]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
Merge pull request #1706 from lifeng68/do_remount
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
1ed6ba91 80#include "confile_utils.h"
8f3e280e 81#include "error.h"
1b09f2c0 82#include "log.h"
d8e48992 83#include "lxcaufs.h"
025ed0f3 84#include "lxclock.h"
8f3e280e
CB
85#include "lxcoverlay.h"
86#include "lxcseccomp.h"
4355ab5f 87#include "namespace.h"
8f3e280e
CB
88#include "network.h"
89#include "parse.h"
90#include "utils.h"
fe4de9a6 91#include "lsm/lsm.h"
d0a36f2c 92
e37dda71 93#if HAVE_LIBCAP
495d2046
SG
94#include <sys/capability.h>
95#endif
96
6ff05e18
SG
97#if HAVE_SYS_PERSONALITY_H
98#include <sys/personality.h>
99#endif
100
edaf8b1b
SG
101#if IS_BIONIC
102#include <../include/lxcmntent.h>
a04f5407
CB
103#ifndef HAVE_PRLIMIT
104#include <../include/prlimit.h>
105#endif
edaf8b1b
SG
106#else
107#include <mntent.h>
108#endif
109
36eb9bde 110lxc_log_define(lxc_conf, lxc);
e5bda9ee 111
e37dda71 112#if HAVE_LIBCAP
b09094da
MN
113#ifndef CAP_SETFCAP
114#define CAP_SETFCAP 31
115#endif
116
117#ifndef CAP_MAC_OVERRIDE
118#define CAP_MAC_OVERRIDE 32
119#endif
120
121#ifndef CAP_MAC_ADMIN
122#define CAP_MAC_ADMIN 33
123#endif
495d2046 124#endif
b09094da
MN
125
126#ifndef PR_CAPBSET_DROP
127#define PR_CAPBSET_DROP 24
128#endif
129
9818cae4
SG
130#ifndef LO_FLAGS_AUTOCLEAR
131#define LO_FLAGS_AUTOCLEAR 4
132#endif
133
bc5b27d6
DK
134#ifndef CAP_SETUID
135#define CAP_SETUID 7
136#endif
137
138#ifndef CAP_SETGID
139#define CAP_SETGID 6
140#endif
141
0769b82a
CS
142/* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144#ifndef CAP_SYS_ADMIN
145#define CAP_SYS_ADMIN 21
146#endif
147
2d76d1d7
SG
148/* Define pivot_root() if missing from the C library */
149#ifndef HAVE_PIVOT_ROOT
150static int pivot_root(const char * new_root, const char * put_old)
151{
152#ifdef __NR_pivot_root
8f3e280e 153 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 154#else
8f3e280e
CB
155 errno = ENOSYS;
156 return -1;
2d76d1d7
SG
157#endif
158}
159#else
160extern int pivot_root(const char * new_root, const char * put_old);
161#endif
162
163/* Define sethostname() if missing from the C library */
164#ifndef HAVE_SETHOSTNAME
165static int sethostname(const char * name, size_t len)
166{
167#ifdef __NR_sethostname
8f3e280e 168 return syscall(__NR_sethostname, name, len);
2d76d1d7 169#else
8f3e280e
CB
170 errno = ENOSYS;
171 return -1;
2d76d1d7
SG
172#endif
173}
174#endif
175
ecec0126
SG
176#ifndef MS_PRIVATE
177#define MS_PRIVATE (1<<18)
178#endif
179
8912711c
CB
180#ifndef MS_LAZYTIME
181#define MS_LAZYTIME (1<<25)
182#endif
183
5ef5c9a3
CB
184/* memfd_create() */
185#ifndef MFD_CLOEXEC
186#define MFD_CLOEXEC 0x0001U
187#endif
188
189#ifndef MFD_ALLOW_SEALING
190#define MFD_ALLOW_SEALING 0x0002U
191#endif
192
193#ifndef HAVE_MEMFD_CREATE
194static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232}
233#else
234extern int memfd_create(const char *name, unsigned int flags);
235#endif
236
72d0e1cb 237char *lxchook_names[NUM_LXC_HOOKS] = {
52492063 238 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
72d0e1cb 239
a589434e 240typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 241
998ac676
RT
242struct mount_opt {
243 char *name;
244 int clear;
245 int flag;
246};
247
81810dd1
DL
248struct caps_opt {
249 char *name;
250 int value;
251};
252
c6d09e15
WB
253struct limit_opt {
254 char *name;
255 int value;
256};
257
858377e4
SH
258/*
259 * The lxc_conf of the container currently being worked on in an
260 * API call
261 * This is used in the error calls
262 */
263#ifdef HAVE_TLS
264__thread struct lxc_conf *current_config;
265#else
266struct lxc_conf *current_config;
267#endif
268
0769b82a
CS
269/* Declare this here, since we don't want to reshuffle the whole file. */
270static int in_caplist(int cap, struct lxc_list *caps);
271
a589434e
JN
272static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
273static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
274static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
275static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
276static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
277static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
278
279static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
280 [LXC_NET_VETH] = instantiate_veth,
281 [LXC_NET_MACVLAN] = instantiate_macvlan,
282 [LXC_NET_VLAN] = instantiate_vlan,
283 [LXC_NET_PHYS] = instantiate_phys,
284 [LXC_NET_EMPTY] = instantiate_empty,
285 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 286};
287
74a2b586
JK
288static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
289static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
290static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
291static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
292static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 293static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 294
a589434e 295static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
296 [LXC_NET_VETH] = shutdown_veth,
297 [LXC_NET_MACVLAN] = shutdown_macvlan,
298 [LXC_NET_VLAN] = shutdown_vlan,
299 [LXC_NET_PHYS] = shutdown_phys,
300 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 301 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
302};
303
998ac676 304static struct mount_opt mount_opt[] = {
470b359b
CB
305 { "async", 1, MS_SYNCHRONOUS },
306 { "atime", 1, MS_NOATIME },
307 { "bind", 0, MS_BIND },
88d413d5 308 { "defaults", 0, 0 },
88d413d5 309 { "dev", 1, MS_NODEV },
470b359b 310 { "diratime", 1, MS_NODIRATIME },
88d413d5 311 { "dirsync", 0, MS_DIRSYNC },
470b359b 312 { "exec", 1, MS_NOEXEC },
8912711c 313 { "lazytime", 0, MS_LAZYTIME },
88d413d5 314 { "mand", 0, MS_MANDLOCK },
88d413d5 315 { "noatime", 0, MS_NOATIME },
470b359b 316 { "nodev", 0, MS_NODEV },
88d413d5 317 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
318 { "noexec", 0, MS_NOEXEC },
319 { "nomand", 1, MS_MANDLOCK },
320 { "norelatime", 1, MS_RELATIME },
321 { "nostrictatime", 1, MS_STRICTATIME },
322 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
323 { "rbind", 0, MS_BIND|MS_REC },
324 { "relatime", 0, MS_RELATIME },
470b359b
CB
325 { "remount", 0, MS_REMOUNT },
326 { "ro", 0, MS_RDONLY },
327 { "rw", 1, MS_RDONLY },
88d413d5 328 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
329 { "suid", 1, MS_NOSUID },
330 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 331 { NULL, 0, 0 },
998ac676
RT
332};
333
e37dda71 334#if HAVE_LIBCAP
81810dd1 335static struct caps_opt caps_opt[] = {
a6afdde9 336 { "chown", CAP_CHOWN },
1e11be34
DL
337 { "dac_override", CAP_DAC_OVERRIDE },
338 { "dac_read_search", CAP_DAC_READ_SEARCH },
339 { "fowner", CAP_FOWNER },
340 { "fsetid", CAP_FSETID },
81810dd1
DL
341 { "kill", CAP_KILL },
342 { "setgid", CAP_SETGID },
343 { "setuid", CAP_SETUID },
344 { "setpcap", CAP_SETPCAP },
345 { "linux_immutable", CAP_LINUX_IMMUTABLE },
346 { "net_bind_service", CAP_NET_BIND_SERVICE },
347 { "net_broadcast", CAP_NET_BROADCAST },
348 { "net_admin", CAP_NET_ADMIN },
349 { "net_raw", CAP_NET_RAW },
350 { "ipc_lock", CAP_IPC_LOCK },
351 { "ipc_owner", CAP_IPC_OWNER },
352 { "sys_module", CAP_SYS_MODULE },
353 { "sys_rawio", CAP_SYS_RAWIO },
354 { "sys_chroot", CAP_SYS_CHROOT },
355 { "sys_ptrace", CAP_SYS_PTRACE },
356 { "sys_pacct", CAP_SYS_PACCT },
357 { "sys_admin", CAP_SYS_ADMIN },
358 { "sys_boot", CAP_SYS_BOOT },
359 { "sys_nice", CAP_SYS_NICE },
360 { "sys_resource", CAP_SYS_RESOURCE },
361 { "sys_time", CAP_SYS_TIME },
362 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
363 { "mknod", CAP_MKNOD },
364 { "lease", CAP_LEASE },
57b837e2
CB
365#ifdef CAP_AUDIT_READ
366 { "audit_read", CAP_AUDIT_READ },
367#endif
9527e566 368#ifdef CAP_AUDIT_WRITE
81810dd1 369 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
370#endif
371#ifdef CAP_AUDIT_CONTROL
81810dd1 372 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 373#endif
81810dd1
DL
374 { "setfcap", CAP_SETFCAP },
375 { "mac_override", CAP_MAC_OVERRIDE },
376 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
377#ifdef CAP_SYSLOG
378 { "syslog", CAP_SYSLOG },
379#endif
380#ifdef CAP_WAKE_ALARM
381 { "wake_alarm", CAP_WAKE_ALARM },
382#endif
2b54359b
CB
383#ifdef CAP_BLOCK_SUSPEND
384 { "block_suspend", CAP_BLOCK_SUSPEND },
385#endif
81810dd1 386};
495d2046
SG
387#else
388static struct caps_opt caps_opt[] = {};
389#endif
81810dd1 390
c6d09e15
WB
391static struct limit_opt limit_opt[] = {
392#ifdef RLIMIT_AS
393 { "as", RLIMIT_AS },
394#endif
395#ifdef RLIMIT_CORE
396 { "core", RLIMIT_CORE },
397#endif
398#ifdef RLIMIT_CPU
399 { "cpu", RLIMIT_CPU },
400#endif
401#ifdef RLIMIT_DATA
402 { "data", RLIMIT_DATA },
403#endif
404#ifdef RLIMIT_FSIZE
405 { "fsize", RLIMIT_FSIZE },
406#endif
407#ifdef RLIMIT_LOCKS
408 { "locks", RLIMIT_LOCKS },
409#endif
410#ifdef RLIMIT_MEMLOCK
411 { "memlock", RLIMIT_MEMLOCK },
412#endif
413#ifdef RLIMIT_MSGQUEUE
414 { "msgqueue", RLIMIT_MSGQUEUE },
415#endif
416#ifdef RLIMIT_NICE
417 { "nice", RLIMIT_NICE },
418#endif
419#ifdef RLIMIT_NOFILE
420 { "nofile", RLIMIT_NOFILE },
421#endif
422#ifdef RLIMIT_NPROC
423 { "nproc", RLIMIT_NPROC },
424#endif
425#ifdef RLIMIT_RSS
426 { "rss", RLIMIT_RSS },
427#endif
428#ifdef RLIMIT_RTPRIO
429 { "rtprio", RLIMIT_RTPRIO },
430#endif
431#ifdef RLIMIT_RTTIME
432 { "rttime", RLIMIT_RTTIME },
433#endif
434#ifdef RLIMIT_SIGPENDING
435 { "sigpending", RLIMIT_SIGPENDING },
436#endif
437#ifdef RLIMIT_STACK
438 { "stack", RLIMIT_STACK },
439#endif
440};
441
91c3830e
SH
442static int run_buffer(char *buffer)
443{
ebec9176 444 struct lxc_popen_FILE *f;
91c3830e 445 char *output;
8e7da691 446 int ret;
91c3830e 447
ebec9176 448 f = lxc_popen(buffer);
91c3830e 449 if (!f) {
062b72c6 450 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
451 return -1;
452 }
453
454 output = malloc(LXC_LOG_BUFFER_SIZE);
455 if (!output) {
062b72c6 456 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 457 lxc_pclose(f);
91c3830e
SH
458 return -1;
459 }
460
062b72c6
CB
461 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
462 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
463
464 free(output);
465
ebec9176 466 ret = lxc_pclose(f);
8e7da691 467 if (ret == -1) {
062b72c6 468 SYSERROR("Script exited with error.");
91c3830e 469 return -1;
8e7da691 470 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 471 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
472 return -1;
473 } else if (WIFSIGNALED(ret)) {
062b72c6 474 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 475 return -1;
91c3830e
SH
476 }
477
478 return 0;
479}
480
148e91f5 481static int run_script_argv(const char *name, const char *section,
062b72c6
CB
482 const char *script, const char *hook,
483 const char *lxcpath, char **argsin)
148e91f5
SH
484{
485 int ret, i;
486 char *buffer;
487 size_t size = 0;
488
062b72c6 489 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
490 script, name, section);
491
062b72c6 492 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
493 size += strlen(argsin[i]) + 1;
494
495 size += strlen(hook) + 1;
496
497 size += strlen(script);
498 size += strlen(name);
499 size += strlen(section);
500 size += 3;
501
502 if (size > INT_MAX)
503 return -1;
504
505 buffer = alloca(size);
506 if (!buffer) {
062b72c6 507 ERROR("Failed to allocate memory.");
148e91f5
SH
508 return -1;
509 }
510
062b72c6
CB
511 ret =
512 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
513 if (ret < 0 || (size_t)ret >= size) {
514 ERROR("Script name too long.");
148e91f5
SH
515 return -1;
516 }
517
062b72c6
CB
518 for (i = 0; argsin && argsin[i]; i++) {
519 int len = size - ret;
148e91f5
SH
520 int rc;
521 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
522 if (rc < 0 || rc >= len) {
062b72c6 523 ERROR("Script args too long.");
148e91f5
SH
524 return -1;
525 }
526 ret += rc;
527 }
528
529 return run_buffer(buffer);
530}
531
062b72c6
CB
532static int run_script(const char *name, const char *section, const char *script,
533 ...)
e3b4c4c4 534{
abbfd20b 535 int ret;
91c3830e 536 char *buffer, *p;
abbfd20b
DL
537 size_t size = 0;
538 va_list ap;
751d9dcd 539
062b72c6 540 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 541 script, name, section);
e3b4c4c4 542
abbfd20b
DL
543 va_start(ap, script);
544 while ((p = va_arg(ap, char *)))
95642a10 545 size += strlen(p) + 1;
abbfd20b
DL
546 va_end(ap);
547
548 size += strlen(script);
549 size += strlen(name);
550 size += strlen(section);
95642a10 551 size += 3;
abbfd20b 552
95642a10
MS
553 if (size > INT_MAX)
554 return -1;
555
556 buffer = alloca(size);
abbfd20b 557 if (!buffer) {
062b72c6 558 ERROR("Failed to allocate memory.");
751d9dcd
DL
559 return -1;
560 }
561
9ba8130c
SH
562 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
563 if (ret < 0 || ret >= size) {
062b72c6 564 ERROR("Script name too long.");
9ba8130c
SH
565 return -1;
566 }
751d9dcd 567
abbfd20b 568 va_start(ap, script);
9ba8130c 569 while ((p = va_arg(ap, char *))) {
062b72c6 570 int len = size - ret;
9ba8130c
SH
571 int rc;
572 rc = snprintf(buffer + ret, len, " %s", p);
573 if (rc < 0 || rc >= len) {
062b72c6 574 ERROR("Script args too long.");
9ba8130c
SH
575 return -1;
576 }
577 ret += rc;
578 }
abbfd20b 579 va_end(ap);
751d9dcd 580
91c3830e 581 return run_buffer(buffer);
e3b4c4c4
ST
582}
583
0c547523
SH
584/*
585 * pin_rootfs
b7ed4bf0
CS
586 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
587 * the duration of the container run, to prevent the container from marking
588 * the underlying fs readonly on shutdown. unlink the file immediately so
589 * no name pollution is happens
0c547523
SH
590 * return -1 on error.
591 * return -2 if nothing needed to be pinned.
592 * return an open fd (>=0) if we pinned it.
593 */
594int pin_rootfs(const char *rootfs)
595{
596 char absrootfs[MAXPATHLEN];
597 char absrootfspin[MAXPATHLEN];
598 struct stat s;
599 int ret, fd;
600
e99ee0de 601 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 602 return -2;
e99ee0de 603
00ec333b 604 if (!realpath(rootfs, absrootfs))
9be53773 605 return -2;
0c547523 606
00ec333b 607 if (access(absrootfs, F_OK))
0c547523 608 return -1;
0c547523 609
00ec333b 610 if (stat(absrootfs, &s))
0c547523 611 return -1;
0c547523 612
72f919c4 613 if (!S_ISDIR(s.st_mode))
0c547523
SH
614 return -2;
615
b7ed4bf0 616 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 617 if (ret >= MAXPATHLEN)
0c547523 618 return -1;
0c547523
SH
619
620 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
621 if (fd < 0)
622 return fd;
623 (void)unlink(absrootfspin);
0c547523
SH
624 return fd;
625}
626
e2a7e8dc
SH
627/*
628 * If we are asking to remount something, make sure that any
629 * NOEXEC etc are honored.
630 */
5ae72b98 631unsigned long add_required_remount_flags(const char *s, const char *d,
e2a7e8dc
SH
632 unsigned long flags)
633{
614305f3 634#ifdef HAVE_STATVFS
e2a7e8dc
SH
635 struct statvfs sb;
636 unsigned long required_flags = 0;
637
638 if (!(flags & MS_REMOUNT))
639 return flags;
640
641 if (!s)
642 s = d;
643
644 if (!s)
645 return flags;
646 if (statvfs(s, &sb) < 0)
647 return flags;
648
649 if (sb.f_flag & MS_NOSUID)
650 required_flags |= MS_NOSUID;
651 if (sb.f_flag & MS_NODEV)
652 required_flags |= MS_NODEV;
653 if (sb.f_flag & MS_RDONLY)
654 required_flags |= MS_RDONLY;
655 if (sb.f_flag & MS_NOEXEC)
656 required_flags |= MS_NOEXEC;
657
658 return flags | required_flags;
614305f3
SH
659#else
660 return flags;
661#endif
e2a7e8dc
SH
662}
663
4fb3cba5 664static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 665{
368bbc02 666 int r;
80e80c40 667 int i;
b06b8511
CS
668 static struct {
669 int match_mask;
670 int match_flag;
671 const char *source;
672 const char *destination;
673 const char *fstype;
674 unsigned long flags;
675 const char *options;
676 } default_mounts[] = {
677 /* Read-only bind-mounting... In older kernels, doing that required
678 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
679 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
680 * kernel 2.6.26 onwards. However, this apparently does not work on
681 * kernel 3.8. Unfortunately, on that very same kernel, doing the
682 * same trick as above doesn't seem to work either, there one needs
683 * to ALSO specify MS_BIND for the remount, otherwise the entire
684 * fs is remounted read-only or the mount fails because it's busy...
685 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
686 * 2.6.32...
368bbc02 687 */
f24a52d5 688 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
689 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
690 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
697 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
705 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 706 };
368bbc02 707
b06b8511
CS
708 for (i = 0; default_mounts[i].match_mask; i++) {
709 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
710 char *source = NULL;
711 char *destination = NULL;
712 int saved_errno;
e2a7e8dc 713 unsigned long mflags;
b06b8511
CS
714
715 if (default_mounts[i].source) {
716 /* will act like strdup if %r is not present */
8ede5f4c 717 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
718 if (!source) {
719 SYSERROR("memory allocation error");
720 return -1;
721 }
722 }
cc4fd506
SH
723 if (!default_mounts[i].destination) {
724 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 725 free(source);
cc4fd506
SH
726 return -1;
727 }
728 /* will act like strdup if %r is not present */
729 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
730 if (!destination) {
731 saved_errno = errno;
732 SYSERROR("memory allocation error");
733 free(source);
734 errno = saved_errno;
735 return -1;
b06b8511 736 }
e2a7e8dc
SH
737 mflags = add_required_remount_flags(source, destination,
738 default_mounts[i].flags);
592fd47a 739 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 740 saved_errno = errno;
b88ff9a0
SG
741 if (r < 0 && errno == ENOENT) {
742 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
743 r = 0;
744 }
745 else if (r < 0)
e2a7e8dc 746 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 747
b06b8511
CS
748 free(source);
749 free(destination);
750 if (r < 0) {
b06b8511
CS
751 errno = saved_errno;
752 return -1;
753 }
368bbc02 754 }
368bbc02
CS
755 }
756
b06b8511 757 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
758 int cg_flags;
759
760 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
761 /* If the type of cgroup mount was not specified, it depends on the
762 * container's capabilities as to what makes sense: if we have
763 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
764 * anyway, so we may as well default to read-write; then the admin
765 * will not be given a false sense of security. (And if they really
766 * want mixed r/o r/w, then they can explicitly specify :mixed.)
767 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
768 * :mixed, because then the container can't remount it read-write. */
769 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
770 int has_sys_admin = 0;
b0ee5983
CB
771
772 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 773 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 774 else
0769b82a 775 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
776
777 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 778 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 779 else
0769b82a 780 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
781 }
782
8ede5f4c 783 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 784 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 785 return -1;
368bbc02
CS
786 }
787 }
788
368bbc02 789 return 0;
368bbc02
CS
790}
791
4e5440c6 792static int setup_utsname(struct utsname *utsname)
0ad19a3f 793{
4e5440c6
DL
794 if (!utsname)
795 return 0;
0ad19a3f 796
4e5440c6
DL
797 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
798 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 799 return -1;
800 }
801
4e5440c6 802 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 803
0ad19a3f 804 return 0;
805}
806
69aa6655
DE
807struct dev_symlinks {
808 const char *oldpath;
809 const char *name;
810};
811
812static const struct dev_symlinks dev_symlinks[] = {
813 {"/proc/self/fd", "fd"},
814 {"/proc/self/fd/0", "stdin"},
815 {"/proc/self/fd/1", "stdout"},
816 {"/proc/self/fd/2", "stderr"},
817};
818
819static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
820{
821 char path[MAXPATHLEN];
822 int ret,i;
09227be2 823 struct stat s;
69aa6655
DE
824
825
826 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
827 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 828 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
829 if (ret < 0 || ret >= MAXPATHLEN)
830 return -1;
09227be2
MW
831
832 /*
833 * Stat the path first. If we don't get an error
834 * accept it as is and don't try to create it
835 */
836 if (!stat(path, &s)) {
837 continue;
838 }
839
69aa6655 840 ret = symlink(d->oldpath, path);
09227be2 841
69aa6655 842 if (ret && errno != EEXIST) {
09227be2
MW
843 if ( errno == EROFS ) {
844 WARN("Warning: Read Only file system while creating %s", path);
845 } else {
846 SYSERROR("Error creating %s", path);
847 return -1;
848 }
69aa6655
DE
849 }
850 }
851 return 0;
852}
853
393903d1
SH
854/*
855 * Build a space-separate list of ptys to pass to systemd.
856 */
857static bool append_ptyname(char **pp, char *name)
b0a33c1e 858{
393903d1
SH
859 char *p;
860
861 if (!*pp) {
862 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
863 if (!*pp)
864 return false;
865 sprintf(*pp, "container_ttys=%s", name);
866 return true;
867 }
868 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
869 if (!p)
870 return false;
871 *pp = p;
872 strcat(p, " ");
873 strcat(p, name);
874 return true;
875}
876
9e1045e3 877static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 878{
9e1045e3 879 int i, ret;
393903d1
SH
880 const struct lxc_tty_info *tty_info = &conf->tty_info;
881 char *ttydir = conf->ttydir;
7c6ef2a2 882 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 883
e8bd4e43 884 if (!conf->rootfs.path)
bc9bd0e3
DL
885 return 0;
886
b0a33c1e 887 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 888 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
889
e8bd4e43 890 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 891 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
892 ERROR("pathname too long for ttys");
893 return -1;
894 }
9e1045e3 895
7c6ef2a2
SH
896 if (ttydir) {
897 /* create dev/lxc/tty%d" */
9e1045e3
CB
898 ret = snprintf(lxcpath, sizeof(lxcpath),
899 "/dev/%s/tty%d", ttydir, i + 1);
900 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
901 ERROR("pathname too long for ttys");
902 return -1;
903 }
9e1045e3 904
7c6ef2a2 905 ret = creat(lxcpath, 0660);
9e1045e3
CB
906 if (ret < 0 && errno != EEXIST) {
907 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
908 return -1;
909 }
4d44e274
SH
910 if (ret >= 0)
911 close(ret);
9e1045e3 912
7c6ef2a2 913 ret = unlink(path);
9e1045e3
CB
914 if (ret < 0 && errno != ENOENT) {
915 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
916 return -1;
917 }
b0a33c1e 918
9e1045e3
CB
919 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
920 if (ret < 0) {
921 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
922 pty_info->name, path);
923 continue;
924 }
9e1045e3
CB
925 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
926 path);
13954cce 927
9e1045e3
CB
928 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
929 ttydir, i + 1);
930 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
931 ERROR("tty pathname too long");
932 return -1;
933 }
9e1045e3 934
7c6ef2a2 935 ret = symlink(lxcpath, path);
9e1045e3
CB
936 if (ret < 0) {
937 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
938 path, lxcpath);
7c6ef2a2
SH
939 return -1;
940 }
941 } else {
9e1045e3
CB
942 /* If we populated /dev, then we need to create
943 * /dev/ttyN
944 */
945 ret = access(path, F_OK);
946 if (ret < 0) {
c6883f38 947 ret = creat(path, 0660);
9e1045e3
CB
948 if (ret < 0) {
949 SYSERROR("failed to create \"%s\"", path);
c6883f38 950 /* this isn't fatal, continue */
025ed0f3 951 } else {
c6883f38 952 close(ret);
025ed0f3 953 }
c6883f38 954 }
9e1045e3
CB
955
956 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
957 if (ret < 0) {
e8bd4e43 958 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
959 continue;
960 }
9e1045e3
CB
961
962 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
963 path);
393903d1 964 }
9e1045e3 965
e8bd4e43 966 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
967 ERROR("Error setting up container_ttys string");
968 return -1;
b0a33c1e 969 }
970 }
971
9e1045e3 972 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 973 return 0;
974}
975
59bb8698 976static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 977{
2d489f9e 978 int oldroot = -1, newroot = -1;
bf601689 979
2d489f9e
SH
980 oldroot = open("/", O_DIRECTORY | O_RDONLY);
981 if (oldroot < 0) {
982 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
983 return -1;
984 }
2d489f9e
SH
985 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
986 if (newroot < 0) {
987 SYSERROR("Error opening new-/ for fchdir");
988 goto fail;
c08556c6 989 }
bf601689 990
cc6f6dd7 991 /* change into new root fs */
2d489f9e 992 if (fchdir(newroot)) {
cc6f6dd7 993 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 994 goto fail;
cc6f6dd7
DL
995 }
996
cc6f6dd7 997 /* pivot_root into our new root fs */
2d489f9e 998 if (pivot_root(".", ".")) {
cc6f6dd7 999 SYSERROR("pivot_root syscall failed");
2d489f9e 1000 goto fail;
bf601689 1001 }
cc6f6dd7 1002
2d489f9e
SH
1003 /*
1004 * at this point the old-root is mounted on top of our new-root
1005 * To unmounted it we must not be chdir'd into it, so escape back
1006 * to old-root
1007 */
1008 if (fchdir(oldroot) < 0) {
1009 SYSERROR("Error entering oldroot");
1010 goto fail;
1011 }
7981ea46 1012 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1013 SYSERROR("Error detaching old root");
1014 goto fail;
cc6f6dd7
DL
1015 }
1016
2d489f9e
SH
1017 if (fchdir(newroot) < 0) {
1018 SYSERROR("Error re-entering newroot");
1019 goto fail;
1020 }
cc6f6dd7 1021
2d489f9e
SH
1022 close(oldroot);
1023 close(newroot);
bf601689 1024
2d489f9e 1025 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1026
bf601689 1027 return 0;
2d489f9e
SH
1028
1029fail:
1030 if (oldroot != -1)
1031 close(oldroot);
1032 if (newroot != -1)
1033 close(newroot);
1034 return -1;
bf601689
MH
1035}
1036
bc6928ff 1037/*
87da4ec3
SH
1038 * Just create a path for /dev under $lxcpath/$name and in rootfs
1039 * If we hit an error, log it but don't fail yet.
91c3830e 1040 */
14221cbb 1041static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
91c3830e
SH
1042{
1043 int ret;
87da4ec3
SH
1044 size_t clen;
1045 char *path;
91c3830e 1046
14221cbb 1047 INFO("Mounting container /dev");
bc6928ff 1048
14221cbb 1049 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1050 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1051 path = alloca(clen);
bc6928ff 1052
ec50007f 1053 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
87da4ec3 1054 if (ret < 0 || ret >= clen)
91c3830e 1055 return -1;
bc6928ff 1056
87da4ec3 1057 if (!dir_exists(path)) {
14221cbb 1058 WARN("No /dev in container.");
87da4ec3
SH
1059 WARN("Proceeding without autodev setup");
1060 return 0;
bc6928ff 1061 }
87da4ec3 1062
1ec0e8e3 1063 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
ec50007f 1064 rootfs->path ? rootfs->mount : NULL);
1ec0e8e3 1065 if (ret != 0) {
87da4ec3 1066 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1ec0e8e3 1067 return -1;
91c3830e 1068 }
87da4ec3
SH
1069
1070 INFO("Mounted tmpfs onto %s", path);
1071
ec50007f 1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87da4ec3 1073 if (ret < 0 || ret >= clen)
91c3830e 1074 return -1;
87da4ec3 1075
bc6928ff
MW
1076 /*
1077 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1078 * If not, then create it and exit if that fails...
1079 */
87da4ec3 1080 if (!dir_exists(path)) {
bc6928ff
MW
1081 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1082 if (ret) {
1083 SYSERROR("Failed to create /dev/pts in container");
1084 return -1;
1085 }
91c3830e
SH
1086 }
1087
14221cbb 1088 INFO("Mounted container /dev");
91c3830e
SH
1089 return 0;
1090}
1091
c6883f38 1092struct lxc_devs {
74a3920a 1093 const char *name;
c6883f38
SH
1094 mode_t mode;
1095 int maj;
1096 int min;
1097};
1098
74a3920a 1099static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1100 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1101 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1102 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1103 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1104 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1105 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1106};
1107
27245ff7 1108static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1109{
1110 int ret;
c6883f38
SH
1111 char path[MAXPATHLEN];
1112 int i;
3a32201c 1113 mode_t cmask;
c6883f38 1114
ec50007f 1115 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1116 if (ret < 0 || ret >= MAXPATHLEN) {
1117 ERROR("Error calculating container /dev location");
c6883f38 1118 return -1;
f7bee6c6 1119 }
91c3830e 1120
0bbf8572
CB
1121 /* ignore, just don't try to fill in */
1122 if (!dir_exists(path))
9cb4d183
SH
1123 return 0;
1124
0bbf8572 1125 INFO("populating container /dev");
3a32201c 1126 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1127 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1128 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1129
ec50007f 1130 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
0bbf8572 1133
c6883f38 1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1135 if (ret < 0) {
9cb4d183
SH
1136 char hostpath[MAXPATHLEN];
1137 FILE *pathfile;
1138
0bbf8572
CB
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
9cb4d183
SH
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150 pathfile = fopen(path, "wb");
1151 if (!pathfile) {
1152 SYSERROR("Failed to create device mount target '%s'", path);
1153 return -1;
1154 }
1155 fclose(pathfile);
0bbf8572
CB
1156 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1157 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1158 return -1;
1159 }
0bbf8572
CB
1160 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1161 } else {
1162 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1163 }
1164 }
3a32201c 1165 umask(cmask);
c6883f38 1166
0bbf8572 1167 INFO("populated container /dev");
c6883f38
SH
1168 return 0;
1169}
1170
9aa76a17 1171static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1172{
9aa76a17 1173 int ret;
91c3e281
CB
1174 struct bdev *bdev;
1175 const struct lxc_rootfs *rootfs;
cc28d0b0 1176
91c3e281 1177 rootfs = &conf->rootfs;
a0f379bf 1178 if (!rootfs->path) {
91c3e281
CB
1179 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1180 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1181 return -1;
1182 }
c69bd12f 1183 return 0;
a0f379bf 1184 }
0ad19a3f 1185
12297168 1186 if (access(rootfs->mount, F_OK)) {
91c3e281 1187 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1188 rootfs->mount);
b1789442
DL
1189 return -1;
1190 }
1191
91c3e281 1192 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1193 if (!bdev) {
1194 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1195 rootfs->path, rootfs->mount,
1196 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1197 return -1;
9be53773 1198 }
9aa76a17
CB
1199
1200 ret = bdev->ops->mount(bdev);
1201 bdev_put(bdev);
1202 if (ret < 0) {
91c3e281
CB
1203 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1204 rootfs->path, rootfs->mount,
1205 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1206 return -1;
1207 }
0ad19a3f 1208
91c3e281
CB
1209 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1210 rootfs->path, rootfs->mount,
1211 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1212
ac778708
DL
1213 return 0;
1214}
1215
91e93c71
AV
1216int prepare_ramfs_root(char *root)
1217{
eab15c1e 1218 char buf[LXC_LINELEN], *p;
91e93c71
AV
1219 char nroot[PATH_MAX];
1220 FILE *f;
1221 int i;
1222 char *p2;
1223
1224 if (realpath(root, nroot) == NULL)
39c7b795 1225 return -errno;
91e93c71
AV
1226
1227 if (chdir("/") == -1)
39c7b795 1228 return -errno;
91e93c71
AV
1229
1230 /*
1231 * We could use here MS_MOVE, but in userns this mount is
1232 * locked and can't be moved.
1233 */
39c7b795 1234 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1235 SYSERROR("Failed to move %s into /", root);
39c7b795 1236 return -errno;
91e93c71
AV
1237 }
1238
39c7b795 1239 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1240 SYSERROR("Failed to make . rprivate");
39c7b795 1241 return -errno;
91e93c71
AV
1242 }
1243
1244 /*
1245 * The following code cleans up inhereted mounts which are not
1246 * required for CT.
1247 *
1248 * The mountinfo file shows not all mounts, if a few points have been
1249 * unmounted between read operations from the mountinfo. So we need to
1250 * read mountinfo a few times.
1251 *
1252 * This loop can be skipped if a container uses unserns, because all
1253 * inherited mounts are locked and we should live with all this trash.
1254 */
1255 while (1) {
1256 int progress = 0;
1257
1258 f = fopen("./proc/self/mountinfo", "r");
1259 if (!f) {
1260 SYSERROR("Unable to open /proc/self/mountinfo");
1261 return -1;
1262 }
eab15c1e 1263 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1264 for (p = buf, i=0; p && i < 4; i++)
1265 p = strchr(p+1, ' ');
1266 if (!p)
1267 continue;
1268 p2 = strchr(p+1, ' ');
1269 if (!p2)
1270 continue;
1271
1272 *p2 = '\0';
1273 *p = '.';
1274
1275 if (strcmp(p + 1, "/") == 0)
1276 continue;
1277 if (strcmp(p + 1, "/proc") == 0)
1278 continue;
1279
1280 if (umount2(p, MNT_DETACH) == 0)
1281 progress++;
1282 }
1283 fclose(f);
1284 if (!progress)
1285 break;
1286 }
1287
8bea9fae
PR
1288 /* This also can be skipped if a container uses unserns */
1289 umount2("./proc", MNT_DETACH);
91e93c71
AV
1290
1291 /* It is weird, but chdir("..") moves us in a new root */
1292 if (chdir("..") == -1) {
1293 SYSERROR("Unable to change working directory");
1294 return -1;
1295 }
1296
1297 if (chroot(".") == -1) {
1298 SYSERROR("Unable to chroot");
1299 return -1;
1300 }
1301
1302 return 0;
1303}
1304
74a3920a 1305static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1306{
39c7b795
CB
1307 if (!rootfs->path) {
1308 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1309 return 0;
39c7b795 1310 }
ac778708 1311
91e93c71 1312 if (detect_ramfs_rootfs()) {
39c7b795
CB
1313 DEBUG("detected that container is on ramfs");
1314 if (prepare_ramfs_root(rootfs->mount)) {
1315 ERROR("failed to prepare minimal ramfs root");
91e93c71 1316 return -1;
39c7b795
CB
1317 }
1318
1319 DEBUG("prepared ramfs root for container");
1320 return 0;
1321 }
1322
1323 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1324 ERROR("failed to pivot root");
25368b52 1325 return -1;
c69bd12f
DL
1326 }
1327
39c7b795 1328 DEBUG("finished pivot root");
25368b52 1329 return 0;
0ad19a3f 1330}
1331
70761e5e 1332static int lxc_setup_devpts(int num_pts)
3c26f34e 1333{
70761e5e 1334 int ret;
d5cb35d6 1335 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
77890c6d 1336
70761e5e
CB
1337 if (!num_pts) {
1338 DEBUG("no new devpts instance will be mounted since no pts "
1339 "devices are requested");
d852c78c 1340 return 0;
3c26f34e 1341 }
1342
d5cb35d6 1343 /* Unmount old devpts instance. */
70761e5e
CB
1344 ret = access("/dev/pts/ptmx", F_OK);
1345 if (!ret) {
70761e5e
CB
1346 ret = umount("/dev/pts");
1347 if (ret < 0) {
1348 SYSERROR("failed to unmount old devpts instance");
1349 return -1;
7e40254a 1350 }
70761e5e 1351 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1352 }
1353
70761e5e
CB
1354 /* Create mountpoint for devpts instance. */
1355 ret = mkdir("/dev/pts", 0755);
1356 if (ret < 0 && errno != EEXIST) {
1357 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1358 return -1;
1359 }
1360
70761e5e
CB
1361 /* Mount new devpts instance. */
1362 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1363 if (ret < 0) {
1364 SYSERROR("failed to mount new devpts instance");
1365 return -1;
1366 }
f4f52cb5 1367 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1368
d5cb35d6 1369 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1370 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1371 if (!ret) {
1372 ret = remove("/dev/ptmx");
1373 if (ret < 0) {
1374 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1375 return -1;
70761e5e 1376 }
d5cb35d6 1377 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1378 }
1379
d5cb35d6
CB
1380 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1381 ret = open("/dev/ptmx", O_CREAT, 0666);
1382 if (ret < 0) {
1383 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1384 return -1;
1385 }
e87bd19c 1386 close(ret);
d5cb35d6 1387 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1388
d5cb35d6 1389 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1390 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1391 if (!ret) {
1392 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1393 return 0;
1394 } else {
1395 /* Fallthrough and try to create a symlink. */
1396 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1397 }
1398
1399 /* Remove the dummy /dev/ptmx file we created above. */
1400 ret = remove("/dev/ptmx");
70761e5e 1401 if (ret < 0) {
d5cb35d6
CB
1402 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1403 return -1;
1404 }
1405
1406 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1407 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1408 if (ret < 0) {
1409 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1410 return -1;
1411 }
d5cb35d6 1412 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1413
3c26f34e 1414 return 0;
1415}
1416
cccc74b5
DL
1417static int setup_personality(int persona)
1418{
6ff05e18 1419 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1420 if (persona == -1)
1421 return 0;
1422
1423 if (personality(persona) < 0) {
1424 SYSERROR("failed to set personality to '0x%x'", persona);
1425 return -1;
1426 }
1427
1428 INFO("set personality to '0x%x'", persona);
6ff05e18 1429 #endif
cccc74b5
DL
1430
1431 return 0;
1432}
1433
3d7d929a
CB
1434static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1435 const struct lxc_console *console)
6e590161 1436{
63376d7d 1437 char path[MAXPATHLEN];
0728ebf4 1438 int ret, fd;
52e35957 1439
8b1b1210
CB
1440 if (console->path && !strcmp(console->path, "none"))
1441 return 0;
1442
7c6ef2a2 1443 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1444 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1445 return -1;
52e35957 1446
8b1b1210
CB
1447 /* When we are asked to setup a console we remove any previous
1448 * /dev/console bind-mounts.
1449 */
a7ba3c7f
CB
1450 if (file_exists(path)) {
1451 ret = lxc_unstack_mountpoint(path, false);
1452 if (ret < 0) {
8b1b1210 1453 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1454 return -ret;
1455 } else {
1456 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1457 }
953fe44f 1458
a7ba3c7f
CB
1459 ret = unlink(path);
1460 if (ret < 0) {
1461 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1462 return -errno;
1463 }
8b1b1210
CB
1464 }
1465
1466 /* For unprivileged containers autodev or automounts will already have
1467 * taken care of creating /dev/console.
1468 */
0728ebf4
TA
1469 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1470 if (fd < 0) {
1471 if (errno != EEXIST) {
1472 SYSERROR("failed to create console");
3d7d929a 1473 return -errno;
0728ebf4
TA
1474 }
1475 } else {
1476 close(fd);
52e35957
DL
1477 }
1478
0728ebf4 1479 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1480 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1481 return -errno;
63376d7d 1482 }
13954cce 1483
3d7d929a 1484 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1485 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1486 return -1;
1487 }
1488
3d7d929a 1489 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1490 return 0;
1491}
1492
3d7d929a
CB
1493static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1494 const struct lxc_console *console,
1495 char *ttydir)
7c6ef2a2 1496{
7c6ef2a2 1497 int ret;
3d7d929a 1498 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1499
1500 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1501 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1502 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1503 return -1;
3d7d929a 1504
7c6ef2a2
SH
1505 ret = mkdir(path, 0755);
1506 if (ret && errno != EEXIST) {
959aee9c 1507 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1508 return -errno;
7c6ef2a2 1509 }
3d7d929a 1510 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1511
3d7d929a
CB
1512 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1513 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1514 return -1;
1515
7c6ef2a2 1516 ret = creat(lxcpath, 0660);
3d7d929a 1517 if (ret == -1 && errno != EEXIST) {
959aee9c 1518 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1519 return -errno;
7c6ef2a2 1520 }
4d44e274
SH
1521 if (ret >= 0)
1522 close(ret);
7c6ef2a2 1523
2a12fefd
CB
1524 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1525 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1526 return -1;
2a12fefd
CB
1527
1528 /* When we are asked to setup a console we remove any previous
1529 * /dev/console bind-mounts.
1530 */
1531 if (console->path && !strcmp(console->path, "none")) {
1532 struct stat st;
1533 ret = stat(path, &st);
1534 if (ret < 0) {
1535 if (errno == ENOENT)
1536 return 0;
1537 SYSERROR("failed stat() \"%s\"", path);
1538 return -errno;
1539 }
1540
1541 /* /dev/console must be character device with major number 5 and
1542 * minor number 1. If not, give benefit of the doubt and assume
1543 * the user has mounted something else right there on purpose.
1544 */
1545 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1546 return 0;
1547
1548 /* In case the user requested a bind-mount for /dev/console and
1549 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1550 * /dev/<ttydir/console.
1551 * Note, we only move the uppermost mount and clear all other
1552 * mounts underneath for safety.
1553 * If it is a character device created via mknod() we simply
1554 * rename it.
2a12fefd
CB
1555 */
1556 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1557 if (ret < 0) {
1558 if (errno != EINVAL) {
1559 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1560 return -errno;
1561 }
1562 /* path was not a mountpoint */
1563 ret = rename(path, lxcpath);
1564 if (ret < 0) {
1565 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1566 return -errno;
1567 }
1568 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1569 } else {
1570 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1571 }
a7ba3c7f
CB
1572
1573 /* Clear all remaining bind-mounts. */
1574 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1575 if (ret < 0) {
a7ba3c7f
CB
1576 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1577 return -ret;
1578 } else {
1579 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1580 }
1581 } else {
1582 if (file_exists(path)) {
1583 ret = lxc_unstack_mountpoint(path, false);
1584 if (ret < 0) {
2a12fefd 1585 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1586 return -ret;
1587 } else {
1588 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1589 }
2a12fefd
CB
1590 }
1591
1592 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1593 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1594 return -1;
1595 }
1596 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1597 }
1598
2a12fefd 1599 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1600 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1601 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1602 return -1;
3d7d929a 1603
2a12fefd
CB
1604 ret = unlink(path);
1605 if (ret && errno != ENOENT) {
1606 SYSERROR("error unlinking %s", path);
1607 return -errno;
1608 }
1609
7c6ef2a2 1610 ret = symlink(lxcpath, path);
3d7d929a
CB
1611 if (ret < 0) {
1612 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1613 return -1;
1614 }
1615
3d7d929a 1616 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1617 return 0;
1618}
1619
3d7d929a
CB
1620static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1621 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1622{
3d7d929a
CB
1623 /* We don't have a rootfs, /dev/console will be shared. */
1624 if (!rootfs->path) {
1625 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1626 return 0;
3d7d929a
CB
1627 }
1628
7c6ef2a2 1629 if (!ttydir)
3d7d929a 1630 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1631
3d7d929a 1632 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1633}
1634
998ac676
RT
1635static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1636{
1637 struct mount_opt *mo;
1638
1639 /* If opt is found in mount_opt, set or clear flags.
1640 * Otherwise append it to data. */
1641
1642 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1643 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1644 if (mo->clear)
1645 *flags &= ~mo->flag;
1646 else
1647 *flags |= mo->flag;
1648 return;
1649 }
1650 }
1651
1652 if (strlen(*data))
1653 strcat(*data, ",");
1654 strcat(*data, opt);
1655}
1656
a17b1e65 1657int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1658 char **mntdata)
1659{
1660 char *s, *data;
1661 char *p, *saveptr = NULL;
1662
911324ef 1663 *mntdata = NULL;
91656ce5 1664 *mntflags = 0L;
911324ef
DL
1665
1666 if (!mntopts)
998ac676
RT
1667 return 0;
1668
911324ef 1669 s = strdup(mntopts);
998ac676 1670 if (!s) {
36eb9bde 1671 SYSERROR("failed to allocate memory");
998ac676
RT
1672 return -1;
1673 }
1674
1675 data = malloc(strlen(s) + 1);
1676 if (!data) {
36eb9bde 1677 SYSERROR("failed to allocate memory");
998ac676
RT
1678 free(s);
1679 return -1;
1680 }
1681 *data = 0;
1682
1683 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1684 p = strtok_r(NULL, ",", &saveptr))
1685 parse_mntopt(p, mntflags, &data);
1686
1687 if (*data)
1688 *mntdata = data;
1689 else
1690 free(data);
1691 free(s);
1692
1693 return 0;
1694}
1695
6fd5e769
SH
1696static void null_endofword(char *word)
1697{
1698 while (*word && *word != ' ' && *word != '\t')
1699 word++;
1700 *word = '\0';
1701}
1702
1703/*
1704 * skip @nfields spaces in @src
1705 */
1706static char *get_field(char *src, int nfields)
1707{
1708 char *p = src;
1709 int i;
1710
1711 for (i = 0; i < nfields; i++) {
1712 while (*p && *p != ' ' && *p != '\t')
1713 p++;
1714 if (!*p)
1715 break;
1716 p++;
1717 }
1718 return p;
1719}
1720
911324ef
DL
1721static int mount_entry(const char *fsname, const char *target,
1722 const char *fstype, unsigned long mountflags,
ae7a770e 1723 const char *data, int optional, int dev, const char *rootfs)
911324ef 1724{
614305f3 1725#ifdef HAVE_STATVFS
2938f7c8 1726 struct statvfs sb;
614305f3 1727#endif
2938f7c8 1728
592fd47a 1729 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1fc64d22
SG
1730 if (optional) {
1731 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1732 target, strerror(errno));
1733 return 0;
1734 }
1735 else {
1736 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1737 return -1;
1738 }
911324ef
DL
1739 }
1740
1741 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
1742 DEBUG("remounting %s on %s to respect bind or remount options",
1743 fsname ? fsname : "(none)", target ? target : "(none)");
7c5b6e7c
AS
1744 unsigned long rqd_flags = 0;
1745 if (mountflags & MS_RDONLY)
1746 rqd_flags |= MS_RDONLY;
614305f3 1747#ifdef HAVE_STATVFS
2938f7c8 1748 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1749 unsigned long required_flags = rqd_flags;
2938f7c8
SH
1750 if (sb.f_flag & MS_NOSUID)
1751 required_flags |= MS_NOSUID;
ae7a770e 1752 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8
SH
1753 required_flags |= MS_NODEV;
1754 if (sb.f_flag & MS_RDONLY)
1755 required_flags |= MS_RDONLY;
1756 if (sb.f_flag & MS_NOEXEC)
1757 required_flags |= MS_NOEXEC;
1758 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1759 /*
1760 * If this was a bind mount request, and required_flags
1761 * does not have any flags which are not already in
1762 * mountflags, then skip the remount
1763 */
1764 if (!(mountflags & MS_REMOUNT)) {
7c5b6e7c 1765 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
2938f7c8
SH
1766 DEBUG("mountflags already was %lu, skipping remount",
1767 mountflags);
1768 goto skipremount;
1769 }
1770 }
1771 mountflags |= required_flags;
6fd5e769 1772 }
614305f3 1773#endif
911324ef
DL
1774
1775 if (mount(fsname, target, fstype,
592fd47a 1776 mountflags | MS_REMOUNT, data) < 0) {
1fc64d22
SG
1777 if (optional) {
1778 INFO("failed to mount '%s' on '%s' (optional): %s",
1779 fsname, target, strerror(errno));
1780 return 0;
1781 }
1782 else {
1783 SYSERROR("failed to mount '%s' on '%s'",
1784 fsname, target);
1785 return -1;
1786 }
911324ef
DL
1787 }
1788 }
1789
614305f3 1790#ifdef HAVE_STATVFS
6fd5e769 1791skipremount:
614305f3 1792#endif
911324ef
DL
1793 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1794
1795 return 0;
1796}
1797
4e4ca161
SH
1798/*
1799 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1800 */
1801static void cull_mntent_opt(struct mntent *mntent)
1802{
1803 int i;
1804 char *p, *p2;
1805 char *list[] = {"create=dir",
1806 "create=file",
1807 "optional",
1808 NULL };
1809
1810 for (i=0; list[i]; i++) {
1811 if (!(p = strstr(mntent->mnt_opts, list[i])))
1812 continue;
1813 p2 = strchr(p, ',');
1814 if (!p2) {
1815 /* no more mntopts, so just chop it here */
1816 *p = '\0';
1817 continue;
1818 }
1819 memmove(p, p2+1, strlen(p2+1)+1);
1820 }
1821}
1822
4d5b72a1 1823static int mount_entry_create_dir_file(const struct mntent *mntent,
0a2dddd4
CB
1824 const char* path, const struct lxc_rootfs *rootfs,
1825 const char *lxc_name, const char *lxc_path)
0ad19a3f 1826{
4d5b72a1 1827 char *pathdirname = NULL;
608e3567 1828 int ret = 0;
34cfffb3 1829 FILE *pathfile = NULL;
911324ef 1830
6e46cc0d 1831 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
5c484f79 1832 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1833 return -1;
1834 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1d52bdf7 1835 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
6e46cc0d
CB
1836 return -1;
1837 }
1838
34cfffb3 1839 if (hasmntopt(mntent, "create=dir")) {
4d5b72a1
NC
1840 if (mkdir_p(path, 0755) < 0) {
1841 WARN("Failed to create mount target '%s'", path);
34cfffb3
SG
1842 ret = -1;
1843 }
1844 }
1845
4d5b72a1
NC
1846 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1847 pathdirname = strdup(path);
34cfffb3 1848 pathdirname = dirname(pathdirname);
119126b6
SG
1849 if (mkdir_p(pathdirname, 0755) < 0) {
1850 WARN("Failed to create target directory");
1851 }
4d5b72a1 1852 pathfile = fopen(path, "wb");
34cfffb3 1853 if (!pathfile) {
4d5b72a1 1854 WARN("Failed to create mount target '%s'", path);
34cfffb3 1855 ret = -1;
6e46cc0d 1856 } else {
34cfffb3 1857 fclose(pathfile);
6e46cc0d 1858 }
34cfffb3 1859 }
4d5b72a1
NC
1860 free(pathdirname);
1861 return ret;
1862}
1863
ec50007f
CB
1864/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1865 * without a rootfs. */
db4aba38 1866static inline int mount_entry_on_generic(struct mntent *mntent,
0a2dddd4
CB
1867 const char* path, const struct lxc_rootfs *rootfs,
1868 const char *lxc_name, const char *lxc_path)
4d5b72a1
NC
1869{
1870 unsigned long mntflags;
1871 char *mntdata;
1872 int ret;
1873 bool optional = hasmntopt(mntent, "optional") != NULL;
ae7a770e 1874 bool dev = hasmntopt(mntent, "dev") != NULL;
4d5b72a1 1875
ec50007f
CB
1876 char *rootfs_path = NULL;
1877 if (rootfs && rootfs->path)
1878 rootfs_path = rootfs->mount;
1879
0a2dddd4 1880 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
34cfffb3 1881
608e3567
SH
1882 if (ret < 0)
1883 return optional ? 0 : -1;
1884
4e4ca161
SH
1885 cull_mntent_opt(mntent);
1886
a17b1e65
SG
1887 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1888 free(mntdata);
1889 return -1;
1890 }
1891
6e46cc0d 1892 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 1893 mntdata, optional, dev, rootfs_path);
68c152ef 1894
911324ef 1895 free(mntdata);
911324ef
DL
1896 return ret;
1897}
1898
db4aba38
NC
1899static inline int mount_entry_on_systemfs(struct mntent *mntent)
1900{
1433c9f9
CB
1901 char path[MAXPATHLEN];
1902 int ret;
1903
1904 /* For containers created without a rootfs all mounts are treated as
1905 * absolute paths starting at / on the host. */
1906 if (mntent->mnt_dir[0] != '/')
1907 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1908 else
1909 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1910
1911 if (ret < 0 || ret >= sizeof(path)) {
1912 ERROR("path name too long");
1913 return -1;
1914 }
1915
1916 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
1917}
1918
4e4ca161 1919static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 1920 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
1921 const char *lxc_name,
1922 const char *lxc_path)
911324ef 1923{
013bd428 1924 char *aux;
59760f5d 1925 char path[MAXPATHLEN];
80a881b2 1926 int r, ret = 0, offset;
67e571de 1927 const char *lxcpath;
0ad19a3f 1928
593e8478 1929 lxcpath = lxc_global_config_value("lxc.lxcpath");
2a59a681
SH
1930 if (!lxcpath) {
1931 ERROR("Out of memory");
1932 return -1;
1933 }
1934
80a881b2 1935 /* if rootfs->path is a blockdev path, allow container fstab to
2a59a681
SH
1936 * use $lxcpath/CN/rootfs as the target prefix */
1937 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
80a881b2
SH
1938 if (r < 0 || r >= MAXPATHLEN)
1939 goto skipvarlib;
1940
1941 aux = strstr(mntent->mnt_dir, path);
1942 if (aux) {
1943 offset = strlen(path);
1944 goto skipabs;
1945 }
1946
1947skipvarlib:
013bd428
DL
1948 aux = strstr(mntent->mnt_dir, rootfs->path);
1949 if (!aux) {
1950 WARN("ignoring mount point '%s'", mntent->mnt_dir);
db4aba38 1951 return ret;
013bd428 1952 }
80a881b2
SH
1953 offset = strlen(rootfs->path);
1954
1955skipabs:
013bd428 1956
9ba8130c 1957 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
80a881b2
SH
1958 aux + offset);
1959 if (r < 0 || r >= MAXPATHLEN) {
1960 WARN("pathnme too long for '%s'", mntent->mnt_dir);
a17b1e65
SG
1961 return -1;
1962 }
1963
0a2dddd4 1964 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 1965}
d330fe7b 1966
4e4ca161 1967static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
1968 const struct lxc_rootfs *rootfs,
1969 const char *lxc_name,
1970 const char *lxc_path)
911324ef
DL
1971{
1972 char path[MAXPATHLEN];
911324ef 1973 int ret;
d330fe7b 1974
34cfffb3 1975 /* relative to root mount point */
6e46cc0d 1976 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 1977 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
1978 ERROR("path name too long");
1979 return -1;
1980 }
911324ef 1981
0a2dddd4 1982 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
1983}
1984
80a881b2 1985static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
0a2dddd4 1986 const char *lxc_name, const char *lxc_path)
911324ef 1987{
aaf901be
AM
1988 struct mntent mntent;
1989 char buf[4096];
911324ef 1990 int ret = -1;
e76b8764 1991
aaf901be 1992 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
e76b8764 1993
911324ef 1994 if (!rootfs->path) {
aaf901be 1995 if (mount_entry_on_systemfs(&mntent))
e76b8764 1996 goto out;
911324ef 1997 continue;
e76b8764
CDC
1998 }
1999
911324ef 2000 /* We have a separate root, mounts are relative to it */
aaf901be 2001 if (mntent.mnt_dir[0] != '/') {
0a2dddd4 2002 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef
DL
2003 goto out;
2004 continue;
2005 }
cd54d859 2006
0a2dddd4 2007 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
911324ef 2008 goto out;
0ad19a3f 2009 }
cd54d859 2010
0ad19a3f 2011 ret = 0;
cd54d859
DL
2012
2013 INFO("mount points have been setup");
0ad19a3f 2014out:
e7938e9e
MN
2015 return ret;
2016}
2017
80a881b2 2018static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
0a2dddd4 2019 const char *lxc_name, const char *lxc_path)
e7938e9e
MN
2020{
2021 FILE *file;
2022 int ret;
2023
2024 if (!fstab)
2025 return 0;
2026
2027 file = setmntent(fstab, "r");
2028 if (!file) {
2029 SYSERROR("failed to use '%s'", fstab);
2030 return -1;
2031 }
2032
0a2dddd4 2033 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e 2034
0ad19a3f 2035 endmntent(file);
2036 return ret;
2037}
2038
5ef5c9a3 2039FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2040{
5ef5c9a3 2041 int ret;
e7938e9e 2042 char *mount_entry;
5ef5c9a3
CB
2043 struct lxc_list *iterator;
2044 FILE *file;
2045 int fd = -1;
2046
2047 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2048 if (fd < 0) {
2049 if (errno != ENOSYS)
2050 return NULL;
2051 file = tmpfile();
2052 } else {
2053 file = fdopen(fd, "r+");
2054 }
e7938e9e 2055
e7938e9e 2056 if (!file) {
fad6ef95 2057 int saved_errno = errno;
5ef5c9a3
CB
2058 if (fd != -1)
2059 close(fd);
fad6ef95 2060 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
9fc7f8c0 2061 return NULL;
e7938e9e
MN
2062 }
2063
2064 lxc_list_for_each(iterator, mount) {
2065 mount_entry = iterator->elem;
5ef5c9a3
CB
2066 ret = fprintf(file, "%s\n", mount_entry);
2067 if (ret < strlen(mount_entry))
2068 WARN("Could not write mount entry to anonymous mount file.");
2069 }
2070
2071 if (fseek(file, 0, SEEK_SET) < 0) {
2072 fclose(file);
2073 return NULL;
e7938e9e
MN
2074 }
2075
9fc7f8c0
TA
2076 return file;
2077}
2078
5ef5c9a3
CB
2079static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2080 struct lxc_list *mount, const char *lxc_name,
2081 const char *lxc_path)
9fc7f8c0
TA
2082{
2083 FILE *file;
2084 int ret;
2085
5ef5c9a3 2086 file = make_anonymous_mount_file(mount);
9fc7f8c0
TA
2087 if (!file)
2088 return -1;
e7938e9e 2089
0a2dddd4 2090 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
e7938e9e
MN
2091
2092 fclose(file);
2093 return ret;
2094}
2095
bab88e68
CS
2096static int parse_cap(const char *cap)
2097{
2098 char *ptr = NULL;
84760c11 2099 size_t i;
2100 int capid = -1;
bab88e68 2101
7035407c
DE
2102 if (!strcmp(cap, "none"))
2103 return -2;
2104
bab88e68
CS
2105 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2106
2107 if (strcmp(cap, caps_opt[i].name))
2108 continue;
2109
2110 capid = caps_opt[i].value;
2111 break;
2112 }
2113
2114 if (capid < 0) {
2115 /* try to see if it's numeric, so the user may specify
2116 * capabilities that the running kernel knows about but
2117 * we don't */
2118 errno = 0;
2119 capid = strtol(cap, &ptr, 10);
2120 if (!ptr || *ptr != '\0' || errno != 0)
2121 /* not a valid number */
2122 capid = -1;
2123 else if (capid > lxc_caps_last_cap())
2124 /* we have a number but it's not a valid
2125 * capability */
2126 capid = -1;
2127 }
2128
2129 return capid;
2130}
2131
0769b82a
CS
2132int in_caplist(int cap, struct lxc_list *caps)
2133{
2134 struct lxc_list *iterator;
2135 int capid;
2136
2137 lxc_list_for_each(iterator, caps) {
2138 capid = parse_cap(iterator->elem);
2139 if (capid == cap)
2140 return 1;
2141 }
2142
2143 return 0;
2144}
2145
81810dd1
DL
2146static int setup_caps(struct lxc_list *caps)
2147{
2148 struct lxc_list *iterator;
2149 char *drop_entry;
bab88e68 2150 int capid;
81810dd1
DL
2151
2152 lxc_list_for_each(iterator, caps) {
2153
2154 drop_entry = iterator->elem;
2155
bab88e68 2156 capid = parse_cap(drop_entry);
d55bc1ad 2157
81810dd1 2158 if (capid < 0) {
1e11be34
DL
2159 ERROR("unknown capability %s", drop_entry);
2160 return -1;
81810dd1
DL
2161 }
2162
2163 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2164
2165 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2166 SYSERROR("failed to remove %s capability", drop_entry);
2167 return -1;
2168 }
81810dd1
DL
2169
2170 }
2171
1fb86a7c
SH
2172 DEBUG("capabilities have been setup");
2173
2174 return 0;
2175}
2176
2177static int dropcaps_except(struct lxc_list *caps)
2178{
2179 struct lxc_list *iterator;
2180 char *keep_entry;
1fb86a7c
SH
2181 int i, capid;
2182 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2183 INFO("found %d capabilities", numcaps);
1fb86a7c 2184
2caf9a97
SH
2185 if (numcaps <= 0 || numcaps > 200)
2186 return -1;
2187
1fb86a7c
SH
2188 // caplist[i] is 1 if we keep capability i
2189 int *caplist = alloca(numcaps * sizeof(int));
2190 memset(caplist, 0, numcaps * sizeof(int));
2191
2192 lxc_list_for_each(iterator, caps) {
2193
2194 keep_entry = iterator->elem;
2195
bab88e68 2196 capid = parse_cap(keep_entry);
1fb86a7c 2197
7035407c
DE
2198 if (capid == -2)
2199 continue;
2200
1fb86a7c
SH
2201 if (capid < 0) {
2202 ERROR("unknown capability %s", keep_entry);
2203 return -1;
2204 }
2205
8255688a 2206 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2207
2208 caplist[capid] = 1;
2209 }
2210 for (i=0; i<numcaps; i++) {
2211 if (caplist[i])
2212 continue;
2213 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2214 SYSERROR("failed to remove capability %d", i);
2215 return -1;
2216 }
1fb86a7c
SH
2217 }
2218
2219 DEBUG("capabilities have been setup");
81810dd1
DL
2220
2221 return 0;
2222}
2223
0ad19a3f 2224static int setup_hw_addr(char *hwaddr, const char *ifname)
2225{
2226 struct sockaddr sockaddr;
2227 struct ifreq ifr;
fad6ef95 2228 int ret, fd, saved_errno;
0ad19a3f 2229
3cfc0f3a
MN
2230 ret = lxc_convert_mac(hwaddr, &sockaddr);
2231 if (ret) {
2232 ERROR("mac address '%s' conversion failed : %s",
2233 hwaddr, strerror(-ret));
0ad19a3f 2234 return -1;
2235 }
2236
2237 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2238 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2239 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2240
2241 fd = socket(AF_INET, SOCK_DGRAM, 0);
2242 if (fd < 0) {
3ab87b66 2243 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2244 return -1;
2245 }
2246
2247 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2248 saved_errno = errno;
0ad19a3f 2249 close(fd);
2250 if (ret)
fad6ef95 2251 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2252
5da6aa8c 2253 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2254
0ad19a3f 2255 return ret;
2256}
2257
82d5ae15 2258static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2259{
82d5ae15
DL
2260 struct lxc_list *iterator;
2261 struct lxc_inetdev *inetdev;
3cfc0f3a 2262 int err;
0ad19a3f 2263
82d5ae15
DL
2264 lxc_list_for_each(iterator, ip) {
2265
2266 inetdev = iterator->elem;
2267
0093bb8c
DL
2268 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2269 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2270 if (err) {
2271 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2272 ifindex, strerror(-err));
82d5ae15
DL
2273 return -1;
2274 }
2275 }
2276
2277 return 0;
0ad19a3f 2278}
2279
82d5ae15 2280static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2281{
82d5ae15 2282 struct lxc_list *iterator;
7fa9074f 2283 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2284 int err;
0ad19a3f 2285
82d5ae15
DL
2286 lxc_list_for_each(iterator, ip) {
2287
2288 inet6dev = iterator->elem;
2289
b3df193c 2290 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2291 &inet6dev->mcast, &inet6dev->acast,
2292 inet6dev->prefix);
3cfc0f3a
MN
2293 if (err) {
2294 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2295 ifindex, strerror(-err));
82d5ae15 2296 return -1;
3cfc0f3a 2297 }
82d5ae15
DL
2298 }
2299
2300 return 0;
0ad19a3f 2301}
2302
e337179a 2303static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
0ad19a3f 2304{
0ad19a3f 2305 char ifname[IFNAMSIZ];
3cfc0f3a 2306 int err;
d1826cf1
CB
2307 const char *net_type_name;
2308 char *current_ifname = ifname;
0ad19a3f 2309
82d5ae15
DL
2310 /* empty network namespace */
2311 if (!netdev->ifindex) {
b0efbac4 2312 if (netdev->flags & IFF_UP) {
d472214b 2313 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2314 if (err) {
2315 ERROR("failed to set the loopback up : %s",
2316 strerror(-err));
82d5ae15
DL
2317 return -1;
2318 }
82d5ae15 2319 }
d1826cf1
CB
2320
2321 if (netdev->type == LXC_NET_EMPTY)
2322 return 0;
2323
2324 if (netdev->type == LXC_NET_NONE)
40790553 2325 return 0;
d1826cf1
CB
2326
2327 if (netdev->type != LXC_NET_VETH) {
2328 net_type_name = lxc_net_type_to_str(netdev->type);
2329 ERROR("%s networks are not supported for containers "
2330 "not setup up by privileged users",
2331 net_type_name);
2332 return -1;
2333 }
2334
40790553 2335 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2336 }
13954cce 2337
b466dc33 2338 /* get the new ifindex in case of physical netdev */
40790553 2339 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2340 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2341 ERROR("failed to get ifindex for %s",
2342 netdev->link);
2343 return -1;
2344 }
40790553 2345 }
b466dc33 2346
82d5ae15
DL
2347 /* retrieve the name of the interface */
2348 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2349 ERROR("no interface corresponding to index '%d'",
82d5ae15 2350 netdev->ifindex);
0ad19a3f 2351 return -1;
2352 }
13954cce 2353
018ef520 2354 /* default: let the system to choose one interface name */
9d083402 2355 if (!netdev->name)
fb6d9b2f
DL
2356 netdev->name = netdev->type == LXC_NET_PHYS ?
2357 netdev->link : "eth%d";
018ef520 2358
82d5ae15 2359 /* rename the interface name */
40790553
SH
2360 if (strcmp(ifname, netdev->name) != 0) {
2361 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2362 if (err) {
2363 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2364 strerror(-err));
2365 return -1;
2366 }
018ef520
DL
2367 }
2368
2369 /* Re-read the name of the interface because its name has changed
2370 * and would be automatically allocated by the system
2371 */
82d5ae15 2372 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2373 ERROR("no interface corresponding to index '%d'",
82d5ae15 2374 netdev->ifindex);
018ef520 2375 return -1;
0ad19a3f 2376 }
2377
82d5ae15
DL
2378 /* set a mac address */
2379 if (netdev->hwaddr) {
2380 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2381 ERROR("failed to setup hw address for '%s'",
82d5ae15 2382 current_ifname);
0ad19a3f 2383 return -1;
2384 }
2385 }
2386
82d5ae15
DL
2387 /* setup ipv4 addresses on the interface */
2388 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2389 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2390 ifname);
2391 return -1;
2392 }
2393
82d5ae15
DL
2394 /* setup ipv6 addresses on the interface */
2395 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2396 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2397 ifname);
2398 return -1;
2399 }
2400
82d5ae15 2401 /* set the network device up */
b0efbac4 2402 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2403 int err;
2404
d472214b 2405 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2406 if (err) {
2407 ERROR("failed to set '%s' up : %s", current_ifname,
2408 strerror(-err));
0ad19a3f 2409 return -1;
2410 }
2411
2412 /* the network is up, make the loopback up too */
d472214b 2413 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2414 if (err) {
2415 ERROR("failed to set the loopback up : %s",
2416 strerror(-err));
0ad19a3f 2417 return -1;
2418 }
2419 }
2420
f8fee0e2
MK
2421 /* We can only set up the default routes after bringing
2422 * up the interface, sine bringing up the interface adds
2423 * the link-local routes and we can't add a default
2424 * route if the gateway is not reachable. */
2425
2426 /* setup ipv4 gateway on the interface */
2427 if (netdev->ipv4_gateway) {
2428 if (!(netdev->flags & IFF_UP)) {
2429 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2430 return -1;
2431 }
2432
2433 if (lxc_list_empty(&netdev->ipv4)) {
2434 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2435 return -1;
2436 }
2437
2438 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2439 if (err) {
fc739df5
SG
2440 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2441 if (err) {
2442 ERROR("failed to add ipv4 dest for '%s': %s",
2443 ifname, strerror(-err));
2444 }
2445
2446 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2447 if (err) {
2448 ERROR("failed to setup ipv4 gateway for '%s': %s",
2449 ifname, strerror(-err));
2450 if (netdev->ipv4_gateway_auto) {
2451 char buf[INET_ADDRSTRLEN];
2452 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2453 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2454 }
2455 return -1;
19a26f82 2456 }
f8fee0e2
MK
2457 }
2458 }
2459
2460 /* setup ipv6 gateway on the interface */
2461 if (netdev->ipv6_gateway) {
2462 if (!(netdev->flags & IFF_UP)) {
2463 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2464 return -1;
2465 }
2466
2467 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2468 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2469 return -1;
2470 }
2471
2472 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2473 if (err) {
fc739df5
SG
2474 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2475 if (err) {
2476 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2477 ifname, strerror(-err));
19a26f82 2478 }
fc739df5
SG
2479
2480 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2481 if (err) {
2482 ERROR("failed to setup ipv6 gateway for '%s': %s",
2483 ifname, strerror(-err));
2484 if (netdev->ipv6_gateway_auto) {
2485 char buf[INET6_ADDRSTRLEN];
2486 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2487 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2488 }
2489 return -1;
2490 }
f8fee0e2
MK
2491 }
2492 }
2493
cd54d859
DL
2494 DEBUG("'%s' has been setup", current_ifname);
2495
0ad19a3f 2496 return 0;
2497}
2498
e337179a
CB
2499static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2500 struct lxc_list *network)
0ad19a3f 2501{
82d5ae15 2502 struct lxc_list *iterator;
82d5ae15 2503 struct lxc_netdev *netdev;
0ad19a3f 2504
c302b476
CB
2505 lxc_log_configured_netdevs(conf);
2506
5f4535a3 2507 lxc_list_for_each(iterator, network) {
5f4535a3 2508 netdev = iterator->elem;
82d5ae15 2509
f9373e40
CB
2510 /* REMOVE in LXC 3.0 */
2511 if (netdev->idx < 0) {
2512 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2513 "networks is DEPRECATED, please switch to using "
2514 "\"lxc.net.[i].* keys\"");
2515 }
2516
e337179a 2517 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
82d5ae15
DL
2518 ERROR("failed to setup netdev");
2519 return -1;
2520 }
2521 }
cd54d859 2522
5f4535a3
DL
2523 if (!lxc_list_empty(network))
2524 INFO("network has been setup");
cd54d859
DL
2525
2526 return 0;
0ad19a3f 2527}
2528
c6d09e15
WB
2529static int parse_resource(const char *res) {
2530 size_t i;
2531 int resid = -1;
2532
2533 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2534 if (strcmp(res, limit_opt[i].name) == 0)
2535 return limit_opt[i].value;
2536 }
2537
2538 /* try to see if it's numeric, so the user may specify
2539 * resources that the running kernel knows about but
2540 * we don't */
2541 if (lxc_safe_int(res, &resid) == 0)
2542 return resid;
2543 return -1;
2544}
2545
2546int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2547 struct lxc_list *it;
2548 struct lxc_limit *lim;
2549 int resid;
2550
2551 lxc_list_for_each(it, limits) {
2552 lim = it->elem;
2553
2554 resid = parse_resource(lim->resource);
2555 if (resid < 0) {
2556 ERROR("unknown resource %s", lim->resource);
2557 return -1;
2558 }
2559
2560 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2561 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2562 return -1;
2563 }
2564 }
2565 return 0;
2566}
2567
2af6bd1b 2568/* try to move physical nics to the init netns */
5610055a 2569void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2570{
64d2fcb5 2571 int i, oldfd;
4ec31c52 2572 char ifname[IFNAMSIZ];
2af6bd1b 2573
5610055a 2574 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2575 return;
2576
64d2fcb5 2577 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2578
64d2fcb5
CB
2579 oldfd = lxc_preserve_ns(getpid(), "net");
2580 if (oldfd < 0) {
2581 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2582 return;
2583 }
64d2fcb5 2584
2af6bd1b
SH
2585 if (setns(netnsfd, 0) != 0) {
2586 SYSERROR("Failed to enter container netns to reset nics");
2587 close(oldfd);
2588 return;
2589 }
2590 for (i=0; i<conf->num_savednics; i++) {
2591 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2592 /* retrieve the name of the interface */
2593 if (!if_indextoname(s->ifindex, ifname)) {
2594 WARN("no interface corresponding to index '%d'", s->ifindex);
2595 continue;
2596 }
5610055a 2597 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2598 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2599 free(s->orig_name);
2af6bd1b 2600 }
5610055a
WB
2601 conf->num_savednics = 0;
2602
2af6bd1b
SH
2603 if (setns(oldfd, 0) != 0)
2604 SYSERROR("Failed to re-enter monitor's netns");
2605 close(oldfd);
2606}
2607
ae9242c8
SH
2608static char *default_rootfs_mount = LXCROOTFSMOUNT;
2609
7b379ab3 2610struct lxc_conf *lxc_conf_init(void)
089cd8b8 2611{
7b379ab3 2612 struct lxc_conf *new;
26ddeedd 2613 int i;
7b379ab3 2614
13277ec4 2615 new = malloc(sizeof(*new));
7b379ab3 2616 if (!new) {
13277ec4 2617 ERROR("lxc_conf_init : %s", strerror(errno));
7b379ab3
MN
2618 return NULL;
2619 }
2620 memset(new, 0, sizeof(*new));
2621
4b73005c 2622 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2623 new->personality = -1;
124fa0a8 2624 new->autodev = 1;
596a818d
DE
2625 new->console.log_path = NULL;
2626 new->console.log_fd = -1;
28a4b0e5 2627 new->console.path = NULL;
63376d7d 2628 new->console.peer = -1;
b5159817
DE
2629 new->console.peerpty.busy = -1;
2630 new->console.peerpty.master = -1;
2631 new->console.peerpty.slave = -1;
63376d7d
DL
2632 new->console.master = -1;
2633 new->console.slave = -1;
2634 new->console.name[0] = '\0';
d2e30e99 2635 new->maincmd_fd = -1;
76a26f55 2636 new->nbd_idx = -1;
54c30e29 2637 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2638 if (!new->rootfs.mount) {
13277ec4 2639 ERROR("lxc_conf_init : %s", strerror(errno));
53f3f048
SH
2640 free(new);
2641 return NULL;
2642 }
858377e4 2643 new->logfd = -1;
7b379ab3
MN
2644 lxc_list_init(&new->cgroup);
2645 lxc_list_init(&new->network);
2646 lxc_list_init(&new->mount_list);
81810dd1 2647 lxc_list_init(&new->caps);
1fb86a7c 2648 lxc_list_init(&new->keepcaps);
f6d3e3e4 2649 lxc_list_init(&new->id_map);
f979ac15 2650 lxc_list_init(&new->includes);
4184c3e1 2651 lxc_list_init(&new->aliens);
7c661726 2652 lxc_list_init(&new->environment);
c6d09e15 2653 lxc_list_init(&new->limits);
26ddeedd
SH
2654 for (i=0; i<NUM_LXC_HOOKS; i++)
2655 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2656 lxc_list_init(&new->groups);
fe4de9a6
DE
2657 new->lsm_aa_profile = NULL;
2658 new->lsm_se_context = NULL;
5112cd70 2659 new->tmp_umount_proc = 0;
7b379ab3 2660
9f30a190
MM
2661 for (i = 0; i < LXC_NS_MAX; i++)
2662 new->inherit_ns_fd[i] = -1;
2663
72bb04e4
PT
2664 /* if running in a new user namespace, init and COMMAND
2665 * default to running as UID/GID 0 when using lxc-execute */
2666 new->init_uid = 0;
2667 new->init_gid = 0;
2668
7b379ab3 2669 return new;
089cd8b8
DL
2670}
2671
a589434e 2672static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2673{
b0ee5983
CB
2674 char *veth1, *veth2;
2675 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
b7b2fde4
CB
2676 int bridge_index, err;
2677 unsigned int mtu = 0;
13954cce 2678
8bee8851 2679 if (netdev->priv.veth_attr.pair) {
e892973e 2680 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2681 if (handler->conf->reboot)
2682 lxc_netdev_delete_by_name(veth1);
2683 } else {
9ba8130c
SH
2684 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2685 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2686 ERROR("veth1 name too long");
2687 return -1;
2688 }
a0265685 2689 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2690 if (!veth1) {
2691 ERROR("failed to allocate a temporary name");
2692 return -1;
2693 }
74a2b586
JK
2694 /* store away for deconf */
2695 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2696 }
82d5ae15 2697
0e391e57 2698 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2699 veth2 = lxc_mkifname(veth2buf);
ad40563e 2700 if (!veth2) {
82d5ae15 2701 ERROR("failed to allocate a temporary name");
ad40563e 2702 goto out_delete;
0ad19a3f 2703 }
2704
3cfc0f3a
MN
2705 err = lxc_veth_create(veth1, veth2);
2706 if (err) {
b0ee5983
CB
2707 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2708 veth2, strerror(-err));
ad40563e 2709 goto out_delete;
0ad19a3f 2710 }
13954cce 2711
49684c0b
CS
2712 /* changing the high byte of the mac address to 0xfe, the bridge interface
2713 * will always keep the host's mac address and not take the mac address
2714 * of a container */
2715 err = setup_private_host_hw_addr(veth1);
2716 if (err) {
b0ee5983
CB
2717 ERROR("failed to change mac address of host interface \"%s\": %s",
2718 veth1, strerror(-err));
49684c0b
CS
2719 goto out_delete;
2720 }
2721
af651aa9
SN
2722 netdev->ifindex = if_nametoindex(veth2);
2723 if (!netdev->ifindex) {
b0ee5983 2724 ERROR("failed to retrieve the index for \"%s\"", veth2);
af651aa9
SN
2725 goto out_delete;
2726 }
2727
82d5ae15 2728 if (netdev->mtu) {
b7b2fde4 2729 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
b0ee5983 2730 WARN("failed to parse mtu from");
b7b2fde4 2731 else
b0ee5983 2732 INFO("retrieved mtu %d", mtu);
e54864d3 2733 } else if (netdev->link) {
e9280f65 2734 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2735 if (bridge_index) {
2736 mtu = netdev_get_mtu(bridge_index);
b0ee5983 2737 INFO("retrieved mtu %d from %s", mtu, netdev->link);
729e8bf6
CB
2738 } else {
2739 mtu = netdev_get_mtu(netdev->ifindex);
b0ee5983 2740 INFO("retrieved mtu %d from %s", mtu, veth2);
729e8bf6 2741 }
e54864d3
NC
2742 }
2743
2744 if (mtu) {
2745 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2746 if (!err)
e54864d3 2747 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2748 if (err) {
b0ee5983
CB
2749 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2750 "and \"%s\": %s",
e54864d3 2751 mtu, veth1, veth2, strerror(-err));
eb14c10a 2752 goto out_delete;
75d09f83
DL
2753 }
2754 }
2755
3cfc0f3a 2756 if (netdev->link) {
c43cbc04 2757 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2758 if (err) {
b0ee5983
CB
2759 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2760 veth1, netdev->link, strerror(-err));
3cfc0f3a
MN
2761 goto out_delete;
2762 }
b0ee5983 2763 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
eb14c10a
DL
2764 }
2765
d472214b 2766 err = lxc_netdev_up(veth1);
6e35af2e 2767 if (err) {
b0ee5983 2768 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
6e35af2e 2769 goto out_delete;
0ad19a3f 2770 }
2771
e3b4c4c4 2772 if (netdev->upscript) {
751d9dcd
DL
2773 err = run_script(handler->name, "net", netdev->upscript, "up",
2774 "veth", veth1, (char*) NULL);
2775 if (err)
e3b4c4c4 2776 goto out_delete;
e3b4c4c4
ST
2777 }
2778
b0ee5983
CB
2779 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2780 netdev->ifindex);
82d5ae15 2781
6ab9ab6d 2782 return 0;
eb14c10a
DL
2783
2784out_delete:
b316d209
CB
2785 if (netdev->ifindex != 0)
2786 lxc_netdev_delete_by_name(veth1);
f10fad2f 2787 if (!netdev->priv.veth_attr.pair)
ad40563e 2788 free(veth1);
f10fad2f 2789 free(veth2);
6ab9ab6d 2790 return -1;
13954cce 2791}
d957ae2d 2792
74a2b586
JK
2793static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2794{
2795 char *veth1;
2796 int err;
2797
2798 if (netdev->priv.veth_attr.pair)
2799 veth1 = netdev->priv.veth_attr.pair;
2800 else
2801 veth1 = netdev->priv.veth_attr.veth1;
2802
2803 if (netdev->downscript) {
2804 err = run_script(handler->name, "net", netdev->downscript,
2805 "down", "veth", veth1, (char*) NULL);
2806 if (err)
2807 return -1;
2808 }
2809 return 0;
2810}
2811
a589434e 2812static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2813{
0e391e57 2814 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2815 int err;
d957ae2d
MT
2816
2817 if (!netdev->link) {
2818 ERROR("no link specified for macvlan netdev");
2819 return -1;
2820 }
13954cce 2821
9ba8130c
SH
2822 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2823 if (err >= sizeof(peerbuf))
2824 return -1;
82d5ae15 2825
a0265685 2826 peer = lxc_mkifname(peerbuf);
ad40563e 2827 if (!peer) {
82d5ae15
DL
2828 ERROR("failed to make a temporary name");
2829 return -1;
0ad19a3f 2830 }
2831
3cfc0f3a
MN
2832 err = lxc_macvlan_create(netdev->link, peer,
2833 netdev->priv.macvlan_attr.mode);
2834 if (err) {
2835 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2836 peer, netdev->link, strerror(-err));
ad40563e 2837 goto out;
0ad19a3f 2838 }
2839
82d5ae15
DL
2840 netdev->ifindex = if_nametoindex(peer);
2841 if (!netdev->ifindex) {
36eb9bde 2842 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2843 goto out;
22ebac19 2844 }
2845
e3b4c4c4 2846 if (netdev->upscript) {
751d9dcd
DL
2847 err = run_script(handler->name, "net", netdev->upscript, "up",
2848 "macvlan", netdev->link, (char*) NULL);
2849 if (err)
ad40563e 2850 goto out;
e3b4c4c4
ST
2851 }
2852
a589434e 2853 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2854 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2855
d957ae2d 2856 return 0;
ad40563e
ÇO
2857out:
2858 lxc_netdev_delete_by_name(peer);
2859 free(peer);
2860 return -1;
0ad19a3f 2861}
2862
74a2b586
JK
2863static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2864{
2865 int err;
2866
2867 if (netdev->downscript) {
2868 err = run_script(handler->name, "net", netdev->downscript,
2869 "down", "macvlan", netdev->link,
2870 (char*) NULL);
2871 if (err)
2872 return -1;
2873 }
2874 return 0;
2875}
2876
a589434e
JN
2877/* XXX: merge with instantiate_macvlan */
2878static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2879{
2880 char peer[IFNAMSIZ];
3cfc0f3a 2881 int err;
82f58d03 2882 static uint16_t vlan_cntr = 0;
b7b2fde4 2883 unsigned int mtu = 0;
26c39028
JHS
2884
2885 if (!netdev->link) {
2886 ERROR("no link specified for vlan netdev");
2887 return -1;
2888 }
2889
82f58d03 2890 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2891 if (err >= sizeof(peer)) {
2892 ERROR("peer name too long");
2893 return -1;
2894 }
26c39028 2895
3cfc0f3a
MN
2896 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2897 if (err) {
2898 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2899 peer, netdev->link, strerror(-err));
26c39028
JHS
2900 return -1;
2901 }
2902
2903 netdev->ifindex = if_nametoindex(peer);
2904 if (!netdev->ifindex) {
2905 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2906 lxc_netdev_delete_by_name(peer);
26c39028
JHS
2907 return -1;
2908 }
2909
a589434e 2910 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 2911 netdev->ifindex);
b4fb7de1 2912 if (netdev->mtu) {
b7b2fde4
CB
2913 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2914 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2915 netdev->ifindex, netdev->name);
2916 return -1;
2917 }
2918 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
2919 if (err) {
2920 ERROR("failed to set mtu '%s' for %s : %s",
2921 netdev->mtu, peer, strerror(-err));
2922 lxc_netdev_delete_by_name(peer);
2923 return -1;
2924 }
2925 }
e892973e 2926
26c39028
JHS
2927 return 0;
2928}
2929
74a2b586
JK
2930static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2931{
2932 return 0;
2933}
2934
a589434e 2935static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2936{
6168e99f
DL
2937 if (!netdev->link) {
2938 ERROR("no link specified for the physical interface");
2939 return -1;
2940 }
2941
9d083402 2942 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 2943 if (!netdev->ifindex) {
9d083402 2944 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 2945 return -1;
2946 }
2947
e3b4c4c4
ST
2948 if (netdev->upscript) {
2949 int err;
751d9dcd
DL
2950 err = run_script(handler->name, "net", netdev->upscript,
2951 "up", "phys", netdev->link, (char*) NULL);
2952 if (err)
e3b4c4c4 2953 return -1;
e3b4c4c4
ST
2954 }
2955
82d5ae15 2956 return 0;
0ad19a3f 2957}
2958
74a2b586
JK
2959static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2960{
2961 int err;
2962
2963 if (netdev->downscript) {
2964 err = run_script(handler->name, "net", netdev->downscript,
2965 "down", "phys", netdev->link, (char*) NULL);
2966 if (err)
2967 return -1;
2968 }
2969 return 0;
2970}
2971
a589434e 2972static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
2973{
2974 netdev->ifindex = 0;
2975 return 0;
2976}
2977
a589434e 2978static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2979{
82d5ae15 2980 netdev->ifindex = 0;
e3b4c4c4
ST
2981 if (netdev->upscript) {
2982 int err;
751d9dcd
DL
2983 err = run_script(handler->name, "net", netdev->upscript,
2984 "up", "empty", (char*) NULL);
2985 if (err)
e3b4c4c4 2986 return -1;
e3b4c4c4 2987 }
82d5ae15 2988 return 0;
0ad19a3f 2989}
2990
74a2b586
JK
2991static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2992{
2993 int err;
2994
2995 if (netdev->downscript) {
2996 err = run_script(handler->name, "net", netdev->downscript,
2997 "down", "empty", (char*) NULL);
2998 if (err)
2999 return -1;
3000 }
3001 return 0;
3002}
3003
26b797f3
SH
3004static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3005{
3006 return 0;
3007}
3008
3009int lxc_requests_empty_network(struct lxc_handler *handler)
3010{
3011 struct lxc_list *network = &handler->conf->network;
3012 struct lxc_list *iterator;
3013 struct lxc_netdev *netdev;
3014 bool found_none = false, found_nic = false;
3015
3016 if (lxc_list_empty(network))
3017 return 0;
3018
3019 lxc_list_for_each(iterator, network) {
3020
3021 netdev = iterator->elem;
3022
3023 if (netdev->type == LXC_NET_NONE)
3024 found_none = true;
3025 else
3026 found_nic = true;
3027 }
3028 if (found_none && !found_nic)
3029 return 1;
3030 return 0;
3031}
3032
e337179a 3033int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
0ad19a3f 3034{
e337179a 3035 bool am_root;
82d5ae15 3036 struct lxc_netdev *netdev;
e337179a
CB
3037 struct lxc_list *iterator;
3038 struct lxc_list *network = &handler->conf->network;
cbef6c52 3039
e337179a
CB
3040 /* We need to be root. */
3041 am_root = (getuid() == 0);
cbef6c52
SH
3042 if (!am_root)
3043 return 0;
0ad19a3f 3044
5f4535a3 3045 lxc_list_for_each(iterator, network) {
5f4535a3 3046 netdev = iterator->elem;
13954cce 3047
e337179a
CB
3048 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3049 ERROR("invalid network configuration type '%d'",
3050 netdev->type);
56637458
CB
3051 return -1;
3052 }
3053
e337179a
CB
3054 if (netdev->type != LXC_NET_MACVLAN &&
3055 netdev->priv.macvlan_attr.mode) {
3056 ERROR("Invalid macvlan.mode for a non-macvlan netdev");
56637458
CB
3057 return -1;
3058 }
3059
e337179a
CB
3060 if (netdev->type != LXC_NET_VETH &&
3061 netdev->priv.veth_attr.pair) {
3062 ERROR("Invalid veth pair for a non-veth netdev");
56637458
CB
3063 return -1;
3064 }
3065
e337179a
CB
3066 if (netdev->type != LXC_NET_VLAN &&
3067 netdev->priv.vlan_attr.vid > 0) {
3068 ERROR("Invalid vlan.id for a non-macvlan netdev");
82d5ae15
DL
3069 return -1;
3070 }
0ad19a3f 3071
e3b4c4c4 3072 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3073 ERROR("failed to create netdev");
3074 return -1;
3075 }
e3b4c4c4 3076
0ad19a3f 3077 }
3078
3079 return 0;
3080}
3081
358daf49 3082bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3083{
e97946ae 3084 int ret;
74a2b586 3085 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3086 struct lxc_list *iterator;
3087 struct lxc_netdev *netdev;
358daf49 3088 bool deleted_all = true;
7fef7a06
DL
3089
3090 lxc_list_for_each(iterator, network) {
3091 netdev = iterator->elem;
d472214b 3092
74a2b586 3093 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3094 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3095 WARN("Failed to rename interface with index %d "
3096 "to its initial name \"%s\".",
3097 netdev->ifindex, netdev->link);
d472214b 3098 continue;
d8f8e352 3099 }
d472214b 3100
74a2b586 3101 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3102 WARN("Failed to destroy netdev");
74a2b586
JK
3103 }
3104
d8f8e352
DL
3105 /* Recent kernel remove the virtual interfaces when the network
3106 * namespace is destroyed but in case we did not moved the
3107 * interface to the network namespace, we have to destroy it
3108 */
e97946ae
CB
3109 if (netdev->ifindex != 0) {
3110 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3111 if (-ret == ENODEV) {
3112 INFO("Interface \"%s\" with index %d already "
3113 "deleted or existing in different network "
3114 "namespace.",
3115 netdev->name ? netdev->name : "(null)",
3116 netdev->ifindex);
3117 } else if (ret < 0) {
3118 deleted_all = false;
3119 WARN("Failed to remove interface \"%s\" with "
3120 "index %d: %s.",
3121 netdev->name ? netdev->name : "(null)",
3122 netdev->ifindex, strerror(-ret));
3123 } else {
3124 INFO("Removed interface \"%s\" with index %d.",
3125 netdev->name ? netdev->name : "(null)",
3126 netdev->ifindex);
3127 }
e97946ae
CB
3128 }
3129
3130 /* Explicitly delete host veth device to prevent lingering
3131 * devices. We had issues in LXD around this.
3132 */
b316d209 3133 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3134 char *hostveth;
3135 if (netdev->priv.veth_attr.pair) {
e97946ae 3136 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3137 ret = lxc_netdev_delete_by_name(hostveth);
3138 if (ret < 0) {
3139 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3140 } else {
3141 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3142 }
3143 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3144 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3145 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3146 if (ret < 0) {
3147 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3148 } else {
3149 INFO("Removed interface \"%s\" from host.", hostveth);
3150 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3151 }
e97946ae
CB
3152 }
3153 }
7fef7a06 3154 }
358daf49
CB
3155
3156 return deleted_all;
7fef7a06
DL
3157}
3158
45e854dc
SG
3159#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3160
fe1f672f 3161/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3162#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3163static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3164 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3165{
3166 pid_t child;
a7242d9a
ÇO
3167 int bytes, pipefd[2];
3168 char *token, *saveptr = NULL;
fe1f672f 3169 char buffer[MAX_BUFFER_SIZE];
091045f8 3170 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3171
3172 if (netdev->type != LXC_NET_VETH) {
3173 ERROR("nic type %d not support for unprivileged use",
091045f8 3174 netdev->type);
cbef6c52
SH
3175 return -1;
3176 }
3177
091045f8 3178 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3179 SYSERROR("pipe failed");
3180 return -1;
3181 }
3182
091045f8
CB
3183 child = fork();
3184 if (child < 0) {
cbef6c52 3185 SYSERROR("fork");
a7242d9a
ÇO
3186 close(pipefd[0]);
3187 close(pipefd[1]);
3188 return -1;
3189 }
3190
3191 if (child == 0) { // child
091045f8
CB
3192 /* Call lxc-user-nic pid type bridge. */
3193 int ret;
3194 char pidstr[LXC_NUMSTRLEN64];
3195
3196 close(pipefd[0]); /* Close the read-end of the pipe. */
3197
3198 /* Redirect stdout to write-end of the pipe. */
3199 ret = dup2(pipefd[1], STDOUT_FILENO);
3200 close(pipefd[1]); /* Close the write-end of the pipe. */
3201 if (ret < 0) {
3202 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3203 exit(EXIT_FAILURE);
3204 }
a7242d9a 3205
091045f8 3206 if (netdev->link)
cff7b5eb 3207 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3208 else
cff7b5eb 3209 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3210
3211 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3212 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3213 exit(EXIT_FAILURE);
3214 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3215
3216 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3217 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3218 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3219 pidstr, "veth", netdev_link, netdev->name, NULL);
3220
3221 SYSERROR("Failed to exec lxc-user-nic.");
3222 exit(EXIT_FAILURE);
a7242d9a
ÇO
3223 }
3224
3225 /* close the write-end of the pipe */
3226 close(pipefd[1]);
3227
fe1f672f 3228 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3229 if (bytes < 0)
3230 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3231 buffer[bytes - 1] = '\0';
3232
3233 if (wait_for_pid(child) != 0) {
3234 close(pipefd[0]);
cbef6c52
SH
3235 return -1;
3236 }
3237
a7242d9a
ÇO
3238 /* close the read-end of the pipe */
3239 close(pipefd[0]);
cbef6c52 3240
a7242d9a
ÇO
3241 /* fill netdev->name field */
3242 token = strtok_r(buffer, ":", &saveptr);
3243 if (!token)
3244 return -1;
091045f8
CB
3245
3246 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3247 if (!netdev->name) {
091045f8 3248 SYSERROR("Failed to allocate memory.");
658979c5
SH
3249 return -1;
3250 }
091045f8 3251 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3252 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3253
3254 /* fill netdev->veth_attr.pair field */
3255 token = strtok_r(NULL, ":", &saveptr);
3256 if (!token)
3257 return -1;
091045f8 3258
a7242d9a 3259 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3260 if (!netdev->priv.veth_attr.pair) {
091045f8 3261 ERROR("Failed to allocate memory.");
658979c5
SH
3262 return -1;
3263 }
45e854dc 3264
a7242d9a 3265 return 0;
cbef6c52
SH
3266}
3267
c43cbc04
SH
3268int lxc_assign_network(const char *lxcpath, char *lxcname,
3269 struct lxc_list *network, pid_t pid)
0ad19a3f 3270{
82d5ae15 3271 struct lxc_list *iterator;
82d5ae15 3272 struct lxc_netdev *netdev;
f2e206ff 3273 char ifname[IFNAMSIZ];
cbef6c52 3274 int am_root = (getuid() == 0);
3cfc0f3a 3275 int err;
0ad19a3f 3276
5f4535a3 3277 lxc_list_for_each(iterator, network) {
82d5ae15 3278
5f4535a3 3279 netdev = iterator->elem;
82d5ae15 3280
fbb16259 3281 if (netdev->type == LXC_NET_VETH && !am_root) {
72ccbbe1
SC
3282 if (netdev->mtu)
3283 INFO("mtu ignored due to insufficient privilege");
c43cbc04 3284 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3285 return -1;
e337179a
CB
3286 /* lxc-user-nic has moved the nic to the new ns.
3287 * unpriv_assign_nic() fills in netdev->name.
3288 * netdev->ifindex will be filed in at
3289 * lxc_setup_netdev_in_child_namespaces.
3290 */
cbef6c52
SH
3291 continue;
3292 }
236087a6 3293
fbb16259
SH
3294 /* empty network namespace, nothing to move */
3295 if (!netdev->ifindex)
3296 continue;
3297
f2e206ff 3298 /* retrieve the name of the interface */
3299 if (!if_indextoname(netdev->ifindex, ifname)) {
3300 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3301 return -1;
3302 }
3303
3304 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3305 if (err) {
3306 ERROR("failed to move '%s' to the container : %s",
3307 netdev->link, strerror(-err));
82d5ae15
DL
3308 return -1;
3309 }
3310
198cbbaa 3311 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3312 }
3313
3314 return 0;
3315}
3316
251d0d2a
DE
3317static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3318 size_t buf_size)
f6d3e3e4 3319{
29053180
CB
3320 char path[MAXPATHLEN];
3321 int fd, ret;
f6d3e3e4 3322
29053180
CB
3323 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3324 idtype == ID_TYPE_UID ? 'u' : 'g');
3325 if (ret < 0 || ret >= MAXPATHLEN) {
3326 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
3327 return -E2BIG;
3328 }
29053180
CB
3329
3330 fd = open(path, O_WRONLY);
3331 if (fd < 0) {
3332 SYSERROR("failed to open \"%s\"", path);
3333 return -1;
f6d3e3e4 3334 }
29053180
CB
3335
3336 errno = 0;
3337 ret = lxc_write_nointr(fd, buf, buf_size);
3338 if (ret != buf_size) {
3339 SYSERROR("failed to write %cid mapping to \"%s\"",
3340 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3341 close(fd);
3342 return -1;
3343 }
3344 close(fd);
3345
3346 return 0;
f6d3e3e4
SH
3347}
3348
6e50e704
CB
3349/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3350 *
3351 * @return 1 if functional binary was found
3352 * @return 0 if binary exists but is lacking privilege
3353 * @return -ENOENT if binary does not exist
3354 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3355 *
3356 */
df6a2945
CB
3357static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3358{
3359 char *path;
3360 int ret;
3361 struct stat st;
3362 int fret = 0;
3363
6e50e704
CB
3364 if (cap != CAP_SETUID && cap != CAP_SETGID)
3365 return -EINVAL;
3366
df6a2945
CB
3367 path = on_path(binary, NULL);
3368 if (!path)
3369 return -ENOENT;
3370
3371 ret = stat(path, &st);
3372 if (ret < 0) {
3373 fret = -errno;
3374 goto cleanup;
3375 }
3376
3377 /* Check if the binary is setuid. */
3378 if (st.st_mode & S_ISUID) {
3379 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3380 fret = 1;
3381 goto cleanup;
3382 }
3383
69924fff 3384 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3385 /* Check if it has the CAP_SETUID capability. */
3386 if ((cap & CAP_SETUID) &&
3387 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3388 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3389 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3390 "and CAP_PERMITTED sets.", path);
3391 fret = 1;
3392 goto cleanup;
3393 }
3394
3395 /* Check if it has the CAP_SETGID capability. */
3396 if ((cap & CAP_SETGID) &&
3397 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3398 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3399 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3400 "and CAP_PERMITTED sets.", path);
3401 fret = 1;
3402 goto cleanup;
3403 }
d6018f88 3404 #else
69924fff
CB
3405 /* If we cannot check for file capabilities we need to give the benefit
3406 * of the doubt. Otherwise we might fail even though all the necessary
3407 * file capabilities are set.
3408 */
d6018f88
CB
3409 DEBUG("Cannot check for file capabilites as full capability support is "
3410 "missing. Manual intervention needed.");
3411 fret = 1;
df6a2945
CB
3412 #endif
3413
3414cleanup:
3415 free(path);
3416 return fret;
3417}
3418
986ef930
CB
3419int lxc_map_ids_exec_wrapper(void *args)
3420{
3421 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3422 return -1;
3423}
3424
f6d3e3e4
SH
3425int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3426{
f6d3e3e4 3427 struct id_map *map;
4bc3b759 3428 struct lxc_list *iterator;
251d0d2a 3429 enum idtype type;
986ef930 3430 char u_or_g;
4bc3b759 3431 char *pos;
99d43365 3432 int fill, left;
986ef930
CB
3433 char cmd_output[MAXPATHLEN];
3434 /* strlen("new@idmap") = 9
3435 * +
3436 * strlen(" ") = 1
3437 * +
3438 * LXC_NUMSTRLEN64
3439 * +
3440 * strlen(" ") = 1
3441 *
3442 * We add some additional space to make sure that we really have
3443 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3444 */
3445 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3446 int ret = 0, uidmap = 0, gidmap = 0;
3447 bool use_shadow = false, had_entry = false;
df6a2945
CB
3448
3449 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3450 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3451 * will protected it by preventing another user from being handed the
3452 * range by shadow.
3453 */
df6a2945 3454 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
3455 if (uidmap == -ENOENT)
3456 WARN("newuidmap binary is missing");
3457 else if (!uidmap)
3458 WARN("newuidmap is lacking necessary privileges");
3459
df6a2945 3460 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
3461 if (gidmap == -ENOENT)
3462 WARN("newgidmap binary is missing");
3463 else if (!gidmap)
3464 WARN("newgidmap is lacking necessary privileges");
3465
df6a2945
CB
3466 if (uidmap > 0 && gidmap > 0) {
3467 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3468 use_shadow = true;
df6a2945 3469 } else {
99d43365
CB
3470 /* In case unprivileged users run application containers via
3471 * execute() or a start*() there are valid cases where they may
3472 * only want to map their own {g,u}id. Let's not block them from
3473 * doing so by requiring geteuid() == 0.
3474 */
3475 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3476 "write directly with euid %d.", geteuid());
0e6e3a41 3477 }
251d0d2a 3478
986ef930
CB
3479 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3480 type++, u_or_g = 'g') {
3481 pos = mapbuf;
3482
0e6e3a41 3483 if (use_shadow)
986ef930 3484 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 3485
cf3ef16d 3486 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3487 /* The kernel only takes <= 4k for writes to
3488 * /proc/<nr>/[ug]id_map
3489 */
251d0d2a 3490 map = iterator->elem;
cf3ef16d
SH
3491 if (map->idtype != type)
3492 continue;
3493
4bc3b759
CB
3494 had_entry = true;
3495
986ef930 3496 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 3497 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3498 use_shadow ? " " : "", map->nsid,
3499 map->hostid, map->range,
0e6e3a41 3500 use_shadow ? "" : "\n");
cf3ef16d 3501 if (fill <= 0 || fill >= left)
4bc3b759
CB
3502 SYSERROR("Too many {g,u}id mappings defined.");
3503
cf3ef16d 3504 pos += fill;
251d0d2a 3505 }
cf3ef16d 3506 if (!had_entry)
4f7521b4 3507 continue;
cf3ef16d 3508
986ef930
CB
3509 /* Try to catch the ouput of new{g,u}idmap to make debugging
3510 * easier.
3511 */
3512 if (use_shadow) {
3513 ret = run_command(cmd_output, sizeof(cmd_output),
3514 lxc_map_ids_exec_wrapper,
3515 (void *)mapbuf);
3516 if (ret < 0) {
3517 ERROR("new%cidmap failed to write mapping: %s",
3518 u_or_g, cmd_output);
3519 return -1;
3520 }
d1838f34 3521 } else {
986ef930
CB
3522 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3523 if (ret < 0)
3524 return -1;
d1838f34 3525 }
986ef930
CB
3526
3527 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3528 }
251d0d2a 3529
986ef930 3530 return 0;
f6d3e3e4
SH
3531}
3532
cf3ef16d 3533/*
7b50c609
TS
3534 * return the host uid/gid to which the container root is mapped in
3535 * *val.
0b3a6504 3536 * Return true if id was found, false otherwise.
cf3ef16d 3537 */
2a9a80cb 3538bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3539 unsigned long *val)
cf3ef16d
SH
3540{
3541 struct lxc_list *it;
3542 struct id_map *map;
3543
3544 lxc_list_for_each(it, &conf->id_map) {
3545 map = it->elem;
7b50c609 3546 if (map->idtype != idtype)
cf3ef16d
SH
3547 continue;
3548 if (map->nsid != 0)
3549 continue;
2a9a80cb
SH
3550 *val = map->hostid;
3551 return true;
cf3ef16d 3552 }
2a9a80cb 3553 return false;
cf3ef16d
SH
3554}
3555
2133f58c 3556int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3557{
3558 struct lxc_list *it;
3559 struct id_map *map;
3560 lxc_list_for_each(it, &conf->id_map) {
3561 map = it->elem;
2133f58c 3562 if (map->idtype != idtype)
cf3ef16d
SH
3563 continue;
3564 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3565 return (id - map->hostid) + map->nsid;
cf3ef16d 3566 }
57d116ab 3567 return -1;
cf3ef16d
SH
3568}
3569
339efad9 3570int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3571{
3572 struct lxc_list *it;
3573 struct id_map *map;
2133f58c 3574 unsigned int freeid = 0;
cf3ef16d
SH
3575again:
3576 lxc_list_for_each(it, &conf->id_map) {
3577 map = it->elem;
2133f58c 3578 if (map->idtype != idtype)
cf3ef16d
SH
3579 continue;
3580 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3581 freeid = map->nsid + map->range;
3582 goto again;
3583 }
3584 }
3585 return freeid;
3586}
3587
19a26f82
MK
3588int lxc_find_gateway_addresses(struct lxc_handler *handler)
3589{
3590 struct lxc_list *network = &handler->conf->network;
3591 struct lxc_list *iterator;
3592 struct lxc_netdev *netdev;
3593 int link_index;
3594
3595 lxc_list_for_each(iterator, network) {
3596 netdev = iterator->elem;
3597
3598 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3599 continue;
3600
3601 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3602 ERROR("gateway = auto only supported for "
3603 "veth and macvlan");
3604 return -1;
3605 }
3606
3607 if (!netdev->link) {
3608 ERROR("gateway = auto needs a link interface");
3609 return -1;
3610 }
3611
3612 link_index = if_nametoindex(netdev->link);
3613 if (!link_index)
3614 return -EINVAL;
3615
3616 if (netdev->ipv4_gateway_auto) {
3617 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3618 ERROR("failed to automatically find ipv4 gateway "
3619 "address from link interface '%s'", netdev->link);
3620 return -1;
3621 }
3622 }
3623
3624 if (netdev->ipv6_gateway_auto) {
3625 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3626 ERROR("failed to automatically find ipv6 gateway "
3627 "address from link interface '%s'", netdev->link);
3628 return -1;
3629 }
3630 }
3631 }
3632
3633 return 0;
3634}
3635
5e4a62bf 3636int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3637{
5e4a62bf 3638 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3639 int i, ret;
b0a33c1e 3640
5e4a62bf
DL
3641 /* no tty in the configuration */
3642 if (!conf->tty)
b0a33c1e 3643 return 0;
3644
9e1045e3 3645 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 3646 if (!tty_info->pty_info) {
9e1045e3
CB
3647 SYSERROR("failed to allocate struct *pty_info");
3648 return -ENOMEM;
b0a33c1e 3649 }
3650
985d15b1 3651 for (i = 0; i < conf->tty; i++) {
b0a33c1e 3652 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3653
025ed0f3
SH
3654 process_lock();
3655 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 3656 pty_info->name, NULL, NULL);
025ed0f3
SH
3657 process_unlock();
3658 if (ret) {
9e1045e3 3659 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
3660 tty_info->nbtty = i;
3661 lxc_delete_tty(tty_info);
9e1045e3 3662 return -ENOTTY;
b0a33c1e 3663 }
3664
9e1045e3 3665 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
3666 pty_info->name, pty_info->master, pty_info->slave);
3667
3ec1648d 3668 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
3669 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3670 if (ret < 0)
3671 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3672 "pty device \"%s\": %s",
3673 pty_info->master, pty_info->name, strerror(errno));
3674
3675 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3676 if (ret < 0)
3677 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3678 "pty device \"%s\": %s",
3679 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 3680
b0a33c1e 3681 pty_info->busy = 0;
3682 }
3683
985d15b1 3684 tty_info->nbtty = conf->tty;
1ac470c0 3685
9e1045e3 3686 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 3687 return 0;
b0a33c1e 3688}
3689
3690void lxc_delete_tty(struct lxc_tty_info *tty_info)
3691{
3692 int i;
3693
3694 for (i = 0; i < tty_info->nbtty; i++) {
3695 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3696
3697 close(pty_info->master);
3698 close(pty_info->slave);
3699 }
3700
3701 free(tty_info->pty_info);
e00c0242 3702 tty_info->pty_info = NULL;
b0a33c1e 3703 tty_info->nbtty = 0;
3704}
3705
f4f52cb5
CB
3706
3707int chown_mapped_root_exec_wrapper(void *args)
3708{
3709 execvp("lxc-usernsexec", args);
3710 return -1;
3711}
3712
f6d3e3e4 3713/*
7b50c609
TS
3714 * chown_mapped_root: for an unprivileged user with uid/gid X to
3715 * chown a dir to subuid/subgid Y, he needs to run chown as root
3716 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3717 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3718 * root is privileged with respect to hostuid/hostgid X, allowing
3719 * him to do the chown.
f6d3e3e4 3720 */
c4d10a05 3721int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3722{
f4f52cb5 3723 uid_t rootuid, rootgid;
2a9a80cb 3724 unsigned long val;
a7ef8753 3725 char *chownpath = path;
f4f52cb5
CB
3726 int hostuid, hostgid, ret;
3727 struct stat sb;
3728 char map1[100], map2[100], map3[100], map4[100], map5[100];
3729 char ugid[100];
3730 char *args1[] = {"lxc-usernsexec",
3731 "-m", map1,
3732 "-m", map2,
3733 "-m", map3,
3734 "-m", map5,
3735 "--", "chown", ugid, path,
3736 NULL};
3737 char *args2[] = {"lxc-usernsexec",
3738 "-m", map1,
3739 "-m", map2,
3740 "-m", map3,
3741 "-m", map4,
3742 "-m", map5,
3743 "--", "chown", ugid, path,
3744 NULL};
3745 char cmd_output[MAXPATHLEN];
3746
3747 hostuid = geteuid();
3748 hostgid = getegid();
f6d3e3e4 3749
2a9a80cb 3750 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3751 ERROR("No uid mapping for container root");
c4d10a05 3752 return -1;
f6d3e3e4 3753 }
f4f52cb5 3754 rootuid = (uid_t)val;
7b50c609 3755 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3756 ERROR("No gid mapping for container root");
7b50c609
TS
3757 return -1;
3758 }
f4f52cb5 3759 rootgid = (gid_t)val;
2a9a80cb 3760
a7ef8753 3761 /*
f4f52cb5 3762 * In case of overlay, we want only the writeable layer to be chowned
a7ef8753 3763 */
1f92162d 3764 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3765 chownpath = strchr(path, ':');
3766 if (!chownpath) {
3767 ERROR("Bad overlay path: %s", path);
3768 return -1;
3769 }
f4f52cb5 3770 chownpath = strchr(chownpath + 1, ':');
a7ef8753
SH
3771 if (!chownpath) {
3772 ERROR("Bad overlay path: %s", path);
3773 return -1;
3774 }
3775 chownpath++;
3776 }
3777 path = chownpath;
f4f52cb5 3778 if (hostuid == 0) {
7b50c609 3779 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3780 ERROR("Error chowning %s", path);
3781 return -1;
3782 }
3783 return 0;
3784 }
f3d7e4ca 3785
f4f52cb5 3786 if (rootuid == hostuid) {
f3d7e4ca 3787 // nothing to do
b103ceac 3788 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
3789 return 0;
3790 }
3791
bbdbf8f0 3792 /* save the current gid of "path" */
f4f52cb5
CB
3793 if (stat(path, &sb) < 0) {
3794 ERROR("Error stat %s", path);
f6d3e3e4
SH
3795 return -1;
3796 }
7b50c609 3797
bbdbf8f0
CB
3798 /* Update the path argument in case this was overlayfs. */
3799 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3800 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3801
f4f52cb5
CB
3802 /*
3803 * A file has to be group-owned by a gid mapped into the
3804 * container, or the container won't be privileged over it.
3805 */
3806 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3807 if (sb.st_uid == hostuid &&
3808 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3809 chown(path, -1, hostgid) < 0) {
3810 ERROR("Failed chgrping %s", path);
3811 return -1;
3812 }
f6d3e3e4 3813
f4f52cb5
CB
3814 // "u:0:rootuid:1"
3815 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3816 if (ret < 0 || ret >= 100) {
3817 ERROR("Error uid printing map string");
3818 return -1;
3819 }
7b50c609 3820
f4f52cb5
CB
3821 // "u:hostuid:hostuid:1"
3822 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3823 if (ret < 0 || ret >= 100) {
3824 ERROR("Error uid printing map string");
3825 return -1;
3826 }
c4d10a05 3827
f4f52cb5
CB
3828 // "g:0:rootgid:1"
3829 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3830 if (ret < 0 || ret >= 100) {
3831 ERROR("Error gid printing map string");
3832 return -1;
3833 }
98e5ba51 3834
f4f52cb5
CB
3835 // "g:pathgid:rootgid+pathgid:1"
3836 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3837 rootgid + (gid_t)sb.st_gid);
3838 if (ret < 0 || ret >= 100) {
3839 ERROR("Error gid printing map string");
3840 return -1;
3841 }
c4d10a05 3842
f4f52cb5
CB
3843 // "g:hostgid:hostgid:1"
3844 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3845 if (ret < 0 || ret >= 100) {
3846 ERROR("Error gid printing map string");
3847 return -1;
3848 }
7b50c609 3849
f4f52cb5
CB
3850 // "0:pathgid" (chown)
3851 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3852 if (ret < 0 || ret >= 100) {
3853 ERROR("Error owner printing format string for chown");
3854 return -1;
3855 }
7b50c609 3856
f4f52cb5
CB
3857 if (hostgid == sb.st_gid)
3858 ret = run_command(cmd_output, sizeof(cmd_output),
3859 chown_mapped_root_exec_wrapper,
3860 (void *)args1);
3861 else
3862 ret = run_command(cmd_output, sizeof(cmd_output),
3863 chown_mapped_root_exec_wrapper,
3864 (void *)args2);
3865 if (ret < 0)
3866 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3867
f4f52cb5 3868 return ret;
f6d3e3e4
SH
3869}
3870
54117de5 3871int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3872{
c4d10a05 3873 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3874 return 0;
c4d10a05 3875
54117de5
CB
3876 if (!strcmp(c->console.name, ""))
3877 return 0;
3878
3879 if (chown_mapped_root(c->console.name, c) < 0) {
3880 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
3881 return -1;
3882 }
3883
54117de5
CB
3884 TRACE("chowned console \"%s\"", c->console.name);
3885
f6d3e3e4
SH
3886 return 0;
3887}
3888
943144d9
CB
3889/* NOTE: Must not be called from inside the container namespace! */
3890int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3891{
3892 int mounted;
3893
943144d9 3894 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3895 if (mounted == -1) {
943144d9 3896 SYSERROR("failed to mount /proc in the container");
01958b1f 3897 /* continue only if there is no rootfs */
943144d9 3898 if (conf->rootfs.path)
01958b1f 3899 return -1;
5112cd70 3900 } else if (mounted == 1) {
943144d9 3901 conf->tmp_umount_proc = 1;
5112cd70 3902 }
943144d9 3903
5112cd70
SH
3904 return 0;
3905}
3906
3907void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3908{
3909 if (lxc_conf->tmp_umount_proc == 1) {
3910 umount("/proc");
3911 lxc_conf->tmp_umount_proc = 0;
3912 }
3913}
3914
6a0c909a 3915void remount_all_slave(void)
e995d7a2
SH
3916{
3917 /* walk /proc/mounts and change any shared entries to slave */
3918 FILE *f = fopen("/proc/self/mountinfo", "r");
3919 char *line = NULL;
3920 size_t len = 0;
3921
3922 if (!f) {
3923 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3924 ERROR("Continuing container startup...");
3925 return;
3926 }
3927
3928 while (getline(&line, &len, f) != -1) {
3929 char *target, *opts;
3930 target = get_field(line, 4);
3931 if (!target)
3932 continue;
3933 opts = get_field(target, 2);
3934 if (!opts)
3935 continue;
3936 null_endofword(opts);
3937 if (!strstr(opts, "shared"))
3938 continue;
3939 null_endofword(target);
3940 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3941 SYSERROR("Failed to make %s rslave", target);
3942 ERROR("Continuing...");
3943 }
3944 }
3945 fclose(f);
f10fad2f 3946 free(line);
e995d7a2
SH
3947}
3948
2322903b
SH
3949void lxc_execute_bind_init(struct lxc_conf *conf)
3950{
3951 int ret;
9d9c111c
SH
3952 char path[PATH_MAX], destpath[PATH_MAX], *p;
3953
3954 /* If init exists in the container, don't bind mount a static one */
3955 p = choose_init(conf->rootfs.mount);
3956 if (p) {
3957 free(p);
3958 return;
3959 }
2322903b
SH
3960
3961 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3962 if (ret < 0 || ret >= PATH_MAX) {
3963 WARN("Path name too long searching for lxc.init.static");
3964 return;
3965 }
3966
3967 if (!file_exists(path)) {
3968 INFO("%s does not exist on host", path);
3969 return;
3970 }
3971
3972 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3973 if (ret < 0 || ret >= PATH_MAX) {
3974 WARN("Path name too long for container's lxc.init.static");
3975 return;
3976 }
3977
3978 if (!file_exists(destpath)) {
3979 FILE * pathfile = fopen(destpath, "wb");
3980 if (!pathfile) {
3981 SYSERROR("Failed to create mount target '%s'", destpath);
3982 return;
3983 }
3984 fclose(pathfile);
3985 }
3986
592fd47a 3987 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3988 if (ret < 0)
3989 SYSERROR("Failed to bind lxc.init.static into container");
3990 INFO("lxc.init.static bound into container at %s", path);
3991}
3992
35120d9c
SH
3993/*
3994 * This does the work of remounting / if it is shared, calling the
3995 * container pre-mount hooks, and mounting the rootfs.
3996 */
3997int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 3998{
35120d9c
SH
3999 if (conf->rootfs_setup) {
4000 /*
4001 * rootfs was set up in another namespace. bind-mount it
4002 * to give us a mount in our own ns so we can pivot_root to it
4003 */
4004 const char *path = conf->rootfs.mount;
4005 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4006 ERROR("Failed to bind-mount container / onto itself");
145832ba 4007 return -1;
35120d9c 4008 }
145832ba 4009 return 0;
35120d9c 4010 }
d4ef7c50 4011
e995d7a2
SH
4012 remount_all_slave();
4013
35120d9c
SH
4014 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4015 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4016 return -1;
4017 }
4018
9aa76a17 4019 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
4020 ERROR("failed to setup rootfs for '%s'", name);
4021 return -1;
4022 }
4023
4024 conf->rootfs_setup = true;
4025 return 0;
4026}
4027
1c1c7051
SH
4028static bool verify_start_hooks(struct lxc_conf *conf)
4029{
4030 struct lxc_list *it;
4031 char path[MAXPATHLEN];
4032 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4033 char *hookname = it->elem;
4034 struct stat st;
4035 int ret;
4036
4037 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 4038 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
4039 if (ret < 0 || ret >= MAXPATHLEN)
4040 return false;
4041 ret = stat(path, &st);
4042 if (ret) {
7b6753e7 4043 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4044 hookname);
4045 return false;
4046 }
6a0c909a 4047 return true;
1c1c7051
SH
4048 }
4049
4050 return true;
4051}
4052
ae467c54 4053static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
e8bd4e43 4054{
ae467c54
CB
4055 int i;
4056 int *ttyfds;
4057 struct lxc_pty_info *pty_info;
e8bd4e43
SH
4058 struct lxc_conf *conf = handler->conf;
4059 const struct lxc_tty_info *tty_info = &conf->tty_info;
e8bd4e43 4060 int sock = handler->ttysock[0];
ae467c54
CB
4061 int ret = -1;
4062 size_t num_ttyfds = (2 * conf->tty);
e8bd4e43 4063
ae467c54
CB
4064 ttyfds = malloc(num_ttyfds * sizeof(int));
4065 if (!ttyfds)
4066 return -1;
4067
4068 for (i = 0; i < num_ttyfds; i++) {
4069 pty_info = &tty_info->pty_info[i / 2];
4070 ttyfds[i++] = pty_info->slave;
4071 ttyfds[i] = pty_info->master;
4072 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
f07fa8df
CB
4073 "parent",
4074 pty_info->name, pty_info->master, pty_info->slave);
e8bd4e43
SH
4075 }
4076
ae467c54
CB
4077 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4078 if (ret < 0)
4079 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4080 strerror(errno));
4081 else
4082 TRACE("sent %d ttys to parent", conf->tty);
4083
e8bd4e43
SH
4084 close(handler->ttysock[0]);
4085 close(handler->ttysock[1]);
4086
ae467c54
CB
4087 for (i = 0; i < num_ttyfds; i++)
4088 close(ttyfds[i]);
e8bd4e43 4089
ae467c54
CB
4090 free(ttyfds);
4091
4092 return ret;
e8bd4e43
SH
4093}
4094
35120d9c
SH
4095int lxc_setup(struct lxc_handler *handler)
4096{
4097 const char *name = handler->name;
4098 struct lxc_conf *lxc_conf = handler->conf;
4099 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4100
4101 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4102 ERROR("Error setting up rootfs mount after spawn");
4103 return -1;
4104 }
4105
6c544cb3
MM
4106 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4107 if (setup_utsname(lxc_conf->utsname)) {
4108 ERROR("failed to setup the utsname for '%s'", name);
4109 return -1;
4110 }
0ad19a3f 4111 }
4112
e337179a
CB
4113 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4114 &lxc_conf->network)) {
36eb9bde 4115 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4116 return -1;
0ad19a3f 4117 }
4118
bc6928ff 4119 if (lxc_conf->autodev > 0) {
14221cbb 4120 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4121 ERROR("failed to mount /dev in the container");
c6883f38
SH
4122 return -1;
4123 }
4124 }
4125
368bbc02
CS
4126 /* do automatic mounts (mainly /proc and /sys), but exclude
4127 * those that need to wait until other stuff has finished
4128 */
4fb3cba5 4129 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4130 ERROR("failed to setup the automatic mounts for '%s'", name);
4131 return -1;
4132 }
4133
0a2dddd4 4134 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4135 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4136 return -1;
576f946d 4137 }
4138
0a2dddd4 4139 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4140 ERROR("failed to setup the mount entries for '%s'", name);
4141 return -1;
4142 }
4143
7b6753e7 4144 /* Make sure any start hooks are in the container */
1c1c7051
SH
4145 if (!verify_start_hooks(lxc_conf))
4146 return -1;
4147
2322903b
SH
4148 if (lxc_conf->is_execute)
4149 lxc_execute_bind_init(lxc_conf);
4150
368bbc02
CS
4151 /* now mount only cgroup, if wanted;
4152 * before, /sys could not have been mounted
4153 * (is either mounted automatically or via fstab entries)
4154 */
4fb3cba5 4155 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4156 ERROR("failed to setup the automatic mounts for '%s'", name);
4157 return -1;
4158 }
4159
283678ed 4160 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4161 ERROR("failed to run mount hooks for container '%s'.", name);
4162 return -1;
4163 }
4164
bc6928ff 4165 if (lxc_conf->autodev > 0) {
283678ed 4166 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4167 ERROR("failed to run autodev hooks for container '%s'.", name);
4168 return -1;
4169 }
27245ff7 4170 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4171 ERROR("failed to populate /dev in the container");
4172 return -1;
4173 }
4174 }
368bbc02 4175
3d7d929a 4176 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4177 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4178 return -1;
6e590161 4179 }
4180
69aa6655
DE
4181 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4182 ERROR("failed to setup /dev symlinks for '%s'", name);
4183 return -1;
4184 }
4185
5112cd70 4186 /* mount /proc if it's not already there */
943144d9 4187 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4188 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4189 return -1;
e075f5d9 4190 }
e075f5d9 4191
ac778708 4192 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4193 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4194 return -1;
ed502555 4195 }
4196
70761e5e 4197 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4198 ERROR("failed to setup the new pts instance");
95b5ffaf 4199 return -1;
3c26f34e 4200 }
4201
e8bd4e43
SH
4202 if (lxc_create_tty(name, lxc_conf)) {
4203 ERROR("failed to create the ttys");
4204 return -1;
4205 }
4206
ae467c54 4207 if (lxc_send_ttys_to_parent(handler) < 0) {
e8bd4e43
SH
4208 ERROR("failure sending console info to parent");
4209 return -1;
4210 }
4211
9e1045e3 4212 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
4213 ERROR("failed to setup the ttys for '%s'", name);
4214 return -1;
4215 }
4216
4217 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4218 SYSERROR("failed to set environment variable for container ptys");
4219
4220
cccc74b5
DL
4221 if (setup_personality(lxc_conf->personality)) {
4222 ERROR("failed to setup personality");
4223 return -1;
4224 }
4225
97a8f74f
SG
4226 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4227 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4228 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4229 return -1;
4230 }
97a8f74f
SG
4231 if (dropcaps_except(&lxc_conf->keepcaps)) {
4232 ERROR("failed to keep requested caps");
4233 return -1;
4234 }
4235 } else if (setup_caps(&lxc_conf->caps)) {
4236 ERROR("failed to drop capabilities");
4237 return -1;
81810dd1
DL
4238 }
4239
cd54d859
DL
4240 NOTICE("'%s' is setup.", name);
4241
0ad19a3f 4242 return 0;
4243}
26ddeedd 4244
283678ed
SH
4245int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4246 const char *lxcpath, char *argv[])
26ddeedd
SH
4247{
4248 int which = -1;
4249 struct lxc_list *it;
4250
4251 if (strcmp(hook, "pre-start") == 0)
4252 which = LXCHOOK_PRESTART;
5ea6163a
SH
4253 else if (strcmp(hook, "pre-mount") == 0)
4254 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4255 else if (strcmp(hook, "mount") == 0)
4256 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4257 else if (strcmp(hook, "autodev") == 0)
4258 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4259 else if (strcmp(hook, "start") == 0)
4260 which = LXCHOOK_START;
52492063
WB
4261 else if (strcmp(hook, "stop") == 0)
4262 which = LXCHOOK_STOP;
26ddeedd
SH
4263 else if (strcmp(hook, "post-stop") == 0)
4264 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4265 else if (strcmp(hook, "clone") == 0)
4266 which = LXCHOOK_CLONE;
37cf711b
SY
4267 else if (strcmp(hook, "destroy") == 0)
4268 which = LXCHOOK_DESTROY;
26ddeedd
SH
4269 else
4270 return -1;
4271 lxc_list_for_each(it, &conf->hooks[which]) {
4272 int ret;
4273 char *hookname = it->elem;
283678ed 4274 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4275 if (ret)
4276 return ret;
4277 }
4278 return 0;
4279}
72d0e1cb 4280
72d0e1cb
SG
4281int lxc_clear_config_caps(struct lxc_conf *c)
4282{
9ebb03ad 4283 struct lxc_list *it,*next;
72d0e1cb 4284
9ebb03ad 4285 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4286 lxc_list_del(it);
4287 free(it->elem);
4288 free(it);
4289 }
4290 return 0;
4291}
4292
74a3920a 4293static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4294 struct lxc_list *it, *next;
4295
4355ab5f 4296 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4297 lxc_list_del(it);
4298 free(it->elem);
4299 free(it);
4300 }
4301 return 0;
4302}
4303
4355ab5f
SH
4304int lxc_clear_idmaps(struct lxc_conf *c)
4305{
4306 return lxc_free_idmap(&c->id_map);
4307}
4308
1fb86a7c
SH
4309int lxc_clear_config_keepcaps(struct lxc_conf *c)
4310{
4311 struct lxc_list *it,*next;
4312
4313 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4314 lxc_list_del(it);
4315 free(it->elem);
4316 free(it);
4317 }
4318 return 0;
4319}
4320
12a50cc6 4321int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4322{
9ebb03ad 4323 struct lxc_list *it,*next;
72d0e1cb 4324 bool all = false;
a6390f01 4325 const char *k = NULL;
72d0e1cb
SG
4326
4327 if (strcmp(key, "lxc.cgroup") == 0)
4328 all = true;
a6390f01
WB
4329 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4330 k = key + sizeof("lxc.cgroup.")-1;
4331 else
4332 return -1;
72d0e1cb 4333
9ebb03ad 4334 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4335 struct lxc_cgroup *cg = it->elem;
4336 if (!all && strcmp(cg->subsystem, k) != 0)
4337 continue;
4338 lxc_list_del(it);
4339 free(cg->subsystem);
4340 free(cg->value);
4341 free(cg);
4342 free(it);
4343 }
4344 return 0;
4345}
4346
c6d09e15
WB
4347int lxc_clear_limits(struct lxc_conf *c, const char *key)
4348{
4349 struct lxc_list *it, *next;
4350 bool all = false;
4351 const char *k = NULL;
4352
240d4b74 4353 if (strcmp(key, "lxc.limit") == 0
4354 || strcmp(key, "lxc.prlimit"))
c6d09e15
WB
4355 all = true;
4356 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4357 k = key + sizeof("lxc.limit.")-1;
240d4b74 4358 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
4359 k = key + sizeof("lxc.prlimit.")-1;
c6d09e15
WB
4360 else
4361 return -1;
4362
4363 lxc_list_for_each_safe(it, &c->limits, next) {
4364 struct lxc_limit *lim = it->elem;
4365 if (!all && strcmp(lim->resource, k) != 0)
4366 continue;
4367 lxc_list_del(it);
4368 free(lim->resource);
4369 free(lim);
4370 free(it);
4371 }
4372 return 0;
4373}
4374
ee1e7aa0
SG
4375int lxc_clear_groups(struct lxc_conf *c)
4376{
4377 struct lxc_list *it,*next;
4378
4379 lxc_list_for_each_safe(it, &c->groups, next) {
4380 lxc_list_del(it);
4381 free(it->elem);
4382 free(it);
4383 }
4384 return 0;
4385}
4386
ab799c0b
SG
4387int lxc_clear_environment(struct lxc_conf *c)
4388{
4389 struct lxc_list *it,*next;
4390
4391 lxc_list_for_each_safe(it, &c->environment, next) {
4392 lxc_list_del(it);
4393 free(it->elem);
4394 free(it);
4395 }
4396 return 0;
4397}
4398
4399
72d0e1cb
SG
4400int lxc_clear_mount_entries(struct lxc_conf *c)
4401{
9ebb03ad 4402 struct lxc_list *it,*next;
72d0e1cb 4403
9ebb03ad 4404 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4405 lxc_list_del(it);
4406 free(it->elem);
4407 free(it);
4408 }
4409 return 0;
4410}
4411
b099e9e9
SH
4412int lxc_clear_automounts(struct lxc_conf *c)
4413{
4414 c->auto_mounts = 0;
4415 return 0;
4416}
4417
12a50cc6 4418int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4419{
9ebb03ad 4420 struct lxc_list *it,*next;
17ed13a3 4421 bool all = false, done = false;
a6390f01 4422 const char *k = NULL;
72d0e1cb
SG
4423 int i;
4424
17ed13a3
SH
4425 if (strcmp(key, "lxc.hook") == 0)
4426 all = true;
a6390f01
WB
4427 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4428 k = key + sizeof("lxc.hook.")-1;
4429 else
4430 return -1;
17ed13a3 4431
72d0e1cb 4432 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4433 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4434 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4435 lxc_list_del(it);
4436 free(it->elem);
4437 free(it);
4438 }
4439 done = true;
72d0e1cb
SG
4440 }
4441 }
17ed13a3
SH
4442
4443 if (!done) {
4444 ERROR("Invalid hook key: %s", key);
4445 return -1;
4446 }
72d0e1cb
SG
4447 return 0;
4448}
8eb5694b 4449
74a3920a 4450static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4451{
4452 int i;
4453
0cf45501 4454 if (!conf->saved_nics)
7b35f3d6
SH
4455 return;
4456 for (i=0; i < conf->num_savednics; i++)
4457 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4458 free(conf->saved_nics);
4459}
4460
4184c3e1
SH
4461static inline void lxc_clear_aliens(struct lxc_conf *conf)
4462{
4463 struct lxc_list *it,*next;
4464
4465 lxc_list_for_each_safe(it, &conf->aliens, next) {
4466 lxc_list_del(it);
4467 free(it->elem);
4468 free(it);
4469 }
4470}
4471
c7b15d1e 4472void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
4473{
4474 struct lxc_list *it,*next;
4475
4476 lxc_list_for_each_safe(it, &conf->includes, next) {
4477 lxc_list_del(it);
4478 free(it->elem);
4479 free(it);
4480 }
4481}
4482
8eb5694b
SH
4483void lxc_conf_free(struct lxc_conf *conf)
4484{
4485 if (!conf)
4486 return;
858377e4
SH
4487 if (current_config == conf)
4488 current_config = NULL;
f10fad2f
ME
4489 free(conf->console.log_path);
4490 free(conf->console.path);
4491 free(conf->rootfs.mount);
b3b8c97f 4492 free(conf->rootfs.bdev_type);
f10fad2f
ME
4493 free(conf->rootfs.options);
4494 free(conf->rootfs.path);
f10fad2f 4495 free(conf->logfile);
858377e4
SH
4496 if (conf->logfd != -1)
4497 close(conf->logfd);
f10fad2f
ME
4498 free(conf->utsname);
4499 free(conf->ttydir);
4500 free(conf->fstab);
4501 free(conf->rcfile);
4502 free(conf->init_cmd);
6b0d5538 4503 free(conf->unexpanded_config);
393903d1 4504 free(conf->pty_names);
76d0127f 4505 free(conf->syslog);
c302b476 4506 lxc_free_networks(&conf->network);
f10fad2f
ME
4507 free(conf->lsm_aa_profile);
4508 free(conf->lsm_se_context);
769872f9 4509 lxc_seccomp_free(conf);
8eb5694b 4510 lxc_clear_config_caps(conf);
1fb86a7c 4511 lxc_clear_config_keepcaps(conf);
8eb5694b 4512 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4513 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4514 lxc_clear_mount_entries(conf);
7b35f3d6 4515 lxc_clear_saved_nics(conf);
27c27d73 4516 lxc_clear_idmaps(conf);
ee1e7aa0 4517 lxc_clear_groups(conf);
f979ac15 4518 lxc_clear_includes(conf);
761d81ca 4519 lxc_clear_aliens(conf);
ab799c0b 4520 lxc_clear_environment(conf);
240d4b74 4521 lxc_clear_limits(conf, "lxc.prlimit");
8eb5694b
SH
4522 free(conf);
4523}
4355ab5f
SH
4524
4525struct userns_fn_data {
4526 int (*fn)(void *);
c9b7c33e 4527 const char *fn_name;
4355ab5f
SH
4528 void *arg;
4529 int p[2];
4530};
4531
4532static int run_userns_fn(void *data)
4533{
4534 struct userns_fn_data *d = data;
4535 char c;
4355ab5f 4536
f8aa4bf3 4537 /* Close write end of the pipe. */
4355ab5f 4538 close(d->p[1]);
f8aa4bf3
CB
4539
4540 /* Wait for parent to finish establishing a new mapping in the user
4541 * namespace we are executing in.
4542 */
4355ab5f
SH
4543 if (read(d->p[0], &c, 1) != 1)
4544 return -1;
f8aa4bf3
CB
4545
4546 /* Close read end of the pipe. */
4355ab5f 4547 close(d->p[0]);
f8aa4bf3 4548
c9b7c33e
CB
4549 if (d->fn_name)
4550 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 4551 /* Call function to run. */
4355ab5f
SH
4552 return d->fn(d->arg);
4553}
4554
339efad9 4555static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
4556 enum idtype idtype)
4557{
4558 struct lxc_list *it;
4559 struct id_map *map;
4560 struct id_map *retmap = NULL;
4561
4562 lxc_list_for_each(it, &conf->id_map) {
4563 map = it->elem;
4564 if (map->idtype != idtype)
4565 continue;
4566
4567 if (id >= map->hostid && id < map->hostid + map->range) {
4568 retmap = map;
4569 break;
4570 }
4571 }
4572
4573 if (!retmap)
4574 return NULL;
4575
4576 retmap = malloc(sizeof(*retmap));
4577 if (!retmap)
4578 return NULL;
4579
4580 memcpy(retmap, map, sizeof(*retmap));
4581 return retmap;
4582}
4583
4355ab5f 4584/*
f8aa4bf3
CB
4585 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4586 * existing one or establish a new one.
4355ab5f 4587 */
28a2d9e7 4588static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 4589{
28a2d9e7 4590 int hostid_mapped;
f8aa4bf3 4591 struct id_map *entry = NULL;
f8aa4bf3 4592
28a2d9e7
CB
4593 /* Reuse existing mapping. */
4594 entry = mapped_hostid_entry(conf, id, type);
4595 if (entry)
4596 return entry;
f8aa4bf3 4597
28a2d9e7
CB
4598 /* Find new mapping. */
4599 hostid_mapped = find_unmapped_nsid(conf, type);
4600 if (hostid_mapped < 0) {
4601 DEBUG("failed to find free mapping for id %d", id);
4602 return NULL;
f8aa4bf3 4603 }
f8aa4bf3 4604
28a2d9e7
CB
4605 entry = malloc(sizeof(*entry));
4606 if (!entry)
4607 return NULL;
4355ab5f 4608
28a2d9e7
CB
4609 entry->idtype = type;
4610 entry->nsid = hostid_mapped;
4611 entry->hostid = (unsigned long)id;
4612 entry->range = 1;
4355ab5f 4613
28a2d9e7 4614 return entry;
4355ab5f
SH
4615}
4616
f8aa4bf3
CB
4617/* Run a function in a new user namespace.
4618 * The caller's euid/egid will be mapped if it is not already.
4619 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4620 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4621 * This means we require only to establish a mapping from:
4622 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4623 * - the container root -> some sub{g,u}id
4624 * The former we add, if the user did not specifiy a mapping. The latter we
4625 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4626 * there to start the container in the first place.
4355ab5f 4627 */
c9b7c33e
CB
4628int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4629 const char *fn_name)
4355ab5f 4630{
f8aa4bf3
CB
4631 pid_t pid;
4632 uid_t euid, egid;
4355ab5f 4633 struct userns_fn_data d;
4355ab5f 4634 int p[2];
f8aa4bf3
CB
4635 struct lxc_list *it;
4636 struct id_map *map;
4637 char c = '1';
4638 int ret = -1;
4639 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4640 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4641 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4642
4355ab5f 4643 ret = pipe(p);
4355ab5f
SH
4644 if (ret < 0) {
4645 SYSERROR("opening pipe");
4646 return -1;
4647 }
4648 d.fn = fn;
c9b7c33e 4649 d.fn_name = fn_name;
4355ab5f
SH
4650 d.arg = data;
4651 d.p[0] = p[0];
4652 d.p[1] = p[1];
f8aa4bf3
CB
4653
4654 /* Clone child in new user namespace. */
4355ab5f 4655 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
4656 if (pid < 0) {
4657 ERROR("failed to clone child process in new user namespace");
4658 goto on_error;
4659 }
4660
4355ab5f 4661 close(p[0]);
4355ab5f
SH
4662 p[0] = -1;
4663
f8aa4bf3
CB
4664 /* Find container root. */
4665 lxc_list_for_each(it, &conf->id_map) {
4666 map = it->elem;
4667
4668 if (map->nsid != 0)
4669 continue;
4670
4671 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4672 container_root_uid = malloc(sizeof(*container_root_uid));
4673 if (!container_root_uid)
4674 goto on_error;
4675 container_root_uid->idtype = map->idtype;
4676 container_root_uid->hostid = map->hostid;
4677 container_root_uid->nsid = 0;
4678 container_root_uid->range = map->range;
4679 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4680 container_root_gid = malloc(sizeof(*container_root_gid));
4681 if (!container_root_gid)
4682 goto on_error;
4683 container_root_gid->idtype = map->idtype;
4684 container_root_gid->hostid = map->hostid;
4685 container_root_gid->nsid = 0;
4686 container_root_gid->range = map->range;
4687 }
4688
4689 /* Found container root. */
4690 if (container_root_uid && container_root_gid)
4691 break;
4692 }
4693
4694 /* This is actually checked earlier but it can't hurt. */
4695 if (!container_root_uid || !container_root_gid) {
4696 ERROR("no mapping for container root found");
4697 goto on_error;
4698 }
4699
1d90e064
CB
4700 host_uid_map = container_root_uid;
4701 host_gid_map = container_root_gid;
4702
f8aa4bf3
CB
4703 /* Check whether the {g,u}id of the user has a mapping. */
4704 euid = geteuid();
4705 egid = getegid();
1d90e064 4706 if (euid != container_root_uid->hostid)
28a2d9e7
CB
4707 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4708
1d90e064 4709 if (egid != container_root_gid->hostid)
28a2d9e7
CB
4710 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4711
4712 if (!host_uid_map) {
4713 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4714 goto on_error;
4715 }
4716
28a2d9e7
CB
4717 if (!host_gid_map) {
4718 DEBUG("failed to find mapping for gid %d", egid);
4719 goto on_error;
4720 }
4721
4722 /* Allocate new {g,u}id map list. */
4723 idmap = malloc(sizeof(*idmap));
4724 if (!idmap)
4725 goto on_error;
4726 lxc_list_init(idmap);
4727
f8aa4bf3
CB
4728 /* Add container root to the map. */
4729 tmplist = malloc(sizeof(*tmplist));
4730 if (!tmplist)
4731 goto on_error;
4732 lxc_list_add_elem(tmplist, container_root_uid);
4733 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4734
1d90e064 4735 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4736 /* idmap will now keep track of that memory. */
4737 container_root_uid = NULL;
4738
4739 /* Add container root to the map. */
4740 tmplist = malloc(sizeof(*tmplist));
4741 if (!tmplist)
4742 goto on_error;
4743 lxc_list_add_elem(tmplist, host_uid_map);
4744 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4745 }
1d90e064
CB
4746 /* idmap will now keep track of that memory. */
4747 container_root_uid = NULL;
4748 /* idmap will now keep track of that memory. */
4749 host_uid_map = NULL;
f8aa4bf3
CB
4750
4751 tmplist = malloc(sizeof(*tmplist));
4752 if (!tmplist)
4753 goto on_error;
4754 lxc_list_add_elem(tmplist, container_root_gid);
4755 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4756
1d90e064 4757 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4758 /* idmap will now keep track of that memory. */
4759 container_root_gid = NULL;
4760
4761 tmplist = malloc(sizeof(*tmplist));
4762 if (!tmplist)
4763 goto on_error;
4764 lxc_list_add_elem(tmplist, host_gid_map);
4765 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4766 }
1d90e064
CB
4767 /* idmap will now keep track of that memory. */
4768 container_root_gid = NULL;
4769 /* idmap will now keep track of that memory. */
4770 host_gid_map = NULL;
f8aa4bf3 4771
4b73005c
CB
4772 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4773 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
f8aa4bf3
CB
4774 lxc_list_for_each(it, idmap) {
4775 map = it->elem;
4776 TRACE("establishing %cid mapping for \"%d\" in new "
4777 "user namespace: nsuid %lu - hostid %lu - range "
4778 "%lu",
4779 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4780 map->nsid, map->hostid, map->range);
4781 }
4355ab5f
SH
4782 }
4783
f8aa4bf3 4784 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4785 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
4786 if (ret < 0) {
4787 ERROR("error setting up {g,u}id mappings for child process "
4788 "\"%d\"",
4789 pid);
4790 goto on_error;
4355ab5f
SH
4791 }
4792
f8aa4bf3 4793 /* Tell child to proceed. */
4355ab5f 4794 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
4795 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4796 goto on_error;
4355ab5f
SH
4797 }
4798
f8aa4bf3 4799 /* Wait for child to finish. */
3139aead
SG
4800 ret = wait_for_pid(pid);
4801
f8aa4bf3 4802on_error:
1d90e064
CB
4803 if (idmap)
4804 lxc_free_idmap(idmap);
4805 if (container_root_uid)
4806 free(container_root_uid);
4807 if (container_root_gid)
4808 free(container_root_gid);
4809 if (host_uid_map && (host_uid_map != container_root_uid))
4810 free(host_uid_map);
4811 if (host_gid_map && (host_gid_map != container_root_gid))
4812 free(host_gid_map);
3139aead 4813
4355ab5f
SH
4814 if (p[0] != -1)
4815 close(p[0]);
4816 close(p[1]);
f8aa4bf3
CB
4817
4818 return ret;
4355ab5f 4819}
97e9cfa0 4820
a96a8e8c 4821/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4822static char* getuname(void)
4823{
a96a8e8c 4824 struct passwd *result;
97e9cfa0 4825
a96a8e8c
SH
4826 result = getpwuid(geteuid());
4827 if (!result)
97e9cfa0
SH
4828 return NULL;
4829
a96a8e8c 4830 return strdup(result->pw_name);
97e9cfa0
SH
4831}
4832
a96a8e8c 4833/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4834static char *getgname(void)
4835{
a96a8e8c 4836 struct group *result;
97e9cfa0 4837
a96a8e8c
SH
4838 result = getgrgid(getegid());
4839 if (!result)
97e9cfa0
SH
4840 return NULL;
4841
a96a8e8c 4842 return strdup(result->gr_name);
97e9cfa0
SH
4843}
4844
a96a8e8c 4845/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4846void suggest_default_idmap(void)
4847{
4848 FILE *f;
4849 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4850 char *line = NULL;
4851 char *uname, *gname;
4852 size_t len = 0;
4853
4854 if (!(uname = getuname()))
4855 return;
4856
4857 if (!(gname = getgname())) {
4858 free(uname);
4859 return;
4860 }
4861
4862 f = fopen(subuidfile, "r");
4863 if (!f) {
4864 ERROR("Your system is not configured with subuids");
4865 free(gname);
4866 free(uname);
4867 return;
4868 }
4869 while (getline(&line, &len, f) != -1) {
b7930180 4870 size_t no_newline = 0;
97e9cfa0
SH
4871 char *p = strchr(line, ':'), *p2;
4872 if (*line == '#')
4873 continue;
4874 if (!p)
4875 continue;
4876 *p = '\0';
4877 p++;
4878 if (strcmp(line, uname))
4879 continue;
4880 p2 = strchr(p, ':');
4881 if (!p2)
4882 continue;
4883 *p2 = '\0';
4884 p2++;
4885 if (!*p2)
4886 continue;
b7930180
CB
4887 no_newline = strcspn(p2, "\n");
4888 p2[no_newline] = '\0';
4889
b7b2fde4
CB
4890 if (lxc_safe_uint(p, &uid) < 0)
4891 WARN("Could not parse UID.");
4892 if (lxc_safe_uint(p2, &urange) < 0)
4893 WARN("Could not parse UID range.");
97e9cfa0
SH
4894 }
4895 fclose(f);
4896
6be7389a 4897 f = fopen(subgidfile, "r");
97e9cfa0
SH
4898 if (!f) {
4899 ERROR("Your system is not configured with subgids");
4900 free(gname);
4901 free(uname);
4902 return;
4903 }
4904 while (getline(&line, &len, f) != -1) {
b7930180 4905 size_t no_newline = 0;
97e9cfa0
SH
4906 char *p = strchr(line, ':'), *p2;
4907 if (*line == '#')
4908 continue;
4909 if (!p)
4910 continue;
4911 *p = '\0';
4912 p++;
4913 if (strcmp(line, uname))
4914 continue;
4915 p2 = strchr(p, ':');
4916 if (!p2)
4917 continue;
4918 *p2 = '\0';
4919 p2++;
4920 if (!*p2)
4921 continue;
b7930180
CB
4922 no_newline = strcspn(p2, "\n");
4923 p2[no_newline] = '\0';
4924
b7b2fde4
CB
4925 if (lxc_safe_uint(p, &gid) < 0)
4926 WARN("Could not parse GID.");
4927 if (lxc_safe_uint(p2, &grange) < 0)
4928 WARN("Could not parse GID range.");
97e9cfa0
SH
4929 }
4930 fclose(f);
4931
f10fad2f 4932 free(line);
97e9cfa0
SH
4933
4934 if (!urange || !grange) {
4935 ERROR("You do not have subuids or subgids allocated");
4936 ERROR("Unprivileged containers require subuids and subgids");
4937 return;
4938 }
4939
4940 ERROR("You must either run as root, or define uid mappings");
4941 ERROR("To pass uid mappings to lxc-create, you could create");
4942 ERROR("~/.config/lxc/default.conf:");
4943 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4944 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4945 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4946
4947 free(gname);
4948 free(uname);
4949}
aaf26830 4950
a7307747
SH
4951static void free_cgroup_settings(struct lxc_list *result)
4952{
4953 struct lxc_list *iterator, *next;
4954
4955 lxc_list_for_each_safe(iterator, result, next) {
4956 lxc_list_del(iterator);
4957 free(iterator);
4958 }
4959 free(result);
4960}
4961
aaf26830
KT
4962/*
4963 * Return the list of cgroup_settings sorted according to the following rules
4964 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4965 */
4966struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4967{
4968 struct lxc_list *result;
4969 struct lxc_list *memsw_limit = NULL;
4970 struct lxc_list *it = NULL;
4971 struct lxc_cgroup *cg = NULL;
4972 struct lxc_list *item = NULL;
4973
4974 result = malloc(sizeof(*result));
fac7c663
KT
4975 if (!result) {
4976 ERROR("failed to allocate memory to sort cgroup settings");
4977 return NULL;
4978 }
aaf26830
KT
4979 lxc_list_init(result);
4980
4981 /*Iterate over the cgroup settings and copy them to the output list*/
4982 lxc_list_for_each(it, cgroup_settings) {
4983 item = malloc(sizeof(*item));
fac7c663
KT
4984 if (!item) {
4985 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 4986 free_cgroup_settings(result);
fac7c663
KT
4987 return NULL;
4988 }
aaf26830
KT
4989 item->elem = it->elem;
4990 cg = it->elem;
4991 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4992 /* Store the memsw_limit location */
4993 memsw_limit = item;
4994 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 4995 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
4996 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4997 item->elem = memsw_limit->elem;
4998 memsw_limit->elem = it->elem;
4999 }
5000 lxc_list_add_tail(result, item);
5001 }
5002
5003 return result;
a7307747 5004}