]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
conf: mount_entry_create_dir_file()
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
1d52bdf7
CB
23
24#define _GNU_SOURCE
d06245b8
NC
25#include "config.h"
26
8f3e280e
CB
27#include <dirent.h>
28#include <errno.h>
29#include <fcntl.h>
30#include <grp.h>
31#include <inttypes.h>
32#include <libgen.h>
33#include <pwd.h>
34#include <stdarg.h>
0ad19a3f 35#include <stdio.h>
0ad19a3f 36#include <stdlib.h>
0ad19a3f 37#include <string.h>
8f3e280e 38#include <time.h>
0ad19a3f 39#include <unistd.h>
8f3e280e
CB
40#include <arpa/inet.h>
41#include <linux/loop.h>
8f3e280e
CB
42#include <net/if.h>
43#include <netinet/in.h>
44#include <sys/mman.h>
45#include <sys/mount.h>
46#include <sys/param.h>
47#include <sys/prctl.h>
48#include <sys/stat.h>
49#include <sys/socket.h>
ce831b3b 50#include <sys/sysmacros.h>
2d76d1d7 51#include <sys/syscall.h>
97e9cfa0 52#include <sys/types.h>
8f3e280e
CB
53#include <sys/utsname.h>
54#include <sys/wait.h>
1d52bdf7 55
af6824fc
ST
56/* makedev() */
57#ifdef MAJOR_IN_MKDEV
58# include <sys/mkdev.h>
59#endif
af6824fc 60
614305f3 61#ifdef HAVE_STATVFS
2938f7c8 62#include <sys/statvfs.h>
614305f3 63#endif
e827ff7e
SG
64
65#if HAVE_PTY_H
b0a33c1e 66#include <pty.h>
e827ff7e
SG
67#else
68#include <../include/openpty.h>
69#endif
0ad19a3f 70
5ef5c9a3
CB
71#ifdef HAVE_LINUX_MEMFD_H
72#include <linux/memfd.h>
73#endif
74
e8bd4e43 75#include "af_unix.h"
8f3e280e
CB
76#include "bdev.h"
77#include "caps.h" /* for lxc_caps_last_cap() */
78#include "cgroup.h"
1b09f2c0 79#include "conf.h"
1ed6ba91 80#include "confile_utils.h"
8f3e280e 81#include "error.h"
1b09f2c0 82#include "log.h"
d8e48992 83#include "lxcaufs.h"
025ed0f3 84#include "lxclock.h"
8f3e280e
CB
85#include "lxcoverlay.h"
86#include "lxcseccomp.h"
4355ab5f 87#include "namespace.h"
8f3e280e
CB
88#include "network.h"
89#include "parse.h"
90#include "utils.h"
fe4de9a6 91#include "lsm/lsm.h"
d0a36f2c 92
e37dda71 93#if HAVE_LIBCAP
495d2046
SG
94#include <sys/capability.h>
95#endif
96
6ff05e18
SG
97#if HAVE_SYS_PERSONALITY_H
98#include <sys/personality.h>
99#endif
100
edaf8b1b
SG
101#if IS_BIONIC
102#include <../include/lxcmntent.h>
a04f5407
CB
103#ifndef HAVE_PRLIMIT
104#include <../include/prlimit.h>
105#endif
edaf8b1b
SG
106#else
107#include <mntent.h>
108#endif
109
36eb9bde 110lxc_log_define(lxc_conf, lxc);
e5bda9ee 111
e37dda71 112#if HAVE_LIBCAP
b09094da
MN
113#ifndef CAP_SETFCAP
114#define CAP_SETFCAP 31
115#endif
116
117#ifndef CAP_MAC_OVERRIDE
118#define CAP_MAC_OVERRIDE 32
119#endif
120
121#ifndef CAP_MAC_ADMIN
122#define CAP_MAC_ADMIN 33
123#endif
495d2046 124#endif
b09094da
MN
125
126#ifndef PR_CAPBSET_DROP
127#define PR_CAPBSET_DROP 24
128#endif
129
9818cae4
SG
130#ifndef LO_FLAGS_AUTOCLEAR
131#define LO_FLAGS_AUTOCLEAR 4
132#endif
133
bc5b27d6
DK
134#ifndef CAP_SETUID
135#define CAP_SETUID 7
136#endif
137
138#ifndef CAP_SETGID
139#define CAP_SETGID 6
140#endif
141
0769b82a
CS
142/* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144#ifndef CAP_SYS_ADMIN
145#define CAP_SYS_ADMIN 21
146#endif
147
2d76d1d7
SG
148/* Define pivot_root() if missing from the C library */
149#ifndef HAVE_PIVOT_ROOT
150static int pivot_root(const char * new_root, const char * put_old)
151{
152#ifdef __NR_pivot_root
8f3e280e 153 return syscall(__NR_pivot_root, new_root, put_old);
2d76d1d7 154#else
8f3e280e
CB
155 errno = ENOSYS;
156 return -1;
2d76d1d7
SG
157#endif
158}
159#else
160extern int pivot_root(const char * new_root, const char * put_old);
161#endif
162
163/* Define sethostname() if missing from the C library */
164#ifndef HAVE_SETHOSTNAME
165static int sethostname(const char * name, size_t len)
166{
167#ifdef __NR_sethostname
8f3e280e 168 return syscall(__NR_sethostname, name, len);
2d76d1d7 169#else
8f3e280e
CB
170 errno = ENOSYS;
171 return -1;
2d76d1d7
SG
172#endif
173}
174#endif
175
ecec0126
SG
176#ifndef MS_PRIVATE
177#define MS_PRIVATE (1<<18)
178#endif
179
8912711c
CB
180#ifndef MS_LAZYTIME
181#define MS_LAZYTIME (1<<25)
182#endif
183
5ef5c9a3
CB
184/* memfd_create() */
185#ifndef MFD_CLOEXEC
186#define MFD_CLOEXEC 0x0001U
187#endif
188
189#ifndef MFD_ALLOW_SEALING
190#define MFD_ALLOW_SEALING 0x0002U
191#endif
192
193#ifndef HAVE_MEMFD_CREATE
194static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232}
233#else
234extern int memfd_create(const char *name, unsigned int flags);
235#endif
236
72d0e1cb 237char *lxchook_names[NUM_LXC_HOOKS] = {
52492063 238 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
72d0e1cb 239
a589434e 240typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 241
998ac676
RT
242struct mount_opt {
243 char *name;
244 int clear;
245 int flag;
246};
247
81810dd1
DL
248struct caps_opt {
249 char *name;
250 int value;
251};
252
c6d09e15
WB
253struct limit_opt {
254 char *name;
255 int value;
256};
257
858377e4
SH
258/*
259 * The lxc_conf of the container currently being worked on in an
260 * API call
261 * This is used in the error calls
262 */
263#ifdef HAVE_TLS
264__thread struct lxc_conf *current_config;
265#else
266struct lxc_conf *current_config;
267#endif
268
0769b82a
CS
269/* Declare this here, since we don't want to reshuffle the whole file. */
270static int in_caplist(int cap, struct lxc_list *caps);
271
a589434e
JN
272static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
273static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
274static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
275static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
276static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
277static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
278
279static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
280 [LXC_NET_VETH] = instantiate_veth,
281 [LXC_NET_MACVLAN] = instantiate_macvlan,
282 [LXC_NET_VLAN] = instantiate_vlan,
283 [LXC_NET_PHYS] = instantiate_phys,
284 [LXC_NET_EMPTY] = instantiate_empty,
285 [LXC_NET_NONE] = instantiate_none,
0ad19a3f 286};
287
74a2b586
JK
288static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
289static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
290static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
291static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
292static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 293static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586 294
a589434e 295static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
74a2b586
JK
296 [LXC_NET_VETH] = shutdown_veth,
297 [LXC_NET_MACVLAN] = shutdown_macvlan,
298 [LXC_NET_VLAN] = shutdown_vlan,
299 [LXC_NET_PHYS] = shutdown_phys,
300 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 301 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
302};
303
998ac676 304static struct mount_opt mount_opt[] = {
470b359b
CB
305 { "async", 1, MS_SYNCHRONOUS },
306 { "atime", 1, MS_NOATIME },
307 { "bind", 0, MS_BIND },
88d413d5 308 { "defaults", 0, 0 },
88d413d5 309 { "dev", 1, MS_NODEV },
470b359b 310 { "diratime", 1, MS_NODIRATIME },
88d413d5 311 { "dirsync", 0, MS_DIRSYNC },
470b359b 312 { "exec", 1, MS_NOEXEC },
8912711c 313 { "lazytime", 0, MS_LAZYTIME },
88d413d5 314 { "mand", 0, MS_MANDLOCK },
88d413d5 315 { "noatime", 0, MS_NOATIME },
470b359b 316 { "nodev", 0, MS_NODEV },
88d413d5 317 { "nodiratime", 0, MS_NODIRATIME },
470b359b
CB
318 { "noexec", 0, MS_NOEXEC },
319 { "nomand", 1, MS_MANDLOCK },
320 { "norelatime", 1, MS_RELATIME },
321 { "nostrictatime", 1, MS_STRICTATIME },
322 { "nosuid", 0, MS_NOSUID },
88d413d5
SW
323 { "rbind", 0, MS_BIND|MS_REC },
324 { "relatime", 0, MS_RELATIME },
470b359b
CB
325 { "remount", 0, MS_REMOUNT },
326 { "ro", 0, MS_RDONLY },
327 { "rw", 1, MS_RDONLY },
88d413d5 328 { "strictatime", 0, MS_STRICTATIME },
470b359b
CB
329 { "suid", 1, MS_NOSUID },
330 { "sync", 0, MS_SYNCHRONOUS },
88d413d5 331 { NULL, 0, 0 },
998ac676
RT
332};
333
e37dda71 334#if HAVE_LIBCAP
81810dd1 335static struct caps_opt caps_opt[] = {
a6afdde9 336 { "chown", CAP_CHOWN },
1e11be34
DL
337 { "dac_override", CAP_DAC_OVERRIDE },
338 { "dac_read_search", CAP_DAC_READ_SEARCH },
339 { "fowner", CAP_FOWNER },
340 { "fsetid", CAP_FSETID },
81810dd1
DL
341 { "kill", CAP_KILL },
342 { "setgid", CAP_SETGID },
343 { "setuid", CAP_SETUID },
344 { "setpcap", CAP_SETPCAP },
345 { "linux_immutable", CAP_LINUX_IMMUTABLE },
346 { "net_bind_service", CAP_NET_BIND_SERVICE },
347 { "net_broadcast", CAP_NET_BROADCAST },
348 { "net_admin", CAP_NET_ADMIN },
349 { "net_raw", CAP_NET_RAW },
350 { "ipc_lock", CAP_IPC_LOCK },
351 { "ipc_owner", CAP_IPC_OWNER },
352 { "sys_module", CAP_SYS_MODULE },
353 { "sys_rawio", CAP_SYS_RAWIO },
354 { "sys_chroot", CAP_SYS_CHROOT },
355 { "sys_ptrace", CAP_SYS_PTRACE },
356 { "sys_pacct", CAP_SYS_PACCT },
357 { "sys_admin", CAP_SYS_ADMIN },
358 { "sys_boot", CAP_SYS_BOOT },
359 { "sys_nice", CAP_SYS_NICE },
360 { "sys_resource", CAP_SYS_RESOURCE },
361 { "sys_time", CAP_SYS_TIME },
362 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
363 { "mknod", CAP_MKNOD },
364 { "lease", CAP_LEASE },
57b837e2
CB
365#ifdef CAP_AUDIT_READ
366 { "audit_read", CAP_AUDIT_READ },
367#endif
9527e566 368#ifdef CAP_AUDIT_WRITE
81810dd1 369 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
370#endif
371#ifdef CAP_AUDIT_CONTROL
81810dd1 372 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 373#endif
81810dd1
DL
374 { "setfcap", CAP_SETFCAP },
375 { "mac_override", CAP_MAC_OVERRIDE },
376 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
377#ifdef CAP_SYSLOG
378 { "syslog", CAP_SYSLOG },
379#endif
380#ifdef CAP_WAKE_ALARM
381 { "wake_alarm", CAP_WAKE_ALARM },
382#endif
2b54359b
CB
383#ifdef CAP_BLOCK_SUSPEND
384 { "block_suspend", CAP_BLOCK_SUSPEND },
385#endif
81810dd1 386};
495d2046
SG
387#else
388static struct caps_opt caps_opt[] = {};
389#endif
81810dd1 390
c6d09e15
WB
391static struct limit_opt limit_opt[] = {
392#ifdef RLIMIT_AS
393 { "as", RLIMIT_AS },
394#endif
395#ifdef RLIMIT_CORE
396 { "core", RLIMIT_CORE },
397#endif
398#ifdef RLIMIT_CPU
399 { "cpu", RLIMIT_CPU },
400#endif
401#ifdef RLIMIT_DATA
402 { "data", RLIMIT_DATA },
403#endif
404#ifdef RLIMIT_FSIZE
405 { "fsize", RLIMIT_FSIZE },
406#endif
407#ifdef RLIMIT_LOCKS
408 { "locks", RLIMIT_LOCKS },
409#endif
410#ifdef RLIMIT_MEMLOCK
411 { "memlock", RLIMIT_MEMLOCK },
412#endif
413#ifdef RLIMIT_MSGQUEUE
414 { "msgqueue", RLIMIT_MSGQUEUE },
415#endif
416#ifdef RLIMIT_NICE
417 { "nice", RLIMIT_NICE },
418#endif
419#ifdef RLIMIT_NOFILE
420 { "nofile", RLIMIT_NOFILE },
421#endif
422#ifdef RLIMIT_NPROC
423 { "nproc", RLIMIT_NPROC },
424#endif
425#ifdef RLIMIT_RSS
426 { "rss", RLIMIT_RSS },
427#endif
428#ifdef RLIMIT_RTPRIO
429 { "rtprio", RLIMIT_RTPRIO },
430#endif
431#ifdef RLIMIT_RTTIME
432 { "rttime", RLIMIT_RTTIME },
433#endif
434#ifdef RLIMIT_SIGPENDING
435 { "sigpending", RLIMIT_SIGPENDING },
436#endif
437#ifdef RLIMIT_STACK
438 { "stack", RLIMIT_STACK },
439#endif
440};
441
91c3830e
SH
442static int run_buffer(char *buffer)
443{
ebec9176 444 struct lxc_popen_FILE *f;
91c3830e 445 char *output;
8e7da691 446 int ret;
91c3830e 447
ebec9176 448 f = lxc_popen(buffer);
91c3830e 449 if (!f) {
062b72c6 450 SYSERROR("Failed to popen() %s.", buffer);
91c3830e
SH
451 return -1;
452 }
453
454 output = malloc(LXC_LOG_BUFFER_SIZE);
455 if (!output) {
062b72c6 456 ERROR("Failed to allocate memory for %s.", buffer);
ebec9176 457 lxc_pclose(f);
91c3830e
SH
458 return -1;
459 }
460
062b72c6
CB
461 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
462 DEBUG("Script %s with output: %s.", buffer, output);
91c3830e
SH
463
464 free(output);
465
ebec9176 466 ret = lxc_pclose(f);
8e7da691 467 if (ret == -1) {
062b72c6 468 SYSERROR("Script exited with error.");
91c3830e 469 return -1;
8e7da691 470 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
062b72c6 471 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
8e7da691
DE
472 return -1;
473 } else if (WIFSIGNALED(ret)) {
062b72c6 474 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
8e7da691 475 return -1;
91c3830e
SH
476 }
477
478 return 0;
479}
480
148e91f5 481static int run_script_argv(const char *name, const char *section,
062b72c6
CB
482 const char *script, const char *hook,
483 const char *lxcpath, char **argsin)
148e91f5
SH
484{
485 int ret, i;
486 char *buffer;
487 size_t size = 0;
488
062b72c6 489 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
148e91f5
SH
490 script, name, section);
491
062b72c6 492 for (i = 0; argsin && argsin[i]; i++)
148e91f5
SH
493 size += strlen(argsin[i]) + 1;
494
495 size += strlen(hook) + 1;
496
497 size += strlen(script);
498 size += strlen(name);
499 size += strlen(section);
500 size += 3;
501
502 if (size > INT_MAX)
503 return -1;
504
505 buffer = alloca(size);
506 if (!buffer) {
062b72c6 507 ERROR("Failed to allocate memory.");
148e91f5
SH
508 return -1;
509 }
510
062b72c6
CB
511 ret =
512 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
513 if (ret < 0 || (size_t)ret >= size) {
514 ERROR("Script name too long.");
148e91f5
SH
515 return -1;
516 }
517
062b72c6
CB
518 for (i = 0; argsin && argsin[i]; i++) {
519 int len = size - ret;
148e91f5
SH
520 int rc;
521 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
522 if (rc < 0 || rc >= len) {
062b72c6 523 ERROR("Script args too long.");
148e91f5
SH
524 return -1;
525 }
526 ret += rc;
527 }
528
529 return run_buffer(buffer);
530}
531
062b72c6
CB
532static int run_script(const char *name, const char *section, const char *script,
533 ...)
e3b4c4c4 534{
abbfd20b 535 int ret;
91c3830e 536 char *buffer, *p;
abbfd20b
DL
537 size_t size = 0;
538 va_list ap;
751d9dcd 539
062b72c6 540 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
751d9dcd 541 script, name, section);
e3b4c4c4 542
abbfd20b
DL
543 va_start(ap, script);
544 while ((p = va_arg(ap, char *)))
95642a10 545 size += strlen(p) + 1;
abbfd20b
DL
546 va_end(ap);
547
548 size += strlen(script);
549 size += strlen(name);
550 size += strlen(section);
95642a10 551 size += 3;
abbfd20b 552
95642a10
MS
553 if (size > INT_MAX)
554 return -1;
555
556 buffer = alloca(size);
abbfd20b 557 if (!buffer) {
062b72c6 558 ERROR("Failed to allocate memory.");
751d9dcd
DL
559 return -1;
560 }
561
9ba8130c
SH
562 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
563 if (ret < 0 || ret >= size) {
062b72c6 564 ERROR("Script name too long.");
9ba8130c
SH
565 return -1;
566 }
751d9dcd 567
abbfd20b 568 va_start(ap, script);
9ba8130c 569 while ((p = va_arg(ap, char *))) {
062b72c6 570 int len = size - ret;
9ba8130c
SH
571 int rc;
572 rc = snprintf(buffer + ret, len, " %s", p);
573 if (rc < 0 || rc >= len) {
062b72c6 574 ERROR("Script args too long.");
9ba8130c
SH
575 return -1;
576 }
577 ret += rc;
578 }
abbfd20b 579 va_end(ap);
751d9dcd 580
91c3830e 581 return run_buffer(buffer);
e3b4c4c4
ST
582}
583
0c547523
SH
584/*
585 * pin_rootfs
b7ed4bf0
CS
586 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
587 * the duration of the container run, to prevent the container from marking
588 * the underlying fs readonly on shutdown. unlink the file immediately so
589 * no name pollution is happens
0c547523
SH
590 * return -1 on error.
591 * return -2 if nothing needed to be pinned.
592 * return an open fd (>=0) if we pinned it.
593 */
594int pin_rootfs(const char *rootfs)
595{
596 char absrootfs[MAXPATHLEN];
597 char absrootfspin[MAXPATHLEN];
598 struct stat s;
599 int ret, fd;
600
e99ee0de 601 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 602 return -2;
e99ee0de 603
00ec333b 604 if (!realpath(rootfs, absrootfs))
9be53773 605 return -2;
0c547523 606
00ec333b 607 if (access(absrootfs, F_OK))
0c547523 608 return -1;
0c547523 609
00ec333b 610 if (stat(absrootfs, &s))
0c547523 611 return -1;
0c547523 612
72f919c4 613 if (!S_ISDIR(s.st_mode))
0c547523
SH
614 return -2;
615
b7ed4bf0 616 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 617 if (ret >= MAXPATHLEN)
0c547523 618 return -1;
0c547523
SH
619
620 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
621 if (fd < 0)
622 return fd;
623 (void)unlink(absrootfspin);
0c547523
SH
624 return fd;
625}
626
e2a7e8dc
SH
627/*
628 * If we are asking to remount something, make sure that any
629 * NOEXEC etc are honored.
630 */
5ae72b98 631unsigned long add_required_remount_flags(const char *s, const char *d,
e2a7e8dc
SH
632 unsigned long flags)
633{
614305f3 634#ifdef HAVE_STATVFS
e2a7e8dc
SH
635 struct statvfs sb;
636 unsigned long required_flags = 0;
637
638 if (!(flags & MS_REMOUNT))
639 return flags;
640
641 if (!s)
642 s = d;
643
644 if (!s)
645 return flags;
646 if (statvfs(s, &sb) < 0)
647 return flags;
648
649 if (sb.f_flag & MS_NOSUID)
650 required_flags |= MS_NOSUID;
651 if (sb.f_flag & MS_NODEV)
652 required_flags |= MS_NODEV;
653 if (sb.f_flag & MS_RDONLY)
654 required_flags |= MS_RDONLY;
655 if (sb.f_flag & MS_NOEXEC)
656 required_flags |= MS_NOEXEC;
657
658 return flags | required_flags;
614305f3
SH
659#else
660 return flags;
661#endif
e2a7e8dc
SH
662}
663
4fb3cba5 664static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 665{
368bbc02 666 int r;
80e80c40 667 int i;
b06b8511
CS
668 static struct {
669 int match_mask;
670 int match_flag;
671 const char *source;
672 const char *destination;
673 const char *fstype;
674 unsigned long flags;
675 const char *options;
676 } default_mounts[] = {
677 /* Read-only bind-mounting... In older kernels, doing that required
678 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
679 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
680 * kernel 2.6.26 onwards. However, this apparently does not work on
681 * kernel 3.8. Unfortunately, on that very same kernel, doing the
682 * same trick as above doesn't seem to work either, there one needs
683 * to ALSO specify MS_BIND for the remount, otherwise the entire
684 * fs is remounted read-only or the mount fails because it's busy...
685 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
686 * 2.6.32...
368bbc02 687 */
f24a52d5 688 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
592fd47a
SH
689 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
690 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
f24a52d5
SG
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
592fd47a 693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
f24a52d5
SG
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
697 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
705 { 0, 0, NULL, NULL, NULL, 0, NULL }
b06b8511 706 };
368bbc02 707
b06b8511
CS
708 for (i = 0; default_mounts[i].match_mask; i++) {
709 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
710 char *source = NULL;
711 char *destination = NULL;
712 int saved_errno;
e2a7e8dc 713 unsigned long mflags;
b06b8511
CS
714
715 if (default_mounts[i].source) {
716 /* will act like strdup if %r is not present */
8ede5f4c 717 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
b06b8511
CS
718 if (!source) {
719 SYSERROR("memory allocation error");
720 return -1;
721 }
722 }
cc4fd506
SH
723 if (!default_mounts[i].destination) {
724 ERROR("BUG: auto mounts destination %d was NULL", i);
b2f44b4d 725 free(source);
cc4fd506
SH
726 return -1;
727 }
728 /* will act like strdup if %r is not present */
729 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
730 if (!destination) {
731 saved_errno = errno;
732 SYSERROR("memory allocation error");
733 free(source);
734 errno = saved_errno;
735 return -1;
b06b8511 736 }
e2a7e8dc
SH
737 mflags = add_required_remount_flags(source, destination,
738 default_mounts[i].flags);
592fd47a 739 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
b06b8511 740 saved_errno = errno;
b88ff9a0
SG
741 if (r < 0 && errno == ENOENT) {
742 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
743 r = 0;
744 }
745 else if (r < 0)
e2a7e8dc 746 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
f24a52d5 747
b06b8511
CS
748 free(source);
749 free(destination);
750 if (r < 0) {
b06b8511
CS
751 errno = saved_errno;
752 return -1;
753 }
368bbc02 754 }
368bbc02
CS
755 }
756
b06b8511 757 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
758 int cg_flags;
759
760 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
761 /* If the type of cgroup mount was not specified, it depends on the
762 * container's capabilities as to what makes sense: if we have
763 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
764 * anyway, so we may as well default to read-write; then the admin
765 * will not be given a false sense of security. (And if they really
766 * want mixed r/o r/w, then they can explicitly specify :mixed.)
767 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
768 * :mixed, because then the container can't remount it read-write. */
769 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
770 int has_sys_admin = 0;
b0ee5983
CB
771
772 if (!lxc_list_empty(&conf->keepcaps))
0769b82a 773 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
b0ee5983 774 else
0769b82a 775 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
b0ee5983
CB
776
777 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
0769b82a 778 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
b0ee5983 779 else
0769b82a 780 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
0769b82a
CS
781 }
782
8ede5f4c 783 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
368bbc02 784 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 785 return -1;
368bbc02
CS
786 }
787 }
788
368bbc02 789 return 0;
368bbc02
CS
790}
791
4e5440c6 792static int setup_utsname(struct utsname *utsname)
0ad19a3f 793{
4e5440c6
DL
794 if (!utsname)
795 return 0;
0ad19a3f 796
4e5440c6
DL
797 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
798 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 799 return -1;
800 }
801
4e5440c6 802 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 803
0ad19a3f 804 return 0;
805}
806
69aa6655
DE
807struct dev_symlinks {
808 const char *oldpath;
809 const char *name;
810};
811
812static const struct dev_symlinks dev_symlinks[] = {
813 {"/proc/self/fd", "fd"},
814 {"/proc/self/fd/0", "stdin"},
815 {"/proc/self/fd/1", "stdout"},
816 {"/proc/self/fd/2", "stderr"},
817};
818
819static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
820{
821 char path[MAXPATHLEN];
822 int ret,i;
09227be2 823 struct stat s;
69aa6655
DE
824
825
826 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
827 const struct dev_symlinks *d = &dev_symlinks[i];
ec50007f 828 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
69aa6655
DE
829 if (ret < 0 || ret >= MAXPATHLEN)
830 return -1;
09227be2
MW
831
832 /*
833 * Stat the path first. If we don't get an error
834 * accept it as is and don't try to create it
835 */
836 if (!stat(path, &s)) {
837 continue;
838 }
839
69aa6655 840 ret = symlink(d->oldpath, path);
09227be2 841
69aa6655 842 if (ret && errno != EEXIST) {
09227be2
MW
843 if ( errno == EROFS ) {
844 WARN("Warning: Read Only file system while creating %s", path);
845 } else {
846 SYSERROR("Error creating %s", path);
847 return -1;
848 }
69aa6655
DE
849 }
850 }
851 return 0;
852}
853
393903d1
SH
854/*
855 * Build a space-separate list of ptys to pass to systemd.
856 */
857static bool append_ptyname(char **pp, char *name)
b0a33c1e 858{
393903d1
SH
859 char *p;
860
861 if (!*pp) {
862 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
863 if (!*pp)
864 return false;
865 sprintf(*pp, "container_ttys=%s", name);
866 return true;
867 }
868 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
869 if (!p)
870 return false;
871 *pp = p;
872 strcat(p, " ");
873 strcat(p, name);
874 return true;
875}
876
9e1045e3 877static int lxc_setup_tty(struct lxc_conf *conf)
393903d1 878{
9e1045e3 879 int i, ret;
393903d1
SH
880 const struct lxc_tty_info *tty_info = &conf->tty_info;
881 char *ttydir = conf->ttydir;
7c6ef2a2 882 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
b0a33c1e 883
e8bd4e43 884 if (!conf->rootfs.path)
bc9bd0e3
DL
885 return 0;
886
b0a33c1e 887 for (i = 0; i < tty_info->nbtty; i++) {
b0a33c1e 888 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
889
e8bd4e43 890 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
9e1045e3 891 if (ret < 0 || (size_t)ret >= sizeof(path)) {
7c6ef2a2
SH
892 ERROR("pathname too long for ttys");
893 return -1;
894 }
9e1045e3 895
7c6ef2a2
SH
896 if (ttydir) {
897 /* create dev/lxc/tty%d" */
9e1045e3
CB
898 ret = snprintf(lxcpath, sizeof(lxcpath),
899 "/dev/%s/tty%d", ttydir, i + 1);
900 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
7c6ef2a2
SH
901 ERROR("pathname too long for ttys");
902 return -1;
903 }
9e1045e3 904
7c6ef2a2 905 ret = creat(lxcpath, 0660);
9e1045e3
CB
906 if (ret < 0 && errno != EEXIST) {
907 SYSERROR("failed to create \"%s\"", lxcpath);
7c6ef2a2
SH
908 return -1;
909 }
4d44e274
SH
910 if (ret >= 0)
911 close(ret);
9e1045e3 912
7c6ef2a2 913 ret = unlink(path);
9e1045e3
CB
914 if (ret < 0 && errno != ENOENT) {
915 SYSERROR("failed to unlink \"%s\"", path);
7c6ef2a2
SH
916 return -1;
917 }
b0a33c1e 918
9e1045e3
CB
919 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
920 if (ret < 0) {
921 WARN("failed to bind mount \"%s\" onto \"%s\"",
7c6ef2a2
SH
922 pty_info->name, path);
923 continue;
924 }
9e1045e3
CB
925 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
926 path);
13954cce 927
9e1045e3
CB
928 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
929 ttydir, i + 1);
930 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
9ba8130c
SH
931 ERROR("tty pathname too long");
932 return -1;
933 }
9e1045e3 934
7c6ef2a2 935 ret = symlink(lxcpath, path);
9e1045e3
CB
936 if (ret < 0) {
937 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
938 path, lxcpath);
7c6ef2a2
SH
939 return -1;
940 }
941 } else {
9e1045e3
CB
942 /* If we populated /dev, then we need to create
943 * /dev/ttyN
944 */
945 ret = access(path, F_OK);
946 if (ret < 0) {
c6883f38 947 ret = creat(path, 0660);
9e1045e3
CB
948 if (ret < 0) {
949 SYSERROR("failed to create \"%s\"", path);
c6883f38 950 /* this isn't fatal, continue */
025ed0f3 951 } else {
c6883f38 952 close(ret);
025ed0f3 953 }
c6883f38 954 }
9e1045e3
CB
955
956 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
957 if (ret < 0) {
e8bd4e43 958 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
7c6ef2a2
SH
959 continue;
960 }
9e1045e3
CB
961
962 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
963 path);
393903d1 964 }
9e1045e3 965
e8bd4e43 966 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
393903d1
SH
967 ERROR("Error setting up container_ttys string");
968 return -1;
b0a33c1e 969 }
970 }
971
9e1045e3 972 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
b0a33c1e 973 return 0;
974}
975
59bb8698 976static int setup_rootfs_pivot_root(const char *rootfs)
bf601689 977{
2d489f9e 978 int oldroot = -1, newroot = -1;
bf601689 979
2d489f9e
SH
980 oldroot = open("/", O_DIRECTORY | O_RDONLY);
981 if (oldroot < 0) {
982 SYSERROR("Error opening old-/ for fchdir");
9ba8130c
SH
983 return -1;
984 }
2d489f9e
SH
985 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
986 if (newroot < 0) {
987 SYSERROR("Error opening new-/ for fchdir");
988 goto fail;
c08556c6 989 }
bf601689 990
cc6f6dd7 991 /* change into new root fs */
2d489f9e 992 if (fchdir(newroot)) {
cc6f6dd7 993 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
2d489f9e 994 goto fail;
cc6f6dd7
DL
995 }
996
cc6f6dd7 997 /* pivot_root into our new root fs */
2d489f9e 998 if (pivot_root(".", ".")) {
cc6f6dd7 999 SYSERROR("pivot_root syscall failed");
2d489f9e 1000 goto fail;
bf601689 1001 }
cc6f6dd7 1002
2d489f9e
SH
1003 /*
1004 * at this point the old-root is mounted on top of our new-root
1005 * To unmounted it we must not be chdir'd into it, so escape back
1006 * to old-root
1007 */
1008 if (fchdir(oldroot) < 0) {
1009 SYSERROR("Error entering oldroot");
1010 goto fail;
1011 }
7981ea46 1012 if (umount2(".", MNT_DETACH) < 0) {
2d489f9e
SH
1013 SYSERROR("Error detaching old root");
1014 goto fail;
cc6f6dd7
DL
1015 }
1016
2d489f9e
SH
1017 if (fchdir(newroot) < 0) {
1018 SYSERROR("Error re-entering newroot");
1019 goto fail;
1020 }
cc6f6dd7 1021
2d489f9e
SH
1022 close(oldroot);
1023 close(newroot);
bf601689 1024
2d489f9e 1025 DEBUG("pivot_root syscall to '%s' successful", rootfs);
bf601689 1026
bf601689 1027 return 0;
2d489f9e
SH
1028
1029fail:
1030 if (oldroot != -1)
1031 close(oldroot);
1032 if (newroot != -1)
1033 close(newroot);
1034 return -1;
bf601689
MH
1035}
1036
bc6928ff 1037/*
87da4ec3
SH
1038 * Just create a path for /dev under $lxcpath/$name and in rootfs
1039 * If we hit an error, log it but don't fail yet.
91c3830e 1040 */
14221cbb 1041static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
91c3830e
SH
1042{
1043 int ret;
87da4ec3
SH
1044 size_t clen;
1045 char *path;
91c3830e 1046
14221cbb 1047 INFO("Mounting container /dev");
bc6928ff 1048
14221cbb 1049 /* $(rootfs->mount) + "/dev/pts" + '\0' */
ec50007f 1050 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
87da4ec3 1051 path = alloca(clen);
bc6928ff 1052
ec50007f 1053 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
87da4ec3 1054 if (ret < 0 || ret >= clen)
91c3830e 1055 return -1;
bc6928ff 1056
87da4ec3 1057 if (!dir_exists(path)) {
14221cbb 1058 WARN("No /dev in container.");
87da4ec3
SH
1059 WARN("Proceeding without autodev setup");
1060 return 0;
bc6928ff 1061 }
87da4ec3 1062
1ec0e8e3 1063 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
ec50007f 1064 rootfs->path ? rootfs->mount : NULL);
1ec0e8e3 1065 if (ret != 0) {
87da4ec3 1066 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1ec0e8e3 1067 return -1;
91c3830e 1068 }
87da4ec3
SH
1069
1070 INFO("Mounted tmpfs onto %s", path);
1071
ec50007f 1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
87da4ec3 1073 if (ret < 0 || ret >= clen)
91c3830e 1074 return -1;
87da4ec3 1075
bc6928ff
MW
1076 /*
1077 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1078 * If not, then create it and exit if that fails...
1079 */
87da4ec3 1080 if (!dir_exists(path)) {
bc6928ff
MW
1081 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1082 if (ret) {
1083 SYSERROR("Failed to create /dev/pts in container");
1084 return -1;
1085 }
91c3830e
SH
1086 }
1087
14221cbb 1088 INFO("Mounted container /dev");
91c3830e
SH
1089 return 0;
1090}
1091
c6883f38 1092struct lxc_devs {
74a3920a 1093 const char *name;
c6883f38
SH
1094 mode_t mode;
1095 int maj;
1096 int min;
1097};
1098
74a3920a 1099static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1100 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1101 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1102 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1103 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1104 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1105 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
c6883f38
SH
1106};
1107
27245ff7 1108static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
c6883f38
SH
1109{
1110 int ret;
c6883f38
SH
1111 char path[MAXPATHLEN];
1112 int i;
3a32201c 1113 mode_t cmask;
c6883f38 1114
ec50007f 1115 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
91c3830e
SH
1116 if (ret < 0 || ret >= MAXPATHLEN) {
1117 ERROR("Error calculating container /dev location");
c6883f38 1118 return -1;
f7bee6c6 1119 }
91c3830e 1120
0bbf8572
CB
1121 /* ignore, just don't try to fill in */
1122 if (!dir_exists(path))
9cb4d183
SH
1123 return 0;
1124
0bbf8572 1125 INFO("populating container /dev");
3a32201c 1126 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1127 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1128 const struct lxc_devs *d = &lxc_devs[i];
0728ebf4 1129
ec50007f 1130 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
c6883f38
SH
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
0bbf8572 1133
c6883f38 1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
0bbf8572 1135 if (ret < 0) {
9cb4d183
SH
1136 char hostpath[MAXPATHLEN];
1137 FILE *pathfile;
1138
0bbf8572
CB
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
9cb4d183
SH
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150 pathfile = fopen(path, "wb");
1151 if (!pathfile) {
1152 SYSERROR("Failed to create device mount target '%s'", path);
1153 return -1;
1154 }
1155 fclose(pathfile);
0bbf8572
CB
1156 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1157 SYSERROR("Failed bind mounting device %s from host into container", d->name);
9cb4d183
SH
1158 return -1;
1159 }
0bbf8572
CB
1160 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1161 } else {
1162 DEBUG("created device node \"%s\"", path);
c6883f38
SH
1163 }
1164 }
3a32201c 1165 umask(cmask);
c6883f38 1166
0bbf8572 1167 INFO("populated container /dev");
c6883f38
SH
1168 return 0;
1169}
1170
9aa76a17 1171static int lxc_setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1172{
9aa76a17 1173 int ret;
91c3e281
CB
1174 struct bdev *bdev;
1175 const struct lxc_rootfs *rootfs;
cc28d0b0 1176
91c3e281 1177 rootfs = &conf->rootfs;
a0f379bf 1178 if (!rootfs->path) {
91c3e281
CB
1179 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1180 SYSERROR("Failed to make / rslave.");
a0f379bf
DW
1181 return -1;
1182 }
c69bd12f 1183 return 0;
a0f379bf 1184 }
0ad19a3f 1185
12297168 1186 if (access(rootfs->mount, F_OK)) {
91c3e281 1187 SYSERROR("Failed to access to \"%s\". Check it is present.",
12297168 1188 rootfs->mount);
b1789442
DL
1189 return -1;
1190 }
1191
91c3e281 1192 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9aa76a17
CB
1193 if (!bdev) {
1194 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
91c3e281
CB
1195 rootfs->path, rootfs->mount,
1196 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1197 return -1;
9be53773 1198 }
9aa76a17
CB
1199
1200 ret = bdev->ops->mount(bdev);
1201 bdev_put(bdev);
1202 if (ret < 0) {
91c3e281
CB
1203 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1204 rootfs->path, rootfs->mount,
1205 rootfs->options ? rootfs->options : "(null)");
c3f0a28c 1206 return -1;
1207 }
0ad19a3f 1208
91c3e281
CB
1209 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1210 rootfs->path, rootfs->mount,
1211 rootfs->options ? rootfs->options : "(null)");
9aa76a17 1212
ac778708
DL
1213 return 0;
1214}
1215
91e93c71
AV
1216int prepare_ramfs_root(char *root)
1217{
eab15c1e 1218 char buf[LXC_LINELEN], *p;
91e93c71
AV
1219 char nroot[PATH_MAX];
1220 FILE *f;
1221 int i;
1222 char *p2;
1223
1224 if (realpath(root, nroot) == NULL)
39c7b795 1225 return -errno;
91e93c71
AV
1226
1227 if (chdir("/") == -1)
39c7b795 1228 return -errno;
91e93c71
AV
1229
1230 /*
1231 * We could use here MS_MOVE, but in userns this mount is
1232 * locked and can't be moved.
1233 */
39c7b795 1234 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
91e93c71 1235 SYSERROR("Failed to move %s into /", root);
39c7b795 1236 return -errno;
91e93c71
AV
1237 }
1238
39c7b795 1239 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
91e93c71 1240 SYSERROR("Failed to make . rprivate");
39c7b795 1241 return -errno;
91e93c71
AV
1242 }
1243
1244 /*
1245 * The following code cleans up inhereted mounts which are not
1246 * required for CT.
1247 *
1248 * The mountinfo file shows not all mounts, if a few points have been
1249 * unmounted between read operations from the mountinfo. So we need to
1250 * read mountinfo a few times.
1251 *
1252 * This loop can be skipped if a container uses unserns, because all
1253 * inherited mounts are locked and we should live with all this trash.
1254 */
1255 while (1) {
1256 int progress = 0;
1257
1258 f = fopen("./proc/self/mountinfo", "r");
1259 if (!f) {
1260 SYSERROR("Unable to open /proc/self/mountinfo");
1261 return -1;
1262 }
eab15c1e 1263 while (fgets(buf, LXC_LINELEN, f)) {
91e93c71
AV
1264 for (p = buf, i=0; p && i < 4; i++)
1265 p = strchr(p+1, ' ');
1266 if (!p)
1267 continue;
1268 p2 = strchr(p+1, ' ');
1269 if (!p2)
1270 continue;
1271
1272 *p2 = '\0';
1273 *p = '.';
1274
1275 if (strcmp(p + 1, "/") == 0)
1276 continue;
1277 if (strcmp(p + 1, "/proc") == 0)
1278 continue;
1279
1280 if (umount2(p, MNT_DETACH) == 0)
1281 progress++;
1282 }
1283 fclose(f);
1284 if (!progress)
1285 break;
1286 }
1287
8bea9fae
PR
1288 /* This also can be skipped if a container uses unserns */
1289 umount2("./proc", MNT_DETACH);
91e93c71
AV
1290
1291 /* It is weird, but chdir("..") moves us in a new root */
1292 if (chdir("..") == -1) {
1293 SYSERROR("Unable to change working directory");
1294 return -1;
1295 }
1296
1297 if (chroot(".") == -1) {
1298 SYSERROR("Unable to chroot");
1299 return -1;
1300 }
1301
1302 return 0;
1303}
1304
74a3920a 1305static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1306{
39c7b795
CB
1307 if (!rootfs->path) {
1308 DEBUG("container does not have a rootfs, so not doing pivot root");
ac778708 1309 return 0;
39c7b795 1310 }
ac778708 1311
91e93c71 1312 if (detect_ramfs_rootfs()) {
39c7b795
CB
1313 DEBUG("detected that container is on ramfs");
1314 if (prepare_ramfs_root(rootfs->mount)) {
1315 ERROR("failed to prepare minimal ramfs root");
91e93c71 1316 return -1;
39c7b795
CB
1317 }
1318
1319 DEBUG("prepared ramfs root for container");
1320 return 0;
1321 }
1322
1323 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1324 ERROR("failed to pivot root");
25368b52 1325 return -1;
c69bd12f
DL
1326 }
1327
39c7b795 1328 DEBUG("finished pivot root");
25368b52 1329 return 0;
0ad19a3f 1330}
1331
70761e5e 1332static int lxc_setup_devpts(int num_pts)
3c26f34e 1333{
70761e5e 1334 int ret;
9d28c4f9
CB
1335 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1336 char devpts_mntopts[256];
77890c6d 1337
70761e5e
CB
1338 if (!num_pts) {
1339 DEBUG("no new devpts instance will be mounted since no pts "
1340 "devices are requested");
d852c78c 1341 return 0;
3c26f34e 1342 }
1343
9d28c4f9
CB
1344 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1345 default_devpts_mntopts, num_pts);
1346 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1347 return -1;
1348
d5cb35d6 1349 /* Unmount old devpts instance. */
70761e5e
CB
1350 ret = access("/dev/pts/ptmx", F_OK);
1351 if (!ret) {
70761e5e
CB
1352 ret = umount("/dev/pts");
1353 if (ret < 0) {
1354 SYSERROR("failed to unmount old devpts instance");
1355 return -1;
7e40254a 1356 }
70761e5e 1357 DEBUG("unmounted old /dev/pts instance");
7e40254a
JTLB
1358 }
1359
70761e5e
CB
1360 /* Create mountpoint for devpts instance. */
1361 ret = mkdir("/dev/pts", 0755);
1362 if (ret < 0 && errno != EEXIST) {
1363 SYSERROR("failed to create the \"/dev/pts\" directory");
3c26f34e 1364 return -1;
1365 }
1366
70761e5e
CB
1367 /* Mount new devpts instance. */
1368 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1369 if (ret < 0) {
1370 SYSERROR("failed to mount new devpts instance");
1371 return -1;
1372 }
f4f52cb5 1373 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
70761e5e 1374
d5cb35d6 1375 /* Remove any pre-existing /dev/ptmx file. */
70761e5e 1376 ret = access("/dev/ptmx", F_OK);
d5cb35d6
CB
1377 if (!ret) {
1378 ret = remove("/dev/ptmx");
1379 if (ret < 0) {
1380 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1381 return -1;
70761e5e 1382 }
d5cb35d6 1383 DEBUG("removed existing \"/dev/ptmx\"");
3c26f34e 1384 }
1385
d5cb35d6
CB
1386 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1387 ret = open("/dev/ptmx", O_CREAT, 0666);
1388 if (ret < 0) {
1389 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1390 return -1;
1391 }
e87bd19c 1392 close(ret);
d5cb35d6 1393 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
77890c6d 1394
d5cb35d6 1395 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
e87bd19c 1396 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
d5cb35d6
CB
1397 if (!ret) {
1398 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1399 return 0;
1400 } else {
1401 /* Fallthrough and try to create a symlink. */
1402 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1403 }
1404
1405 /* Remove the dummy /dev/ptmx file we created above. */
1406 ret = remove("/dev/ptmx");
70761e5e 1407 if (ret < 0) {
d5cb35d6
CB
1408 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1409 return -1;
1410 }
1411
1412 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1413 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1414 if (ret < 0) {
1415 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
3c26f34e 1416 return -1;
1417 }
d5cb35d6 1418 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
cd54d859 1419
3c26f34e 1420 return 0;
1421}
1422
cccc74b5
DL
1423static int setup_personality(int persona)
1424{
6ff05e18 1425 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1426 if (persona == -1)
1427 return 0;
1428
1429 if (personality(persona) < 0) {
1430 SYSERROR("failed to set personality to '0x%x'", persona);
1431 return -1;
1432 }
1433
1434 INFO("set personality to '0x%x'", persona);
6ff05e18 1435 #endif
cccc74b5
DL
1436
1437 return 0;
1438}
1439
3d7d929a
CB
1440static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1441 const struct lxc_console *console)
6e590161 1442{
63376d7d 1443 char path[MAXPATHLEN];
0728ebf4 1444 int ret, fd;
52e35957 1445
8b1b1210
CB
1446 if (console->path && !strcmp(console->path, "none"))
1447 return 0;
1448
7c6ef2a2 1449 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
3d7d929a 1450 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1451 return -1;
52e35957 1452
8b1b1210
CB
1453 /* When we are asked to setup a console we remove any previous
1454 * /dev/console bind-mounts.
1455 */
a7ba3c7f
CB
1456 if (file_exists(path)) {
1457 ret = lxc_unstack_mountpoint(path, false);
1458 if (ret < 0) {
8b1b1210 1459 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1460 return -ret;
1461 } else {
1462 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1463 }
953fe44f 1464
a7ba3c7f
CB
1465 ret = unlink(path);
1466 if (ret < 0) {
1467 SYSERROR("error unlinking %s", path);
8b1b1210
CB
1468 return -errno;
1469 }
8b1b1210
CB
1470 }
1471
1472 /* For unprivileged containers autodev or automounts will already have
1473 * taken care of creating /dev/console.
1474 */
0728ebf4
TA
1475 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1476 if (fd < 0) {
1477 if (errno != EEXIST) {
1478 SYSERROR("failed to create console");
3d7d929a 1479 return -errno;
0728ebf4
TA
1480 }
1481 } else {
1482 close(fd);
52e35957
DL
1483 }
1484
0728ebf4 1485 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
3d7d929a
CB
1486 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1487 return -errno;
63376d7d 1488 }
13954cce 1489
3d7d929a 1490 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
63376d7d 1491 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1492 return -1;
1493 }
1494
3d7d929a 1495 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
7c6ef2a2
SH
1496 return 0;
1497}
1498
3d7d929a
CB
1499static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1500 const struct lxc_console *console,
1501 char *ttydir)
7c6ef2a2 1502{
7c6ef2a2 1503 int ret;
3d7d929a 1504 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
7c6ef2a2
SH
1505
1506 /* create rootfs/dev/<ttydir> directory */
3d7d929a
CB
1507 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1508 if (ret < 0 || (size_t)ret >= sizeof(path))
7c6ef2a2 1509 return -1;
3d7d929a 1510
7c6ef2a2
SH
1511 ret = mkdir(path, 0755);
1512 if (ret && errno != EEXIST) {
959aee9c 1513 SYSERROR("failed with errno %d to create %s", errno, path);
3d7d929a 1514 return -errno;
7c6ef2a2 1515 }
3d7d929a 1516 DEBUG("created directory for console and tty devices at \%s\"", path);
7c6ef2a2 1517
3d7d929a
CB
1518 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1519 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1520 return -1;
1521
7c6ef2a2 1522 ret = creat(lxcpath, 0660);
3d7d929a 1523 if (ret == -1 && errno != EEXIST) {
959aee9c 1524 SYSERROR("error %d creating %s", errno, lxcpath);
3d7d929a 1525 return -errno;
7c6ef2a2 1526 }
4d44e274
SH
1527 if (ret >= 0)
1528 close(ret);
7c6ef2a2 1529
2a12fefd
CB
1530 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1531 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
7c6ef2a2 1532 return -1;
2a12fefd
CB
1533
1534 /* When we are asked to setup a console we remove any previous
1535 * /dev/console bind-mounts.
1536 */
1537 if (console->path && !strcmp(console->path, "none")) {
1538 struct stat st;
1539 ret = stat(path, &st);
1540 if (ret < 0) {
1541 if (errno == ENOENT)
1542 return 0;
1543 SYSERROR("failed stat() \"%s\"", path);
1544 return -errno;
1545 }
1546
1547 /* /dev/console must be character device with major number 5 and
1548 * minor number 1. If not, give benefit of the doubt and assume
1549 * the user has mounted something else right there on purpose.
1550 */
1551 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1552 return 0;
1553
1554 /* In case the user requested a bind-mount for /dev/console and
1555 * requests a ttydir we move the mount to the
a7ba3c7f
CB
1556 * /dev/<ttydir/console.
1557 * Note, we only move the uppermost mount and clear all other
1558 * mounts underneath for safety.
1559 * If it is a character device created via mknod() we simply
1560 * rename it.
2a12fefd
CB
1561 */
1562 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1563 if (ret < 0) {
1564 if (errno != EINVAL) {
1565 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1566 return -errno;
1567 }
1568 /* path was not a mountpoint */
1569 ret = rename(path, lxcpath);
1570 if (ret < 0) {
1571 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1572 return -errno;
1573 }
1574 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1575 } else {
1576 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1577 }
a7ba3c7f
CB
1578
1579 /* Clear all remaining bind-mounts. */
1580 ret = lxc_unstack_mountpoint(path, false);
2a12fefd 1581 if (ret < 0) {
a7ba3c7f
CB
1582 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1583 return -ret;
1584 } else {
1585 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1586 }
1587 } else {
1588 if (file_exists(path)) {
1589 ret = lxc_unstack_mountpoint(path, false);
1590 if (ret < 0) {
2a12fefd 1591 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
a7ba3c7f
CB
1592 return -ret;
1593 } else {
1594 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
2a12fefd 1595 }
2a12fefd
CB
1596 }
1597
1598 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1599 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1600 return -1;
1601 }
1602 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
7c6ef2a2
SH
1603 }
1604
2a12fefd 1605 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
9ba8130c 1606 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
3d7d929a 1607 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
9ba8130c 1608 return -1;
3d7d929a 1609
2a12fefd
CB
1610 ret = unlink(path);
1611 if (ret && errno != ENOENT) {
1612 SYSERROR("error unlinking %s", path);
1613 return -errno;
1614 }
1615
7c6ef2a2 1616 ret = symlink(lxcpath, path);
3d7d929a
CB
1617 if (ret < 0) {
1618 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
7c6ef2a2
SH
1619 return -1;
1620 }
1621
3d7d929a 1622 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
6e590161 1623 return 0;
1624}
1625
3d7d929a
CB
1626static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1627 const struct lxc_console *console, char *ttydir)
7c6ef2a2 1628{
3d7d929a
CB
1629 /* We don't have a rootfs, /dev/console will be shared. */
1630 if (!rootfs->path) {
1631 DEBUG("/dev/console will be shared with the host");
7c6ef2a2 1632 return 0;
3d7d929a
CB
1633 }
1634
7c6ef2a2 1635 if (!ttydir)
3d7d929a 1636 return lxc_setup_dev_console(rootfs, console);
7c6ef2a2 1637
3d7d929a 1638 return lxc_setup_ttydir_console(rootfs, console, ttydir);
7c6ef2a2
SH
1639}
1640
998ac676
RT
1641static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1642{
1643 struct mount_opt *mo;
1644
1645 /* If opt is found in mount_opt, set or clear flags.
1646 * Otherwise append it to data. */
1647
1648 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1649 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1650 if (mo->clear)
1651 *flags &= ~mo->flag;
1652 else
1653 *flags |= mo->flag;
1654 return;
1655 }
1656 }
1657
1658 if (strlen(*data))
1659 strcat(*data, ",");
1660 strcat(*data, opt);
1661}
1662
a17b1e65 1663int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1664 char **mntdata)
1665{
1666 char *s, *data;
1667 char *p, *saveptr = NULL;
1668
911324ef 1669 *mntdata = NULL;
91656ce5 1670 *mntflags = 0L;
911324ef
DL
1671
1672 if (!mntopts)
998ac676
RT
1673 return 0;
1674
911324ef 1675 s = strdup(mntopts);
998ac676 1676 if (!s) {
36eb9bde 1677 SYSERROR("failed to allocate memory");
998ac676
RT
1678 return -1;
1679 }
1680
1681 data = malloc(strlen(s) + 1);
1682 if (!data) {
36eb9bde 1683 SYSERROR("failed to allocate memory");
998ac676
RT
1684 free(s);
1685 return -1;
1686 }
1687 *data = 0;
1688
1689 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1690 p = strtok_r(NULL, ",", &saveptr))
1691 parse_mntopt(p, mntflags, &data);
1692
1693 if (*data)
1694 *mntdata = data;
1695 else
1696 free(data);
1697 free(s);
1698
1699 return 0;
1700}
1701
6fd5e769
SH
1702static void null_endofword(char *word)
1703{
1704 while (*word && *word != ' ' && *word != '\t')
1705 word++;
1706 *word = '\0';
1707}
1708
1709/*
1710 * skip @nfields spaces in @src
1711 */
1712static char *get_field(char *src, int nfields)
1713{
1714 char *p = src;
1715 int i;
1716
1717 for (i = 0; i < nfields; i++) {
1718 while (*p && *p != ' ' && *p != '\t')
1719 p++;
1720 if (!*p)
1721 break;
1722 p++;
1723 }
1724 return p;
1725}
1726
911324ef
DL
1727static int mount_entry(const char *fsname, const char *target,
1728 const char *fstype, unsigned long mountflags,
ae7a770e 1729 const char *data, int optional, int dev, const char *rootfs)
911324ef 1730{
614305f3 1731#ifdef HAVE_STATVFS
2938f7c8 1732 struct statvfs sb;
614305f3 1733#endif
2938f7c8 1734
592fd47a 1735 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1fc64d22
SG
1736 if (optional) {
1737 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1738 target, strerror(errno));
1739 return 0;
1740 }
1741 else {
1742 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1743 return -1;
1744 }
911324ef
DL
1745 }
1746
1747 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
1748 DEBUG("remounting %s on %s to respect bind or remount options",
1749 fsname ? fsname : "(none)", target ? target : "(none)");
7c5b6e7c
AS
1750 unsigned long rqd_flags = 0;
1751 if (mountflags & MS_RDONLY)
1752 rqd_flags |= MS_RDONLY;
614305f3 1753#ifdef HAVE_STATVFS
2938f7c8 1754 if (statvfs(fsname, &sb) == 0) {
7c5b6e7c 1755 unsigned long required_flags = rqd_flags;
2938f7c8
SH
1756 if (sb.f_flag & MS_NOSUID)
1757 required_flags |= MS_NOSUID;
ae7a770e 1758 if (sb.f_flag & MS_NODEV && !dev)
2938f7c8
SH
1759 required_flags |= MS_NODEV;
1760 if (sb.f_flag & MS_RDONLY)
1761 required_flags |= MS_RDONLY;
1762 if (sb.f_flag & MS_NOEXEC)
1763 required_flags |= MS_NOEXEC;
1764 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1765 /*
1766 * If this was a bind mount request, and required_flags
1767 * does not have any flags which are not already in
1768 * mountflags, then skip the remount
1769 */
1770 if (!(mountflags & MS_REMOUNT)) {
7c5b6e7c 1771 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
2938f7c8
SH
1772 DEBUG("mountflags already was %lu, skipping remount",
1773 mountflags);
1774 goto skipremount;
1775 }
1776 }
1777 mountflags |= required_flags;
6fd5e769 1778 }
614305f3 1779#endif
911324ef
DL
1780
1781 if (mount(fsname, target, fstype,
592fd47a 1782 mountflags | MS_REMOUNT, data) < 0) {
1fc64d22
SG
1783 if (optional) {
1784 INFO("failed to mount '%s' on '%s' (optional): %s",
1785 fsname, target, strerror(errno));
1786 return 0;
1787 }
1788 else {
1789 SYSERROR("failed to mount '%s' on '%s'",
1790 fsname, target);
1791 return -1;
1792 }
911324ef
DL
1793 }
1794 }
1795
614305f3 1796#ifdef HAVE_STATVFS
6fd5e769 1797skipremount:
614305f3 1798#endif
911324ef
DL
1799 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1800
1801 return 0;
1802}
1803
4e4ca161
SH
1804/*
1805 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1806 */
1807static void cull_mntent_opt(struct mntent *mntent)
1808{
1809 int i;
1810 char *p, *p2;
1811 char *list[] = {"create=dir",
1812 "create=file",
1813 "optional",
1814 NULL };
1815
1816 for (i=0; list[i]; i++) {
1817 if (!(p = strstr(mntent->mnt_opts, list[i])))
1818 continue;
1819 p2 = strchr(p, ',');
1820 if (!p2) {
1821 /* no more mntopts, so just chop it here */
1822 *p = '\0';
1823 continue;
1824 }
1825 memmove(p, p2+1, strlen(p2+1)+1);
1826 }
1827}
1828
4d5b72a1 1829static int mount_entry_create_dir_file(const struct mntent *mntent,
749f98d9
CB
1830 const char *path,
1831 const struct lxc_rootfs *rootfs,
1832 const char *lxc_name,
1833 const char *lxc_path)
0ad19a3f 1834{
608e3567 1835 int ret = 0;
911324ef 1836
749f98d9
CB
1837 if (!strncmp(mntent->mnt_type, "overlay", 7))
1838 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1839 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1840 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1841 if (ret < 0)
1842 return -1;
6e46cc0d 1843
34cfffb3 1844 if (hasmntopt(mntent, "create=dir")) {
749f98d9
CB
1845 ret = mkdir_p(path, 0755);
1846 if (ret < 0 && errno != EEXIST) {
1847 SYSERROR("Failed to create directory \"%s\"", path);
1848 return -1;
34cfffb3
SG
1849 }
1850 }
1851
4d5b72a1 1852 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
749f98d9
CB
1853 int fd;
1854 char *p1, *p2;
1855
1856 p1 = strdup(path);
1857 if (!p1)
1858 return -1;
1859
1860 p2 = dirname(p1);
1861
1862 ret = mkdir_p(p2, 0755);
1863 free(p1);
1864 if (ret < 0 && errno != EEXIST) {
1865 SYSERROR("Failed to create directory \"%s\"", path);
1866 return -1;
6e46cc0d 1867 }
749f98d9
CB
1868
1869 fd = open(path, O_CREAT, 0644);
1870 if (fd < 0)
1871 return -1;
1872 close(fd);
34cfffb3 1873 }
749f98d9
CB
1874
1875 return 0;
4d5b72a1
NC
1876}
1877
ec50007f
CB
1878/* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1879 * without a rootfs. */
db4aba38 1880static inline int mount_entry_on_generic(struct mntent *mntent,
d8b712bc
CB
1881 const char *path,
1882 const struct lxc_rootfs *rootfs,
1883 const char *lxc_name,
1884 const char *lxc_path)
4d5b72a1 1885{
d8b712bc 1886 int ret;
4d5b72a1
NC
1887 unsigned long mntflags;
1888 char *mntdata;
d8b712bc 1889 bool dev, optional;
ec50007f 1890 char *rootfs_path = NULL;
d8b712bc
CB
1891
1892 optional = hasmntopt(mntent, "optional") != NULL;
1893 dev = hasmntopt(mntent, "dev") != NULL;
1894
ec50007f
CB
1895 if (rootfs && rootfs->path)
1896 rootfs_path = rootfs->mount;
1897
d8b712bc
CB
1898 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
1899 lxc_path);
1900 if (ret < 0) {
1901 if (optional)
1902 return 0;
608e3567 1903
d8b712bc
CB
1904 return -1;
1905 }
4e4ca161
SH
1906 cull_mntent_opt(mntent);
1907
d8b712bc
CB
1908 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
1909 if (ret < 0)
a17b1e65 1910 return -1;
a17b1e65 1911
6e46cc0d 1912 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
ae7a770e 1913 mntdata, optional, dev, rootfs_path);
68c152ef 1914
911324ef 1915 free(mntdata);
911324ef
DL
1916 return ret;
1917}
1918
db4aba38
NC
1919static inline int mount_entry_on_systemfs(struct mntent *mntent)
1920{
1433c9f9 1921 int ret;
07667a6a 1922 char path[MAXPATHLEN];
1433c9f9
CB
1923
1924 /* For containers created without a rootfs all mounts are treated as
07667a6a
CB
1925 * absolute paths starting at / on the host.
1926 */
1433c9f9
CB
1927 if (mntent->mnt_dir[0] != '/')
1928 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1929 else
1930 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
07667a6a 1931 if (ret < 0 || ret >= sizeof(path))
1433c9f9 1932 return -1;
1433c9f9
CB
1933
1934 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
db4aba38
NC
1935}
1936
4e4ca161 1937static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2 1938 const struct lxc_rootfs *rootfs,
0a2dddd4
CB
1939 const char *lxc_name,
1940 const char *lxc_path)
911324ef 1941{
bdd2b34c 1942 int offset;
013bd428 1943 char *aux;
67e571de 1944 const char *lxcpath;
bdd2b34c
CB
1945 char path[MAXPATHLEN];
1946 int ret = 0;
0ad19a3f 1947
593e8478 1948 lxcpath = lxc_global_config_value("lxc.lxcpath");
bdd2b34c 1949 if (!lxcpath)
2a59a681 1950 return -1;
2a59a681 1951
bdd2b34c
CB
1952 /* If rootfs->path is a blockdev path, allow container fstab to use
1953 * <lxcpath>/<name>/rootfs" as the target prefix.
1954 */
1955 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1956 if (ret < 0 || ret >= MAXPATHLEN)
80a881b2
SH
1957 goto skipvarlib;
1958
1959 aux = strstr(mntent->mnt_dir, path);
1960 if (aux) {
1961 offset = strlen(path);
1962 goto skipabs;
1963 }
1964
1965skipvarlib:
013bd428
DL
1966 aux = strstr(mntent->mnt_dir, rootfs->path);
1967 if (!aux) {
bdd2b34c 1968 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
db4aba38 1969 return ret;
013bd428 1970 }
80a881b2
SH
1971 offset = strlen(rootfs->path);
1972
1973skipabs:
bdd2b34c
CB
1974 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
1975 if (ret < 0 || ret >= MAXPATHLEN)
a17b1e65 1976 return -1;
a17b1e65 1977
0a2dddd4 1978 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef 1979}
d330fe7b 1980
4e4ca161 1981static int mount_entry_on_relative_rootfs(struct mntent *mntent,
0a2dddd4
CB
1982 const struct lxc_rootfs *rootfs,
1983 const char *lxc_name,
1984 const char *lxc_path)
911324ef
DL
1985{
1986 char path[MAXPATHLEN];
911324ef 1987 int ret;
d330fe7b 1988
34cfffb3 1989 /* relative to root mount point */
6e46cc0d 1990 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1433c9f9 1991 if (ret < 0 || ret >= sizeof(path)) {
9ba8130c
SH
1992 ERROR("path name too long");
1993 return -1;
1994 }
911324ef 1995
0a2dddd4 1996 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
911324ef
DL
1997}
1998
80a881b2 1999static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
1ae3c19f 2000 const char *lxc_name, const char *lxc_path)
911324ef 2001{
aaf901be
AM
2002 struct mntent mntent;
2003 char buf[4096];
911324ef 2004 int ret = -1;
e76b8764 2005
aaf901be 2006 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
1ae3c19f
CB
2007 if (!rootfs->path)
2008 ret = mount_entry_on_systemfs(&mntent);
2009 else if (mntent.mnt_dir[0] != '/')
2010 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2011 lxc_name, lxc_path);
2012 else
2013 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2014 lxc_name, lxc_path);
2015 if (ret < 0)
2016 return -1;
0ad19a3f 2017 }
2018 ret = 0;
cd54d859 2019
1ae3c19f 2020 INFO("Set up mount entries");
e7938e9e
MN
2021 return ret;
2022}
2023
80a881b2 2024static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
42dff448 2025 const char *lxc_name, const char *lxc_path)
e7938e9e 2026{
42dff448 2027 FILE *f;
e7938e9e
MN
2028 int ret;
2029
2030 if (!fstab)
2031 return 0;
2032
42dff448
CB
2033 f = setmntent(fstab, "r");
2034 if (!f) {
2035 SYSERROR("Failed to open \"%s\"", fstab);
e7938e9e
MN
2036 return -1;
2037 }
2038
42dff448
CB
2039 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
2040 if (ret < 0)
2041 ERROR("Failed to set up mount entries");
e7938e9e 2042
42dff448 2043 endmntent(f);
0ad19a3f 2044 return ret;
2045}
2046
5ef5c9a3 2047FILE *make_anonymous_mount_file(struct lxc_list *mount)
e7938e9e 2048{
5ef5c9a3 2049 int ret;
e7938e9e 2050 char *mount_entry;
5ef5c9a3 2051 struct lxc_list *iterator;
6bd04140 2052 FILE *f;
5ef5c9a3
CB
2053 int fd = -1;
2054
2055 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2056 if (fd < 0) {
2057 if (errno != ENOSYS)
2058 return NULL;
6bd04140
CB
2059 f = tmpfile();
2060 TRACE("Created temporary mount file");
5ef5c9a3 2061 } else {
6bd04140
CB
2062 f = fdopen(fd, "r+");
2063 TRACE("Created anonymous mount file");
5ef5c9a3 2064 }
e7938e9e 2065
6bd04140
CB
2066 if (!f) {
2067 SYSERROR("Could not create mount file");
5ef5c9a3
CB
2068 if (fd != -1)
2069 close(fd);
9fc7f8c0 2070 return NULL;
e7938e9e
MN
2071 }
2072
2073 lxc_list_for_each(iterator, mount) {
2074 mount_entry = iterator->elem;
6bd04140 2075 ret = fprintf(f, "%s\n", mount_entry);
5ef5c9a3 2076 if (ret < strlen(mount_entry))
6bd04140 2077 WARN("Could not write mount entry to mount file");
5ef5c9a3
CB
2078 }
2079
6bd04140
CB
2080 ret = fseek(f, 0, SEEK_SET);
2081 if (ret < 0) {
2082 SYSERROR("Failed to seek mount file");
2083 fclose(f);
5ef5c9a3 2084 return NULL;
e7938e9e
MN
2085 }
2086
6bd04140 2087 return f;
9fc7f8c0
TA
2088}
2089
5ef5c9a3
CB
2090static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2091 struct lxc_list *mount, const char *lxc_name,
2092 const char *lxc_path)
9fc7f8c0 2093{
19b5d755 2094 FILE *f;
9fc7f8c0
TA
2095 int ret;
2096
19b5d755
CB
2097 f = make_anonymous_mount_file(mount);
2098 if (!f)
9fc7f8c0 2099 return -1;
e7938e9e 2100
19b5d755 2101 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
e7938e9e 2102
19b5d755 2103 fclose(f);
e7938e9e
MN
2104 return ret;
2105}
2106
bab88e68
CS
2107static int parse_cap(const char *cap)
2108{
2109 char *ptr = NULL;
84760c11 2110 size_t i;
2111 int capid = -1;
bab88e68 2112
7035407c
DE
2113 if (!strcmp(cap, "none"))
2114 return -2;
2115
bab88e68
CS
2116 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2117
2118 if (strcmp(cap, caps_opt[i].name))
2119 continue;
2120
2121 capid = caps_opt[i].value;
2122 break;
2123 }
2124
2125 if (capid < 0) {
2126 /* try to see if it's numeric, so the user may specify
2127 * capabilities that the running kernel knows about but
2128 * we don't */
2129 errno = 0;
2130 capid = strtol(cap, &ptr, 10);
2131 if (!ptr || *ptr != '\0' || errno != 0)
2132 /* not a valid number */
2133 capid = -1;
2134 else if (capid > lxc_caps_last_cap())
2135 /* we have a number but it's not a valid
2136 * capability */
2137 capid = -1;
2138 }
2139
2140 return capid;
2141}
2142
0769b82a
CS
2143int in_caplist(int cap, struct lxc_list *caps)
2144{
2145 struct lxc_list *iterator;
2146 int capid;
2147
2148 lxc_list_for_each(iterator, caps) {
2149 capid = parse_cap(iterator->elem);
2150 if (capid == cap)
2151 return 1;
2152 }
2153
2154 return 0;
2155}
2156
81810dd1
DL
2157static int setup_caps(struct lxc_list *caps)
2158{
2159 struct lxc_list *iterator;
2160 char *drop_entry;
bab88e68 2161 int capid;
81810dd1
DL
2162
2163 lxc_list_for_each(iterator, caps) {
2164
2165 drop_entry = iterator->elem;
2166
bab88e68 2167 capid = parse_cap(drop_entry);
d55bc1ad 2168
81810dd1 2169 if (capid < 0) {
1e11be34
DL
2170 ERROR("unknown capability %s", drop_entry);
2171 return -1;
81810dd1
DL
2172 }
2173
2174 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2175
2176 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2177 SYSERROR("failed to remove %s capability", drop_entry);
2178 return -1;
2179 }
81810dd1
DL
2180
2181 }
2182
1fb86a7c
SH
2183 DEBUG("capabilities have been setup");
2184
2185 return 0;
2186}
2187
2188static int dropcaps_except(struct lxc_list *caps)
2189{
2190 struct lxc_list *iterator;
2191 char *keep_entry;
1fb86a7c
SH
2192 int i, capid;
2193 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2194 INFO("found %d capabilities", numcaps);
1fb86a7c 2195
2caf9a97
SH
2196 if (numcaps <= 0 || numcaps > 200)
2197 return -1;
2198
1fb86a7c
SH
2199 // caplist[i] is 1 if we keep capability i
2200 int *caplist = alloca(numcaps * sizeof(int));
2201 memset(caplist, 0, numcaps * sizeof(int));
2202
2203 lxc_list_for_each(iterator, caps) {
2204
2205 keep_entry = iterator->elem;
2206
bab88e68 2207 capid = parse_cap(keep_entry);
1fb86a7c 2208
7035407c
DE
2209 if (capid == -2)
2210 continue;
2211
1fb86a7c
SH
2212 if (capid < 0) {
2213 ERROR("unknown capability %s", keep_entry);
2214 return -1;
2215 }
2216
8255688a 2217 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2218
2219 caplist[capid] = 1;
2220 }
2221 for (i=0; i<numcaps; i++) {
2222 if (caplist[i])
2223 continue;
2224 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2225 SYSERROR("failed to remove capability %d", i);
2226 return -1;
2227 }
1fb86a7c
SH
2228 }
2229
2230 DEBUG("capabilities have been setup");
81810dd1
DL
2231
2232 return 0;
2233}
2234
0ad19a3f 2235static int setup_hw_addr(char *hwaddr, const char *ifname)
2236{
2237 struct sockaddr sockaddr;
2238 struct ifreq ifr;
fad6ef95 2239 int ret, fd, saved_errno;
0ad19a3f 2240
3cfc0f3a
MN
2241 ret = lxc_convert_mac(hwaddr, &sockaddr);
2242 if (ret) {
2243 ERROR("mac address '%s' conversion failed : %s",
2244 hwaddr, strerror(-ret));
0ad19a3f 2245 return -1;
2246 }
2247
2248 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2249 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2250 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2251
2252 fd = socket(AF_INET, SOCK_DGRAM, 0);
2253 if (fd < 0) {
3ab87b66 2254 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2255 return -1;
2256 }
2257
2258 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
fad6ef95 2259 saved_errno = errno;
0ad19a3f 2260 close(fd);
2261 if (ret)
fad6ef95 2262 ERROR("ioctl failure : %s", strerror(saved_errno));
0ad19a3f 2263
5da6aa8c 2264 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2265
0ad19a3f 2266 return ret;
2267}
2268
82d5ae15 2269static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2270{
82d5ae15
DL
2271 struct lxc_list *iterator;
2272 struct lxc_inetdev *inetdev;
3cfc0f3a 2273 int err;
0ad19a3f 2274
82d5ae15
DL
2275 lxc_list_for_each(iterator, ip) {
2276
2277 inetdev = iterator->elem;
2278
0093bb8c
DL
2279 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2280 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2281 if (err) {
2282 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2283 ifindex, strerror(-err));
82d5ae15
DL
2284 return -1;
2285 }
2286 }
2287
2288 return 0;
0ad19a3f 2289}
2290
82d5ae15 2291static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2292{
82d5ae15 2293 struct lxc_list *iterator;
7fa9074f 2294 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2295 int err;
0ad19a3f 2296
82d5ae15
DL
2297 lxc_list_for_each(iterator, ip) {
2298
2299 inet6dev = iterator->elem;
2300
b3df193c 2301 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2302 &inet6dev->mcast, &inet6dev->acast,
2303 inet6dev->prefix);
3cfc0f3a
MN
2304 if (err) {
2305 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2306 ifindex, strerror(-err));
82d5ae15 2307 return -1;
3cfc0f3a 2308 }
82d5ae15
DL
2309 }
2310
2311 return 0;
0ad19a3f 2312}
2313
e337179a 2314static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
0ad19a3f 2315{
0ad19a3f 2316 char ifname[IFNAMSIZ];
3cfc0f3a 2317 int err;
d1826cf1
CB
2318 const char *net_type_name;
2319 char *current_ifname = ifname;
0ad19a3f 2320
82d5ae15
DL
2321 /* empty network namespace */
2322 if (!netdev->ifindex) {
b0efbac4 2323 if (netdev->flags & IFF_UP) {
d472214b 2324 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2325 if (err) {
2326 ERROR("failed to set the loopback up : %s",
2327 strerror(-err));
82d5ae15
DL
2328 return -1;
2329 }
82d5ae15 2330 }
d1826cf1
CB
2331
2332 if (netdev->type == LXC_NET_EMPTY)
2333 return 0;
2334
2335 if (netdev->type == LXC_NET_NONE)
40790553 2336 return 0;
d1826cf1
CB
2337
2338 if (netdev->type != LXC_NET_VETH) {
2339 net_type_name = lxc_net_type_to_str(netdev->type);
2340 ERROR("%s networks are not supported for containers "
2341 "not setup up by privileged users",
2342 net_type_name);
2343 return -1;
2344 }
2345
40790553 2346 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2347 }
13954cce 2348
b466dc33 2349 /* get the new ifindex in case of physical netdev */
40790553 2350 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2351 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2352 ERROR("failed to get ifindex for %s",
2353 netdev->link);
2354 return -1;
2355 }
40790553 2356 }
b466dc33 2357
82d5ae15
DL
2358 /* retrieve the name of the interface */
2359 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2360 ERROR("no interface corresponding to index '%d'",
82d5ae15 2361 netdev->ifindex);
0ad19a3f 2362 return -1;
2363 }
13954cce 2364
018ef520 2365 /* default: let the system to choose one interface name */
9d083402 2366 if (!netdev->name)
fb6d9b2f
DL
2367 netdev->name = netdev->type == LXC_NET_PHYS ?
2368 netdev->link : "eth%d";
018ef520 2369
82d5ae15 2370 /* rename the interface name */
40790553
SH
2371 if (strcmp(ifname, netdev->name) != 0) {
2372 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2373 if (err) {
2374 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2375 strerror(-err));
2376 return -1;
2377 }
018ef520
DL
2378 }
2379
2380 /* Re-read the name of the interface because its name has changed
2381 * and would be automatically allocated by the system
2382 */
82d5ae15 2383 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2384 ERROR("no interface corresponding to index '%d'",
82d5ae15 2385 netdev->ifindex);
018ef520 2386 return -1;
0ad19a3f 2387 }
2388
82d5ae15
DL
2389 /* set a mac address */
2390 if (netdev->hwaddr) {
2391 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2392 ERROR("failed to setup hw address for '%s'",
82d5ae15 2393 current_ifname);
0ad19a3f 2394 return -1;
2395 }
2396 }
2397
82d5ae15
DL
2398 /* setup ipv4 addresses on the interface */
2399 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2400 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2401 ifname);
2402 return -1;
2403 }
2404
82d5ae15
DL
2405 /* setup ipv6 addresses on the interface */
2406 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2407 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2408 ifname);
2409 return -1;
2410 }
2411
82d5ae15 2412 /* set the network device up */
b0efbac4 2413 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2414 int err;
2415
d472214b 2416 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2417 if (err) {
2418 ERROR("failed to set '%s' up : %s", current_ifname,
2419 strerror(-err));
0ad19a3f 2420 return -1;
2421 }
2422
2423 /* the network is up, make the loopback up too */
d472214b 2424 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2425 if (err) {
2426 ERROR("failed to set the loopback up : %s",
2427 strerror(-err));
0ad19a3f 2428 return -1;
2429 }
2430 }
2431
f8fee0e2
MK
2432 /* We can only set up the default routes after bringing
2433 * up the interface, sine bringing up the interface adds
2434 * the link-local routes and we can't add a default
2435 * route if the gateway is not reachable. */
2436
2437 /* setup ipv4 gateway on the interface */
2438 if (netdev->ipv4_gateway) {
2439 if (!(netdev->flags & IFF_UP)) {
2440 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2441 return -1;
2442 }
2443
2444 if (lxc_list_empty(&netdev->ipv4)) {
2445 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2446 return -1;
2447 }
2448
2449 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2450 if (err) {
fc739df5
SG
2451 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2452 if (err) {
2453 ERROR("failed to add ipv4 dest for '%s': %s",
2454 ifname, strerror(-err));
2455 }
2456
2457 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2458 if (err) {
2459 ERROR("failed to setup ipv4 gateway for '%s': %s",
2460 ifname, strerror(-err));
2461 if (netdev->ipv4_gateway_auto) {
2462 char buf[INET_ADDRSTRLEN];
2463 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2464 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2465 }
2466 return -1;
19a26f82 2467 }
f8fee0e2
MK
2468 }
2469 }
2470
2471 /* setup ipv6 gateway on the interface */
2472 if (netdev->ipv6_gateway) {
2473 if (!(netdev->flags & IFF_UP)) {
2474 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2475 return -1;
2476 }
2477
2478 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2479 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2480 return -1;
2481 }
2482
2483 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2484 if (err) {
fc739df5
SG
2485 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2486 if (err) {
2487 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2488 ifname, strerror(-err));
19a26f82 2489 }
fc739df5
SG
2490
2491 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2492 if (err) {
2493 ERROR("failed to setup ipv6 gateway for '%s': %s",
2494 ifname, strerror(-err));
2495 if (netdev->ipv6_gateway_auto) {
2496 char buf[INET6_ADDRSTRLEN];
2497 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2498 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2499 }
2500 return -1;
2501 }
f8fee0e2
MK
2502 }
2503 }
2504
cd54d859
DL
2505 DEBUG("'%s' has been setup", current_ifname);
2506
0ad19a3f 2507 return 0;
2508}
2509
e337179a
CB
2510static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2511 struct lxc_list *network)
0ad19a3f 2512{
82d5ae15 2513 struct lxc_list *iterator;
82d5ae15 2514 struct lxc_netdev *netdev;
0ad19a3f 2515
c302b476
CB
2516 lxc_log_configured_netdevs(conf);
2517
5f4535a3 2518 lxc_list_for_each(iterator, network) {
5f4535a3 2519 netdev = iterator->elem;
82d5ae15 2520
f9373e40
CB
2521 /* REMOVE in LXC 3.0 */
2522 if (netdev->idx < 0) {
2523 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2524 "networks is DEPRECATED, please switch to using "
2525 "\"lxc.net.[i].* keys\"");
2526 }
2527
e337179a 2528 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
82d5ae15
DL
2529 ERROR("failed to setup netdev");
2530 return -1;
2531 }
2532 }
cd54d859 2533
5f4535a3
DL
2534 if (!lxc_list_empty(network))
2535 INFO("network has been setup");
cd54d859
DL
2536
2537 return 0;
0ad19a3f 2538}
2539
c6d09e15
WB
2540static int parse_resource(const char *res) {
2541 size_t i;
2542 int resid = -1;
2543
2544 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2545 if (strcmp(res, limit_opt[i].name) == 0)
2546 return limit_opt[i].value;
2547 }
2548
2549 /* try to see if it's numeric, so the user may specify
2550 * resources that the running kernel knows about but
2551 * we don't */
2552 if (lxc_safe_int(res, &resid) == 0)
2553 return resid;
2554 return -1;
2555}
2556
2557int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2558 struct lxc_list *it;
2559 struct lxc_limit *lim;
2560 int resid;
2561
2562 lxc_list_for_each(it, limits) {
2563 lim = it->elem;
2564
2565 resid = parse_resource(lim->resource);
2566 if (resid < 0) {
2567 ERROR("unknown resource %s", lim->resource);
2568 return -1;
2569 }
2570
2571 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2572 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2573 return -1;
2574 }
2575 }
2576 return 0;
2577}
2578
2af6bd1b 2579/* try to move physical nics to the init netns */
5610055a 2580void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2af6bd1b 2581{
64d2fcb5 2582 int i, oldfd;
4ec31c52 2583 char ifname[IFNAMSIZ];
2af6bd1b 2584
5610055a 2585 if (netnsfd < 0 || conf->num_savednics == 0)
2af6bd1b
SH
2586 return;
2587
64d2fcb5 2588 INFO("Running to reset %d nic names.", conf->num_savednics);
5610055a 2589
64d2fcb5
CB
2590 oldfd = lxc_preserve_ns(getpid(), "net");
2591 if (oldfd < 0) {
2592 SYSERROR("Failed to open monitor netns fd.");
2af6bd1b
SH
2593 return;
2594 }
64d2fcb5 2595
2af6bd1b
SH
2596 if (setns(netnsfd, 0) != 0) {
2597 SYSERROR("Failed to enter container netns to reset nics");
2598 close(oldfd);
2599 return;
2600 }
2601 for (i=0; i<conf->num_savednics; i++) {
2602 struct saved_nic *s = &conf->saved_nics[i];
f2e206ff 2603 /* retrieve the name of the interface */
2604 if (!if_indextoname(s->ifindex, ifname)) {
2605 WARN("no interface corresponding to index '%d'", s->ifindex);
2606 continue;
2607 }
5610055a 2608 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
f2e206ff 2609 WARN("Error moving nic name:%s back to host netns", ifname);
5610055a 2610 free(s->orig_name);
2af6bd1b 2611 }
5610055a
WB
2612 conf->num_savednics = 0;
2613
2af6bd1b
SH
2614 if (setns(oldfd, 0) != 0)
2615 SYSERROR("Failed to re-enter monitor's netns");
2616 close(oldfd);
2617}
2618
ae9242c8
SH
2619static char *default_rootfs_mount = LXCROOTFSMOUNT;
2620
7b379ab3 2621struct lxc_conf *lxc_conf_init(void)
089cd8b8 2622{
7b379ab3 2623 struct lxc_conf *new;
26ddeedd 2624 int i;
7b379ab3 2625
13277ec4 2626 new = malloc(sizeof(*new));
7b379ab3 2627 if (!new) {
13277ec4 2628 ERROR("lxc_conf_init : %s", strerror(errno));
7b379ab3
MN
2629 return NULL;
2630 }
2631 memset(new, 0, sizeof(*new));
2632
4b73005c 2633 new->loglevel = LXC_LOG_LEVEL_NOTSET;
cccc74b5 2634 new->personality = -1;
124fa0a8 2635 new->autodev = 1;
596a818d
DE
2636 new->console.log_path = NULL;
2637 new->console.log_fd = -1;
28a4b0e5 2638 new->console.path = NULL;
63376d7d 2639 new->console.peer = -1;
b5159817
DE
2640 new->console.peerpty.busy = -1;
2641 new->console.peerpty.master = -1;
2642 new->console.peerpty.slave = -1;
63376d7d
DL
2643 new->console.master = -1;
2644 new->console.slave = -1;
2645 new->console.name[0] = '\0';
d2e30e99 2646 new->maincmd_fd = -1;
76a26f55 2647 new->nbd_idx = -1;
54c30e29 2648 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048 2649 if (!new->rootfs.mount) {
13277ec4 2650 ERROR("lxc_conf_init : %s", strerror(errno));
53f3f048
SH
2651 free(new);
2652 return NULL;
2653 }
858377e4 2654 new->logfd = -1;
7b379ab3
MN
2655 lxc_list_init(&new->cgroup);
2656 lxc_list_init(&new->network);
2657 lxc_list_init(&new->mount_list);
81810dd1 2658 lxc_list_init(&new->caps);
1fb86a7c 2659 lxc_list_init(&new->keepcaps);
f6d3e3e4 2660 lxc_list_init(&new->id_map);
f979ac15 2661 lxc_list_init(&new->includes);
4184c3e1 2662 lxc_list_init(&new->aliens);
7c661726 2663 lxc_list_init(&new->environment);
c6d09e15 2664 lxc_list_init(&new->limits);
26ddeedd
SH
2665 for (i=0; i<NUM_LXC_HOOKS; i++)
2666 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2667 lxc_list_init(&new->groups);
fe4de9a6
DE
2668 new->lsm_aa_profile = NULL;
2669 new->lsm_se_context = NULL;
5112cd70 2670 new->tmp_umount_proc = 0;
7b379ab3 2671
9f30a190
MM
2672 for (i = 0; i < LXC_NS_MAX; i++)
2673 new->inherit_ns_fd[i] = -1;
2674
72bb04e4
PT
2675 /* if running in a new user namespace, init and COMMAND
2676 * default to running as UID/GID 0 when using lxc-execute */
2677 new->init_uid = 0;
2678 new->init_gid = 0;
2679
7b379ab3 2680 return new;
089cd8b8
DL
2681}
2682
a589434e 2683static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2684{
b0ee5983
CB
2685 char *veth1, *veth2;
2686 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
b7b2fde4
CB
2687 int bridge_index, err;
2688 unsigned int mtu = 0;
13954cce 2689
8bee8851 2690 if (netdev->priv.veth_attr.pair) {
e892973e 2691 veth1 = netdev->priv.veth_attr.pair;
8bee8851
WB
2692 if (handler->conf->reboot)
2693 lxc_netdev_delete_by_name(veth1);
2694 } else {
9ba8130c
SH
2695 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2696 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2697 ERROR("veth1 name too long");
2698 return -1;
2699 }
a0265685 2700 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2701 if (!veth1) {
2702 ERROR("failed to allocate a temporary name");
2703 return -1;
2704 }
74a2b586
JK
2705 /* store away for deconf */
2706 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2707 }
82d5ae15 2708
0e391e57 2709 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2710 veth2 = lxc_mkifname(veth2buf);
ad40563e 2711 if (!veth2) {
82d5ae15 2712 ERROR("failed to allocate a temporary name");
ad40563e 2713 goto out_delete;
0ad19a3f 2714 }
2715
3cfc0f3a
MN
2716 err = lxc_veth_create(veth1, veth2);
2717 if (err) {
b0ee5983
CB
2718 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2719 veth2, strerror(-err));
ad40563e 2720 goto out_delete;
0ad19a3f 2721 }
13954cce 2722
49684c0b
CS
2723 /* changing the high byte of the mac address to 0xfe, the bridge interface
2724 * will always keep the host's mac address and not take the mac address
2725 * of a container */
2726 err = setup_private_host_hw_addr(veth1);
2727 if (err) {
b0ee5983
CB
2728 ERROR("failed to change mac address of host interface \"%s\": %s",
2729 veth1, strerror(-err));
49684c0b
CS
2730 goto out_delete;
2731 }
2732
af651aa9
SN
2733 netdev->ifindex = if_nametoindex(veth2);
2734 if (!netdev->ifindex) {
b0ee5983 2735 ERROR("failed to retrieve the index for \"%s\"", veth2);
af651aa9
SN
2736 goto out_delete;
2737 }
2738
82d5ae15 2739 if (netdev->mtu) {
b7b2fde4 2740 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
b0ee5983 2741 WARN("failed to parse mtu from");
b7b2fde4 2742 else
b0ee5983 2743 INFO("retrieved mtu %d", mtu);
e54864d3 2744 } else if (netdev->link) {
e9280f65 2745 bridge_index = if_nametoindex(netdev->link);
729e8bf6
CB
2746 if (bridge_index) {
2747 mtu = netdev_get_mtu(bridge_index);
b0ee5983 2748 INFO("retrieved mtu %d from %s", mtu, netdev->link);
729e8bf6
CB
2749 } else {
2750 mtu = netdev_get_mtu(netdev->ifindex);
b0ee5983 2751 INFO("retrieved mtu %d from %s", mtu, veth2);
729e8bf6 2752 }
e54864d3
NC
2753 }
2754
2755 if (mtu) {
2756 err = lxc_netdev_set_mtu(veth1, mtu);
3cfc0f3a 2757 if (!err)
e54864d3 2758 err = lxc_netdev_set_mtu(veth2, mtu);
3cfc0f3a 2759 if (err) {
b0ee5983
CB
2760 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2761 "and \"%s\": %s",
e54864d3 2762 mtu, veth1, veth2, strerror(-err));
eb14c10a 2763 goto out_delete;
75d09f83
DL
2764 }
2765 }
2766
3cfc0f3a 2767 if (netdev->link) {
c43cbc04 2768 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
3cfc0f3a 2769 if (err) {
b0ee5983
CB
2770 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2771 veth1, netdev->link, strerror(-err));
3cfc0f3a
MN
2772 goto out_delete;
2773 }
b0ee5983 2774 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
eb14c10a
DL
2775 }
2776
d472214b 2777 err = lxc_netdev_up(veth1);
6e35af2e 2778 if (err) {
b0ee5983 2779 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
6e35af2e 2780 goto out_delete;
0ad19a3f 2781 }
2782
e3b4c4c4 2783 if (netdev->upscript) {
751d9dcd
DL
2784 err = run_script(handler->name, "net", netdev->upscript, "up",
2785 "veth", veth1, (char*) NULL);
2786 if (err)
e3b4c4c4 2787 goto out_delete;
e3b4c4c4
ST
2788 }
2789
b0ee5983
CB
2790 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2791 netdev->ifindex);
82d5ae15 2792
6ab9ab6d 2793 return 0;
eb14c10a
DL
2794
2795out_delete:
b316d209
CB
2796 if (netdev->ifindex != 0)
2797 lxc_netdev_delete_by_name(veth1);
f10fad2f 2798 if (!netdev->priv.veth_attr.pair)
ad40563e 2799 free(veth1);
f10fad2f 2800 free(veth2);
6ab9ab6d 2801 return -1;
13954cce 2802}
d957ae2d 2803
74a2b586
JK
2804static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2805{
2806 char *veth1;
2807 int err;
2808
2809 if (netdev->priv.veth_attr.pair)
2810 veth1 = netdev->priv.veth_attr.pair;
2811 else
2812 veth1 = netdev->priv.veth_attr.veth1;
2813
2814 if (netdev->downscript) {
2815 err = run_script(handler->name, "net", netdev->downscript,
2816 "down", "veth", veth1, (char*) NULL);
2817 if (err)
2818 return -1;
2819 }
2820 return 0;
2821}
2822
a589434e 2823static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2824{
0e391e57 2825 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2826 int err;
d957ae2d
MT
2827
2828 if (!netdev->link) {
2829 ERROR("no link specified for macvlan netdev");
2830 return -1;
2831 }
13954cce 2832
9ba8130c
SH
2833 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2834 if (err >= sizeof(peerbuf))
2835 return -1;
82d5ae15 2836
a0265685 2837 peer = lxc_mkifname(peerbuf);
ad40563e 2838 if (!peer) {
82d5ae15
DL
2839 ERROR("failed to make a temporary name");
2840 return -1;
0ad19a3f 2841 }
2842
3cfc0f3a
MN
2843 err = lxc_macvlan_create(netdev->link, peer,
2844 netdev->priv.macvlan_attr.mode);
2845 if (err) {
2846 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2847 peer, netdev->link, strerror(-err));
ad40563e 2848 goto out;
0ad19a3f 2849 }
2850
82d5ae15
DL
2851 netdev->ifindex = if_nametoindex(peer);
2852 if (!netdev->ifindex) {
36eb9bde 2853 ERROR("failed to retrieve the index for %s", peer);
ad40563e 2854 goto out;
22ebac19 2855 }
2856
e3b4c4c4 2857 if (netdev->upscript) {
751d9dcd
DL
2858 err = run_script(handler->name, "net", netdev->upscript, "up",
2859 "macvlan", netdev->link, (char*) NULL);
2860 if (err)
ad40563e 2861 goto out;
e3b4c4c4
ST
2862 }
2863
a589434e 2864 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
e892973e 2865 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 2866
d957ae2d 2867 return 0;
ad40563e
ÇO
2868out:
2869 lxc_netdev_delete_by_name(peer);
2870 free(peer);
2871 return -1;
0ad19a3f 2872}
2873
74a2b586
JK
2874static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2875{
2876 int err;
2877
2878 if (netdev->downscript) {
2879 err = run_script(handler->name, "net", netdev->downscript,
2880 "down", "macvlan", netdev->link,
2881 (char*) NULL);
2882 if (err)
2883 return -1;
2884 }
2885 return 0;
2886}
2887
a589434e
JN
2888/* XXX: merge with instantiate_macvlan */
2889static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
2890{
2891 char peer[IFNAMSIZ];
3cfc0f3a 2892 int err;
82f58d03 2893 static uint16_t vlan_cntr = 0;
b7b2fde4 2894 unsigned int mtu = 0;
26c39028
JHS
2895
2896 if (!netdev->link) {
2897 ERROR("no link specified for vlan netdev");
2898 return -1;
2899 }
2900
82f58d03 2901 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
9ba8130c
SH
2902 if (err >= sizeof(peer)) {
2903 ERROR("peer name too long");
2904 return -1;
2905 }
26c39028 2906
3cfc0f3a
MN
2907 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2908 if (err) {
2909 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2910 peer, netdev->link, strerror(-err));
26c39028
JHS
2911 return -1;
2912 }
2913
2914 netdev->ifindex = if_nametoindex(peer);
2915 if (!netdev->ifindex) {
2916 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 2917 lxc_netdev_delete_by_name(peer);
26c39028
JHS
2918 return -1;
2919 }
2920
a589434e 2921 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
e892973e 2922 netdev->ifindex);
b4fb7de1 2923 if (netdev->mtu) {
b7b2fde4
CB
2924 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2925 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2926 netdev->ifindex, netdev->name);
2927 return -1;
2928 }
2929 err = lxc_netdev_set_mtu(peer, mtu);
b4fb7de1
VL
2930 if (err) {
2931 ERROR("failed to set mtu '%s' for %s : %s",
2932 netdev->mtu, peer, strerror(-err));
2933 lxc_netdev_delete_by_name(peer);
2934 return -1;
2935 }
2936 }
e892973e 2937
26c39028
JHS
2938 return 0;
2939}
2940
74a2b586
JK
2941static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2942{
2943 return 0;
2944}
2945
a589434e 2946static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2947{
6168e99f
DL
2948 if (!netdev->link) {
2949 ERROR("no link specified for the physical interface");
2950 return -1;
2951 }
2952
9d083402 2953 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 2954 if (!netdev->ifindex) {
9d083402 2955 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 2956 return -1;
2957 }
2958
e3b4c4c4
ST
2959 if (netdev->upscript) {
2960 int err;
751d9dcd
DL
2961 err = run_script(handler->name, "net", netdev->upscript,
2962 "up", "phys", netdev->link, (char*) NULL);
2963 if (err)
e3b4c4c4 2964 return -1;
e3b4c4c4
ST
2965 }
2966
82d5ae15 2967 return 0;
0ad19a3f 2968}
2969
74a2b586
JK
2970static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2971{
2972 int err;
2973
2974 if (netdev->downscript) {
2975 err = run_script(handler->name, "net", netdev->downscript,
2976 "down", "phys", netdev->link, (char*) NULL);
2977 if (err)
2978 return -1;
2979 }
2980 return 0;
2981}
2982
a589434e 2983static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
26b797f3
SH
2984{
2985 netdev->ifindex = 0;
2986 return 0;
2987}
2988
a589434e 2989static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2990{
82d5ae15 2991 netdev->ifindex = 0;
e3b4c4c4
ST
2992 if (netdev->upscript) {
2993 int err;
751d9dcd
DL
2994 err = run_script(handler->name, "net", netdev->upscript,
2995 "up", "empty", (char*) NULL);
2996 if (err)
e3b4c4c4 2997 return -1;
e3b4c4c4 2998 }
82d5ae15 2999 return 0;
0ad19a3f 3000}
3001
74a2b586
JK
3002static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3003{
3004 int err;
3005
3006 if (netdev->downscript) {
3007 err = run_script(handler->name, "net", netdev->downscript,
3008 "down", "empty", (char*) NULL);
3009 if (err)
3010 return -1;
3011 }
3012 return 0;
3013}
3014
26b797f3
SH
3015static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3016{
3017 return 0;
3018}
3019
3020int lxc_requests_empty_network(struct lxc_handler *handler)
3021{
3022 struct lxc_list *network = &handler->conf->network;
3023 struct lxc_list *iterator;
3024 struct lxc_netdev *netdev;
3025 bool found_none = false, found_nic = false;
3026
3027 if (lxc_list_empty(network))
3028 return 0;
3029
3030 lxc_list_for_each(iterator, network) {
3031
3032 netdev = iterator->elem;
3033
3034 if (netdev->type == LXC_NET_NONE)
3035 found_none = true;
3036 else
3037 found_nic = true;
3038 }
3039 if (found_none && !found_nic)
3040 return 1;
3041 return 0;
3042}
3043
e337179a 3044int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
0ad19a3f 3045{
e337179a 3046 bool am_root;
82d5ae15 3047 struct lxc_netdev *netdev;
e337179a
CB
3048 struct lxc_list *iterator;
3049 struct lxc_list *network = &handler->conf->network;
cbef6c52 3050
e337179a
CB
3051 /* We need to be root. */
3052 am_root = (getuid() == 0);
cbef6c52
SH
3053 if (!am_root)
3054 return 0;
0ad19a3f 3055
5f4535a3 3056 lxc_list_for_each(iterator, network) {
5f4535a3 3057 netdev = iterator->elem;
13954cce 3058
e337179a
CB
3059 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3060 ERROR("invalid network configuration type '%d'",
3061 netdev->type);
56637458
CB
3062 return -1;
3063 }
3064
e337179a
CB
3065 if (netdev->type != LXC_NET_MACVLAN &&
3066 netdev->priv.macvlan_attr.mode) {
3067 ERROR("Invalid macvlan.mode for a non-macvlan netdev");
56637458
CB
3068 return -1;
3069 }
3070
e337179a
CB
3071 if (netdev->type != LXC_NET_VETH &&
3072 netdev->priv.veth_attr.pair) {
3073 ERROR("Invalid veth pair for a non-veth netdev");
56637458
CB
3074 return -1;
3075 }
3076
e337179a
CB
3077 if (netdev->type != LXC_NET_VLAN &&
3078 netdev->priv.vlan_attr.vid > 0) {
3079 ERROR("Invalid vlan.id for a non-macvlan netdev");
82d5ae15
DL
3080 return -1;
3081 }
0ad19a3f 3082
e3b4c4c4 3083 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3084 ERROR("failed to create netdev");
3085 return -1;
3086 }
e3b4c4c4 3087
0ad19a3f 3088 }
3089
3090 return 0;
3091}
3092
358daf49 3093bool lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3094{
e97946ae 3095 int ret;
74a2b586 3096 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3097 struct lxc_list *iterator;
3098 struct lxc_netdev *netdev;
358daf49 3099 bool deleted_all = true;
7fef7a06
DL
3100
3101 lxc_list_for_each(iterator, network) {
3102 netdev = iterator->elem;
d472214b 3103
74a2b586 3104 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352 3105 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
358daf49
CB
3106 WARN("Failed to rename interface with index %d "
3107 "to its initial name \"%s\".",
3108 netdev->ifindex, netdev->link);
d472214b 3109 continue;
d8f8e352 3110 }
d472214b 3111
74a2b586 3112 if (netdev_deconf[netdev->type](handler, netdev)) {
e97946ae 3113 WARN("Failed to destroy netdev");
74a2b586
JK
3114 }
3115
d8f8e352
DL
3116 /* Recent kernel remove the virtual interfaces when the network
3117 * namespace is destroyed but in case we did not moved the
3118 * interface to the network namespace, we have to destroy it
3119 */
e97946ae
CB
3120 if (netdev->ifindex != 0) {
3121 ret = lxc_netdev_delete_by_index(netdev->ifindex);
358daf49
CB
3122 if (-ret == ENODEV) {
3123 INFO("Interface \"%s\" with index %d already "
3124 "deleted or existing in different network "
3125 "namespace.",
3126 netdev->name ? netdev->name : "(null)",
3127 netdev->ifindex);
3128 } else if (ret < 0) {
3129 deleted_all = false;
3130 WARN("Failed to remove interface \"%s\" with "
3131 "index %d: %s.",
3132 netdev->name ? netdev->name : "(null)",
3133 netdev->ifindex, strerror(-ret));
3134 } else {
3135 INFO("Removed interface \"%s\" with index %d.",
3136 netdev->name ? netdev->name : "(null)",
3137 netdev->ifindex);
3138 }
e97946ae
CB
3139 }
3140
3141 /* Explicitly delete host veth device to prevent lingering
3142 * devices. We had issues in LXD around this.
3143 */
b316d209 3144 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
358daf49
CB
3145 char *hostveth;
3146 if (netdev->priv.veth_attr.pair) {
e97946ae 3147 hostveth = netdev->priv.veth_attr.pair;
358daf49
CB
3148 ret = lxc_netdev_delete_by_name(hostveth);
3149 if (ret < 0) {
3150 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3151 } else {
3152 INFO("Removed interface \"%s\" from host.", hostveth);
358daf49
CB
3153 }
3154 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
e97946ae 3155 hostveth = netdev->priv.veth_attr.veth1;
e97946ae 3156 ret = lxc_netdev_delete_by_name(hostveth);
358daf49
CB
3157 if (ret < 0) {
3158 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3159 } else {
3160 INFO("Removed interface \"%s\" from host.", hostveth);
3161 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3162 }
e97946ae
CB
3163 }
3164 }
7fef7a06 3165 }
358daf49
CB
3166
3167 return deleted_all;
7fef7a06
DL
3168}
3169
45e854dc
SG
3170#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3171
fe1f672f 3172/* lxc-user-nic returns "interface_name:interface_name\n" */
eab15c1e 3173#define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
c43cbc04
SH
3174static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3175 struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3176{
3177 pid_t child;
a7242d9a
ÇO
3178 int bytes, pipefd[2];
3179 char *token, *saveptr = NULL;
fe1f672f 3180 char buffer[MAX_BUFFER_SIZE];
091045f8 3181 char netdev_link[IFNAMSIZ + 1];
cbef6c52
SH
3182
3183 if (netdev->type != LXC_NET_VETH) {
3184 ERROR("nic type %d not support for unprivileged use",
091045f8 3185 netdev->type);
cbef6c52
SH
3186 return -1;
3187 }
3188
091045f8 3189 if (pipe(pipefd) < 0) {
a7242d9a
ÇO
3190 SYSERROR("pipe failed");
3191 return -1;
3192 }
3193
091045f8
CB
3194 child = fork();
3195 if (child < 0) {
cbef6c52 3196 SYSERROR("fork");
a7242d9a
ÇO
3197 close(pipefd[0]);
3198 close(pipefd[1]);
3199 return -1;
3200 }
3201
3202 if (child == 0) { // child
091045f8
CB
3203 /* Call lxc-user-nic pid type bridge. */
3204 int ret;
3205 char pidstr[LXC_NUMSTRLEN64];
3206
3207 close(pipefd[0]); /* Close the read-end of the pipe. */
3208
3209 /* Redirect stdout to write-end of the pipe. */
3210 ret = dup2(pipefd[1], STDOUT_FILENO);
3211 close(pipefd[1]); /* Close the write-end of the pipe. */
3212 if (ret < 0) {
3213 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3214 exit(EXIT_FAILURE);
3215 }
a7242d9a 3216
091045f8 3217 if (netdev->link)
cff7b5eb 3218 strncpy(netdev_link, netdev->link, IFNAMSIZ);
091045f8 3219 else
cff7b5eb 3220 strncpy(netdev_link, "none", IFNAMSIZ);
091045f8
CB
3221
3222 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3223 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3224 exit(EXIT_FAILURE);
3225 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3226
3227 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3228 lxcname, pidstr, netdev_link, netdev->name);
c43cbc04 3229 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
091045f8
CB
3230 pidstr, "veth", netdev_link, netdev->name, NULL);
3231
3232 SYSERROR("Failed to exec lxc-user-nic.");
3233 exit(EXIT_FAILURE);
a7242d9a
ÇO
3234 }
3235
3236 /* close the write-end of the pipe */
3237 close(pipefd[1]);
3238
fe1f672f 3239 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
091045f8
CB
3240 if (bytes < 0)
3241 SYSERROR("Failed to read from pipe file descriptor.");
a7242d9a
ÇO
3242 buffer[bytes - 1] = '\0';
3243
3244 if (wait_for_pid(child) != 0) {
3245 close(pipefd[0]);
cbef6c52
SH
3246 return -1;
3247 }
3248
a7242d9a
ÇO
3249 /* close the read-end of the pipe */
3250 close(pipefd[0]);
cbef6c52 3251
a7242d9a
ÇO
3252 /* fill netdev->name field */
3253 token = strtok_r(buffer, ":", &saveptr);
3254 if (!token)
3255 return -1;
091045f8
CB
3256
3257 netdev->name = malloc(IFNAMSIZ + 1);
658979c5 3258 if (!netdev->name) {
091045f8 3259 SYSERROR("Failed to allocate memory.");
658979c5
SH
3260 return -1;
3261 }
091045f8 3262 memset(netdev->name, 0, IFNAMSIZ + 1);
658979c5 3263 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3264
3265 /* fill netdev->veth_attr.pair field */
3266 token = strtok_r(NULL, ":", &saveptr);
3267 if (!token)
3268 return -1;
091045f8 3269
a7242d9a 3270 netdev->priv.veth_attr.pair = strdup(token);
658979c5 3271 if (!netdev->priv.veth_attr.pair) {
091045f8 3272 ERROR("Failed to allocate memory.");
658979c5
SH
3273 return -1;
3274 }
45e854dc 3275
a7242d9a 3276 return 0;
cbef6c52
SH
3277}
3278
c43cbc04
SH
3279int lxc_assign_network(const char *lxcpath, char *lxcname,
3280 struct lxc_list *network, pid_t pid)
0ad19a3f 3281{
82d5ae15 3282 struct lxc_list *iterator;
82d5ae15 3283 struct lxc_netdev *netdev;
f2e206ff 3284 char ifname[IFNAMSIZ];
cbef6c52 3285 int am_root = (getuid() == 0);
3cfc0f3a 3286 int err;
0ad19a3f 3287
5f4535a3 3288 lxc_list_for_each(iterator, network) {
82d5ae15 3289
5f4535a3 3290 netdev = iterator->elem;
82d5ae15 3291
fbb16259 3292 if (netdev->type == LXC_NET_VETH && !am_root) {
72ccbbe1
SC
3293 if (netdev->mtu)
3294 INFO("mtu ignored due to insufficient privilege");
c43cbc04 3295 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
cbef6c52 3296 return -1;
e337179a
CB
3297 /* lxc-user-nic has moved the nic to the new ns.
3298 * unpriv_assign_nic() fills in netdev->name.
3299 * netdev->ifindex will be filed in at
3300 * lxc_setup_netdev_in_child_namespaces.
3301 */
cbef6c52
SH
3302 continue;
3303 }
236087a6 3304
fbb16259
SH
3305 /* empty network namespace, nothing to move */
3306 if (!netdev->ifindex)
3307 continue;
3308
f2e206ff 3309 /* retrieve the name of the interface */
3310 if (!if_indextoname(netdev->ifindex, ifname)) {
3311 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3312 return -1;
3313 }
3314
3315 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3cfc0f3a
MN
3316 if (err) {
3317 ERROR("failed to move '%s' to the container : %s",
3318 netdev->link, strerror(-err));
82d5ae15
DL
3319 return -1;
3320 }
3321
198cbbaa 3322 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
0ad19a3f 3323 }
3324
3325 return 0;
3326}
3327
251d0d2a
DE
3328static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3329 size_t buf_size)
f6d3e3e4 3330{
29053180
CB
3331 char path[MAXPATHLEN];
3332 int fd, ret;
f6d3e3e4 3333
29053180
CB
3334 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3335 idtype == ID_TYPE_UID ? 'u' : 'g');
3336 if (ret < 0 || ret >= MAXPATHLEN) {
3337 ERROR("failed to create path \"%s\"", path);
f6d3e3e4
SH
3338 return -E2BIG;
3339 }
29053180
CB
3340
3341 fd = open(path, O_WRONLY);
3342 if (fd < 0) {
3343 SYSERROR("failed to open \"%s\"", path);
3344 return -1;
f6d3e3e4 3345 }
29053180
CB
3346
3347 errno = 0;
3348 ret = lxc_write_nointr(fd, buf, buf_size);
3349 if (ret != buf_size) {
3350 SYSERROR("failed to write %cid mapping to \"%s\"",
3351 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3352 close(fd);
3353 return -1;
3354 }
3355 close(fd);
3356
3357 return 0;
f6d3e3e4
SH
3358}
3359
6e50e704
CB
3360/* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3361 *
3362 * @return 1 if functional binary was found
3363 * @return 0 if binary exists but is lacking privilege
3364 * @return -ENOENT if binary does not exist
3365 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3366 *
3367 */
df6a2945
CB
3368static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3369{
3370 char *path;
3371 int ret;
3372 struct stat st;
3373 int fret = 0;
3374
6e50e704
CB
3375 if (cap != CAP_SETUID && cap != CAP_SETGID)
3376 return -EINVAL;
3377
df6a2945
CB
3378 path = on_path(binary, NULL);
3379 if (!path)
3380 return -ENOENT;
3381
3382 ret = stat(path, &st);
3383 if (ret < 0) {
3384 fret = -errno;
3385 goto cleanup;
3386 }
3387
3388 /* Check if the binary is setuid. */
3389 if (st.st_mode & S_ISUID) {
3390 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3391 fret = 1;
3392 goto cleanup;
3393 }
3394
69924fff 3395 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
df6a2945
CB
3396 /* Check if it has the CAP_SETUID capability. */
3397 if ((cap & CAP_SETUID) &&
3398 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3399 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3400 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3401 "and CAP_PERMITTED sets.", path);
3402 fret = 1;
3403 goto cleanup;
3404 }
3405
3406 /* Check if it has the CAP_SETGID capability. */
3407 if ((cap & CAP_SETGID) &&
3408 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3409 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3410 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3411 "and CAP_PERMITTED sets.", path);
3412 fret = 1;
3413 goto cleanup;
3414 }
d6018f88 3415 #else
69924fff
CB
3416 /* If we cannot check for file capabilities we need to give the benefit
3417 * of the doubt. Otherwise we might fail even though all the necessary
3418 * file capabilities are set.
3419 */
d6018f88
CB
3420 DEBUG("Cannot check for file capabilites as full capability support is "
3421 "missing. Manual intervention needed.");
3422 fret = 1;
df6a2945
CB
3423 #endif
3424
3425cleanup:
3426 free(path);
3427 return fret;
3428}
3429
986ef930
CB
3430int lxc_map_ids_exec_wrapper(void *args)
3431{
3432 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3433 return -1;
3434}
3435
f6d3e3e4
SH
3436int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3437{
f6d3e3e4 3438 struct id_map *map;
4bc3b759 3439 struct lxc_list *iterator;
251d0d2a 3440 enum idtype type;
986ef930 3441 char u_or_g;
4bc3b759 3442 char *pos;
99d43365 3443 int fill, left;
986ef930
CB
3444 char cmd_output[MAXPATHLEN];
3445 /* strlen("new@idmap") = 9
3446 * +
3447 * strlen(" ") = 1
3448 * +
3449 * LXC_NUMSTRLEN64
3450 * +
3451 * strlen(" ") = 1
3452 *
3453 * We add some additional space to make sure that we really have
3454 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3455 */
3456 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3457 int ret = 0, uidmap = 0, gidmap = 0;
3458 bool use_shadow = false, had_entry = false;
df6a2945
CB
3459
3460 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3461 * ranges, then insist that root also reserve ranges in subuid. This
22038de5
SH
3462 * will protected it by preventing another user from being handed the
3463 * range by shadow.
3464 */
df6a2945 3465 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
6e50e704
CB
3466 if (uidmap == -ENOENT)
3467 WARN("newuidmap binary is missing");
3468 else if (!uidmap)
3469 WARN("newuidmap is lacking necessary privileges");
3470
df6a2945 3471 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
6e50e704
CB
3472 if (gidmap == -ENOENT)
3473 WARN("newgidmap binary is missing");
3474 else if (!gidmap)
3475 WARN("newgidmap is lacking necessary privileges");
3476
df6a2945
CB
3477 if (uidmap > 0 && gidmap > 0) {
3478 DEBUG("Functional newuidmap and newgidmap binary found.");
4bc3b759 3479 use_shadow = true;
df6a2945 3480 } else {
99d43365
CB
3481 /* In case unprivileged users run application containers via
3482 * execute() or a start*() there are valid cases where they may
3483 * only want to map their own {g,u}id. Let's not block them from
3484 * doing so by requiring geteuid() == 0.
3485 */
3486 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3487 "write directly with euid %d.", geteuid());
0e6e3a41 3488 }
251d0d2a 3489
986ef930
CB
3490 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3491 type++, u_or_g = 'g') {
3492 pos = mapbuf;
3493
0e6e3a41 3494 if (use_shadow)
986ef930 3495 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
4f7521b4 3496
cf3ef16d 3497 lxc_list_for_each(iterator, idmap) {
4bc3b759
CB
3498 /* The kernel only takes <= 4k for writes to
3499 * /proc/<nr>/[ug]id_map
3500 */
251d0d2a 3501 map = iterator->elem;
cf3ef16d
SH
3502 if (map->idtype != type)
3503 continue;
3504
4bc3b759
CB
3505 had_entry = true;
3506
986ef930 3507 left = LXC_IDMAPLEN - (pos - mapbuf);
d1838f34 3508 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
4bc3b759
CB
3509 use_shadow ? " " : "", map->nsid,
3510 map->hostid, map->range,
0e6e3a41 3511 use_shadow ? "" : "\n");
cf3ef16d 3512 if (fill <= 0 || fill >= left)
4bc3b759
CB
3513 SYSERROR("Too many {g,u}id mappings defined.");
3514
cf3ef16d 3515 pos += fill;
251d0d2a 3516 }
cf3ef16d 3517 if (!had_entry)
4f7521b4 3518 continue;
cf3ef16d 3519
986ef930
CB
3520 /* Try to catch the ouput of new{g,u}idmap to make debugging
3521 * easier.
3522 */
3523 if (use_shadow) {
3524 ret = run_command(cmd_output, sizeof(cmd_output),
3525 lxc_map_ids_exec_wrapper,
3526 (void *)mapbuf);
3527 if (ret < 0) {
3528 ERROR("new%cidmap failed to write mapping: %s",
3529 u_or_g, cmd_output);
3530 return -1;
3531 }
d1838f34 3532 } else {
986ef930
CB
3533 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3534 if (ret < 0)
3535 return -1;
d1838f34 3536 }
986ef930
CB
3537
3538 memset(mapbuf, 0, sizeof(mapbuf));
f6d3e3e4 3539 }
251d0d2a 3540
986ef930 3541 return 0;
f6d3e3e4
SH
3542}
3543
cf3ef16d 3544/*
7b50c609
TS
3545 * return the host uid/gid to which the container root is mapped in
3546 * *val.
0b3a6504 3547 * Return true if id was found, false otherwise.
cf3ef16d 3548 */
2a9a80cb 3549bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3550 unsigned long *val)
cf3ef16d
SH
3551{
3552 struct lxc_list *it;
3553 struct id_map *map;
3554
3555 lxc_list_for_each(it, &conf->id_map) {
3556 map = it->elem;
7b50c609 3557 if (map->idtype != idtype)
cf3ef16d
SH
3558 continue;
3559 if (map->nsid != 0)
3560 continue;
2a9a80cb
SH
3561 *val = map->hostid;
3562 return true;
cf3ef16d 3563 }
2a9a80cb 3564 return false;
cf3ef16d
SH
3565}
3566
2133f58c 3567int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3568{
3569 struct lxc_list *it;
3570 struct id_map *map;
3571 lxc_list_for_each(it, &conf->id_map) {
3572 map = it->elem;
2133f58c 3573 if (map->idtype != idtype)
cf3ef16d
SH
3574 continue;
3575 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3576 return (id - map->hostid) + map->nsid;
cf3ef16d 3577 }
57d116ab 3578 return -1;
cf3ef16d
SH
3579}
3580
339efad9 3581int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3582{
3583 struct lxc_list *it;
3584 struct id_map *map;
2133f58c 3585 unsigned int freeid = 0;
cf3ef16d
SH
3586again:
3587 lxc_list_for_each(it, &conf->id_map) {
3588 map = it->elem;
2133f58c 3589 if (map->idtype != idtype)
cf3ef16d
SH
3590 continue;
3591 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3592 freeid = map->nsid + map->range;
3593 goto again;
3594 }
3595 }
3596 return freeid;
3597}
3598
19a26f82
MK
3599int lxc_find_gateway_addresses(struct lxc_handler *handler)
3600{
3601 struct lxc_list *network = &handler->conf->network;
3602 struct lxc_list *iterator;
3603 struct lxc_netdev *netdev;
3604 int link_index;
3605
3606 lxc_list_for_each(iterator, network) {
3607 netdev = iterator->elem;
3608
3609 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3610 continue;
3611
3612 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3613 ERROR("gateway = auto only supported for "
3614 "veth and macvlan");
3615 return -1;
3616 }
3617
3618 if (!netdev->link) {
3619 ERROR("gateway = auto needs a link interface");
3620 return -1;
3621 }
3622
3623 link_index = if_nametoindex(netdev->link);
3624 if (!link_index)
3625 return -EINVAL;
3626
3627 if (netdev->ipv4_gateway_auto) {
3628 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3629 ERROR("failed to automatically find ipv4 gateway "
3630 "address from link interface '%s'", netdev->link);
3631 return -1;
3632 }
3633 }
3634
3635 if (netdev->ipv6_gateway_auto) {
3636 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3637 ERROR("failed to automatically find ipv6 gateway "
3638 "address from link interface '%s'", netdev->link);
3639 return -1;
3640 }
3641 }
3642 }
3643
3644 return 0;
3645}
3646
5e4a62bf 3647int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3648{
5e4a62bf 3649 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3650 int i, ret;
b0a33c1e 3651
5e4a62bf
DL
3652 /* no tty in the configuration */
3653 if (!conf->tty)
b0a33c1e 3654 return 0;
3655
9e1045e3 3656 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
b0a33c1e 3657 if (!tty_info->pty_info) {
9e1045e3
CB
3658 SYSERROR("failed to allocate struct *pty_info");
3659 return -ENOMEM;
b0a33c1e 3660 }
3661
985d15b1 3662 for (i = 0; i < conf->tty; i++) {
b0a33c1e 3663 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3664
025ed0f3
SH
3665 process_lock();
3666 ret = openpty(&pty_info->master, &pty_info->slave,
9e1045e3 3667 pty_info->name, NULL, NULL);
025ed0f3
SH
3668 process_unlock();
3669 if (ret) {
9e1045e3 3670 SYSERROR("failed to create pty device number %d", i);
985d15b1
MT
3671 tty_info->nbtty = i;
3672 lxc_delete_tty(tty_info);
9e1045e3 3673 return -ENOTTY;
b0a33c1e 3674 }
3675
9e1045e3 3676 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
5332bb84
DL
3677 pty_info->name, pty_info->master, pty_info->slave);
3678
3ec1648d 3679 /* Prevent leaking the file descriptors to the container */
9e1045e3
CB
3680 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3681 if (ret < 0)
3682 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3683 "pty device \"%s\": %s",
3684 pty_info->master, pty_info->name, strerror(errno));
3685
3686 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3687 if (ret < 0)
3688 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3689 "pty device \"%s\": %s",
3690 pty_info->slave, pty_info->name, strerror(errno));
b035ad62 3691
b0a33c1e 3692 pty_info->busy = 0;
3693 }
3694
985d15b1 3695 tty_info->nbtty = conf->tty;
1ac470c0 3696
9e1045e3 3697 INFO("finished allocating %d pts devices", conf->tty);
985d15b1 3698 return 0;
b0a33c1e 3699}
3700
3701void lxc_delete_tty(struct lxc_tty_info *tty_info)
3702{
3703 int i;
3704
3705 for (i = 0; i < tty_info->nbtty; i++) {
3706 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3707
3708 close(pty_info->master);
3709 close(pty_info->slave);
3710 }
3711
3712 free(tty_info->pty_info);
e00c0242 3713 tty_info->pty_info = NULL;
b0a33c1e 3714 tty_info->nbtty = 0;
3715}
3716
f4f52cb5
CB
3717
3718int chown_mapped_root_exec_wrapper(void *args)
3719{
3720 execvp("lxc-usernsexec", args);
3721 return -1;
3722}
3723
f6d3e3e4 3724/*
7b50c609
TS
3725 * chown_mapped_root: for an unprivileged user with uid/gid X to
3726 * chown a dir to subuid/subgid Y, he needs to run chown as root
3727 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3728 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3729 * root is privileged with respect to hostuid/hostgid X, allowing
3730 * him to do the chown.
f6d3e3e4 3731 */
c4d10a05 3732int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3733{
f4f52cb5 3734 uid_t rootuid, rootgid;
2a9a80cb 3735 unsigned long val;
a7ef8753 3736 char *chownpath = path;
f4f52cb5
CB
3737 int hostuid, hostgid, ret;
3738 struct stat sb;
3739 char map1[100], map2[100], map3[100], map4[100], map5[100];
3740 char ugid[100];
3741 char *args1[] = {"lxc-usernsexec",
3742 "-m", map1,
3743 "-m", map2,
3744 "-m", map3,
3745 "-m", map5,
3746 "--", "chown", ugid, path,
3747 NULL};
3748 char *args2[] = {"lxc-usernsexec",
3749 "-m", map1,
3750 "-m", map2,
3751 "-m", map3,
3752 "-m", map4,
3753 "-m", map5,
3754 "--", "chown", ugid, path,
3755 NULL};
3756 char cmd_output[MAXPATHLEN];
3757
3758 hostuid = geteuid();
3759 hostgid = getegid();
f6d3e3e4 3760
2a9a80cb 3761 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
bc80f098 3762 ERROR("No uid mapping for container root");
c4d10a05 3763 return -1;
f6d3e3e4 3764 }
f4f52cb5 3765 rootuid = (uid_t)val;
7b50c609 3766 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
bc80f098 3767 ERROR("No gid mapping for container root");
7b50c609
TS
3768 return -1;
3769 }
f4f52cb5 3770 rootgid = (gid_t)val;
2a9a80cb 3771
a7ef8753 3772 /*
f4f52cb5 3773 * In case of overlay, we want only the writeable layer to be chowned
a7ef8753 3774 */
1f92162d 3775 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3776 chownpath = strchr(path, ':');
3777 if (!chownpath) {
3778 ERROR("Bad overlay path: %s", path);
3779 return -1;
3780 }
f4f52cb5 3781 chownpath = strchr(chownpath + 1, ':');
a7ef8753
SH
3782 if (!chownpath) {
3783 ERROR("Bad overlay path: %s", path);
3784 return -1;
3785 }
3786 chownpath++;
3787 }
3788 path = chownpath;
f4f52cb5 3789 if (hostuid == 0) {
7b50c609 3790 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3791 ERROR("Error chowning %s", path);
3792 return -1;
3793 }
3794 return 0;
3795 }
f3d7e4ca 3796
f4f52cb5 3797 if (rootuid == hostuid) {
f3d7e4ca 3798 // nothing to do
b103ceac 3799 INFO("Container root is our uid; no need to chown");
f3d7e4ca
SH
3800 return 0;
3801 }
3802
bbdbf8f0 3803 /* save the current gid of "path" */
f4f52cb5
CB
3804 if (stat(path, &sb) < 0) {
3805 ERROR("Error stat %s", path);
f6d3e3e4
SH
3806 return -1;
3807 }
7b50c609 3808
bbdbf8f0
CB
3809 /* Update the path argument in case this was overlayfs. */
3810 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3811 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3812
f4f52cb5
CB
3813 /*
3814 * A file has to be group-owned by a gid mapped into the
3815 * container, or the container won't be privileged over it.
3816 */
3817 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3818 if (sb.st_uid == hostuid &&
3819 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3820 chown(path, -1, hostgid) < 0) {
3821 ERROR("Failed chgrping %s", path);
3822 return -1;
3823 }
f6d3e3e4 3824
f4f52cb5
CB
3825 // "u:0:rootuid:1"
3826 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3827 if (ret < 0 || ret >= 100) {
3828 ERROR("Error uid printing map string");
3829 return -1;
3830 }
7b50c609 3831
f4f52cb5
CB
3832 // "u:hostuid:hostuid:1"
3833 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3834 if (ret < 0 || ret >= 100) {
3835 ERROR("Error uid printing map string");
3836 return -1;
3837 }
c4d10a05 3838
f4f52cb5
CB
3839 // "g:0:rootgid:1"
3840 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3841 if (ret < 0 || ret >= 100) {
3842 ERROR("Error gid printing map string");
3843 return -1;
3844 }
98e5ba51 3845
f4f52cb5
CB
3846 // "g:pathgid:rootgid+pathgid:1"
3847 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3848 rootgid + (gid_t)sb.st_gid);
3849 if (ret < 0 || ret >= 100) {
3850 ERROR("Error gid printing map string");
3851 return -1;
3852 }
c4d10a05 3853
f4f52cb5
CB
3854 // "g:hostgid:hostgid:1"
3855 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3856 if (ret < 0 || ret >= 100) {
3857 ERROR("Error gid printing map string");
3858 return -1;
3859 }
7b50c609 3860
f4f52cb5
CB
3861 // "0:pathgid" (chown)
3862 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3863 if (ret < 0 || ret >= 100) {
3864 ERROR("Error owner printing format string for chown");
3865 return -1;
3866 }
7b50c609 3867
f4f52cb5
CB
3868 if (hostgid == sb.st_gid)
3869 ret = run_command(cmd_output, sizeof(cmd_output),
3870 chown_mapped_root_exec_wrapper,
3871 (void *)args1);
3872 else
3873 ret = run_command(cmd_output, sizeof(cmd_output),
3874 chown_mapped_root_exec_wrapper,
3875 (void *)args2);
3876 if (ret < 0)
3877 ERROR("lxc-usernsexec failed: %s", cmd_output);
7b50c609 3878
f4f52cb5 3879 return ret;
f6d3e3e4
SH
3880}
3881
54117de5 3882int lxc_ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3883{
c4d10a05 3884 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3885 return 0;
c4d10a05 3886
54117de5
CB
3887 if (!strcmp(c->console.name, ""))
3888 return 0;
3889
3890 if (chown_mapped_root(c->console.name, c) < 0) {
3891 ERROR("failed to chown console \"%s\"", c->console.name);
c4d10a05
SH
3892 return -1;
3893 }
3894
54117de5
CB
3895 TRACE("chowned console \"%s\"", c->console.name);
3896
f6d3e3e4
SH
3897 return 0;
3898}
3899
943144d9
CB
3900/* NOTE: Must not be called from inside the container namespace! */
3901int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
5112cd70
SH
3902{
3903 int mounted;
3904
943144d9 3905 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
5112cd70 3906 if (mounted == -1) {
943144d9 3907 SYSERROR("failed to mount /proc in the container");
01958b1f 3908 /* continue only if there is no rootfs */
943144d9 3909 if (conf->rootfs.path)
01958b1f 3910 return -1;
5112cd70 3911 } else if (mounted == 1) {
943144d9 3912 conf->tmp_umount_proc = 1;
5112cd70 3913 }
943144d9 3914
5112cd70
SH
3915 return 0;
3916}
3917
3918void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3919{
3920 if (lxc_conf->tmp_umount_proc == 1) {
3921 umount("/proc");
3922 lxc_conf->tmp_umount_proc = 0;
3923 }
3924}
3925
6a0c909a 3926void remount_all_slave(void)
e995d7a2
SH
3927{
3928 /* walk /proc/mounts and change any shared entries to slave */
3929 FILE *f = fopen("/proc/self/mountinfo", "r");
3930 char *line = NULL;
3931 size_t len = 0;
3932
3933 if (!f) {
3934 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3935 ERROR("Continuing container startup...");
3936 return;
3937 }
3938
3939 while (getline(&line, &len, f) != -1) {
3940 char *target, *opts;
3941 target = get_field(line, 4);
3942 if (!target)
3943 continue;
3944 opts = get_field(target, 2);
3945 if (!opts)
3946 continue;
3947 null_endofword(opts);
3948 if (!strstr(opts, "shared"))
3949 continue;
3950 null_endofword(target);
3951 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3952 SYSERROR("Failed to make %s rslave", target);
3953 ERROR("Continuing...");
3954 }
3955 }
3956 fclose(f);
f10fad2f 3957 free(line);
e995d7a2
SH
3958}
3959
2322903b
SH
3960void lxc_execute_bind_init(struct lxc_conf *conf)
3961{
3962 int ret;
9d9c111c
SH
3963 char path[PATH_MAX], destpath[PATH_MAX], *p;
3964
3965 /* If init exists in the container, don't bind mount a static one */
3966 p = choose_init(conf->rootfs.mount);
3967 if (p) {
3968 free(p);
3969 return;
3970 }
2322903b
SH
3971
3972 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3973 if (ret < 0 || ret >= PATH_MAX) {
3974 WARN("Path name too long searching for lxc.init.static");
3975 return;
3976 }
3977
3978 if (!file_exists(path)) {
3979 INFO("%s does not exist on host", path);
3980 return;
3981 }
3982
3983 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3984 if (ret < 0 || ret >= PATH_MAX) {
3985 WARN("Path name too long for container's lxc.init.static");
3986 return;
3987 }
3988
3989 if (!file_exists(destpath)) {
3990 FILE * pathfile = fopen(destpath, "wb");
3991 if (!pathfile) {
3992 SYSERROR("Failed to create mount target '%s'", destpath);
3993 return;
3994 }
3995 fclose(pathfile);
3996 }
3997
592fd47a 3998 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
2322903b
SH
3999 if (ret < 0)
4000 SYSERROR("Failed to bind lxc.init.static into container");
4001 INFO("lxc.init.static bound into container at %s", path);
4002}
4003
35120d9c
SH
4004/*
4005 * This does the work of remounting / if it is shared, calling the
4006 * container pre-mount hooks, and mounting the rootfs.
4007 */
4008int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 4009{
35120d9c
SH
4010 if (conf->rootfs_setup) {
4011 /*
4012 * rootfs was set up in another namespace. bind-mount it
4013 * to give us a mount in our own ns so we can pivot_root to it
4014 */
4015 const char *path = conf->rootfs.mount;
4016 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4017 ERROR("Failed to bind-mount container / onto itself");
145832ba 4018 return -1;
35120d9c 4019 }
145832ba 4020 return 0;
35120d9c 4021 }
d4ef7c50 4022
e995d7a2
SH
4023 remount_all_slave();
4024
35120d9c
SH
4025 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4026 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4027 return -1;
4028 }
4029
9aa76a17 4030 if (lxc_setup_rootfs(conf)) {
35120d9c
SH
4031 ERROR("failed to setup rootfs for '%s'", name);
4032 return -1;
4033 }
4034
4035 conf->rootfs_setup = true;
4036 return 0;
4037}
4038
1c1c7051
SH
4039static bool verify_start_hooks(struct lxc_conf *conf)
4040{
4041 struct lxc_list *it;
4042 char path[MAXPATHLEN];
4043 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4044 char *hookname = it->elem;
4045 struct stat st;
4046 int ret;
4047
4048 ret = snprintf(path, MAXPATHLEN, "%s%s",
7b6753e7 4049 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
1c1c7051
SH
4050 if (ret < 0 || ret >= MAXPATHLEN)
4051 return false;
4052 ret = stat(path, &st);
4053 if (ret) {
7b6753e7 4054 SYSERROR("Start hook %s not found in container",
1c1c7051
SH
4055 hookname);
4056 return false;
4057 }
6a0c909a 4058 return true;
1c1c7051
SH
4059 }
4060
4061 return true;
4062}
4063
ae467c54 4064static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
e8bd4e43 4065{
ae467c54
CB
4066 int i;
4067 int *ttyfds;
4068 struct lxc_pty_info *pty_info;
e8bd4e43
SH
4069 struct lxc_conf *conf = handler->conf;
4070 const struct lxc_tty_info *tty_info = &conf->tty_info;
e8bd4e43 4071 int sock = handler->ttysock[0];
ae467c54
CB
4072 int ret = -1;
4073 size_t num_ttyfds = (2 * conf->tty);
e8bd4e43 4074
ae467c54
CB
4075 ttyfds = malloc(num_ttyfds * sizeof(int));
4076 if (!ttyfds)
4077 return -1;
4078
4079 for (i = 0; i < num_ttyfds; i++) {
4080 pty_info = &tty_info->pty_info[i / 2];
4081 ttyfds[i++] = pty_info->slave;
4082 ttyfds[i] = pty_info->master;
4083 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
f07fa8df
CB
4084 "parent",
4085 pty_info->name, pty_info->master, pty_info->slave);
e8bd4e43
SH
4086 }
4087
ae467c54
CB
4088 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4089 if (ret < 0)
4090 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4091 strerror(errno));
4092 else
4093 TRACE("sent %d ttys to parent", conf->tty);
4094
e8bd4e43
SH
4095 close(handler->ttysock[0]);
4096 close(handler->ttysock[1]);
4097
ae467c54
CB
4098 for (i = 0; i < num_ttyfds; i++)
4099 close(ttyfds[i]);
e8bd4e43 4100
ae467c54
CB
4101 free(ttyfds);
4102
4103 return ret;
e8bd4e43
SH
4104}
4105
35120d9c
SH
4106int lxc_setup(struct lxc_handler *handler)
4107{
4108 const char *name = handler->name;
4109 struct lxc_conf *lxc_conf = handler->conf;
4110 const char *lxcpath = handler->lxcpath;
35120d9c
SH
4111
4112 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4113 ERROR("Error setting up rootfs mount after spawn");
4114 return -1;
4115 }
4116
6c544cb3
MM
4117 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4118 if (setup_utsname(lxc_conf->utsname)) {
4119 ERROR("failed to setup the utsname for '%s'", name);
4120 return -1;
4121 }
0ad19a3f 4122 }
4123
e337179a
CB
4124 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4125 &lxc_conf->network)) {
36eb9bde 4126 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4127 return -1;
0ad19a3f 4128 }
4129
bc6928ff 4130 if (lxc_conf->autodev > 0) {
14221cbb 4131 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
91c3830e 4132 ERROR("failed to mount /dev in the container");
c6883f38
SH
4133 return -1;
4134 }
4135 }
4136
368bbc02
CS
4137 /* do automatic mounts (mainly /proc and /sys), but exclude
4138 * those that need to wait until other stuff has finished
4139 */
4fb3cba5 4140 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4141 ERROR("failed to setup the automatic mounts for '%s'", name);
4142 return -1;
4143 }
4144
0a2dddd4 4145 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
36eb9bde 4146 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4147 return -1;
576f946d 4148 }
4149
0a2dddd4 4150 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
e7938e9e
MN
4151 ERROR("failed to setup the mount entries for '%s'", name);
4152 return -1;
4153 }
4154
7b6753e7 4155 /* Make sure any start hooks are in the container */
1c1c7051
SH
4156 if (!verify_start_hooks(lxc_conf))
4157 return -1;
4158
2322903b
SH
4159 if (lxc_conf->is_execute)
4160 lxc_execute_bind_init(lxc_conf);
4161
368bbc02
CS
4162 /* now mount only cgroup, if wanted;
4163 * before, /sys could not have been mounted
4164 * (is either mounted automatically or via fstab entries)
4165 */
4fb3cba5 4166 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4167 ERROR("failed to setup the automatic mounts for '%s'", name);
4168 return -1;
4169 }
4170
283678ed 4171 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4172 ERROR("failed to run mount hooks for container '%s'.", name);
4173 return -1;
4174 }
4175
bc6928ff 4176 if (lxc_conf->autodev > 0) {
283678ed 4177 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4178 ERROR("failed to run autodev hooks for container '%s'.", name);
4179 return -1;
4180 }
27245ff7 4181 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
91c3830e
SH
4182 ERROR("failed to populate /dev in the container");
4183 return -1;
4184 }
4185 }
368bbc02 4186
3d7d929a 4187 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4188 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4189 return -1;
6e590161 4190 }
4191
69aa6655
DE
4192 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4193 ERROR("failed to setup /dev symlinks for '%s'", name);
4194 return -1;
4195 }
4196
5112cd70 4197 /* mount /proc if it's not already there */
943144d9 4198 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4199 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4200 return -1;
e075f5d9 4201 }
e075f5d9 4202
ac778708 4203 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4204 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4205 return -1;
ed502555 4206 }
4207
70761e5e 4208 if (lxc_setup_devpts(lxc_conf->pts)) {
36eb9bde 4209 ERROR("failed to setup the new pts instance");
95b5ffaf 4210 return -1;
3c26f34e 4211 }
4212
e8bd4e43
SH
4213 if (lxc_create_tty(name, lxc_conf)) {
4214 ERROR("failed to create the ttys");
4215 return -1;
4216 }
4217
ae467c54 4218 if (lxc_send_ttys_to_parent(handler) < 0) {
e8bd4e43
SH
4219 ERROR("failure sending console info to parent");
4220 return -1;
4221 }
4222
9e1045e3 4223 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
e8bd4e43
SH
4224 ERROR("failed to setup the ttys for '%s'", name);
4225 return -1;
4226 }
4227
4228 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4229 SYSERROR("failed to set environment variable for container ptys");
4230
4231
cccc74b5
DL
4232 if (setup_personality(lxc_conf->personality)) {
4233 ERROR("failed to setup personality");
4234 return -1;
4235 }
4236
97a8f74f
SG
4237 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4238 if (!lxc_list_empty(&lxc_conf->caps)) {
7389ca26 4239 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
f6d3e3e4
SH
4240 return -1;
4241 }
97a8f74f
SG
4242 if (dropcaps_except(&lxc_conf->keepcaps)) {
4243 ERROR("failed to keep requested caps");
4244 return -1;
4245 }
4246 } else if (setup_caps(&lxc_conf->caps)) {
4247 ERROR("failed to drop capabilities");
4248 return -1;
81810dd1
DL
4249 }
4250
f4152036 4251 NOTICE("Container \"%s\" is set up", name);
cd54d859 4252
0ad19a3f 4253 return 0;
4254}
26ddeedd 4255
283678ed
SH
4256int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4257 const char *lxcpath, char *argv[])
26ddeedd
SH
4258{
4259 int which = -1;
4260 struct lxc_list *it;
4261
4262 if (strcmp(hook, "pre-start") == 0)
4263 which = LXCHOOK_PRESTART;
5ea6163a
SH
4264 else if (strcmp(hook, "pre-mount") == 0)
4265 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4266 else if (strcmp(hook, "mount") == 0)
4267 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4268 else if (strcmp(hook, "autodev") == 0)
4269 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4270 else if (strcmp(hook, "start") == 0)
4271 which = LXCHOOK_START;
52492063
WB
4272 else if (strcmp(hook, "stop") == 0)
4273 which = LXCHOOK_STOP;
26ddeedd
SH
4274 else if (strcmp(hook, "post-stop") == 0)
4275 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4276 else if (strcmp(hook, "clone") == 0)
4277 which = LXCHOOK_CLONE;
37cf711b
SY
4278 else if (strcmp(hook, "destroy") == 0)
4279 which = LXCHOOK_DESTROY;
26ddeedd
SH
4280 else
4281 return -1;
4282 lxc_list_for_each(it, &conf->hooks[which]) {
4283 int ret;
4284 char *hookname = it->elem;
283678ed 4285 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4286 if (ret)
4287 return ret;
4288 }
4289 return 0;
4290}
72d0e1cb 4291
72d0e1cb
SG
4292int lxc_clear_config_caps(struct lxc_conf *c)
4293{
9ebb03ad 4294 struct lxc_list *it,*next;
72d0e1cb 4295
9ebb03ad 4296 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4297 lxc_list_del(it);
4298 free(it->elem);
4299 free(it);
4300 }
4301 return 0;
4302}
4303
74a3920a 4304static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4305 struct lxc_list *it, *next;
4306
4355ab5f 4307 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4308 lxc_list_del(it);
4309 free(it->elem);
4310 free(it);
4311 }
4312 return 0;
4313}
4314
4355ab5f
SH
4315int lxc_clear_idmaps(struct lxc_conf *c)
4316{
4317 return lxc_free_idmap(&c->id_map);
4318}
4319
1fb86a7c
SH
4320int lxc_clear_config_keepcaps(struct lxc_conf *c)
4321{
4322 struct lxc_list *it,*next;
4323
4324 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4325 lxc_list_del(it);
4326 free(it->elem);
4327 free(it);
4328 }
4329 return 0;
4330}
4331
12a50cc6 4332int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4333{
9ebb03ad 4334 struct lxc_list *it,*next;
72d0e1cb 4335 bool all = false;
a6390f01 4336 const char *k = NULL;
72d0e1cb
SG
4337
4338 if (strcmp(key, "lxc.cgroup") == 0)
4339 all = true;
a6390f01
WB
4340 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4341 k = key + sizeof("lxc.cgroup.")-1;
4342 else
4343 return -1;
72d0e1cb 4344
9ebb03ad 4345 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4346 struct lxc_cgroup *cg = it->elem;
4347 if (!all && strcmp(cg->subsystem, k) != 0)
4348 continue;
4349 lxc_list_del(it);
4350 free(cg->subsystem);
4351 free(cg->value);
4352 free(cg);
4353 free(it);
4354 }
4355 return 0;
4356}
4357
c6d09e15
WB
4358int lxc_clear_limits(struct lxc_conf *c, const char *key)
4359{
4360 struct lxc_list *it, *next;
4361 bool all = false;
4362 const char *k = NULL;
4363
240d4b74 4364 if (strcmp(key, "lxc.limit") == 0
4365 || strcmp(key, "lxc.prlimit"))
c6d09e15
WB
4366 all = true;
4367 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4368 k = key + sizeof("lxc.limit.")-1;
240d4b74 4369 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
4370 k = key + sizeof("lxc.prlimit.")-1;
c6d09e15
WB
4371 else
4372 return -1;
4373
4374 lxc_list_for_each_safe(it, &c->limits, next) {
4375 struct lxc_limit *lim = it->elem;
4376 if (!all && strcmp(lim->resource, k) != 0)
4377 continue;
4378 lxc_list_del(it);
4379 free(lim->resource);
4380 free(lim);
4381 free(it);
4382 }
4383 return 0;
4384}
4385
ee1e7aa0
SG
4386int lxc_clear_groups(struct lxc_conf *c)
4387{
4388 struct lxc_list *it,*next;
4389
4390 lxc_list_for_each_safe(it, &c->groups, next) {
4391 lxc_list_del(it);
4392 free(it->elem);
4393 free(it);
4394 }
4395 return 0;
4396}
4397
ab799c0b
SG
4398int lxc_clear_environment(struct lxc_conf *c)
4399{
4400 struct lxc_list *it,*next;
4401
4402 lxc_list_for_each_safe(it, &c->environment, next) {
4403 lxc_list_del(it);
4404 free(it->elem);
4405 free(it);
4406 }
4407 return 0;
4408}
4409
4410
72d0e1cb
SG
4411int lxc_clear_mount_entries(struct lxc_conf *c)
4412{
9ebb03ad 4413 struct lxc_list *it,*next;
72d0e1cb 4414
9ebb03ad 4415 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4416 lxc_list_del(it);
4417 free(it->elem);
4418 free(it);
4419 }
4420 return 0;
4421}
4422
b099e9e9
SH
4423int lxc_clear_automounts(struct lxc_conf *c)
4424{
4425 c->auto_mounts = 0;
4426 return 0;
4427}
4428
12a50cc6 4429int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4430{
9ebb03ad 4431 struct lxc_list *it,*next;
17ed13a3 4432 bool all = false, done = false;
a6390f01 4433 const char *k = NULL;
72d0e1cb
SG
4434 int i;
4435
17ed13a3
SH
4436 if (strcmp(key, "lxc.hook") == 0)
4437 all = true;
a6390f01
WB
4438 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4439 k = key + sizeof("lxc.hook.")-1;
4440 else
4441 return -1;
17ed13a3 4442
72d0e1cb 4443 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4444 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4445 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4446 lxc_list_del(it);
4447 free(it->elem);
4448 free(it);
4449 }
4450 done = true;
72d0e1cb
SG
4451 }
4452 }
17ed13a3
SH
4453
4454 if (!done) {
4455 ERROR("Invalid hook key: %s", key);
4456 return -1;
4457 }
72d0e1cb
SG
4458 return 0;
4459}
8eb5694b 4460
74a3920a 4461static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4462{
4463 int i;
4464
0cf45501 4465 if (!conf->saved_nics)
7b35f3d6
SH
4466 return;
4467 for (i=0; i < conf->num_savednics; i++)
4468 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4469 free(conf->saved_nics);
4470}
4471
4184c3e1
SH
4472static inline void lxc_clear_aliens(struct lxc_conf *conf)
4473{
4474 struct lxc_list *it,*next;
4475
4476 lxc_list_for_each_safe(it, &conf->aliens, next) {
4477 lxc_list_del(it);
4478 free(it->elem);
4479 free(it);
4480 }
4481}
4482
c7b15d1e 4483void lxc_clear_includes(struct lxc_conf *conf)
f979ac15
SH
4484{
4485 struct lxc_list *it,*next;
4486
4487 lxc_list_for_each_safe(it, &conf->includes, next) {
4488 lxc_list_del(it);
4489 free(it->elem);
4490 free(it);
4491 }
4492}
4493
8eb5694b
SH
4494void lxc_conf_free(struct lxc_conf *conf)
4495{
4496 if (!conf)
4497 return;
858377e4
SH
4498 if (current_config == conf)
4499 current_config = NULL;
f10fad2f
ME
4500 free(conf->console.log_path);
4501 free(conf->console.path);
4502 free(conf->rootfs.mount);
b3b8c97f 4503 free(conf->rootfs.bdev_type);
f10fad2f
ME
4504 free(conf->rootfs.options);
4505 free(conf->rootfs.path);
f10fad2f 4506 free(conf->logfile);
858377e4
SH
4507 if (conf->logfd != -1)
4508 close(conf->logfd);
f10fad2f
ME
4509 free(conf->utsname);
4510 free(conf->ttydir);
4511 free(conf->fstab);
4512 free(conf->rcfile);
4513 free(conf->init_cmd);
6b0d5538 4514 free(conf->unexpanded_config);
393903d1 4515 free(conf->pty_names);
76d0127f 4516 free(conf->syslog);
c302b476 4517 lxc_free_networks(&conf->network);
f10fad2f
ME
4518 free(conf->lsm_aa_profile);
4519 free(conf->lsm_se_context);
769872f9 4520 lxc_seccomp_free(conf);
8eb5694b 4521 lxc_clear_config_caps(conf);
1fb86a7c 4522 lxc_clear_config_keepcaps(conf);
8eb5694b 4523 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4524 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4525 lxc_clear_mount_entries(conf);
7b35f3d6 4526 lxc_clear_saved_nics(conf);
27c27d73 4527 lxc_clear_idmaps(conf);
ee1e7aa0 4528 lxc_clear_groups(conf);
f979ac15 4529 lxc_clear_includes(conf);
761d81ca 4530 lxc_clear_aliens(conf);
ab799c0b 4531 lxc_clear_environment(conf);
240d4b74 4532 lxc_clear_limits(conf, "lxc.prlimit");
8eb5694b
SH
4533 free(conf);
4534}
4355ab5f
SH
4535
4536struct userns_fn_data {
4537 int (*fn)(void *);
c9b7c33e 4538 const char *fn_name;
4355ab5f
SH
4539 void *arg;
4540 int p[2];
4541};
4542
4543static int run_userns_fn(void *data)
4544{
4545 struct userns_fn_data *d = data;
4546 char c;
4355ab5f 4547
f8aa4bf3 4548 /* Close write end of the pipe. */
4355ab5f 4549 close(d->p[1]);
f8aa4bf3
CB
4550
4551 /* Wait for parent to finish establishing a new mapping in the user
4552 * namespace we are executing in.
4553 */
4355ab5f
SH
4554 if (read(d->p[0], &c, 1) != 1)
4555 return -1;
f8aa4bf3
CB
4556
4557 /* Close read end of the pipe. */
4355ab5f 4558 close(d->p[0]);
f8aa4bf3 4559
c9b7c33e
CB
4560 if (d->fn_name)
4561 TRACE("calling function \"%s\"", d->fn_name);
f8aa4bf3 4562 /* Call function to run. */
4355ab5f
SH
4563 return d->fn(d->arg);
4564}
4565
339efad9 4566static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
f8aa4bf3
CB
4567 enum idtype idtype)
4568{
4569 struct lxc_list *it;
4570 struct id_map *map;
4571 struct id_map *retmap = NULL;
4572
4573 lxc_list_for_each(it, &conf->id_map) {
4574 map = it->elem;
4575 if (map->idtype != idtype)
4576 continue;
4577
4578 if (id >= map->hostid && id < map->hostid + map->range) {
4579 retmap = map;
4580 break;
4581 }
4582 }
4583
4584 if (!retmap)
4585 return NULL;
4586
4587 retmap = malloc(sizeof(*retmap));
4588 if (!retmap)
4589 return NULL;
4590
4591 memcpy(retmap, map, sizeof(*retmap));
4592 return retmap;
4593}
4594
4355ab5f 4595/*
f8aa4bf3
CB
4596 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4597 * existing one or establish a new one.
4355ab5f 4598 */
28a2d9e7 4599static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4355ab5f 4600{
28a2d9e7 4601 int hostid_mapped;
f8aa4bf3 4602 struct id_map *entry = NULL;
f8aa4bf3 4603
28a2d9e7
CB
4604 /* Reuse existing mapping. */
4605 entry = mapped_hostid_entry(conf, id, type);
4606 if (entry)
4607 return entry;
f8aa4bf3 4608
28a2d9e7
CB
4609 /* Find new mapping. */
4610 hostid_mapped = find_unmapped_nsid(conf, type);
4611 if (hostid_mapped < 0) {
4612 DEBUG("failed to find free mapping for id %d", id);
4613 return NULL;
f8aa4bf3 4614 }
f8aa4bf3 4615
28a2d9e7
CB
4616 entry = malloc(sizeof(*entry));
4617 if (!entry)
4618 return NULL;
4355ab5f 4619
28a2d9e7
CB
4620 entry->idtype = type;
4621 entry->nsid = hostid_mapped;
4622 entry->hostid = (unsigned long)id;
4623 entry->range = 1;
4355ab5f 4624
28a2d9e7 4625 return entry;
4355ab5f
SH
4626}
4627
f8aa4bf3
CB
4628/* Run a function in a new user namespace.
4629 * The caller's euid/egid will be mapped if it is not already.
4630 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4631 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4632 * This means we require only to establish a mapping from:
4633 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4634 * - the container root -> some sub{g,u}id
4635 * The former we add, if the user did not specifiy a mapping. The latter we
4636 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4637 * there to start the container in the first place.
4355ab5f 4638 */
c9b7c33e
CB
4639int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4640 const char *fn_name)
4355ab5f 4641{
f8aa4bf3
CB
4642 pid_t pid;
4643 uid_t euid, egid;
4355ab5f 4644 struct userns_fn_data d;
4355ab5f 4645 int p[2];
f8aa4bf3
CB
4646 struct lxc_list *it;
4647 struct id_map *map;
4648 char c = '1';
4649 int ret = -1;
4650 struct lxc_list *idmap = NULL, *tmplist = NULL;
28a2d9e7
CB
4651 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4652 *host_uid_map = NULL, *host_gid_map = NULL;
4355ab5f 4653
4355ab5f 4654 ret = pipe(p);
4355ab5f
SH
4655 if (ret < 0) {
4656 SYSERROR("opening pipe");
4657 return -1;
4658 }
4659 d.fn = fn;
c9b7c33e 4660 d.fn_name = fn_name;
4355ab5f
SH
4661 d.arg = data;
4662 d.p[0] = p[0];
4663 d.p[1] = p[1];
f8aa4bf3
CB
4664
4665 /* Clone child in new user namespace. */
4355ab5f 4666 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
f8aa4bf3
CB
4667 if (pid < 0) {
4668 ERROR("failed to clone child process in new user namespace");
4669 goto on_error;
4670 }
4671
4355ab5f 4672 close(p[0]);
4355ab5f
SH
4673 p[0] = -1;
4674
f8aa4bf3
CB
4675 /* Find container root. */
4676 lxc_list_for_each(it, &conf->id_map) {
4677 map = it->elem;
4678
4679 if (map->nsid != 0)
4680 continue;
4681
4682 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4683 container_root_uid = malloc(sizeof(*container_root_uid));
4684 if (!container_root_uid)
4685 goto on_error;
4686 container_root_uid->idtype = map->idtype;
4687 container_root_uid->hostid = map->hostid;
4688 container_root_uid->nsid = 0;
4689 container_root_uid->range = map->range;
4690 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4691 container_root_gid = malloc(sizeof(*container_root_gid));
4692 if (!container_root_gid)
4693 goto on_error;
4694 container_root_gid->idtype = map->idtype;
4695 container_root_gid->hostid = map->hostid;
4696 container_root_gid->nsid = 0;
4697 container_root_gid->range = map->range;
4698 }
4699
4700 /* Found container root. */
4701 if (container_root_uid && container_root_gid)
4702 break;
4703 }
4704
4705 /* This is actually checked earlier but it can't hurt. */
4706 if (!container_root_uid || !container_root_gid) {
4707 ERROR("no mapping for container root found");
4708 goto on_error;
4709 }
4710
1d90e064
CB
4711 host_uid_map = container_root_uid;
4712 host_gid_map = container_root_gid;
4713
f8aa4bf3
CB
4714 /* Check whether the {g,u}id of the user has a mapping. */
4715 euid = geteuid();
4716 egid = getegid();
1d90e064 4717 if (euid != container_root_uid->hostid)
28a2d9e7
CB
4718 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4719
1d90e064 4720 if (egid != container_root_gid->hostid)
28a2d9e7
CB
4721 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4722
4723 if (!host_uid_map) {
4724 DEBUG("failed to find mapping for uid %d", euid);
f8aa4bf3
CB
4725 goto on_error;
4726 }
4727
28a2d9e7
CB
4728 if (!host_gid_map) {
4729 DEBUG("failed to find mapping for gid %d", egid);
4730 goto on_error;
4731 }
4732
4733 /* Allocate new {g,u}id map list. */
4734 idmap = malloc(sizeof(*idmap));
4735 if (!idmap)
4736 goto on_error;
4737 lxc_list_init(idmap);
4738
f8aa4bf3
CB
4739 /* Add container root to the map. */
4740 tmplist = malloc(sizeof(*tmplist));
4741 if (!tmplist)
4742 goto on_error;
4743 lxc_list_add_elem(tmplist, container_root_uid);
4744 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4745
1d90e064 4746 if (host_uid_map && (host_uid_map != container_root_uid)) {
28a2d9e7
CB
4747 /* idmap will now keep track of that memory. */
4748 container_root_uid = NULL;
4749
4750 /* Add container root to the map. */
4751 tmplist = malloc(sizeof(*tmplist));
4752 if (!tmplist)
4753 goto on_error;
4754 lxc_list_add_elem(tmplist, host_uid_map);
4755 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4756 }
1d90e064
CB
4757 /* idmap will now keep track of that memory. */
4758 container_root_uid = NULL;
4759 /* idmap will now keep track of that memory. */
4760 host_uid_map = NULL;
f8aa4bf3
CB
4761
4762 tmplist = malloc(sizeof(*tmplist));
4763 if (!tmplist)
4764 goto on_error;
4765 lxc_list_add_elem(tmplist, container_root_gid);
4766 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4767
1d90e064 4768 if (host_gid_map && (host_gid_map != container_root_gid)) {
28a2d9e7
CB
4769 /* idmap will now keep track of that memory. */
4770 container_root_gid = NULL;
4771
4772 tmplist = malloc(sizeof(*tmplist));
4773 if (!tmplist)
4774 goto on_error;
4775 lxc_list_add_elem(tmplist, host_gid_map);
4776 lxc_list_add_tail(idmap, tmplist);
28a2d9e7 4777 }
1d90e064
CB
4778 /* idmap will now keep track of that memory. */
4779 container_root_gid = NULL;
4780 /* idmap will now keep track of that memory. */
4781 host_gid_map = NULL;
f8aa4bf3 4782
4b73005c
CB
4783 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4784 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
f8aa4bf3
CB
4785 lxc_list_for_each(it, idmap) {
4786 map = it->elem;
4787 TRACE("establishing %cid mapping for \"%d\" in new "
4788 "user namespace: nsuid %lu - hostid %lu - range "
4789 "%lu",
4790 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4791 map->nsid, map->hostid, map->range);
4792 }
4355ab5f
SH
4793 }
4794
f8aa4bf3 4795 /* Set up {g,u}id mapping for user namespace of child process. */
4355ab5f 4796 ret = lxc_map_ids(idmap, pid);
f8aa4bf3
CB
4797 if (ret < 0) {
4798 ERROR("error setting up {g,u}id mappings for child process "
4799 "\"%d\"",
4800 pid);
4801 goto on_error;
4355ab5f
SH
4802 }
4803
f8aa4bf3 4804 /* Tell child to proceed. */
4355ab5f 4805 if (write(p[1], &c, 1) != 1) {
f8aa4bf3
CB
4806 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4807 goto on_error;
4355ab5f
SH
4808 }
4809
f8aa4bf3 4810 /* Wait for child to finish. */
3139aead
SG
4811 ret = wait_for_pid(pid);
4812
f8aa4bf3 4813on_error:
1d90e064
CB
4814 if (idmap)
4815 lxc_free_idmap(idmap);
4816 if (container_root_uid)
4817 free(container_root_uid);
4818 if (container_root_gid)
4819 free(container_root_gid);
4820 if (host_uid_map && (host_uid_map != container_root_uid))
4821 free(host_uid_map);
4822 if (host_gid_map && (host_gid_map != container_root_gid))
4823 free(host_gid_map);
3139aead 4824
4355ab5f
SH
4825 if (p[0] != -1)
4826 close(p[0]);
4827 close(p[1]);
f8aa4bf3
CB
4828
4829 return ret;
4355ab5f 4830}
97e9cfa0 4831
a96a8e8c 4832/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4833static char* getuname(void)
4834{
a96a8e8c 4835 struct passwd *result;
97e9cfa0 4836
a96a8e8c
SH
4837 result = getpwuid(geteuid());
4838 if (!result)
97e9cfa0
SH
4839 return NULL;
4840
a96a8e8c 4841 return strdup(result->pw_name);
97e9cfa0
SH
4842}
4843
a96a8e8c 4844/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4845static char *getgname(void)
4846{
a96a8e8c 4847 struct group *result;
97e9cfa0 4848
a96a8e8c
SH
4849 result = getgrgid(getegid());
4850 if (!result)
97e9cfa0
SH
4851 return NULL;
4852
a96a8e8c 4853 return strdup(result->gr_name);
97e9cfa0
SH
4854}
4855
a96a8e8c 4856/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4857void suggest_default_idmap(void)
4858{
4859 FILE *f;
4860 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4861 char *line = NULL;
4862 char *uname, *gname;
4863 size_t len = 0;
4864
4865 if (!(uname = getuname()))
4866 return;
4867
4868 if (!(gname = getgname())) {
4869 free(uname);
4870 return;
4871 }
4872
4873 f = fopen(subuidfile, "r");
4874 if (!f) {
4875 ERROR("Your system is not configured with subuids");
4876 free(gname);
4877 free(uname);
4878 return;
4879 }
4880 while (getline(&line, &len, f) != -1) {
b7930180 4881 size_t no_newline = 0;
97e9cfa0
SH
4882 char *p = strchr(line, ':'), *p2;
4883 if (*line == '#')
4884 continue;
4885 if (!p)
4886 continue;
4887 *p = '\0';
4888 p++;
4889 if (strcmp(line, uname))
4890 continue;
4891 p2 = strchr(p, ':');
4892 if (!p2)
4893 continue;
4894 *p2 = '\0';
4895 p2++;
4896 if (!*p2)
4897 continue;
b7930180
CB
4898 no_newline = strcspn(p2, "\n");
4899 p2[no_newline] = '\0';
4900
b7b2fde4
CB
4901 if (lxc_safe_uint(p, &uid) < 0)
4902 WARN("Could not parse UID.");
4903 if (lxc_safe_uint(p2, &urange) < 0)
4904 WARN("Could not parse UID range.");
97e9cfa0
SH
4905 }
4906 fclose(f);
4907
6be7389a 4908 f = fopen(subgidfile, "r");
97e9cfa0
SH
4909 if (!f) {
4910 ERROR("Your system is not configured with subgids");
4911 free(gname);
4912 free(uname);
4913 return;
4914 }
4915 while (getline(&line, &len, f) != -1) {
b7930180 4916 size_t no_newline = 0;
97e9cfa0
SH
4917 char *p = strchr(line, ':'), *p2;
4918 if (*line == '#')
4919 continue;
4920 if (!p)
4921 continue;
4922 *p = '\0';
4923 p++;
4924 if (strcmp(line, uname))
4925 continue;
4926 p2 = strchr(p, ':');
4927 if (!p2)
4928 continue;
4929 *p2 = '\0';
4930 p2++;
4931 if (!*p2)
4932 continue;
b7930180
CB
4933 no_newline = strcspn(p2, "\n");
4934 p2[no_newline] = '\0';
4935
b7b2fde4
CB
4936 if (lxc_safe_uint(p, &gid) < 0)
4937 WARN("Could not parse GID.");
4938 if (lxc_safe_uint(p2, &grange) < 0)
4939 WARN("Could not parse GID range.");
97e9cfa0
SH
4940 }
4941 fclose(f);
4942
f10fad2f 4943 free(line);
97e9cfa0
SH
4944
4945 if (!urange || !grange) {
4946 ERROR("You do not have subuids or subgids allocated");
4947 ERROR("Unprivileged containers require subuids and subgids");
4948 return;
4949 }
4950
4951 ERROR("You must either run as root, or define uid mappings");
4952 ERROR("To pass uid mappings to lxc-create, you could create");
4953 ERROR("~/.config/lxc/default.conf:");
4954 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4955 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4956 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4957
4958 free(gname);
4959 free(uname);
4960}
aaf26830 4961
a7307747
SH
4962static void free_cgroup_settings(struct lxc_list *result)
4963{
4964 struct lxc_list *iterator, *next;
4965
4966 lxc_list_for_each_safe(iterator, result, next) {
4967 lxc_list_del(iterator);
4968 free(iterator);
4969 }
4970 free(result);
4971}
4972
aaf26830
KT
4973/*
4974 * Return the list of cgroup_settings sorted according to the following rules
4975 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4976 */
4977struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4978{
4979 struct lxc_list *result;
4980 struct lxc_list *memsw_limit = NULL;
4981 struct lxc_list *it = NULL;
4982 struct lxc_cgroup *cg = NULL;
4983 struct lxc_list *item = NULL;
4984
4985 result = malloc(sizeof(*result));
fac7c663
KT
4986 if (!result) {
4987 ERROR("failed to allocate memory to sort cgroup settings");
4988 return NULL;
4989 }
aaf26830
KT
4990 lxc_list_init(result);
4991
4992 /*Iterate over the cgroup settings and copy them to the output list*/
4993 lxc_list_for_each(it, cgroup_settings) {
4994 item = malloc(sizeof(*item));
fac7c663
KT
4995 if (!item) {
4996 ERROR("failed to allocate memory to sort cgroup settings");
a7307747 4997 free_cgroup_settings(result);
fac7c663
KT
4998 return NULL;
4999 }
aaf26830
KT
5000 item->elem = it->elem;
5001 cg = it->elem;
5002 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5003 /* Store the memsw_limit location */
5004 memsw_limit = item;
5005 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4d5b72a1 5006 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
aaf26830
KT
5007 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5008 item->elem = memsw_limit->elem;
5009 memsw_limit->elem = it->elem;
5010 }
5011 lxc_list_add_tail(result, item);
5012 }
5013
5014 return result;
a7307747 5015}