]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/conf.c
lxc_mount_auto_mounts: honor existing nodev etc at remounts
[mirror_lxc.git] / src / lxc / conf.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
d06245b8
NC
23#include "config.h"
24
0ad19a3f 25#include <stdio.h>
0ad19a3f 26#include <stdlib.h>
e3b4c4c4 27#include <stdarg.h>
0ad19a3f 28#include <errno.h>
29#include <string.h>
30#include <dirent.h>
0ad19a3f 31#include <unistd.h>
bc6928ff 32#include <inttypes.h>
e3b4c4c4 33#include <sys/wait.h>
2d76d1d7 34#include <sys/syscall.h>
97e9cfa0
SH
35#include <sys/types.h>
36#include <pwd.h>
37#include <grp.h>
4a0ba80d 38#include <time.h>
2938f7c8 39#include <sys/statvfs.h>
e827ff7e
SG
40
41#if HAVE_PTY_H
b0a33c1e 42#include <pty.h>
e827ff7e
SG
43#else
44#include <../include/openpty.h>
45#endif
0ad19a3f 46
b3ecde1e
DL
47#include <linux/loop.h>
48
0ad19a3f 49#include <sys/types.h>
50#include <sys/utsname.h>
51#include <sys/param.h>
52#include <sys/stat.h>
53#include <sys/socket.h>
54#include <sys/mount.h>
55#include <sys/mman.h>
81810dd1 56#include <sys/prctl.h>
0ad19a3f 57
58#include <arpa/inet.h>
59#include <fcntl.h>
60#include <netinet/in.h>
61#include <net/if.h>
6f4a3756 62#include <libgen.h>
0ad19a3f 63
e5bda9ee 64#include "network.h"
65#include "error.h"
b2718c72 66#include "parse.h"
1b09f2c0
DL
67#include "utils.h"
68#include "conf.h"
69#include "log.h"
d55bc1ad 70#include "caps.h" /* for lxc_caps_last_cap() */
9be53773 71#include "bdev.h"
368bbc02 72#include "cgroup.h"
025ed0f3 73#include "lxclock.h"
4355ab5f 74#include "namespace.h"
fe4de9a6 75#include "lsm/lsm.h"
d0a36f2c 76
495d2046
SG
77#if HAVE_SYS_CAPABILITY_H
78#include <sys/capability.h>
79#endif
80
6ff05e18
SG
81#if HAVE_SYS_PERSONALITY_H
82#include <sys/personality.h>
83#endif
84
edaf8b1b
SG
85#if IS_BIONIC
86#include <../include/lxcmntent.h>
87#else
88#include <mntent.h>
89#endif
90
769872f9
SH
91#include "lxcseccomp.h"
92
36eb9bde 93lxc_log_define(lxc_conf, lxc);
e5bda9ee 94
0ad19a3f 95#define MAXHWLEN 18
96#define MAXINDEXLEN 20
442cbbe6 97#define MAXMTULEN 16
0ad19a3f 98#define MAXLINELEN 128
99
495d2046 100#if HAVE_SYS_CAPABILITY_H
b09094da
MN
101#ifndef CAP_SETFCAP
102#define CAP_SETFCAP 31
103#endif
104
105#ifndef CAP_MAC_OVERRIDE
106#define CAP_MAC_OVERRIDE 32
107#endif
108
109#ifndef CAP_MAC_ADMIN
110#define CAP_MAC_ADMIN 33
111#endif
495d2046 112#endif
b09094da
MN
113
114#ifndef PR_CAPBSET_DROP
115#define PR_CAPBSET_DROP 24
116#endif
117
9818cae4
SG
118#ifndef LO_FLAGS_AUTOCLEAR
119#define LO_FLAGS_AUTOCLEAR 4
120#endif
121
0769b82a
CS
122/* needed for cgroup automount checks, regardless of whether we
123 * have included linux/capability.h or not */
124#ifndef CAP_SYS_ADMIN
125#define CAP_SYS_ADMIN 21
126#endif
127
2d76d1d7
SG
128/* Define pivot_root() if missing from the C library */
129#ifndef HAVE_PIVOT_ROOT
130static int pivot_root(const char * new_root, const char * put_old)
131{
132#ifdef __NR_pivot_root
133return syscall(__NR_pivot_root, new_root, put_old);
134#else
135errno = ENOSYS;
136return -1;
137#endif
138}
139#else
140extern int pivot_root(const char * new_root, const char * put_old);
141#endif
142
143/* Define sethostname() if missing from the C library */
144#ifndef HAVE_SETHOSTNAME
145static int sethostname(const char * name, size_t len)
146{
147#ifdef __NR_sethostname
148return syscall(__NR_sethostname, name, len);
149#else
150errno = ENOSYS;
151return -1;
152#endif
153}
154#endif
155
72f919c4
SG
156/* Define __S_ISTYPE if missing from the C library */
157#ifndef __S_ISTYPE
158#define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
159#endif
160
72d0e1cb 161char *lxchook_names[NUM_LXC_HOOKS] = {
148e91f5 162 "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
72d0e1cb 163
e3b4c4c4 164typedef int (*instanciate_cb)(struct lxc_handler *, struct lxc_netdev *);
0ad19a3f 165
998ac676
RT
166struct mount_opt {
167 char *name;
168 int clear;
169 int flag;
170};
171
81810dd1
DL
172struct caps_opt {
173 char *name;
174 int value;
175};
176
0769b82a
CS
177/* Declare this here, since we don't want to reshuffle the whole file. */
178static int in_caplist(int cap, struct lxc_list *caps);
179
e3b4c4c4
ST
180static int instanciate_veth(struct lxc_handler *, struct lxc_netdev *);
181static int instanciate_macvlan(struct lxc_handler *, struct lxc_netdev *);
182static int instanciate_vlan(struct lxc_handler *, struct lxc_netdev *);
183static int instanciate_phys(struct lxc_handler *, struct lxc_netdev *);
184static int instanciate_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 185static int instanciate_none(struct lxc_handler *, struct lxc_netdev *);
82d5ae15 186
24654103
DL
187static instanciate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
188 [LXC_NET_VETH] = instanciate_veth,
189 [LXC_NET_MACVLAN] = instanciate_macvlan,
190 [LXC_NET_VLAN] = instanciate_vlan,
191 [LXC_NET_PHYS] = instanciate_phys,
192 [LXC_NET_EMPTY] = instanciate_empty,
26b797f3 193 [LXC_NET_NONE] = instanciate_none,
0ad19a3f 194};
195
74a2b586
JK
196static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
197static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
198static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
199static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
200static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
26b797f3 201static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
74a2b586
JK
202
203static instanciate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
204 [LXC_NET_VETH] = shutdown_veth,
205 [LXC_NET_MACVLAN] = shutdown_macvlan,
206 [LXC_NET_VLAN] = shutdown_vlan,
207 [LXC_NET_PHYS] = shutdown_phys,
208 [LXC_NET_EMPTY] = shutdown_empty,
26b797f3 209 [LXC_NET_NONE] = shutdown_none,
74a2b586
JK
210};
211
998ac676 212static struct mount_opt mount_opt[] = {
88d413d5
SW
213 { "defaults", 0, 0 },
214 { "ro", 0, MS_RDONLY },
215 { "rw", 1, MS_RDONLY },
216 { "suid", 1, MS_NOSUID },
217 { "nosuid", 0, MS_NOSUID },
218 { "dev", 1, MS_NODEV },
219 { "nodev", 0, MS_NODEV },
220 { "exec", 1, MS_NOEXEC },
221 { "noexec", 0, MS_NOEXEC },
222 { "sync", 0, MS_SYNCHRONOUS },
223 { "async", 1, MS_SYNCHRONOUS },
224 { "dirsync", 0, MS_DIRSYNC },
225 { "remount", 0, MS_REMOUNT },
226 { "mand", 0, MS_MANDLOCK },
227 { "nomand", 1, MS_MANDLOCK },
228 { "atime", 1, MS_NOATIME },
229 { "noatime", 0, MS_NOATIME },
230 { "diratime", 1, MS_NODIRATIME },
231 { "nodiratime", 0, MS_NODIRATIME },
232 { "bind", 0, MS_BIND },
233 { "rbind", 0, MS_BIND|MS_REC },
234 { "relatime", 0, MS_RELATIME },
235 { "norelatime", 1, MS_RELATIME },
236 { "strictatime", 0, MS_STRICTATIME },
237 { "nostrictatime", 1, MS_STRICTATIME },
238 { NULL, 0, 0 },
998ac676
RT
239};
240
495d2046 241#if HAVE_SYS_CAPABILITY_H
81810dd1 242static struct caps_opt caps_opt[] = {
a6afdde9 243 { "chown", CAP_CHOWN },
1e11be34
DL
244 { "dac_override", CAP_DAC_OVERRIDE },
245 { "dac_read_search", CAP_DAC_READ_SEARCH },
246 { "fowner", CAP_FOWNER },
247 { "fsetid", CAP_FSETID },
81810dd1
DL
248 { "kill", CAP_KILL },
249 { "setgid", CAP_SETGID },
250 { "setuid", CAP_SETUID },
251 { "setpcap", CAP_SETPCAP },
252 { "linux_immutable", CAP_LINUX_IMMUTABLE },
253 { "net_bind_service", CAP_NET_BIND_SERVICE },
254 { "net_broadcast", CAP_NET_BROADCAST },
255 { "net_admin", CAP_NET_ADMIN },
256 { "net_raw", CAP_NET_RAW },
257 { "ipc_lock", CAP_IPC_LOCK },
258 { "ipc_owner", CAP_IPC_OWNER },
259 { "sys_module", CAP_SYS_MODULE },
260 { "sys_rawio", CAP_SYS_RAWIO },
261 { "sys_chroot", CAP_SYS_CHROOT },
262 { "sys_ptrace", CAP_SYS_PTRACE },
263 { "sys_pacct", CAP_SYS_PACCT },
264 { "sys_admin", CAP_SYS_ADMIN },
265 { "sys_boot", CAP_SYS_BOOT },
266 { "sys_nice", CAP_SYS_NICE },
267 { "sys_resource", CAP_SYS_RESOURCE },
268 { "sys_time", CAP_SYS_TIME },
269 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
270 { "mknod", CAP_MKNOD },
271 { "lease", CAP_LEASE },
9527e566 272#ifdef CAP_AUDIT_WRITE
81810dd1 273 { "audit_write", CAP_AUDIT_WRITE },
9527e566
FW
274#endif
275#ifdef CAP_AUDIT_CONTROL
81810dd1 276 { "audit_control", CAP_AUDIT_CONTROL },
9527e566 277#endif
81810dd1
DL
278 { "setfcap", CAP_SETFCAP },
279 { "mac_override", CAP_MAC_OVERRIDE },
280 { "mac_admin", CAP_MAC_ADMIN },
5170c716
CS
281#ifdef CAP_SYSLOG
282 { "syslog", CAP_SYSLOG },
283#endif
284#ifdef CAP_WAKE_ALARM
285 { "wake_alarm", CAP_WAKE_ALARM },
286#endif
81810dd1 287};
495d2046
SG
288#else
289static struct caps_opt caps_opt[] = {};
290#endif
81810dd1 291
f0d02950
JTLB
292const char *dev_base_path = "/dev/.lxc";
293const char *dev_user_path = "/dev/.lxc/user";
294
91c3830e
SH
295static int run_buffer(char *buffer)
296{
ebec9176 297 struct lxc_popen_FILE *f;
91c3830e 298 char *output;
8e7da691 299 int ret;
91c3830e 300
ebec9176 301 f = lxc_popen(buffer);
91c3830e
SH
302 if (!f) {
303 SYSERROR("popen failed");
304 return -1;
305 }
306
307 output = malloc(LXC_LOG_BUFFER_SIZE);
308 if (!output) {
309 ERROR("failed to allocate memory for script output");
ebec9176 310 lxc_pclose(f);
91c3830e
SH
311 return -1;
312 }
313
ebec9176 314 while(fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
91c3830e
SH
315 DEBUG("script output: %s", output);
316
317 free(output);
318
ebec9176 319 ret = lxc_pclose(f);
8e7da691 320 if (ret == -1) {
91c3830e
SH
321 SYSERROR("Script exited on error");
322 return -1;
8e7da691
DE
323 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
324 ERROR("Script exited with status %d", WEXITSTATUS(ret));
325 return -1;
326 } else if (WIFSIGNALED(ret)) {
327 ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret),
328 strsignal(WTERMSIG(ret)));
329 return -1;
91c3830e
SH
330 }
331
332 return 0;
333}
334
148e91f5 335static int run_script_argv(const char *name, const char *section,
283678ed
SH
336 const char *script, const char *hook, const char *lxcpath,
337 char **argsin)
148e91f5
SH
338{
339 int ret, i;
340 char *buffer;
341 size_t size = 0;
342
343 INFO("Executing script '%s' for container '%s', config section '%s'",
344 script, name, section);
345
346 for (i=0; argsin && argsin[i]; i++)
347 size += strlen(argsin[i]) + 1;
348
349 size += strlen(hook) + 1;
350
351 size += strlen(script);
352 size += strlen(name);
353 size += strlen(section);
354 size += 3;
355
356 if (size > INT_MAX)
357 return -1;
358
359 buffer = alloca(size);
360 if (!buffer) {
361 ERROR("failed to allocate memory");
362 return -1;
363 }
364
365 ret = snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
366 if (ret < 0 || ret >= size) {
367 ERROR("Script name too long");
368 return -1;
369 }
370
371 for (i=0; argsin && argsin[i]; i++) {
372 int len = size-ret;
373 int rc;
374 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
375 if (rc < 0 || rc >= len) {
376 ERROR("Script args too long");
377 return -1;
378 }
379 ret += rc;
380 }
381
382 return run_buffer(buffer);
383}
384
751d9dcd
DL
385static int run_script(const char *name, const char *section,
386 const char *script, ...)
e3b4c4c4 387{
abbfd20b 388 int ret;
91c3830e 389 char *buffer, *p;
abbfd20b
DL
390 size_t size = 0;
391 va_list ap;
751d9dcd
DL
392
393 INFO("Executing script '%s' for container '%s', config section '%s'",
394 script, name, section);
e3b4c4c4 395
abbfd20b
DL
396 va_start(ap, script);
397 while ((p = va_arg(ap, char *)))
95642a10 398 size += strlen(p) + 1;
abbfd20b
DL
399 va_end(ap);
400
401 size += strlen(script);
402 size += strlen(name);
403 size += strlen(section);
95642a10 404 size += 3;
abbfd20b 405
95642a10
MS
406 if (size > INT_MAX)
407 return -1;
408
409 buffer = alloca(size);
abbfd20b
DL
410 if (!buffer) {
411 ERROR("failed to allocate memory");
751d9dcd
DL
412 return -1;
413 }
414
9ba8130c
SH
415 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
416 if (ret < 0 || ret >= size) {
417 ERROR("Script name too long");
9ba8130c
SH
418 return -1;
419 }
751d9dcd 420
abbfd20b 421 va_start(ap, script);
9ba8130c
SH
422 while ((p = va_arg(ap, char *))) {
423 int len = size-ret;
424 int rc;
425 rc = snprintf(buffer + ret, len, " %s", p);
426 if (rc < 0 || rc >= len) {
9ba8130c
SH
427 ERROR("Script args too long");
428 return -1;
429 }
430 ret += rc;
431 }
abbfd20b 432 va_end(ap);
751d9dcd 433
91c3830e 434 return run_buffer(buffer);
e3b4c4c4
ST
435}
436
a6afdde9 437static int find_fstype_cb(char* buffer, void *data)
78ae2fcc 438{
439 struct cbarg {
440 const char *rootfs;
a6afdde9 441 const char *target;
a17b1e65 442 const char *options;
78ae2fcc 443 } *cbarg = data;
444
a17b1e65
SG
445 unsigned long mntflags;
446 char *mntdata;
78ae2fcc 447 char *fstype;
448
449 /* we don't try 'nodev' entries */
450 if (strstr(buffer, "nodev"))
451 return 0;
452
453 fstype = buffer;
b2718c72 454 fstype += lxc_char_left_gc(fstype, strlen(fstype));
455 fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
78ae2fcc 456
9827ecdb
YK
457 /* ignore blank line and comment */
458 if (fstype[0] == '\0' || fstype[0] == '#')
459 return 0;
460
a6afdde9
DL
461 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
462 cbarg->rootfs, cbarg->target, fstype);
463
a17b1e65
SG
464 if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) {
465 free(mntdata);
466 return -1;
467 }
468
469 if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) {
a6afdde9 470 DEBUG("mount failed with error: %s", strerror(errno));
a17b1e65 471 free(mntdata);
78ae2fcc 472 return 0;
a6afdde9 473 }
a17b1e65 474 free(mntdata);
78ae2fcc 475
a6afdde9
DL
476 INFO("mounted '%s' on '%s', with fstype '%s'",
477 cbarg->rootfs, cbarg->target, fstype);
78ae2fcc 478
479 return 1;
480}
481
a17b1e65
SG
482static int mount_unknown_fs(const char *rootfs, const char *target,
483 const char *options)
78ae2fcc 484{
a6afdde9 485 int i;
78ae2fcc 486
487 struct cbarg {
488 const char *rootfs;
a6afdde9 489 const char *target;
a17b1e65 490 const char *options;
78ae2fcc 491 } cbarg = {
492 .rootfs = rootfs,
a6afdde9 493 .target = target,
a17b1e65 494 .options = options,
78ae2fcc 495 };
496
a6afdde9
DL
497 /*
498 * find the filesystem type with brute force:
499 * first we check with /etc/filesystems, in case the modules
78ae2fcc 500 * are auto-loaded and fall back to the supported kernel fs
501 */
502 char *fsfile[] = {
503 "/etc/filesystems",
504 "/proc/filesystems",
505 };
506
a6afdde9
DL
507 for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
508
509 int ret;
510
511 if (access(fsfile[i], F_OK))
512 continue;
513
514 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
515 if (ret < 0) {
516 ERROR("failed to parse '%s'", fsfile[i]);
517 return -1;
518 }
519
520 if (ret)
521 return 0;
78ae2fcc 522 }
523
a6afdde9
DL
524 ERROR("failed to determine fs type for '%s'", rootfs);
525 return -1;
526}
527
a17b1e65
SG
528static int mount_rootfs_dir(const char *rootfs, const char *target,
529 const char *options)
a6afdde9 530{
a17b1e65
SG
531 unsigned long mntflags;
532 char *mntdata;
533 int ret;
534
535 if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
536 free(mntdata);
537 return -1;
538 }
539
540 ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
541 free(mntdata);
542
543 return ret;
a6afdde9
DL
544}
545
546static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
547{
548 int rfd;
549 int ret = -1;
550
551 rfd = open(rootfs, O_RDWR);
552 if (rfd < 0) {
553 SYSERROR("failed to open '%s'", rootfs);
78ae2fcc 554 return -1;
555 }
556
a6afdde9 557 memset(loinfo, 0, sizeof(*loinfo));
78ae2fcc 558
a6afdde9 559 loinfo->lo_flags = LO_FLAGS_AUTOCLEAR;
78ae2fcc 560
a6afdde9
DL
561 if (ioctl(fd, LOOP_SET_FD, rfd)) {
562 SYSERROR("failed to LOOP_SET_FD");
563 goto out;
78ae2fcc 564 }
565
a6afdde9
DL
566 if (ioctl(fd, LOOP_SET_STATUS64, loinfo)) {
567 SYSERROR("failed to LOOP_SET_STATUS64");
78ae2fcc 568 goto out;
569 }
570
a6afdde9 571 ret = 0;
78ae2fcc 572out:
a6afdde9 573 close(rfd);
78ae2fcc 574
a6afdde9 575 return ret;
78ae2fcc 576}
577
a17b1e65
SG
578static int mount_rootfs_file(const char *rootfs, const char *target,
579 const char *options)
78ae2fcc 580{
a6afdde9
DL
581 struct dirent dirent, *direntp;
582 struct loop_info64 loinfo;
9ba8130c 583 int ret = -1, fd = -1, rc;
a6afdde9
DL
584 DIR *dir;
585 char path[MAXPATHLEN];
78ae2fcc 586
a6afdde9
DL
587 dir = opendir("/dev");
588 if (!dir) {
589 SYSERROR("failed to open '/dev'");
78ae2fcc 590 return -1;
591 }
592
a6afdde9
DL
593 while (!readdir_r(dir, &dirent, &direntp)) {
594
595 if (!direntp)
596 break;
597
598 if (!strcmp(direntp->d_name, "."))
599 continue;
600
601 if (!strcmp(direntp->d_name, ".."))
602 continue;
603
604 if (strncmp(direntp->d_name, "loop", 4))
605 continue;
606
9ba8130c
SH
607 rc = snprintf(path, MAXPATHLEN, "/dev/%s", direntp->d_name);
608 if (rc < 0 || rc >= MAXPATHLEN)
609 continue;
610
a6afdde9
DL
611 fd = open(path, O_RDWR);
612 if (fd < 0)
613 continue;
614
615 if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) {
616 close(fd);
617 continue;
618 }
619
620 if (errno != ENXIO) {
621 WARN("unexpected error for ioctl on '%s': %m",
622 direntp->d_name);
00b6be44 623 close(fd);
a6afdde9
DL
624 continue;
625 }
626
627 DEBUG("found '%s' free lodev", path);
628
629 ret = setup_lodev(rootfs, fd, &loinfo);
630 if (!ret)
a17b1e65 631 ret = mount_unknown_fs(path, target, options);
a6afdde9
DL
632 close(fd);
633
634 break;
635 }
636
637 if (closedir(dir))
638 WARN("failed to close directory");
639
640 return ret;
78ae2fcc 641}
642
a17b1e65
SG
643static int mount_rootfs_block(const char *rootfs, const char *target,
644 const char *options)
a6afdde9 645{
a17b1e65 646 return mount_unknown_fs(rootfs, target, options);
a6afdde9
DL
647}
648
0c547523
SH
649/*
650 * pin_rootfs
b7ed4bf0
CS
651 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
652 * the duration of the container run, to prevent the container from marking
653 * the underlying fs readonly on shutdown. unlink the file immediately so
654 * no name pollution is happens
0c547523
SH
655 * return -1 on error.
656 * return -2 if nothing needed to be pinned.
657 * return an open fd (>=0) if we pinned it.
658 */
659int pin_rootfs(const char *rootfs)
660{
661 char absrootfs[MAXPATHLEN];
662 char absrootfspin[MAXPATHLEN];
663 struct stat s;
664 int ret, fd;
665
e99ee0de 666 if (rootfs == NULL || strlen(rootfs) == 0)
0d03360a 667 return -2;
e99ee0de 668
00ec333b 669 if (!realpath(rootfs, absrootfs))
9be53773 670 return -2;
0c547523 671
00ec333b 672 if (access(absrootfs, F_OK))
0c547523 673 return -1;
0c547523 674
00ec333b 675 if (stat(absrootfs, &s))
0c547523 676 return -1;
0c547523 677
72f919c4 678 if (!S_ISDIR(s.st_mode))
0c547523
SH
679 return -2;
680
b7ed4bf0 681 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
00ec333b 682 if (ret >= MAXPATHLEN)
0c547523 683 return -1;
0c547523
SH
684
685 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
b7ed4bf0
CS
686 if (fd < 0)
687 return fd;
688 (void)unlink(absrootfspin);
0c547523
SH
689 return fd;
690}
691
e2a7e8dc
SH
692/*
693 * If we are asking to remount something, make sure that any
694 * NOEXEC etc are honored.
695 */
696static unsigned long add_required_remount_flags(const char *s, const char *d,
697 unsigned long flags)
698{
699 struct statvfs sb;
700 unsigned long required_flags = 0;
701
702 if (!(flags & MS_REMOUNT))
703 return flags;
704
705 if (!s)
706 s = d;
707
708 if (!s)
709 return flags;
710 if (statvfs(s, &sb) < 0)
711 return flags;
712
713 if (sb.f_flag & MS_NOSUID)
714 required_flags |= MS_NOSUID;
715 if (sb.f_flag & MS_NODEV)
716 required_flags |= MS_NODEV;
717 if (sb.f_flag & MS_RDONLY)
718 required_flags |= MS_RDONLY;
719 if (sb.f_flag & MS_NOEXEC)
720 required_flags |= MS_NOEXEC;
721
722 return flags | required_flags;
723}
724
4fb3cba5 725static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
368bbc02 726{
368bbc02 727 int r;
b06b8511
CS
728 size_t i;
729 static struct {
730 int match_mask;
731 int match_flag;
732 const char *source;
733 const char *destination;
734 const char *fstype;
735 unsigned long flags;
736 const char *options;
737 } default_mounts[] = {
738 /* Read-only bind-mounting... In older kernels, doing that required
739 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
740 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
741 * kernel 2.6.26 onwards. However, this apparently does not work on
742 * kernel 3.8. Unfortunately, on that very same kernel, doing the
743 * same trick as above doesn't seem to work either, there one needs
744 * to ALSO specify MS_BIND for the remount, otherwise the entire
745 * fs is remounted read-only or the mount fails because it's busy...
746 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
747 * 2.6.32...
368bbc02 748 */
b06b8511
CS
749 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
750 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
751 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
752 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
753 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
754 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
755 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
756 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
757 { 0, 0, NULL, NULL, NULL, 0, NULL }
758 };
368bbc02 759
b06b8511
CS
760 for (i = 0; default_mounts[i].match_mask; i++) {
761 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
762 char *source = NULL;
763 char *destination = NULL;
764 int saved_errno;
e2a7e8dc 765 unsigned long mflags;
b06b8511
CS
766
767 if (default_mounts[i].source) {
768 /* will act like strdup if %r is not present */
769 source = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].source);
770 if (!source) {
771 SYSERROR("memory allocation error");
772 return -1;
773 }
774 }
775 if (default_mounts[i].destination) {
776 /* will act like strdup if %r is not present */
777 destination = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].destination);
778 if (!destination) {
779 saved_errno = errno;
780 SYSERROR("memory allocation error");
781 free(source);
782 errno = saved_errno;
783 return -1;
784 }
785 }
e2a7e8dc
SH
786 mflags = add_required_remount_flags(source, destination,
787 default_mounts[i].flags);
788 r = mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options);
b06b8511 789 saved_errno = errno;
c414be25 790 if (r < 0)
e2a7e8dc 791 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
b06b8511
CS
792 free(source);
793 free(destination);
794 if (r < 0) {
b06b8511
CS
795 errno = saved_errno;
796 return -1;
797 }
368bbc02 798 }
368bbc02
CS
799 }
800
b06b8511 801 if (flags & LXC_AUTO_CGROUP_MASK) {
0769b82a
CS
802 int cg_flags;
803
804 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
805 /* If the type of cgroup mount was not specified, it depends on the
806 * container's capabilities as to what makes sense: if we have
807 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
808 * anyway, so we may as well default to read-write; then the admin
809 * will not be given a false sense of security. (And if they really
810 * want mixed r/o r/w, then they can explicitly specify :mixed.)
811 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
812 * :mixed, because then the container can't remount it read-write. */
813 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
814 int has_sys_admin = 0;
815 if (!lxc_list_empty(&conf->keepcaps)) {
816 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
817 } else {
818 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
819 }
820 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC) {
821 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
822 } else {
823 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
824 }
825 }
826
827 if (!cgroup_mount(conf->rootfs.mount, handler, cg_flags)) {
368bbc02 828 SYSERROR("error mounting /sys/fs/cgroup");
b06b8511 829 return -1;
368bbc02
CS
830 }
831 }
832
368bbc02 833 return 0;
368bbc02
CS
834}
835
a17b1e65 836static int mount_rootfs(const char *rootfs, const char *target, const char *options)
0ad19a3f 837{
b09ef133 838 char absrootfs[MAXPATHLEN];
78ae2fcc 839 struct stat s;
a6afdde9 840 int i;
78ae2fcc 841
a17b1e65 842 typedef int (*rootfs_cb)(const char *, const char *, const char *);
78ae2fcc 843
844 struct rootfs_type {
845 int type;
846 rootfs_cb cb;
847 } rtfs_type[] = {
2656d231
DL
848 { S_IFDIR, mount_rootfs_dir },
849 { S_IFBLK, mount_rootfs_block },
850 { S_IFREG, mount_rootfs_file },
78ae2fcc 851 };
0ad19a3f 852
4c8ab83b 853 if (!realpath(rootfs, absrootfs)) {
36eb9bde 854 SYSERROR("failed to get real path for '%s'", rootfs);
4c8ab83b 855 return -1;
856 }
b09ef133 857
b09ef133 858 if (access(absrootfs, F_OK)) {
36eb9bde 859 SYSERROR("'%s' is not accessible", absrootfs);
b09ef133 860 return -1;
861 }
862
78ae2fcc 863 if (stat(absrootfs, &s)) {
36eb9bde 864 SYSERROR("failed to stat '%s'", absrootfs);
9b0f0477 865 return -1;
866 }
867
78ae2fcc 868 for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
9b0f0477 869
78ae2fcc 870 if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
871 continue;
9b0f0477 872
a17b1e65 873 return rtfs_type[i].cb(absrootfs, target, options);
78ae2fcc 874 }
9b0f0477 875
36eb9bde 876 ERROR("unsupported rootfs type for '%s'", absrootfs);
78ae2fcc 877 return -1;
0ad19a3f 878}
879
4e5440c6 880static int setup_utsname(struct utsname *utsname)
0ad19a3f 881{
4e5440c6
DL
882 if (!utsname)
883 return 0;
0ad19a3f 884
4e5440c6
DL
885 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
886 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
0ad19a3f 887 return -1;
888 }
889
4e5440c6 890 INFO("'%s' hostname has been setup", utsname->nodename);
cd54d859 891
0ad19a3f 892 return 0;
893}
894
69aa6655
DE
895struct dev_symlinks {
896 const char *oldpath;
897 const char *name;
898};
899
900static const struct dev_symlinks dev_symlinks[] = {
901 {"/proc/self/fd", "fd"},
902 {"/proc/self/fd/0", "stdin"},
903 {"/proc/self/fd/1", "stdout"},
904 {"/proc/self/fd/2", "stderr"},
905};
906
907static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
908{
909 char path[MAXPATHLEN];
910 int ret,i;
09227be2 911 struct stat s;
69aa6655
DE
912
913
914 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
915 const struct dev_symlinks *d = &dev_symlinks[i];
916 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, d->name);
917 if (ret < 0 || ret >= MAXPATHLEN)
918 return -1;
09227be2
MW
919
920 /*
921 * Stat the path first. If we don't get an error
922 * accept it as is and don't try to create it
923 */
924 if (!stat(path, &s)) {
925 continue;
926 }
927
69aa6655 928 ret = symlink(d->oldpath, path);
09227be2 929
69aa6655 930 if (ret && errno != EEXIST) {
09227be2
MW
931 if ( errno == EROFS ) {
932 WARN("Warning: Read Only file system while creating %s", path);
933 } else {
934 SYSERROR("Error creating %s", path);
935 return -1;
936 }
69aa6655
DE
937 }
938 }
939 return 0;
940}
941
33fcb7a0 942static int setup_tty(const struct lxc_rootfs *rootfs,
7c6ef2a2 943 const struct lxc_tty_info *tty_info, char *ttydir)
b0a33c1e 944{
7c6ef2a2
SH
945 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
946 int i, ret;
b0a33c1e 947
bc9bd0e3
DL
948 if (!rootfs->path)
949 return 0;
950
b0a33c1e 951 for (i = 0; i < tty_info->nbtty; i++) {
952
953 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
954
7c6ef2a2 955 ret = snprintf(path, sizeof(path), "%s/dev/tty%d",
12297168 956 rootfs->mount, i + 1);
7c6ef2a2
SH
957 if (ret >= sizeof(path)) {
958 ERROR("pathname too long for ttys");
959 return -1;
960 }
961 if (ttydir) {
962 /* create dev/lxc/tty%d" */
9ba8130c 963 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/tty%d",
7c6ef2a2
SH
964 rootfs->mount, ttydir, i + 1);
965 if (ret >= sizeof(lxcpath)) {
966 ERROR("pathname too long for ttys");
967 return -1;
968 }
969 ret = creat(lxcpath, 0660);
970 if (ret==-1 && errno != EEXIST) {
959aee9c 971 SYSERROR("error creating %s", lxcpath);
7c6ef2a2
SH
972 return -1;
973 }
4d44e274
SH
974 if (ret >= 0)
975 close(ret);
7c6ef2a2
SH
976 ret = unlink(path);
977 if (ret && errno != ENOENT) {
959aee9c 978 SYSERROR("error unlinking %s", path);
7c6ef2a2
SH
979 return -1;
980 }
b0a33c1e 981
7c6ef2a2
SH
982 if (mount(pty_info->name, lxcpath, "none", MS_BIND, 0)) {
983 WARN("failed to mount '%s'->'%s'",
984 pty_info->name, path);
985 continue;
986 }
13954cce 987
9ba8130c
SH
988 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d", ttydir, i+1);
989 if (ret >= sizeof(lxcpath)) {
990 ERROR("tty pathname too long");
991 return -1;
992 }
7c6ef2a2
SH
993 ret = symlink(lxcpath, path);
994 if (ret) {
959aee9c 995 SYSERROR("failed to create symlink for tty %d", i+1);
7c6ef2a2
SH
996 return -1;
997 }
998 } else {
c6883f38
SH
999 /* If we populated /dev, then we need to create /dev/ttyN */
1000 if (access(path, F_OK)) {
1001 ret = creat(path, 0660);
1002 if (ret==-1) {
959aee9c 1003 SYSERROR("error creating %s", path);
c6883f38 1004 /* this isn't fatal, continue */
025ed0f3 1005 } else {
c6883f38 1006 close(ret);
025ed0f3 1007 }
c6883f38 1008 }
7c6ef2a2
SH
1009 if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
1010 WARN("failed to mount '%s'->'%s'",
1011 pty_info->name, path);
1012 continue;
1013 }
b0a33c1e 1014 }
1015 }
1016
cd54d859
DL
1017 INFO("%d tty(s) has been setup", tty_info->nbtty);
1018
b0a33c1e 1019 return 0;
1020}
1021
7a7ff0c6 1022static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
bf601689
MH
1023{
1024 struct lxc_list *mountlist, *listentry, *iterator;
2c7d90ac 1025 char *pivotdir, *mountpoint, *mountentry, *saveptr = NULL;
bf601689
MH
1026 int found;
1027 void **cbparm;
1028
1029 mountentry = buffer;
1030 cbparm = (void **)data;
1031
1032 mountlist = cbparm[0];
1033 pivotdir = cbparm[1];
1034
1035 /* parse entry, first field is mountname, ignore */
2796cf79 1036 mountpoint = strtok_r(mountentry, " ", &saveptr);
bf601689
MH
1037 if (!mountpoint)
1038 return -1;
1039
1040 /* second field is mountpoint */
2796cf79 1041 mountpoint = strtok_r(NULL, " ", &saveptr);
bf601689
MH
1042 if (!mountpoint)
1043 return -1;
1044
1045 /* only consider mountpoints below old root fs */
1046 if (strncmp(mountpoint, pivotdir, strlen(pivotdir)))
1047 return 0;
1048
1049 /* filter duplicate mountpoints */
1050 found = 0;
1051 lxc_list_for_each(iterator, mountlist) {
1052 if (!strcmp(iterator->elem, mountpoint)) {
1053 found = 1;
1054 break;
1055 }
1056 }
1057 if (found)
1058 return 0;
1059
1060 /* add entry to list */
1061 listentry = malloc(sizeof(*listentry));
1062 if (!listentry) {
1063 SYSERROR("malloc for mountpoint listentry failed");
1064 return -1;
1065 }
1066
1067 listentry->elem = strdup(mountpoint);
1068 if (!listentry->elem) {
1069 SYSERROR("strdup failed");
00b6be44 1070 free(listentry);
bf601689
MH
1071 return -1;
1072 }
1073 lxc_list_add_tail(mountlist, listentry);
1074
1075 return 0;
1076}
1077
cc6f6dd7 1078static int umount_oldrootfs(const char *oldrootfs)
bf601689 1079{
2382ecff 1080 char path[MAXPATHLEN];
bf601689 1081 void *cbparm[2];
9ebb03ad 1082 struct lxc_list mountlist, *iterator, *next;
bf601689 1083 int ok, still_mounted, last_still_mounted;
9ba8130c 1084 int rc;
bf601689
MH
1085
1086 /* read and parse /proc/mounts in old root fs */
1087 lxc_list_init(&mountlist);
1088
cc6f6dd7 1089 /* oldrootfs is on the top tree directory now */
9ba8130c
SH
1090 rc = snprintf(path, sizeof(path), "/%s", oldrootfs);
1091 if (rc >= sizeof(path)) {
1092 ERROR("rootfs name too long");
1093 return -1;
1094 }
bf601689 1095 cbparm[0] = &mountlist;
bf601689 1096
cc6f6dd7 1097 cbparm[1] = strdup(path);
bf601689
MH
1098 if (!cbparm[1]) {
1099 SYSERROR("strdup failed");
1100 return -1;
1101 }
1102
9ba8130c
SH
1103 rc = snprintf(path, sizeof(path), "%s/proc/mounts", oldrootfs);
1104 if (rc >= sizeof(path)) {
1105 ERROR("container proc/mounts name too long");
1106 return -1;
1107 }
cc6f6dd7
DL
1108
1109 ok = lxc_file_for_each_line(path,
1110 setup_rootfs_pivot_root_cb, &cbparm);
bf601689
MH
1111 if (ok < 0) {
1112 SYSERROR("failed to read or parse mount list '%s'", path);
1113 return -1;
1114 }
1115
1116 /* umount filesystems until none left or list no longer shrinks */
1117 still_mounted = 0;
1118 do {
1119 last_still_mounted = still_mounted;
1120 still_mounted = 0;
1121
9ebb03ad 1122 lxc_list_for_each_safe(iterator, &mountlist, next) {
bf601689 1123
c08556c6 1124 /* umount normally */
bf601689
MH
1125 if (!umount(iterator->elem)) {
1126 DEBUG("umounted '%s'", (char *)iterator->elem);
1127 lxc_list_del(iterator);
1128 continue;
1129 }
1130
bf601689
MH
1131 still_mounted++;
1132 }
7df119ee 1133
bf601689
MH
1134 } while (still_mounted > 0 && still_mounted != last_still_mounted);
1135
7df119ee 1136
c08556c6
DL
1137 lxc_list_for_each(iterator, &mountlist) {
1138
1139 /* let's try a lazy umount */
1140 if (!umount2(iterator->elem, MNT_DETACH)) {
1141 INFO("lazy unmount of '%s'", (char *)iterator->elem);
1142 continue;
1143 }
1144
1145 /* be more brutal (nfs) */
1146 if (!umount2(iterator->elem, MNT_FORCE)) {
1147 INFO("forced unmount of '%s'", (char *)iterator->elem);
1148 continue;
1149 }
1150
7df119ee 1151 WARN("failed to unmount '%s'", (char *)iterator->elem);
c08556c6 1152 }
bf601689 1153
cc6f6dd7
DL
1154 return 0;
1155}
1156
1157static int setup_rootfs_pivot_root(const char *rootfs, const char *pivotdir)
1158{
1159 char path[MAXPATHLEN];
1160 int remove_pivotdir = 0;
9ba8130c 1161 int rc;
cc6f6dd7
DL
1162
1163 /* change into new root fs */
1164 if (chdir(rootfs)) {
1165 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
1166 return -1;
1167 }
1168
1169 if (!pivotdir)
30c5d292 1170 pivotdir = "lxc_putold";
cc6f6dd7 1171
4f9293b1 1172 /* compute the full path to pivotdir under rootfs */
9ba8130c
SH
1173 rc = snprintf(path, sizeof(path), "%s/%s", rootfs, pivotdir);
1174 if (rc >= sizeof(path)) {
1175 ERROR("pivot dir name too long");
1176 return -1;
1177 }
cc6f6dd7
DL
1178
1179 if (access(path, F_OK)) {
1180
119126b6 1181 if (mkdir_p(path, 0755) < 0) {
cc6f6dd7
DL
1182 SYSERROR("failed to create pivotdir '%s'", path);
1183 return -1;
1184 }
1185
1186 remove_pivotdir = 1;
1187 DEBUG("created '%s' directory", path);
1188 }
1189
1190 DEBUG("mountpoint for old rootfs is '%s'", path);
1191
1192 /* pivot_root into our new root fs */
1193 if (pivot_root(".", path)) {
1194 SYSERROR("pivot_root syscall failed");
bf601689
MH
1195 return -1;
1196 }
cc6f6dd7
DL
1197
1198 if (chdir("/")) {
1199 SYSERROR("can't chdir to / after pivot_root");
1200 return -1;
1201 }
1202
1203 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1204
1205 /* we switch from absolute path to relative path */
1206 if (umount_oldrootfs(pivotdir))
1207 return -1;
bf601689 1208
c08556c6
DL
1209 /* remove temporary mount point, we don't consider the removing
1210 * as fatal */
a91d897a
FW
1211 if (remove_pivotdir && rmdir(pivotdir))
1212 WARN("can't remove mountpoint '%s': %m", pivotdir);
bf601689 1213
bf601689
MH
1214 return 0;
1215}
1216
bc6928ff
MW
1217/*
1218 * Check to see if a directory has something mounted on it and,
1219 * if it does, return the fstype.
1220 *
1221 * Code largely based on detect_shared_rootfs below
1222 *
1223 * Returns: # of matching entries in /proc/self/mounts
1224 * if != 0 fstype is filled with the last filesystem value.
1225 * if == 0 no matches found, fstype unchanged.
1226 *
1227 * ToDo: Maybe return the mount options in another parameter...
1228 */
1229
1230#define LINELEN 4096
1231#define MAX_FSTYPE_LEN 128
74a3920a 1232static int mount_check_fs( const char *dir, char *fstype )
bc6928ff
MW
1233{
1234 char buf[LINELEN], *p;
1235 struct stat s;
1236 FILE *f;
1237 int found_fs = 0;
1238 char *p2;
1239
959aee9c 1240 DEBUG("entering mount_check_fs for %s", dir);
bc6928ff
MW
1241
1242 if ( 0 != access(dir, F_OK) || 0 != stat(dir, &s) || 0 == S_ISDIR(s.st_mode) ) {
1243 return 0;
1244 }
1245
bc6928ff 1246 f = fopen("/proc/self/mounts", "r");
bc6928ff
MW
1247 if (!f)
1248 return 0;
4ad9f44b 1249 while (fgets(buf, LINELEN, f)) {
bc6928ff
MW
1250 p = index(buf, ' ');
1251 if( !p )
1252 continue;
1253 *p = '\0';
1254 p2 = p + 1;
1255
1256 p = index(p2, ' ');
1257 if( !p )
1258 continue;
1259 *p = '\0';
1260
1261 /* Compare the directory in the entry to desired */
1262 if( strcmp( p2, dir ) ) {
1263 continue;
1264 }
1265
1266 p2 = p + 1;
1267 p = index( p2, ' ');
1268 if( !p )
1269 continue;
1270 *p = '\0';
1271
1272 ++found_fs;
1273
1274 if( fstype ) {
1275 strncpy( fstype, p2, MAX_FSTYPE_LEN - 1 );
1276 fstype [ MAX_FSTYPE_LEN - 1 ] = '\0';
1277 }
1278 }
1279
bc6928ff 1280 fclose(f);
bc6928ff 1281
959aee9c 1282 DEBUG("mount_check_fs returning %d last %s", found_fs, fstype);
bc6928ff
MW
1283
1284 return found_fs;
1285}
1286
1287/*
1288 * Locate a devtmpfs mount (should be on /dev) and create a container
1289 * subdirectory on it which we can then bind mount to the container
1290 * /dev instead of mounting a tmpfs there.
1291 * If we fail, return NULL.
1292 * Else return the pointer to the name buffer with the string to
1293 * the devtmpfs subdirectory.
1294 */
1295
74a3920a 1296static char *mk_devtmpfs(const char *name, char *path, const char *lxcpath)
bc6928ff
MW
1297{
1298 int ret;
1299 struct stat s;
1300 char tmp_path[MAXPATHLEN];
1301 char fstype[MAX_FSTYPE_LEN];
bc6928ff
MW
1302 uint64_t hash;
1303
f0d02950 1304 if ( 0 != access(dev_base_path, F_OK) || 0 != stat(dev_base_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
bc6928ff 1305 /* This is just making /dev/.lxc it better work or we're done */
f0d02950 1306 ret = mkdir(dev_base_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
bc6928ff
MW
1307 if ( ret ) {
1308 SYSERROR( "Unable to create /dev/.lxc for autodev" );
1309 return NULL;
1310 }
1311 }
1312
1313 /*
1314 * Programmers notes:
1315 * We can not do mounts in this area of code that we want
1316 * to be visible in the host. Consequently, /dev/.lxc must
1317 * be set up earlier if we need a tmpfs mounted there.
1318 * That only affects the rare cases where autodev is enabled
1319 * for a container and devtmpfs is not mounted on /dev in the
1320 * host. In that case, we'll fall back to the old method
1321 * of mounting a tmpfs in the container and have no visibility
1322 * into the container /dev.
1323 */
1324 if( ! mount_check_fs( "/dev", fstype )
1325 || strcmp( "devtmpfs", fstype ) ) {
1326 /* Either /dev was not mounted or was not devtmpfs */
1327
1328 if ( ! mount_check_fs( "/dev/.lxc", NULL ) ) {
1329 /*
1330 * /dev/.lxc is not already mounted
1331 * Doing a mount here does no good, since
1332 * it's not visible in the host.
1333 */
1334
1335 ERROR("/dev/.lxc is not setup - taking fallback" );
1336 return NULL;
1337 }
1338 }
1339
f0d02950 1340 if ( 0 != access(dev_user_path, F_OK) || 0 != stat(dev_user_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
bc6928ff
MW
1341 /*
1342 * This is making /dev/.lxc/user path for non-priv users.
1343 * If this doesn't work, we'll have to fall back in the
1344 * case of non-priv users. It's mode 1777 like /tmp.
1345 */
f0d02950 1346 ret = mkdir(dev_user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
bc6928ff
MW
1347 if ( ret ) {
1348 /* Issue an error but don't fail yet! */
1349 ERROR("Unable to create /dev/.lxc/user");
1350 }
1351 /* Umask tends to screw us up here */
f0d02950 1352 chmod(dev_user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
bc6928ff
MW
1353 }
1354
1355 /*
1356 * Since the container name must be unique within a given
1357 * lxcpath, we're going to use a hash of the path
1358 * /lxcpath/name as our hash name in /dev/.lxc/
1359 */
1360
1361 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s", lxcpath, name);
1362 if (ret < 0 || ret >= MAXPATHLEN)
1363 return NULL;
1364
1365 hash = fnv_64a_buf(tmp_path, ret, FNV1A_64_INIT);
1366
f0d02950 1367 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, dev_base_path, name, hash);
bc6928ff
MW
1368 if (ret < 0 || ret >= MAXPATHLEN)
1369 return NULL;
1370
1371 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1372 ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1373 if ( ret ) {
f0d02950
JTLB
1374 /* Something must have failed with the dev_base_path...
1375 * Maybe unpriv user. Try dev_user_path now... */
bc6928ff
MW
1376 INFO("Setup in /dev/.lxc failed. Trying /dev/.lxc/user." );
1377
f0d02950 1378 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, dev_user_path, name, hash);
bc6928ff
MW
1379 if (ret < 0 || ret >= MAXPATHLEN)
1380 return NULL;
1381
1382 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1383 ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1384 if ( ret ) {
1385 ERROR("Container /dev setup in host /dev failed - taking fallback" );
1386 return NULL;
1387 }
1388 }
1389 }
1390 }
1391
1392 strcpy( path, tmp_path );
1393 return path;
1394}
1395
91c3830e
SH
1396/*
1397 * Do we want to add options for max size of /dev and a file to
1398 * specify which devices to create?
1399 */
bc6928ff 1400static int mount_autodev(const char *name, char *root, const char *lxcpath)
91c3830e
SH
1401{
1402 int ret;
bc6928ff 1403 struct stat s;
91c3830e 1404 char path[MAXPATHLEN];
bc6928ff
MW
1405 char host_path[MAXPATHLEN];
1406 char devtmpfs_path[MAXPATHLEN];
91c3830e 1407
959aee9c 1408 INFO("Mounting /dev under %s", root);
bc6928ff
MW
1409
1410 ret = snprintf(host_path, MAXPATHLEN, "%s/%s/rootfs.dev", lxcpath, name);
1411 if (ret < 0 || ret > MAXPATHLEN)
1412 return -1;
1413
91c3830e
SH
1414 ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
1415 if (ret < 0 || ret > MAXPATHLEN)
1416 return -1;
bc6928ff
MW
1417
1418 if (mk_devtmpfs( name, devtmpfs_path, lxcpath ) ) {
1419 /*
1420 * Get rid of old links and directoriess
1421 * This could be either a symlink and we remove it,
1422 * or an empty directory and we remove it,
1423 * or non-existant and we don't care,
1424 * or a non-empty directory, and we will then emit an error
1425 * but we will not fail out the process.
1426 */
1427 unlink( host_path );
1428 rmdir( host_path );
1429 ret = symlink(devtmpfs_path, host_path);
1430
1431 if ( ret < 0 ) {
959aee9c 1432 SYSERROR("WARNING: Failed to create symlink '%s'->'%s'", host_path, devtmpfs_path);
bc6928ff
MW
1433 }
1434 DEBUG("Bind mounting %s to %s", devtmpfs_path , path );
1435 ret = mount(devtmpfs_path, path, NULL, MS_BIND, 0 );
1436 } else {
1437 /* Only mount a tmpfs on here if we don't already a mount */
1438 if ( ! mount_check_fs( host_path, NULL ) ) {
1439 DEBUG("Mounting tmpfs to %s", host_path );
58ab99ae 1440 ret = mount("none", path, "tmpfs", 0, "size=100000,mode=755");
bc6928ff
MW
1441 } else {
1442 /* This allows someone to manually set up a mount */
1443 DEBUG("Bind mounting %s to %s", host_path, path );
1444 ret = mount(host_path , path, NULL, MS_BIND, 0 );
1445 }
1446 }
91c3830e 1447 if (ret) {
959aee9c 1448 SYSERROR("Failed to mount /dev at %s", root);
91c3830e
SH
1449 return -1;
1450 }
1451 ret = snprintf(path, MAXPATHLEN, "%s/dev/pts", root);
1452 if (ret < 0 || ret >= MAXPATHLEN)
1453 return -1;
bc6928ff
MW
1454 /*
1455 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1456 * If not, then create it and exit if that fails...
1457 */
1458 if ( 0 != access(path, F_OK) || 0 != stat(path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1459 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1460 if (ret) {
1461 SYSERROR("Failed to create /dev/pts in container");
1462 return -1;
1463 }
91c3830e
SH
1464 }
1465
959aee9c 1466 INFO("Mounted /dev under %s", root);
91c3830e
SH
1467 return 0;
1468}
1469
c6883f38 1470struct lxc_devs {
74a3920a 1471 const char *name;
c6883f38
SH
1472 mode_t mode;
1473 int maj;
1474 int min;
1475};
1476
74a3920a 1477static const struct lxc_devs lxc_devs[] = {
c6883f38
SH
1478 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1479 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1480 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1481 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1482 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1483 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1484 { "console", S_IFCHR | S_IRUSR | S_IWUSR, 5, 1 },
1485};
1486
74a3920a 1487static int setup_autodev(const char *root)
c6883f38
SH
1488{
1489 int ret;
c6883f38
SH
1490 char path[MAXPATHLEN];
1491 int i;
3a32201c 1492 mode_t cmask;
c6883f38 1493
959aee9c 1494 INFO("Creating initial consoles under %s/dev", root);
91c3830e 1495
c6883f38 1496 ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
91c3830e
SH
1497 if (ret < 0 || ret >= MAXPATHLEN) {
1498 ERROR("Error calculating container /dev location");
c6883f38 1499 return -1;
f7bee6c6 1500 }
91c3830e 1501
959aee9c 1502 INFO("Populating /dev under %s", root);
3a32201c 1503 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
c6883f38 1504 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
74a3920a 1505 const struct lxc_devs *d = &lxc_devs[i];
c6883f38
SH
1506 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", root, d->name);
1507 if (ret < 0 || ret >= MAXPATHLEN)
1508 return -1;
1509 ret = mknod(path, d->mode, makedev(d->maj, d->min));
91c3830e 1510 if (ret && errno != EEXIST) {
959aee9c 1511 SYSERROR("Error creating %s", d->name);
c6883f38
SH
1512 return -1;
1513 }
1514 }
3a32201c 1515 umask(cmask);
c6883f38 1516
959aee9c 1517 INFO("Populated /dev under %s", root);
c6883f38
SH
1518 return 0;
1519}
1520
f0d02950
JTLB
1521/*
1522 * Locate allocated devtmpfs mount and purge it.
1523 * path lookup mostly taken from mk_devtmpfs
1524 */
1525int lxc_delete_autodev(struct lxc_handler *handler)
1526{
1527 int ret;
1528 struct stat s;
1529 struct lxc_conf *lxc_conf = handler->conf;
1530 const char *name = handler->name;
1531 const char *lxcpath = handler->lxcpath;
1532 char tmp_path[MAXPATHLEN];
1533 uint64_t hash;
1534
1535 if ( lxc_conf->autodev <= 0 )
1536 return 0;
1537
1c90734d
JTLB
1538 /* don't clean on reboot */
1539 if ( lxc_conf->reboot == 1 )
1540 return 0;
f0d02950
JTLB
1541
1542 /*
1543 * Use the same logic as mk_devtmpfs to compute candidate
1544 * path for cleanup.
1545 */
1546
1547 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s", lxcpath, name);
1548 if (ret < 0 || ret >= MAXPATHLEN)
1549 return -1;
1550
1551 hash = fnv_64a_buf(tmp_path, ret, FNV1A_64_INIT);
1552
1553 /* Probe /dev/.lxc/<container name>.<hash> */
1554 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, dev_base_path, name, hash);
1555 if (ret < 0 || ret >= MAXPATHLEN)
1556 return -1;
1557
1558 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1559 /* Probe /dev/.lxc/user/<container name>.<hash> */
1560 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, dev_user_path, name, hash);
1561 if (ret < 0 || ret >= MAXPATHLEN)
1562 return -1;
1563
1564 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1565 WARN("Failed to locate autodev /dev/.lxc and /dev/.lxc/user." );
1566 return -1;
1567 }
1568 }
1569
1570 /* Do the cleanup */
1571 INFO("Cleaning %s", tmp_path );
1572 if ( 0 != lxc_rmdir_onedev(tmp_path, NULL) ) {
1573 ERROR("Failed to cleanup autodev" );
1574 }
1575
1576 return 0;
1577}
1578
cc28d0b0
SH
1579/*
1580 * I'll forgive you for asking whether all of this is needed :) The
1581 * answer is yes.
1582 * pivot_root will fail if the new root, the put_old dir, or the parent
1583 * of current->fs->root are MS_SHARED. (parent of current->fs_root may
1584 * or may not be current->fs_root - if we assumed it always was, we could
1585 * just mount --make-rslave /). So,
1586 * 1. mount a tiny tmpfs to be parent of current->fs->root.
1587 * 2. make that MS_SLAVE
1588 * 3. make a 'root' directory under that
1589 * 4. mount --rbind / under the $tinyroot/root.
1590 * 5. make that rslave
1591 * 6. chdir and chroot into $tinyroot/root
1592 * 7. $tinyroot will be unmounted by our parent in start.c
1593 */
1594static int chroot_into_slave(struct lxc_conf *conf)
1595{
1596 char path[MAXPATHLEN];
1597 const char *destpath = conf->rootfs.mount;
1598 int ret;
1599
1600 if (mount(destpath, destpath, NULL, MS_BIND, 0)) {
1601 SYSERROR("failed to mount %s bind", destpath);
1602 return -1;
1603 }
1604 if (mount("", destpath, NULL, MS_SLAVE, 0)) {
1605 SYSERROR("failed to make %s slave", destpath);
1606 return -1;
1607 }
58ab99ae 1608 if (mount("none", destpath, "tmpfs", 0, "size=10000,mode=755")) {
cc28d0b0
SH
1609 SYSERROR("Failed to mount tmpfs / at %s", destpath);
1610 return -1;
1611 }
1612 ret = snprintf(path, MAXPATHLEN, "%s/root", destpath);
1613 if (ret < 0 || ret >= MAXPATHLEN) {
1614 ERROR("out of memory making root path");
1615 return -1;
1616 }
1617 if (mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
1618 SYSERROR("Failed to create /dev/pts in container");
1619 return -1;
1620 }
1621 if (mount("/", path, NULL, MS_BIND|MS_REC, 0)) {
1622 SYSERROR("Failed to rbind mount / to %s", path);
1623 return -1;
1624 }
1625 if (mount("", destpath, NULL, MS_SLAVE|MS_REC, 0)) {
1626 SYSERROR("Failed to make tmp-/ at %s rslave", path);
1627 return -1;
1628 }
cc28d0b0
SH
1629 if (chroot(path)) {
1630 SYSERROR("Failed to chroot into tmp-/");
1631 return -1;
1632 }
6b9324bd
SG
1633 if (chdir("/")) {
1634 SYSERROR("Failed to chdir into tmp-/");
1635 return -1;
1636 }
959aee9c 1637 INFO("Chrooted into tmp-/ at %s", path);
cc28d0b0
SH
1638 return 0;
1639}
1640
1641static int setup_rootfs(struct lxc_conf *conf)
0ad19a3f 1642{
cc28d0b0
SH
1643 const struct lxc_rootfs *rootfs = &conf->rootfs;
1644
a0f379bf
DW
1645 if (!rootfs->path) {
1646 if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) {
1647 SYSERROR("Failed to make / rslave");
1648 return -1;
1649 }
c69bd12f 1650 return 0;
a0f379bf 1651 }
0ad19a3f 1652
12297168 1653 if (access(rootfs->mount, F_OK)) {
b1789442 1654 SYSERROR("failed to access to '%s', check it is present",
12297168 1655 rootfs->mount);
b1789442
DL
1656 return -1;
1657 }
1658
9be53773 1659 // First try mounting rootfs using a bdev
76a26f55 1660 struct bdev *bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
9be53773 1661 if (bdev && bdev->ops->mount(bdev) == 0) {
59d66af2 1662 bdev_put(bdev);
9be53773
SH
1663 DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
1664 return 0;
1665 }
59d66af2
SH
1666 if (bdev)
1667 bdev_put(bdev);
a17b1e65 1668 if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
a6afdde9 1669 ERROR("failed to mount rootfs");
c3f0a28c 1670 return -1;
1671 }
0ad19a3f 1672
12297168 1673 DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
c69bd12f 1674
ac778708
DL
1675 return 0;
1676}
1677
74a3920a 1678static int setup_pivot_root(const struct lxc_rootfs *rootfs)
ac778708 1679{
ac778708
DL
1680 if (!rootfs->path)
1681 return 0;
1682
12297168 1683 if (setup_rootfs_pivot_root(rootfs->mount, rootfs->pivot)) {
cc6f6dd7 1684 ERROR("failed to setup pivot root");
25368b52 1685 return -1;
c69bd12f
DL
1686 }
1687
25368b52 1688 return 0;
0ad19a3f 1689}
1690
d852c78c 1691static int setup_pts(int pts)
3c26f34e 1692{
77890c6d
SW
1693 char target[PATH_MAX];
1694
d852c78c
DL
1695 if (!pts)
1696 return 0;
3c26f34e 1697
1698 if (!access("/dev/pts/ptmx", F_OK) && umount("/dev/pts")) {
36eb9bde 1699 SYSERROR("failed to umount 'dev/pts'");
3c26f34e 1700 return -1;
1701 }
1702
7e40254a
JTLB
1703 if (mkdir("/dev/pts", 0755)) {
1704 if ( errno != EEXIST ) {
1705 SYSERROR("failed to create '/dev/pts'");
1706 return -1;
1707 }
1708 }
1709
a6afdde9 1710 if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
67e5a20a 1711 "newinstance,ptmxmode=0666,mode=0620,gid=5")) {
36eb9bde 1712 SYSERROR("failed to mount a new instance of '/dev/pts'");
3c26f34e 1713 return -1;
1714 }
1715
3c26f34e 1716 if (access("/dev/ptmx", F_OK)) {
1717 if (!symlink("/dev/pts/ptmx", "/dev/ptmx"))
1718 goto out;
36eb9bde 1719 SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'");
3c26f34e 1720 return -1;
1721 }
1722
77890c6d
SW
1723 if (realpath("/dev/ptmx", target) && !strcmp(target, "/dev/pts/ptmx"))
1724 goto out;
1725
3c26f34e 1726 /* fallback here, /dev/pts/ptmx exists just mount bind */
1727 if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0)) {
36eb9bde 1728 SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'");
3c26f34e 1729 return -1;
1730 }
cd54d859
DL
1731
1732 INFO("created new pts instance");
d852c78c 1733
3c26f34e 1734out:
1735 return 0;
1736}
1737
cccc74b5
DL
1738static int setup_personality(int persona)
1739{
6ff05e18 1740 #if HAVE_SYS_PERSONALITY_H
cccc74b5
DL
1741 if (persona == -1)
1742 return 0;
1743
1744 if (personality(persona) < 0) {
1745 SYSERROR("failed to set personality to '0x%x'", persona);
1746 return -1;
1747 }
1748
1749 INFO("set personality to '0x%x'", persona);
6ff05e18 1750 #endif
cccc74b5
DL
1751
1752 return 0;
1753}
1754
7c6ef2a2 1755static int setup_dev_console(const struct lxc_rootfs *rootfs,
33fcb7a0 1756 const struct lxc_console *console)
6e590161 1757{
63376d7d
DL
1758 char path[MAXPATHLEN];
1759 struct stat s;
7c6ef2a2 1760 int ret;
52e35957 1761
7c6ef2a2
SH
1762 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1763 if (ret >= sizeof(path)) {
959aee9c 1764 ERROR("console path too long");
7c6ef2a2
SH
1765 return -1;
1766 }
52e35957 1767
63376d7d 1768 if (access(path, F_OK)) {
466978b0 1769 WARN("rootfs specified but no console found at '%s'", path);
63376d7d 1770 return 0;
52e35957
DL
1771 }
1772
b5159817
DE
1773 if (console->master < 0) {
1774 INFO("no console");
f78a1f32
DL
1775 return 0;
1776 }
ed502555 1777
63376d7d
DL
1778 if (stat(path, &s)) {
1779 SYSERROR("failed to stat '%s'", path);
1780 return -1;
1781 }
1782
1783 if (chmod(console->name, s.st_mode)) {
1784 SYSERROR("failed to set mode '0%o' to '%s'",
1785 s.st_mode, console->name);
1786 return -1;
1787 }
13954cce 1788
63376d7d
DL
1789 if (mount(console->name, path, "none", MS_BIND, 0)) {
1790 ERROR("failed to mount '%s' on '%s'", console->name, path);
6e590161 1791 return -1;
1792 }
1793
63376d7d 1794 INFO("console has been setup");
7c6ef2a2
SH
1795 return 0;
1796}
1797
1798static int setup_ttydir_console(const struct lxc_rootfs *rootfs,
1799 const struct lxc_console *console,
1800 char *ttydir)
1801{
1802 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1803 int ret;
1804
1805 /* create rootfs/dev/<ttydir> directory */
1806 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount,
1807 ttydir);
1808 if (ret >= sizeof(path))
1809 return -1;
1810 ret = mkdir(path, 0755);
1811 if (ret && errno != EEXIST) {
959aee9c 1812 SYSERROR("failed with errno %d to create %s", errno, path);
7c6ef2a2
SH
1813 return -1;
1814 }
959aee9c 1815 INFO("created %s", path);
7c6ef2a2
SH
1816
1817 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console",
1818 rootfs->mount, ttydir);
1819 if (ret >= sizeof(lxcpath)) {
959aee9c 1820 ERROR("console path too long");
7c6ef2a2
SH
1821 return -1;
1822 }
1823
1824 snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1825 ret = unlink(path);
1826 if (ret && errno != ENOENT) {
959aee9c 1827 SYSERROR("error unlinking %s", path);
7c6ef2a2
SH
1828 return -1;
1829 }
1830
1831 ret = creat(lxcpath, 0660);
1832 if (ret==-1 && errno != EEXIST) {
959aee9c 1833 SYSERROR("error %d creating %s", errno, lxcpath);
7c6ef2a2
SH
1834 return -1;
1835 }
4d44e274
SH
1836 if (ret >= 0)
1837 close(ret);
7c6ef2a2 1838
b5159817
DE
1839 if (console->master < 0) {
1840 INFO("no console");
7c6ef2a2
SH
1841 return 0;
1842 }
1843
1844 if (mount(console->name, lxcpath, "none", MS_BIND, 0)) {
1845 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1846 return -1;
1847 }
1848
1849 /* create symlink from rootfs/dev/console to 'lxc/console' */
9ba8130c
SH
1850 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1851 if (ret >= sizeof(lxcpath)) {
1852 ERROR("lxc/console path too long");
1853 return -1;
1854 }
7c6ef2a2
SH
1855 ret = symlink(lxcpath, path);
1856 if (ret) {
1857 SYSERROR("failed to create symlink for console");
1858 return -1;
1859 }
1860
1861 INFO("console has been setup on %s", lxcpath);
cd54d859 1862
6e590161 1863 return 0;
1864}
1865
7c6ef2a2
SH
1866static int setup_console(const struct lxc_rootfs *rootfs,
1867 const struct lxc_console *console,
1868 char *ttydir)
1869{
1870 /* We don't have a rootfs, /dev/console will be shared */
1871 if (!rootfs->path)
1872 return 0;
1873 if (!ttydir)
1874 return setup_dev_console(rootfs, console);
1875
1876 return setup_ttydir_console(rootfs, console, ttydir);
1877}
1878
1bd051a6
SH
1879static int setup_kmsg(const struct lxc_rootfs *rootfs,
1880 const struct lxc_console *console)
1881{
1882 char kpath[MAXPATHLEN];
1883 int ret;
1884
222fea5a
DE
1885 if (!rootfs->path)
1886 return 0;
1bd051a6
SH
1887 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1888 if (ret < 0 || ret >= sizeof(kpath))
1889 return -1;
1890
1891 ret = unlink(kpath);
1892 if (ret && errno != ENOENT) {
959aee9c 1893 SYSERROR("error unlinking %s", kpath);
1bd051a6
SH
1894 return -1;
1895 }
1896
1897 ret = symlink("console", kpath);
1898 if (ret) {
1899 SYSERROR("failed to create symlink for kmsg");
1900 return -1;
1901 }
1902
1903 return 0;
1904}
1905
998ac676
RT
1906static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1907{
1908 struct mount_opt *mo;
1909
1910 /* If opt is found in mount_opt, set or clear flags.
1911 * Otherwise append it to data. */
1912
1913 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1914 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1915 if (mo->clear)
1916 *flags &= ~mo->flag;
1917 else
1918 *flags |= mo->flag;
1919 return;
1920 }
1921 }
1922
1923 if (strlen(*data))
1924 strcat(*data, ",");
1925 strcat(*data, opt);
1926}
1927
a17b1e65 1928int parse_mntopts(const char *mntopts, unsigned long *mntflags,
998ac676
RT
1929 char **mntdata)
1930{
1931 char *s, *data;
1932 char *p, *saveptr = NULL;
1933
911324ef 1934 *mntdata = NULL;
91656ce5 1935 *mntflags = 0L;
911324ef
DL
1936
1937 if (!mntopts)
998ac676
RT
1938 return 0;
1939
911324ef 1940 s = strdup(mntopts);
998ac676 1941 if (!s) {
36eb9bde 1942 SYSERROR("failed to allocate memory");
998ac676
RT
1943 return -1;
1944 }
1945
1946 data = malloc(strlen(s) + 1);
1947 if (!data) {
36eb9bde 1948 SYSERROR("failed to allocate memory");
998ac676
RT
1949 free(s);
1950 return -1;
1951 }
1952 *data = 0;
1953
1954 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1955 p = strtok_r(NULL, ",", &saveptr))
1956 parse_mntopt(p, mntflags, &data);
1957
1958 if (*data)
1959 *mntdata = data;
1960 else
1961 free(data);
1962 free(s);
1963
1964 return 0;
1965}
1966
6fd5e769
SH
1967static void null_endofword(char *word)
1968{
1969 while (*word && *word != ' ' && *word != '\t')
1970 word++;
1971 *word = '\0';
1972}
1973
1974/*
1975 * skip @nfields spaces in @src
1976 */
1977static char *get_field(char *src, int nfields)
1978{
1979 char *p = src;
1980 int i;
1981
1982 for (i = 0; i < nfields; i++) {
1983 while (*p && *p != ' ' && *p != '\t')
1984 p++;
1985 if (!*p)
1986 break;
1987 p++;
1988 }
1989 return p;
1990}
1991
911324ef
DL
1992static int mount_entry(const char *fsname, const char *target,
1993 const char *fstype, unsigned long mountflags,
1fc64d22 1994 const char *data, int optional)
911324ef 1995{
2938f7c8
SH
1996 struct statvfs sb;
1997
911324ef 1998 if (mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data)) {
1fc64d22
SG
1999 if (optional) {
2000 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
2001 target, strerror(errno));
2002 return 0;
2003 }
2004 else {
2005 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
2006 return -1;
2007 }
911324ef
DL
2008 }
2009
2010 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2938f7c8
SH
2011 DEBUG("remounting %s on %s to respect bind or remount options",
2012 fsname ? fsname : "(none)", target ? target : "(none)");
2013
2014 if (statvfs(fsname, &sb) == 0) {
2015 unsigned long required_flags = 0;
2016 if (sb.f_flag & MS_NOSUID)
2017 required_flags |= MS_NOSUID;
2018 if (sb.f_flag & MS_NODEV)
2019 required_flags |= MS_NODEV;
2020 if (sb.f_flag & MS_RDONLY)
2021 required_flags |= MS_RDONLY;
2022 if (sb.f_flag & MS_NOEXEC)
2023 required_flags |= MS_NOEXEC;
2024 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
2025 /*
2026 * If this was a bind mount request, and required_flags
2027 * does not have any flags which are not already in
2028 * mountflags, then skip the remount
2029 */
2030 if (!(mountflags & MS_REMOUNT)) {
2031 if (!(required_flags & ~mountflags)) {
2032 DEBUG("mountflags already was %lu, skipping remount",
2033 mountflags);
2034 goto skipremount;
2035 }
2036 }
2037 mountflags |= required_flags;
6fd5e769 2038 }
911324ef
DL
2039
2040 if (mount(fsname, target, fstype,
2041 mountflags | MS_REMOUNT, data)) {
1fc64d22
SG
2042 if (optional) {
2043 INFO("failed to mount '%s' on '%s' (optional): %s",
2044 fsname, target, strerror(errno));
2045 return 0;
2046 }
2047 else {
2048 SYSERROR("failed to mount '%s' on '%s'",
2049 fsname, target);
2050 return -1;
2051 }
911324ef
DL
2052 }
2053 }
2054
6fd5e769 2055skipremount:
911324ef
DL
2056 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
2057
2058 return 0;
2059}
2060
4e4ca161
SH
2061/*
2062 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
2063 */
2064static void cull_mntent_opt(struct mntent *mntent)
2065{
2066 int i;
2067 char *p, *p2;
2068 char *list[] = {"create=dir",
2069 "create=file",
2070 "optional",
2071 NULL };
2072
2073 for (i=0; list[i]; i++) {
2074 if (!(p = strstr(mntent->mnt_opts, list[i])))
2075 continue;
2076 p2 = strchr(p, ',');
2077 if (!p2) {
2078 /* no more mntopts, so just chop it here */
2079 *p = '\0';
2080 continue;
2081 }
2082 memmove(p, p2+1, strlen(p2+1)+1);
2083 }
2084}
2085
2086static inline int mount_entry_on_systemfs(struct mntent *mntent)
0ad19a3f 2087{
998ac676
RT
2088 unsigned long mntflags;
2089 char *mntdata;
911324ef 2090 int ret;
34cfffb3
SG
2091 FILE *pathfile = NULL;
2092 char* pathdirname = NULL;
4f1d50d1 2093 bool optional = hasmntopt(mntent, "optional") != NULL;
911324ef 2094
34cfffb3 2095 if (hasmntopt(mntent, "create=dir")) {
119126b6 2096 if (mkdir_p(mntent->mnt_dir, 0755) < 0) {
34cfffb3
SG
2097 WARN("Failed to create mount target '%s'", mntent->mnt_dir);
2098 ret = -1;
2099 }
2100 }
2101
2102 if (hasmntopt(mntent, "create=file") && access(mntent->mnt_dir, F_OK)) {
2103 pathdirname = strdup(mntent->mnt_dir);
2104 pathdirname = dirname(pathdirname);
119126b6
SG
2105 if (mkdir_p(pathdirname, 0755) < 0) {
2106 WARN("Failed to create target directory");
2107 }
34cfffb3
SG
2108 pathfile = fopen(mntent->mnt_dir, "wb");
2109 if (!pathfile) {
2110 WARN("Failed to create mount target '%s'", mntent->mnt_dir);
2111 ret = -1;
2112 }
2113 else
2114 fclose(pathfile);
2115 }
2116
4e4ca161
SH
2117 cull_mntent_opt(mntent);
2118
a17b1e65
SG
2119 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2120 free(mntdata);
2121 return -1;
2122 }
2123
911324ef 2124 ret = mount_entry(mntent->mnt_fsname, mntent->mnt_dir,
1fc64d22 2125 mntent->mnt_type, mntflags, mntdata, optional);
68c152ef 2126
34cfffb3 2127 free(pathdirname);
911324ef
DL
2128 free(mntdata);
2129
2130 return ret;
2131}
2132
4e4ca161 2133static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
80a881b2
SH
2134 const struct lxc_rootfs *rootfs,
2135 const char *lxc_name)
911324ef 2136{
013bd428 2137 char *aux;
59760f5d 2138 char path[MAXPATHLEN];
911324ef
DL
2139 unsigned long mntflags;
2140 char *mntdata;
80a881b2 2141 int r, ret = 0, offset;
67e571de 2142 const char *lxcpath;
34cfffb3
SG
2143 FILE *pathfile = NULL;
2144 char *pathdirname = NULL;
4f1d50d1 2145 bool optional = hasmntopt(mntent, "optional") != NULL;
0ad19a3f 2146
593e8478 2147 lxcpath = lxc_global_config_value("lxc.lxcpath");
2a59a681
SH
2148 if (!lxcpath) {
2149 ERROR("Out of memory");
2150 return -1;
2151 }
2152
80a881b2 2153 /* if rootfs->path is a blockdev path, allow container fstab to
2a59a681
SH
2154 * use $lxcpath/CN/rootfs as the target prefix */
2155 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
80a881b2
SH
2156 if (r < 0 || r >= MAXPATHLEN)
2157 goto skipvarlib;
2158
2159 aux = strstr(mntent->mnt_dir, path);
2160 if (aux) {
2161 offset = strlen(path);
2162 goto skipabs;
2163 }
2164
2165skipvarlib:
013bd428
DL
2166 aux = strstr(mntent->mnt_dir, rootfs->path);
2167 if (!aux) {
2168 WARN("ignoring mount point '%s'", mntent->mnt_dir);
2169 goto out;
2170 }
80a881b2
SH
2171 offset = strlen(rootfs->path);
2172
2173skipabs:
013bd428 2174
9ba8130c 2175 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
80a881b2
SH
2176 aux + offset);
2177 if (r < 0 || r >= MAXPATHLEN) {
2178 WARN("pathnme too long for '%s'", mntent->mnt_dir);
2179 ret = -1;
2180 goto out;
2181 }
2182
34cfffb3 2183 if (hasmntopt(mntent, "create=dir")) {
119126b6 2184 if (mkdir_p(path, 0755) < 0) {
34cfffb3
SG
2185 WARN("Failed to create mount target '%s'", path);
2186 ret = -1;
2187 }
2188 }
2189
2190 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
2191 pathdirname = strdup(path);
2192 pathdirname = dirname(pathdirname);
119126b6
SG
2193 if (mkdir_p(pathdirname, 0755) < 0) {
2194 WARN("Failed to create target directory");
2195 }
34cfffb3
SG
2196 pathfile = fopen(path, "wb");
2197 if (!pathfile) {
2198 WARN("Failed to create mount target '%s'", path);
2199 ret = -1;
2200 }
2201 else
2202 fclose(pathfile);
2203 }
4e4ca161 2204 cull_mntent_opt(mntent);
d330fe7b 2205
a17b1e65
SG
2206 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2207 free(mntdata);
2208 return -1;
2209 }
2210
013bd428 2211 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
1fc64d22 2212 mntflags, mntdata, optional);
0ad19a3f 2213
a17b1e65
SG
2214 free(mntdata);
2215
013bd428 2216out:
34cfffb3 2217 free(pathdirname);
911324ef
DL
2218 return ret;
2219}
d330fe7b 2220
4e4ca161 2221static int mount_entry_on_relative_rootfs(struct mntent *mntent,
911324ef
DL
2222 const char *rootfs)
2223{
2224 char path[MAXPATHLEN];
2225 unsigned long mntflags;
2226 char *mntdata;
2227 int ret;
34cfffb3
SG
2228 FILE *pathfile = NULL;
2229 char *pathdirname = NULL;
4f1d50d1 2230 bool optional = hasmntopt(mntent, "optional") != NULL;
d330fe7b 2231
34cfffb3 2232 /* relative to root mount point */
9ba8130c
SH
2233 ret = snprintf(path, sizeof(path), "%s/%s", rootfs, mntent->mnt_dir);
2234 if (ret >= sizeof(path)) {
2235 ERROR("path name too long");
2236 return -1;
2237 }
911324ef 2238
34cfffb3 2239 if (hasmntopt(mntent, "create=dir")) {
119126b6 2240 if (mkdir_p(path, 0755) < 0) {
34cfffb3
SG
2241 WARN("Failed to create mount target '%s'", path);
2242 ret = -1;
2243 }
2244 }
2245
2246 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
2247 pathdirname = strdup(path);
2248 pathdirname = dirname(pathdirname);
119126b6
SG
2249 if (mkdir_p(pathdirname, 0755) < 0) {
2250 WARN("Failed to create target directory");
2251 }
34cfffb3
SG
2252 pathfile = fopen(path, "wb");
2253 if (!pathfile) {
2254 WARN("Failed to create mount target '%s'", path);
2255 ret = -1;
2256 }
2257 else
2258 fclose(pathfile);
2259 }
4e4ca161 2260 cull_mntent_opt(mntent);
34cfffb3 2261
a17b1e65
SG
2262 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2263 free(mntdata);
2264 return -1;
2265 }
2266
911324ef 2267 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
1fc64d22 2268 mntflags, mntdata, optional);
68c152ef 2269
34cfffb3 2270 free(pathdirname);
911324ef 2271 free(mntdata);
998ac676 2272
911324ef
DL
2273 return ret;
2274}
2275
80a881b2
SH
2276static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
2277 const char *lxc_name)
911324ef 2278{
aaf901be
AM
2279 struct mntent mntent;
2280 char buf[4096];
911324ef 2281 int ret = -1;
e76b8764 2282
aaf901be 2283 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
e76b8764 2284
911324ef 2285 if (!rootfs->path) {
aaf901be 2286 if (mount_entry_on_systemfs(&mntent))
e76b8764 2287 goto out;
911324ef 2288 continue;
e76b8764
CDC
2289 }
2290
911324ef 2291 /* We have a separate root, mounts are relative to it */
aaf901be
AM
2292 if (mntent.mnt_dir[0] != '/') {
2293 if (mount_entry_on_relative_rootfs(&mntent,
911324ef
DL
2294 rootfs->mount))
2295 goto out;
2296 continue;
2297 }
cd54d859 2298
aaf901be 2299 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name))
911324ef 2300 goto out;
0ad19a3f 2301 }
cd54d859 2302
0ad19a3f 2303 ret = 0;
cd54d859
DL
2304
2305 INFO("mount points have been setup");
0ad19a3f 2306out:
e7938e9e
MN
2307 return ret;
2308}
2309
80a881b2
SH
2310static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
2311 const char *lxc_name)
e7938e9e
MN
2312{
2313 FILE *file;
2314 int ret;
2315
2316 if (!fstab)
2317 return 0;
2318
2319 file = setmntent(fstab, "r");
2320 if (!file) {
2321 SYSERROR("failed to use '%s'", fstab);
2322 return -1;
2323 }
2324
80a881b2 2325 ret = mount_file_entries(rootfs, file, lxc_name);
e7938e9e 2326
0ad19a3f 2327 endmntent(file);
2328 return ret;
2329}
2330
80a881b2
SH
2331static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list *mount,
2332 const char *lxc_name)
e7938e9e
MN
2333{
2334 FILE *file;
2335 struct lxc_list *iterator;
2336 char *mount_entry;
2337 int ret;
2338
2339 file = tmpfile();
2340 if (!file) {
2341 ERROR("tmpfile error: %m");
2342 return -1;
2343 }
2344
2345 lxc_list_for_each(iterator, mount) {
2346 mount_entry = iterator->elem;
1d6b1976 2347 fprintf(file, "%s\n", mount_entry);
e7938e9e
MN
2348 }
2349
2350 rewind(file);
2351
80a881b2 2352 ret = mount_file_entries(rootfs, file, lxc_name);
e7938e9e
MN
2353
2354 fclose(file);
2355 return ret;
2356}
2357
bab88e68
CS
2358static int parse_cap(const char *cap)
2359{
2360 char *ptr = NULL;
2361 int i, capid = -1;
2362
7035407c
DE
2363 if (!strcmp(cap, "none"))
2364 return -2;
2365
bab88e68
CS
2366 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2367
2368 if (strcmp(cap, caps_opt[i].name))
2369 continue;
2370
2371 capid = caps_opt[i].value;
2372 break;
2373 }
2374
2375 if (capid < 0) {
2376 /* try to see if it's numeric, so the user may specify
2377 * capabilities that the running kernel knows about but
2378 * we don't */
2379 errno = 0;
2380 capid = strtol(cap, &ptr, 10);
2381 if (!ptr || *ptr != '\0' || errno != 0)
2382 /* not a valid number */
2383 capid = -1;
2384 else if (capid > lxc_caps_last_cap())
2385 /* we have a number but it's not a valid
2386 * capability */
2387 capid = -1;
2388 }
2389
2390 return capid;
2391}
2392
0769b82a
CS
2393int in_caplist(int cap, struct lxc_list *caps)
2394{
2395 struct lxc_list *iterator;
2396 int capid;
2397
2398 lxc_list_for_each(iterator, caps) {
2399 capid = parse_cap(iterator->elem);
2400 if (capid == cap)
2401 return 1;
2402 }
2403
2404 return 0;
2405}
2406
81810dd1
DL
2407static int setup_caps(struct lxc_list *caps)
2408{
2409 struct lxc_list *iterator;
2410 char *drop_entry;
bab88e68 2411 int capid;
81810dd1
DL
2412
2413 lxc_list_for_each(iterator, caps) {
2414
2415 drop_entry = iterator->elem;
2416
bab88e68 2417 capid = parse_cap(drop_entry);
d55bc1ad 2418
81810dd1 2419 if (capid < 0) {
1e11be34
DL
2420 ERROR("unknown capability %s", drop_entry);
2421 return -1;
81810dd1
DL
2422 }
2423
2424 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2425
2426 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
3ec1648d
SH
2427 SYSERROR("failed to remove %s capability", drop_entry);
2428 return -1;
2429 }
81810dd1
DL
2430
2431 }
2432
1fb86a7c
SH
2433 DEBUG("capabilities have been setup");
2434
2435 return 0;
2436}
2437
2438static int dropcaps_except(struct lxc_list *caps)
2439{
2440 struct lxc_list *iterator;
2441 char *keep_entry;
1fb86a7c
SH
2442 int i, capid;
2443 int numcaps = lxc_caps_last_cap() + 1;
959aee9c 2444 INFO("found %d capabilities", numcaps);
1fb86a7c 2445
2caf9a97
SH
2446 if (numcaps <= 0 || numcaps > 200)
2447 return -1;
2448
1fb86a7c
SH
2449 // caplist[i] is 1 if we keep capability i
2450 int *caplist = alloca(numcaps * sizeof(int));
2451 memset(caplist, 0, numcaps * sizeof(int));
2452
2453 lxc_list_for_each(iterator, caps) {
2454
2455 keep_entry = iterator->elem;
2456
bab88e68 2457 capid = parse_cap(keep_entry);
1fb86a7c 2458
7035407c
DE
2459 if (capid == -2)
2460 continue;
2461
1fb86a7c
SH
2462 if (capid < 0) {
2463 ERROR("unknown capability %s", keep_entry);
2464 return -1;
2465 }
2466
8255688a 2467 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
1fb86a7c
SH
2468
2469 caplist[capid] = 1;
2470 }
2471 for (i=0; i<numcaps; i++) {
2472 if (caplist[i])
2473 continue;
2474 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
3ec1648d
SH
2475 SYSERROR("failed to remove capability %d", i);
2476 return -1;
2477 }
1fb86a7c
SH
2478 }
2479
2480 DEBUG("capabilities have been setup");
81810dd1
DL
2481
2482 return 0;
2483}
2484
0ad19a3f 2485static int setup_hw_addr(char *hwaddr, const char *ifname)
2486{
2487 struct sockaddr sockaddr;
2488 struct ifreq ifr;
2489 int ret, fd;
2490
3cfc0f3a
MN
2491 ret = lxc_convert_mac(hwaddr, &sockaddr);
2492 if (ret) {
2493 ERROR("mac address '%s' conversion failed : %s",
2494 hwaddr, strerror(-ret));
0ad19a3f 2495 return -1;
2496 }
2497
2498 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
5da6aa8c 2499 ifr.ifr_name[IFNAMSIZ-1] = '\0';
0ad19a3f 2500 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2501
2502 fd = socket(AF_INET, SOCK_DGRAM, 0);
2503 if (fd < 0) {
3ab87b66 2504 ERROR("socket failure : %s", strerror(errno));
0ad19a3f 2505 return -1;
2506 }
2507
2508 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2509 close(fd);
2510 if (ret)
3ab87b66 2511 ERROR("ioctl failure : %s", strerror(errno));
0ad19a3f 2512
5da6aa8c 2513 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
cd54d859 2514
0ad19a3f 2515 return ret;
2516}
2517
82d5ae15 2518static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2519{
82d5ae15
DL
2520 struct lxc_list *iterator;
2521 struct lxc_inetdev *inetdev;
3cfc0f3a 2522 int err;
0ad19a3f 2523
82d5ae15
DL
2524 lxc_list_for_each(iterator, ip) {
2525
2526 inetdev = iterator->elem;
2527
0093bb8c
DL
2528 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2529 &inetdev->bcast, inetdev->prefix);
3cfc0f3a
MN
2530 if (err) {
2531 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2532 ifindex, strerror(-err));
82d5ae15
DL
2533 return -1;
2534 }
2535 }
2536
2537 return 0;
0ad19a3f 2538}
2539
82d5ae15 2540static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
0ad19a3f 2541{
82d5ae15 2542 struct lxc_list *iterator;
7fa9074f 2543 struct lxc_inet6dev *inet6dev;
3cfc0f3a 2544 int err;
0ad19a3f 2545
82d5ae15
DL
2546 lxc_list_for_each(iterator, ip) {
2547
2548 inet6dev = iterator->elem;
2549
b3df193c 2550 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
0093bb8c
DL
2551 &inet6dev->mcast, &inet6dev->acast,
2552 inet6dev->prefix);
3cfc0f3a
MN
2553 if (err) {
2554 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2555 ifindex, strerror(-err));
82d5ae15 2556 return -1;
3cfc0f3a 2557 }
82d5ae15
DL
2558 }
2559
2560 return 0;
0ad19a3f 2561}
2562
82d5ae15 2563static int setup_netdev(struct lxc_netdev *netdev)
0ad19a3f 2564{
0ad19a3f 2565 char ifname[IFNAMSIZ];
0ad19a3f 2566 char *current_ifname = ifname;
3cfc0f3a 2567 int err;
0ad19a3f 2568
82d5ae15
DL
2569 /* empty network namespace */
2570 if (!netdev->ifindex) {
b0efbac4 2571 if (netdev->flags & IFF_UP) {
d472214b 2572 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2573 if (err) {
2574 ERROR("failed to set the loopback up : %s",
2575 strerror(-err));
82d5ae15
DL
2576 return -1;
2577 }
82d5ae15 2578 }
40790553
SH
2579 if (netdev->type != LXC_NET_VETH)
2580 return 0;
2581 netdev->ifindex = if_nametoindex(netdev->name);
0ad19a3f 2582 }
13954cce 2583
b466dc33 2584 /* get the new ifindex in case of physical netdev */
40790553 2585 if (netdev->type == LXC_NET_PHYS) {
b466dc33
BP
2586 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2587 ERROR("failed to get ifindex for %s",
2588 netdev->link);
2589 return -1;
2590 }
40790553 2591 }
b466dc33 2592
82d5ae15
DL
2593 /* retrieve the name of the interface */
2594 if (!if_indextoname(netdev->ifindex, current_ifname)) {
36eb9bde 2595 ERROR("no interface corresponding to index '%d'",
82d5ae15 2596 netdev->ifindex);
0ad19a3f 2597 return -1;
2598 }
13954cce 2599
018ef520 2600 /* default: let the system to choose one interface name */
9d083402 2601 if (!netdev->name)
fb6d9b2f
DL
2602 netdev->name = netdev->type == LXC_NET_PHYS ?
2603 netdev->link : "eth%d";
018ef520 2604
82d5ae15 2605 /* rename the interface name */
40790553
SH
2606 if (strcmp(ifname, netdev->name) != 0) {
2607 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2608 if (err) {
2609 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2610 strerror(-err));
2611 return -1;
2612 }
018ef520
DL
2613 }
2614
2615 /* Re-read the name of the interface because its name has changed
2616 * and would be automatically allocated by the system
2617 */
82d5ae15 2618 if (!if_indextoname(netdev->ifindex, current_ifname)) {
018ef520 2619 ERROR("no interface corresponding to index '%d'",
82d5ae15 2620 netdev->ifindex);
018ef520 2621 return -1;
0ad19a3f 2622 }
2623
82d5ae15
DL
2624 /* set a mac address */
2625 if (netdev->hwaddr) {
2626 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
36eb9bde 2627 ERROR("failed to setup hw address for '%s'",
82d5ae15 2628 current_ifname);
0ad19a3f 2629 return -1;
2630 }
2631 }
2632
82d5ae15
DL
2633 /* setup ipv4 addresses on the interface */
2634 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
36eb9bde 2635 ERROR("failed to setup ip addresses for '%s'",
0ad19a3f 2636 ifname);
2637 return -1;
2638 }
2639
82d5ae15
DL
2640 /* setup ipv6 addresses on the interface */
2641 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
36eb9bde 2642 ERROR("failed to setup ipv6 addresses for '%s'",
0ad19a3f 2643 ifname);
2644 return -1;
2645 }
2646
82d5ae15 2647 /* set the network device up */
b0efbac4 2648 if (netdev->flags & IFF_UP) {
3cfc0f3a
MN
2649 int err;
2650
d472214b 2651 err = lxc_netdev_up(current_ifname);
3cfc0f3a
MN
2652 if (err) {
2653 ERROR("failed to set '%s' up : %s", current_ifname,
2654 strerror(-err));
0ad19a3f 2655 return -1;
2656 }
2657
2658 /* the network is up, make the loopback up too */
d472214b 2659 err = lxc_netdev_up("lo");
3cfc0f3a
MN
2660 if (err) {
2661 ERROR("failed to set the loopback up : %s",
2662 strerror(-err));
0ad19a3f 2663 return -1;
2664 }
2665 }
2666
f8fee0e2
MK
2667 /* We can only set up the default routes after bringing
2668 * up the interface, sine bringing up the interface adds
2669 * the link-local routes and we can't add a default
2670 * route if the gateway is not reachable. */
2671
2672 /* setup ipv4 gateway on the interface */
2673 if (netdev->ipv4_gateway) {
2674 if (!(netdev->flags & IFF_UP)) {
2675 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2676 return -1;
2677 }
2678
2679 if (lxc_list_empty(&netdev->ipv4)) {
2680 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2681 return -1;
2682 }
2683
2684 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2685 if (err) {
fc739df5
SG
2686 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2687 if (err) {
2688 ERROR("failed to add ipv4 dest for '%s': %s",
2689 ifname, strerror(-err));
2690 }
2691
2692 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2693 if (err) {
2694 ERROR("failed to setup ipv4 gateway for '%s': %s",
2695 ifname, strerror(-err));
2696 if (netdev->ipv4_gateway_auto) {
2697 char buf[INET_ADDRSTRLEN];
2698 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2699 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2700 }
2701 return -1;
19a26f82 2702 }
f8fee0e2
MK
2703 }
2704 }
2705
2706 /* setup ipv6 gateway on the interface */
2707 if (netdev->ipv6_gateway) {
2708 if (!(netdev->flags & IFF_UP)) {
2709 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2710 return -1;
2711 }
2712
2713 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2714 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2715 return -1;
2716 }
2717
2718 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2719 if (err) {
fc739df5
SG
2720 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2721 if (err) {
2722 ERROR("failed to add ipv6 dest for '%s': %s",
f8fee0e2 2723 ifname, strerror(-err));
19a26f82 2724 }
fc739df5
SG
2725
2726 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2727 if (err) {
2728 ERROR("failed to setup ipv6 gateway for '%s': %s",
2729 ifname, strerror(-err));
2730 if (netdev->ipv6_gateway_auto) {
2731 char buf[INET6_ADDRSTRLEN];
2732 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2733 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2734 }
2735 return -1;
2736 }
f8fee0e2
MK
2737 }
2738 }
2739
cd54d859
DL
2740 DEBUG("'%s' has been setup", current_ifname);
2741
0ad19a3f 2742 return 0;
2743}
2744
5f4535a3 2745static int setup_network(struct lxc_list *network)
0ad19a3f 2746{
82d5ae15 2747 struct lxc_list *iterator;
82d5ae15 2748 struct lxc_netdev *netdev;
0ad19a3f 2749
5f4535a3 2750 lxc_list_for_each(iterator, network) {
cd54d859 2751
5f4535a3 2752 netdev = iterator->elem;
82d5ae15
DL
2753
2754 if (setup_netdev(netdev)) {
2755 ERROR("failed to setup netdev");
2756 return -1;
2757 }
2758 }
cd54d859 2759
5f4535a3
DL
2760 if (!lxc_list_empty(network))
2761 INFO("network has been setup");
cd54d859
DL
2762
2763 return 0;
0ad19a3f 2764}
2765
2af6bd1b
SH
2766/* try to move physical nics to the init netns */
2767void restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2768{
2769 int i, ret, oldfd;
2770 char path[MAXPATHLEN];
2771
2772 if (netnsfd < 0)
2773 return;
2774
2775 ret = snprintf(path, MAXPATHLEN, "/proc/self/ns/net");
2776 if (ret < 0 || ret >= MAXPATHLEN) {
2777 WARN("Failed to open monitor netns fd");
2778 return;
2779 }
2780 if ((oldfd = open(path, O_RDONLY)) < 0) {
2781 SYSERROR("Failed to open monitor netns fd");
2782 return;
2783 }
2784 if (setns(netnsfd, 0) != 0) {
2785 SYSERROR("Failed to enter container netns to reset nics");
2786 close(oldfd);
2787 return;
2788 }
2789 for (i=0; i<conf->num_savednics; i++) {
2790 struct saved_nic *s = &conf->saved_nics[i];
2791 if (lxc_netdev_move_by_index(s->ifindex, 1))
2792 WARN("Error moving nic index:%d back to host netns",
2793 s->ifindex);
2794 }
2795 if (setns(oldfd, 0) != 0)
2796 SYSERROR("Failed to re-enter monitor's netns");
2797 close(oldfd);
2798}
2799
2800void lxc_rename_phys_nics_on_shutdown(int netnsfd, struct lxc_conf *conf)
7b35f3d6
SH
2801{
2802 int i;
2803
2af6bd1b
SH
2804 if (conf->num_savednics == 0)
2805 return;
2806
7b35f3d6 2807 INFO("running to reset %d nic names", conf->num_savednics);
2af6bd1b 2808 restore_phys_nics_to_netns(netnsfd, conf);
7b35f3d6
SH
2809 for (i=0; i<conf->num_savednics; i++) {
2810 struct saved_nic *s = &conf->saved_nics[i];
959aee9c 2811 INFO("resetting nic %d to %s", s->ifindex, s->orig_name);
7b35f3d6
SH
2812 lxc_netdev_rename_by_index(s->ifindex, s->orig_name);
2813 free(s->orig_name);
2814 }
2815 conf->num_savednics = 0;
7b35f3d6
SH
2816}
2817
ae9242c8
SH
2818static char *default_rootfs_mount = LXCROOTFSMOUNT;
2819
7b379ab3 2820struct lxc_conf *lxc_conf_init(void)
089cd8b8 2821{
7b379ab3 2822 struct lxc_conf *new;
26ddeedd 2823 int i;
7b379ab3
MN
2824
2825 new = malloc(sizeof(*new));
2826 if (!new) {
2827 ERROR("lxc_conf_init : %m");
2828 return NULL;
2829 }
2830 memset(new, 0, sizeof(*new));
2831
b40a606e 2832 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
cccc74b5 2833 new->personality = -1;
bc6928ff 2834 new->autodev = -1;
596a818d
DE
2835 new->console.log_path = NULL;
2836 new->console.log_fd = -1;
28a4b0e5 2837 new->console.path = NULL;
63376d7d 2838 new->console.peer = -1;
b5159817
DE
2839 new->console.peerpty.busy = -1;
2840 new->console.peerpty.master = -1;
2841 new->console.peerpty.slave = -1;
63376d7d
DL
2842 new->console.master = -1;
2843 new->console.slave = -1;
2844 new->console.name[0] = '\0';
d2e30e99 2845 new->maincmd_fd = -1;
76a26f55 2846 new->nbd_idx = -1;
54c30e29 2847 new->rootfs.mount = strdup(default_rootfs_mount);
53f3f048
SH
2848 if (!new->rootfs.mount) {
2849 ERROR("lxc_conf_init : %m");
2850 free(new);
2851 return NULL;
2852 }
2f3f41d0 2853 new->kmsg = 1;
7b379ab3
MN
2854 lxc_list_init(&new->cgroup);
2855 lxc_list_init(&new->network);
2856 lxc_list_init(&new->mount_list);
81810dd1 2857 lxc_list_init(&new->caps);
1fb86a7c 2858 lxc_list_init(&new->keepcaps);
f6d3e3e4 2859 lxc_list_init(&new->id_map);
f979ac15 2860 lxc_list_init(&new->includes);
4184c3e1 2861 lxc_list_init(&new->aliens);
7c661726 2862 lxc_list_init(&new->environment);
26ddeedd
SH
2863 for (i=0; i<NUM_LXC_HOOKS; i++)
2864 lxc_list_init(&new->hooks[i]);
ee1e7aa0 2865 lxc_list_init(&new->groups);
fe4de9a6
DE
2866 new->lsm_aa_profile = NULL;
2867 new->lsm_se_context = NULL;
5112cd70 2868 new->tmp_umount_proc = 0;
7b379ab3 2869
9f30a190
MM
2870 for (i = 0; i < LXC_NS_MAX; i++)
2871 new->inherit_ns_fd[i] = -1;
2872
7b379ab3 2873 return new;
089cd8b8
DL
2874}
2875
e3b4c4c4 2876static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2877{
8634bc19 2878 char veth1buf[IFNAMSIZ], *veth1;
0e391e57 2879 char veth2buf[IFNAMSIZ], *veth2;
3cfc0f3a 2880 int err;
13954cce 2881
e892973e
DL
2882 if (netdev->priv.veth_attr.pair)
2883 veth1 = netdev->priv.veth_attr.pair;
8634bc19 2884 else {
9ba8130c
SH
2885 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2886 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2887 ERROR("veth1 name too long");
2888 return -1;
2889 }
a0265685 2890 veth1 = lxc_mkifname(veth1buf);
ad40563e
ÇO
2891 if (!veth1) {
2892 ERROR("failed to allocate a temporary name");
2893 return -1;
2894 }
74a2b586
JK
2895 /* store away for deconf */
2896 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
8634bc19 2897 }
82d5ae15 2898
0e391e57 2899 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
a0265685 2900 veth2 = lxc_mkifname(veth2buf);
ad40563e 2901 if (!veth2) {
82d5ae15 2902 ERROR("failed to allocate a temporary name");
ad40563e 2903 goto out_delete;
0ad19a3f 2904 }
2905
3cfc0f3a
MN
2906 err = lxc_veth_create(veth1, veth2);
2907 if (err) {
2908 ERROR("failed to create %s-%s : %s", veth1, veth2,
2909 strerror(-err));
ad40563e 2910 goto out_delete;
0ad19a3f 2911 }
13954cce 2912
49684c0b
CS
2913 /* changing the high byte of the mac address to 0xfe, the bridge interface
2914 * will always keep the host's mac address and not take the mac address
2915 * of a container */
2916 err = setup_private_host_hw_addr(veth1);
2917 if (err) {
2918 ERROR("failed to change mac address of host interface '%s' : %s",
2919 veth1, strerror(-err));
2920 goto out_delete;
2921 }
2922
82d5ae15 2923 if (netdev->mtu) {
d472214b 2924 err = lxc_netdev_set_mtu(veth1, atoi(netdev->mtu));
3cfc0f3a 2925 if (!err)
d472214b 2926 err = lxc_netdev_set_mtu(veth2, atoi(netdev->mtu));
3cfc0f3a
MN
2927 if (err) {
2928 ERROR("failed to set mtu '%s' for %s-%s : %s",
2929 netdev->mtu, veth1, veth2, strerror(-err));
eb14c10a 2930 goto out_delete;
75d09f83
DL
2931 }
2932 }
2933
3cfc0f3a
MN
2934 if (netdev->link) {
2935 err = lxc_bridge_attach(netdev->link, veth1);
2936 if (err) {
2937 ERROR("failed to attach '%s' to the bridge '%s' : %s",
2938 veth1, netdev->link, strerror(-err));
2939 goto out_delete;
2940 }
eb14c10a
DL
2941 }
2942
82d5ae15
DL
2943 netdev->ifindex = if_nametoindex(veth2);
2944 if (!netdev->ifindex) {
36eb9bde 2945 ERROR("failed to retrieve the index for %s", veth2);
eb14c10a
DL
2946 goto out_delete;
2947 }
2948
d472214b 2949 err = lxc_netdev_up(veth1);
6e35af2e
DL
2950 if (err) {
2951 ERROR("failed to set %s up : %s", veth1, strerror(-err));
2952 goto out_delete;
0ad19a3f 2953 }
2954
e3b4c4c4 2955 if (netdev->upscript) {
751d9dcd
DL
2956 err = run_script(handler->name, "net", netdev->upscript, "up",
2957 "veth", veth1, (char*) NULL);
2958 if (err)
e3b4c4c4 2959 goto out_delete;
e3b4c4c4
ST
2960 }
2961
82d5ae15
DL
2962 DEBUG("instanciated veth '%s/%s', index is '%d'",
2963 veth1, veth2, netdev->ifindex);
2964
6ab9ab6d 2965 return 0;
eb14c10a
DL
2966
2967out_delete:
b84f58b9 2968 lxc_netdev_delete_by_name(veth1);
ad40563e
ÇO
2969 if (!netdev->priv.veth_attr.pair && veth1)
2970 free(veth1);
2971 if(veth2)
2972 free(veth2);
6ab9ab6d 2973 return -1;
13954cce 2974}
d957ae2d 2975
74a2b586
JK
2976static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2977{
2978 char *veth1;
2979 int err;
2980
2981 if (netdev->priv.veth_attr.pair)
2982 veth1 = netdev->priv.veth_attr.pair;
2983 else
2984 veth1 = netdev->priv.veth_attr.veth1;
2985
2986 if (netdev->downscript) {
2987 err = run_script(handler->name, "net", netdev->downscript,
2988 "down", "veth", veth1, (char*) NULL);
2989 if (err)
2990 return -1;
2991 }
2992 return 0;
2993}
2994
e3b4c4c4 2995static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 2996{
0e391e57 2997 char peerbuf[IFNAMSIZ], *peer;
3cfc0f3a 2998 int err;
d957ae2d
MT
2999
3000 if (!netdev->link) {
3001 ERROR("no link specified for macvlan netdev");
3002 return -1;
3003 }
13954cce 3004
9ba8130c
SH
3005 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
3006 if (err >= sizeof(peerbuf))
3007 return -1;
82d5ae15 3008
a0265685 3009 peer = lxc_mkifname(peerbuf);
ad40563e 3010 if (!peer) {
82d5ae15
DL
3011 ERROR("failed to make a temporary name");
3012 return -1;
0ad19a3f 3013 }
3014
3cfc0f3a
MN
3015 err = lxc_macvlan_create(netdev->link, peer,
3016 netdev->priv.macvlan_attr.mode);
3017 if (err) {
3018 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
3019 peer, netdev->link, strerror(-err));
ad40563e 3020 goto out;
0ad19a3f 3021 }
3022
82d5ae15
DL
3023 netdev->ifindex = if_nametoindex(peer);
3024 if (!netdev->ifindex) {
36eb9bde 3025 ERROR("failed to retrieve the index for %s", peer);
ad40563e 3026 goto out;
22ebac19 3027 }
3028
e3b4c4c4 3029 if (netdev->upscript) {
751d9dcd
DL
3030 err = run_script(handler->name, "net", netdev->upscript, "up",
3031 "macvlan", netdev->link, (char*) NULL);
3032 if (err)
ad40563e 3033 goto out;
e3b4c4c4
ST
3034 }
3035
e892973e
DL
3036 DEBUG("instanciated macvlan '%s', index is '%d' and mode '%d'",
3037 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
0ad19a3f 3038
d957ae2d 3039 return 0;
ad40563e
ÇO
3040out:
3041 lxc_netdev_delete_by_name(peer);
3042 free(peer);
3043 return -1;
0ad19a3f 3044}
3045
74a2b586
JK
3046static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3047{
3048 int err;
3049
3050 if (netdev->downscript) {
3051 err = run_script(handler->name, "net", netdev->downscript,
3052 "down", "macvlan", netdev->link,
3053 (char*) NULL);
3054 if (err)
3055 return -1;
3056 }
3057 return 0;
3058}
3059
26c39028 3060/* XXX: merge with instanciate_macvlan */
e3b4c4c4 3061static int instanciate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
26c39028
JHS
3062{
3063 char peer[IFNAMSIZ];
3cfc0f3a 3064 int err;
26c39028
JHS
3065
3066 if (!netdev->link) {
3067 ERROR("no link specified for vlan netdev");
3068 return -1;
3069 }
3070
9ba8130c
SH
3071 err = snprintf(peer, sizeof(peer), "vlan%d", netdev->priv.vlan_attr.vid);
3072 if (err >= sizeof(peer)) {
3073 ERROR("peer name too long");
3074 return -1;
3075 }
26c39028 3076
3cfc0f3a
MN
3077 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
3078 if (err) {
3079 ERROR("failed to create vlan interface '%s' on '%s' : %s",
3080 peer, netdev->link, strerror(-err));
26c39028
JHS
3081 return -1;
3082 }
3083
3084 netdev->ifindex = if_nametoindex(peer);
3085 if (!netdev->ifindex) {
3086 ERROR("failed to retrieve the ifindex for %s", peer);
b84f58b9 3087 lxc_netdev_delete_by_name(peer);
26c39028
JHS
3088 return -1;
3089 }
3090
e892973e
DL
3091 DEBUG("instanciated vlan '%s', ifindex is '%d'", " vlan1000",
3092 netdev->ifindex);
3093
26c39028
JHS
3094 return 0;
3095}
3096
74a2b586
JK
3097static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3098{
3099 return 0;
3100}
3101
e3b4c4c4 3102static int instanciate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3103{
6168e99f
DL
3104 if (!netdev->link) {
3105 ERROR("no link specified for the physical interface");
3106 return -1;
3107 }
3108
9d083402 3109 netdev->ifindex = if_nametoindex(netdev->link);
82d5ae15 3110 if (!netdev->ifindex) {
9d083402 3111 ERROR("failed to retrieve the index for %s", netdev->link);
0ad19a3f 3112 return -1;
3113 }
3114
e3b4c4c4
ST
3115 if (netdev->upscript) {
3116 int err;
751d9dcd
DL
3117 err = run_script(handler->name, "net", netdev->upscript,
3118 "up", "phys", netdev->link, (char*) NULL);
3119 if (err)
e3b4c4c4 3120 return -1;
e3b4c4c4
ST
3121 }
3122
82d5ae15 3123 return 0;
0ad19a3f 3124}
3125
74a2b586
JK
3126static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3127{
3128 int err;
3129
3130 if (netdev->downscript) {
3131 err = run_script(handler->name, "net", netdev->downscript,
3132 "down", "phys", netdev->link, (char*) NULL);
3133 if (err)
3134 return -1;
3135 }
3136 return 0;
3137}
3138
26b797f3
SH
3139static int instanciate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3140{
3141 netdev->ifindex = 0;
3142 return 0;
3143}
3144
e3b4c4c4 3145static int instanciate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
0ad19a3f 3146{
82d5ae15 3147 netdev->ifindex = 0;
e3b4c4c4
ST
3148 if (netdev->upscript) {
3149 int err;
751d9dcd
DL
3150 err = run_script(handler->name, "net", netdev->upscript,
3151 "up", "empty", (char*) NULL);
3152 if (err)
e3b4c4c4 3153 return -1;
e3b4c4c4 3154 }
82d5ae15 3155 return 0;
0ad19a3f 3156}
3157
74a2b586
JK
3158static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3159{
3160 int err;
3161
3162 if (netdev->downscript) {
3163 err = run_script(handler->name, "net", netdev->downscript,
3164 "down", "empty", (char*) NULL);
3165 if (err)
3166 return -1;
3167 }
3168 return 0;
3169}
3170
26b797f3
SH
3171static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3172{
3173 return 0;
3174}
3175
3176int lxc_requests_empty_network(struct lxc_handler *handler)
3177{
3178 struct lxc_list *network = &handler->conf->network;
3179 struct lxc_list *iterator;
3180 struct lxc_netdev *netdev;
3181 bool found_none = false, found_nic = false;
3182
3183 if (lxc_list_empty(network))
3184 return 0;
3185
3186 lxc_list_for_each(iterator, network) {
3187
3188 netdev = iterator->elem;
3189
3190 if (netdev->type == LXC_NET_NONE)
3191 found_none = true;
3192 else
3193 found_nic = true;
3194 }
3195 if (found_none && !found_nic)
3196 return 1;
3197 return 0;
3198}
3199
e3b4c4c4 3200int lxc_create_network(struct lxc_handler *handler)
0ad19a3f 3201{
e3b4c4c4 3202 struct lxc_list *network = &handler->conf->network;
82d5ae15 3203 struct lxc_list *iterator;
82d5ae15 3204 struct lxc_netdev *netdev;
cbef6c52
SH
3205 int am_root = (getuid() == 0);
3206
3207 if (!am_root)
3208 return 0;
0ad19a3f 3209
5f4535a3 3210 lxc_list_for_each(iterator, network) {
0ad19a3f 3211
5f4535a3 3212 netdev = iterator->elem;
13954cce 3213
24654103 3214 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
82d5ae15 3215 ERROR("invalid network configuration type '%d'",
5f4535a3 3216 netdev->type);
82d5ae15
DL
3217 return -1;
3218 }
0ad19a3f 3219
e3b4c4c4 3220 if (netdev_conf[netdev->type](handler, netdev)) {
82d5ae15
DL
3221 ERROR("failed to create netdev");
3222 return -1;
3223 }
e3b4c4c4 3224
0ad19a3f 3225 }
3226
3227 return 0;
3228}
3229
74a2b586 3230void lxc_delete_network(struct lxc_handler *handler)
7fef7a06 3231{
74a2b586 3232 struct lxc_list *network = &handler->conf->network;
7fef7a06
DL
3233 struct lxc_list *iterator;
3234 struct lxc_netdev *netdev;
3235
3236 lxc_list_for_each(iterator, network) {
3237 netdev = iterator->elem;
d472214b 3238
74a2b586 3239 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
d8f8e352
DL
3240 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3241 WARN("failed to rename to the initial name the " \
3242 "netdev '%s'", netdev->link);
d472214b 3243 continue;
d8f8e352 3244 }
d472214b 3245
74a2b586
JK
3246 if (netdev_deconf[netdev->type](handler, netdev)) {
3247 WARN("failed to destroy netdev");
3248 }
3249
d8f8e352
DL
3250 /* Recent kernel remove the virtual interfaces when the network
3251 * namespace is destroyed but in case we did not moved the
3252 * interface to the network namespace, we have to destroy it
3253 */
74a2b586
JK
3254 if (netdev->ifindex != 0 &&
3255 lxc_netdev_delete_by_index(netdev->ifindex))
d8f8e352 3256 WARN("failed to remove interface '%s'", netdev->name);
7fef7a06
DL
3257 }
3258}
3259
45e854dc
SG
3260#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3261
fe1f672f
ÇO
3262/* lxc-user-nic returns "interface_name:interface_name\n" */
3263#define MAX_BUFFER_SIZE IFNAMSIZ*2 + 2
74a3920a 3264static int unpriv_assign_nic(struct lxc_netdev *netdev, pid_t pid)
cbef6c52
SH
3265{
3266 pid_t child;
a7242d9a
ÇO
3267 int bytes, pipefd[2];
3268 char *token, *saveptr = NULL;
fe1f672f 3269 char buffer[MAX_BUFFER_SIZE];
cbef6c52
SH
3270
3271 if (netdev->type != LXC_NET_VETH) {
3272 ERROR("nic type %d not support for unprivileged use",
3273 netdev->type);
3274 return -1;
3275 }
3276
a7242d9a
ÇO
3277 if(pipe(pipefd) < 0) {
3278 SYSERROR("pipe failed");
3279 return -1;
3280 }
3281
cbef6c52
SH
3282 if ((child = fork()) < 0) {
3283 SYSERROR("fork");
a7242d9a
ÇO
3284 close(pipefd[0]);
3285 close(pipefd[1]);
3286 return -1;
3287 }
3288
3289 if (child == 0) { // child
3290 /* close the read-end of the pipe */
3291 close(pipefd[0]);
3292 /* redirect the stdout to write-end of the pipe */
3293 dup2(pipefd[1], STDOUT_FILENO);
3294 /* close the write-end of the pipe */
fe1f672f 3295 close(pipefd[1]);
a7242d9a
ÇO
3296
3297 // Call lxc-user-nic pid type bridge
3298 char pidstr[20];
3299 char *args[] = {LXC_USERNIC_PATH, pidstr, "veth", netdev->link, netdev->name, NULL };
3300 snprintf(pidstr, 19, "%lu", (unsigned long) pid);
3301 pidstr[19] = '\0';
3302 execvp(args[0], args);
3303 SYSERROR("execvp lxc-user-nic");
3304 exit(1);
3305 }
3306
3307 /* close the write-end of the pipe */
3308 close(pipefd[1]);
3309
fe1f672f 3310 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
a7242d9a
ÇO
3311 if (bytes < 0) {
3312 SYSERROR("read failed");
3313 }
3314 buffer[bytes - 1] = '\0';
3315
3316 if (wait_for_pid(child) != 0) {
3317 close(pipefd[0]);
cbef6c52
SH
3318 return -1;
3319 }
3320
a7242d9a
ÇO
3321 /* close the read-end of the pipe */
3322 close(pipefd[0]);
cbef6c52 3323
a7242d9a
ÇO
3324 /* fill netdev->name field */
3325 token = strtok_r(buffer, ":", &saveptr);
3326 if (!token)
3327 return -1;
658979c5
SH
3328 netdev->name = malloc(IFNAMSIZ+1);
3329 if (!netdev->name) {
3330 ERROR("Out of memory");
3331 return -1;
3332 }
3333 memset(netdev->name, 0, IFNAMSIZ+1);
3334 strncpy(netdev->name, token, IFNAMSIZ);
a7242d9a
ÇO
3335
3336 /* fill netdev->veth_attr.pair field */
3337 token = strtok_r(NULL, ":", &saveptr);
3338 if (!token)
3339 return -1;
3340 netdev->priv.veth_attr.pair = strdup(token);
658979c5
SH
3341 if (!netdev->priv.veth_attr.pair) {
3342 ERROR("Out of memory");
3343 return -1;
3344 }
45e854dc 3345
a7242d9a 3346 return 0;
cbef6c52
SH
3347}
3348
5f4535a3 3349int lxc_assign_network(struct lxc_list *network, pid_t pid)
0ad19a3f 3350{
82d5ae15 3351 struct lxc_list *iterator;
82d5ae15 3352 struct lxc_netdev *netdev;
cbef6c52 3353 int am_root = (getuid() == 0);
3cfc0f3a 3354 int err;
0ad19a3f 3355
5f4535a3 3356 lxc_list_for_each(iterator, network) {
82d5ae15 3357
5f4535a3 3358 netdev = iterator->elem;
82d5ae15 3359
fbb16259 3360 if (netdev->type == LXC_NET_VETH && !am_root) {
cbef6c52
SH
3361 if (unpriv_assign_nic(netdev, pid))
3362 return -1;
658979c5
SH
3363 // lxc-user-nic has moved the nic to the new ns.
3364 // unpriv_assign_nic() fills in netdev->name.
3365 // netdev->ifindex will be filed in at setup_netdev.
cbef6c52
SH
3366 continue;
3367 }
236087a6 3368
fbb16259
SH
3369 /* empty network namespace, nothing to move */
3370 if (!netdev->ifindex)
3371 continue;
3372
d472214b 3373 err = lxc_netdev_move_by_index(netdev->ifindex, pid);
3cfc0f3a
MN
3374 if (err) {
3375 ERROR("failed to move '%s' to the container : %s",
3376 netdev->link, strerror(-err));
82d5ae15
DL
3377 return -1;
3378 }
3379
c1c75c04 3380 DEBUG("move '%s' to '%d'", netdev->name, pid);
0ad19a3f 3381 }
3382
3383 return 0;
3384}
3385
251d0d2a
DE
3386static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3387 size_t buf_size)
f6d3e3e4
SH
3388{
3389 char path[PATH_MAX];
e4ccd113 3390 int ret, closeret;
f6d3e3e4
SH
3391 FILE *f;
3392
3393 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
3394 if (ret < 0 || ret >= PATH_MAX) {
03fadd16 3395 fprintf(stderr, "%s: path name too long\n", __func__);
f6d3e3e4
SH
3396 return -E2BIG;
3397 }
3398 f = fopen(path, "w");
3399 if (!f) {
3400 perror("open");
3401 return -EINVAL;
3402 }
251d0d2a 3403 ret = fwrite(buf, buf_size, 1, f);
f6d3e3e4 3404 if (ret < 0)
e4ccd113
SH
3405 SYSERROR("writing id mapping");
3406 closeret = fclose(f);
3407 if (closeret)
3408 SYSERROR("writing id mapping");
3409 return ret < 0 ? ret : closeret;
f6d3e3e4
SH
3410}
3411
3412int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3413{
3414 struct lxc_list *iterator;
3415 struct id_map *map;
8afb3e61 3416 int ret = 0, use_shadow = 0;
251d0d2a 3417 enum idtype type;
8afb3e61
SG
3418 char *buf = NULL, *pos, *cmdpath = NULL;
3419
9d9c111c 3420 cmdpath = on_path("newuidmap", NULL);
8afb3e61
SG
3421 if (cmdpath) {
3422 use_shadow = 1;
3423 free(cmdpath);
3424 }
3425
0e6e3a41
SG
3426 if (!use_shadow && geteuid()) {
3427 ERROR("Missing newuidmap/newgidmap");
3428 return -1;
3429 }
251d0d2a
DE
3430
3431 for(type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) {
4f7521b4 3432 int left, fill;
cf3ef16d
SH
3433 int had_entry = 0;
3434 if (!buf) {
3435 buf = pos = malloc(4096);
4f7521b4
SH
3436 if (!buf)
3437 return -ENOMEM;
cf3ef16d
SH
3438 }
3439 pos = buf;
0e6e3a41 3440 if (use_shadow)
d1838f34 3441 pos += sprintf(buf, "new%cidmap %d",
cf3ef16d
SH
3442 type == ID_TYPE_UID ? 'u' : 'g',
3443 pid);
4f7521b4 3444
cf3ef16d
SH
3445 lxc_list_for_each(iterator, idmap) {
3446 /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
251d0d2a 3447 map = iterator->elem;
cf3ef16d
SH
3448 if (map->idtype != type)
3449 continue;
3450
3451 had_entry = 1;
3452 left = 4096 - (pos - buf);
d1838f34 3453 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
0e6e3a41 3454 use_shadow ? " " : "",
d1838f34 3455 map->nsid, map->hostid, map->range,
0e6e3a41 3456 use_shadow ? "" : "\n");
cf3ef16d
SH
3457 if (fill <= 0 || fill >= left)
3458 SYSERROR("snprintf failed, too many mappings");
3459 pos += fill;
251d0d2a 3460 }
cf3ef16d 3461 if (!had_entry)
4f7521b4 3462 continue;
cf3ef16d 3463
0e6e3a41 3464 if (!use_shadow) {
cf3ef16d 3465 ret = write_id_mapping(type, pid, buf, pos-buf);
d1838f34
MS
3466 } else {
3467 left = 4096 - (pos - buf);
3468 fill = snprintf(pos, left, "\n");
3469 if (fill <= 0 || fill >= left)
3470 SYSERROR("snprintf failed, too many mappings");
3471 pos += fill;
cf3ef16d 3472 ret = system(buf);
d1838f34 3473 }
cf3ef16d 3474
f6d3e3e4
SH
3475 if (ret)
3476 break;
3477 }
251d0d2a 3478
4f7521b4
SH
3479 if (buf)
3480 free(buf);
f6d3e3e4
SH
3481 return ret;
3482}
3483
cf3ef16d 3484/*
7b50c609
TS
3485 * return the host uid/gid to which the container root is mapped in
3486 * *val.
0b3a6504 3487 * Return true if id was found, false otherwise.
cf3ef16d 3488 */
2a9a80cb 3489bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3ec1648d 3490 unsigned long *val)
cf3ef16d
SH
3491{
3492 struct lxc_list *it;
3493 struct id_map *map;
3494
3495 lxc_list_for_each(it, &conf->id_map) {
3496 map = it->elem;
7b50c609 3497 if (map->idtype != idtype)
cf3ef16d
SH
3498 continue;
3499 if (map->nsid != 0)
3500 continue;
2a9a80cb
SH
3501 *val = map->hostid;
3502 return true;
cf3ef16d 3503 }
2a9a80cb 3504 return false;
cf3ef16d
SH
3505}
3506
2133f58c 3507int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3508{
3509 struct lxc_list *it;
3510 struct id_map *map;
3511 lxc_list_for_each(it, &conf->id_map) {
3512 map = it->elem;
2133f58c 3513 if (map->idtype != idtype)
cf3ef16d
SH
3514 continue;
3515 if (id >= map->hostid && id < map->hostid + map->range)
57d116ab 3516 return (id - map->hostid) + map->nsid;
cf3ef16d 3517 }
57d116ab 3518 return -1;
cf3ef16d
SH
3519}
3520
2133f58c 3521int find_unmapped_nsuid(struct lxc_conf *conf, enum idtype idtype)
cf3ef16d
SH
3522{
3523 struct lxc_list *it;
3524 struct id_map *map;
2133f58c 3525 unsigned int freeid = 0;
cf3ef16d
SH
3526again:
3527 lxc_list_for_each(it, &conf->id_map) {
3528 map = it->elem;
2133f58c 3529 if (map->idtype != idtype)
cf3ef16d
SH
3530 continue;
3531 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3532 freeid = map->nsid + map->range;
3533 goto again;
3534 }
3535 }
3536 return freeid;
3537}
3538
19a26f82
MK
3539int lxc_find_gateway_addresses(struct lxc_handler *handler)
3540{
3541 struct lxc_list *network = &handler->conf->network;
3542 struct lxc_list *iterator;
3543 struct lxc_netdev *netdev;
3544 int link_index;
3545
3546 lxc_list_for_each(iterator, network) {
3547 netdev = iterator->elem;
3548
3549 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3550 continue;
3551
3552 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3553 ERROR("gateway = auto only supported for "
3554 "veth and macvlan");
3555 return -1;
3556 }
3557
3558 if (!netdev->link) {
3559 ERROR("gateway = auto needs a link interface");
3560 return -1;
3561 }
3562
3563 link_index = if_nametoindex(netdev->link);
3564 if (!link_index)
3565 return -EINVAL;
3566
3567 if (netdev->ipv4_gateway_auto) {
3568 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3569 ERROR("failed to automatically find ipv4 gateway "
3570 "address from link interface '%s'", netdev->link);
3571 return -1;
3572 }
3573 }
3574
3575 if (netdev->ipv6_gateway_auto) {
3576 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3577 ERROR("failed to automatically find ipv6 gateway "
3578 "address from link interface '%s'", netdev->link);
3579 return -1;
3580 }
3581 }
3582 }
3583
3584 return 0;
3585}
3586
5e4a62bf 3587int lxc_create_tty(const char *name, struct lxc_conf *conf)
b0a33c1e 3588{
5e4a62bf 3589 struct lxc_tty_info *tty_info = &conf->tty_info;
025ed0f3 3590 int i, ret;
b0a33c1e 3591
5e4a62bf
DL
3592 /* no tty in the configuration */
3593 if (!conf->tty)
b0a33c1e 3594 return 0;
3595
13954cce 3596 tty_info->pty_info =
e4e7d59d 3597 malloc(sizeof(*tty_info->pty_info)*conf->tty);
b0a33c1e 3598 if (!tty_info->pty_info) {
36eb9bde 3599 SYSERROR("failed to allocate pty_info");
985d15b1 3600 return -1;
b0a33c1e 3601 }
3602
985d15b1 3603 for (i = 0; i < conf->tty; i++) {
13954cce 3604
b0a33c1e 3605 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3606
025ed0f3
SH
3607 process_lock();
3608 ret = openpty(&pty_info->master, &pty_info->slave,
3609 pty_info->name, NULL, NULL);
3610 process_unlock();
3611 if (ret) {
36eb9bde 3612 SYSERROR("failed to create pty #%d", i);
985d15b1
MT
3613 tty_info->nbtty = i;
3614 lxc_delete_tty(tty_info);
3615 return -1;
b0a33c1e 3616 }
3617
5332bb84
DL
3618 DEBUG("allocated pty '%s' (%d/%d)",
3619 pty_info->name, pty_info->master, pty_info->slave);
3620
3ec1648d 3621 /* Prevent leaking the file descriptors to the container */
b035ad62
MS
3622 fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3623 fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3624
b0a33c1e 3625 pty_info->busy = 0;
3626 }
3627
985d15b1 3628 tty_info->nbtty = conf->tty;
1ac470c0
DL
3629
3630 INFO("tty's configured");
3631
985d15b1 3632 return 0;
b0a33c1e 3633}
3634
3635void lxc_delete_tty(struct lxc_tty_info *tty_info)
3636{
3637 int i;
3638
3639 for (i = 0; i < tty_info->nbtty; i++) {
3640 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3641
3642 close(pty_info->master);
3643 close(pty_info->slave);
3644 }
3645
3646 free(tty_info->pty_info);
3647 tty_info->nbtty = 0;
3648}
3649
f6d3e3e4 3650/*
7b50c609
TS
3651 * chown_mapped_root: for an unprivileged user with uid/gid X to
3652 * chown a dir to subuid/subgid Y, he needs to run chown as root
3653 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3654 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3655 * root is privileged with respect to hostuid/hostgid X, allowing
3656 * him to do the chown.
f6d3e3e4 3657 */
c4d10a05 3658int chown_mapped_root(char *path, struct lxc_conf *conf)
f6d3e3e4 3659{
7b50c609
TS
3660 uid_t rootuid;
3661 gid_t rootgid;
c4d10a05 3662 pid_t pid;
2a9a80cb 3663 unsigned long val;
a7ef8753 3664 char *chownpath = path;
f6d3e3e4 3665
2a9a80cb 3666 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
c4d10a05
SH
3667 ERROR("No mapping for container root");
3668 return -1;
f6d3e3e4 3669 }
7b50c609
TS
3670 rootuid = (uid_t) val;
3671 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3672 ERROR("No mapping for container root");
3673 return -1;
3674 }
3675 rootgid = (gid_t) val;
2a9a80cb 3676
a7ef8753
SH
3677 /*
3678 * In case of overlay, we want only the writeable layer
3679 * to be chowned
3680 */
1f92162d 3681 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
a7ef8753
SH
3682 chownpath = strchr(path, ':');
3683 if (!chownpath) {
3684 ERROR("Bad overlay path: %s", path);
3685 return -1;
3686 }
3687 chownpath = strchr(chownpath+1, ':');
3688 if (!chownpath) {
3689 ERROR("Bad overlay path: %s", path);
3690 return -1;
3691 }
3692 chownpath++;
3693 }
3694 path = chownpath;
c4d10a05 3695 if (geteuid() == 0) {
7b50c609 3696 if (chown(path, rootuid, rootgid) < 0) {
c4d10a05
SH
3697 ERROR("Error chowning %s", path);
3698 return -1;
3699 }
3700 return 0;
3701 }
f3d7e4ca 3702
7b50c609 3703 if (rootuid == geteuid()) {
f3d7e4ca
SH
3704 // nothing to do
3705 INFO("%s: container root is our uid; no need to chown" ,__func__);
3706 return 0;
3707 }
3708
c4d10a05
SH
3709 pid = fork();
3710 if (pid < 0) {
3711 SYSERROR("Failed forking");
f6d3e3e4
SH
3712 return -1;
3713 }
c4d10a05 3714 if (!pid) {
7b50c609
TS
3715 int hostuid = geteuid(), hostgid = getegid(), ret;
3716 struct stat sb;
3717 char map1[100], map2[100], map3[100], map4[100], map5[100];
3718 char ugid[100];
3719 char *args1[] = { "lxc-usernsexec", "-m", map1, "-m", map2,
3720 "-m", map3, "-m", map5,
3721 "--", "chown", ugid, path, NULL };
3722 char *args2[] = { "lxc-usernsexec", "-m", map1, "-m", map2,
3723 "-m", map3, "-m", map4, "-m", map5,
3724 "--", "chown", ugid, path, NULL };
3725
3726 // save the current gid of "path"
3727 if (stat(path, &sb) < 0) {
3728 ERROR("Error stat %s", path);
3729 return -1;
3730 }
f6d3e3e4 3731
9a7c2aba
SH
3732 /*
3733 * A file has to be group-owned by a gid mapped into the
3734 * container, or the container won't be privileged over it.
3735 */
3736 if (sb.st_uid == geteuid() &&
3737 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3738 chown(path, -1, hostgid) < 0) {
3739 ERROR("Failed chgrping %s", path);
7b50c609
TS
3740 return -1;
3741 }
3742
3743 // "u:0:rootuid:1"
3744 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
c4d10a05
SH
3745 if (ret < 0 || ret >= 100) {
3746 ERROR("Error uid printing map string");
f6d3e3e4
SH
3747 return -1;
3748 }
c4d10a05 3749
98e5ba51
SH
3750 // "u:hostuid:hostuid:1"
3751 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3752 if (ret < 0 || ret >= 100) {
3753 ERROR("Error uid printing map string");
3754 return -1;
3755 }
3756
7b50c609
TS
3757 // "g:0:rootgid:1"
3758 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
c4d10a05 3759 if (ret < 0 || ret >= 100) {
7b50c609 3760 ERROR("Error gid printing map string");
c4d10a05
SH
3761 return -1;
3762 }
3763
7b50c609 3764 // "g:pathgid:rootgid+pathgid:1"
b4c1e35d
SG
3765 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3766 rootgid + (gid_t)sb.st_gid);
7b50c609
TS
3767 if (ret < 0 || ret >= 100) {
3768 ERROR("Error gid printing map string");
3769 return -1;
3770 }
3771
3772 // "g:hostgid:hostgid:1"
3773 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3774 if (ret < 0 || ret >= 100) {
3775 ERROR("Error gid printing map string");
3776 return -1;
3777 }
3778
3779 // "0:pathgid" (chown)
b4c1e35d 3780 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
7b50c609
TS
3781 if (ret < 0 || ret >= 100) {
3782 ERROR("Error owner printing format string for chown");
3783 return -1;
3784 }
3785
3786 if (hostgid == sb.st_gid)
3787 ret = execvp("lxc-usernsexec", args1);
3788 else
3789 ret = execvp("lxc-usernsexec", args2);
c4d10a05
SH
3790 SYSERROR("Failed executing usernsexec");
3791 exit(1);
f6d3e3e4 3792 }
c4d10a05 3793 return wait_for_pid(pid);
f6d3e3e4
SH
3794}
3795
c4d10a05 3796int ttys_shift_ids(struct lxc_conf *c)
f6d3e3e4 3797{
c4d10a05 3798 int i;
f6d3e3e4 3799
c4d10a05 3800 if (lxc_list_empty(&c->id_map))
f6d3e3e4 3801 return 0;
c4d10a05
SH
3802
3803 for (i = 0; i < c->tty_info.nbtty; i++) {
3804 struct lxc_pty_info *pty_info = &c->tty_info.pty_info[i];
3805
3806 if (chown_mapped_root(pty_info->name, c) < 0) {
3807 ERROR("Failed to chown %s", pty_info->name);
f6d3e3e4
SH
3808 return -1;
3809 }
3810 }
3811
29b10e4f 3812 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
c4d10a05
SH
3813 ERROR("Failed to chown %s", c->console.name);
3814 return -1;
3815 }
3816
f6d3e3e4
SH
3817 return 0;
3818}
3819
bc6928ff
MW
3820/*
3821 * This routine is called when the configuration does not already specify a value
3822 * for autodev (mounting a file system on /dev and populating it in a container).
3823 * If a hard override value has not be specified, then we try to apply some
3824 * heuristics to determine if we should switch to autodev mode.
3825 *
3826 * For instance, if the container has an /etc/systemd/system directory then it
3827 * is probably running systemd as the init process and it needs the autodev
3828 * mount to prevent it from mounting devtmpfs on /dev on it's own causing conflicts
3829 * in the host.
3830 *
3831 * We may also want to enable autodev if the host has devtmpfs mounted on its
3832 * /dev as this then enable us to use subdirectories under /dev for the container
3833 * /dev directories and we can fake udev devices.
3834 */
3835struct start_args {
3836 char *const *argv;
3837};
3838
3839#define MAX_SYMLINK_DEPTH 32
3840
74a3920a 3841static int check_autodev( const char *rootfs, void *data )
bc6928ff
MW
3842{
3843 struct start_args *arg = data;
3844 int ret;
3845 int loop_count = 0;
3846 struct stat s;
3847 char absrootfs[MAXPATHLEN];
3848 char path[MAXPATHLEN];
3849 char abs_path[MAXPATHLEN];
3850 char *command = "/sbin/init";
3851
3852 if (rootfs == NULL || strlen(rootfs) == 0)
3853 return -2;
3854
3855 if (!realpath(rootfs, absrootfs))
3856 return -2;
3857
3858 if( arg && arg->argv[0] ) {
3859 command = arg->argv[0];
959aee9c 3860 DEBUG("Set exec command to %s", command );
bc6928ff
MW
3861 }
3862
3863 strncpy( path, command, MAXPATHLEN-1 );
3864
3865 if ( 0 != access(path, F_OK) || 0 != stat(path, &s) )
3866 return -2;
3867
3868 /* Dereference down the symlink merry path testing as we go. */
3869 /* If anything references systemd in the path - set autodev! */
3870 /* Renormalize to the rootfs before each dereference */
3871 /* Relative symlinks should fall out in the wash even with .. */
3872 while( 1 ) {
3873 if ( strstr( path, "systemd" ) ) {
3874 INFO("Container with systemd init detected - enabling autodev!");
3875 return 1;
3876 }
3877
3878 ret = snprintf(abs_path, MAXPATHLEN-1, "%s/%s", absrootfs, path);
3879 if (ret < 0 || ret > MAXPATHLEN)
3880 return -2;
3881
3882 ret = readlink( abs_path, path, MAXPATHLEN-1 );
3883
3884 if ( ( ret <= 0 ) || ( ++loop_count > MAX_SYMLINK_DEPTH ) ) {
3885 break; /* Break out for other tests */
3886 }
3887 path[ret] = '\0';
3888 }
3889
3890 /*
3891 * Add future checks here.
3892 * Return positive if we should go autodev
3893 * Return 0 if we should NOT go autodev
3894 * Return negative if we encounter an error or can not determine...
3895 */
3896
3897 /* All else fails, we don't need autodev */
3898 INFO("Autodev not required.");
3899 return 0;
3900}
3901
5112cd70
SH
3902/*
3903 * _do_tmp_proc_mount: Mount /proc inside container if not already
3904 * mounted
3905 *
3906 * @rootfs : the rootfs where proc should be mounted
3907 *
3908 * Returns < 0 on failure, 0 if the correct proc was already mounted
3909 * and 1 if a new proc was mounted.
3910 */
3911static int do_tmp_proc_mount(const char *rootfs)
3912{
3913 char path[MAXPATHLEN];
3914 char link[20];
3915 int linklen, ret;
3916
3917 ret = snprintf(path, MAXPATHLEN, "%s/proc/self", rootfs);
3918 if (ret < 0 || ret >= MAXPATHLEN) {
3919 SYSERROR("proc path name too long");
3920 return -1;
3921 }
3922 memset(link, 0, 20);
3923 linklen = readlink(path, link, 20);
3924 INFO("I am %d, /proc/self points to '%s'", getpid(), link);
3925 ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
3926 if (linklen < 0) /* /proc not mounted */
3927 goto domount;
3928 /* can't be longer than rootfs/proc/1 */
3929 if (strncmp(link, "1", linklen) != 0) {
3930 /* wrong /procs mounted */
3931 umount2(path, MNT_DETACH); /* ignore failure */
3932 goto domount;
3933 }
3934 /* the right proc is already mounted */
3935 return 0;
3936
3937domount:
3938 if (mount("proc", path, "proc", 0, NULL))
3939 return -1;
3940 INFO("Mounted /proc in container for security transition");
3941 return 1;
3942}
3943
3944int tmp_proc_mount(struct lxc_conf *lxc_conf)
3945{
3946 int mounted;
3947
3948 if (lxc_conf->rootfs.path == NULL || strlen(lxc_conf->rootfs.path) == 0) {
3949 if (mount("proc", "/proc", "proc", 0, NULL)) {
3950 SYSERROR("Failed mounting /proc, proceeding");
3951 mounted = 0;
3952 } else
3953 mounted = 1;
3954 } else
3955 mounted = do_tmp_proc_mount(lxc_conf->rootfs.mount);
3956 if (mounted == -1) {
3957 SYSERROR("failed to mount /proc in the container.");
3958 return -1;
3959 } else if (mounted == 1) {
3960 lxc_conf->tmp_umount_proc = 1;
3961 }
3962 return 0;
3963}
3964
3965void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3966{
3967 if (lxc_conf->tmp_umount_proc == 1) {
3968 umount("/proc");
3969 lxc_conf->tmp_umount_proc = 0;
3970 }
3971}
3972
e995d7a2
SH
3973static void remount_all_slave(void)
3974{
3975 /* walk /proc/mounts and change any shared entries to slave */
3976 FILE *f = fopen("/proc/self/mountinfo", "r");
3977 char *line = NULL;
3978 size_t len = 0;
3979
3980 if (!f) {
3981 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3982 ERROR("Continuing container startup...");
3983 return;
3984 }
3985
3986 while (getline(&line, &len, f) != -1) {
3987 char *target, *opts;
3988 target = get_field(line, 4);
3989 if (!target)
3990 continue;
3991 opts = get_field(target, 2);
3992 if (!opts)
3993 continue;
3994 null_endofword(opts);
3995 if (!strstr(opts, "shared"))
3996 continue;
3997 null_endofword(target);
3998 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3999 SYSERROR("Failed to make %s rslave", target);
4000 ERROR("Continuing...");
4001 }
4002 }
4003 fclose(f);
4004 if (line)
4005 free(line);
4006}
4007
2322903b
SH
4008void lxc_execute_bind_init(struct lxc_conf *conf)
4009{
4010 int ret;
9d9c111c
SH
4011 char path[PATH_MAX], destpath[PATH_MAX], *p;
4012
4013 /* If init exists in the container, don't bind mount a static one */
4014 p = choose_init(conf->rootfs.mount);
4015 if (p) {
4016 free(p);
4017 return;
4018 }
2322903b
SH
4019
4020 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
4021 if (ret < 0 || ret >= PATH_MAX) {
4022 WARN("Path name too long searching for lxc.init.static");
4023 return;
4024 }
4025
4026 if (!file_exists(path)) {
4027 INFO("%s does not exist on host", path);
4028 return;
4029 }
4030
4031 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4032 if (ret < 0 || ret >= PATH_MAX) {
4033 WARN("Path name too long for container's lxc.init.static");
4034 return;
4035 }
4036
4037 if (!file_exists(destpath)) {
4038 FILE * pathfile = fopen(destpath, "wb");
4039 if (!pathfile) {
4040 SYSERROR("Failed to create mount target '%s'", destpath);
4041 return;
4042 }
4043 fclose(pathfile);
4044 }
4045
4046 ret = mount(path, destpath, "none", MS_BIND, NULL);
4047 if (ret < 0)
4048 SYSERROR("Failed to bind lxc.init.static into container");
4049 INFO("lxc.init.static bound into container at %s", path);
4050}
4051
35120d9c
SH
4052/*
4053 * This does the work of remounting / if it is shared, calling the
4054 * container pre-mount hooks, and mounting the rootfs.
4055 */
4056int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
0ad19a3f 4057{
35120d9c
SH
4058 if (conf->rootfs_setup) {
4059 /*
4060 * rootfs was set up in another namespace. bind-mount it
4061 * to give us a mount in our own ns so we can pivot_root to it
4062 */
4063 const char *path = conf->rootfs.mount;
4064 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4065 ERROR("Failed to bind-mount container / onto itself");
4066 return false;
4067 }
4068 }
d4ef7c50 4069
cd698bdd 4070 if (detect_ramfs_rootfs()) {
35120d9c 4071 if (chroot_into_slave(conf)) {
cd698bdd
FK
4072 ERROR("Failed to chroot into slave /");
4073 return -1;
4074 }
4075 }
4076
e995d7a2
SH
4077 remount_all_slave();
4078
35120d9c
SH
4079 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4080 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4081 return -1;
4082 }
4083
4084 if (setup_rootfs(conf)) {
4085 ERROR("failed to setup rootfs for '%s'", name);
4086 return -1;
4087 }
4088
4089 conf->rootfs_setup = true;
4090 return 0;
4091}
4092
1c1c7051
SH
4093static bool verify_start_hooks(struct lxc_conf *conf)
4094{
4095 struct lxc_list *it;
4096 char path[MAXPATHLEN];
4097 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4098 char *hookname = it->elem;
4099 struct stat st;
4100 int ret;
4101
4102 ret = snprintf(path, MAXPATHLEN, "%s%s",
4103 conf->rootfs.mount, hookname);
4104 if (ret < 0 || ret >= MAXPATHLEN)
4105 return false;
4106 ret = stat(path, &st);
4107 if (ret) {
4108 SYSERROR("Start hook %s not found in container rootfs",
4109 hookname);
4110 return false;
4111 }
4112 }
4113
4114 return true;
4115}
4116
35120d9c
SH
4117int lxc_setup(struct lxc_handler *handler)
4118{
4119 const char *name = handler->name;
4120 struct lxc_conf *lxc_conf = handler->conf;
4121 const char *lxcpath = handler->lxcpath;
4122 void *data = handler->data;
4123
4124 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4125 ERROR("Error setting up rootfs mount after spawn");
4126 return -1;
4127 }
4128
6c544cb3
MM
4129 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4130 if (setup_utsname(lxc_conf->utsname)) {
4131 ERROR("failed to setup the utsname for '%s'", name);
4132 return -1;
4133 }
0ad19a3f 4134 }
4135
5f4535a3 4136 if (setup_network(&lxc_conf->network)) {
36eb9bde 4137 ERROR("failed to setup the network for '%s'", name);
95b5ffaf 4138 return -1;
0ad19a3f 4139 }
4140
bc6928ff
MW
4141 if (lxc_conf->autodev < 0) {
4142 lxc_conf->autodev = check_autodev(lxc_conf->rootfs.mount, data);
4143 }
4144
4145 if (lxc_conf->autodev > 0) {
4146 if (mount_autodev(name, lxc_conf->rootfs.mount, lxcpath)) {
91c3830e 4147 ERROR("failed to mount /dev in the container");
c6883f38
SH
4148 return -1;
4149 }
4150 }
4151
368bbc02
CS
4152 /* do automatic mounts (mainly /proc and /sys), but exclude
4153 * those that need to wait until other stuff has finished
4154 */
4fb3cba5 4155 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4156 ERROR("failed to setup the automatic mounts for '%s'", name);
4157 return -1;
4158 }
4159
80a881b2 4160 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) {
36eb9bde 4161 ERROR("failed to setup the mounts for '%s'", name);
95b5ffaf 4162 return -1;
576f946d 4163 }
4164
c1dc38c2 4165 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name)) {
e7938e9e
MN
4166 ERROR("failed to setup the mount entries for '%s'", name);
4167 return -1;
4168 }
4169
1c1c7051
SH
4170 /* Make sure any start hooks are in the rootfs */
4171 if (!verify_start_hooks(lxc_conf))
4172 return -1;
4173
2322903b
SH
4174 if (lxc_conf->is_execute)
4175 lxc_execute_bind_init(lxc_conf);
4176
368bbc02
CS
4177 /* now mount only cgroup, if wanted;
4178 * before, /sys could not have been mounted
4179 * (is either mounted automatically or via fstab entries)
4180 */
4fb3cba5 4181 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
368bbc02
CS
4182 ERROR("failed to setup the automatic mounts for '%s'", name);
4183 return -1;
4184 }
4185
283678ed 4186 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
773fb9ca
SH
4187 ERROR("failed to run mount hooks for container '%s'.", name);
4188 return -1;
4189 }
4190
bc6928ff 4191 if (lxc_conf->autodev > 0) {
283678ed 4192 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
f7bee6c6
MW
4193 ERROR("failed to run autodev hooks for container '%s'.", name);
4194 return -1;
4195 }
91c3830e
SH
4196 if (setup_autodev(lxc_conf->rootfs.mount)) {
4197 ERROR("failed to populate /dev in the container");
4198 return -1;
4199 }
4200 }
368bbc02 4201
37903589 4202 if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
36eb9bde 4203 ERROR("failed to setup the console for '%s'", name);
95b5ffaf 4204 return -1;
6e590161 4205 }
4206
7e0e1d94
AV
4207 if (lxc_conf->kmsg) {
4208 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4209 ERROR("failed to setup kmsg for '%s'", name);
4210 }
1bd051a6 4211
37903589 4212 if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
36eb9bde 4213 ERROR("failed to setup the ttys for '%s'", name);
95b5ffaf 4214 return -1;
b0a33c1e 4215 }
4216
69aa6655
DE
4217 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4218 ERROR("failed to setup /dev symlinks for '%s'", name);
4219 return -1;
4220 }
4221
5112cd70
SH
4222 /* mount /proc if it's not already there */
4223 if (tmp_proc_mount(lxc_conf) < 0) {
fe4de9a6 4224 ERROR("failed to LSM mount proc for '%s'", name);
e075f5d9 4225 return -1;
e075f5d9 4226 }
e075f5d9 4227
ac778708 4228 if (setup_pivot_root(&lxc_conf->rootfs)) {
36eb9bde 4229 ERROR("failed to set rootfs for '%s'", name);
95b5ffaf 4230 return -1;
ed502555 4231 }
4232
571e6ec8 4233 if (setup_pts(lxc_conf->pts)) {
36eb9bde 4234 ERROR("failed to setup the new pts instance");
95b5ffaf 4235 return -1;
3c26f34e 4236 }
4237
cccc74b5
DL
4238 if (setup_personality(lxc_conf->personality)) {
4239 ERROR("failed to setup personality");
4240 return -1;
4241 }
4242
f6d3e3e4 4243 if (lxc_list_empty(&lxc_conf->id_map)) {
1fb86a7c
SH
4244 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4245 if (!lxc_list_empty(&lxc_conf->caps)) {
4246 ERROR("Simultaneously requested dropping and keeping caps");
4247 return -1;
4248 }
4249 if (dropcaps_except(&lxc_conf->keepcaps)) {
959aee9c 4250 ERROR("failed to keep requested caps");
1fb86a7c
SH
4251 return -1;
4252 }
4253 } else if (setup_caps(&lxc_conf->caps)) {
f6d3e3e4
SH
4254 ERROR("failed to drop capabilities");
4255 return -1;
4256 }
81810dd1
DL
4257 }
4258
cd54d859
DL
4259 NOTICE("'%s' is setup.", name);
4260
0ad19a3f 4261 return 0;
4262}
26ddeedd 4263
283678ed
SH
4264int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4265 const char *lxcpath, char *argv[])
26ddeedd
SH
4266{
4267 int which = -1;
4268 struct lxc_list *it;
4269
4270 if (strcmp(hook, "pre-start") == 0)
4271 which = LXCHOOK_PRESTART;
5ea6163a
SH
4272 else if (strcmp(hook, "pre-mount") == 0)
4273 which = LXCHOOK_PREMOUNT;
26ddeedd
SH
4274 else if (strcmp(hook, "mount") == 0)
4275 which = LXCHOOK_MOUNT;
f7bee6c6
MW
4276 else if (strcmp(hook, "autodev") == 0)
4277 which = LXCHOOK_AUTODEV;
26ddeedd
SH
4278 else if (strcmp(hook, "start") == 0)
4279 which = LXCHOOK_START;
4280 else if (strcmp(hook, "post-stop") == 0)
4281 which = LXCHOOK_POSTSTOP;
148e91f5
SH
4282 else if (strcmp(hook, "clone") == 0)
4283 which = LXCHOOK_CLONE;
26ddeedd
SH
4284 else
4285 return -1;
4286 lxc_list_for_each(it, &conf->hooks[which]) {
4287 int ret;
4288 char *hookname = it->elem;
283678ed 4289 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
26ddeedd
SH
4290 if (ret)
4291 return ret;
4292 }
4293 return 0;
4294}
72d0e1cb 4295
427b3a21 4296static void lxc_remove_nic(struct lxc_list *it)
72d0e1cb
SG
4297{
4298 struct lxc_netdev *netdev = it->elem;
9ebb03ad 4299 struct lxc_list *it2,*next;
72d0e1cb
SG
4300
4301 lxc_list_del(it);
4302
4303 if (netdev->link)
4304 free(netdev->link);
4305 if (netdev->name)
4306 free(netdev->name);
c9bb9a85
DE
4307 if (netdev->type == LXC_NET_VETH && netdev->priv.veth_attr.pair)
4308 free(netdev->priv.veth_attr.pair);
72d0e1cb
SG
4309 if (netdev->upscript)
4310 free(netdev->upscript);
4311 if (netdev->hwaddr)
4312 free(netdev->hwaddr);
4313 if (netdev->mtu)
4314 free(netdev->mtu);
4315 if (netdev->ipv4_gateway)
4316 free(netdev->ipv4_gateway);
4317 if (netdev->ipv6_gateway)
4318 free(netdev->ipv6_gateway);
9ebb03ad 4319 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4320 lxc_list_del(it2);
4321 free(it2->elem);
4322 free(it2);
4323 }
9ebb03ad 4324 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4325 lxc_list_del(it2);
4326 free(it2->elem);
4327 free(it2);
4328 }
d95db067 4329 free(netdev);
72d0e1cb
SG
4330 free(it);
4331}
4332
4333/* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
12a50cc6 4334int lxc_clear_nic(struct lxc_conf *c, const char *key)
72d0e1cb
SG
4335{
4336 char *p1;
4337 int ret, idx, i;
4338 struct lxc_list *it;
4339 struct lxc_netdev *netdev;
4340
4341 p1 = index(key, '.');
4342 if (!p1 || *(p1+1) == '\0')
4343 p1 = NULL;
4344
4345 ret = sscanf(key, "%d", &idx);
4346 if (ret != 1) return -1;
4347 if (idx < 0)
4348 return -1;
4349
4350 i = 0;
4351 lxc_list_for_each(it, &c->network) {
4352 if (i == idx)
4353 break;
4354 i++;
4355 }
4356 if (i < idx) // we don't have that many nics defined
4357 return -1;
4358
4359 if (!it || !it->elem)
4360 return -1;
4361
4362 netdev = it->elem;
4363
4364 if (!p1) {
4365 lxc_remove_nic(it);
52d21d40 4366 } else if (strcmp(p1, ".ipv4") == 0) {
9ebb03ad
DE
4367 struct lxc_list *it2,*next;
4368 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
72d0e1cb
SG
4369 lxc_list_del(it2);
4370 free(it2->elem);
4371 free(it2);
4372 }
52d21d40 4373 } else if (strcmp(p1, ".ipv6") == 0) {
9ebb03ad
DE
4374 struct lxc_list *it2,*next;
4375 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
72d0e1cb
SG
4376 lxc_list_del(it2);
4377 free(it2->elem);
4378 free(it2);
4379 }
52d21d40 4380 } else if (strcmp(p1, ".link") == 0) {
72d0e1cb
SG
4381 if (netdev->link) {
4382 free(netdev->link);
4383 netdev->link = NULL;
4384 }
52d21d40 4385 } else if (strcmp(p1, ".name") == 0) {
72d0e1cb
SG
4386 if (netdev->name) {
4387 free(netdev->name);
4388 netdev->name = NULL;
4389 }
52d21d40 4390 } else if (strcmp(p1, ".script.up") == 0) {
72d0e1cb
SG
4391 if (netdev->upscript) {
4392 free(netdev->upscript);
4393 netdev->upscript = NULL;
4394 }
52d21d40 4395 } else if (strcmp(p1, ".hwaddr") == 0) {
72d0e1cb
SG
4396 if (netdev->hwaddr) {
4397 free(netdev->hwaddr);
4398 netdev->hwaddr = NULL;
4399 }
52d21d40 4400 } else if (strcmp(p1, ".mtu") == 0) {
72d0e1cb
SG
4401 if (netdev->mtu) {
4402 free(netdev->mtu);
4403 netdev->mtu = NULL;
4404 }
52d21d40 4405 } else if (strcmp(p1, ".ipv4_gateway") == 0) {
72d0e1cb
SG
4406 if (netdev->ipv4_gateway) {
4407 free(netdev->ipv4_gateway);
4408 netdev->ipv4_gateway = NULL;
4409 }
52d21d40 4410 } else if (strcmp(p1, ".ipv6_gateway") == 0) {
72d0e1cb
SG
4411 if (netdev->ipv6_gateway) {
4412 free(netdev->ipv6_gateway);
4413 netdev->ipv6_gateway = NULL;
4414 }
4415 }
4416 else return -1;
4417
4418 return 0;
4419}
4420
4421int lxc_clear_config_network(struct lxc_conf *c)
4422{
9ebb03ad
DE
4423 struct lxc_list *it,*next;
4424 lxc_list_for_each_safe(it, &c->network, next) {
72d0e1cb
SG
4425 lxc_remove_nic(it);
4426 }
4427 return 0;
4428}
4429
4430int lxc_clear_config_caps(struct lxc_conf *c)
4431{
9ebb03ad 4432 struct lxc_list *it,*next;
72d0e1cb 4433
9ebb03ad 4434 lxc_list_for_each_safe(it, &c->caps, next) {
72d0e1cb
SG
4435 lxc_list_del(it);
4436 free(it->elem);
4437 free(it);
4438 }
4439 return 0;
4440}
4441
74a3920a 4442static int lxc_free_idmap(struct lxc_list *id_map) {
27c27d73
SH
4443 struct lxc_list *it, *next;
4444
4355ab5f 4445 lxc_list_for_each_safe(it, id_map, next) {
27c27d73
SH
4446 lxc_list_del(it);
4447 free(it->elem);
4448 free(it);
4449 }
4450 return 0;
4451}
4452
4355ab5f
SH
4453int lxc_clear_idmaps(struct lxc_conf *c)
4454{
4455 return lxc_free_idmap(&c->id_map);
4456}
4457
1fb86a7c
SH
4458int lxc_clear_config_keepcaps(struct lxc_conf *c)
4459{
4460 struct lxc_list *it,*next;
4461
4462 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4463 lxc_list_del(it);
4464 free(it->elem);
4465 free(it);
4466 }
4467 return 0;
4468}
4469
12a50cc6 4470int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
72d0e1cb 4471{
9ebb03ad 4472 struct lxc_list *it,*next;
72d0e1cb 4473 bool all = false;
12a50cc6 4474 const char *k = key + 11;
72d0e1cb
SG
4475
4476 if (strcmp(key, "lxc.cgroup") == 0)
4477 all = true;
4478
9ebb03ad 4479 lxc_list_for_each_safe(it, &c->cgroup, next) {
72d0e1cb
SG
4480 struct lxc_cgroup *cg = it->elem;
4481 if (!all && strcmp(cg->subsystem, k) != 0)
4482 continue;
4483 lxc_list_del(it);
4484 free(cg->subsystem);
4485 free(cg->value);
4486 free(cg);
4487 free(it);
4488 }
4489 return 0;
4490}
4491
ee1e7aa0
SG
4492int lxc_clear_groups(struct lxc_conf *c)
4493{
4494 struct lxc_list *it,*next;
4495
4496 lxc_list_for_each_safe(it, &c->groups, next) {
4497 lxc_list_del(it);
4498 free(it->elem);
4499 free(it);
4500 }
4501 return 0;
4502}
4503
ab799c0b
SG
4504int lxc_clear_environment(struct lxc_conf *c)
4505{
4506 struct lxc_list *it,*next;
4507
4508 lxc_list_for_each_safe(it, &c->environment, next) {
4509 lxc_list_del(it);
4510 free(it->elem);
4511 free(it);
4512 }
4513 return 0;
4514}
4515
4516
72d0e1cb
SG
4517int lxc_clear_mount_entries(struct lxc_conf *c)
4518{
9ebb03ad 4519 struct lxc_list *it,*next;
72d0e1cb 4520
9ebb03ad 4521 lxc_list_for_each_safe(it, &c->mount_list, next) {
72d0e1cb
SG
4522 lxc_list_del(it);
4523 free(it->elem);
4524 free(it);
4525 }
4526 return 0;
4527}
4528
b099e9e9
SH
4529int lxc_clear_automounts(struct lxc_conf *c)
4530{
4531 c->auto_mounts = 0;
4532 return 0;
4533}
4534
12a50cc6 4535int lxc_clear_hooks(struct lxc_conf *c, const char *key)
72d0e1cb 4536{
9ebb03ad 4537 struct lxc_list *it,*next;
17ed13a3 4538 bool all = false, done = false;
12a50cc6 4539 const char *k = key + 9;
72d0e1cb
SG
4540 int i;
4541
17ed13a3
SH
4542 if (strcmp(key, "lxc.hook") == 0)
4543 all = true;
4544
72d0e1cb 4545 for (i=0; i<NUM_LXC_HOOKS; i++) {
17ed13a3 4546 if (all || strcmp(k, lxchook_names[i]) == 0) {
9ebb03ad 4547 lxc_list_for_each_safe(it, &c->hooks[i], next) {
17ed13a3
SH
4548 lxc_list_del(it);
4549 free(it->elem);
4550 free(it);
4551 }
4552 done = true;
72d0e1cb
SG
4553 }
4554 }
17ed13a3
SH
4555
4556 if (!done) {
4557 ERROR("Invalid hook key: %s", key);
4558 return -1;
4559 }
72d0e1cb
SG
4560 return 0;
4561}
8eb5694b 4562
74a3920a 4563static void lxc_clear_saved_nics(struct lxc_conf *conf)
7b35f3d6
SH
4564{
4565 int i;
4566
0cf45501 4567 if (!conf->saved_nics)
7b35f3d6
SH
4568 return;
4569 for (i=0; i < conf->num_savednics; i++)
4570 free(conf->saved_nics[i].orig_name);
7b35f3d6
SH
4571 free(conf->saved_nics);
4572}
4573
4184c3e1
SH
4574static inline void lxc_clear_aliens(struct lxc_conf *conf)
4575{
4576 struct lxc_list *it,*next;
4577
4578 lxc_list_for_each_safe(it, &conf->aliens, next) {
4579 lxc_list_del(it);
4580 free(it->elem);
4581 free(it);
4582 }
4583}
4584
f979ac15
SH
4585static inline void lxc_clear_includes(struct lxc_conf *conf)
4586{
4587 struct lxc_list *it,*next;
4588
4589 lxc_list_for_each_safe(it, &conf->includes, next) {
4590 lxc_list_del(it);
4591 free(it->elem);
4592 free(it);
4593 }
4594}
4595
8eb5694b
SH
4596void lxc_conf_free(struct lxc_conf *conf)
4597{
4598 if (!conf)
4599 return;
b91f00d3
SH
4600 if (conf->console.log_path)
4601 free(conf->console.log_path);
8eb5694b
SH
4602 if (conf->console.path)
4603 free(conf->console.path);
54c30e29 4604 if (conf->rootfs.mount)
8eb5694b 4605 free(conf->rootfs.mount);
a17b1e65
SG
4606 if (conf->rootfs.options)
4607 free(conf->rootfs.options);
d95db067
DE
4608 if (conf->rootfs.path)
4609 free(conf->rootfs.path);
a58878d6
SH
4610 if (conf->rootfs.pivot)
4611 free(conf->rootfs.pivot);
4612 if (conf->logfile)
4613 free(conf->logfile);
d95db067
DE
4614 if (conf->utsname)
4615 free(conf->utsname);
4616 if (conf->ttydir)
4617 free(conf->ttydir);
4618 if (conf->fstab)
4619 free(conf->fstab);
fc7e8864
WM
4620 if (conf->rcfile)
4621 free(conf->rcfile);
6b0d5538 4622 free(conf->unexpanded_config);
8eb5694b 4623 lxc_clear_config_network(conf);
fe4de9a6
DE
4624 if (conf->lsm_aa_profile)
4625 free(conf->lsm_aa_profile);
4626 if (conf->lsm_se_context)
4627 free(conf->lsm_se_context);
769872f9 4628 lxc_seccomp_free(conf);
8eb5694b 4629 lxc_clear_config_caps(conf);
1fb86a7c 4630 lxc_clear_config_keepcaps(conf);
8eb5694b 4631 lxc_clear_cgroups(conf, "lxc.cgroup");
17ed13a3 4632 lxc_clear_hooks(conf, "lxc.hook");
8eb5694b 4633 lxc_clear_mount_entries(conf);
7b35f3d6 4634 lxc_clear_saved_nics(conf);
27c27d73 4635 lxc_clear_idmaps(conf);
ee1e7aa0 4636 lxc_clear_groups(conf);
f979ac15 4637 lxc_clear_includes(conf);
761d81ca 4638 lxc_clear_aliens(conf);
ab799c0b 4639 lxc_clear_environment(conf);
8eb5694b
SH
4640 free(conf);
4641}
4355ab5f
SH
4642
4643struct userns_fn_data {
4644 int (*fn)(void *);
4645 void *arg;
4646 int p[2];
4647};
4648
4649static int run_userns_fn(void *data)
4650{
4651 struct userns_fn_data *d = data;
4652 char c;
4653 // we're not sharing with the parent any more, if it was a thread
4654
4655 close(d->p[1]);
4656 if (read(d->p[0], &c, 1) != 1)
4657 return -1;
4658 close(d->p[0]);
4659 return d->fn(d->arg);
4660}
4661
4662/*
8b227008
TS
4663 * Add ID_TYPE_UID/ID_TYPE_GID entries to an existing lxc_conf,
4664 * if they are not already there.
4355ab5f 4665 */
8b227008
TS
4666static struct lxc_list *idmap_add_id(struct lxc_conf *conf,
4667 uid_t uid, gid_t gid)
4355ab5f 4668{
8b227008
TS
4669 int hostuid_mapped = mapped_hostid(uid, conf, ID_TYPE_UID);
4670 int hostgid_mapped = mapped_hostid(gid, conf, ID_TYPE_GID);
4355ab5f
SH
4671 struct lxc_list *new = NULL, *tmp, *it, *next;
4672 struct id_map *entry;
4673
3ec1648d
SH
4674 new = malloc(sizeof(*new));
4675 if (!new) {
4676 ERROR("Out of memory building id map");
4677 return NULL;
4678 }
4679 lxc_list_init(new);
4680
8b227008
TS
4681 if (hostuid_mapped < 0) {
4682 hostuid_mapped = find_unmapped_nsuid(conf, ID_TYPE_UID);
4683 if (hostuid_mapped < 0)
3ec1648d
SH
4684 goto err;
4685 tmp = malloc(sizeof(*tmp));
4686 if (!tmp)
4687 goto err;
4355ab5f
SH
4688 entry = malloc(sizeof(*entry));
4689 if (!entry) {
3ec1648d
SH
4690 free(tmp);
4691 goto err;
4355ab5f 4692 }
3ec1648d 4693 tmp->elem = entry;
4355ab5f 4694 entry->idtype = ID_TYPE_UID;
8b227008
TS
4695 entry->nsid = hostuid_mapped;
4696 entry->hostid = (unsigned long) uid;
4697 entry->range = 1;
4698 lxc_list_add_tail(new, tmp);
4699 }
4700 if (hostgid_mapped < 0) {
4701 hostgid_mapped = find_unmapped_nsuid(conf, ID_TYPE_GID);
4702 if (hostgid_mapped < 0)
4703 goto err;
4704 tmp = malloc(sizeof(*tmp));
4705 if (!tmp)
4706 goto err;
4707 entry = malloc(sizeof(*entry));
4708 if (!entry) {
4709 free(tmp);
4710 goto err;
4711 }
4712 tmp->elem = entry;
4713 entry->idtype = ID_TYPE_GID;
4714 entry->nsid = hostgid_mapped;
4715 entry->hostid = (unsigned long) gid;
4355ab5f 4716 entry->range = 1;
3ec1648d 4717 lxc_list_add_tail(new, tmp);
4355ab5f
SH
4718 }
4719 lxc_list_for_each_safe(it, &conf->id_map, next) {
4720 tmp = malloc(sizeof(*tmp));
4721 if (!tmp)
4722 goto err;
4723 entry = malloc(sizeof(*entry));
4724 if (!entry) {
4725 free(tmp);
4726 goto err;
4727 }
4728 memset(entry, 0, sizeof(*entry));
4729 memcpy(entry, it->elem, sizeof(*entry));
4730 tmp->elem = entry;
3ec1648d 4731 lxc_list_add_tail(new, tmp);
4355ab5f
SH
4732 }
4733
4734 return new;
4735
4736err:
8b227008 4737 ERROR("Out of memory building a new uid/gid map");
908fde6a
SH
4738 if (new)
4739 lxc_free_idmap(new);
c30ac545 4740 free(new);
4355ab5f
SH
4741 return NULL;
4742}
4743
4744/*
4745 * Run a function in a new user namespace.
8b227008 4746 * The caller's euid/egid will be mapped in if it is not already.
4355ab5f
SH
4747 */
4748int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data)
4749{
4750 int ret, pid;
4751 struct userns_fn_data d;
4752 char c = '1';
4753 int p[2];
4754 struct lxc_list *idmap;
4755
4355ab5f 4756 ret = pipe(p);
4355ab5f
SH
4757 if (ret < 0) {
4758 SYSERROR("opening pipe");
4759 return -1;
4760 }
4761 d.fn = fn;
4762 d.arg = data;
4763 d.p[0] = p[0];
4764 d.p[1] = p[1];
4765 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4766 if (pid < 0)
4767 goto err;
4355ab5f 4768 close(p[0]);
4355ab5f
SH
4769 p[0] = -1;
4770
8b227008
TS
4771 if ((idmap = idmap_add_id(conf, geteuid(), getegid())) == NULL) {
4772 ERROR("Error adding self to container uid/gid map");
4355ab5f
SH
4773 goto err;
4774 }
4775
4776 ret = lxc_map_ids(idmap, pid);
4777 lxc_free_idmap(idmap);
88dd66fc 4778 free(idmap);
565e571c 4779 if (ret) {
4355ab5f
SH
4780 ERROR("Error setting up child mappings");
4781 goto err;
4782 }
4783
4784 // kick the child
4785 if (write(p[1], &c, 1) != 1) {
4786 SYSERROR("writing to pipe to child");
4787 goto err;
4788 }
4789
3139aead
SG
4790 ret = wait_for_pid(pid);
4791
4792 close(p[1]);
4793 return ret;
4794
4355ab5f 4795err:
4355ab5f
SH
4796 if (p[0] != -1)
4797 close(p[0]);
4798 close(p[1]);
4355ab5f
SH
4799 return -1;
4800}
97e9cfa0 4801
a96a8e8c 4802/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4803static char* getuname(void)
4804{
a96a8e8c 4805 struct passwd *result;
97e9cfa0 4806
a96a8e8c
SH
4807 result = getpwuid(geteuid());
4808 if (!result)
97e9cfa0
SH
4809 return NULL;
4810
a96a8e8c 4811 return strdup(result->pw_name);
97e9cfa0
SH
4812}
4813
a96a8e8c 4814/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4815static char *getgname(void)
4816{
a96a8e8c 4817 struct group *result;
97e9cfa0 4818
a96a8e8c
SH
4819 result = getgrgid(getegid());
4820 if (!result)
97e9cfa0
SH
4821 return NULL;
4822
a96a8e8c 4823 return strdup(result->gr_name);
97e9cfa0
SH
4824}
4825
a96a8e8c 4826/* not thread-safe, do not use from api without first forking */
97e9cfa0
SH
4827void suggest_default_idmap(void)
4828{
4829 FILE *f;
4830 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4831 char *line = NULL;
4832 char *uname, *gname;
4833 size_t len = 0;
4834
4835 if (!(uname = getuname()))
4836 return;
4837
4838 if (!(gname = getgname())) {
4839 free(uname);
4840 return;
4841 }
4842
4843 f = fopen(subuidfile, "r");
4844 if (!f) {
4845 ERROR("Your system is not configured with subuids");
4846 free(gname);
4847 free(uname);
4848 return;
4849 }
4850 while (getline(&line, &len, f) != -1) {
4851 char *p = strchr(line, ':'), *p2;
4852 if (*line == '#')
4853 continue;
4854 if (!p)
4855 continue;
4856 *p = '\0';
4857 p++;
4858 if (strcmp(line, uname))
4859 continue;
4860 p2 = strchr(p, ':');
4861 if (!p2)
4862 continue;
4863 *p2 = '\0';
4864 p2++;
4865 if (!*p2)
4866 continue;
4867 uid = atoi(p);
4868 urange = atoi(p2);
4869 }
4870 fclose(f);
4871
4872 f = fopen(subuidfile, "r");
4873 if (!f) {
4874 ERROR("Your system is not configured with subgids");
4875 free(gname);
4876 free(uname);
4877 return;
4878 }
4879 while (getline(&line, &len, f) != -1) {
4880 char *p = strchr(line, ':'), *p2;
4881 if (*line == '#')
4882 continue;
4883 if (!p)
4884 continue;
4885 *p = '\0';
4886 p++;
4887 if (strcmp(line, uname))
4888 continue;
4889 p2 = strchr(p, ':');
4890 if (!p2)
4891 continue;
4892 *p2 = '\0';
4893 p2++;
4894 if (!*p2)
4895 continue;
4896 gid = atoi(p);
4897 grange = atoi(p2);
4898 }
4899 fclose(f);
4900
4901 if (line)
4902 free(line);
4903
4904 if (!urange || !grange) {
4905 ERROR("You do not have subuids or subgids allocated");
4906 ERROR("Unprivileged containers require subuids and subgids");
4907 return;
4908 }
4909
4910 ERROR("You must either run as root, or define uid mappings");
4911 ERROR("To pass uid mappings to lxc-create, you could create");
4912 ERROR("~/.config/lxc/default.conf:");
4913 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4914 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4915 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4916
4917 free(gname);
4918 free(uname);
4919}