]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
conf: remove stack allocations
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #ifndef _GNU_SOURCE
25 #define _GNU_SOURCE 1
26 #endif
27 #include <arpa/inet.h>
28 #include <dirent.h>
29 #include <errno.h>
30 #include <fcntl.h>
31 #include <grp.h>
32 #include <inttypes.h>
33 #include <libgen.h>
34 #include <linux/loop.h>
35 #include <net/if.h>
36 #include <netinet/in.h>
37 #include <pwd.h>
38 #include <stdarg.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <sys/mman.h>
43 #include <sys/mount.h>
44 #include <sys/param.h>
45 #include <sys/prctl.h>
46 #include <sys/sendfile.h>
47 #include <sys/socket.h>
48 #include <sys/stat.h>
49 #include <sys/syscall.h>
50 #include <sys/sysmacros.h>
51 #include <sys/types.h>
52 #include <sys/utsname.h>
53 #include <sys/wait.h>
54 #include <time.h>
55 #include <unistd.h>
56
57 #include "af_unix.h"
58 #include "caps.h"
59 #include "cgroup.h"
60 #include "conf.h"
61 #include "config.h"
62 #include "confile.h"
63 #include "confile_utils.h"
64 #include "error.h"
65 #include "log.h"
66 #include "lsm/lsm.h"
67 #include "lxclock.h"
68 #include "lxcseccomp.h"
69 #include "macro.h"
70 #include "memory_utils.h"
71 #include "namespace.h"
72 #include "network.h"
73 #include "parse.h"
74 #include "raw_syscalls.h"
75 #include "ringbuf.h"
76 #include "start.h"
77 #include "storage.h"
78 #include "storage/overlay.h"
79 #include "syscall_wrappers.h"
80 #include "terminal.h"
81 #include "utils.h"
82
83 #ifdef MAJOR_IN_MKDEV
84 #include <sys/mkdev.h>
85 #endif
86
87 #ifdef HAVE_STATVFS
88 #include <sys/statvfs.h>
89 #endif
90
91 #if HAVE_PTY_H
92 #include <pty.h>
93 #else
94 #include <../include/openpty.h>
95 #endif
96
97 #if HAVE_LIBCAP
98 #include <sys/capability.h>
99 #endif
100
101 #if HAVE_SYS_PERSONALITY_H
102 #include <sys/personality.h>
103 #endif
104
105 #ifndef HAVE_STRLCAT
106 #include "include/strlcat.h"
107 #endif
108
109 #if IS_BIONIC
110 #include <../include/lxcmntent.h>
111 #else
112 #include <mntent.h>
113 #endif
114
115 #if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
116 #include <../include/prlimit.h>
117 #endif
118
119 lxc_log_define(conf, lxc);
120
121 /* The lxc_conf of the container currently being worked on in an API call.
122 * This is used in the error calls.
123 */
124 #ifdef HAVE_TLS
125 thread_local struct lxc_conf *current_config;
126 #else
127 struct lxc_conf *current_config;
128 #endif
129
130 char *lxchook_names[NUM_LXC_HOOKS] = {
131 "pre-start",
132 "pre-mount",
133 "mount",
134 "autodev",
135 "start",
136 "stop",
137 "post-stop",
138 "clone",
139 "destroy",
140 "start-host"
141 };
142
143 struct mount_opt {
144 char *name;
145 int clear;
146 int flag;
147 };
148
149 struct caps_opt {
150 char *name;
151 int value;
152 };
153
154 struct limit_opt {
155 char *name;
156 int value;
157 };
158
159 static struct mount_opt mount_opt[] = {
160 { "async", 1, MS_SYNCHRONOUS },
161 { "atime", 1, MS_NOATIME },
162 { "bind", 0, MS_BIND },
163 { "defaults", 0, 0 },
164 { "dev", 1, MS_NODEV },
165 { "diratime", 1, MS_NODIRATIME },
166 { "dirsync", 0, MS_DIRSYNC },
167 { "exec", 1, MS_NOEXEC },
168 { "lazytime", 0, MS_LAZYTIME },
169 { "mand", 0, MS_MANDLOCK },
170 { "noatime", 0, MS_NOATIME },
171 { "nodev", 0, MS_NODEV },
172 { "nodiratime", 0, MS_NODIRATIME },
173 { "noexec", 0, MS_NOEXEC },
174 { "nomand", 1, MS_MANDLOCK },
175 { "norelatime", 1, MS_RELATIME },
176 { "nostrictatime", 1, MS_STRICTATIME },
177 { "nosuid", 0, MS_NOSUID },
178 { "rbind", 0, MS_BIND|MS_REC },
179 { "relatime", 0, MS_RELATIME },
180 { "remount", 0, MS_REMOUNT },
181 { "ro", 0, MS_RDONLY },
182 { "rw", 1, MS_RDONLY },
183 { "strictatime", 0, MS_STRICTATIME },
184 { "suid", 1, MS_NOSUID },
185 { "sync", 0, MS_SYNCHRONOUS },
186 { NULL, 0, 0 },
187 };
188
189 static struct mount_opt propagation_opt[] = {
190 { "private", 0, MS_PRIVATE },
191 { "shared", 0, MS_SHARED },
192 { "slave", 0, MS_SLAVE },
193 { "unbindable", 0, MS_UNBINDABLE },
194 { "rprivate", 0, MS_PRIVATE|MS_REC },
195 { "rshared", 0, MS_SHARED|MS_REC },
196 { "rslave", 0, MS_SLAVE|MS_REC },
197 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
198 { NULL, 0, 0 },
199 };
200
201 static struct caps_opt caps_opt[] = {
202 #if HAVE_LIBCAP
203 { "chown", CAP_CHOWN },
204 { "dac_override", CAP_DAC_OVERRIDE },
205 { "dac_read_search", CAP_DAC_READ_SEARCH },
206 { "fowner", CAP_FOWNER },
207 { "fsetid", CAP_FSETID },
208 { "kill", CAP_KILL },
209 { "setgid", CAP_SETGID },
210 { "setuid", CAP_SETUID },
211 { "setpcap", CAP_SETPCAP },
212 { "linux_immutable", CAP_LINUX_IMMUTABLE },
213 { "net_bind_service", CAP_NET_BIND_SERVICE },
214 { "net_broadcast", CAP_NET_BROADCAST },
215 { "net_admin", CAP_NET_ADMIN },
216 { "net_raw", CAP_NET_RAW },
217 { "ipc_lock", CAP_IPC_LOCK },
218 { "ipc_owner", CAP_IPC_OWNER },
219 { "sys_module", CAP_SYS_MODULE },
220 { "sys_rawio", CAP_SYS_RAWIO },
221 { "sys_chroot", CAP_SYS_CHROOT },
222 { "sys_ptrace", CAP_SYS_PTRACE },
223 { "sys_pacct", CAP_SYS_PACCT },
224 { "sys_admin", CAP_SYS_ADMIN },
225 { "sys_boot", CAP_SYS_BOOT },
226 { "sys_nice", CAP_SYS_NICE },
227 { "sys_resource", CAP_SYS_RESOURCE },
228 { "sys_time", CAP_SYS_TIME },
229 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
230 { "mknod", CAP_MKNOD },
231 { "lease", CAP_LEASE },
232 #ifdef CAP_AUDIT_READ
233 { "audit_read", CAP_AUDIT_READ },
234 #endif
235 #ifdef CAP_AUDIT_WRITE
236 { "audit_write", CAP_AUDIT_WRITE },
237 #endif
238 #ifdef CAP_AUDIT_CONTROL
239 { "audit_control", CAP_AUDIT_CONTROL },
240 #endif
241 { "setfcap", CAP_SETFCAP },
242 { "mac_override", CAP_MAC_OVERRIDE },
243 { "mac_admin", CAP_MAC_ADMIN },
244 #ifdef CAP_SYSLOG
245 { "syslog", CAP_SYSLOG },
246 #endif
247 #ifdef CAP_WAKE_ALARM
248 { "wake_alarm", CAP_WAKE_ALARM },
249 #endif
250 #ifdef CAP_BLOCK_SUSPEND
251 { "block_suspend", CAP_BLOCK_SUSPEND },
252 #endif
253 #endif
254 };
255
256 static struct limit_opt limit_opt[] = {
257 #ifdef RLIMIT_AS
258 { "as", RLIMIT_AS },
259 #endif
260 #ifdef RLIMIT_CORE
261 { "core", RLIMIT_CORE },
262 #endif
263 #ifdef RLIMIT_CPU
264 { "cpu", RLIMIT_CPU },
265 #endif
266 #ifdef RLIMIT_DATA
267 { "data", RLIMIT_DATA },
268 #endif
269 #ifdef RLIMIT_FSIZE
270 { "fsize", RLIMIT_FSIZE },
271 #endif
272 #ifdef RLIMIT_LOCKS
273 { "locks", RLIMIT_LOCKS },
274 #endif
275 #ifdef RLIMIT_MEMLOCK
276 { "memlock", RLIMIT_MEMLOCK },
277 #endif
278 #ifdef RLIMIT_MSGQUEUE
279 { "msgqueue", RLIMIT_MSGQUEUE },
280 #endif
281 #ifdef RLIMIT_NICE
282 { "nice", RLIMIT_NICE },
283 #endif
284 #ifdef RLIMIT_NOFILE
285 { "nofile", RLIMIT_NOFILE },
286 #endif
287 #ifdef RLIMIT_NPROC
288 { "nproc", RLIMIT_NPROC },
289 #endif
290 #ifdef RLIMIT_RSS
291 { "rss", RLIMIT_RSS },
292 #endif
293 #ifdef RLIMIT_RTPRIO
294 { "rtprio", RLIMIT_RTPRIO },
295 #endif
296 #ifdef RLIMIT_RTTIME
297 { "rttime", RLIMIT_RTTIME },
298 #endif
299 #ifdef RLIMIT_SIGPENDING
300 { "sigpending", RLIMIT_SIGPENDING },
301 #endif
302 #ifdef RLIMIT_STACK
303 { "stack", RLIMIT_STACK },
304 #endif
305 };
306
307 static int run_buffer(char *buffer)
308 {
309 int ret;
310 char *output;
311 struct lxc_popen_FILE *f;
312
313 f = lxc_popen(buffer);
314 if (!f) {
315 SYSERROR("Failed to popen() %s", buffer);
316 return -1;
317 }
318
319 output = malloc(LXC_LOG_BUFFER_SIZE);
320 if (!output) {
321 ERROR("Failed to allocate memory for %s", buffer);
322 lxc_pclose(f);
323 return -1;
324 }
325
326 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
327 DEBUG("Script %s with output: %s", buffer, output);
328
329 free(output);
330
331 ret = lxc_pclose(f);
332 if (ret == -1) {
333 SYSERROR("Script exited with error");
334 return -1;
335 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
336 ERROR("Script exited with status %d", WEXITSTATUS(ret));
337 return -1;
338 } else if (WIFSIGNALED(ret)) {
339 ERROR("Script terminated by signal %d", WTERMSIG(ret));
340 return -1;
341 }
342
343 return 0;
344 }
345
346 int run_script_argv(const char *name, unsigned int hook_version,
347 const char *section, const char *script,
348 const char *hookname, char **argv)
349 {
350 int buf_pos, i, ret;
351 char *buffer;
352 int fret = -1;
353 size_t size = 0;
354
355 if (hook_version == 0)
356 INFO("Executing script \"%s\" for container \"%s\", config "
357 "section \"%s\"", script, name, section);
358 else
359 INFO("Executing script \"%s\" for container \"%s\"", script, name);
360
361 for (i = 0; argv && argv[i]; i++)
362 size += strlen(argv[i]) + 1;
363
364 size += STRLITERALLEN("exec");
365 size++;
366 size += strlen(script);
367 size++;
368
369 if (size > INT_MAX)
370 return -EFBIG;
371
372 if (hook_version == 0) {
373 size += strlen(hookname);
374 size++;
375
376 size += strlen(name);
377 size++;
378
379 size += strlen(section);
380 size++;
381
382 if (size > INT_MAX)
383 return -EFBIG;
384 }
385
386 buffer = malloc(size);
387 if (!buffer)
388 return -ENOMEM;
389
390 if (hook_version == 0)
391 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
392 else
393 buf_pos = snprintf(buffer, size, "exec %s", script);
394 if (buf_pos < 0 || (size_t)buf_pos >= size) {
395 ERROR("Failed to create command line for script \"%s\"", script);
396 goto on_error;
397 }
398
399 if (hook_version == 1) {
400 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
401 if (ret < 0) {
402 SYSERROR("Failed to set environment variable: "
403 "LXC_HOOK_TYPE=%s", hookname);
404 goto on_error;
405 }
406 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
407
408 ret = setenv("LXC_HOOK_SECTION", section, 1);
409 if (ret < 0) {
410 SYSERROR("Failed to set environment variable: "
411 "LXC_HOOK_SECTION=%s", section);
412 goto on_error;
413 }
414 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
415
416 if (strcmp(section, "net") == 0) {
417 char *parent;
418
419 if (!argv || !argv[0])
420 goto on_error;
421
422 ret = setenv("LXC_NET_TYPE", argv[0], 1);
423 if (ret < 0) {
424 SYSERROR("Failed to set environment variable: "
425 "LXC_NET_TYPE=%s", argv[0]);
426 goto on_error;
427 }
428 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
429
430 parent = argv[1] ? argv[1] : "";
431
432 if (strcmp(argv[0], "macvlan") == 0) {
433 ret = setenv("LXC_NET_PARENT", parent, 1);
434 if (ret < 0) {
435 SYSERROR("Failed to set environment "
436 "variable: LXC_NET_PARENT=%s", parent);
437 goto on_error;
438 }
439 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
440 } else if (strcmp(argv[0], "phys") == 0) {
441 ret = setenv("LXC_NET_PARENT", parent, 1);
442 if (ret < 0) {
443 SYSERROR("Failed to set environment "
444 "variable: LXC_NET_PARENT=%s", parent);
445 goto on_error;
446 }
447 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
448 } else if (strcmp(argv[0], "veth") == 0) {
449 char *peer = argv[2] ? argv[2] : "";
450
451 ret = setenv("LXC_NET_PEER", peer, 1);
452 if (ret < 0) {
453 SYSERROR("Failed to set environment "
454 "variable: LXC_NET_PEER=%s", peer);
455 goto on_error;
456 }
457 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
458
459 ret = setenv("LXC_NET_PARENT", parent, 1);
460 if (ret < 0) {
461 SYSERROR("Failed to set environment "
462 "variable: LXC_NET_PARENT=%s", parent);
463 goto on_error;
464 }
465 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
466 }
467 }
468 }
469
470 for (i = 0; argv && argv[i]; i++) {
471 size_t len = size - buf_pos;
472
473 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
474 if (ret < 0 || (size_t)ret >= len) {
475 ERROR("Failed to create command line for script \"%s\"", script);
476 goto on_error;
477 }
478 buf_pos += ret;
479 }
480
481 fret = run_buffer(buffer);
482
483 on_error:
484 free(buffer);
485 return fret;
486 }
487
488 int run_script(const char *name, const char *section, const char *script, ...)
489 {
490 __do_free char *buffer = NULL;
491 int ret;
492 char *p;
493 va_list ap;
494 size_t size = 0;
495
496 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
497 script, name, section);
498
499 va_start(ap, script);
500 while ((p = va_arg(ap, char *)))
501 size += strlen(p) + 1;
502 va_end(ap);
503
504 size += STRLITERALLEN("exec");
505 size += strlen(script);
506 size += strlen(name);
507 size += strlen(section);
508 size += 4;
509
510 if (size > INT_MAX)
511 return -1;
512
513 buffer = must_realloc(NULL, size);
514 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
515 if (ret < 0 || ret >= size)
516 return -1;
517
518 va_start(ap, script);
519 while ((p = va_arg(ap, char *))) {
520 int len = size - ret;
521 int rc;
522 rc = snprintf(buffer + ret, len, " %s", p);
523 if (rc < 0 || rc >= len) {
524 va_end(ap);
525 return -1;
526 }
527 ret += rc;
528 }
529 va_end(ap);
530
531 return run_buffer(buffer);
532 }
533
534 /* pin_rootfs
535 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
536 * the duration of the container run, to prevent the container from marking
537 * the underlying fs readonly on shutdown. unlink the file immediately so
538 * no name pollution is happens.
539 * don't unlink on NFS to avoid random named stale handles.
540 * return -1 on error.
541 * return -2 if nothing needed to be pinned.
542 * return an open fd (>=0) if we pinned it.
543 */
544 int pin_rootfs(const char *rootfs)
545 {
546 int fd, ret;
547 char absrootfspin[PATH_MAX];
548 char *absrootfs;
549 struct stat s;
550 struct statfs sfs;
551
552 if (rootfs == NULL || strlen(rootfs) == 0)
553 return -2;
554
555 absrootfs = realpath(rootfs, NULL);
556 if (!absrootfs)
557 return -2;
558
559 ret = stat(absrootfs, &s);
560 if (ret < 0) {
561 free(absrootfs);
562 return -1;
563 }
564
565 if (!S_ISDIR(s.st_mode)) {
566 free(absrootfs);
567 return -2;
568 }
569
570 ret = snprintf(absrootfspin, PATH_MAX, "%s/.lxc-keep", absrootfs);
571 free(absrootfs);
572 if (ret < 0 || ret >= PATH_MAX)
573 return -1;
574
575 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR);
576 if (fd < 0)
577 return fd;
578
579 ret = fstatfs (fd, &sfs);
580 if (ret < 0)
581 return fd;
582
583 if (sfs.f_type == NFS_SUPER_MAGIC) {
584 DEBUG("Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
585 return fd;
586 }
587
588 (void)unlink(absrootfspin);
589
590 return fd;
591 }
592
593 /* If we are asking to remount something, make sure that any NOEXEC etc are
594 * honored.
595 */
596 unsigned long add_required_remount_flags(const char *s, const char *d,
597 unsigned long flags)
598 {
599 #ifdef HAVE_STATVFS
600 int ret;
601 struct statvfs sb;
602 unsigned long required_flags = 0;
603
604 if (!s)
605 s = d;
606
607 if (!s)
608 return flags;
609
610 ret = statvfs(s, &sb);
611 if (ret < 0)
612 return flags;
613
614 if (flags & MS_REMOUNT) {
615 if (sb.f_flag & MS_NOSUID)
616 required_flags |= MS_NOSUID;
617 if (sb.f_flag & MS_NODEV)
618 required_flags |= MS_NODEV;
619 if (sb.f_flag & MS_RDONLY)
620 required_flags |= MS_RDONLY;
621 if (sb.f_flag & MS_NOEXEC)
622 required_flags |= MS_NOEXEC;
623 }
624
625 if (sb.f_flag & MS_NOATIME)
626 required_flags |= MS_NOATIME;
627 if (sb.f_flag & MS_NODIRATIME)
628 required_flags |= MS_NODIRATIME;
629 if (sb.f_flag & MS_LAZYTIME)
630 required_flags |= MS_LAZYTIME;
631 if (sb.f_flag & MS_RELATIME)
632 required_flags |= MS_RELATIME;
633 if (sb.f_flag & MS_STRICTATIME)
634 required_flags |= MS_STRICTATIME;
635
636 return flags | required_flags;
637 #else
638 return flags;
639 #endif
640 }
641
642 static int add_shmount_to_list(struct lxc_conf *conf)
643 {
644 char new_mount[PATH_MAX];
645 /* Offset for the leading '/' since the path_cont
646 * is absolute inside the container.
647 */
648 int offset = 1, ret = -1;
649
650 ret = snprintf(new_mount, sizeof(new_mount),
651 "%s %s none bind,create=dir 0 0", conf->shmount.path_host,
652 conf->shmount.path_cont + offset);
653 if (ret < 0 || (size_t)ret >= sizeof(new_mount))
654 return -1;
655
656 return add_elem_to_mount_list(new_mount, conf);
657 }
658
659 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
660 {
661 int i, r;
662 static struct {
663 int match_mask;
664 int match_flag;
665 const char *source;
666 const char *destination;
667 const char *fstype;
668 unsigned long flags;
669 const char *options;
670 } default_mounts[] = {
671 /* Read-only bind-mounting... In older kernels, doing that
672 * required to do one MS_BIND mount and then
673 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
674 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
675 * onwards. However, this apparently does not work on kernel
676 * 3.8. Unfortunately, on that very same kernel, doing the same
677 * trick as above doesn't seem to work either, there one needs
678 * to ALSO specify MS_BIND for the remount, otherwise the
679 * entire fs is remounted read-only or the mount fails because
680 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
681 * kernels as low as 2.6.32...
682 */
683 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
684 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
685 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
686 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
687 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
688 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
689 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
690 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
692 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
693 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
694 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
695 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
696 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
697 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
700 { 0, 0, NULL, NULL, NULL, 0, NULL }
701 };
702
703 for (i = 0; default_mounts[i].match_mask; i++) {
704 int saved_errno;
705 unsigned long mflags;
706 char *destination = NULL;
707 char *source = NULL;
708 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
709 continue;
710
711 if (default_mounts[i].source) {
712 /* will act like strdup if %r is not present */
713 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
714 if (!source)
715 return -1;
716 }
717
718 if (!default_mounts[i].destination) {
719 ERROR("BUG: auto mounts destination %d was NULL", i);
720 free(source);
721 return -1;
722 }
723
724 /* will act like strdup if %r is not present */
725 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
726 if (!destination) {
727 saved_errno = errno;
728 free(source);
729 errno = saved_errno;
730 return -1;
731 }
732
733 mflags = add_required_remount_flags(source, destination,
734 default_mounts[i].flags);
735 r = safe_mount(source, destination, default_mounts[i].fstype,
736 mflags, default_mounts[i].options,
737 conf->rootfs.path ? conf->rootfs.mount : NULL);
738 saved_errno = errno;
739 if (r < 0 && errno == ENOENT) {
740 INFO("Mount source or target for \"%s\" on \"%s\" does "
741 "not exist. Skipping", source, destination);
742 r = 0;
743 } else if (r < 0) {
744 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
745 }
746
747 free(source);
748 free(destination);
749 if (r < 0) {
750 errno = saved_errno;
751 return -1;
752 }
753 }
754
755 if (flags & LXC_AUTO_CGROUP_MASK) {
756 int cg_flags;
757
758 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
759 /* If the type of cgroup mount was not specified, it depends on
760 * the container's capabilities as to what makes sense: if we
761 * have CAP_SYS_ADMIN, the read-only part can be remounted
762 * read-write anyway, so we may as well default to read-write;
763 * then the admin will not be given a false sense of security.
764 * (And if they really want mixed r/o r/w, then they can
765 * explicitly specify :mixed.) OTOH, if the container lacks
766 * CAP_SYS_ADMIN, do only default to :mixed, because then the
767 * container can't remount it read-write.
768 */
769 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
770 int has_sys_admin = 0;
771
772 if (!lxc_list_empty(&conf->keepcaps))
773 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
774 else
775 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
776
777 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
778 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
779 else
780 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
781 }
782
783 if (flags & LXC_AUTO_CGROUP_FORCE)
784 cg_flags |= LXC_AUTO_CGROUP_FORCE;
785
786 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
787 handler,
788 conf->rootfs.path ? conf->rootfs.mount : "",
789 cg_flags)) {
790 SYSERROR("Failed to mount \"/sys/fs/cgroup\"");
791 return -1;
792 }
793 }
794
795 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
796 int ret = add_shmount_to_list(conf);
797 if (ret < 0) {
798 ERROR("Failed to add shmount entry to container config");
799 return -1;
800 }
801 }
802
803 return 0;
804 }
805
806 static int setup_utsname(struct utsname *utsname)
807 {
808 int ret;
809
810 if (!utsname)
811 return 0;
812
813 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
814 if (ret < 0) {
815 SYSERROR("Failed to set the hostname to \"%s\"", utsname->nodename);
816 return -1;
817 }
818
819 INFO("Set hostname to \"%s\"", utsname->nodename);
820
821 return 0;
822 }
823
824 struct dev_symlinks {
825 const char *oldpath;
826 const char *name;
827 };
828
829 static const struct dev_symlinks dev_symlinks[] = {
830 { "/proc/self/fd", "fd" },
831 { "/proc/self/fd/0", "stdin" },
832 { "/proc/self/fd/1", "stdout" },
833 { "/proc/self/fd/2", "stderr" },
834 };
835
836 static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
837 {
838 int i, ret;
839 char path[PATH_MAX];
840 struct stat s;
841
842 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
843 const struct dev_symlinks *d = &dev_symlinks[i];
844
845 ret = snprintf(path, sizeof(path), "%s/dev/%s",
846 rootfs->path ? rootfs->mount : "", d->name);
847 if (ret < 0 || ret >= PATH_MAX)
848 return -1;
849
850 /* Stat the path first. If we don't get an error accept it as
851 * is and don't try to create it
852 */
853 ret = stat(path, &s);
854 if (ret == 0)
855 continue;
856
857 ret = symlink(d->oldpath, path);
858 if (ret && errno != EEXIST) {
859 if (errno == EROFS) {
860 WARN("Failed to create \"%s\". Read-only filesystem", path);
861 } else {
862 SYSERROR("Failed to create \"%s\"", path);
863 return -1;
864 }
865 }
866 }
867
868 return 0;
869 }
870
871 /* Build a space-separate list of ptys to pass to systemd. */
872 static bool append_ttyname(char **pp, char *name)
873 {
874 char *p;
875 size_t size;
876
877 if (!*pp) {
878 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
879 if (!*pp)
880 return false;
881
882 sprintf(*pp, "container_ttys=%s", name);
883 return true;
884 }
885
886 size = strlen(*pp) + strlen(name) + 2;
887 p = realloc(*pp, size);
888 if (!p)
889 return false;
890
891 *pp = p;
892 (void)strlcat(p, " ", size);
893 (void)strlcat(p, name, size);
894
895 return true;
896 }
897
898 static int lxc_setup_ttys(struct lxc_conf *conf)
899 {
900 int i, ret;
901 const struct lxc_tty_info *ttys = &conf->ttys;
902 char *ttydir = ttys->dir;
903 char path[PATH_MAX], lxcpath[PATH_MAX];
904
905 if (!conf->rootfs.path)
906 return 0;
907
908 for (i = 0; i < ttys->max; i++) {
909 struct lxc_terminal_info *tty = &ttys->tty[i];
910
911 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
912 if (ret < 0 || (size_t)ret >= sizeof(path))
913 return -1;
914
915 if (ttydir) {
916 /* create dev/lxc/tty%d" */
917 ret = snprintf(lxcpath, sizeof(lxcpath),
918 "/dev/%s/tty%d", ttydir, i + 1);
919 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
920 return -1;
921
922 ret = mknod(lxcpath, S_IFREG | 0000, 0);
923 if (ret < 0 && errno != EEXIST) {
924 SYSERROR("Failed to create \"%s\"", lxcpath);
925 return -1;
926 }
927
928 ret = unlink(path);
929 if (ret < 0 && errno != ENOENT) {
930 SYSERROR("Failed to unlink \"%s\"", path);
931 return -1;
932 }
933
934 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
935 if (ret < 0) {
936 SYSWARN("Failed to bind mount \"%s\" onto \"%s\"",
937 tty->name, lxcpath);
938 continue;
939 }
940 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name,
941 lxcpath);
942
943 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
944 ttydir, i + 1);
945 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
946 return -1;
947
948 ret = symlink(lxcpath, path);
949 if (ret < 0) {
950 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
951 path, lxcpath);
952 return -1;
953 }
954 } else {
955 /* If we populated /dev, then we need to create
956 * /dev/ttyN
957 */
958 ret = mknod(path, S_IFREG | 0000, 0);
959 if (ret < 0) /* this isn't fatal, continue */
960 SYSERROR("Failed to create \"%s\"", path);
961
962 ret = mount(tty->name, path, "none", MS_BIND, 0);
963 if (ret < 0) {
964 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
965 continue;
966 }
967
968 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
969 }
970
971 if (!append_ttyname(&conf->ttys.tty_names, tty->name)) {
972 ERROR("Error setting up container_ttys string");
973 return -1;
974 }
975 }
976
977 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
978 return 0;
979 }
980
981 int lxc_allocate_ttys(struct lxc_conf *conf)
982 {
983 size_t i;
984 int ret;
985 struct lxc_tty_info *ttys = &conf->ttys;
986
987 /* no tty in the configuration */
988 if (ttys->max == 0)
989 return 0;
990
991 ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
992 if (!ttys->tty)
993 return -ENOMEM;
994
995 for (i = 0; i < ttys->max; i++) {
996 struct lxc_terminal_info *tty = &ttys->tty[i];
997
998 tty->master = -EBADF;
999 tty->slave = -EBADF;
1000 ret = openpty(&tty->master, &tty->slave, NULL, NULL, NULL);
1001 if (ret < 0) {
1002 SYSERROR("Failed to create tty %zu", i);
1003 ttys->max = i;
1004 lxc_delete_tty(ttys);
1005 return -ENOTTY;
1006 }
1007
1008 ret = ttyname_r(tty->slave, tty->name, sizeof(tty->name));
1009 if (ret < 0) {
1010 SYSERROR("Failed to retrieve name of tty %zu slave", i);
1011 ttys->max = i;
1012 lxc_delete_tty(ttys);
1013 return -ENOTTY;
1014 }
1015
1016 DEBUG("Created tty \"%s\" with master fd %d and slave fd %d",
1017 tty->name, tty->master, tty->slave);
1018
1019 /* Prevent leaking the file descriptors to the container */
1020 ret = fd_cloexec(tty->master, true);
1021 if (ret < 0)
1022 SYSWARN("Failed to set FD_CLOEXEC flag on master fd %d of "
1023 "tty device \"%s\"", tty->master, tty->name);
1024
1025 ret = fd_cloexec(tty->slave, true);
1026 if (ret < 0)
1027 SYSWARN("Failed to set FD_CLOEXEC flag on slave fd %d of "
1028 "tty device \"%s\"", tty->slave, tty->name);
1029
1030 tty->busy = 0;
1031 }
1032
1033 INFO("Finished creating %zu tty devices", ttys->max);
1034 return 0;
1035 }
1036
1037 void lxc_delete_tty(struct lxc_tty_info *ttys)
1038 {
1039 int i;
1040
1041 if (!ttys->tty)
1042 return;
1043
1044 for (i = 0; i < ttys->max; i++) {
1045 struct lxc_terminal_info *tty = &ttys->tty[i];
1046
1047 if (tty->master >= 0) {
1048 close(tty->master);
1049 tty->master = -EBADF;
1050 }
1051
1052 if (tty->slave >= 0) {
1053 close(tty->slave);
1054 tty->slave = -EBADF;
1055 }
1056 }
1057
1058 free(ttys->tty);
1059 ttys->tty = NULL;
1060 }
1061
1062 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1063 {
1064 int i;
1065 int ret = -1;
1066 struct lxc_conf *conf = handler->conf;
1067 struct lxc_tty_info *ttys = &conf->ttys;
1068 int sock = handler->data_sock[0];
1069
1070 if (ttys->max == 0)
1071 return 0;
1072
1073 for (i = 0; i < ttys->max; i++) {
1074 int ttyfds[2];
1075 struct lxc_terminal_info *tty = &ttys->tty[i];
1076
1077 ttyfds[0] = tty->master;
1078 ttyfds[1] = tty->slave;
1079
1080 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1081 if (ret < 0)
1082 break;
1083
1084 TRACE("Sent tty \"%s\" with master fd %d and slave fd %d to "
1085 "parent", tty->name, tty->master, tty->slave);
1086 }
1087
1088 if (ret < 0)
1089 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
1090 else
1091 TRACE("Sent %zu ttys to parent", ttys->max);
1092
1093 return ret;
1094 }
1095
1096 static int lxc_create_ttys(struct lxc_handler *handler)
1097 {
1098 int ret = -1;
1099 struct lxc_conf *conf = handler->conf;
1100
1101 ret = lxc_allocate_ttys(conf);
1102 if (ret < 0) {
1103 ERROR("Failed to allocate ttys");
1104 goto on_error;
1105 }
1106
1107 ret = lxc_send_ttys_to_parent(handler);
1108 if (ret < 0) {
1109 ERROR("Failed to send ttys to parent");
1110 goto on_error;
1111 }
1112
1113 if (!conf->is_execute) {
1114 ret = lxc_setup_ttys(conf);
1115 if (ret < 0) {
1116 ERROR("Failed to setup ttys");
1117 goto on_error;
1118 }
1119 }
1120
1121 if (conf->ttys.tty_names) {
1122 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
1123 if (ret < 0)
1124 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
1125 }
1126
1127 ret = 0;
1128
1129 on_error:
1130 lxc_delete_tty(&conf->ttys);
1131
1132 return ret;
1133 }
1134
1135 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1136 * error, log it but don't fail yet.
1137 */
1138 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1139 const char *lxcpath)
1140 {
1141 __do_free char *path = NULL;
1142 int ret;
1143 size_t clen;
1144 mode_t cur_mask;
1145
1146 INFO("Preparing \"/dev\"");
1147
1148 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1149 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1150 path = must_realloc(NULL, clen);
1151
1152 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1153 if (ret < 0 || (size_t)ret >= clen)
1154 return -1;
1155
1156 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1157 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1158 if (ret < 0 && errno != EEXIST) {
1159 SYSERROR("Failed to create \"/dev\" directory");
1160 ret = -errno;
1161 goto reset_umask;
1162 }
1163
1164 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1165 rootfs->path ? rootfs->mount : NULL);
1166 if (ret < 0) {
1167 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1168 goto reset_umask;
1169 }
1170 TRACE("Mounted tmpfs on \"%s\"", path);
1171
1172 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1173 if (ret < 0 || (size_t)ret >= clen) {
1174 ret = -1;
1175 goto reset_umask;
1176 }
1177
1178 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1179 * If not, then create it and exit if that fails...
1180 */
1181 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1182 if (ret < 0 && errno != EEXIST) {
1183 SYSERROR("Failed to create directory \"%s\"", path);
1184 ret = -errno;
1185 goto reset_umask;
1186 }
1187
1188 ret = 0;
1189
1190 reset_umask:
1191 (void)umask(cur_mask);
1192
1193 INFO("Prepared \"/dev\"");
1194 return ret;
1195 }
1196
1197 struct lxc_device_node {
1198 const char *name;
1199 const mode_t mode;
1200 const int maj;
1201 const int min;
1202 };
1203
1204 static const struct lxc_device_node lxc_devices[] = {
1205 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1206 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1207 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1208 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1209 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1210 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1211 };
1212
1213
1214 enum {
1215 LXC_DEVNODE_BIND,
1216 LXC_DEVNODE_MKNOD,
1217 LXC_DEVNODE_PARTIAL,
1218 LXC_DEVNODE_OPEN,
1219 };
1220
1221 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1222 {
1223 int i, ret;
1224 char path[PATH_MAX];
1225 mode_t cmask;
1226 int use_mknod = LXC_DEVNODE_MKNOD;
1227
1228 ret = snprintf(path, PATH_MAX, "%s/dev",
1229 rootfs->path ? rootfs->mount : "");
1230 if (ret < 0 || ret >= PATH_MAX)
1231 return -1;
1232
1233 /* ignore, just don't try to fill in */
1234 if (!dir_exists(path))
1235 return 0;
1236
1237 INFO("Populating \"/dev\"");
1238
1239 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1240 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
1241 char hostpath[PATH_MAX];
1242 const struct lxc_device_node *device = &lxc_devices[i];
1243
1244 ret = snprintf(path, PATH_MAX, "%s/dev/%s",
1245 rootfs->path ? rootfs->mount : "", device->name);
1246 if (ret < 0 || ret >= PATH_MAX)
1247 return -1;
1248
1249 if (use_mknod >= LXC_DEVNODE_MKNOD) {
1250 ret = mknod(path, device->mode, makedev(device->maj, device->min));
1251 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1252 DEBUG("Created device node \"%s\"", path);
1253 } else if (ret < 0) {
1254 if (errno != EPERM) {
1255 SYSERROR("Failed to create device node \"%s\"", path);
1256 return -1;
1257 }
1258
1259 use_mknod = LXC_DEVNODE_BIND;
1260 }
1261
1262 /* Device nodes are fully useable. */
1263 if (use_mknod == LXC_DEVNODE_OPEN)
1264 continue;
1265
1266 if (use_mknod == LXC_DEVNODE_MKNOD) {
1267 /* See
1268 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1269 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1270 */
1271 ret = open(path, O_RDONLY | O_CLOEXEC);
1272 if (ret >= 0) {
1273 close(ret);
1274 /* Device nodes are fully useable. */
1275 use_mknod = LXC_DEVNODE_OPEN;
1276 continue;
1277 }
1278
1279 SYSTRACE("Failed to open \"%s\" device", path);
1280 /* Device nodes are only partially useable. */
1281 use_mknod = LXC_DEVNODE_PARTIAL;
1282 }
1283 }
1284
1285 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1286 /* If we are dealing with partially functional device
1287 * nodes the prio mknod() call will have created the
1288 * device node so we can use it as a bind-mount target.
1289 */
1290 ret = mknod(path, S_IFREG | 0000, 0);
1291 if (ret < 0 && errno != EEXIST) {
1292 SYSERROR("Failed to create file \"%s\"", path);
1293 return -1;
1294 }
1295 }
1296
1297 /* Fallback to bind-mounting the device from the host. */
1298 ret = snprintf(hostpath, PATH_MAX, "/dev/%s", device->name);
1299 if (ret < 0 || ret >= PATH_MAX)
1300 return -1;
1301
1302 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1303 rootfs->path ? rootfs->mount : NULL);
1304 if (ret < 0) {
1305 SYSERROR("Failed to bind mount host device node \"%s\" "
1306 "onto \"%s\"", hostpath, path);
1307 return -1;
1308 }
1309 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"",
1310 hostpath, path);
1311 }
1312 (void)umask(cmask);
1313
1314 INFO("Populated \"/dev\"");
1315 return 0;
1316 }
1317
1318 static int lxc_mount_rootfs(struct lxc_conf *conf)
1319 {
1320 int ret;
1321 struct lxc_storage *bdev;
1322 const struct lxc_rootfs *rootfs = &conf->rootfs;
1323
1324 if (!rootfs->path) {
1325 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
1326 if (ret < 0) {
1327 SYSERROR("Failed to remount \"/\" MS_REC | MS_SLAVE");
1328 return -1;
1329 }
1330
1331 return 0;
1332 }
1333
1334 ret = access(rootfs->mount, F_OK);
1335 if (ret != 0) {
1336 SYSERROR("Failed to access to \"%s\". Check it is present",
1337 rootfs->mount);
1338 return -1;
1339 }
1340
1341 bdev = storage_init(conf);
1342 if (!bdev) {
1343 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1344 rootfs->path, rootfs->mount,
1345 rootfs->options ? rootfs->options : "(null)");
1346 return -1;
1347 }
1348
1349 ret = bdev->ops->mount(bdev);
1350 storage_put(bdev);
1351 if (ret < 0) {
1352 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1353 rootfs->path, rootfs->mount,
1354 rootfs->options ? rootfs->options : "(null)");
1355 return -1;
1356 }
1357
1358 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
1359 rootfs->path, rootfs->mount,
1360 rootfs->options ? rootfs->options : "(null)");
1361
1362 return 0;
1363 }
1364
1365 int lxc_chroot(const struct lxc_rootfs *rootfs)
1366 {
1367 int i, ret;
1368 char *p, *p2;
1369 char buf[LXC_LINELEN];
1370 char *nroot;
1371 FILE *f;
1372 char *root = rootfs->mount;
1373
1374 nroot = realpath(root, NULL);
1375 if (!nroot) {
1376 SYSERROR("Failed to resolve \"%s\"", root);
1377 return -1;
1378 }
1379
1380 ret = chdir("/");
1381 if (ret < 0) {
1382 free(nroot);
1383 return -1;
1384 }
1385
1386 /* We could use here MS_MOVE, but in userns this mount is locked and
1387 * can't be moved.
1388 */
1389 ret = mount(nroot, "/", NULL, MS_REC | MS_BIND, NULL);
1390 if (ret < 0) {
1391 SYSERROR("Failed to mount \"%s\" onto \"/\" as MS_REC | MS_BIND", nroot);
1392 free(nroot);
1393 return -1;
1394 }
1395 free(nroot);
1396
1397 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
1398 if (ret < 0) {
1399 SYSERROR("Failed to remount \"/\"");
1400 return -1;
1401 }
1402
1403 /* The following code cleans up inherited mounts which are not required
1404 * for CT.
1405 *
1406 * The mountinfo file shows not all mounts, if a few points have been
1407 * unmounted between read operations from the mountinfo. So we need to
1408 * read mountinfo a few times.
1409 *
1410 * This loop can be skipped if a container uses userns, because all
1411 * inherited mounts are locked and we should live with all this trash.
1412 */
1413 for (;;) {
1414 int progress = 0;
1415
1416 f = fopen("./proc/self/mountinfo", "r");
1417 if (!f) {
1418 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
1419 return -1;
1420 }
1421
1422 while (fgets(buf, LXC_LINELEN, f)) {
1423 for (p = buf, i=0; p && i < 4; i++)
1424 p = strchr(p+1, ' ');
1425
1426 if (!p)
1427 continue;
1428
1429 p2 = strchr(p+1, ' ');
1430 if (!p2)
1431 continue;
1432
1433 *p2 = '\0';
1434 *p = '.';
1435
1436 if (strcmp(p + 1, "/") == 0)
1437 continue;
1438
1439 if (strcmp(p + 1, "/proc") == 0)
1440 continue;
1441
1442 ret = umount2(p, MNT_DETACH);
1443 if (ret == 0)
1444 progress++;
1445 }
1446
1447 fclose(f);
1448
1449 if (!progress)
1450 break;
1451 }
1452
1453 /* This also can be skipped if a container uses userns. */
1454 (void)umount2("./proc", MNT_DETACH);
1455
1456 /* It is weird, but chdir("..") moves us in a new root */
1457 ret = chdir("..");
1458 if (ret < 0) {
1459 SYSERROR("Failed to chdir(\"..\")");
1460 return -1;
1461 }
1462
1463 ret = chroot(".");
1464 if (ret < 0) {
1465 SYSERROR("Failed to chroot(\".\")");
1466 return -1;
1467 }
1468
1469 return 0;
1470 }
1471
1472 /* (The following explanation is copied verbatim from the kernel.)
1473 *
1474 * pivot_root Semantics:
1475 * Moves the root file system of the current process to the directory put_old,
1476 * makes new_root as the new root file system of the current process, and sets
1477 * root/cwd of all processes which had them on the current root to new_root.
1478 *
1479 * Restrictions:
1480 * The new_root and put_old must be directories, and must not be on the
1481 * same file system as the current process root. The put_old must be
1482 * underneath new_root, i.e. adding a non-zero number of /.. to the string
1483 * pointed to by put_old must yield the same directory as new_root. No other
1484 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1485 *
1486 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1487 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1488 * in this situation.
1489 *
1490 * Notes:
1491 * - we don't move root/cwd if they are not at the root (reason: if something
1492 * cared enough to change them, it's probably wrong to force them elsewhere)
1493 * - it's okay to pick a root that isn't the root of a file system, e.g.
1494 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
1495 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
1496 * first.
1497 */
1498 static int lxc_pivot_root(const char *rootfs)
1499 {
1500 int oldroot;
1501 int newroot = -1, ret = -1;
1502
1503 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1504 if (oldroot < 0) {
1505 SYSERROR("Failed to open old root directory");
1506 return -1;
1507 }
1508
1509 newroot = open(rootfs, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1510 if (newroot < 0) {
1511 SYSERROR("Failed to open new root directory");
1512 goto on_error;
1513 }
1514
1515 /* change into new root fs */
1516 ret = fchdir(newroot);
1517 if (ret < 0) {
1518 ret = -1;
1519 SYSERROR("Failed to change to new rootfs \"%s\"", rootfs);
1520 goto on_error;
1521 }
1522
1523 /* pivot_root into our new root fs */
1524 ret = pivot_root(".", ".");
1525 if (ret < 0) {
1526 ret = -1;
1527 SYSERROR("Failed to pivot_root()");
1528 goto on_error;
1529 }
1530
1531 /* At this point the old-root is mounted on top of our new-root. To
1532 * unmounted it we must not be chdir'd into it, so escape back to
1533 * old-root.
1534 */
1535 ret = fchdir(oldroot);
1536 if (ret < 0) {
1537 ret = -1;
1538 SYSERROR("Failed to enter old root directory");
1539 goto on_error;
1540 }
1541
1542 /* Make oldroot rslave to make sure our umounts don't propagate to the
1543 * host.
1544 */
1545 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
1546 if (ret < 0) {
1547 ret = -1;
1548 SYSERROR("Failed to make oldroot rslave");
1549 goto on_error;
1550 }
1551
1552 ret = umount2(".", MNT_DETACH);
1553 if (ret < 0) {
1554 ret = -1;
1555 SYSERROR("Failed to detach old root directory");
1556 goto on_error;
1557 }
1558
1559 ret = fchdir(newroot);
1560 if (ret < 0) {
1561 ret = -1;
1562 SYSERROR("Failed to re-enter new root directory");
1563 goto on_error;
1564 }
1565
1566 ret = 0;
1567
1568 TRACE("pivot_root(\"%s\") successful", rootfs);
1569
1570 on_error:
1571 close(oldroot);
1572
1573 if (newroot >= 0)
1574 close(newroot);
1575
1576 return ret;
1577 }
1578
1579 static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
1580 {
1581 if (!rootfs->path) {
1582 DEBUG("Container does not have a rootfs");
1583 return 0;
1584 }
1585
1586 if (detect_ramfs_rootfs())
1587 return lxc_chroot(rootfs);
1588
1589 return lxc_pivot_root(rootfs->mount);
1590 }
1591
1592 static const struct id_map *find_mapped_nsid_entry(struct lxc_conf *conf,
1593 unsigned id,
1594 enum idtype idtype)
1595 {
1596 struct lxc_list *it;
1597 struct id_map *map;
1598 struct id_map *retmap = NULL;
1599
1600 /* Shortcut for container's root mappings. */
1601 if (id == 0) {
1602 if (idtype == ID_TYPE_UID)
1603 return conf->root_nsuid_map;
1604
1605 if (idtype == ID_TYPE_GID)
1606 return conf->root_nsgid_map;
1607 }
1608
1609 lxc_list_for_each(it, &conf->id_map) {
1610 map = it->elem;
1611 if (map->idtype != idtype)
1612 continue;
1613
1614 if (id >= map->nsid && id < map->nsid + map->range) {
1615 retmap = map;
1616 break;
1617 }
1618 }
1619
1620 return retmap;
1621 }
1622
1623 static int lxc_setup_devpts(struct lxc_conf *conf)
1624 {
1625 int ret;
1626 char **opts;
1627 char devpts_mntopts[256];
1628 char *mntopt_sets[5];
1629 char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
1630
1631 if (conf->pty_max <= 0) {
1632 DEBUG("No new devpts instance will be mounted since no pts "
1633 "devices are requested");
1634 return 0;
1635 }
1636
1637 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1638 default_devpts_mntopts, conf->pty_max);
1639 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1640 return -1;
1641
1642 ret = umount2("/dev/pts", MNT_DETACH);
1643 if (ret < 0)
1644 SYSWARN("Failed to unmount old devpts instance");
1645 else
1646 DEBUG("Unmounted old devpts instance");
1647
1648 /* Create mountpoint for devpts instance. */
1649 ret = mkdir("/dev/pts", 0755);
1650 if (ret < 0 && errno != EEXIST) {
1651 SYSERROR("Failed to create \"/dev/pts\" directory");
1652 return -1;
1653 }
1654
1655 /* gid=5 && max= */
1656 mntopt_sets[0] = devpts_mntopts;
1657
1658 /* !gid=5 && max= */
1659 mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
1660
1661 /* gid=5 && !max= */
1662 mntopt_sets[2] = default_devpts_mntopts;
1663
1664 /* !gid=5 && !max= */
1665 mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
1666
1667 /* end */
1668 mntopt_sets[4] = NULL;
1669
1670 for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
1671 /* mount new devpts instance */
1672 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
1673 if (ret == 0)
1674 break;
1675 }
1676
1677 if (ret < 0) {
1678 SYSERROR("Failed to mount new devpts instance");
1679 return -1;
1680 }
1681 DEBUG("Mount new devpts instance with options \"%s\"", *opts);
1682
1683 /* Remove any pre-existing /dev/ptmx file. */
1684 ret = remove("/dev/ptmx");
1685 if (ret < 0) {
1686 if (errno != ENOENT) {
1687 SYSERROR("Failed to remove existing \"/dev/ptmx\" file");
1688 return -1;
1689 }
1690 } else {
1691 DEBUG("Removed existing \"/dev/ptmx\" file");
1692 }
1693
1694 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1695 ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
1696 if (ret < 0 && errno != EEXIST) {
1697 SYSERROR("Failed to create dummy \"/dev/ptmx\" file as bind mount target");
1698 return -1;
1699 }
1700 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
1701
1702 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1703 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1704 if (!ret) {
1705 DEBUG("Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1706 return 0;
1707 } else {
1708 /* Fallthrough and try to create a symlink. */
1709 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1710 }
1711
1712 /* Remove the dummy /dev/ptmx file we created above. */
1713 ret = remove("/dev/ptmx");
1714 if (ret < 0) {
1715 SYSERROR("Failed to remove existing \"/dev/ptmx\"");
1716 return -1;
1717 }
1718
1719 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1720 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1721 if (ret < 0) {
1722 SYSERROR("Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
1723 return -1;
1724 }
1725 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
1726
1727 return 0;
1728 }
1729
1730 static int setup_personality(int persona)
1731 {
1732 int ret;
1733
1734 #if HAVE_SYS_PERSONALITY_H
1735 if (persona == -1)
1736 return 0;
1737
1738 ret = personality(persona);
1739 if (ret < 0) {
1740 SYSERROR("Failed to set personality to \"0x%x\"", persona);
1741 return -1;
1742 }
1743
1744 INFO("Set personality to \"0x%x\"", persona);
1745 #endif
1746
1747 return 0;
1748 }
1749
1750 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1751 const struct lxc_terminal *console)
1752 {
1753 int ret;
1754 char path[PATH_MAX];
1755 char *rootfs_path = rootfs->path ? rootfs->mount : "";
1756
1757 if (console->path && !strcmp(console->path, "none"))
1758 return 0;
1759
1760 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1761 if (ret < 0 || (size_t)ret >= sizeof(path))
1762 return -1;
1763
1764 /* When we are asked to setup a console we remove any previous
1765 * /dev/console bind-mounts.
1766 */
1767 if (file_exists(path)) {
1768 ret = lxc_unstack_mountpoint(path, false);
1769 if (ret < 0) {
1770 SYSERROR("Failed to unmount \"%s\"", path);
1771 return -ret;
1772 } else {
1773 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
1774 }
1775 }
1776
1777 /* For unprivileged containers autodev or automounts will already have
1778 * taken care of creating /dev/console.
1779 */
1780 ret = mknod(path, S_IFREG | 0000, 0);
1781 if (ret < 0 && errno != EEXIST) {
1782 SYSERROR("Failed to create console");
1783 return -errno;
1784 }
1785
1786 ret = fchmod(console->slave, S_IXUSR | S_IXGRP);
1787 if (ret < 0) {
1788 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1789 S_IXUSR | S_IXGRP, console->name);
1790 return -errno;
1791 }
1792
1793 ret = safe_mount(console->name, path, "none", MS_BIND, 0, rootfs_path);
1794 if (ret < 0) {
1795 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, path);
1796 return -1;
1797 }
1798
1799 DEBUG("Mounted pts device \"%s\" onto \"%s\"", console->name, path);
1800 return 0;
1801 }
1802
1803 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1804 const struct lxc_terminal *console,
1805 char *ttydir)
1806 {
1807 int ret;
1808 char path[PATH_MAX], lxcpath[PATH_MAX];
1809 char *rootfs_path = rootfs->path ? rootfs->mount : "";
1810
1811 if (console->path && !strcmp(console->path, "none"))
1812 return 0;
1813
1814 /* create rootfs/dev/<ttydir> directory */
1815 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
1816 if (ret < 0 || (size_t)ret >= sizeof(path))
1817 return -1;
1818
1819 ret = mkdir(path, 0755);
1820 if (ret && errno != EEXIST) {
1821 SYSERROR("Failed to create \"%s\"", path);
1822 return -errno;
1823 }
1824 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1825
1826 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
1827 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1828 return -1;
1829
1830 ret = mknod(lxcpath, S_IFREG | 0000, 0);
1831 if (ret < 0 && errno != EEXIST) {
1832 SYSERROR("Failed to create \"%s\"", lxcpath);
1833 return -errno;
1834 }
1835
1836 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1837 if (ret < 0 || (size_t)ret >= sizeof(path))
1838 return -1;
1839
1840 if (file_exists(path)) {
1841 ret = lxc_unstack_mountpoint(path, false);
1842 if (ret < 0) {
1843 SYSERROR("Failed to unmount \"%s\"", path);
1844 return -ret;
1845 } else {
1846 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
1847 }
1848 }
1849
1850 ret = mknod(path, S_IFREG | 0000, 0);
1851 if (ret < 0 && errno != EEXIST) {
1852 SYSERROR("Failed to create console");
1853 return -errno;
1854 }
1855
1856 ret = fchmod(console->slave, S_IXUSR | S_IXGRP);
1857 if (ret < 0) {
1858 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1859 S_IXUSR | S_IXGRP, console->name);
1860 return -errno;
1861 }
1862
1863 /* bind mount console->name to '/dev/<ttydir>/console' */
1864 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
1865 if (ret < 0) {
1866 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
1867 return -1;
1868 }
1869 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1870
1871 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
1872 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1873 if (ret < 0) {
1874 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
1875 return -1;
1876 }
1877 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1878
1879 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
1880 return 0;
1881 }
1882
1883 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1884 const struct lxc_terminal *console, char *ttydir)
1885 {
1886
1887 if (!ttydir)
1888 return lxc_setup_dev_console(rootfs, console);
1889
1890 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1891 }
1892
1893 static void parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
1894 {
1895 struct mount_opt *mo;
1896
1897 /* If opt is found in mount_opt, set or clear flags.
1898 * Otherwise append it to data. */
1899
1900 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1901 if (strncmp(opt, mo->name, strlen(mo->name)) == 0) {
1902 if (mo->clear)
1903 *flags &= ~mo->flag;
1904 else
1905 *flags |= mo->flag;
1906 return;
1907 }
1908 }
1909
1910 if (strlen(*data))
1911 (void)strlcat(*data, ",", size);
1912
1913 (void)strlcat(*data, opt, size);
1914 }
1915
1916 int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
1917 {
1918 char *data, *p, *s;
1919 size_t size;
1920
1921 *mntdata = NULL;
1922 *mntflags = 0L;
1923
1924 if (!mntopts)
1925 return 0;
1926
1927 s = strdup(mntopts);
1928 if (!s)
1929 return -1;
1930
1931 size = strlen(s) + 1;
1932 data = malloc(size);
1933 if (!data) {
1934 free(s);
1935 return -1;
1936 }
1937 *data = 0;
1938
1939 lxc_iterate_parts(p, s, ",")
1940 parse_mntopt(p, mntflags, &data, size);
1941
1942 if (*data)
1943 *mntdata = data;
1944 else
1945 free(data);
1946 free(s);
1947
1948 return 0;
1949 }
1950
1951 static void parse_propagationopt(char *opt, unsigned long *flags)
1952 {
1953 struct mount_opt *mo;
1954
1955 /* If opt is found in propagation_opt, set or clear flags. */
1956 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
1957 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1958 continue;
1959
1960 if (mo->clear)
1961 *flags &= ~mo->flag;
1962 else
1963 *flags |= mo->flag;
1964
1965 return;
1966 }
1967 }
1968
1969 int parse_propagationopts(const char *mntopts, unsigned long *pflags)
1970 {
1971 char *p, *s;
1972
1973 if (!mntopts)
1974 return 0;
1975
1976 s = strdup(mntopts);
1977 if (!s) {
1978 SYSERROR("Failed to allocate memory");
1979 return -ENOMEM;
1980 }
1981
1982 *pflags = 0L;
1983 lxc_iterate_parts(p, s, ",")
1984 parse_propagationopt(p, pflags);
1985 free(s);
1986
1987 return 0;
1988 }
1989
1990 static void null_endofword(char *word)
1991 {
1992 while (*word && *word != ' ' && *word != '\t')
1993 word++;
1994 *word = '\0';
1995 }
1996
1997 /* skip @nfields spaces in @src */
1998 static char *get_field(char *src, int nfields)
1999 {
2000 int i;
2001 char *p = src;
2002
2003 for (i = 0; i < nfields; i++) {
2004 while (*p && *p != ' ' && *p != '\t')
2005 p++;
2006
2007 if (!*p)
2008 break;
2009
2010 p++;
2011 }
2012
2013 return p;
2014 }
2015
2016 static int mount_entry(const char *fsname, const char *target,
2017 const char *fstype, unsigned long mountflags,
2018 unsigned long pflags, const char *data, bool optional,
2019 bool dev, bool relative, const char *rootfs)
2020 {
2021 int ret;
2022 char srcbuf[PATH_MAX];
2023 const char *srcpath = fsname;
2024 #ifdef HAVE_STATVFS
2025 struct statvfs sb;
2026 #endif
2027
2028 if (relative) {
2029 ret = snprintf(srcbuf, PATH_MAX, "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
2030 if (ret < 0 || ret >= PATH_MAX) {
2031 ERROR("source path is too long");
2032 return -1;
2033 }
2034 srcpath = srcbuf;
2035 }
2036
2037 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
2038 rootfs);
2039 if (ret < 0) {
2040 if (optional) {
2041 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2042 srcpath ? srcpath : "(null)", target);
2043 return 0;
2044 }
2045
2046 SYSERROR("Failed to mount \"%s\" on \"%s\"",
2047 srcpath ? srcpath : "(null)", target);
2048 return -1;
2049 }
2050
2051 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
2052 unsigned long rqd_flags = 0;
2053
2054 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
2055 "options", srcpath ? srcpath : "(none)", target ? target : "(none)");
2056
2057 if (mountflags & MS_RDONLY)
2058 rqd_flags |= MS_RDONLY;
2059 #ifdef HAVE_STATVFS
2060 if (srcpath && statvfs(srcpath, &sb) == 0) {
2061 unsigned long required_flags = rqd_flags;
2062
2063 if (sb.f_flag & MS_NOSUID)
2064 required_flags |= MS_NOSUID;
2065
2066 if (sb.f_flag & MS_NODEV && !dev)
2067 required_flags |= MS_NODEV;
2068
2069 if (sb.f_flag & MS_RDONLY)
2070 required_flags |= MS_RDONLY;
2071
2072 if (sb.f_flag & MS_NOEXEC)
2073 required_flags |= MS_NOEXEC;
2074
2075 DEBUG("Flags for \"%s\" were %lu, required extra flags "
2076 "are %lu", srcpath, sb.f_flag, required_flags);
2077
2078 /* If this was a bind mount request, and required_flags
2079 * does not have any flags which are not already in
2080 * mountflags, then skip the remount.
2081 */
2082 if (!(mountflags & MS_REMOUNT)) {
2083 if (!(required_flags & ~mountflags) &&
2084 rqd_flags == 0) {
2085 DEBUG("Mountflags already were %lu, "
2086 "skipping remount", mountflags);
2087 goto skipremount;
2088 }
2089 }
2090
2091 mountflags |= required_flags;
2092 }
2093 #endif
2094
2095 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
2096 if (ret < 0) {
2097 if (optional) {
2098 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2099 srcpath ? srcpath : "(null)", target);
2100 return 0;
2101 }
2102
2103 SYSERROR("Failed to mount \"%s\" on \"%s\"",
2104 srcpath ? srcpath : "(null)", target);
2105 return -1;
2106 }
2107 }
2108
2109 #ifdef HAVE_STATVFS
2110 skipremount:
2111 #endif
2112 if (pflags) {
2113 ret = mount(NULL, target, NULL, pflags, NULL);
2114 if (ret < 0) {
2115 if (optional) {
2116 SYSINFO("Failed to change mount propagation "
2117 "for \"%s\" (optional)", target);
2118 return 0;
2119 } else {
2120 SYSERROR("Failed to change mount propagation "
2121 "for \"%s\" (optional)", target);
2122 return -1;
2123 }
2124 }
2125 DEBUG("Changed mount propagation for \"%s\"", target);
2126 }
2127
2128 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
2129 srcpath ? srcpath : "(null)", target, fstype);
2130
2131 return 0;
2132 }
2133
2134 /* Remove "optional", "create=dir", and "create=file" from mntopt */
2135 static void cull_mntent_opt(struct mntent *mntent)
2136 {
2137 int i;
2138 char *list[] = {
2139 "create=dir",
2140 "create=file",
2141 "optional",
2142 "relative",
2143 NULL
2144 };
2145
2146 for (i = 0; list[i]; i++) {
2147 char *p, *p2;
2148
2149 p = strstr(mntent->mnt_opts, list[i]);
2150 if (!p)
2151 continue;
2152
2153 p2 = strchr(p, ',');
2154 if (!p2) {
2155 /* no more mntopts, so just chop it here */
2156 *p = '\0';
2157 continue;
2158 }
2159
2160 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
2161 }
2162 }
2163
2164 static int mount_entry_create_dir_file(const struct mntent *mntent,
2165 const char *path,
2166 const struct lxc_rootfs *rootfs,
2167 const char *lxc_name, const char *lxc_path)
2168 {
2169 int ret;
2170 char *p1, *p2;
2171
2172 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
2173 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
2174 if (ret < 0)
2175 return -1;
2176 }
2177
2178 if (hasmntopt(mntent, "create=dir")) {
2179 ret = mkdir_p(path, 0755);
2180 if (ret < 0 && errno != EEXIST) {
2181 SYSERROR("Failed to create directory \"%s\"", path);
2182 return -1;
2183 }
2184 }
2185
2186 if (!hasmntopt(mntent, "create=file"))
2187 return 0;
2188
2189 ret = access(path, F_OK);
2190 if (ret == 0)
2191 return 0;
2192
2193 p1 = strdup(path);
2194 if (!p1)
2195 return -1;
2196
2197 p2 = dirname(p1);
2198
2199 ret = mkdir_p(p2, 0755);
2200 free(p1);
2201 if (ret < 0 && errno != EEXIST) {
2202 SYSERROR("Failed to create directory \"%s\"", path);
2203 return -1;
2204 }
2205
2206 ret = mknod(path, S_IFREG | 0000, 0);
2207 if (ret < 0 && errno != EEXIST)
2208 return -errno;
2209
2210 return 0;
2211 }
2212
2213 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2214 * without a rootfs. */
2215 static inline int mount_entry_on_generic(struct mntent *mntent,
2216 const char *path,
2217 const struct lxc_rootfs *rootfs,
2218 const char *lxc_name,
2219 const char *lxc_path)
2220 {
2221 int ret;
2222 unsigned long mntflags;
2223 char *mntdata;
2224 bool dev, optional, relative;
2225 unsigned long pflags = 0;
2226 char *rootfs_path = NULL;
2227
2228 optional = hasmntopt(mntent, "optional") != NULL;
2229 dev = hasmntopt(mntent, "dev") != NULL;
2230 relative = hasmntopt(mntent, "relative") != NULL;
2231
2232 if (rootfs && rootfs->path)
2233 rootfs_path = rootfs->mount;
2234
2235 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2236 lxc_path);
2237 if (ret < 0) {
2238 if (optional)
2239 return 0;
2240
2241 return -1;
2242 }
2243 cull_mntent_opt(mntent);
2244
2245 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2246 if (ret < 0)
2247 return -1;
2248
2249 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2250 if (ret < 0)
2251 return -1;
2252
2253 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
2254 pflags, mntdata, optional, dev, relative, rootfs_path);
2255
2256 free(mntdata);
2257 return ret;
2258 }
2259
2260 static inline int mount_entry_on_systemfs(struct mntent *mntent)
2261 {
2262 int ret;
2263 char path[PATH_MAX];
2264
2265 /* For containers created without a rootfs all mounts are treated as
2266 * absolute paths starting at / on the host.
2267 */
2268 if (mntent->mnt_dir[0] != '/')
2269 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2270 else
2271 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2272 if (ret < 0 || ret >= sizeof(path))
2273 return -1;
2274
2275 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
2276 }
2277
2278 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
2279 const struct lxc_rootfs *rootfs,
2280 const char *lxc_name,
2281 const char *lxc_path)
2282 {
2283 int offset;
2284 char *aux;
2285 const char *lxcpath;
2286 char path[PATH_MAX];
2287 int ret = 0;
2288
2289 lxcpath = lxc_global_config_value("lxc.lxcpath");
2290 if (!lxcpath)
2291 return -1;
2292
2293 /* If rootfs->path is a blockdev path, allow container fstab to use
2294 * <lxcpath>/<name>/rootfs" as the target prefix.
2295 */
2296 ret = snprintf(path, PATH_MAX, "%s/%s/rootfs", lxcpath, lxc_name);
2297 if (ret < 0 || ret >= PATH_MAX)
2298 goto skipvarlib;
2299
2300 aux = strstr(mntent->mnt_dir, path);
2301 if (aux) {
2302 offset = strlen(path);
2303 goto skipabs;
2304 }
2305
2306 skipvarlib:
2307 aux = strstr(mntent->mnt_dir, rootfs->path);
2308 if (!aux) {
2309 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
2310 return ret;
2311 }
2312 offset = strlen(rootfs->path);
2313
2314 skipabs:
2315 ret = snprintf(path, PATH_MAX, "%s/%s", rootfs->mount, aux + offset);
2316 if (ret < 0 || ret >= PATH_MAX)
2317 return -1;
2318
2319 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2320 }
2321
2322 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2323 const struct lxc_rootfs *rootfs,
2324 const char *lxc_name,
2325 const char *lxc_path)
2326 {
2327 int ret;
2328 char path[PATH_MAX];
2329
2330 /* relative to root mount point */
2331 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2332 if (ret < 0 || (size_t)ret >= sizeof(path))
2333 return -1;
2334
2335 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2336 }
2337
2338 static int mount_file_entries(const struct lxc_conf *conf,
2339 const struct lxc_rootfs *rootfs, FILE *file,
2340 const char *lxc_name, const char *lxc_path)
2341 {
2342 char buf[PATH_MAX];
2343 struct mntent mntent;
2344
2345 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2346 int ret;
2347
2348 if (!rootfs->path)
2349 ret = mount_entry_on_systemfs(&mntent);
2350 else if (mntent.mnt_dir[0] != '/')
2351 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2352 lxc_name, lxc_path);
2353 else
2354 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2355 lxc_name, lxc_path);
2356 if (ret < 0)
2357 return -1;
2358 }
2359
2360 if (!feof(file) || ferror(file)) {
2361 ERROR("Failed to parse mount entries");
2362 return -1;
2363 }
2364
2365 return 0;
2366 }
2367
2368 static int setup_mount(const struct lxc_conf *conf,
2369 const struct lxc_rootfs *rootfs, const char *fstab,
2370 const char *lxc_name, const char *lxc_path)
2371 {
2372 FILE *f;
2373 int ret;
2374
2375 if (!fstab)
2376 return 0;
2377
2378 f = setmntent(fstab, "r");
2379 if (!f) {
2380 SYSERROR("Failed to open \"%s\"", fstab);
2381 return -1;
2382 }
2383
2384 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2385 if (ret < 0)
2386 ERROR("Failed to set up mount entries");
2387
2388 endmntent(f);
2389 return ret;
2390 }
2391
2392 /*
2393 * In order for nested containers to be able to mount /proc and /sys they need
2394 * to see a "pure" proc and sysfs mount points with nothing mounted on top
2395 * (like lxcfs).
2396 * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
2397 * apparmor rule to deny access to them. This is mostly for convenience: The
2398 * container's root user can mount them anyway and thus has access to the two
2399 * file systems. But a non-root user in the container should not be allowed to
2400 * access them as a side effect without explicitly allowing it.
2401 */
2402 static const char nesting_helpers[] =
2403 "proc dev/.lxc/proc proc create=dir,optional 0 0\n"
2404 "sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
2405
2406 FILE *make_anonymous_mount_file(struct lxc_list *mount,
2407 bool include_nesting_helpers)
2408 {
2409 int ret;
2410 char *mount_entry;
2411 struct lxc_list *iterator;
2412 int fd = -1;
2413
2414 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
2415 if (fd < 0) {
2416 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2417
2418 if (errno != ENOSYS)
2419 return NULL;
2420
2421 fd = lxc_make_tmpfile(template, true);
2422 if (fd < 0) {
2423 SYSERROR("Could not create temporary mount file");
2424 return NULL;
2425 }
2426
2427 TRACE("Created temporary mount file");
2428 }
2429
2430 lxc_list_for_each (iterator, mount) {
2431 size_t len;
2432
2433 mount_entry = iterator->elem;
2434 len = strlen(mount_entry);
2435
2436 ret = lxc_write_nointr(fd, mount_entry, len);
2437 if (ret != len)
2438 goto on_error;
2439
2440 ret = lxc_write_nointr(fd, "\n", 1);
2441 if (ret != 1)
2442 goto on_error;
2443 }
2444
2445 if (include_nesting_helpers) {
2446 ret = lxc_write_nointr(fd, nesting_helpers,
2447 STRARRAYLEN(nesting_helpers));
2448 if (ret != STRARRAYLEN(nesting_helpers))
2449 goto on_error;
2450 }
2451
2452 ret = lseek(fd, 0, SEEK_SET);
2453 if (ret < 0)
2454 goto on_error;
2455
2456 return fdopen(fd, "r+");
2457
2458 on_error:
2459 SYSERROR("Failed to write mount entry to temporary mount file");
2460 close(fd);
2461 return NULL;
2462 }
2463
2464 static int setup_mount_entries(const struct lxc_conf *conf,
2465 const struct lxc_rootfs *rootfs,
2466 struct lxc_list *mount, const char *lxc_name,
2467 const char *lxc_path)
2468 {
2469 int ret;
2470 FILE *f;
2471
2472 f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
2473 if (!f)
2474 return -1;
2475
2476 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2477 fclose(f);
2478
2479 return ret;
2480 }
2481
2482 static int parse_cap(const char *cap)
2483 {
2484 size_t i;
2485 int capid = -1;
2486 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2487 char *ptr = NULL;
2488
2489 if (strcmp(cap, "none") == 0)
2490 return -2;
2491
2492 for (i = 0; i < end; i++) {
2493 if (strcmp(cap, caps_opt[i].name))
2494 continue;
2495
2496 capid = caps_opt[i].value;
2497 break;
2498 }
2499
2500 if (capid < 0) {
2501 /* Try to see if it's numeric, so the user may specify
2502 * capabilities that the running kernel knows about but we
2503 * don't
2504 */
2505 errno = 0;
2506 capid = strtol(cap, &ptr, 10);
2507 if (!ptr || *ptr != '\0' || errno != 0)
2508 /* not a valid number */
2509 capid = -1;
2510 else if (capid > lxc_caps_last_cap())
2511 /* we have a number but it's not a valid
2512 * capability */
2513 capid = -1;
2514 }
2515
2516 return capid;
2517 }
2518
2519 int in_caplist(int cap, struct lxc_list *caps)
2520 {
2521 int capid;
2522 struct lxc_list *iterator;
2523
2524 lxc_list_for_each (iterator, caps) {
2525 capid = parse_cap(iterator->elem);
2526 if (capid == cap)
2527 return 1;
2528 }
2529
2530 return 0;
2531 }
2532
2533 static int setup_caps(struct lxc_list *caps)
2534 {
2535 int capid;
2536 char *drop_entry;
2537 struct lxc_list *iterator;
2538
2539 lxc_list_for_each (iterator, caps) {
2540 int ret;
2541
2542 drop_entry = iterator->elem;
2543
2544 capid = parse_cap(drop_entry);
2545 if (capid < 0) {
2546 ERROR("unknown capability %s", drop_entry);
2547 return -1;
2548 }
2549
2550 ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
2551 prctl_arg(0), prctl_arg(0));
2552 if (ret < 0) {
2553 SYSERROR("Failed to remove %s capability", drop_entry);
2554 return -1;
2555 }
2556 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
2557 }
2558
2559 DEBUG("Capabilities have been setup");
2560 return 0;
2561 }
2562
2563 static int dropcaps_except(struct lxc_list *caps)
2564 {
2565 __do_free int *caplist = NULL;
2566 int i, capid, numcaps;
2567 char *keep_entry;
2568 struct lxc_list *iterator;
2569
2570 numcaps = lxc_caps_last_cap() + 1;
2571 if (numcaps <= 0 || numcaps > 200)
2572 return -1;
2573 TRACE("Found %d capabilities", numcaps);
2574
2575 /* caplist[i] is 1 if we keep capability i */
2576 caplist = must_realloc(NULL, numcaps * sizeof(int));
2577 memset(caplist, 0, numcaps * sizeof(int));
2578
2579 lxc_list_for_each (iterator, caps) {
2580 keep_entry = iterator->elem;
2581
2582 capid = parse_cap(keep_entry);
2583 if (capid == -2)
2584 continue;
2585
2586 if (capid < 0) {
2587 ERROR("Unknown capability %s", keep_entry);
2588 return -1;
2589 }
2590
2591 DEBUG("Keep capability %s (%d)", keep_entry, capid);
2592 caplist[capid] = 1;
2593 }
2594
2595 for (i = 0; i < numcaps; i++) {
2596 int ret;
2597
2598 if (caplist[i])
2599 continue;
2600
2601 ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
2602 prctl_arg(0), prctl_arg(0));
2603 if (ret < 0) {
2604 SYSERROR("Failed to remove capability %d", i);
2605 return -1;
2606 }
2607 }
2608
2609 DEBUG("Capabilities have been setup");
2610 return 0;
2611 }
2612
2613 static int parse_resource(const char *res)
2614 {
2615 int ret;
2616 size_t i;
2617 int resid = -1;
2618
2619 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
2620 if (strcmp(res, limit_opt[i].name) == 0)
2621 return limit_opt[i].value;
2622
2623 /* Try to see if it's numeric, so the user may specify
2624 * resources that the running kernel knows about but
2625 * we don't.
2626 */
2627 ret = lxc_safe_int(res, &resid);
2628 if (ret < 0)
2629 return -1;
2630
2631 return resid;
2632 }
2633
2634 int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2635 {
2636 int resid;
2637 struct lxc_list *it;
2638 struct lxc_limit *lim;
2639
2640 lxc_list_for_each (it, limits) {
2641 lim = it->elem;
2642
2643 resid = parse_resource(lim->resource);
2644 if (resid < 0) {
2645 ERROR("Unknown resource %s", lim->resource);
2646 return -1;
2647 }
2648
2649 #if HAVE_PRLIMIT || HAVE_PRLIMIT64
2650 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2651 SYSERROR("Failed to set limit %s", lim->resource);
2652 return -1;
2653 }
2654
2655 TRACE("Setup \"%s\" limit", lim->resource);
2656 #else
2657 ERROR("Cannot set limit \"%s\" as prlimit is missing", lim->resource);
2658 return -1;
2659 #endif
2660 }
2661
2662 return 0;
2663 }
2664
2665 int setup_sysctl_parameters(struct lxc_list *sysctls)
2666 {
2667 struct lxc_list *it;
2668 struct lxc_sysctl *elem;
2669 int ret = 0;
2670 char *tmp = NULL;
2671 char filename[PATH_MAX] = {0};
2672
2673 lxc_list_for_each (it, sysctls) {
2674 elem = it->elem;
2675 tmp = lxc_string_replace(".", "/", elem->key);
2676 if (!tmp) {
2677 ERROR("Failed to replace key %s", elem->key);
2678 return -1;
2679 }
2680
2681 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2682 free(tmp);
2683 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2684 ERROR("Error setting up sysctl parameters path");
2685 return -1;
2686 }
2687
2688 ret = lxc_write_to_file(filename, elem->value,
2689 strlen(elem->value), false, 0666);
2690 if (ret < 0) {
2691 SYSERROR("Failed to setup sysctl parameters %s to %s",
2692 elem->key, elem->value);
2693 return -1;
2694 }
2695 }
2696
2697 return 0;
2698 }
2699
2700 int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2701 {
2702 struct lxc_list *it;
2703 struct lxc_proc *elem;
2704 int ret = 0;
2705 char *tmp = NULL;
2706 char filename[PATH_MAX] = {0};
2707
2708 lxc_list_for_each (it, procs) {
2709 elem = it->elem;
2710 tmp = lxc_string_replace(".", "/", elem->filename);
2711 if (!tmp) {
2712 ERROR("Failed to replace key %s", elem->filename);
2713 return -1;
2714 }
2715
2716 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2717 free(tmp);
2718 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2719 ERROR("Error setting up proc filesystem path");
2720 return -1;
2721 }
2722
2723 ret = lxc_write_to_file(filename, elem->value,
2724 strlen(elem->value), false, 0666);
2725 if (ret < 0) {
2726 SYSERROR("Failed to setup proc filesystem %s to %s",
2727 elem->filename, elem->value);
2728 return -1;
2729 }
2730 }
2731
2732 return 0;
2733 }
2734
2735 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2736
2737 struct lxc_conf *lxc_conf_init(void)
2738 {
2739 int i;
2740 struct lxc_conf *new;
2741
2742 new = malloc(sizeof(*new));
2743 if (!new)
2744 return NULL;
2745 memset(new, 0, sizeof(*new));
2746
2747 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2748 new->personality = -1;
2749 new->autodev = 1;
2750 new->console.buffer_size = 0;
2751 new->console.log_path = NULL;
2752 new->console.log_fd = -1;
2753 new->console.log_size = 0;
2754 new->console.path = NULL;
2755 new->console.peer = -1;
2756 new->console.proxy.busy = -1;
2757 new->console.proxy.master = -1;
2758 new->console.proxy.slave = -1;
2759 new->console.master = -1;
2760 new->console.slave = -1;
2761 new->console.name[0] = '\0';
2762 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
2763 new->maincmd_fd = -1;
2764 new->monitor_signal_pdeath = SIGKILL;
2765 new->nbd_idx = -1;
2766 new->rootfs.mount = strdup(default_rootfs_mount);
2767 if (!new->rootfs.mount) {
2768 free(new);
2769 return NULL;
2770 }
2771 new->rootfs.managed = true;
2772 new->logfd = -1;
2773 lxc_list_init(&new->cgroup);
2774 lxc_list_init(&new->cgroup2);
2775 lxc_list_init(&new->network);
2776 lxc_list_init(&new->mount_list);
2777 lxc_list_init(&new->caps);
2778 lxc_list_init(&new->keepcaps);
2779 lxc_list_init(&new->id_map);
2780 new->root_nsuid_map = NULL;
2781 new->root_nsgid_map = NULL;
2782 lxc_list_init(&new->includes);
2783 lxc_list_init(&new->aliens);
2784 lxc_list_init(&new->environment);
2785 lxc_list_init(&new->limits);
2786 lxc_list_init(&new->sysctls);
2787 lxc_list_init(&new->procs);
2788 new->hooks_version = 0;
2789 for (i = 0; i < NUM_LXC_HOOKS; i++)
2790 lxc_list_init(&new->hooks[i]);
2791 lxc_list_init(&new->groups);
2792 lxc_list_init(&new->state_clients);
2793 new->lsm_aa_profile = NULL;
2794 lxc_list_init(&new->lsm_aa_raw);
2795 new->lsm_se_context = NULL;
2796 new->tmp_umount_proc = false;
2797 new->tmp_umount_proc = 0;
2798 new->shmount.path_host = NULL;
2799 new->shmount.path_cont = NULL;
2800
2801 /* if running in a new user namespace, init and COMMAND
2802 * default to running as UID/GID 0 when using lxc-execute */
2803 new->init_uid = 0;
2804 new->init_gid = 0;
2805 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2806 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
2807
2808 return new;
2809 }
2810
2811 int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2812 size_t buf_size)
2813 {
2814 int fd, ret;
2815 char path[PATH_MAX];
2816
2817 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
2818 size_t buflen;
2819
2820 ret = snprintf(path, PATH_MAX, "/proc/%d/setgroups", pid);
2821 if (ret < 0 || ret >= PATH_MAX)
2822 return -E2BIG;
2823
2824 fd = open(path, O_WRONLY);
2825 if (fd < 0 && errno != ENOENT) {
2826 SYSERROR("Failed to open \"%s\"", path);
2827 return -1;
2828 }
2829
2830 if (fd >= 0) {
2831 buflen = STRLITERALLEN("deny\n");
2832 errno = 0;
2833 ret = lxc_write_nointr(fd, "deny\n", buflen);
2834 close(fd);
2835 if (ret != buflen) {
2836 SYSERROR("Failed to write \"deny\" to "
2837 "\"/proc/%d/setgroups\"", pid);
2838 return -1;
2839 }
2840 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
2841 }
2842 }
2843
2844 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid,
2845 idtype == ID_TYPE_UID ? 'u' : 'g');
2846 if (ret < 0 || ret >= PATH_MAX)
2847 return -E2BIG;
2848
2849 fd = open(path, O_WRONLY);
2850 if (fd < 0) {
2851 SYSERROR("Failed to open \"%s\"", path);
2852 return -1;
2853 }
2854
2855 errno = 0;
2856 ret = lxc_write_nointr(fd, buf, buf_size);
2857 close(fd);
2858 if (ret != buf_size) {
2859 SYSERROR("Failed to write %cid mapping to \"%s\"",
2860 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2861 return -1;
2862 }
2863
2864 return 0;
2865 }
2866
2867 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2868 *
2869 * @return 1 if functional binary was found
2870 * @return 0 if binary exists but is lacking privilege
2871 * @return -ENOENT if binary does not exist
2872 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2873 */
2874 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2875 {
2876 char *path;
2877 int ret;
2878 struct stat st;
2879 int fret = 0;
2880
2881 if (cap != CAP_SETUID && cap != CAP_SETGID)
2882 return -EINVAL;
2883
2884 path = on_path(binary, NULL);
2885 if (!path)
2886 return -ENOENT;
2887
2888 ret = stat(path, &st);
2889 if (ret < 0) {
2890 fret = -errno;
2891 goto cleanup;
2892 }
2893
2894 /* Check if the binary is setuid. */
2895 if (st.st_mode & S_ISUID) {
2896 DEBUG("The binary \"%s\" does have the setuid bit set", path);
2897 fret = 1;
2898 goto cleanup;
2899 }
2900
2901 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
2902 /* Check if it has the CAP_SETUID capability. */
2903 if ((cap & CAP_SETUID) &&
2904 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2905 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2906 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2907 "and CAP_PERMITTED sets", path);
2908 fret = 1;
2909 goto cleanup;
2910 }
2911
2912 /* Check if it has the CAP_SETGID capability. */
2913 if ((cap & CAP_SETGID) &&
2914 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2915 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2916 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2917 "and CAP_PERMITTED sets", path);
2918 fret = 1;
2919 goto cleanup;
2920 }
2921 #else
2922 /* If we cannot check for file capabilities we need to give the benefit
2923 * of the doubt. Otherwise we might fail even though all the necessary
2924 * file capabilities are set.
2925 */
2926 DEBUG("Cannot check for file capabilities as full capability support is "
2927 "missing. Manual intervention needed");
2928 fret = 1;
2929 #endif
2930
2931 cleanup:
2932 free(path);
2933 return fret;
2934 }
2935
2936 int lxc_map_ids_exec_wrapper(void *args)
2937 {
2938 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2939 return -1;
2940 }
2941
2942 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2943 {
2944 int fill, left;
2945 char u_or_g;
2946 char *pos;
2947 char cmd_output[PATH_MAX];
2948 struct id_map *map;
2949 struct lxc_list *iterator;
2950 enum idtype type;
2951 /* strlen("new@idmap") = 9
2952 * +
2953 * strlen(" ") = 1
2954 * +
2955 * INTTYPE_TO_STRLEN(uint32_t)
2956 * +
2957 * strlen(" ") = 1
2958 *
2959 * We add some additional space to make sure that we really have
2960 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2961 */
2962 int ret = 0, gidmap = 0, uidmap = 0;
2963 char mapbuf[9 + 1 + INTTYPE_TO_STRLEN(uint32_t) + 1 + LXC_IDMAPLEN] = {0};
2964 bool had_entry = false, use_shadow = false;
2965 int hostuid, hostgid;
2966
2967 hostuid = geteuid();
2968 hostgid = getegid();
2969
2970 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2971 * ranges, then insist that root also reserve ranges in subuid. This
2972 * will protected it by preventing another user from being handed the
2973 * range by shadow.
2974 */
2975 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
2976 if (uidmap == -ENOENT)
2977 WARN("newuidmap binary is missing");
2978 else if (!uidmap)
2979 WARN("newuidmap is lacking necessary privileges");
2980
2981 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
2982 if (gidmap == -ENOENT)
2983 WARN("newgidmap binary is missing");
2984 else if (!gidmap)
2985 WARN("newgidmap is lacking necessary privileges");
2986
2987 if (uidmap > 0 && gidmap > 0) {
2988 DEBUG("Functional newuidmap and newgidmap binary found");
2989 use_shadow = true;
2990 } else {
2991 /* In case unprivileged users run application containers via
2992 * execute() or a start*() there are valid cases where they may
2993 * only want to map their own {g,u}id. Let's not block them from
2994 * doing so by requiring geteuid() == 0.
2995 */
2996 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2997 "write directly with euid %d", hostuid);
2998 }
2999
3000 /* Check if we really need to use newuidmap and newgidmap.
3001 * If the user is only remapping his own {g,u}id, we don't need it.
3002 */
3003 if (use_shadow && lxc_list_len(idmap) == 2) {
3004 use_shadow = false;
3005 lxc_list_for_each(iterator, idmap) {
3006 map = iterator->elem;
3007 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
3008 map->nsid == hostuid && map->hostid == hostuid)
3009 continue;
3010 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
3011 map->nsid == hostgid && map->hostid == hostgid)
3012 continue;
3013 use_shadow = true;
3014 break;
3015 }
3016 }
3017
3018 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3019 type++, u_or_g = 'g') {
3020 pos = mapbuf;
3021
3022 if (use_shadow)
3023 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3024
3025 lxc_list_for_each(iterator, idmap) {
3026 map = iterator->elem;
3027 if (map->idtype != type)
3028 continue;
3029
3030 had_entry = true;
3031
3032 left = LXC_IDMAPLEN - (pos - mapbuf);
3033 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3034 use_shadow ? " " : "", map->nsid,
3035 map->hostid, map->range,
3036 use_shadow ? "" : "\n");
3037 if (fill <= 0 || fill >= left) {
3038 /* The kernel only takes <= 4k for writes to
3039 * /proc/<pid>/{g,u}id_map
3040 */
3041 SYSERROR("Too many %cid mappings defined", u_or_g);
3042 return -1;
3043 }
3044
3045 pos += fill;
3046 }
3047 if (!had_entry)
3048 continue;
3049
3050 /* Try to catch the output of new{g,u}idmap to make debugging
3051 * easier.
3052 */
3053 if (use_shadow) {
3054 ret = run_command(cmd_output, sizeof(cmd_output),
3055 lxc_map_ids_exec_wrapper,
3056 (void *)mapbuf);
3057 if (ret < 0) {
3058 ERROR("new%cidmap failed to write mapping \"%s\": %s",
3059 u_or_g, cmd_output, mapbuf);
3060 return -1;
3061 }
3062 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
3063 } else {
3064 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3065 if (ret < 0) {
3066 ERROR("Failed to write mapping: %s", mapbuf);
3067 return -1;
3068 }
3069 TRACE("Wrote mapping \"%s\"", mapbuf);
3070 }
3071
3072 memset(mapbuf, 0, sizeof(mapbuf));
3073 }
3074
3075 return 0;
3076 }
3077
3078 /* Return the host uid/gid to which the container root is mapped in val.
3079 * Return true if id was found, false otherwise.
3080 */
3081 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3082 unsigned long *val)
3083 {
3084 unsigned nsid;
3085 struct id_map *map;
3086 struct lxc_list *it;
3087
3088 if (idtype == ID_TYPE_UID)
3089 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
3090 else
3091 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
3092
3093 lxc_list_for_each (it, &conf->id_map) {
3094 map = it->elem;
3095 if (map->idtype != idtype)
3096 continue;
3097 if (map->nsid != nsid)
3098 continue;
3099 *val = map->hostid;
3100 return true;
3101 }
3102
3103 return false;
3104 }
3105
3106 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3107 {
3108 struct id_map *map;
3109 struct lxc_list *it;
3110
3111 lxc_list_for_each (it, &conf->id_map) {
3112 map = it->elem;
3113 if (map->idtype != idtype)
3114 continue;
3115
3116 if (id >= map->hostid && id < map->hostid + map->range)
3117 return (id - map->hostid) + map->nsid;
3118 }
3119
3120 return -1;
3121 }
3122
3123 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3124 {
3125 struct id_map *map;
3126 struct lxc_list *it;
3127 unsigned int freeid = 0;
3128
3129 again:
3130 lxc_list_for_each (it, &conf->id_map) {
3131 map = it->elem;
3132 if (map->idtype != idtype)
3133 continue;
3134
3135 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3136 freeid = map->nsid + map->range;
3137 goto again;
3138 }
3139 }
3140
3141 return freeid;
3142 }
3143
3144 int chown_mapped_root_exec_wrapper(void *args)
3145 {
3146 execvp("lxc-usernsexec", args);
3147 return -1;
3148 }
3149
3150 /* chown_mapped_root: for an unprivileged user with uid/gid X to
3151 * chown a dir to subuid/subgid Y, he needs to run chown as root
3152 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3153 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3154 * root is privileged with respect to hostuid/hostgid X, allowing
3155 * him to do the chown.
3156 */
3157 int chown_mapped_root(const char *path, struct lxc_conf *conf)
3158 {
3159 uid_t rootuid, rootgid;
3160 unsigned long val;
3161 int hostuid, hostgid, ret;
3162 struct stat sb;
3163 char map1[100], map2[100], map3[100], map4[100], map5[100];
3164 char ugid[100];
3165 const char *args1[] = {"lxc-usernsexec",
3166 "-m", map1,
3167 "-m", map2,
3168 "-m", map3,
3169 "-m", map5,
3170 "--", "chown", ugid, path,
3171 NULL};
3172 const char *args2[] = {"lxc-usernsexec",
3173 "-m", map1,
3174 "-m", map2,
3175 "-m", map3,
3176 "-m", map4,
3177 "-m", map5,
3178 "--", "chown", ugid, path,
3179 NULL};
3180 char cmd_output[PATH_MAX];
3181
3182 hostuid = geteuid();
3183 hostgid = getegid();
3184
3185 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3186 ERROR("No uid mapping for container root");
3187 return -1;
3188 }
3189 rootuid = (uid_t)val;
3190
3191 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3192 ERROR("No gid mapping for container root");
3193 return -1;
3194 }
3195 rootgid = (gid_t)val;
3196
3197 if (hostuid == 0) {
3198 if (chown(path, rootuid, rootgid) < 0) {
3199 ERROR("Error chowning %s", path);
3200 return -1;
3201 }
3202
3203 return 0;
3204 }
3205
3206 if (rootuid == hostuid) {
3207 /* nothing to do */
3208 INFO("Container root is our uid; no need to chown");
3209 return 0;
3210 }
3211
3212 /* save the current gid of "path" */
3213 if (stat(path, &sb) < 0) {
3214 ERROR("Error stat %s", path);
3215 return -1;
3216 }
3217
3218 /* Update the path argument in case this was overlayfs. */
3219 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3220 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3221
3222 /*
3223 * A file has to be group-owned by a gid mapped into the
3224 * container, or the container won't be privileged over it.
3225 */
3226 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3227 if (sb.st_uid == hostuid &&
3228 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3229 chown(path, -1, hostgid) < 0) {
3230 ERROR("Failed chgrping %s", path);
3231 return -1;
3232 }
3233
3234 /* "u:0:rootuid:1" */
3235 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3236 if (ret < 0 || ret >= 100) {
3237 ERROR("Error uid printing map string");
3238 return -1;
3239 }
3240
3241 /* "u:hostuid:hostuid:1" */
3242 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3243 if (ret < 0 || ret >= 100) {
3244 ERROR("Error uid printing map string");
3245 return -1;
3246 }
3247
3248 /* "g:0:rootgid:1" */
3249 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3250 if (ret < 0 || ret >= 100) {
3251 ERROR("Error gid printing map string");
3252 return -1;
3253 }
3254
3255 /* "g:pathgid:rootgid+pathgid:1" */
3256 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3257 rootgid + (gid_t)sb.st_gid);
3258 if (ret < 0 || ret >= 100) {
3259 ERROR("Error gid printing map string");
3260 return -1;
3261 }
3262
3263 /* "g:hostgid:hostgid:1" */
3264 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3265 if (ret < 0 || ret >= 100) {
3266 ERROR("Error gid printing map string");
3267 return -1;
3268 }
3269
3270 /* "0:pathgid" (chown) */
3271 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3272 if (ret < 0 || ret >= 100) {
3273 ERROR("Error owner printing format string for chown");
3274 return -1;
3275 }
3276
3277 if (hostgid == sb.st_gid)
3278 ret = run_command(cmd_output, sizeof(cmd_output),
3279 chown_mapped_root_exec_wrapper,
3280 (void *)args1);
3281 else
3282 ret = run_command(cmd_output, sizeof(cmd_output),
3283 chown_mapped_root_exec_wrapper,
3284 (void *)args2);
3285 if (ret < 0)
3286 ERROR("lxc-usernsexec failed: %s", cmd_output);
3287
3288 return ret;
3289 }
3290
3291 /* NOTE: Must not be called from inside the container namespace! */
3292 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
3293 {
3294 int mounted;
3295
3296 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
3297 if (mounted == -1) {
3298 SYSERROR("Failed to mount proc in the container");
3299 /* continue only if there is no rootfs */
3300 if (conf->rootfs.path)
3301 return -1;
3302 } else if (mounted == 1) {
3303 conf->tmp_umount_proc = true;
3304 }
3305
3306 return 0;
3307 }
3308
3309 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3310 {
3311 if (!lxc_conf->tmp_umount_proc)
3312 return;
3313
3314 (void)umount2("/proc", MNT_DETACH);
3315 lxc_conf->tmp_umount_proc = false;
3316 }
3317
3318 /* Walk /proc/mounts and change any shared entries to slave. */
3319 void remount_all_slave(void)
3320 {
3321 int memfd, mntinfo_fd, ret;
3322 ssize_t copied;
3323 FILE *f;
3324 size_t len = 0;
3325 char *line = NULL;
3326
3327 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
3328 if (mntinfo_fd < 0) {
3329 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
3330 return;
3331 }
3332
3333 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3334 if (memfd < 0) {
3335 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3336
3337 if (errno != ENOSYS) {
3338 SYSERROR("Failed to create temporary in-memory file");
3339 close(mntinfo_fd);
3340 return;
3341 }
3342
3343 memfd = lxc_make_tmpfile(template, true);
3344 if (memfd < 0) {
3345 close(mntinfo_fd);
3346 WARN("Failed to create temporary file");
3347 return;
3348 }
3349 }
3350
3351 again:
3352 copied = lxc_sendfile_nointr(memfd, mntinfo_fd, NULL, LXC_SENDFILE_MAX);
3353 if (copied < 0) {
3354 if (errno == EINTR)
3355 goto again;
3356
3357 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
3358 close(mntinfo_fd);
3359 close(memfd);
3360 return;
3361 }
3362 close(mntinfo_fd);
3363
3364 /* After a successful fdopen() memfd will be closed when calling
3365 * fclose(f). Calling close(memfd) afterwards is undefined.
3366 */
3367 ret = lseek(memfd, 0, SEEK_SET);
3368 if (ret < 0) {
3369 SYSERROR("Failed to reset file descriptor offset");
3370 close(memfd);
3371 return;
3372 }
3373
3374 f = fdopen(memfd, "r");
3375 if (!f) {
3376 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark "
3377 "all shared. Continuing");
3378 close(memfd);
3379 return;
3380 }
3381
3382 while (getline(&line, &len, f) != -1) {
3383 char *opts, *target;
3384
3385 target = get_field(line, 4);
3386 if (!target)
3387 continue;
3388
3389 opts = get_field(target, 2);
3390 if (!opts)
3391 continue;
3392
3393 null_endofword(opts);
3394 if (!strstr(opts, "shared"))
3395 continue;
3396
3397 null_endofword(target);
3398 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3399 if (ret < 0) {
3400 SYSERROR("Failed to make \"%s\" MS_SLAVE", target);
3401 ERROR("Continuing...");
3402 continue;
3403 }
3404 TRACE("Remounted \"%s\" as MS_SLAVE", target);
3405 }
3406 fclose(f);
3407 free(line);
3408 TRACE("Remounted all mount table entries as MS_SLAVE");
3409 }
3410
3411 static int lxc_execute_bind_init(struct lxc_handler *handler)
3412 {
3413 int ret;
3414 char *p;
3415 char path[PATH_MAX], destpath[PATH_MAX];
3416 struct lxc_conf *conf = handler->conf;
3417
3418 /* If init exists in the container, don't bind mount a static one */
3419 p = choose_init(conf->rootfs.mount);
3420 if (p) {
3421 char *old = p;
3422
3423 p = strdup(old + strlen(conf->rootfs.mount));
3424 free(old);
3425 if (!p)
3426 return -ENOMEM;
3427
3428 INFO("Found existing init at \"%s\"", p);
3429 goto out;
3430 }
3431
3432 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3433 if (ret < 0 || ret >= PATH_MAX)
3434 return -1;
3435
3436 if (!file_exists(path)) {
3437 ERROR("The file \"%s\" does not exist on host", path);
3438 return -1;
3439 }
3440
3441 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
3442 if (ret < 0 || ret >= PATH_MAX)
3443 return -1;
3444
3445 if (!file_exists(destpath)) {
3446 ret = mknod(destpath, S_IFREG | 0000, 0);
3447 if (ret < 0 && errno != EEXIST) {
3448 SYSERROR("Failed to create dummy \"%s\" file as bind mount target", destpath);
3449 return -1;
3450 }
3451 }
3452
3453 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3454 if (ret < 0) {
3455 SYSERROR("Failed to bind mount lxc.init.static into container");
3456 return -1;
3457 }
3458
3459 p = strdup(destpath + strlen(conf->rootfs.mount));
3460 if (!p)
3461 return -ENOMEM;
3462
3463 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
3464 out:
3465 ((struct execute_args *)handler->data)->init_fd = -1;
3466 ((struct execute_args *)handler->data)->init_path = p;
3467 return 0;
3468 }
3469
3470 /* This does the work of remounting / if it is shared, calling the container
3471 * pre-mount hooks, and mounting the rootfs.
3472 */
3473 int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
3474 const char *lxcpath)
3475 {
3476 int ret;
3477
3478 if (conf->rootfs_setup) {
3479 const char *path = conf->rootfs.mount;
3480
3481 /* The rootfs was set up in another namespace. bind-mount it to
3482 * give us a mount in our own ns so we can pivot_root to it
3483 */
3484 ret = mount(path, path, "rootfs", MS_BIND, NULL);
3485 if (ret < 0) {
3486 ERROR("Failed to bind mount container / onto itself");
3487 return -1;
3488 }
3489
3490 TRACE("Bind mounted container / onto itself");
3491 return 0;
3492 }
3493
3494 remount_all_slave();
3495
3496 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
3497 if (ret < 0) {
3498 ERROR("Failed to run pre-mount hooks");
3499 return -1;
3500 }
3501
3502 ret = lxc_mount_rootfs(conf);
3503 if (ret < 0) {
3504 ERROR("Failed to setup rootfs for");
3505 return -1;
3506 }
3507
3508 conf->rootfs_setup = true;
3509 return 0;
3510 }
3511
3512 static bool verify_start_hooks(struct lxc_conf *conf)
3513 {
3514 char path[PATH_MAX];
3515 struct lxc_list *it;
3516
3517 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
3518 int ret;
3519 char *hookname = it->elem;
3520
3521 ret = snprintf(path, PATH_MAX, "%s%s",
3522 conf->rootfs.path ? conf->rootfs.mount : "",
3523 hookname);
3524 if (ret < 0 || ret >= PATH_MAX)
3525 return false;
3526
3527 ret = access(path, X_OK);
3528 if (ret < 0) {
3529 SYSERROR("Start hook \"%s\" not found in container",
3530 hookname);
3531 return false;
3532 }
3533
3534 return true;
3535 }
3536
3537 return true;
3538 }
3539
3540 static bool execveat_supported(void)
3541 {
3542 lxc_raw_execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
3543 if (errno == ENOSYS)
3544 return false;
3545
3546 return true;
3547 }
3548
3549 int lxc_setup(struct lxc_handler *handler)
3550 {
3551 int ret;
3552 const char *lxcpath = handler->lxcpath, *name = handler->name;
3553 struct lxc_conf *lxc_conf = handler->conf;
3554
3555 ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
3556 if (ret < 0) {
3557 ERROR("Failed to setup rootfs");
3558 return -1;
3559 }
3560
3561 if (handler->nsfd[LXC_NS_UTS] == -1) {
3562 ret = setup_utsname(lxc_conf->utsname);
3563 if (ret < 0) {
3564 ERROR("Failed to setup the utsname %s", name);
3565 return -1;
3566 }
3567 }
3568
3569 ret = lxc_setup_keyring();
3570 if (ret < 0)
3571 return -1;
3572
3573 ret = lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network);
3574 if (ret < 0) {
3575 ERROR("Failed to setup network");
3576 return -1;
3577 }
3578
3579 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
3580 if (ret < 0) {
3581 ERROR("Failed to send network device names and ifindices to parent");
3582 return -1;
3583 }
3584
3585 if (lxc_conf->autodev > 0) {
3586 ret = mount_autodev(name, &lxc_conf->rootfs, lxcpath);
3587 if (ret < 0) {
3588 ERROR("Failed to mount \"/dev\"");
3589 return -1;
3590 }
3591 }
3592
3593 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3594 * need to wait until other stuff has finished.
3595 */
3596 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
3597 if (ret < 0) {
3598 ERROR("Failed to setup first automatic mounts");
3599 return -1;
3600 }
3601
3602 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
3603 if (ret < 0) {
3604 ERROR("Failed to setup mounts");
3605 return -1;
3606 }
3607
3608 if (lxc_conf->is_execute) {
3609 if (execveat_supported()) {
3610 int fd;
3611 char path[PATH_MAX];
3612
3613 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3614 if (ret < 0 || ret >= PATH_MAX) {
3615 ERROR("Path to init.lxc.static too long");
3616 return -1;
3617 }
3618
3619 fd = open(path, O_PATH | O_CLOEXEC);
3620 if (fd < 0) {
3621 SYSERROR("Unable to open lxc.init.static");
3622 return -1;
3623 }
3624
3625 ((struct execute_args *)handler->data)->init_fd = fd;
3626 ((struct execute_args *)handler->data)->init_path = NULL;
3627 } else {
3628 ret = lxc_execute_bind_init(handler);
3629 if (ret < 0) {
3630 ERROR("Failed to bind-mount the lxc init system");
3631 return -1;
3632 }
3633 }
3634 }
3635
3636 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3637 * mounted. It is guaranteed to be mounted now either through
3638 * automatically or via fstab entries.
3639 */
3640 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
3641 if (ret < 0) {
3642 ERROR("Failed to setup remaining automatic mounts");
3643 return -1;
3644 }
3645
3646 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
3647 if (ret < 0) {
3648 ERROR("Failed to run mount hooks");
3649 return -1;
3650 }
3651
3652 if (lxc_conf->autodev > 0) {
3653 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
3654 if (ret < 0) {
3655 ERROR("Failed to run autodev hooks");
3656 return -1;
3657 }
3658
3659 ret = lxc_fill_autodev(&lxc_conf->rootfs);
3660 if (ret < 0) {
3661 ERROR("Failed to populate \"/dev\"");
3662 return -1;
3663 }
3664 }
3665
3666 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3667 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3668 &lxc_conf->mount_list, name, lxcpath);
3669 if (ret < 0) {
3670 ERROR("Failed to setup mount entries");
3671 return -1;
3672 }
3673 }
3674
3675 /* Make sure any start hooks are in the container */
3676 if (!verify_start_hooks(lxc_conf)) {
3677 ERROR("Failed to verify start hooks");
3678 return -1;
3679 }
3680
3681 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
3682 lxc_conf->ttys.dir);
3683 if (ret < 0) {
3684 ERROR("Failed to setup console");
3685 return -1;
3686 }
3687
3688 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3689 if (ret < 0) {
3690 ERROR("Failed to setup \"/dev\" symlinks");
3691 return -1;
3692 }
3693
3694 ret = lxc_create_tmp_proc_mount(lxc_conf);
3695 if (ret < 0) {
3696 ERROR("Failed to \"/proc\" LSMs");
3697 return -1;
3698 }
3699
3700 ret = lxc_setup_rootfs_switch_root(&lxc_conf->rootfs);
3701 if (ret < 0) {
3702 ERROR("Failed to pivot root into rootfs");
3703 return -1;
3704 }
3705
3706 ret = lxc_setup_devpts(lxc_conf);
3707 if (ret < 0) {
3708 ERROR("Failed to setup new devpts instance");
3709 return -1;
3710 }
3711
3712 ret = lxc_create_ttys(handler);
3713 if (ret < 0)
3714 return -1;
3715
3716 ret = setup_personality(lxc_conf->personality);
3717 if (ret < 0) {
3718 ERROR("Failed to set personality");
3719 return -1;
3720 }
3721
3722 /* Set sysctl value to a path under /proc/sys as determined from the
3723 * key. For e.g. net.ipv4.ip_forward translated to
3724 * /proc/sys/net/ipv4/ip_forward.
3725 */
3726 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3727 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
3728 if (ret < 0) {
3729 ERROR("Failed to setup sysctl parameters");
3730 return -1;
3731 }
3732 }
3733
3734 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3735 if (!lxc_list_empty(&lxc_conf->caps)) {
3736 ERROR("Container requests lxc.cap.drop and "
3737 "lxc.cap.keep: either use lxc.cap.drop or "
3738 "lxc.cap.keep, not both");
3739 return -1;
3740 }
3741
3742 if (dropcaps_except(&lxc_conf->keepcaps)) {
3743 ERROR("Failed to keep capabilities");
3744 return -1;
3745 }
3746 } else if (setup_caps(&lxc_conf->caps)) {
3747 ERROR("Failed to drop capabilities");
3748 return -1;
3749 }
3750
3751 NOTICE("The container \"%s\" is set up", name);
3752
3753 return 0;
3754 }
3755
3756 int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
3757 char *argv[])
3758 {
3759 struct lxc_list *it;
3760 int which = -1;
3761
3762 if (strcmp(hookname, "pre-start") == 0)
3763 which = LXCHOOK_PRESTART;
3764 else if (strcmp(hookname, "start-host") == 0)
3765 which = LXCHOOK_START_HOST;
3766 else if (strcmp(hookname, "pre-mount") == 0)
3767 which = LXCHOOK_PREMOUNT;
3768 else if (strcmp(hookname, "mount") == 0)
3769 which = LXCHOOK_MOUNT;
3770 else if (strcmp(hookname, "autodev") == 0)
3771 which = LXCHOOK_AUTODEV;
3772 else if (strcmp(hookname, "start") == 0)
3773 which = LXCHOOK_START;
3774 else if (strcmp(hookname, "stop") == 0)
3775 which = LXCHOOK_STOP;
3776 else if (strcmp(hookname, "post-stop") == 0)
3777 which = LXCHOOK_POSTSTOP;
3778 else if (strcmp(hookname, "clone") == 0)
3779 which = LXCHOOK_CLONE;
3780 else if (strcmp(hookname, "destroy") == 0)
3781 which = LXCHOOK_DESTROY;
3782 else
3783 return -1;
3784
3785 lxc_list_for_each (it, &conf->hooks[which]) {
3786 int ret;
3787 char *hook = it->elem;
3788
3789 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
3790 hookname, argv);
3791 if (ret < 0)
3792 return -1;
3793 }
3794
3795 return 0;
3796 }
3797
3798 int lxc_clear_config_caps(struct lxc_conf *c)
3799 {
3800 struct lxc_list *it, *next;
3801
3802 lxc_list_for_each_safe (it, &c->caps, next) {
3803 lxc_list_del(it);
3804 free(it->elem);
3805 free(it);
3806 }
3807
3808 return 0;
3809 }
3810
3811 static int lxc_free_idmap(struct lxc_list *id_map)
3812 {
3813 struct lxc_list *it, *next;
3814
3815 lxc_list_for_each_safe (it, id_map, next) {
3816 lxc_list_del(it);
3817 free(it->elem);
3818 free(it);
3819 }
3820
3821 return 0;
3822 }
3823
3824 int lxc_clear_idmaps(struct lxc_conf *c)
3825 {
3826 return lxc_free_idmap(&c->id_map);
3827 }
3828
3829 int lxc_clear_config_keepcaps(struct lxc_conf *c)
3830 {
3831 struct lxc_list *it, *next;
3832
3833 lxc_list_for_each_safe (it, &c->keepcaps, next) {
3834 lxc_list_del(it);
3835 free(it->elem);
3836 free(it);
3837 }
3838
3839 return 0;
3840 }
3841
3842 int lxc_clear_namespace(struct lxc_conf *c)
3843 {
3844 int i;
3845 for (i = 0; i < LXC_NS_MAX; i++) {
3846 free(c->ns_share[i]);
3847 c->ns_share[i] = NULL;
3848 }
3849 return 0;
3850 }
3851
3852 int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
3853 {
3854 char *global_token, *namespaced_token;
3855 size_t namespaced_token_len;
3856 struct lxc_list *it, *next, *list;
3857 const char *k = key;
3858 bool all = false;
3859
3860 if (version == CGROUP2_SUPER_MAGIC) {
3861 global_token = "lxc.cgroup2";
3862 namespaced_token = "lxc.cgroup2.";
3863 namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
3864 list = &c->cgroup2;
3865 } else if (version == CGROUP_SUPER_MAGIC) {
3866 global_token = "lxc.cgroup";
3867 namespaced_token = "lxc.cgroup.";
3868 namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
3869 list = &c->cgroup;
3870 } else {
3871 return -EINVAL;
3872 }
3873
3874 if (strcmp(key, global_token) == 0)
3875 all = true;
3876 else if (strncmp(key, namespaced_token, namespaced_token_len) == 0)
3877 k += namespaced_token_len;
3878 else
3879 return -EINVAL;
3880
3881 lxc_list_for_each_safe (it, list, next) {
3882 struct lxc_cgroup *cg = it->elem;
3883
3884 if (!all && strcmp(cg->subsystem, k) != 0)
3885 continue;
3886
3887 lxc_list_del(it);
3888 free(cg->subsystem);
3889 free(cg->value);
3890 free(cg);
3891 free(it);
3892 }
3893
3894 return 0;
3895 }
3896
3897 int lxc_clear_limits(struct lxc_conf *c, const char *key)
3898 {
3899 struct lxc_list *it, *next;
3900 const char *k = NULL;
3901 bool all = false;
3902
3903 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
3904 all = true;
3905 else if (strncmp(key, "lxc.limit.", STRLITERALLEN("lxc.limit.")) == 0)
3906 k = key + STRLITERALLEN("lxc.limit.");
3907 else if (strncmp(key, "lxc.prlimit.", STRLITERALLEN("lxc.prlimit.")) == 0)
3908 k = key + STRLITERALLEN("lxc.prlimit.");
3909 else
3910 return -1;
3911
3912 lxc_list_for_each_safe (it, &c->limits, next) {
3913 struct lxc_limit *lim = it->elem;
3914
3915 if (!all && strcmp(lim->resource, k) != 0)
3916 continue;
3917
3918 lxc_list_del(it);
3919 free(lim->resource);
3920 free(lim);
3921 free(it);
3922 }
3923
3924 return 0;
3925 }
3926
3927 int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3928 {
3929 struct lxc_list *it, *next;
3930 const char *k = NULL;
3931 bool all = false;
3932
3933 if (strcmp(key, "lxc.sysctl") == 0)
3934 all = true;
3935 else if (strncmp(key, "lxc.sysctl.", STRLITERALLEN("lxc.sysctl.")) == 0)
3936 k = key + STRLITERALLEN("lxc.sysctl.");
3937 else
3938 return -1;
3939
3940 lxc_list_for_each_safe (it, &c->sysctls, next) {
3941 struct lxc_sysctl *elem = it->elem;
3942
3943 if (!all && strcmp(elem->key, k) != 0)
3944 continue;
3945
3946 lxc_list_del(it);
3947 free(elem->key);
3948 free(elem->value);
3949 free(elem);
3950 free(it);
3951 }
3952
3953 return 0;
3954 }
3955
3956 int lxc_clear_procs(struct lxc_conf *c, const char *key)
3957 {
3958 struct lxc_list *it, *next;
3959 const char *k = NULL;
3960 bool all = false;
3961
3962 if (strcmp(key, "lxc.proc") == 0)
3963 all = true;
3964 else if (strncmp(key, "lxc.proc.", STRLITERALLEN("lxc.proc.")) == 0)
3965 k = key + STRLITERALLEN("lxc.proc.");
3966 else
3967 return -1;
3968
3969 lxc_list_for_each_safe (it, &c->procs, next) {
3970 struct lxc_proc *proc = it->elem;
3971
3972 if (!all && strcmp(proc->filename, k) != 0)
3973 continue;
3974
3975 lxc_list_del(it);
3976 free(proc->filename);
3977 free(proc->value);
3978 free(proc);
3979 free(it);
3980 }
3981
3982 return 0;
3983 }
3984
3985 int lxc_clear_groups(struct lxc_conf *c)
3986 {
3987 struct lxc_list *it, *next;
3988
3989 lxc_list_for_each_safe (it, &c->groups, next) {
3990 lxc_list_del(it);
3991 free(it->elem);
3992 free(it);
3993 }
3994
3995 return 0;
3996 }
3997
3998 int lxc_clear_environment(struct lxc_conf *c)
3999 {
4000 struct lxc_list *it, *next;
4001
4002 lxc_list_for_each_safe (it, &c->environment, next) {
4003 lxc_list_del(it);
4004 free(it->elem);
4005 free(it);
4006 }
4007
4008 return 0;
4009 }
4010
4011 int lxc_clear_mount_entries(struct lxc_conf *c)
4012 {
4013 struct lxc_list *it, *next;
4014
4015 lxc_list_for_each_safe (it, &c->mount_list, next) {
4016 lxc_list_del(it);
4017 free(it->elem);
4018 free(it);
4019 }
4020
4021 return 0;
4022 }
4023
4024 int lxc_clear_automounts(struct lxc_conf *c)
4025 {
4026 c->auto_mounts = 0;
4027 return 0;
4028 }
4029
4030 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4031 {
4032 int i;
4033 struct lxc_list *it, *next;
4034 const char *k = NULL;
4035 bool all = false, done = false;
4036
4037 if (strcmp(key, "lxc.hook") == 0)
4038 all = true;
4039 else if (strncmp(key, "lxc.hook.", STRLITERALLEN("lxc.hook.")) == 0)
4040 k = key + STRLITERALLEN("lxc.hook.");
4041 else
4042 return -1;
4043
4044 for (i = 0; i < NUM_LXC_HOOKS; i++) {
4045 if (all || strcmp(k, lxchook_names[i]) == 0) {
4046 lxc_list_for_each_safe (it, &c->hooks[i], next) {
4047 lxc_list_del(it);
4048 free(it->elem);
4049 free(it);
4050 }
4051
4052 done = true;
4053 }
4054 }
4055
4056 if (!done) {
4057 ERROR("Invalid hook key: %s", key);
4058 return -1;
4059 }
4060
4061 return 0;
4062 }
4063
4064 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4065 {
4066 struct lxc_list *it, *next;
4067
4068 lxc_list_for_each_safe (it, &conf->aliens, next) {
4069 lxc_list_del(it);
4070 free(it->elem);
4071 free(it);
4072 }
4073 }
4074
4075 void lxc_clear_includes(struct lxc_conf *conf)
4076 {
4077 struct lxc_list *it, *next;
4078
4079 lxc_list_for_each_safe (it, &conf->includes, next) {
4080 lxc_list_del(it);
4081 free(it->elem);
4082 free(it);
4083 }
4084 }
4085
4086 int lxc_clear_apparmor_raw(struct lxc_conf *c)
4087 {
4088 struct lxc_list *it, *next;
4089
4090 lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
4091 lxc_list_del(it);
4092 free(it->elem);
4093 free(it);
4094 }
4095
4096 return 0;
4097 }
4098
4099 void lxc_conf_free(struct lxc_conf *conf)
4100 {
4101 if (!conf)
4102 return;
4103
4104 if (current_config == conf)
4105 current_config = NULL;
4106 lxc_terminal_conf_free(&conf->console);
4107 free(conf->rootfs.mount);
4108 free(conf->rootfs.bdev_type);
4109 free(conf->rootfs.options);
4110 free(conf->rootfs.path);
4111 free(conf->logfile);
4112 if (conf->logfd != -1)
4113 close(conf->logfd);
4114 free(conf->utsname);
4115 free(conf->ttys.dir);
4116 free(conf->ttys.tty_names);
4117 free(conf->fstab);
4118 free(conf->rcfile);
4119 free(conf->execute_cmd);
4120 free(conf->init_cmd);
4121 free(conf->init_cwd);
4122 free(conf->unexpanded_config);
4123 free(conf->syslog);
4124 lxc_free_networks(&conf->network);
4125 free(conf->lsm_aa_profile);
4126 free(conf->lsm_aa_profile_computed);
4127 free(conf->lsm_se_context);
4128 lxc_seccomp_free(conf);
4129 lxc_clear_config_caps(conf);
4130 lxc_clear_config_keepcaps(conf);
4131 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
4132 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
4133 lxc_clear_hooks(conf, "lxc.hook");
4134 lxc_clear_mount_entries(conf);
4135 lxc_clear_idmaps(conf);
4136 lxc_clear_groups(conf);
4137 lxc_clear_includes(conf);
4138 lxc_clear_aliens(conf);
4139 lxc_clear_environment(conf);
4140 lxc_clear_limits(conf, "lxc.prlimit");
4141 lxc_clear_sysctls(conf, "lxc.sysctl");
4142 lxc_clear_procs(conf, "lxc.proc");
4143 lxc_clear_apparmor_raw(conf);
4144 lxc_clear_namespace(conf);
4145 free(conf->cgroup_meta.dir);
4146 free(conf->cgroup_meta.controllers);
4147 free(conf->shmount.path_host);
4148 free(conf->shmount.path_cont);
4149 free(conf);
4150 }
4151
4152 struct userns_fn_data {
4153 int (*fn)(void *);
4154 const char *fn_name;
4155 void *arg;
4156 int p[2];
4157 };
4158
4159 static int run_userns_fn(void *data)
4160 {
4161 int ret;
4162 char c;
4163 struct userns_fn_data *d = data;
4164
4165 /* Close write end of the pipe. */
4166 close(d->p[1]);
4167
4168 /* Wait for parent to finish establishing a new mapping in the user
4169 * namespace we are executing in.
4170 */
4171 ret = lxc_read_nointr(d->p[0], &c, 1);
4172 /* Close read end of the pipe. */
4173 close(d->p[0]);
4174 if (ret != 1)
4175 return -1;
4176
4177 if (d->fn_name)
4178 TRACE("Calling function \"%s\"", d->fn_name);
4179
4180 /* Call function to run. */
4181 return d->fn(d->arg);
4182 }
4183
4184 static struct id_map *mapped_nsid_add(struct lxc_conf *conf, unsigned id,
4185 enum idtype idtype)
4186 {
4187 const struct id_map *map;
4188 struct id_map *retmap;
4189
4190 map = find_mapped_nsid_entry(conf, id, idtype);
4191 if (!map)
4192 return NULL;
4193
4194 retmap = malloc(sizeof(*retmap));
4195 if (!retmap)
4196 return NULL;
4197
4198 memcpy(retmap, map, sizeof(*retmap));
4199 return retmap;
4200 }
4201
4202 static struct id_map *find_mapped_hostid_entry(struct lxc_conf *conf,
4203 unsigned id, enum idtype idtype)
4204 {
4205 struct id_map *map;
4206 struct lxc_list *it;
4207 struct id_map *retmap = NULL;
4208
4209 lxc_list_for_each (it, &conf->id_map) {
4210 map = it->elem;
4211 if (map->idtype != idtype)
4212 continue;
4213
4214 if (id >= map->hostid && id < map->hostid + map->range) {
4215 retmap = map;
4216 break;
4217 }
4218 }
4219
4220 return retmap;
4221 }
4222
4223 /* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4224 * existing one or establish a new one.
4225 */
4226 static struct id_map *mapped_hostid_add(struct lxc_conf *conf, uid_t id,
4227 enum idtype type)
4228 {
4229 int hostid_mapped;
4230 struct id_map *entry = NULL, *tmp = NULL;
4231
4232 entry = malloc(sizeof(*entry));
4233 if (!entry)
4234 return NULL;
4235
4236 /* Reuse existing mapping. */
4237 tmp = find_mapped_hostid_entry(conf, id, type);
4238 if (tmp)
4239 return memcpy(entry, tmp, sizeof(*entry));
4240
4241 /* Find new mapping. */
4242 hostid_mapped = find_unmapped_nsid(conf, type);
4243 if (hostid_mapped < 0) {
4244 DEBUG("Failed to find free mapping for id %d", id);
4245 free(entry);
4246 return NULL;
4247 }
4248
4249 entry->idtype = type;
4250 entry->nsid = hostid_mapped;
4251 entry->hostid = (unsigned long)id;
4252 entry->range = 1;
4253
4254 return entry;
4255 }
4256
4257 struct lxc_list *get_minimal_idmap(struct lxc_conf *conf)
4258 {
4259 uid_t euid, egid;
4260 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4261 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
4262 struct lxc_list *idmap = NULL, *tmplist = NULL;
4263 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4264 *host_uid_map = NULL, *host_gid_map = NULL;
4265
4266 /* Find container root mappings. */
4267 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
4268 if (!container_root_uid) {
4269 DEBUG("Failed to find mapping for namespace uid %d", 0);
4270 goto on_error;
4271 }
4272 euid = geteuid();
4273 if (euid >= container_root_uid->hostid &&
4274 euid < (container_root_uid->hostid + container_root_uid->range))
4275 host_uid_map = container_root_uid;
4276
4277 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
4278 if (!container_root_gid) {
4279 DEBUG("Failed to find mapping for namespace gid %d", 0);
4280 goto on_error;
4281 }
4282 egid = getegid();
4283 if (egid >= container_root_gid->hostid &&
4284 egid < (container_root_gid->hostid + container_root_gid->range))
4285 host_gid_map = container_root_gid;
4286
4287 /* Check whether the {g,u}id of the user has a mapping. */
4288 if (!host_uid_map)
4289 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
4290 if (!host_uid_map) {
4291 DEBUG("Failed to find mapping for uid %d", euid);
4292 goto on_error;
4293 }
4294
4295 if (!host_gid_map)
4296 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
4297 if (!host_gid_map) {
4298 DEBUG("Failed to find mapping for gid %d", egid);
4299 goto on_error;
4300 }
4301
4302 /* Allocate new {g,u}id map list. */
4303 idmap = malloc(sizeof(*idmap));
4304 if (!idmap)
4305 goto on_error;
4306 lxc_list_init(idmap);
4307
4308 /* Add container root to the map. */
4309 tmplist = malloc(sizeof(*tmplist));
4310 if (!tmplist)
4311 goto on_error;
4312 lxc_list_add_elem(tmplist, container_root_uid);
4313 lxc_list_add_tail(idmap, tmplist);
4314
4315 if (host_uid_map && (host_uid_map != container_root_uid)) {
4316 /* idmap will now keep track of that memory. */
4317 container_root_uid = NULL;
4318
4319 /* Add container root to the map. */
4320 tmplist = malloc(sizeof(*tmplist));
4321 if (!tmplist)
4322 goto on_error;
4323 lxc_list_add_elem(tmplist, host_uid_map);
4324 lxc_list_add_tail(idmap, tmplist);
4325 }
4326 /* idmap will now keep track of that memory. */
4327 container_root_uid = NULL;
4328 /* idmap will now keep track of that memory. */
4329 host_uid_map = NULL;
4330
4331 tmplist = malloc(sizeof(*tmplist));
4332 if (!tmplist)
4333 goto on_error;
4334 lxc_list_add_elem(tmplist, container_root_gid);
4335 lxc_list_add_tail(idmap, tmplist);
4336
4337 if (host_gid_map && (host_gid_map != container_root_gid)) {
4338 /* idmap will now keep track of that memory. */
4339 container_root_gid = NULL;
4340
4341 tmplist = malloc(sizeof(*tmplist));
4342 if (!tmplist)
4343 goto on_error;
4344 lxc_list_add_elem(tmplist, host_gid_map);
4345 lxc_list_add_tail(idmap, tmplist);
4346 }
4347 /* idmap will now keep track of that memory. */
4348 container_root_gid = NULL;
4349 /* idmap will now keep track of that memory. */
4350 host_gid_map = NULL;
4351
4352 TRACE("Allocated minimal idmapping");
4353 return idmap;
4354
4355 on_error:
4356 if (idmap) {
4357 lxc_free_idmap(idmap);
4358 free(idmap);
4359 }
4360 if (container_root_uid)
4361 free(container_root_uid);
4362 if (container_root_gid)
4363 free(container_root_gid);
4364 if (host_uid_map && (host_uid_map != container_root_uid))
4365 free(host_uid_map);
4366 if (host_gid_map && (host_gid_map != container_root_gid))
4367 free(host_gid_map);
4368
4369 return NULL;
4370 }
4371
4372 /* Run a function in a new user namespace.
4373 * The caller's euid/egid will be mapped if it is not already.
4374 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4375 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4376 * This means we require only to establish a mapping from:
4377 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4378 * - the container root -> some sub{g,u}id
4379 * The former we add, if the user did not specify a mapping. The latter we
4380 * retrieve from the container's configured {g,u}id mappings as it must have been
4381 * there to start the container in the first place.
4382 */
4383 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4384 const char *fn_name)
4385 {
4386 pid_t pid;
4387 int p[2];
4388 struct userns_fn_data d;
4389 struct lxc_list *idmap;
4390 int ret = -1, status = -1;
4391 char c = '1';
4392
4393 if (!conf)
4394 return -EINVAL;
4395
4396 idmap = get_minimal_idmap(conf);
4397 if (!idmap)
4398 return -1;
4399
4400 ret = pipe2(p, O_CLOEXEC);
4401 if (ret < 0) {
4402 SYSERROR("Failed to create pipe");
4403 return -1;
4404 }
4405 d.fn = fn;
4406 d.fn_name = fn_name;
4407 d.arg = data;
4408 d.p[0] = p[0];
4409 d.p[1] = p[1];
4410
4411 /* Clone child in new user namespace. */
4412 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER);
4413 if (pid < 0) {
4414 ERROR("Failed to clone process in new user namespace");
4415 goto on_error;
4416 }
4417
4418 close(p[0]);
4419 p[0] = -1;
4420
4421 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4422 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4423 struct id_map *map;
4424 struct lxc_list *it;
4425
4426 lxc_list_for_each (it, idmap) {
4427 map = it->elem;
4428 TRACE("Establishing %cid mapping for \"%d\" in new "
4429 "user namespace: nsuid %lu - hostid %lu - range "
4430 "%lu",
4431 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4432 map->nsid, map->hostid, map->range);
4433 }
4434 }
4435
4436 /* Set up {g,u}id mapping for user namespace of child process. */
4437 ret = lxc_map_ids(idmap, pid);
4438 if (ret < 0) {
4439 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4440 goto on_error;
4441 }
4442
4443 /* Tell child to proceed. */
4444 if (lxc_write_nointr(p[1], &c, 1) != 1) {
4445 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4446 goto on_error;
4447 }
4448
4449 on_error:
4450 if (p[0] != -1)
4451 close(p[0]);
4452 close(p[1]);
4453
4454 /* Wait for child to finish. */
4455 if (pid > 0)
4456 status = wait_for_pid(pid);
4457
4458 if (status < 0)
4459 ret = -1;
4460
4461 return ret;
4462 }
4463
4464 int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4465 const char *fn_name)
4466 {
4467 pid_t pid;
4468 uid_t euid, egid;
4469 int p[2];
4470 struct id_map *map;
4471 struct lxc_list *cur;
4472 struct userns_fn_data d;
4473 int ret = -1;
4474 char c = '1';
4475 struct lxc_list *idmap = NULL, *tmplist = NULL;
4476 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4477 *host_uid_map = NULL, *host_gid_map = NULL;
4478
4479 if (!conf)
4480 return -EINVAL;
4481
4482 ret = pipe2(p, O_CLOEXEC);
4483 if (ret < 0) {
4484 SYSERROR("opening pipe");
4485 return -1;
4486 }
4487 d.fn = fn;
4488 d.fn_name = fn_name;
4489 d.arg = data;
4490 d.p[0] = p[0];
4491 d.p[1] = p[1];
4492
4493 /* Clone child in new user namespace. */
4494 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4495 if (pid < 0) {
4496 ERROR("Failed to clone process in new user namespace");
4497 goto on_error;
4498 }
4499
4500 close(p[0]);
4501 p[0] = -1;
4502
4503 euid = geteuid();
4504 egid = getegid();
4505
4506 /* Allocate new {g,u}id map list. */
4507 idmap = malloc(sizeof(*idmap));
4508 if (!idmap)
4509 goto on_error;
4510 lxc_list_init(idmap);
4511
4512 /* Find container root. */
4513 lxc_list_for_each (cur, &conf->id_map) {
4514 struct id_map *tmpmap;
4515
4516 tmplist = malloc(sizeof(*tmplist));
4517 if (!tmplist)
4518 goto on_error;
4519
4520 tmpmap = malloc(sizeof(*tmpmap));
4521 if (!tmpmap) {
4522 free(tmplist);
4523 goto on_error;
4524 }
4525
4526 memset(tmpmap, 0, sizeof(*tmpmap));
4527 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4528 tmplist->elem = tmpmap;
4529
4530 lxc_list_add_tail(idmap, tmplist);
4531
4532 map = cur->elem;
4533
4534 if (map->idtype == ID_TYPE_UID)
4535 if (euid >= map->hostid && euid < map->hostid + map->range)
4536 host_uid_map = map;
4537
4538 if (map->idtype == ID_TYPE_GID)
4539 if (egid >= map->hostid && egid < map->hostid + map->range)
4540 host_gid_map = map;
4541
4542 if (map->nsid != 0)
4543 continue;
4544
4545 if (map->idtype == ID_TYPE_UID)
4546 if (container_root_uid == NULL)
4547 container_root_uid = map;
4548
4549 if (map->idtype == ID_TYPE_GID)
4550 if (container_root_gid == NULL)
4551 container_root_gid = map;
4552 }
4553
4554 if (!container_root_uid || !container_root_gid) {
4555 ERROR("No mapping for container root found");
4556 goto on_error;
4557 }
4558
4559 /* Check whether the {g,u}id of the user has a mapping. */
4560 if (!host_uid_map)
4561 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
4562 else
4563 host_uid_map = container_root_uid;
4564
4565 if (!host_gid_map)
4566 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
4567 else
4568 host_gid_map = container_root_gid;
4569
4570 if (!host_uid_map) {
4571 DEBUG("Failed to find mapping for uid %d", euid);
4572 goto on_error;
4573 }
4574
4575 if (!host_gid_map) {
4576 DEBUG("Failed to find mapping for gid %d", egid);
4577 goto on_error;
4578 }
4579
4580 if (host_uid_map && (host_uid_map != container_root_uid)) {
4581 /* Add container root to the map. */
4582 tmplist = malloc(sizeof(*tmplist));
4583 if (!tmplist)
4584 goto on_error;
4585 lxc_list_add_elem(tmplist, host_uid_map);
4586 lxc_list_add_tail(idmap, tmplist);
4587 }
4588 /* idmap will now keep track of that memory. */
4589 host_uid_map = NULL;
4590
4591 if (host_gid_map && (host_gid_map != container_root_gid)) {
4592 tmplist = malloc(sizeof(*tmplist));
4593 if (!tmplist)
4594 goto on_error;
4595 lxc_list_add_elem(tmplist, host_gid_map);
4596 lxc_list_add_tail(idmap, tmplist);
4597 }
4598 /* idmap will now keep track of that memory. */
4599 host_gid_map = NULL;
4600
4601 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4602 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4603 lxc_list_for_each (cur, idmap) {
4604 map = cur->elem;
4605 TRACE("establishing %cid mapping for \"%d\" in new "
4606 "user namespace: nsuid %lu - hostid %lu - range "
4607 "%lu",
4608 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4609 map->nsid, map->hostid, map->range);
4610 }
4611 }
4612
4613 /* Set up {g,u}id mapping for user namespace of child process. */
4614 ret = lxc_map_ids(idmap, pid);
4615 if (ret < 0) {
4616 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
4617 goto on_error;
4618 }
4619
4620 /* Tell child to proceed. */
4621 if (lxc_write_nointr(p[1], &c, 1) != 1) {
4622 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4623 goto on_error;
4624 }
4625
4626 on_error:
4627 if (p[0] != -1)
4628 close(p[0]);
4629 close(p[1]);
4630
4631 /* Wait for child to finish. */
4632 if (pid > 0)
4633 ret = wait_for_pid(pid);
4634
4635 if (idmap) {
4636 lxc_free_idmap(idmap);
4637 free(idmap);
4638 }
4639
4640 if (host_uid_map && (host_uid_map != container_root_uid))
4641 free(host_uid_map);
4642 if (host_gid_map && (host_gid_map != container_root_gid))
4643 free(host_gid_map);
4644
4645 return ret;
4646 }
4647
4648 /* not thread-safe, do not use from api without first forking */
4649 static char *getuname(void)
4650 {
4651 struct passwd pwent;
4652 struct passwd *pwentp = NULL;
4653 char *buf;
4654 char *username;
4655 size_t bufsize;
4656 int ret;
4657
4658 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4659 if (bufsize == -1)
4660 bufsize = 1024;
4661
4662 buf = malloc(bufsize);
4663 if (!buf)
4664 return NULL;
4665
4666 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4667 if (!pwentp) {
4668 if (ret == 0)
4669 WARN("Could not find matched password record.");
4670
4671 ERROR("Failed to get password record - %u", geteuid());
4672 free(buf);
4673 return NULL;
4674 }
4675
4676 username = strdup(pwent.pw_name);
4677 free(buf);
4678
4679 return username;
4680 }
4681
4682 /* not thread-safe, do not use from api without first forking */
4683 static char *getgname(void)
4684 {
4685 struct group grent;
4686 struct group *grentp = NULL;
4687 char *buf;
4688 char *grname;
4689 size_t bufsize;
4690 int ret;
4691
4692 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4693 if (bufsize == -1)
4694 bufsize = 1024;
4695
4696 buf = malloc(bufsize);
4697 if (!buf)
4698 return NULL;
4699
4700 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4701 if (!grentp) {
4702 if (ret == 0)
4703 WARN("Could not find matched group record");
4704
4705 ERROR("Failed to get group record - %u", getegid());
4706 free(buf);
4707 return NULL;
4708 }
4709
4710 grname = strdup(grent.gr_name);
4711 free(buf);
4712
4713 return grname;
4714 }
4715
4716 /* not thread-safe, do not use from api without first forking */
4717 void suggest_default_idmap(void)
4718 {
4719 char *uname, *gname;
4720 FILE *f;
4721 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4722 size_t len = 0;
4723 char *line = NULL;
4724
4725 uname = getuname();
4726 if (!uname)
4727 return;
4728
4729 gname = getgname();
4730 if (!gname) {
4731 free(uname);
4732 return;
4733 }
4734
4735 f = fopen(subuidfile, "r");
4736 if (!f) {
4737 ERROR("Your system is not configured with subuids");
4738 free(gname);
4739 free(uname);
4740 return;
4741 }
4742
4743 while (getline(&line, &len, f) != -1) {
4744 char *p, *p2;
4745 size_t no_newline = 0;
4746
4747 p = strchr(line, ':');
4748 if (*line == '#')
4749 continue;
4750 if (!p)
4751 continue;
4752 *p = '\0';
4753 p++;
4754
4755 if (strcmp(line, uname))
4756 continue;
4757
4758 p2 = strchr(p, ':');
4759 if (!p2)
4760 continue;
4761 *p2 = '\0';
4762 p2++;
4763 if (!*p2)
4764 continue;
4765 no_newline = strcspn(p2, "\n");
4766 p2[no_newline] = '\0';
4767
4768 if (lxc_safe_uint(p, &uid) < 0)
4769 WARN("Could not parse UID");
4770 if (lxc_safe_uint(p2, &urange) < 0)
4771 WARN("Could not parse UID range");
4772 }
4773 fclose(f);
4774
4775 f = fopen(subgidfile, "r");
4776 if (!f) {
4777 ERROR("Your system is not configured with subgids");
4778 free(gname);
4779 free(uname);
4780 return;
4781 }
4782
4783 while (getline(&line, &len, f) != -1) {
4784 char *p, *p2;
4785 size_t no_newline = 0;
4786
4787 p = strchr(line, ':');
4788 if (*line == '#')
4789 continue;
4790 if (!p)
4791 continue;
4792 *p = '\0';
4793 p++;
4794
4795 if (strcmp(line, uname))
4796 continue;
4797
4798 p2 = strchr(p, ':');
4799 if (!p2)
4800 continue;
4801 *p2 = '\0';
4802 p2++;
4803 if (!*p2)
4804 continue;
4805 no_newline = strcspn(p2, "\n");
4806 p2[no_newline] = '\0';
4807
4808 if (lxc_safe_uint(p, &gid) < 0)
4809 WARN("Could not parse GID");
4810 if (lxc_safe_uint(p2, &grange) < 0)
4811 WARN("Could not parse GID range");
4812 }
4813 fclose(f);
4814
4815 free(line);
4816
4817 if (!urange || !grange) {
4818 ERROR("You do not have subuids or subgids allocated");
4819 ERROR("Unprivileged containers require subuids and subgids");
4820 free(uname);
4821 free(gname);
4822 return;
4823 }
4824
4825 ERROR("You must either run as root, or define uid mappings");
4826 ERROR("To pass uid mappings to lxc-create, you could create");
4827 ERROR("~/.config/lxc/default.conf:");
4828 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4829 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4830 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
4831
4832 free(gname);
4833 free(uname);
4834 }
4835
4836 static void free_cgroup_settings(struct lxc_list *result)
4837 {
4838 struct lxc_list *iterator, *next;
4839
4840 lxc_list_for_each_safe (iterator, result, next) {
4841 lxc_list_del(iterator);
4842 free(iterator);
4843 }
4844 free(result);
4845 }
4846
4847 /* Return the list of cgroup_settings sorted according to the following rules
4848 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4849 */
4850 struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
4851 {
4852 struct lxc_list *result;
4853 struct lxc_cgroup *cg = NULL;
4854 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
4855
4856 result = malloc(sizeof(*result));
4857 if (!result)
4858 return NULL;
4859 lxc_list_init(result);
4860
4861 /* Iterate over the cgroup settings and copy them to the output list. */
4862 lxc_list_for_each (it, cgroup_settings) {
4863 item = malloc(sizeof(*item));
4864 if (!item) {
4865 free_cgroup_settings(result);
4866 return NULL;
4867 }
4868
4869 item->elem = it->elem;
4870 cg = it->elem;
4871 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4872 /* Store the memsw_limit location */
4873 memsw_limit = item;
4874 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4875 memsw_limit != NULL) {
4876 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4877 * before lxc.cgroup.memory.limit_in_bytes, swap these
4878 * two items */
4879 item->elem = memsw_limit->elem;
4880 memsw_limit->elem = it->elem;
4881 }
4882 lxc_list_add_tail(result, item);
4883 }
4884
4885 return result;
4886 }