]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
Merge pull request #3636 from brauner/2021-01-27/fixes
[mirror_lxc.git] / src / lxc / conf.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #include <arpa/inet.h>
7 #include <dirent.h>
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <grp.h>
11 #include <inttypes.h>
12 #include <libgen.h>
13 #include <linux/loop.h>
14 #include <net/if.h>
15 #include <netinet/in.h>
16 #include <pwd.h>
17 #include <stdarg.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <sys/mman.h>
22 #include <sys/mount.h>
23 #include <sys/param.h>
24 #include <sys/prctl.h>
25 #include <sys/sendfile.h>
26 #include <sys/socket.h>
27 #include <sys/stat.h>
28 #include <sys/syscall.h>
29 #include <sys/sysmacros.h>
30 #include <sys/types.h>
31 #include <sys/utsname.h>
32 #include <sys/wait.h>
33 #include <time.h>
34 #include <unistd.h>
35
36 #include "af_unix.h"
37 #include "caps.h"
38 #include "cgroups/cgroup.h"
39 #include "conf.h"
40 #include "config.h"
41 #include "confile.h"
42 #include "confile_utils.h"
43 #include "error.h"
44 #include "log.h"
45 #include "lsm/lsm.h"
46 #include "lxclock.h"
47 #include "lxcseccomp.h"
48 #include "macro.h"
49 #include "memory_utils.h"
50 #include "mount_utils.h"
51 #include "namespace.h"
52 #include "network.h"
53 #include "parse.h"
54 #include "process_utils.h"
55 #include "ringbuf.h"
56 #include "start.h"
57 #include "storage/storage.h"
58 #include "storage/overlay.h"
59 #include "syscall_wrappers.h"
60 #include "terminal.h"
61 #include "utils.h"
62 #include "uuid.h"
63
64 #ifdef MAJOR_IN_MKDEV
65 #include <sys/mkdev.h>
66 #endif
67
68 #ifdef HAVE_STATVFS
69 #include <sys/statvfs.h>
70 #endif
71
72 #if HAVE_OPENPTY
73 #include <pty.h>
74 #else
75 #include <../include/openpty.h>
76 #endif
77
78 #if HAVE_LIBCAP
79 #include <sys/capability.h>
80 #endif
81
82 #if HAVE_SYS_PERSONALITY_H
83 #include <sys/personality.h>
84 #endif
85
86 #ifndef HAVE_STRLCAT
87 #include "include/strlcat.h"
88 #endif
89
90 #if IS_BIONIC
91 #include <../include/lxcmntent.h>
92 #else
93 #include <mntent.h>
94 #endif
95
96 #if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
97 #include <../include/prlimit.h>
98 #endif
99
100 lxc_log_define(conf, lxc);
101
102 /* The lxc_conf of the container currently being worked on in an API call.
103 * This is used in the error calls.
104 */
105 #ifdef HAVE_TLS
106 thread_local struct lxc_conf *current_config;
107 #else
108 struct lxc_conf *current_config;
109 #endif
110
111 char *lxchook_names[NUM_LXC_HOOKS] = {
112 "pre-start",
113 "pre-mount",
114 "mount",
115 "autodev",
116 "start",
117 "stop",
118 "post-stop",
119 "clone",
120 "destroy",
121 "start-host"
122 };
123
124 struct mount_opt {
125 char *name;
126 int clear;
127 int flag;
128 };
129
130 struct caps_opt {
131 char *name;
132 int value;
133 };
134
135 struct limit_opt {
136 char *name;
137 int value;
138 };
139
140 static struct mount_opt mount_opt[] = {
141 { "async", 1, MS_SYNCHRONOUS },
142 { "atime", 1, MS_NOATIME },
143 { "bind", 0, MS_BIND },
144 { "defaults", 0, 0 },
145 { "dev", 1, MS_NODEV },
146 { "diratime", 1, MS_NODIRATIME },
147 { "dirsync", 0, MS_DIRSYNC },
148 { "exec", 1, MS_NOEXEC },
149 { "lazytime", 0, MS_LAZYTIME },
150 { "mand", 0, MS_MANDLOCK },
151 { "noatime", 0, MS_NOATIME },
152 { "nodev", 0, MS_NODEV },
153 { "nodiratime", 0, MS_NODIRATIME },
154 { "noexec", 0, MS_NOEXEC },
155 { "nomand", 1, MS_MANDLOCK },
156 { "norelatime", 1, MS_RELATIME },
157 { "nostrictatime", 1, MS_STRICTATIME },
158 { "nosuid", 0, MS_NOSUID },
159 { "rbind", 0, MS_BIND|MS_REC },
160 { "relatime", 0, MS_RELATIME },
161 { "remount", 0, MS_REMOUNT },
162 { "ro", 0, MS_RDONLY },
163 { "rw", 1, MS_RDONLY },
164 { "strictatime", 0, MS_STRICTATIME },
165 { "suid", 1, MS_NOSUID },
166 { "sync", 0, MS_SYNCHRONOUS },
167 { NULL, 0, 0 },
168 };
169
170 static struct mount_opt propagation_opt[] = {
171 { "private", 0, MS_PRIVATE },
172 { "shared", 0, MS_SHARED },
173 { "slave", 0, MS_SLAVE },
174 { "unbindable", 0, MS_UNBINDABLE },
175 { "rprivate", 0, MS_PRIVATE|MS_REC },
176 { "rshared", 0, MS_SHARED|MS_REC },
177 { "rslave", 0, MS_SLAVE|MS_REC },
178 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
179 { NULL, 0, 0 },
180 };
181
182 static struct caps_opt caps_opt[] = {
183 #if HAVE_LIBCAP
184 { "chown", CAP_CHOWN },
185 { "dac_override", CAP_DAC_OVERRIDE },
186 { "dac_read_search", CAP_DAC_READ_SEARCH },
187 { "fowner", CAP_FOWNER },
188 { "fsetid", CAP_FSETID },
189 { "kill", CAP_KILL },
190 { "setgid", CAP_SETGID },
191 { "setuid", CAP_SETUID },
192 { "setpcap", CAP_SETPCAP },
193 { "linux_immutable", CAP_LINUX_IMMUTABLE },
194 { "net_bind_service", CAP_NET_BIND_SERVICE },
195 { "net_broadcast", CAP_NET_BROADCAST },
196 { "net_admin", CAP_NET_ADMIN },
197 { "net_raw", CAP_NET_RAW },
198 { "ipc_lock", CAP_IPC_LOCK },
199 { "ipc_owner", CAP_IPC_OWNER },
200 { "sys_module", CAP_SYS_MODULE },
201 { "sys_rawio", CAP_SYS_RAWIO },
202 { "sys_chroot", CAP_SYS_CHROOT },
203 { "sys_ptrace", CAP_SYS_PTRACE },
204 { "sys_pacct", CAP_SYS_PACCT },
205 { "sys_admin", CAP_SYS_ADMIN },
206 { "sys_boot", CAP_SYS_BOOT },
207 { "sys_nice", CAP_SYS_NICE },
208 { "sys_resource", CAP_SYS_RESOURCE },
209 { "sys_time", CAP_SYS_TIME },
210 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
211 { "mknod", CAP_MKNOD },
212 { "lease", CAP_LEASE },
213 { "audit_write", CAP_AUDIT_WRITE },
214 { "audit_control", CAP_AUDIT_CONTROL },
215 { "setfcap", CAP_SETFCAP },
216 { "mac_override", CAP_MAC_OVERRIDE },
217 { "mac_admin", CAP_MAC_ADMIN },
218 { "syslog", CAP_SYSLOG },
219 { "wake_alarm", CAP_WAKE_ALARM },
220 { "block_suspend", CAP_BLOCK_SUSPEND },
221 { "audit_read", CAP_AUDIT_READ },
222 { "perfmon", CAP_PERFMON },
223 { "bpf", CAP_BPF },
224 { "checkpoint_restore", CAP_CHECKPOINT_RESTORE },
225 #endif
226 };
227
228 static struct limit_opt limit_opt[] = {
229 #ifdef RLIMIT_AS
230 { "as", RLIMIT_AS },
231 #endif
232 #ifdef RLIMIT_CORE
233 { "core", RLIMIT_CORE },
234 #endif
235 #ifdef RLIMIT_CPU
236 { "cpu", RLIMIT_CPU },
237 #endif
238 #ifdef RLIMIT_DATA
239 { "data", RLIMIT_DATA },
240 #endif
241 #ifdef RLIMIT_FSIZE
242 { "fsize", RLIMIT_FSIZE },
243 #endif
244 #ifdef RLIMIT_LOCKS
245 { "locks", RLIMIT_LOCKS },
246 #endif
247 #ifdef RLIMIT_MEMLOCK
248 { "memlock", RLIMIT_MEMLOCK },
249 #endif
250 #ifdef RLIMIT_MSGQUEUE
251 { "msgqueue", RLIMIT_MSGQUEUE },
252 #endif
253 #ifdef RLIMIT_NICE
254 { "nice", RLIMIT_NICE },
255 #endif
256 #ifdef RLIMIT_NOFILE
257 { "nofile", RLIMIT_NOFILE },
258 #endif
259 #ifdef RLIMIT_NPROC
260 { "nproc", RLIMIT_NPROC },
261 #endif
262 #ifdef RLIMIT_RSS
263 { "rss", RLIMIT_RSS },
264 #endif
265 #ifdef RLIMIT_RTPRIO
266 { "rtprio", RLIMIT_RTPRIO },
267 #endif
268 #ifdef RLIMIT_RTTIME
269 { "rttime", RLIMIT_RTTIME },
270 #endif
271 #ifdef RLIMIT_SIGPENDING
272 { "sigpending", RLIMIT_SIGPENDING },
273 #endif
274 #ifdef RLIMIT_STACK
275 { "stack", RLIMIT_STACK },
276 #endif
277 };
278
279 static int run_buffer(char *buffer)
280 {
281 __do_free char *output = NULL;
282 __do_lxc_pclose struct lxc_popen_FILE *f = NULL;
283 int fd, ret;
284
285 f = lxc_popen(buffer);
286 if (!f)
287 return log_error_errno(-1, errno, "Failed to popen() %s", buffer);
288
289 output = malloc(LXC_LOG_BUFFER_SIZE);
290 if (!output)
291 return log_error_errno(-1, ENOMEM, "Failed to allocate memory for %s", buffer);
292
293 fd = fileno(f->f);
294 if (fd < 0)
295 return log_error_errno(-1, errno, "Failed to retrieve underlying file descriptor");
296
297 for (int i = 0; i < 10; i++) {
298 ssize_t bytes_read;
299
300 bytes_read = lxc_read_nointr(fd, output, LXC_LOG_BUFFER_SIZE - 1);
301 if (bytes_read > 0) {
302 output[bytes_read] = '\0';
303 DEBUG("Script %s produced output: %s", buffer, output);
304 continue;
305 }
306
307 break;
308 }
309
310 ret = lxc_pclose(move_ptr(f));
311 if (ret == -1)
312 return log_error_errno(-1, errno, "Script exited with error");
313 else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0)
314 return log_error(-1, "Script exited with status %d", WEXITSTATUS(ret));
315 else if (WIFSIGNALED(ret))
316 return log_error(-1, "Script terminated by signal %d", WTERMSIG(ret));
317
318 return 0;
319 }
320
321 int run_script_argv(const char *name, unsigned int hook_version,
322 const char *section, const char *script,
323 const char *hookname, char **argv)
324 {
325 __do_free char *buffer = NULL;
326 int buf_pos, i, ret;
327 size_t size = 0;
328
329 if (hook_version == 0)
330 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
331 script, name, section);
332 else
333 INFO("Executing script \"%s\" for container \"%s\"", script, name);
334
335 for (i = 0; argv && argv[i]; i++)
336 size += strlen(argv[i]) + 1;
337
338 size += STRLITERALLEN("exec");
339 size++;
340 size += strlen(script);
341 size++;
342
343 if (size > INT_MAX)
344 return -EFBIG;
345
346 if (hook_version == 0) {
347 size += strlen(hookname);
348 size++;
349
350 size += strlen(name);
351 size++;
352
353 size += strlen(section);
354 size++;
355
356 if (size > INT_MAX)
357 return -EFBIG;
358 }
359
360 buffer = malloc(size);
361 if (!buffer)
362 return -ENOMEM;
363
364 if (hook_version == 0)
365 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
366 else
367 buf_pos = snprintf(buffer, size, "exec %s", script);
368 if (buf_pos < 0 || (size_t)buf_pos >= size)
369 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
370
371 if (hook_version == 1) {
372 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
373 if (ret < 0) {
374 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_TYPE=%s", hookname);
375 }
376 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
377
378 ret = setenv("LXC_HOOK_SECTION", section, 1);
379 if (ret < 0)
380 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_HOOK_SECTION=%s", section);
381 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
382
383 if (strcmp(section, "net") == 0) {
384 char *parent;
385
386 if (!argv || !argv[0])
387 return -1;
388
389 ret = setenv("LXC_NET_TYPE", argv[0], 1);
390 if (ret < 0)
391 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_TYPE=%s", argv[0]);
392 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
393
394 parent = argv[1] ? argv[1] : "";
395
396 if (strcmp(argv[0], "macvlan") == 0) {
397 ret = setenv("LXC_NET_PARENT", parent, 1);
398 if (ret < 0)
399 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
400 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
401 } else if (strcmp(argv[0], "phys") == 0) {
402 ret = setenv("LXC_NET_PARENT", parent, 1);
403 if (ret < 0)
404 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
405 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
406 } else if (strcmp(argv[0], "veth") == 0) {
407 char *peer = argv[2] ? argv[2] : "";
408
409 ret = setenv("LXC_NET_PEER", peer, 1);
410 if (ret < 0)
411 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PEER=%s", peer);
412 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
413
414 ret = setenv("LXC_NET_PARENT", parent, 1);
415 if (ret < 0)
416 return log_error_errno(-1, errno, "Failed to set environment variable: LXC_NET_PARENT=%s", parent);
417 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
418 }
419 }
420 }
421
422 for (i = 0; argv && argv[i]; i++) {
423 size_t len = size - buf_pos;
424
425 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
426 if (ret < 0 || (size_t)ret >= len)
427 return log_error_errno(-1, errno, "Failed to create command line for script \"%s\"", script);
428 buf_pos += ret;
429 }
430
431 return run_buffer(buffer);
432 }
433
434 int run_script(const char *name, const char *section, const char *script, ...)
435 {
436 __do_free char *buffer = NULL;
437 int ret;
438 char *p;
439 va_list ap;
440 size_t size = 0;
441
442 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
443 script, name, section);
444
445 va_start(ap, script);
446 while ((p = va_arg(ap, char *)))
447 size += strlen(p) + 1;
448 va_end(ap);
449
450 size += STRLITERALLEN("exec");
451 size += strlen(script);
452 size += strlen(name);
453 size += strlen(section);
454 size += 4;
455
456 if (size > INT_MAX)
457 return -1;
458
459 buffer = must_realloc(NULL, size);
460 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
461 if (ret < 0 || ret >= size)
462 return -1;
463
464 va_start(ap, script);
465 while ((p = va_arg(ap, char *))) {
466 int len = size - ret;
467 int rc;
468 rc = snprintf(buffer + ret, len, " %s", p);
469 if (rc < 0 || rc >= len) {
470 va_end(ap);
471 return -1;
472 }
473 ret += rc;
474 }
475 va_end(ap);
476
477 return run_buffer(buffer);
478 }
479
480 /* pin_rootfs
481 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
482 * the duration of the container run, to prevent the container from marking
483 * the underlying fs readonly on shutdown. unlink the file immediately so
484 * no name pollution is happens.
485 * don't unlink on NFS to avoid random named stale handles.
486 * return -1 on error.
487 * return -2 if nothing needed to be pinned.
488 * return an open fd (>=0) if we pinned it.
489 */
490 int pin_rootfs(const char *rootfs)
491 {
492 __do_free char *absrootfs = NULL;
493 int fd, ret;
494 char absrootfspin[PATH_MAX];
495 struct stat s;
496 struct statfs sfs;
497
498 if (rootfs == NULL || strlen(rootfs) == 0)
499 return -2;
500
501 absrootfs = realpath(rootfs, NULL);
502 if (!absrootfs)
503 return -2;
504
505 ret = stat(absrootfs, &s);
506 if (ret < 0)
507 return -1;
508
509 if (!S_ISDIR(s.st_mode))
510 return -2;
511
512 ret = snprintf(absrootfspin, sizeof(absrootfspin), "%s/.lxc-keep", absrootfs);
513 if (ret < 0 || (size_t)ret >= sizeof(absrootfspin))
514 return -1;
515
516 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR | O_CLOEXEC);
517 if (fd < 0)
518 return fd;
519
520 ret = fstatfs (fd, &sfs);
521 if (ret < 0)
522 return fd;
523
524 if (sfs.f_type == NFS_SUPER_MAGIC)
525 return log_debug(fd, "Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
526
527 (void)unlink(absrootfspin);
528
529 return fd;
530 }
531
532 /* If we are asking to remount something, make sure that any NOEXEC etc are
533 * honored.
534 */
535 unsigned long add_required_remount_flags(const char *s, const char *d,
536 unsigned long flags)
537 {
538 #ifdef HAVE_STATVFS
539 int ret;
540 struct statvfs sb;
541 unsigned long required_flags = 0;
542
543 if (!s)
544 s = d;
545
546 if (!s)
547 return flags;
548
549 ret = statvfs(s, &sb);
550 if (ret < 0)
551 return flags;
552
553 if (flags & MS_REMOUNT) {
554 if (sb.f_flag & MS_NOSUID)
555 required_flags |= MS_NOSUID;
556 if (sb.f_flag & MS_NODEV)
557 required_flags |= MS_NODEV;
558 if (sb.f_flag & MS_RDONLY)
559 required_flags |= MS_RDONLY;
560 if (sb.f_flag & MS_NOEXEC)
561 required_flags |= MS_NOEXEC;
562 }
563
564 if (sb.f_flag & MS_NOATIME)
565 required_flags |= MS_NOATIME;
566 if (sb.f_flag & MS_NODIRATIME)
567 required_flags |= MS_NODIRATIME;
568 if (sb.f_flag & MS_LAZYTIME)
569 required_flags |= MS_LAZYTIME;
570 if (sb.f_flag & MS_RELATIME)
571 required_flags |= MS_RELATIME;
572 if (sb.f_flag & MS_STRICTATIME)
573 required_flags |= MS_STRICTATIME;
574
575 return flags | required_flags;
576 #else
577 return flags;
578 #endif
579 }
580
581 static int add_shmount_to_list(struct lxc_conf *conf)
582 {
583 char new_mount[PATH_MAX];
584 /* Offset for the leading '/' since the path_cont
585 * is absolute inside the container.
586 */
587 int offset = 1, ret = -1;
588
589 ret = snprintf(new_mount, sizeof(new_mount),
590 "%s %s none bind,create=dir 0 0", conf->shmount.path_host,
591 conf->shmount.path_cont + offset);
592 if (ret < 0 || (size_t)ret >= sizeof(new_mount))
593 return -1;
594
595 return add_elem_to_mount_list(new_mount, conf);
596 }
597
598 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
599 {
600 int i, ret;
601 static struct {
602 int match_mask;
603 int match_flag;
604 const char *source;
605 const char *destination;
606 const char *fstype;
607 unsigned long flags;
608 const char *options;
609 bool requires_cap_net_admin;
610 } default_mounts[] = {
611 /* Read-only bind-mounting... In older kernels, doing that
612 * required to do one MS_BIND mount and then
613 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
614 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
615 * onwards. However, this apparently does not work on kernel
616 * 3.8. Unfortunately, on that very same kernel, doing the same
617 * trick as above doesn't seem to work either, there one needs
618 * to ALSO specify MS_BIND for the remount, otherwise the
619 * entire fs is remounted read-only or the mount fails because
620 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
621 * kernels as low as 2.6.32...
622 */
623 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
624 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
625 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL, true },
626 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL, false },
627 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
628 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL, true },
629 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL, false },
630 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
631 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
632 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL, false },
633 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL, false },
634 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL, false },
635 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL, false },
636 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL, false },
637 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL, false },
638 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL, false },
639 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL, false },
640 { 0, 0, NULL, NULL, NULL, 0, NULL, false }
641 };
642 struct lxc_rootfs *rootfs = &conf->rootfs;
643 bool has_cap_net_admin;
644
645 if (flags & LXC_AUTO_PROC_MASK) {
646 ret = mkdirat(rootfs->mntpt_fd, "proc" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
647 if (ret < 0 && errno != EEXIST)
648 return log_error_errno(-errno, errno,
649 "Failed to create proc mountpoint under %d", rootfs->mntpt_fd);
650 }
651
652 if (flags & LXC_AUTO_SYS_MASK) {
653 ret = mkdirat(rootfs->mntpt_fd, "sys" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
654 if (ret < 0 && errno != EEXIST)
655 return log_error_errno(-errno, errno,
656 "Failed to create sysfs mountpoint under %d", rootfs->mntpt_fd);
657 }
658
659 has_cap_net_admin = lxc_wants_cap(CAP_NET_ADMIN, conf);
660 for (i = 0; default_mounts[i].match_mask; i++) {
661 __do_free char *destination = NULL, *source = NULL;
662 int saved_errno;
663 unsigned long mflags;
664 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
665 continue;
666
667 if (default_mounts[i].source) {
668 /* will act like strdup if %r is not present */
669 source = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].source);
670 if (!source)
671 return -1;
672 }
673
674 if (!default_mounts[i].destination)
675 return log_error(-1, "BUG: auto mounts destination %d was NULL", i);
676
677 if (!has_cap_net_admin && default_mounts[i].requires_cap_net_admin) {
678 TRACE("Container does not have CAP_NET_ADMIN. Skipping \"%s\" mount", default_mounts[i].source ?: "(null)");
679 continue;
680 }
681
682 /* will act like strdup if %r is not present */
683 destination = lxc_string_replace("%r", rootfs->path ? rootfs->mount : "", default_mounts[i].destination);
684 if (!destination)
685 return -1;
686
687 mflags = add_required_remount_flags(source, destination,
688 default_mounts[i].flags);
689 ret = safe_mount(source, destination, default_mounts[i].fstype,
690 mflags, default_mounts[i].options,
691 rootfs->path ? rootfs->mount : NULL);
692 saved_errno = errno;
693 if (ret < 0 && errno == ENOENT) {
694 INFO("Mount source or target for \"%s\" on \"%s\" does not exist. Skipping", source, destination);
695 ret = 0;
696 } else if (ret < 0) {
697 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
698 }
699
700 if (ret < 0) {
701 errno = saved_errno;
702 return -1;
703 }
704 }
705
706 if (flags & LXC_AUTO_CGROUP_MASK) {
707 int cg_flags;
708
709 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
710 /* If the type of cgroup mount was not specified, it depends on
711 * the container's capabilities as to what makes sense: if we
712 * have CAP_SYS_ADMIN, the read-only part can be remounted
713 * read-write anyway, so we may as well default to read-write;
714 * then the admin will not be given a false sense of security.
715 * (And if they really want mixed r/o r/w, then they can
716 * explicitly specify :mixed.) OTOH, if the container lacks
717 * CAP_SYS_ADMIN, do only default to :mixed, because then the
718 * container can't remount it read-write.
719 */
720 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
721 int has_sys_admin = 0;
722
723 if (!lxc_list_empty(&conf->keepcaps))
724 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
725 else
726 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
727
728 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
729 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
730 else
731 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
732 }
733
734 if (flags & LXC_AUTO_CGROUP_FORCE)
735 cg_flags |= LXC_AUTO_CGROUP_FORCE;
736
737 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
738 handler,
739 rootfs->path ? rootfs->mount : "",
740 cg_flags))
741 return log_error_errno(-1, errno, "Failed to mount \"/sys/fs/cgroup\"");
742 }
743
744 if (flags & LXC_AUTO_SHMOUNTS_MASK) {
745 ret = add_shmount_to_list(conf);
746 if (ret < 0)
747 return log_error(-1, "Failed to add shmount entry to container config");
748 }
749
750 return 0;
751 }
752
753 static int setup_utsname(struct utsname *utsname)
754 {
755 int ret;
756
757 if (!utsname)
758 return 0;
759
760 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
761 if (ret < 0)
762 return log_error_errno(-1, errno, "Failed to set the hostname to \"%s\"",
763 utsname->nodename);
764
765 INFO("Set hostname to \"%s\"", utsname->nodename);
766
767 return 0;
768 }
769
770 struct dev_symlinks {
771 const char *oldpath;
772 const char *name;
773 };
774
775 static const struct dev_symlinks dev_symlinks[] = {
776 { "/proc/self/fd", "fd" },
777 { "/proc/self/fd/0", "stdin" },
778 { "/proc/self/fd/1", "stdout" },
779 { "/proc/self/fd/2", "stderr" },
780 };
781
782 static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
783 {
784 int i, ret;
785 char path[PATH_MAX];
786 struct stat s;
787
788 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
789 const struct dev_symlinks *d = &dev_symlinks[i];
790
791 ret = snprintf(path, sizeof(path), "%s/dev/%s",
792 rootfs->path ? rootfs->mount : "", d->name);
793 if (ret < 0 || (size_t)ret >= sizeof(path))
794 return -1;
795
796 /* Stat the path first. If we don't get an error accept it as
797 * is and don't try to create it
798 */
799 ret = stat(path, &s);
800 if (ret == 0)
801 continue;
802
803 ret = symlink(d->oldpath, path);
804 if (ret && errno != EEXIST) {
805 if (errno == EROFS)
806 WARN("Failed to create \"%s\". Read-only filesystem", path);
807 else
808 return log_error_errno(-1, errno, "Failed to create \"%s\"", path);
809 }
810 }
811
812 return 0;
813 }
814
815 /* Build a space-separate list of ptys to pass to systemd. */
816 static bool append_ttyname(char **pp, char *name)
817 {
818 char *p;
819 size_t size;
820
821 if (!*pp) {
822 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
823 if (!*pp)
824 return false;
825
826 sprintf(*pp, "container_ttys=%s", name);
827 return true;
828 }
829
830 size = strlen(*pp) + strlen(name) + 2;
831 p = realloc(*pp, size);
832 if (!p)
833 return false;
834
835 *pp = p;
836 (void)strlcat(p, " ", size);
837 (void)strlcat(p, name, size);
838
839 return true;
840 }
841
842 static int lxc_setup_ttys(struct lxc_conf *conf)
843 {
844 int i, ret;
845 const struct lxc_tty_info *ttys = &conf->ttys;
846 char *ttydir = ttys->dir;
847 char path[PATH_MAX], lxcpath[PATH_MAX];
848
849 if (!conf->rootfs.path)
850 return 0;
851
852 for (i = 0; i < ttys->max; i++) {
853 struct lxc_terminal_info *tty = &ttys->tty[i];
854
855 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
856 if (ret < 0 || (size_t)ret >= sizeof(path))
857 return -1;
858
859 if (ttydir) {
860 /* create dev/lxc/tty%d" */
861 ret = snprintf(lxcpath, sizeof(lxcpath),
862 "/dev/%s/tty%d", ttydir, i + 1);
863 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
864 return -1;
865
866 ret = mknod(lxcpath, S_IFREG | 0000, 0);
867 if (ret < 0 && errno != EEXIST) {
868 SYSERROR("Failed to create \"%s\"", lxcpath);
869 return -1;
870 }
871
872 ret = unlink(path);
873 if (ret < 0 && errno != ENOENT) {
874 SYSERROR("Failed to unlink \"%s\"", path);
875 return -1;
876 }
877
878 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
879 if (ret < 0) {
880 SYSWARN("Failed to bind mount \"%s\" onto \"%s\"", tty->name, lxcpath);
881 continue;
882 }
883 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, lxcpath);
884
885 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
886 ttydir, i + 1);
887 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
888 return -1;
889
890 ret = symlink(lxcpath, path);
891 if (ret < 0)
892 return log_error_errno(-1, errno, "Failed to create symlink \"%s\" -> \"%s\"", path, lxcpath);
893 } else {
894 /* If we populated /dev, then we need to create
895 * /dev/ttyN
896 */
897 ret = mknod(path, S_IFREG | 0000, 0);
898 if (ret < 0) /* this isn't fatal, continue */
899 SYSERROR("Failed to create \"%s\"", path);
900
901 ret = mount(tty->name, path, "none", MS_BIND, 0);
902 if (ret < 0) {
903 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
904 continue;
905 }
906
907 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
908 }
909
910 if (!append_ttyname(&conf->ttys.tty_names, tty->name))
911 return log_error(-1, "Error setting up container_ttys string");
912 }
913
914 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
915 return 0;
916 }
917
918 define_cleanup_function(struct lxc_tty_info *, lxc_delete_tty);
919
920 static int lxc_allocate_ttys(struct lxc_conf *conf)
921 {
922 struct lxc_terminal_info *tty_new = NULL;
923 int ret;
924 call_cleaner(lxc_delete_tty) struct lxc_tty_info *ttys = &conf->ttys;
925
926 /* no tty in the configuration */
927 if (ttys->max == 0)
928 return 0;
929
930 tty_new = malloc(sizeof(struct lxc_terminal_info) * ttys->max);
931 if (!tty_new)
932 return -ENOMEM;
933 ttys->tty = tty_new;
934
935 for (size_t i = 0; i < ttys->max; i++) {
936 struct lxc_terminal_info *tty = &ttys->tty[i];
937
938 tty->ptx = -EBADF;
939 tty->pty = -EBADF;
940 ret = openpty(&tty->ptx, &tty->pty, NULL, NULL, NULL);
941 if (ret < 0) {
942 ttys->max = i;
943 return log_error_errno(-ENOTTY, ENOTTY, "Failed to create tty %zu", i);
944 }
945
946 ret = ttyname_r(tty->pty, tty->name, sizeof(tty->name));
947 if (ret < 0) {
948 ttys->max = i;
949 return log_error_errno(-ENOTTY, ENOTTY, "Failed to retrieve name of tty %zu pty", i);
950 }
951
952 DEBUG("Created tty \"%s\" with ptx fd %d and pty fd %d",
953 tty->name, tty->ptx, tty->pty);
954
955 /* Prevent leaking the file descriptors to the container */
956 ret = fd_cloexec(tty->ptx, true);
957 if (ret < 0)
958 SYSWARN("Failed to set FD_CLOEXEC flag on ptx fd %d of tty device \"%s\"",
959 tty->ptx, tty->name);
960
961 ret = fd_cloexec(tty->pty, true);
962 if (ret < 0)
963 SYSWARN("Failed to set FD_CLOEXEC flag on pty fd %d of tty device \"%s\"",
964 tty->pty, tty->name);
965
966 tty->busy = -1;
967 }
968
969 INFO("Finished creating %zu tty devices", ttys->max);
970 move_ptr(ttys);
971 return 0;
972 }
973
974 void lxc_delete_tty(struct lxc_tty_info *ttys)
975 {
976 if (!ttys->tty)
977 return;
978
979 for (int i = 0; i < ttys->max; i++) {
980 struct lxc_terminal_info *tty = &ttys->tty[i];
981 close_prot_errno_disarm(tty->ptx);
982 close_prot_errno_disarm(tty->pty);
983 }
984
985 free_disarm(ttys->tty);
986 }
987
988 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
989 {
990 int i;
991 int ret = -1;
992 struct lxc_conf *conf = handler->conf;
993 struct lxc_tty_info *ttys = &conf->ttys;
994 int sock = handler->data_sock[0];
995
996 if (ttys->max == 0)
997 return 0;
998
999 for (i = 0; i < ttys->max; i++) {
1000 int ttyfds[2];
1001 struct lxc_terminal_info *tty = &ttys->tty[i];
1002
1003 ttyfds[0] = tty->ptx;
1004 ttyfds[1] = tty->pty;
1005
1006 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1007 if (ret < 0)
1008 break;
1009
1010 TRACE("Sent tty \"%s\" with ptx fd %d and pty fd %d to parent",
1011 tty->name, tty->ptx, tty->pty);
1012 }
1013
1014 if (ret < 0)
1015 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
1016 else
1017 TRACE("Sent %zu ttys to parent", ttys->max);
1018
1019 return ret;
1020 }
1021
1022 static int lxc_create_ttys(struct lxc_handler *handler)
1023 {
1024 int ret = -1;
1025 struct lxc_conf *conf = handler->conf;
1026
1027 ret = lxc_allocate_ttys(conf);
1028 if (ret < 0) {
1029 ERROR("Failed to allocate ttys");
1030 goto on_error;
1031 }
1032
1033 ret = lxc_send_ttys_to_parent(handler);
1034 if (ret < 0) {
1035 ERROR("Failed to send ttys to parent");
1036 goto on_error;
1037 }
1038
1039 if (!conf->is_execute) {
1040 ret = lxc_setup_ttys(conf);
1041 if (ret < 0) {
1042 ERROR("Failed to setup ttys");
1043 goto on_error;
1044 }
1045 }
1046
1047 if (conf->ttys.tty_names) {
1048 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
1049 if (ret < 0)
1050 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
1051 }
1052
1053 ret = 0;
1054
1055 on_error:
1056 lxc_delete_tty(&conf->ttys);
1057
1058 return ret;
1059 }
1060
1061 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1062 * error, log it but don't fail yet.
1063 */
1064 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1065 int autodevtmpfssize, const char *lxcpath)
1066 {
1067 const char *path = rootfs->path ? rootfs->mount : NULL;
1068 int ret;
1069 mode_t cur_mask;
1070 char mount_options[128];
1071
1072 INFO("Preparing \"/dev\"");
1073
1074 sprintf(mount_options, "size=%d,mode=755", (autodevtmpfssize != 0) ? autodevtmpfssize : 500000);
1075 DEBUG("Using mount options: %s", mount_options);
1076
1077 cur_mask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1078 ret = mkdirat(rootfs->mntpt_fd, "dev" , S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1079 if (ret < 0 && errno != EEXIST) {
1080 SYSERROR("Failed to create \"/dev\" directory");
1081 ret = -errno;
1082 goto reset_umask;
1083 }
1084
1085 ret = safe_mount_beneath_at(rootfs->mntpt_fd, "none", "dev", "tmpfs", 0, mount_options);
1086 if (ret < 0) {
1087 __do_free char *fallback_path = NULL;
1088
1089 if (errno != ENOSYS) {
1090 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1091 goto reset_umask;
1092 }
1093
1094 if (path) {
1095 fallback_path = must_make_path(path, "/dev", NULL);
1096 ret = safe_mount("none", fallback_path, "tmpfs", 0, mount_options, path);
1097 } else {
1098 ret = safe_mount("none", "dev", "tmpfs", 0, mount_options, NULL);
1099 }
1100 if (ret < 0) {
1101 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1102 goto reset_umask;
1103 }
1104 }
1105 TRACE("Mounted tmpfs on \"%s\"", path);
1106
1107 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1108 * If not, then create it and exit if that fails...
1109 */
1110 ret = mkdirat(rootfs->mntpt_fd, "dev/pts", S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1111 if (ret < 0 && errno != EEXIST) {
1112 SYSERROR("Failed to create directory \"%s\"", path);
1113 ret = -errno;
1114 goto reset_umask;
1115 }
1116
1117 ret = 0;
1118
1119 reset_umask:
1120 (void)umask(cur_mask);
1121
1122 INFO("Prepared \"/dev\"");
1123 return ret;
1124 }
1125
1126 struct lxc_device_node {
1127 const char *name;
1128 const mode_t mode;
1129 const int maj;
1130 const int min;
1131 };
1132
1133 static const struct lxc_device_node lxc_devices[] = {
1134 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1135 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1136 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1137 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1138 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1139 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1140 };
1141
1142
1143 enum {
1144 LXC_DEVNODE_BIND,
1145 LXC_DEVNODE_MKNOD,
1146 LXC_DEVNODE_PARTIAL,
1147 LXC_DEVNODE_OPEN,
1148 };
1149
1150 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1151 {
1152 __do_close int dev_dir_fd = -EBADF;
1153 int i, ret;
1154 mode_t cmask;
1155 int use_mknod = LXC_DEVNODE_MKNOD;
1156
1157 /* ignore, just don't try to fill in */
1158 if (!exists_dir_at(rootfs->mntpt_fd, "dev"))
1159 return 0;
1160
1161 dev_dir_fd = openat(rootfs->mntpt_fd, "dev/", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOFOLLOW);
1162 if (dev_dir_fd < 0)
1163 return -errno;
1164
1165 INFO("Populating \"/dev\"");
1166
1167 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1168 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
1169 char hostpath[PATH_MAX], path[PATH_MAX];
1170 const struct lxc_device_node *device = &lxc_devices[i];
1171
1172 if (use_mknod >= LXC_DEVNODE_MKNOD) {
1173 ret = mknodat(dev_dir_fd, device->name, device->mode, makedev(device->maj, device->min));
1174 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1175 DEBUG("Created device node \"%s\"", device->name);
1176 } else if (ret < 0) {
1177 if (errno != EPERM)
1178 return log_error_errno(-1, errno, "Failed to create device node \"%s\"", device->name);
1179
1180 use_mknod = LXC_DEVNODE_BIND;
1181 }
1182
1183 /* Device nodes are fully useable. */
1184 if (use_mknod == LXC_DEVNODE_OPEN)
1185 continue;
1186
1187 if (use_mknod == LXC_DEVNODE_MKNOD) {
1188 __do_close int fd = -EBADF;
1189 /* See
1190 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1191 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1192 */
1193 fd = openat(dev_dir_fd, device->name, O_RDONLY | O_CLOEXEC);
1194 if (fd >= 0) {
1195 /* Device nodes are fully useable. */
1196 use_mknod = LXC_DEVNODE_OPEN;
1197 continue;
1198 }
1199
1200 SYSTRACE("Failed to open \"%s\" device", device->name);
1201 /* Device nodes are only partially useable. */
1202 use_mknod = LXC_DEVNODE_PARTIAL;
1203 }
1204 }
1205
1206 if (use_mknod != LXC_DEVNODE_PARTIAL) {
1207 /* If we are dealing with partially functional device
1208 * nodes the prio mknod() call will have created the
1209 * device node so we can use it as a bind-mount target.
1210 */
1211 ret = mknodat(dev_dir_fd, device->name, S_IFREG | 0000, 0);
1212 if (ret < 0 && errno != EEXIST)
1213 return log_error_errno(-1, errno, "Failed to create file \"%s\"", device->name);
1214 }
1215
1216 /* Fallback to bind-mounting the device from the host. */
1217 ret = snprintf(hostpath, sizeof(hostpath), "/dev/%s", device->name);
1218 if (ret < 0 || (size_t)ret >= sizeof(hostpath))
1219 return ret_errno(EIO);
1220
1221 ret = safe_mount_beneath_at(dev_dir_fd, hostpath, device->name, NULL, MS_BIND, NULL);
1222 if (ret < 0) {
1223 const char *mntpt = rootfs->path ? rootfs->mount : NULL;
1224 if (errno == ENOSYS) {
1225 ret = snprintf(path, sizeof(path), "%s/dev/%s", mntpt, device->name);
1226 if (ret < 0 || ret >= sizeof(path))
1227 return log_error(-1, "Failed to create device path for %s", device->name);
1228 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL);
1229 }
1230 }
1231 if (ret < 0)
1232 return log_error_errno(-1, errno, "Failed to bind mount host device node \"%s\" onto \"%s\"", hostpath, device->name);
1233 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"", hostpath, device->name);
1234 }
1235 (void)umask(cmask);
1236
1237 INFO("Populated \"/dev\"");
1238 return 0;
1239 }
1240
1241 static int lxc_mount_rootfs(struct lxc_conf *conf)
1242 {
1243 int ret;
1244 struct lxc_storage *bdev;
1245 struct lxc_rootfs *rootfs = &conf->rootfs;
1246
1247 if (!rootfs->path) {
1248 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
1249 if (ret < 0)
1250 return log_error_errno(-1, errno, "Failed to recursively turn root mount tree into dependent mount");
1251
1252 rootfs->mntpt_fd = openat(-1, "/", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH);
1253 if (rootfs->mntpt_fd < 0)
1254 return -errno;
1255
1256 return 0;
1257 }
1258
1259 ret = access(rootfs->mount, F_OK);
1260 if (ret != 0)
1261 return log_error_errno(-1, errno, "Failed to access to \"%s\". Check it is present",
1262 rootfs->mount);
1263
1264 bdev = storage_init(conf);
1265 if (!bdev)
1266 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1267 rootfs->path, rootfs->mount,
1268 rootfs->options ? rootfs->options : "(null)");
1269
1270 ret = bdev->ops->mount(bdev);
1271 storage_put(bdev);
1272 if (ret < 0)
1273 return log_error(-1, "Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1274 rootfs->path, rootfs->mount,
1275 rootfs->options ? rootfs->options : "(null)");
1276
1277 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
1278 rootfs->path, rootfs->mount,
1279 rootfs->options ? rootfs->options : "(null)");
1280
1281 rootfs->mntpt_fd = openat(-1, rootfs->mount, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH);
1282 if (rootfs->mntpt_fd < 0)
1283 return -errno;
1284
1285 return 0;
1286 }
1287
1288 static int lxc_chroot(const struct lxc_rootfs *rootfs)
1289 {
1290 __do_free char *nroot = NULL;
1291 int i, ret;
1292 char *root = rootfs->mount;
1293
1294 nroot = realpath(root, NULL);
1295 if (!nroot)
1296 return log_error_errno(-1, errno, "Failed to resolve \"%s\"", root);
1297
1298 ret = chdir("/");
1299 if (ret < 0)
1300 return -1;
1301
1302 /* We could use here MS_MOVE, but in userns this mount is locked and
1303 * can't be moved.
1304 */
1305 ret = mount(nroot, "/", NULL, MS_REC | MS_BIND, NULL);
1306 if (ret < 0)
1307 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"/\" as MS_REC | MS_BIND", nroot);
1308
1309 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
1310 if (ret < 0)
1311 return log_error_errno(-1, errno, "Failed to remount \"/\"");
1312
1313 /* The following code cleans up inherited mounts which are not required
1314 * for CT.
1315 *
1316 * The mountinfo file shows not all mounts, if a few points have been
1317 * unmounted between read operations from the mountinfo. So we need to
1318 * read mountinfo a few times.
1319 *
1320 * This loop can be skipped if a container uses userns, because all
1321 * inherited mounts are locked and we should live with all this trash.
1322 */
1323 for (;;) {
1324 __do_fclose FILE *f = NULL;
1325 __do_free char *line = NULL;
1326 char *slider1, *slider2;
1327 int progress = 0;
1328 size_t len = 0;
1329
1330 f = fopen("./proc/self/mountinfo", "re");
1331 if (!f)
1332 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
1333
1334 while (getline(&line, &len, f) > 0) {
1335 for (slider1 = line, i = 0; slider1 && i < 4; i++)
1336 slider1 = strchr(slider1 + 1, ' ');
1337
1338 if (!slider1)
1339 continue;
1340
1341 slider2 = strchr(slider1 + 1, ' ');
1342 if (!slider2)
1343 continue;
1344
1345 *slider2 = '\0';
1346 *slider1 = '.';
1347
1348 if (strcmp(slider1 + 1, "/") == 0)
1349 continue;
1350
1351 if (strcmp(slider1 + 1, "/proc") == 0)
1352 continue;
1353
1354 ret = umount2(slider1, MNT_DETACH);
1355 if (ret == 0)
1356 progress++;
1357 }
1358
1359 if (!progress)
1360 break;
1361 }
1362
1363 /* This also can be skipped if a container uses userns. */
1364 (void)umount2("./proc", MNT_DETACH);
1365
1366 /* It is weird, but chdir("..") moves us in a new root */
1367 ret = chdir("..");
1368 if (ret < 0)
1369 return log_error_errno(-1, errno, "Failed to chdir(\"..\")");
1370
1371 ret = chroot(".");
1372 if (ret < 0)
1373 return log_error_errno(-1, errno, "Failed to chroot(\".\")");
1374
1375 return 0;
1376 }
1377
1378 /* (The following explanation is copied verbatim from the kernel.)
1379 *
1380 * pivot_root Semantics:
1381 * Moves the root file system of the current process to the directory put_old,
1382 * makes new_root as the new root file system of the current process, and sets
1383 * root/cwd of all processes which had them on the current root to new_root.
1384 *
1385 * Restrictions:
1386 * The new_root and put_old must be directories, and must not be on the
1387 * same file system as the current process root. The put_old must be
1388 * underneath new_root, i.e. adding a non-zero number of /.. to the string
1389 * pointed to by put_old must yield the same directory as new_root. No other
1390 * file system may be mounted on put_old. After all, new_root is a mountpoint.
1391 *
1392 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
1393 * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
1394 * in this situation.
1395 *
1396 * Notes:
1397 * - we don't move root/cwd if they are not at the root (reason: if something
1398 * cared enough to change them, it's probably wrong to force them elsewhere)
1399 * - it's okay to pick a root that isn't the root of a file system, e.g.
1400 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
1401 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
1402 * first.
1403 */
1404 static int lxc_pivot_root(const char *rootfs)
1405 {
1406 __do_close int oldroot = -EBADF, newroot = -EBADF;
1407 int ret;
1408
1409 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1410 if (oldroot < 0)
1411 return log_error_errno(-1, errno, "Failed to open old root directory");
1412
1413 newroot = open(rootfs, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1414 if (newroot < 0)
1415 return log_error_errno(-1, errno, "Failed to open new root directory");
1416
1417 /* change into new root fs */
1418 ret = fchdir(newroot);
1419 if (ret < 0)
1420 return log_error_errno(-1, errno, "Failed to change to new rootfs \"%s\"", rootfs);
1421
1422 /* pivot_root into our new root fs */
1423 ret = pivot_root(".", ".");
1424 if (ret < 0)
1425 return log_error_errno(-1, errno, "Failed to pivot_root()");
1426
1427 /* At this point the old-root is mounted on top of our new-root. To
1428 * unmounted it we must not be chdir'd into it, so escape back to
1429 * old-root.
1430 */
1431 ret = fchdir(oldroot);
1432 if (ret < 0)
1433 return log_error_errno(-1, errno, "Failed to enter old root directory");
1434
1435 /* Make oldroot a depedent mount to make sure our umounts don't propagate to the
1436 * host.
1437 */
1438 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
1439 if (ret < 0)
1440 return log_error_errno(-1, errno, "Failed to recursively turn old root mount tree into dependent mount");
1441
1442 ret = umount2(".", MNT_DETACH);
1443 if (ret < 0)
1444 return log_error_errno(-1, errno, "Failed to detach old root directory");
1445
1446 ret = fchdir(newroot);
1447 if (ret < 0)
1448 return log_error_errno(-1, errno, "Failed to re-enter new root directory");
1449
1450 TRACE("pivot_root(\"%s\") successful", rootfs);
1451
1452 return 0;
1453 }
1454
1455 static int lxc_setup_rootfs_switch_root(const struct lxc_rootfs *rootfs)
1456 {
1457 if (!rootfs->path)
1458 return log_debug(0, "Container does not have a rootfs");
1459
1460 if (detect_ramfs_rootfs())
1461 return lxc_chroot(rootfs);
1462
1463 return lxc_pivot_root(rootfs->mount);
1464 }
1465
1466 static const struct id_map *find_mapped_nsid_entry(const struct lxc_conf *conf,
1467 unsigned id,
1468 enum idtype idtype)
1469 {
1470 struct lxc_list *it;
1471 struct id_map *map;
1472 struct id_map *retmap = NULL;
1473
1474 /* Shortcut for container's root mappings. */
1475 if (id == 0) {
1476 if (idtype == ID_TYPE_UID)
1477 return conf->root_nsuid_map;
1478
1479 if (idtype == ID_TYPE_GID)
1480 return conf->root_nsgid_map;
1481 }
1482
1483 lxc_list_for_each(it, &conf->id_map) {
1484 map = it->elem;
1485 if (map->idtype != idtype)
1486 continue;
1487
1488 if (id >= map->nsid && id < map->nsid + map->range) {
1489 retmap = map;
1490 break;
1491 }
1492 }
1493
1494 return retmap;
1495 }
1496
1497 int lxc_setup_devpts_parent(struct lxc_handler *handler)
1498 {
1499 int ret;
1500
1501 if (handler->conf->pty_max <= 0)
1502 return 0;
1503
1504 ret = lxc_abstract_unix_recv_fds(handler->data_sock[1], &handler->conf->devpts_fd, 1,
1505 &handler->conf->devpts_fd, sizeof(handler->conf->devpts_fd));
1506 if (ret < 0)
1507 return log_error_errno(-1, errno, "Failed to receive devpts fd from child");
1508
1509 TRACE("Received devpts file descriptor %d from child", handler->conf->devpts_fd);
1510 return 0;
1511 }
1512
1513 static int lxc_setup_devpts_child(struct lxc_handler *handler)
1514 {
1515 __do_close int devpts_fd = -EBADF;
1516 int ret;
1517 char **opts;
1518 char devpts_mntopts[256];
1519 char *mntopt_sets[5];
1520 char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
1521 struct lxc_conf *conf = handler->conf;
1522 int sock = handler->data_sock[0];
1523
1524 if (conf->pty_max <= 0)
1525 return log_debug(0, "No new devpts instance will be mounted since no pts devices are requested");
1526
1527 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1528 default_devpts_mntopts, conf->pty_max);
1529 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1530 return -1;
1531
1532 (void)umount2("/dev/pts", MNT_DETACH);
1533
1534 /* Create mountpoint for devpts instance. */
1535 ret = mkdir("/dev/pts", 0755);
1536 if (ret < 0 && errno != EEXIST)
1537 return log_error_errno(-1, errno, "Failed to create \"/dev/pts\" directory");
1538
1539 /* gid=5 && max= */
1540 mntopt_sets[0] = devpts_mntopts;
1541
1542 /* !gid=5 && max= */
1543 mntopt_sets[1] = devpts_mntopts + STRLITERALLEN("gid=5") + 1;
1544
1545 /* gid=5 && !max= */
1546 mntopt_sets[2] = default_devpts_mntopts;
1547
1548 /* !gid=5 && !max= */
1549 mntopt_sets[3] = default_devpts_mntopts + STRLITERALLEN("gid=5") + 1;
1550
1551 /* end */
1552 mntopt_sets[4] = NULL;
1553
1554 for (ret = -1, opts = mntopt_sets; opts && *opts; opts++) {
1555 /* mount new devpts instance */
1556 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, *opts);
1557 if (ret == 0)
1558 break;
1559 }
1560
1561 if (ret < 0)
1562 return log_error_errno(-1, errno, "Failed to mount new devpts instance");
1563 DEBUG("Mount new devpts instance with options \"%s\"", *opts);
1564
1565 devpts_fd = openat(-EBADF, "/dev/pts", O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOFOLLOW);
1566 if (devpts_fd < 0) {
1567 devpts_fd = -EBADF;
1568 TRACE("Failed to create detached devpts mount");
1569 ret = lxc_abstract_unix_send_fds(sock, NULL, 0, &devpts_fd, sizeof(int));
1570 } else {
1571 ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0);
1572 }
1573 if (ret < 0)
1574 return log_error_errno(-1, errno, "Failed to send devpts fd to parent");
1575
1576 TRACE("Sent devpts file descriptor %d to parent", devpts_fd);
1577
1578 /* Remove any pre-existing /dev/ptmx file. */
1579 ret = remove("/dev/ptmx");
1580 if (ret < 0) {
1581 if (errno != ENOENT)
1582 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\" file");
1583 } else {
1584 DEBUG("Removed existing \"/dev/ptmx\" file");
1585 }
1586
1587 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1588 ret = mknod("/dev/ptmx", S_IFREG | 0000, 0);
1589 if (ret < 0 && errno != EEXIST)
1590 return log_error_errno(-1, errno, "Failed to create dummy \"/dev/ptmx\" file as bind mount target");
1591 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
1592
1593 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1594 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1595 if (!ret)
1596 return log_debug(0, "Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1597 else
1598 /* Fallthrough and try to create a symlink. */
1599 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1600
1601 /* Remove the dummy /dev/ptmx file we created above. */
1602 ret = remove("/dev/ptmx");
1603 if (ret < 0)
1604 return log_error_errno(-1, errno, "Failed to remove existing \"/dev/ptmx\"");
1605
1606 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1607 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1608 if (ret < 0)
1609 return log_error_errno(-1, errno, "Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
1610
1611 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
1612 return 0;
1613 }
1614
1615 static int setup_personality(int persona)
1616 {
1617 int ret;
1618
1619 #if HAVE_SYS_PERSONALITY_H
1620 if (persona == -1)
1621 return 0;
1622
1623 ret = personality(persona);
1624 if (ret < 0)
1625 return log_error_errno(-1, errno, "Failed to set personality to \"0x%x\"", persona);
1626
1627 INFO("Set personality to \"0x%x\"", persona);
1628 #endif
1629
1630 return 0;
1631 }
1632
1633 static inline bool wants_console(const struct lxc_terminal *terminal)
1634 {
1635 return !terminal->path || strcmp(terminal->path, "none");
1636 }
1637
1638 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1639 const struct lxc_terminal *console,
1640 int pty_mnt_fd)
1641 {
1642 int ret;
1643 char path[PATH_MAX];
1644 char *rootfs_path = rootfs->path ? rootfs->mount : "";
1645
1646 if (!wants_console(console))
1647 return 0;
1648
1649 /*
1650 * When we are asked to setup a console we remove any previous
1651 * /dev/console bind-mounts.
1652 */
1653 if (exists_file_at(rootfs->dev_mntpt_fd, "console")) {
1654 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1655 if (ret < 0 || (size_t)ret >= sizeof(path))
1656 return -1;
1657
1658 ret = lxc_unstack_mountpoint(path, false);
1659 if (ret < 0)
1660 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1661 else
1662 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
1663 }
1664
1665 /*
1666 * For unprivileged containers autodev or automounts will already have
1667 * taken care of creating /dev/console.
1668 */
1669 ret = mknodat(rootfs->dev_mntpt_fd, "console", S_IFREG | 0000, 0);
1670 if (ret < 0 && errno != EEXIST)
1671 return log_error_errno(-errno, errno, "Failed to create console");
1672
1673 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
1674 if (ret < 0)
1675 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
1676
1677 if (pty_mnt_fd >= 0) {
1678 ret = move_mount(pty_mnt_fd, "", rootfs->dev_mntpt_fd, "console", MOVE_MOUNT_F_EMPTY_PATH);
1679 if (!ret) {
1680 DEBUG("Moved mount \"%s\" onto \"%s\"", console->name, path);
1681 goto finish;
1682 }
1683
1684 if (ret && errno != ENOSYS)
1685 return log_error_errno(-1, errno,
1686 "Failed to mount %d(%s) on \"%s\"",
1687 pty_mnt_fd, console->name, path);
1688 }
1689
1690 ret = safe_mount_beneath_at(rootfs->dev_mntpt_fd, console->name, "console", NULL, MS_BIND, NULL);
1691 if (ret < 0) {
1692 if (errno == ENOSYS) {
1693 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1694 if (ret < 0 || (size_t)ret >= sizeof(path))
1695 return -1;
1696
1697 ret = safe_mount(console->name, path, "none", MS_BIND, NULL, rootfs_path);
1698 if (ret < 0)
1699 return log_error_errno(-1, errno, "Failed to mount %d(%s) on \"%s\"", pty_mnt_fd, console->name, path);
1700 }
1701 }
1702
1703 finish:
1704 DEBUG("Mounted pty device %d(%s) onto \"%s\"", pty_mnt_fd, console->name, path);
1705 return 0;
1706 }
1707
1708 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1709 const struct lxc_terminal *console,
1710 char *ttydir, int pty_mnt_fd)
1711 {
1712 int ret;
1713 char path[PATH_MAX], lxcpath[PATH_MAX];
1714 char *rootfs_path = rootfs->path ? rootfs->mount : "";
1715
1716 if (!wants_console(console))
1717 return 0;
1718
1719 /* create rootfs/dev/<ttydir> directory */
1720 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
1721 if (ret < 0 || (size_t)ret >= sizeof(path))
1722 return -1;
1723
1724 ret = mkdir(path, 0755);
1725 if (ret && errno != EEXIST)
1726 return log_error_errno(-errno, errno, "Failed to create \"%s\"", path);
1727 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1728
1729 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
1730 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1731 return -1;
1732
1733 ret = mknod(lxcpath, S_IFREG | 0000, 0);
1734 if (ret < 0 && errno != EEXIST)
1735 return log_error_errno(-errno, errno, "Failed to create \"%s\"", lxcpath);
1736
1737 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1738 if (ret < 0 || (size_t)ret >= sizeof(path))
1739 return -1;
1740
1741 if (file_exists(path)) {
1742 ret = lxc_unstack_mountpoint(path, false);
1743 if (ret < 0)
1744 return log_error_errno(-ret, errno, "Failed to unmount \"%s\"", path);
1745 else
1746 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
1747 }
1748
1749 ret = mknod(path, S_IFREG | 0000, 0);
1750 if (ret < 0 && errno != EEXIST)
1751 return log_error_errno(-errno, errno, "Failed to create console");
1752
1753 ret = fchmod(console->pty, S_IXUSR | S_IXGRP);
1754 if (ret < 0)
1755 return log_error_errno(-errno, errno, "Failed to set mode \"0%o\" to \"%s\"", S_IXUSR | S_IXGRP, console->name);
1756
1757 /* bind mount console->name to '/dev/<ttydir>/console' */
1758 if (pty_mnt_fd >= 0) {
1759 ret = move_mount(pty_mnt_fd, "", -EBADF, lxcpath, MOVE_MOUNT_F_EMPTY_PATH);
1760 if (!ret) {
1761 DEBUG("Moved mount \"%s\" onto \"%s\"", console->name, lxcpath);
1762 goto finish;
1763 }
1764
1765 if (ret && errno != ENOSYS)
1766 return log_error_errno(-1, errno,
1767 "Failed to mount %d(%s) on \"%s\"",
1768 pty_mnt_fd, console->name, lxcpath);
1769 }
1770
1771 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
1772 if (ret < 0)
1773 return log_error_errno(-1, errno, "Failed to mount %d(%s) on \"%s\"", pty_mnt_fd, console->name, lxcpath);
1774 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1775
1776 finish:
1777 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
1778 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1779 if (ret < 0)
1780 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
1781 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1782
1783 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
1784 return 0;
1785 }
1786
1787 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1788 const struct lxc_terminal *console, char *ttydir,
1789 int pty_mnt_fd)
1790 {
1791
1792 if (!ttydir)
1793 return lxc_setup_dev_console(rootfs, console, pty_mnt_fd);
1794
1795 return lxc_setup_ttydir_console(rootfs, console, ttydir, pty_mnt_fd);
1796 }
1797
1798 static int parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
1799 {
1800 ssize_t ret;
1801
1802 /* If '=' is contained in opt, the option must go into data. */
1803 if (!strchr(opt, '=')) {
1804 /*
1805 * If opt is found in mount_opt, set or clear flags.
1806 * Otherwise append it to data.
1807 */
1808 size_t opt_len = strlen(opt);
1809 for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) {
1810 size_t mo_name_len = strlen(mo->name);
1811
1812 if (opt_len == mo_name_len && strncmp(opt, mo->name, mo_name_len) == 0) {
1813 if (mo->clear)
1814 *flags &= ~mo->flag;
1815 else
1816 *flags |= mo->flag;
1817 return 0;
1818 }
1819 }
1820 }
1821
1822 if (strlen(*data)) {
1823 ret = strlcat(*data, ",", size);
1824 if (ret < 0)
1825 return log_error_errno(ret, errno, "Failed to append \",\" to %s", *data);
1826 }
1827
1828 ret = strlcat(*data, opt, size);
1829 if (ret < 0)
1830 return log_error_errno(ret, errno, "Failed to append \"%s\" to %s", opt, *data);
1831
1832 return 0;
1833 }
1834
1835 int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
1836 {
1837 __do_free char *mntopts_new = NULL, *mntopts_dup = NULL;
1838 char *mntopt_cur = NULL;
1839 size_t size;
1840
1841 if (*mntdata || *mntflags)
1842 return ret_errno(EINVAL);
1843
1844 if (!mntopts)
1845 return 0;
1846
1847 mntopts_dup = strdup(mntopts);
1848 if (!mntopts_dup)
1849 return ret_errno(ENOMEM);
1850
1851 size = strlen(mntopts_dup) + 1;
1852 mntopts_new = zalloc(size);
1853 if (!mntopts_new)
1854 return ret_errno(ENOMEM);
1855
1856 lxc_iterate_parts(mntopt_cur, mntopts_dup, ",")
1857 if (parse_mntopt(mntopt_cur, mntflags, &mntopts_new, size) < 0)
1858 return ret_errno(EINVAL);
1859
1860 if (*mntopts_new)
1861 *mntdata = move_ptr(mntopts_new);
1862
1863 return 0;
1864 }
1865
1866 static void parse_propagationopt(char *opt, unsigned long *flags)
1867 {
1868 struct mount_opt *mo;
1869
1870 /* If opt is found in propagation_opt, set or clear flags. */
1871 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
1872 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1873 continue;
1874
1875 if (mo->clear)
1876 *flags &= ~mo->flag;
1877 else
1878 *flags |= mo->flag;
1879
1880 return;
1881 }
1882 }
1883
1884 int parse_propagationopts(const char *mntopts, unsigned long *pflags)
1885 {
1886 __do_free char *s = NULL;
1887 char *p;
1888
1889 if (!mntopts)
1890 return 0;
1891
1892 s = strdup(mntopts);
1893 if (!s)
1894 return log_error_errno(-ENOMEM, errno, "Failed to allocate memory");
1895
1896 *pflags = 0L;
1897 lxc_iterate_parts(p, s, ",")
1898 parse_propagationopt(p, pflags);
1899
1900 return 0;
1901 }
1902
1903 static void null_endofword(char *word)
1904 {
1905 while (*word && *word != ' ' && *word != '\t')
1906 word++;
1907 *word = '\0';
1908 }
1909
1910 /* skip @nfields spaces in @src */
1911 static char *get_field(char *src, int nfields)
1912 {
1913 int i;
1914 char *p = src;
1915
1916 for (i = 0; i < nfields; i++) {
1917 while (*p && *p != ' ' && *p != '\t')
1918 p++;
1919
1920 if (!*p)
1921 break;
1922
1923 p++;
1924 }
1925
1926 return p;
1927 }
1928
1929 static int mount_entry(const char *fsname, const char *target,
1930 const char *fstype, unsigned long mountflags,
1931 unsigned long pflags, const char *data, bool optional,
1932 bool dev, bool relative, const char *rootfs)
1933 {
1934 int ret;
1935 char srcbuf[PATH_MAX];
1936 const char *srcpath = fsname;
1937 #ifdef HAVE_STATVFS
1938 struct statvfs sb;
1939 #endif
1940
1941 if (relative) {
1942 ret = snprintf(srcbuf, sizeof(srcbuf), "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1943 if (ret < 0 || ret >= sizeof(srcbuf))
1944 return log_error_errno(-1, errno, "source path is too long");
1945 srcpath = srcbuf;
1946 }
1947
1948 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
1949 rootfs);
1950 if (ret < 0) {
1951 if (optional)
1952 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
1953 srcpath ? srcpath : "(null)", target);
1954
1955 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
1956 srcpath ? srcpath : "(null)", target);
1957 }
1958
1959 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1960
1961 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount options",
1962 srcpath ? srcpath : "(none)", target ? target : "(none)");
1963
1964 #ifdef HAVE_STATVFS
1965 if (srcpath && statvfs(srcpath, &sb) == 0) {
1966 unsigned long required_flags = 0;
1967
1968 if (sb.f_flag & MS_NOSUID)
1969 required_flags |= MS_NOSUID;
1970
1971 if (sb.f_flag & MS_NODEV && !dev)
1972 required_flags |= MS_NODEV;
1973
1974 if (sb.f_flag & MS_RDONLY)
1975 required_flags |= MS_RDONLY;
1976
1977 if (sb.f_flag & MS_NOEXEC)
1978 required_flags |= MS_NOEXEC;
1979
1980 DEBUG("Flags for \"%s\" were %lu, required extra flags are %lu",
1981 srcpath, sb.f_flag, required_flags);
1982
1983 /* If this was a bind mount request, and required_flags
1984 * does not have any flags which are not already in
1985 * mountflags, then skip the remount.
1986 */
1987 if (!(mountflags & MS_REMOUNT) &&
1988 (!(required_flags & ~mountflags) && !(mountflags & MS_RDONLY))) {
1989 DEBUG("Mountflags already were %lu, skipping remount", mountflags);
1990 goto skipremount;
1991 }
1992
1993 mountflags |= required_flags;
1994 }
1995 #endif
1996
1997 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
1998 if (ret < 0) {
1999 if (optional)
2000 return log_info_errno(0, errno, "Failed to mount \"%s\" on \"%s\" (optional)",
2001 srcpath ? srcpath : "(null)",
2002 target);
2003
2004 return log_error_errno(-1, errno, "Failed to mount \"%s\" on \"%s\"",
2005 srcpath ? srcpath : "(null)",
2006 target);
2007 }
2008 }
2009
2010 #ifdef HAVE_STATVFS
2011 skipremount:
2012 #endif
2013 if (pflags) {
2014 ret = mount(NULL, target, NULL, pflags, NULL);
2015 if (ret < 0) {
2016 if (optional)
2017 return log_info_errno(0, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
2018 else
2019 return log_error_errno(-1, errno, "Failed to change mount propagation for \"%s\" (optional)", target);
2020 }
2021 DEBUG("Changed mount propagation for \"%s\"", target);
2022 }
2023
2024 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
2025 srcpath ? srcpath : "(null)", target, fstype);
2026
2027 return 0;
2028 }
2029
2030 /* Remove "optional", "create=dir", and "create=file" from mntopt */
2031 static void cull_mntent_opt(struct mntent *mntent)
2032 {
2033 int i;
2034 char *list[] = {
2035 "create=dir",
2036 "create=file",
2037 "optional",
2038 "relative",
2039 NULL
2040 };
2041
2042 for (i = 0; list[i]; i++) {
2043 char *p, *p2;
2044
2045 p = strstr(mntent->mnt_opts, list[i]);
2046 if (!p)
2047 continue;
2048
2049 p2 = strchr(p, ',');
2050 if (!p2) {
2051 /* no more mntopts, so just chop it here */
2052 *p = '\0';
2053 continue;
2054 }
2055
2056 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
2057 }
2058 }
2059
2060 static int mount_entry_create_dir_file(const struct mntent *mntent,
2061 const char *path,
2062 const struct lxc_rootfs *rootfs,
2063 const char *lxc_name, const char *lxc_path)
2064 {
2065 __do_free char *p1 = NULL;
2066 int ret;
2067 char *p2;
2068
2069 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
2070 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
2071 if (ret < 0)
2072 return -1;
2073 }
2074
2075 if (hasmntopt(mntent, "create=dir")) {
2076 ret = mkdir_p(path, 0755);
2077 if (ret < 0 && errno != EEXIST)
2078 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
2079 }
2080
2081 if (!hasmntopt(mntent, "create=file"))
2082 return 0;
2083
2084 ret = access(path, F_OK);
2085 if (ret == 0)
2086 return 0;
2087
2088 p1 = strdup(path);
2089 if (!p1)
2090 return -1;
2091
2092 p2 = dirname(p1);
2093
2094 ret = mkdir_p(p2, 0755);
2095 if (ret < 0 && errno != EEXIST)
2096 return log_error_errno(-1, errno, "Failed to create directory \"%s\"", path);
2097
2098 ret = mknod(path, S_IFREG | 0000, 0);
2099 if (ret < 0 && errno != EEXIST)
2100 return -errno;
2101
2102 return 0;
2103 }
2104
2105 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2106 * without a rootfs. */
2107 static inline int mount_entry_on_generic(struct mntent *mntent,
2108 const char *path,
2109 const struct lxc_rootfs *rootfs,
2110 const char *lxc_name,
2111 const char *lxc_path)
2112 {
2113 __do_free char *mntdata = NULL;
2114 unsigned long mntflags = 0, pflags = 0;
2115 char *rootfs_path = NULL;
2116 int ret;
2117 bool dev, optional, relative;
2118
2119 optional = hasmntopt(mntent, "optional") != NULL;
2120 dev = hasmntopt(mntent, "dev") != NULL;
2121 relative = hasmntopt(mntent, "relative") != NULL;
2122
2123 if (rootfs && rootfs->path)
2124 rootfs_path = rootfs->mount;
2125
2126 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2127 lxc_path);
2128 if (ret < 0) {
2129 if (optional)
2130 return 0;
2131
2132 return -1;
2133 }
2134 cull_mntent_opt(mntent);
2135
2136 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2137 if (ret < 0)
2138 return -1;
2139
2140 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2141 if (ret < 0)
2142 return ret;
2143
2144 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
2145 pflags, mntdata, optional, dev, relative, rootfs_path);
2146
2147 return ret;
2148 }
2149
2150 static inline int mount_entry_on_systemfs(struct mntent *mntent)
2151 {
2152 int ret;
2153 char path[PATH_MAX];
2154
2155 /* For containers created without a rootfs all mounts are treated as
2156 * absolute paths starting at / on the host.
2157 */
2158 if (mntent->mnt_dir[0] != '/')
2159 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2160 else
2161 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2162 if (ret < 0 || ret >= sizeof(path))
2163 return -1;
2164
2165 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
2166 }
2167
2168 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
2169 const struct lxc_rootfs *rootfs,
2170 const char *lxc_name,
2171 const char *lxc_path)
2172 {
2173 int offset;
2174 char *aux;
2175 const char *lxcpath;
2176 char path[PATH_MAX];
2177 int ret = 0;
2178
2179 lxcpath = lxc_global_config_value("lxc.lxcpath");
2180 if (!lxcpath)
2181 return -1;
2182
2183 /* If rootfs->path is a blockdev path, allow container fstab to use
2184 * <lxcpath>/<name>/rootfs" as the target prefix.
2185 */
2186 ret = snprintf(path, PATH_MAX, "%s/%s/rootfs", lxcpath, lxc_name);
2187 if (ret < 0 || ret >= PATH_MAX)
2188 goto skipvarlib;
2189
2190 aux = strstr(mntent->mnt_dir, path);
2191 if (aux) {
2192 offset = strlen(path);
2193 goto skipabs;
2194 }
2195
2196 skipvarlib:
2197 aux = strstr(mntent->mnt_dir, rootfs->path);
2198 if (!aux)
2199 return log_warn(ret, "Ignoring mount point \"%s\"", mntent->mnt_dir);
2200 offset = strlen(rootfs->path);
2201
2202 skipabs:
2203 ret = snprintf(path, PATH_MAX, "%s/%s", rootfs->mount, aux + offset);
2204 if (ret < 0 || ret >= PATH_MAX)
2205 return -1;
2206
2207 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2208 }
2209
2210 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2211 const struct lxc_rootfs *rootfs,
2212 const char *lxc_name,
2213 const char *lxc_path)
2214 {
2215 int ret;
2216 char path[PATH_MAX];
2217
2218 /* relative to root mount point */
2219 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2220 if (ret < 0 || (size_t)ret >= sizeof(path))
2221 return -1;
2222
2223 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2224 }
2225
2226 static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
2227 const char *lxc_name, const char *lxc_path)
2228 {
2229 char buf[PATH_MAX];
2230 struct mntent mntent;
2231
2232 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2233 int ret;
2234
2235 if (!rootfs->path)
2236 ret = mount_entry_on_systemfs(&mntent);
2237 else if (mntent.mnt_dir[0] != '/')
2238 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2239 lxc_name, lxc_path);
2240 else
2241 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2242 lxc_name, lxc_path);
2243 if (ret < 0)
2244 return -1;
2245 }
2246
2247 if (!feof(file) || ferror(file))
2248 return log_error(-1, "Failed to parse mount entries");
2249
2250 return 0;
2251 }
2252
2253 static inline void __auto_endmntent__(FILE **f)
2254 {
2255 if (*f)
2256 endmntent(*f);
2257 }
2258
2259 #define __do_endmntent __attribute__((__cleanup__(__auto_endmntent__)))
2260
2261 static int setup_mount(const struct lxc_conf *conf,
2262 const struct lxc_rootfs *rootfs, const char *fstab,
2263 const char *lxc_name, const char *lxc_path)
2264 {
2265 __do_endmntent FILE *f = NULL;
2266 int ret;
2267
2268 if (!fstab)
2269 return 0;
2270
2271 f = setmntent(fstab, "re");
2272 if (!f)
2273 return log_error_errno(-1, errno, "Failed to open \"%s\"", fstab);
2274
2275 ret = mount_file_entries(rootfs, f, lxc_name, lxc_path);
2276 if (ret < 0)
2277 ERROR("Failed to set up mount entries");
2278
2279 return ret;
2280 }
2281
2282 /*
2283 * In order for nested containers to be able to mount /proc and /sys they need
2284 * to see a "pure" proc and sysfs mount points with nothing mounted on top
2285 * (like lxcfs).
2286 * For this we provide proc and sysfs in /dev/.lxc/{proc,sys} while using an
2287 * apparmor rule to deny access to them. This is mostly for convenience: The
2288 * container's root user can mount them anyway and thus has access to the two
2289 * file systems. But a non-root user in the container should not be allowed to
2290 * access them as a side effect without explicitly allowing it.
2291 */
2292 static const char nesting_helpers[] =
2293 "proc dev/.lxc/proc proc create=dir,optional 0 0\n"
2294 "sys dev/.lxc/sys sysfs create=dir,optional 0 0\n";
2295
2296 FILE *make_anonymous_mount_file(struct lxc_list *mount,
2297 bool include_nesting_helpers)
2298 {
2299 __do_close int fd = -EBADF;
2300 FILE *f;
2301 int ret;
2302 char *mount_entry;
2303 struct lxc_list *iterator;
2304
2305 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
2306 if (fd < 0) {
2307 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2308
2309 if (errno != ENOSYS)
2310 return NULL;
2311
2312 fd = lxc_make_tmpfile(template, true);
2313 if (fd < 0)
2314 return log_error_errno(NULL, errno, "Could not create temporary mount file");
2315
2316 TRACE("Created temporary mount file");
2317 }
2318
2319 lxc_list_for_each (iterator, mount) {
2320 size_t len;
2321
2322 mount_entry = iterator->elem;
2323 len = strlen(mount_entry);
2324
2325 ret = lxc_write_nointr(fd, mount_entry, len);
2326 if (ret != len)
2327 return NULL;
2328
2329 ret = lxc_write_nointr(fd, "\n", 1);
2330 if (ret != 1)
2331 return NULL;
2332 }
2333
2334 if (include_nesting_helpers) {
2335 ret = lxc_write_nointr(fd, nesting_helpers,
2336 STRARRAYLEN(nesting_helpers));
2337 if (ret != STRARRAYLEN(nesting_helpers))
2338 return NULL;
2339 }
2340
2341 ret = lseek(fd, 0, SEEK_SET);
2342 if (ret < 0)
2343 return NULL;
2344
2345 f = fdopen(fd, "re+");
2346 if (f)
2347 move_fd(fd); /* Transfer ownership of fd. */
2348 return f;
2349 }
2350
2351 static int setup_mount_entries(const struct lxc_conf *conf,
2352 const struct lxc_rootfs *rootfs,
2353 struct lxc_list *mount, const char *lxc_name,
2354 const char *lxc_path)
2355 {
2356 __do_fclose FILE *f = NULL;
2357
2358 f = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
2359 if (!f)
2360 return -1;
2361
2362 return mount_file_entries(rootfs, f, lxc_name, lxc_path);
2363 }
2364
2365 static int parse_cap(const char *cap)
2366 {
2367 size_t i;
2368 int capid = -1;
2369 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2370 char *ptr = NULL;
2371
2372 if (strcmp(cap, "none") == 0)
2373 return -2;
2374
2375 for (i = 0; i < end; i++) {
2376 if (strcmp(cap, caps_opt[i].name))
2377 continue;
2378
2379 capid = caps_opt[i].value;
2380 break;
2381 }
2382
2383 if (capid < 0) {
2384 /* Try to see if it's numeric, so the user may specify
2385 * capabilities that the running kernel knows about but we
2386 * don't
2387 */
2388 errno = 0;
2389 capid = strtol(cap, &ptr, 10);
2390 if (!ptr || *ptr != '\0' || errno != 0)
2391 /* not a valid number */
2392 capid = -1;
2393 else if (capid > lxc_caps_last_cap())
2394 /* we have a number but it's not a valid
2395 * capability */
2396 capid = -1;
2397 }
2398
2399 return capid;
2400 }
2401
2402 int in_caplist(int cap, struct lxc_list *caps)
2403 {
2404 int capid;
2405 struct lxc_list *iterator;
2406
2407 lxc_list_for_each (iterator, caps) {
2408 capid = parse_cap(iterator->elem);
2409 if (capid == cap)
2410 return 1;
2411 }
2412
2413 return 0;
2414 }
2415
2416 static int setup_caps(struct lxc_list *caps)
2417 {
2418 int capid;
2419 char *drop_entry;
2420 struct lxc_list *iterator;
2421
2422 lxc_list_for_each (iterator, caps) {
2423 int ret;
2424
2425 drop_entry = iterator->elem;
2426
2427 capid = parse_cap(drop_entry);
2428 if (capid < 0)
2429 return log_error(-1, "unknown capability %s", drop_entry);
2430
2431 ret = prctl(PR_CAPBSET_DROP, prctl_arg(capid), prctl_arg(0),
2432 prctl_arg(0), prctl_arg(0));
2433 if (ret < 0)
2434 return log_error_errno(-1, errno, "Failed to remove %s capability", drop_entry);
2435 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
2436 }
2437
2438 DEBUG("Capabilities have been setup");
2439 return 0;
2440 }
2441
2442 static int dropcaps_except(struct lxc_list *caps)
2443 {
2444 __do_free int *caplist = NULL;
2445 int i, capid, numcaps;
2446 char *keep_entry;
2447 struct lxc_list *iterator;
2448
2449 numcaps = lxc_caps_last_cap() + 1;
2450 if (numcaps <= 0 || numcaps > 200)
2451 return -1;
2452 TRACE("Found %d capabilities", numcaps);
2453
2454 /* caplist[i] is 1 if we keep capability i */
2455 caplist = must_realloc(NULL, numcaps * sizeof(int));
2456 memset(caplist, 0, numcaps * sizeof(int));
2457
2458 lxc_list_for_each (iterator, caps) {
2459 keep_entry = iterator->elem;
2460
2461 capid = parse_cap(keep_entry);
2462 if (capid == -2)
2463 continue;
2464
2465 if (capid < 0)
2466 return log_error(-1, "Unknown capability %s", keep_entry);
2467
2468 DEBUG("Keep capability %s (%d)", keep_entry, capid);
2469 caplist[capid] = 1;
2470 }
2471
2472 for (i = 0; i < numcaps; i++) {
2473 int ret;
2474
2475 if (caplist[i])
2476 continue;
2477
2478 ret = prctl(PR_CAPBSET_DROP, prctl_arg(i), prctl_arg(0),
2479 prctl_arg(0), prctl_arg(0));
2480 if (ret < 0)
2481 return log_error_errno(-1, errno, "Failed to remove capability %d", i);
2482 }
2483
2484 DEBUG("Capabilities have been setup");
2485 return 0;
2486 }
2487
2488 static int parse_resource(const char *res)
2489 {
2490 int ret;
2491 size_t i;
2492 int resid = -1;
2493
2494 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
2495 if (strcmp(res, limit_opt[i].name) == 0)
2496 return limit_opt[i].value;
2497
2498 /* Try to see if it's numeric, so the user may specify
2499 * resources that the running kernel knows about but
2500 * we don't.
2501 */
2502 ret = lxc_safe_int(res, &resid);
2503 if (ret < 0)
2504 return -1;
2505
2506 return resid;
2507 }
2508
2509 int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2510 {
2511 int resid;
2512 struct lxc_list *it;
2513 struct lxc_limit *lim;
2514
2515 lxc_list_for_each (it, limits) {
2516 lim = it->elem;
2517
2518 resid = parse_resource(lim->resource);
2519 if (resid < 0)
2520 return log_error(-1, "Unknown resource %s", lim->resource);
2521
2522 #if HAVE_PRLIMIT || HAVE_PRLIMIT64
2523 if (prlimit(pid, resid, &lim->limit, NULL) != 0)
2524 return log_error_errno(-1, errno, "Failed to set limit %s", lim->resource);
2525
2526 TRACE("Setup \"%s\" limit", lim->resource);
2527 #else
2528 return log_error(-1, "Cannot set limit \"%s\" as prlimit is missing", lim->resource);
2529 #endif
2530 }
2531
2532 return 0;
2533 }
2534
2535 int setup_sysctl_parameters(struct lxc_list *sysctls)
2536 {
2537 __do_free char *tmp = NULL;
2538 struct lxc_list *it;
2539 struct lxc_sysctl *elem;
2540 int ret = 0;
2541 char filename[PATH_MAX] = {0};
2542
2543 lxc_list_for_each (it, sysctls) {
2544 elem = it->elem;
2545 tmp = lxc_string_replace(".", "/", elem->key);
2546 if (!tmp)
2547 return log_error(-1, "Failed to replace key %s", elem->key);
2548
2549 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2550 if (ret < 0 || (size_t)ret >= sizeof(filename))
2551 return log_error(-1, "Error setting up sysctl parameters path");
2552
2553 ret = lxc_write_to_file(filename, elem->value,
2554 strlen(elem->value), false, 0666);
2555 if (ret < 0)
2556 return log_error_errno(-1, errno, "Failed to setup sysctl parameters %s to %s",
2557 elem->key, elem->value);
2558 }
2559
2560 return 0;
2561 }
2562
2563 int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2564 {
2565 __do_free char *tmp = NULL;
2566 struct lxc_list *it;
2567 struct lxc_proc *elem;
2568 int ret = 0;
2569 char filename[PATH_MAX] = {0};
2570
2571 lxc_list_for_each (it, procs) {
2572 elem = it->elem;
2573 tmp = lxc_string_replace(".", "/", elem->filename);
2574 if (!tmp)
2575 return log_error(-1, "Failed to replace key %s", elem->filename);
2576
2577 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2578 if (ret < 0 || (size_t)ret >= sizeof(filename))
2579 return log_error(-1, "Error setting up proc filesystem path");
2580
2581 ret = lxc_write_to_file(filename, elem->value,
2582 strlen(elem->value), false, 0666);
2583 if (ret < 0)
2584 return log_error_errno(-1, errno, "Failed to setup proc filesystem %s to %s", elem->filename, elem->value);
2585 }
2586
2587 return 0;
2588 }
2589
2590 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2591
2592 struct lxc_conf *lxc_conf_init(void)
2593 {
2594 int i;
2595 struct lxc_conf *new;
2596
2597 new = malloc(sizeof(*new));
2598 if (!new)
2599 return NULL;
2600 memset(new, 0, sizeof(*new));
2601
2602 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2603 new->personality = -1;
2604 new->autodev = 1;
2605 new->console.buffer_size = 0;
2606 new->console.log_path = NULL;
2607 new->console.log_fd = -1;
2608 new->console.log_size = 0;
2609 new->console.path = NULL;
2610 new->console.peer = -1;
2611 new->console.proxy.busy = -1;
2612 new->console.proxy.ptx = -1;
2613 new->console.proxy.pty = -1;
2614 new->console.ptx = -1;
2615 new->console.pty = -1;
2616 new->console.name[0] = '\0';
2617 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
2618 new->maincmd_fd = -1;
2619 new->monitor_signal_pdeath = SIGKILL;
2620 new->nbd_idx = -1;
2621 new->rootfs.mount = strdup(default_rootfs_mount);
2622 if (!new->rootfs.mount) {
2623 free(new);
2624 return NULL;
2625 }
2626 new->rootfs.managed = true;
2627 new->rootfs.mntpt_fd = -EBADF;
2628 new->rootfs.dev_mntpt_fd = -EBADF;
2629 new->logfd = -1;
2630 lxc_list_init(&new->cgroup);
2631 lxc_list_init(&new->cgroup2);
2632 lxc_list_init(&new->devices);
2633 lxc_list_init(&new->network);
2634 lxc_list_init(&new->mount_list);
2635 lxc_list_init(&new->caps);
2636 lxc_list_init(&new->keepcaps);
2637 lxc_list_init(&new->id_map);
2638 new->root_nsuid_map = NULL;
2639 new->root_nsgid_map = NULL;
2640 lxc_list_init(&new->includes);
2641 lxc_list_init(&new->aliens);
2642 lxc_list_init(&new->environment);
2643 lxc_list_init(&new->limits);
2644 lxc_list_init(&new->sysctls);
2645 lxc_list_init(&new->procs);
2646 new->hooks_version = 0;
2647 for (i = 0; i < NUM_LXC_HOOKS; i++)
2648 lxc_list_init(&new->hooks[i]);
2649 lxc_list_init(&new->groups);
2650 lxc_list_init(&new->state_clients);
2651 new->lsm_aa_profile = NULL;
2652 lxc_list_init(&new->lsm_aa_raw);
2653 new->lsm_se_context = NULL;
2654 new->lsm_se_keyring_context = NULL;
2655 new->keyring_disable_session = false;
2656 new->tmp_umount_proc = false;
2657 new->tmp_umount_proc = 0;
2658 new->shmount.path_host = NULL;
2659 new->shmount.path_cont = NULL;
2660
2661 /* if running in a new user namespace, init and COMMAND
2662 * default to running as UID/GID 0 when using lxc-execute */
2663 new->init_uid = 0;
2664 new->init_gid = 0;
2665 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2666 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
2667 memset(&new->timens, 0, sizeof(struct timens_offsets));
2668 seccomp_conf_init(new);
2669
2670 return new;
2671 }
2672
2673 int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2674 size_t buf_size)
2675 {
2676 __do_close int fd = -EBADF;
2677 int ret;
2678 char path[PATH_MAX];
2679
2680 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
2681 __do_close int setgroups_fd = -EBADF;
2682
2683 ret = snprintf(path, PATH_MAX, "/proc/%d/setgroups", pid);
2684 if (ret < 0 || ret >= PATH_MAX)
2685 return -E2BIG;
2686
2687 setgroups_fd = open(path, O_WRONLY);
2688 if (setgroups_fd < 0 && errno != ENOENT)
2689 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
2690
2691 if (setgroups_fd >= 0) {
2692 ret = lxc_write_nointr(setgroups_fd, "deny\n",
2693 STRLITERALLEN("deny\n"));
2694 if (ret != STRLITERALLEN("deny\n"))
2695 return log_error_errno(-1, errno, "Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid);
2696 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
2697 }
2698 }
2699
2700 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid,
2701 idtype == ID_TYPE_UID ? 'u' : 'g');
2702 if (ret < 0 || ret >= PATH_MAX)
2703 return -E2BIG;
2704
2705 fd = open(path, O_WRONLY | O_CLOEXEC);
2706 if (fd < 0)
2707 return log_error_errno(-1, errno, "Failed to open \"%s\"", path);
2708
2709 ret = lxc_write_nointr(fd, buf, buf_size);
2710 if (ret != buf_size)
2711 return log_error_errno(-1, errno, "Failed to write %cid mapping to \"%s\"",
2712 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2713
2714 return 0;
2715 }
2716
2717 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2718 *
2719 * @return 1 if functional binary was found
2720 * @return 0 if binary exists but is lacking privilege
2721 * @return -ENOENT if binary does not exist
2722 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2723 */
2724 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2725 {
2726 __do_free char *path = NULL;
2727 int ret;
2728 struct stat st;
2729
2730 errno = EINVAL;
2731 if (cap != CAP_SETUID && cap != CAP_SETGID)
2732 return -1;
2733
2734 errno = ENOENT;
2735 path = on_path(binary, NULL);
2736 if (!path)
2737 return -1;
2738
2739 ret = stat(path, &st);
2740 if (ret < 0)
2741 return -1;
2742
2743 /* Check if the binary is setuid. */
2744 if (st.st_mode & S_ISUID)
2745 return log_debug(1, "The binary \"%s\" does have the setuid bit set", path);
2746
2747 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
2748 /* Check if it has the CAP_SETUID capability. */
2749 if ((cap & CAP_SETUID) &&
2750 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2751 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED))
2752 return log_debug(1, "The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
2753
2754 /* Check if it has the CAP_SETGID capability. */
2755 if ((cap & CAP_SETGID) &&
2756 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2757 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED))
2758 return log_debug(1, "The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE and CAP_PERMITTED sets", path);
2759 #else
2760 /* If we cannot check for file capabilities we need to give the benefit
2761 * of the doubt. Otherwise we might fail even though all the necessary
2762 * file capabilities are set.
2763 */
2764 DEBUG("Cannot check for file capabilities as full capability support is missing. Manual intervention needed");
2765 #endif
2766
2767 return 1;
2768 }
2769
2770 static int lxc_map_ids_exec_wrapper(void *args)
2771 {
2772 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2773 return -1;
2774 }
2775
2776 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2777 {
2778 int fill, left;
2779 char u_or_g;
2780 char *pos;
2781 char cmd_output[PATH_MAX];
2782 struct id_map *map;
2783 struct lxc_list *iterator;
2784 enum idtype type;
2785 int ret = 0, gidmap = 0, uidmap = 0;
2786 char mapbuf[STRLITERALLEN("new@idmap") + STRLITERALLEN(" ") +
2787 INTTYPE_TO_STRLEN(pid_t) + STRLITERALLEN(" ") +
2788 LXC_IDMAPLEN] = {0};
2789 bool had_entry = false, use_shadow = false;
2790 int hostuid, hostgid;
2791
2792 hostuid = geteuid();
2793 hostgid = getegid();
2794
2795 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2796 * ranges, then insist that root also reserve ranges in subuid. This
2797 * will protected it by preventing another user from being handed the
2798 * range by shadow.
2799 */
2800 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
2801 if (uidmap == -ENOENT)
2802 WARN("newuidmap binary is missing");
2803 else if (!uidmap)
2804 WARN("newuidmap is lacking necessary privileges");
2805
2806 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
2807 if (gidmap == -ENOENT)
2808 WARN("newgidmap binary is missing");
2809 else if (!gidmap)
2810 WARN("newgidmap is lacking necessary privileges");
2811
2812 if (uidmap > 0 && gidmap > 0) {
2813 DEBUG("Functional newuidmap and newgidmap binary found");
2814 use_shadow = true;
2815 } else {
2816 /* In case unprivileged users run application containers via
2817 * execute() or a start*() there are valid cases where they may
2818 * only want to map their own {g,u}id. Let's not block them from
2819 * doing so by requiring geteuid() == 0.
2820 */
2821 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2822 "write directly with euid %d", hostuid);
2823 }
2824
2825 /* Check if we really need to use newuidmap and newgidmap.
2826 * If the user is only remapping his own {g,u}id, we don't need it.
2827 */
2828 if (use_shadow && lxc_list_len(idmap) == 2) {
2829 use_shadow = false;
2830 lxc_list_for_each(iterator, idmap) {
2831 map = iterator->elem;
2832 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2833 map->nsid == hostuid && map->hostid == hostuid)
2834 continue;
2835 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2836 map->nsid == hostgid && map->hostid == hostgid)
2837 continue;
2838 use_shadow = true;
2839 break;
2840 }
2841 }
2842
2843 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2844 type++, u_or_g = 'g') {
2845 pos = mapbuf;
2846
2847 if (use_shadow)
2848 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
2849
2850 lxc_list_for_each(iterator, idmap) {
2851 map = iterator->elem;
2852 if (map->idtype != type)
2853 continue;
2854
2855 had_entry = true;
2856
2857 left = LXC_IDMAPLEN - (pos - mapbuf);
2858 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
2859 use_shadow ? " " : "", map->nsid,
2860 map->hostid, map->range,
2861 use_shadow ? "" : "\n");
2862 /*
2863 * The kernel only takes <= 4k for writes to
2864 * /proc/<pid>/{g,u}id_map
2865 */
2866 if (fill <= 0 || fill >= left)
2867 return log_error_errno(-1, errno, "Too many %cid mappings defined", u_or_g);
2868
2869 pos += fill;
2870 }
2871 if (!had_entry)
2872 continue;
2873
2874 /* Try to catch the output of new{g,u}idmap to make debugging
2875 * easier.
2876 */
2877 if (use_shadow) {
2878 ret = run_command(cmd_output, sizeof(cmd_output),
2879 lxc_map_ids_exec_wrapper,
2880 (void *)mapbuf);
2881 if (ret < 0)
2882 return log_error(-1, "new%cidmap failed to write mapping \"%s\": %s", u_or_g, cmd_output, mapbuf);
2883 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
2884 } else {
2885 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
2886 if (ret < 0)
2887 return log_error(-1, "Failed to write mapping: %s", mapbuf);
2888 TRACE("Wrote mapping \"%s\"", mapbuf);
2889 }
2890
2891 memset(mapbuf, 0, sizeof(mapbuf));
2892 }
2893
2894 return 0;
2895 }
2896
2897 /*
2898 * Return the host uid/gid to which the container root is mapped in val.
2899 * Return true if id was found, false otherwise.
2900 */
2901 static id_t get_mapped_rootid(const struct lxc_conf *conf, enum idtype idtype)
2902 {
2903 unsigned nsid;
2904 struct id_map *map;
2905 struct lxc_list *it;
2906
2907 if (idtype == ID_TYPE_UID)
2908 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
2909 else
2910 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
2911
2912 lxc_list_for_each (it, &conf->id_map) {
2913 map = it->elem;
2914 if (map->idtype != idtype)
2915 continue;
2916 if (map->nsid != nsid)
2917 continue;
2918 return map->hostid;
2919 }
2920
2921 if (idtype == ID_TYPE_UID)
2922 return LXC_INVALID_UID;
2923
2924 return LXC_INVALID_GID;
2925 }
2926
2927 int mapped_hostid(unsigned id, const struct lxc_conf *conf, enum idtype idtype)
2928 {
2929 struct id_map *map;
2930 struct lxc_list *it;
2931
2932 lxc_list_for_each (it, &conf->id_map) {
2933 map = it->elem;
2934 if (map->idtype != idtype)
2935 continue;
2936
2937 if (id >= map->hostid && id < map->hostid + map->range)
2938 return (id - map->hostid) + map->nsid;
2939 }
2940
2941 return -1;
2942 }
2943
2944 int find_unmapped_nsid(const struct lxc_conf *conf, enum idtype idtype)
2945 {
2946 struct id_map *map;
2947 struct lxc_list *it;
2948 unsigned int freeid = 0;
2949
2950 again:
2951 lxc_list_for_each (it, &conf->id_map) {
2952 map = it->elem;
2953 if (map->idtype != idtype)
2954 continue;
2955
2956 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2957 freeid = map->nsid + map->range;
2958 goto again;
2959 }
2960 }
2961
2962 return freeid;
2963 }
2964
2965 /* NOTE: Must not be called from inside the container namespace! */
2966 static int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
2967 {
2968 int mounted;
2969
2970 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
2971 if (mounted == -1) {
2972 SYSERROR("Failed to mount proc in the container");
2973 /* continue only if there is no rootfs */
2974 if (conf->rootfs.path)
2975 return -1;
2976 } else if (mounted == 1) {
2977 conf->tmp_umount_proc = true;
2978 }
2979
2980 return 0;
2981 }
2982
2983 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2984 {
2985 if (!lxc_conf->tmp_umount_proc)
2986 return;
2987
2988 (void)umount2("/proc", MNT_DETACH);
2989 lxc_conf->tmp_umount_proc = false;
2990 }
2991
2992 /* Walk /proc/mounts and change any shared entries to dependent mounts. */
2993 void turn_into_dependent_mounts(void)
2994 {
2995 __do_free char *line = NULL;
2996 __do_fclose FILE *f = NULL;
2997 __do_close int memfd = -EBADF, mntinfo_fd = -EBADF;
2998 size_t len = 0;
2999 ssize_t copied;
3000 int ret;
3001
3002 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
3003 if (mntinfo_fd < 0) {
3004 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
3005 return;
3006 }
3007
3008 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3009 if (memfd < 0) {
3010 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3011
3012 if (errno != ENOSYS) {
3013 SYSERROR("Failed to create temporary in-memory file");
3014 return;
3015 }
3016
3017 memfd = lxc_make_tmpfile(template, true);
3018 if (memfd < 0) {
3019 WARN("Failed to create temporary file");
3020 return;
3021 }
3022 }
3023
3024 copied = fd_to_fd(mntinfo_fd, memfd);
3025 if (copied < 0) {
3026 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
3027 return;
3028 }
3029
3030 ret = lseek(memfd, 0, SEEK_SET);
3031 if (ret < 0) {
3032 SYSERROR("Failed to reset file descriptor offset");
3033 return;
3034 }
3035
3036 f = fdopen(memfd, "re");
3037 if (!f) {
3038 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark all shared. Continuing");
3039 return;
3040 }
3041
3042 /*
3043 * After a successful fdopen() memfd will be closed when calling
3044 * fclose(f). Calling close(memfd) afterwards is undefined.
3045 */
3046 move_fd(memfd);
3047
3048 while (getline(&line, &len, f) != -1) {
3049 char *opts, *target;
3050
3051 target = get_field(line, 4);
3052 if (!target)
3053 continue;
3054
3055 opts = get_field(target, 2);
3056 if (!opts)
3057 continue;
3058
3059 null_endofword(opts);
3060 if (!strstr(opts, "shared"))
3061 continue;
3062
3063 null_endofword(target);
3064 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3065 if (ret < 0) {
3066 SYSERROR("Failed to recursively turn old root mount tree into dependent mount. Continuing...");
3067 continue;
3068 }
3069 TRACE("Recursively turned old root mount tree into dependent mount");
3070 }
3071 TRACE("Turned all mount table entries into dependent mount");
3072 }
3073
3074 static int lxc_execute_bind_init(struct lxc_handler *handler)
3075 {
3076 int ret;
3077 char *p;
3078 char path[PATH_MAX], destpath[PATH_MAX];
3079 struct lxc_conf *conf = handler->conf;
3080
3081 /* If init exists in the container, don't bind mount a static one */
3082 p = choose_init(conf->rootfs.mount);
3083 if (p) {
3084 __do_free char *old = p;
3085
3086 p = strdup(old + strlen(conf->rootfs.mount));
3087 if (!p)
3088 return -ENOMEM;
3089
3090 INFO("Found existing init at \"%s\"", p);
3091 goto out;
3092 }
3093
3094 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3095 if (ret < 0 || ret >= PATH_MAX)
3096 return -1;
3097
3098 if (!file_exists(path))
3099 return log_error_errno(-1, errno, "The file \"%s\" does not exist on host", path);
3100
3101 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
3102 if (ret < 0 || ret >= PATH_MAX)
3103 return -1;
3104
3105 if (!file_exists(destpath)) {
3106 ret = mknod(destpath, S_IFREG | 0000, 0);
3107 if (ret < 0 && errno != EEXIST)
3108 return log_error_errno(-1, errno, "Failed to create dummy \"%s\" file as bind mount target", destpath);
3109 }
3110
3111 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3112 if (ret < 0)
3113 return log_error_errno(-1, errno, "Failed to bind mount lxc.init.static into container");
3114
3115 p = strdup(destpath + strlen(conf->rootfs.mount));
3116 if (!p)
3117 return -ENOMEM;
3118
3119 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
3120 out:
3121 ((struct execute_args *)handler->data)->init_fd = -1;
3122 ((struct execute_args *)handler->data)->init_path = p;
3123 return 0;
3124 }
3125
3126 /* This does the work of remounting / if it is shared, calling the container
3127 * pre-mount hooks, and mounting the rootfs.
3128 */
3129 int lxc_setup_rootfs_prepare_root(struct lxc_conf *conf, const char *name,
3130 const char *lxcpath)
3131 {
3132 int ret;
3133
3134 if (conf->rootfs_setup) {
3135 const char *path = conf->rootfs.mount;
3136
3137 /* The rootfs was set up in another namespace. bind-mount it to
3138 * give us a mount in our own ns so we can pivot_root to it
3139 */
3140 ret = mount(path, path, "rootfs", MS_BIND, NULL);
3141 if (ret < 0)
3142 return log_error(-1, "Failed to bind mount container / onto itself");
3143
3144 conf->rootfs.mntpt_fd = openat(-EBADF, path, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH | O_NOCTTY);
3145 if (conf->rootfs.mntpt_fd < 0)
3146 return log_error_errno(-errno, errno, "Failed to open file descriptor for container rootfs");
3147
3148 return log_trace(0, "Bind mounted container / onto itself");
3149 }
3150
3151 turn_into_dependent_mounts();
3152
3153 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
3154 if (ret < 0)
3155 return log_error(-1, "Failed to run pre-mount hooks");
3156
3157 ret = lxc_mount_rootfs(conf);
3158 if (ret < 0)
3159 return log_error(-1, "Failed to setup rootfs for");
3160
3161 conf->rootfs_setup = true;
3162 return 0;
3163 }
3164
3165 static bool verify_start_hooks(struct lxc_conf *conf)
3166 {
3167 char path[PATH_MAX];
3168 struct lxc_list *it;
3169
3170 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
3171 int ret;
3172 char *hookname = it->elem;
3173
3174 ret = snprintf(path, PATH_MAX, "%s%s",
3175 conf->rootfs.path ? conf->rootfs.mount : "",
3176 hookname);
3177 if (ret < 0 || ret >= PATH_MAX)
3178 return false;
3179
3180 ret = access(path, X_OK);
3181 if (ret < 0)
3182 return log_error_errno(false, errno, "Start hook \"%s\" not found in container", hookname);
3183
3184 return true;
3185 }
3186
3187 return true;
3188 }
3189
3190 static bool execveat_supported(void)
3191 {
3192 execveat(-1, "", NULL, NULL, AT_EMPTY_PATH);
3193 if (errno == ENOSYS)
3194 return false;
3195
3196 return true;
3197 }
3198
3199 static int lxc_setup_boot_id(void)
3200 {
3201 int ret;
3202 const char *boot_id_path = "/proc/sys/kernel/random/boot_id";
3203 const char *mock_boot_id_path = "/dev/.lxc-boot-id";
3204 lxc_id128_t n;
3205
3206 if (access(boot_id_path, F_OK))
3207 return 0;
3208
3209 memset(&n, 0, sizeof(n));
3210 if (lxc_id128_randomize(&n)) {
3211 SYSERROR("Failed to generate random data for uuid");
3212 return -1;
3213 }
3214
3215 ret = lxc_id128_write(mock_boot_id_path, n);
3216 if (ret < 0) {
3217 SYSERROR("Failed to write uuid to %s", mock_boot_id_path);
3218 return -1;
3219 }
3220
3221 ret = chmod(mock_boot_id_path, 0444);
3222 if (ret < 0) {
3223 SYSERROR("Failed to chown %s", mock_boot_id_path);
3224 (void)unlink(mock_boot_id_path);
3225 return -1;
3226 }
3227
3228 ret = mount(mock_boot_id_path, boot_id_path, NULL, MS_BIND, NULL);
3229 if (ret < 0) {
3230 SYSERROR("Failed to mount %s to %s", mock_boot_id_path,
3231 boot_id_path);
3232 (void)unlink(mock_boot_id_path);
3233 return -1;
3234 }
3235
3236 ret = mount(NULL, boot_id_path, NULL,
3237 (MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOSUID | MS_NOEXEC |
3238 MS_NODEV),
3239 NULL);
3240 if (ret < 0) {
3241 SYSERROR("Failed to remount %s read-only", boot_id_path);
3242 (void)unlink(mock_boot_id_path);
3243 return -1;
3244 }
3245
3246 return 0;
3247 }
3248
3249 static int lxc_setup_keyring(struct lsm_ops *lsm_ops, const struct lxc_conf *conf)
3250 {
3251 key_serial_t keyring;
3252 int ret = 0;
3253
3254 if (conf->lsm_se_keyring_context)
3255 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_keyring_context);
3256 else if (conf->lsm_se_context)
3257 ret = lsm_ops->keyring_label_set(lsm_ops, conf->lsm_se_context);
3258 if (ret < 0)
3259 return log_error_errno(-1, errno, "Failed to set keyring context");
3260
3261 /*
3262 * Try to allocate a new session keyring for the container to prevent
3263 * information leaks.
3264 */
3265 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
3266 prctl_arg(0), prctl_arg(0), prctl_arg(0));
3267 if (keyring < 0) {
3268 switch (errno) {
3269 case ENOSYS:
3270 DEBUG("The keyctl() syscall is not supported or blocked");
3271 break;
3272 case EACCES:
3273 __fallthrough;
3274 case EPERM:
3275 DEBUG("Failed to access kernel keyring. Continuing...");
3276 break;
3277 default:
3278 SYSERROR("Failed to create kernel keyring");
3279 break;
3280 }
3281 }
3282
3283 return ret;
3284 }
3285
3286 int lxc_setup(struct lxc_handler *handler)
3287 {
3288 __do_close int pty_mnt_fd = -EBADF;
3289 int ret;
3290 const char *lxcpath = handler->lxcpath, *name = handler->name;
3291 struct lxc_conf *lxc_conf = handler->conf;
3292
3293 ret = lxc_setup_rootfs_prepare_root(lxc_conf, name, lxcpath);
3294 if (ret < 0)
3295 return log_error(-1, "Failed to setup rootfs");
3296
3297 if (handler->nsfd[LXC_NS_UTS] == -EBADF) {
3298 ret = setup_utsname(lxc_conf->utsname);
3299 if (ret < 0)
3300 return log_error(-1, "Failed to setup the utsname %s", name);
3301 }
3302
3303 if (!lxc_conf->keyring_disable_session) {
3304 ret = lxc_setup_keyring(handler->lsm_ops, lxc_conf);
3305 if (ret < 0)
3306 return log_error(-1, "Failed to setup container keyring");
3307 }
3308
3309 if (handler->ns_clone_flags & CLONE_NEWNET) {
3310 ret = lxc_setup_network_in_child_namespaces(lxc_conf,
3311 &lxc_conf->network);
3312 if (ret < 0)
3313 return log_error(-1, "Failed to setup network");
3314
3315 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
3316 if (ret < 0)
3317 return log_error(-1, "Failed to send network device names and ifindices to parent");
3318 }
3319
3320 if (wants_console(&lxc_conf->console)) {
3321 pty_mnt_fd = open_tree(-EBADF, lxc_conf->console.name,
3322 OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
3323 if (pty_mnt_fd < 0)
3324 SYSTRACE("Failed to create detached mount for container's console \"%s\"",
3325 lxc_conf->console.name);
3326 else
3327 TRACE("Created detached mount for container's console \"%s\"",
3328 lxc_conf->console.name);
3329 }
3330
3331 if (lxc_conf->autodev > 0) {
3332 ret = mount_autodev(name, &lxc_conf->rootfs, lxc_conf->autodevtmpfssize, lxcpath);
3333 if (ret < 0)
3334 return log_error(-1, "Failed to mount \"/dev\"");
3335 }
3336
3337 lxc_conf->rootfs.dev_mntpt_fd = openat(lxc_conf->rootfs.mntpt_fd, "dev",
3338 O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_NOFOLLOW);
3339 if (lxc_conf->rootfs.dev_mntpt_fd < 0 && errno != ENOENT)
3340 return log_error_errno(-errno, errno, "Failed to open \"/dev\"");
3341
3342 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3343 * need to wait until other stuff has finished.
3344 */
3345 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
3346 if (ret < 0)
3347 return log_error(-1, "Failed to setup first automatic mounts");
3348
3349 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
3350 if (ret < 0)
3351 return log_error(-1, "Failed to setup mounts");
3352
3353 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3354 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3355 &lxc_conf->mount_list, name, lxcpath);
3356 if (ret < 0)
3357 return log_error(-1, "Failed to setup mount entries");
3358 }
3359
3360 if (lxc_conf->is_execute) {
3361 if (execveat_supported()) {
3362 int fd;
3363 char path[STRLITERALLEN(SBINDIR) + STRLITERALLEN("/init.lxc.static") + 1];
3364
3365 ret = snprintf(path, sizeof(path), SBINDIR "/init.lxc.static");
3366 if (ret < 0 || ret >= PATH_MAX)
3367 return log_error(-1, "Path to init.lxc.static too long");
3368
3369 fd = open(path, O_NOCTTY | O_NOFOLLOW | O_CLOEXEC | O_PATH);
3370 if (fd < 0)
3371 return log_error_errno(-1, errno, "Unable to open lxc.init.static");
3372
3373 ((struct execute_args *)handler->data)->init_fd = fd;
3374 ((struct execute_args *)handler->data)->init_path = NULL;
3375 } else {
3376 ret = lxc_execute_bind_init(handler);
3377 if (ret < 0)
3378 return log_error(-1, "Failed to bind-mount the lxc init system");
3379 }
3380 }
3381
3382 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3383 * mounted. It is guaranteed to be mounted now either through
3384 * automatically or via fstab entries.
3385 */
3386 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
3387 if (ret < 0)
3388 return log_error(-1, "Failed to setup remaining automatic mounts");
3389
3390 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
3391 if (ret < 0)
3392 return log_error(-1, "Failed to run mount hooks");
3393
3394 if (lxc_conf->autodev > 0) {
3395 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
3396 if (ret < 0)
3397 return log_error(-1, "Failed to run autodev hooks");
3398
3399 ret = lxc_fill_autodev(&lxc_conf->rootfs);
3400 if (ret < 0)
3401 return log_error(-1, "Failed to populate \"/dev\"");
3402 }
3403
3404 /* Make sure any start hooks are in the container */
3405 if (!verify_start_hooks(lxc_conf))
3406 return log_error(-1, "Failed to verify start hooks");
3407
3408 ret = lxc_create_tmp_proc_mount(lxc_conf);
3409 if (ret < 0)
3410 return log_error(-1, "Failed to \"/proc\" LSMs");
3411
3412 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
3413 lxc_conf->ttys.dir, pty_mnt_fd);
3414 if (ret < 0)
3415 return log_error(-1, "Failed to setup console");
3416
3417 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3418 if (ret < 0)
3419 return log_error(-1, "Failed to setup \"/dev\" symlinks");
3420
3421 ret = lxc_setup_rootfs_switch_root(&lxc_conf->rootfs);
3422 if (ret < 0)
3423 return log_error(-1, "Failed to pivot root into rootfs");
3424
3425 /* Setting the boot-id is best-effort for now. */
3426 if (lxc_conf->autodev > 0)
3427 (void)lxc_setup_boot_id();
3428
3429 ret = lxc_setup_devpts_child(handler);
3430 if (ret < 0)
3431 return log_error(-1, "Failed to setup new devpts instance");
3432
3433 ret = lxc_create_ttys(handler);
3434 if (ret < 0)
3435 return -1;
3436
3437 ret = setup_personality(lxc_conf->personality);
3438 if (ret < 0)
3439 return log_error(-1, "Failed to set personality");
3440
3441 /* Set sysctl value to a path under /proc/sys as determined from the
3442 * key. For e.g. net.ipv4.ip_forward translated to
3443 * /proc/sys/net/ipv4/ip_forward.
3444 */
3445 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3446 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
3447 if (ret < 0)
3448 return log_error(-1, "Failed to setup sysctl parameters");
3449 }
3450
3451 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3452 if (!lxc_list_empty(&lxc_conf->caps))
3453 return log_error(-1, "Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both");
3454
3455 if (dropcaps_except(&lxc_conf->keepcaps))
3456 return log_error(-1, "Failed to keep capabilities");
3457 } else if (setup_caps(&lxc_conf->caps)) {
3458 return log_error(-1, "Failed to drop capabilities");
3459 }
3460
3461 close_prot_errno_disarm(lxc_conf->rootfs.mntpt_fd)
3462 close_prot_errno_disarm(lxc_conf->rootfs.dev_mntpt_fd)
3463 NOTICE("The container \"%s\" is set up", name);
3464
3465 return 0;
3466 }
3467
3468 int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
3469 char *argv[])
3470 {
3471 struct lxc_list *it;
3472 int which;
3473
3474 for (which = 0; which < NUM_LXC_HOOKS; which ++) {
3475 if (strcmp(hookname, lxchook_names[which]) == 0)
3476 break;
3477 }
3478
3479 if (which >= NUM_LXC_HOOKS)
3480 return -1;
3481
3482 lxc_list_for_each (it, &conf->hooks[which]) {
3483 int ret;
3484 char *hook = it->elem;
3485
3486 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
3487 hookname, argv);
3488 if (ret < 0)
3489 return -1;
3490 }
3491
3492 return 0;
3493 }
3494
3495 int lxc_clear_config_caps(struct lxc_conf *c)
3496 {
3497 struct lxc_list *it, *next;
3498
3499 lxc_list_for_each_safe (it, &c->caps, next) {
3500 lxc_list_del(it);
3501 free(it->elem);
3502 free(it);
3503 }
3504
3505 return 0;
3506 }
3507
3508 static int lxc_free_idmap(struct lxc_list *id_map)
3509 {
3510 struct lxc_list *it, *next;
3511
3512 lxc_list_for_each_safe(it, id_map, next) {
3513 lxc_list_del(it);
3514 free(it->elem);
3515 free(it);
3516 }
3517
3518 return 0;
3519 }
3520
3521 static int __lxc_free_idmap(struct lxc_list *id_map)
3522 {
3523 lxc_free_idmap(id_map);
3524 free(id_map);
3525 return 0;
3526 }
3527 define_cleanup_function(struct lxc_list *, __lxc_free_idmap);
3528
3529 int lxc_clear_idmaps(struct lxc_conf *c)
3530 {
3531 return lxc_free_idmap(&c->id_map);
3532 }
3533
3534 int lxc_clear_config_keepcaps(struct lxc_conf *c)
3535 {
3536 struct lxc_list *it, *next;
3537
3538 lxc_list_for_each_safe (it, &c->keepcaps, next) {
3539 lxc_list_del(it);
3540 free(it->elem);
3541 free(it);
3542 }
3543
3544 return 0;
3545 }
3546
3547 int lxc_clear_namespace(struct lxc_conf *c)
3548 {
3549 int i;
3550 for (i = 0; i < LXC_NS_MAX; i++) {
3551 free(c->ns_share[i]);
3552 c->ns_share[i] = NULL;
3553 }
3554 return 0;
3555 }
3556
3557 int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
3558 {
3559 char *global_token, *namespaced_token;
3560 size_t namespaced_token_len;
3561 struct lxc_list *it, *next, *list;
3562 const char *k = key;
3563 bool all = false;
3564
3565 if (version == CGROUP2_SUPER_MAGIC) {
3566 global_token = "lxc.cgroup2";
3567 namespaced_token = "lxc.cgroup2.";
3568 namespaced_token_len = STRLITERALLEN("lxc.cgroup2.");
3569 list = &c->cgroup2;
3570 } else if (version == CGROUP_SUPER_MAGIC) {
3571 global_token = "lxc.cgroup";
3572 namespaced_token = "lxc.cgroup.";
3573 namespaced_token_len = STRLITERALLEN("lxc.cgroup.");
3574 list = &c->cgroup;
3575 } else {
3576 return -EINVAL;
3577 }
3578
3579 if (strcmp(key, global_token) == 0)
3580 all = true;
3581 else if (strncmp(key, namespaced_token, namespaced_token_len) == 0)
3582 k += namespaced_token_len;
3583 else
3584 return -EINVAL;
3585
3586 lxc_list_for_each_safe (it, list, next) {
3587 struct lxc_cgroup *cg = it->elem;
3588
3589 if (!all && strcmp(cg->subsystem, k) != 0)
3590 continue;
3591
3592 lxc_list_del(it);
3593 free(cg->subsystem);
3594 free(cg->value);
3595 free(cg);
3596 free(it);
3597 }
3598
3599 return 0;
3600 }
3601
3602 static void lxc_clear_devices(struct lxc_conf *conf)
3603 {
3604 struct lxc_list *list = &conf->devices;
3605 struct lxc_list *it, *next;
3606
3607 lxc_list_for_each_safe(it, list, next) {
3608 lxc_list_del(it);
3609 free(it);
3610 }
3611 }
3612
3613 int lxc_clear_limits(struct lxc_conf *c, const char *key)
3614 {
3615 struct lxc_list *it, *next;
3616 const char *k = NULL;
3617 bool all = false;
3618
3619 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
3620 all = true;
3621 else if (strncmp(key, "lxc.limit.", STRLITERALLEN("lxc.limit.")) == 0)
3622 k = key + STRLITERALLEN("lxc.limit.");
3623 else if (strncmp(key, "lxc.prlimit.", STRLITERALLEN("lxc.prlimit.")) == 0)
3624 k = key + STRLITERALLEN("lxc.prlimit.");
3625 else
3626 return -1;
3627
3628 lxc_list_for_each_safe (it, &c->limits, next) {
3629 struct lxc_limit *lim = it->elem;
3630
3631 if (!all && strcmp(lim->resource, k) != 0)
3632 continue;
3633
3634 lxc_list_del(it);
3635 free(lim->resource);
3636 free(lim);
3637 free(it);
3638 }
3639
3640 return 0;
3641 }
3642
3643 int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3644 {
3645 struct lxc_list *it, *next;
3646 const char *k = NULL;
3647 bool all = false;
3648
3649 if (strcmp(key, "lxc.sysctl") == 0)
3650 all = true;
3651 else if (strncmp(key, "lxc.sysctl.", STRLITERALLEN("lxc.sysctl.")) == 0)
3652 k = key + STRLITERALLEN("lxc.sysctl.");
3653 else
3654 return -1;
3655
3656 lxc_list_for_each_safe (it, &c->sysctls, next) {
3657 struct lxc_sysctl *elem = it->elem;
3658
3659 if (!all && strcmp(elem->key, k) != 0)
3660 continue;
3661
3662 lxc_list_del(it);
3663 free(elem->key);
3664 free(elem->value);
3665 free(elem);
3666 free(it);
3667 }
3668
3669 return 0;
3670 }
3671
3672 int lxc_clear_procs(struct lxc_conf *c, const char *key)
3673 {
3674 struct lxc_list *it, *next;
3675 const char *k = NULL;
3676 bool all = false;
3677
3678 if (strcmp(key, "lxc.proc") == 0)
3679 all = true;
3680 else if (strncmp(key, "lxc.proc.", STRLITERALLEN("lxc.proc.")) == 0)
3681 k = key + STRLITERALLEN("lxc.proc.");
3682 else
3683 return -1;
3684
3685 lxc_list_for_each_safe (it, &c->procs, next) {
3686 struct lxc_proc *proc = it->elem;
3687
3688 if (!all && strcmp(proc->filename, k) != 0)
3689 continue;
3690
3691 lxc_list_del(it);
3692 free(proc->filename);
3693 free(proc->value);
3694 free(proc);
3695 free(it);
3696 }
3697
3698 return 0;
3699 }
3700
3701 int lxc_clear_groups(struct lxc_conf *c)
3702 {
3703 struct lxc_list *it, *next;
3704
3705 lxc_list_for_each_safe (it, &c->groups, next) {
3706 lxc_list_del(it);
3707 free(it->elem);
3708 free(it);
3709 }
3710
3711 return 0;
3712 }
3713
3714 int lxc_clear_environment(struct lxc_conf *c)
3715 {
3716 struct lxc_list *it, *next;
3717
3718 lxc_list_for_each_safe (it, &c->environment, next) {
3719 lxc_list_del(it);
3720 free(it->elem);
3721 free(it);
3722 }
3723
3724 return 0;
3725 }
3726
3727 int lxc_clear_mount_entries(struct lxc_conf *c)
3728 {
3729 struct lxc_list *it, *next;
3730
3731 lxc_list_for_each_safe (it, &c->mount_list, next) {
3732 lxc_list_del(it);
3733 free(it->elem);
3734 free(it);
3735 }
3736
3737 return 0;
3738 }
3739
3740 int lxc_clear_automounts(struct lxc_conf *c)
3741 {
3742 c->auto_mounts = 0;
3743 return 0;
3744 }
3745
3746 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
3747 {
3748 int i;
3749 struct lxc_list *it, *next;
3750 const char *k = NULL;
3751 bool all = false, done = false;
3752
3753 if (strcmp(key, "lxc.hook") == 0)
3754 all = true;
3755 else if (strncmp(key, "lxc.hook.", STRLITERALLEN("lxc.hook.")) == 0)
3756 k = key + STRLITERALLEN("lxc.hook.");
3757 else
3758 return -1;
3759
3760 for (i = 0; i < NUM_LXC_HOOKS; i++) {
3761 if (all || strcmp(k, lxchook_names[i]) == 0) {
3762 lxc_list_for_each_safe (it, &c->hooks[i], next) {
3763 lxc_list_del(it);
3764 free(it->elem);
3765 free(it);
3766 }
3767
3768 done = true;
3769 }
3770 }
3771
3772 if (!done)
3773 return log_error(-1, "Invalid hook key: %s", key);
3774
3775 return 0;
3776 }
3777
3778 static inline void lxc_clear_aliens(struct lxc_conf *conf)
3779 {
3780 struct lxc_list *it, *next;
3781
3782 lxc_list_for_each_safe (it, &conf->aliens, next) {
3783 lxc_list_del(it);
3784 free(it->elem);
3785 free(it);
3786 }
3787 }
3788
3789 void lxc_clear_includes(struct lxc_conf *conf)
3790 {
3791 struct lxc_list *it, *next;
3792
3793 lxc_list_for_each_safe (it, &conf->includes, next) {
3794 lxc_list_del(it);
3795 free(it->elem);
3796 free(it);
3797 }
3798 }
3799
3800 int lxc_clear_apparmor_raw(struct lxc_conf *c)
3801 {
3802 struct lxc_list *it, *next;
3803
3804 lxc_list_for_each_safe (it, &c->lsm_aa_raw, next) {
3805 lxc_list_del(it);
3806 free(it->elem);
3807 free(it);
3808 }
3809
3810 return 0;
3811 }
3812
3813 void lxc_conf_free(struct lxc_conf *conf)
3814 {
3815 if (!conf)
3816 return;
3817
3818 if (current_config == conf)
3819 current_config = NULL;
3820 lxc_terminal_conf_free(&conf->console);
3821 free(conf->rootfs.mount);
3822 free(conf->rootfs.bdev_type);
3823 free(conf->rootfs.options);
3824 free(conf->rootfs.path);
3825 free(conf->rootfs.data);
3826 close_prot_errno_disarm(conf->rootfs.mntpt_fd);
3827 close_prot_errno_disarm(conf->rootfs.dev_mntpt_fd);
3828 free(conf->logfile);
3829 if (conf->logfd != -1)
3830 close(conf->logfd);
3831 free(conf->utsname);
3832 free(conf->ttys.dir);
3833 free(conf->ttys.tty_names);
3834 free(conf->fstab);
3835 free(conf->rcfile);
3836 free(conf->execute_cmd);
3837 free(conf->init_cmd);
3838 free(conf->init_cwd);
3839 free(conf->unexpanded_config);
3840 free(conf->syslog);
3841 lxc_free_networks(&conf->network);
3842 free(conf->lsm_aa_profile);
3843 free(conf->lsm_aa_profile_computed);
3844 free(conf->lsm_se_context);
3845 lxc_seccomp_free(&conf->seccomp);
3846 lxc_clear_config_caps(conf);
3847 lxc_clear_config_keepcaps(conf);
3848 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
3849 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
3850 lxc_clear_devices(conf);
3851 lxc_clear_hooks(conf, "lxc.hook");
3852 lxc_clear_mount_entries(conf);
3853 lxc_clear_idmaps(conf);
3854 lxc_clear_groups(conf);
3855 lxc_clear_includes(conf);
3856 lxc_clear_aliens(conf);
3857 lxc_clear_environment(conf);
3858 lxc_clear_limits(conf, "lxc.prlimit");
3859 lxc_clear_sysctls(conf, "lxc.sysctl");
3860 lxc_clear_procs(conf, "lxc.proc");
3861 lxc_clear_apparmor_raw(conf);
3862 lxc_clear_namespace(conf);
3863 free(conf->cgroup_meta.dir);
3864 free(conf->cgroup_meta.monitor_dir);
3865 free(conf->cgroup_meta.monitor_pivot_dir);
3866 free(conf->cgroup_meta.container_dir);
3867 free(conf->cgroup_meta.namespace_dir);
3868 free(conf->cgroup_meta.controllers);
3869 free(conf->shmount.path_host);
3870 free(conf->shmount.path_cont);
3871 free(conf);
3872 }
3873
3874 struct userns_fn_data {
3875 int (*fn)(void *);
3876 const char *fn_name;
3877 void *arg;
3878 int p[2];
3879 };
3880
3881 static int run_userns_fn(void *data)
3882 {
3883 struct userns_fn_data *d = data;
3884 int ret;
3885 char c;
3886
3887 close_prot_errno_disarm(d->p[1]);
3888
3889 /*
3890 * Wait for parent to finish establishing a new mapping in the user
3891 * namespace we are executing in.
3892 */
3893 ret = lxc_read_nointr(d->p[0], &c, 1);
3894 close_prot_errno_disarm(d->p[0]);
3895 if (ret != 1)
3896 return -1;
3897
3898 if (d->fn_name)
3899 TRACE("Calling function \"%s\"", d->fn_name);
3900
3901 /* Call function to run. */
3902 return d->fn(d->arg);
3903 }
3904
3905 static struct id_map *mapped_nsid_add(const struct lxc_conf *conf, unsigned id,
3906 enum idtype idtype)
3907 {
3908 const struct id_map *map;
3909 struct id_map *retmap;
3910
3911 map = find_mapped_nsid_entry(conf, id, idtype);
3912 if (!map)
3913 return NULL;
3914
3915 retmap = malloc(sizeof(*retmap));
3916 if (!retmap)
3917 return NULL;
3918
3919 memcpy(retmap, map, sizeof(*retmap));
3920 return retmap;
3921 }
3922
3923 static struct id_map *find_mapped_hostid_entry(const struct lxc_conf *conf,
3924 unsigned id, enum idtype idtype)
3925 {
3926 struct id_map *map;
3927 struct lxc_list *it;
3928 struct id_map *retmap = NULL;
3929
3930 lxc_list_for_each (it, &conf->id_map) {
3931 map = it->elem;
3932 if (map->idtype != idtype)
3933 continue;
3934
3935 if (id >= map->hostid && id < map->hostid + map->range) {
3936 retmap = map;
3937 break;
3938 }
3939 }
3940
3941 return retmap;
3942 }
3943
3944 /* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3945 * existing one or establish a new one.
3946 */
3947 static struct id_map *mapped_hostid_add(const struct lxc_conf *conf, uid_t id,
3948 enum idtype type)
3949 {
3950 __do_free struct id_map *entry = NULL;
3951 int hostid_mapped;
3952 struct id_map *tmp = NULL;
3953
3954 entry = malloc(sizeof(*entry));
3955 if (!entry)
3956 return NULL;
3957
3958 /* Reuse existing mapping. */
3959 tmp = find_mapped_hostid_entry(conf, id, type);
3960 if (tmp) {
3961 memcpy(entry, tmp, sizeof(*entry));
3962 } else {
3963 /* Find new mapping. */
3964 hostid_mapped = find_unmapped_nsid(conf, type);
3965 if (hostid_mapped < 0)
3966 return log_debug(NULL, "Failed to find free mapping for id %d", id);
3967
3968 entry->idtype = type;
3969 entry->nsid = hostid_mapped;
3970 entry->hostid = (unsigned long)id;
3971 entry->range = 1;
3972 }
3973
3974 return move_ptr(entry);
3975 }
3976
3977 static struct lxc_list *get_minimal_idmap(const struct lxc_conf *conf,
3978 uid_t *resuid, gid_t *resgid)
3979 {
3980 __do_free struct id_map *container_root_uid = NULL,
3981 *container_root_gid = NULL,
3982 *host_uid_map = NULL, *host_gid_map = NULL;
3983 __do_free struct lxc_list *idmap = NULL;
3984 uid_t euid, egid;
3985 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
3986 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
3987 struct lxc_list *tmplist = NULL;
3988
3989 /* Find container root mappings. */
3990 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
3991 if (!container_root_uid)
3992 return log_debug(NULL, "Failed to find mapping for namespace uid %d", 0);
3993 euid = geteuid();
3994 if (euid >= container_root_uid->hostid &&
3995 euid < (container_root_uid->hostid + container_root_uid->range))
3996 host_uid_map = move_ptr(container_root_uid);
3997
3998 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
3999 if (!container_root_gid)
4000 return log_debug(NULL, "Failed to find mapping for namespace gid %d", 0);
4001 egid = getegid();
4002 if (egid >= container_root_gid->hostid &&
4003 egid < (container_root_gid->hostid + container_root_gid->range))
4004 host_gid_map = move_ptr(container_root_gid);
4005
4006 /* Check whether the {g,u}id of the user has a mapping. */
4007 if (!host_uid_map)
4008 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
4009 if (!host_uid_map)
4010 return log_debug(NULL, "Failed to find mapping for uid %d", euid);
4011
4012 if (!host_gid_map)
4013 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
4014 if (!host_gid_map)
4015 return log_debug(NULL, "Failed to find mapping for gid %d", egid);
4016
4017 /* Allocate new {g,u}id map list. */
4018 idmap = malloc(sizeof(*idmap));
4019 if (!idmap)
4020 return NULL;
4021 lxc_list_init(idmap);
4022
4023 /* Add container root to the map. */
4024 tmplist = malloc(sizeof(*tmplist));
4025 if (!tmplist)
4026 return NULL;
4027 /* idmap will now keep track of that memory. */
4028 lxc_list_add_elem(tmplist, move_ptr(host_uid_map));
4029 lxc_list_add_tail(idmap, tmplist);
4030
4031 if (container_root_uid) {
4032 /* Add container root to the map. */
4033 tmplist = malloc(sizeof(*tmplist));
4034 if (!tmplist)
4035 return NULL;
4036 /* idmap will now keep track of that memory. */
4037 lxc_list_add_elem(tmplist, move_ptr(container_root_uid));
4038 lxc_list_add_tail(idmap, tmplist);
4039 }
4040
4041 tmplist = malloc(sizeof(*tmplist));
4042 if (!tmplist)
4043 return NULL;
4044 /* idmap will now keep track of that memory. */
4045 lxc_list_add_elem(tmplist, move_ptr(host_gid_map));
4046 lxc_list_add_tail(idmap, tmplist);
4047
4048 if (container_root_gid) {
4049 tmplist = malloc(sizeof(*tmplist));
4050 if (!tmplist)
4051 return NULL;
4052 /* idmap will now keep track of that memory. */
4053 lxc_list_add_elem(tmplist, move_ptr(container_root_gid));
4054 lxc_list_add_tail(idmap, tmplist);
4055 }
4056
4057 TRACE("Allocated minimal idmapping for ns uid %d and ns gid %d", nsuid, nsgid);
4058
4059 if (resuid)
4060 *resuid = nsuid;
4061 if (resgid)
4062 *resgid = nsgid;
4063 return move_ptr(idmap);
4064 }
4065
4066 /*
4067 * Run a function in a new user namespace.
4068 * The caller's euid/egid will be mapped if it is not already.
4069 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4070 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4071 * This means we require only to establish a mapping from:
4072 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4073 * - the container root -> some sub{g,u}id
4074 * The former we add, if the user did not specify a mapping. The latter we
4075 * retrieve from the container's configured {g,u}id mappings as it must have been
4076 * there to start the container in the first place.
4077 */
4078 int userns_exec_1(const struct lxc_conf *conf, int (*fn)(void *), void *data,
4079 const char *fn_name)
4080 {
4081 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
4082 int ret = -1, status = -1;
4083 char c = '1';
4084 struct userns_fn_data d = {
4085 .arg = data,
4086 .fn = fn,
4087 .fn_name = fn_name,
4088 };
4089 pid_t pid;
4090 int pipe_fds[2];
4091
4092 if (!conf)
4093 return -EINVAL;
4094
4095 idmap = get_minimal_idmap(conf, NULL, NULL);
4096 if (!idmap)
4097 return ret_errno(ENOENT);
4098
4099 ret = pipe2(pipe_fds, O_CLOEXEC);
4100 if (ret < 0)
4101 return -errno;
4102
4103 d.p[0] = pipe_fds[0];
4104 d.p[1] = pipe_fds[1];
4105
4106 /* Clone child in new user namespace. */
4107 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER, NULL);
4108 if (pid < 0) {
4109 ERROR("Failed to clone process in new user namespace");
4110 goto on_error;
4111 }
4112
4113 close_prot_errno_disarm(pipe_fds[0]);
4114
4115 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4116 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4117 struct id_map *map;
4118 struct lxc_list *it;
4119
4120 lxc_list_for_each(it, idmap) {
4121 map = it->elem;
4122 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4123 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4124 }
4125 }
4126
4127 /* Set up {g,u}id mapping for user namespace of child process. */
4128 ret = lxc_map_ids(idmap, pid);
4129 if (ret < 0) {
4130 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4131 goto on_error;
4132 }
4133
4134 /* Tell child to proceed. */
4135 if (lxc_write_nointr(pipe_fds[1], &c, 1) != 1) {
4136 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4137 goto on_error;
4138 }
4139
4140 on_error:
4141 close_prot_errno_disarm(pipe_fds[0]);
4142 close_prot_errno_disarm(pipe_fds[1]);
4143
4144 /* Wait for child to finish. */
4145 if (pid > 0)
4146 status = wait_for_pid(pid);
4147
4148 if (status < 0)
4149 ret = -1;
4150
4151 return ret;
4152 }
4153
4154 int userns_exec_minimal(const struct lxc_conf *conf,
4155 int (*fn_parent)(void *), void *fn_parent_data,
4156 int (*fn_child)(void *), void *fn_child_data)
4157 {
4158 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
4159 uid_t resuid = LXC_INVALID_UID;
4160 gid_t resgid = LXC_INVALID_GID;
4161 char c = '1';
4162 ssize_t ret;
4163 pid_t pid;
4164 int sock_fds[2];
4165
4166 if (!conf || !fn_child)
4167 return ret_errno(EINVAL);
4168
4169 idmap = get_minimal_idmap(conf, &resuid, &resgid);
4170 if (!idmap)
4171 return ret_errno(ENOENT);
4172
4173 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4174 if (ret < 0)
4175 return -errno;
4176
4177 pid = fork();
4178 if (pid < 0) {
4179 SYSERROR("Failed to create new process");
4180 goto on_error;
4181 }
4182
4183 if (pid == 0) {
4184 close_prot_errno_disarm(sock_fds[1]);
4185
4186 ret = unshare(CLONE_NEWUSER);
4187 if (ret < 0) {
4188 SYSERROR("Failed to unshare new user namespace");
4189 _exit(EXIT_FAILURE);
4190 }
4191
4192 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4193 if (ret != 1)
4194 _exit(EXIT_FAILURE);
4195
4196 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4197 if (ret != 1)
4198 _exit(EXIT_FAILURE);
4199
4200 close_prot_errno_disarm(sock_fds[0]);
4201
4202 if (!lxc_setgroups(0, NULL) && errno != EPERM)
4203 _exit(EXIT_FAILURE);
4204
4205 ret = setresgid(resgid, resgid, resgid);
4206 if (ret < 0) {
4207 SYSERROR("Failed to setresgid(%d, %d, %d)",
4208 resgid, resgid, resgid);
4209 _exit(EXIT_FAILURE);
4210 }
4211
4212 ret = setresuid(resuid, resuid, resuid);
4213 if (ret < 0) {
4214 SYSERROR("Failed to setresuid(%d, %d, %d)",
4215 resuid, resuid, resuid);
4216 _exit(EXIT_FAILURE);
4217 }
4218
4219 ret = fn_child(fn_child_data);
4220 if (ret) {
4221 SYSERROR("Running function in new user namespace failed");
4222 _exit(EXIT_FAILURE);
4223 }
4224
4225 _exit(EXIT_SUCCESS);
4226 }
4227
4228 close_prot_errno_disarm(sock_fds[0]);
4229
4230 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4231 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4232 struct id_map *map;
4233 struct lxc_list *it;
4234
4235 lxc_list_for_each(it, idmap) {
4236 map = it->elem;
4237 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4238 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4239 }
4240 }
4241
4242 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4243 if (ret != 1) {
4244 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4245 goto on_error;
4246 }
4247
4248 /* Set up {g,u}id mapping for user namespace of child process. */
4249 ret = lxc_map_ids(idmap, pid);
4250 if (ret < 0) {
4251 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4252 goto on_error;
4253 }
4254
4255 /* Tell child to proceed. */
4256 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4257 if (ret != 1) {
4258 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4259 goto on_error;
4260 }
4261
4262 if (fn_parent && fn_parent(fn_parent_data)) {
4263 SYSERROR("Running parent function failed");
4264 _exit(EXIT_FAILURE);
4265 }
4266
4267 on_error:
4268 close_prot_errno_disarm(sock_fds[0]);
4269 close_prot_errno_disarm(sock_fds[1]);
4270
4271 /* Wait for child to finish. */
4272 if (pid < 0)
4273 return -1;
4274
4275 return wait_for_pid(pid);
4276 }
4277
4278 int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4279 const char *fn_name)
4280 {
4281 pid_t pid;
4282 uid_t euid, egid;
4283 int p[2];
4284 struct id_map *map;
4285 struct lxc_list *cur;
4286 struct userns_fn_data d;
4287 int ret = -1;
4288 char c = '1';
4289 struct lxc_list *idmap = NULL, *tmplist = NULL;
4290 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4291 *host_uid_map = NULL, *host_gid_map = NULL;
4292
4293 if (!conf)
4294 return -EINVAL;
4295
4296 ret = pipe2(p, O_CLOEXEC);
4297 if (ret < 0) {
4298 SYSERROR("opening pipe");
4299 return -1;
4300 }
4301 d.fn = fn;
4302 d.fn_name = fn_name;
4303 d.arg = data;
4304 d.p[0] = p[0];
4305 d.p[1] = p[1];
4306
4307 /* Clone child in new user namespace. */
4308 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER, NULL);
4309 if (pid < 0) {
4310 ERROR("Failed to clone process in new user namespace");
4311 goto on_error;
4312 }
4313
4314 close(p[0]);
4315 p[0] = -1;
4316
4317 euid = geteuid();
4318 egid = getegid();
4319
4320 /* Allocate new {g,u}id map list. */
4321 idmap = malloc(sizeof(*idmap));
4322 if (!idmap)
4323 goto on_error;
4324 lxc_list_init(idmap);
4325
4326 /* Find container root. */
4327 lxc_list_for_each (cur, &conf->id_map) {
4328 struct id_map *tmpmap;
4329
4330 tmplist = malloc(sizeof(*tmplist));
4331 if (!tmplist)
4332 goto on_error;
4333
4334 tmpmap = malloc(sizeof(*tmpmap));
4335 if (!tmpmap) {
4336 free(tmplist);
4337 goto on_error;
4338 }
4339
4340 memset(tmpmap, 0, sizeof(*tmpmap));
4341 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4342 tmplist->elem = tmpmap;
4343
4344 lxc_list_add_tail(idmap, tmplist);
4345
4346 map = cur->elem;
4347
4348 if (map->idtype == ID_TYPE_UID)
4349 if (euid >= map->hostid && euid < map->hostid + map->range)
4350 host_uid_map = map;
4351
4352 if (map->idtype == ID_TYPE_GID)
4353 if (egid >= map->hostid && egid < map->hostid + map->range)
4354 host_gid_map = map;
4355
4356 if (map->nsid != 0)
4357 continue;
4358
4359 if (map->idtype == ID_TYPE_UID)
4360 if (container_root_uid == NULL)
4361 container_root_uid = map;
4362
4363 if (map->idtype == ID_TYPE_GID)
4364 if (container_root_gid == NULL)
4365 container_root_gid = map;
4366 }
4367
4368 if (!container_root_uid || !container_root_gid) {
4369 ERROR("No mapping for container root found");
4370 goto on_error;
4371 }
4372
4373 /* Check whether the {g,u}id of the user has a mapping. */
4374 if (!host_uid_map)
4375 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
4376 else
4377 host_uid_map = container_root_uid;
4378
4379 if (!host_gid_map)
4380 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
4381 else
4382 host_gid_map = container_root_gid;
4383
4384 if (!host_uid_map) {
4385 DEBUG("Failed to find mapping for uid %d", euid);
4386 goto on_error;
4387 }
4388
4389 if (!host_gid_map) {
4390 DEBUG("Failed to find mapping for gid %d", egid);
4391 goto on_error;
4392 }
4393
4394 if (host_uid_map && (host_uid_map != container_root_uid)) {
4395 /* Add container root to the map. */
4396 tmplist = malloc(sizeof(*tmplist));
4397 if (!tmplist)
4398 goto on_error;
4399 lxc_list_add_elem(tmplist, host_uid_map);
4400 lxc_list_add_tail(idmap, tmplist);
4401 }
4402 /* idmap will now keep track of that memory. */
4403 host_uid_map = NULL;
4404
4405 if (host_gid_map && (host_gid_map != container_root_gid)) {
4406 tmplist = malloc(sizeof(*tmplist));
4407 if (!tmplist)
4408 goto on_error;
4409 lxc_list_add_elem(tmplist, host_gid_map);
4410 lxc_list_add_tail(idmap, tmplist);
4411 }
4412 /* idmap will now keep track of that memory. */
4413 host_gid_map = NULL;
4414
4415 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4416 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4417 lxc_list_for_each (cur, idmap) {
4418 map = cur->elem;
4419 TRACE("establishing %cid mapping for \"%d\" in new "
4420 "user namespace: nsuid %lu - hostid %lu - range "
4421 "%lu",
4422 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4423 map->nsid, map->hostid, map->range);
4424 }
4425 }
4426
4427 /* Set up {g,u}id mapping for user namespace of child process. */
4428 ret = lxc_map_ids(idmap, pid);
4429 if (ret < 0) {
4430 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
4431 goto on_error;
4432 }
4433
4434 /* Tell child to proceed. */
4435 if (lxc_write_nointr(p[1], &c, 1) != 1) {
4436 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4437 goto on_error;
4438 }
4439
4440 on_error:
4441 if (p[0] != -1)
4442 close(p[0]);
4443 close(p[1]);
4444
4445 /* Wait for child to finish. */
4446 if (pid > 0)
4447 ret = wait_for_pid(pid);
4448
4449 if (idmap)
4450 __lxc_free_idmap(idmap);
4451
4452 if (host_uid_map && (host_uid_map != container_root_uid))
4453 free(host_uid_map);
4454 if (host_gid_map && (host_gid_map != container_root_gid))
4455 free(host_gid_map);
4456
4457 return ret;
4458 }
4459
4460 static int add_idmap_entry(struct lxc_list *idmap, enum idtype idtype,
4461 unsigned long nsid, unsigned long hostid,
4462 unsigned long range)
4463 {
4464 __do_free struct id_map *new_idmap = NULL;
4465 __do_free struct lxc_list *new_list = NULL;
4466
4467 new_idmap = zalloc(sizeof(*new_idmap));
4468 if (!new_idmap)
4469 return ret_errno(ENOMEM);
4470
4471 new_idmap->idtype = idtype;
4472 new_idmap->hostid = hostid;
4473 new_idmap->nsid = nsid;
4474 new_idmap->range = range;
4475
4476 new_list = zalloc(sizeof(*new_list));
4477 if (!new_list)
4478 return ret_errno(ENOMEM);
4479
4480 new_list->elem = move_ptr(new_idmap);
4481 lxc_list_add_tail(idmap, move_ptr(new_list));
4482
4483 INFO("Adding id map: type %c nsid %lu hostid %lu range %lu",
4484 idtype == ID_TYPE_UID ? 'u' : 'g', nsid, hostid, range);
4485 return 0;
4486 }
4487
4488 int userns_exec_mapped_root(const char *path, int path_fd,
4489 const struct lxc_conf *conf)
4490 {
4491 call_cleaner(__lxc_free_idmap) struct lxc_list *idmap = NULL;
4492 __do_close int fd = -EBADF;
4493 int target_fd = -EBADF;
4494 char c = '1';
4495 ssize_t ret;
4496 pid_t pid;
4497 int sock_fds[2];
4498 uid_t container_host_uid, hostuid;
4499 gid_t container_host_gid, hostgid;
4500 struct stat st;
4501
4502 if (!conf || (!path && path_fd < 0))
4503 return ret_errno(EINVAL);
4504
4505 if (!path)
4506 path = "(null)";
4507
4508 container_host_uid = get_mapped_rootid(conf, ID_TYPE_UID);
4509 if (!uid_valid(container_host_uid))
4510 return log_error(-1, "No uid mapping for container root");
4511
4512 container_host_gid = get_mapped_rootid(conf, ID_TYPE_GID);
4513 if (!gid_valid(container_host_gid))
4514 return log_error(-1, "No gid mapping for container root");
4515
4516 if (path_fd < 0) {
4517 fd = open(path, O_CLOEXEC | O_NOCTTY);
4518 if (fd < 0)
4519 return log_error_errno(-errno, errno, "Failed to open \"%s\"", path);
4520 target_fd = fd;
4521 } else {
4522 target_fd = path_fd;
4523 }
4524
4525 hostuid = geteuid();
4526 /* We are root so chown directly. */
4527 if (hostuid == 0) {
4528 ret = fchown(target_fd, container_host_uid, container_host_gid);
4529 if (ret)
4530 return log_error_errno(-errno, errno,
4531 "Failed to fchown(%d(%s), %d, %d)",
4532 target_fd, path, container_host_uid,
4533 container_host_gid);
4534 return log_trace(0, "Chowned %d(%s) to uid %d and %d", target_fd, path,
4535 container_host_uid, container_host_gid);
4536 }
4537
4538 /* The container's root host id matches */
4539 if (container_host_uid == hostuid)
4540 return log_info(0, "Container root id is mapped to our uid");
4541
4542 /* Get the current ids of our target. */
4543 ret = fstat(target_fd, &st);
4544 if (ret)
4545 return log_error_errno(-errno, errno, "Failed to stat \"%s\"", path);
4546
4547 hostgid = getegid();
4548 if (st.st_uid == hostuid && mapped_hostid(st.st_gid, conf, ID_TYPE_GID) < 0) {
4549 ret = fchown(target_fd, -1, hostgid);
4550 if (ret)
4551 return log_error_errno(-errno, errno,
4552 "Failed to fchown(%d(%s), -1, %d)",
4553 target_fd, path, hostgid);
4554 TRACE("Chowned %d(%s) to -1:%d", target_fd, path, hostgid);
4555 }
4556
4557 idmap = malloc(sizeof(*idmap));
4558 if (!idmap)
4559 return -ENOMEM;
4560 lxc_list_init(idmap);
4561
4562 /* "u:0:rootuid:1" */
4563 ret = add_idmap_entry(idmap, ID_TYPE_UID, 0, container_host_uid, 1);
4564 if (ret < 0)
4565 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4566
4567 /* "u:hostuid:hostuid:1" */
4568 ret = add_idmap_entry(idmap, ID_TYPE_UID, hostuid, hostuid, 1);
4569 if (ret < 0)
4570 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4571
4572 /* "g:0:rootgid:1" */
4573 ret = add_idmap_entry(idmap, ID_TYPE_GID, 0, container_host_gid, 1);
4574 if (ret < 0)
4575 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4576
4577 /* "g:hostgid:hostgid:1" */
4578 ret = add_idmap_entry(idmap, ID_TYPE_GID, hostgid, hostgid, 1);
4579 if (ret < 0)
4580 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4581
4582 if (hostgid != st.st_gid) {
4583 /* "g:pathgid:rootgid+pathgid:1" */
4584 ret = add_idmap_entry(idmap, ID_TYPE_GID, st.st_gid,
4585 container_host_gid + (gid_t)st.st_gid, 1);
4586 if (ret < 0)
4587 return log_error_errno(ret, -ret, "Failed to add idmap entry");
4588 }
4589
4590 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, sock_fds);
4591 if (ret < 0)
4592 return -errno;
4593
4594 pid = fork();
4595 if (pid < 0) {
4596 SYSERROR("Failed to create new process");
4597 goto on_error;
4598 }
4599
4600 if (pid == 0) {
4601 close_prot_errno_disarm(sock_fds[1]);
4602
4603 ret = unshare(CLONE_NEWUSER);
4604 if (ret < 0) {
4605 SYSERROR("Failed to unshare new user namespace");
4606 _exit(EXIT_FAILURE);
4607 }
4608
4609 ret = lxc_write_nointr(sock_fds[0], &c, 1);
4610 if (ret != 1)
4611 _exit(EXIT_FAILURE);
4612
4613 ret = lxc_read_nointr(sock_fds[0], &c, 1);
4614 if (ret != 1)
4615 _exit(EXIT_FAILURE);
4616
4617 close_prot_errno_disarm(sock_fds[0]);
4618
4619 if (!lxc_switch_uid_gid(0, 0))
4620 _exit(EXIT_FAILURE);
4621
4622 if (!lxc_setgroups(0, NULL))
4623 _exit(EXIT_FAILURE);
4624
4625 ret = fchown(target_fd, 0, st.st_gid);
4626 if (ret) {
4627 SYSERROR("Failed to chown %d(%s) to 0:%d", target_fd, path, st.st_gid);
4628 _exit(EXIT_FAILURE);
4629 }
4630
4631 TRACE("Chowned %d(%s) to 0:%d", target_fd, path, st.st_gid);
4632 _exit(EXIT_SUCCESS);
4633 }
4634
4635 close_prot_errno_disarm(sock_fds[0]);
4636
4637 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4638 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4639 struct id_map *map;
4640 struct lxc_list *it;
4641
4642 lxc_list_for_each(it, idmap) {
4643 map = it->elem;
4644 TRACE("Establishing %cid mapping for \"%d\" in new user namespace: nsuid %lu - hostid %lu - range %lu",
4645 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid, map->nsid, map->hostid, map->range);
4646 }
4647 }
4648
4649 ret = lxc_read_nointr(sock_fds[1], &c, 1);
4650 if (ret != 1) {
4651 SYSERROR("Failed waiting for child process %d\" to tell us to proceed", pid);
4652 goto on_error;
4653 }
4654
4655 /* Set up {g,u}id mapping for user namespace of child process. */
4656 ret = lxc_map_ids(idmap, pid);
4657 if (ret < 0) {
4658 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4659 goto on_error;
4660 }
4661
4662 /* Tell child to proceed. */
4663 ret = lxc_write_nointr(sock_fds[1], &c, 1);
4664 if (ret != 1) {
4665 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4666 goto on_error;
4667 }
4668
4669 on_error:
4670 close_prot_errno_disarm(sock_fds[0]);
4671 close_prot_errno_disarm(sock_fds[1]);
4672
4673 /* Wait for child to finish. */
4674 if (pid < 0)
4675 return -1;
4676
4677 return wait_for_pid(pid);
4678 }
4679
4680 /* not thread-safe, do not use from api without first forking */
4681 static char *getuname(void)
4682 {
4683 __do_free char *buf = NULL;
4684 struct passwd pwent;
4685 struct passwd *pwentp = NULL;
4686 size_t bufsize;
4687 int ret;
4688
4689 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4690 if (bufsize == -1)
4691 bufsize = 1024;
4692
4693 buf = malloc(bufsize);
4694 if (!buf)
4695 return NULL;
4696
4697 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4698 if (!pwentp) {
4699 if (ret == 0)
4700 WARN("Could not find matched password record.");
4701
4702 return log_error(NULL, "Failed to get password record - %u", geteuid());
4703 }
4704
4705 return strdup(pwent.pw_name);
4706 }
4707
4708 /* not thread-safe, do not use from api without first forking */
4709 static char *getgname(void)
4710 {
4711 __do_free char *buf = NULL;
4712 struct group grent;
4713 struct group *grentp = NULL;
4714 size_t bufsize;
4715 int ret;
4716
4717 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4718 if (bufsize == -1)
4719 bufsize = 1024;
4720
4721 buf = malloc(bufsize);
4722 if (!buf)
4723 return NULL;
4724
4725 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4726 if (!grentp) {
4727 if (ret == 0)
4728 WARN("Could not find matched group record");
4729
4730 return log_error(NULL, "Failed to get group record - %u", getegid());
4731 }
4732
4733 return strdup(grent.gr_name);
4734 }
4735
4736 /* not thread-safe, do not use from api without first forking */
4737 void suggest_default_idmap(void)
4738 {
4739 __do_free char *gname = NULL, *line = NULL, *uname = NULL;
4740 __do_fclose FILE *subuid_f = NULL, *subgid_f = NULL;
4741 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4742 size_t len = 0;
4743
4744 uname = getuname();
4745 if (!uname)
4746 return;
4747
4748 gname = getgname();
4749 if (!gname)
4750 return;
4751
4752 subuid_f = fopen(subuidfile, "re");
4753 if (!subuid_f) {
4754 ERROR("Your system is not configured with subuids");
4755 return;
4756 }
4757
4758 while (getline(&line, &len, subuid_f) != -1) {
4759 char *p, *p2;
4760 size_t no_newline = 0;
4761
4762 p = strchr(line, ':');
4763 if (*line == '#')
4764 continue;
4765 if (!p)
4766 continue;
4767 *p = '\0';
4768 p++;
4769
4770 if (strcmp(line, uname))
4771 continue;
4772
4773 p2 = strchr(p, ':');
4774 if (!p2)
4775 continue;
4776 *p2 = '\0';
4777 p2++;
4778 if (!*p2)
4779 continue;
4780 no_newline = strcspn(p2, "\n");
4781 p2[no_newline] = '\0';
4782
4783 if (lxc_safe_uint(p, &uid) < 0)
4784 WARN("Could not parse UID");
4785 if (lxc_safe_uint(p2, &urange) < 0)
4786 WARN("Could not parse UID range");
4787 }
4788
4789 subgid_f = fopen(subgidfile, "re");
4790 if (!subgid_f) {
4791 ERROR("Your system is not configured with subgids");
4792 return;
4793 }
4794
4795 while (getline(&line, &len, subgid_f) != -1) {
4796 char *p, *p2;
4797 size_t no_newline = 0;
4798
4799 p = strchr(line, ':');
4800 if (*line == '#')
4801 continue;
4802 if (!p)
4803 continue;
4804 *p = '\0';
4805 p++;
4806
4807 if (strcmp(line, uname))
4808 continue;
4809
4810 p2 = strchr(p, ':');
4811 if (!p2)
4812 continue;
4813 *p2 = '\0';
4814 p2++;
4815 if (!*p2)
4816 continue;
4817 no_newline = strcspn(p2, "\n");
4818 p2[no_newline] = '\0';
4819
4820 if (lxc_safe_uint(p, &gid) < 0)
4821 WARN("Could not parse GID");
4822 if (lxc_safe_uint(p2, &grange) < 0)
4823 WARN("Could not parse GID range");
4824 }
4825
4826 if (!urange || !grange) {
4827 ERROR("You do not have subuids or subgids allocated");
4828 ERROR("Unprivileged containers require subuids and subgids");
4829 return;
4830 }
4831
4832 ERROR("You must either run as root, or define uid mappings");
4833 ERROR("To pass uid mappings to lxc-create, you could create");
4834 ERROR("~/.config/lxc/default.conf:");
4835 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4836 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4837 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
4838 }
4839
4840 static void free_cgroup_settings(struct lxc_list *result)
4841 {
4842 struct lxc_list *iterator, *next;
4843
4844 lxc_list_for_each_safe (iterator, result, next) {
4845 lxc_list_del(iterator);
4846 free_disarm(iterator);
4847 }
4848 free_disarm(result);
4849 }
4850
4851 /* Return the list of cgroup_settings sorted according to the following rules
4852 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4853 */
4854 struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
4855 {
4856 struct lxc_list *result;
4857 struct lxc_cgroup *cg = NULL;
4858 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
4859
4860 result = malloc(sizeof(*result));
4861 if (!result)
4862 return NULL;
4863 lxc_list_init(result);
4864
4865 /* Iterate over the cgroup settings and copy them to the output list. */
4866 lxc_list_for_each (it, cgroup_settings) {
4867 item = malloc(sizeof(*item));
4868 if (!item) {
4869 free_cgroup_settings(result);
4870 return NULL;
4871 }
4872
4873 item->elem = it->elem;
4874 cg = it->elem;
4875 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4876 /* Store the memsw_limit location */
4877 memsw_limit = item;
4878 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4879 memsw_limit != NULL) {
4880 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4881 * before lxc.cgroup.memory.limit_in_bytes, swap these
4882 * two items */
4883 item->elem = memsw_limit->elem;
4884 memsw_limit->elem = it->elem;
4885 }
4886 lxc_list_add_tail(result, item);
4887 }
4888
4889 return result;
4890 }