]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
Merge pull request #1860 from kilobyte/master
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "caps.h" /* for lxc_caps_last_cap() */
77 #include "cgroup.h"
78 #include "conf.h"
79 #include "confile_utils.h"
80 #include "error.h"
81 #include "log.h"
82 #include "lxclock.h"
83 #include "lxcseccomp.h"
84 #include "namespace.h"
85 #include "network.h"
86 #include "parse.h"
87 #include "storage.h"
88 #include "storage/aufs.h"
89 #include "storage/overlay.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy",
240 "start-host"};
241
242 struct mount_opt {
243 char *name;
244 int clear;
245 int flag;
246 };
247
248 struct caps_opt {
249 char *name;
250 int value;
251 };
252
253 struct limit_opt {
254 char *name;
255 int value;
256 };
257
258 /*
259 * The lxc_conf of the container currently being worked on in an
260 * API call
261 * This is used in the error calls
262 */
263 #ifdef HAVE_TLS
264 __thread struct lxc_conf *current_config;
265 #else
266 struct lxc_conf *current_config;
267 #endif
268
269 /* Declare this here, since we don't want to reshuffle the whole file. */
270 static int in_caplist(int cap, struct lxc_list *caps);
271
272 static struct mount_opt mount_opt[] = {
273 { "async", 1, MS_SYNCHRONOUS },
274 { "atime", 1, MS_NOATIME },
275 { "bind", 0, MS_BIND },
276 { "defaults", 0, 0 },
277 { "dev", 1, MS_NODEV },
278 { "diratime", 1, MS_NODIRATIME },
279 { "dirsync", 0, MS_DIRSYNC },
280 { "exec", 1, MS_NOEXEC },
281 { "lazytime", 0, MS_LAZYTIME },
282 { "mand", 0, MS_MANDLOCK },
283 { "noatime", 0, MS_NOATIME },
284 { "nodev", 0, MS_NODEV },
285 { "nodiratime", 0, MS_NODIRATIME },
286 { "noexec", 0, MS_NOEXEC },
287 { "nomand", 1, MS_MANDLOCK },
288 { "norelatime", 1, MS_RELATIME },
289 { "nostrictatime", 1, MS_STRICTATIME },
290 { "nosuid", 0, MS_NOSUID },
291 { "rbind", 0, MS_BIND|MS_REC },
292 { "relatime", 0, MS_RELATIME },
293 { "remount", 0, MS_REMOUNT },
294 { "ro", 0, MS_RDONLY },
295 { "rw", 1, MS_RDONLY },
296 { "strictatime", 0, MS_STRICTATIME },
297 { "suid", 1, MS_NOSUID },
298 { "sync", 0, MS_SYNCHRONOUS },
299 { NULL, 0, 0 },
300 };
301
302 #if HAVE_LIBCAP
303 static struct caps_opt caps_opt[] = {
304 { "chown", CAP_CHOWN },
305 { "dac_override", CAP_DAC_OVERRIDE },
306 { "dac_read_search", CAP_DAC_READ_SEARCH },
307 { "fowner", CAP_FOWNER },
308 { "fsetid", CAP_FSETID },
309 { "kill", CAP_KILL },
310 { "setgid", CAP_SETGID },
311 { "setuid", CAP_SETUID },
312 { "setpcap", CAP_SETPCAP },
313 { "linux_immutable", CAP_LINUX_IMMUTABLE },
314 { "net_bind_service", CAP_NET_BIND_SERVICE },
315 { "net_broadcast", CAP_NET_BROADCAST },
316 { "net_admin", CAP_NET_ADMIN },
317 { "net_raw", CAP_NET_RAW },
318 { "ipc_lock", CAP_IPC_LOCK },
319 { "ipc_owner", CAP_IPC_OWNER },
320 { "sys_module", CAP_SYS_MODULE },
321 { "sys_rawio", CAP_SYS_RAWIO },
322 { "sys_chroot", CAP_SYS_CHROOT },
323 { "sys_ptrace", CAP_SYS_PTRACE },
324 { "sys_pacct", CAP_SYS_PACCT },
325 { "sys_admin", CAP_SYS_ADMIN },
326 { "sys_boot", CAP_SYS_BOOT },
327 { "sys_nice", CAP_SYS_NICE },
328 { "sys_resource", CAP_SYS_RESOURCE },
329 { "sys_time", CAP_SYS_TIME },
330 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
331 { "mknod", CAP_MKNOD },
332 { "lease", CAP_LEASE },
333 #ifdef CAP_AUDIT_READ
334 { "audit_read", CAP_AUDIT_READ },
335 #endif
336 #ifdef CAP_AUDIT_WRITE
337 { "audit_write", CAP_AUDIT_WRITE },
338 #endif
339 #ifdef CAP_AUDIT_CONTROL
340 { "audit_control", CAP_AUDIT_CONTROL },
341 #endif
342 { "setfcap", CAP_SETFCAP },
343 { "mac_override", CAP_MAC_OVERRIDE },
344 { "mac_admin", CAP_MAC_ADMIN },
345 #ifdef CAP_SYSLOG
346 { "syslog", CAP_SYSLOG },
347 #endif
348 #ifdef CAP_WAKE_ALARM
349 { "wake_alarm", CAP_WAKE_ALARM },
350 #endif
351 #ifdef CAP_BLOCK_SUSPEND
352 { "block_suspend", CAP_BLOCK_SUSPEND },
353 #endif
354 };
355 #else
356 static struct caps_opt caps_opt[] = {};
357 #endif
358
359 static struct limit_opt limit_opt[] = {
360 #ifdef RLIMIT_AS
361 { "as", RLIMIT_AS },
362 #endif
363 #ifdef RLIMIT_CORE
364 { "core", RLIMIT_CORE },
365 #endif
366 #ifdef RLIMIT_CPU
367 { "cpu", RLIMIT_CPU },
368 #endif
369 #ifdef RLIMIT_DATA
370 { "data", RLIMIT_DATA },
371 #endif
372 #ifdef RLIMIT_FSIZE
373 { "fsize", RLIMIT_FSIZE },
374 #endif
375 #ifdef RLIMIT_LOCKS
376 { "locks", RLIMIT_LOCKS },
377 #endif
378 #ifdef RLIMIT_MEMLOCK
379 { "memlock", RLIMIT_MEMLOCK },
380 #endif
381 #ifdef RLIMIT_MSGQUEUE
382 { "msgqueue", RLIMIT_MSGQUEUE },
383 #endif
384 #ifdef RLIMIT_NICE
385 { "nice", RLIMIT_NICE },
386 #endif
387 #ifdef RLIMIT_NOFILE
388 { "nofile", RLIMIT_NOFILE },
389 #endif
390 #ifdef RLIMIT_NPROC
391 { "nproc", RLIMIT_NPROC },
392 #endif
393 #ifdef RLIMIT_RSS
394 { "rss", RLIMIT_RSS },
395 #endif
396 #ifdef RLIMIT_RTPRIO
397 { "rtprio", RLIMIT_RTPRIO },
398 #endif
399 #ifdef RLIMIT_RTTIME
400 { "rttime", RLIMIT_RTTIME },
401 #endif
402 #ifdef RLIMIT_SIGPENDING
403 { "sigpending", RLIMIT_SIGPENDING },
404 #endif
405 #ifdef RLIMIT_STACK
406 { "stack", RLIMIT_STACK },
407 #endif
408 };
409
410 static int run_buffer(char *buffer)
411 {
412 struct lxc_popen_FILE *f;
413 char *output;
414 int ret;
415
416 f = lxc_popen(buffer);
417 if (!f) {
418 SYSERROR("Failed to popen() %s.", buffer);
419 return -1;
420 }
421
422 output = malloc(LXC_LOG_BUFFER_SIZE);
423 if (!output) {
424 ERROR("Failed to allocate memory for %s.", buffer);
425 lxc_pclose(f);
426 return -1;
427 }
428
429 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
430 DEBUG("Script %s with output: %s.", buffer, output);
431
432 free(output);
433
434 ret = lxc_pclose(f);
435 if (ret == -1) {
436 SYSERROR("Script exited with error.");
437 return -1;
438 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
439 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
440 return -1;
441 } else if (WIFSIGNALED(ret)) {
442 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
443 return -1;
444 }
445
446 return 0;
447 }
448
449 static int run_script_argv(const char *name, const char *section,
450 const char *script, const char *hook,
451 const char *lxcpath, char **argsin)
452 {
453 int ret, i;
454 char *buffer;
455 size_t size = 0;
456
457 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
458 script, name, section);
459
460 for (i = 0; argsin && argsin[i]; i++)
461 size += strlen(argsin[i]) + 1;
462
463 size += strlen(hook) + 1;
464
465 size += strlen(script);
466 size += strlen(name);
467 size += strlen(section);
468 size += 3;
469
470 if (size > INT_MAX)
471 return -1;
472
473 buffer = alloca(size);
474 if (!buffer) {
475 ERROR("Failed to allocate memory.");
476 return -1;
477 }
478
479 ret =
480 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
481 if (ret < 0 || (size_t)ret >= size) {
482 ERROR("Script name too long.");
483 return -1;
484 }
485
486 for (i = 0; argsin && argsin[i]; i++) {
487 int len = size - ret;
488 int rc;
489 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
490 if (rc < 0 || rc >= len) {
491 ERROR("Script args too long.");
492 return -1;
493 }
494 ret += rc;
495 }
496
497 return run_buffer(buffer);
498 }
499
500 int run_script(const char *name, const char *section, const char *script, ...)
501 {
502 int ret;
503 char *buffer, *p;
504 size_t size = 0;
505 va_list ap;
506
507 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
508 script, name, section);
509
510 va_start(ap, script);
511 while ((p = va_arg(ap, char *)))
512 size += strlen(p) + 1;
513 va_end(ap);
514
515 size += strlen(script);
516 size += strlen(name);
517 size += strlen(section);
518 size += 3;
519
520 if (size > INT_MAX)
521 return -1;
522
523 buffer = alloca(size);
524 if (!buffer) {
525 ERROR("Failed to allocate memory.");
526 return -1;
527 }
528
529 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
530 if (ret < 0 || ret >= size) {
531 ERROR("Script name too long.");
532 return -1;
533 }
534
535 va_start(ap, script);
536 while ((p = va_arg(ap, char *))) {
537 int len = size - ret;
538 int rc;
539 rc = snprintf(buffer + ret, len, " %s", p);
540 if (rc < 0 || rc >= len) {
541 ERROR("Script args too long.");
542 return -1;
543 }
544 ret += rc;
545 }
546 va_end(ap);
547
548 return run_buffer(buffer);
549 }
550
551 /*
552 * pin_rootfs
553 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
554 * the duration of the container run, to prevent the container from marking
555 * the underlying fs readonly on shutdown. unlink the file immediately so
556 * no name pollution is happens
557 * return -1 on error.
558 * return -2 if nothing needed to be pinned.
559 * return an open fd (>=0) if we pinned it.
560 */
561 int pin_rootfs(const char *rootfs)
562 {
563 char absrootfs[MAXPATHLEN];
564 char absrootfspin[MAXPATHLEN];
565 struct stat s;
566 int ret, fd;
567
568 if (rootfs == NULL || strlen(rootfs) == 0)
569 return -2;
570
571 if (!realpath(rootfs, absrootfs))
572 return -2;
573
574 if (access(absrootfs, F_OK))
575 return -1;
576
577 if (stat(absrootfs, &s))
578 return -1;
579
580 if (!S_ISDIR(s.st_mode))
581 return -2;
582
583 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
584 if (ret >= MAXPATHLEN)
585 return -1;
586
587 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
588 if (fd < 0)
589 return fd;
590 (void)unlink(absrootfspin);
591 return fd;
592 }
593
594 /*
595 * If we are asking to remount something, make sure that any
596 * NOEXEC etc are honored.
597 */
598 unsigned long add_required_remount_flags(const char *s, const char *d,
599 unsigned long flags)
600 {
601 #ifdef HAVE_STATVFS
602 struct statvfs sb;
603 unsigned long required_flags = 0;
604
605 if (!(flags & MS_REMOUNT))
606 return flags;
607
608 if (!s)
609 s = d;
610
611 if (!s)
612 return flags;
613 if (statvfs(s, &sb) < 0)
614 return flags;
615
616 if (sb.f_flag & MS_NOSUID)
617 required_flags |= MS_NOSUID;
618 if (sb.f_flag & MS_NODEV)
619 required_flags |= MS_NODEV;
620 if (sb.f_flag & MS_RDONLY)
621 required_flags |= MS_RDONLY;
622 if (sb.f_flag & MS_NOEXEC)
623 required_flags |= MS_NOEXEC;
624
625 return flags | required_flags;
626 #else
627 return flags;
628 #endif
629 }
630
631 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
632 {
633 int r;
634 int i;
635 static struct {
636 int match_mask;
637 int match_flag;
638 const char *source;
639 const char *destination;
640 const char *fstype;
641 unsigned long flags;
642 const char *options;
643 } default_mounts[] = {
644 /* Read-only bind-mounting... In older kernels, doing that required
645 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
646 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
647 * kernel 2.6.26 onwards. However, this apparently does not work on
648 * kernel 3.8. Unfortunately, on that very same kernel, doing the
649 * same trick as above doesn't seem to work either, there one needs
650 * to ALSO specify MS_BIND for the remount, otherwise the entire
651 * fs is remounted read-only or the mount fails because it's busy...
652 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
653 * 2.6.32...
654 */
655 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
656 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
657 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
658 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
659 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
660 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
661 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
662 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
663 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
664 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
665 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
666 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
667 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
668 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
669 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
670 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
671 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
672 { 0, 0, NULL, NULL, NULL, 0, NULL }
673 };
674
675 for (i = 0; default_mounts[i].match_mask; i++) {
676 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
677 char *source = NULL;
678 char *destination = NULL;
679 int saved_errno;
680 unsigned long mflags;
681
682 if (default_mounts[i].source) {
683 /* will act like strdup if %r is not present */
684 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
685 if (!source) {
686 SYSERROR("memory allocation error");
687 return -1;
688 }
689 }
690 if (!default_mounts[i].destination) {
691 ERROR("BUG: auto mounts destination %d was NULL", i);
692 free(source);
693 return -1;
694 }
695 /* will act like strdup if %r is not present */
696 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
697 if (!destination) {
698 saved_errno = errno;
699 SYSERROR("memory allocation error");
700 free(source);
701 errno = saved_errno;
702 return -1;
703 }
704 mflags = add_required_remount_flags(source, destination,
705 default_mounts[i].flags);
706 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
707 saved_errno = errno;
708 if (r < 0 && errno == ENOENT) {
709 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
710 r = 0;
711 }
712 else if (r < 0)
713 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
714
715 free(source);
716 free(destination);
717 if (r < 0) {
718 errno = saved_errno;
719 return -1;
720 }
721 }
722 }
723
724 if (flags & LXC_AUTO_CGROUP_MASK) {
725 int cg_flags;
726
727 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
728 /* If the type of cgroup mount was not specified, it depends on the
729 * container's capabilities as to what makes sense: if we have
730 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
731 * anyway, so we may as well default to read-write; then the admin
732 * will not be given a false sense of security. (And if they really
733 * want mixed r/o r/w, then they can explicitly specify :mixed.)
734 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
735 * :mixed, because then the container can't remount it read-write. */
736 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
737 int has_sys_admin = 0;
738
739 if (!lxc_list_empty(&conf->keepcaps))
740 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
741 else
742 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
743
744 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
745 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
746 else
747 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
748 }
749
750 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
751 SYSERROR("error mounting /sys/fs/cgroup");
752 return -1;
753 }
754 }
755
756 return 0;
757 }
758
759 static int setup_utsname(struct utsname *utsname)
760 {
761 if (!utsname)
762 return 0;
763
764 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
765 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
766 return -1;
767 }
768
769 INFO("'%s' hostname has been setup", utsname->nodename);
770
771 return 0;
772 }
773
774 struct dev_symlinks {
775 const char *oldpath;
776 const char *name;
777 };
778
779 static const struct dev_symlinks dev_symlinks[] = {
780 {"/proc/self/fd", "fd"},
781 {"/proc/self/fd/0", "stdin"},
782 {"/proc/self/fd/1", "stdout"},
783 {"/proc/self/fd/2", "stderr"},
784 };
785
786 static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
787 {
788 char path[MAXPATHLEN];
789 int ret,i;
790 struct stat s;
791
792
793 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
794 const struct dev_symlinks *d = &dev_symlinks[i];
795 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
796 if (ret < 0 || ret >= MAXPATHLEN)
797 return -1;
798
799 /*
800 * Stat the path first. If we don't get an error
801 * accept it as is and don't try to create it
802 */
803 if (!stat(path, &s)) {
804 continue;
805 }
806
807 ret = symlink(d->oldpath, path);
808
809 if (ret && errno != EEXIST) {
810 if ( errno == EROFS ) {
811 WARN("Warning: Read Only file system while creating %s", path);
812 } else {
813 SYSERROR("Error creating %s", path);
814 return -1;
815 }
816 }
817 }
818 return 0;
819 }
820
821 /* Build a space-separate list of ptys to pass to systemd. */
822 static bool append_ptyname(char **pp, char *name)
823 {
824 char *p;
825
826 if (!*pp) {
827 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
828 if (!*pp)
829 return false;
830 sprintf(*pp, "container_ttys=%s", name);
831 return true;
832 }
833 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
834 if (!p)
835 return false;
836 *pp = p;
837 strcat(p, " ");
838 strcat(p, name);
839 return true;
840 }
841
842 static int lxc_setup_ttys(struct lxc_conf *conf)
843 {
844 int i, ret;
845 const struct lxc_tty_info *tty_info = &conf->tty_info;
846 char *ttydir = conf->ttydir;
847 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
848
849 if (!conf->rootfs.path)
850 return 0;
851
852 for (i = 0; i < tty_info->nbtty; i++) {
853 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
854
855 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
856 if (ret < 0 || (size_t)ret >= sizeof(path))
857 return -1;
858
859 if (ttydir) {
860 /* create dev/lxc/tty%d" */
861 ret = snprintf(lxcpath, sizeof(lxcpath),
862 "/dev/%s/tty%d", ttydir, i + 1);
863 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
864 return -1;
865
866 ret = creat(lxcpath, 0660);
867 if (ret < 0 && errno != EEXIST) {
868 SYSERROR("Failed to create \"%s\"", lxcpath);
869 return -1;
870 }
871 if (ret >= 0)
872 close(ret);
873
874 ret = unlink(path);
875 if (ret < 0 && errno != ENOENT) {
876 SYSERROR("Failed to unlink \"%s\"", path);
877 return -1;
878 }
879
880 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
881 if (ret < 0) {
882 WARN("Failed to bind mount \"%s\" onto \"%s\"",
883 pty_info->name, path);
884 continue;
885 }
886 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
887 path);
888
889 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
890 ttydir, i + 1);
891 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
892 return -1;
893
894 ret = symlink(lxcpath, path);
895 if (ret < 0) {
896 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
897 path, lxcpath);
898 return -1;
899 }
900 } else {
901 /* If we populated /dev, then we need to create
902 * /dev/ttyN
903 */
904 ret = access(path, F_OK);
905 if (ret < 0) {
906 ret = creat(path, 0660);
907 if (ret < 0) {
908 SYSERROR("Failed to create \"%s\"", path);
909 /* this isn't fatal, continue */
910 } else {
911 close(ret);
912 }
913 }
914
915 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
916 if (ret < 0) {
917 SYSERROR("Failed to mount '%s'->'%s'", pty_info->name, path);
918 continue;
919 }
920
921 DEBUG("Bind mounted \"%s\" onto \"%s\"", pty_info->name,
922 path);
923 }
924
925 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
926 ERROR("Error setting up container_ttys string");
927 return -1;
928 }
929 }
930
931 INFO("Finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
932 return 0;
933 }
934
935 int lxc_allocate_ttys(const char *name, struct lxc_conf *conf)
936 {
937 struct lxc_tty_info *tty_info = &conf->tty_info;
938 int i, ret;
939
940 /* no tty in the configuration */
941 if (!conf->tty)
942 return 0;
943
944 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
945 if (!tty_info->pty_info) {
946 SYSERROR("failed to allocate struct *pty_info");
947 return -ENOMEM;
948 }
949
950 for (i = 0; i < conf->tty; i++) {
951 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
952
953 process_lock();
954 ret = openpty(&pty_info->master, &pty_info->slave,
955 pty_info->name, NULL, NULL);
956 process_unlock();
957 if (ret) {
958 SYSERROR("failed to create pty device number %d", i);
959 tty_info->nbtty = i;
960 lxc_delete_tty(tty_info);
961 return -ENOTTY;
962 }
963
964 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
965 pty_info->name, pty_info->master, pty_info->slave);
966
967 /* Prevent leaking the file descriptors to the container */
968 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
969 if (ret < 0)
970 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
971 "pty device \"%s\": %s",
972 pty_info->master, pty_info->name, strerror(errno));
973
974 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
975 if (ret < 0)
976 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
977 "pty device \"%s\": %s",
978 pty_info->slave, pty_info->name, strerror(errno));
979
980 pty_info->busy = 0;
981 }
982
983 tty_info->nbtty = conf->tty;
984
985 INFO("finished allocating %d pts devices", conf->tty);
986 return 0;
987 }
988
989 void lxc_delete_tty(struct lxc_tty_info *tty_info)
990 {
991 int i;
992
993 for (i = 0; i < tty_info->nbtty; i++) {
994 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
995
996 close(pty_info->master);
997 close(pty_info->slave);
998 }
999
1000 free(tty_info->pty_info);
1001 tty_info->pty_info = NULL;
1002 tty_info->nbtty = 0;
1003 }
1004
1005 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1006 {
1007 int i;
1008 struct lxc_conf *conf = handler->conf;
1009 struct lxc_tty_info *tty_info = &conf->tty_info;
1010 int sock = handler->data_sock[0];
1011 int ret = -1;
1012
1013 if (!conf->tty)
1014 return 0;
1015
1016 for (i = 0; i < conf->tty; i++) {
1017 int ttyfds[2];
1018 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
1019
1020 ttyfds[0] = pty_info->master;
1021 ttyfds[1] = pty_info->slave;
1022
1023 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1024 if (ret < 0)
1025 break;
1026
1027 TRACE("Send pty \"%s\" with master fd %d and slave fd %d to "
1028 "parent", pty_info->name, pty_info->master, pty_info->slave);
1029 }
1030
1031 if (ret < 0)
1032 ERROR("Failed to send %d ttys to parent: %s", conf->tty,
1033 strerror(errno));
1034 else
1035 TRACE("Sent %d ttys to parent", conf->tty);
1036
1037 return ret;
1038 }
1039
1040 static int lxc_create_ttys(struct lxc_handler *handler)
1041 {
1042 int ret = -1;
1043 struct lxc_conf *conf = handler->conf;
1044
1045 ret = lxc_allocate_ttys(handler->name, conf);
1046 if (ret < 0) {
1047 ERROR("Failed to allocate ttys");
1048 goto on_error;
1049 }
1050
1051 ret = lxc_send_ttys_to_parent(handler);
1052 if (ret < 0) {
1053 ERROR("Failed to send ttys to parent");
1054 goto on_error;
1055 }
1056
1057 if (!conf->is_execute) {
1058 ret = lxc_setup_ttys(conf);
1059 if (ret < 0) {
1060 ERROR("Failed to setup ttys");
1061 goto on_error;
1062 }
1063 }
1064
1065 if (conf->pty_names) {
1066 ret = setenv("container_ttys", conf->pty_names, 1);
1067 if (ret < 0)
1068 SYSERROR("Failed to set \"container_ttys=%s\"", conf->pty_names);
1069 }
1070
1071 ret = 0;
1072
1073 on_error:
1074 lxc_delete_tty(&conf->tty_info);
1075
1076 return ret;
1077 }
1078
1079 static int setup_rootfs_pivot_root(const char *rootfs)
1080 {
1081 int oldroot = -1, newroot = -1;
1082
1083 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1084 if (oldroot < 0) {
1085 SYSERROR("Error opening old-/ for fchdir");
1086 return -1;
1087 }
1088 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1089 if (newroot < 0) {
1090 SYSERROR("Error opening new-/ for fchdir");
1091 goto fail;
1092 }
1093
1094 /* change into new root fs */
1095 if (fchdir(newroot)) {
1096 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
1097 goto fail;
1098 }
1099
1100 /* pivot_root into our new root fs */
1101 if (pivot_root(".", ".")) {
1102 SYSERROR("pivot_root syscall failed");
1103 goto fail;
1104 }
1105
1106 /*
1107 * at this point the old-root is mounted on top of our new-root
1108 * To unmounted it we must not be chdir'd into it, so escape back
1109 * to old-root
1110 */
1111 if (fchdir(oldroot) < 0) {
1112 SYSERROR("Error entering oldroot");
1113 goto fail;
1114 }
1115 if (umount2(".", MNT_DETACH) < 0) {
1116 SYSERROR("Error detaching old root");
1117 goto fail;
1118 }
1119
1120 if (fchdir(newroot) < 0) {
1121 SYSERROR("Error re-entering newroot");
1122 goto fail;
1123 }
1124
1125 close(oldroot);
1126 close(newroot);
1127
1128 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1129
1130 return 0;
1131
1132 fail:
1133 if (oldroot != -1)
1134 close(oldroot);
1135 if (newroot != -1)
1136 close(newroot);
1137 return -1;
1138 }
1139
1140 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1141 * error, log it but don't fail yet.
1142 */
1143 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1144 const char *lxcpath)
1145 {
1146 int ret;
1147 size_t clen;
1148 char *path;
1149
1150 INFO("Preparing \"/dev\"");
1151
1152 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1153 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1154 path = alloca(clen);
1155
1156 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1157 if (ret < 0 || (size_t)ret >= clen)
1158 return -1;
1159
1160 if (!dir_exists(path)) {
1161 WARN("\"/dev\" directory does not exist. Proceeding without "
1162 "autodev being set up");
1163 return 0;
1164 }
1165
1166 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1167 rootfs->path ? rootfs->mount : NULL);
1168 if (ret < 0) {
1169 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1170 return -1;
1171 }
1172 INFO("Mounted tmpfs on \"%s\"", path);
1173
1174 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1175 if (ret < 0 || (size_t)ret >= clen)
1176 return -1;
1177
1178 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1179 * If not, then create it and exit if that fails...
1180 */
1181 if (!dir_exists(path)) {
1182 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1183 if (ret < 0) {
1184 SYSERROR("Failed to create directory \"%s\"", path);
1185 return -1;
1186 }
1187 }
1188
1189 INFO("Prepared \"/dev\"");
1190 return 0;
1191 }
1192
1193 struct lxc_devs {
1194 const char *name;
1195 mode_t mode;
1196 int maj;
1197 int min;
1198 };
1199
1200 static const struct lxc_devs lxc_devs[] = {
1201 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1202 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1203 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1204 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1205 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1206 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1207 };
1208
1209 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1210 {
1211 int ret;
1212 char path[MAXPATHLEN];
1213 int i;
1214 mode_t cmask;
1215
1216 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1217 rootfs->path ? rootfs->mount : "");
1218 if (ret < 0 || ret >= MAXPATHLEN)
1219 return -1;
1220
1221 /* ignore, just don't try to fill in */
1222 if (!dir_exists(path))
1223 return 0;
1224
1225 INFO("Populating \"/dev\"");
1226
1227 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1228 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1229 const struct lxc_devs *d = &lxc_devs[i];
1230
1231 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1232 rootfs->path ? rootfs->mount : "", d->name);
1233 if (ret < 0 || ret >= MAXPATHLEN)
1234 return -1;
1235
1236 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1237 if (ret < 0) {
1238 FILE *pathfile;
1239 char hostpath[MAXPATHLEN];
1240
1241 if (errno == EEXIST) {
1242 DEBUG("\"%s\" device already existed", path);
1243 continue;
1244 }
1245
1246 /* Unprivileged containers cannot create devices, so
1247 * bind mount the device from the host.
1248 */
1249 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1250 if (ret < 0 || ret >= MAXPATHLEN)
1251 return -1;
1252
1253 pathfile = fopen(path, "wb");
1254 if (!pathfile) {
1255 SYSERROR("Failed to create file \"%s\"", path);
1256 return -1;
1257 }
1258 fclose(pathfile);
1259
1260 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1261 rootfs->path ? rootfs->mount : NULL);
1262 if (ret < 0) {
1263 SYSERROR("Failed to bind mount \"%s\" from "
1264 "host into container",
1265 d->name);
1266 return -1;
1267 }
1268 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1269 path);
1270 } else {
1271 DEBUG("Created device node \"%s\"", path);
1272 }
1273 }
1274 umask(cmask);
1275
1276 INFO("Populated \"/dev\"");
1277 return 0;
1278 }
1279
1280 static int lxc_setup_rootfs(struct lxc_conf *conf)
1281 {
1282 int ret;
1283 struct lxc_storage *bdev;
1284 const struct lxc_rootfs *rootfs;
1285
1286 rootfs = &conf->rootfs;
1287 if (!rootfs->path) {
1288 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1289 SYSERROR("Failed to make / rslave.");
1290 return -1;
1291 }
1292 return 0;
1293 }
1294
1295 if (access(rootfs->mount, F_OK)) {
1296 SYSERROR("Failed to access to \"%s\". Check it is present.",
1297 rootfs->mount);
1298 return -1;
1299 }
1300
1301 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1302 if (!bdev) {
1303 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1304 rootfs->path, rootfs->mount,
1305 rootfs->options ? rootfs->options : "(null)");
1306 return -1;
1307 }
1308
1309 ret = bdev->ops->mount(bdev);
1310 storage_put(bdev);
1311 if (ret < 0) {
1312 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1313 rootfs->path, rootfs->mount,
1314 rootfs->options ? rootfs->options : "(null)");
1315 return -1;
1316 }
1317
1318 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1319 rootfs->path, rootfs->mount,
1320 rootfs->options ? rootfs->options : "(null)");
1321
1322 return 0;
1323 }
1324
1325 int prepare_ramfs_root(char *root)
1326 {
1327 char buf[LXC_LINELEN], *p;
1328 char nroot[PATH_MAX];
1329 FILE *f;
1330 int i;
1331 char *p2;
1332
1333 if (realpath(root, nroot) == NULL)
1334 return -errno;
1335
1336 if (chdir("/") == -1)
1337 return -errno;
1338
1339 /*
1340 * We could use here MS_MOVE, but in userns this mount is
1341 * locked and can't be moved.
1342 */
1343 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1344 SYSERROR("Failed to move %s into /", root);
1345 return -errno;
1346 }
1347
1348 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1349 SYSERROR("Failed to make . rprivate");
1350 return -errno;
1351 }
1352
1353 /*
1354 * The following code cleans up inhereted mounts which are not
1355 * required for CT.
1356 *
1357 * The mountinfo file shows not all mounts, if a few points have been
1358 * unmounted between read operations from the mountinfo. So we need to
1359 * read mountinfo a few times.
1360 *
1361 * This loop can be skipped if a container uses unserns, because all
1362 * inherited mounts are locked and we should live with all this trash.
1363 */
1364 while (1) {
1365 int progress = 0;
1366
1367 f = fopen("./proc/self/mountinfo", "r");
1368 if (!f) {
1369 SYSERROR("Unable to open /proc/self/mountinfo");
1370 return -1;
1371 }
1372 while (fgets(buf, LXC_LINELEN, f)) {
1373 for (p = buf, i=0; p && i < 4; i++)
1374 p = strchr(p+1, ' ');
1375 if (!p)
1376 continue;
1377 p2 = strchr(p+1, ' ');
1378 if (!p2)
1379 continue;
1380
1381 *p2 = '\0';
1382 *p = '.';
1383
1384 if (strcmp(p + 1, "/") == 0)
1385 continue;
1386 if (strcmp(p + 1, "/proc") == 0)
1387 continue;
1388
1389 if (umount2(p, MNT_DETACH) == 0)
1390 progress++;
1391 }
1392 fclose(f);
1393 if (!progress)
1394 break;
1395 }
1396
1397 /* This also can be skipped if a container uses unserns */
1398 umount2("./proc", MNT_DETACH);
1399
1400 /* It is weird, but chdir("..") moves us in a new root */
1401 if (chdir("..") == -1) {
1402 SYSERROR("Unable to change working directory");
1403 return -1;
1404 }
1405
1406 if (chroot(".") == -1) {
1407 SYSERROR("Unable to chroot");
1408 return -1;
1409 }
1410
1411 return 0;
1412 }
1413
1414 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1415 {
1416 if (!rootfs->path) {
1417 DEBUG("container does not have a rootfs, so not doing pivot root");
1418 return 0;
1419 }
1420
1421 if (detect_ramfs_rootfs()) {
1422 DEBUG("detected that container is on ramfs");
1423 if (prepare_ramfs_root(rootfs->mount)) {
1424 ERROR("failed to prepare minimal ramfs root");
1425 return -1;
1426 }
1427
1428 DEBUG("prepared ramfs root for container");
1429 return 0;
1430 }
1431
1432 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1433 ERROR("failed to pivot root");
1434 return -1;
1435 }
1436
1437 DEBUG("finished pivot root");
1438 return 0;
1439 }
1440
1441 static int lxc_setup_devpts(int num_pts)
1442 {
1443 int ret;
1444 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1445 char devpts_mntopts[256];
1446
1447 if (!num_pts) {
1448 DEBUG("no new devpts instance will be mounted since no pts "
1449 "devices are requested");
1450 return 0;
1451 }
1452
1453 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1454 default_devpts_mntopts, num_pts);
1455 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1456 return -1;
1457
1458 /* Unmount old devpts instance. */
1459 ret = access("/dev/pts/ptmx", F_OK);
1460 if (!ret) {
1461 ret = umount("/dev/pts");
1462 if (ret < 0) {
1463 SYSERROR("failed to unmount old devpts instance");
1464 return -1;
1465 }
1466 DEBUG("unmounted old /dev/pts instance");
1467 }
1468
1469 /* Create mountpoint for devpts instance. */
1470 ret = mkdir("/dev/pts", 0755);
1471 if (ret < 0 && errno != EEXIST) {
1472 SYSERROR("failed to create the \"/dev/pts\" directory");
1473 return -1;
1474 }
1475
1476 /* Mount new devpts instance. */
1477 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1478 if (ret < 0) {
1479 SYSERROR("failed to mount new devpts instance");
1480 return -1;
1481 }
1482 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1483
1484 /* Remove any pre-existing /dev/ptmx file. */
1485 ret = access("/dev/ptmx", F_OK);
1486 if (!ret) {
1487 ret = remove("/dev/ptmx");
1488 if (ret < 0) {
1489 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1490 return -1;
1491 }
1492 DEBUG("removed existing \"/dev/ptmx\"");
1493 }
1494
1495 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1496 ret = open("/dev/ptmx", O_CREAT, 0666);
1497 if (ret < 0) {
1498 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1499 return -1;
1500 }
1501 close(ret);
1502 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1503
1504 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1505 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1506 if (!ret) {
1507 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1508 return 0;
1509 } else {
1510 /* Fallthrough and try to create a symlink. */
1511 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1512 }
1513
1514 /* Remove the dummy /dev/ptmx file we created above. */
1515 ret = remove("/dev/ptmx");
1516 if (ret < 0) {
1517 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1518 return -1;
1519 }
1520
1521 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1522 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1523 if (ret < 0) {
1524 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1525 return -1;
1526 }
1527 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1528
1529 return 0;
1530 }
1531
1532 static int setup_personality(int persona)
1533 {
1534 #if HAVE_SYS_PERSONALITY_H
1535 if (persona == -1)
1536 return 0;
1537
1538 if (personality(persona) < 0) {
1539 SYSERROR("failed to set personality to '0x%x'", persona);
1540 return -1;
1541 }
1542
1543 INFO("set personality to '0x%x'", persona);
1544 #endif
1545
1546 return 0;
1547 }
1548
1549 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1550 const struct lxc_console *console)
1551 {
1552 char path[MAXPATHLEN];
1553 int ret, fd;
1554
1555 if (console->path && !strcmp(console->path, "none"))
1556 return 0;
1557
1558 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1559 if (ret < 0 || (size_t)ret >= sizeof(path))
1560 return -1;
1561
1562 /* When we are asked to setup a console we remove any previous
1563 * /dev/console bind-mounts.
1564 */
1565 if (file_exists(path)) {
1566 ret = lxc_unstack_mountpoint(path, false);
1567 if (ret < 0) {
1568 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1569 return -ret;
1570 } else {
1571 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1572 }
1573
1574 ret = unlink(path);
1575 if (ret < 0) {
1576 SYSERROR("error unlinking %s", path);
1577 return -errno;
1578 }
1579 }
1580
1581 /* For unprivileged containers autodev or automounts will already have
1582 * taken care of creating /dev/console.
1583 */
1584 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1585 if (fd < 0) {
1586 if (errno != EEXIST) {
1587 SYSERROR("failed to create console");
1588 return -errno;
1589 }
1590 } else {
1591 close(fd);
1592 }
1593
1594 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1595 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1596 return -errno;
1597 }
1598
1599 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1600 ERROR("failed to mount '%s' on '%s'", console->name, path);
1601 return -1;
1602 }
1603
1604 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1605 return 0;
1606 }
1607
1608 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1609 const struct lxc_console *console,
1610 char *ttydir)
1611 {
1612 int ret;
1613 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1614
1615 /* create rootfs/dev/<ttydir> directory */
1616 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1617 if (ret < 0 || (size_t)ret >= sizeof(path))
1618 return -1;
1619
1620 ret = mkdir(path, 0755);
1621 if (ret && errno != EEXIST) {
1622 SYSERROR("failed with errno %d to create %s", errno, path);
1623 return -errno;
1624 }
1625 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1626
1627 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1628 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1629 return -1;
1630
1631 ret = creat(lxcpath, 0660);
1632 if (ret == -1 && errno != EEXIST) {
1633 SYSERROR("error %d creating %s", errno, lxcpath);
1634 return -errno;
1635 }
1636 if (ret >= 0)
1637 close(ret);
1638
1639 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1640 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1641 return -1;
1642
1643 /* When we are asked to setup a console we remove any previous
1644 * /dev/console bind-mounts.
1645 */
1646 if (console->path && !strcmp(console->path, "none")) {
1647 struct stat st;
1648 ret = stat(path, &st);
1649 if (ret < 0) {
1650 if (errno == ENOENT)
1651 return 0;
1652 SYSERROR("failed stat() \"%s\"", path);
1653 return -errno;
1654 }
1655
1656 /* /dev/console must be character device with major number 5 and
1657 * minor number 1. If not, give benefit of the doubt and assume
1658 * the user has mounted something else right there on purpose.
1659 */
1660 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1661 return 0;
1662
1663 /* In case the user requested a bind-mount for /dev/console and
1664 * requests a ttydir we move the mount to the
1665 * /dev/<ttydir/console.
1666 * Note, we only move the uppermost mount and clear all other
1667 * mounts underneath for safety.
1668 * If it is a character device created via mknod() we simply
1669 * rename it.
1670 */
1671 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1672 if (ret < 0) {
1673 if (errno != EINVAL) {
1674 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1675 return -errno;
1676 }
1677 /* path was not a mountpoint */
1678 ret = rename(path, lxcpath);
1679 if (ret < 0) {
1680 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1681 return -errno;
1682 }
1683 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1684 } else {
1685 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1686 }
1687
1688 /* Clear all remaining bind-mounts. */
1689 ret = lxc_unstack_mountpoint(path, false);
1690 if (ret < 0) {
1691 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1692 return -ret;
1693 } else {
1694 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1695 }
1696 } else {
1697 if (file_exists(path)) {
1698 ret = lxc_unstack_mountpoint(path, false);
1699 if (ret < 0) {
1700 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1701 return -ret;
1702 } else {
1703 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1704 }
1705 }
1706
1707 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1708 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1709 return -1;
1710 }
1711 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1712 }
1713
1714 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1715 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1716 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1717 return -1;
1718
1719 ret = unlink(path);
1720 if (ret && errno != ENOENT) {
1721 SYSERROR("error unlinking %s", path);
1722 return -errno;
1723 }
1724
1725 ret = symlink(lxcpath, path);
1726 if (ret < 0) {
1727 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1728 return -1;
1729 }
1730
1731 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1732 return 0;
1733 }
1734
1735 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1736 const struct lxc_console *console, char *ttydir)
1737 {
1738 /* We don't have a rootfs, /dev/console will be shared. */
1739 if (!rootfs->path) {
1740 DEBUG("/dev/console will be shared with the host");
1741 return 0;
1742 }
1743
1744 if (!ttydir)
1745 return lxc_setup_dev_console(rootfs, console);
1746
1747 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1748 }
1749
1750 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1751 {
1752 struct mount_opt *mo;
1753
1754 /* If opt is found in mount_opt, set or clear flags.
1755 * Otherwise append it to data. */
1756
1757 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1758 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1759 if (mo->clear)
1760 *flags &= ~mo->flag;
1761 else
1762 *flags |= mo->flag;
1763 return;
1764 }
1765 }
1766
1767 if (strlen(*data))
1768 strcat(*data, ",");
1769 strcat(*data, opt);
1770 }
1771
1772 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1773 char **mntdata)
1774 {
1775 char *s, *data;
1776 char *p, *saveptr = NULL;
1777
1778 *mntdata = NULL;
1779 *mntflags = 0L;
1780
1781 if (!mntopts)
1782 return 0;
1783
1784 s = strdup(mntopts);
1785 if (!s) {
1786 SYSERROR("failed to allocate memory");
1787 return -1;
1788 }
1789
1790 data = malloc(strlen(s) + 1);
1791 if (!data) {
1792 SYSERROR("failed to allocate memory");
1793 free(s);
1794 return -1;
1795 }
1796 *data = 0;
1797
1798 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1799 p = strtok_r(NULL, ",", &saveptr))
1800 parse_mntopt(p, mntflags, &data);
1801
1802 if (*data)
1803 *mntdata = data;
1804 else
1805 free(data);
1806 free(s);
1807
1808 return 0;
1809 }
1810
1811 static void null_endofword(char *word)
1812 {
1813 while (*word && *word != ' ' && *word != '\t')
1814 word++;
1815 *word = '\0';
1816 }
1817
1818 /*
1819 * skip @nfields spaces in @src
1820 */
1821 static char *get_field(char *src, int nfields)
1822 {
1823 char *p = src;
1824 int i;
1825
1826 for (i = 0; i < nfields; i++) {
1827 while (*p && *p != ' ' && *p != '\t')
1828 p++;
1829 if (!*p)
1830 break;
1831 p++;
1832 }
1833 return p;
1834 }
1835
1836 static int mount_entry(const char *fsname, const char *target,
1837 const char *fstype, unsigned long mountflags,
1838 const char *data, int optional, int dev,
1839 const char *rootfs)
1840 {
1841 int ret;
1842 #ifdef HAVE_STATVFS
1843 struct statvfs sb;
1844 #endif
1845
1846 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1847 rootfs);
1848 if (ret < 0) {
1849 if (optional) {
1850 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1851 fsname, target, strerror(errno));
1852 return 0;
1853 }
1854
1855 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1856 return -1;
1857 }
1858
1859 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1860 unsigned long rqd_flags = 0;
1861
1862 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1863 "options",
1864 fsname ? fsname : "(none)", target ? target : "(none)");
1865
1866 if (mountflags & MS_RDONLY)
1867 rqd_flags |= MS_RDONLY;
1868 #ifdef HAVE_STATVFS
1869 if (statvfs(fsname, &sb) == 0) {
1870 unsigned long required_flags = rqd_flags;
1871
1872 if (sb.f_flag & MS_NOSUID)
1873 required_flags |= MS_NOSUID;
1874
1875 if (sb.f_flag & MS_NODEV && !dev)
1876 required_flags |= MS_NODEV;
1877
1878 if (sb.f_flag & MS_RDONLY)
1879 required_flags |= MS_RDONLY;
1880
1881 if (sb.f_flag & MS_NOEXEC)
1882 required_flags |= MS_NOEXEC;
1883
1884 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1885 "are %lu", fsname, sb.f_flag, required_flags);
1886
1887 /* If this was a bind mount request, and required_flags
1888 * does not have any flags which are not already in
1889 * mountflags, then skip the remount.
1890 */
1891 if (!(mountflags & MS_REMOUNT)) {
1892 if (!(required_flags & ~mountflags) &&
1893 rqd_flags == 0) {
1894 DEBUG("Mountflags already were %lu, "
1895 "skipping remount", mountflags);
1896 goto skipremount;
1897 }
1898 }
1899
1900 mountflags |= required_flags;
1901 }
1902 #endif
1903
1904 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1905 if (ret < 0) {
1906 if (optional) {
1907 INFO("Failed to mount \"%s\" on \"%s\" "
1908 "(optional): %s", fsname, target,
1909 strerror(errno));
1910 return 0;
1911 }
1912
1913 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1914 return -1;
1915 }
1916 }
1917
1918 #ifdef HAVE_STATVFS
1919 skipremount:
1920 #endif
1921 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1922 target, fstype);
1923
1924 return 0;
1925 }
1926
1927 /* Remove "optional", "create=dir", and "create=file" from mntopt */
1928 static void cull_mntent_opt(struct mntent *mntent)
1929 {
1930 int i;
1931 char *list[] = {"create=dir", "create=file", "optional", NULL};
1932
1933 for (i = 0; list[i]; i++) {
1934 char *p, *p2;
1935
1936 p = strstr(mntent->mnt_opts, list[i]);
1937 if (!p)
1938 continue;
1939
1940 p2 = strchr(p, ',');
1941 if (!p2) {
1942 /* no more mntopts, so just chop it here */
1943 *p = '\0';
1944 continue;
1945 }
1946
1947 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
1948 }
1949 }
1950
1951 static int mount_entry_create_dir_file(const struct mntent *mntent,
1952 const char *path,
1953 const struct lxc_rootfs *rootfs,
1954 const char *lxc_name,
1955 const char *lxc_path)
1956 {
1957 int ret = 0;
1958
1959 if (!strncmp(mntent->mnt_type, "overlay", 7))
1960 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1961 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1962 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1963 if (ret < 0)
1964 return -1;
1965
1966 if (hasmntopt(mntent, "create=dir")) {
1967 ret = mkdir_p(path, 0755);
1968 if (ret < 0 && errno != EEXIST) {
1969 SYSERROR("Failed to create directory \"%s\"", path);
1970 return -1;
1971 }
1972 }
1973
1974 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1975 int fd;
1976 char *p1, *p2;
1977
1978 p1 = strdup(path);
1979 if (!p1)
1980 return -1;
1981
1982 p2 = dirname(p1);
1983
1984 ret = mkdir_p(p2, 0755);
1985 free(p1);
1986 if (ret < 0 && errno != EEXIST) {
1987 SYSERROR("Failed to create directory \"%s\"", path);
1988 return -1;
1989 }
1990
1991 fd = open(path, O_CREAT, 0644);
1992 if (fd < 0)
1993 return -1;
1994 close(fd);
1995 }
1996
1997 return 0;
1998 }
1999
2000 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2001 * without a rootfs. */
2002 static inline int mount_entry_on_generic(struct mntent *mntent,
2003 const char *path,
2004 const struct lxc_rootfs *rootfs,
2005 const char *lxc_name,
2006 const char *lxc_path)
2007 {
2008 int ret;
2009 unsigned long mntflags;
2010 char *mntdata;
2011 bool dev, optional;
2012 char *rootfs_path = NULL;
2013
2014 optional = hasmntopt(mntent, "optional") != NULL;
2015 dev = hasmntopt(mntent, "dev") != NULL;
2016
2017 if (rootfs && rootfs->path)
2018 rootfs_path = rootfs->mount;
2019
2020 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2021 lxc_path);
2022 if (ret < 0) {
2023 if (optional)
2024 return 0;
2025
2026 return -1;
2027 }
2028 cull_mntent_opt(mntent);
2029
2030 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2031 if (ret < 0)
2032 return -1;
2033
2034 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
2035 mntdata, optional, dev, rootfs_path);
2036
2037 free(mntdata);
2038 return ret;
2039 }
2040
2041 static inline int mount_entry_on_systemfs(struct mntent *mntent)
2042 {
2043 int ret;
2044 char path[MAXPATHLEN];
2045
2046 /* For containers created without a rootfs all mounts are treated as
2047 * absolute paths starting at / on the host.
2048 */
2049 if (mntent->mnt_dir[0] != '/')
2050 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2051 else
2052 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2053 if (ret < 0 || ret >= sizeof(path))
2054 return -1;
2055
2056 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
2057 }
2058
2059 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
2060 const struct lxc_rootfs *rootfs,
2061 const char *lxc_name,
2062 const char *lxc_path)
2063 {
2064 int offset;
2065 char *aux;
2066 const char *lxcpath;
2067 char path[MAXPATHLEN];
2068 int ret = 0;
2069
2070 lxcpath = lxc_global_config_value("lxc.lxcpath");
2071 if (!lxcpath)
2072 return -1;
2073
2074 /* If rootfs->path is a blockdev path, allow container fstab to use
2075 * <lxcpath>/<name>/rootfs" as the target prefix.
2076 */
2077 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2078 if (ret < 0 || ret >= MAXPATHLEN)
2079 goto skipvarlib;
2080
2081 aux = strstr(mntent->mnt_dir, path);
2082 if (aux) {
2083 offset = strlen(path);
2084 goto skipabs;
2085 }
2086
2087 skipvarlib:
2088 aux = strstr(mntent->mnt_dir, rootfs->path);
2089 if (!aux) {
2090 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
2091 return ret;
2092 }
2093 offset = strlen(rootfs->path);
2094
2095 skipabs:
2096 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2097 if (ret < 0 || ret >= MAXPATHLEN)
2098 return -1;
2099
2100 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2101 }
2102
2103 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2104 const struct lxc_rootfs *rootfs,
2105 const char *lxc_name,
2106 const char *lxc_path)
2107 {
2108 char path[MAXPATHLEN];
2109 int ret;
2110
2111 /* relative to root mount point */
2112 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2113 if (ret < 0 || ret >= sizeof(path)) {
2114 ERROR("path name too long");
2115 return -1;
2116 }
2117
2118 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2119 }
2120
2121 /* This logs a NOTICE() when a user specifies mounts that would conflict with
2122 * devices liblxc sets up automatically.
2123 */
2124 static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2125 const char *dest)
2126 {
2127 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2128 bool needs_warning = false;
2129
2130 clean_mnt_fsname = lxc_deslashify(src);
2131 if (!clean_mnt_fsname)
2132 return;
2133
2134 clean_mnt_dir = lxc_deslashify(dest);
2135 if (!clean_mnt_dir) {
2136 free(clean_mnt_fsname);
2137 return;
2138 }
2139
2140 tmp = clean_mnt_dir;
2141 if (*tmp == '/')
2142 tmp++;
2143
2144 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2145 free(clean_mnt_dir);
2146 free(clean_mnt_fsname);
2147 return;
2148 }
2149
2150 if (!conf->autodev && !conf->pts && !conf->tty &&
2151 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2152 free(clean_mnt_dir);
2153 free(clean_mnt_fsname);
2154 return;
2155 }
2156
2157 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2158 needs_warning = true;
2159 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2160 needs_warning = true;
2161 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2162 needs_warning = true;
2163 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2164 needs_warning = true;
2165 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2166 needs_warning = true;
2167 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2168 needs_warning = true;
2169 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2170 needs_warning = true;
2171 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2172 needs_warning = true;
2173 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2174 needs_warning = true;
2175 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2176 needs_warning = true;
2177 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2178 needs_warning = true;
2179
2180 if (needs_warning)
2181 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2182 "automatic device setup under \"/dev\"",
2183 clean_mnt_fsname, clean_mnt_dir);
2184
2185 free(clean_mnt_dir);
2186 free(clean_mnt_fsname);
2187 }
2188
2189 static int mount_file_entries(const struct lxc_conf *conf,
2190 const struct lxc_rootfs *rootfs, FILE *file,
2191 const char *lxc_name, const char *lxc_path)
2192 {
2193 struct mntent mntent;
2194 char buf[4096];
2195 int ret = -1;
2196
2197 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2198 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2199
2200 if (!rootfs->path)
2201 ret = mount_entry_on_systemfs(&mntent);
2202 else if (mntent.mnt_dir[0] != '/')
2203 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2204 lxc_name, lxc_path);
2205 else
2206 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2207 lxc_name, lxc_path);
2208 if (ret < 0)
2209 return -1;
2210 }
2211 ret = 0;
2212
2213 INFO("Set up mount entries");
2214 return ret;
2215 }
2216
2217 static int setup_mount(const struct lxc_conf *conf,
2218 const struct lxc_rootfs *rootfs, const char *fstab,
2219 const char *lxc_name, const char *lxc_path)
2220 {
2221 FILE *f;
2222 int ret;
2223
2224 if (!fstab)
2225 return 0;
2226
2227 f = setmntent(fstab, "r");
2228 if (!f) {
2229 SYSERROR("Failed to open \"%s\"", fstab);
2230 return -1;
2231 }
2232
2233 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2234 if (ret < 0)
2235 ERROR("Failed to set up mount entries");
2236
2237 endmntent(f);
2238 return ret;
2239 }
2240
2241 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2242 {
2243 int ret;
2244 char *mount_entry;
2245 struct lxc_list *iterator;
2246 FILE *f;
2247 int fd = -1;
2248
2249 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2250 if (fd < 0) {
2251 if (errno != ENOSYS)
2252 return NULL;
2253 f = tmpfile();
2254 TRACE("Created temporary mount file");
2255 } else {
2256 f = fdopen(fd, "r+");
2257 TRACE("Created anonymous mount file");
2258 }
2259
2260 if (!f) {
2261 SYSERROR("Could not create mount file");
2262 if (fd != -1)
2263 close(fd);
2264 return NULL;
2265 }
2266
2267 lxc_list_for_each(iterator, mount) {
2268 mount_entry = iterator->elem;
2269 ret = fprintf(f, "%s\n", mount_entry);
2270 if (ret < strlen(mount_entry))
2271 WARN("Could not write mount entry to mount file");
2272 }
2273
2274 ret = fseek(f, 0, SEEK_SET);
2275 if (ret < 0) {
2276 SYSERROR("Failed to seek mount file");
2277 fclose(f);
2278 return NULL;
2279 }
2280
2281 return f;
2282 }
2283
2284 static int setup_mount_entries(const struct lxc_conf *conf,
2285 const struct lxc_rootfs *rootfs,
2286 struct lxc_list *mount, const char *lxc_name,
2287 const char *lxc_path)
2288 {
2289 FILE *f;
2290 int ret;
2291
2292 f = make_anonymous_mount_file(mount);
2293 if (!f)
2294 return -1;
2295
2296 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2297
2298 fclose(f);
2299 return ret;
2300 }
2301
2302 static int parse_cap(const char *cap)
2303 {
2304 char *ptr = NULL;
2305 size_t i;
2306 int capid = -1;
2307
2308 if (!strcmp(cap, "none"))
2309 return -2;
2310
2311 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2312
2313 if (strcmp(cap, caps_opt[i].name))
2314 continue;
2315
2316 capid = caps_opt[i].value;
2317 break;
2318 }
2319
2320 if (capid < 0) {
2321 /* try to see if it's numeric, so the user may specify
2322 * capabilities that the running kernel knows about but
2323 * we don't */
2324 errno = 0;
2325 capid = strtol(cap, &ptr, 10);
2326 if (!ptr || *ptr != '\0' || errno != 0)
2327 /* not a valid number */
2328 capid = -1;
2329 else if (capid > lxc_caps_last_cap())
2330 /* we have a number but it's not a valid
2331 * capability */
2332 capid = -1;
2333 }
2334
2335 return capid;
2336 }
2337
2338 int in_caplist(int cap, struct lxc_list *caps)
2339 {
2340 struct lxc_list *iterator;
2341 int capid;
2342
2343 lxc_list_for_each(iterator, caps) {
2344 capid = parse_cap(iterator->elem);
2345 if (capid == cap)
2346 return 1;
2347 }
2348
2349 return 0;
2350 }
2351
2352 static int setup_caps(struct lxc_list *caps)
2353 {
2354 struct lxc_list *iterator;
2355 char *drop_entry;
2356 int capid;
2357
2358 lxc_list_for_each(iterator, caps) {
2359
2360 drop_entry = iterator->elem;
2361
2362 capid = parse_cap(drop_entry);
2363
2364 if (capid < 0) {
2365 ERROR("unknown capability %s", drop_entry);
2366 return -1;
2367 }
2368
2369 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2370
2371 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2372 SYSERROR("failed to remove %s capability", drop_entry);
2373 return -1;
2374 }
2375
2376 }
2377
2378 DEBUG("capabilities have been setup");
2379
2380 return 0;
2381 }
2382
2383 static int dropcaps_except(struct lxc_list *caps)
2384 {
2385 struct lxc_list *iterator;
2386 char *keep_entry;
2387 int i, capid;
2388 int numcaps = lxc_caps_last_cap() + 1;
2389 INFO("found %d capabilities", numcaps);
2390
2391 if (numcaps <= 0 || numcaps > 200)
2392 return -1;
2393
2394 /* caplist[i] is 1 if we keep capability i */
2395 int *caplist = alloca(numcaps * sizeof(int));
2396 memset(caplist, 0, numcaps * sizeof(int));
2397
2398 lxc_list_for_each(iterator, caps) {
2399
2400 keep_entry = iterator->elem;
2401
2402 capid = parse_cap(keep_entry);
2403
2404 if (capid == -2)
2405 continue;
2406
2407 if (capid < 0) {
2408 ERROR("unknown capability %s", keep_entry);
2409 return -1;
2410 }
2411
2412 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2413
2414 caplist[capid] = 1;
2415 }
2416 for (i=0; i<numcaps; i++) {
2417 if (caplist[i])
2418 continue;
2419 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2420 SYSERROR("failed to remove capability %d", i);
2421 return -1;
2422 }
2423 }
2424
2425 DEBUG("capabilities have been setup");
2426
2427 return 0;
2428 }
2429
2430 static int parse_resource(const char *res) {
2431 size_t i;
2432 int resid = -1;
2433
2434 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2435 if (strcmp(res, limit_opt[i].name) == 0)
2436 return limit_opt[i].value;
2437 }
2438
2439 /* try to see if it's numeric, so the user may specify
2440 * resources that the running kernel knows about but
2441 * we don't */
2442 if (lxc_safe_int(res, &resid) == 0)
2443 return resid;
2444 return -1;
2445 }
2446
2447 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2448 struct lxc_list *it;
2449 struct lxc_limit *lim;
2450 int resid;
2451
2452 lxc_list_for_each(it, limits) {
2453 lim = it->elem;
2454
2455 resid = parse_resource(lim->resource);
2456 if (resid < 0) {
2457 ERROR("unknown resource %s", lim->resource);
2458 return -1;
2459 }
2460
2461 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2462 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2463 return -1;
2464 }
2465 }
2466 return 0;
2467 }
2468
2469 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2470
2471 struct lxc_conf *lxc_conf_init(void)
2472 {
2473 struct lxc_conf *new;
2474 int i;
2475
2476 new = malloc(sizeof(*new));
2477 if (!new) {
2478 ERROR("lxc_conf_init : %s", strerror(errno));
2479 return NULL;
2480 }
2481 memset(new, 0, sizeof(*new));
2482
2483 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2484 new->personality = -1;
2485 new->autodev = 1;
2486 new->console.log_path = NULL;
2487 new->console.log_fd = -1;
2488 new->console.path = NULL;
2489 new->console.peer = -1;
2490 new->console.peerpty.busy = -1;
2491 new->console.peerpty.master = -1;
2492 new->console.peerpty.slave = -1;
2493 new->console.master = -1;
2494 new->console.slave = -1;
2495 new->console.name[0] = '\0';
2496 new->maincmd_fd = -1;
2497 new->nbd_idx = -1;
2498 new->rootfs.mount = strdup(default_rootfs_mount);
2499 if (!new->rootfs.mount) {
2500 ERROR("lxc_conf_init : %s", strerror(errno));
2501 free(new);
2502 return NULL;
2503 }
2504 new->logfd = -1;
2505 lxc_list_init(&new->cgroup);
2506 lxc_list_init(&new->network);
2507 lxc_list_init(&new->mount_list);
2508 lxc_list_init(&new->caps);
2509 lxc_list_init(&new->keepcaps);
2510 lxc_list_init(&new->id_map);
2511 lxc_list_init(&new->includes);
2512 lxc_list_init(&new->aliens);
2513 lxc_list_init(&new->environment);
2514 lxc_list_init(&new->limits);
2515 for (i=0; i<NUM_LXC_HOOKS; i++)
2516 lxc_list_init(&new->hooks[i]);
2517 lxc_list_init(&new->groups);
2518 new->lsm_aa_profile = NULL;
2519 new->lsm_se_context = NULL;
2520 new->tmp_umount_proc = 0;
2521
2522 for (i = 0; i < LXC_NS_MAX; i++)
2523 new->inherit_ns_fd[i] = -1;
2524
2525 /* if running in a new user namespace, init and COMMAND
2526 * default to running as UID/GID 0 when using lxc-execute */
2527 new->init_uid = 0;
2528 new->init_gid = 0;
2529 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2530
2531 return new;
2532 }
2533
2534 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2535 size_t buf_size)
2536 {
2537 char path[MAXPATHLEN];
2538 int fd, ret;
2539
2540 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2541 idtype == ID_TYPE_UID ? 'u' : 'g');
2542 if (ret < 0 || ret >= MAXPATHLEN) {
2543 ERROR("failed to create path \"%s\"", path);
2544 return -E2BIG;
2545 }
2546
2547 fd = open(path, O_WRONLY);
2548 if (fd < 0) {
2549 SYSERROR("failed to open \"%s\"", path);
2550 return -1;
2551 }
2552
2553 errno = 0;
2554 ret = lxc_write_nointr(fd, buf, buf_size);
2555 if (ret != buf_size) {
2556 SYSERROR("failed to write %cid mapping to \"%s\"",
2557 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2558 close(fd);
2559 return -1;
2560 }
2561 close(fd);
2562
2563 return 0;
2564 }
2565
2566 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2567 *
2568 * @return 1 if functional binary was found
2569 * @return 0 if binary exists but is lacking privilege
2570 * @return -ENOENT if binary does not exist
2571 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2572 *
2573 */
2574 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2575 {
2576 char *path;
2577 int ret;
2578 struct stat st;
2579 int fret = 0;
2580
2581 if (cap != CAP_SETUID && cap != CAP_SETGID)
2582 return -EINVAL;
2583
2584 path = on_path(binary, NULL);
2585 if (!path)
2586 return -ENOENT;
2587
2588 ret = stat(path, &st);
2589 if (ret < 0) {
2590 fret = -errno;
2591 goto cleanup;
2592 }
2593
2594 /* Check if the binary is setuid. */
2595 if (st.st_mode & S_ISUID) {
2596 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
2597 fret = 1;
2598 goto cleanup;
2599 }
2600
2601 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
2602 /* Check if it has the CAP_SETUID capability. */
2603 if ((cap & CAP_SETUID) &&
2604 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2605 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2606 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2607 "and CAP_PERMITTED sets.", path);
2608 fret = 1;
2609 goto cleanup;
2610 }
2611
2612 /* Check if it has the CAP_SETGID capability. */
2613 if ((cap & CAP_SETGID) &&
2614 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2615 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2616 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2617 "and CAP_PERMITTED sets.", path);
2618 fret = 1;
2619 goto cleanup;
2620 }
2621 #else
2622 /* If we cannot check for file capabilities we need to give the benefit
2623 * of the doubt. Otherwise we might fail even though all the necessary
2624 * file capabilities are set.
2625 */
2626 DEBUG("Cannot check for file capabilites as full capability support is "
2627 "missing. Manual intervention needed.");
2628 fret = 1;
2629 #endif
2630
2631 cleanup:
2632 free(path);
2633 return fret;
2634 }
2635
2636 int lxc_map_ids_exec_wrapper(void *args)
2637 {
2638 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2639 return -1;
2640 }
2641
2642 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2643 {
2644 struct id_map *map;
2645 struct lxc_list *iterator;
2646 enum idtype type;
2647 char u_or_g;
2648 char *pos;
2649 int fill, left;
2650 char cmd_output[MAXPATHLEN];
2651 /* strlen("new@idmap") = 9
2652 * +
2653 * strlen(" ") = 1
2654 * +
2655 * LXC_NUMSTRLEN64
2656 * +
2657 * strlen(" ") = 1
2658 *
2659 * We add some additional space to make sure that we really have
2660 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2661 */
2662 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2663 int ret = 0, uidmap = 0, gidmap = 0;
2664 bool use_shadow = false, had_entry = false;
2665
2666 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2667 * ranges, then insist that root also reserve ranges in subuid. This
2668 * will protected it by preventing another user from being handed the
2669 * range by shadow.
2670 */
2671 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
2672 if (uidmap == -ENOENT)
2673 WARN("newuidmap binary is missing");
2674 else if (!uidmap)
2675 WARN("newuidmap is lacking necessary privileges");
2676
2677 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
2678 if (gidmap == -ENOENT)
2679 WARN("newgidmap binary is missing");
2680 else if (!gidmap)
2681 WARN("newgidmap is lacking necessary privileges");
2682
2683 if (uidmap > 0 && gidmap > 0) {
2684 DEBUG("Functional newuidmap and newgidmap binary found.");
2685 use_shadow = true;
2686 } else {
2687 /* In case unprivileged users run application containers via
2688 * execute() or a start*() there are valid cases where they may
2689 * only want to map their own {g,u}id. Let's not block them from
2690 * doing so by requiring geteuid() == 0.
2691 */
2692 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2693 "write directly with euid %d.", geteuid());
2694 }
2695
2696 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2697 type++, u_or_g = 'g') {
2698 pos = mapbuf;
2699
2700 if (use_shadow)
2701 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
2702
2703 lxc_list_for_each(iterator, idmap) {
2704 map = iterator->elem;
2705 if (map->idtype != type)
2706 continue;
2707
2708 had_entry = true;
2709
2710 left = LXC_IDMAPLEN - (pos - mapbuf);
2711 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
2712 use_shadow ? " " : "", map->nsid,
2713 map->hostid, map->range,
2714 use_shadow ? "" : "\n");
2715 if (fill <= 0 || fill >= left) {
2716 /* The kernel only takes <= 4k for writes to
2717 * /proc/<pid>/{g,u}id_map
2718 */
2719 SYSERROR("Too many %cid mappings defined", u_or_g);
2720 return -1;
2721 }
2722
2723 pos += fill;
2724 }
2725 if (!had_entry)
2726 continue;
2727
2728 /* Try to catch the ouput of new{g,u}idmap to make debugging
2729 * easier.
2730 */
2731 if (use_shadow) {
2732 ret = run_command(cmd_output, sizeof(cmd_output),
2733 lxc_map_ids_exec_wrapper,
2734 (void *)mapbuf);
2735 if (ret < 0) {
2736 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2737 u_or_g, cmd_output, mapbuf);
2738 return -1;
2739 }
2740 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
2741 } else {
2742 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
2743 if (ret < 0) {
2744 ERROR("Failed to write mapping: %s", mapbuf);
2745 return -1;
2746 }
2747 TRACE("Wrote mapping \"%s\"", mapbuf);
2748 }
2749
2750 memset(mapbuf, 0, sizeof(mapbuf));
2751 }
2752
2753 return 0;
2754 }
2755
2756 /*
2757 * return the host uid/gid to which the container root is mapped in
2758 * *val.
2759 * Return true if id was found, false otherwise.
2760 */
2761 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
2762 unsigned long *val)
2763 {
2764 struct lxc_list *it;
2765 struct id_map *map;
2766
2767 lxc_list_for_each(it, &conf->id_map) {
2768 map = it->elem;
2769 if (map->idtype != idtype)
2770 continue;
2771 if (map->nsid != 0)
2772 continue;
2773 *val = map->hostid;
2774 return true;
2775 }
2776 return false;
2777 }
2778
2779 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
2780 {
2781 struct lxc_list *it;
2782 struct id_map *map;
2783 lxc_list_for_each(it, &conf->id_map) {
2784 map = it->elem;
2785 if (map->idtype != idtype)
2786 continue;
2787 if (id >= map->hostid && id < map->hostid + map->range)
2788 return (id - map->hostid) + map->nsid;
2789 }
2790 return -1;
2791 }
2792
2793 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
2794 {
2795 struct lxc_list *it;
2796 struct id_map *map;
2797 unsigned int freeid = 0;
2798 again:
2799 lxc_list_for_each(it, &conf->id_map) {
2800 map = it->elem;
2801 if (map->idtype != idtype)
2802 continue;
2803 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2804 freeid = map->nsid + map->range;
2805 goto again;
2806 }
2807 }
2808 return freeid;
2809 }
2810
2811 int chown_mapped_root_exec_wrapper(void *args)
2812 {
2813 execvp("lxc-usernsexec", args);
2814 return -1;
2815 }
2816
2817 /*
2818 * chown_mapped_root: for an unprivileged user with uid/gid X to
2819 * chown a dir to subuid/subgid Y, he needs to run chown as root
2820 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
2821 * nsid Y is mapped to hostuid/hostgid X. That way, the container
2822 * root is privileged with respect to hostuid/hostgid X, allowing
2823 * him to do the chown.
2824 */
2825 int chown_mapped_root(char *path, struct lxc_conf *conf)
2826 {
2827 uid_t rootuid, rootgid;
2828 unsigned long val;
2829 int hostuid, hostgid, ret;
2830 struct stat sb;
2831 char map1[100], map2[100], map3[100], map4[100], map5[100];
2832 char ugid[100];
2833 char *args1[] = {"lxc-usernsexec",
2834 "-m", map1,
2835 "-m", map2,
2836 "-m", map3,
2837 "-m", map5,
2838 "--", "chown", ugid, path,
2839 NULL};
2840 char *args2[] = {"lxc-usernsexec",
2841 "-m", map1,
2842 "-m", map2,
2843 "-m", map3,
2844 "-m", map4,
2845 "-m", map5,
2846 "--", "chown", ugid, path,
2847 NULL};
2848 char cmd_output[MAXPATHLEN];
2849
2850 hostuid = geteuid();
2851 hostgid = getegid();
2852
2853 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
2854 ERROR("No uid mapping for container root");
2855 return -1;
2856 }
2857 rootuid = (uid_t)val;
2858 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
2859 ERROR("No gid mapping for container root");
2860 return -1;
2861 }
2862 rootgid = (gid_t)val;
2863
2864 if (hostuid == 0) {
2865 if (chown(path, rootuid, rootgid) < 0) {
2866 ERROR("Error chowning %s", path);
2867 return -1;
2868 }
2869 return 0;
2870 }
2871
2872 if (rootuid == hostuid) {
2873 /* nothing to do */
2874 INFO("Container root is our uid; no need to chown");
2875 return 0;
2876 }
2877
2878 /* save the current gid of "path" */
2879 if (stat(path, &sb) < 0) {
2880 ERROR("Error stat %s", path);
2881 return -1;
2882 }
2883
2884 /* Update the path argument in case this was overlayfs. */
2885 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
2886 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
2887
2888 /*
2889 * A file has to be group-owned by a gid mapped into the
2890 * container, or the container won't be privileged over it.
2891 */
2892 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
2893 if (sb.st_uid == hostuid &&
2894 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
2895 chown(path, -1, hostgid) < 0) {
2896 ERROR("Failed chgrping %s", path);
2897 return -1;
2898 }
2899
2900 /* "u:0:rootuid:1" */
2901 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
2902 if (ret < 0 || ret >= 100) {
2903 ERROR("Error uid printing map string");
2904 return -1;
2905 }
2906
2907 /* "u:hostuid:hostuid:1" */
2908 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
2909 if (ret < 0 || ret >= 100) {
2910 ERROR("Error uid printing map string");
2911 return -1;
2912 }
2913
2914 /* "g:0:rootgid:1" */
2915 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
2916 if (ret < 0 || ret >= 100) {
2917 ERROR("Error gid printing map string");
2918 return -1;
2919 }
2920
2921 /* "g:pathgid:rootgid+pathgid:1" */
2922 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
2923 rootgid + (gid_t)sb.st_gid);
2924 if (ret < 0 || ret >= 100) {
2925 ERROR("Error gid printing map string");
2926 return -1;
2927 }
2928
2929 /* "g:hostgid:hostgid:1" */
2930 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
2931 if (ret < 0 || ret >= 100) {
2932 ERROR("Error gid printing map string");
2933 return -1;
2934 }
2935
2936 /* "0:pathgid" (chown) */
2937 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
2938 if (ret < 0 || ret >= 100) {
2939 ERROR("Error owner printing format string for chown");
2940 return -1;
2941 }
2942
2943 if (hostgid == sb.st_gid)
2944 ret = run_command(cmd_output, sizeof(cmd_output),
2945 chown_mapped_root_exec_wrapper,
2946 (void *)args1);
2947 else
2948 ret = run_command(cmd_output, sizeof(cmd_output),
2949 chown_mapped_root_exec_wrapper,
2950 (void *)args2);
2951 if (ret < 0)
2952 ERROR("lxc-usernsexec failed: %s", cmd_output);
2953
2954 return ret;
2955 }
2956
2957 int lxc_ttys_shift_ids(struct lxc_conf *c)
2958 {
2959 if (lxc_list_empty(&c->id_map))
2960 return 0;
2961
2962 if (!strcmp(c->console.name, ""))
2963 return 0;
2964
2965 if (chown_mapped_root(c->console.name, c) < 0) {
2966 ERROR("failed to chown console \"%s\"", c->console.name);
2967 return -1;
2968 }
2969
2970 TRACE("chowned console \"%s\"", c->console.name);
2971
2972 return 0;
2973 }
2974
2975 /* NOTE: Must not be called from inside the container namespace! */
2976 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
2977 {
2978 int mounted;
2979
2980 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
2981 if (mounted == -1) {
2982 SYSERROR("failed to mount /proc in the container");
2983 /* continue only if there is no rootfs */
2984 if (conf->rootfs.path)
2985 return -1;
2986 } else if (mounted == 1) {
2987 conf->tmp_umount_proc = 1;
2988 }
2989
2990 return 0;
2991 }
2992
2993 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2994 {
2995 if (lxc_conf->tmp_umount_proc == 1) {
2996 umount("/proc");
2997 lxc_conf->tmp_umount_proc = 0;
2998 }
2999 }
3000
3001 void remount_all_slave(void)
3002 {
3003 /* walk /proc/mounts and change any shared entries to slave */
3004 FILE *f = fopen("/proc/self/mountinfo", "r");
3005 char *line = NULL;
3006 size_t len = 0;
3007
3008 if (!f) {
3009 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3010 ERROR("Continuing container startup...");
3011 return;
3012 }
3013
3014 while (getline(&line, &len, f) != -1) {
3015 char *target, *opts;
3016 target = get_field(line, 4);
3017 if (!target)
3018 continue;
3019 opts = get_field(target, 2);
3020 if (!opts)
3021 continue;
3022 null_endofword(opts);
3023 if (!strstr(opts, "shared"))
3024 continue;
3025 null_endofword(target);
3026 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3027 SYSERROR("Failed to make %s rslave", target);
3028 ERROR("Continuing...");
3029 }
3030 }
3031 fclose(f);
3032 free(line);
3033 }
3034
3035 void lxc_execute_bind_init(struct lxc_conf *conf)
3036 {
3037 int ret;
3038 char path[PATH_MAX], destpath[PATH_MAX], *p;
3039
3040 /* If init exists in the container, don't bind mount a static one */
3041 p = choose_init(conf->rootfs.mount);
3042 if (p) {
3043 free(p);
3044 return;
3045 }
3046
3047 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3048 if (ret < 0 || ret >= PATH_MAX) {
3049 WARN("Path name too long searching for lxc.init.static");
3050 return;
3051 }
3052
3053 if (!file_exists(path)) {
3054 INFO("%s does not exist on host", path);
3055 return;
3056 }
3057
3058 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3059 if (ret < 0 || ret >= PATH_MAX) {
3060 WARN("Path name too long for container's lxc.init.static");
3061 return;
3062 }
3063
3064 if (!file_exists(destpath)) {
3065 FILE * pathfile = fopen(destpath, "wb");
3066 if (!pathfile) {
3067 SYSERROR("Failed to create mount target '%s'", destpath);
3068 return;
3069 }
3070 fclose(pathfile);
3071 }
3072
3073 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3074 if (ret < 0)
3075 SYSERROR("Failed to bind lxc.init.static into container");
3076 INFO("lxc.init.static bound into container at %s", path);
3077 }
3078
3079 /*
3080 * This does the work of remounting / if it is shared, calling the
3081 * container pre-mount hooks, and mounting the rootfs.
3082 */
3083 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
3084 {
3085 if (conf->rootfs_setup) {
3086 /*
3087 * rootfs was set up in another namespace. bind-mount it
3088 * to give us a mount in our own ns so we can pivot_root to it
3089 */
3090 const char *path = conf->rootfs.mount;
3091 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3092 ERROR("Failed to bind-mount container / onto itself");
3093 return -1;
3094 }
3095 return 0;
3096 }
3097
3098 remount_all_slave();
3099
3100 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3101 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3102 return -1;
3103 }
3104
3105 if (lxc_setup_rootfs(conf)) {
3106 ERROR("failed to setup rootfs for '%s'", name);
3107 return -1;
3108 }
3109
3110 conf->rootfs_setup = true;
3111 return 0;
3112 }
3113
3114 static bool verify_start_hooks(struct lxc_conf *conf)
3115 {
3116 struct lxc_list *it;
3117 char path[MAXPATHLEN];
3118 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3119 char *hookname = it->elem;
3120 struct stat st;
3121 int ret;
3122
3123 ret = snprintf(path, MAXPATHLEN, "%s%s",
3124 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
3125 if (ret < 0 || ret >= MAXPATHLEN)
3126 return false;
3127 ret = stat(path, &st);
3128 if (ret) {
3129 SYSERROR("Start hook %s not found in container",
3130 hookname);
3131 return false;
3132 }
3133 return true;
3134 }
3135
3136 return true;
3137 }
3138
3139 int lxc_setup(struct lxc_handler *handler)
3140 {
3141 int ret;
3142 const char *name = handler->name;
3143 struct lxc_conf *lxc_conf = handler->conf;
3144 const char *lxcpath = handler->lxcpath;
3145
3146 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
3147 ERROR("Error setting up rootfs mount after spawn");
3148 return -1;
3149 }
3150
3151 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
3152 if (setup_utsname(lxc_conf->utsname)) {
3153 ERROR("failed to setup the utsname for '%s'", name);
3154 return -1;
3155 }
3156 }
3157
3158 if (lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network)) {
3159 ERROR("failed to setup the network for '%s'", name);
3160 return -1;
3161 }
3162
3163 if (lxc_network_send_name_and_ifindex_to_parent(handler) < 0) {
3164 ERROR("Failed to network device names and ifindices to parent");
3165 return -1;
3166 }
3167
3168 if (lxc_conf->autodev > 0) {
3169 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
3170 ERROR("failed to mount /dev in the container");
3171 return -1;
3172 }
3173 }
3174
3175 /* do automatic mounts (mainly /proc and /sys), but exclude
3176 * those that need to wait until other stuff has finished
3177 */
3178 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
3179 ERROR("failed to setup the automatic mounts for '%s'", name);
3180 return -1;
3181 }
3182
3183 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
3184 ERROR("failed to setup the mounts for '%s'", name);
3185 return -1;
3186 }
3187
3188 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
3189 ERROR("failed to setup the mount entries for '%s'", name);
3190 return -1;
3191 }
3192
3193 /* Make sure any start hooks are in the container */
3194 if (!verify_start_hooks(lxc_conf))
3195 return -1;
3196
3197 if (lxc_conf->is_execute)
3198 lxc_execute_bind_init(lxc_conf);
3199
3200 /* now mount only cgroup, if wanted;
3201 * before, /sys could not have been mounted
3202 * (is either mounted automatically or via fstab entries)
3203 */
3204 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
3205 ERROR("failed to setup the automatic mounts for '%s'", name);
3206 return -1;
3207 }
3208
3209 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
3210 ERROR("failed to run mount hooks for container '%s'.", name);
3211 return -1;
3212 }
3213
3214 if (lxc_conf->autodev > 0) {
3215 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
3216 ERROR("failed to run autodev hooks for container '%s'.", name);
3217 return -1;
3218 }
3219
3220 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
3221 ERROR("failed to populate /dev in the container");
3222 return -1;
3223 }
3224 }
3225
3226 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
3227 lxc_conf->ttydir);
3228 if (ret < 0) {
3229 ERROR("Failed to setup console");
3230 return -1;
3231 }
3232
3233 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3234 if (ret < 0) {
3235 ERROR("Failed to setup /dev symlinks");
3236 return -1;
3237 }
3238
3239 /* mount /proc if it's not already there */
3240 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
3241 ERROR("failed to LSM mount proc for '%s'", name);
3242 return -1;
3243 }
3244
3245 if (setup_pivot_root(&lxc_conf->rootfs)) {
3246 ERROR("failed to set rootfs for '%s'", name);
3247 return -1;
3248 }
3249
3250 if (lxc_setup_devpts(lxc_conf->pts)) {
3251 ERROR("failed to setup the new pts instance");
3252 return -1;
3253 }
3254
3255 ret = lxc_create_ttys(handler);
3256 if (ret < 0)
3257 return -1;
3258
3259 if (setup_personality(lxc_conf->personality)) {
3260 ERROR("failed to setup personality");
3261 return -1;
3262 }
3263
3264 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3265 if (!lxc_list_empty(&lxc_conf->caps)) {
3266 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
3267 return -1;
3268 }
3269 if (dropcaps_except(&lxc_conf->keepcaps)) {
3270 ERROR("failed to keep requested caps");
3271 return -1;
3272 }
3273 } else if (setup_caps(&lxc_conf->caps)) {
3274 ERROR("failed to drop capabilities");
3275 return -1;
3276 }
3277
3278 NOTICE("Container \"%s\" is set up", name);
3279
3280 return 0;
3281 }
3282
3283 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
3284 const char *lxcpath, char *argv[])
3285 {
3286 int which = -1;
3287 struct lxc_list *it;
3288
3289 if (strcmp(hook, "pre-start") == 0)
3290 which = LXCHOOK_PRESTART;
3291 else if (strcmp(hook, "start-host") == 0)
3292 which = LXCHOOK_START_HOST;
3293 else if (strcmp(hook, "pre-mount") == 0)
3294 which = LXCHOOK_PREMOUNT;
3295 else if (strcmp(hook, "mount") == 0)
3296 which = LXCHOOK_MOUNT;
3297 else if (strcmp(hook, "autodev") == 0)
3298 which = LXCHOOK_AUTODEV;
3299 else if (strcmp(hook, "start") == 0)
3300 which = LXCHOOK_START;
3301 else if (strcmp(hook, "stop") == 0)
3302 which = LXCHOOK_STOP;
3303 else if (strcmp(hook, "post-stop") == 0)
3304 which = LXCHOOK_POSTSTOP;
3305 else if (strcmp(hook, "clone") == 0)
3306 which = LXCHOOK_CLONE;
3307 else if (strcmp(hook, "destroy") == 0)
3308 which = LXCHOOK_DESTROY;
3309 else
3310 return -1;
3311 lxc_list_for_each(it, &conf->hooks[which]) {
3312 int ret;
3313 char *hookname = it->elem;
3314 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
3315 if (ret)
3316 return ret;
3317 }
3318 return 0;
3319 }
3320
3321 int lxc_clear_config_caps(struct lxc_conf *c)
3322 {
3323 struct lxc_list *it, *next;
3324
3325 lxc_list_for_each_safe(it, &c->caps, next) {
3326 lxc_list_del(it);
3327 free(it->elem);
3328 free(it);
3329 }
3330 return 0;
3331 }
3332
3333 static int lxc_free_idmap(struct lxc_list *id_map) {
3334 struct lxc_list *it, *next;
3335
3336 lxc_list_for_each_safe(it, id_map, next) {
3337 lxc_list_del(it);
3338 free(it->elem);
3339 free(it);
3340 }
3341 return 0;
3342 }
3343
3344 int lxc_clear_idmaps(struct lxc_conf *c)
3345 {
3346 return lxc_free_idmap(&c->id_map);
3347 }
3348
3349 int lxc_clear_config_keepcaps(struct lxc_conf *c)
3350 {
3351 struct lxc_list *it,*next;
3352
3353 lxc_list_for_each_safe(it, &c->keepcaps, next) {
3354 lxc_list_del(it);
3355 free(it->elem);
3356 free(it);
3357 }
3358 return 0;
3359 }
3360
3361 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
3362 {
3363 struct lxc_list *it,*next;
3364 bool all = false;
3365 const char *k = NULL;
3366
3367 if (strcmp(key, "lxc.cgroup") == 0)
3368 all = true;
3369 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
3370 k = key + sizeof("lxc.cgroup.")-1;
3371 else
3372 return -1;
3373
3374 lxc_list_for_each_safe(it, &c->cgroup, next) {
3375 struct lxc_cgroup *cg = it->elem;
3376 if (!all && strcmp(cg->subsystem, k) != 0)
3377 continue;
3378 lxc_list_del(it);
3379 free(cg->subsystem);
3380 free(cg->value);
3381 free(cg);
3382 free(it);
3383 }
3384 return 0;
3385 }
3386
3387 int lxc_clear_limits(struct lxc_conf *c, const char *key)
3388 {
3389 struct lxc_list *it, *next;
3390 bool all = false;
3391 const char *k = NULL;
3392
3393 if (strcmp(key, "lxc.limit") == 0
3394 || strcmp(key, "lxc.prlimit"))
3395 all = true;
3396 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
3397 k = key + sizeof("lxc.limit.")-1;
3398 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
3399 k = key + sizeof("lxc.prlimit.")-1;
3400 else
3401 return -1;
3402
3403 lxc_list_for_each_safe(it, &c->limits, next) {
3404 struct lxc_limit *lim = it->elem;
3405 if (!all && strcmp(lim->resource, k) != 0)
3406 continue;
3407 lxc_list_del(it);
3408 free(lim->resource);
3409 free(lim);
3410 free(it);
3411 }
3412 return 0;
3413 }
3414
3415 int lxc_clear_groups(struct lxc_conf *c)
3416 {
3417 struct lxc_list *it,*next;
3418
3419 lxc_list_for_each_safe(it, &c->groups, next) {
3420 lxc_list_del(it);
3421 free(it->elem);
3422 free(it);
3423 }
3424 return 0;
3425 }
3426
3427 int lxc_clear_environment(struct lxc_conf *c)
3428 {
3429 struct lxc_list *it,*next;
3430
3431 lxc_list_for_each_safe(it, &c->environment, next) {
3432 lxc_list_del(it);
3433 free(it->elem);
3434 free(it);
3435 }
3436 return 0;
3437 }
3438
3439 int lxc_clear_mount_entries(struct lxc_conf *c)
3440 {
3441 struct lxc_list *it,*next;
3442
3443 lxc_list_for_each_safe(it, &c->mount_list, next) {
3444 lxc_list_del(it);
3445 free(it->elem);
3446 free(it);
3447 }
3448 return 0;
3449 }
3450
3451 int lxc_clear_automounts(struct lxc_conf *c)
3452 {
3453 c->auto_mounts = 0;
3454 return 0;
3455 }
3456
3457 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
3458 {
3459 struct lxc_list *it,*next;
3460 bool all = false, done = false;
3461 const char *k = NULL;
3462 int i;
3463
3464 if (strcmp(key, "lxc.hook") == 0)
3465 all = true;
3466 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
3467 k = key + sizeof("lxc.hook.")-1;
3468 else
3469 return -1;
3470
3471 for (i=0; i<NUM_LXC_HOOKS; i++) {
3472 if (all || strcmp(k, lxchook_names[i]) == 0) {
3473 lxc_list_for_each_safe(it, &c->hooks[i], next) {
3474 lxc_list_del(it);
3475 free(it->elem);
3476 free(it);
3477 }
3478 done = true;
3479 }
3480 }
3481
3482 if (!done) {
3483 ERROR("Invalid hook key: %s", key);
3484 return -1;
3485 }
3486 return 0;
3487 }
3488
3489 static inline void lxc_clear_aliens(struct lxc_conf *conf)
3490 {
3491 struct lxc_list *it,*next;
3492
3493 lxc_list_for_each_safe(it, &conf->aliens, next) {
3494 lxc_list_del(it);
3495 free(it->elem);
3496 free(it);
3497 }
3498 }
3499
3500 void lxc_clear_includes(struct lxc_conf *conf)
3501 {
3502 struct lxc_list *it,*next;
3503
3504 lxc_list_for_each_safe(it, &conf->includes, next) {
3505 lxc_list_del(it);
3506 free(it->elem);
3507 free(it);
3508 }
3509 }
3510
3511 void lxc_conf_free(struct lxc_conf *conf)
3512 {
3513 if (!conf)
3514 return;
3515 if (current_config == conf)
3516 current_config = NULL;
3517 free(conf->console.log_path);
3518 free(conf->console.path);
3519 free(conf->rootfs.mount);
3520 free(conf->rootfs.bdev_type);
3521 free(conf->rootfs.options);
3522 free(conf->rootfs.path);
3523 free(conf->logfile);
3524 if (conf->logfd != -1)
3525 close(conf->logfd);
3526 free(conf->utsname);
3527 free(conf->ttydir);
3528 free(conf->fstab);
3529 free(conf->rcfile);
3530 free(conf->execute_cmd);
3531 free(conf->init_cmd);
3532 free(conf->unexpanded_config);
3533 free(conf->pty_names);
3534 free(conf->syslog);
3535 lxc_free_networks(&conf->network);
3536 free(conf->lsm_aa_profile);
3537 free(conf->lsm_se_context);
3538 lxc_seccomp_free(conf);
3539 lxc_clear_config_caps(conf);
3540 lxc_clear_config_keepcaps(conf);
3541 lxc_clear_cgroups(conf, "lxc.cgroup");
3542 lxc_clear_hooks(conf, "lxc.hook");
3543 lxc_clear_mount_entries(conf);
3544 lxc_clear_idmaps(conf);
3545 lxc_clear_groups(conf);
3546 lxc_clear_includes(conf);
3547 lxc_clear_aliens(conf);
3548 lxc_clear_environment(conf);
3549 lxc_clear_limits(conf, "lxc.prlimit");
3550 free(conf->cgroup_meta.dir);
3551 free(conf->cgroup_meta.controllers);
3552 free(conf);
3553 }
3554
3555 struct userns_fn_data {
3556 int (*fn)(void *);
3557 const char *fn_name;
3558 void *arg;
3559 int p[2];
3560 };
3561
3562 static int run_userns_fn(void *data)
3563 {
3564 struct userns_fn_data *d = data;
3565 char c;
3566
3567 /* Close write end of the pipe. */
3568 close(d->p[1]);
3569
3570 /* Wait for parent to finish establishing a new mapping in the user
3571 * namespace we are executing in.
3572 */
3573 if (read(d->p[0], &c, 1) != 1)
3574 return -1;
3575
3576 /* Close read end of the pipe. */
3577 close(d->p[0]);
3578
3579 if (d->fn_name)
3580 TRACE("calling function \"%s\"", d->fn_name);
3581 /* Call function to run. */
3582 return d->fn(d->arg);
3583 }
3584
3585 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
3586 enum idtype idtype)
3587 {
3588 struct lxc_list *it;
3589 struct id_map *map;
3590 struct id_map *retmap = NULL;
3591
3592 lxc_list_for_each(it, &conf->id_map) {
3593 map = it->elem;
3594 if (map->idtype != idtype)
3595 continue;
3596
3597 if (id >= map->hostid && id < map->hostid + map->range) {
3598 retmap = map;
3599 break;
3600 }
3601 }
3602
3603 if (!retmap)
3604 return NULL;
3605
3606 retmap = malloc(sizeof(*retmap));
3607 if (!retmap)
3608 return NULL;
3609
3610 memcpy(retmap, map, sizeof(*retmap));
3611 return retmap;
3612 }
3613
3614 /*
3615 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3616 * existing one or establish a new one.
3617 */
3618 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
3619 {
3620 int hostid_mapped;
3621 struct id_map *entry = NULL;
3622
3623 /* Reuse existing mapping. */
3624 entry = mapped_hostid_entry(conf, id, type);
3625 if (entry)
3626 return entry;
3627
3628 /* Find new mapping. */
3629 hostid_mapped = find_unmapped_nsid(conf, type);
3630 if (hostid_mapped < 0) {
3631 DEBUG("failed to find free mapping for id %d", id);
3632 return NULL;
3633 }
3634
3635 entry = malloc(sizeof(*entry));
3636 if (!entry)
3637 return NULL;
3638
3639 entry->idtype = type;
3640 entry->nsid = hostid_mapped;
3641 entry->hostid = (unsigned long)id;
3642 entry->range = 1;
3643
3644 return entry;
3645 }
3646
3647 /* Run a function in a new user namespace.
3648 * The caller's euid/egid will be mapped if it is not already.
3649 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3650 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3651 * This means we require only to establish a mapping from:
3652 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3653 * - the container root -> some sub{g,u}id
3654 * The former we add, if the user did not specifiy a mapping. The latter we
3655 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
3656 * there to start the container in the first place.
3657 */
3658 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
3659 const char *fn_name)
3660 {
3661 pid_t pid;
3662 uid_t euid, egid;
3663 struct userns_fn_data d;
3664 int p[2];
3665 struct lxc_list *it;
3666 struct id_map *map;
3667 char c = '1';
3668 int ret = -1;
3669 struct lxc_list *idmap = NULL, *tmplist = NULL;
3670 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3671 *host_uid_map = NULL, *host_gid_map = NULL;
3672
3673 ret = pipe(p);
3674 if (ret < 0) {
3675 SYSERROR("opening pipe");
3676 return -1;
3677 }
3678 d.fn = fn;
3679 d.fn_name = fn_name;
3680 d.arg = data;
3681 d.p[0] = p[0];
3682 d.p[1] = p[1];
3683
3684 /* Clone child in new user namespace. */
3685 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
3686 if (pid < 0) {
3687 ERROR("failed to clone child process in new user namespace");
3688 goto on_error;
3689 }
3690
3691 close(p[0]);
3692 p[0] = -1;
3693
3694 euid = geteuid();
3695 egid = getegid();
3696
3697 /* Find container root. */
3698 lxc_list_for_each(it, &conf->id_map) {
3699 map = it->elem;
3700
3701 if (map->nsid != 0)
3702 continue;
3703
3704 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
3705 container_root_uid = malloc(sizeof(*container_root_uid));
3706 if (!container_root_uid)
3707 goto on_error;
3708 container_root_uid->idtype = map->idtype;
3709 container_root_uid->hostid = map->hostid;
3710 container_root_uid->nsid = 0;
3711 container_root_uid->range = map->range;
3712
3713 /* Check if container root mapping contains a mapping
3714 * for user's uid.
3715 */
3716 if (euid >= map->hostid && euid < map->hostid + map->range)
3717 host_uid_map = container_root_uid;
3718 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
3719 container_root_gid = malloc(sizeof(*container_root_gid));
3720 if (!container_root_gid)
3721 goto on_error;
3722 container_root_gid->idtype = map->idtype;
3723 container_root_gid->hostid = map->hostid;
3724 container_root_gid->nsid = 0;
3725 container_root_gid->range = map->range;
3726
3727 /* Check if container root mapping contains a mapping
3728 * for user's gid.
3729 */
3730 if (egid >= map->hostid && egid < map->hostid + map->range)
3731 host_gid_map = container_root_gid;
3732 }
3733
3734 /* Found container root. */
3735 if (container_root_uid && container_root_gid)
3736 break;
3737 }
3738
3739 /* This is actually checked earlier but it can't hurt. */
3740 if (!container_root_uid || !container_root_gid) {
3741 ERROR("no mapping for container root found");
3742 goto on_error;
3743 }
3744
3745 /* Check whether the {g,u}id of the user has a mapping. */
3746 if (!host_uid_map)
3747 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3748
3749 if (!host_gid_map)
3750 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3751
3752 if (!host_uid_map) {
3753 DEBUG("failed to find mapping for uid %d", euid);
3754 goto on_error;
3755 }
3756
3757 if (!host_gid_map) {
3758 DEBUG("failed to find mapping for gid %d", egid);
3759 goto on_error;
3760 }
3761
3762 /* Allocate new {g,u}id map list. */
3763 idmap = malloc(sizeof(*idmap));
3764 if (!idmap)
3765 goto on_error;
3766 lxc_list_init(idmap);
3767
3768 /* Add container root to the map. */
3769 tmplist = malloc(sizeof(*tmplist));
3770 if (!tmplist)
3771 goto on_error;
3772 lxc_list_add_elem(tmplist, container_root_uid);
3773 lxc_list_add_tail(idmap, tmplist);
3774
3775 if (host_uid_map && (host_uid_map != container_root_uid)) {
3776 /* idmap will now keep track of that memory. */
3777 container_root_uid = NULL;
3778
3779 /* Add container root to the map. */
3780 tmplist = malloc(sizeof(*tmplist));
3781 if (!tmplist)
3782 goto on_error;
3783 lxc_list_add_elem(tmplist, host_uid_map);
3784 lxc_list_add_tail(idmap, tmplist);
3785 }
3786 /* idmap will now keep track of that memory. */
3787 container_root_uid = NULL;
3788 /* idmap will now keep track of that memory. */
3789 host_uid_map = NULL;
3790
3791 tmplist = malloc(sizeof(*tmplist));
3792 if (!tmplist)
3793 goto on_error;
3794 lxc_list_add_elem(tmplist, container_root_gid);
3795 lxc_list_add_tail(idmap, tmplist);
3796
3797 if (host_gid_map && (host_gid_map != container_root_gid)) {
3798 /* idmap will now keep track of that memory. */
3799 container_root_gid = NULL;
3800
3801 tmplist = malloc(sizeof(*tmplist));
3802 if (!tmplist)
3803 goto on_error;
3804 lxc_list_add_elem(tmplist, host_gid_map);
3805 lxc_list_add_tail(idmap, tmplist);
3806 }
3807 /* idmap will now keep track of that memory. */
3808 container_root_gid = NULL;
3809 /* idmap will now keep track of that memory. */
3810 host_gid_map = NULL;
3811
3812 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3813 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
3814 lxc_list_for_each(it, idmap) {
3815 map = it->elem;
3816 TRACE("establishing %cid mapping for \"%d\" in new "
3817 "user namespace: nsuid %lu - hostid %lu - range "
3818 "%lu",
3819 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
3820 map->nsid, map->hostid, map->range);
3821 }
3822 }
3823
3824 /* Set up {g,u}id mapping for user namespace of child process. */
3825 ret = lxc_map_ids(idmap, pid);
3826 if (ret < 0) {
3827 ERROR("error setting up {g,u}id mappings for child process "
3828 "\"%d\"", pid);
3829 goto on_error;
3830 }
3831
3832 /* Tell child to proceed. */
3833 if (write(p[1], &c, 1) != 1) {
3834 SYSERROR("failed telling child process \"%d\" to proceed", pid);
3835 goto on_error;
3836 }
3837
3838 /* Wait for child to finish. */
3839 ret = wait_for_pid(pid);
3840
3841 on_error:
3842 if (idmap)
3843 lxc_free_idmap(idmap);
3844 if (container_root_uid)
3845 free(container_root_uid);
3846 if (container_root_gid)
3847 free(container_root_gid);
3848 if (host_uid_map && (host_uid_map != container_root_uid))
3849 free(host_uid_map);
3850 if (host_gid_map && (host_gid_map != container_root_gid))
3851 free(host_gid_map);
3852
3853 if (p[0] != -1)
3854 close(p[0]);
3855 close(p[1]);
3856
3857 return ret;
3858 }
3859
3860 int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
3861 const char *fn_name)
3862 {
3863 pid_t pid;
3864 uid_t euid, egid;
3865 struct userns_fn_data d;
3866 int p[2];
3867 struct id_map *map;
3868 struct lxc_list *cur;
3869 char c = '1';
3870 int ret = -1;
3871 struct lxc_list *idmap = NULL, *tmplist = NULL;
3872 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3873 *host_uid_map = NULL, *host_gid_map = NULL;
3874
3875 ret = pipe(p);
3876 if (ret < 0) {
3877 SYSERROR("opening pipe");
3878 return -1;
3879 }
3880 d.fn = fn;
3881 d.fn_name = fn_name;
3882 d.arg = data;
3883 d.p[0] = p[0];
3884 d.p[1] = p[1];
3885
3886 /* Clone child in new user namespace. */
3887 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
3888 if (pid < 0) {
3889 ERROR("failed to clone child process in new user namespace");
3890 goto on_error;
3891 }
3892
3893 close(p[0]);
3894 p[0] = -1;
3895
3896 euid = geteuid();
3897 egid = getegid();
3898
3899 /* Allocate new {g,u}id map list. */
3900 idmap = malloc(sizeof(*idmap));
3901 if (!idmap)
3902 goto on_error;
3903 lxc_list_init(idmap);
3904
3905 /* Find container root. */
3906 lxc_list_for_each(cur, &conf->id_map) {
3907 struct id_map *tmpmap;
3908
3909 tmplist = malloc(sizeof(*tmplist));
3910 if (!tmplist)
3911 goto on_error;
3912
3913 tmpmap = malloc(sizeof(*tmpmap));
3914 if (!tmpmap) {
3915 free(tmplist);
3916 goto on_error;
3917 }
3918
3919 memset(tmpmap, 0, sizeof(*tmpmap));
3920 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
3921 tmplist->elem = tmpmap;
3922
3923 lxc_list_add_tail(idmap, tmplist);
3924
3925 map = cur->elem;
3926
3927 if (map->idtype == ID_TYPE_UID)
3928 if (euid >= map->hostid && euid < map->hostid + map->range)
3929 host_uid_map = map;
3930
3931 if (map->idtype == ID_TYPE_GID)
3932 if (egid >= map->hostid && egid < map->hostid + map->range)
3933 host_gid_map = map;
3934
3935 if (map->nsid != 0)
3936 continue;
3937
3938 if (map->idtype == ID_TYPE_UID)
3939 if (container_root_uid == NULL)
3940 container_root_uid = map;
3941
3942 if (map->idtype == ID_TYPE_GID)
3943 if (container_root_gid == NULL)
3944 container_root_gid = map;
3945 }
3946
3947 if (!container_root_uid || !container_root_gid) {
3948 ERROR("No mapping for container root found");
3949 goto on_error;
3950 }
3951
3952 /* Check whether the {g,u}id of the user has a mapping. */
3953 if (!host_uid_map)
3954 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3955 else
3956 host_uid_map = container_root_uid;
3957
3958 if (!host_gid_map)
3959 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3960 else
3961 host_gid_map = container_root_gid;
3962
3963 if (!host_uid_map) {
3964 DEBUG("Failed to find mapping for uid %d", euid);
3965 goto on_error;
3966 }
3967
3968 if (!host_gid_map) {
3969 DEBUG("Failed to find mapping for gid %d", egid);
3970 goto on_error;
3971 }
3972
3973 if (host_uid_map && (host_uid_map != container_root_uid)) {
3974 /* Add container root to the map. */
3975 tmplist = malloc(sizeof(*tmplist));
3976 if (!tmplist)
3977 goto on_error;
3978 lxc_list_add_elem(tmplist, host_uid_map);
3979 lxc_list_add_tail(idmap, tmplist);
3980 }
3981 /* idmap will now keep track of that memory. */
3982 host_uid_map = NULL;
3983
3984 if (host_gid_map && (host_gid_map != container_root_gid)) {
3985 tmplist = malloc(sizeof(*tmplist));
3986 if (!tmplist)
3987 goto on_error;
3988 lxc_list_add_elem(tmplist, host_gid_map);
3989 lxc_list_add_tail(idmap, tmplist);
3990 }
3991 /* idmap will now keep track of that memory. */
3992 host_gid_map = NULL;
3993
3994 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3995 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
3996 lxc_list_for_each(cur, idmap) {
3997 map = cur->elem;
3998 TRACE("establishing %cid mapping for \"%d\" in new "
3999 "user namespace: nsuid %lu - hostid %lu - range "
4000 "%lu",
4001 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4002 map->nsid, map->hostid, map->range);
4003 }
4004 }
4005
4006 /* Set up {g,u}id mapping for user namespace of child process. */
4007 ret = lxc_map_ids(idmap, pid);
4008 if (ret < 0) {
4009 ERROR("error setting up {g,u}id mappings for child process "
4010 "\"%d\"", pid);
4011 goto on_error;
4012 }
4013
4014 /* Tell child to proceed. */
4015 if (write(p[1], &c, 1) != 1) {
4016 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4017 goto on_error;
4018 }
4019
4020 /* Wait for child to finish. */
4021 ret = wait_for_pid(pid);
4022
4023 on_error:
4024 if (idmap)
4025 lxc_free_idmap(idmap);
4026 if (host_uid_map && (host_uid_map != container_root_uid))
4027 free(host_uid_map);
4028 if (host_gid_map && (host_gid_map != container_root_gid))
4029 free(host_gid_map);
4030
4031 if (p[0] != -1)
4032 close(p[0]);
4033 close(p[1]);
4034
4035 return ret;
4036 }
4037
4038 /* not thread-safe, do not use from api without first forking */
4039 static char* getuname(void)
4040 {
4041 struct passwd *result;
4042
4043 result = getpwuid(geteuid());
4044 if (!result)
4045 return NULL;
4046
4047 return strdup(result->pw_name);
4048 }
4049
4050 /* not thread-safe, do not use from api without first forking */
4051 static char *getgname(void)
4052 {
4053 struct group *result;
4054
4055 result = getgrgid(getegid());
4056 if (!result)
4057 return NULL;
4058
4059 return strdup(result->gr_name);
4060 }
4061
4062 /* not thread-safe, do not use from api without first forking */
4063 void suggest_default_idmap(void)
4064 {
4065 FILE *f;
4066 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4067 char *line = NULL;
4068 char *uname, *gname;
4069 size_t len = 0;
4070
4071 if (!(uname = getuname()))
4072 return;
4073
4074 if (!(gname = getgname())) {
4075 free(uname);
4076 return;
4077 }
4078
4079 f = fopen(subuidfile, "r");
4080 if (!f) {
4081 ERROR("Your system is not configured with subuids");
4082 free(gname);
4083 free(uname);
4084 return;
4085 }
4086 while (getline(&line, &len, f) != -1) {
4087 size_t no_newline = 0;
4088 char *p = strchr(line, ':'), *p2;
4089 if (*line == '#')
4090 continue;
4091 if (!p)
4092 continue;
4093 *p = '\0';
4094 p++;
4095 if (strcmp(line, uname))
4096 continue;
4097 p2 = strchr(p, ':');
4098 if (!p2)
4099 continue;
4100 *p2 = '\0';
4101 p2++;
4102 if (!*p2)
4103 continue;
4104 no_newline = strcspn(p2, "\n");
4105 p2[no_newline] = '\0';
4106
4107 if (lxc_safe_uint(p, &uid) < 0)
4108 WARN("Could not parse UID.");
4109 if (lxc_safe_uint(p2, &urange) < 0)
4110 WARN("Could not parse UID range.");
4111 }
4112 fclose(f);
4113
4114 f = fopen(subgidfile, "r");
4115 if (!f) {
4116 ERROR("Your system is not configured with subgids");
4117 free(gname);
4118 free(uname);
4119 return;
4120 }
4121 while (getline(&line, &len, f) != -1) {
4122 size_t no_newline = 0;
4123 char *p = strchr(line, ':'), *p2;
4124 if (*line == '#')
4125 continue;
4126 if (!p)
4127 continue;
4128 *p = '\0';
4129 p++;
4130 if (strcmp(line, uname))
4131 continue;
4132 p2 = strchr(p, ':');
4133 if (!p2)
4134 continue;
4135 *p2 = '\0';
4136 p2++;
4137 if (!*p2)
4138 continue;
4139 no_newline = strcspn(p2, "\n");
4140 p2[no_newline] = '\0';
4141
4142 if (lxc_safe_uint(p, &gid) < 0)
4143 WARN("Could not parse GID.");
4144 if (lxc_safe_uint(p2, &grange) < 0)
4145 WARN("Could not parse GID range.");
4146 }
4147 fclose(f);
4148
4149 free(line);
4150
4151 if (!urange || !grange) {
4152 ERROR("You do not have subuids or subgids allocated");
4153 ERROR("Unprivileged containers require subuids and subgids");
4154 return;
4155 }
4156
4157 ERROR("You must either run as root, or define uid mappings");
4158 ERROR("To pass uid mappings to lxc-create, you could create");
4159 ERROR("~/.config/lxc/default.conf:");
4160 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4161 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4162 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
4163
4164 free(gname);
4165 free(uname);
4166 }
4167
4168 static void free_cgroup_settings(struct lxc_list *result)
4169 {
4170 struct lxc_list *iterator, *next;
4171
4172 lxc_list_for_each_safe(iterator, result, next) {
4173 lxc_list_del(iterator);
4174 free(iterator);
4175 }
4176 free(result);
4177 }
4178
4179 /*
4180 * Return the list of cgroup_settings sorted according to the following rules
4181 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4182 */
4183 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4184 {
4185 struct lxc_list *result;
4186 struct lxc_list *memsw_limit = NULL;
4187 struct lxc_list *it = NULL;
4188 struct lxc_cgroup *cg = NULL;
4189 struct lxc_list *item = NULL;
4190
4191 result = malloc(sizeof(*result));
4192 if (!result) {
4193 ERROR("failed to allocate memory to sort cgroup settings");
4194 return NULL;
4195 }
4196 lxc_list_init(result);
4197
4198 /*Iterate over the cgroup settings and copy them to the output list*/
4199 lxc_list_for_each(it, cgroup_settings) {
4200 item = malloc(sizeof(*item));
4201 if (!item) {
4202 ERROR("failed to allocate memory to sort cgroup settings");
4203 free_cgroup_settings(result);
4204 return NULL;
4205 }
4206 item->elem = it->elem;
4207 cg = it->elem;
4208 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4209 /* Store the memsw_limit location */
4210 memsw_limit = item;
4211 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4212 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
4213 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4214 item->elem = memsw_limit->elem;
4215 memsw_limit->elem = it->elem;
4216 }
4217 lxc_list_add_tail(result, item);
4218 }
4219
4220 return result;
4221 }