]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/namespace.c
Merge pull request #2034 from brauner/2017-12-14/use_clone_in_run_command
[mirror_lxc.git] / src / lxc / namespace.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2009
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include <alloca.h>
25 #include <errno.h>
26 #include <fcntl.h>
27 #include <sched.h>
28 #include <signal.h>
29 #include <unistd.h>
30 #include <sys/param.h>
31 #include <sys/stat.h>
32 #include <sys/syscall.h>
33 #include <sys/types.h>
34
35 #include "log.h"
36 #include "namespace.h"
37 #include "utils.h"
38
39 lxc_log_define(lxc_namespace, lxc);
40
41 struct clone_arg {
42 int (*fn)(void *);
43 void *arg;
44 };
45
46 static int do_clone(void *arg)
47 {
48 struct clone_arg *clone_arg = arg;
49 return clone_arg->fn(clone_arg->arg);
50 }
51
52 pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
53 {
54 struct clone_arg clone_arg = {
55 .fn = fn,
56 .arg = arg,
57 };
58
59 size_t stack_size = lxc_getpagesize();
60 void *stack = alloca(stack_size);
61 pid_t ret;
62
63 #ifdef __ia64__
64 ret = __clone2(do_clone, stack, stack_size, flags | SIGCHLD, &clone_arg);
65 #else
66 ret = clone(do_clone, stack + stack_size, flags | SIGCHLD, &clone_arg);
67 #endif
68 if (ret < 0)
69 ERROR("Failed to clone (%#x): %s.", flags, strerror(errno));
70
71 return ret;
72 }
73
74 /**
75 * This is based on raw_clone in systemd but adapted to our needs. This uses
76 * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and
77 * doesn't really matter to us so disallow it.
78 *
79 * The nice thing about this is that we get fork() behavior. That is
80 * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
81 */
82 pid_t lxc_raw_clone(unsigned long flags)
83 {
84
85 /* These flags don't interest at all so we don't jump through any hoopes
86 * of retrieving them and passing them to the kernel.
87 */
88 errno = EINVAL;
89 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
90 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
91 return -EINVAL;
92
93 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
94 /* On s390/s390x and cris the order of the first and second arguments
95 * of the system call is reversed.
96 */
97 return (int)syscall(__NR_clone, NULL, flags | SIGCHLD);
98 #elif defined(__sparc__) && defined(__arch64__)
99 {
100 /**
101 * sparc64 always returns the other process id in %o0, and
102 * a boolean flag whether this is the child or the parent in
103 * %o1. Inline assembly is needed to get the flag returned
104 * in %o1.
105 */
106 int in_child;
107 int child_pid;
108 asm volatile("mov %2, %%g1\n\t"
109 "mov %3, %%o0\n\t"
110 "mov 0 , %%o1\n\t"
111 "t 0x6d\n\t"
112 "mov %%o1, %0\n\t"
113 "mov %%o0, %1"
114 : "=r"(in_child), "=r"(child_pid)
115 : "i"(__NR_clone), "r"(flags | SIGCHLD)
116 : "%o1", "%o0", "%g1");
117 if (in_child)
118 return 0;
119 else
120 return child_pid;
121 }
122 #elif defined(__ia64__)
123 /* On ia64 the stack and stack size are passed as separate arguments. */
124 return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0);
125 #else
126 return (int)syscall(__NR_clone, flags | SIGCHLD, NULL);
127 #endif
128 }
129
130 /* Leave the user namespace at the first position in the array of structs so
131 * that we always attach to it first when iterating over the struct and using
132 * setns() to switch namespaces. This especially affects lxc_attach(): Suppose
133 * you cloned a new user namespace and mount namespace as an unprivileged user
134 * on the host and want to setns() to the mount namespace. This requires you to
135 * attach to the user namespace first otherwise the kernel will fail this check:
136 *
137 * if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
138 * !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
139 * !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
140 * return -EPERM;
141 *
142 * in
143 *
144 * linux/fs/namespace.c:mntns_install().
145 */
146 const struct ns_info ns_info[LXC_NS_MAX] = {
147 [LXC_NS_USER] = { "user", CLONE_NEWUSER, "CLONE_NEWUSER", "LXC_USER_NS" },
148 [LXC_NS_MNT] = { "mnt", CLONE_NEWNS, "CLONE_NEWNS", "LXC_MNT_NS" },
149 [LXC_NS_PID] = { "pid", CLONE_NEWPID, "CLONE_NEWPID", "LXC_PID_NS" },
150 [LXC_NS_UTS] = { "uts", CLONE_NEWUTS, "CLONE_NEWUTS", "LXC_UTS_NS" },
151 [LXC_NS_IPC] = { "ipc", CLONE_NEWIPC, "CLONE_NEWIPC", "LXC_IPC_NS" },
152 [LXC_NS_NET] = { "net", CLONE_NEWNET, "CLONE_NEWNET", "LXC_NET_NS" },
153 [LXC_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS" }
154 };
155
156 int lxc_namespace_2_cloneflag(const char *namespace)
157 {
158 int i;
159 for (i = 0; i < LXC_NS_MAX; i++)
160 if (!strcasecmp(ns_info[i].proc_name, namespace))
161 return ns_info[i].clone_flag;
162
163 ERROR("Invalid namespace name \"%s\"", namespace);
164 return -EINVAL;
165 }
166
167 int lxc_namespace_2_ns_idx(const char *namespace)
168 {
169 int i;
170 for (i = 0; i < LXC_NS_MAX; i++)
171 if (!strcmp(ns_info[i].proc_name, namespace))
172 return i;
173
174 ERROR("Invalid namespace name \"%s\"", namespace);
175 return -EINVAL;
176 }
177
178 int lxc_fill_namespace_flags(char *flaglist, int *flags)
179 {
180 char *token, *saveptr = NULL;
181 int aflag;
182
183 if (!flaglist) {
184 ERROR("At least one namespace is needed.");
185 return -1;
186 }
187
188 token = strtok_r(flaglist, "|", &saveptr);
189 while (token) {
190
191 aflag = lxc_namespace_2_cloneflag(token);
192 if (aflag < 0)
193 return -1;
194
195 *flags |= aflag;
196
197 token = strtok_r(NULL, "|", &saveptr);
198 }
199 return 0;
200 }