]>
Commit | Line | Data |
---|---|---|
5bb3ba8a DL |
1 | /* |
2 | * lxc: linux Container library | |
3 | * | |
4 | * (C) Copyright IBM Corp. 2007, 2009 | |
5 | * | |
6 | * Authors: | |
9afe19d6 | 7 | * Daniel Lezcano <daniel.lezcano at free.fr> |
5bb3ba8a DL |
8 | * |
9 | * This library is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * This library is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with this library; if not, write to the Free Software | |
250b1eec | 21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
5bb3ba8a DL |
22 | */ |
23 | ||
d38dd64a CB |
24 | #ifndef _GNU_SOURCE |
25 | #define _GNU_SOURCE 1 | |
26 | #endif | |
5bb3ba8a DL |
27 | #include <alloca.h> |
28 | #include <errno.h> | |
a2028b8f | 29 | #include <fcntl.h> |
8ab93249 | 30 | #include <sched.h> |
5bb3ba8a | 31 | #include <signal.h> |
81c75799 | 32 | #include <sys/param.h> |
81c75799 | 33 | #include <sys/stat.h> |
8ab93249 | 34 | #include <sys/syscall.h> |
a2028b8f | 35 | #include <sys/types.h> |
d38dd64a | 36 | #include <unistd.h> |
5bb3ba8a | 37 | |
d38dd64a | 38 | #include "config.h" |
81c75799 | 39 | #include "log.h" |
a2028b8f CB |
40 | #include "namespace.h" |
41 | #include "utils.h" | |
81c75799 | 42 | |
ac2cecc4 | 43 | lxc_log_define(namespace, lxc); |
5bb3ba8a DL |
44 | |
45 | struct clone_arg { | |
46 | int (*fn)(void *); | |
47 | void *arg; | |
48 | }; | |
49 | ||
50 | static int do_clone(void *arg) | |
51 | { | |
52 | struct clone_arg *clone_arg = arg; | |
53 | return clone_arg->fn(clone_arg->arg); | |
54 | } | |
55 | ||
56 | pid_t lxc_clone(int (*fn)(void *), void *arg, int flags) | |
57 | { | |
58 | struct clone_arg clone_arg = { | |
59 | .fn = fn, | |
60 | .arg = arg, | |
61 | }; | |
62 | ||
a2028b8f | 63 | size_t stack_size = lxc_getpagesize(); |
92c64f7e | 64 | void *stack = alloca(stack_size); |
5bb3ba8a DL |
65 | pid_t ret; |
66 | ||
246091b9 | 67 | #ifdef __ia64__ |
8ab93249 | 68 | ret = __clone2(do_clone, stack, stack_size, flags | SIGCHLD, &clone_arg); |
246091b9 | 69 | #else |
92c64f7e | 70 | ret = clone(do_clone, stack + stack_size, flags | SIGCHLD, &clone_arg); |
246091b9 | 71 | #endif |
5bb3ba8a | 72 | if (ret < 0) |
6d1400b5 | 73 | SYSERROR("Failed to clone (%#x)", flags); |
5bb3ba8a DL |
74 | |
75 | return ret; | |
76 | } | |
39a5d5fe | 77 | |
8ab93249 CB |
78 | /** |
79 | * This is based on raw_clone in systemd but adapted to our needs. This uses | |
80 | * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and | |
81 | * doesn't really matter to us so disallow it. | |
82 | * | |
83 | * The nice thing about this is that we get fork() behavior. That is | |
84 | * lxc_raw_clone() returns 0 in the child and the child pid in the parent. | |
85 | */ | |
86 | pid_t lxc_raw_clone(unsigned long flags) | |
87 | { | |
88 | ||
89 | /* These flags don't interest at all so we don't jump through any hoopes | |
90 | * of retrieving them and passing them to the kernel. | |
91 | */ | |
92 | errno = EINVAL; | |
93 | if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | | |
94 | CLONE_CHILD_CLEARTID | CLONE_SETTLS))) | |
95 | return -EINVAL; | |
96 | ||
97 | #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) | |
98 | /* On s390/s390x and cris the order of the first and second arguments | |
99 | * of the system call is reversed. | |
100 | */ | |
101 | return (int)syscall(__NR_clone, NULL, flags | SIGCHLD); | |
102 | #elif defined(__sparc__) && defined(__arch64__) | |
103 | { | |
104 | /** | |
105 | * sparc64 always returns the other process id in %o0, and | |
106 | * a boolean flag whether this is the child or the parent in | |
107 | * %o1. Inline assembly is needed to get the flag returned | |
108 | * in %o1. | |
109 | */ | |
110 | int in_child; | |
111 | int child_pid; | |
112 | asm volatile("mov %2, %%g1\n\t" | |
113 | "mov %3, %%o0\n\t" | |
114 | "mov 0 , %%o1\n\t" | |
115 | "t 0x6d\n\t" | |
116 | "mov %%o1, %0\n\t" | |
117 | "mov %%o0, %1" | |
118 | : "=r"(in_child), "=r"(child_pid) | |
119 | : "i"(__NR_clone), "r"(flags | SIGCHLD) | |
120 | : "%o1", "%o0", "%g1"); | |
727b9b16 | 121 | |
8ab93249 CB |
122 | if (in_child) |
123 | return 0; | |
124 | else | |
125 | return child_pid; | |
126 | } | |
127 | #elif defined(__ia64__) | |
128 | /* On ia64 the stack and stack size are passed as separate arguments. */ | |
129 | return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0); | |
130 | #else | |
131 | return (int)syscall(__NR_clone, flags | SIGCHLD, NULL); | |
132 | #endif | |
133 | } | |
134 | ||
0c2a98bd CB |
135 | pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags) |
136 | { | |
137 | pid_t pid; | |
138 | ||
139 | pid = lxc_raw_clone(flags); | |
140 | if (pid < 0) | |
141 | return -1; | |
142 | ||
143 | /* exit() is not thread-safe and might mess with the parent's signal | |
144 | * handlers and other stuff when exec() fails. | |
145 | */ | |
146 | if (pid == 0) | |
147 | _exit(fn(args)); | |
148 | ||
149 | return pid; | |
150 | } | |
151 | ||
29ed9c13 CB |
152 | /* Leave the user namespace at the first position in the array of structs so |
153 | * that we always attach to it first when iterating over the struct and using | |
154 | * setns() to switch namespaces. This especially affects lxc_attach(): Suppose | |
155 | * you cloned a new user namespace and mount namespace as an unprivileged user | |
156 | * on the host and want to setns() to the mount namespace. This requires you to | |
157 | * attach to the user namespace first otherwise the kernel will fail this check: | |
158 | * | |
159 | * if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || | |
160 | * !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || | |
161 | * !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | |
162 | * return -EPERM; | |
163 | * | |
164 | * in | |
165 | * | |
166 | * linux/fs/namespace.c:mntns_install(). | |
167 | */ | |
9662e444 | 168 | const struct ns_info ns_info[LXC_NS_MAX] = { |
18b3b9c1 CB |
169 | [LXC_NS_USER] = { "user", CLONE_NEWUSER, "CLONE_NEWUSER", "LXC_USER_NS" }, |
170 | [LXC_NS_MNT] = { "mnt", CLONE_NEWNS, "CLONE_NEWNS", "LXC_MNT_NS" }, | |
171 | [LXC_NS_PID] = { "pid", CLONE_NEWPID, "CLONE_NEWPID", "LXC_PID_NS" }, | |
172 | [LXC_NS_UTS] = { "uts", CLONE_NEWUTS, "CLONE_NEWUTS", "LXC_UTS_NS" }, | |
173 | [LXC_NS_IPC] = { "ipc", CLONE_NEWIPC, "CLONE_NEWIPC", "LXC_IPC_NS" }, | |
174 | [LXC_NS_NET] = { "net", CLONE_NEWNET, "CLONE_NEWNET", "LXC_NET_NS" }, | |
175 | [LXC_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS" } | |
39a5d5fe CS |
176 | }; |
177 | ||
28d9e29e | 178 | int lxc_namespace_2_cloneflag(const char *namespace) |
39a5d5fe | 179 | { |
9662e444 | 180 | int i; |
727b9b16 | 181 | |
9662e444 CB |
182 | for (i = 0; i < LXC_NS_MAX; i++) |
183 | if (!strcasecmp(ns_info[i].proc_name, namespace)) | |
184 | return ns_info[i].clone_flag; | |
39a5d5fe | 185 | |
28d9e29e CB |
186 | ERROR("Invalid namespace name \"%s\"", namespace); |
187 | return -EINVAL; | |
188 | } | |
189 | ||
190 | int lxc_namespace_2_ns_idx(const char *namespace) | |
191 | { | |
192 | int i; | |
727b9b16 | 193 | |
28d9e29e CB |
194 | for (i = 0; i < LXC_NS_MAX; i++) |
195 | if (!strcmp(ns_info[i].proc_name, namespace)) | |
196 | return i; | |
197 | ||
198 | ERROR("Invalid namespace name \"%s\"", namespace); | |
199 | return -EINVAL; | |
39a5d5fe CS |
200 | } |
201 | ||
42067d18 | 202 | extern int lxc_namespace_2_std_identifiers(char *namespaces) |
203 | { | |
204 | char **it; | |
205 | char *del; | |
206 | ||
207 | /* The identifiers for namespaces used with lxc-attach and lxc-unshare | |
208 | * as given on the manpage do not align with the standard identifiers. | |
209 | * This affects network, mount, and uts namespaces. The standard identifiers | |
210 | * are: "mnt", "uts", and "net" whereas lxc-attach and lxc-unshare uses | |
211 | * "MOUNT", "UTSNAME", and "NETWORK". So let's use some cheap memmove()s | |
212 | * to replace them by their standard identifiers. | |
213 | * Let's illustrate this with an example: | |
214 | * Assume the string: | |
215 | * | |
216 | * "IPC|MOUNT|PID" | |
217 | * | |
218 | * then we memmove() | |
219 | * | |
220 | * dest: del + 1 == OUNT|PID | |
221 | * src: del + 3 == NT|PID | |
222 | */ | |
223 | if (!namespaces) | |
224 | return -1; | |
225 | ||
226 | while ((del = strstr(namespaces, "MOUNT"))) | |
227 | memmove(del + 1, del + 3, strlen(del) - 2); | |
228 | ||
229 | for (it = (char *[]){"NETWORK", "UTSNAME", NULL}; it && *it; it++) | |
230 | while ((del = strstr(namespaces, *it))) | |
231 | memmove(del + 3, del + 7, strlen(del) - 6); | |
232 | ||
233 | return 0; | |
234 | } | |
235 | ||
39a5d5fe CS |
236 | int lxc_fill_namespace_flags(char *flaglist, int *flags) |
237 | { | |
803fd7bf | 238 | char *token; |
39a5d5fe CS |
239 | int aflag; |
240 | ||
241 | if (!flaglist) { | |
9662e444 | 242 | ERROR("At least one namespace is needed."); |
39a5d5fe CS |
243 | return -1; |
244 | } | |
245 | ||
803fd7bf | 246 | lxc_iterate_parts(token, flaglist, "|") { |
39a5d5fe CS |
247 | aflag = lxc_namespace_2_cloneflag(token); |
248 | if (aflag < 0) | |
249 | return -1; | |
250 | ||
251 | *flags |= aflag; | |
39a5d5fe | 252 | } |
727b9b16 | 253 | |
39a5d5fe CS |
254 | return 0; |
255 | } |