]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/namespace.c
tree-wide: fix includes to fix bionic builds
[mirror_lxc.git] / src / lxc / namespace.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2009
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #ifndef _GNU_SOURCE
25 #define _GNU_SOURCE 1
26 #endif
27 #include <alloca.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <sched.h>
31 #include <signal.h>
32 #include <sys/param.h>
33 #include <sys/stat.h>
34 #include <sys/syscall.h>
35 #include <sys/types.h>
36 #include <unistd.h>
37
38 #include "config.h"
39 #include "log.h"
40 #include "namespace.h"
41 #include "utils.h"
42
43 lxc_log_define(namespace, lxc);
44
45 struct clone_arg {
46 int (*fn)(void *);
47 void *arg;
48 };
49
50 static int do_clone(void *arg)
51 {
52 struct clone_arg *clone_arg = arg;
53 return clone_arg->fn(clone_arg->arg);
54 }
55
56 pid_t lxc_clone(int (*fn)(void *), void *arg, int flags)
57 {
58 struct clone_arg clone_arg = {
59 .fn = fn,
60 .arg = arg,
61 };
62
63 size_t stack_size = lxc_getpagesize();
64 void *stack = alloca(stack_size);
65 pid_t ret;
66
67 #ifdef __ia64__
68 ret = __clone2(do_clone, stack, stack_size, flags | SIGCHLD, &clone_arg);
69 #else
70 ret = clone(do_clone, stack + stack_size, flags | SIGCHLD, &clone_arg);
71 #endif
72 if (ret < 0)
73 SYSERROR("Failed to clone (%#x)", flags);
74
75 return ret;
76 }
77
78 /**
79 * This is based on raw_clone in systemd but adapted to our needs. This uses
80 * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and
81 * doesn't really matter to us so disallow it.
82 *
83 * The nice thing about this is that we get fork() behavior. That is
84 * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
85 */
86 pid_t lxc_raw_clone(unsigned long flags)
87 {
88
89 /* These flags don't interest at all so we don't jump through any hoopes
90 * of retrieving them and passing them to the kernel.
91 */
92 errno = EINVAL;
93 if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
94 CLONE_CHILD_CLEARTID | CLONE_SETTLS)))
95 return -EINVAL;
96
97 #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
98 /* On s390/s390x and cris the order of the first and second arguments
99 * of the system call is reversed.
100 */
101 return (int)syscall(__NR_clone, NULL, flags | SIGCHLD);
102 #elif defined(__sparc__) && defined(__arch64__)
103 {
104 /**
105 * sparc64 always returns the other process id in %o0, and
106 * a boolean flag whether this is the child or the parent in
107 * %o1. Inline assembly is needed to get the flag returned
108 * in %o1.
109 */
110 int in_child;
111 int child_pid;
112 asm volatile("mov %2, %%g1\n\t"
113 "mov %3, %%o0\n\t"
114 "mov 0 , %%o1\n\t"
115 "t 0x6d\n\t"
116 "mov %%o1, %0\n\t"
117 "mov %%o0, %1"
118 : "=r"(in_child), "=r"(child_pid)
119 : "i"(__NR_clone), "r"(flags | SIGCHLD)
120 : "%o1", "%o0", "%g1");
121
122 if (in_child)
123 return 0;
124 else
125 return child_pid;
126 }
127 #elif defined(__ia64__)
128 /* On ia64 the stack and stack size are passed as separate arguments. */
129 return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0);
130 #else
131 return (int)syscall(__NR_clone, flags | SIGCHLD, NULL);
132 #endif
133 }
134
135 pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags)
136 {
137 pid_t pid;
138
139 pid = lxc_raw_clone(flags);
140 if (pid < 0)
141 return -1;
142
143 /* exit() is not thread-safe and might mess with the parent's signal
144 * handlers and other stuff when exec() fails.
145 */
146 if (pid == 0)
147 _exit(fn(args));
148
149 return pid;
150 }
151
152 /* Leave the user namespace at the first position in the array of structs so
153 * that we always attach to it first when iterating over the struct and using
154 * setns() to switch namespaces. This especially affects lxc_attach(): Suppose
155 * you cloned a new user namespace and mount namespace as an unprivileged user
156 * on the host and want to setns() to the mount namespace. This requires you to
157 * attach to the user namespace first otherwise the kernel will fail this check:
158 *
159 * if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
160 * !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
161 * !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
162 * return -EPERM;
163 *
164 * in
165 *
166 * linux/fs/namespace.c:mntns_install().
167 */
168 const struct ns_info ns_info[LXC_NS_MAX] = {
169 [LXC_NS_USER] = { "user", CLONE_NEWUSER, "CLONE_NEWUSER", "LXC_USER_NS" },
170 [LXC_NS_MNT] = { "mnt", CLONE_NEWNS, "CLONE_NEWNS", "LXC_MNT_NS" },
171 [LXC_NS_PID] = { "pid", CLONE_NEWPID, "CLONE_NEWPID", "LXC_PID_NS" },
172 [LXC_NS_UTS] = { "uts", CLONE_NEWUTS, "CLONE_NEWUTS", "LXC_UTS_NS" },
173 [LXC_NS_IPC] = { "ipc", CLONE_NEWIPC, "CLONE_NEWIPC", "LXC_IPC_NS" },
174 [LXC_NS_NET] = { "net", CLONE_NEWNET, "CLONE_NEWNET", "LXC_NET_NS" },
175 [LXC_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, "CLONE_NEWCGROUP", "LXC_CGROUP_NS" }
176 };
177
178 int lxc_namespace_2_cloneflag(const char *namespace)
179 {
180 int i;
181
182 for (i = 0; i < LXC_NS_MAX; i++)
183 if (!strcasecmp(ns_info[i].proc_name, namespace))
184 return ns_info[i].clone_flag;
185
186 ERROR("Invalid namespace name \"%s\"", namespace);
187 return -EINVAL;
188 }
189
190 int lxc_namespace_2_ns_idx(const char *namespace)
191 {
192 int i;
193
194 for (i = 0; i < LXC_NS_MAX; i++)
195 if (!strcmp(ns_info[i].proc_name, namespace))
196 return i;
197
198 ERROR("Invalid namespace name \"%s\"", namespace);
199 return -EINVAL;
200 }
201
202 extern int lxc_namespace_2_std_identifiers(char *namespaces)
203 {
204 char **it;
205 char *del;
206
207 /* The identifiers for namespaces used with lxc-attach and lxc-unshare
208 * as given on the manpage do not align with the standard identifiers.
209 * This affects network, mount, and uts namespaces. The standard identifiers
210 * are: "mnt", "uts", and "net" whereas lxc-attach and lxc-unshare uses
211 * "MOUNT", "UTSNAME", and "NETWORK". So let's use some cheap memmove()s
212 * to replace them by their standard identifiers.
213 * Let's illustrate this with an example:
214 * Assume the string:
215 *
216 * "IPC|MOUNT|PID"
217 *
218 * then we memmove()
219 *
220 * dest: del + 1 == OUNT|PID
221 * src: del + 3 == NT|PID
222 */
223 if (!namespaces)
224 return -1;
225
226 while ((del = strstr(namespaces, "MOUNT")))
227 memmove(del + 1, del + 3, strlen(del) - 2);
228
229 for (it = (char *[]){"NETWORK", "UTSNAME", NULL}; it && *it; it++)
230 while ((del = strstr(namespaces, *it)))
231 memmove(del + 3, del + 7, strlen(del) - 6);
232
233 return 0;
234 }
235
236 int lxc_fill_namespace_flags(char *flaglist, int *flags)
237 {
238 char *token;
239 int aflag;
240
241 if (!flaglist) {
242 ERROR("At least one namespace is needed.");
243 return -1;
244 }
245
246 lxc_iterate_parts(token, flaglist, "|") {
247 aflag = lxc_namespace_2_cloneflag(token);
248 if (aflag < 0)
249 return -1;
250
251 *flags |= aflag;
252 }
253
254 return 0;
255 }