]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
1f5596dd CB |
3 | #ifndef _GNU_SOURCE |
4 | #define _GNU_SOURCE | |
5 | #endif | |
6 | ||
7 | #ifndef FUSE_USE_VERSION | |
237e200e | 8 | #define FUSE_USE_VERSION 26 |
1f5596dd CB |
9 | #endif |
10 | ||
11 | #define _FILE_OFFSET_BITS 64 | |
237e200e | 12 | |
237e200e | 13 | #include <dirent.h> |
29a73c2f | 14 | #include <errno.h> |
237e200e SH |
15 | #include <fcntl.h> |
16 | #include <fuse.h> | |
0ecddf02 | 17 | #include <inttypes.h> |
237e200e | 18 | #include <libgen.h> |
dee86006 CB |
19 | #include <linux/magic.h> |
20 | #include <linux/sched.h> | |
237e200e | 21 | #include <pthread.h> |
29a73c2f | 22 | #include <sched.h> |
db1b32f6 | 23 | #include <stdarg.h> |
29a73c2f | 24 | #include <stdbool.h> |
0ecddf02 | 25 | #include <stdint.h> |
29a73c2f CB |
26 | #include <stdio.h> |
27 | #include <stdlib.h> | |
28 | #include <string.h> | |
29a73c2f CB |
29 | #include <sys/epoll.h> |
30 | #include <sys/mman.h> | |
31 | #include <sys/mount.h> | |
237e200e SH |
32 | #include <sys/param.h> |
33 | #include <sys/socket.h> | |
29a73c2f | 34 | #include <sys/syscall.h> |
0ecddf02 | 35 | #include <sys/sysinfo.h> |
d89504c4 | 36 | #include <sys/vfs.h> |
dee86006 CB |
37 | #include <time.h> |
38 | #include <unistd.h> | |
39 | #include <wait.h> | |
237e200e | 40 | |
ce8fc84c | 41 | #include "api_extensions.h" |
237e200e | 42 | #include "bindings.h" |
580fe4df | 43 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
44 | #include "cgroups/cgroup.h" |
45 | #include "cgroups/cgroup_utils.h" | |
dee86006 | 46 | #include "config.h" |
c9236032 | 47 | #include "memory_utils.h" |
1f5596dd | 48 | #include "proc_cpuview.h" |
8364a99c | 49 | #include "syscall_numbers.h" |
1d81c6a6 | 50 | #include "utils.h" |
237e200e | 51 | |
2aa59b2e | 52 | static bool can_use_pidfd; |
b9b6bdc9 CB |
53 | |
54 | static volatile sig_atomic_t reload_successful; | |
cbfc55fd CB |
55 | |
56 | bool liblxcfs_functional(void) | |
57 | { | |
b9b6bdc9 | 58 | return reload_successful != 0; |
cbfc55fd | 59 | } |
2aa59b2e | 60 | |
29a73c2f CB |
61 | /* Define pivot_root() if missing from the C library */ |
62 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 63 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f | 64 | { |
4ec5c9da | 65 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f CB |
66 | } |
67 | #else | |
4ec5c9da | 68 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
69 | #endif |
70 | ||
237e200e SH |
71 | /* |
72 | * A table caching which pid is init for a pid namespace. | |
73 | * When looking up which pid is init for $qpid, we first | |
74 | * 1. Stat /proc/$qpid/ns/pid. | |
75 | * 2. Check whether the ino_t is in our store. | |
76 | * a. if not, fork a child in qpid's ns to send us | |
77 | * ucred.pid = 1, and read the initpid. Cache | |
78 | * initpid and creation time for /proc/initpid | |
79 | * in a new store entry. | |
80 | * b. if so, verify that /proc/initpid still matches | |
81 | * what we have saved. If not, clear the store | |
82 | * entry and go back to a. If so, return the | |
83 | * cached initpid. | |
84 | */ | |
85 | struct pidns_init_store { | |
2aa59b2e CB |
86 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
87 | pid_t initpid; /* the pid of nit in that ns */ | |
88 | int init_pidfd; | |
1ba088ae | 89 | int64_t ctime; /* the time at which /proc/$initpid was created */ |
237e200e | 90 | struct pidns_init_store *next; |
1ba088ae | 91 | int64_t lastcheck; |
237e200e SH |
92 | }; |
93 | ||
94 | /* lol - look at how they are allocated in the kernel */ | |
95 | #define PIDNS_HASH_SIZE 4096 | |
96 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
97 | ||
98 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
99 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 100 | |
4e1e4115 | 101 | static void mutex_lock(pthread_mutex_t *l) |
237e200e SH |
102 | { |
103 | int ret; | |
104 | ||
4ec5c9da CB |
105 | ret = pthread_mutex_lock(l); |
106 | if (ret) | |
107 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
108 | } |
109 | ||
77f4399a | 110 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 111 | |
4e1e4115 | 112 | static void mutex_unlock(pthread_mutex_t *l) |
237e200e SH |
113 | { |
114 | int ret; | |
115 | ||
4ec5c9da CB |
116 | ret = pthread_mutex_unlock(l); |
117 | if (ret) | |
118 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
119 | } |
120 | ||
4e1e4115 | 121 | static inline void store_lock(void) |
237e200e | 122 | { |
4e1e4115 | 123 | mutex_lock(&pidns_store_mutex); |
237e200e SH |
124 | } |
125 | ||
4e1e4115 | 126 | static inline void store_unlock(void) |
237e200e | 127 | { |
4e1e4115 | 128 | mutex_unlock(&pidns_store_mutex); |
237e200e SH |
129 | } |
130 | ||
2aa59b2e CB |
131 | /* /proc/ = 6 |
132 | * + | |
133 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
134 | * + | |
135 | * \0 = 1 | |
136 | */ | |
137 | #define LXCFS_PROC_PID_LEN \ | |
138 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
139 | ||
bc189096 | 140 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 141 | { |
bc189096 | 142 | int ret; |
237e200e | 143 | |
bc189096 CB |
144 | if (entry->init_pidfd < 0) |
145 | return ret_errno(ENOSYS); | |
7dd6560a | 146 | |
bc189096 CB |
147 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
148 | if (ret < 0) { | |
149 | if (errno == ENOSYS) | |
150 | return ret_errno(ENOSYS); | |
7dd6560a | 151 | |
bc189096 | 152 | return 0; |
2aa59b2e CB |
153 | } |
154 | ||
bc189096 CB |
155 | return 1; |
156 | } | |
157 | ||
158 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
159 | { | |
160 | struct stat st; | |
161 | char path[LXCFS_PROC_PID_LEN]; | |
162 | ||
163 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
164 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
165 | return 0; | |
166 | ||
167 | return 1; | |
168 | } | |
169 | ||
170 | /* Must be called under store_lock */ | |
171 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
172 | { | |
173 | int ret; | |
174 | ||
175 | ret = initpid_still_valid_pidfd(entry); | |
176 | if (ret < 0) | |
177 | ret = initpid_still_valid_stat(entry); | |
178 | ||
179 | return ret == 1; | |
237e200e SH |
180 | } |
181 | ||
182 | /* Must be called under store_lock */ | |
2aa59b2e | 183 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 184 | { |
2aa59b2e CB |
185 | struct pidns_init_store *it; |
186 | int ino_hash; | |
237e200e | 187 | |
2aa59b2e CB |
188 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
189 | entry->initpid); | |
7dd6560a | 190 | |
2aa59b2e CB |
191 | ino_hash = HASH(entry->ino); |
192 | if (pidns_hash_table[ino_hash] == entry) { | |
193 | pidns_hash_table[ino_hash] = entry->next; | |
194 | close_prot_errno_disarm(entry->init_pidfd); | |
195 | free_disarm(entry); | |
237e200e SH |
196 | return; |
197 | } | |
198 | ||
2aa59b2e CB |
199 | it = pidns_hash_table[ino_hash]; |
200 | while (it) { | |
201 | if (it->next == entry) { | |
202 | it->next = entry->next; | |
203 | close_prot_errno_disarm(entry->init_pidfd); | |
204 | free_disarm(entry); | |
237e200e SH |
205 | return; |
206 | } | |
2aa59b2e | 207 | it = it->next; |
237e200e SH |
208 | } |
209 | } | |
210 | ||
211 | #define PURGE_SECS 5 | |
212 | /* Must be called under store_lock */ | |
213 | static void prune_initpid_store(void) | |
214 | { | |
1ba088ae CB |
215 | static int64_t last_prune = 0; |
216 | int64_t now, threshold; | |
237e200e SH |
217 | |
218 | if (!last_prune) { | |
219 | last_prune = time(NULL); | |
220 | return; | |
221 | } | |
2aa59b2e | 222 | |
237e200e | 223 | now = time(NULL); |
b18d6121 | 224 | if (now < (last_prune + PURGE_SECS)) |
237e200e | 225 | return; |
7dd6560a | 226 | |
2aa59b2e | 227 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 228 | |
237e200e SH |
229 | last_prune = now; |
230 | threshold = now - 2 * PURGE_SECS; | |
231 | ||
2aa59b2e CB |
232 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
233 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
234 | if (entry->lastcheck < threshold) { | |
235 | struct pidns_init_store *cur = entry; | |
7dd6560a | 236 | |
2aa59b2e | 237 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 238 | |
237e200e | 239 | if (prev) |
2aa59b2e | 240 | prev->next = entry->next; |
237e200e | 241 | else |
2aa59b2e CB |
242 | pidns_hash_table[i] = entry->next; |
243 | entry = entry->next; | |
244 | close_prot_errno_disarm(cur->init_pidfd); | |
245 | free_disarm(cur); | |
237e200e | 246 | } else { |
2aa59b2e CB |
247 | prev = entry; |
248 | entry = entry->next; | |
237e200e SH |
249 | } |
250 | } | |
251 | } | |
252 | } | |
253 | ||
254 | /* Must be called under store_lock */ | |
fcdedd16 | 255 | static void save_initpid(ino_t pidns_inode, pid_t pid) |
237e200e | 256 | { |
1e5d03fe | 257 | __do_free struct pidns_init_store *entry = NULL; |
05b7a16d | 258 | __do_close int pidfd = -EBADF; |
536620fd | 259 | const struct lxcfs_opts *opts = fuse_get_context()->private_data; |
2aa59b2e | 260 | char path[LXCFS_PROC_PID_LEN]; |
2aa59b2e CB |
261 | struct stat st; |
262 | int ino_hash; | |
263 | ||
9973cc06 | 264 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
265 | pidfd = pidfd_open(pid, 0); |
266 | if (pidfd < 0) | |
267 | return; | |
268 | } | |
237e200e | 269 | |
2aa59b2e CB |
270 | snprintf(path, sizeof(path), "/proc/%d", pid); |
271 | if (stat(path, &st)) | |
272 | return; | |
7dd6560a | 273 | |
5ec289bf | 274 | entry = zalloc(sizeof(*entry)); |
0eb3756b | 275 | if (!entry) |
237e200e | 276 | return; |
2aa59b2e | 277 | |
97017213 | 278 | ino_hash = HASH(pidns_inode); |
1e5d03fe | 279 | *entry = (struct pidns_init_store){ |
fcdedd16 | 280 | .ino = pidns_inode, |
1e5d03fe CB |
281 | .initpid = pid, |
282 | .ctime = st.st_ctime, | |
283 | .next = pidns_hash_table[ino_hash], | |
284 | .lastcheck = time(NULL), | |
285 | .init_pidfd = move_fd(pidfd), | |
286 | }; | |
287 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
288 | |
289 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
290 | } |
291 | ||
292 | /* | |
293 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
294 | * entry for the inode number and creation time. Verify that the init pid | |
295 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
296 | * otherwise. | |
297 | * Must be called under store_lock | |
298 | */ | |
fcdedd16 | 299 | static struct pidns_init_store *lookup_verify_initpid(ino_t pidns_inode) |
237e200e | 300 | { |
fcdedd16 | 301 | struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; |
2aa59b2e CB |
302 | |
303 | while (entry) { | |
fcdedd16 | 304 | if (entry->ino == pidns_inode) { |
2aa59b2e CB |
305 | if (initpid_still_valid(entry)) { |
306 | entry->lastcheck = time(NULL); | |
307 | return entry; | |
237e200e | 308 | } |
2aa59b2e CB |
309 | |
310 | remove_initpid(entry); | |
237e200e SH |
311 | return NULL; |
312 | } | |
2aa59b2e | 313 | entry = entry->next; |
237e200e SH |
314 | } |
315 | ||
316 | return NULL; | |
317 | } | |
318 | ||
4ec5c9da | 319 | static int send_creds_clone_wrapper(void *arg) |
237e200e | 320 | { |
f1744de4 CB |
321 | int sock = PTR_TO_INT(arg); |
322 | char v = '1'; /* we are the child */ | |
323 | struct ucred cred = { | |
324 | .uid = 0, | |
325 | .gid = 0, | |
326 | .pid = 1, | |
327 | }; | |
328 | ||
329 | return send_creds(sock, &cred, v, true) != SEND_CREDS_OK; | |
237e200e SH |
330 | } |
331 | ||
87f7558b CB |
332 | /* |
333 | * Let's use the "standard stack limit" (i.e. glibc thread size default) for | |
334 | * stack sizes: 8MB. | |
335 | */ | |
336 | #define __LXCFS_STACK_SIZE (8 * 1024 * 1024) | |
6abff455 | 337 | pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags) |
87f7558b CB |
338 | { |
339 | pid_t ret; | |
340 | void *stack; | |
341 | ||
342 | stack = malloc(__LXCFS_STACK_SIZE); | |
343 | if (!stack) | |
344 | return ret_errno(ENOMEM); | |
345 | ||
346 | #ifdef __ia64__ | |
347 | ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
348 | #else | |
349 | ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
350 | #endif | |
351 | return ret; | |
352 | } | |
353 | ||
354 | #define LXCFS_PROC_PID_NS_LEN \ | |
355 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
356 | STRLITERALLEN("/ns/pid") + 1) | |
357 | ||
580fe4df CB |
358 | /* |
359 | * clone a task which switches to @task's namespace and writes '1'. | |
360 | * over a unix sock so we can read the task's reaper's pid in our | |
361 | * namespace | |
362 | * | |
363 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
364 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
365 | * the pidns and the parent pid outside are identical. Using clone prevents | |
366 | * this issue. | |
367 | */ | |
368 | static void write_task_init_pid_exit(int sock, pid_t target) | |
369 | { | |
05b7a16d | 370 | __do_close int fd = -EBADF; |
87f7558b | 371 | char path[LXCFS_PROC_PID_NS_LEN]; |
580fe4df | 372 | pid_t pid; |
87f7558b CB |
373 | |
374 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
375 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
376 | if (fd < 0) | |
377 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
378 | ||
379 | if (setns(fd, 0)) | |
380 | log_exit("Failed to setns to pid namespace of process %d", target); | |
381 | ||
f1744de4 | 382 | pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0); |
580fe4df | 383 | if (pid < 0) |
87f7558b CB |
384 | _exit(EXIT_FAILURE); |
385 | ||
580fe4df CB |
386 | if (pid != 0) { |
387 | if (!wait_for_pid(pid)) | |
87f7558b CB |
388 | _exit(EXIT_FAILURE); |
389 | ||
390 | _exit(EXIT_SUCCESS); | |
237e200e | 391 | } |
237e200e SH |
392 | } |
393 | ||
580fe4df | 394 | static pid_t get_init_pid_for_task(pid_t task) |
237e200e | 395 | { |
580fe4df | 396 | char v = '0'; |
87f7558b | 397 | pid_t pid_ret = -1; |
dac3dc93 CB |
398 | struct ucred cred = { |
399 | .pid = -1, | |
400 | .uid = -1, | |
401 | .gid = -1, | |
402 | }; | |
87f7558b CB |
403 | pid_t pid; |
404 | int sock[2]; | |
237e200e | 405 | |
87f7558b | 406 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 407 | return -1; |
237e200e | 408 | |
580fe4df CB |
409 | pid = fork(); |
410 | if (pid < 0) | |
411 | goto out; | |
87f7558b CB |
412 | |
413 | if (pid == 0) { | |
580fe4df CB |
414 | close(sock[1]); |
415 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 416 | _exit(EXIT_SUCCESS); |
237e200e | 417 | } |
7213ec5c | 418 | |
580fe4df CB |
419 | if (!recv_creds(sock[1], &cred, &v)) |
420 | goto out; | |
87f7558b CB |
421 | |
422 | pid_ret = cred.pid; | |
237e200e | 423 | |
580fe4df CB |
424 | out: |
425 | close(sock[0]); | |
426 | close(sock[1]); | |
427 | if (pid > 0) | |
428 | wait_for_pid(pid); | |
237e200e | 429 | |
87f7558b CB |
430 | return pid_ret; |
431 | } | |
2aa59b2e CB |
432 | |
433 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 434 | { |
580fe4df | 435 | pid_t answer = 0; |
2aa59b2e CB |
436 | char path[LXCFS_PROC_PID_NS_LEN]; |
437 | struct stat st; | |
438 | struct pidns_init_store *entry; | |
439 | ||
440 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
2aa59b2e | 441 | if (stat(path, &st)) |
4e1e4115 | 442 | return ret_errno(ESRCH); |
2aa59b2e | 443 | |
4e1e4115 | 444 | store_lock(); |
fcdedd16 WB |
445 | |
446 | entry = lookup_verify_initpid(st.st_ino); | |
2aa59b2e CB |
447 | if (entry) { |
448 | answer = entry->initpid; | |
580fe4df CB |
449 | goto out; |
450 | } | |
2aa59b2e | 451 | |
fcdedd16 | 452 | /* release the mutex as the following call is expensive */ |
4e1e4115 CB |
453 | store_unlock(); |
454 | ||
2aa59b2e | 455 | answer = get_init_pid_for_task(pid); |
4e1e4115 CB |
456 | |
457 | store_lock(); | |
fcdedd16 | 458 | |
580fe4df | 459 | if (answer > 0) |
fcdedd16 | 460 | save_initpid(st.st_ino, answer); |
b7672ded | 461 | |
580fe4df | 462 | out: |
2aa59b2e CB |
463 | /* |
464 | * Prune at the end in case we're returning the value we were about to | |
465 | * return. | |
466 | */ | |
580fe4df | 467 | prune_initpid_store(); |
4e1e4115 | 468 | store_unlock(); |
2aa59b2e | 469 | |
580fe4df | 470 | return answer; |
237e200e SH |
471 | } |
472 | ||
29a73c2f CB |
473 | /* |
474 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
475 | */ |
476 | ||
29a73c2f CB |
477 | static bool umount_if_mounted(void) |
478 | { | |
479 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 480 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
481 | return false; |
482 | } | |
483 | return true; | |
484 | } | |
485 | ||
2283e240 CB |
486 | /* __typeof__ should be safe to use with all compilers. */ |
487 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
488 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
489 | { | |
490 | return (fs->f_type == (fs_type_magic)magic_val); | |
491 | } | |
492 | ||
0a4dea41 CB |
493 | /* |
494 | * looking at fs/proc_namespace.c, it appears we can | |
495 | * actually expect the rootfs entry to very specifically contain | |
496 | * " - rootfs rootfs " | |
497 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
498 | * the rootfs entry should always be skipped in mountinfo contents. | |
499 | */ | |
500 | static bool is_on_ramfs(void) | |
501 | { | |
87f7558b | 502 | __do_free char *line = NULL; |
757a63e7 | 503 | __do_free void *fopen_cache = NULL; |
87f7558b | 504 | __do_fclose FILE *f = NULL; |
0a4dea41 | 505 | size_t len = 0; |
0a4dea41 | 506 | |
757a63e7 | 507 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
508 | if (!f) |
509 | return false; | |
510 | ||
511 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
512 | int i; |
513 | char *p, *p2; | |
514 | ||
0a4dea41 CB |
515 | for (p = line, i = 0; p && i < 4; i++) |
516 | p = strchr(p + 1, ' '); | |
517 | if (!p) | |
518 | continue; | |
87f7558b | 519 | |
0a4dea41 CB |
520 | p2 = strchr(p + 1, ' '); |
521 | if (!p2) | |
522 | continue; | |
523 | *p2 = '\0'; | |
524 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 525 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 526 | p = strchr(p2 + 1, '-'); |
87f7558b | 527 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 528 | return true; |
0a4dea41 CB |
529 | } |
530 | } | |
87f7558b | 531 | |
0a4dea41 CB |
532 | return false; |
533 | } | |
534 | ||
cc309f33 | 535 | static int pivot_enter() |
0a4dea41 | 536 | { |
05b7a16d | 537 | __do_close int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 | 538 | |
3326c17e | 539 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
540 | if (oldroot < 0) |
541 | return log_error_errno(-1, errno, | |
542 | "Failed to open old root for fchdir"); | |
cc309f33 | 543 | |
3326c17e | 544 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
545 | if (newroot < 0) |
546 | return log_error_errno(-1, errno, | |
547 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
548 | |
549 | /* change into new root fs */ | |
87f7558b CB |
550 | if (fchdir(newroot) < 0) |
551 | return log_error_errno(-1, | |
552 | errno, "Failed to change directory to new rootfs: %s", | |
553 | ROOTDIR); | |
cc309f33 | 554 | |
0a4dea41 | 555 | /* pivot_root into our new root fs */ |
87f7558b CB |
556 | if (pivot_root(".", ".") < 0) |
557 | return log_error_errno(-1, errno, | |
558 | "pivot_root() syscall failed: %s", | |
559 | strerror(errno)); | |
0a4dea41 CB |
560 | |
561 | /* | |
562 | * At this point the old-root is mounted on top of our new-root. | |
563 | * To unmounted it we must not be chdir'd into it, so escape back | |
564 | * to the old-root. | |
565 | */ | |
87f7558b CB |
566 | if (fchdir(oldroot) < 0) |
567 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 568 | |
87f7558b CB |
569 | if (umount2(".", MNT_DETACH) < 0) |
570 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 571 | |
87f7558b CB |
572 | if (fchdir(newroot) < 0) |
573 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 574 | |
87f7558b | 575 | return 0; |
0a4dea41 CB |
576 | } |
577 | ||
578 | static int chroot_enter() | |
579 | { | |
580 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
581 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
582 | return -1; | |
583 | } | |
584 | ||
585 | if (chroot(".") < 0) { | |
586 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
587 | return -1; | |
588 | } | |
589 | ||
590 | if (chdir("/") < 0) { | |
591 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
592 | return -1; | |
593 | } | |
594 | ||
595 | return 0; | |
596 | } | |
597 | ||
0232cbac | 598 | static int permute_and_enter(void) |
29a73c2f | 599 | { |
0a4dea41 CB |
600 | struct statfs sb; |
601 | ||
602 | if (statfs("/", &sb) < 0) { | |
603 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 604 | return -1; |
0a4dea41 CB |
605 | } |
606 | ||
607 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
608 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
609 | * /proc/1/mountinfo. */ | |
610 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
611 | return chroot_enter(); | |
29a73c2f | 612 | |
cc309f33 | 613 | if (pivot_enter() < 0) { |
0a4dea41 | 614 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 615 | return -1; |
29a73c2f CB |
616 | } |
617 | ||
cc309f33 | 618 | return 0; |
29a73c2f CB |
619 | } |
620 | ||
621 | /* Prepare our new clean root. */ | |
0232cbac | 622 | static int permute_prepare(void) |
29a73c2f CB |
623 | { |
624 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 625 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
626 | return -1; |
627 | } | |
628 | ||
629 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 630 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
631 | return -1; |
632 | } | |
633 | ||
634 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 635 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
636 | return -1; |
637 | } | |
638 | ||
639 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 640 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
641 | return -1; |
642 | } | |
643 | ||
644 | return 0; | |
645 | } | |
646 | ||
0232cbac CB |
647 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
648 | static bool permute_root(void) | |
29a73c2f CB |
649 | { |
650 | /* Prepare new root. */ | |
0232cbac | 651 | if (permute_prepare() < 0) |
29a73c2f CB |
652 | return false; |
653 | ||
654 | /* Pivot into new root. */ | |
0232cbac | 655 | if (permute_and_enter() < 0) |
29a73c2f CB |
656 | return false; |
657 | ||
658 | return true; | |
659 | } | |
660 | ||
0a4dea41 | 661 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
662 | { |
663 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 664 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
665 | return false; |
666 | } | |
480262c9 | 667 | |
29a73c2f | 668 | if (!umount_if_mounted()) { |
b8defc3d | 669 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
670 | return false; |
671 | } | |
672 | ||
673 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 674 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
675 | return false; |
676 | } | |
677 | ||
1d81c6a6 | 678 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 679 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
680 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
681 | return false; | |
682 | } | |
683 | ||
480262c9 | 684 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 685 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
686 | return false; |
687 | } | |
480262c9 | 688 | |
29a73c2f | 689 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 690 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
691 | return false; |
692 | } | |
480262c9 | 693 | |
29a73c2f CB |
694 | return true; |
695 | } | |
696 | ||
0a4dea41 | 697 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 698 | { |
5fbea8a6 CB |
699 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
700 | return false; | |
51c7ca35 | 701 | |
5fbea8a6 CB |
702 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
703 | return false; | |
29a73c2f | 704 | |
5fbea8a6 CB |
705 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
706 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
707 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
708 | if ((*h)->fd < 0) | |
29a73c2f | 709 | return false; |
29a73c2f | 710 | } |
5fbea8a6 | 711 | |
29a73c2f CB |
712 | return true; |
713 | } | |
714 | ||
480262c9 | 715 | static bool cgfs_setup_controllers(void) |
29a73c2f | 716 | { |
0a4dea41 | 717 | if (!cgfs_prepare_mounts()) |
29a73c2f | 718 | return false; |
29a73c2f | 719 | |
2b8eff1d CB |
720 | if (!cgfs_mount_hierarchies()) |
721 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 722 | |
0232cbac | 723 | if (!permute_root()) |
29a73c2f CB |
724 | return false; |
725 | ||
726 | return true; | |
727 | } | |
728 | ||
dee86006 | 729 | static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) |
b9b6bdc9 CB |
730 | { |
731 | int ret; | |
732 | ||
733 | if (reload_successful) { | |
734 | reload_successful = 0; | |
735 | ||
736 | /* write() is async signal safe */ | |
737 | ret = write(STDERR_FILENO, | |
738 | "Switched into non-virtualization mode\n", | |
739 | STRLITERALLEN("Switched into non-virtualization mode\n")); | |
740 | if (ret < 0) | |
741 | goto please_compiler; | |
742 | } else { | |
743 | reload_successful = 1; | |
744 | ||
745 | /* write() is async signal safe */ | |
746 | ret = write(STDERR_FILENO, "Switched into virtualization mode\n", | |
747 | STRLITERALLEN("Switched into virtualization mode\n")); | |
748 | if (ret < 0) | |
749 | goto please_compiler; | |
750 | } | |
751 | ||
752 | please_compiler: | |
753 | /* | |
754 | * The write() syscall is a function whose return value needs to be | |
755 | * checked. Otherwise the compiler will warn. This is how we | |
756 | * please our master. Another one could be to use | |
757 | * syscall(__NR_write, ...) directly but whatever. | |
758 | */ | |
759 | return; | |
760 | } | |
761 | ||
2243c5a9 | 762 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 763 | { |
05b7a16d | 764 | __do_close int init_ns = -EBADF, root_fd = -EBADF, |
de69569b | 765 | pidfd = -EBADF; |
4ec5c9da | 766 | int i = 0; |
2aa59b2e | 767 | pid_t pid; |
237e200e | 768 | |
c2357135 | 769 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); |
cc42d0c7 | 770 | |
5fbea8a6 | 771 | cgroup_ops = cgroup_init(); |
c2357135 CB |
772 | if (!cgroup_ops) { |
773 | lxcfs_info("Failed to initialize cgroup support"); | |
774 | goto broken_upgrade; | |
775 | } | |
237e200e | 776 | |
480262c9 | 777 | /* Preserve initial namespace. */ |
2aa59b2e CB |
778 | pid = getpid(); |
779 | init_ns = preserve_ns(pid, "mnt"); | |
c2357135 CB |
780 | if (init_ns < 0) { |
781 | lxcfs_info("Failed to preserve initial mount namespace"); | |
782 | goto broken_upgrade; | |
783 | } | |
480262c9 | 784 | |
480262c9 CB |
785 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
786 | * to privately mount lxcfs cgroups. */ | |
c2357135 | 787 | if (!cgfs_setup_controllers()) { |
2243c5a9 | 788 | log_exit("Failed to setup private cgroup mounts for lxcfs"); |
c2357135 CB |
789 | goto broken_upgrade; |
790 | } | |
480262c9 | 791 | |
c2357135 | 792 | if (setns(init_ns, 0) < 0) { |
2243c5a9 | 793 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); |
c2357135 CB |
794 | goto broken_upgrade; |
795 | } | |
29a73c2f | 796 | |
c2357135 | 797 | if (!init_cpuview()) { |
2243c5a9 | 798 | log_exit("Failed to init CPU view"); |
c2357135 CB |
799 | goto broken_upgrade; |
800 | } | |
056adcef | 801 | |
cc42d0c7 CB |
802 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
803 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
804 | |
805 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
806 | char **controller_list = (*h)->controllers; |
807 | __do_free char *controllers = NULL; | |
808 | if (controller_list && *controller_list) | |
809 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
810 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 811 | } |
2aa59b2e CB |
812 | |
813 | pidfd = pidfd_open(pid, 0); | |
814 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
815 | can_use_pidfd = true; | |
cc42d0c7 | 816 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 817 | } |
ce8fc84c | 818 | |
cc42d0c7 | 819 | lxcfs_info("api_extensions:"); |
ce8fc84c | 820 | for (i = 0; i < nr_api_extensions; i++) |
cc42d0c7 | 821 | lxcfs_info("- %s", api_extensions[i]); |
de69569b CB |
822 | |
823 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
c2357135 CB |
824 | if (root_fd < 0) |
825 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
826 | else if (fchdir(root_fd) < 0) | |
827 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
828 | ||
dee86006 CB |
829 | if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { |
830 | lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); | |
b9b6bdc9 | 831 | goto broken_upgrade; |
dee86006 | 832 | } |
b9b6bdc9 CB |
833 | |
834 | reload_successful = 1; | |
c2357135 | 835 | return; |
de69569b | 836 | |
c2357135 | 837 | broken_upgrade: |
b9b6bdc9 | 838 | reload_successful = 0; |
c2357135 | 839 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); |
237e200e SH |
840 | } |
841 | ||
2243c5a9 | 842 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 843 | { |
cc42d0c7 CB |
844 | lxcfs_info("Running destructor %s", __func__); |
845 | ||
056adcef | 846 | free_cpuview(); |
2243c5a9 | 847 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 848 | } |