]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
1f5596dd CB |
3 | #ifndef _GNU_SOURCE |
4 | #define _GNU_SOURCE | |
5 | #endif | |
6 | ||
7 | #ifndef FUSE_USE_VERSION | |
237e200e | 8 | #define FUSE_USE_VERSION 26 |
1f5596dd CB |
9 | #endif |
10 | ||
11 | #define _FILE_OFFSET_BITS 64 | |
237e200e | 12 | |
237e200e | 13 | #include <dirent.h> |
29a73c2f | 14 | #include <errno.h> |
237e200e SH |
15 | #include <fcntl.h> |
16 | #include <fuse.h> | |
0ecddf02 | 17 | #include <inttypes.h> |
237e200e | 18 | #include <libgen.h> |
237e200e | 19 | #include <pthread.h> |
29a73c2f | 20 | #include <sched.h> |
db1b32f6 | 21 | #include <stdarg.h> |
29a73c2f | 22 | #include <stdbool.h> |
0ecddf02 | 23 | #include <stdint.h> |
29a73c2f CB |
24 | #include <stdio.h> |
25 | #include <stdlib.h> | |
26 | #include <string.h> | |
27 | #include <time.h> | |
28 | #include <unistd.h> | |
29 | #include <wait.h> | |
d89504c4 | 30 | #include <linux/magic.h> |
237e200e | 31 | #include <linux/sched.h> |
29a73c2f CB |
32 | #include <sys/epoll.h> |
33 | #include <sys/mman.h> | |
34 | #include <sys/mount.h> | |
237e200e | 35 | #include <sys/param.h> |
87f7558b | 36 | #include <signal.h> |
237e200e | 37 | #include <sys/socket.h> |
29a73c2f | 38 | #include <sys/syscall.h> |
0ecddf02 | 39 | #include <sys/sysinfo.h> |
d89504c4 | 40 | #include <sys/vfs.h> |
237e200e | 41 | |
ce8fc84c | 42 | #include "api_extensions.h" |
237e200e | 43 | #include "bindings.h" |
1d81c6a6 | 44 | #include "config.h" |
580fe4df | 45 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
46 | #include "cgroups/cgroup.h" |
47 | #include "cgroups/cgroup_utils.h" | |
c9236032 | 48 | #include "memory_utils.h" |
1f5596dd | 49 | #include "proc_cpuview.h" |
1d81c6a6 | 50 | #include "utils.h" |
237e200e | 51 | |
2aa59b2e | 52 | static bool can_use_pidfd; |
cbfc55fd CB |
53 | static bool reload_successful; |
54 | ||
55 | bool liblxcfs_functional(void) | |
56 | { | |
57 | return reload_successful; | |
58 | } | |
2aa59b2e | 59 | |
29a73c2f CB |
60 | /* Define pivot_root() if missing from the C library */ |
61 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 62 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f CB |
63 | { |
64 | #ifdef __NR_pivot_root | |
4ec5c9da | 65 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f | 66 | #else |
4ec5c9da CB |
67 | errno = ENOSYS; |
68 | return -1; | |
29a73c2f CB |
69 | #endif |
70 | } | |
71 | #else | |
4ec5c9da | 72 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
73 | #endif |
74 | ||
237e200e SH |
75 | /* |
76 | * A table caching which pid is init for a pid namespace. | |
77 | * When looking up which pid is init for $qpid, we first | |
78 | * 1. Stat /proc/$qpid/ns/pid. | |
79 | * 2. Check whether the ino_t is in our store. | |
80 | * a. if not, fork a child in qpid's ns to send us | |
81 | * ucred.pid = 1, and read the initpid. Cache | |
82 | * initpid and creation time for /proc/initpid | |
83 | * in a new store entry. | |
84 | * b. if so, verify that /proc/initpid still matches | |
85 | * what we have saved. If not, clear the store | |
86 | * entry and go back to a. If so, return the | |
87 | * cached initpid. | |
88 | */ | |
89 | struct pidns_init_store { | |
2aa59b2e CB |
90 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
91 | pid_t initpid; /* the pid of nit in that ns */ | |
92 | int init_pidfd; | |
93 | long int ctime; /* the time at which /proc/$initpid was created */ | |
237e200e SH |
94 | struct pidns_init_store *next; |
95 | long int lastcheck; | |
96 | }; | |
97 | ||
98 | /* lol - look at how they are allocated in the kernel */ | |
99 | #define PIDNS_HASH_SIZE 4096 | |
100 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
101 | ||
102 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
103 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 104 | |
237e200e SH |
105 | static void lock_mutex(pthread_mutex_t *l) |
106 | { | |
107 | int ret; | |
108 | ||
4ec5c9da CB |
109 | ret = pthread_mutex_lock(l); |
110 | if (ret) | |
111 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
112 | } |
113 | ||
77f4399a | 114 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 115 | |
237e200e SH |
116 | static void unlock_mutex(pthread_mutex_t *l) |
117 | { | |
118 | int ret; | |
119 | ||
4ec5c9da CB |
120 | ret = pthread_mutex_unlock(l); |
121 | if (ret) | |
122 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
123 | } |
124 | ||
125 | static void store_lock(void) | |
126 | { | |
127 | lock_mutex(&pidns_store_mutex); | |
128 | } | |
129 | ||
130 | static void store_unlock(void) | |
131 | { | |
132 | unlock_mutex(&pidns_store_mutex); | |
133 | } | |
134 | ||
2aa59b2e CB |
135 | /* /proc/ = 6 |
136 | * + | |
137 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
138 | * + | |
139 | * \0 = 1 | |
140 | */ | |
141 | #define LXCFS_PROC_PID_LEN \ | |
142 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
143 | ||
bc189096 | 144 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 145 | { |
bc189096 | 146 | int ret; |
237e200e | 147 | |
bc189096 CB |
148 | if (entry->init_pidfd < 0) |
149 | return ret_errno(ENOSYS); | |
7dd6560a | 150 | |
bc189096 CB |
151 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
152 | if (ret < 0) { | |
153 | if (errno == ENOSYS) | |
154 | return ret_errno(ENOSYS); | |
7dd6560a | 155 | |
bc189096 | 156 | return 0; |
2aa59b2e CB |
157 | } |
158 | ||
bc189096 CB |
159 | return 1; |
160 | } | |
161 | ||
162 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
163 | { | |
164 | struct stat st; | |
165 | char path[LXCFS_PROC_PID_LEN]; | |
166 | ||
167 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
168 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
169 | return 0; | |
170 | ||
171 | return 1; | |
172 | } | |
173 | ||
174 | /* Must be called under store_lock */ | |
175 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
176 | { | |
177 | int ret; | |
178 | ||
179 | ret = initpid_still_valid_pidfd(entry); | |
180 | if (ret < 0) | |
181 | ret = initpid_still_valid_stat(entry); | |
182 | ||
183 | return ret == 1; | |
237e200e SH |
184 | } |
185 | ||
186 | /* Must be called under store_lock */ | |
2aa59b2e | 187 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 188 | { |
2aa59b2e CB |
189 | struct pidns_init_store *it; |
190 | int ino_hash; | |
237e200e | 191 | |
2aa59b2e CB |
192 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
193 | entry->initpid); | |
7dd6560a | 194 | |
2aa59b2e CB |
195 | ino_hash = HASH(entry->ino); |
196 | if (pidns_hash_table[ino_hash] == entry) { | |
197 | pidns_hash_table[ino_hash] = entry->next; | |
198 | close_prot_errno_disarm(entry->init_pidfd); | |
199 | free_disarm(entry); | |
237e200e SH |
200 | return; |
201 | } | |
202 | ||
2aa59b2e CB |
203 | it = pidns_hash_table[ino_hash]; |
204 | while (it) { | |
205 | if (it->next == entry) { | |
206 | it->next = entry->next; | |
207 | close_prot_errno_disarm(entry->init_pidfd); | |
208 | free_disarm(entry); | |
237e200e SH |
209 | return; |
210 | } | |
2aa59b2e | 211 | it = it->next; |
237e200e SH |
212 | } |
213 | } | |
214 | ||
215 | #define PURGE_SECS 5 | |
216 | /* Must be called under store_lock */ | |
217 | static void prune_initpid_store(void) | |
218 | { | |
219 | static long int last_prune = 0; | |
237e200e | 220 | long int now, threshold; |
237e200e SH |
221 | |
222 | if (!last_prune) { | |
223 | last_prune = time(NULL); | |
224 | return; | |
225 | } | |
2aa59b2e | 226 | |
237e200e SH |
227 | now = time(NULL); |
228 | if (now < last_prune + PURGE_SECS) | |
229 | return; | |
7dd6560a | 230 | |
2aa59b2e | 231 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 232 | |
237e200e SH |
233 | last_prune = now; |
234 | threshold = now - 2 * PURGE_SECS; | |
235 | ||
2aa59b2e CB |
236 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
237 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
238 | if (entry->lastcheck < threshold) { | |
239 | struct pidns_init_store *cur = entry; | |
7dd6560a | 240 | |
2aa59b2e | 241 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 242 | |
237e200e | 243 | if (prev) |
2aa59b2e | 244 | prev->next = entry->next; |
237e200e | 245 | else |
2aa59b2e CB |
246 | pidns_hash_table[i] = entry->next; |
247 | entry = entry->next; | |
248 | close_prot_errno_disarm(cur->init_pidfd); | |
249 | free_disarm(cur); | |
237e200e | 250 | } else { |
2aa59b2e CB |
251 | prev = entry; |
252 | entry = entry->next; | |
237e200e SH |
253 | } |
254 | } | |
255 | } | |
256 | } | |
257 | ||
258 | /* Must be called under store_lock */ | |
259 | static void save_initpid(struct stat *sb, pid_t pid) | |
260 | { | |
1e5d03fe | 261 | __do_free struct pidns_init_store *entry = NULL; |
2aa59b2e CB |
262 | __do_close_prot_errno int pidfd = -EBADF; |
263 | char path[LXCFS_PROC_PID_LEN]; | |
264 | struct lxcfs_opts *opts = fuse_get_context()->private_data; | |
265 | struct stat st; | |
266 | int ino_hash; | |
267 | ||
9973cc06 | 268 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
269 | pidfd = pidfd_open(pid, 0); |
270 | if (pidfd < 0) | |
271 | return; | |
272 | } | |
237e200e | 273 | |
2aa59b2e CB |
274 | snprintf(path, sizeof(path), "/proc/%d", pid); |
275 | if (stat(path, &st)) | |
276 | return; | |
7dd6560a | 277 | |
1e5d03fe CB |
278 | entry = malloc(sizeof(*entry)); |
279 | if (entry) | |
237e200e | 280 | return; |
2aa59b2e | 281 | |
1e5d03fe CB |
282 | ino_hash = HASH(entry->ino); |
283 | *entry = (struct pidns_init_store){ | |
284 | .ino = sb->st_ino, | |
285 | .initpid = pid, | |
286 | .ctime = st.st_ctime, | |
287 | .next = pidns_hash_table[ino_hash], | |
288 | .lastcheck = time(NULL), | |
289 | .init_pidfd = move_fd(pidfd), | |
290 | }; | |
291 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
292 | |
293 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
294 | } |
295 | ||
296 | /* | |
297 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
298 | * entry for the inode number and creation time. Verify that the init pid | |
299 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
300 | * otherwise. | |
301 | * Must be called under store_lock | |
302 | */ | |
303 | static struct pidns_init_store *lookup_verify_initpid(struct stat *sb) | |
304 | { | |
2aa59b2e CB |
305 | struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)]; |
306 | ||
307 | while (entry) { | |
308 | if (entry->ino == sb->st_ino) { | |
309 | if (initpid_still_valid(entry)) { | |
310 | entry->lastcheck = time(NULL); | |
311 | return entry; | |
237e200e | 312 | } |
2aa59b2e CB |
313 | |
314 | remove_initpid(entry); | |
237e200e SH |
315 | return NULL; |
316 | } | |
2aa59b2e | 317 | entry = entry->next; |
237e200e SH |
318 | } |
319 | ||
320 | return NULL; | |
321 | } | |
322 | ||
4ec5c9da | 323 | static int send_creds_clone_wrapper(void *arg) |
237e200e | 324 | { |
f1744de4 CB |
325 | int sock = PTR_TO_INT(arg); |
326 | char v = '1'; /* we are the child */ | |
327 | struct ucred cred = { | |
328 | .uid = 0, | |
329 | .gid = 0, | |
330 | .pid = 1, | |
331 | }; | |
332 | ||
333 | return send_creds(sock, &cred, v, true) != SEND_CREDS_OK; | |
237e200e SH |
334 | } |
335 | ||
87f7558b CB |
336 | /* |
337 | * Let's use the "standard stack limit" (i.e. glibc thread size default) for | |
338 | * stack sizes: 8MB. | |
339 | */ | |
340 | #define __LXCFS_STACK_SIZE (8 * 1024 * 1024) | |
341 | static pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags) | |
342 | { | |
343 | pid_t ret; | |
344 | void *stack; | |
345 | ||
346 | stack = malloc(__LXCFS_STACK_SIZE); | |
347 | if (!stack) | |
348 | return ret_errno(ENOMEM); | |
349 | ||
350 | #ifdef __ia64__ | |
351 | ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
352 | #else | |
353 | ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
354 | #endif | |
355 | return ret; | |
356 | } | |
357 | ||
358 | #define LXCFS_PROC_PID_NS_LEN \ | |
359 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
360 | STRLITERALLEN("/ns/pid") + 1) | |
361 | ||
580fe4df CB |
362 | /* |
363 | * clone a task which switches to @task's namespace and writes '1'. | |
364 | * over a unix sock so we can read the task's reaper's pid in our | |
365 | * namespace | |
366 | * | |
367 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
368 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
369 | * the pidns and the parent pid outside are identical. Using clone prevents | |
370 | * this issue. | |
371 | */ | |
372 | static void write_task_init_pid_exit(int sock, pid_t target) | |
373 | { | |
87f7558b CB |
374 | __do_close_prot_errno int fd = -EBADF; |
375 | char path[LXCFS_PROC_PID_NS_LEN]; | |
580fe4df | 376 | pid_t pid; |
87f7558b CB |
377 | |
378 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
379 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
380 | if (fd < 0) | |
381 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
382 | ||
383 | if (setns(fd, 0)) | |
384 | log_exit("Failed to setns to pid namespace of process %d", target); | |
385 | ||
f1744de4 | 386 | pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0); |
580fe4df | 387 | if (pid < 0) |
87f7558b CB |
388 | _exit(EXIT_FAILURE); |
389 | ||
580fe4df CB |
390 | if (pid != 0) { |
391 | if (!wait_for_pid(pid)) | |
87f7558b CB |
392 | _exit(EXIT_FAILURE); |
393 | ||
394 | _exit(EXIT_SUCCESS); | |
237e200e | 395 | } |
237e200e SH |
396 | } |
397 | ||
580fe4df | 398 | static pid_t get_init_pid_for_task(pid_t task) |
237e200e | 399 | { |
580fe4df | 400 | char v = '0'; |
87f7558b CB |
401 | pid_t pid_ret = -1; |
402 | pid_t pid; | |
403 | int sock[2]; | |
580fe4df | 404 | struct ucred cred; |
237e200e | 405 | |
87f7558b | 406 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 407 | return -1; |
237e200e | 408 | |
580fe4df CB |
409 | pid = fork(); |
410 | if (pid < 0) | |
411 | goto out; | |
87f7558b CB |
412 | |
413 | if (pid == 0) { | |
580fe4df CB |
414 | close(sock[1]); |
415 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 416 | _exit(EXIT_SUCCESS); |
237e200e | 417 | } |
7213ec5c | 418 | |
580fe4df CB |
419 | if (!recv_creds(sock[1], &cred, &v)) |
420 | goto out; | |
87f7558b CB |
421 | |
422 | pid_ret = cred.pid; | |
237e200e | 423 | |
580fe4df CB |
424 | out: |
425 | close(sock[0]); | |
426 | close(sock[1]); | |
427 | if (pid > 0) | |
428 | wait_for_pid(pid); | |
237e200e | 429 | |
87f7558b CB |
430 | return pid_ret; |
431 | } | |
2aa59b2e CB |
432 | |
433 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 434 | { |
580fe4df | 435 | pid_t answer = 0; |
2aa59b2e CB |
436 | char path[LXCFS_PROC_PID_NS_LEN]; |
437 | struct stat st; | |
438 | struct pidns_init_store *entry; | |
439 | ||
440 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
b7672ded | 441 | |
580fe4df | 442 | store_lock(); |
2aa59b2e | 443 | if (stat(path, &st)) |
580fe4df | 444 | goto out; |
2aa59b2e CB |
445 | |
446 | entry = lookup_verify_initpid(&st); | |
447 | if (entry) { | |
448 | answer = entry->initpid; | |
580fe4df CB |
449 | goto out; |
450 | } | |
2aa59b2e CB |
451 | |
452 | answer = get_init_pid_for_task(pid); | |
580fe4df | 453 | if (answer > 0) |
2aa59b2e | 454 | save_initpid(&st, answer); |
b7672ded | 455 | |
580fe4df | 456 | out: |
2aa59b2e CB |
457 | /* |
458 | * Prune at the end in case we're returning the value we were about to | |
459 | * return. | |
460 | */ | |
580fe4df | 461 | prune_initpid_store(); |
2aa59b2e | 462 | |
580fe4df | 463 | store_unlock(); |
2aa59b2e | 464 | |
580fe4df | 465 | return answer; |
237e200e SH |
466 | } |
467 | ||
29a73c2f CB |
468 | /* |
469 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
470 | */ |
471 | ||
29a73c2f CB |
472 | static bool umount_if_mounted(void) |
473 | { | |
474 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 475 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
476 | return false; |
477 | } | |
478 | return true; | |
479 | } | |
480 | ||
2283e240 CB |
481 | /* __typeof__ should be safe to use with all compilers. */ |
482 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
483 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
484 | { | |
485 | return (fs->f_type == (fs_type_magic)magic_val); | |
486 | } | |
487 | ||
0a4dea41 CB |
488 | /* |
489 | * looking at fs/proc_namespace.c, it appears we can | |
490 | * actually expect the rootfs entry to very specifically contain | |
491 | * " - rootfs rootfs " | |
492 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
493 | * the rootfs entry should always be skipped in mountinfo contents. | |
494 | */ | |
495 | static bool is_on_ramfs(void) | |
496 | { | |
87f7558b | 497 | __do_free char *line = NULL; |
757a63e7 | 498 | __do_free void *fopen_cache = NULL; |
87f7558b | 499 | __do_fclose FILE *f = NULL; |
0a4dea41 | 500 | size_t len = 0; |
0a4dea41 | 501 | |
757a63e7 | 502 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
503 | if (!f) |
504 | return false; | |
505 | ||
506 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
507 | int i; |
508 | char *p, *p2; | |
509 | ||
0a4dea41 CB |
510 | for (p = line, i = 0; p && i < 4; i++) |
511 | p = strchr(p + 1, ' '); | |
512 | if (!p) | |
513 | continue; | |
87f7558b | 514 | |
0a4dea41 CB |
515 | p2 = strchr(p + 1, ' '); |
516 | if (!p2) | |
517 | continue; | |
518 | *p2 = '\0'; | |
519 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 520 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 521 | p = strchr(p2 + 1, '-'); |
87f7558b | 522 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 523 | return true; |
0a4dea41 CB |
524 | } |
525 | } | |
87f7558b | 526 | |
0a4dea41 CB |
527 | return false; |
528 | } | |
529 | ||
cc309f33 | 530 | static int pivot_enter() |
0a4dea41 | 531 | { |
87f7558b | 532 | __do_close_prot_errno int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 | 533 | |
3326c17e | 534 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
535 | if (oldroot < 0) |
536 | return log_error_errno(-1, errno, | |
537 | "Failed to open old root for fchdir"); | |
cc309f33 | 538 | |
3326c17e | 539 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
540 | if (newroot < 0) |
541 | return log_error_errno(-1, errno, | |
542 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
543 | |
544 | /* change into new root fs */ | |
87f7558b CB |
545 | if (fchdir(newroot) < 0) |
546 | return log_error_errno(-1, | |
547 | errno, "Failed to change directory to new rootfs: %s", | |
548 | ROOTDIR); | |
cc309f33 | 549 | |
0a4dea41 | 550 | /* pivot_root into our new root fs */ |
87f7558b CB |
551 | if (pivot_root(".", ".") < 0) |
552 | return log_error_errno(-1, errno, | |
553 | "pivot_root() syscall failed: %s", | |
554 | strerror(errno)); | |
0a4dea41 CB |
555 | |
556 | /* | |
557 | * At this point the old-root is mounted on top of our new-root. | |
558 | * To unmounted it we must not be chdir'd into it, so escape back | |
559 | * to the old-root. | |
560 | */ | |
87f7558b CB |
561 | if (fchdir(oldroot) < 0) |
562 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 563 | |
87f7558b CB |
564 | if (umount2(".", MNT_DETACH) < 0) |
565 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 566 | |
87f7558b CB |
567 | if (fchdir(newroot) < 0) |
568 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 569 | |
87f7558b | 570 | return 0; |
0a4dea41 CB |
571 | } |
572 | ||
573 | static int chroot_enter() | |
574 | { | |
575 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
576 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
577 | return -1; | |
578 | } | |
579 | ||
580 | if (chroot(".") < 0) { | |
581 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
582 | return -1; | |
583 | } | |
584 | ||
585 | if (chdir("/") < 0) { | |
586 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
587 | return -1; | |
588 | } | |
589 | ||
590 | return 0; | |
591 | } | |
592 | ||
0232cbac | 593 | static int permute_and_enter(void) |
29a73c2f | 594 | { |
0a4dea41 CB |
595 | struct statfs sb; |
596 | ||
597 | if (statfs("/", &sb) < 0) { | |
598 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 599 | return -1; |
0a4dea41 CB |
600 | } |
601 | ||
602 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
603 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
604 | * /proc/1/mountinfo. */ | |
605 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
606 | return chroot_enter(); | |
29a73c2f | 607 | |
cc309f33 | 608 | if (pivot_enter() < 0) { |
0a4dea41 | 609 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 610 | return -1; |
29a73c2f CB |
611 | } |
612 | ||
cc309f33 | 613 | return 0; |
29a73c2f CB |
614 | } |
615 | ||
616 | /* Prepare our new clean root. */ | |
0232cbac | 617 | static int permute_prepare(void) |
29a73c2f CB |
618 | { |
619 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 620 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
621 | return -1; |
622 | } | |
623 | ||
624 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 625 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
626 | return -1; |
627 | } | |
628 | ||
629 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 630 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
631 | return -1; |
632 | } | |
633 | ||
634 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 635 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
636 | return -1; |
637 | } | |
638 | ||
639 | return 0; | |
640 | } | |
641 | ||
0232cbac CB |
642 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
643 | static bool permute_root(void) | |
29a73c2f CB |
644 | { |
645 | /* Prepare new root. */ | |
0232cbac | 646 | if (permute_prepare() < 0) |
29a73c2f CB |
647 | return false; |
648 | ||
649 | /* Pivot into new root. */ | |
0232cbac | 650 | if (permute_and_enter() < 0) |
29a73c2f CB |
651 | return false; |
652 | ||
653 | return true; | |
654 | } | |
655 | ||
0a4dea41 | 656 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
657 | { |
658 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 659 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
660 | return false; |
661 | } | |
480262c9 | 662 | |
29a73c2f | 663 | if (!umount_if_mounted()) { |
b8defc3d | 664 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
665 | return false; |
666 | } | |
667 | ||
668 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 669 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
670 | return false; |
671 | } | |
672 | ||
1d81c6a6 | 673 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 674 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
675 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
676 | return false; | |
677 | } | |
678 | ||
480262c9 | 679 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 680 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
681 | return false; |
682 | } | |
480262c9 | 683 | |
29a73c2f | 684 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 685 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
686 | return false; |
687 | } | |
480262c9 | 688 | |
29a73c2f CB |
689 | return true; |
690 | } | |
691 | ||
0a4dea41 | 692 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 693 | { |
5fbea8a6 CB |
694 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
695 | return false; | |
51c7ca35 | 696 | |
5fbea8a6 CB |
697 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
698 | return false; | |
29a73c2f | 699 | |
5fbea8a6 CB |
700 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
701 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
702 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
703 | if ((*h)->fd < 0) | |
29a73c2f | 704 | return false; |
29a73c2f | 705 | } |
5fbea8a6 | 706 | |
29a73c2f CB |
707 | return true; |
708 | } | |
709 | ||
480262c9 | 710 | static bool cgfs_setup_controllers(void) |
29a73c2f | 711 | { |
0a4dea41 | 712 | if (!cgfs_prepare_mounts()) |
29a73c2f | 713 | return false; |
29a73c2f | 714 | |
2b8eff1d CB |
715 | if (!cgfs_mount_hierarchies()) |
716 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 717 | |
0232cbac | 718 | if (!permute_root()) |
29a73c2f CB |
719 | return false; |
720 | ||
721 | return true; | |
722 | } | |
723 | ||
2243c5a9 | 724 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 725 | { |
de69569b CB |
726 | __do_close_prot_errno int init_ns = -EBADF, root_fd = -EBADF, |
727 | pidfd = -EBADF; | |
4ec5c9da | 728 | int i = 0; |
2aa59b2e | 729 | pid_t pid; |
237e200e | 730 | |
c2357135 | 731 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); |
cc42d0c7 | 732 | |
5fbea8a6 | 733 | cgroup_ops = cgroup_init(); |
c2357135 CB |
734 | if (!cgroup_ops) { |
735 | lxcfs_info("Failed to initialize cgroup support"); | |
736 | goto broken_upgrade; | |
737 | } | |
237e200e | 738 | |
480262c9 | 739 | /* Preserve initial namespace. */ |
2aa59b2e CB |
740 | pid = getpid(); |
741 | init_ns = preserve_ns(pid, "mnt"); | |
c2357135 CB |
742 | if (init_ns < 0) { |
743 | lxcfs_info("Failed to preserve initial mount namespace"); | |
744 | goto broken_upgrade; | |
745 | } | |
480262c9 | 746 | |
480262c9 CB |
747 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
748 | * to privately mount lxcfs cgroups. */ | |
c2357135 | 749 | if (!cgfs_setup_controllers()) { |
2243c5a9 | 750 | log_exit("Failed to setup private cgroup mounts for lxcfs"); |
c2357135 CB |
751 | goto broken_upgrade; |
752 | } | |
480262c9 | 753 | |
c2357135 | 754 | if (setns(init_ns, 0) < 0) { |
2243c5a9 | 755 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); |
c2357135 CB |
756 | goto broken_upgrade; |
757 | } | |
29a73c2f | 758 | |
c2357135 | 759 | if (!init_cpuview()) { |
2243c5a9 | 760 | log_exit("Failed to init CPU view"); |
c2357135 CB |
761 | goto broken_upgrade; |
762 | } | |
056adcef | 763 | |
cc42d0c7 CB |
764 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
765 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
766 | |
767 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
768 | char **controller_list = (*h)->controllers; |
769 | __do_free char *controllers = NULL; | |
770 | if (controller_list && *controller_list) | |
771 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
772 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 773 | } |
2aa59b2e CB |
774 | |
775 | pidfd = pidfd_open(pid, 0); | |
776 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
777 | can_use_pidfd = true; | |
cc42d0c7 | 778 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 779 | } |
ce8fc84c | 780 | |
cc42d0c7 | 781 | lxcfs_info("api_extensions:"); |
ce8fc84c | 782 | for (i = 0; i < nr_api_extensions; i++) |
cc42d0c7 | 783 | lxcfs_info("- %s", api_extensions[i]); |
de69569b CB |
784 | |
785 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
c2357135 CB |
786 | if (root_fd < 0) |
787 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
788 | else if (fchdir(root_fd) < 0) | |
789 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
790 | ||
cbfc55fd | 791 | reload_successful = true; |
c2357135 | 792 | return; |
de69569b | 793 | |
c2357135 | 794 | broken_upgrade: |
cbfc55fd | 795 | reload_successful = false; |
c2357135 | 796 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); |
237e200e SH |
797 | } |
798 | ||
2243c5a9 | 799 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 800 | { |
cc42d0c7 CB |
801 | lxcfs_info("Running destructor %s", __func__); |
802 | ||
056adcef | 803 | free_cpuview(); |
2243c5a9 | 804 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 805 | } |