]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
1f5596dd CB |
3 | #ifndef _GNU_SOURCE |
4 | #define _GNU_SOURCE | |
5 | #endif | |
6 | ||
7 | #ifndef FUSE_USE_VERSION | |
237e200e | 8 | #define FUSE_USE_VERSION 26 |
1f5596dd CB |
9 | #endif |
10 | ||
11 | #define _FILE_OFFSET_BITS 64 | |
237e200e | 12 | |
237e200e | 13 | #include <dirent.h> |
29a73c2f | 14 | #include <errno.h> |
237e200e SH |
15 | #include <fcntl.h> |
16 | #include <fuse.h> | |
0ecddf02 | 17 | #include <inttypes.h> |
237e200e | 18 | #include <libgen.h> |
237e200e | 19 | #include <pthread.h> |
29a73c2f | 20 | #include <sched.h> |
db1b32f6 | 21 | #include <stdarg.h> |
29a73c2f | 22 | #include <stdbool.h> |
0ecddf02 | 23 | #include <stdint.h> |
29a73c2f CB |
24 | #include <stdio.h> |
25 | #include <stdlib.h> | |
26 | #include <string.h> | |
27 | #include <time.h> | |
28 | #include <unistd.h> | |
29 | #include <wait.h> | |
d89504c4 | 30 | #include <linux/magic.h> |
237e200e | 31 | #include <linux/sched.h> |
29a73c2f CB |
32 | #include <sys/epoll.h> |
33 | #include <sys/mman.h> | |
34 | #include <sys/mount.h> | |
237e200e | 35 | #include <sys/param.h> |
87f7558b | 36 | #include <signal.h> |
237e200e | 37 | #include <sys/socket.h> |
29a73c2f | 38 | #include <sys/syscall.h> |
0ecddf02 | 39 | #include <sys/sysinfo.h> |
d89504c4 | 40 | #include <sys/vfs.h> |
237e200e | 41 | |
ce8fc84c | 42 | #include "api_extensions.h" |
237e200e | 43 | #include "bindings.h" |
1d81c6a6 | 44 | #include "config.h" |
580fe4df | 45 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
46 | #include "cgroups/cgroup.h" |
47 | #include "cgroups/cgroup_utils.h" | |
c9236032 | 48 | #include "memory_utils.h" |
1f5596dd | 49 | #include "proc_cpuview.h" |
1d81c6a6 | 50 | #include "utils.h" |
237e200e | 51 | |
2aa59b2e CB |
52 | static bool can_use_pidfd; |
53 | ||
29a73c2f CB |
54 | /* Define pivot_root() if missing from the C library */ |
55 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 56 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f CB |
57 | { |
58 | #ifdef __NR_pivot_root | |
4ec5c9da | 59 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f | 60 | #else |
4ec5c9da CB |
61 | errno = ENOSYS; |
62 | return -1; | |
29a73c2f CB |
63 | #endif |
64 | } | |
65 | #else | |
4ec5c9da | 66 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
67 | #endif |
68 | ||
237e200e SH |
69 | /* |
70 | * A table caching which pid is init for a pid namespace. | |
71 | * When looking up which pid is init for $qpid, we first | |
72 | * 1. Stat /proc/$qpid/ns/pid. | |
73 | * 2. Check whether the ino_t is in our store. | |
74 | * a. if not, fork a child in qpid's ns to send us | |
75 | * ucred.pid = 1, and read the initpid. Cache | |
76 | * initpid and creation time for /proc/initpid | |
77 | * in a new store entry. | |
78 | * b. if so, verify that /proc/initpid still matches | |
79 | * what we have saved. If not, clear the store | |
80 | * entry and go back to a. If so, return the | |
81 | * cached initpid. | |
82 | */ | |
83 | struct pidns_init_store { | |
2aa59b2e CB |
84 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
85 | pid_t initpid; /* the pid of nit in that ns */ | |
86 | int init_pidfd; | |
87 | long int ctime; /* the time at which /proc/$initpid was created */ | |
237e200e SH |
88 | struct pidns_init_store *next; |
89 | long int lastcheck; | |
90 | }; | |
91 | ||
92 | /* lol - look at how they are allocated in the kernel */ | |
93 | #define PIDNS_HASH_SIZE 4096 | |
94 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
95 | ||
96 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
97 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 98 | |
237e200e SH |
99 | static void lock_mutex(pthread_mutex_t *l) |
100 | { | |
101 | int ret; | |
102 | ||
4ec5c9da CB |
103 | ret = pthread_mutex_lock(l); |
104 | if (ret) | |
105 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
106 | } |
107 | ||
77f4399a | 108 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 109 | |
237e200e SH |
110 | static void unlock_mutex(pthread_mutex_t *l) |
111 | { | |
112 | int ret; | |
113 | ||
4ec5c9da CB |
114 | ret = pthread_mutex_unlock(l); |
115 | if (ret) | |
116 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
117 | } |
118 | ||
119 | static void store_lock(void) | |
120 | { | |
121 | lock_mutex(&pidns_store_mutex); | |
122 | } | |
123 | ||
124 | static void store_unlock(void) | |
125 | { | |
126 | unlock_mutex(&pidns_store_mutex); | |
127 | } | |
128 | ||
2aa59b2e CB |
129 | /* /proc/ = 6 |
130 | * + | |
131 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
132 | * + | |
133 | * \0 = 1 | |
134 | */ | |
135 | #define LXCFS_PROC_PID_LEN \ | |
136 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
137 | ||
bc189096 | 138 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 139 | { |
bc189096 | 140 | int ret; |
237e200e | 141 | |
bc189096 CB |
142 | if (entry->init_pidfd < 0) |
143 | return ret_errno(ENOSYS); | |
7dd6560a | 144 | |
bc189096 CB |
145 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
146 | if (ret < 0) { | |
147 | if (errno == ENOSYS) | |
148 | return ret_errno(ENOSYS); | |
7dd6560a | 149 | |
bc189096 | 150 | return 0; |
2aa59b2e CB |
151 | } |
152 | ||
bc189096 CB |
153 | return 1; |
154 | } | |
155 | ||
156 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
157 | { | |
158 | struct stat st; | |
159 | char path[LXCFS_PROC_PID_LEN]; | |
160 | ||
161 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
162 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
163 | return 0; | |
164 | ||
165 | return 1; | |
166 | } | |
167 | ||
168 | /* Must be called under store_lock */ | |
169 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
170 | { | |
171 | int ret; | |
172 | ||
173 | ret = initpid_still_valid_pidfd(entry); | |
174 | if (ret < 0) | |
175 | ret = initpid_still_valid_stat(entry); | |
176 | ||
177 | return ret == 1; | |
237e200e SH |
178 | } |
179 | ||
180 | /* Must be called under store_lock */ | |
2aa59b2e | 181 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 182 | { |
2aa59b2e CB |
183 | struct pidns_init_store *it; |
184 | int ino_hash; | |
237e200e | 185 | |
2aa59b2e CB |
186 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
187 | entry->initpid); | |
7dd6560a | 188 | |
2aa59b2e CB |
189 | ino_hash = HASH(entry->ino); |
190 | if (pidns_hash_table[ino_hash] == entry) { | |
191 | pidns_hash_table[ino_hash] = entry->next; | |
192 | close_prot_errno_disarm(entry->init_pidfd); | |
193 | free_disarm(entry); | |
237e200e SH |
194 | return; |
195 | } | |
196 | ||
2aa59b2e CB |
197 | it = pidns_hash_table[ino_hash]; |
198 | while (it) { | |
199 | if (it->next == entry) { | |
200 | it->next = entry->next; | |
201 | close_prot_errno_disarm(entry->init_pidfd); | |
202 | free_disarm(entry); | |
237e200e SH |
203 | return; |
204 | } | |
2aa59b2e | 205 | it = it->next; |
237e200e SH |
206 | } |
207 | } | |
208 | ||
209 | #define PURGE_SECS 5 | |
210 | /* Must be called under store_lock */ | |
211 | static void prune_initpid_store(void) | |
212 | { | |
213 | static long int last_prune = 0; | |
237e200e | 214 | long int now, threshold; |
237e200e SH |
215 | |
216 | if (!last_prune) { | |
217 | last_prune = time(NULL); | |
218 | return; | |
219 | } | |
2aa59b2e | 220 | |
237e200e SH |
221 | now = time(NULL); |
222 | if (now < last_prune + PURGE_SECS) | |
223 | return; | |
7dd6560a | 224 | |
2aa59b2e | 225 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 226 | |
237e200e SH |
227 | last_prune = now; |
228 | threshold = now - 2 * PURGE_SECS; | |
229 | ||
2aa59b2e CB |
230 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
231 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
232 | if (entry->lastcheck < threshold) { | |
233 | struct pidns_init_store *cur = entry; | |
7dd6560a | 234 | |
2aa59b2e | 235 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 236 | |
237e200e | 237 | if (prev) |
2aa59b2e | 238 | prev->next = entry->next; |
237e200e | 239 | else |
2aa59b2e CB |
240 | pidns_hash_table[i] = entry->next; |
241 | entry = entry->next; | |
242 | close_prot_errno_disarm(cur->init_pidfd); | |
243 | free_disarm(cur); | |
237e200e | 244 | } else { |
2aa59b2e CB |
245 | prev = entry; |
246 | entry = entry->next; | |
237e200e SH |
247 | } |
248 | } | |
249 | } | |
250 | } | |
251 | ||
252 | /* Must be called under store_lock */ | |
253 | static void save_initpid(struct stat *sb, pid_t pid) | |
254 | { | |
1e5d03fe | 255 | __do_free struct pidns_init_store *entry = NULL; |
2aa59b2e CB |
256 | __do_close_prot_errno int pidfd = -EBADF; |
257 | char path[LXCFS_PROC_PID_LEN]; | |
258 | struct lxcfs_opts *opts = fuse_get_context()->private_data; | |
259 | struct stat st; | |
260 | int ino_hash; | |
261 | ||
9973cc06 | 262 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
263 | pidfd = pidfd_open(pid, 0); |
264 | if (pidfd < 0) | |
265 | return; | |
266 | } | |
237e200e | 267 | |
2aa59b2e CB |
268 | snprintf(path, sizeof(path), "/proc/%d", pid); |
269 | if (stat(path, &st)) | |
270 | return; | |
7dd6560a | 271 | |
1e5d03fe CB |
272 | entry = malloc(sizeof(*entry)); |
273 | if (entry) | |
237e200e | 274 | return; |
2aa59b2e | 275 | |
1e5d03fe CB |
276 | ino_hash = HASH(entry->ino); |
277 | *entry = (struct pidns_init_store){ | |
278 | .ino = sb->st_ino, | |
279 | .initpid = pid, | |
280 | .ctime = st.st_ctime, | |
281 | .next = pidns_hash_table[ino_hash], | |
282 | .lastcheck = time(NULL), | |
283 | .init_pidfd = move_fd(pidfd), | |
284 | }; | |
285 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
286 | |
287 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
288 | } |
289 | ||
290 | /* | |
291 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
292 | * entry for the inode number and creation time. Verify that the init pid | |
293 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
294 | * otherwise. | |
295 | * Must be called under store_lock | |
296 | */ | |
297 | static struct pidns_init_store *lookup_verify_initpid(struct stat *sb) | |
298 | { | |
2aa59b2e CB |
299 | struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)]; |
300 | ||
301 | while (entry) { | |
302 | if (entry->ino == sb->st_ino) { | |
303 | if (initpid_still_valid(entry)) { | |
304 | entry->lastcheck = time(NULL); | |
305 | return entry; | |
237e200e | 306 | } |
2aa59b2e CB |
307 | |
308 | remove_initpid(entry); | |
237e200e SH |
309 | return NULL; |
310 | } | |
2aa59b2e | 311 | entry = entry->next; |
237e200e SH |
312 | } |
313 | ||
314 | return NULL; | |
315 | } | |
316 | ||
4ec5c9da | 317 | static int send_creds_clone_wrapper(void *arg) |
237e200e | 318 | { |
f1744de4 CB |
319 | int sock = PTR_TO_INT(arg); |
320 | char v = '1'; /* we are the child */ | |
321 | struct ucred cred = { | |
322 | .uid = 0, | |
323 | .gid = 0, | |
324 | .pid = 1, | |
325 | }; | |
326 | ||
327 | return send_creds(sock, &cred, v, true) != SEND_CREDS_OK; | |
237e200e SH |
328 | } |
329 | ||
87f7558b CB |
330 | /* |
331 | * Let's use the "standard stack limit" (i.e. glibc thread size default) for | |
332 | * stack sizes: 8MB. | |
333 | */ | |
334 | #define __LXCFS_STACK_SIZE (8 * 1024 * 1024) | |
335 | static pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags) | |
336 | { | |
337 | pid_t ret; | |
338 | void *stack; | |
339 | ||
340 | stack = malloc(__LXCFS_STACK_SIZE); | |
341 | if (!stack) | |
342 | return ret_errno(ENOMEM); | |
343 | ||
344 | #ifdef __ia64__ | |
345 | ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
346 | #else | |
347 | ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
348 | #endif | |
349 | return ret; | |
350 | } | |
351 | ||
352 | #define LXCFS_PROC_PID_NS_LEN \ | |
353 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
354 | STRLITERALLEN("/ns/pid") + 1) | |
355 | ||
580fe4df CB |
356 | /* |
357 | * clone a task which switches to @task's namespace and writes '1'. | |
358 | * over a unix sock so we can read the task's reaper's pid in our | |
359 | * namespace | |
360 | * | |
361 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
362 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
363 | * the pidns and the parent pid outside are identical. Using clone prevents | |
364 | * this issue. | |
365 | */ | |
366 | static void write_task_init_pid_exit(int sock, pid_t target) | |
367 | { | |
87f7558b CB |
368 | __do_close_prot_errno int fd = -EBADF; |
369 | char path[LXCFS_PROC_PID_NS_LEN]; | |
580fe4df | 370 | pid_t pid; |
87f7558b CB |
371 | |
372 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
373 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
374 | if (fd < 0) | |
375 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
376 | ||
377 | if (setns(fd, 0)) | |
378 | log_exit("Failed to setns to pid namespace of process %d", target); | |
379 | ||
f1744de4 | 380 | pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0); |
580fe4df | 381 | if (pid < 0) |
87f7558b CB |
382 | _exit(EXIT_FAILURE); |
383 | ||
580fe4df CB |
384 | if (pid != 0) { |
385 | if (!wait_for_pid(pid)) | |
87f7558b CB |
386 | _exit(EXIT_FAILURE); |
387 | ||
388 | _exit(EXIT_SUCCESS); | |
237e200e | 389 | } |
237e200e SH |
390 | } |
391 | ||
580fe4df | 392 | static pid_t get_init_pid_for_task(pid_t task) |
237e200e | 393 | { |
580fe4df | 394 | char v = '0'; |
87f7558b CB |
395 | pid_t pid_ret = -1; |
396 | pid_t pid; | |
397 | int sock[2]; | |
580fe4df | 398 | struct ucred cred; |
237e200e | 399 | |
87f7558b | 400 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 401 | return -1; |
237e200e | 402 | |
580fe4df CB |
403 | pid = fork(); |
404 | if (pid < 0) | |
405 | goto out; | |
87f7558b CB |
406 | |
407 | if (pid == 0) { | |
580fe4df CB |
408 | close(sock[1]); |
409 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 410 | _exit(EXIT_SUCCESS); |
237e200e | 411 | } |
7213ec5c | 412 | |
580fe4df CB |
413 | if (!recv_creds(sock[1], &cred, &v)) |
414 | goto out; | |
87f7558b CB |
415 | |
416 | pid_ret = cred.pid; | |
237e200e | 417 | |
580fe4df CB |
418 | out: |
419 | close(sock[0]); | |
420 | close(sock[1]); | |
421 | if (pid > 0) | |
422 | wait_for_pid(pid); | |
237e200e | 423 | |
87f7558b CB |
424 | return pid_ret; |
425 | } | |
2aa59b2e CB |
426 | |
427 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 428 | { |
580fe4df | 429 | pid_t answer = 0; |
2aa59b2e CB |
430 | char path[LXCFS_PROC_PID_NS_LEN]; |
431 | struct stat st; | |
432 | struct pidns_init_store *entry; | |
433 | ||
434 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
b7672ded | 435 | |
580fe4df | 436 | store_lock(); |
2aa59b2e | 437 | if (stat(path, &st)) |
580fe4df | 438 | goto out; |
2aa59b2e CB |
439 | |
440 | entry = lookup_verify_initpid(&st); | |
441 | if (entry) { | |
442 | answer = entry->initpid; | |
580fe4df CB |
443 | goto out; |
444 | } | |
2aa59b2e CB |
445 | |
446 | answer = get_init_pid_for_task(pid); | |
580fe4df | 447 | if (answer > 0) |
2aa59b2e | 448 | save_initpid(&st, answer); |
b7672ded | 449 | |
580fe4df | 450 | out: |
2aa59b2e CB |
451 | /* |
452 | * Prune at the end in case we're returning the value we were about to | |
453 | * return. | |
454 | */ | |
580fe4df | 455 | prune_initpid_store(); |
2aa59b2e | 456 | |
580fe4df | 457 | store_unlock(); |
2aa59b2e | 458 | |
580fe4df | 459 | return answer; |
237e200e SH |
460 | } |
461 | ||
29a73c2f CB |
462 | /* |
463 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
464 | */ |
465 | ||
29a73c2f CB |
466 | static bool umount_if_mounted(void) |
467 | { | |
468 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 469 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
470 | return false; |
471 | } | |
472 | return true; | |
473 | } | |
474 | ||
2283e240 CB |
475 | /* __typeof__ should be safe to use with all compilers. */ |
476 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
477 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
478 | { | |
479 | return (fs->f_type == (fs_type_magic)magic_val); | |
480 | } | |
481 | ||
0a4dea41 CB |
482 | /* |
483 | * looking at fs/proc_namespace.c, it appears we can | |
484 | * actually expect the rootfs entry to very specifically contain | |
485 | * " - rootfs rootfs " | |
486 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
487 | * the rootfs entry should always be skipped in mountinfo contents. | |
488 | */ | |
489 | static bool is_on_ramfs(void) | |
490 | { | |
87f7558b | 491 | __do_free char *line = NULL; |
757a63e7 | 492 | __do_free void *fopen_cache = NULL; |
87f7558b | 493 | __do_fclose FILE *f = NULL; |
0a4dea41 | 494 | size_t len = 0; |
0a4dea41 | 495 | |
757a63e7 | 496 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
497 | if (!f) |
498 | return false; | |
499 | ||
500 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
501 | int i; |
502 | char *p, *p2; | |
503 | ||
0a4dea41 CB |
504 | for (p = line, i = 0; p && i < 4; i++) |
505 | p = strchr(p + 1, ' '); | |
506 | if (!p) | |
507 | continue; | |
87f7558b | 508 | |
0a4dea41 CB |
509 | p2 = strchr(p + 1, ' '); |
510 | if (!p2) | |
511 | continue; | |
512 | *p2 = '\0'; | |
513 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 514 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 515 | p = strchr(p2 + 1, '-'); |
87f7558b | 516 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 517 | return true; |
0a4dea41 CB |
518 | } |
519 | } | |
87f7558b | 520 | |
0a4dea41 CB |
521 | return false; |
522 | } | |
523 | ||
cc309f33 | 524 | static int pivot_enter() |
0a4dea41 | 525 | { |
87f7558b | 526 | __do_close_prot_errno int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 CB |
527 | |
528 | oldroot = open("/", O_DIRECTORY | O_RDONLY); | |
87f7558b CB |
529 | if (oldroot < 0) |
530 | return log_error_errno(-1, errno, | |
531 | "Failed to open old root for fchdir"); | |
cc309f33 CB |
532 | |
533 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY); | |
87f7558b CB |
534 | if (newroot < 0) |
535 | return log_error_errno(-1, errno, | |
536 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
537 | |
538 | /* change into new root fs */ | |
87f7558b CB |
539 | if (fchdir(newroot) < 0) |
540 | return log_error_errno(-1, | |
541 | errno, "Failed to change directory to new rootfs: %s", | |
542 | ROOTDIR); | |
cc309f33 | 543 | |
0a4dea41 | 544 | /* pivot_root into our new root fs */ |
87f7558b CB |
545 | if (pivot_root(".", ".") < 0) |
546 | return log_error_errno(-1, errno, | |
547 | "pivot_root() syscall failed: %s", | |
548 | strerror(errno)); | |
0a4dea41 CB |
549 | |
550 | /* | |
551 | * At this point the old-root is mounted on top of our new-root. | |
552 | * To unmounted it we must not be chdir'd into it, so escape back | |
553 | * to the old-root. | |
554 | */ | |
87f7558b CB |
555 | if (fchdir(oldroot) < 0) |
556 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 557 | |
87f7558b CB |
558 | if (umount2(".", MNT_DETACH) < 0) |
559 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 560 | |
87f7558b CB |
561 | if (fchdir(newroot) < 0) |
562 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 563 | |
87f7558b | 564 | return 0; |
0a4dea41 CB |
565 | } |
566 | ||
567 | static int chroot_enter() | |
568 | { | |
569 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
570 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
571 | return -1; | |
572 | } | |
573 | ||
574 | if (chroot(".") < 0) { | |
575 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
576 | return -1; | |
577 | } | |
578 | ||
579 | if (chdir("/") < 0) { | |
580 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
581 | return -1; | |
582 | } | |
583 | ||
584 | return 0; | |
585 | } | |
586 | ||
0232cbac | 587 | static int permute_and_enter(void) |
29a73c2f | 588 | { |
0a4dea41 CB |
589 | struct statfs sb; |
590 | ||
591 | if (statfs("/", &sb) < 0) { | |
592 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 593 | return -1; |
0a4dea41 CB |
594 | } |
595 | ||
596 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
597 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
598 | * /proc/1/mountinfo. */ | |
599 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
600 | return chroot_enter(); | |
29a73c2f | 601 | |
cc309f33 | 602 | if (pivot_enter() < 0) { |
0a4dea41 | 603 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 604 | return -1; |
29a73c2f CB |
605 | } |
606 | ||
cc309f33 | 607 | return 0; |
29a73c2f CB |
608 | } |
609 | ||
610 | /* Prepare our new clean root. */ | |
0232cbac | 611 | static int permute_prepare(void) |
29a73c2f CB |
612 | { |
613 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 614 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
615 | return -1; |
616 | } | |
617 | ||
618 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 619 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
620 | return -1; |
621 | } | |
622 | ||
623 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 624 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
625 | return -1; |
626 | } | |
627 | ||
628 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 629 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
630 | return -1; |
631 | } | |
632 | ||
633 | return 0; | |
634 | } | |
635 | ||
0232cbac CB |
636 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
637 | static bool permute_root(void) | |
29a73c2f CB |
638 | { |
639 | /* Prepare new root. */ | |
0232cbac | 640 | if (permute_prepare() < 0) |
29a73c2f CB |
641 | return false; |
642 | ||
643 | /* Pivot into new root. */ | |
0232cbac | 644 | if (permute_and_enter() < 0) |
29a73c2f CB |
645 | return false; |
646 | ||
647 | return true; | |
648 | } | |
649 | ||
0a4dea41 | 650 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
651 | { |
652 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 653 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
654 | return false; |
655 | } | |
480262c9 | 656 | |
29a73c2f | 657 | if (!umount_if_mounted()) { |
b8defc3d | 658 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
659 | return false; |
660 | } | |
661 | ||
662 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 663 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
664 | return false; |
665 | } | |
666 | ||
1d81c6a6 | 667 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 668 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
669 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
670 | return false; | |
671 | } | |
672 | ||
480262c9 | 673 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 674 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
675 | return false; |
676 | } | |
480262c9 | 677 | |
29a73c2f | 678 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 679 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
680 | return false; |
681 | } | |
480262c9 | 682 | |
29a73c2f CB |
683 | return true; |
684 | } | |
685 | ||
0a4dea41 | 686 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 687 | { |
5fbea8a6 CB |
688 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
689 | return false; | |
51c7ca35 | 690 | |
5fbea8a6 CB |
691 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
692 | return false; | |
29a73c2f | 693 | |
5fbea8a6 CB |
694 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
695 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
696 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
697 | if ((*h)->fd < 0) | |
29a73c2f | 698 | return false; |
29a73c2f | 699 | } |
5fbea8a6 | 700 | |
29a73c2f CB |
701 | return true; |
702 | } | |
703 | ||
480262c9 | 704 | static bool cgfs_setup_controllers(void) |
29a73c2f | 705 | { |
0a4dea41 | 706 | if (!cgfs_prepare_mounts()) |
29a73c2f | 707 | return false; |
29a73c2f | 708 | |
2b8eff1d CB |
709 | if (!cgfs_mount_hierarchies()) |
710 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 711 | |
0232cbac | 712 | if (!permute_root()) |
29a73c2f CB |
713 | return false; |
714 | ||
715 | return true; | |
716 | } | |
717 | ||
2243c5a9 | 718 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 719 | { |
2aa59b2e | 720 | __do_close_prot_errno int init_ns = -EBADF, pidfd = -EBADF; |
4ec5c9da | 721 | int i = 0; |
2aa59b2e | 722 | pid_t pid; |
5fbea8a6 | 723 | char *cret; |
e58dab00 | 724 | char cwd[MAXPATHLEN]; |
237e200e | 725 | |
cc42d0c7 CB |
726 | lxcfs_info("Running constructor %s", __func__); |
727 | ||
5fbea8a6 CB |
728 | cgroup_ops = cgroup_init(); |
729 | if (!cgroup_ops) | |
2243c5a9 | 730 | log_exit("Failed to initialize cgroup support"); |
237e200e | 731 | |
480262c9 | 732 | /* Preserve initial namespace. */ |
2aa59b2e CB |
733 | pid = getpid(); |
734 | init_ns = preserve_ns(pid, "mnt"); | |
2243c5a9 CB |
735 | if (init_ns < 0) |
736 | log_exit("Failed to preserve initial mount namespace"); | |
480262c9 | 737 | |
e58dab00 | 738 | cret = getcwd(cwd, MAXPATHLEN); |
4ec5c9da | 739 | if (!cret) |
2243c5a9 | 740 | log_exit("%s - Could not retrieve current working directory", strerror(errno)); |
e58dab00 | 741 | |
480262c9 CB |
742 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
743 | * to privately mount lxcfs cgroups. */ | |
2243c5a9 CB |
744 | if (!cgfs_setup_controllers()) |
745 | log_exit("Failed to setup private cgroup mounts for lxcfs"); | |
480262c9 | 746 | |
2243c5a9 CB |
747 | if (setns(init_ns, 0) < 0) |
748 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); | |
29a73c2f | 749 | |
e58dab00 | 750 | if (!cret || chdir(cwd) < 0) |
2243c5a9 | 751 | log_exit("%s - Could not change back to original working directory", strerror(errno)); |
e58dab00 | 752 | |
2243c5a9 CB |
753 | if (!init_cpuview()) |
754 | log_exit("Failed to init CPU view"); | |
056adcef | 755 | |
cc42d0c7 CB |
756 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
757 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
758 | |
759 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
760 | char **controller_list = (*h)->controllers; |
761 | __do_free char *controllers = NULL; | |
762 | if (controller_list && *controller_list) | |
763 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
764 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 765 | } |
2aa59b2e CB |
766 | |
767 | pidfd = pidfd_open(pid, 0); | |
768 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
769 | can_use_pidfd = true; | |
cc42d0c7 | 770 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 771 | } |
ce8fc84c | 772 | |
cc42d0c7 | 773 | lxcfs_info("api_extensions:"); |
ce8fc84c | 774 | for (i = 0; i < nr_api_extensions; i++) |
cc42d0c7 | 775 | lxcfs_info("- %s", api_extensions[i]); |
237e200e SH |
776 | } |
777 | ||
2243c5a9 | 778 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 779 | { |
cc42d0c7 CB |
780 | lxcfs_info("Running destructor %s", __func__); |
781 | ||
056adcef | 782 | free_cpuview(); |
2243c5a9 | 783 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 784 | } |