]>
Commit | Line | Data |
---|---|---|
1 | /* SPDX-License-Identifier: LGPL-2.1+ */ | |
2 | ||
3 | #include "config.h" | |
4 | ||
5 | #include <dirent.h> | |
6 | #include <errno.h> | |
7 | #include <fcntl.h> | |
8 | #include <inttypes.h> | |
9 | #include <libgen.h> | |
10 | #include <linux/magic.h> | |
11 | #include <linux/sched.h> | |
12 | #include <pthread.h> | |
13 | #include <sched.h> | |
14 | #include <stdarg.h> | |
15 | #include <stdbool.h> | |
16 | #include <stdint.h> | |
17 | #include <stdio.h> | |
18 | #include <stdlib.h> | |
19 | #include <string.h> | |
20 | #include <sys/epoll.h> | |
21 | #include <sys/mman.h> | |
22 | #include <sys/mount.h> | |
23 | #include <sys/param.h> | |
24 | #include <sys/socket.h> | |
25 | #include <sys/syscall.h> | |
26 | #include <sys/sysinfo.h> | |
27 | #include <sys/vfs.h> | |
28 | #include <time.h> | |
29 | #include <unistd.h> | |
30 | #include <wait.h> | |
31 | ||
32 | #include "bindings.h" | |
33 | ||
34 | #include "api_extensions.h" | |
35 | #include "cgroup_fuse.h" | |
36 | #include "cgroups/cgroup.h" | |
37 | #include "cgroups/cgroup_utils.h" | |
38 | #include "memory_utils.h" | |
39 | #include "proc_cpuview.h" | |
40 | #include "syscall_numbers.h" | |
41 | #include "utils.h" | |
42 | ||
43 | static bool can_use_pidfd; | |
44 | static bool can_use_swap; | |
45 | static bool can_use_sys_cpu; | |
46 | static bool has_versioned_opts; | |
47 | static bool memory_is_cgroupv2; | |
48 | ||
49 | static volatile sig_atomic_t reload_successful; | |
50 | ||
51 | bool liblxcfs_functional(void) | |
52 | { | |
53 | return reload_successful != 0; | |
54 | } | |
55 | ||
56 | bool liblxcfs_can_use_swap(void) | |
57 | { | |
58 | return can_use_swap; | |
59 | } | |
60 | ||
61 | bool liblxcfs_can_use_sys_cpu(void) | |
62 | { | |
63 | return can_use_sys_cpu; | |
64 | } | |
65 | ||
66 | bool liblxcfs_has_versioned_opts(void) | |
67 | { | |
68 | return has_versioned_opts; | |
69 | } | |
70 | ||
71 | bool liblxcfs_memory_is_cgroupv2(void) | |
72 | { | |
73 | return memory_is_cgroupv2; | |
74 | } | |
75 | ||
76 | /* Define pivot_root() if missing from the C library */ | |
77 | #ifndef HAVE_PIVOT_ROOT | |
78 | static int pivot_root(const char *new_root, const char *put_old) | |
79 | { | |
80 | return syscall(__NR_pivot_root, new_root, put_old); | |
81 | } | |
82 | #else | |
83 | extern int pivot_root(const char *new_root, const char *put_old); | |
84 | #endif | |
85 | ||
86 | /* | |
87 | * A table caching which pid is init for a pid namespace. | |
88 | * When looking up which pid is init for $qpid, we first | |
89 | * 1. Stat /proc/$qpid/ns/pid. | |
90 | * 2. Check whether the ino_t is in our store. | |
91 | * a. if not, fork a child in qpid's ns to send us | |
92 | * ucred.pid = 1, and read the initpid. Cache | |
93 | * initpid and creation time for /proc/initpid | |
94 | * in a new store entry. | |
95 | * b. if so, verify that /proc/initpid still matches | |
96 | * what we have saved. If not, clear the store | |
97 | * entry and go back to a. If so, return the | |
98 | * cached initpid. | |
99 | */ | |
100 | struct pidns_init_store { | |
101 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ | |
102 | pid_t initpid; /* the pid of nit in that ns */ | |
103 | int init_pidfd; | |
104 | int64_t ctime; /* the time at which /proc/$initpid was created */ | |
105 | struct pidns_init_store *next; | |
106 | int64_t lastcheck; | |
107 | }; | |
108 | ||
109 | /* lol - look at how they are allocated in the kernel */ | |
110 | #define PIDNS_HASH_SIZE 4096 | |
111 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
112 | ||
113 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
114 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
115 | ||
116 | static void mutex_lock(pthread_mutex_t *l) | |
117 | { | |
118 | int ret; | |
119 | ||
120 | ret = pthread_mutex_lock(l); | |
121 | if (ret) | |
122 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
123 | } | |
124 | ||
125 | struct cgroup_ops *cgroup_ops; | |
126 | ||
127 | static void mutex_unlock(pthread_mutex_t *l) | |
128 | { | |
129 | int ret; | |
130 | ||
131 | ret = pthread_mutex_unlock(l); | |
132 | if (ret) | |
133 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
134 | } | |
135 | ||
136 | static inline void store_lock(void) | |
137 | { | |
138 | mutex_lock(&pidns_store_mutex); | |
139 | } | |
140 | ||
141 | static inline void store_unlock(void) | |
142 | { | |
143 | mutex_unlock(&pidns_store_mutex); | |
144 | } | |
145 | ||
146 | /* /proc/ = 6 | |
147 | * + | |
148 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
149 | * + | |
150 | * \0 = 1 | |
151 | */ | |
152 | #define LXCFS_PROC_PID_LEN \ | |
153 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
154 | ||
155 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) | |
156 | { | |
157 | int ret; | |
158 | ||
159 | if (entry->init_pidfd < 0) | |
160 | return ret_errno(ENOSYS); | |
161 | ||
162 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); | |
163 | if (ret < 0) { | |
164 | if (errno == ENOSYS) | |
165 | return ret_errno(ENOSYS); | |
166 | ||
167 | return 0; | |
168 | } | |
169 | ||
170 | return 1; | |
171 | } | |
172 | ||
173 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
174 | { | |
175 | struct stat st; | |
176 | char path[LXCFS_PROC_PID_LEN]; | |
177 | ||
178 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
179 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
180 | return 0; | |
181 | ||
182 | return 1; | |
183 | } | |
184 | ||
185 | /* Must be called under store_lock */ | |
186 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
187 | { | |
188 | int ret; | |
189 | ||
190 | ret = initpid_still_valid_pidfd(entry); | |
191 | if (ret < 0) | |
192 | ret = initpid_still_valid_stat(entry); | |
193 | ||
194 | return ret == 1; | |
195 | } | |
196 | ||
197 | /* Must be called under store_lock */ | |
198 | static void remove_initpid(struct pidns_init_store *entry) | |
199 | { | |
200 | struct pidns_init_store *it; | |
201 | int ino_hash; | |
202 | ||
203 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", | |
204 | entry->initpid); | |
205 | ||
206 | ino_hash = HASH(entry->ino); | |
207 | if (pidns_hash_table[ino_hash] == entry) { | |
208 | pidns_hash_table[ino_hash] = entry->next; | |
209 | close_prot_errno_disarm(entry->init_pidfd); | |
210 | free_disarm(entry); | |
211 | return; | |
212 | } | |
213 | ||
214 | it = pidns_hash_table[ino_hash]; | |
215 | while (it) { | |
216 | if (it->next == entry) { | |
217 | it->next = entry->next; | |
218 | close_prot_errno_disarm(entry->init_pidfd); | |
219 | free_disarm(entry); | |
220 | return; | |
221 | } | |
222 | it = it->next; | |
223 | } | |
224 | } | |
225 | ||
226 | #define PURGE_SECS 5 | |
227 | /* Must be called under store_lock */ | |
228 | static void prune_initpid_store(void) | |
229 | { | |
230 | static int64_t last_prune = 0; | |
231 | int64_t now, threshold; | |
232 | ||
233 | if (!last_prune) { | |
234 | last_prune = time(NULL); | |
235 | return; | |
236 | } | |
237 | ||
238 | now = time(NULL); | |
239 | if (now < (last_prune + PURGE_SECS)) | |
240 | return; | |
241 | ||
242 | lxcfs_debug("Pruning init pid cache"); | |
243 | ||
244 | last_prune = now; | |
245 | threshold = now - 2 * PURGE_SECS; | |
246 | ||
247 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { | |
248 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
249 | if (entry->lastcheck < threshold) { | |
250 | struct pidns_init_store *cur = entry; | |
251 | ||
252 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); | |
253 | ||
254 | if (prev) | |
255 | prev->next = entry->next; | |
256 | else | |
257 | pidns_hash_table[i] = entry->next; | |
258 | entry = entry->next; | |
259 | close_prot_errno_disarm(cur->init_pidfd); | |
260 | free_disarm(cur); | |
261 | } else { | |
262 | prev = entry; | |
263 | entry = entry->next; | |
264 | } | |
265 | } | |
266 | } | |
267 | } | |
268 | ||
269 | static void clear_initpid_store(void) | |
270 | { | |
271 | store_lock(); | |
272 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { | |
273 | for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) { | |
274 | struct pidns_init_store *cur = entry; | |
275 | ||
276 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); | |
277 | ||
278 | pidns_hash_table[i] = entry->next; | |
279 | entry = entry->next; | |
280 | close_prot_errno_disarm(cur->init_pidfd); | |
281 | free_disarm(cur); | |
282 | } | |
283 | } | |
284 | store_unlock(); | |
285 | } | |
286 | ||
287 | /* Must be called under store_lock */ | |
288 | static void save_initpid(ino_t pidns_inode, pid_t pid) | |
289 | { | |
290 | __do_free struct pidns_init_store *entry = NULL; | |
291 | __do_close int pidfd = -EBADF; | |
292 | const struct lxcfs_opts *opts = fuse_get_context()->private_data; | |
293 | char path[LXCFS_PROC_PID_LEN]; | |
294 | struct stat st; | |
295 | int ino_hash; | |
296 | ||
297 | if (opts && opts->use_pidfd && can_use_pidfd) { | |
298 | pidfd = pidfd_open(pid, 0); | |
299 | if (pidfd < 0) | |
300 | return; | |
301 | } | |
302 | ||
303 | snprintf(path, sizeof(path), "/proc/%d", pid); | |
304 | if (stat(path, &st)) | |
305 | return; | |
306 | ||
307 | entry = zalloc(sizeof(*entry)); | |
308 | if (!entry) | |
309 | return; | |
310 | ||
311 | ino_hash = HASH(pidns_inode); | |
312 | *entry = (struct pidns_init_store){ | |
313 | .ino = pidns_inode, | |
314 | .initpid = pid, | |
315 | .ctime = st.st_ctime, | |
316 | .next = pidns_hash_table[ino_hash], | |
317 | .lastcheck = time(NULL), | |
318 | .init_pidfd = move_fd(pidfd), | |
319 | }; | |
320 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
321 | ||
322 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
323 | } | |
324 | ||
325 | /* | |
326 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
327 | * entry for the inode number and creation time. Verify that the init pid | |
328 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
329 | * otherwise. | |
330 | * Must be called under store_lock | |
331 | */ | |
332 | static pid_t lookup_verify_initpid(ino_t pidns_inode) | |
333 | { | |
334 | struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; | |
335 | ||
336 | while (entry) { | |
337 | if (entry->ino == pidns_inode) { | |
338 | if (initpid_still_valid(entry)) { | |
339 | entry->lastcheck = time(NULL); | |
340 | return entry->initpid; | |
341 | } | |
342 | ||
343 | remove_initpid(entry); | |
344 | return ret_errno(ESRCH); | |
345 | } | |
346 | entry = entry->next; | |
347 | } | |
348 | ||
349 | return ret_errno(ESRCH); | |
350 | } | |
351 | ||
352 | static bool send_creds_ok(int sock_fd) | |
353 | { | |
354 | char v = '1'; /* we are the child */ | |
355 | struct ucred cred = { | |
356 | .uid = 0, | |
357 | .gid = 0, | |
358 | .pid = 1, | |
359 | }; | |
360 | ||
361 | return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK; | |
362 | } | |
363 | ||
364 | __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd) | |
365 | { | |
366 | /* | |
367 | * These flags don't interest at all so we don't jump through any hoops | |
368 | * of retrieving them and passing them to the kernel. | |
369 | */ | |
370 | errno = EINVAL; | |
371 | if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | | |
372 | CLONE_CHILD_CLEARTID | CLONE_SETTLS))) | |
373 | return -EINVAL; | |
374 | ||
375 | #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) | |
376 | /* On s390/s390x and cris the order of the first and second arguments | |
377 | * of the system call is reversed. | |
378 | */ | |
379 | return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd); | |
380 | #elif defined(__sparc__) && defined(__arch64__) | |
381 | { | |
382 | /* | |
383 | * sparc64 always returns the other process id in %o0, and a | |
384 | * boolean flag whether this is the child or the parent in %o1. | |
385 | * Inline assembly is needed to get the flag returned in %o1. | |
386 | */ | |
387 | register long g1 asm("g1") = __NR_clone; | |
388 | register long o0 asm("o0") = flags | SIGCHLD; | |
389 | register long o1 asm("o1") = 0; /* is parent/child indicator */ | |
390 | register long o2 asm("o2") = (unsigned long)pidfd; | |
391 | long is_error, retval, in_child; | |
392 | pid_t child_pid; | |
393 | ||
394 | asm volatile( | |
395 | #if defined(__arch64__) | |
396 | "t 0x6d\n\t" /* 64-bit trap */ | |
397 | #else | |
398 | "t 0x10\n\t" /* 32-bit trap */ | |
399 | #endif | |
400 | /* | |
401 | * catch errors: On sparc, the carry bit (csr) in the | |
402 | * processor status register (psr) is used instead of a | |
403 | * full register. | |
404 | */ | |
405 | "addx %%g0, 0, %%g1" | |
406 | : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ | |
407 | : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ | |
408 | : "%cc"); /* clobbers */ | |
409 | ||
410 | is_error = g1; | |
411 | retval = o0; | |
412 | in_child = o1; | |
413 | ||
414 | if (is_error) { | |
415 | errno = retval; | |
416 | return -1; | |
417 | } | |
418 | ||
419 | if (in_child) | |
420 | return 0; | |
421 | ||
422 | child_pid = retval; | |
423 | return child_pid; | |
424 | } | |
425 | #elif defined(__ia64__) | |
426 | /* On ia64 the stack and stack size are passed as separate arguments. */ | |
427 | return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd); | |
428 | #else | |
429 | return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd); | |
430 | #endif | |
431 | } | |
432 | ||
433 | #define LXCFS_PROC_PID_NS_LEN \ | |
434 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
435 | STRLITERALLEN("/ns/pid") + 1) | |
436 | ||
437 | /* | |
438 | * clone a task which switches to @task's namespace and writes '1'. | |
439 | * over a unix sock so we can read the task's reaper's pid in our | |
440 | * namespace | |
441 | * | |
442 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
443 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
444 | * the pidns and the parent pid outside are identical. Using clone prevents | |
445 | * this issue. | |
446 | */ | |
447 | static void write_task_init_pid_exit(int sock, pid_t target) | |
448 | { | |
449 | __do_close int fd = -EBADF; | |
450 | char path[LXCFS_PROC_PID_NS_LEN]; | |
451 | pid_t pid; | |
452 | ||
453 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
454 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
455 | if (fd < 0) | |
456 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
457 | ||
458 | if (setns(fd, 0)) | |
459 | log_exit("Failed to setns to pid namespace of process %d", target); | |
460 | ||
461 | pid = lxcfs_raw_clone(0, NULL); | |
462 | if (pid < 0) | |
463 | _exit(EXIT_FAILURE); | |
464 | ||
465 | if (pid == 0) { | |
466 | if (!send_creds_ok(sock)) | |
467 | _exit(EXIT_FAILURE); | |
468 | ||
469 | _exit(EXIT_SUCCESS); | |
470 | } | |
471 | ||
472 | if (!wait_for_pid(pid)) | |
473 | _exit(EXIT_FAILURE); | |
474 | ||
475 | _exit(EXIT_SUCCESS); | |
476 | } | |
477 | ||
478 | static pid_t scm_init_pid(pid_t task) | |
479 | { | |
480 | char v = '0'; | |
481 | pid_t pid_ret = -1; | |
482 | struct ucred cred = { | |
483 | .pid = -1, | |
484 | .uid = -1, | |
485 | .gid = -1, | |
486 | }; | |
487 | pid_t pid; | |
488 | int sock[2]; | |
489 | ||
490 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) | |
491 | return -1; | |
492 | ||
493 | pid = fork(); | |
494 | if (pid < 0) | |
495 | goto out; | |
496 | ||
497 | if (pid == 0) { | |
498 | close(sock[1]); | |
499 | write_task_init_pid_exit(sock[0], task); | |
500 | _exit(EXIT_SUCCESS); | |
501 | } | |
502 | ||
503 | if (!recv_creds(sock[1], &cred, &v)) | |
504 | goto out; | |
505 | ||
506 | pid_ret = cred.pid; | |
507 | ||
508 | out: | |
509 | close(sock[0]); | |
510 | close(sock[1]); | |
511 | if (pid > 0) | |
512 | wait_for_pid(pid); | |
513 | ||
514 | return pid_ret; | |
515 | } | |
516 | ||
517 | pid_t lookup_initpid_in_store(pid_t pid) | |
518 | { | |
519 | pid_t hashed_pid = 0; | |
520 | char path[LXCFS_PROC_PID_NS_LEN]; | |
521 | struct stat st; | |
522 | ||
523 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
524 | if (stat(path, &st)) | |
525 | return ret_errno(ESRCH); | |
526 | ||
527 | store_lock(); | |
528 | ||
529 | hashed_pid = lookup_verify_initpid(st.st_ino); | |
530 | if (hashed_pid < 0) { | |
531 | /* release the mutex as the following call is expensive */ | |
532 | store_unlock(); | |
533 | ||
534 | hashed_pid = scm_init_pid(pid); | |
535 | ||
536 | store_lock(); | |
537 | ||
538 | if (hashed_pid > 0) | |
539 | save_initpid(st.st_ino, hashed_pid); | |
540 | } | |
541 | ||
542 | /* | |
543 | * Prune at the end in case we're pruning the value | |
544 | * we were about to return. | |
545 | */ | |
546 | prune_initpid_store(); | |
547 | store_unlock(); | |
548 | ||
549 | return hashed_pid; | |
550 | } | |
551 | ||
552 | /* | |
553 | * Functions needed to setup cgroups in the __constructor__. | |
554 | */ | |
555 | ||
556 | static bool umount_if_mounted(void) | |
557 | { | |
558 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
559 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); | |
560 | return false; | |
561 | } | |
562 | return true; | |
563 | } | |
564 | ||
565 | /* __typeof__ should be safe to use with all compilers. */ | |
566 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
567 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
568 | { | |
569 | return (fs->f_type == (fs_type_magic)magic_val); | |
570 | } | |
571 | ||
572 | /* | |
573 | * looking at fs/proc_namespace.c, it appears we can | |
574 | * actually expect the rootfs entry to very specifically contain | |
575 | * " - rootfs rootfs " | |
576 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
577 | * the rootfs entry should always be skipped in mountinfo contents. | |
578 | */ | |
579 | static bool is_on_ramfs(void) | |
580 | { | |
581 | __do_free char *line = NULL; | |
582 | __do_free void *fopen_cache = NULL; | |
583 | __do_fclose FILE *f = NULL; | |
584 | size_t len = 0; | |
585 | ||
586 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); | |
587 | if (!f) | |
588 | return false; | |
589 | ||
590 | while (getline(&line, &len, f) != -1) { | |
591 | int i; | |
592 | char *p, *p2; | |
593 | ||
594 | for (p = line, i = 0; p && i < 4; i++) | |
595 | p = strchr(p + 1, ' '); | |
596 | if (!p) | |
597 | continue; | |
598 | ||
599 | p2 = strchr(p + 1, ' '); | |
600 | if (!p2) | |
601 | continue; | |
602 | *p2 = '\0'; | |
603 | if (strcmp(p + 1, "/") == 0) { | |
604 | /* This is '/'. Is it the ramfs? */ | |
605 | p = strchr(p2 + 1, '-'); | |
606 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) | |
607 | return true; | |
608 | } | |
609 | } | |
610 | ||
611 | return false; | |
612 | } | |
613 | ||
614 | static int pivot_enter(void) | |
615 | { | |
616 | __do_close int oldroot = -EBADF, newroot = -EBADF; | |
617 | ||
618 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); | |
619 | if (oldroot < 0) | |
620 | return log_error_errno(-1, errno, | |
621 | "Failed to open old root for fchdir"); | |
622 | ||
623 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); | |
624 | if (newroot < 0) | |
625 | return log_error_errno(-1, errno, | |
626 | "Failed to open new root for fchdir"); | |
627 | ||
628 | /* change into new root fs */ | |
629 | if (fchdir(newroot) < 0) | |
630 | return log_error_errno(-1, | |
631 | errno, "Failed to change directory to new rootfs: %s", | |
632 | ROOTDIR); | |
633 | ||
634 | /* pivot_root into our new root fs */ | |
635 | if (pivot_root(".", ".") < 0) | |
636 | return log_error_errno(-1, errno, | |
637 | "pivot_root() syscall failed: %s", | |
638 | strerror(errno)); | |
639 | ||
640 | /* | |
641 | * At this point the old-root is mounted on top of our new-root. | |
642 | * To unmounted it we must not be chdir'd into it, so escape back | |
643 | * to the old-root. | |
644 | */ | |
645 | if (fchdir(oldroot) < 0) | |
646 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
647 | ||
648 | if (umount2(".", MNT_DETACH) < 0) | |
649 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
650 | ||
651 | if (fchdir(newroot) < 0) | |
652 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
653 | ||
654 | return 0; | |
655 | } | |
656 | ||
657 | static int chroot_enter(void) | |
658 | { | |
659 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
660 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
661 | return -1; | |
662 | } | |
663 | ||
664 | if (chroot(".") < 0) { | |
665 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
666 | return -1; | |
667 | } | |
668 | ||
669 | if (chdir("/") < 0) { | |
670 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
671 | return -1; | |
672 | } | |
673 | ||
674 | return 0; | |
675 | } | |
676 | ||
677 | static int permute_and_enter(void) | |
678 | { | |
679 | struct statfs sb; | |
680 | ||
681 | if (statfs("/", &sb) < 0) { | |
682 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
683 | return -1; | |
684 | } | |
685 | ||
686 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
687 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
688 | * /proc/1/mountinfo. */ | |
689 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
690 | return chroot_enter(); | |
691 | ||
692 | if (pivot_enter() < 0) { | |
693 | lxcfs_error("%s\n", "Could not perform pivot root."); | |
694 | return -1; | |
695 | } | |
696 | ||
697 | return 0; | |
698 | } | |
699 | ||
700 | /* Prepare our new clean root. */ | |
701 | static int permute_prepare(void) | |
702 | { | |
703 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
704 | lxcfs_error("%s\n", "Failed to create directory for new root."); | |
705 | return -1; | |
706 | } | |
707 | ||
708 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
709 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); | |
710 | return -1; | |
711 | } | |
712 | ||
713 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
714 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); | |
715 | return -1; | |
716 | } | |
717 | ||
718 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
719 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); | |
720 | return -1; | |
721 | } | |
722 | ||
723 | return 0; | |
724 | } | |
725 | ||
726 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ | |
727 | static bool permute_root(void) | |
728 | { | |
729 | /* Prepare new root. */ | |
730 | if (permute_prepare() < 0) | |
731 | return false; | |
732 | ||
733 | /* Pivot into new root. */ | |
734 | if (permute_and_enter() < 0) | |
735 | return false; | |
736 | ||
737 | return true; | |
738 | } | |
739 | ||
740 | static bool cgfs_prepare_mounts(void) | |
741 | { | |
742 | if (!mkdir_p(BASEDIR, 0700)) { | |
743 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); | |
744 | return false; | |
745 | } | |
746 | ||
747 | if (!umount_if_mounted()) { | |
748 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); | |
749 | return false; | |
750 | } | |
751 | ||
752 | if (unshare(CLONE_NEWNS) < 0) { | |
753 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); | |
754 | return false; | |
755 | } | |
756 | ||
757 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); | |
758 | if (cgroup_ops->mntns_fd < 0) { | |
759 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); | |
760 | return false; | |
761 | } | |
762 | ||
763 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { | |
764 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); | |
765 | return false; | |
766 | } | |
767 | ||
768 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { | |
769 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); | |
770 | return false; | |
771 | } | |
772 | ||
773 | return true; | |
774 | } | |
775 | ||
776 | static bool cgfs_mount_hierarchies(void) | |
777 | { | |
778 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) | |
779 | return false; | |
780 | ||
781 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) | |
782 | return false; | |
783 | ||
784 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { | |
785 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
786 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
787 | if ((*h)->fd < 0) | |
788 | return false; | |
789 | } | |
790 | ||
791 | return true; | |
792 | } | |
793 | ||
794 | static bool cgfs_setup_controllers(void) | |
795 | { | |
796 | if (!cgfs_prepare_mounts()) | |
797 | return false; | |
798 | ||
799 | if (!cgfs_mount_hierarchies()) | |
800 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
801 | ||
802 | if (!permute_root()) | |
803 | return false; | |
804 | ||
805 | return true; | |
806 | } | |
807 | ||
808 | static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) | |
809 | { | |
810 | int ret; | |
811 | ||
812 | if (reload_successful) { | |
813 | reload_successful = 0; | |
814 | ||
815 | /* write() is async signal safe */ | |
816 | ret = write(STDERR_FILENO, | |
817 | "Switched into non-virtualization mode\n", | |
818 | STRLITERALLEN("Switched into non-virtualization mode\n")); | |
819 | if (ret < 0) | |
820 | goto please_compiler; | |
821 | } else { | |
822 | reload_successful = 1; | |
823 | ||
824 | /* write() is async signal safe */ | |
825 | ret = write(STDERR_FILENO, "Switched into virtualization mode\n", | |
826 | STRLITERALLEN("Switched into virtualization mode\n")); | |
827 | if (ret < 0) | |
828 | goto please_compiler; | |
829 | } | |
830 | ||
831 | please_compiler: | |
832 | /* | |
833 | * The write() syscall is a function whose return value needs to be | |
834 | * checked. Otherwise the compiler will warn.Another one could be to | |
835 | * use syscall(__NR_write, ...) directly but whatever. | |
836 | */ | |
837 | return; | |
838 | } | |
839 | ||
840 | static void __attribute__((constructor)) lxcfs_init(void) | |
841 | { | |
842 | __do_close int init_ns = -EBADF, root_fd = -EBADF, | |
843 | pidfd = -EBADF; | |
844 | int i = 0; | |
845 | pid_t pid; | |
846 | struct hierarchy *hierarchy; | |
847 | ||
848 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); | |
849 | ||
850 | cgroup_ops = cgroup_init(); | |
851 | if (!cgroup_ops) { | |
852 | lxcfs_info("Failed to initialize cgroup support"); | |
853 | goto broken_upgrade; | |
854 | } | |
855 | ||
856 | /* Preserve initial namespace. */ | |
857 | pid = getpid(); | |
858 | init_ns = preserve_ns(pid, "mnt"); | |
859 | if (init_ns < 0) { | |
860 | lxcfs_info("Failed to preserve initial mount namespace"); | |
861 | goto broken_upgrade; | |
862 | } | |
863 | ||
864 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace | |
865 | * to privately mount lxcfs cgroups. */ | |
866 | if (!cgfs_setup_controllers()) { | |
867 | log_exit("Failed to setup private cgroup mounts for lxcfs"); | |
868 | goto broken_upgrade; | |
869 | } | |
870 | ||
871 | if (setns(init_ns, 0) < 0) { | |
872 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); | |
873 | goto broken_upgrade; | |
874 | } | |
875 | ||
876 | if (!init_cpuview()) { | |
877 | log_exit("Failed to init CPU view"); | |
878 | goto broken_upgrade; | |
879 | } | |
880 | ||
881 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); | |
882 | lxcfs_info("hierarchies:"); | |
883 | ||
884 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
885 | char **controller_list = (*h)->controllers; | |
886 | __do_free char *controllers = NULL; | |
887 | if (controller_list && *controller_list) | |
888 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
889 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
890 | } | |
891 | ||
892 | pidfd = pidfd_open(pid, 0); | |
893 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
894 | can_use_pidfd = true; | |
895 | lxcfs_info("Kernel supports pidfds"); | |
896 | } | |
897 | ||
898 | can_use_swap = cgroup_ops->can_use_swap(cgroup_ops); | |
899 | if (can_use_swap) | |
900 | lxcfs_info("Kernel supports swap accounting"); | |
901 | else | |
902 | lxcfs_info("Kernel does not support swap accounting"); | |
903 | ||
904 | hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory"); | |
905 | memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy); | |
906 | ||
907 | lxcfs_info("api_extensions:"); | |
908 | for (size_t nr = 0; nr < nr_api_extensions; nr++) | |
909 | lxcfs_info("- %s", api_extensions[nr]); | |
910 | ||
911 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
912 | if (root_fd < 0) | |
913 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
914 | else if (fchdir(root_fd) < 0) | |
915 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
916 | ||
917 | if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { | |
918 | lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); | |
919 | goto broken_upgrade; | |
920 | } | |
921 | ||
922 | reload_successful = 1; | |
923 | return; | |
924 | ||
925 | broken_upgrade: | |
926 | reload_successful = 0; | |
927 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); | |
928 | } | |
929 | ||
930 | static void __attribute__((destructor)) lxcfs_exit(void) | |
931 | { | |
932 | lxcfs_info("Running destructor %s", __func__); | |
933 | ||
934 | clear_initpid_store(); | |
935 | free_cpuview(); | |
936 | cgroup_exit(cgroup_ops); | |
937 | } | |
938 | ||
939 | void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data) | |
940 | { | |
941 | struct fuse_context *fc = fuse_get_context(); | |
942 | can_use_sys_cpu = true; | |
943 | has_versioned_opts = true; | |
944 | return fc->private_data; | |
945 | } |