]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
1f5596dd CB |
3 | #ifndef _GNU_SOURCE |
4 | #define _GNU_SOURCE | |
5 | #endif | |
6 | ||
7 | #ifndef FUSE_USE_VERSION | |
237e200e | 8 | #define FUSE_USE_VERSION 26 |
1f5596dd CB |
9 | #endif |
10 | ||
11 | #define _FILE_OFFSET_BITS 64 | |
237e200e | 12 | |
237e200e | 13 | #include <dirent.h> |
29a73c2f | 14 | #include <errno.h> |
237e200e SH |
15 | #include <fcntl.h> |
16 | #include <fuse.h> | |
0ecddf02 | 17 | #include <inttypes.h> |
237e200e | 18 | #include <libgen.h> |
dee86006 CB |
19 | #include <linux/magic.h> |
20 | #include <linux/sched.h> | |
237e200e | 21 | #include <pthread.h> |
29a73c2f | 22 | #include <sched.h> |
db1b32f6 | 23 | #include <stdarg.h> |
29a73c2f | 24 | #include <stdbool.h> |
0ecddf02 | 25 | #include <stdint.h> |
29a73c2f CB |
26 | #include <stdio.h> |
27 | #include <stdlib.h> | |
28 | #include <string.h> | |
29a73c2f CB |
29 | #include <sys/epoll.h> |
30 | #include <sys/mman.h> | |
31 | #include <sys/mount.h> | |
237e200e SH |
32 | #include <sys/param.h> |
33 | #include <sys/socket.h> | |
29a73c2f | 34 | #include <sys/syscall.h> |
0ecddf02 | 35 | #include <sys/sysinfo.h> |
d89504c4 | 36 | #include <sys/vfs.h> |
dee86006 CB |
37 | #include <time.h> |
38 | #include <unistd.h> | |
39 | #include <wait.h> | |
237e200e | 40 | |
ce8fc84c | 41 | #include "api_extensions.h" |
237e200e | 42 | #include "bindings.h" |
580fe4df | 43 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
44 | #include "cgroups/cgroup.h" |
45 | #include "cgroups/cgroup_utils.h" | |
dee86006 | 46 | #include "config.h" |
c9236032 | 47 | #include "memory_utils.h" |
1f5596dd | 48 | #include "proc_cpuview.h" |
8364a99c | 49 | #include "syscall_numbers.h" |
1d81c6a6 | 50 | #include "utils.h" |
237e200e | 51 | |
2aa59b2e | 52 | static bool can_use_pidfd; |
b9b6bdc9 CB |
53 | |
54 | static volatile sig_atomic_t reload_successful; | |
cbfc55fd CB |
55 | |
56 | bool liblxcfs_functional(void) | |
57 | { | |
b9b6bdc9 | 58 | return reload_successful != 0; |
cbfc55fd | 59 | } |
2aa59b2e | 60 | |
29a73c2f CB |
61 | /* Define pivot_root() if missing from the C library */ |
62 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 63 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f | 64 | { |
4ec5c9da | 65 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f CB |
66 | } |
67 | #else | |
4ec5c9da | 68 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
69 | #endif |
70 | ||
237e200e SH |
71 | /* |
72 | * A table caching which pid is init for a pid namespace. | |
73 | * When looking up which pid is init for $qpid, we first | |
74 | * 1. Stat /proc/$qpid/ns/pid. | |
75 | * 2. Check whether the ino_t is in our store. | |
76 | * a. if not, fork a child in qpid's ns to send us | |
77 | * ucred.pid = 1, and read the initpid. Cache | |
78 | * initpid and creation time for /proc/initpid | |
79 | * in a new store entry. | |
80 | * b. if so, verify that /proc/initpid still matches | |
81 | * what we have saved. If not, clear the store | |
82 | * entry and go back to a. If so, return the | |
83 | * cached initpid. | |
84 | */ | |
85 | struct pidns_init_store { | |
2aa59b2e CB |
86 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
87 | pid_t initpid; /* the pid of nit in that ns */ | |
88 | int init_pidfd; | |
1ba088ae | 89 | int64_t ctime; /* the time at which /proc/$initpid was created */ |
237e200e | 90 | struct pidns_init_store *next; |
1ba088ae | 91 | int64_t lastcheck; |
237e200e SH |
92 | }; |
93 | ||
94 | /* lol - look at how they are allocated in the kernel */ | |
95 | #define PIDNS_HASH_SIZE 4096 | |
96 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
97 | ||
98 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
99 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 100 | |
237e200e SH |
101 | static void lock_mutex(pthread_mutex_t *l) |
102 | { | |
103 | int ret; | |
104 | ||
4ec5c9da CB |
105 | ret = pthread_mutex_lock(l); |
106 | if (ret) | |
107 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
108 | } |
109 | ||
77f4399a | 110 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 111 | |
237e200e SH |
112 | static void unlock_mutex(pthread_mutex_t *l) |
113 | { | |
114 | int ret; | |
115 | ||
4ec5c9da CB |
116 | ret = pthread_mutex_unlock(l); |
117 | if (ret) | |
118 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
119 | } |
120 | ||
fcdedd16 | 121 | static inline void unlock_mutex_function(pthread_mutex_t **mutex) |
237e200e | 122 | { |
fcdedd16 WB |
123 | if (*mutex) |
124 | unlock_mutex(*mutex); | |
237e200e | 125 | } |
fcdedd16 | 126 | #define __do_unlock call_cleaner(unlock_mutex) |
237e200e | 127 | |
fcdedd16 | 128 | static pthread_mutex_t* __attribute__((warn_unused_result)) store_lock(void) |
237e200e | 129 | { |
fcdedd16 WB |
130 | lock_mutex(&pidns_store_mutex); |
131 | return &pidns_store_mutex; | |
237e200e SH |
132 | } |
133 | ||
2aa59b2e CB |
134 | /* /proc/ = 6 |
135 | * + | |
136 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
137 | * + | |
138 | * \0 = 1 | |
139 | */ | |
140 | #define LXCFS_PROC_PID_LEN \ | |
141 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
142 | ||
bc189096 | 143 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 144 | { |
bc189096 | 145 | int ret; |
237e200e | 146 | |
bc189096 CB |
147 | if (entry->init_pidfd < 0) |
148 | return ret_errno(ENOSYS); | |
7dd6560a | 149 | |
bc189096 CB |
150 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
151 | if (ret < 0) { | |
152 | if (errno == ENOSYS) | |
153 | return ret_errno(ENOSYS); | |
7dd6560a | 154 | |
bc189096 | 155 | return 0; |
2aa59b2e CB |
156 | } |
157 | ||
bc189096 CB |
158 | return 1; |
159 | } | |
160 | ||
161 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
162 | { | |
163 | struct stat st; | |
164 | char path[LXCFS_PROC_PID_LEN]; | |
165 | ||
166 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
167 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
168 | return 0; | |
169 | ||
170 | return 1; | |
171 | } | |
172 | ||
173 | /* Must be called under store_lock */ | |
174 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
175 | { | |
176 | int ret; | |
177 | ||
178 | ret = initpid_still_valid_pidfd(entry); | |
179 | if (ret < 0) | |
180 | ret = initpid_still_valid_stat(entry); | |
181 | ||
182 | return ret == 1; | |
237e200e SH |
183 | } |
184 | ||
185 | /* Must be called under store_lock */ | |
2aa59b2e | 186 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 187 | { |
2aa59b2e CB |
188 | struct pidns_init_store *it; |
189 | int ino_hash; | |
237e200e | 190 | |
2aa59b2e CB |
191 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
192 | entry->initpid); | |
7dd6560a | 193 | |
2aa59b2e CB |
194 | ino_hash = HASH(entry->ino); |
195 | if (pidns_hash_table[ino_hash] == entry) { | |
196 | pidns_hash_table[ino_hash] = entry->next; | |
197 | close_prot_errno_disarm(entry->init_pidfd); | |
198 | free_disarm(entry); | |
237e200e SH |
199 | return; |
200 | } | |
201 | ||
2aa59b2e CB |
202 | it = pidns_hash_table[ino_hash]; |
203 | while (it) { | |
204 | if (it->next == entry) { | |
205 | it->next = entry->next; | |
206 | close_prot_errno_disarm(entry->init_pidfd); | |
207 | free_disarm(entry); | |
237e200e SH |
208 | return; |
209 | } | |
2aa59b2e | 210 | it = it->next; |
237e200e SH |
211 | } |
212 | } | |
213 | ||
214 | #define PURGE_SECS 5 | |
215 | /* Must be called under store_lock */ | |
216 | static void prune_initpid_store(void) | |
217 | { | |
1ba088ae CB |
218 | static int64_t last_prune = 0; |
219 | int64_t now, threshold; | |
237e200e SH |
220 | |
221 | if (!last_prune) { | |
222 | last_prune = time(NULL); | |
223 | return; | |
224 | } | |
2aa59b2e | 225 | |
237e200e SH |
226 | now = time(NULL); |
227 | if (now < last_prune + PURGE_SECS) | |
228 | return; | |
7dd6560a | 229 | |
2aa59b2e | 230 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 231 | |
237e200e SH |
232 | last_prune = now; |
233 | threshold = now - 2 * PURGE_SECS; | |
234 | ||
2aa59b2e CB |
235 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
236 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
237 | if (entry->lastcheck < threshold) { | |
238 | struct pidns_init_store *cur = entry; | |
7dd6560a | 239 | |
2aa59b2e | 240 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 241 | |
237e200e | 242 | if (prev) |
2aa59b2e | 243 | prev->next = entry->next; |
237e200e | 244 | else |
2aa59b2e CB |
245 | pidns_hash_table[i] = entry->next; |
246 | entry = entry->next; | |
247 | close_prot_errno_disarm(cur->init_pidfd); | |
248 | free_disarm(cur); | |
237e200e | 249 | } else { |
2aa59b2e CB |
250 | prev = entry; |
251 | entry = entry->next; | |
237e200e SH |
252 | } |
253 | } | |
254 | } | |
255 | } | |
256 | ||
257 | /* Must be called under store_lock */ | |
fcdedd16 | 258 | static void save_initpid(ino_t pidns_inode, pid_t pid) |
237e200e | 259 | { |
1e5d03fe | 260 | __do_free struct pidns_init_store *entry = NULL; |
05b7a16d | 261 | __do_close int pidfd = -EBADF; |
2aa59b2e CB |
262 | char path[LXCFS_PROC_PID_LEN]; |
263 | struct lxcfs_opts *opts = fuse_get_context()->private_data; | |
264 | struct stat st; | |
265 | int ino_hash; | |
266 | ||
9973cc06 | 267 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
268 | pidfd = pidfd_open(pid, 0); |
269 | if (pidfd < 0) | |
270 | return; | |
271 | } | |
237e200e | 272 | |
2aa59b2e CB |
273 | snprintf(path, sizeof(path), "/proc/%d", pid); |
274 | if (stat(path, &st)) | |
275 | return; | |
7dd6560a | 276 | |
5ec289bf | 277 | entry = zalloc(sizeof(*entry)); |
0eb3756b | 278 | if (!entry) |
237e200e | 279 | return; |
2aa59b2e | 280 | |
97017213 | 281 | ino_hash = HASH(pidns_inode); |
1e5d03fe | 282 | *entry = (struct pidns_init_store){ |
fcdedd16 | 283 | .ino = pidns_inode, |
1e5d03fe CB |
284 | .initpid = pid, |
285 | .ctime = st.st_ctime, | |
286 | .next = pidns_hash_table[ino_hash], | |
287 | .lastcheck = time(NULL), | |
288 | .init_pidfd = move_fd(pidfd), | |
289 | }; | |
290 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
291 | |
292 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
293 | } |
294 | ||
295 | /* | |
296 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
297 | * entry for the inode number and creation time. Verify that the init pid | |
298 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
299 | * otherwise. | |
300 | * Must be called under store_lock | |
301 | */ | |
fcdedd16 | 302 | static struct pidns_init_store *lookup_verify_initpid(ino_t pidns_inode) |
237e200e | 303 | { |
fcdedd16 | 304 | struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; |
2aa59b2e CB |
305 | |
306 | while (entry) { | |
fcdedd16 | 307 | if (entry->ino == pidns_inode) { |
2aa59b2e CB |
308 | if (initpid_still_valid(entry)) { |
309 | entry->lastcheck = time(NULL); | |
310 | return entry; | |
237e200e | 311 | } |
2aa59b2e CB |
312 | |
313 | remove_initpid(entry); | |
237e200e SH |
314 | return NULL; |
315 | } | |
2aa59b2e | 316 | entry = entry->next; |
237e200e SH |
317 | } |
318 | ||
319 | return NULL; | |
320 | } | |
321 | ||
4ec5c9da | 322 | static int send_creds_clone_wrapper(void *arg) |
237e200e | 323 | { |
f1744de4 CB |
324 | int sock = PTR_TO_INT(arg); |
325 | char v = '1'; /* we are the child */ | |
326 | struct ucred cred = { | |
327 | .uid = 0, | |
328 | .gid = 0, | |
329 | .pid = 1, | |
330 | }; | |
331 | ||
332 | return send_creds(sock, &cred, v, true) != SEND_CREDS_OK; | |
237e200e SH |
333 | } |
334 | ||
87f7558b CB |
335 | /* |
336 | * Let's use the "standard stack limit" (i.e. glibc thread size default) for | |
337 | * stack sizes: 8MB. | |
338 | */ | |
339 | #define __LXCFS_STACK_SIZE (8 * 1024 * 1024) | |
6abff455 | 340 | pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags) |
87f7558b CB |
341 | { |
342 | pid_t ret; | |
343 | void *stack; | |
344 | ||
345 | stack = malloc(__LXCFS_STACK_SIZE); | |
346 | if (!stack) | |
347 | return ret_errno(ENOMEM); | |
348 | ||
349 | #ifdef __ia64__ | |
350 | ret = __clone2(fn, stack, __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
351 | #else | |
352 | ret = clone(fn, stack + __LXCFS_STACK_SIZE, flags | SIGCHLD, arg, NULL); | |
353 | #endif | |
354 | return ret; | |
355 | } | |
356 | ||
357 | #define LXCFS_PROC_PID_NS_LEN \ | |
358 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
359 | STRLITERALLEN("/ns/pid") + 1) | |
360 | ||
580fe4df CB |
361 | /* |
362 | * clone a task which switches to @task's namespace and writes '1'. | |
363 | * over a unix sock so we can read the task's reaper's pid in our | |
364 | * namespace | |
365 | * | |
366 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
367 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
368 | * the pidns and the parent pid outside are identical. Using clone prevents | |
369 | * this issue. | |
370 | */ | |
371 | static void write_task_init_pid_exit(int sock, pid_t target) | |
372 | { | |
05b7a16d | 373 | __do_close int fd = -EBADF; |
87f7558b | 374 | char path[LXCFS_PROC_PID_NS_LEN]; |
580fe4df | 375 | pid_t pid; |
87f7558b CB |
376 | |
377 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
378 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
379 | if (fd < 0) | |
380 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
381 | ||
382 | if (setns(fd, 0)) | |
383 | log_exit("Failed to setns to pid namespace of process %d", target); | |
384 | ||
f1744de4 | 385 | pid = lxcfs_clone(send_creds_clone_wrapper, INT_TO_PTR(sock), 0); |
580fe4df | 386 | if (pid < 0) |
87f7558b CB |
387 | _exit(EXIT_FAILURE); |
388 | ||
580fe4df CB |
389 | if (pid != 0) { |
390 | if (!wait_for_pid(pid)) | |
87f7558b CB |
391 | _exit(EXIT_FAILURE); |
392 | ||
393 | _exit(EXIT_SUCCESS); | |
237e200e | 394 | } |
237e200e SH |
395 | } |
396 | ||
580fe4df | 397 | static pid_t get_init_pid_for_task(pid_t task) |
237e200e | 398 | { |
580fe4df | 399 | char v = '0'; |
87f7558b | 400 | pid_t pid_ret = -1; |
dac3dc93 CB |
401 | struct ucred cred = { |
402 | .pid = -1, | |
403 | .uid = -1, | |
404 | .gid = -1, | |
405 | }; | |
87f7558b CB |
406 | pid_t pid; |
407 | int sock[2]; | |
237e200e | 408 | |
87f7558b | 409 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 410 | return -1; |
237e200e | 411 | |
580fe4df CB |
412 | pid = fork(); |
413 | if (pid < 0) | |
414 | goto out; | |
87f7558b CB |
415 | |
416 | if (pid == 0) { | |
580fe4df CB |
417 | close(sock[1]); |
418 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 419 | _exit(EXIT_SUCCESS); |
237e200e | 420 | } |
7213ec5c | 421 | |
580fe4df CB |
422 | if (!recv_creds(sock[1], &cred, &v)) |
423 | goto out; | |
87f7558b CB |
424 | |
425 | pid_ret = cred.pid; | |
237e200e | 426 | |
580fe4df CB |
427 | out: |
428 | close(sock[0]); | |
429 | close(sock[1]); | |
430 | if (pid > 0) | |
431 | wait_for_pid(pid); | |
237e200e | 432 | |
87f7558b CB |
433 | return pid_ret; |
434 | } | |
2aa59b2e CB |
435 | |
436 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 437 | { |
fcdedd16 | 438 | __do_unlock pthread_mutex_t *store_mutex = NULL; |
580fe4df | 439 | pid_t answer = 0; |
2aa59b2e CB |
440 | char path[LXCFS_PROC_PID_NS_LEN]; |
441 | struct stat st; | |
442 | struct pidns_init_store *entry; | |
443 | ||
444 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
b7672ded | 445 | |
2aa59b2e | 446 | if (stat(path, &st)) |
580fe4df | 447 | goto out; |
2aa59b2e | 448 | |
fcdedd16 WB |
449 | store_mutex = store_lock(); |
450 | ||
451 | entry = lookup_verify_initpid(st.st_ino); | |
2aa59b2e CB |
452 | if (entry) { |
453 | answer = entry->initpid; | |
580fe4df CB |
454 | goto out; |
455 | } | |
2aa59b2e | 456 | |
fcdedd16 WB |
457 | /* release the mutex as the following call is expensive */ |
458 | unlock_mutex(move_ptr(store_mutex)); | |
2aa59b2e | 459 | answer = get_init_pid_for_task(pid); |
fcdedd16 WB |
460 | store_mutex = store_lock(); |
461 | ||
580fe4df | 462 | if (answer > 0) |
fcdedd16 | 463 | save_initpid(st.st_ino, answer); |
b7672ded | 464 | |
580fe4df | 465 | out: |
2aa59b2e CB |
466 | /* |
467 | * Prune at the end in case we're returning the value we were about to | |
468 | * return. | |
469 | */ | |
580fe4df | 470 | prune_initpid_store(); |
2aa59b2e | 471 | |
580fe4df | 472 | return answer; |
237e200e SH |
473 | } |
474 | ||
29a73c2f CB |
475 | /* |
476 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
477 | */ |
478 | ||
29a73c2f CB |
479 | static bool umount_if_mounted(void) |
480 | { | |
481 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 482 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
483 | return false; |
484 | } | |
485 | return true; | |
486 | } | |
487 | ||
2283e240 CB |
488 | /* __typeof__ should be safe to use with all compilers. */ |
489 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
490 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
491 | { | |
492 | return (fs->f_type == (fs_type_magic)magic_val); | |
493 | } | |
494 | ||
0a4dea41 CB |
495 | /* |
496 | * looking at fs/proc_namespace.c, it appears we can | |
497 | * actually expect the rootfs entry to very specifically contain | |
498 | * " - rootfs rootfs " | |
499 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
500 | * the rootfs entry should always be skipped in mountinfo contents. | |
501 | */ | |
502 | static bool is_on_ramfs(void) | |
503 | { | |
87f7558b | 504 | __do_free char *line = NULL; |
757a63e7 | 505 | __do_free void *fopen_cache = NULL; |
87f7558b | 506 | __do_fclose FILE *f = NULL; |
0a4dea41 | 507 | size_t len = 0; |
0a4dea41 | 508 | |
757a63e7 | 509 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
510 | if (!f) |
511 | return false; | |
512 | ||
513 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
514 | int i; |
515 | char *p, *p2; | |
516 | ||
0a4dea41 CB |
517 | for (p = line, i = 0; p && i < 4; i++) |
518 | p = strchr(p + 1, ' '); | |
519 | if (!p) | |
520 | continue; | |
87f7558b | 521 | |
0a4dea41 CB |
522 | p2 = strchr(p + 1, ' '); |
523 | if (!p2) | |
524 | continue; | |
525 | *p2 = '\0'; | |
526 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 527 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 528 | p = strchr(p2 + 1, '-'); |
87f7558b | 529 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 530 | return true; |
0a4dea41 CB |
531 | } |
532 | } | |
87f7558b | 533 | |
0a4dea41 CB |
534 | return false; |
535 | } | |
536 | ||
cc309f33 | 537 | static int pivot_enter() |
0a4dea41 | 538 | { |
05b7a16d | 539 | __do_close int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 | 540 | |
3326c17e | 541 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
542 | if (oldroot < 0) |
543 | return log_error_errno(-1, errno, | |
544 | "Failed to open old root for fchdir"); | |
cc309f33 | 545 | |
3326c17e | 546 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
547 | if (newroot < 0) |
548 | return log_error_errno(-1, errno, | |
549 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
550 | |
551 | /* change into new root fs */ | |
87f7558b CB |
552 | if (fchdir(newroot) < 0) |
553 | return log_error_errno(-1, | |
554 | errno, "Failed to change directory to new rootfs: %s", | |
555 | ROOTDIR); | |
cc309f33 | 556 | |
0a4dea41 | 557 | /* pivot_root into our new root fs */ |
87f7558b CB |
558 | if (pivot_root(".", ".") < 0) |
559 | return log_error_errno(-1, errno, | |
560 | "pivot_root() syscall failed: %s", | |
561 | strerror(errno)); | |
0a4dea41 CB |
562 | |
563 | /* | |
564 | * At this point the old-root is mounted on top of our new-root. | |
565 | * To unmounted it we must not be chdir'd into it, so escape back | |
566 | * to the old-root. | |
567 | */ | |
87f7558b CB |
568 | if (fchdir(oldroot) < 0) |
569 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 570 | |
87f7558b CB |
571 | if (umount2(".", MNT_DETACH) < 0) |
572 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 573 | |
87f7558b CB |
574 | if (fchdir(newroot) < 0) |
575 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 576 | |
87f7558b | 577 | return 0; |
0a4dea41 CB |
578 | } |
579 | ||
580 | static int chroot_enter() | |
581 | { | |
582 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
583 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
584 | return -1; | |
585 | } | |
586 | ||
587 | if (chroot(".") < 0) { | |
588 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
589 | return -1; | |
590 | } | |
591 | ||
592 | if (chdir("/") < 0) { | |
593 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
594 | return -1; | |
595 | } | |
596 | ||
597 | return 0; | |
598 | } | |
599 | ||
0232cbac | 600 | static int permute_and_enter(void) |
29a73c2f | 601 | { |
0a4dea41 CB |
602 | struct statfs sb; |
603 | ||
604 | if (statfs("/", &sb) < 0) { | |
605 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 606 | return -1; |
0a4dea41 CB |
607 | } |
608 | ||
609 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
610 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
611 | * /proc/1/mountinfo. */ | |
612 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
613 | return chroot_enter(); | |
29a73c2f | 614 | |
cc309f33 | 615 | if (pivot_enter() < 0) { |
0a4dea41 | 616 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 617 | return -1; |
29a73c2f CB |
618 | } |
619 | ||
cc309f33 | 620 | return 0; |
29a73c2f CB |
621 | } |
622 | ||
623 | /* Prepare our new clean root. */ | |
0232cbac | 624 | static int permute_prepare(void) |
29a73c2f CB |
625 | { |
626 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 627 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
628 | return -1; |
629 | } | |
630 | ||
631 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 632 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
633 | return -1; |
634 | } | |
635 | ||
636 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 637 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
638 | return -1; |
639 | } | |
640 | ||
641 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 642 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
643 | return -1; |
644 | } | |
645 | ||
646 | return 0; | |
647 | } | |
648 | ||
0232cbac CB |
649 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
650 | static bool permute_root(void) | |
29a73c2f CB |
651 | { |
652 | /* Prepare new root. */ | |
0232cbac | 653 | if (permute_prepare() < 0) |
29a73c2f CB |
654 | return false; |
655 | ||
656 | /* Pivot into new root. */ | |
0232cbac | 657 | if (permute_and_enter() < 0) |
29a73c2f CB |
658 | return false; |
659 | ||
660 | return true; | |
661 | } | |
662 | ||
0a4dea41 | 663 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
664 | { |
665 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 666 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
667 | return false; |
668 | } | |
480262c9 | 669 | |
29a73c2f | 670 | if (!umount_if_mounted()) { |
b8defc3d | 671 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
672 | return false; |
673 | } | |
674 | ||
675 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 676 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
677 | return false; |
678 | } | |
679 | ||
1d81c6a6 | 680 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 681 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
682 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
683 | return false; | |
684 | } | |
685 | ||
480262c9 | 686 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 687 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
688 | return false; |
689 | } | |
480262c9 | 690 | |
29a73c2f | 691 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 692 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
693 | return false; |
694 | } | |
480262c9 | 695 | |
29a73c2f CB |
696 | return true; |
697 | } | |
698 | ||
0a4dea41 | 699 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 700 | { |
5fbea8a6 CB |
701 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
702 | return false; | |
51c7ca35 | 703 | |
5fbea8a6 CB |
704 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
705 | return false; | |
29a73c2f | 706 | |
5fbea8a6 CB |
707 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
708 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
709 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
710 | if ((*h)->fd < 0) | |
29a73c2f | 711 | return false; |
29a73c2f | 712 | } |
5fbea8a6 | 713 | |
29a73c2f CB |
714 | return true; |
715 | } | |
716 | ||
480262c9 | 717 | static bool cgfs_setup_controllers(void) |
29a73c2f | 718 | { |
0a4dea41 | 719 | if (!cgfs_prepare_mounts()) |
29a73c2f | 720 | return false; |
29a73c2f | 721 | |
2b8eff1d CB |
722 | if (!cgfs_mount_hierarchies()) |
723 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 724 | |
0232cbac | 725 | if (!permute_root()) |
29a73c2f CB |
726 | return false; |
727 | ||
728 | return true; | |
729 | } | |
730 | ||
dee86006 | 731 | static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) |
b9b6bdc9 CB |
732 | { |
733 | int ret; | |
734 | ||
735 | if (reload_successful) { | |
736 | reload_successful = 0; | |
737 | ||
738 | /* write() is async signal safe */ | |
739 | ret = write(STDERR_FILENO, | |
740 | "Switched into non-virtualization mode\n", | |
741 | STRLITERALLEN("Switched into non-virtualization mode\n")); | |
742 | if (ret < 0) | |
743 | goto please_compiler; | |
744 | } else { | |
745 | reload_successful = 1; | |
746 | ||
747 | /* write() is async signal safe */ | |
748 | ret = write(STDERR_FILENO, "Switched into virtualization mode\n", | |
749 | STRLITERALLEN("Switched into virtualization mode\n")); | |
750 | if (ret < 0) | |
751 | goto please_compiler; | |
752 | } | |
753 | ||
754 | please_compiler: | |
755 | /* | |
756 | * The write() syscall is a function whose return value needs to be | |
757 | * checked. Otherwise the compiler will warn. This is how we | |
758 | * please our master. Another one could be to use | |
759 | * syscall(__NR_write, ...) directly but whatever. | |
760 | */ | |
761 | return; | |
762 | } | |
763 | ||
2243c5a9 | 764 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 765 | { |
05b7a16d | 766 | __do_close int init_ns = -EBADF, root_fd = -EBADF, |
de69569b | 767 | pidfd = -EBADF; |
4ec5c9da | 768 | int i = 0; |
2aa59b2e | 769 | pid_t pid; |
237e200e | 770 | |
c2357135 | 771 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); |
cc42d0c7 | 772 | |
5fbea8a6 | 773 | cgroup_ops = cgroup_init(); |
c2357135 CB |
774 | if (!cgroup_ops) { |
775 | lxcfs_info("Failed to initialize cgroup support"); | |
776 | goto broken_upgrade; | |
777 | } | |
237e200e | 778 | |
480262c9 | 779 | /* Preserve initial namespace. */ |
2aa59b2e CB |
780 | pid = getpid(); |
781 | init_ns = preserve_ns(pid, "mnt"); | |
c2357135 CB |
782 | if (init_ns < 0) { |
783 | lxcfs_info("Failed to preserve initial mount namespace"); | |
784 | goto broken_upgrade; | |
785 | } | |
480262c9 | 786 | |
480262c9 CB |
787 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
788 | * to privately mount lxcfs cgroups. */ | |
c2357135 | 789 | if (!cgfs_setup_controllers()) { |
2243c5a9 | 790 | log_exit("Failed to setup private cgroup mounts for lxcfs"); |
c2357135 CB |
791 | goto broken_upgrade; |
792 | } | |
480262c9 | 793 | |
c2357135 | 794 | if (setns(init_ns, 0) < 0) { |
2243c5a9 | 795 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); |
c2357135 CB |
796 | goto broken_upgrade; |
797 | } | |
29a73c2f | 798 | |
c2357135 | 799 | if (!init_cpuview()) { |
2243c5a9 | 800 | log_exit("Failed to init CPU view"); |
c2357135 CB |
801 | goto broken_upgrade; |
802 | } | |
056adcef | 803 | |
cc42d0c7 CB |
804 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
805 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
806 | |
807 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
808 | char **controller_list = (*h)->controllers; |
809 | __do_free char *controllers = NULL; | |
810 | if (controller_list && *controller_list) | |
811 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
812 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 813 | } |
2aa59b2e CB |
814 | |
815 | pidfd = pidfd_open(pid, 0); | |
816 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
817 | can_use_pidfd = true; | |
cc42d0c7 | 818 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 819 | } |
ce8fc84c | 820 | |
cc42d0c7 | 821 | lxcfs_info("api_extensions:"); |
ce8fc84c | 822 | for (i = 0; i < nr_api_extensions; i++) |
cc42d0c7 | 823 | lxcfs_info("- %s", api_extensions[i]); |
de69569b CB |
824 | |
825 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
c2357135 CB |
826 | if (root_fd < 0) |
827 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
828 | else if (fchdir(root_fd) < 0) | |
829 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
830 | ||
dee86006 CB |
831 | if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { |
832 | lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); | |
b9b6bdc9 | 833 | goto broken_upgrade; |
dee86006 | 834 | } |
b9b6bdc9 CB |
835 | |
836 | reload_successful = 1; | |
c2357135 | 837 | return; |
de69569b | 838 | |
c2357135 | 839 | broken_upgrade: |
b9b6bdc9 | 840 | reload_successful = 0; |
c2357135 | 841 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); |
237e200e SH |
842 | } |
843 | ||
2243c5a9 | 844 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 845 | { |
cc42d0c7 CB |
846 | lxcfs_info("Running destructor %s", __func__); |
847 | ||
056adcef | 848 | free_cpuview(); |
2243c5a9 | 849 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 850 | } |