]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
1f5596dd CB |
3 | #ifndef _GNU_SOURCE |
4 | #define _GNU_SOURCE | |
5 | #endif | |
6 | ||
7 | #ifndef FUSE_USE_VERSION | |
237e200e | 8 | #define FUSE_USE_VERSION 26 |
1f5596dd CB |
9 | #endif |
10 | ||
11 | #define _FILE_OFFSET_BITS 64 | |
237e200e | 12 | |
237e200e | 13 | #include <dirent.h> |
29a73c2f | 14 | #include <errno.h> |
237e200e SH |
15 | #include <fcntl.h> |
16 | #include <fuse.h> | |
0ecddf02 | 17 | #include <inttypes.h> |
237e200e | 18 | #include <libgen.h> |
dee86006 CB |
19 | #include <linux/magic.h> |
20 | #include <linux/sched.h> | |
237e200e | 21 | #include <pthread.h> |
29a73c2f | 22 | #include <sched.h> |
db1b32f6 | 23 | #include <stdarg.h> |
29a73c2f | 24 | #include <stdbool.h> |
0ecddf02 | 25 | #include <stdint.h> |
29a73c2f CB |
26 | #include <stdio.h> |
27 | #include <stdlib.h> | |
28 | #include <string.h> | |
29a73c2f CB |
29 | #include <sys/epoll.h> |
30 | #include <sys/mman.h> | |
31 | #include <sys/mount.h> | |
237e200e SH |
32 | #include <sys/param.h> |
33 | #include <sys/socket.h> | |
29a73c2f | 34 | #include <sys/syscall.h> |
0ecddf02 | 35 | #include <sys/sysinfo.h> |
d89504c4 | 36 | #include <sys/vfs.h> |
dee86006 CB |
37 | #include <time.h> |
38 | #include <unistd.h> | |
39 | #include <wait.h> | |
237e200e | 40 | |
ce8fc84c | 41 | #include "api_extensions.h" |
237e200e | 42 | #include "bindings.h" |
580fe4df | 43 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
44 | #include "cgroups/cgroup.h" |
45 | #include "cgroups/cgroup_utils.h" | |
dee86006 | 46 | #include "config.h" |
c9236032 | 47 | #include "memory_utils.h" |
1f5596dd | 48 | #include "proc_cpuview.h" |
8364a99c | 49 | #include "syscall_numbers.h" |
1d81c6a6 | 50 | #include "utils.h" |
237e200e | 51 | |
2aa59b2e | 52 | static bool can_use_pidfd; |
b9b6bdc9 CB |
53 | |
54 | static volatile sig_atomic_t reload_successful; | |
cbfc55fd CB |
55 | |
56 | bool liblxcfs_functional(void) | |
57 | { | |
b9b6bdc9 | 58 | return reload_successful != 0; |
cbfc55fd | 59 | } |
2aa59b2e | 60 | |
29a73c2f CB |
61 | /* Define pivot_root() if missing from the C library */ |
62 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 63 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f | 64 | { |
4ec5c9da | 65 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f CB |
66 | } |
67 | #else | |
4ec5c9da | 68 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
69 | #endif |
70 | ||
237e200e SH |
71 | /* |
72 | * A table caching which pid is init for a pid namespace. | |
73 | * When looking up which pid is init for $qpid, we first | |
74 | * 1. Stat /proc/$qpid/ns/pid. | |
75 | * 2. Check whether the ino_t is in our store. | |
76 | * a. if not, fork a child in qpid's ns to send us | |
77 | * ucred.pid = 1, and read the initpid. Cache | |
78 | * initpid and creation time for /proc/initpid | |
79 | * in a new store entry. | |
80 | * b. if so, verify that /proc/initpid still matches | |
81 | * what we have saved. If not, clear the store | |
82 | * entry and go back to a. If so, return the | |
83 | * cached initpid. | |
84 | */ | |
85 | struct pidns_init_store { | |
2aa59b2e CB |
86 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
87 | pid_t initpid; /* the pid of nit in that ns */ | |
88 | int init_pidfd; | |
1ba088ae | 89 | int64_t ctime; /* the time at which /proc/$initpid was created */ |
237e200e | 90 | struct pidns_init_store *next; |
1ba088ae | 91 | int64_t lastcheck; |
237e200e SH |
92 | }; |
93 | ||
94 | /* lol - look at how they are allocated in the kernel */ | |
95 | #define PIDNS_HASH_SIZE 4096 | |
96 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
97 | ||
98 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
99 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 100 | |
4e1e4115 | 101 | static void mutex_lock(pthread_mutex_t *l) |
237e200e SH |
102 | { |
103 | int ret; | |
104 | ||
4ec5c9da CB |
105 | ret = pthread_mutex_lock(l); |
106 | if (ret) | |
107 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
108 | } |
109 | ||
77f4399a | 110 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 111 | |
4e1e4115 | 112 | static void mutex_unlock(pthread_mutex_t *l) |
237e200e SH |
113 | { |
114 | int ret; | |
115 | ||
4ec5c9da CB |
116 | ret = pthread_mutex_unlock(l); |
117 | if (ret) | |
118 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
119 | } |
120 | ||
4e1e4115 | 121 | static inline void store_lock(void) |
237e200e | 122 | { |
4e1e4115 | 123 | mutex_lock(&pidns_store_mutex); |
237e200e SH |
124 | } |
125 | ||
4e1e4115 | 126 | static inline void store_unlock(void) |
237e200e | 127 | { |
4e1e4115 | 128 | mutex_unlock(&pidns_store_mutex); |
237e200e SH |
129 | } |
130 | ||
2aa59b2e CB |
131 | /* /proc/ = 6 |
132 | * + | |
133 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
134 | * + | |
135 | * \0 = 1 | |
136 | */ | |
137 | #define LXCFS_PROC_PID_LEN \ | |
138 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
139 | ||
bc189096 | 140 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 141 | { |
bc189096 | 142 | int ret; |
237e200e | 143 | |
bc189096 CB |
144 | if (entry->init_pidfd < 0) |
145 | return ret_errno(ENOSYS); | |
7dd6560a | 146 | |
bc189096 CB |
147 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
148 | if (ret < 0) { | |
149 | if (errno == ENOSYS) | |
150 | return ret_errno(ENOSYS); | |
7dd6560a | 151 | |
bc189096 | 152 | return 0; |
2aa59b2e CB |
153 | } |
154 | ||
bc189096 CB |
155 | return 1; |
156 | } | |
157 | ||
158 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
159 | { | |
160 | struct stat st; | |
161 | char path[LXCFS_PROC_PID_LEN]; | |
162 | ||
163 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
164 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
165 | return 0; | |
166 | ||
167 | return 1; | |
168 | } | |
169 | ||
170 | /* Must be called under store_lock */ | |
171 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
172 | { | |
173 | int ret; | |
174 | ||
175 | ret = initpid_still_valid_pidfd(entry); | |
176 | if (ret < 0) | |
177 | ret = initpid_still_valid_stat(entry); | |
178 | ||
179 | return ret == 1; | |
237e200e SH |
180 | } |
181 | ||
182 | /* Must be called under store_lock */ | |
2aa59b2e | 183 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 184 | { |
2aa59b2e CB |
185 | struct pidns_init_store *it; |
186 | int ino_hash; | |
237e200e | 187 | |
2aa59b2e CB |
188 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
189 | entry->initpid); | |
7dd6560a | 190 | |
2aa59b2e CB |
191 | ino_hash = HASH(entry->ino); |
192 | if (pidns_hash_table[ino_hash] == entry) { | |
193 | pidns_hash_table[ino_hash] = entry->next; | |
194 | close_prot_errno_disarm(entry->init_pidfd); | |
195 | free_disarm(entry); | |
237e200e SH |
196 | return; |
197 | } | |
198 | ||
2aa59b2e CB |
199 | it = pidns_hash_table[ino_hash]; |
200 | while (it) { | |
201 | if (it->next == entry) { | |
202 | it->next = entry->next; | |
203 | close_prot_errno_disarm(entry->init_pidfd); | |
204 | free_disarm(entry); | |
237e200e SH |
205 | return; |
206 | } | |
2aa59b2e | 207 | it = it->next; |
237e200e SH |
208 | } |
209 | } | |
210 | ||
211 | #define PURGE_SECS 5 | |
212 | /* Must be called under store_lock */ | |
213 | static void prune_initpid_store(void) | |
214 | { | |
1ba088ae CB |
215 | static int64_t last_prune = 0; |
216 | int64_t now, threshold; | |
237e200e SH |
217 | |
218 | if (!last_prune) { | |
219 | last_prune = time(NULL); | |
220 | return; | |
221 | } | |
2aa59b2e | 222 | |
237e200e | 223 | now = time(NULL); |
b18d6121 | 224 | if (now < (last_prune + PURGE_SECS)) |
237e200e | 225 | return; |
7dd6560a | 226 | |
2aa59b2e | 227 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 228 | |
237e200e SH |
229 | last_prune = now; |
230 | threshold = now - 2 * PURGE_SECS; | |
231 | ||
2aa59b2e CB |
232 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
233 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
234 | if (entry->lastcheck < threshold) { | |
235 | struct pidns_init_store *cur = entry; | |
7dd6560a | 236 | |
2aa59b2e | 237 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 238 | |
237e200e | 239 | if (prev) |
2aa59b2e | 240 | prev->next = entry->next; |
237e200e | 241 | else |
2aa59b2e CB |
242 | pidns_hash_table[i] = entry->next; |
243 | entry = entry->next; | |
244 | close_prot_errno_disarm(cur->init_pidfd); | |
245 | free_disarm(cur); | |
237e200e | 246 | } else { |
2aa59b2e CB |
247 | prev = entry; |
248 | entry = entry->next; | |
237e200e SH |
249 | } |
250 | } | |
251 | } | |
252 | } | |
253 | ||
c8f77ce4 CB |
254 | static void clear_initpid_store(void) |
255 | { | |
256 | store_lock(); | |
257 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { | |
258 | for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) { | |
259 | struct pidns_init_store *cur = entry; | |
260 | ||
261 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); | |
262 | ||
263 | pidns_hash_table[i] = entry->next; | |
264 | entry = entry->next; | |
265 | close_prot_errno_disarm(cur->init_pidfd); | |
266 | free_disarm(cur); | |
267 | } | |
268 | } | |
269 | store_unlock(); | |
270 | } | |
271 | ||
237e200e | 272 | /* Must be called under store_lock */ |
fcdedd16 | 273 | static void save_initpid(ino_t pidns_inode, pid_t pid) |
237e200e | 274 | { |
1e5d03fe | 275 | __do_free struct pidns_init_store *entry = NULL; |
05b7a16d | 276 | __do_close int pidfd = -EBADF; |
536620fd | 277 | const struct lxcfs_opts *opts = fuse_get_context()->private_data; |
2aa59b2e | 278 | char path[LXCFS_PROC_PID_LEN]; |
2aa59b2e CB |
279 | struct stat st; |
280 | int ino_hash; | |
281 | ||
9973cc06 | 282 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
283 | pidfd = pidfd_open(pid, 0); |
284 | if (pidfd < 0) | |
285 | return; | |
286 | } | |
237e200e | 287 | |
2aa59b2e CB |
288 | snprintf(path, sizeof(path), "/proc/%d", pid); |
289 | if (stat(path, &st)) | |
290 | return; | |
7dd6560a | 291 | |
5ec289bf | 292 | entry = zalloc(sizeof(*entry)); |
0eb3756b | 293 | if (!entry) |
237e200e | 294 | return; |
2aa59b2e | 295 | |
97017213 | 296 | ino_hash = HASH(pidns_inode); |
1e5d03fe | 297 | *entry = (struct pidns_init_store){ |
fcdedd16 | 298 | .ino = pidns_inode, |
1e5d03fe CB |
299 | .initpid = pid, |
300 | .ctime = st.st_ctime, | |
301 | .next = pidns_hash_table[ino_hash], | |
302 | .lastcheck = time(NULL), | |
303 | .init_pidfd = move_fd(pidfd), | |
304 | }; | |
305 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
306 | |
307 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
308 | } |
309 | ||
310 | /* | |
311 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
312 | * entry for the inode number and creation time. Verify that the init pid | |
313 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
314 | * otherwise. | |
315 | * Must be called under store_lock | |
316 | */ | |
cfda2e8a | 317 | static pid_t lookup_verify_initpid(ino_t pidns_inode) |
237e200e | 318 | { |
fcdedd16 | 319 | struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; |
2aa59b2e CB |
320 | |
321 | while (entry) { | |
fcdedd16 | 322 | if (entry->ino == pidns_inode) { |
2aa59b2e CB |
323 | if (initpid_still_valid(entry)) { |
324 | entry->lastcheck = time(NULL); | |
cfda2e8a | 325 | return entry->initpid; |
237e200e | 326 | } |
2aa59b2e CB |
327 | |
328 | remove_initpid(entry); | |
cfda2e8a | 329 | return ret_errno(ESRCH); |
237e200e | 330 | } |
2aa59b2e | 331 | entry = entry->next; |
237e200e SH |
332 | } |
333 | ||
cfda2e8a | 334 | return ret_errno(ESRCH); |
237e200e SH |
335 | } |
336 | ||
35acc247 | 337 | static bool send_creds_ok(int sock_fd) |
237e200e | 338 | { |
f1744de4 CB |
339 | char v = '1'; /* we are the child */ |
340 | struct ucred cred = { | |
341 | .uid = 0, | |
342 | .gid = 0, | |
343 | .pid = 1, | |
344 | }; | |
345 | ||
35acc247 | 346 | return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK; |
237e200e SH |
347 | } |
348 | ||
35acc247 | 349 | __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd) |
87f7558b | 350 | { |
35acc247 CB |
351 | /* |
352 | * These flags don't interest at all so we don't jump through any hoops | |
353 | * of retrieving them and passing them to the kernel. | |
354 | */ | |
355 | errno = EINVAL; | |
356 | if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | | |
357 | CLONE_CHILD_CLEARTID | CLONE_SETTLS))) | |
358 | return -EINVAL; | |
359 | ||
360 | #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) | |
361 | /* On s390/s390x and cris the order of the first and second arguments | |
362 | * of the system call is reversed. | |
363 | */ | |
364 | return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd); | |
365 | #elif defined(__sparc__) && defined(__arch64__) | |
366 | { | |
367 | /* | |
368 | * sparc64 always returns the other process id in %o0, and a | |
369 | * boolean flag whether this is the child or the parent in %o1. | |
370 | * Inline assembly is needed to get the flag returned in %o1. | |
371 | */ | |
372 | register long g1 asm("g1") = __NR_clone; | |
373 | register long o0 asm("o0") = flags | SIGCHLD; | |
374 | register long o1 asm("o1") = 0; /* is parent/child indicator */ | |
375 | register long o2 asm("o2") = (unsigned long)pidfd; | |
376 | long is_error, retval, in_child; | |
377 | pid_t child_pid; | |
378 | ||
379 | asm volatile( | |
380 | #if defined(__arch64__) | |
381 | "t 0x6d\n\t" /* 64-bit trap */ | |
382 | #else | |
383 | "t 0x10\n\t" /* 32-bit trap */ | |
384 | #endif | |
385 | /* | |
386 | * catch errors: On sparc, the carry bit (csr) in the | |
387 | * processor status register (psr) is used instead of a | |
388 | * full register. | |
389 | */ | |
390 | "addx %%g0, 0, %%g1" | |
391 | : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ | |
392 | : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ | |
393 | : "%cc"); /* clobbers */ | |
394 | ||
395 | is_error = g1; | |
396 | retval = o0; | |
397 | in_child = o1; | |
398 | ||
399 | if (is_error) { | |
400 | errno = retval; | |
401 | return -1; | |
402 | } | |
87f7558b | 403 | |
35acc247 CB |
404 | if (in_child) |
405 | return 0; | |
87f7558b | 406 | |
35acc247 CB |
407 | child_pid = retval; |
408 | return child_pid; | |
409 | } | |
410 | #elif defined(__ia64__) | |
411 | /* On ia64 the stack and stack size are passed as separate arguments. */ | |
412 | return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd); | |
87f7558b | 413 | #else |
35acc247 | 414 | return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd); |
87f7558b | 415 | #endif |
87f7558b CB |
416 | } |
417 | ||
418 | #define LXCFS_PROC_PID_NS_LEN \ | |
419 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
420 | STRLITERALLEN("/ns/pid") + 1) | |
421 | ||
580fe4df CB |
422 | /* |
423 | * clone a task which switches to @task's namespace and writes '1'. | |
424 | * over a unix sock so we can read the task's reaper's pid in our | |
425 | * namespace | |
426 | * | |
427 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
428 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
429 | * the pidns and the parent pid outside are identical. Using clone prevents | |
430 | * this issue. | |
431 | */ | |
432 | static void write_task_init_pid_exit(int sock, pid_t target) | |
433 | { | |
05b7a16d | 434 | __do_close int fd = -EBADF; |
87f7558b | 435 | char path[LXCFS_PROC_PID_NS_LEN]; |
580fe4df | 436 | pid_t pid; |
87f7558b CB |
437 | |
438 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
439 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
440 | if (fd < 0) | |
441 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
442 | ||
443 | if (setns(fd, 0)) | |
444 | log_exit("Failed to setns to pid namespace of process %d", target); | |
445 | ||
35acc247 | 446 | pid = lxcfs_raw_clone(0, NULL); |
580fe4df | 447 | if (pid < 0) |
87f7558b CB |
448 | _exit(EXIT_FAILURE); |
449 | ||
35acc247 CB |
450 | if (pid == 0) { |
451 | if (!send_creds_ok(sock)) | |
87f7558b CB |
452 | _exit(EXIT_FAILURE); |
453 | ||
454 | _exit(EXIT_SUCCESS); | |
237e200e | 455 | } |
35acc247 CB |
456 | |
457 | if (!wait_for_pid(pid)) | |
458 | _exit(EXIT_FAILURE); | |
459 | ||
460 | _exit(EXIT_SUCCESS); | |
237e200e SH |
461 | } |
462 | ||
8a07696e | 463 | static pid_t scm_init_pid(pid_t task) |
237e200e | 464 | { |
580fe4df | 465 | char v = '0'; |
87f7558b | 466 | pid_t pid_ret = -1; |
dac3dc93 CB |
467 | struct ucred cred = { |
468 | .pid = -1, | |
469 | .uid = -1, | |
470 | .gid = -1, | |
471 | }; | |
87f7558b CB |
472 | pid_t pid; |
473 | int sock[2]; | |
237e200e | 474 | |
87f7558b | 475 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 476 | return -1; |
237e200e | 477 | |
580fe4df CB |
478 | pid = fork(); |
479 | if (pid < 0) | |
480 | goto out; | |
87f7558b CB |
481 | |
482 | if (pid == 0) { | |
580fe4df CB |
483 | close(sock[1]); |
484 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 485 | _exit(EXIT_SUCCESS); |
237e200e | 486 | } |
7213ec5c | 487 | |
580fe4df CB |
488 | if (!recv_creds(sock[1], &cred, &v)) |
489 | goto out; | |
87f7558b CB |
490 | |
491 | pid_ret = cred.pid; | |
237e200e | 492 | |
580fe4df CB |
493 | out: |
494 | close(sock[0]); | |
495 | close(sock[1]); | |
496 | if (pid > 0) | |
497 | wait_for_pid(pid); | |
237e200e | 498 | |
87f7558b CB |
499 | return pid_ret; |
500 | } | |
2aa59b2e CB |
501 | |
502 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 503 | { |
cfda2e8a | 504 | pid_t hashed_pid = 0; |
2aa59b2e CB |
505 | char path[LXCFS_PROC_PID_NS_LEN]; |
506 | struct stat st; | |
2aa59b2e CB |
507 | |
508 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
2aa59b2e | 509 | if (stat(path, &st)) |
4e1e4115 | 510 | return ret_errno(ESRCH); |
2aa59b2e | 511 | |
4e1e4115 | 512 | store_lock(); |
fcdedd16 | 513 | |
cfda2e8a CB |
514 | hashed_pid = lookup_verify_initpid(st.st_ino); |
515 | if (hashed_pid < 0) { | |
516 | /* release the mutex as the following call is expensive */ | |
517 | store_unlock(); | |
2aa59b2e | 518 | |
8a07696e | 519 | hashed_pid = scm_init_pid(pid); |
4e1e4115 | 520 | |
cfda2e8a | 521 | store_lock(); |
4e1e4115 | 522 | |
cfda2e8a CB |
523 | if (hashed_pid > 0) |
524 | save_initpid(st.st_ino, hashed_pid); | |
525 | } | |
b7672ded | 526 | |
2aa59b2e | 527 | /* |
cfda2e8a CB |
528 | * Prune at the end in case we're pruning the value |
529 | * we were about to return. | |
2aa59b2e | 530 | */ |
580fe4df | 531 | prune_initpid_store(); |
4e1e4115 | 532 | store_unlock(); |
2aa59b2e | 533 | |
cfda2e8a | 534 | return hashed_pid; |
237e200e SH |
535 | } |
536 | ||
29a73c2f CB |
537 | /* |
538 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
539 | */ |
540 | ||
29a73c2f CB |
541 | static bool umount_if_mounted(void) |
542 | { | |
543 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 544 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
545 | return false; |
546 | } | |
547 | return true; | |
548 | } | |
549 | ||
2283e240 CB |
550 | /* __typeof__ should be safe to use with all compilers. */ |
551 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
552 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
553 | { | |
554 | return (fs->f_type == (fs_type_magic)magic_val); | |
555 | } | |
556 | ||
0a4dea41 CB |
557 | /* |
558 | * looking at fs/proc_namespace.c, it appears we can | |
559 | * actually expect the rootfs entry to very specifically contain | |
560 | * " - rootfs rootfs " | |
561 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
562 | * the rootfs entry should always be skipped in mountinfo contents. | |
563 | */ | |
564 | static bool is_on_ramfs(void) | |
565 | { | |
87f7558b | 566 | __do_free char *line = NULL; |
757a63e7 | 567 | __do_free void *fopen_cache = NULL; |
87f7558b | 568 | __do_fclose FILE *f = NULL; |
0a4dea41 | 569 | size_t len = 0; |
0a4dea41 | 570 | |
757a63e7 | 571 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
572 | if (!f) |
573 | return false; | |
574 | ||
575 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
576 | int i; |
577 | char *p, *p2; | |
578 | ||
0a4dea41 CB |
579 | for (p = line, i = 0; p && i < 4; i++) |
580 | p = strchr(p + 1, ' '); | |
581 | if (!p) | |
582 | continue; | |
87f7558b | 583 | |
0a4dea41 CB |
584 | p2 = strchr(p + 1, ' '); |
585 | if (!p2) | |
586 | continue; | |
587 | *p2 = '\0'; | |
588 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 589 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 590 | p = strchr(p2 + 1, '-'); |
87f7558b | 591 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 592 | return true; |
0a4dea41 CB |
593 | } |
594 | } | |
87f7558b | 595 | |
0a4dea41 CB |
596 | return false; |
597 | } | |
598 | ||
cc309f33 | 599 | static int pivot_enter() |
0a4dea41 | 600 | { |
05b7a16d | 601 | __do_close int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 | 602 | |
3326c17e | 603 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
604 | if (oldroot < 0) |
605 | return log_error_errno(-1, errno, | |
606 | "Failed to open old root for fchdir"); | |
cc309f33 | 607 | |
3326c17e | 608 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
609 | if (newroot < 0) |
610 | return log_error_errno(-1, errno, | |
611 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
612 | |
613 | /* change into new root fs */ | |
87f7558b CB |
614 | if (fchdir(newroot) < 0) |
615 | return log_error_errno(-1, | |
616 | errno, "Failed to change directory to new rootfs: %s", | |
617 | ROOTDIR); | |
cc309f33 | 618 | |
0a4dea41 | 619 | /* pivot_root into our new root fs */ |
87f7558b CB |
620 | if (pivot_root(".", ".") < 0) |
621 | return log_error_errno(-1, errno, | |
622 | "pivot_root() syscall failed: %s", | |
623 | strerror(errno)); | |
0a4dea41 CB |
624 | |
625 | /* | |
626 | * At this point the old-root is mounted on top of our new-root. | |
627 | * To unmounted it we must not be chdir'd into it, so escape back | |
628 | * to the old-root. | |
629 | */ | |
87f7558b CB |
630 | if (fchdir(oldroot) < 0) |
631 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 632 | |
87f7558b CB |
633 | if (umount2(".", MNT_DETACH) < 0) |
634 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 635 | |
87f7558b CB |
636 | if (fchdir(newroot) < 0) |
637 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 638 | |
87f7558b | 639 | return 0; |
0a4dea41 CB |
640 | } |
641 | ||
642 | static int chroot_enter() | |
643 | { | |
644 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
645 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
646 | return -1; | |
647 | } | |
648 | ||
649 | if (chroot(".") < 0) { | |
650 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
651 | return -1; | |
652 | } | |
653 | ||
654 | if (chdir("/") < 0) { | |
655 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
656 | return -1; | |
657 | } | |
658 | ||
659 | return 0; | |
660 | } | |
661 | ||
0232cbac | 662 | static int permute_and_enter(void) |
29a73c2f | 663 | { |
0a4dea41 CB |
664 | struct statfs sb; |
665 | ||
666 | if (statfs("/", &sb) < 0) { | |
667 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 668 | return -1; |
0a4dea41 CB |
669 | } |
670 | ||
671 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
672 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
673 | * /proc/1/mountinfo. */ | |
674 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
675 | return chroot_enter(); | |
29a73c2f | 676 | |
cc309f33 | 677 | if (pivot_enter() < 0) { |
0a4dea41 | 678 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 679 | return -1; |
29a73c2f CB |
680 | } |
681 | ||
cc309f33 | 682 | return 0; |
29a73c2f CB |
683 | } |
684 | ||
685 | /* Prepare our new clean root. */ | |
0232cbac | 686 | static int permute_prepare(void) |
29a73c2f CB |
687 | { |
688 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 689 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
690 | return -1; |
691 | } | |
692 | ||
693 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 694 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
695 | return -1; |
696 | } | |
697 | ||
698 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 699 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
700 | return -1; |
701 | } | |
702 | ||
703 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 704 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
705 | return -1; |
706 | } | |
707 | ||
708 | return 0; | |
709 | } | |
710 | ||
0232cbac CB |
711 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
712 | static bool permute_root(void) | |
29a73c2f CB |
713 | { |
714 | /* Prepare new root. */ | |
0232cbac | 715 | if (permute_prepare() < 0) |
29a73c2f CB |
716 | return false; |
717 | ||
718 | /* Pivot into new root. */ | |
0232cbac | 719 | if (permute_and_enter() < 0) |
29a73c2f CB |
720 | return false; |
721 | ||
722 | return true; | |
723 | } | |
724 | ||
0a4dea41 | 725 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
726 | { |
727 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 728 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
729 | return false; |
730 | } | |
480262c9 | 731 | |
29a73c2f | 732 | if (!umount_if_mounted()) { |
b8defc3d | 733 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
734 | return false; |
735 | } | |
736 | ||
737 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 738 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
739 | return false; |
740 | } | |
741 | ||
1d81c6a6 | 742 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 743 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
744 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
745 | return false; | |
746 | } | |
747 | ||
480262c9 | 748 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 749 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
750 | return false; |
751 | } | |
480262c9 | 752 | |
29a73c2f | 753 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 754 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
755 | return false; |
756 | } | |
480262c9 | 757 | |
29a73c2f CB |
758 | return true; |
759 | } | |
760 | ||
0a4dea41 | 761 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 762 | { |
5fbea8a6 CB |
763 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
764 | return false; | |
51c7ca35 | 765 | |
5fbea8a6 CB |
766 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
767 | return false; | |
29a73c2f | 768 | |
5fbea8a6 CB |
769 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
770 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
771 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
772 | if ((*h)->fd < 0) | |
29a73c2f | 773 | return false; |
29a73c2f | 774 | } |
5fbea8a6 | 775 | |
29a73c2f CB |
776 | return true; |
777 | } | |
778 | ||
480262c9 | 779 | static bool cgfs_setup_controllers(void) |
29a73c2f | 780 | { |
0a4dea41 | 781 | if (!cgfs_prepare_mounts()) |
29a73c2f | 782 | return false; |
29a73c2f | 783 | |
2b8eff1d CB |
784 | if (!cgfs_mount_hierarchies()) |
785 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 786 | |
0232cbac | 787 | if (!permute_root()) |
29a73c2f CB |
788 | return false; |
789 | ||
790 | return true; | |
791 | } | |
792 | ||
dee86006 | 793 | static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) |
b9b6bdc9 CB |
794 | { |
795 | int ret; | |
796 | ||
797 | if (reload_successful) { | |
798 | reload_successful = 0; | |
799 | ||
800 | /* write() is async signal safe */ | |
801 | ret = write(STDERR_FILENO, | |
802 | "Switched into non-virtualization mode\n", | |
803 | STRLITERALLEN("Switched into non-virtualization mode\n")); | |
804 | if (ret < 0) | |
805 | goto please_compiler; | |
806 | } else { | |
807 | reload_successful = 1; | |
808 | ||
809 | /* write() is async signal safe */ | |
810 | ret = write(STDERR_FILENO, "Switched into virtualization mode\n", | |
811 | STRLITERALLEN("Switched into virtualization mode\n")); | |
812 | if (ret < 0) | |
813 | goto please_compiler; | |
814 | } | |
815 | ||
816 | please_compiler: | |
817 | /* | |
818 | * The write() syscall is a function whose return value needs to be | |
819 | * checked. Otherwise the compiler will warn. This is how we | |
820 | * please our master. Another one could be to use | |
821 | * syscall(__NR_write, ...) directly but whatever. | |
822 | */ | |
823 | return; | |
824 | } | |
825 | ||
2243c5a9 | 826 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 827 | { |
05b7a16d | 828 | __do_close int init_ns = -EBADF, root_fd = -EBADF, |
de69569b | 829 | pidfd = -EBADF; |
4ec5c9da | 830 | int i = 0; |
2aa59b2e | 831 | pid_t pid; |
237e200e | 832 | |
c2357135 | 833 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); |
cc42d0c7 | 834 | |
5fbea8a6 | 835 | cgroup_ops = cgroup_init(); |
c2357135 CB |
836 | if (!cgroup_ops) { |
837 | lxcfs_info("Failed to initialize cgroup support"); | |
838 | goto broken_upgrade; | |
839 | } | |
237e200e | 840 | |
480262c9 | 841 | /* Preserve initial namespace. */ |
2aa59b2e CB |
842 | pid = getpid(); |
843 | init_ns = preserve_ns(pid, "mnt"); | |
c2357135 CB |
844 | if (init_ns < 0) { |
845 | lxcfs_info("Failed to preserve initial mount namespace"); | |
846 | goto broken_upgrade; | |
847 | } | |
480262c9 | 848 | |
480262c9 CB |
849 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
850 | * to privately mount lxcfs cgroups. */ | |
c2357135 | 851 | if (!cgfs_setup_controllers()) { |
2243c5a9 | 852 | log_exit("Failed to setup private cgroup mounts for lxcfs"); |
c2357135 CB |
853 | goto broken_upgrade; |
854 | } | |
480262c9 | 855 | |
c2357135 | 856 | if (setns(init_ns, 0) < 0) { |
2243c5a9 | 857 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); |
c2357135 CB |
858 | goto broken_upgrade; |
859 | } | |
29a73c2f | 860 | |
c2357135 | 861 | if (!init_cpuview()) { |
2243c5a9 | 862 | log_exit("Failed to init CPU view"); |
c2357135 CB |
863 | goto broken_upgrade; |
864 | } | |
056adcef | 865 | |
cc42d0c7 CB |
866 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
867 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
868 | |
869 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
870 | char **controller_list = (*h)->controllers; |
871 | __do_free char *controllers = NULL; | |
872 | if (controller_list && *controller_list) | |
873 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
874 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 875 | } |
2aa59b2e CB |
876 | |
877 | pidfd = pidfd_open(pid, 0); | |
878 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
879 | can_use_pidfd = true; | |
cc42d0c7 | 880 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 881 | } |
ce8fc84c | 882 | |
cc42d0c7 | 883 | lxcfs_info("api_extensions:"); |
ce8fc84c | 884 | for (i = 0; i < nr_api_extensions; i++) |
cc42d0c7 | 885 | lxcfs_info("- %s", api_extensions[i]); |
de69569b CB |
886 | |
887 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
c2357135 CB |
888 | if (root_fd < 0) |
889 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
890 | else if (fchdir(root_fd) < 0) | |
891 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
892 | ||
dee86006 CB |
893 | if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { |
894 | lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); | |
b9b6bdc9 | 895 | goto broken_upgrade; |
dee86006 | 896 | } |
b9b6bdc9 CB |
897 | |
898 | reload_successful = 1; | |
c2357135 | 899 | return; |
de69569b | 900 | |
c2357135 | 901 | broken_upgrade: |
b9b6bdc9 | 902 | reload_successful = 0; |
c2357135 | 903 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); |
237e200e SH |
904 | } |
905 | ||
2243c5a9 | 906 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 907 | { |
cc42d0c7 CB |
908 | lxcfs_info("Running destructor %s", __func__); |
909 | ||
c8f77ce4 | 910 | clear_initpid_store(); |
056adcef | 911 | free_cpuview(); |
2243c5a9 | 912 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 913 | } |