]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
f834b6bf SP |
3 | #include "config.h" |
4 | ||
237e200e | 5 | #include <dirent.h> |
29a73c2f | 6 | #include <errno.h> |
237e200e | 7 | #include <fcntl.h> |
0ecddf02 | 8 | #include <inttypes.h> |
237e200e | 9 | #include <libgen.h> |
dee86006 CB |
10 | #include <linux/magic.h> |
11 | #include <linux/sched.h> | |
237e200e | 12 | #include <pthread.h> |
29a73c2f | 13 | #include <sched.h> |
db1b32f6 | 14 | #include <stdarg.h> |
29a73c2f | 15 | #include <stdbool.h> |
0ecddf02 | 16 | #include <stdint.h> |
29a73c2f CB |
17 | #include <stdio.h> |
18 | #include <stdlib.h> | |
19 | #include <string.h> | |
29a73c2f CB |
20 | #include <sys/epoll.h> |
21 | #include <sys/mman.h> | |
22 | #include <sys/mount.h> | |
237e200e SH |
23 | #include <sys/param.h> |
24 | #include <sys/socket.h> | |
29a73c2f | 25 | #include <sys/syscall.h> |
0ecddf02 | 26 | #include <sys/sysinfo.h> |
d89504c4 | 27 | #include <sys/vfs.h> |
dee86006 CB |
28 | #include <time.h> |
29 | #include <unistd.h> | |
30 | #include <wait.h> | |
237e200e | 31 | |
237e200e | 32 | #include "bindings.h" |
e01afbb7 CB |
33 | |
34 | #include "api_extensions.h" | |
580fe4df | 35 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
36 | #include "cgroups/cgroup.h" |
37 | #include "cgroups/cgroup_utils.h" | |
c9236032 | 38 | #include "memory_utils.h" |
1f5596dd | 39 | #include "proc_cpuview.h" |
8364a99c | 40 | #include "syscall_numbers.h" |
1d81c6a6 | 41 | #include "utils.h" |
237e200e | 42 | |
2aa59b2e | 43 | static bool can_use_pidfd; |
c6805016 | 44 | static bool can_use_swap; |
285aea40 CB |
45 | static bool can_use_sys_cpu; |
46 | static bool has_versioned_opts; | |
50f7faee | 47 | static bool memory_is_cgroupv2; |
b9b6bdc9 CB |
48 | |
49 | static volatile sig_atomic_t reload_successful; | |
cbfc55fd CB |
50 | |
51 | bool liblxcfs_functional(void) | |
52 | { | |
b9b6bdc9 | 53 | return reload_successful != 0; |
cbfc55fd | 54 | } |
2aa59b2e | 55 | |
c6805016 CB |
56 | bool liblxcfs_can_use_swap(void) |
57 | { | |
58 | return can_use_swap; | |
59 | } | |
60 | ||
285aea40 CB |
61 | bool liblxcfs_can_use_sys_cpu(void) |
62 | { | |
63 | return can_use_sys_cpu; | |
64 | } | |
65 | ||
66 | bool liblxcfs_has_versioned_opts(void) | |
67 | { | |
68 | return has_versioned_opts; | |
69 | } | |
70 | ||
50f7faee WB |
71 | bool liblxcfs_memory_is_cgroupv2(void) |
72 | { | |
73 | return memory_is_cgroupv2; | |
74 | } | |
75 | ||
29a73c2f CB |
76 | /* Define pivot_root() if missing from the C library */ |
77 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 78 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f | 79 | { |
4ec5c9da | 80 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f CB |
81 | } |
82 | #else | |
4ec5c9da | 83 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
84 | #endif |
85 | ||
237e200e SH |
86 | /* |
87 | * A table caching which pid is init for a pid namespace. | |
88 | * When looking up which pid is init for $qpid, we first | |
89 | * 1. Stat /proc/$qpid/ns/pid. | |
90 | * 2. Check whether the ino_t is in our store. | |
91 | * a. if not, fork a child in qpid's ns to send us | |
92 | * ucred.pid = 1, and read the initpid. Cache | |
93 | * initpid and creation time for /proc/initpid | |
94 | * in a new store entry. | |
95 | * b. if so, verify that /proc/initpid still matches | |
96 | * what we have saved. If not, clear the store | |
97 | * entry and go back to a. If so, return the | |
98 | * cached initpid. | |
99 | */ | |
100 | struct pidns_init_store { | |
2aa59b2e CB |
101 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
102 | pid_t initpid; /* the pid of nit in that ns */ | |
103 | int init_pidfd; | |
1ba088ae | 104 | int64_t ctime; /* the time at which /proc/$initpid was created */ |
237e200e | 105 | struct pidns_init_store *next; |
1ba088ae | 106 | int64_t lastcheck; |
237e200e SH |
107 | }; |
108 | ||
109 | /* lol - look at how they are allocated in the kernel */ | |
110 | #define PIDNS_HASH_SIZE 4096 | |
111 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
112 | ||
113 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
114 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 115 | |
4e1e4115 | 116 | static void mutex_lock(pthread_mutex_t *l) |
237e200e SH |
117 | { |
118 | int ret; | |
119 | ||
4ec5c9da CB |
120 | ret = pthread_mutex_lock(l); |
121 | if (ret) | |
122 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
123 | } |
124 | ||
77f4399a | 125 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 126 | |
4e1e4115 | 127 | static void mutex_unlock(pthread_mutex_t *l) |
237e200e SH |
128 | { |
129 | int ret; | |
130 | ||
4ec5c9da CB |
131 | ret = pthread_mutex_unlock(l); |
132 | if (ret) | |
133 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
134 | } |
135 | ||
4e1e4115 | 136 | static inline void store_lock(void) |
237e200e | 137 | { |
4e1e4115 | 138 | mutex_lock(&pidns_store_mutex); |
237e200e SH |
139 | } |
140 | ||
4e1e4115 | 141 | static inline void store_unlock(void) |
237e200e | 142 | { |
4e1e4115 | 143 | mutex_unlock(&pidns_store_mutex); |
237e200e SH |
144 | } |
145 | ||
2aa59b2e CB |
146 | /* /proc/ = 6 |
147 | * + | |
148 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
149 | * + | |
150 | * \0 = 1 | |
151 | */ | |
152 | #define LXCFS_PROC_PID_LEN \ | |
153 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
154 | ||
bc189096 | 155 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 156 | { |
bc189096 | 157 | int ret; |
237e200e | 158 | |
bc189096 CB |
159 | if (entry->init_pidfd < 0) |
160 | return ret_errno(ENOSYS); | |
7dd6560a | 161 | |
bc189096 CB |
162 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
163 | if (ret < 0) { | |
164 | if (errno == ENOSYS) | |
165 | return ret_errno(ENOSYS); | |
7dd6560a | 166 | |
bc189096 | 167 | return 0; |
2aa59b2e CB |
168 | } |
169 | ||
bc189096 CB |
170 | return 1; |
171 | } | |
172 | ||
173 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
174 | { | |
175 | struct stat st; | |
176 | char path[LXCFS_PROC_PID_LEN]; | |
177 | ||
178 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
179 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
180 | return 0; | |
181 | ||
182 | return 1; | |
183 | } | |
184 | ||
185 | /* Must be called under store_lock */ | |
186 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
187 | { | |
188 | int ret; | |
189 | ||
190 | ret = initpid_still_valid_pidfd(entry); | |
191 | if (ret < 0) | |
192 | ret = initpid_still_valid_stat(entry); | |
193 | ||
194 | return ret == 1; | |
237e200e SH |
195 | } |
196 | ||
197 | /* Must be called under store_lock */ | |
2aa59b2e | 198 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 199 | { |
2aa59b2e CB |
200 | struct pidns_init_store *it; |
201 | int ino_hash; | |
237e200e | 202 | |
2aa59b2e CB |
203 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
204 | entry->initpid); | |
7dd6560a | 205 | |
2aa59b2e CB |
206 | ino_hash = HASH(entry->ino); |
207 | if (pidns_hash_table[ino_hash] == entry) { | |
208 | pidns_hash_table[ino_hash] = entry->next; | |
209 | close_prot_errno_disarm(entry->init_pidfd); | |
210 | free_disarm(entry); | |
237e200e SH |
211 | return; |
212 | } | |
213 | ||
2aa59b2e CB |
214 | it = pidns_hash_table[ino_hash]; |
215 | while (it) { | |
216 | if (it->next == entry) { | |
217 | it->next = entry->next; | |
218 | close_prot_errno_disarm(entry->init_pidfd); | |
219 | free_disarm(entry); | |
237e200e SH |
220 | return; |
221 | } | |
2aa59b2e | 222 | it = it->next; |
237e200e SH |
223 | } |
224 | } | |
225 | ||
226 | #define PURGE_SECS 5 | |
227 | /* Must be called under store_lock */ | |
228 | static void prune_initpid_store(void) | |
229 | { | |
1ba088ae CB |
230 | static int64_t last_prune = 0; |
231 | int64_t now, threshold; | |
237e200e SH |
232 | |
233 | if (!last_prune) { | |
234 | last_prune = time(NULL); | |
235 | return; | |
236 | } | |
2aa59b2e | 237 | |
237e200e | 238 | now = time(NULL); |
b18d6121 | 239 | if (now < (last_prune + PURGE_SECS)) |
237e200e | 240 | return; |
7dd6560a | 241 | |
2aa59b2e | 242 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 243 | |
237e200e SH |
244 | last_prune = now; |
245 | threshold = now - 2 * PURGE_SECS; | |
246 | ||
2aa59b2e CB |
247 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
248 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
249 | if (entry->lastcheck < threshold) { | |
250 | struct pidns_init_store *cur = entry; | |
7dd6560a | 251 | |
2aa59b2e | 252 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 253 | |
237e200e | 254 | if (prev) |
2aa59b2e | 255 | prev->next = entry->next; |
237e200e | 256 | else |
2aa59b2e CB |
257 | pidns_hash_table[i] = entry->next; |
258 | entry = entry->next; | |
259 | close_prot_errno_disarm(cur->init_pidfd); | |
260 | free_disarm(cur); | |
237e200e | 261 | } else { |
2aa59b2e CB |
262 | prev = entry; |
263 | entry = entry->next; | |
237e200e SH |
264 | } |
265 | } | |
266 | } | |
267 | } | |
268 | ||
c8f77ce4 CB |
269 | static void clear_initpid_store(void) |
270 | { | |
271 | store_lock(); | |
272 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { | |
273 | for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) { | |
274 | struct pidns_init_store *cur = entry; | |
275 | ||
276 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); | |
277 | ||
278 | pidns_hash_table[i] = entry->next; | |
279 | entry = entry->next; | |
280 | close_prot_errno_disarm(cur->init_pidfd); | |
281 | free_disarm(cur); | |
282 | } | |
283 | } | |
284 | store_unlock(); | |
285 | } | |
286 | ||
237e200e | 287 | /* Must be called under store_lock */ |
fcdedd16 | 288 | static void save_initpid(ino_t pidns_inode, pid_t pid) |
237e200e | 289 | { |
1e5d03fe | 290 | __do_free struct pidns_init_store *entry = NULL; |
05b7a16d | 291 | __do_close int pidfd = -EBADF; |
536620fd | 292 | const struct lxcfs_opts *opts = fuse_get_context()->private_data; |
2aa59b2e | 293 | char path[LXCFS_PROC_PID_LEN]; |
2aa59b2e CB |
294 | struct stat st; |
295 | int ino_hash; | |
296 | ||
9973cc06 | 297 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
298 | pidfd = pidfd_open(pid, 0); |
299 | if (pidfd < 0) | |
300 | return; | |
301 | } | |
237e200e | 302 | |
2aa59b2e CB |
303 | snprintf(path, sizeof(path), "/proc/%d", pid); |
304 | if (stat(path, &st)) | |
305 | return; | |
7dd6560a | 306 | |
5ec289bf | 307 | entry = zalloc(sizeof(*entry)); |
0eb3756b | 308 | if (!entry) |
237e200e | 309 | return; |
2aa59b2e | 310 | |
97017213 | 311 | ino_hash = HASH(pidns_inode); |
1e5d03fe | 312 | *entry = (struct pidns_init_store){ |
fcdedd16 | 313 | .ino = pidns_inode, |
1e5d03fe CB |
314 | .initpid = pid, |
315 | .ctime = st.st_ctime, | |
316 | .next = pidns_hash_table[ino_hash], | |
317 | .lastcheck = time(NULL), | |
318 | .init_pidfd = move_fd(pidfd), | |
319 | }; | |
320 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
321 | |
322 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
323 | } |
324 | ||
325 | /* | |
326 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
327 | * entry for the inode number and creation time. Verify that the init pid | |
328 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
329 | * otherwise. | |
330 | * Must be called under store_lock | |
331 | */ | |
cfda2e8a | 332 | static pid_t lookup_verify_initpid(ino_t pidns_inode) |
237e200e | 333 | { |
fcdedd16 | 334 | struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; |
2aa59b2e CB |
335 | |
336 | while (entry) { | |
fcdedd16 | 337 | if (entry->ino == pidns_inode) { |
2aa59b2e CB |
338 | if (initpid_still_valid(entry)) { |
339 | entry->lastcheck = time(NULL); | |
cfda2e8a | 340 | return entry->initpid; |
237e200e | 341 | } |
2aa59b2e CB |
342 | |
343 | remove_initpid(entry); | |
cfda2e8a | 344 | return ret_errno(ESRCH); |
237e200e | 345 | } |
2aa59b2e | 346 | entry = entry->next; |
237e200e SH |
347 | } |
348 | ||
cfda2e8a | 349 | return ret_errno(ESRCH); |
237e200e SH |
350 | } |
351 | ||
35acc247 | 352 | static bool send_creds_ok(int sock_fd) |
237e200e | 353 | { |
f1744de4 CB |
354 | char v = '1'; /* we are the child */ |
355 | struct ucred cred = { | |
356 | .uid = 0, | |
357 | .gid = 0, | |
358 | .pid = 1, | |
359 | }; | |
360 | ||
35acc247 | 361 | return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK; |
237e200e SH |
362 | } |
363 | ||
35acc247 | 364 | __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd) |
87f7558b | 365 | { |
35acc247 CB |
366 | /* |
367 | * These flags don't interest at all so we don't jump through any hoops | |
368 | * of retrieving them and passing them to the kernel. | |
369 | */ | |
370 | errno = EINVAL; | |
371 | if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | | |
372 | CLONE_CHILD_CLEARTID | CLONE_SETTLS))) | |
373 | return -EINVAL; | |
374 | ||
375 | #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) | |
376 | /* On s390/s390x and cris the order of the first and second arguments | |
377 | * of the system call is reversed. | |
378 | */ | |
379 | return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd); | |
380 | #elif defined(__sparc__) && defined(__arch64__) | |
381 | { | |
382 | /* | |
383 | * sparc64 always returns the other process id in %o0, and a | |
384 | * boolean flag whether this is the child or the parent in %o1. | |
385 | * Inline assembly is needed to get the flag returned in %o1. | |
386 | */ | |
387 | register long g1 asm("g1") = __NR_clone; | |
388 | register long o0 asm("o0") = flags | SIGCHLD; | |
389 | register long o1 asm("o1") = 0; /* is parent/child indicator */ | |
390 | register long o2 asm("o2") = (unsigned long)pidfd; | |
391 | long is_error, retval, in_child; | |
392 | pid_t child_pid; | |
393 | ||
394 | asm volatile( | |
395 | #if defined(__arch64__) | |
396 | "t 0x6d\n\t" /* 64-bit trap */ | |
397 | #else | |
398 | "t 0x10\n\t" /* 32-bit trap */ | |
399 | #endif | |
400 | /* | |
401 | * catch errors: On sparc, the carry bit (csr) in the | |
402 | * processor status register (psr) is used instead of a | |
403 | * full register. | |
404 | */ | |
405 | "addx %%g0, 0, %%g1" | |
406 | : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ | |
407 | : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ | |
408 | : "%cc"); /* clobbers */ | |
409 | ||
410 | is_error = g1; | |
411 | retval = o0; | |
412 | in_child = o1; | |
413 | ||
414 | if (is_error) { | |
415 | errno = retval; | |
416 | return -1; | |
417 | } | |
87f7558b | 418 | |
35acc247 CB |
419 | if (in_child) |
420 | return 0; | |
87f7558b | 421 | |
35acc247 CB |
422 | child_pid = retval; |
423 | return child_pid; | |
424 | } | |
425 | #elif defined(__ia64__) | |
426 | /* On ia64 the stack and stack size are passed as separate arguments. */ | |
427 | return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd); | |
87f7558b | 428 | #else |
35acc247 | 429 | return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd); |
87f7558b | 430 | #endif |
87f7558b CB |
431 | } |
432 | ||
433 | #define LXCFS_PROC_PID_NS_LEN \ | |
434 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
435 | STRLITERALLEN("/ns/pid") + 1) | |
436 | ||
580fe4df CB |
437 | /* |
438 | * clone a task which switches to @task's namespace and writes '1'. | |
439 | * over a unix sock so we can read the task's reaper's pid in our | |
440 | * namespace | |
441 | * | |
442 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
443 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
444 | * the pidns and the parent pid outside are identical. Using clone prevents | |
445 | * this issue. | |
446 | */ | |
447 | static void write_task_init_pid_exit(int sock, pid_t target) | |
448 | { | |
05b7a16d | 449 | __do_close int fd = -EBADF; |
87f7558b | 450 | char path[LXCFS_PROC_PID_NS_LEN]; |
580fe4df | 451 | pid_t pid; |
87f7558b CB |
452 | |
453 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
454 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
455 | if (fd < 0) | |
456 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
457 | ||
458 | if (setns(fd, 0)) | |
459 | log_exit("Failed to setns to pid namespace of process %d", target); | |
460 | ||
35acc247 | 461 | pid = lxcfs_raw_clone(0, NULL); |
580fe4df | 462 | if (pid < 0) |
87f7558b CB |
463 | _exit(EXIT_FAILURE); |
464 | ||
35acc247 CB |
465 | if (pid == 0) { |
466 | if (!send_creds_ok(sock)) | |
87f7558b CB |
467 | _exit(EXIT_FAILURE); |
468 | ||
469 | _exit(EXIT_SUCCESS); | |
237e200e | 470 | } |
35acc247 CB |
471 | |
472 | if (!wait_for_pid(pid)) | |
473 | _exit(EXIT_FAILURE); | |
474 | ||
475 | _exit(EXIT_SUCCESS); | |
237e200e SH |
476 | } |
477 | ||
8a07696e | 478 | static pid_t scm_init_pid(pid_t task) |
237e200e | 479 | { |
580fe4df | 480 | char v = '0'; |
87f7558b | 481 | pid_t pid_ret = -1; |
dac3dc93 CB |
482 | struct ucred cred = { |
483 | .pid = -1, | |
484 | .uid = -1, | |
485 | .gid = -1, | |
486 | }; | |
87f7558b CB |
487 | pid_t pid; |
488 | int sock[2]; | |
237e200e | 489 | |
87f7558b | 490 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 491 | return -1; |
237e200e | 492 | |
580fe4df CB |
493 | pid = fork(); |
494 | if (pid < 0) | |
495 | goto out; | |
87f7558b CB |
496 | |
497 | if (pid == 0) { | |
580fe4df CB |
498 | close(sock[1]); |
499 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 500 | _exit(EXIT_SUCCESS); |
237e200e | 501 | } |
7213ec5c | 502 | |
580fe4df CB |
503 | if (!recv_creds(sock[1], &cred, &v)) |
504 | goto out; | |
87f7558b CB |
505 | |
506 | pid_ret = cred.pid; | |
237e200e | 507 | |
580fe4df CB |
508 | out: |
509 | close(sock[0]); | |
510 | close(sock[1]); | |
511 | if (pid > 0) | |
512 | wait_for_pid(pid); | |
237e200e | 513 | |
87f7558b CB |
514 | return pid_ret; |
515 | } | |
2aa59b2e CB |
516 | |
517 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 518 | { |
cfda2e8a | 519 | pid_t hashed_pid = 0; |
2aa59b2e CB |
520 | char path[LXCFS_PROC_PID_NS_LEN]; |
521 | struct stat st; | |
2aa59b2e CB |
522 | |
523 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
2aa59b2e | 524 | if (stat(path, &st)) |
4e1e4115 | 525 | return ret_errno(ESRCH); |
2aa59b2e | 526 | |
4e1e4115 | 527 | store_lock(); |
fcdedd16 | 528 | |
cfda2e8a CB |
529 | hashed_pid = lookup_verify_initpid(st.st_ino); |
530 | if (hashed_pid < 0) { | |
531 | /* release the mutex as the following call is expensive */ | |
532 | store_unlock(); | |
2aa59b2e | 533 | |
8a07696e | 534 | hashed_pid = scm_init_pid(pid); |
4e1e4115 | 535 | |
cfda2e8a | 536 | store_lock(); |
4e1e4115 | 537 | |
cfda2e8a CB |
538 | if (hashed_pid > 0) |
539 | save_initpid(st.st_ino, hashed_pid); | |
540 | } | |
b7672ded | 541 | |
2aa59b2e | 542 | /* |
cfda2e8a CB |
543 | * Prune at the end in case we're pruning the value |
544 | * we were about to return. | |
2aa59b2e | 545 | */ |
580fe4df | 546 | prune_initpid_store(); |
4e1e4115 | 547 | store_unlock(); |
2aa59b2e | 548 | |
cfda2e8a | 549 | return hashed_pid; |
237e200e SH |
550 | } |
551 | ||
29a73c2f CB |
552 | /* |
553 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
554 | */ |
555 | ||
29a73c2f CB |
556 | static bool umount_if_mounted(void) |
557 | { | |
558 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 559 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
560 | return false; |
561 | } | |
562 | return true; | |
563 | } | |
564 | ||
2283e240 CB |
565 | /* __typeof__ should be safe to use with all compilers. */ |
566 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
567 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
568 | { | |
569 | return (fs->f_type == (fs_type_magic)magic_val); | |
570 | } | |
571 | ||
0a4dea41 CB |
572 | /* |
573 | * looking at fs/proc_namespace.c, it appears we can | |
574 | * actually expect the rootfs entry to very specifically contain | |
575 | * " - rootfs rootfs " | |
576 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
577 | * the rootfs entry should always be skipped in mountinfo contents. | |
578 | */ | |
579 | static bool is_on_ramfs(void) | |
580 | { | |
87f7558b | 581 | __do_free char *line = NULL; |
757a63e7 | 582 | __do_free void *fopen_cache = NULL; |
87f7558b | 583 | __do_fclose FILE *f = NULL; |
0a4dea41 | 584 | size_t len = 0; |
0a4dea41 | 585 | |
757a63e7 | 586 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
587 | if (!f) |
588 | return false; | |
589 | ||
590 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
591 | int i; |
592 | char *p, *p2; | |
593 | ||
0a4dea41 CB |
594 | for (p = line, i = 0; p && i < 4; i++) |
595 | p = strchr(p + 1, ' '); | |
596 | if (!p) | |
597 | continue; | |
87f7558b | 598 | |
0a4dea41 CB |
599 | p2 = strchr(p + 1, ' '); |
600 | if (!p2) | |
601 | continue; | |
602 | *p2 = '\0'; | |
603 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 604 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 605 | p = strchr(p2 + 1, '-'); |
87f7558b | 606 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 607 | return true; |
0a4dea41 CB |
608 | } |
609 | } | |
87f7558b | 610 | |
0a4dea41 CB |
611 | return false; |
612 | } | |
613 | ||
9b96e96e | 614 | static int pivot_enter(void) |
0a4dea41 | 615 | { |
05b7a16d | 616 | __do_close int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 | 617 | |
3326c17e | 618 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
619 | if (oldroot < 0) |
620 | return log_error_errno(-1, errno, | |
621 | "Failed to open old root for fchdir"); | |
cc309f33 | 622 | |
3326c17e | 623 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
624 | if (newroot < 0) |
625 | return log_error_errno(-1, errno, | |
626 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
627 | |
628 | /* change into new root fs */ | |
87f7558b CB |
629 | if (fchdir(newroot) < 0) |
630 | return log_error_errno(-1, | |
631 | errno, "Failed to change directory to new rootfs: %s", | |
632 | ROOTDIR); | |
cc309f33 | 633 | |
0a4dea41 | 634 | /* pivot_root into our new root fs */ |
87f7558b CB |
635 | if (pivot_root(".", ".") < 0) |
636 | return log_error_errno(-1, errno, | |
637 | "pivot_root() syscall failed: %s", | |
638 | strerror(errno)); | |
0a4dea41 CB |
639 | |
640 | /* | |
641 | * At this point the old-root is mounted on top of our new-root. | |
642 | * To unmounted it we must not be chdir'd into it, so escape back | |
643 | * to the old-root. | |
644 | */ | |
87f7558b CB |
645 | if (fchdir(oldroot) < 0) |
646 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 647 | |
87f7558b CB |
648 | if (umount2(".", MNT_DETACH) < 0) |
649 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 650 | |
87f7558b CB |
651 | if (fchdir(newroot) < 0) |
652 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 653 | |
87f7558b | 654 | return 0; |
0a4dea41 CB |
655 | } |
656 | ||
9b96e96e | 657 | static int chroot_enter(void) |
0a4dea41 CB |
658 | { |
659 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
660 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
661 | return -1; | |
662 | } | |
663 | ||
664 | if (chroot(".") < 0) { | |
665 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
666 | return -1; | |
667 | } | |
668 | ||
669 | if (chdir("/") < 0) { | |
670 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
671 | return -1; | |
672 | } | |
673 | ||
674 | return 0; | |
675 | } | |
676 | ||
0232cbac | 677 | static int permute_and_enter(void) |
29a73c2f | 678 | { |
0a4dea41 CB |
679 | struct statfs sb; |
680 | ||
681 | if (statfs("/", &sb) < 0) { | |
682 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 683 | return -1; |
0a4dea41 CB |
684 | } |
685 | ||
686 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
687 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
688 | * /proc/1/mountinfo. */ | |
689 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
690 | return chroot_enter(); | |
29a73c2f | 691 | |
cc309f33 | 692 | if (pivot_enter() < 0) { |
0a4dea41 | 693 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 694 | return -1; |
29a73c2f CB |
695 | } |
696 | ||
cc309f33 | 697 | return 0; |
29a73c2f CB |
698 | } |
699 | ||
700 | /* Prepare our new clean root. */ | |
0232cbac | 701 | static int permute_prepare(void) |
29a73c2f CB |
702 | { |
703 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 704 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
705 | return -1; |
706 | } | |
707 | ||
708 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 709 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
710 | return -1; |
711 | } | |
712 | ||
713 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 714 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
715 | return -1; |
716 | } | |
717 | ||
718 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 719 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
720 | return -1; |
721 | } | |
722 | ||
723 | return 0; | |
724 | } | |
725 | ||
0232cbac CB |
726 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
727 | static bool permute_root(void) | |
29a73c2f CB |
728 | { |
729 | /* Prepare new root. */ | |
0232cbac | 730 | if (permute_prepare() < 0) |
29a73c2f CB |
731 | return false; |
732 | ||
733 | /* Pivot into new root. */ | |
0232cbac | 734 | if (permute_and_enter() < 0) |
29a73c2f CB |
735 | return false; |
736 | ||
737 | return true; | |
738 | } | |
739 | ||
0a4dea41 | 740 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
741 | { |
742 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 743 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
744 | return false; |
745 | } | |
480262c9 | 746 | |
29a73c2f | 747 | if (!umount_if_mounted()) { |
b8defc3d | 748 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
749 | return false; |
750 | } | |
751 | ||
752 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 753 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
754 | return false; |
755 | } | |
756 | ||
1d81c6a6 | 757 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 758 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
759 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
760 | return false; | |
761 | } | |
762 | ||
480262c9 | 763 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 764 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
765 | return false; |
766 | } | |
480262c9 | 767 | |
29a73c2f | 768 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 769 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
770 | return false; |
771 | } | |
480262c9 | 772 | |
29a73c2f CB |
773 | return true; |
774 | } | |
775 | ||
0a4dea41 | 776 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 777 | { |
5fbea8a6 CB |
778 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
779 | return false; | |
51c7ca35 | 780 | |
5fbea8a6 CB |
781 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
782 | return false; | |
29a73c2f | 783 | |
5fbea8a6 CB |
784 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
785 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
786 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
787 | if ((*h)->fd < 0) | |
29a73c2f | 788 | return false; |
29a73c2f | 789 | } |
5fbea8a6 | 790 | |
29a73c2f CB |
791 | return true; |
792 | } | |
793 | ||
480262c9 | 794 | static bool cgfs_setup_controllers(void) |
29a73c2f | 795 | { |
0a4dea41 | 796 | if (!cgfs_prepare_mounts()) |
29a73c2f | 797 | return false; |
29a73c2f | 798 | |
2b8eff1d CB |
799 | if (!cgfs_mount_hierarchies()) |
800 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 801 | |
0232cbac | 802 | if (!permute_root()) |
29a73c2f CB |
803 | return false; |
804 | ||
805 | return true; | |
806 | } | |
807 | ||
dee86006 | 808 | static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) |
b9b6bdc9 CB |
809 | { |
810 | int ret; | |
811 | ||
812 | if (reload_successful) { | |
813 | reload_successful = 0; | |
814 | ||
815 | /* write() is async signal safe */ | |
816 | ret = write(STDERR_FILENO, | |
817 | "Switched into non-virtualization mode\n", | |
818 | STRLITERALLEN("Switched into non-virtualization mode\n")); | |
819 | if (ret < 0) | |
820 | goto please_compiler; | |
821 | } else { | |
822 | reload_successful = 1; | |
823 | ||
824 | /* write() is async signal safe */ | |
825 | ret = write(STDERR_FILENO, "Switched into virtualization mode\n", | |
826 | STRLITERALLEN("Switched into virtualization mode\n")); | |
827 | if (ret < 0) | |
828 | goto please_compiler; | |
829 | } | |
830 | ||
831 | please_compiler: | |
832 | /* | |
833 | * The write() syscall is a function whose return value needs to be | |
4210ee1d CB |
834 | * checked. Otherwise the compiler will warn.Another one could be to |
835 | * use syscall(__NR_write, ...) directly but whatever. | |
b9b6bdc9 CB |
836 | */ |
837 | return; | |
838 | } | |
839 | ||
2243c5a9 | 840 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 841 | { |
05b7a16d | 842 | __do_close int init_ns = -EBADF, root_fd = -EBADF, |
de69569b | 843 | pidfd = -EBADF; |
4ec5c9da | 844 | int i = 0; |
2aa59b2e | 845 | pid_t pid; |
50f7faee | 846 | struct hierarchy *hierarchy; |
237e200e | 847 | |
c2357135 | 848 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); |
cc42d0c7 | 849 | |
5fbea8a6 | 850 | cgroup_ops = cgroup_init(); |
c2357135 CB |
851 | if (!cgroup_ops) { |
852 | lxcfs_info("Failed to initialize cgroup support"); | |
853 | goto broken_upgrade; | |
854 | } | |
237e200e | 855 | |
480262c9 | 856 | /* Preserve initial namespace. */ |
2aa59b2e CB |
857 | pid = getpid(); |
858 | init_ns = preserve_ns(pid, "mnt"); | |
c2357135 CB |
859 | if (init_ns < 0) { |
860 | lxcfs_info("Failed to preserve initial mount namespace"); | |
861 | goto broken_upgrade; | |
862 | } | |
480262c9 | 863 | |
480262c9 CB |
864 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
865 | * to privately mount lxcfs cgroups. */ | |
c2357135 | 866 | if (!cgfs_setup_controllers()) { |
2243c5a9 | 867 | log_exit("Failed to setup private cgroup mounts for lxcfs"); |
c2357135 CB |
868 | goto broken_upgrade; |
869 | } | |
480262c9 | 870 | |
c2357135 | 871 | if (setns(init_ns, 0) < 0) { |
2243c5a9 | 872 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); |
c2357135 CB |
873 | goto broken_upgrade; |
874 | } | |
29a73c2f | 875 | |
c2357135 | 876 | if (!init_cpuview()) { |
2243c5a9 | 877 | log_exit("Failed to init CPU view"); |
c2357135 CB |
878 | goto broken_upgrade; |
879 | } | |
056adcef | 880 | |
cc42d0c7 CB |
881 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
882 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
883 | |
884 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
885 | char **controller_list = (*h)->controllers; |
886 | __do_free char *controllers = NULL; | |
887 | if (controller_list && *controller_list) | |
888 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
889 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 890 | } |
2aa59b2e CB |
891 | |
892 | pidfd = pidfd_open(pid, 0); | |
893 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
894 | can_use_pidfd = true; | |
cc42d0c7 | 895 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 896 | } |
ce8fc84c | 897 | |
c6805016 CB |
898 | can_use_swap = cgroup_ops->can_use_swap(cgroup_ops); |
899 | if (can_use_swap) | |
900 | lxcfs_info("Kernel supports swap accounting"); | |
901 | else | |
902 | lxcfs_info("Kernel does not support swap accounting"); | |
903 | ||
50f7faee WB |
904 | hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory"); |
905 | memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy); | |
906 | ||
cc42d0c7 | 907 | lxcfs_info("api_extensions:"); |
3cf1e562 CB |
908 | for (size_t nr = 0; nr < nr_api_extensions; nr++) |
909 | lxcfs_info("- %s", api_extensions[nr]); | |
de69569b CB |
910 | |
911 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
c2357135 CB |
912 | if (root_fd < 0) |
913 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
914 | else if (fchdir(root_fd) < 0) | |
915 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
916 | ||
dee86006 CB |
917 | if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { |
918 | lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); | |
b9b6bdc9 | 919 | goto broken_upgrade; |
dee86006 | 920 | } |
b9b6bdc9 CB |
921 | |
922 | reload_successful = 1; | |
c2357135 | 923 | return; |
de69569b | 924 | |
c2357135 | 925 | broken_upgrade: |
b9b6bdc9 | 926 | reload_successful = 0; |
c2357135 | 927 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); |
237e200e SH |
928 | } |
929 | ||
2243c5a9 | 930 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 931 | { |
cc42d0c7 CB |
932 | lxcfs_info("Running destructor %s", __func__); |
933 | ||
c8f77ce4 | 934 | clear_initpid_store(); |
056adcef | 935 | free_cpuview(); |
2243c5a9 | 936 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 937 | } |
285aea40 | 938 | |
0d5383b7 | 939 | void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data) |
285aea40 CB |
940 | { |
941 | struct fuse_context *fc = fuse_get_context(); | |
888ab80a | 942 | #if HAVE_FUSE_RETURNS_DT_TYPE |
285aea40 | 943 | can_use_sys_cpu = true; |
888ab80a | 944 | #endif |
285aea40 CB |
945 | has_versioned_opts = true; |
946 | return fc->private_data; | |
947 | } |