]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
1f5596dd CB |
3 | #ifndef _GNU_SOURCE |
4 | #define _GNU_SOURCE | |
5 | #endif | |
6 | ||
f834b6bf SP |
7 | #include "config.h" |
8 | ||
237e200e | 9 | #include <dirent.h> |
29a73c2f | 10 | #include <errno.h> |
237e200e | 11 | #include <fcntl.h> |
0ecddf02 | 12 | #include <inttypes.h> |
237e200e | 13 | #include <libgen.h> |
dee86006 CB |
14 | #include <linux/magic.h> |
15 | #include <linux/sched.h> | |
237e200e | 16 | #include <pthread.h> |
29a73c2f | 17 | #include <sched.h> |
db1b32f6 | 18 | #include <stdarg.h> |
29a73c2f | 19 | #include <stdbool.h> |
0ecddf02 | 20 | #include <stdint.h> |
29a73c2f CB |
21 | #include <stdio.h> |
22 | #include <stdlib.h> | |
23 | #include <string.h> | |
29a73c2f CB |
24 | #include <sys/epoll.h> |
25 | #include <sys/mman.h> | |
26 | #include <sys/mount.h> | |
237e200e SH |
27 | #include <sys/param.h> |
28 | #include <sys/socket.h> | |
29a73c2f | 29 | #include <sys/syscall.h> |
0ecddf02 | 30 | #include <sys/sysinfo.h> |
d89504c4 | 31 | #include <sys/vfs.h> |
dee86006 CB |
32 | #include <time.h> |
33 | #include <unistd.h> | |
34 | #include <wait.h> | |
237e200e | 35 | |
237e200e | 36 | #include "bindings.h" |
e01afbb7 CB |
37 | |
38 | #include "api_extensions.h" | |
580fe4df | 39 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
40 | #include "cgroups/cgroup.h" |
41 | #include "cgroups/cgroup_utils.h" | |
c9236032 | 42 | #include "memory_utils.h" |
1f5596dd | 43 | #include "proc_cpuview.h" |
8364a99c | 44 | #include "syscall_numbers.h" |
1d81c6a6 | 45 | #include "utils.h" |
237e200e | 46 | |
2aa59b2e | 47 | static bool can_use_pidfd; |
c6805016 | 48 | static bool can_use_swap; |
285aea40 CB |
49 | static bool can_use_sys_cpu; |
50 | static bool has_versioned_opts; | |
b9b6bdc9 CB |
51 | |
52 | static volatile sig_atomic_t reload_successful; | |
cbfc55fd CB |
53 | |
54 | bool liblxcfs_functional(void) | |
55 | { | |
b9b6bdc9 | 56 | return reload_successful != 0; |
cbfc55fd | 57 | } |
2aa59b2e | 58 | |
c6805016 CB |
59 | bool liblxcfs_can_use_swap(void) |
60 | { | |
61 | return can_use_swap; | |
62 | } | |
63 | ||
285aea40 CB |
64 | bool liblxcfs_can_use_sys_cpu(void) |
65 | { | |
66 | return can_use_sys_cpu; | |
67 | } | |
68 | ||
69 | bool liblxcfs_has_versioned_opts(void) | |
70 | { | |
71 | return has_versioned_opts; | |
72 | } | |
73 | ||
29a73c2f CB |
74 | /* Define pivot_root() if missing from the C library */ |
75 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 76 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f | 77 | { |
4ec5c9da | 78 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f CB |
79 | } |
80 | #else | |
4ec5c9da | 81 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
82 | #endif |
83 | ||
237e200e SH |
84 | /* |
85 | * A table caching which pid is init for a pid namespace. | |
86 | * When looking up which pid is init for $qpid, we first | |
87 | * 1. Stat /proc/$qpid/ns/pid. | |
88 | * 2. Check whether the ino_t is in our store. | |
89 | * a. if not, fork a child in qpid's ns to send us | |
90 | * ucred.pid = 1, and read the initpid. Cache | |
91 | * initpid and creation time for /proc/initpid | |
92 | * in a new store entry. | |
93 | * b. if so, verify that /proc/initpid still matches | |
94 | * what we have saved. If not, clear the store | |
95 | * entry and go back to a. If so, return the | |
96 | * cached initpid. | |
97 | */ | |
98 | struct pidns_init_store { | |
2aa59b2e CB |
99 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
100 | pid_t initpid; /* the pid of nit in that ns */ | |
101 | int init_pidfd; | |
1ba088ae | 102 | int64_t ctime; /* the time at which /proc/$initpid was created */ |
237e200e | 103 | struct pidns_init_store *next; |
1ba088ae | 104 | int64_t lastcheck; |
237e200e SH |
105 | }; |
106 | ||
107 | /* lol - look at how they are allocated in the kernel */ | |
108 | #define PIDNS_HASH_SIZE 4096 | |
109 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
110 | ||
111 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
112 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 113 | |
4e1e4115 | 114 | static void mutex_lock(pthread_mutex_t *l) |
237e200e SH |
115 | { |
116 | int ret; | |
117 | ||
4ec5c9da CB |
118 | ret = pthread_mutex_lock(l); |
119 | if (ret) | |
120 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
121 | } |
122 | ||
77f4399a | 123 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 124 | |
4e1e4115 | 125 | static void mutex_unlock(pthread_mutex_t *l) |
237e200e SH |
126 | { |
127 | int ret; | |
128 | ||
4ec5c9da CB |
129 | ret = pthread_mutex_unlock(l); |
130 | if (ret) | |
131 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
132 | } |
133 | ||
4e1e4115 | 134 | static inline void store_lock(void) |
237e200e | 135 | { |
4e1e4115 | 136 | mutex_lock(&pidns_store_mutex); |
237e200e SH |
137 | } |
138 | ||
4e1e4115 | 139 | static inline void store_unlock(void) |
237e200e | 140 | { |
4e1e4115 | 141 | mutex_unlock(&pidns_store_mutex); |
237e200e SH |
142 | } |
143 | ||
2aa59b2e CB |
144 | /* /proc/ = 6 |
145 | * + | |
146 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
147 | * + | |
148 | * \0 = 1 | |
149 | */ | |
150 | #define LXCFS_PROC_PID_LEN \ | |
151 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
152 | ||
bc189096 | 153 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 154 | { |
bc189096 | 155 | int ret; |
237e200e | 156 | |
bc189096 CB |
157 | if (entry->init_pidfd < 0) |
158 | return ret_errno(ENOSYS); | |
7dd6560a | 159 | |
bc189096 CB |
160 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
161 | if (ret < 0) { | |
162 | if (errno == ENOSYS) | |
163 | return ret_errno(ENOSYS); | |
7dd6560a | 164 | |
bc189096 | 165 | return 0; |
2aa59b2e CB |
166 | } |
167 | ||
bc189096 CB |
168 | return 1; |
169 | } | |
170 | ||
171 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
172 | { | |
173 | struct stat st; | |
174 | char path[LXCFS_PROC_PID_LEN]; | |
175 | ||
176 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
177 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
178 | return 0; | |
179 | ||
180 | return 1; | |
181 | } | |
182 | ||
183 | /* Must be called under store_lock */ | |
184 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
185 | { | |
186 | int ret; | |
187 | ||
188 | ret = initpid_still_valid_pidfd(entry); | |
189 | if (ret < 0) | |
190 | ret = initpid_still_valid_stat(entry); | |
191 | ||
192 | return ret == 1; | |
237e200e SH |
193 | } |
194 | ||
195 | /* Must be called under store_lock */ | |
2aa59b2e | 196 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 197 | { |
2aa59b2e CB |
198 | struct pidns_init_store *it; |
199 | int ino_hash; | |
237e200e | 200 | |
2aa59b2e CB |
201 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
202 | entry->initpid); | |
7dd6560a | 203 | |
2aa59b2e CB |
204 | ino_hash = HASH(entry->ino); |
205 | if (pidns_hash_table[ino_hash] == entry) { | |
206 | pidns_hash_table[ino_hash] = entry->next; | |
207 | close_prot_errno_disarm(entry->init_pidfd); | |
208 | free_disarm(entry); | |
237e200e SH |
209 | return; |
210 | } | |
211 | ||
2aa59b2e CB |
212 | it = pidns_hash_table[ino_hash]; |
213 | while (it) { | |
214 | if (it->next == entry) { | |
215 | it->next = entry->next; | |
216 | close_prot_errno_disarm(entry->init_pidfd); | |
217 | free_disarm(entry); | |
237e200e SH |
218 | return; |
219 | } | |
2aa59b2e | 220 | it = it->next; |
237e200e SH |
221 | } |
222 | } | |
223 | ||
224 | #define PURGE_SECS 5 | |
225 | /* Must be called under store_lock */ | |
226 | static void prune_initpid_store(void) | |
227 | { | |
1ba088ae CB |
228 | static int64_t last_prune = 0; |
229 | int64_t now, threshold; | |
237e200e SH |
230 | |
231 | if (!last_prune) { | |
232 | last_prune = time(NULL); | |
233 | return; | |
234 | } | |
2aa59b2e | 235 | |
237e200e | 236 | now = time(NULL); |
b18d6121 | 237 | if (now < (last_prune + PURGE_SECS)) |
237e200e | 238 | return; |
7dd6560a | 239 | |
2aa59b2e | 240 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 241 | |
237e200e SH |
242 | last_prune = now; |
243 | threshold = now - 2 * PURGE_SECS; | |
244 | ||
2aa59b2e CB |
245 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
246 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
247 | if (entry->lastcheck < threshold) { | |
248 | struct pidns_init_store *cur = entry; | |
7dd6560a | 249 | |
2aa59b2e | 250 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 251 | |
237e200e | 252 | if (prev) |
2aa59b2e | 253 | prev->next = entry->next; |
237e200e | 254 | else |
2aa59b2e CB |
255 | pidns_hash_table[i] = entry->next; |
256 | entry = entry->next; | |
257 | close_prot_errno_disarm(cur->init_pidfd); | |
258 | free_disarm(cur); | |
237e200e | 259 | } else { |
2aa59b2e CB |
260 | prev = entry; |
261 | entry = entry->next; | |
237e200e SH |
262 | } |
263 | } | |
264 | } | |
265 | } | |
266 | ||
c8f77ce4 CB |
267 | static void clear_initpid_store(void) |
268 | { | |
269 | store_lock(); | |
270 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { | |
271 | for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) { | |
272 | struct pidns_init_store *cur = entry; | |
273 | ||
274 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); | |
275 | ||
276 | pidns_hash_table[i] = entry->next; | |
277 | entry = entry->next; | |
278 | close_prot_errno_disarm(cur->init_pidfd); | |
279 | free_disarm(cur); | |
280 | } | |
281 | } | |
282 | store_unlock(); | |
283 | } | |
284 | ||
237e200e | 285 | /* Must be called under store_lock */ |
fcdedd16 | 286 | static void save_initpid(ino_t pidns_inode, pid_t pid) |
237e200e | 287 | { |
1e5d03fe | 288 | __do_free struct pidns_init_store *entry = NULL; |
05b7a16d | 289 | __do_close int pidfd = -EBADF; |
536620fd | 290 | const struct lxcfs_opts *opts = fuse_get_context()->private_data; |
2aa59b2e | 291 | char path[LXCFS_PROC_PID_LEN]; |
2aa59b2e CB |
292 | struct stat st; |
293 | int ino_hash; | |
294 | ||
9973cc06 | 295 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
296 | pidfd = pidfd_open(pid, 0); |
297 | if (pidfd < 0) | |
298 | return; | |
299 | } | |
237e200e | 300 | |
2aa59b2e CB |
301 | snprintf(path, sizeof(path), "/proc/%d", pid); |
302 | if (stat(path, &st)) | |
303 | return; | |
7dd6560a | 304 | |
5ec289bf | 305 | entry = zalloc(sizeof(*entry)); |
0eb3756b | 306 | if (!entry) |
237e200e | 307 | return; |
2aa59b2e | 308 | |
97017213 | 309 | ino_hash = HASH(pidns_inode); |
1e5d03fe | 310 | *entry = (struct pidns_init_store){ |
fcdedd16 | 311 | .ino = pidns_inode, |
1e5d03fe CB |
312 | .initpid = pid, |
313 | .ctime = st.st_ctime, | |
314 | .next = pidns_hash_table[ino_hash], | |
315 | .lastcheck = time(NULL), | |
316 | .init_pidfd = move_fd(pidfd), | |
317 | }; | |
318 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
319 | |
320 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
321 | } |
322 | ||
323 | /* | |
324 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
325 | * entry for the inode number and creation time. Verify that the init pid | |
326 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
327 | * otherwise. | |
328 | * Must be called under store_lock | |
329 | */ | |
cfda2e8a | 330 | static pid_t lookup_verify_initpid(ino_t pidns_inode) |
237e200e | 331 | { |
fcdedd16 | 332 | struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; |
2aa59b2e CB |
333 | |
334 | while (entry) { | |
fcdedd16 | 335 | if (entry->ino == pidns_inode) { |
2aa59b2e CB |
336 | if (initpid_still_valid(entry)) { |
337 | entry->lastcheck = time(NULL); | |
cfda2e8a | 338 | return entry->initpid; |
237e200e | 339 | } |
2aa59b2e CB |
340 | |
341 | remove_initpid(entry); | |
cfda2e8a | 342 | return ret_errno(ESRCH); |
237e200e | 343 | } |
2aa59b2e | 344 | entry = entry->next; |
237e200e SH |
345 | } |
346 | ||
cfda2e8a | 347 | return ret_errno(ESRCH); |
237e200e SH |
348 | } |
349 | ||
35acc247 | 350 | static bool send_creds_ok(int sock_fd) |
237e200e | 351 | { |
f1744de4 CB |
352 | char v = '1'; /* we are the child */ |
353 | struct ucred cred = { | |
354 | .uid = 0, | |
355 | .gid = 0, | |
356 | .pid = 1, | |
357 | }; | |
358 | ||
35acc247 | 359 | return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK; |
237e200e SH |
360 | } |
361 | ||
35acc247 | 362 | __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd) |
87f7558b | 363 | { |
35acc247 CB |
364 | /* |
365 | * These flags don't interest at all so we don't jump through any hoops | |
366 | * of retrieving them and passing them to the kernel. | |
367 | */ | |
368 | errno = EINVAL; | |
369 | if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | | |
370 | CLONE_CHILD_CLEARTID | CLONE_SETTLS))) | |
371 | return -EINVAL; | |
372 | ||
373 | #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) | |
374 | /* On s390/s390x and cris the order of the first and second arguments | |
375 | * of the system call is reversed. | |
376 | */ | |
377 | return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd); | |
378 | #elif defined(__sparc__) && defined(__arch64__) | |
379 | { | |
380 | /* | |
381 | * sparc64 always returns the other process id in %o0, and a | |
382 | * boolean flag whether this is the child or the parent in %o1. | |
383 | * Inline assembly is needed to get the flag returned in %o1. | |
384 | */ | |
385 | register long g1 asm("g1") = __NR_clone; | |
386 | register long o0 asm("o0") = flags | SIGCHLD; | |
387 | register long o1 asm("o1") = 0; /* is parent/child indicator */ | |
388 | register long o2 asm("o2") = (unsigned long)pidfd; | |
389 | long is_error, retval, in_child; | |
390 | pid_t child_pid; | |
391 | ||
392 | asm volatile( | |
393 | #if defined(__arch64__) | |
394 | "t 0x6d\n\t" /* 64-bit trap */ | |
395 | #else | |
396 | "t 0x10\n\t" /* 32-bit trap */ | |
397 | #endif | |
398 | /* | |
399 | * catch errors: On sparc, the carry bit (csr) in the | |
400 | * processor status register (psr) is used instead of a | |
401 | * full register. | |
402 | */ | |
403 | "addx %%g0, 0, %%g1" | |
404 | : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ | |
405 | : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ | |
406 | : "%cc"); /* clobbers */ | |
407 | ||
408 | is_error = g1; | |
409 | retval = o0; | |
410 | in_child = o1; | |
411 | ||
412 | if (is_error) { | |
413 | errno = retval; | |
414 | return -1; | |
415 | } | |
87f7558b | 416 | |
35acc247 CB |
417 | if (in_child) |
418 | return 0; | |
87f7558b | 419 | |
35acc247 CB |
420 | child_pid = retval; |
421 | return child_pid; | |
422 | } | |
423 | #elif defined(__ia64__) | |
424 | /* On ia64 the stack and stack size are passed as separate arguments. */ | |
425 | return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd); | |
87f7558b | 426 | #else |
35acc247 | 427 | return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd); |
87f7558b | 428 | #endif |
87f7558b CB |
429 | } |
430 | ||
431 | #define LXCFS_PROC_PID_NS_LEN \ | |
432 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
433 | STRLITERALLEN("/ns/pid") + 1) | |
434 | ||
580fe4df CB |
435 | /* |
436 | * clone a task which switches to @task's namespace and writes '1'. | |
437 | * over a unix sock so we can read the task's reaper's pid in our | |
438 | * namespace | |
439 | * | |
440 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
441 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
442 | * the pidns and the parent pid outside are identical. Using clone prevents | |
443 | * this issue. | |
444 | */ | |
445 | static void write_task_init_pid_exit(int sock, pid_t target) | |
446 | { | |
05b7a16d | 447 | __do_close int fd = -EBADF; |
87f7558b | 448 | char path[LXCFS_PROC_PID_NS_LEN]; |
580fe4df | 449 | pid_t pid; |
87f7558b CB |
450 | |
451 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
452 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
453 | if (fd < 0) | |
454 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
455 | ||
456 | if (setns(fd, 0)) | |
457 | log_exit("Failed to setns to pid namespace of process %d", target); | |
458 | ||
35acc247 | 459 | pid = lxcfs_raw_clone(0, NULL); |
580fe4df | 460 | if (pid < 0) |
87f7558b CB |
461 | _exit(EXIT_FAILURE); |
462 | ||
35acc247 CB |
463 | if (pid == 0) { |
464 | if (!send_creds_ok(sock)) | |
87f7558b CB |
465 | _exit(EXIT_FAILURE); |
466 | ||
467 | _exit(EXIT_SUCCESS); | |
237e200e | 468 | } |
35acc247 CB |
469 | |
470 | if (!wait_for_pid(pid)) | |
471 | _exit(EXIT_FAILURE); | |
472 | ||
473 | _exit(EXIT_SUCCESS); | |
237e200e SH |
474 | } |
475 | ||
8a07696e | 476 | static pid_t scm_init_pid(pid_t task) |
237e200e | 477 | { |
580fe4df | 478 | char v = '0'; |
87f7558b | 479 | pid_t pid_ret = -1; |
dac3dc93 CB |
480 | struct ucred cred = { |
481 | .pid = -1, | |
482 | .uid = -1, | |
483 | .gid = -1, | |
484 | }; | |
87f7558b CB |
485 | pid_t pid; |
486 | int sock[2]; | |
237e200e | 487 | |
87f7558b | 488 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 489 | return -1; |
237e200e | 490 | |
580fe4df CB |
491 | pid = fork(); |
492 | if (pid < 0) | |
493 | goto out; | |
87f7558b CB |
494 | |
495 | if (pid == 0) { | |
580fe4df CB |
496 | close(sock[1]); |
497 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 498 | _exit(EXIT_SUCCESS); |
237e200e | 499 | } |
7213ec5c | 500 | |
580fe4df CB |
501 | if (!recv_creds(sock[1], &cred, &v)) |
502 | goto out; | |
87f7558b CB |
503 | |
504 | pid_ret = cred.pid; | |
237e200e | 505 | |
580fe4df CB |
506 | out: |
507 | close(sock[0]); | |
508 | close(sock[1]); | |
509 | if (pid > 0) | |
510 | wait_for_pid(pid); | |
237e200e | 511 | |
87f7558b CB |
512 | return pid_ret; |
513 | } | |
2aa59b2e CB |
514 | |
515 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 516 | { |
cfda2e8a | 517 | pid_t hashed_pid = 0; |
2aa59b2e CB |
518 | char path[LXCFS_PROC_PID_NS_LEN]; |
519 | struct stat st; | |
2aa59b2e CB |
520 | |
521 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
2aa59b2e | 522 | if (stat(path, &st)) |
4e1e4115 | 523 | return ret_errno(ESRCH); |
2aa59b2e | 524 | |
4e1e4115 | 525 | store_lock(); |
fcdedd16 | 526 | |
cfda2e8a CB |
527 | hashed_pid = lookup_verify_initpid(st.st_ino); |
528 | if (hashed_pid < 0) { | |
529 | /* release the mutex as the following call is expensive */ | |
530 | store_unlock(); | |
2aa59b2e | 531 | |
8a07696e | 532 | hashed_pid = scm_init_pid(pid); |
4e1e4115 | 533 | |
cfda2e8a | 534 | store_lock(); |
4e1e4115 | 535 | |
cfda2e8a CB |
536 | if (hashed_pid > 0) |
537 | save_initpid(st.st_ino, hashed_pid); | |
538 | } | |
b7672ded | 539 | |
2aa59b2e | 540 | /* |
cfda2e8a CB |
541 | * Prune at the end in case we're pruning the value |
542 | * we were about to return. | |
2aa59b2e | 543 | */ |
580fe4df | 544 | prune_initpid_store(); |
4e1e4115 | 545 | store_unlock(); |
2aa59b2e | 546 | |
cfda2e8a | 547 | return hashed_pid; |
237e200e SH |
548 | } |
549 | ||
29a73c2f CB |
550 | /* |
551 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
552 | */ |
553 | ||
29a73c2f CB |
554 | static bool umount_if_mounted(void) |
555 | { | |
556 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 557 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
558 | return false; |
559 | } | |
560 | return true; | |
561 | } | |
562 | ||
2283e240 CB |
563 | /* __typeof__ should be safe to use with all compilers. */ |
564 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
565 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
566 | { | |
567 | return (fs->f_type == (fs_type_magic)magic_val); | |
568 | } | |
569 | ||
0a4dea41 CB |
570 | /* |
571 | * looking at fs/proc_namespace.c, it appears we can | |
572 | * actually expect the rootfs entry to very specifically contain | |
573 | * " - rootfs rootfs " | |
574 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
575 | * the rootfs entry should always be skipped in mountinfo contents. | |
576 | */ | |
577 | static bool is_on_ramfs(void) | |
578 | { | |
87f7558b | 579 | __do_free char *line = NULL; |
757a63e7 | 580 | __do_free void *fopen_cache = NULL; |
87f7558b | 581 | __do_fclose FILE *f = NULL; |
0a4dea41 | 582 | size_t len = 0; |
0a4dea41 | 583 | |
757a63e7 | 584 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
585 | if (!f) |
586 | return false; | |
587 | ||
588 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
589 | int i; |
590 | char *p, *p2; | |
591 | ||
0a4dea41 CB |
592 | for (p = line, i = 0; p && i < 4; i++) |
593 | p = strchr(p + 1, ' '); | |
594 | if (!p) | |
595 | continue; | |
87f7558b | 596 | |
0a4dea41 CB |
597 | p2 = strchr(p + 1, ' '); |
598 | if (!p2) | |
599 | continue; | |
600 | *p2 = '\0'; | |
601 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 602 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 603 | p = strchr(p2 + 1, '-'); |
87f7558b | 604 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 605 | return true; |
0a4dea41 CB |
606 | } |
607 | } | |
87f7558b | 608 | |
0a4dea41 CB |
609 | return false; |
610 | } | |
611 | ||
9b96e96e | 612 | static int pivot_enter(void) |
0a4dea41 | 613 | { |
05b7a16d | 614 | __do_close int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 | 615 | |
3326c17e | 616 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
617 | if (oldroot < 0) |
618 | return log_error_errno(-1, errno, | |
619 | "Failed to open old root for fchdir"); | |
cc309f33 | 620 | |
3326c17e | 621 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
622 | if (newroot < 0) |
623 | return log_error_errno(-1, errno, | |
624 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
625 | |
626 | /* change into new root fs */ | |
87f7558b CB |
627 | if (fchdir(newroot) < 0) |
628 | return log_error_errno(-1, | |
629 | errno, "Failed to change directory to new rootfs: %s", | |
630 | ROOTDIR); | |
cc309f33 | 631 | |
0a4dea41 | 632 | /* pivot_root into our new root fs */ |
87f7558b CB |
633 | if (pivot_root(".", ".") < 0) |
634 | return log_error_errno(-1, errno, | |
635 | "pivot_root() syscall failed: %s", | |
636 | strerror(errno)); | |
0a4dea41 CB |
637 | |
638 | /* | |
639 | * At this point the old-root is mounted on top of our new-root. | |
640 | * To unmounted it we must not be chdir'd into it, so escape back | |
641 | * to the old-root. | |
642 | */ | |
87f7558b CB |
643 | if (fchdir(oldroot) < 0) |
644 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 645 | |
87f7558b CB |
646 | if (umount2(".", MNT_DETACH) < 0) |
647 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 648 | |
87f7558b CB |
649 | if (fchdir(newroot) < 0) |
650 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 651 | |
87f7558b | 652 | return 0; |
0a4dea41 CB |
653 | } |
654 | ||
9b96e96e | 655 | static int chroot_enter(void) |
0a4dea41 CB |
656 | { |
657 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
658 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
659 | return -1; | |
660 | } | |
661 | ||
662 | if (chroot(".") < 0) { | |
663 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
664 | return -1; | |
665 | } | |
666 | ||
667 | if (chdir("/") < 0) { | |
668 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
669 | return -1; | |
670 | } | |
671 | ||
672 | return 0; | |
673 | } | |
674 | ||
0232cbac | 675 | static int permute_and_enter(void) |
29a73c2f | 676 | { |
0a4dea41 CB |
677 | struct statfs sb; |
678 | ||
679 | if (statfs("/", &sb) < 0) { | |
680 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 681 | return -1; |
0a4dea41 CB |
682 | } |
683 | ||
684 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
685 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
686 | * /proc/1/mountinfo. */ | |
687 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
688 | return chroot_enter(); | |
29a73c2f | 689 | |
cc309f33 | 690 | if (pivot_enter() < 0) { |
0a4dea41 | 691 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 692 | return -1; |
29a73c2f CB |
693 | } |
694 | ||
cc309f33 | 695 | return 0; |
29a73c2f CB |
696 | } |
697 | ||
698 | /* Prepare our new clean root. */ | |
0232cbac | 699 | static int permute_prepare(void) |
29a73c2f CB |
700 | { |
701 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 702 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
703 | return -1; |
704 | } | |
705 | ||
706 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 707 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
708 | return -1; |
709 | } | |
710 | ||
711 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 712 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
713 | return -1; |
714 | } | |
715 | ||
716 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 717 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
718 | return -1; |
719 | } | |
720 | ||
721 | return 0; | |
722 | } | |
723 | ||
0232cbac CB |
724 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
725 | static bool permute_root(void) | |
29a73c2f CB |
726 | { |
727 | /* Prepare new root. */ | |
0232cbac | 728 | if (permute_prepare() < 0) |
29a73c2f CB |
729 | return false; |
730 | ||
731 | /* Pivot into new root. */ | |
0232cbac | 732 | if (permute_and_enter() < 0) |
29a73c2f CB |
733 | return false; |
734 | ||
735 | return true; | |
736 | } | |
737 | ||
0a4dea41 | 738 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
739 | { |
740 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 741 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
742 | return false; |
743 | } | |
480262c9 | 744 | |
29a73c2f | 745 | if (!umount_if_mounted()) { |
b8defc3d | 746 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
747 | return false; |
748 | } | |
749 | ||
750 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 751 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
752 | return false; |
753 | } | |
754 | ||
1d81c6a6 | 755 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 756 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
757 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
758 | return false; | |
759 | } | |
760 | ||
480262c9 | 761 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 762 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
763 | return false; |
764 | } | |
480262c9 | 765 | |
29a73c2f | 766 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 767 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
768 | return false; |
769 | } | |
480262c9 | 770 | |
29a73c2f CB |
771 | return true; |
772 | } | |
773 | ||
0a4dea41 | 774 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 775 | { |
5fbea8a6 CB |
776 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
777 | return false; | |
51c7ca35 | 778 | |
5fbea8a6 CB |
779 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
780 | return false; | |
29a73c2f | 781 | |
5fbea8a6 CB |
782 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
783 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
784 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
785 | if ((*h)->fd < 0) | |
29a73c2f | 786 | return false; |
29a73c2f | 787 | } |
5fbea8a6 | 788 | |
29a73c2f CB |
789 | return true; |
790 | } | |
791 | ||
480262c9 | 792 | static bool cgfs_setup_controllers(void) |
29a73c2f | 793 | { |
0a4dea41 | 794 | if (!cgfs_prepare_mounts()) |
29a73c2f | 795 | return false; |
29a73c2f | 796 | |
2b8eff1d CB |
797 | if (!cgfs_mount_hierarchies()) |
798 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 799 | |
0232cbac | 800 | if (!permute_root()) |
29a73c2f CB |
801 | return false; |
802 | ||
803 | return true; | |
804 | } | |
805 | ||
dee86006 | 806 | static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) |
b9b6bdc9 CB |
807 | { |
808 | int ret; | |
809 | ||
810 | if (reload_successful) { | |
811 | reload_successful = 0; | |
812 | ||
813 | /* write() is async signal safe */ | |
814 | ret = write(STDERR_FILENO, | |
815 | "Switched into non-virtualization mode\n", | |
816 | STRLITERALLEN("Switched into non-virtualization mode\n")); | |
817 | if (ret < 0) | |
818 | goto please_compiler; | |
819 | } else { | |
820 | reload_successful = 1; | |
821 | ||
822 | /* write() is async signal safe */ | |
823 | ret = write(STDERR_FILENO, "Switched into virtualization mode\n", | |
824 | STRLITERALLEN("Switched into virtualization mode\n")); | |
825 | if (ret < 0) | |
826 | goto please_compiler; | |
827 | } | |
828 | ||
829 | please_compiler: | |
830 | /* | |
831 | * The write() syscall is a function whose return value needs to be | |
4210ee1d CB |
832 | * checked. Otherwise the compiler will warn.Another one could be to |
833 | * use syscall(__NR_write, ...) directly but whatever. | |
b9b6bdc9 CB |
834 | */ |
835 | return; | |
836 | } | |
837 | ||
2243c5a9 | 838 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 839 | { |
05b7a16d | 840 | __do_close int init_ns = -EBADF, root_fd = -EBADF, |
de69569b | 841 | pidfd = -EBADF; |
4ec5c9da | 842 | int i = 0; |
2aa59b2e | 843 | pid_t pid; |
237e200e | 844 | |
c2357135 | 845 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); |
cc42d0c7 | 846 | |
5fbea8a6 | 847 | cgroup_ops = cgroup_init(); |
c2357135 CB |
848 | if (!cgroup_ops) { |
849 | lxcfs_info("Failed to initialize cgroup support"); | |
850 | goto broken_upgrade; | |
851 | } | |
237e200e | 852 | |
480262c9 | 853 | /* Preserve initial namespace. */ |
2aa59b2e CB |
854 | pid = getpid(); |
855 | init_ns = preserve_ns(pid, "mnt"); | |
c2357135 CB |
856 | if (init_ns < 0) { |
857 | lxcfs_info("Failed to preserve initial mount namespace"); | |
858 | goto broken_upgrade; | |
859 | } | |
480262c9 | 860 | |
480262c9 CB |
861 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
862 | * to privately mount lxcfs cgroups. */ | |
c2357135 | 863 | if (!cgfs_setup_controllers()) { |
2243c5a9 | 864 | log_exit("Failed to setup private cgroup mounts for lxcfs"); |
c2357135 CB |
865 | goto broken_upgrade; |
866 | } | |
480262c9 | 867 | |
c2357135 | 868 | if (setns(init_ns, 0) < 0) { |
2243c5a9 | 869 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); |
c2357135 CB |
870 | goto broken_upgrade; |
871 | } | |
29a73c2f | 872 | |
c2357135 | 873 | if (!init_cpuview()) { |
2243c5a9 | 874 | log_exit("Failed to init CPU view"); |
c2357135 CB |
875 | goto broken_upgrade; |
876 | } | |
056adcef | 877 | |
cc42d0c7 CB |
878 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
879 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
880 | |
881 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
882 | char **controller_list = (*h)->controllers; |
883 | __do_free char *controllers = NULL; | |
884 | if (controller_list && *controller_list) | |
885 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
886 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 887 | } |
2aa59b2e CB |
888 | |
889 | pidfd = pidfd_open(pid, 0); | |
890 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
891 | can_use_pidfd = true; | |
cc42d0c7 | 892 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 893 | } |
ce8fc84c | 894 | |
c6805016 CB |
895 | can_use_swap = cgroup_ops->can_use_swap(cgroup_ops); |
896 | if (can_use_swap) | |
897 | lxcfs_info("Kernel supports swap accounting"); | |
898 | else | |
899 | lxcfs_info("Kernel does not support swap accounting"); | |
900 | ||
cc42d0c7 | 901 | lxcfs_info("api_extensions:"); |
3cf1e562 CB |
902 | for (size_t nr = 0; nr < nr_api_extensions; nr++) |
903 | lxcfs_info("- %s", api_extensions[nr]); | |
de69569b CB |
904 | |
905 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
c2357135 CB |
906 | if (root_fd < 0) |
907 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
908 | else if (fchdir(root_fd) < 0) | |
909 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
910 | ||
dee86006 CB |
911 | if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { |
912 | lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); | |
b9b6bdc9 | 913 | goto broken_upgrade; |
dee86006 | 914 | } |
b9b6bdc9 CB |
915 | |
916 | reload_successful = 1; | |
c2357135 | 917 | return; |
de69569b | 918 | |
c2357135 | 919 | broken_upgrade: |
b9b6bdc9 | 920 | reload_successful = 0; |
c2357135 | 921 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); |
237e200e SH |
922 | } |
923 | ||
2243c5a9 | 924 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 925 | { |
cc42d0c7 CB |
926 | lxcfs_info("Running destructor %s", __func__); |
927 | ||
c8f77ce4 | 928 | clear_initpid_store(); |
056adcef | 929 | free_cpuview(); |
2243c5a9 | 930 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 931 | } |
285aea40 | 932 | |
0d5383b7 | 933 | void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data) |
285aea40 CB |
934 | { |
935 | struct fuse_context *fc = fuse_get_context(); | |
936 | can_use_sys_cpu = true; | |
937 | has_versioned_opts = true; | |
938 | return fc->private_data; | |
939 | } |