]>
Commit | Line | Data |
---|---|---|
db0463bf | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
237e200e | 2 | |
f834b6bf SP |
3 | #include "config.h" |
4 | ||
237e200e | 5 | #include <dirent.h> |
29a73c2f | 6 | #include <errno.h> |
237e200e | 7 | #include <fcntl.h> |
0ecddf02 | 8 | #include <inttypes.h> |
237e200e | 9 | #include <libgen.h> |
dee86006 CB |
10 | #include <linux/magic.h> |
11 | #include <linux/sched.h> | |
237e200e | 12 | #include <pthread.h> |
29a73c2f | 13 | #include <sched.h> |
db1b32f6 | 14 | #include <stdarg.h> |
29a73c2f | 15 | #include <stdbool.h> |
0ecddf02 | 16 | #include <stdint.h> |
29a73c2f CB |
17 | #include <stdio.h> |
18 | #include <stdlib.h> | |
19 | #include <string.h> | |
29a73c2f CB |
20 | #include <sys/epoll.h> |
21 | #include <sys/mman.h> | |
22 | #include <sys/mount.h> | |
237e200e SH |
23 | #include <sys/param.h> |
24 | #include <sys/socket.h> | |
29a73c2f | 25 | #include <sys/syscall.h> |
0ecddf02 | 26 | #include <sys/sysinfo.h> |
d89504c4 | 27 | #include <sys/vfs.h> |
dee86006 CB |
28 | #include <time.h> |
29 | #include <unistd.h> | |
30 | #include <wait.h> | |
237e200e | 31 | |
237e200e | 32 | #include "bindings.h" |
e01afbb7 CB |
33 | |
34 | #include "api_extensions.h" | |
580fe4df | 35 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
36 | #include "cgroups/cgroup.h" |
37 | #include "cgroups/cgroup_utils.h" | |
c9236032 | 38 | #include "memory_utils.h" |
1f5596dd | 39 | #include "proc_cpuview.h" |
8364a99c | 40 | #include "syscall_numbers.h" |
1d81c6a6 | 41 | #include "utils.h" |
237e200e | 42 | |
2aa59b2e | 43 | static bool can_use_pidfd; |
c6805016 | 44 | static bool can_use_swap; |
285aea40 CB |
45 | static bool can_use_sys_cpu; |
46 | static bool has_versioned_opts; | |
b9b6bdc9 CB |
47 | |
48 | static volatile sig_atomic_t reload_successful; | |
cbfc55fd CB |
49 | |
50 | bool liblxcfs_functional(void) | |
51 | { | |
b9b6bdc9 | 52 | return reload_successful != 0; |
cbfc55fd | 53 | } |
2aa59b2e | 54 | |
c6805016 CB |
55 | bool liblxcfs_can_use_swap(void) |
56 | { | |
57 | return can_use_swap; | |
58 | } | |
59 | ||
285aea40 CB |
60 | bool liblxcfs_can_use_sys_cpu(void) |
61 | { | |
62 | return can_use_sys_cpu; | |
63 | } | |
64 | ||
65 | bool liblxcfs_has_versioned_opts(void) | |
66 | { | |
67 | return has_versioned_opts; | |
68 | } | |
69 | ||
29a73c2f CB |
70 | /* Define pivot_root() if missing from the C library */ |
71 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 72 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f | 73 | { |
4ec5c9da | 74 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f CB |
75 | } |
76 | #else | |
4ec5c9da | 77 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
78 | #endif |
79 | ||
237e200e SH |
80 | /* |
81 | * A table caching which pid is init for a pid namespace. | |
82 | * When looking up which pid is init for $qpid, we first | |
83 | * 1. Stat /proc/$qpid/ns/pid. | |
84 | * 2. Check whether the ino_t is in our store. | |
85 | * a. if not, fork a child in qpid's ns to send us | |
86 | * ucred.pid = 1, and read the initpid. Cache | |
87 | * initpid and creation time for /proc/initpid | |
88 | * in a new store entry. | |
89 | * b. if so, verify that /proc/initpid still matches | |
90 | * what we have saved. If not, clear the store | |
91 | * entry and go back to a. If so, return the | |
92 | * cached initpid. | |
93 | */ | |
94 | struct pidns_init_store { | |
2aa59b2e CB |
95 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
96 | pid_t initpid; /* the pid of nit in that ns */ | |
97 | int init_pidfd; | |
1ba088ae | 98 | int64_t ctime; /* the time at which /proc/$initpid was created */ |
237e200e | 99 | struct pidns_init_store *next; |
1ba088ae | 100 | int64_t lastcheck; |
237e200e SH |
101 | }; |
102 | ||
103 | /* lol - look at how they are allocated in the kernel */ | |
104 | #define PIDNS_HASH_SIZE 4096 | |
105 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
106 | ||
107 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
108 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 109 | |
4e1e4115 | 110 | static void mutex_lock(pthread_mutex_t *l) |
237e200e SH |
111 | { |
112 | int ret; | |
113 | ||
4ec5c9da CB |
114 | ret = pthread_mutex_lock(l); |
115 | if (ret) | |
116 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
117 | } |
118 | ||
77f4399a | 119 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 120 | |
4e1e4115 | 121 | static void mutex_unlock(pthread_mutex_t *l) |
237e200e SH |
122 | { |
123 | int ret; | |
124 | ||
4ec5c9da CB |
125 | ret = pthread_mutex_unlock(l); |
126 | if (ret) | |
127 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
128 | } |
129 | ||
4e1e4115 | 130 | static inline void store_lock(void) |
237e200e | 131 | { |
4e1e4115 | 132 | mutex_lock(&pidns_store_mutex); |
237e200e SH |
133 | } |
134 | ||
4e1e4115 | 135 | static inline void store_unlock(void) |
237e200e | 136 | { |
4e1e4115 | 137 | mutex_unlock(&pidns_store_mutex); |
237e200e SH |
138 | } |
139 | ||
2aa59b2e CB |
140 | /* /proc/ = 6 |
141 | * + | |
142 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
143 | * + | |
144 | * \0 = 1 | |
145 | */ | |
146 | #define LXCFS_PROC_PID_LEN \ | |
147 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
148 | ||
bc189096 | 149 | static int initpid_still_valid_pidfd(struct pidns_init_store *entry) |
237e200e | 150 | { |
bc189096 | 151 | int ret; |
237e200e | 152 | |
bc189096 CB |
153 | if (entry->init_pidfd < 0) |
154 | return ret_errno(ENOSYS); | |
7dd6560a | 155 | |
bc189096 CB |
156 | ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); |
157 | if (ret < 0) { | |
158 | if (errno == ENOSYS) | |
159 | return ret_errno(ENOSYS); | |
7dd6560a | 160 | |
bc189096 | 161 | return 0; |
2aa59b2e CB |
162 | } |
163 | ||
bc189096 CB |
164 | return 1; |
165 | } | |
166 | ||
167 | static int initpid_still_valid_stat(struct pidns_init_store *entry) | |
168 | { | |
169 | struct stat st; | |
170 | char path[LXCFS_PROC_PID_LEN]; | |
171 | ||
172 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); | |
173 | if (stat(path, &st) || entry->ctime != st.st_ctime) | |
174 | return 0; | |
175 | ||
176 | return 1; | |
177 | } | |
178 | ||
179 | /* Must be called under store_lock */ | |
180 | static bool initpid_still_valid(struct pidns_init_store *entry) | |
181 | { | |
182 | int ret; | |
183 | ||
184 | ret = initpid_still_valid_pidfd(entry); | |
185 | if (ret < 0) | |
186 | ret = initpid_still_valid_stat(entry); | |
187 | ||
188 | return ret == 1; | |
237e200e SH |
189 | } |
190 | ||
191 | /* Must be called under store_lock */ | |
2aa59b2e | 192 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 193 | { |
2aa59b2e CB |
194 | struct pidns_init_store *it; |
195 | int ino_hash; | |
237e200e | 196 | |
2aa59b2e CB |
197 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
198 | entry->initpid); | |
7dd6560a | 199 | |
2aa59b2e CB |
200 | ino_hash = HASH(entry->ino); |
201 | if (pidns_hash_table[ino_hash] == entry) { | |
202 | pidns_hash_table[ino_hash] = entry->next; | |
203 | close_prot_errno_disarm(entry->init_pidfd); | |
204 | free_disarm(entry); | |
237e200e SH |
205 | return; |
206 | } | |
207 | ||
2aa59b2e CB |
208 | it = pidns_hash_table[ino_hash]; |
209 | while (it) { | |
210 | if (it->next == entry) { | |
211 | it->next = entry->next; | |
212 | close_prot_errno_disarm(entry->init_pidfd); | |
213 | free_disarm(entry); | |
237e200e SH |
214 | return; |
215 | } | |
2aa59b2e | 216 | it = it->next; |
237e200e SH |
217 | } |
218 | } | |
219 | ||
220 | #define PURGE_SECS 5 | |
221 | /* Must be called under store_lock */ | |
222 | static void prune_initpid_store(void) | |
223 | { | |
1ba088ae CB |
224 | static int64_t last_prune = 0; |
225 | int64_t now, threshold; | |
237e200e SH |
226 | |
227 | if (!last_prune) { | |
228 | last_prune = time(NULL); | |
229 | return; | |
230 | } | |
2aa59b2e | 231 | |
237e200e | 232 | now = time(NULL); |
b18d6121 | 233 | if (now < (last_prune + PURGE_SECS)) |
237e200e | 234 | return; |
7dd6560a | 235 | |
2aa59b2e | 236 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 237 | |
237e200e SH |
238 | last_prune = now; |
239 | threshold = now - 2 * PURGE_SECS; | |
240 | ||
2aa59b2e CB |
241 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
242 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
243 | if (entry->lastcheck < threshold) { | |
244 | struct pidns_init_store *cur = entry; | |
7dd6560a | 245 | |
2aa59b2e | 246 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 247 | |
237e200e | 248 | if (prev) |
2aa59b2e | 249 | prev->next = entry->next; |
237e200e | 250 | else |
2aa59b2e CB |
251 | pidns_hash_table[i] = entry->next; |
252 | entry = entry->next; | |
253 | close_prot_errno_disarm(cur->init_pidfd); | |
254 | free_disarm(cur); | |
237e200e | 255 | } else { |
2aa59b2e CB |
256 | prev = entry; |
257 | entry = entry->next; | |
237e200e SH |
258 | } |
259 | } | |
260 | } | |
261 | } | |
262 | ||
c8f77ce4 CB |
263 | static void clear_initpid_store(void) |
264 | { | |
265 | store_lock(); | |
266 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { | |
267 | for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) { | |
268 | struct pidns_init_store *cur = entry; | |
269 | ||
270 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); | |
271 | ||
272 | pidns_hash_table[i] = entry->next; | |
273 | entry = entry->next; | |
274 | close_prot_errno_disarm(cur->init_pidfd); | |
275 | free_disarm(cur); | |
276 | } | |
277 | } | |
278 | store_unlock(); | |
279 | } | |
280 | ||
237e200e | 281 | /* Must be called under store_lock */ |
fcdedd16 | 282 | static void save_initpid(ino_t pidns_inode, pid_t pid) |
237e200e | 283 | { |
1e5d03fe | 284 | __do_free struct pidns_init_store *entry = NULL; |
05b7a16d | 285 | __do_close int pidfd = -EBADF; |
536620fd | 286 | const struct lxcfs_opts *opts = fuse_get_context()->private_data; |
2aa59b2e | 287 | char path[LXCFS_PROC_PID_LEN]; |
2aa59b2e CB |
288 | struct stat st; |
289 | int ino_hash; | |
290 | ||
9973cc06 | 291 | if (opts && opts->use_pidfd && can_use_pidfd) { |
2aa59b2e CB |
292 | pidfd = pidfd_open(pid, 0); |
293 | if (pidfd < 0) | |
294 | return; | |
295 | } | |
237e200e | 296 | |
2aa59b2e CB |
297 | snprintf(path, sizeof(path), "/proc/%d", pid); |
298 | if (stat(path, &st)) | |
299 | return; | |
7dd6560a | 300 | |
5ec289bf | 301 | entry = zalloc(sizeof(*entry)); |
0eb3756b | 302 | if (!entry) |
237e200e | 303 | return; |
2aa59b2e | 304 | |
97017213 | 305 | ino_hash = HASH(pidns_inode); |
1e5d03fe | 306 | *entry = (struct pidns_init_store){ |
fcdedd16 | 307 | .ino = pidns_inode, |
1e5d03fe CB |
308 | .initpid = pid, |
309 | .ctime = st.st_ctime, | |
310 | .next = pidns_hash_table[ino_hash], | |
311 | .lastcheck = time(NULL), | |
312 | .init_pidfd = move_fd(pidfd), | |
313 | }; | |
314 | pidns_hash_table[ino_hash] = move_ptr(entry); | |
2aa59b2e CB |
315 | |
316 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
317 | } |
318 | ||
319 | /* | |
320 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
321 | * entry for the inode number and creation time. Verify that the init pid | |
322 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
323 | * otherwise. | |
324 | * Must be called under store_lock | |
325 | */ | |
cfda2e8a | 326 | static pid_t lookup_verify_initpid(ino_t pidns_inode) |
237e200e | 327 | { |
fcdedd16 | 328 | struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; |
2aa59b2e CB |
329 | |
330 | while (entry) { | |
fcdedd16 | 331 | if (entry->ino == pidns_inode) { |
2aa59b2e CB |
332 | if (initpid_still_valid(entry)) { |
333 | entry->lastcheck = time(NULL); | |
cfda2e8a | 334 | return entry->initpid; |
237e200e | 335 | } |
2aa59b2e CB |
336 | |
337 | remove_initpid(entry); | |
cfda2e8a | 338 | return ret_errno(ESRCH); |
237e200e | 339 | } |
2aa59b2e | 340 | entry = entry->next; |
237e200e SH |
341 | } |
342 | ||
cfda2e8a | 343 | return ret_errno(ESRCH); |
237e200e SH |
344 | } |
345 | ||
35acc247 | 346 | static bool send_creds_ok(int sock_fd) |
237e200e | 347 | { |
f1744de4 CB |
348 | char v = '1'; /* we are the child */ |
349 | struct ucred cred = { | |
350 | .uid = 0, | |
351 | .gid = 0, | |
352 | .pid = 1, | |
353 | }; | |
354 | ||
35acc247 | 355 | return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK; |
237e200e SH |
356 | } |
357 | ||
35acc247 | 358 | __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd) |
87f7558b | 359 | { |
35acc247 CB |
360 | /* |
361 | * These flags don't interest at all so we don't jump through any hoops | |
362 | * of retrieving them and passing them to the kernel. | |
363 | */ | |
364 | errno = EINVAL; | |
365 | if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | | |
366 | CLONE_CHILD_CLEARTID | CLONE_SETTLS))) | |
367 | return -EINVAL; | |
368 | ||
369 | #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) | |
370 | /* On s390/s390x and cris the order of the first and second arguments | |
371 | * of the system call is reversed. | |
372 | */ | |
373 | return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd); | |
374 | #elif defined(__sparc__) && defined(__arch64__) | |
375 | { | |
376 | /* | |
377 | * sparc64 always returns the other process id in %o0, and a | |
378 | * boolean flag whether this is the child or the parent in %o1. | |
379 | * Inline assembly is needed to get the flag returned in %o1. | |
380 | */ | |
381 | register long g1 asm("g1") = __NR_clone; | |
382 | register long o0 asm("o0") = flags | SIGCHLD; | |
383 | register long o1 asm("o1") = 0; /* is parent/child indicator */ | |
384 | register long o2 asm("o2") = (unsigned long)pidfd; | |
385 | long is_error, retval, in_child; | |
386 | pid_t child_pid; | |
387 | ||
388 | asm volatile( | |
389 | #if defined(__arch64__) | |
390 | "t 0x6d\n\t" /* 64-bit trap */ | |
391 | #else | |
392 | "t 0x10\n\t" /* 32-bit trap */ | |
393 | #endif | |
394 | /* | |
395 | * catch errors: On sparc, the carry bit (csr) in the | |
396 | * processor status register (psr) is used instead of a | |
397 | * full register. | |
398 | */ | |
399 | "addx %%g0, 0, %%g1" | |
400 | : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ | |
401 | : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ | |
402 | : "%cc"); /* clobbers */ | |
403 | ||
404 | is_error = g1; | |
405 | retval = o0; | |
406 | in_child = o1; | |
407 | ||
408 | if (is_error) { | |
409 | errno = retval; | |
410 | return -1; | |
411 | } | |
87f7558b | 412 | |
35acc247 CB |
413 | if (in_child) |
414 | return 0; | |
87f7558b | 415 | |
35acc247 CB |
416 | child_pid = retval; |
417 | return child_pid; | |
418 | } | |
419 | #elif defined(__ia64__) | |
420 | /* On ia64 the stack and stack size are passed as separate arguments. */ | |
421 | return syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0), pidfd); | |
87f7558b | 422 | #else |
35acc247 | 423 | return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd); |
87f7558b | 424 | #endif |
87f7558b CB |
425 | } |
426 | ||
427 | #define LXCFS_PROC_PID_NS_LEN \ | |
428 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
429 | STRLITERALLEN("/ns/pid") + 1) | |
430 | ||
580fe4df CB |
431 | /* |
432 | * clone a task which switches to @task's namespace and writes '1'. | |
433 | * over a unix sock so we can read the task's reaper's pid in our | |
434 | * namespace | |
435 | * | |
436 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
437 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
438 | * the pidns and the parent pid outside are identical. Using clone prevents | |
439 | * this issue. | |
440 | */ | |
441 | static void write_task_init_pid_exit(int sock, pid_t target) | |
442 | { | |
05b7a16d | 443 | __do_close int fd = -EBADF; |
87f7558b | 444 | char path[LXCFS_PROC_PID_NS_LEN]; |
580fe4df | 445 | pid_t pid; |
87f7558b CB |
446 | |
447 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); | |
448 | fd = open(path, O_RDONLY | O_CLOEXEC); | |
449 | if (fd < 0) | |
450 | log_exit("write_task_init_pid_exit open of ns/pid"); | |
451 | ||
452 | if (setns(fd, 0)) | |
453 | log_exit("Failed to setns to pid namespace of process %d", target); | |
454 | ||
35acc247 | 455 | pid = lxcfs_raw_clone(0, NULL); |
580fe4df | 456 | if (pid < 0) |
87f7558b CB |
457 | _exit(EXIT_FAILURE); |
458 | ||
35acc247 CB |
459 | if (pid == 0) { |
460 | if (!send_creds_ok(sock)) | |
87f7558b CB |
461 | _exit(EXIT_FAILURE); |
462 | ||
463 | _exit(EXIT_SUCCESS); | |
237e200e | 464 | } |
35acc247 CB |
465 | |
466 | if (!wait_for_pid(pid)) | |
467 | _exit(EXIT_FAILURE); | |
468 | ||
469 | _exit(EXIT_SUCCESS); | |
237e200e SH |
470 | } |
471 | ||
8a07696e | 472 | static pid_t scm_init_pid(pid_t task) |
237e200e | 473 | { |
580fe4df | 474 | char v = '0'; |
87f7558b | 475 | pid_t pid_ret = -1; |
dac3dc93 CB |
476 | struct ucred cred = { |
477 | .pid = -1, | |
478 | .uid = -1, | |
479 | .gid = -1, | |
480 | }; | |
87f7558b CB |
481 | pid_t pid; |
482 | int sock[2]; | |
237e200e | 483 | |
87f7558b | 484 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) |
580fe4df | 485 | return -1; |
237e200e | 486 | |
580fe4df CB |
487 | pid = fork(); |
488 | if (pid < 0) | |
489 | goto out; | |
87f7558b CB |
490 | |
491 | if (pid == 0) { | |
580fe4df CB |
492 | close(sock[1]); |
493 | write_task_init_pid_exit(sock[0], task); | |
87f7558b | 494 | _exit(EXIT_SUCCESS); |
237e200e | 495 | } |
7213ec5c | 496 | |
580fe4df CB |
497 | if (!recv_creds(sock[1], &cred, &v)) |
498 | goto out; | |
87f7558b CB |
499 | |
500 | pid_ret = cred.pid; | |
237e200e | 501 | |
580fe4df CB |
502 | out: |
503 | close(sock[0]); | |
504 | close(sock[1]); | |
505 | if (pid > 0) | |
506 | wait_for_pid(pid); | |
237e200e | 507 | |
87f7558b CB |
508 | return pid_ret; |
509 | } | |
2aa59b2e CB |
510 | |
511 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 512 | { |
cfda2e8a | 513 | pid_t hashed_pid = 0; |
2aa59b2e CB |
514 | char path[LXCFS_PROC_PID_NS_LEN]; |
515 | struct stat st; | |
2aa59b2e CB |
516 | |
517 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
2aa59b2e | 518 | if (stat(path, &st)) |
4e1e4115 | 519 | return ret_errno(ESRCH); |
2aa59b2e | 520 | |
4e1e4115 | 521 | store_lock(); |
fcdedd16 | 522 | |
cfda2e8a CB |
523 | hashed_pid = lookup_verify_initpid(st.st_ino); |
524 | if (hashed_pid < 0) { | |
525 | /* release the mutex as the following call is expensive */ | |
526 | store_unlock(); | |
2aa59b2e | 527 | |
8a07696e | 528 | hashed_pid = scm_init_pid(pid); |
4e1e4115 | 529 | |
cfda2e8a | 530 | store_lock(); |
4e1e4115 | 531 | |
cfda2e8a CB |
532 | if (hashed_pid > 0) |
533 | save_initpid(st.st_ino, hashed_pid); | |
534 | } | |
b7672ded | 535 | |
2aa59b2e | 536 | /* |
cfda2e8a CB |
537 | * Prune at the end in case we're pruning the value |
538 | * we were about to return. | |
2aa59b2e | 539 | */ |
580fe4df | 540 | prune_initpid_store(); |
4e1e4115 | 541 | store_unlock(); |
2aa59b2e | 542 | |
cfda2e8a | 543 | return hashed_pid; |
237e200e SH |
544 | } |
545 | ||
29a73c2f CB |
546 | /* |
547 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
548 | */ |
549 | ||
29a73c2f CB |
550 | static bool umount_if_mounted(void) |
551 | { | |
552 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 553 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
554 | return false; |
555 | } | |
556 | return true; | |
557 | } | |
558 | ||
2283e240 CB |
559 | /* __typeof__ should be safe to use with all compilers. */ |
560 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
561 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
562 | { | |
563 | return (fs->f_type == (fs_type_magic)magic_val); | |
564 | } | |
565 | ||
0a4dea41 CB |
566 | /* |
567 | * looking at fs/proc_namespace.c, it appears we can | |
568 | * actually expect the rootfs entry to very specifically contain | |
569 | * " - rootfs rootfs " | |
570 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
571 | * the rootfs entry should always be skipped in mountinfo contents. | |
572 | */ | |
573 | static bool is_on_ramfs(void) | |
574 | { | |
87f7558b | 575 | __do_free char *line = NULL; |
757a63e7 | 576 | __do_free void *fopen_cache = NULL; |
87f7558b | 577 | __do_fclose FILE *f = NULL; |
0a4dea41 | 578 | size_t len = 0; |
0a4dea41 | 579 | |
757a63e7 | 580 | f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); |
0a4dea41 CB |
581 | if (!f) |
582 | return false; | |
583 | ||
584 | while (getline(&line, &len, f) != -1) { | |
87f7558b CB |
585 | int i; |
586 | char *p, *p2; | |
587 | ||
0a4dea41 CB |
588 | for (p = line, i = 0; p && i < 4; i++) |
589 | p = strchr(p + 1, ' '); | |
590 | if (!p) | |
591 | continue; | |
87f7558b | 592 | |
0a4dea41 CB |
593 | p2 = strchr(p + 1, ' '); |
594 | if (!p2) | |
595 | continue; | |
596 | *p2 = '\0'; | |
597 | if (strcmp(p + 1, "/") == 0) { | |
87f7558b | 598 | /* This is '/'. Is it the ramfs? */ |
0a4dea41 | 599 | p = strchr(p2 + 1, '-'); |
87f7558b | 600 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) |
0a4dea41 | 601 | return true; |
0a4dea41 CB |
602 | } |
603 | } | |
87f7558b | 604 | |
0a4dea41 CB |
605 | return false; |
606 | } | |
607 | ||
9b96e96e | 608 | static int pivot_enter(void) |
0a4dea41 | 609 | { |
05b7a16d | 610 | __do_close int oldroot = -EBADF, newroot = -EBADF; |
cc309f33 | 611 | |
3326c17e | 612 | oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
613 | if (oldroot < 0) |
614 | return log_error_errno(-1, errno, | |
615 | "Failed to open old root for fchdir"); | |
cc309f33 | 616 | |
3326c17e | 617 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY | O_CLOEXEC); |
87f7558b CB |
618 | if (newroot < 0) |
619 | return log_error_errno(-1, errno, | |
620 | "Failed to open new root for fchdir"); | |
cc309f33 CB |
621 | |
622 | /* change into new root fs */ | |
87f7558b CB |
623 | if (fchdir(newroot) < 0) |
624 | return log_error_errno(-1, | |
625 | errno, "Failed to change directory to new rootfs: %s", | |
626 | ROOTDIR); | |
cc309f33 | 627 | |
0a4dea41 | 628 | /* pivot_root into our new root fs */ |
87f7558b CB |
629 | if (pivot_root(".", ".") < 0) |
630 | return log_error_errno(-1, errno, | |
631 | "pivot_root() syscall failed: %s", | |
632 | strerror(errno)); | |
0a4dea41 CB |
633 | |
634 | /* | |
635 | * At this point the old-root is mounted on top of our new-root. | |
636 | * To unmounted it we must not be chdir'd into it, so escape back | |
637 | * to the old-root. | |
638 | */ | |
87f7558b CB |
639 | if (fchdir(oldroot) < 0) |
640 | return log_error_errno(-1, errno, "Failed to enter old root"); | |
0a4dea41 | 641 | |
87f7558b CB |
642 | if (umount2(".", MNT_DETACH) < 0) |
643 | return log_error_errno(-1, errno, "Failed to detach old root"); | |
0a4dea41 | 644 | |
87f7558b CB |
645 | if (fchdir(newroot) < 0) |
646 | return log_error_errno(-1, errno, "Failed to re-enter new root"); | |
cc309f33 | 647 | |
87f7558b | 648 | return 0; |
0a4dea41 CB |
649 | } |
650 | ||
9b96e96e | 651 | static int chroot_enter(void) |
0a4dea41 CB |
652 | { |
653 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
654 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
655 | return -1; | |
656 | } | |
657 | ||
658 | if (chroot(".") < 0) { | |
659 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
660 | return -1; | |
661 | } | |
662 | ||
663 | if (chdir("/") < 0) { | |
664 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
665 | return -1; | |
666 | } | |
667 | ||
668 | return 0; | |
669 | } | |
670 | ||
0232cbac | 671 | static int permute_and_enter(void) |
29a73c2f | 672 | { |
0a4dea41 CB |
673 | struct statfs sb; |
674 | ||
675 | if (statfs("/", &sb) < 0) { | |
676 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 677 | return -1; |
0a4dea41 CB |
678 | } |
679 | ||
680 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
681 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
682 | * /proc/1/mountinfo. */ | |
683 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
684 | return chroot_enter(); | |
29a73c2f | 685 | |
cc309f33 | 686 | if (pivot_enter() < 0) { |
0a4dea41 | 687 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 688 | return -1; |
29a73c2f CB |
689 | } |
690 | ||
cc309f33 | 691 | return 0; |
29a73c2f CB |
692 | } |
693 | ||
694 | /* Prepare our new clean root. */ | |
0232cbac | 695 | static int permute_prepare(void) |
29a73c2f CB |
696 | { |
697 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 698 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
699 | return -1; |
700 | } | |
701 | ||
702 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 703 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
704 | return -1; |
705 | } | |
706 | ||
707 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 708 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
709 | return -1; |
710 | } | |
711 | ||
712 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 713 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
714 | return -1; |
715 | } | |
716 | ||
717 | return 0; | |
718 | } | |
719 | ||
0232cbac CB |
720 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
721 | static bool permute_root(void) | |
29a73c2f CB |
722 | { |
723 | /* Prepare new root. */ | |
0232cbac | 724 | if (permute_prepare() < 0) |
29a73c2f CB |
725 | return false; |
726 | ||
727 | /* Pivot into new root. */ | |
0232cbac | 728 | if (permute_and_enter() < 0) |
29a73c2f CB |
729 | return false; |
730 | ||
731 | return true; | |
732 | } | |
733 | ||
0a4dea41 | 734 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
735 | { |
736 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 737 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
738 | return false; |
739 | } | |
480262c9 | 740 | |
29a73c2f | 741 | if (!umount_if_mounted()) { |
b8defc3d | 742 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
743 | return false; |
744 | } | |
745 | ||
746 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 747 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
748 | return false; |
749 | } | |
750 | ||
1d81c6a6 | 751 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 752 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
753 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
754 | return false; | |
755 | } | |
756 | ||
480262c9 | 757 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 758 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
759 | return false; |
760 | } | |
480262c9 | 761 | |
29a73c2f | 762 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 763 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
764 | return false; |
765 | } | |
480262c9 | 766 | |
29a73c2f CB |
767 | return true; |
768 | } | |
769 | ||
0a4dea41 | 770 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 771 | { |
5fbea8a6 CB |
772 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
773 | return false; | |
51c7ca35 | 774 | |
5fbea8a6 CB |
775 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
776 | return false; | |
29a73c2f | 777 | |
5fbea8a6 CB |
778 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
779 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
780 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
781 | if ((*h)->fd < 0) | |
29a73c2f | 782 | return false; |
29a73c2f | 783 | } |
5fbea8a6 | 784 | |
29a73c2f CB |
785 | return true; |
786 | } | |
787 | ||
480262c9 | 788 | static bool cgfs_setup_controllers(void) |
29a73c2f | 789 | { |
0a4dea41 | 790 | if (!cgfs_prepare_mounts()) |
29a73c2f | 791 | return false; |
29a73c2f | 792 | |
2b8eff1d CB |
793 | if (!cgfs_mount_hierarchies()) |
794 | return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); | |
29a73c2f | 795 | |
0232cbac | 796 | if (!permute_root()) |
29a73c2f CB |
797 | return false; |
798 | ||
799 | return true; | |
800 | } | |
801 | ||
dee86006 | 802 | static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) |
b9b6bdc9 CB |
803 | { |
804 | int ret; | |
805 | ||
806 | if (reload_successful) { | |
807 | reload_successful = 0; | |
808 | ||
809 | /* write() is async signal safe */ | |
810 | ret = write(STDERR_FILENO, | |
811 | "Switched into non-virtualization mode\n", | |
812 | STRLITERALLEN("Switched into non-virtualization mode\n")); | |
813 | if (ret < 0) | |
814 | goto please_compiler; | |
815 | } else { | |
816 | reload_successful = 1; | |
817 | ||
818 | /* write() is async signal safe */ | |
819 | ret = write(STDERR_FILENO, "Switched into virtualization mode\n", | |
820 | STRLITERALLEN("Switched into virtualization mode\n")); | |
821 | if (ret < 0) | |
822 | goto please_compiler; | |
823 | } | |
824 | ||
825 | please_compiler: | |
826 | /* | |
827 | * The write() syscall is a function whose return value needs to be | |
4210ee1d CB |
828 | * checked. Otherwise the compiler will warn.Another one could be to |
829 | * use syscall(__NR_write, ...) directly but whatever. | |
b9b6bdc9 CB |
830 | */ |
831 | return; | |
832 | } | |
833 | ||
2243c5a9 | 834 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 835 | { |
05b7a16d | 836 | __do_close int init_ns = -EBADF, root_fd = -EBADF, |
de69569b | 837 | pidfd = -EBADF; |
4ec5c9da | 838 | int i = 0; |
2aa59b2e | 839 | pid_t pid; |
237e200e | 840 | |
c2357135 | 841 | lxcfs_info("Running constructor %s to reload liblxcfs", __func__); |
cc42d0c7 | 842 | |
5fbea8a6 | 843 | cgroup_ops = cgroup_init(); |
c2357135 CB |
844 | if (!cgroup_ops) { |
845 | lxcfs_info("Failed to initialize cgroup support"); | |
846 | goto broken_upgrade; | |
847 | } | |
237e200e | 848 | |
480262c9 | 849 | /* Preserve initial namespace. */ |
2aa59b2e CB |
850 | pid = getpid(); |
851 | init_ns = preserve_ns(pid, "mnt"); | |
c2357135 CB |
852 | if (init_ns < 0) { |
853 | lxcfs_info("Failed to preserve initial mount namespace"); | |
854 | goto broken_upgrade; | |
855 | } | |
480262c9 | 856 | |
480262c9 CB |
857 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
858 | * to privately mount lxcfs cgroups. */ | |
c2357135 | 859 | if (!cgfs_setup_controllers()) { |
2243c5a9 | 860 | log_exit("Failed to setup private cgroup mounts for lxcfs"); |
c2357135 CB |
861 | goto broken_upgrade; |
862 | } | |
480262c9 | 863 | |
c2357135 | 864 | if (setns(init_ns, 0) < 0) { |
2243c5a9 | 865 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); |
c2357135 CB |
866 | goto broken_upgrade; |
867 | } | |
29a73c2f | 868 | |
c2357135 | 869 | if (!init_cpuview()) { |
2243c5a9 | 870 | log_exit("Failed to init CPU view"); |
c2357135 CB |
871 | goto broken_upgrade; |
872 | } | |
056adcef | 873 | |
cc42d0c7 CB |
874 | lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); |
875 | lxcfs_info("hierarchies:"); | |
4ec5c9da CB |
876 | |
877 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
cc42d0c7 CB |
878 | char **controller_list = (*h)->controllers; |
879 | __do_free char *controllers = NULL; | |
880 | if (controller_list && *controller_list) | |
881 | controllers = lxc_string_join(",", (const char **)controller_list, false); | |
882 | lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); | |
4ec5c9da | 883 | } |
2aa59b2e CB |
884 | |
885 | pidfd = pidfd_open(pid, 0); | |
886 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
887 | can_use_pidfd = true; | |
cc42d0c7 | 888 | lxcfs_info("Kernel supports pidfds"); |
2aa59b2e | 889 | } |
ce8fc84c | 890 | |
c6805016 CB |
891 | can_use_swap = cgroup_ops->can_use_swap(cgroup_ops); |
892 | if (can_use_swap) | |
893 | lxcfs_info("Kernel supports swap accounting"); | |
894 | else | |
895 | lxcfs_info("Kernel does not support swap accounting"); | |
896 | ||
cc42d0c7 | 897 | lxcfs_info("api_extensions:"); |
3cf1e562 CB |
898 | for (size_t nr = 0; nr < nr_api_extensions; nr++) |
899 | lxcfs_info("- %s", api_extensions[nr]); | |
de69569b CB |
900 | |
901 | root_fd = open("/", O_PATH | O_CLOEXEC); | |
c2357135 CB |
902 | if (root_fd < 0) |
903 | lxcfs_info("%s - Failed to open root directory", strerror(errno)); | |
904 | else if (fchdir(root_fd) < 0) | |
905 | lxcfs_info("%s - Failed to change to root directory", strerror(errno)); | |
906 | ||
dee86006 CB |
907 | if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { |
908 | lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); | |
b9b6bdc9 | 909 | goto broken_upgrade; |
dee86006 | 910 | } |
b9b6bdc9 CB |
911 | |
912 | reload_successful = 1; | |
c2357135 | 913 | return; |
de69569b | 914 | |
c2357135 | 915 | broken_upgrade: |
b9b6bdc9 | 916 | reload_successful = 0; |
c2357135 | 917 | lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); |
237e200e SH |
918 | } |
919 | ||
2243c5a9 | 920 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 921 | { |
cc42d0c7 CB |
922 | lxcfs_info("Running destructor %s", __func__); |
923 | ||
c8f77ce4 | 924 | clear_initpid_store(); |
056adcef | 925 | free_cpuview(); |
2243c5a9 | 926 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 927 | } |
285aea40 | 928 | |
0d5383b7 | 929 | void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data) |
285aea40 CB |
930 | { |
931 | struct fuse_context *fc = fuse_get_context(); | |
932 | can_use_sys_cpu = true; | |
933 | has_versioned_opts = true; | |
934 | return fc->private_data; | |
935 | } |