]>
Commit | Line | Data |
---|---|---|
237e200e SH |
1 | /* lxcfs |
2 | * | |
3 | * Copyright © 2014-2016 Canonical, Inc | |
4 | * Author: Serge Hallyn <serge.hallyn@ubuntu.com> | |
5 | * | |
6 | * See COPYING file for details. | |
7 | */ | |
8 | ||
1f5596dd CB |
9 | #ifndef _GNU_SOURCE |
10 | #define _GNU_SOURCE | |
11 | #endif | |
12 | ||
13 | #ifndef FUSE_USE_VERSION | |
237e200e | 14 | #define FUSE_USE_VERSION 26 |
1f5596dd CB |
15 | #endif |
16 | ||
17 | #define _FILE_OFFSET_BITS 64 | |
237e200e | 18 | |
237e200e | 19 | #include <dirent.h> |
29a73c2f | 20 | #include <errno.h> |
237e200e SH |
21 | #include <fcntl.h> |
22 | #include <fuse.h> | |
0ecddf02 | 23 | #include <inttypes.h> |
237e200e | 24 | #include <libgen.h> |
237e200e | 25 | #include <pthread.h> |
29a73c2f | 26 | #include <sched.h> |
db1b32f6 | 27 | #include <stdarg.h> |
29a73c2f | 28 | #include <stdbool.h> |
0ecddf02 | 29 | #include <stdint.h> |
29a73c2f CB |
30 | #include <stdio.h> |
31 | #include <stdlib.h> | |
32 | #include <string.h> | |
33 | #include <time.h> | |
34 | #include <unistd.h> | |
35 | #include <wait.h> | |
d89504c4 | 36 | #include <linux/magic.h> |
237e200e | 37 | #include <linux/sched.h> |
29a73c2f CB |
38 | #include <sys/epoll.h> |
39 | #include <sys/mman.h> | |
40 | #include <sys/mount.h> | |
237e200e SH |
41 | #include <sys/param.h> |
42 | #include <sys/socket.h> | |
29a73c2f | 43 | #include <sys/syscall.h> |
0ecddf02 | 44 | #include <sys/sysinfo.h> |
d89504c4 | 45 | #include <sys/vfs.h> |
237e200e | 46 | |
237e200e | 47 | #include "bindings.h" |
1d81c6a6 | 48 | #include "config.h" |
580fe4df | 49 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
50 | #include "cgroups/cgroup.h" |
51 | #include "cgroups/cgroup_utils.h" | |
c9236032 | 52 | #include "memory_utils.h" |
1f5596dd | 53 | #include "proc_cpuview.h" |
1d81c6a6 | 54 | #include "utils.h" |
237e200e | 55 | |
2aa59b2e CB |
56 | static bool can_use_pidfd; |
57 | ||
29a73c2f CB |
58 | /* Define pivot_root() if missing from the C library */ |
59 | #ifndef HAVE_PIVOT_ROOT | |
4ec5c9da | 60 | static int pivot_root(const char *new_root, const char *put_old) |
29a73c2f CB |
61 | { |
62 | #ifdef __NR_pivot_root | |
4ec5c9da | 63 | return syscall(__NR_pivot_root, new_root, put_old); |
29a73c2f | 64 | #else |
4ec5c9da CB |
65 | errno = ENOSYS; |
66 | return -1; | |
29a73c2f CB |
67 | #endif |
68 | } | |
69 | #else | |
4ec5c9da | 70 | extern int pivot_root(const char *new_root, const char *put_old); |
29a73c2f CB |
71 | #endif |
72 | ||
237e200e SH |
73 | /* |
74 | * A table caching which pid is init for a pid namespace. | |
75 | * When looking up which pid is init for $qpid, we first | |
76 | * 1. Stat /proc/$qpid/ns/pid. | |
77 | * 2. Check whether the ino_t is in our store. | |
78 | * a. if not, fork a child in qpid's ns to send us | |
79 | * ucred.pid = 1, and read the initpid. Cache | |
80 | * initpid and creation time for /proc/initpid | |
81 | * in a new store entry. | |
82 | * b. if so, verify that /proc/initpid still matches | |
83 | * what we have saved. If not, clear the store | |
84 | * entry and go back to a. If so, return the | |
85 | * cached initpid. | |
86 | */ | |
87 | struct pidns_init_store { | |
2aa59b2e CB |
88 | ino_t ino; /* inode number for /proc/$pid/ns/pid */ |
89 | pid_t initpid; /* the pid of nit in that ns */ | |
90 | int init_pidfd; | |
91 | long int ctime; /* the time at which /proc/$initpid was created */ | |
237e200e SH |
92 | struct pidns_init_store *next; |
93 | long int lastcheck; | |
94 | }; | |
95 | ||
96 | /* lol - look at how they are allocated in the kernel */ | |
97 | #define PIDNS_HASH_SIZE 4096 | |
98 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
99 | ||
100 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
101 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
4ec5c9da | 102 | |
237e200e SH |
103 | static void lock_mutex(pthread_mutex_t *l) |
104 | { | |
105 | int ret; | |
106 | ||
4ec5c9da CB |
107 | ret = pthread_mutex_lock(l); |
108 | if (ret) | |
109 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
110 | } |
111 | ||
77f4399a | 112 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 113 | |
237e200e SH |
114 | static void unlock_mutex(pthread_mutex_t *l) |
115 | { | |
116 | int ret; | |
117 | ||
4ec5c9da CB |
118 | ret = pthread_mutex_unlock(l); |
119 | if (ret) | |
120 | log_exit("%s - returned %d\n", strerror(ret), ret); | |
237e200e SH |
121 | } |
122 | ||
123 | static void store_lock(void) | |
124 | { | |
125 | lock_mutex(&pidns_store_mutex); | |
126 | } | |
127 | ||
128 | static void store_unlock(void) | |
129 | { | |
130 | unlock_mutex(&pidns_store_mutex); | |
131 | } | |
132 | ||
2aa59b2e CB |
133 | /* /proc/ = 6 |
134 | * + | |
135 | * <pid-as-str> = INTTYPE_TO_STRLEN(pid_t) | |
136 | * + | |
137 | * \0 = 1 | |
138 | */ | |
139 | #define LXCFS_PROC_PID_LEN \ | |
140 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) | |
141 | ||
237e200e | 142 | /* Must be called under store_lock */ |
2aa59b2e | 143 | static bool initpid_still_valid(struct pidns_init_store *entry) |
237e200e | 144 | { |
2aa59b2e | 145 | bool valid = true; |
237e200e | 146 | |
2aa59b2e CB |
147 | if (entry->init_pidfd >= 0) { |
148 | if (pidfd_send_signal(entry->init_pidfd, 0, NULL, 0)) | |
149 | valid = false; | |
150 | } else { | |
151 | struct stat st; | |
152 | char path[LXCFS_PROC_PID_LEN]; | |
7dd6560a | 153 | |
2aa59b2e | 154 | snprintf(path, sizeof(path), "/proc/%d", entry->initpid); |
7dd6560a | 155 | |
2aa59b2e CB |
156 | if (stat(path, &st) || entry->ctime != st.st_ctime) |
157 | valid = false; | |
158 | } | |
159 | ||
160 | return valid; | |
237e200e SH |
161 | } |
162 | ||
163 | /* Must be called under store_lock */ | |
2aa59b2e | 164 | static void remove_initpid(struct pidns_init_store *entry) |
237e200e | 165 | { |
2aa59b2e CB |
166 | struct pidns_init_store *it; |
167 | int ino_hash; | |
237e200e | 168 | |
2aa59b2e CB |
169 | lxcfs_debug("Removing cached entry for pid %d from init pid cache", |
170 | entry->initpid); | |
7dd6560a | 171 | |
2aa59b2e CB |
172 | ino_hash = HASH(entry->ino); |
173 | if (pidns_hash_table[ino_hash] == entry) { | |
174 | pidns_hash_table[ino_hash] = entry->next; | |
175 | close_prot_errno_disarm(entry->init_pidfd); | |
176 | free_disarm(entry); | |
237e200e SH |
177 | return; |
178 | } | |
179 | ||
2aa59b2e CB |
180 | it = pidns_hash_table[ino_hash]; |
181 | while (it) { | |
182 | if (it->next == entry) { | |
183 | it->next = entry->next; | |
184 | close_prot_errno_disarm(entry->init_pidfd); | |
185 | free_disarm(entry); | |
237e200e SH |
186 | return; |
187 | } | |
2aa59b2e | 188 | it = it->next; |
237e200e SH |
189 | } |
190 | } | |
191 | ||
192 | #define PURGE_SECS 5 | |
193 | /* Must be called under store_lock */ | |
194 | static void prune_initpid_store(void) | |
195 | { | |
196 | static long int last_prune = 0; | |
237e200e | 197 | long int now, threshold; |
237e200e SH |
198 | |
199 | if (!last_prune) { | |
200 | last_prune = time(NULL); | |
201 | return; | |
202 | } | |
2aa59b2e | 203 | |
237e200e SH |
204 | now = time(NULL); |
205 | if (now < last_prune + PURGE_SECS) | |
206 | return; | |
7dd6560a | 207 | |
2aa59b2e | 208 | lxcfs_debug("Pruning init pid cache"); |
7dd6560a | 209 | |
237e200e SH |
210 | last_prune = now; |
211 | threshold = now - 2 * PURGE_SECS; | |
212 | ||
2aa59b2e CB |
213 | for (int i = 0; i < PIDNS_HASH_SIZE; i++) { |
214 | for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { | |
215 | if (entry->lastcheck < threshold) { | |
216 | struct pidns_init_store *cur = entry; | |
7dd6560a | 217 | |
2aa59b2e | 218 | lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); |
7dd6560a | 219 | |
237e200e | 220 | if (prev) |
2aa59b2e | 221 | prev->next = entry->next; |
237e200e | 222 | else |
2aa59b2e CB |
223 | pidns_hash_table[i] = entry->next; |
224 | entry = entry->next; | |
225 | close_prot_errno_disarm(cur->init_pidfd); | |
226 | free_disarm(cur); | |
237e200e | 227 | } else { |
2aa59b2e CB |
228 | prev = entry; |
229 | entry = entry->next; | |
237e200e SH |
230 | } |
231 | } | |
232 | } | |
233 | } | |
234 | ||
235 | /* Must be called under store_lock */ | |
236 | static void save_initpid(struct stat *sb, pid_t pid) | |
237 | { | |
2aa59b2e CB |
238 | __do_free struct pidns_init_store *e = NULL; |
239 | __do_close_prot_errno int pidfd = -EBADF; | |
240 | char path[LXCFS_PROC_PID_LEN]; | |
241 | struct lxcfs_opts *opts = fuse_get_context()->private_data; | |
242 | struct stat st; | |
243 | int ino_hash; | |
244 | ||
245 | if (opts->use_pidfd && can_use_pidfd) { | |
246 | pidfd = pidfd_open(pid, 0); | |
247 | if (pidfd < 0) | |
248 | return; | |
249 | } | |
237e200e | 250 | |
2aa59b2e CB |
251 | snprintf(path, sizeof(path), "/proc/%d", pid); |
252 | if (stat(path, &st)) | |
253 | return; | |
7dd6560a | 254 | |
2aa59b2e CB |
255 | e = malloc(sizeof(*e)); |
256 | if (!e) | |
237e200e | 257 | return; |
2aa59b2e | 258 | |
237e200e SH |
259 | e->ino = sb->st_ino; |
260 | e->initpid = pid; | |
2aa59b2e CB |
261 | e->ctime = st.st_ctime; |
262 | ino_hash = HASH(e->ino); | |
263 | e->next = pidns_hash_table[ino_hash]; | |
237e200e | 264 | e->lastcheck = time(NULL); |
2aa59b2e CB |
265 | e->init_pidfd = move_fd(pidfd); |
266 | pidns_hash_table[ino_hash] = move_ptr(e); | |
267 | ||
268 | lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); | |
237e200e SH |
269 | } |
270 | ||
271 | /* | |
272 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
273 | * entry for the inode number and creation time. Verify that the init pid | |
274 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
275 | * otherwise. | |
276 | * Must be called under store_lock | |
277 | */ | |
278 | static struct pidns_init_store *lookup_verify_initpid(struct stat *sb) | |
279 | { | |
2aa59b2e CB |
280 | struct pidns_init_store *entry = pidns_hash_table[HASH(sb->st_ino)]; |
281 | ||
282 | while (entry) { | |
283 | if (entry->ino == sb->st_ino) { | |
284 | if (initpid_still_valid(entry)) { | |
285 | entry->lastcheck = time(NULL); | |
286 | return entry; | |
237e200e | 287 | } |
2aa59b2e CB |
288 | |
289 | remove_initpid(entry); | |
237e200e SH |
290 | return NULL; |
291 | } | |
2aa59b2e | 292 | entry = entry->next; |
237e200e SH |
293 | } |
294 | ||
295 | return NULL; | |
296 | } | |
297 | ||
4ec5c9da | 298 | static int send_creds_clone_wrapper(void *arg) |
237e200e | 299 | { |
4ec5c9da CB |
300 | struct ucred cred; |
301 | char v; | |
302 | int sock = *(int *)arg; | |
ba59ea09 | 303 | |
4ec5c9da CB |
304 | /* we are the child */ |
305 | cred.uid = 0; | |
306 | cred.gid = 0; | |
307 | cred.pid = 1; | |
308 | v = '1'; | |
309 | if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) | |
310 | return 1; | |
311 | return 0; | |
237e200e SH |
312 | } |
313 | ||
580fe4df CB |
314 | /* |
315 | * clone a task which switches to @task's namespace and writes '1'. | |
316 | * over a unix sock so we can read the task's reaper's pid in our | |
317 | * namespace | |
318 | * | |
319 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
320 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
321 | * the pidns and the parent pid outside are identical. Using clone prevents | |
322 | * this issue. | |
323 | */ | |
324 | static void write_task_init_pid_exit(int sock, pid_t target) | |
325 | { | |
326 | char fnam[100]; | |
327 | pid_t pid; | |
328 | int fd, ret; | |
329 | size_t stack_size = sysconf(_SC_PAGESIZE); | |
330 | void *stack = alloca(stack_size); | |
237e200e | 331 | |
580fe4df CB |
332 | ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target); |
333 | if (ret < 0 || ret >= sizeof(fnam)) | |
334 | _exit(1); | |
f23fe717 | 335 | |
580fe4df CB |
336 | fd = open(fnam, O_RDONLY); |
337 | if (fd < 0) { | |
338 | perror("write_task_init_pid_exit open of ns/pid"); | |
339 | _exit(1); | |
237e200e | 340 | } |
580fe4df CB |
341 | if (setns(fd, 0)) { |
342 | perror("write_task_init_pid_exit setns 1"); | |
343 | close(fd); | |
344 | _exit(1); | |
345 | } | |
346 | pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock); | |
347 | if (pid < 0) | |
348 | _exit(1); | |
349 | if (pid != 0) { | |
350 | if (!wait_for_pid(pid)) | |
351 | _exit(1); | |
352 | _exit(0); | |
237e200e | 353 | } |
237e200e SH |
354 | } |
355 | ||
580fe4df | 356 | static pid_t get_init_pid_for_task(pid_t task) |
237e200e | 357 | { |
580fe4df CB |
358 | int sock[2]; |
359 | pid_t pid; | |
360 | pid_t ret = -1; | |
361 | char v = '0'; | |
362 | struct ucred cred; | |
237e200e | 363 | |
580fe4df CB |
364 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { |
365 | perror("socketpair"); | |
366 | return -1; | |
237e200e SH |
367 | } |
368 | ||
580fe4df CB |
369 | pid = fork(); |
370 | if (pid < 0) | |
371 | goto out; | |
372 | if (!pid) { | |
373 | close(sock[1]); | |
374 | write_task_init_pid_exit(sock[0], task); | |
375 | _exit(0); | |
237e200e | 376 | } |
7213ec5c | 377 | |
580fe4df CB |
378 | if (!recv_creds(sock[1], &cred, &v)) |
379 | goto out; | |
380 | ret = cred.pid; | |
237e200e | 381 | |
580fe4df CB |
382 | out: |
383 | close(sock[0]); | |
384 | close(sock[1]); | |
385 | if (pid > 0) | |
386 | wait_for_pid(pid); | |
237e200e SH |
387 | return ret; |
388 | } | |
389 | ||
2aa59b2e CB |
390 | #define LXCFS_PROC_PID_NS_LEN \ |
391 | (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ | |
392 | STRLITERALLEN("/ns/pid") + 1) | |
393 | ||
394 | pid_t lookup_initpid_in_store(pid_t pid) | |
237e200e | 395 | { |
580fe4df | 396 | pid_t answer = 0; |
2aa59b2e CB |
397 | char path[LXCFS_PROC_PID_NS_LEN]; |
398 | struct stat st; | |
399 | struct pidns_init_store *entry; | |
400 | ||
401 | snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); | |
b7672ded | 402 | |
580fe4df | 403 | store_lock(); |
2aa59b2e | 404 | if (stat(path, &st)) |
580fe4df | 405 | goto out; |
2aa59b2e CB |
406 | |
407 | entry = lookup_verify_initpid(&st); | |
408 | if (entry) { | |
409 | answer = entry->initpid; | |
580fe4df CB |
410 | goto out; |
411 | } | |
2aa59b2e CB |
412 | |
413 | answer = get_init_pid_for_task(pid); | |
580fe4df | 414 | if (answer > 0) |
2aa59b2e | 415 | save_initpid(&st, answer); |
b7672ded | 416 | |
580fe4df | 417 | out: |
2aa59b2e CB |
418 | /* |
419 | * Prune at the end in case we're returning the value we were about to | |
420 | * return. | |
421 | */ | |
580fe4df | 422 | prune_initpid_store(); |
2aa59b2e | 423 | |
580fe4df | 424 | store_unlock(); |
2aa59b2e | 425 | |
580fe4df | 426 | return answer; |
237e200e SH |
427 | } |
428 | ||
29a73c2f CB |
429 | /* |
430 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
431 | */ |
432 | ||
29a73c2f CB |
433 | static bool umount_if_mounted(void) |
434 | { | |
435 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 436 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
437 | return false; |
438 | } | |
439 | return true; | |
440 | } | |
441 | ||
2283e240 CB |
442 | /* __typeof__ should be safe to use with all compilers. */ |
443 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
444 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
445 | { | |
446 | return (fs->f_type == (fs_type_magic)magic_val); | |
447 | } | |
448 | ||
0a4dea41 CB |
449 | /* |
450 | * looking at fs/proc_namespace.c, it appears we can | |
451 | * actually expect the rootfs entry to very specifically contain | |
452 | * " - rootfs rootfs " | |
453 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
454 | * the rootfs entry should always be skipped in mountinfo contents. | |
455 | */ | |
456 | static bool is_on_ramfs(void) | |
457 | { | |
458 | FILE *f; | |
459 | char *p, *p2; | |
460 | char *line = NULL; | |
461 | size_t len = 0; | |
462 | int i; | |
463 | ||
464 | f = fopen("/proc/self/mountinfo", "r"); | |
465 | if (!f) | |
466 | return false; | |
467 | ||
468 | while (getline(&line, &len, f) != -1) { | |
469 | for (p = line, i = 0; p && i < 4; i++) | |
470 | p = strchr(p + 1, ' '); | |
471 | if (!p) | |
472 | continue; | |
473 | p2 = strchr(p + 1, ' '); | |
474 | if (!p2) | |
475 | continue; | |
476 | *p2 = '\0'; | |
477 | if (strcmp(p + 1, "/") == 0) { | |
478 | // this is '/'. is it the ramfs? | |
479 | p = strchr(p2 + 1, '-'); | |
480 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) { | |
481 | free(line); | |
482 | fclose(f); | |
483 | return true; | |
484 | } | |
485 | } | |
486 | } | |
487 | free(line); | |
488 | fclose(f); | |
489 | return false; | |
490 | } | |
491 | ||
cc309f33 | 492 | static int pivot_enter() |
0a4dea41 | 493 | { |
cc309f33 CB |
494 | int ret = -1, oldroot = -1, newroot = -1; |
495 | ||
496 | oldroot = open("/", O_DIRECTORY | O_RDONLY); | |
497 | if (oldroot < 0) { | |
498 | lxcfs_error("%s\n", "Failed to open old root for fchdir."); | |
499 | return ret; | |
500 | } | |
501 | ||
502 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY); | |
503 | if (newroot < 0) { | |
504 | lxcfs_error("%s\n", "Failed to open new root for fchdir."); | |
505 | goto err; | |
506 | } | |
507 | ||
508 | /* change into new root fs */ | |
509 | if (fchdir(newroot) < 0) { | |
510 | lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR); | |
511 | goto err; | |
512 | } | |
513 | ||
0a4dea41 CB |
514 | /* pivot_root into our new root fs */ |
515 | if (pivot_root(".", ".") < 0) { | |
516 | lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno)); | |
cc309f33 | 517 | goto err; |
0a4dea41 CB |
518 | } |
519 | ||
520 | /* | |
521 | * At this point the old-root is mounted on top of our new-root. | |
522 | * To unmounted it we must not be chdir'd into it, so escape back | |
523 | * to the old-root. | |
524 | */ | |
525 | if (fchdir(oldroot) < 0) { | |
526 | lxcfs_error("%s\n", "Failed to enter old root."); | |
cc309f33 | 527 | goto err; |
0a4dea41 CB |
528 | } |
529 | ||
530 | if (umount2(".", MNT_DETACH) < 0) { | |
531 | lxcfs_error("%s\n", "Failed to detach old root."); | |
cc309f33 | 532 | goto err; |
0a4dea41 CB |
533 | } |
534 | ||
535 | if (fchdir(newroot) < 0) { | |
536 | lxcfs_error("%s\n", "Failed to re-enter new root."); | |
cc309f33 | 537 | goto err; |
0a4dea41 CB |
538 | } |
539 | ||
cc309f33 CB |
540 | ret = 0; |
541 | ||
542 | err: | |
543 | if (oldroot > 0) | |
544 | close(oldroot); | |
545 | if (newroot > 0) | |
546 | close(newroot); | |
547 | ||
548 | return ret; | |
0a4dea41 CB |
549 | } |
550 | ||
551 | static int chroot_enter() | |
552 | { | |
553 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
554 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
555 | return -1; | |
556 | } | |
557 | ||
558 | if (chroot(".") < 0) { | |
559 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
560 | return -1; | |
561 | } | |
562 | ||
563 | if (chdir("/") < 0) { | |
564 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
565 | return -1; | |
566 | } | |
567 | ||
568 | return 0; | |
569 | } | |
570 | ||
0232cbac | 571 | static int permute_and_enter(void) |
29a73c2f | 572 | { |
0a4dea41 CB |
573 | struct statfs sb; |
574 | ||
575 | if (statfs("/", &sb) < 0) { | |
576 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 577 | return -1; |
0a4dea41 CB |
578 | } |
579 | ||
580 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
581 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
582 | * /proc/1/mountinfo. */ | |
583 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
584 | return chroot_enter(); | |
29a73c2f | 585 | |
cc309f33 | 586 | if (pivot_enter() < 0) { |
0a4dea41 | 587 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 588 | return -1; |
29a73c2f CB |
589 | } |
590 | ||
cc309f33 | 591 | return 0; |
29a73c2f CB |
592 | } |
593 | ||
594 | /* Prepare our new clean root. */ | |
0232cbac | 595 | static int permute_prepare(void) |
29a73c2f CB |
596 | { |
597 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 598 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
599 | return -1; |
600 | } | |
601 | ||
602 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 603 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
604 | return -1; |
605 | } | |
606 | ||
607 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 608 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
609 | return -1; |
610 | } | |
611 | ||
612 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 613 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
614 | return -1; |
615 | } | |
616 | ||
617 | return 0; | |
618 | } | |
619 | ||
0232cbac CB |
620 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
621 | static bool permute_root(void) | |
29a73c2f CB |
622 | { |
623 | /* Prepare new root. */ | |
0232cbac | 624 | if (permute_prepare() < 0) |
29a73c2f CB |
625 | return false; |
626 | ||
627 | /* Pivot into new root. */ | |
0232cbac | 628 | if (permute_and_enter() < 0) |
29a73c2f CB |
629 | return false; |
630 | ||
631 | return true; | |
632 | } | |
633 | ||
0a4dea41 | 634 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
635 | { |
636 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 637 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
638 | return false; |
639 | } | |
480262c9 | 640 | |
29a73c2f | 641 | if (!umount_if_mounted()) { |
b8defc3d | 642 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
643 | return false; |
644 | } | |
645 | ||
646 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 647 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
648 | return false; |
649 | } | |
650 | ||
1d81c6a6 | 651 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 652 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
653 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
654 | return false; | |
655 | } | |
656 | ||
480262c9 | 657 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 658 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
659 | return false; |
660 | } | |
480262c9 | 661 | |
29a73c2f | 662 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 663 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
664 | return false; |
665 | } | |
480262c9 | 666 | |
29a73c2f CB |
667 | return true; |
668 | } | |
669 | ||
0a4dea41 | 670 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 671 | { |
5fbea8a6 CB |
672 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
673 | return false; | |
51c7ca35 | 674 | |
5fbea8a6 CB |
675 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
676 | return false; | |
29a73c2f | 677 | |
5fbea8a6 CB |
678 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
679 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
680 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
681 | if ((*h)->fd < 0) | |
29a73c2f | 682 | return false; |
29a73c2f | 683 | } |
5fbea8a6 | 684 | |
29a73c2f CB |
685 | return true; |
686 | } | |
687 | ||
480262c9 | 688 | static bool cgfs_setup_controllers(void) |
29a73c2f | 689 | { |
0a4dea41 | 690 | if (!cgfs_prepare_mounts()) |
29a73c2f | 691 | return false; |
29a73c2f | 692 | |
0a4dea41 | 693 | if (!cgfs_mount_hierarchies()) { |
b8defc3d | 694 | lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts."); |
29a73c2f CB |
695 | return false; |
696 | } | |
697 | ||
0232cbac | 698 | if (!permute_root()) |
29a73c2f CB |
699 | return false; |
700 | ||
701 | return true; | |
702 | } | |
703 | ||
2243c5a9 | 704 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 705 | { |
2aa59b2e | 706 | __do_close_prot_errno int init_ns = -EBADF, pidfd = -EBADF; |
4ec5c9da | 707 | int i = 0; |
2aa59b2e | 708 | pid_t pid; |
5fbea8a6 | 709 | char *cret; |
e58dab00 | 710 | char cwd[MAXPATHLEN]; |
237e200e | 711 | |
5fbea8a6 CB |
712 | cgroup_ops = cgroup_init(); |
713 | if (!cgroup_ops) | |
2243c5a9 | 714 | log_exit("Failed to initialize cgroup support"); |
237e200e | 715 | |
480262c9 | 716 | /* Preserve initial namespace. */ |
2aa59b2e CB |
717 | pid = getpid(); |
718 | init_ns = preserve_ns(pid, "mnt"); | |
2243c5a9 CB |
719 | if (init_ns < 0) |
720 | log_exit("Failed to preserve initial mount namespace"); | |
480262c9 | 721 | |
e58dab00 | 722 | cret = getcwd(cwd, MAXPATHLEN); |
4ec5c9da | 723 | if (!cret) |
2243c5a9 | 724 | log_exit("%s - Could not retrieve current working directory", strerror(errno)); |
e58dab00 | 725 | |
480262c9 CB |
726 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
727 | * to privately mount lxcfs cgroups. */ | |
2243c5a9 CB |
728 | if (!cgfs_setup_controllers()) |
729 | log_exit("Failed to setup private cgroup mounts for lxcfs"); | |
480262c9 | 730 | |
2243c5a9 CB |
731 | if (setns(init_ns, 0) < 0) |
732 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); | |
29a73c2f | 733 | |
e58dab00 | 734 | if (!cret || chdir(cwd) < 0) |
2243c5a9 | 735 | log_exit("%s - Could not change back to original working directory", strerror(errno)); |
e58dab00 | 736 | |
2243c5a9 CB |
737 | if (!init_cpuview()) |
738 | log_exit("Failed to init CPU view"); | |
056adcef | 739 | |
4ec5c9da CB |
740 | fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd); |
741 | fprintf(stderr, "hierarchies:\n"); | |
742 | ||
743 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { | |
744 | __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false); | |
745 | fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: ""); | |
746 | } | |
2aa59b2e CB |
747 | |
748 | pidfd = pidfd_open(pid, 0); | |
749 | if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { | |
750 | can_use_pidfd = true; | |
751 | lxcfs_error("Kernel supports pidfds"); | |
752 | } | |
237e200e SH |
753 | } |
754 | ||
2243c5a9 | 755 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 756 | { |
0646f250 | 757 | lxcfs_debug("%s\n", "Running destructor for liblxcfs"); |
056adcef | 758 | free_cpuview(); |
2243c5a9 | 759 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 760 | } |