]>
Commit | Line | Data |
---|---|---|
237e200e SH |
1 | /* lxcfs |
2 | * | |
3 | * Copyright © 2014-2016 Canonical, Inc | |
4 | * Author: Serge Hallyn <serge.hallyn@ubuntu.com> | |
5 | * | |
6 | * See COPYING file for details. | |
7 | */ | |
8 | ||
1f5596dd CB |
9 | #ifndef _GNU_SOURCE |
10 | #define _GNU_SOURCE | |
11 | #endif | |
12 | ||
13 | #ifndef FUSE_USE_VERSION | |
237e200e | 14 | #define FUSE_USE_VERSION 26 |
1f5596dd CB |
15 | #endif |
16 | ||
17 | #define _FILE_OFFSET_BITS 64 | |
237e200e | 18 | |
237e200e | 19 | #include <dirent.h> |
29a73c2f | 20 | #include <errno.h> |
237e200e SH |
21 | #include <fcntl.h> |
22 | #include <fuse.h> | |
0ecddf02 | 23 | #include <inttypes.h> |
237e200e | 24 | #include <libgen.h> |
237e200e | 25 | #include <pthread.h> |
29a73c2f | 26 | #include <sched.h> |
db1b32f6 | 27 | #include <stdarg.h> |
29a73c2f | 28 | #include <stdbool.h> |
0ecddf02 | 29 | #include <stdint.h> |
29a73c2f CB |
30 | #include <stdio.h> |
31 | #include <stdlib.h> | |
32 | #include <string.h> | |
33 | #include <time.h> | |
34 | #include <unistd.h> | |
35 | #include <wait.h> | |
d89504c4 | 36 | #include <linux/magic.h> |
237e200e | 37 | #include <linux/sched.h> |
29a73c2f CB |
38 | #include <sys/epoll.h> |
39 | #include <sys/mman.h> | |
40 | #include <sys/mount.h> | |
237e200e SH |
41 | #include <sys/param.h> |
42 | #include <sys/socket.h> | |
29a73c2f | 43 | #include <sys/syscall.h> |
0ecddf02 | 44 | #include <sys/sysinfo.h> |
d89504c4 | 45 | #include <sys/vfs.h> |
237e200e | 46 | |
237e200e | 47 | #include "bindings.h" |
1d81c6a6 | 48 | #include "config.h" |
580fe4df | 49 | #include "cgroup_fuse.h" |
5fbea8a6 CB |
50 | #include "cgroups/cgroup.h" |
51 | #include "cgroups/cgroup_utils.h" | |
c9236032 | 52 | #include "memory_utils.h" |
1f5596dd | 53 | #include "proc_cpuview.h" |
1d81c6a6 | 54 | #include "utils.h" |
237e200e | 55 | |
29a73c2f CB |
56 | /* Define pivot_root() if missing from the C library */ |
57 | #ifndef HAVE_PIVOT_ROOT | |
58 | static int pivot_root(const char * new_root, const char * put_old) | |
59 | { | |
60 | #ifdef __NR_pivot_root | |
61 | return syscall(__NR_pivot_root, new_root, put_old); | |
62 | #else | |
63 | errno = ENOSYS; | |
64 | return -1; | |
65 | #endif | |
66 | } | |
67 | #else | |
68 | extern int pivot_root(const char * new_root, const char * put_old); | |
69 | #endif | |
70 | ||
237e200e SH |
71 | /* |
72 | * A table caching which pid is init for a pid namespace. | |
73 | * When looking up which pid is init for $qpid, we first | |
74 | * 1. Stat /proc/$qpid/ns/pid. | |
75 | * 2. Check whether the ino_t is in our store. | |
76 | * a. if not, fork a child in qpid's ns to send us | |
77 | * ucred.pid = 1, and read the initpid. Cache | |
78 | * initpid and creation time for /proc/initpid | |
79 | * in a new store entry. | |
80 | * b. if so, verify that /proc/initpid still matches | |
81 | * what we have saved. If not, clear the store | |
82 | * entry and go back to a. If so, return the | |
83 | * cached initpid. | |
84 | */ | |
85 | struct pidns_init_store { | |
86 | ino_t ino; // inode number for /proc/$pid/ns/pid | |
87 | pid_t initpid; // the pid of nit in that ns | |
88 | long int ctime; // the time at which /proc/$initpid was created | |
89 | struct pidns_init_store *next; | |
90 | long int lastcheck; | |
91 | }; | |
92 | ||
93 | /* lol - look at how they are allocated in the kernel */ | |
94 | #define PIDNS_HASH_SIZE 4096 | |
95 | #define HASH(x) ((x) % PIDNS_HASH_SIZE) | |
96 | ||
97 | static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; | |
98 | static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; | |
99 | static void lock_mutex(pthread_mutex_t *l) | |
100 | { | |
101 | int ret; | |
102 | ||
103 | if ((ret = pthread_mutex_lock(l)) != 0) { | |
b8defc3d | 104 | lxcfs_error("returned:%d %s\n", ret, strerror(ret)); |
237e200e SH |
105 | exit(1); |
106 | } | |
107 | } | |
108 | ||
77f4399a | 109 | struct cgroup_ops *cgroup_ops; |
29a73c2f | 110 | |
237e200e SH |
111 | static void unlock_mutex(pthread_mutex_t *l) |
112 | { | |
113 | int ret; | |
114 | ||
115 | if ((ret = pthread_mutex_unlock(l)) != 0) { | |
b8defc3d | 116 | lxcfs_error("returned:%d %s\n", ret, strerror(ret)); |
237e200e SH |
117 | exit(1); |
118 | } | |
119 | } | |
120 | ||
121 | static void store_lock(void) | |
122 | { | |
123 | lock_mutex(&pidns_store_mutex); | |
124 | } | |
125 | ||
126 | static void store_unlock(void) | |
127 | { | |
128 | unlock_mutex(&pidns_store_mutex); | |
129 | } | |
130 | ||
131 | /* Must be called under store_lock */ | |
132 | static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb) | |
133 | { | |
134 | struct stat initsb; | |
135 | char fnam[100]; | |
136 | ||
137 | snprintf(fnam, 100, "/proc/%d", e->initpid); | |
138 | if (stat(fnam, &initsb) < 0) | |
139 | return false; | |
7dd6560a CB |
140 | |
141 | lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime, | |
142 | initsb.st_ctime, e->initpid); | |
143 | ||
237e200e SH |
144 | if (e->ctime != initsb.st_ctime) |
145 | return false; | |
146 | return true; | |
147 | } | |
148 | ||
149 | /* Must be called under store_lock */ | |
150 | static void remove_initpid(struct pidns_init_store *e) | |
151 | { | |
152 | struct pidns_init_store *tmp; | |
153 | int h; | |
154 | ||
7dd6560a CB |
155 | lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid); |
156 | ||
237e200e SH |
157 | h = HASH(e->ino); |
158 | if (pidns_hash_table[h] == e) { | |
159 | pidns_hash_table[h] = e->next; | |
54a6d46a | 160 | free_disarm(e); |
237e200e SH |
161 | return; |
162 | } | |
163 | ||
164 | tmp = pidns_hash_table[h]; | |
165 | while (tmp) { | |
166 | if (tmp->next == e) { | |
167 | tmp->next = e->next; | |
54a6d46a | 168 | free_disarm(e); |
237e200e SH |
169 | return; |
170 | } | |
171 | tmp = tmp->next; | |
172 | } | |
173 | } | |
174 | ||
175 | #define PURGE_SECS 5 | |
176 | /* Must be called under store_lock */ | |
177 | static void prune_initpid_store(void) | |
178 | { | |
179 | static long int last_prune = 0; | |
180 | struct pidns_init_store *e, *prev, *delme; | |
181 | long int now, threshold; | |
182 | int i; | |
183 | ||
184 | if (!last_prune) { | |
185 | last_prune = time(NULL); | |
186 | return; | |
187 | } | |
188 | now = time(NULL); | |
189 | if (now < last_prune + PURGE_SECS) | |
190 | return; | |
7dd6560a CB |
191 | |
192 | lxcfs_debug("%s\n", "Pruning."); | |
193 | ||
237e200e SH |
194 | last_prune = now; |
195 | threshold = now - 2 * PURGE_SECS; | |
196 | ||
197 | for (i = 0; i < PIDNS_HASH_SIZE; i++) { | |
198 | for (prev = NULL, e = pidns_hash_table[i]; e; ) { | |
199 | if (e->lastcheck < threshold) { | |
7dd6560a CB |
200 | |
201 | lxcfs_debug("Removing cached entry for %d.\n", e->initpid); | |
202 | ||
237e200e SH |
203 | delme = e; |
204 | if (prev) | |
205 | prev->next = e->next; | |
206 | else | |
207 | pidns_hash_table[i] = e->next; | |
208 | e = e->next; | |
54a6d46a | 209 | free_disarm(delme); |
237e200e SH |
210 | } else { |
211 | prev = e; | |
212 | e = e->next; | |
213 | } | |
214 | } | |
215 | } | |
216 | } | |
217 | ||
218 | /* Must be called under store_lock */ | |
219 | static void save_initpid(struct stat *sb, pid_t pid) | |
220 | { | |
221 | struct pidns_init_store *e; | |
222 | char fpath[100]; | |
223 | struct stat procsb; | |
224 | int h; | |
225 | ||
7dd6560a CB |
226 | lxcfs_debug("Save_initpid: adding entry for %d.\n", pid); |
227 | ||
237e200e SH |
228 | snprintf(fpath, 100, "/proc/%d", pid); |
229 | if (stat(fpath, &procsb) < 0) | |
230 | return; | |
231 | do { | |
232 | e = malloc(sizeof(*e)); | |
233 | } while (!e); | |
234 | e->ino = sb->st_ino; | |
235 | e->initpid = pid; | |
236 | e->ctime = procsb.st_ctime; | |
237 | h = HASH(e->ino); | |
238 | e->next = pidns_hash_table[h]; | |
239 | e->lastcheck = time(NULL); | |
240 | pidns_hash_table[h] = e; | |
241 | } | |
242 | ||
243 | /* | |
244 | * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store | |
245 | * entry for the inode number and creation time. Verify that the init pid | |
246 | * is still valid. If not, remove it. Return the entry if valid, NULL | |
247 | * otherwise. | |
248 | * Must be called under store_lock | |
249 | */ | |
250 | static struct pidns_init_store *lookup_verify_initpid(struct stat *sb) | |
251 | { | |
252 | int h = HASH(sb->st_ino); | |
253 | struct pidns_init_store *e = pidns_hash_table[h]; | |
254 | ||
255 | while (e) { | |
256 | if (e->ino == sb->st_ino) { | |
257 | if (initpid_still_valid(e, sb)) { | |
258 | e->lastcheck = time(NULL); | |
259 | return e; | |
260 | } | |
261 | remove_initpid(e); | |
262 | return NULL; | |
263 | } | |
264 | e = e->next; | |
265 | } | |
266 | ||
267 | return NULL; | |
268 | } | |
269 | ||
237e200e SH |
270 | struct cgfs_files { |
271 | char *name; | |
272 | uint32_t uid, gid; | |
273 | uint32_t mode; | |
274 | }; | |
275 | ||
237e200e SH |
276 | static void print_subsystems(void) |
277 | { | |
5fbea8a6 | 278 | int i = 0; |
237e200e | 279 | |
0646f250 | 280 | fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd); |
cc97d34c | 281 | fprintf(stderr, "hierarchies:\n"); |
5fbea8a6 CB |
282 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { |
283 | __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false); | |
284 | fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: ""); | |
237e200e SH |
285 | } |
286 | } | |
287 | ||
580fe4df | 288 | bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file) |
237e200e | 289 | { |
580fe4df | 290 | int ret, cfd; |
237e200e | 291 | size_t len; |
5fbea8a6 | 292 | char *fnam; |
237e200e | 293 | |
d298bba1 | 294 | cfd = get_cgroup_fd(controller); |
5fbea8a6 | 295 | if (cfd < 0) |
237e200e | 296 | return false; |
f5a6d92e CB |
297 | |
298 | /* Make sure we pass a relative path to *at() family of functions. | |
299 | * . + /cgroup + / + file + \0 | |
300 | */ | |
ba59ea09 | 301 | len = strlen(cgroup) + strlen(file) + 3; |
237e200e | 302 | fnam = alloca(len); |
075387cd | 303 | ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file); |
ba59ea09 CB |
304 | if (ret < 0 || (size_t)ret >= len) |
305 | return false; | |
306 | ||
580fe4df | 307 | return (faccessat(cfd, fnam, F_OK, 0) == 0); |
237e200e SH |
308 | } |
309 | ||
580fe4df CB |
310 | #define SEND_CREDS_OK 0 |
311 | #define SEND_CREDS_NOTSK 1 | |
312 | #define SEND_CREDS_FAIL 2 | |
580fe4df | 313 | static int wait_for_pid(pid_t pid); |
580fe4df | 314 | static int send_creds_clone_wrapper(void *arg); |
237e200e | 315 | |
580fe4df CB |
316 | /* |
317 | * clone a task which switches to @task's namespace and writes '1'. | |
318 | * over a unix sock so we can read the task's reaper's pid in our | |
319 | * namespace | |
320 | * | |
321 | * Note: glibc's fork() does not respect pidns, which can lead to failed | |
322 | * assertions inside glibc (and thus failed forks) if the child's pid in | |
323 | * the pidns and the parent pid outside are identical. Using clone prevents | |
324 | * this issue. | |
325 | */ | |
326 | static void write_task_init_pid_exit(int sock, pid_t target) | |
327 | { | |
328 | char fnam[100]; | |
329 | pid_t pid; | |
330 | int fd, ret; | |
331 | size_t stack_size = sysconf(_SC_PAGESIZE); | |
332 | void *stack = alloca(stack_size); | |
237e200e | 333 | |
580fe4df CB |
334 | ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target); |
335 | if (ret < 0 || ret >= sizeof(fnam)) | |
336 | _exit(1); | |
f23fe717 | 337 | |
580fe4df CB |
338 | fd = open(fnam, O_RDONLY); |
339 | if (fd < 0) { | |
340 | perror("write_task_init_pid_exit open of ns/pid"); | |
341 | _exit(1); | |
237e200e | 342 | } |
580fe4df CB |
343 | if (setns(fd, 0)) { |
344 | perror("write_task_init_pid_exit setns 1"); | |
345 | close(fd); | |
346 | _exit(1); | |
347 | } | |
348 | pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock); | |
349 | if (pid < 0) | |
350 | _exit(1); | |
351 | if (pid != 0) { | |
352 | if (!wait_for_pid(pid)) | |
353 | _exit(1); | |
354 | _exit(0); | |
237e200e | 355 | } |
237e200e SH |
356 | } |
357 | ||
580fe4df CB |
358 | static int send_creds_clone_wrapper(void *arg) { |
359 | struct ucred cred; | |
360 | char v; | |
361 | int sock = *(int *)arg; | |
237e200e | 362 | |
580fe4df CB |
363 | /* we are the child */ |
364 | cred.uid = 0; | |
365 | cred.gid = 0; | |
366 | cred.pid = 1; | |
367 | v = '1'; | |
368 | if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) | |
369 | return 1; | |
237e200e SH |
370 | return 0; |
371 | } | |
372 | ||
580fe4df | 373 | static pid_t get_init_pid_for_task(pid_t task) |
237e200e | 374 | { |
580fe4df CB |
375 | int sock[2]; |
376 | pid_t pid; | |
377 | pid_t ret = -1; | |
378 | char v = '0'; | |
379 | struct ucred cred; | |
237e200e | 380 | |
580fe4df CB |
381 | if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { |
382 | perror("socketpair"); | |
383 | return -1; | |
237e200e SH |
384 | } |
385 | ||
580fe4df CB |
386 | pid = fork(); |
387 | if (pid < 0) | |
388 | goto out; | |
389 | if (!pid) { | |
390 | close(sock[1]); | |
391 | write_task_init_pid_exit(sock[0], task); | |
392 | _exit(0); | |
237e200e | 393 | } |
7213ec5c | 394 | |
580fe4df CB |
395 | if (!recv_creds(sock[1], &cred, &v)) |
396 | goto out; | |
397 | ret = cred.pid; | |
237e200e | 398 | |
580fe4df CB |
399 | out: |
400 | close(sock[0]); | |
401 | close(sock[1]); | |
402 | if (pid > 0) | |
403 | wait_for_pid(pid); | |
237e200e SH |
404 | return ret; |
405 | } | |
406 | ||
580fe4df | 407 | pid_t lookup_initpid_in_store(pid_t qpid) |
237e200e | 408 | { |
580fe4df CB |
409 | pid_t answer = 0; |
410 | struct stat sb; | |
411 | struct pidns_init_store *e; | |
412 | char fnam[100]; | |
b7672ded | 413 | |
580fe4df CB |
414 | snprintf(fnam, 100, "/proc/%d/ns/pid", qpid); |
415 | store_lock(); | |
416 | if (stat(fnam, &sb) < 0) | |
417 | goto out; | |
418 | e = lookup_verify_initpid(&sb); | |
419 | if (e) { | |
420 | answer = e->initpid; | |
421 | goto out; | |
422 | } | |
423 | answer = get_init_pid_for_task(qpid); | |
424 | if (answer > 0) | |
425 | save_initpid(&sb, answer); | |
b7672ded | 426 | |
580fe4df CB |
427 | out: |
428 | /* we prune at end in case we are returning | |
429 | * the value we were about to return */ | |
430 | prune_initpid_store(); | |
431 | store_unlock(); | |
432 | return answer; | |
237e200e SH |
433 | } |
434 | ||
580fe4df | 435 | static int wait_for_pid(pid_t pid) |
237e200e | 436 | { |
580fe4df | 437 | int status, ret; |
f5a6d92e | 438 | |
580fe4df CB |
439 | if (pid <= 0) |
440 | return -1; | |
237e200e | 441 | |
580fe4df CB |
442 | again: |
443 | ret = waitpid(pid, &status, 0); | |
444 | if (ret == -1) { | |
445 | if (errno == EINTR) | |
446 | goto again; | |
447 | return -1; | |
448 | } | |
449 | if (ret != pid) | |
450 | goto again; | |
451 | if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) | |
452 | return -1; | |
237e200e SH |
453 | return 0; |
454 | } | |
455 | ||
580fe4df CB |
456 | #define INITSCOPE "/init.scope" |
457 | void prune_init_slice(char *cg) | |
237e200e | 458 | { |
580fe4df CB |
459 | char *point; |
460 | size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE); | |
3ffd08ee | 461 | |
580fe4df CB |
462 | if (cg_len < initscope_len) |
463 | return; | |
3ffd08ee | 464 | |
580fe4df CB |
465 | point = cg + cg_len - initscope_len; |
466 | if (strcmp(point, INITSCOPE) == 0) { | |
467 | if (point == cg) | |
468 | *(point+1) = '\0'; | |
469 | else | |
470 | *point = '\0'; | |
471 | } | |
237e200e SH |
472 | } |
473 | ||
580fe4df CB |
474 | struct pid_ns_clone_args { |
475 | int *cpipe; | |
476 | int sock; | |
477 | pid_t tpid; | |
478 | int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns | |
479 | }; | |
237e200e | 480 | |
29a73c2f CB |
481 | /* |
482 | * Functions needed to setup cgroups in the __constructor__. | |
29a73c2f CB |
483 | */ |
484 | ||
29a73c2f CB |
485 | static bool umount_if_mounted(void) |
486 | { | |
487 | if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) { | |
b8defc3d | 488 | lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno)); |
29a73c2f CB |
489 | return false; |
490 | } | |
491 | return true; | |
492 | } | |
493 | ||
2283e240 CB |
494 | /* __typeof__ should be safe to use with all compilers. */ |
495 | typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; | |
496 | static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) | |
497 | { | |
498 | return (fs->f_type == (fs_type_magic)magic_val); | |
499 | } | |
500 | ||
0a4dea41 CB |
501 | /* |
502 | * looking at fs/proc_namespace.c, it appears we can | |
503 | * actually expect the rootfs entry to very specifically contain | |
504 | * " - rootfs rootfs " | |
505 | * IIUC, so long as we've chrooted so that rootfs is not our root, | |
506 | * the rootfs entry should always be skipped in mountinfo contents. | |
507 | */ | |
508 | static bool is_on_ramfs(void) | |
509 | { | |
510 | FILE *f; | |
511 | char *p, *p2; | |
512 | char *line = NULL; | |
513 | size_t len = 0; | |
514 | int i; | |
515 | ||
516 | f = fopen("/proc/self/mountinfo", "r"); | |
517 | if (!f) | |
518 | return false; | |
519 | ||
520 | while (getline(&line, &len, f) != -1) { | |
521 | for (p = line, i = 0; p && i < 4; i++) | |
522 | p = strchr(p + 1, ' '); | |
523 | if (!p) | |
524 | continue; | |
525 | p2 = strchr(p + 1, ' '); | |
526 | if (!p2) | |
527 | continue; | |
528 | *p2 = '\0'; | |
529 | if (strcmp(p + 1, "/") == 0) { | |
530 | // this is '/'. is it the ramfs? | |
531 | p = strchr(p2 + 1, '-'); | |
532 | if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) { | |
533 | free(line); | |
534 | fclose(f); | |
535 | return true; | |
536 | } | |
537 | } | |
538 | } | |
539 | free(line); | |
540 | fclose(f); | |
541 | return false; | |
542 | } | |
543 | ||
cc309f33 | 544 | static int pivot_enter() |
0a4dea41 | 545 | { |
cc309f33 CB |
546 | int ret = -1, oldroot = -1, newroot = -1; |
547 | ||
548 | oldroot = open("/", O_DIRECTORY | O_RDONLY); | |
549 | if (oldroot < 0) { | |
550 | lxcfs_error("%s\n", "Failed to open old root for fchdir."); | |
551 | return ret; | |
552 | } | |
553 | ||
554 | newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY); | |
555 | if (newroot < 0) { | |
556 | lxcfs_error("%s\n", "Failed to open new root for fchdir."); | |
557 | goto err; | |
558 | } | |
559 | ||
560 | /* change into new root fs */ | |
561 | if (fchdir(newroot) < 0) { | |
562 | lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR); | |
563 | goto err; | |
564 | } | |
565 | ||
0a4dea41 CB |
566 | /* pivot_root into our new root fs */ |
567 | if (pivot_root(".", ".") < 0) { | |
568 | lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno)); | |
cc309f33 | 569 | goto err; |
0a4dea41 CB |
570 | } |
571 | ||
572 | /* | |
573 | * At this point the old-root is mounted on top of our new-root. | |
574 | * To unmounted it we must not be chdir'd into it, so escape back | |
575 | * to the old-root. | |
576 | */ | |
577 | if (fchdir(oldroot) < 0) { | |
578 | lxcfs_error("%s\n", "Failed to enter old root."); | |
cc309f33 | 579 | goto err; |
0a4dea41 CB |
580 | } |
581 | ||
582 | if (umount2(".", MNT_DETACH) < 0) { | |
583 | lxcfs_error("%s\n", "Failed to detach old root."); | |
cc309f33 | 584 | goto err; |
0a4dea41 CB |
585 | } |
586 | ||
587 | if (fchdir(newroot) < 0) { | |
588 | lxcfs_error("%s\n", "Failed to re-enter new root."); | |
cc309f33 | 589 | goto err; |
0a4dea41 CB |
590 | } |
591 | ||
cc309f33 CB |
592 | ret = 0; |
593 | ||
594 | err: | |
595 | if (oldroot > 0) | |
596 | close(oldroot); | |
597 | if (newroot > 0) | |
598 | close(newroot); | |
599 | ||
600 | return ret; | |
0a4dea41 CB |
601 | } |
602 | ||
603 | static int chroot_enter() | |
604 | { | |
605 | if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) { | |
606 | lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR); | |
607 | return -1; | |
608 | } | |
609 | ||
610 | if (chroot(".") < 0) { | |
611 | lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); | |
612 | return -1; | |
613 | } | |
614 | ||
615 | if (chdir("/") < 0) { | |
616 | lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); | |
617 | return -1; | |
618 | } | |
619 | ||
620 | return 0; | |
621 | } | |
622 | ||
0232cbac | 623 | static int permute_and_enter(void) |
29a73c2f | 624 | { |
0a4dea41 CB |
625 | struct statfs sb; |
626 | ||
627 | if (statfs("/", &sb) < 0) { | |
628 | lxcfs_error("%s\n", "Could not stat / mountpoint."); | |
cc309f33 | 629 | return -1; |
0a4dea41 CB |
630 | } |
631 | ||
632 | /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will | |
633 | * likely report TMPFS_MAGIC. Hence, when it reports no we still check | |
634 | * /proc/1/mountinfo. */ | |
635 | if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) | |
636 | return chroot_enter(); | |
29a73c2f | 637 | |
cc309f33 | 638 | if (pivot_enter() < 0) { |
0a4dea41 | 639 | lxcfs_error("%s\n", "Could not perform pivot root."); |
cc309f33 | 640 | return -1; |
29a73c2f CB |
641 | } |
642 | ||
cc309f33 | 643 | return 0; |
29a73c2f CB |
644 | } |
645 | ||
646 | /* Prepare our new clean root. */ | |
0232cbac | 647 | static int permute_prepare(void) |
29a73c2f CB |
648 | { |
649 | if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) { | |
b8defc3d | 650 | lxcfs_error("%s\n", "Failed to create directory for new root."); |
29a73c2f CB |
651 | return -1; |
652 | } | |
653 | ||
654 | if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 655 | lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); |
29a73c2f CB |
656 | return -1; |
657 | } | |
658 | ||
659 | if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) { | |
b8defc3d | 660 | lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
661 | return -1; |
662 | } | |
663 | ||
664 | if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) { | |
b8defc3d | 665 | printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno)); |
29a73c2f CB |
666 | return -1; |
667 | } | |
668 | ||
669 | return 0; | |
670 | } | |
671 | ||
0232cbac CB |
672 | /* Calls chroot() on ramfs, pivot_root() in all other cases. */ |
673 | static bool permute_root(void) | |
29a73c2f CB |
674 | { |
675 | /* Prepare new root. */ | |
0232cbac | 676 | if (permute_prepare() < 0) |
29a73c2f CB |
677 | return false; |
678 | ||
679 | /* Pivot into new root. */ | |
0232cbac | 680 | if (permute_and_enter() < 0) |
29a73c2f CB |
681 | return false; |
682 | ||
683 | return true; | |
684 | } | |
685 | ||
0a4dea41 | 686 | static bool cgfs_prepare_mounts(void) |
29a73c2f CB |
687 | { |
688 | if (!mkdir_p(BASEDIR, 0700)) { | |
b8defc3d | 689 | lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); |
29a73c2f CB |
690 | return false; |
691 | } | |
480262c9 | 692 | |
29a73c2f | 693 | if (!umount_if_mounted()) { |
b8defc3d | 694 | lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); |
480262c9 CB |
695 | return false; |
696 | } | |
697 | ||
698 | if (unshare(CLONE_NEWNS) < 0) { | |
b8defc3d | 699 | lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); |
480262c9 CB |
700 | return false; |
701 | } | |
702 | ||
1d81c6a6 | 703 | cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); |
0646f250 | 704 | if (cgroup_ops->mntns_fd < 0) { |
a257a8ee CB |
705 | lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); |
706 | return false; | |
707 | } | |
708 | ||
480262c9 | 709 | if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { |
b8defc3d | 710 | lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); |
29a73c2f CB |
711 | return false; |
712 | } | |
480262c9 | 713 | |
29a73c2f | 714 | if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) { |
b8defc3d | 715 | lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); |
29a73c2f CB |
716 | return false; |
717 | } | |
480262c9 | 718 | |
29a73c2f CB |
719 | return true; |
720 | } | |
721 | ||
0a4dea41 | 722 | static bool cgfs_mount_hierarchies(void) |
29a73c2f | 723 | { |
5fbea8a6 CB |
724 | if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755)) |
725 | return false; | |
51c7ca35 | 726 | |
5fbea8a6 CB |
727 | if (!cgroup_ops->mount(cgroup_ops, BASEDIR)) |
728 | return false; | |
29a73c2f | 729 | |
5fbea8a6 CB |
730 | for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { |
731 | __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL); | |
732 | (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); | |
733 | if ((*h)->fd < 0) | |
29a73c2f | 734 | return false; |
29a73c2f | 735 | } |
5fbea8a6 | 736 | |
29a73c2f CB |
737 | return true; |
738 | } | |
739 | ||
480262c9 | 740 | static bool cgfs_setup_controllers(void) |
29a73c2f | 741 | { |
0a4dea41 | 742 | if (!cgfs_prepare_mounts()) |
29a73c2f | 743 | return false; |
29a73c2f | 744 | |
0a4dea41 | 745 | if (!cgfs_mount_hierarchies()) { |
b8defc3d | 746 | lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts."); |
29a73c2f CB |
747 | return false; |
748 | } | |
749 | ||
0232cbac | 750 | if (!permute_root()) |
29a73c2f CB |
751 | return false; |
752 | ||
753 | return true; | |
754 | } | |
755 | ||
2243c5a9 | 756 | static void __attribute__((constructor)) lxcfs_init(void) |
237e200e | 757 | { |
2243c5a9 | 758 | __do_close_prot_errno int init_ns = -EBADF; |
5fbea8a6 | 759 | char *cret; |
e58dab00 | 760 | char cwd[MAXPATHLEN]; |
237e200e | 761 | |
5fbea8a6 CB |
762 | cgroup_ops = cgroup_init(); |
763 | if (!cgroup_ops) | |
2243c5a9 | 764 | log_exit("Failed to initialize cgroup support"); |
237e200e | 765 | |
480262c9 | 766 | /* Preserve initial namespace. */ |
1d81c6a6 | 767 | init_ns = preserve_ns(getpid(), "mnt"); |
2243c5a9 CB |
768 | if (init_ns < 0) |
769 | log_exit("Failed to preserve initial mount namespace"); | |
480262c9 | 770 | |
e58dab00 | 771 | cret = getcwd(cwd, MAXPATHLEN); |
2243c5a9 | 772 | log_exit("%s - Could not retrieve current working directory", strerror(errno)); |
e58dab00 | 773 | |
480262c9 CB |
774 | /* This function calls unshare(CLONE_NEWNS) our initial mount namespace |
775 | * to privately mount lxcfs cgroups. */ | |
2243c5a9 CB |
776 | if (!cgfs_setup_controllers()) |
777 | log_exit("Failed to setup private cgroup mounts for lxcfs"); | |
480262c9 | 778 | |
2243c5a9 CB |
779 | if (setns(init_ns, 0) < 0) |
780 | log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); | |
29a73c2f | 781 | |
e58dab00 | 782 | if (!cret || chdir(cwd) < 0) |
2243c5a9 | 783 | log_exit("%s - Could not change back to original working directory", strerror(errno)); |
e58dab00 | 784 | |
2243c5a9 CB |
785 | if (!init_cpuview()) |
786 | log_exit("Failed to init CPU view"); | |
056adcef | 787 | |
237e200e | 788 | print_subsystems(); |
237e200e SH |
789 | } |
790 | ||
2243c5a9 | 791 | static void __attribute__((destructor)) lxcfs_exit(void) |
237e200e | 792 | { |
0646f250 | 793 | lxcfs_debug("%s\n", "Running destructor for liblxcfs"); |
056adcef | 794 | free_cpuview(); |
2243c5a9 | 795 | cgroup_exit(cgroup_ops); |
1c4b4e38 | 796 | } |