]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Use hash table to store load information
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 /* The function of hash table.*/
84 #define LOAD_SIZE 100 /*the size of hash_table */
85 static int calc_hash(char *name)
86 {
87 unsigned int hash = 0;
88 unsigned int x = 0;
89 /* ELFHash algorithm. */
90 while (*name) {
91 hash = (hash << 4) + *name++;
92 x = hash & 0xf0000000;
93 if (x != 0)
94 hash ^= (x >> 24);
95 hash &= ~x;
96 }
97 return ((hash & 0x7fffffff) % LOAD_SIZE);
98 }
99
100 struct load_node {
101 char *cg; /*cg */
102 unsigned long avenrun[3]; /* Load averages */
103 unsigned int run_pid;
104 unsigned int total_pid;
105 unsigned int last_pid;
106 int cfd; /* The file descriptor of the mounted cgroup */
107 struct load_node *next;
108 struct load_node **pre;
109 };
110
111 struct load_head {
112 /*
113 * The lock is about insert load_node and refresh load_node.To the first
114 * load_node of each hash bucket, insert and refresh in this hash bucket is
115 * mutually exclusive.
116 */
117 pthread_mutex_t lock;
118 /*
119 * The rdlock is about read loadavg and delete load_node.To each hash
120 * bucket, read and delete is mutually exclusive. But at the same time, we
121 * allow paratactic read operation. This rdlock is at list level.
122 */
123 pthread_rwlock_t rdlock;
124 /*
125 * The rilock is about read loadavg and insert load_node.To the first
126 * load_node of each hash bucket, read and insert is mutually exclusive.
127 * But at the same time, we allow paratactic read operation.
128 */
129 pthread_rwlock_t rilock;
130 struct load_node *next;
131 };
132
133 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
134 /*
135 * init_load initialize the hash table.
136 * Return 0 on success, return -1 on failure.
137 */
138 static int init_load(void)
139 {
140 int i;
141 int ret;
142
143 for (i = 0; i < LOAD_SIZE; i++) {
144 load_hash[i].next = NULL;
145 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
146 if (ret != 0) {
147 lxcfs_error("%s\n", "Failed to initialize lock");
148 goto out3;
149 }
150 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
151 if (ret != 0) {
152 lxcfs_error("%s\n", "Failed to initialize rdlock");
153 goto out2;
154 }
155 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
156 if (ret != 0) {
157 lxcfs_error("%s\n", "Failed to initialize rilock");
158 goto out1;
159 }
160 }
161 return 0;
162 out1:
163 pthread_rwlock_destroy(&load_hash[i].rdlock);
164 out2:
165 pthread_mutex_destroy(&load_hash[i].lock);
166 out3:
167 while (i > 0) {
168 i--;
169 pthread_mutex_destroy(&load_hash[i].lock);
170 pthread_rwlock_destroy(&load_hash[i].rdlock);
171 pthread_rwlock_destroy(&load_hash[i].rilock);
172 }
173 return -1;
174 }
175
176 static void insert_node(struct load_node **n, int locate)
177 {
178 struct load_node *f;
179
180 pthread_mutex_lock(&load_hash[locate].lock);
181 pthread_rwlock_wrlock(&load_hash[locate].rilock);
182 f = load_hash[locate].next;
183 load_hash[locate].next = *n;
184
185 (*n)->pre = &(load_hash[locate].next);
186 if (f)
187 f->pre = &((*n)->next);
188 (*n)->next = f;
189 pthread_mutex_unlock(&load_hash[locate].lock);
190 pthread_rwlock_unlock(&load_hash[locate].rilock);
191 }
192 /*
193 * locate_node() finds special node. Not return NULL means success.
194 * It should be noted that rdlock isn't unlocked at the end of code
195 * because this function is used to read special node. Delete is not
196 * allowed before read has ended.
197 * unlock rdlock only in proc_loadavg_read().
198 */
199 static struct load_node *locate_node(char *cg, int locate)
200 {
201 struct load_node *f = NULL;
202 int i = 0;
203
204 pthread_rwlock_rdlock(&load_hash[locate].rilock);
205 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
206 if (load_hash[locate].next == NULL) {
207 pthread_rwlock_unlock(&load_hash[locate].rilock);
208 return f;
209 }
210 f = load_hash[locate].next;
211 pthread_rwlock_unlock(&load_hash[locate].rilock);
212 while (f && ((i = strcmp(f->cg, cg)) != 0))
213 f = f->next;
214 return f;
215 }
216 /* Delete the load_node n and return the next node of it. */
217 static struct load_node *del_node(struct load_node *n, int locate)
218 {
219 struct load_node *g;
220
221 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
222 if (n->next == NULL) {
223 *(n->pre) = NULL;
224 } else {
225 *(n->pre) = n->next;
226 n->next->pre = n->pre;
227 }
228 g = n->next;
229 free(n->cg);
230 free(n);
231 pthread_rwlock_unlock(&load_hash[locate].rdlock);
232 return g;
233 }
234
235 /* Reserve buffer size to account for file size changes. */
236 #define BUF_RESERVE_SIZE 512
237
238 /*
239 * A table caching which pid is init for a pid namespace.
240 * When looking up which pid is init for $qpid, we first
241 * 1. Stat /proc/$qpid/ns/pid.
242 * 2. Check whether the ino_t is in our store.
243 * a. if not, fork a child in qpid's ns to send us
244 * ucred.pid = 1, and read the initpid. Cache
245 * initpid and creation time for /proc/initpid
246 * in a new store entry.
247 * b. if so, verify that /proc/initpid still matches
248 * what we have saved. If not, clear the store
249 * entry and go back to a. If so, return the
250 * cached initpid.
251 */
252 struct pidns_init_store {
253 ino_t ino; // inode number for /proc/$pid/ns/pid
254 pid_t initpid; // the pid of nit in that ns
255 long int ctime; // the time at which /proc/$initpid was created
256 struct pidns_init_store *next;
257 long int lastcheck;
258 };
259
260 /* lol - look at how they are allocated in the kernel */
261 #define PIDNS_HASH_SIZE 4096
262 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
263
264 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
265 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
266 static void lock_mutex(pthread_mutex_t *l)
267 {
268 int ret;
269
270 if ((ret = pthread_mutex_lock(l)) != 0) {
271 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
272 exit(1);
273 }
274 }
275
276 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
277 * Number of hierarchies mounted. */
278 static int num_hierarchies;
279
280 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
281 * Hierachies mounted {cpuset, blkio, ...}:
282 * Initialized via __constructor__ collect_and_mount_subsystems(). */
283 static char **hierarchies;
284
285 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
286 * Open file descriptors:
287 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
288 * private mount namespace.
289 * Initialized via __constructor__ collect_and_mount_subsystems().
290 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
291 * mounts and respective files in the private namespace even when located in
292 * another namespace using the *at() family of functions
293 * {openat(), fchownat(), ...}. */
294 static int *fd_hierarchies;
295 static int cgroup_mount_ns_fd = -1;
296
297 static void unlock_mutex(pthread_mutex_t *l)
298 {
299 int ret;
300
301 if ((ret = pthread_mutex_unlock(l)) != 0) {
302 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
303 exit(1);
304 }
305 }
306
307 static void store_lock(void)
308 {
309 lock_mutex(&pidns_store_mutex);
310 }
311
312 static void store_unlock(void)
313 {
314 unlock_mutex(&pidns_store_mutex);
315 }
316
317 /* Must be called under store_lock */
318 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
319 {
320 struct stat initsb;
321 char fnam[100];
322
323 snprintf(fnam, 100, "/proc/%d", e->initpid);
324 if (stat(fnam, &initsb) < 0)
325 return false;
326
327 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
328 initsb.st_ctime, e->initpid);
329
330 if (e->ctime != initsb.st_ctime)
331 return false;
332 return true;
333 }
334
335 /* Must be called under store_lock */
336 static void remove_initpid(struct pidns_init_store *e)
337 {
338 struct pidns_init_store *tmp;
339 int h;
340
341 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
342
343 h = HASH(e->ino);
344 if (pidns_hash_table[h] == e) {
345 pidns_hash_table[h] = e->next;
346 free(e);
347 return;
348 }
349
350 tmp = pidns_hash_table[h];
351 while (tmp) {
352 if (tmp->next == e) {
353 tmp->next = e->next;
354 free(e);
355 return;
356 }
357 tmp = tmp->next;
358 }
359 }
360
361 #define PURGE_SECS 5
362 /* Must be called under store_lock */
363 static void prune_initpid_store(void)
364 {
365 static long int last_prune = 0;
366 struct pidns_init_store *e, *prev, *delme;
367 long int now, threshold;
368 int i;
369
370 if (!last_prune) {
371 last_prune = time(NULL);
372 return;
373 }
374 now = time(NULL);
375 if (now < last_prune + PURGE_SECS)
376 return;
377
378 lxcfs_debug("%s\n", "Pruning.");
379
380 last_prune = now;
381 threshold = now - 2 * PURGE_SECS;
382
383 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
384 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
385 if (e->lastcheck < threshold) {
386
387 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
388
389 delme = e;
390 if (prev)
391 prev->next = e->next;
392 else
393 pidns_hash_table[i] = e->next;
394 e = e->next;
395 free(delme);
396 } else {
397 prev = e;
398 e = e->next;
399 }
400 }
401 }
402 }
403
404 /* Must be called under store_lock */
405 static void save_initpid(struct stat *sb, pid_t pid)
406 {
407 struct pidns_init_store *e;
408 char fpath[100];
409 struct stat procsb;
410 int h;
411
412 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
413
414 snprintf(fpath, 100, "/proc/%d", pid);
415 if (stat(fpath, &procsb) < 0)
416 return;
417 do {
418 e = malloc(sizeof(*e));
419 } while (!e);
420 e->ino = sb->st_ino;
421 e->initpid = pid;
422 e->ctime = procsb.st_ctime;
423 h = HASH(e->ino);
424 e->next = pidns_hash_table[h];
425 e->lastcheck = time(NULL);
426 pidns_hash_table[h] = e;
427 }
428
429 /*
430 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
431 * entry for the inode number and creation time. Verify that the init pid
432 * is still valid. If not, remove it. Return the entry if valid, NULL
433 * otherwise.
434 * Must be called under store_lock
435 */
436 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
437 {
438 int h = HASH(sb->st_ino);
439 struct pidns_init_store *e = pidns_hash_table[h];
440
441 while (e) {
442 if (e->ino == sb->st_ino) {
443 if (initpid_still_valid(e, sb)) {
444 e->lastcheck = time(NULL);
445 return e;
446 }
447 remove_initpid(e);
448 return NULL;
449 }
450 e = e->next;
451 }
452
453 return NULL;
454 }
455
456 static int is_dir(const char *path, int fd)
457 {
458 struct stat statbuf;
459 int ret = fstatat(fd, path, &statbuf, fd);
460 if (ret == 0 && S_ISDIR(statbuf.st_mode))
461 return 1;
462 return 0;
463 }
464
465 static char *must_copy_string(const char *str)
466 {
467 char *dup = NULL;
468 if (!str)
469 return NULL;
470 do {
471 dup = strdup(str);
472 } while (!dup);
473
474 return dup;
475 }
476
477 static inline void drop_trailing_newlines(char *s)
478 {
479 int l;
480
481 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
482 s[l-1] = '\0';
483 }
484
485 #define BATCH_SIZE 50
486 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
487 {
488 int newbatches = (newlen / BATCH_SIZE) + 1;
489 int oldbatches = (oldlen / BATCH_SIZE) + 1;
490
491 if (!*mem || newbatches > oldbatches) {
492 char *tmp;
493 do {
494 tmp = realloc(*mem, newbatches * BATCH_SIZE);
495 } while (!tmp);
496 *mem = tmp;
497 }
498 }
499 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
500 {
501 size_t newlen = *len + linelen;
502 dorealloc(contents, *len, newlen + 1);
503 memcpy(*contents + *len, line, linelen+1);
504 *len = newlen;
505 }
506
507 static char *slurp_file(const char *from, int fd)
508 {
509 char *line = NULL;
510 char *contents = NULL;
511 FILE *f = fdopen(fd, "r");
512 size_t len = 0, fulllen = 0;
513 ssize_t linelen;
514
515 if (!f)
516 return NULL;
517
518 while ((linelen = getline(&line, &len, f)) != -1) {
519 append_line(&contents, &fulllen, line, linelen);
520 }
521 fclose(f);
522
523 if (contents)
524 drop_trailing_newlines(contents);
525 free(line);
526 return contents;
527 }
528
529 static bool write_string(const char *fnam, const char *string, int fd)
530 {
531 FILE *f;
532 size_t len, ret;
533
534 if (!(f = fdopen(fd, "w")))
535 return false;
536 len = strlen(string);
537 ret = fwrite(string, 1, len, f);
538 if (ret != len) {
539 lxcfs_error("Error writing to file: %s\n", strerror(errno));
540 fclose(f);
541 return false;
542 }
543 if (fclose(f) < 0) {
544 lxcfs_error("Error writing to file: %s\n", strerror(errno));
545 return false;
546 }
547 return true;
548 }
549
550 struct cgfs_files {
551 char *name;
552 uint32_t uid, gid;
553 uint32_t mode;
554 };
555
556 #define ALLOC_NUM 20
557 static bool store_hierarchy(char *stridx, char *h)
558 {
559 if (num_hierarchies % ALLOC_NUM == 0) {
560 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
561 n *= ALLOC_NUM;
562 char **tmp = realloc(hierarchies, n * sizeof(char *));
563 if (!tmp) {
564 lxcfs_error("%s\n", strerror(errno));
565 exit(1);
566 }
567 hierarchies = tmp;
568 }
569
570 hierarchies[num_hierarchies++] = must_copy_string(h);
571 return true;
572 }
573
574 static void print_subsystems(void)
575 {
576 int i;
577
578 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
579 fprintf(stderr, "hierarchies:\n");
580 for (i = 0; i < num_hierarchies; i++) {
581 if (hierarchies[i])
582 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
583 fd_hierarchies[i], hierarchies[i]);
584 }
585 }
586
587 static bool in_comma_list(const char *needle, const char *haystack)
588 {
589 const char *s = haystack, *e;
590 size_t nlen = strlen(needle);
591
592 while (*s && (e = strchr(s, ','))) {
593 if (nlen != e - s) {
594 s = e + 1;
595 continue;
596 }
597 if (strncmp(needle, s, nlen) == 0)
598 return true;
599 s = e + 1;
600 }
601 if (strcmp(needle, s) == 0)
602 return true;
603 return false;
604 }
605
606 /* do we need to do any massaging here? I'm not sure... */
607 /* Return the mounted controller and store the corresponding open file descriptor
608 * referring to the controller mountpoint in the private lxcfs namespace in
609 * @cfd.
610 */
611 static char *find_mounted_controller(const char *controller, int *cfd)
612 {
613 int i;
614
615 for (i = 0; i < num_hierarchies; i++) {
616 if (!hierarchies[i])
617 continue;
618 if (strcmp(hierarchies[i], controller) == 0) {
619 *cfd = fd_hierarchies[i];
620 return hierarchies[i];
621 }
622 if (in_comma_list(controller, hierarchies[i])) {
623 *cfd = fd_hierarchies[i];
624 return hierarchies[i];
625 }
626 }
627
628 return NULL;
629 }
630
631 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
632 const char *value)
633 {
634 int ret, fd, cfd;
635 size_t len;
636 char *fnam, *tmpc;
637
638 tmpc = find_mounted_controller(controller, &cfd);
639 if (!tmpc)
640 return false;
641
642 /* Make sure we pass a relative path to *at() family of functions.
643 * . + /cgroup + / + file + \0
644 */
645 len = strlen(cgroup) + strlen(file) + 3;
646 fnam = alloca(len);
647 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
648 if (ret < 0 || (size_t)ret >= len)
649 return false;
650
651 fd = openat(cfd, fnam, O_WRONLY);
652 if (fd < 0)
653 return false;
654
655 return write_string(fnam, value, fd);
656 }
657
658 // Chown all the files in the cgroup directory. We do this when we create
659 // a cgroup on behalf of a user.
660 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
661 {
662 struct dirent *direntp;
663 char path[MAXPATHLEN];
664 size_t len;
665 DIR *d;
666 int fd1, ret;
667
668 len = strlen(dirname);
669 if (len >= MAXPATHLEN) {
670 lxcfs_error("Pathname too long: %s\n", dirname);
671 return;
672 }
673
674 fd1 = openat(fd, dirname, O_DIRECTORY);
675 if (fd1 < 0)
676 return;
677
678 d = fdopendir(fd1);
679 if (!d) {
680 lxcfs_error("Failed to open %s\n", dirname);
681 return;
682 }
683
684 while ((direntp = readdir(d))) {
685 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
686 continue;
687 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
688 if (ret < 0 || ret >= MAXPATHLEN) {
689 lxcfs_error("Pathname too long under %s\n", dirname);
690 continue;
691 }
692 if (fchownat(fd, path, uid, gid, 0) < 0)
693 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
694 }
695 closedir(d);
696 }
697
698 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
699 {
700 int cfd;
701 size_t len;
702 char *dirnam, *tmpc;
703
704 tmpc = find_mounted_controller(controller, &cfd);
705 if (!tmpc)
706 return -EINVAL;
707
708 /* Make sure we pass a relative path to *at() family of functions.
709 * . + /cg + \0
710 */
711 len = strlen(cg) + 2;
712 dirnam = alloca(len);
713 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
714
715 if (mkdirat(cfd, dirnam, 0755) < 0)
716 return -errno;
717
718 if (uid == 0 && gid == 0)
719 return 0;
720
721 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
722 return -errno;
723
724 chown_all_cgroup_files(dirnam, uid, gid, cfd);
725
726 return 0;
727 }
728
729 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
730 {
731 struct dirent *direntp;
732 DIR *dir;
733 bool ret = false;
734 char pathname[MAXPATHLEN];
735 int dupfd;
736
737 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
738 if (dupfd < 0)
739 return false;
740
741 dir = fdopendir(dupfd);
742 if (!dir) {
743 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
744 close(dupfd);
745 return false;
746 }
747
748 while ((direntp = readdir(dir))) {
749 struct stat mystat;
750 int rc;
751
752 if (!strcmp(direntp->d_name, ".") ||
753 !strcmp(direntp->d_name, ".."))
754 continue;
755
756 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
757 if (rc < 0 || rc >= MAXPATHLEN) {
758 lxcfs_error("%s\n", "Pathname too long.");
759 continue;
760 }
761
762 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
763 if (rc) {
764 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
765 continue;
766 }
767 if (S_ISDIR(mystat.st_mode))
768 if (!recursive_rmdir(pathname, fd, cfd))
769 lxcfs_debug("Error removing %s.\n", pathname);
770 }
771
772 ret = true;
773 if (closedir(dir) < 0) {
774 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
775 ret = false;
776 }
777
778 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
779 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
780 ret = false;
781 }
782
783 close(dupfd);
784
785 return ret;
786 }
787
788 bool cgfs_remove(const char *controller, const char *cg)
789 {
790 int fd, cfd;
791 size_t len;
792 char *dirnam, *tmpc;
793 bool bret;
794
795 tmpc = find_mounted_controller(controller, &cfd);
796 if (!tmpc)
797 return false;
798
799 /* Make sure we pass a relative path to *at() family of functions.
800 * . + /cg + \0
801 */
802 len = strlen(cg) + 2;
803 dirnam = alloca(len);
804 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
805
806 fd = openat(cfd, dirnam, O_DIRECTORY);
807 if (fd < 0)
808 return false;
809
810 bret = recursive_rmdir(dirnam, fd, cfd);
811 close(fd);
812 return bret;
813 }
814
815 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
816 {
817 int cfd;
818 size_t len;
819 char *pathname, *tmpc;
820
821 tmpc = find_mounted_controller(controller, &cfd);
822 if (!tmpc)
823 return false;
824
825 /* Make sure we pass a relative path to *at() family of functions.
826 * . + /file + \0
827 */
828 len = strlen(file) + 2;
829 pathname = alloca(len);
830 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
831 if (fchmodat(cfd, pathname, mode, 0) < 0)
832 return false;
833 return true;
834 }
835
836 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
837 {
838 size_t len;
839 char *fname;
840
841 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
842 fname = alloca(len);
843 snprintf(fname, len, "%s/tasks", dirname);
844 if (fchownat(fd, fname, uid, gid, 0) != 0)
845 return -errno;
846 snprintf(fname, len, "%s/cgroup.procs", dirname);
847 if (fchownat(fd, fname, uid, gid, 0) != 0)
848 return -errno;
849 return 0;
850 }
851
852 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
853 {
854 int cfd;
855 size_t len;
856 char *pathname, *tmpc;
857
858 tmpc = find_mounted_controller(controller, &cfd);
859 if (!tmpc)
860 return -EINVAL;
861
862 /* Make sure we pass a relative path to *at() family of functions.
863 * . + /file + \0
864 */
865 len = strlen(file) + 2;
866 pathname = alloca(len);
867 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
868 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
869 return -errno;
870
871 if (is_dir(pathname, cfd))
872 // like cgmanager did, we want to chown the tasks file as well
873 return chown_tasks_files(pathname, uid, gid, cfd);
874
875 return 0;
876 }
877
878 FILE *open_pids_file(const char *controller, const char *cgroup)
879 {
880 int fd, cfd;
881 size_t len;
882 char *pathname, *tmpc;
883
884 tmpc = find_mounted_controller(controller, &cfd);
885 if (!tmpc)
886 return NULL;
887
888 /* Make sure we pass a relative path to *at() family of functions.
889 * . + /cgroup + / "cgroup.procs" + \0
890 */
891 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
892 pathname = alloca(len);
893 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
894
895 fd = openat(cfd, pathname, O_WRONLY);
896 if (fd < 0)
897 return NULL;
898
899 return fdopen(fd, "w");
900 }
901
902 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
903 void ***list, size_t typesize,
904 void* (*iterator)(const char*, const char*, const char*))
905 {
906 int cfd, fd, ret;
907 size_t len;
908 char *cg, *tmpc;
909 char pathname[MAXPATHLEN];
910 size_t sz = 0, asz = 0;
911 struct dirent *dirent;
912 DIR *dir;
913
914 tmpc = find_mounted_controller(controller, &cfd);
915 *list = NULL;
916 if (!tmpc)
917 return false;
918
919 /* Make sure we pass a relative path to *at() family of functions. */
920 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
921 cg = alloca(len);
922 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
923 if (ret < 0 || (size_t)ret >= len) {
924 lxcfs_error("Pathname too long under %s\n", cgroup);
925 return false;
926 }
927
928 fd = openat(cfd, cg, O_DIRECTORY);
929 if (fd < 0)
930 return false;
931
932 dir = fdopendir(fd);
933 if (!dir)
934 return false;
935
936 while ((dirent = readdir(dir))) {
937 struct stat mystat;
938
939 if (!strcmp(dirent->d_name, ".") ||
940 !strcmp(dirent->d_name, ".."))
941 continue;
942
943 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
944 if (ret < 0 || ret >= MAXPATHLEN) {
945 lxcfs_error("Pathname too long under %s\n", cg);
946 continue;
947 }
948
949 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
950 if (ret) {
951 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
952 continue;
953 }
954 if ((!directories && !S_ISREG(mystat.st_mode)) ||
955 (directories && !S_ISDIR(mystat.st_mode)))
956 continue;
957
958 if (sz+2 >= asz) {
959 void **tmp;
960 asz += BATCH_SIZE;
961 do {
962 tmp = realloc(*list, asz * typesize);
963 } while (!tmp);
964 *list = tmp;
965 }
966 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
967 (*list)[sz+1] = NULL;
968 sz++;
969 }
970 if (closedir(dir) < 0) {
971 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
972 return false;
973 }
974 return true;
975 }
976
977 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
978 {
979 char *dup;
980 do {
981 dup = strdup(dir_entry);
982 } while (!dup);
983 return dup;
984 }
985
986 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
987 {
988 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
989 }
990
991 void free_key(struct cgfs_files *k)
992 {
993 if (!k)
994 return;
995 free(k->name);
996 free(k);
997 }
998
999 void free_keys(struct cgfs_files **keys)
1000 {
1001 int i;
1002
1003 if (!keys)
1004 return;
1005 for (i = 0; keys[i]; i++) {
1006 free_key(keys[i]);
1007 }
1008 free(keys);
1009 }
1010
1011 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1012 {
1013 int ret, fd, cfd;
1014 size_t len;
1015 char *fnam, *tmpc;
1016
1017 tmpc = find_mounted_controller(controller, &cfd);
1018 if (!tmpc)
1019 return false;
1020
1021 /* Make sure we pass a relative path to *at() family of functions.
1022 * . + /cgroup + / + file + \0
1023 */
1024 len = strlen(cgroup) + strlen(file) + 3;
1025 fnam = alloca(len);
1026 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1027 if (ret < 0 || (size_t)ret >= len)
1028 return false;
1029
1030 fd = openat(cfd, fnam, O_RDONLY);
1031 if (fd < 0)
1032 return false;
1033
1034 *value = slurp_file(fnam, fd);
1035 return *value != NULL;
1036 }
1037
1038 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1039 {
1040 int ret, cfd;
1041 size_t len;
1042 char *fnam, *tmpc;
1043 struct stat sb;
1044 struct cgfs_files *newkey;
1045
1046 tmpc = find_mounted_controller(controller, &cfd);
1047 if (!tmpc)
1048 return false;
1049
1050 if (file && *file == '/')
1051 file++;
1052
1053 if (file && strchr(file, '/'))
1054 return NULL;
1055
1056 /* Make sure we pass a relative path to *at() family of functions.
1057 * . + /cgroup + / + file + \0
1058 */
1059 len = strlen(cgroup) + 3;
1060 if (file)
1061 len += strlen(file) + 1;
1062 fnam = alloca(len);
1063 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1064 file ? "/" : "", file ? file : "");
1065
1066 ret = fstatat(cfd, fnam, &sb, 0);
1067 if (ret < 0)
1068 return NULL;
1069
1070 do {
1071 newkey = malloc(sizeof(struct cgfs_files));
1072 } while (!newkey);
1073 if (file)
1074 newkey->name = must_copy_string(file);
1075 else if (strrchr(cgroup, '/'))
1076 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1077 else
1078 newkey->name = must_copy_string(cgroup);
1079 newkey->uid = sb.st_uid;
1080 newkey->gid = sb.st_gid;
1081 newkey->mode = sb.st_mode;
1082
1083 return newkey;
1084 }
1085
1086 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1087 {
1088 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1089 if (!entry) {
1090 lxcfs_error("Error getting files under %s:%s\n", controller,
1091 cgroup);
1092 }
1093 return entry;
1094 }
1095
1096 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1097 {
1098 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1099 }
1100
1101 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1102 {
1103 int cfd;
1104 size_t len;
1105 char *fnam, *tmpc;
1106 int ret;
1107 struct stat sb;
1108
1109 tmpc = find_mounted_controller(controller, &cfd);
1110 if (!tmpc)
1111 return false;
1112
1113 /* Make sure we pass a relative path to *at() family of functions.
1114 * . + /cgroup + / + f + \0
1115 */
1116 len = strlen(cgroup) + strlen(f) + 3;
1117 fnam = alloca(len);
1118 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1119 if (ret < 0 || (size_t)ret >= len)
1120 return false;
1121
1122 ret = fstatat(cfd, fnam, &sb, 0);
1123 if (ret < 0 || !S_ISDIR(sb.st_mode))
1124 return false;
1125
1126 return true;
1127 }
1128
1129 #define SEND_CREDS_OK 0
1130 #define SEND_CREDS_NOTSK 1
1131 #define SEND_CREDS_FAIL 2
1132 static bool recv_creds(int sock, struct ucred *cred, char *v);
1133 static int wait_for_pid(pid_t pid);
1134 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1135 static int send_creds_clone_wrapper(void *arg);
1136
1137 /*
1138 * clone a task which switches to @task's namespace and writes '1'.
1139 * over a unix sock so we can read the task's reaper's pid in our
1140 * namespace
1141 *
1142 * Note: glibc's fork() does not respect pidns, which can lead to failed
1143 * assertions inside glibc (and thus failed forks) if the child's pid in
1144 * the pidns and the parent pid outside are identical. Using clone prevents
1145 * this issue.
1146 */
1147 static void write_task_init_pid_exit(int sock, pid_t target)
1148 {
1149 char fnam[100];
1150 pid_t pid;
1151 int fd, ret;
1152 size_t stack_size = sysconf(_SC_PAGESIZE);
1153 void *stack = alloca(stack_size);
1154
1155 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1156 if (ret < 0 || ret >= sizeof(fnam))
1157 _exit(1);
1158
1159 fd = open(fnam, O_RDONLY);
1160 if (fd < 0) {
1161 perror("write_task_init_pid_exit open of ns/pid");
1162 _exit(1);
1163 }
1164 if (setns(fd, 0)) {
1165 perror("write_task_init_pid_exit setns 1");
1166 close(fd);
1167 _exit(1);
1168 }
1169 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1170 if (pid < 0)
1171 _exit(1);
1172 if (pid != 0) {
1173 if (!wait_for_pid(pid))
1174 _exit(1);
1175 _exit(0);
1176 }
1177 }
1178
1179 static int send_creds_clone_wrapper(void *arg) {
1180 struct ucred cred;
1181 char v;
1182 int sock = *(int *)arg;
1183
1184 /* we are the child */
1185 cred.uid = 0;
1186 cred.gid = 0;
1187 cred.pid = 1;
1188 v = '1';
1189 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1190 return 1;
1191 return 0;
1192 }
1193
1194 static pid_t get_init_pid_for_task(pid_t task)
1195 {
1196 int sock[2];
1197 pid_t pid;
1198 pid_t ret = -1;
1199 char v = '0';
1200 struct ucred cred;
1201
1202 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1203 perror("socketpair");
1204 return -1;
1205 }
1206
1207 pid = fork();
1208 if (pid < 0)
1209 goto out;
1210 if (!pid) {
1211 close(sock[1]);
1212 write_task_init_pid_exit(sock[0], task);
1213 _exit(0);
1214 }
1215
1216 if (!recv_creds(sock[1], &cred, &v))
1217 goto out;
1218 ret = cred.pid;
1219
1220 out:
1221 close(sock[0]);
1222 close(sock[1]);
1223 if (pid > 0)
1224 wait_for_pid(pid);
1225 return ret;
1226 }
1227
1228 static pid_t lookup_initpid_in_store(pid_t qpid)
1229 {
1230 pid_t answer = 0;
1231 struct stat sb;
1232 struct pidns_init_store *e;
1233 char fnam[100];
1234
1235 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1236 store_lock();
1237 if (stat(fnam, &sb) < 0)
1238 goto out;
1239 e = lookup_verify_initpid(&sb);
1240 if (e) {
1241 answer = e->initpid;
1242 goto out;
1243 }
1244 answer = get_init_pid_for_task(qpid);
1245 if (answer > 0)
1246 save_initpid(&sb, answer);
1247
1248 out:
1249 /* we prune at end in case we are returning
1250 * the value we were about to return */
1251 prune_initpid_store();
1252 store_unlock();
1253 return answer;
1254 }
1255
1256 static int wait_for_pid(pid_t pid)
1257 {
1258 int status, ret;
1259
1260 if (pid <= 0)
1261 return -1;
1262
1263 again:
1264 ret = waitpid(pid, &status, 0);
1265 if (ret == -1) {
1266 if (errno == EINTR)
1267 goto again;
1268 return -1;
1269 }
1270 if (ret != pid)
1271 goto again;
1272 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1273 return -1;
1274 return 0;
1275 }
1276
1277
1278 /*
1279 * append pid to *src.
1280 * src: a pointer to a char* in which ot append the pid.
1281 * sz: the number of characters printed so far, minus trailing \0.
1282 * asz: the allocated size so far
1283 * pid: the pid to append
1284 */
1285 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1286 {
1287 char tmp[30];
1288
1289 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1290
1291 if (!*src || tmplen + *sz + 1 >= *asz) {
1292 char *tmp;
1293 do {
1294 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1295 } while (!tmp);
1296 *src = tmp;
1297 *asz += BUF_RESERVE_SIZE;
1298 }
1299 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1300 *sz += tmplen;
1301 }
1302
1303 /*
1304 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1305 * valid in the caller's namespace, return the id mapped into
1306 * pid's namespace.
1307 * Returns the mapped id, or -1 on error.
1308 */
1309 unsigned int
1310 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1311 {
1312 unsigned int nsuid, // base id for a range in the idfile's namespace
1313 hostuid, // base id for a range in the caller's namespace
1314 count; // number of ids in this range
1315 char line[400];
1316 int ret;
1317
1318 fseek(idfile, 0L, SEEK_SET);
1319 while (fgets(line, 400, idfile)) {
1320 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1321 if (ret != 3)
1322 continue;
1323 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1324 /*
1325 * uids wrapped around - unexpected as this is a procfile,
1326 * so just bail.
1327 */
1328 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1329 nsuid, hostuid, count, line);
1330 return -1;
1331 }
1332 if (hostuid <= in_id && hostuid+count > in_id) {
1333 /*
1334 * now since hostuid <= in_id < hostuid+count, and
1335 * hostuid+count and nsuid+count do not wrap around,
1336 * we know that nsuid+(in_id-hostuid) which must be
1337 * less that nsuid+(count) must not wrap around
1338 */
1339 return (in_id - hostuid) + nsuid;
1340 }
1341 }
1342
1343 // no answer found
1344 return -1;
1345 }
1346
1347 /*
1348 * for is_privileged_over,
1349 * specify whether we require the calling uid to be root in his
1350 * namespace
1351 */
1352 #define NS_ROOT_REQD true
1353 #define NS_ROOT_OPT false
1354
1355 #define PROCLEN 100
1356
1357 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1358 {
1359 char fpath[PROCLEN];
1360 int ret;
1361 bool answer = false;
1362 uid_t nsuid;
1363
1364 if (victim == -1 || uid == -1)
1365 return false;
1366
1367 /*
1368 * If the request is one not requiring root in the namespace,
1369 * then having the same uid suffices. (i.e. uid 1000 has write
1370 * access to files owned by uid 1000
1371 */
1372 if (!req_ns_root && uid == victim)
1373 return true;
1374
1375 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1376 if (ret < 0 || ret >= PROCLEN)
1377 return false;
1378 FILE *f = fopen(fpath, "r");
1379 if (!f)
1380 return false;
1381
1382 /* if caller's not root in his namespace, reject */
1383 nsuid = convert_id_to_ns(f, uid);
1384 if (nsuid)
1385 goto out;
1386
1387 /*
1388 * If victim is not mapped into caller's ns, reject.
1389 * XXX I'm not sure this check is needed given that fuse
1390 * will be sending requests where the vfs has converted
1391 */
1392 nsuid = convert_id_to_ns(f, victim);
1393 if (nsuid == -1)
1394 goto out;
1395
1396 answer = true;
1397
1398 out:
1399 fclose(f);
1400 return answer;
1401 }
1402
1403 static bool perms_include(int fmode, mode_t req_mode)
1404 {
1405 mode_t r;
1406
1407 switch (req_mode & O_ACCMODE) {
1408 case O_RDONLY:
1409 r = S_IROTH;
1410 break;
1411 case O_WRONLY:
1412 r = S_IWOTH;
1413 break;
1414 case O_RDWR:
1415 r = S_IROTH | S_IWOTH;
1416 break;
1417 default:
1418 return false;
1419 }
1420 return ((fmode & r) == r);
1421 }
1422
1423
1424 /*
1425 * taskcg is a/b/c
1426 * querycg is /a/b/c/d/e
1427 * we return 'd'
1428 */
1429 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1430 {
1431 char *start, *end;
1432
1433 if (strlen(taskcg) <= strlen(querycg)) {
1434 lxcfs_error("%s\n", "I was fed bad input.");
1435 return NULL;
1436 }
1437
1438 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1439 start = strdup(taskcg + 1);
1440 else
1441 start = strdup(taskcg + strlen(querycg) + 1);
1442 if (!start)
1443 return NULL;
1444 end = strchr(start, '/');
1445 if (end)
1446 *end = '\0';
1447 return start;
1448 }
1449
1450 static void stripnewline(char *x)
1451 {
1452 size_t l = strlen(x);
1453 if (l && x[l-1] == '\n')
1454 x[l-1] = '\0';
1455 }
1456
1457 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1458 {
1459 int cfd;
1460 char fnam[PROCLEN];
1461 FILE *f;
1462 char *answer = NULL;
1463 char *line = NULL;
1464 size_t len = 0;
1465 int ret;
1466 const char *h = find_mounted_controller(contrl, &cfd);
1467 if (!h)
1468 return NULL;
1469
1470 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1471 if (ret < 0 || ret >= PROCLEN)
1472 return NULL;
1473 if (!(f = fopen(fnam, "r")))
1474 return NULL;
1475
1476 while (getline(&line, &len, f) != -1) {
1477 char *c1, *c2;
1478 if (!line[0])
1479 continue;
1480 c1 = strchr(line, ':');
1481 if (!c1)
1482 goto out;
1483 c1++;
1484 c2 = strchr(c1, ':');
1485 if (!c2)
1486 goto out;
1487 *c2 = '\0';
1488 if (strcmp(c1, h) != 0)
1489 continue;
1490 c2++;
1491 stripnewline(c2);
1492 do {
1493 answer = strdup(c2);
1494 } while (!answer);
1495 break;
1496 }
1497
1498 out:
1499 fclose(f);
1500 free(line);
1501 return answer;
1502 }
1503
1504 /*
1505 * check whether a fuse context may access a cgroup dir or file
1506 *
1507 * If file is not null, it is a cgroup file to check under cg.
1508 * If file is null, then we are checking perms on cg itself.
1509 *
1510 * For files we can check the mode of the list_keys result.
1511 * For cgroups, we must make assumptions based on the files under the
1512 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1513 * yet.
1514 */
1515 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1516 {
1517 struct cgfs_files *k = NULL;
1518 bool ret = false;
1519
1520 k = cgfs_get_key(contrl, cg, file);
1521 if (!k)
1522 return false;
1523
1524 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1525 if (perms_include(k->mode >> 6, mode)) {
1526 ret = true;
1527 goto out;
1528 }
1529 }
1530 if (fc->gid == k->gid) {
1531 if (perms_include(k->mode >> 3, mode)) {
1532 ret = true;
1533 goto out;
1534 }
1535 }
1536 ret = perms_include(k->mode, mode);
1537
1538 out:
1539 free_key(k);
1540 return ret;
1541 }
1542
1543 #define INITSCOPE "/init.scope"
1544 static void prune_init_slice(char *cg)
1545 {
1546 char *point;
1547 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1548
1549 if (cg_len < initscope_len)
1550 return;
1551
1552 point = cg + cg_len - initscope_len;
1553 if (strcmp(point, INITSCOPE) == 0) {
1554 if (point == cg)
1555 *(point+1) = '\0';
1556 else
1557 *point = '\0';
1558 }
1559 }
1560
1561 /*
1562 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1563 * If pid is in /a, he may act on /a/b, but not on /b.
1564 * if the answer is false and nextcg is not NULL, then *nextcg will point
1565 * to a string containing the next cgroup directory under cg, which must be
1566 * freed by the caller.
1567 */
1568 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1569 {
1570 bool answer = false;
1571 char *c2 = get_pid_cgroup(pid, contrl);
1572 char *linecmp;
1573
1574 if (!c2)
1575 return false;
1576 prune_init_slice(c2);
1577
1578 /*
1579 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1580 * they pass in a cgroup without leading '/'
1581 *
1582 * The original line here was:
1583 * linecmp = *cg == '/' ? c2 : c2+1;
1584 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1585 * Serge, do you know?
1586 */
1587 if (*cg == '/' || !strncmp(cg, "./", 2))
1588 linecmp = c2;
1589 else
1590 linecmp = c2 + 1;
1591 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1592 if (nextcg) {
1593 *nextcg = get_next_cgroup_dir(linecmp, cg);
1594 }
1595 goto out;
1596 }
1597 answer = true;
1598
1599 out:
1600 free(c2);
1601 return answer;
1602 }
1603
1604 /*
1605 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1606 */
1607 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1608 {
1609 bool answer = false;
1610 char *c2, *task_cg;
1611 size_t target_len, task_len;
1612
1613 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1614 return true;
1615
1616 c2 = get_pid_cgroup(pid, contrl);
1617 if (!c2)
1618 return false;
1619 prune_init_slice(c2);
1620
1621 task_cg = c2 + 1;
1622 target_len = strlen(cg);
1623 task_len = strlen(task_cg);
1624 if (task_len == 0) {
1625 /* Task is in the root cg, it can see everything. This case is
1626 * not handled by the strmcps below, since they test for the
1627 * last /, but that is the first / that we've chopped off
1628 * above.
1629 */
1630 answer = true;
1631 goto out;
1632 }
1633 if (strcmp(cg, task_cg) == 0) {
1634 answer = true;
1635 goto out;
1636 }
1637 if (target_len < task_len) {
1638 /* looking up a parent dir */
1639 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1640 answer = true;
1641 goto out;
1642 }
1643 if (target_len > task_len) {
1644 /* looking up a child dir */
1645 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1646 answer = true;
1647 goto out;
1648 }
1649
1650 out:
1651 free(c2);
1652 return answer;
1653 }
1654
1655 /*
1656 * given /cgroup/freezer/a/b, return "freezer".
1657 * the returned char* should NOT be freed.
1658 */
1659 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1660 {
1661 const char *p1;
1662 char *contr, *slash;
1663
1664 if (strlen(path) < 9) {
1665 errno = EACCES;
1666 return NULL;
1667 }
1668 if (*(path + 7) != '/') {
1669 errno = EINVAL;
1670 return NULL;
1671 }
1672 p1 = path + 8;
1673 contr = strdupa(p1);
1674 if (!contr) {
1675 errno = ENOMEM;
1676 return NULL;
1677 }
1678 slash = strstr(contr, "/");
1679 if (slash)
1680 *slash = '\0';
1681
1682 int i;
1683 for (i = 0; i < num_hierarchies; i++) {
1684 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1685 return hierarchies[i];
1686 }
1687 errno = ENOENT;
1688 return NULL;
1689 }
1690
1691 /*
1692 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1693 * Note that the returned value may include files (keynames) etc
1694 */
1695 static const char *find_cgroup_in_path(const char *path)
1696 {
1697 const char *p1;
1698
1699 if (strlen(path) < 9) {
1700 errno = EACCES;
1701 return NULL;
1702 }
1703 p1 = strstr(path + 8, "/");
1704 if (!p1) {
1705 errno = EINVAL;
1706 return NULL;
1707 }
1708 errno = 0;
1709 return p1 + 1;
1710 }
1711
1712 /*
1713 * split the last path element from the path in @cg.
1714 * @dir is newly allocated and should be freed, @last not
1715 */
1716 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1717 {
1718 char *p;
1719
1720 do {
1721 *dir = strdup(cg);
1722 } while (!*dir);
1723 *last = strrchr(cg, '/');
1724 if (!*last) {
1725 *last = NULL;
1726 return;
1727 }
1728 p = strrchr(*dir, '/');
1729 *p = '\0';
1730 }
1731
1732 /*
1733 * FUSE ops for /cgroup
1734 */
1735
1736 int cg_getattr(const char *path, struct stat *sb)
1737 {
1738 struct timespec now;
1739 struct fuse_context *fc = fuse_get_context();
1740 char * cgdir = NULL;
1741 char *last = NULL, *path1, *path2;
1742 struct cgfs_files *k = NULL;
1743 const char *cgroup;
1744 const char *controller = NULL;
1745 int ret = -ENOENT;
1746
1747
1748 if (!fc)
1749 return -EIO;
1750
1751 memset(sb, 0, sizeof(struct stat));
1752
1753 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1754 return -EINVAL;
1755
1756 sb->st_uid = sb->st_gid = 0;
1757 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1758 sb->st_size = 0;
1759
1760 if (strcmp(path, "/cgroup") == 0) {
1761 sb->st_mode = S_IFDIR | 00755;
1762 sb->st_nlink = 2;
1763 return 0;
1764 }
1765
1766 controller = pick_controller_from_path(fc, path);
1767 if (!controller)
1768 return -errno;
1769 cgroup = find_cgroup_in_path(path);
1770 if (!cgroup) {
1771 /* this is just /cgroup/controller, return it as a dir */
1772 sb->st_mode = S_IFDIR | 00755;
1773 sb->st_nlink = 2;
1774 return 0;
1775 }
1776
1777 get_cgdir_and_path(cgroup, &cgdir, &last);
1778
1779 if (!last) {
1780 path1 = "/";
1781 path2 = cgdir;
1782 } else {
1783 path1 = cgdir;
1784 path2 = last;
1785 }
1786
1787 pid_t initpid = lookup_initpid_in_store(fc->pid);
1788 if (initpid <= 0)
1789 initpid = fc->pid;
1790 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1791 * Then check that caller's cgroup is under path if last is a child
1792 * cgroup, or cgdir if last is a file */
1793
1794 if (is_child_cgroup(controller, path1, path2)) {
1795 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1796 ret = -ENOENT;
1797 goto out;
1798 }
1799 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1800 /* this is just /cgroup/controller, return it as a dir */
1801 sb->st_mode = S_IFDIR | 00555;
1802 sb->st_nlink = 2;
1803 ret = 0;
1804 goto out;
1805 }
1806 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1807 ret = -EACCES;
1808 goto out;
1809 }
1810
1811 // get uid, gid, from '/tasks' file and make up a mode
1812 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1813 sb->st_mode = S_IFDIR | 00755;
1814 k = cgfs_get_key(controller, cgroup, NULL);
1815 if (!k) {
1816 sb->st_uid = sb->st_gid = 0;
1817 } else {
1818 sb->st_uid = k->uid;
1819 sb->st_gid = k->gid;
1820 }
1821 free_key(k);
1822 sb->st_nlink = 2;
1823 ret = 0;
1824 goto out;
1825 }
1826
1827 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1828 sb->st_mode = S_IFREG | k->mode;
1829 sb->st_nlink = 1;
1830 sb->st_uid = k->uid;
1831 sb->st_gid = k->gid;
1832 sb->st_size = 0;
1833 free_key(k);
1834 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1835 ret = -ENOENT;
1836 goto out;
1837 }
1838 ret = 0;
1839 }
1840
1841 out:
1842 free(cgdir);
1843 return ret;
1844 }
1845
1846 int cg_opendir(const char *path, struct fuse_file_info *fi)
1847 {
1848 struct fuse_context *fc = fuse_get_context();
1849 const char *cgroup;
1850 struct file_info *dir_info;
1851 char *controller = NULL;
1852
1853 if (!fc)
1854 return -EIO;
1855
1856 if (strcmp(path, "/cgroup") == 0) {
1857 cgroup = NULL;
1858 controller = NULL;
1859 } else {
1860 // return list of keys for the controller, and list of child cgroups
1861 controller = pick_controller_from_path(fc, path);
1862 if (!controller)
1863 return -errno;
1864
1865 cgroup = find_cgroup_in_path(path);
1866 if (!cgroup) {
1867 /* this is just /cgroup/controller, return its contents */
1868 cgroup = "/";
1869 }
1870 }
1871
1872 pid_t initpid = lookup_initpid_in_store(fc->pid);
1873 if (initpid <= 0)
1874 initpid = fc->pid;
1875 if (cgroup) {
1876 if (!caller_may_see_dir(initpid, controller, cgroup))
1877 return -ENOENT;
1878 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1879 return -EACCES;
1880 }
1881
1882 /* we'll free this at cg_releasedir */
1883 dir_info = malloc(sizeof(*dir_info));
1884 if (!dir_info)
1885 return -ENOMEM;
1886 dir_info->controller = must_copy_string(controller);
1887 dir_info->cgroup = must_copy_string(cgroup);
1888 dir_info->type = LXC_TYPE_CGDIR;
1889 dir_info->buf = NULL;
1890 dir_info->file = NULL;
1891 dir_info->buflen = 0;
1892
1893 fi->fh = (unsigned long)dir_info;
1894 return 0;
1895 }
1896
1897 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1898 struct fuse_file_info *fi)
1899 {
1900 struct file_info *d = (struct file_info *)fi->fh;
1901 struct cgfs_files **list = NULL;
1902 int i, ret;
1903 char *nextcg = NULL;
1904 struct fuse_context *fc = fuse_get_context();
1905 char **clist = NULL;
1906
1907 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1908 return -EIO;
1909
1910 if (d->type != LXC_TYPE_CGDIR) {
1911 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1912 return -EIO;
1913 }
1914 if (!d->cgroup && !d->controller) {
1915 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1916 int i;
1917
1918 for (i = 0; i < num_hierarchies; i++) {
1919 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1920 return -EIO;
1921 }
1922 }
1923 return 0;
1924 }
1925
1926 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1927 // not a valid cgroup
1928 ret = -EINVAL;
1929 goto out;
1930 }
1931
1932 pid_t initpid = lookup_initpid_in_store(fc->pid);
1933 if (initpid <= 0)
1934 initpid = fc->pid;
1935 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1936 if (nextcg) {
1937 ret = filler(buf, nextcg, NULL, 0);
1938 free(nextcg);
1939 if (ret != 0) {
1940 ret = -EIO;
1941 goto out;
1942 }
1943 }
1944 ret = 0;
1945 goto out;
1946 }
1947
1948 for (i = 0; list[i]; i++) {
1949 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1950 ret = -EIO;
1951 goto out;
1952 }
1953 }
1954
1955 // now get the list of child cgroups
1956
1957 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1958 ret = 0;
1959 goto out;
1960 }
1961 if (clist) {
1962 for (i = 0; clist[i]; i++) {
1963 if (filler(buf, clist[i], NULL, 0) != 0) {
1964 ret = -EIO;
1965 goto out;
1966 }
1967 }
1968 }
1969 ret = 0;
1970
1971 out:
1972 free_keys(list);
1973 if (clist) {
1974 for (i = 0; clist[i]; i++)
1975 free(clist[i]);
1976 free(clist);
1977 }
1978 return ret;
1979 }
1980
1981 static void do_release_file_info(struct fuse_file_info *fi)
1982 {
1983 struct file_info *f = (struct file_info *)fi->fh;
1984
1985 if (!f)
1986 return;
1987
1988 fi->fh = 0;
1989
1990 free(f->controller);
1991 f->controller = NULL;
1992 free(f->cgroup);
1993 f->cgroup = NULL;
1994 free(f->file);
1995 f->file = NULL;
1996 free(f->buf);
1997 f->buf = NULL;
1998 free(f);
1999 }
2000
2001 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2002 {
2003 do_release_file_info(fi);
2004 return 0;
2005 }
2006
2007 int cg_open(const char *path, struct fuse_file_info *fi)
2008 {
2009 const char *cgroup;
2010 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2011 struct cgfs_files *k = NULL;
2012 struct file_info *file_info;
2013 struct fuse_context *fc = fuse_get_context();
2014 int ret;
2015
2016 if (!fc)
2017 return -EIO;
2018
2019 controller = pick_controller_from_path(fc, path);
2020 if (!controller)
2021 return -errno;
2022 cgroup = find_cgroup_in_path(path);
2023 if (!cgroup)
2024 return -errno;
2025
2026 get_cgdir_and_path(cgroup, &cgdir, &last);
2027 if (!last) {
2028 path1 = "/";
2029 path2 = cgdir;
2030 } else {
2031 path1 = cgdir;
2032 path2 = last;
2033 }
2034
2035 k = cgfs_get_key(controller, path1, path2);
2036 if (!k) {
2037 ret = -EINVAL;
2038 goto out;
2039 }
2040 free_key(k);
2041
2042 pid_t initpid = lookup_initpid_in_store(fc->pid);
2043 if (initpid <= 0)
2044 initpid = fc->pid;
2045 if (!caller_may_see_dir(initpid, controller, path1)) {
2046 ret = -ENOENT;
2047 goto out;
2048 }
2049 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2050 ret = -EACCES;
2051 goto out;
2052 }
2053
2054 /* we'll free this at cg_release */
2055 file_info = malloc(sizeof(*file_info));
2056 if (!file_info) {
2057 ret = -ENOMEM;
2058 goto out;
2059 }
2060 file_info->controller = must_copy_string(controller);
2061 file_info->cgroup = must_copy_string(path1);
2062 file_info->file = must_copy_string(path2);
2063 file_info->type = LXC_TYPE_CGFILE;
2064 file_info->buf = NULL;
2065 file_info->buflen = 0;
2066
2067 fi->fh = (unsigned long)file_info;
2068 ret = 0;
2069
2070 out:
2071 free(cgdir);
2072 return ret;
2073 }
2074
2075 int cg_access(const char *path, int mode)
2076 {
2077 int ret;
2078 const char *cgroup;
2079 char *path1, *path2, *controller;
2080 char *last = NULL, *cgdir = NULL;
2081 struct cgfs_files *k = NULL;
2082 struct fuse_context *fc = fuse_get_context();
2083
2084 if (strcmp(path, "/cgroup") == 0)
2085 return 0;
2086
2087 if (!fc)
2088 return -EIO;
2089
2090 controller = pick_controller_from_path(fc, path);
2091 if (!controller)
2092 return -errno;
2093 cgroup = find_cgroup_in_path(path);
2094 if (!cgroup) {
2095 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2096 if ((mode & W_OK) == 0)
2097 return 0;
2098 return -EACCES;
2099 }
2100
2101 get_cgdir_and_path(cgroup, &cgdir, &last);
2102 if (!last) {
2103 path1 = "/";
2104 path2 = cgdir;
2105 } else {
2106 path1 = cgdir;
2107 path2 = last;
2108 }
2109
2110 k = cgfs_get_key(controller, path1, path2);
2111 if (!k) {
2112 if ((mode & W_OK) == 0)
2113 ret = 0;
2114 else
2115 ret = -EACCES;
2116 goto out;
2117 }
2118 free_key(k);
2119
2120 pid_t initpid = lookup_initpid_in_store(fc->pid);
2121 if (initpid <= 0)
2122 initpid = fc->pid;
2123 if (!caller_may_see_dir(initpid, controller, path1)) {
2124 ret = -ENOENT;
2125 goto out;
2126 }
2127 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2128 ret = -EACCES;
2129 goto out;
2130 }
2131
2132 ret = 0;
2133
2134 out:
2135 free(cgdir);
2136 return ret;
2137 }
2138
2139 int cg_release(const char *path, struct fuse_file_info *fi)
2140 {
2141 do_release_file_info(fi);
2142 return 0;
2143 }
2144
2145 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2146
2147 static bool wait_for_sock(int sock, int timeout)
2148 {
2149 struct epoll_event ev;
2150 int epfd, ret, now, starttime, deltatime, saved_errno;
2151
2152 if ((starttime = time(NULL)) < 0)
2153 return false;
2154
2155 if ((epfd = epoll_create(1)) < 0) {
2156 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2157 return false;
2158 }
2159
2160 ev.events = POLLIN_SET;
2161 ev.data.fd = sock;
2162 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2163 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2164 close(epfd);
2165 return false;
2166 }
2167
2168 again:
2169 if ((now = time(NULL)) < 0) {
2170 close(epfd);
2171 return false;
2172 }
2173
2174 deltatime = (starttime + timeout) - now;
2175 if (deltatime < 0) { // timeout
2176 errno = 0;
2177 close(epfd);
2178 return false;
2179 }
2180 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2181 if (ret < 0 && errno == EINTR)
2182 goto again;
2183 saved_errno = errno;
2184 close(epfd);
2185
2186 if (ret <= 0) {
2187 errno = saved_errno;
2188 return false;
2189 }
2190 return true;
2191 }
2192
2193 static int msgrecv(int sockfd, void *buf, size_t len)
2194 {
2195 if (!wait_for_sock(sockfd, 2))
2196 return -1;
2197 return recv(sockfd, buf, len, MSG_DONTWAIT);
2198 }
2199
2200 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2201 {
2202 struct msghdr msg = { 0 };
2203 struct iovec iov;
2204 struct cmsghdr *cmsg;
2205 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2206 char buf[1];
2207 buf[0] = 'p';
2208
2209 if (pingfirst) {
2210 if (msgrecv(sock, buf, 1) != 1) {
2211 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2212 return SEND_CREDS_FAIL;
2213 }
2214 }
2215
2216 msg.msg_control = cmsgbuf;
2217 msg.msg_controllen = sizeof(cmsgbuf);
2218
2219 cmsg = CMSG_FIRSTHDR(&msg);
2220 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2221 cmsg->cmsg_level = SOL_SOCKET;
2222 cmsg->cmsg_type = SCM_CREDENTIALS;
2223 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2224
2225 msg.msg_name = NULL;
2226 msg.msg_namelen = 0;
2227
2228 buf[0] = v;
2229 iov.iov_base = buf;
2230 iov.iov_len = sizeof(buf);
2231 msg.msg_iov = &iov;
2232 msg.msg_iovlen = 1;
2233
2234 if (sendmsg(sock, &msg, 0) < 0) {
2235 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2236 if (errno == 3)
2237 return SEND_CREDS_NOTSK;
2238 return SEND_CREDS_FAIL;
2239 }
2240
2241 return SEND_CREDS_OK;
2242 }
2243
2244 static bool recv_creds(int sock, struct ucred *cred, char *v)
2245 {
2246 struct msghdr msg = { 0 };
2247 struct iovec iov;
2248 struct cmsghdr *cmsg;
2249 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2250 char buf[1];
2251 int ret;
2252 int optval = 1;
2253
2254 *v = '1';
2255
2256 cred->pid = -1;
2257 cred->uid = -1;
2258 cred->gid = -1;
2259
2260 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2261 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2262 return false;
2263 }
2264 buf[0] = '1';
2265 if (write(sock, buf, 1) != 1) {
2266 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2267 return false;
2268 }
2269
2270 msg.msg_name = NULL;
2271 msg.msg_namelen = 0;
2272 msg.msg_control = cmsgbuf;
2273 msg.msg_controllen = sizeof(cmsgbuf);
2274
2275 iov.iov_base = buf;
2276 iov.iov_len = sizeof(buf);
2277 msg.msg_iov = &iov;
2278 msg.msg_iovlen = 1;
2279
2280 if (!wait_for_sock(sock, 2)) {
2281 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2282 return false;
2283 }
2284 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2285 if (ret < 0) {
2286 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2287 return false;
2288 }
2289
2290 cmsg = CMSG_FIRSTHDR(&msg);
2291
2292 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2293 cmsg->cmsg_level == SOL_SOCKET &&
2294 cmsg->cmsg_type == SCM_CREDENTIALS) {
2295 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2296 }
2297 *v = buf[0];
2298
2299 return true;
2300 }
2301
2302 struct pid_ns_clone_args {
2303 int *cpipe;
2304 int sock;
2305 pid_t tpid;
2306 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2307 };
2308
2309 /*
2310 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2311 * with clone(). This simply writes '1' as ACK back to the parent
2312 * before calling the actual wrapped function.
2313 */
2314 static int pid_ns_clone_wrapper(void *arg) {
2315 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2316 char b = '1';
2317
2318 close(args->cpipe[0]);
2319 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2320 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2321 close(args->cpipe[1]);
2322 return args->wrapped(args->sock, args->tpid);
2323 }
2324
2325 /*
2326 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2327 * int value back over the socket. This shifts the pid from the
2328 * sender's pidns into tpid's pidns.
2329 */
2330 static int pid_to_ns(int sock, pid_t tpid)
2331 {
2332 char v = '0';
2333 struct ucred cred;
2334
2335 while (recv_creds(sock, &cred, &v)) {
2336 if (v == '1')
2337 return 0;
2338 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2339 return 1;
2340 }
2341 return 0;
2342 }
2343
2344
2345 /*
2346 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2347 * in your old pidns. Only children which you clone will be in the target
2348 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2349 * actually convert pids.
2350 *
2351 * Note: glibc's fork() does not respect pidns, which can lead to failed
2352 * assertions inside glibc (and thus failed forks) if the child's pid in
2353 * the pidns and the parent pid outside are identical. Using clone prevents
2354 * this issue.
2355 */
2356 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2357 {
2358 int newnsfd = -1, ret, cpipe[2];
2359 char fnam[100];
2360 pid_t cpid;
2361 char v;
2362
2363 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2364 if (ret < 0 || ret >= sizeof(fnam))
2365 _exit(1);
2366 newnsfd = open(fnam, O_RDONLY);
2367 if (newnsfd < 0)
2368 _exit(1);
2369 if (setns(newnsfd, 0) < 0)
2370 _exit(1);
2371 close(newnsfd);
2372
2373 if (pipe(cpipe) < 0)
2374 _exit(1);
2375
2376 struct pid_ns_clone_args args = {
2377 .cpipe = cpipe,
2378 .sock = sock,
2379 .tpid = tpid,
2380 .wrapped = &pid_to_ns
2381 };
2382 size_t stack_size = sysconf(_SC_PAGESIZE);
2383 void *stack = alloca(stack_size);
2384
2385 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2386 if (cpid < 0)
2387 _exit(1);
2388
2389 // give the child 1 second to be done forking and
2390 // write its ack
2391 if (!wait_for_sock(cpipe[0], 1))
2392 _exit(1);
2393 ret = read(cpipe[0], &v, 1);
2394 if (ret != sizeof(char) || v != '1')
2395 _exit(1);
2396
2397 if (!wait_for_pid(cpid))
2398 _exit(1);
2399 _exit(0);
2400 }
2401
2402 /*
2403 * To read cgroup files with a particular pid, we will setns into the child
2404 * pidns, open a pipe, fork a child - which will be the first to really be in
2405 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2406 */
2407 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2408 {
2409 int sock[2] = {-1, -1};
2410 char *tmpdata = NULL;
2411 int ret;
2412 pid_t qpid, cpid = -1;
2413 bool answer = false;
2414 char v = '0';
2415 struct ucred cred;
2416 size_t sz = 0, asz = 0;
2417
2418 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2419 return false;
2420
2421 /*
2422 * Now we read the pids from returned data one by one, pass
2423 * them into a child in the target namespace, read back the
2424 * translated pids, and put them into our to-return data
2425 */
2426
2427 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2428 perror("socketpair");
2429 free(tmpdata);
2430 return false;
2431 }
2432
2433 cpid = fork();
2434 if (cpid == -1)
2435 goto out;
2436
2437 if (!cpid) // child - exits when done
2438 pid_to_ns_wrapper(sock[1], tpid);
2439
2440 char *ptr = tmpdata;
2441 cred.uid = 0;
2442 cred.gid = 0;
2443 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2444 cred.pid = qpid;
2445 ret = send_creds(sock[0], &cred, v, true);
2446
2447 if (ret == SEND_CREDS_NOTSK)
2448 goto next;
2449 if (ret == SEND_CREDS_FAIL)
2450 goto out;
2451
2452 // read converted results
2453 if (!wait_for_sock(sock[0], 2)) {
2454 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2455 goto out;
2456 }
2457 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2458 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2459 goto out;
2460 }
2461 must_strcat_pid(d, &sz, &asz, qpid);
2462 next:
2463 ptr = strchr(ptr, '\n');
2464 if (!ptr)
2465 break;
2466 ptr++;
2467 }
2468
2469 cred.pid = getpid();
2470 v = '1';
2471 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2472 // failed to ask child to exit
2473 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2474 goto out;
2475 }
2476
2477 answer = true;
2478
2479 out:
2480 free(tmpdata);
2481 if (cpid != -1)
2482 wait_for_pid(cpid);
2483 if (sock[0] != -1) {
2484 close(sock[0]);
2485 close(sock[1]);
2486 }
2487 return answer;
2488 }
2489
2490 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2491 struct fuse_file_info *fi)
2492 {
2493 struct fuse_context *fc = fuse_get_context();
2494 struct file_info *f = (struct file_info *)fi->fh;
2495 struct cgfs_files *k = NULL;
2496 char *data = NULL;
2497 int ret, s;
2498 bool r;
2499
2500 if (f->type != LXC_TYPE_CGFILE) {
2501 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2502 return -EIO;
2503 }
2504
2505 if (offset)
2506 return 0;
2507
2508 if (!fc)
2509 return -EIO;
2510
2511 if (!f->controller)
2512 return -EINVAL;
2513
2514 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2515 return -EINVAL;
2516 }
2517 free_key(k);
2518
2519
2520 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2521 ret = -EACCES;
2522 goto out;
2523 }
2524
2525 if (strcmp(f->file, "tasks") == 0 ||
2526 strcmp(f->file, "/tasks") == 0 ||
2527 strcmp(f->file, "/cgroup.procs") == 0 ||
2528 strcmp(f->file, "cgroup.procs") == 0)
2529 // special case - we have to translate the pids
2530 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2531 else
2532 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2533
2534 if (!r) {
2535 ret = -EINVAL;
2536 goto out;
2537 }
2538
2539 if (!data) {
2540 ret = 0;
2541 goto out;
2542 }
2543 s = strlen(data);
2544 if (s > size)
2545 s = size;
2546 memcpy(buf, data, s);
2547 if (s > 0 && s < size && data[s-1] != '\n')
2548 buf[s++] = '\n';
2549
2550 ret = s;
2551
2552 out:
2553 free(data);
2554 return ret;
2555 }
2556
2557 static int pid_from_ns(int sock, pid_t tpid)
2558 {
2559 pid_t vpid;
2560 struct ucred cred;
2561 char v;
2562 int ret;
2563
2564 cred.uid = 0;
2565 cred.gid = 0;
2566 while (1) {
2567 if (!wait_for_sock(sock, 2)) {
2568 lxcfs_error("%s\n", "Timeout reading from parent.");
2569 return 1;
2570 }
2571 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2572 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2573 return 1;
2574 }
2575 if (vpid == -1) // done
2576 break;
2577 v = '0';
2578 cred.pid = vpid;
2579 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2580 v = '1';
2581 cred.pid = getpid();
2582 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2583 return 1;
2584 }
2585 }
2586 return 0;
2587 }
2588
2589 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2590 {
2591 int newnsfd = -1, ret, cpipe[2];
2592 char fnam[100];
2593 pid_t cpid;
2594 char v;
2595
2596 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2597 if (ret < 0 || ret >= sizeof(fnam))
2598 _exit(1);
2599 newnsfd = open(fnam, O_RDONLY);
2600 if (newnsfd < 0)
2601 _exit(1);
2602 if (setns(newnsfd, 0) < 0)
2603 _exit(1);
2604 close(newnsfd);
2605
2606 if (pipe(cpipe) < 0)
2607 _exit(1);
2608
2609 struct pid_ns_clone_args args = {
2610 .cpipe = cpipe,
2611 .sock = sock,
2612 .tpid = tpid,
2613 .wrapped = &pid_from_ns
2614 };
2615 size_t stack_size = sysconf(_SC_PAGESIZE);
2616 void *stack = alloca(stack_size);
2617
2618 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2619 if (cpid < 0)
2620 _exit(1);
2621
2622 // give the child 1 second to be done forking and
2623 // write its ack
2624 if (!wait_for_sock(cpipe[0], 1))
2625 _exit(1);
2626 ret = read(cpipe[0], &v, 1);
2627 if (ret != sizeof(char) || v != '1')
2628 _exit(1);
2629
2630 if (!wait_for_pid(cpid))
2631 _exit(1);
2632 _exit(0);
2633 }
2634
2635 /*
2636 * Given host @uid, return the uid to which it maps in
2637 * @pid's user namespace, or -1 if none.
2638 */
2639 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2640 {
2641 FILE *f;
2642 char line[400];
2643
2644 sprintf(line, "/proc/%d/uid_map", pid);
2645 if ((f = fopen(line, "r")) == NULL) {
2646 return false;
2647 }
2648
2649 *answer = convert_id_to_ns(f, uid);
2650 fclose(f);
2651
2652 if (*answer == -1)
2653 return false;
2654 return true;
2655 }
2656
2657 /*
2658 * get_pid_creds: get the real uid and gid of @pid from
2659 * /proc/$$/status
2660 * (XXX should we use euid here?)
2661 */
2662 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2663 {
2664 char line[400];
2665 uid_t u;
2666 gid_t g;
2667 FILE *f;
2668
2669 *uid = -1;
2670 *gid = -1;
2671 sprintf(line, "/proc/%d/status", pid);
2672 if ((f = fopen(line, "r")) == NULL) {
2673 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2674 return;
2675 }
2676 while (fgets(line, 400, f)) {
2677 if (strncmp(line, "Uid:", 4) == 0) {
2678 if (sscanf(line+4, "%u", &u) != 1) {
2679 lxcfs_error("bad uid line for pid %u\n", pid);
2680 fclose(f);
2681 return;
2682 }
2683 *uid = u;
2684 } else if (strncmp(line, "Gid:", 4) == 0) {
2685 if (sscanf(line+4, "%u", &g) != 1) {
2686 lxcfs_error("bad gid line for pid %u\n", pid);
2687 fclose(f);
2688 return;
2689 }
2690 *gid = g;
2691 }
2692 }
2693 fclose(f);
2694 }
2695
2696 /*
2697 * May the requestor @r move victim @v to a new cgroup?
2698 * This is allowed if
2699 * . they are the same task
2700 * . they are ownedy by the same uid
2701 * . @r is root on the host, or
2702 * . @v's uid is mapped into @r's where @r is root.
2703 */
2704 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2705 {
2706 uid_t v_uid, tmpuid;
2707 gid_t v_gid;
2708
2709 if (r == v)
2710 return true;
2711 if (r_uid == 0)
2712 return true;
2713 get_pid_creds(v, &v_uid, &v_gid);
2714 if (r_uid == v_uid)
2715 return true;
2716 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2717 && hostuid_to_ns(v_uid, r, &tmpuid))
2718 return true;
2719 return false;
2720 }
2721
2722 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2723 const char *file, const char *buf)
2724 {
2725 int sock[2] = {-1, -1};
2726 pid_t qpid, cpid = -1;
2727 FILE *pids_file = NULL;
2728 bool answer = false, fail = false;
2729
2730 pids_file = open_pids_file(contrl, cg);
2731 if (!pids_file)
2732 return false;
2733
2734 /*
2735 * write the pids to a socket, have helper in writer's pidns
2736 * call movepid for us
2737 */
2738 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2739 perror("socketpair");
2740 goto out;
2741 }
2742
2743 cpid = fork();
2744 if (cpid == -1)
2745 goto out;
2746
2747 if (!cpid) { // child
2748 fclose(pids_file);
2749 pid_from_ns_wrapper(sock[1], tpid);
2750 }
2751
2752 const char *ptr = buf;
2753 while (sscanf(ptr, "%d", &qpid) == 1) {
2754 struct ucred cred;
2755 char v;
2756
2757 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2758 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2759 goto out;
2760 }
2761
2762 if (recv_creds(sock[0], &cred, &v)) {
2763 if (v == '0') {
2764 if (!may_move_pid(tpid, tuid, cred.pid)) {
2765 fail = true;
2766 break;
2767 }
2768 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2769 fail = true;
2770 }
2771 }
2772
2773 ptr = strchr(ptr, '\n');
2774 if (!ptr)
2775 break;
2776 ptr++;
2777 }
2778
2779 /* All good, write the value */
2780 qpid = -1;
2781 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2782 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2783
2784 if (!fail)
2785 answer = true;
2786
2787 out:
2788 if (cpid != -1)
2789 wait_for_pid(cpid);
2790 if (sock[0] != -1) {
2791 close(sock[0]);
2792 close(sock[1]);
2793 }
2794 if (pids_file) {
2795 if (fclose(pids_file) != 0)
2796 answer = false;
2797 }
2798 return answer;
2799 }
2800
2801 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2802 struct fuse_file_info *fi)
2803 {
2804 struct fuse_context *fc = fuse_get_context();
2805 char *localbuf = NULL;
2806 struct cgfs_files *k = NULL;
2807 struct file_info *f = (struct file_info *)fi->fh;
2808 bool r;
2809
2810 if (f->type != LXC_TYPE_CGFILE) {
2811 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2812 return -EIO;
2813 }
2814
2815 if (offset)
2816 return 0;
2817
2818 if (!fc)
2819 return -EIO;
2820
2821 localbuf = alloca(size+1);
2822 localbuf[size] = '\0';
2823 memcpy(localbuf, buf, size);
2824
2825 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2826 size = -EINVAL;
2827 goto out;
2828 }
2829
2830 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2831 size = -EACCES;
2832 goto out;
2833 }
2834
2835 if (strcmp(f->file, "tasks") == 0 ||
2836 strcmp(f->file, "/tasks") == 0 ||
2837 strcmp(f->file, "/cgroup.procs") == 0 ||
2838 strcmp(f->file, "cgroup.procs") == 0)
2839 // special case - we have to translate the pids
2840 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2841 else
2842 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2843
2844 if (!r)
2845 size = -EINVAL;
2846
2847 out:
2848 free_key(k);
2849 return size;
2850 }
2851
2852 int cg_chown(const char *path, uid_t uid, gid_t gid)
2853 {
2854 struct fuse_context *fc = fuse_get_context();
2855 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2856 struct cgfs_files *k = NULL;
2857 const char *cgroup;
2858 int ret;
2859
2860 if (!fc)
2861 return -EIO;
2862
2863 if (strcmp(path, "/cgroup") == 0)
2864 return -EPERM;
2865
2866 controller = pick_controller_from_path(fc, path);
2867 if (!controller)
2868 return errno == ENOENT ? -EPERM : -errno;
2869
2870 cgroup = find_cgroup_in_path(path);
2871 if (!cgroup)
2872 /* this is just /cgroup/controller */
2873 return -EPERM;
2874
2875 get_cgdir_and_path(cgroup, &cgdir, &last);
2876
2877 if (!last) {
2878 path1 = "/";
2879 path2 = cgdir;
2880 } else {
2881 path1 = cgdir;
2882 path2 = last;
2883 }
2884
2885 if (is_child_cgroup(controller, path1, path2)) {
2886 // get uid, gid, from '/tasks' file and make up a mode
2887 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2888 k = cgfs_get_key(controller, cgroup, "tasks");
2889
2890 } else
2891 k = cgfs_get_key(controller, path1, path2);
2892
2893 if (!k) {
2894 ret = -EINVAL;
2895 goto out;
2896 }
2897
2898 /*
2899 * This being a fuse request, the uid and gid must be valid
2900 * in the caller's namespace. So we can just check to make
2901 * sure that the caller is root in his uid, and privileged
2902 * over the file's current owner.
2903 */
2904 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2905 ret = -EACCES;
2906 goto out;
2907 }
2908
2909 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2910
2911 out:
2912 free_key(k);
2913 free(cgdir);
2914
2915 return ret;
2916 }
2917
2918 int cg_chmod(const char *path, mode_t mode)
2919 {
2920 struct fuse_context *fc = fuse_get_context();
2921 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2922 struct cgfs_files *k = NULL;
2923 const char *cgroup;
2924 int ret;
2925
2926 if (!fc)
2927 return -EIO;
2928
2929 if (strcmp(path, "/cgroup") == 0)
2930 return -EPERM;
2931
2932 controller = pick_controller_from_path(fc, path);
2933 if (!controller)
2934 return errno == ENOENT ? -EPERM : -errno;
2935
2936 cgroup = find_cgroup_in_path(path);
2937 if (!cgroup)
2938 /* this is just /cgroup/controller */
2939 return -EPERM;
2940
2941 get_cgdir_and_path(cgroup, &cgdir, &last);
2942
2943 if (!last) {
2944 path1 = "/";
2945 path2 = cgdir;
2946 } else {
2947 path1 = cgdir;
2948 path2 = last;
2949 }
2950
2951 if (is_child_cgroup(controller, path1, path2)) {
2952 // get uid, gid, from '/tasks' file and make up a mode
2953 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2954 k = cgfs_get_key(controller, cgroup, "tasks");
2955
2956 } else
2957 k = cgfs_get_key(controller, path1, path2);
2958
2959 if (!k) {
2960 ret = -EINVAL;
2961 goto out;
2962 }
2963
2964 /*
2965 * This being a fuse request, the uid and gid must be valid
2966 * in the caller's namespace. So we can just check to make
2967 * sure that the caller is root in his uid, and privileged
2968 * over the file's current owner.
2969 */
2970 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2971 ret = -EPERM;
2972 goto out;
2973 }
2974
2975 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2976 ret = -EINVAL;
2977 goto out;
2978 }
2979
2980 ret = 0;
2981 out:
2982 free_key(k);
2983 free(cgdir);
2984 return ret;
2985 }
2986
2987 int cg_mkdir(const char *path, mode_t mode)
2988 {
2989 struct fuse_context *fc = fuse_get_context();
2990 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2991 const char *cgroup;
2992 int ret;
2993
2994 if (!fc)
2995 return -EIO;
2996
2997 controller = pick_controller_from_path(fc, path);
2998 if (!controller)
2999 return errno == ENOENT ? -EPERM : -errno;
3000
3001 cgroup = find_cgroup_in_path(path);
3002 if (!cgroup)
3003 return -errno;
3004
3005 get_cgdir_and_path(cgroup, &cgdir, &last);
3006 if (!last)
3007 path1 = "/";
3008 else
3009 path1 = cgdir;
3010
3011 pid_t initpid = lookup_initpid_in_store(fc->pid);
3012 if (initpid <= 0)
3013 initpid = fc->pid;
3014 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3015 if (!next)
3016 ret = -EINVAL;
3017 else if (last && strcmp(next, last) == 0)
3018 ret = -EEXIST;
3019 else
3020 ret = -EPERM;
3021 goto out;
3022 }
3023
3024 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3025 ret = -EACCES;
3026 goto out;
3027 }
3028 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3029 ret = -EACCES;
3030 goto out;
3031 }
3032
3033 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3034
3035 out:
3036 free(cgdir);
3037 free(next);
3038 return ret;
3039 }
3040
3041 int cg_rmdir(const char *path)
3042 {
3043 struct fuse_context *fc = fuse_get_context();
3044 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3045 const char *cgroup;
3046 int ret;
3047
3048 if (!fc)
3049 return -EIO;
3050
3051 controller = pick_controller_from_path(fc, path);
3052 if (!controller) /* Someone's trying to delete "/cgroup". */
3053 return -EPERM;
3054
3055 cgroup = find_cgroup_in_path(path);
3056 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3057 return -EPERM;
3058
3059 get_cgdir_and_path(cgroup, &cgdir, &last);
3060 if (!last) {
3061 /* Someone's trying to delete a cgroup on the same level as the
3062 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3063 * rmdir "/cgroup/blkio/init.slice".
3064 */
3065 ret = -EPERM;
3066 goto out;
3067 }
3068
3069 pid_t initpid = lookup_initpid_in_store(fc->pid);
3070 if (initpid <= 0)
3071 initpid = fc->pid;
3072 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3073 if (!last || (next && (strcmp(next, last) == 0)))
3074 ret = -EBUSY;
3075 else
3076 ret = -ENOENT;
3077 goto out;
3078 }
3079
3080 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3081 ret = -EACCES;
3082 goto out;
3083 }
3084 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3085 ret = -EACCES;
3086 goto out;
3087 }
3088
3089 if (!cgfs_remove(controller, cgroup)) {
3090 ret = -EINVAL;
3091 goto out;
3092 }
3093
3094 ret = 0;
3095
3096 out:
3097 free(cgdir);
3098 free(next);
3099 return ret;
3100 }
3101
3102 static bool startswith(const char *line, const char *pref)
3103 {
3104 if (strncmp(line, pref, strlen(pref)) == 0)
3105 return true;
3106 return false;
3107 }
3108
3109 static void parse_memstat(char *memstat, unsigned long *cached,
3110 unsigned long *active_anon, unsigned long *inactive_anon,
3111 unsigned long *active_file, unsigned long *inactive_file,
3112 unsigned long *unevictable)
3113 {
3114 char *eol;
3115
3116 while (*memstat) {
3117 if (startswith(memstat, "total_cache")) {
3118 sscanf(memstat + 11, "%lu", cached);
3119 *cached /= 1024;
3120 } else if (startswith(memstat, "total_active_anon")) {
3121 sscanf(memstat + 17, "%lu", active_anon);
3122 *active_anon /= 1024;
3123 } else if (startswith(memstat, "total_inactive_anon")) {
3124 sscanf(memstat + 19, "%lu", inactive_anon);
3125 *inactive_anon /= 1024;
3126 } else if (startswith(memstat, "total_active_file")) {
3127 sscanf(memstat + 17, "%lu", active_file);
3128 *active_file /= 1024;
3129 } else if (startswith(memstat, "total_inactive_file")) {
3130 sscanf(memstat + 19, "%lu", inactive_file);
3131 *inactive_file /= 1024;
3132 } else if (startswith(memstat, "total_unevictable")) {
3133 sscanf(memstat + 17, "%lu", unevictable);
3134 *unevictable /= 1024;
3135 }
3136 eol = strchr(memstat, '\n');
3137 if (!eol)
3138 return;
3139 memstat = eol+1;
3140 }
3141 }
3142
3143 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3144 {
3145 char *eol;
3146 char key[32];
3147
3148 memset(key, 0, 32);
3149 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3150
3151 size_t len = strlen(key);
3152 *v = 0;
3153
3154 while (*str) {
3155 if (startswith(str, key)) {
3156 sscanf(str + len, "%lu", v);
3157 return;
3158 }
3159 eol = strchr(str, '\n');
3160 if (!eol)
3161 return;
3162 str = eol+1;
3163 }
3164 }
3165
3166 static int read_file(const char *path, char *buf, size_t size,
3167 struct file_info *d)
3168 {
3169 size_t linelen = 0, total_len = 0, rv = 0;
3170 char *line = NULL;
3171 char *cache = d->buf;
3172 size_t cache_size = d->buflen;
3173 FILE *f = fopen(path, "r");
3174 if (!f)
3175 return 0;
3176
3177 while (getline(&line, &linelen, f) != -1) {
3178 ssize_t l = snprintf(cache, cache_size, "%s", line);
3179 if (l < 0) {
3180 perror("Error writing to cache");
3181 rv = 0;
3182 goto err;
3183 }
3184 if (l >= cache_size) {
3185 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3186 rv = 0;
3187 goto err;
3188 }
3189 cache += l;
3190 cache_size -= l;
3191 total_len += l;
3192 }
3193
3194 d->size = total_len;
3195 if (total_len > size)
3196 total_len = size;
3197
3198 /* read from off 0 */
3199 memcpy(buf, d->buf, total_len);
3200 rv = total_len;
3201 err:
3202 fclose(f);
3203 free(line);
3204 return rv;
3205 }
3206
3207 /*
3208 * FUSE ops for /proc
3209 */
3210
3211 static unsigned long get_memlimit(const char *cgroup, const char *file)
3212 {
3213 char *memlimit_str = NULL;
3214 unsigned long memlimit = -1;
3215
3216 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3217 memlimit = strtoul(memlimit_str, NULL, 10);
3218
3219 free(memlimit_str);
3220
3221 return memlimit;
3222 }
3223
3224 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3225 {
3226 char *copy = strdupa(cgroup);
3227 unsigned long memlimit = 0, retlimit;
3228
3229 retlimit = get_memlimit(copy, file);
3230
3231 while (strcmp(copy, "/") != 0) {
3232 copy = dirname(copy);
3233 memlimit = get_memlimit(copy, file);
3234 if (memlimit != -1 && memlimit < retlimit)
3235 retlimit = memlimit;
3236 };
3237
3238 return retlimit;
3239 }
3240
3241 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3242 struct fuse_file_info *fi)
3243 {
3244 struct fuse_context *fc = fuse_get_context();
3245 struct file_info *d = (struct file_info *)fi->fh;
3246 char *cg;
3247 char *memusage_str = NULL, *memstat_str = NULL,
3248 *memswlimit_str = NULL, *memswusage_str = NULL;
3249 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3250 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3251 active_file = 0, inactive_file = 0, unevictable = 0,
3252 hostswtotal = 0;
3253 char *line = NULL;
3254 size_t linelen = 0, total_len = 0, rv = 0;
3255 char *cache = d->buf;
3256 size_t cache_size = d->buflen;
3257 FILE *f = NULL;
3258
3259 if (offset){
3260 if (offset > d->size)
3261 return -EINVAL;
3262 if (!d->cached)
3263 return 0;
3264 int left = d->size - offset;
3265 total_len = left > size ? size: left;
3266 memcpy(buf, cache + offset, total_len);
3267 return total_len;
3268 }
3269
3270 pid_t initpid = lookup_initpid_in_store(fc->pid);
3271 if (initpid <= 0)
3272 initpid = fc->pid;
3273 cg = get_pid_cgroup(initpid, "memory");
3274 if (!cg)
3275 return read_file("/proc/meminfo", buf, size, d);
3276 prune_init_slice(cg);
3277
3278 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3279 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3280 goto err;
3281 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3282 goto err;
3283
3284 // Following values are allowed to fail, because swapaccount might be turned
3285 // off for current kernel
3286 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3287 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3288 {
3289 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3290 memswusage = strtoul(memswusage_str, NULL, 10);
3291
3292 memswlimit = memswlimit / 1024;
3293 memswusage = memswusage / 1024;
3294 }
3295
3296 memusage = strtoul(memusage_str, NULL, 10);
3297 memlimit /= 1024;
3298 memusage /= 1024;
3299
3300 parse_memstat(memstat_str, &cached, &active_anon,
3301 &inactive_anon, &active_file, &inactive_file,
3302 &unevictable);
3303
3304 f = fopen("/proc/meminfo", "r");
3305 if (!f)
3306 goto err;
3307
3308 while (getline(&line, &linelen, f) != -1) {
3309 ssize_t l;
3310 char *printme, lbuf[100];
3311
3312 memset(lbuf, 0, 100);
3313 if (startswith(line, "MemTotal:")) {
3314 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3315 if (hosttotal < memlimit)
3316 memlimit = hosttotal;
3317 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3318 printme = lbuf;
3319 } else if (startswith(line, "MemFree:")) {
3320 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3321 printme = lbuf;
3322 } else if (startswith(line, "MemAvailable:")) {
3323 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3324 printme = lbuf;
3325 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3326 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3327 if (hostswtotal < memswlimit)
3328 memswlimit = hostswtotal;
3329 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3330 printme = lbuf;
3331 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3332 unsigned long swaptotal = memswlimit,
3333 swapusage = memswusage - memusage,
3334 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3335 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3336 printme = lbuf;
3337 } else if (startswith(line, "Slab:")) {
3338 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3339 printme = lbuf;
3340 } else if (startswith(line, "Buffers:")) {
3341 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3342 printme = lbuf;
3343 } else if (startswith(line, "Cached:")) {
3344 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3345 printme = lbuf;
3346 } else if (startswith(line, "SwapCached:")) {
3347 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3348 printme = lbuf;
3349 } else if (startswith(line, "Active:")) {
3350 snprintf(lbuf, 100, "Active: %8lu kB\n",
3351 active_anon + active_file);
3352 printme = lbuf;
3353 } else if (startswith(line, "Inactive:")) {
3354 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3355 inactive_anon + inactive_file);
3356 printme = lbuf;
3357 } else if (startswith(line, "Active(anon)")) {
3358 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3359 printme = lbuf;
3360 } else if (startswith(line, "Inactive(anon)")) {
3361 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3362 printme = lbuf;
3363 } else if (startswith(line, "Active(file)")) {
3364 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3365 printme = lbuf;
3366 } else if (startswith(line, "Inactive(file)")) {
3367 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3368 printme = lbuf;
3369 } else if (startswith(line, "Unevictable")) {
3370 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3371 printme = lbuf;
3372 } else if (startswith(line, "SReclaimable")) {
3373 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3374 printme = lbuf;
3375 } else if (startswith(line, "SUnreclaim")) {
3376 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3377 printme = lbuf;
3378 } else
3379 printme = line;
3380
3381 l = snprintf(cache, cache_size, "%s", printme);
3382 if (l < 0) {
3383 perror("Error writing to cache");
3384 rv = 0;
3385 goto err;
3386
3387 }
3388 if (l >= cache_size) {
3389 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3390 rv = 0;
3391 goto err;
3392 }
3393
3394 cache += l;
3395 cache_size -= l;
3396 total_len += l;
3397 }
3398
3399 d->cached = 1;
3400 d->size = total_len;
3401 if (total_len > size ) total_len = size;
3402 memcpy(buf, d->buf, total_len);
3403
3404 rv = total_len;
3405 err:
3406 if (f)
3407 fclose(f);
3408 free(line);
3409 free(cg);
3410 free(memusage_str);
3411 free(memswlimit_str);
3412 free(memswusage_str);
3413 free(memstat_str);
3414 return rv;
3415 }
3416
3417 /*
3418 * Read the cpuset.cpus for cg
3419 * Return the answer in a newly allocated string which must be freed
3420 */
3421 static char *get_cpuset(const char *cg)
3422 {
3423 char *answer;
3424
3425 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3426 return NULL;
3427 return answer;
3428 }
3429
3430 bool cpu_in_cpuset(int cpu, const char *cpuset);
3431
3432 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3433 {
3434 int cpu;
3435
3436 if (sscanf(line, "processor : %d", &cpu) != 1)
3437 return false;
3438 return cpu_in_cpuset(cpu, cpuset);
3439 }
3440
3441 /*
3442 * check whether this is a '^processor" line in /proc/cpuinfo
3443 */
3444 static bool is_processor_line(const char *line)
3445 {
3446 int cpu;
3447
3448 if (sscanf(line, "processor : %d", &cpu) == 1)
3449 return true;
3450 return false;
3451 }
3452
3453 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3454 struct fuse_file_info *fi)
3455 {
3456 struct fuse_context *fc = fuse_get_context();
3457 struct file_info *d = (struct file_info *)fi->fh;
3458 char *cg;
3459 char *cpuset = NULL;
3460 char *line = NULL;
3461 size_t linelen = 0, total_len = 0, rv = 0;
3462 bool am_printing = false, firstline = true, is_s390x = false;
3463 int curcpu = -1, cpu;
3464 char *cache = d->buf;
3465 size_t cache_size = d->buflen;
3466 FILE *f = NULL;
3467
3468 if (offset){
3469 if (offset > d->size)
3470 return -EINVAL;
3471 if (!d->cached)
3472 return 0;
3473 int left = d->size - offset;
3474 total_len = left > size ? size: left;
3475 memcpy(buf, cache + offset, total_len);
3476 return total_len;
3477 }
3478
3479 pid_t initpid = lookup_initpid_in_store(fc->pid);
3480 if (initpid <= 0)
3481 initpid = fc->pid;
3482 cg = get_pid_cgroup(initpid, "cpuset");
3483 if (!cg)
3484 return read_file("proc/cpuinfo", buf, size, d);
3485 prune_init_slice(cg);
3486
3487 cpuset = get_cpuset(cg);
3488 if (!cpuset)
3489 goto err;
3490
3491 f = fopen("/proc/cpuinfo", "r");
3492 if (!f)
3493 goto err;
3494
3495 while (getline(&line, &linelen, f) != -1) {
3496 ssize_t l;
3497 if (firstline) {
3498 firstline = false;
3499 if (strstr(line, "IBM/S390") != NULL) {
3500 is_s390x = true;
3501 am_printing = true;
3502 continue;
3503 }
3504 }
3505 if (strncmp(line, "# processors:", 12) == 0)
3506 continue;
3507 if (is_processor_line(line)) {
3508 am_printing = cpuline_in_cpuset(line, cpuset);
3509 if (am_printing) {
3510 curcpu ++;
3511 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3512 if (l < 0) {
3513 perror("Error writing to cache");
3514 rv = 0;
3515 goto err;
3516 }
3517 if (l >= cache_size) {
3518 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3519 rv = 0;
3520 goto err;
3521 }
3522 cache += l;
3523 cache_size -= l;
3524 total_len += l;
3525 }
3526 continue;
3527 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3528 char *p;
3529 if (!cpu_in_cpuset(cpu, cpuset))
3530 continue;
3531 curcpu ++;
3532 p = strchr(line, ':');
3533 if (!p || !*p)
3534 goto err;
3535 p++;
3536 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3537 if (l < 0) {
3538 perror("Error writing to cache");
3539 rv = 0;
3540 goto err;
3541 }
3542 if (l >= cache_size) {
3543 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3544 rv = 0;
3545 goto err;
3546 }
3547 cache += l;
3548 cache_size -= l;
3549 total_len += l;
3550 continue;
3551
3552 }
3553 if (am_printing) {
3554 l = snprintf(cache, cache_size, "%s", line);
3555 if (l < 0) {
3556 perror("Error writing to cache");
3557 rv = 0;
3558 goto err;
3559 }
3560 if (l >= cache_size) {
3561 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3562 rv = 0;
3563 goto err;
3564 }
3565 cache += l;
3566 cache_size -= l;
3567 total_len += l;
3568 }
3569 }
3570
3571 if (is_s390x) {
3572 char *origcache = d->buf;
3573 ssize_t l;
3574 do {
3575 d->buf = malloc(d->buflen);
3576 } while (!d->buf);
3577 cache = d->buf;
3578 cache_size = d->buflen;
3579 total_len = 0;
3580 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3581 if (l < 0 || l >= cache_size) {
3582 free(origcache);
3583 goto err;
3584 }
3585 cache_size -= l;
3586 cache += l;
3587 total_len += l;
3588 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3589 if (l < 0 || l >= cache_size) {
3590 free(origcache);
3591 goto err;
3592 }
3593 cache_size -= l;
3594 cache += l;
3595 total_len += l;
3596 l = snprintf(cache, cache_size, "%s", origcache);
3597 free(origcache);
3598 if (l < 0 || l >= cache_size)
3599 goto err;
3600 total_len += l;
3601 }
3602
3603 d->cached = 1;
3604 d->size = total_len;
3605 if (total_len > size ) total_len = size;
3606
3607 /* read from off 0 */
3608 memcpy(buf, d->buf, total_len);
3609 rv = total_len;
3610 err:
3611 if (f)
3612 fclose(f);
3613 free(line);
3614 free(cpuset);
3615 free(cg);
3616 return rv;
3617 }
3618
3619 static uint64_t get_reaper_start_time(pid_t pid)
3620 {
3621 int ret;
3622 FILE *f;
3623 uint64_t starttime;
3624 /* strlen("/proc/") = 6
3625 * +
3626 * LXCFS_NUMSTRLEN64
3627 * +
3628 * strlen("/stat") = 5
3629 * +
3630 * \0 = 1
3631 * */
3632 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3633 char path[__PROC_PID_STAT_LEN];
3634 pid_t qpid;
3635
3636 qpid = lookup_initpid_in_store(pid);
3637 if (qpid <= 0) {
3638 /* Caller can check for EINVAL on 0. */
3639 errno = EINVAL;
3640 return 0;
3641 }
3642
3643 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3644 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3645 /* Caller can check for EINVAL on 0. */
3646 errno = EINVAL;
3647 return 0;
3648 }
3649
3650 f = fopen(path, "r");
3651 if (!f) {
3652 /* Caller can check for EINVAL on 0. */
3653 errno = EINVAL;
3654 return 0;
3655 }
3656
3657 /* Note that the *scanf() argument supression requires that length
3658 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3659 * at us. It's like telling someone you're not married and then asking
3660 * if you can bring your wife to the party.
3661 */
3662 ret = fscanf(f, "%*d " /* (1) pid %d */
3663 "%*s " /* (2) comm %s */
3664 "%*c " /* (3) state %c */
3665 "%*d " /* (4) ppid %d */
3666 "%*d " /* (5) pgrp %d */
3667 "%*d " /* (6) session %d */
3668 "%*d " /* (7) tty_nr %d */
3669 "%*d " /* (8) tpgid %d */
3670 "%*u " /* (9) flags %u */
3671 "%*u " /* (10) minflt %lu */
3672 "%*u " /* (11) cminflt %lu */
3673 "%*u " /* (12) majflt %lu */
3674 "%*u " /* (13) cmajflt %lu */
3675 "%*u " /* (14) utime %lu */
3676 "%*u " /* (15) stime %lu */
3677 "%*d " /* (16) cutime %ld */
3678 "%*d " /* (17) cstime %ld */
3679 "%*d " /* (18) priority %ld */
3680 "%*d " /* (19) nice %ld */
3681 "%*d " /* (20) num_threads %ld */
3682 "%*d " /* (21) itrealvalue %ld */
3683 "%" PRIu64, /* (22) starttime %llu */
3684 &starttime);
3685 if (ret != 1) {
3686 fclose(f);
3687 /* Caller can check for EINVAL on 0. */
3688 errno = EINVAL;
3689 return 0;
3690 }
3691
3692 fclose(f);
3693
3694 errno = 0;
3695 return starttime;
3696 }
3697
3698 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3699 {
3700 uint64_t clockticks;
3701 int64_t ticks_per_sec;
3702
3703 clockticks = get_reaper_start_time(pid);
3704 if (clockticks == 0 && errno == EINVAL) {
3705 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3706 return 0;
3707 }
3708
3709 ticks_per_sec = sysconf(_SC_CLK_TCK);
3710 if (ticks_per_sec < 0 && errno == EINVAL) {
3711 lxcfs_debug(
3712 "%s\n",
3713 "failed to determine number of clock ticks in a second");
3714 return 0;
3715 }
3716
3717 return (clockticks /= ticks_per_sec);
3718 }
3719
3720 static uint64_t get_reaper_age(pid_t pid)
3721 {
3722 uint64_t procstart, uptime, procage;
3723
3724 /* We need to substract the time the process has started since system
3725 * boot minus the time when the system has started to get the actual
3726 * reaper age.
3727 */
3728 procstart = get_reaper_start_time_in_sec(pid);
3729 procage = procstart;
3730 if (procstart > 0) {
3731 int ret;
3732 struct timespec spec;
3733
3734 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3735 if (ret < 0)
3736 return 0;
3737 /* We could make this more precise here by using the tv_nsec
3738 * field in the timespec struct and convert it to milliseconds
3739 * and then create a double for the seconds and milliseconds but
3740 * that seems more work than it is worth.
3741 */
3742 uptime = spec.tv_sec;
3743 procage = uptime - procstart;
3744 }
3745
3746 return procage;
3747 }
3748
3749 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3750 static int proc_stat_read(char *buf, size_t size, off_t offset,
3751 struct fuse_file_info *fi)
3752 {
3753 struct fuse_context *fc = fuse_get_context();
3754 struct file_info *d = (struct file_info *)fi->fh;
3755 char *cg;
3756 char *cpuset = NULL;
3757 char *line = NULL;
3758 size_t linelen = 0, total_len = 0, rv = 0;
3759 int curcpu = -1; /* cpu numbering starts at 0 */
3760 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3761 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3762 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3763 char cpuall[CPUALL_MAX_SIZE];
3764 /* reserve for cpu all */
3765 char *cache = d->buf + CPUALL_MAX_SIZE;
3766 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3767 FILE *f = NULL;
3768
3769 if (offset){
3770 if (offset > d->size)
3771 return -EINVAL;
3772 if (!d->cached)
3773 return 0;
3774 int left = d->size - offset;
3775 total_len = left > size ? size: left;
3776 memcpy(buf, d->buf + offset, total_len);
3777 return total_len;
3778 }
3779
3780 pid_t initpid = lookup_initpid_in_store(fc->pid);
3781 if (initpid <= 0)
3782 initpid = fc->pid;
3783 cg = get_pid_cgroup(initpid, "cpuset");
3784 if (!cg)
3785 return read_file("/proc/stat", buf, size, d);
3786 prune_init_slice(cg);
3787
3788 cpuset = get_cpuset(cg);
3789 if (!cpuset)
3790 goto err;
3791
3792 f = fopen("/proc/stat", "r");
3793 if (!f)
3794 goto err;
3795
3796 //skip first line
3797 if (getline(&line, &linelen, f) < 0) {
3798 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3799 goto err;
3800 }
3801
3802 while (getline(&line, &linelen, f) != -1) {
3803 ssize_t l;
3804 int cpu;
3805 char cpu_char[10]; /* That's a lot of cores */
3806 char *c;
3807
3808 if (strlen(line) == 0)
3809 continue;
3810 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3811 /* not a ^cpuN line containing a number N, just print it */
3812 l = snprintf(cache, cache_size, "%s", line);
3813 if (l < 0) {
3814 perror("Error writing to cache");
3815 rv = 0;
3816 goto err;
3817 }
3818 if (l >= cache_size) {
3819 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3820 rv = 0;
3821 goto err;
3822 }
3823 cache += l;
3824 cache_size -= l;
3825 total_len += l;
3826 continue;
3827 }
3828
3829 if (sscanf(cpu_char, "%d", &cpu) != 1)
3830 continue;
3831 if (!cpu_in_cpuset(cpu, cpuset))
3832 continue;
3833 curcpu ++;
3834
3835 c = strchr(line, ' ');
3836 if (!c)
3837 continue;
3838 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3839 if (l < 0) {
3840 perror("Error writing to cache");
3841 rv = 0;
3842 goto err;
3843
3844 }
3845 if (l >= cache_size) {
3846 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3847 rv = 0;
3848 goto err;
3849 }
3850
3851 cache += l;
3852 cache_size -= l;
3853 total_len += l;
3854
3855 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3856 &user,
3857 &nice,
3858 &system,
3859 &idle,
3860 &iowait,
3861 &irq,
3862 &softirq,
3863 &steal,
3864 &guest,
3865 &guest_nice) != 10)
3866 continue;
3867 user_sum += user;
3868 nice_sum += nice;
3869 system_sum += system;
3870 idle_sum += idle;
3871 iowait_sum += iowait;
3872 irq_sum += irq;
3873 softirq_sum += softirq;
3874 steal_sum += steal;
3875 guest_sum += guest;
3876 guest_nice_sum += guest_nice;
3877 }
3878
3879 cache = d->buf;
3880
3881 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3882 user_sum,
3883 nice_sum,
3884 system_sum,
3885 idle_sum,
3886 iowait_sum,
3887 irq_sum,
3888 softirq_sum,
3889 steal_sum,
3890 guest_sum,
3891 guest_nice_sum);
3892 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3893 memcpy(cache, cpuall, cpuall_len);
3894 cache += cpuall_len;
3895 } else {
3896 /* shouldn't happen */
3897 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3898 cpuall_len = 0;
3899 }
3900
3901 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3902 total_len += cpuall_len;
3903 d->cached = 1;
3904 d->size = total_len;
3905 if (total_len > size)
3906 total_len = size;
3907
3908 memcpy(buf, d->buf, total_len);
3909 rv = total_len;
3910
3911 err:
3912 if (f)
3913 fclose(f);
3914 free(line);
3915 free(cpuset);
3916 free(cg);
3917 return rv;
3918 }
3919
3920 /* This function retrieves the busy time of a group of tasks by looking at
3921 * cpuacct.usage. Unfortunately, this only makes sense when the container has
3922 * been given it's own cpuacct cgroup. If not, this function will take the busy
3923 * time of all other taks that do not actually belong to the container into
3924 * account as well. If someone has a clever solution for this please send a
3925 * patch!
3926 */
3927 static unsigned long get_reaper_busy(pid_t task)
3928 {
3929 pid_t initpid = lookup_initpid_in_store(task);
3930 char *cgroup = NULL, *usage_str = NULL;
3931 unsigned long usage = 0;
3932
3933 if (initpid <= 0)
3934 return 0;
3935
3936 cgroup = get_pid_cgroup(initpid, "cpuacct");
3937 if (!cgroup)
3938 goto out;
3939 prune_init_slice(cgroup);
3940 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3941 goto out;
3942 usage = strtoul(usage_str, NULL, 10);
3943 usage /= 1000000000;
3944
3945 out:
3946 free(cgroup);
3947 free(usage_str);
3948 return usage;
3949 }
3950
3951 #if RELOADTEST
3952 void iwashere(void)
3953 {
3954 int fd;
3955
3956 fd = creat("/tmp/lxcfs-iwashere", 0644);
3957 if (fd >= 0)
3958 close(fd);
3959 }
3960 #endif
3961
3962 /*
3963 * We read /proc/uptime and reuse its second field.
3964 * For the first field, we use the mtime for the reaper for
3965 * the calling pid as returned by getreaperage
3966 */
3967 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3968 struct fuse_file_info *fi)
3969 {
3970 struct fuse_context *fc = fuse_get_context();
3971 struct file_info *d = (struct file_info *)fi->fh;
3972 unsigned long int busytime = get_reaper_busy(fc->pid);
3973 char *cache = d->buf;
3974 ssize_t total_len = 0;
3975 uint64_t idletime, reaperage;
3976
3977 #if RELOADTEST
3978 iwashere();
3979 #endif
3980
3981 if (offset){
3982 if (!d->cached)
3983 return 0;
3984 if (offset > d->size)
3985 return -EINVAL;
3986 int left = d->size - offset;
3987 total_len = left > size ? size: left;
3988 memcpy(buf, cache + offset, total_len);
3989 return total_len;
3990 }
3991
3992 reaperage = get_reaper_age(fc->pid);
3993 /* To understand why this is done, please read the comment to the
3994 * get_reaper_busy() function.
3995 */
3996 idletime = reaperage;
3997 if (reaperage >= busytime)
3998 idletime = reaperage - busytime;
3999
4000 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4001 if (total_len < 0 || total_len >= d->buflen){
4002 lxcfs_error("%s\n", "failed to write to cache");
4003 return 0;
4004 }
4005
4006 d->size = (int)total_len;
4007 d->cached = 1;
4008
4009 if (total_len > size) total_len = size;
4010
4011 memcpy(buf, d->buf, total_len);
4012 return total_len;
4013 }
4014
4015 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4016 struct fuse_file_info *fi)
4017 {
4018 char dev_name[72];
4019 struct fuse_context *fc = fuse_get_context();
4020 struct file_info *d = (struct file_info *)fi->fh;
4021 char *cg;
4022 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4023 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4024 unsigned long read = 0, write = 0;
4025 unsigned long read_merged = 0, write_merged = 0;
4026 unsigned long read_sectors = 0, write_sectors = 0;
4027 unsigned long read_ticks = 0, write_ticks = 0;
4028 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4029 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4030 char *cache = d->buf;
4031 size_t cache_size = d->buflen;
4032 char *line = NULL;
4033 size_t linelen = 0, total_len = 0, rv = 0;
4034 unsigned int major = 0, minor = 0;
4035 int i = 0;
4036 FILE *f = NULL;
4037
4038 if (offset){
4039 if (offset > d->size)
4040 return -EINVAL;
4041 if (!d->cached)
4042 return 0;
4043 int left = d->size - offset;
4044 total_len = left > size ? size: left;
4045 memcpy(buf, cache + offset, total_len);
4046 return total_len;
4047 }
4048
4049 pid_t initpid = lookup_initpid_in_store(fc->pid);
4050 if (initpid <= 0)
4051 initpid = fc->pid;
4052 cg = get_pid_cgroup(initpid, "blkio");
4053 if (!cg)
4054 return read_file("/proc/diskstats", buf, size, d);
4055 prune_init_slice(cg);
4056
4057 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4058 goto err;
4059 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4060 goto err;
4061 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4062 goto err;
4063 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4064 goto err;
4065 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4066 goto err;
4067
4068
4069 f = fopen("/proc/diskstats", "r");
4070 if (!f)
4071 goto err;
4072
4073 while (getline(&line, &linelen, f) != -1) {
4074 ssize_t l;
4075 char lbuf[256];
4076
4077 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4078 if (i != 3)
4079 continue;
4080
4081 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4082 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4083 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4084 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4085 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4086 read_sectors = read_sectors/512;
4087 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4088 write_sectors = write_sectors/512;
4089
4090 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4091 rd_svctm = rd_svctm/1000000;
4092 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4093 rd_wait = rd_wait/1000000;
4094 read_ticks = rd_svctm + rd_wait;
4095
4096 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4097 wr_svctm = wr_svctm/1000000;
4098 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4099 wr_wait = wr_wait/1000000;
4100 write_ticks = wr_svctm + wr_wait;
4101
4102 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4103 tot_ticks = tot_ticks/1000000;
4104
4105 memset(lbuf, 0, 256);
4106 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4107 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4108 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4109 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4110 else
4111 continue;
4112
4113 l = snprintf(cache, cache_size, "%s", lbuf);
4114 if (l < 0) {
4115 perror("Error writing to fuse buf");
4116 rv = 0;
4117 goto err;
4118 }
4119 if (l >= cache_size) {
4120 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4121 rv = 0;
4122 goto err;
4123 }
4124 cache += l;
4125 cache_size -= l;
4126 total_len += l;
4127 }
4128
4129 d->cached = 1;
4130 d->size = total_len;
4131 if (total_len > size ) total_len = size;
4132 memcpy(buf, d->buf, total_len);
4133
4134 rv = total_len;
4135 err:
4136 free(cg);
4137 if (f)
4138 fclose(f);
4139 free(line);
4140 free(io_serviced_str);
4141 free(io_merged_str);
4142 free(io_service_bytes_str);
4143 free(io_wait_time_str);
4144 free(io_service_time_str);
4145 return rv;
4146 }
4147
4148 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4149 struct fuse_file_info *fi)
4150 {
4151 struct fuse_context *fc = fuse_get_context();
4152 struct file_info *d = (struct file_info *)fi->fh;
4153 char *cg = NULL;
4154 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4155 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4156 ssize_t total_len = 0, rv = 0;
4157 ssize_t l = 0;
4158 char *cache = d->buf;
4159
4160 if (offset) {
4161 if (offset > d->size)
4162 return -EINVAL;
4163 if (!d->cached)
4164 return 0;
4165 int left = d->size - offset;
4166 total_len = left > size ? size: left;
4167 memcpy(buf, cache + offset, total_len);
4168 return total_len;
4169 }
4170
4171 pid_t initpid = lookup_initpid_in_store(fc->pid);
4172 if (initpid <= 0)
4173 initpid = fc->pid;
4174 cg = get_pid_cgroup(initpid, "memory");
4175 if (!cg)
4176 return read_file("/proc/swaps", buf, size, d);
4177 prune_init_slice(cg);
4178
4179 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4180
4181 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4182 goto err;
4183
4184 memusage = strtoul(memusage_str, NULL, 10);
4185
4186 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4187 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4188
4189 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4190 memswusage = strtoul(memswusage_str, NULL, 10);
4191
4192 swap_total = (memswlimit - memlimit) / 1024;
4193 swap_free = (memswusage - memusage) / 1024;
4194 }
4195
4196 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4197
4198 /* When no mem + swap limit is specified or swapaccount=0*/
4199 if (!memswlimit) {
4200 char *line = NULL;
4201 size_t linelen = 0;
4202 FILE *f = fopen("/proc/meminfo", "r");
4203
4204 if (!f)
4205 goto err;
4206
4207 while (getline(&line, &linelen, f) != -1) {
4208 if (startswith(line, "SwapTotal:")) {
4209 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4210 } else if (startswith(line, "SwapFree:")) {
4211 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4212 }
4213 }
4214
4215 free(line);
4216 fclose(f);
4217 }
4218
4219 if (swap_total > 0) {
4220 l = snprintf(d->buf + total_len, d->size - total_len,
4221 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4222 swap_total, swap_free);
4223 total_len += l;
4224 }
4225
4226 if (total_len < 0 || l < 0) {
4227 perror("Error writing to cache");
4228 rv = 0;
4229 goto err;
4230 }
4231
4232 d->cached = 1;
4233 d->size = (int)total_len;
4234
4235 if (total_len > size) total_len = size;
4236 memcpy(buf, d->buf, total_len);
4237 rv = total_len;
4238
4239 err:
4240 free(cg);
4241 free(memswlimit_str);
4242 free(memlimit_str);
4243 free(memusage_str);
4244 free(memswusage_str);
4245 return rv;
4246 }
4247
4248 static off_t get_procfile_size(const char *which)
4249 {
4250 FILE *f = fopen(which, "r");
4251 char *line = NULL;
4252 size_t len = 0;
4253 ssize_t sz, answer = 0;
4254 if (!f)
4255 return 0;
4256
4257 while ((sz = getline(&line, &len, f)) != -1)
4258 answer += sz;
4259 fclose (f);
4260 free(line);
4261
4262 return answer;
4263 }
4264
4265 int proc_getattr(const char *path, struct stat *sb)
4266 {
4267 struct timespec now;
4268
4269 memset(sb, 0, sizeof(struct stat));
4270 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4271 return -EINVAL;
4272 sb->st_uid = sb->st_gid = 0;
4273 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4274 if (strcmp(path, "/proc") == 0) {
4275 sb->st_mode = S_IFDIR | 00555;
4276 sb->st_nlink = 2;
4277 return 0;
4278 }
4279 if (strcmp(path, "/proc/meminfo") == 0 ||
4280 strcmp(path, "/proc/cpuinfo") == 0 ||
4281 strcmp(path, "/proc/uptime") == 0 ||
4282 strcmp(path, "/proc/stat") == 0 ||
4283 strcmp(path, "/proc/diskstats") == 0 ||
4284 strcmp(path, "/proc/swaps") == 0 ||
4285 strcmp(path, "/proc/loadavg") == 0) {
4286 sb->st_size = 0;
4287 sb->st_mode = S_IFREG | 00444;
4288 sb->st_nlink = 1;
4289 return 0;
4290 }
4291
4292 return -ENOENT;
4293 }
4294
4295 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4296 struct fuse_file_info *fi)
4297 {
4298 if (filler(buf, ".", NULL, 0) != 0 ||
4299 filler(buf, "..", NULL, 0) != 0 ||
4300 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4301 filler(buf, "meminfo", NULL, 0) != 0 ||
4302 filler(buf, "stat", NULL, 0) != 0 ||
4303 filler(buf, "uptime", NULL, 0) != 0 ||
4304 filler(buf, "diskstats", NULL, 0) != 0 ||
4305 filler(buf, "swaps", NULL, 0) != 0 ||
4306 filler(buf, "loadavg", NULL, 0) != 0)
4307 return -EINVAL;
4308 return 0;
4309 }
4310
4311 int proc_open(const char *path, struct fuse_file_info *fi)
4312 {
4313 int type = -1;
4314 struct file_info *info;
4315
4316 if (strcmp(path, "/proc/meminfo") == 0)
4317 type = LXC_TYPE_PROC_MEMINFO;
4318 else if (strcmp(path, "/proc/cpuinfo") == 0)
4319 type = LXC_TYPE_PROC_CPUINFO;
4320 else if (strcmp(path, "/proc/uptime") == 0)
4321 type = LXC_TYPE_PROC_UPTIME;
4322 else if (strcmp(path, "/proc/stat") == 0)
4323 type = LXC_TYPE_PROC_STAT;
4324 else if (strcmp(path, "/proc/diskstats") == 0)
4325 type = LXC_TYPE_PROC_DISKSTATS;
4326 else if (strcmp(path, "/proc/swaps") == 0)
4327 type = LXC_TYPE_PROC_SWAPS;
4328 else if (strcmp(path, "/proc/loadavg") == 0)
4329 type = LXC_TYPE_PROC_LOADAVG;
4330 if (type == -1)
4331 return -ENOENT;
4332
4333 info = malloc(sizeof(*info));
4334 if (!info)
4335 return -ENOMEM;
4336
4337 memset(info, 0, sizeof(*info));
4338 info->type = type;
4339
4340 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4341 do {
4342 info->buf = malloc(info->buflen);
4343 } while (!info->buf);
4344 memset(info->buf, 0, info->buflen);
4345 /* set actual size to buffer size */
4346 info->size = info->buflen;
4347
4348 fi->fh = (unsigned long)info;
4349 return 0;
4350 }
4351
4352 int proc_access(const char *path, int mask)
4353 {
4354 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4355 return 0;
4356
4357 /* these are all read-only */
4358 if ((mask & ~R_OK) != 0)
4359 return -EACCES;
4360 return 0;
4361 }
4362
4363 int proc_release(const char *path, struct fuse_file_info *fi)
4364 {
4365 do_release_file_info(fi);
4366 return 0;
4367 }
4368
4369 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4370 struct fuse_file_info *fi)
4371 {
4372 struct file_info *f = (struct file_info *) fi->fh;
4373
4374 switch (f->type) {
4375 case LXC_TYPE_PROC_MEMINFO:
4376 return proc_meminfo_read(buf, size, offset, fi);
4377 case LXC_TYPE_PROC_CPUINFO:
4378 return proc_cpuinfo_read(buf, size, offset, fi);
4379 case LXC_TYPE_PROC_UPTIME:
4380 return proc_uptime_read(buf, size, offset, fi);
4381 case LXC_TYPE_PROC_STAT:
4382 return proc_stat_read(buf, size, offset, fi);
4383 case LXC_TYPE_PROC_DISKSTATS:
4384 return proc_diskstats_read(buf, size, offset, fi);
4385 case LXC_TYPE_PROC_SWAPS:
4386 return proc_swaps_read(buf, size, offset, fi);
4387 case LXC_TYPE_PROC_LOADAVG:
4388 return proc_loadavg_read(buf, size, offset, fi);
4389 default:
4390 return -EINVAL;
4391 }
4392 }
4393
4394 /*
4395 * Functions needed to setup cgroups in the __constructor__.
4396 */
4397
4398 static bool mkdir_p(const char *dir, mode_t mode)
4399 {
4400 const char *tmp = dir;
4401 const char *orig = dir;
4402 char *makeme;
4403
4404 do {
4405 dir = tmp + strspn(tmp, "/");
4406 tmp = dir + strcspn(dir, "/");
4407 makeme = strndup(orig, dir - orig);
4408 if (!makeme)
4409 return false;
4410 if (mkdir(makeme, mode) && errno != EEXIST) {
4411 lxcfs_error("Failed to create directory '%s': %s.\n",
4412 makeme, strerror(errno));
4413 free(makeme);
4414 return false;
4415 }
4416 free(makeme);
4417 } while(tmp != dir);
4418
4419 return true;
4420 }
4421
4422 static bool umount_if_mounted(void)
4423 {
4424 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4425 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4426 return false;
4427 }
4428 return true;
4429 }
4430
4431 /* __typeof__ should be safe to use with all compilers. */
4432 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4433 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4434 {
4435 return (fs->f_type == (fs_type_magic)magic_val);
4436 }
4437
4438 /*
4439 * looking at fs/proc_namespace.c, it appears we can
4440 * actually expect the rootfs entry to very specifically contain
4441 * " - rootfs rootfs "
4442 * IIUC, so long as we've chrooted so that rootfs is not our root,
4443 * the rootfs entry should always be skipped in mountinfo contents.
4444 */
4445 static bool is_on_ramfs(void)
4446 {
4447 FILE *f;
4448 char *p, *p2;
4449 char *line = NULL;
4450 size_t len = 0;
4451 int i;
4452
4453 f = fopen("/proc/self/mountinfo", "r");
4454 if (!f)
4455 return false;
4456
4457 while (getline(&line, &len, f) != -1) {
4458 for (p = line, i = 0; p && i < 4; i++)
4459 p = strchr(p + 1, ' ');
4460 if (!p)
4461 continue;
4462 p2 = strchr(p + 1, ' ');
4463 if (!p2)
4464 continue;
4465 *p2 = '\0';
4466 if (strcmp(p + 1, "/") == 0) {
4467 // this is '/'. is it the ramfs?
4468 p = strchr(p2 + 1, '-');
4469 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4470 free(line);
4471 fclose(f);
4472 return true;
4473 }
4474 }
4475 }
4476 free(line);
4477 fclose(f);
4478 return false;
4479 }
4480
4481 static int pivot_enter()
4482 {
4483 int ret = -1, oldroot = -1, newroot = -1;
4484
4485 oldroot = open("/", O_DIRECTORY | O_RDONLY);
4486 if (oldroot < 0) {
4487 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4488 return ret;
4489 }
4490
4491 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4492 if (newroot < 0) {
4493 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4494 goto err;
4495 }
4496
4497 /* change into new root fs */
4498 if (fchdir(newroot) < 0) {
4499 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4500 goto err;
4501 }
4502
4503 /* pivot_root into our new root fs */
4504 if (pivot_root(".", ".") < 0) {
4505 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4506 goto err;
4507 }
4508
4509 /*
4510 * At this point the old-root is mounted on top of our new-root.
4511 * To unmounted it we must not be chdir'd into it, so escape back
4512 * to the old-root.
4513 */
4514 if (fchdir(oldroot) < 0) {
4515 lxcfs_error("%s\n", "Failed to enter old root.");
4516 goto err;
4517 }
4518
4519 if (umount2(".", MNT_DETACH) < 0) {
4520 lxcfs_error("%s\n", "Failed to detach old root.");
4521 goto err;
4522 }
4523
4524 if (fchdir(newroot) < 0) {
4525 lxcfs_error("%s\n", "Failed to re-enter new root.");
4526 goto err;
4527 }
4528
4529 ret = 0;
4530
4531 err:
4532 if (oldroot > 0)
4533 close(oldroot);
4534 if (newroot > 0)
4535 close(newroot);
4536
4537 return ret;
4538 }
4539
4540 static int chroot_enter()
4541 {
4542 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4543 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4544 return -1;
4545 }
4546
4547 if (chroot(".") < 0) {
4548 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4549 return -1;
4550 }
4551
4552 if (chdir("/") < 0) {
4553 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4554 return -1;
4555 }
4556
4557 return 0;
4558 }
4559
4560 static int permute_and_enter(void)
4561 {
4562 struct statfs sb;
4563
4564 if (statfs("/", &sb) < 0) {
4565 lxcfs_error("%s\n", "Could not stat / mountpoint.");
4566 return -1;
4567 }
4568
4569 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
4570 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
4571 * /proc/1/mountinfo. */
4572 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
4573 return chroot_enter();
4574
4575 if (pivot_enter() < 0) {
4576 lxcfs_error("%s\n", "Could not perform pivot root.");
4577 return -1;
4578 }
4579
4580 return 0;
4581 }
4582
4583 /* Prepare our new clean root. */
4584 static int permute_prepare(void)
4585 {
4586 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4587 lxcfs_error("%s\n", "Failed to create directory for new root.");
4588 return -1;
4589 }
4590
4591 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4592 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4593 return -1;
4594 }
4595
4596 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4597 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
4598 return -1;
4599 }
4600
4601 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4602 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
4603 return -1;
4604 }
4605
4606 return 0;
4607 }
4608
4609 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
4610 static bool permute_root(void)
4611 {
4612 /* Prepare new root. */
4613 if (permute_prepare() < 0)
4614 return false;
4615
4616 /* Pivot into new root. */
4617 if (permute_and_enter() < 0)
4618 return false;
4619
4620 return true;
4621 }
4622
4623 static int preserve_mnt_ns(int pid)
4624 {
4625 int ret;
4626 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
4627 char path[len];
4628
4629 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4630 if (ret < 0 || (size_t)ret >= len)
4631 return -1;
4632
4633 return open(path, O_RDONLY | O_CLOEXEC);
4634 }
4635
4636 static bool cgfs_prepare_mounts(void)
4637 {
4638 if (!mkdir_p(BASEDIR, 0700)) {
4639 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
4640 return false;
4641 }
4642
4643 if (!umount_if_mounted()) {
4644 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
4645 return false;
4646 }
4647
4648 if (unshare(CLONE_NEWNS) < 0) {
4649 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
4650 return false;
4651 }
4652
4653 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
4654 if (cgroup_mount_ns_fd < 0) {
4655 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
4656 return false;
4657 }
4658
4659 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4660 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
4661 return false;
4662 }
4663
4664 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
4665 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
4666 return false;
4667 }
4668
4669 return true;
4670 }
4671
4672 static bool cgfs_mount_hierarchies(void)
4673 {
4674 char *target;
4675 size_t clen, len;
4676 int i, ret;
4677
4678 for (i = 0; i < num_hierarchies; i++) {
4679 char *controller = hierarchies[i];
4680
4681 clen = strlen(controller);
4682 len = strlen(BASEDIR) + clen + 2;
4683 target = malloc(len);
4684 if (!target)
4685 return false;
4686
4687 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4688 if (ret < 0 || ret >= len) {
4689 free(target);
4690 return false;
4691 }
4692 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4693 free(target);
4694 return false;
4695 }
4696 if (!strcmp(controller, "unified"))
4697 ret = mount("none", target, "cgroup2", 0, NULL);
4698 else
4699 ret = mount(controller, target, "cgroup", 0, controller);
4700 if (ret < 0) {
4701 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
4702 free(target);
4703 return false;
4704 }
4705
4706 fd_hierarchies[i] = open(target, O_DIRECTORY);
4707 if (fd_hierarchies[i] < 0) {
4708 free(target);
4709 return false;
4710 }
4711 free(target);
4712 }
4713 return true;
4714 }
4715
4716 static bool cgfs_setup_controllers(void)
4717 {
4718 if (!cgfs_prepare_mounts())
4719 return false;
4720
4721 if (!cgfs_mount_hierarchies()) {
4722 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
4723 return false;
4724 }
4725
4726 if (!permute_root())
4727 return false;
4728
4729 return true;
4730 }
4731
4732 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
4733 {
4734 FILE *f;
4735 char *cret, *line = NULL;
4736 char cwd[MAXPATHLEN];
4737 size_t len = 0;
4738 int i, init_ns = -1;
4739 bool found_unified = false;
4740
4741 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4742 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
4743 return;
4744 }
4745
4746 while (getline(&line, &len, f) != -1) {
4747 char *idx, *p, *p2;
4748
4749 p = strchr(line, ':');
4750 if (!p)
4751 goto out;
4752 idx = line;
4753 *(p++) = '\0';
4754
4755 p2 = strrchr(p, ':');
4756 if (!p2)
4757 goto out;
4758 *p2 = '\0';
4759
4760 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4761 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4762 * because it parses out the empty string "" and later on passes
4763 * it to mount(). Let's skip such entries.
4764 */
4765 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
4766 found_unified = true;
4767 p = "unified";
4768 }
4769
4770 if (!store_hierarchy(line, p))
4771 goto out;
4772 }
4773
4774 /* Preserve initial namespace. */
4775 init_ns = preserve_mnt_ns(getpid());
4776 if (init_ns < 0) {
4777 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
4778 goto out;
4779 }
4780
4781 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
4782 if (!fd_hierarchies) {
4783 lxcfs_error("%s\n", strerror(errno));
4784 goto out;
4785 }
4786
4787 for (i = 0; i < num_hierarchies; i++)
4788 fd_hierarchies[i] = -1;
4789
4790 cret = getcwd(cwd, MAXPATHLEN);
4791 if (!cret)
4792 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
4793
4794 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4795 * to privately mount lxcfs cgroups. */
4796 if (!cgfs_setup_controllers()) {
4797 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
4798 goto out;
4799 }
4800
4801 if (setns(init_ns, 0) < 0) {
4802 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
4803 goto out;
4804 }
4805
4806 if (!cret || chdir(cwd) < 0)
4807 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
4808
4809 print_subsystems();
4810
4811 out:
4812 free(line);
4813 fclose(f);
4814 if (init_ns >= 0)
4815 close(init_ns);
4816 }
4817
4818 static void __attribute__((destructor)) free_subsystems(void)
4819 {
4820 int i;
4821
4822 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
4823
4824 for (i = 0; i < num_hierarchies; i++) {
4825 if (hierarchies[i])
4826 free(hierarchies[i]);
4827 if (fd_hierarchies && fd_hierarchies[i] >= 0)
4828 close(fd_hierarchies[i]);
4829 }
4830 free(hierarchies);
4831 free(fd_hierarchies);
4832
4833 if (cgroup_mount_ns_fd >= 0)
4834 close(cgroup_mount_ns_fd);
4835 }