]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
b08cf43e725d38ccf0608dcfda4d91a36c35c3a1
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdbool.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <time.h>
26 #include <unistd.h>
27 #include <wait.h>
28 #include <linux/magic.h>
29 #include <linux/sched.h>
30 #include <sys/epoll.h>
31 #include <sys/mman.h>
32 #include <sys/mount.h>
33 #include <sys/param.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/vfs.h>
38
39 #include "bindings.h"
40 #include "config.h" // for VERSION
41
42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43 #define LXCFS_NUMSTRLEN64 21
44
45 /* Define pivot_root() if missing from the C library */
46 #ifndef HAVE_PIVOT_ROOT
47 static int pivot_root(const char * new_root, const char * put_old)
48 {
49 #ifdef __NR_pivot_root
50 return syscall(__NR_pivot_root, new_root, put_old);
51 #else
52 errno = ENOSYS;
53 return -1;
54 #endif
55 }
56 #else
57 extern int pivot_root(const char * new_root, const char * put_old);
58 #endif
59
60 enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
68 LXC_TYPE_PROC_SWAPS,
69 LXC_TYPE_PROC_LOADAVG,
70 };
71
72 struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81 };
82
83 /* The function of hash table.*/
84 #define LOAD_SIZE 100 /*the size of hash_table */
85 #define FLUSH_TIME 5 /*the flush rate */
86 #define DEPTH_DIR 3 /*the depth of per cgroup */
87 /* The function of calculate loadavg .*/
88 #define FSHIFT 11 /* nr of bits of precision */
89 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
90 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
91 #define EXP_5 2014 /* 1/exp(5sec/5min) */
92 #define EXP_15 2037 /* 1/exp(5sec/15min) */
93 #define LOAD_INT(x) ((x) >> FSHIFT)
94 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
95 /*
96 * This parameter is used for proc_loadavg_read().
97 * 1 means use loadavg, 0 means not use.
98 */
99 static int loadavg = 0;
100 static int calc_hash(char *name)
101 {
102 unsigned int hash = 0;
103 unsigned int x = 0;
104 /* ELFHash algorithm. */
105 while (*name) {
106 hash = (hash << 4) + *name++;
107 x = hash & 0xf0000000;
108 if (x != 0)
109 hash ^= (x >> 24);
110 hash &= ~x;
111 }
112 return ((hash & 0x7fffffff) % LOAD_SIZE);
113 }
114
115 struct load_node {
116 char *cg; /*cg */
117 unsigned long avenrun[3]; /* Load averages */
118 unsigned int run_pid;
119 unsigned int total_pid;
120 unsigned int last_pid;
121 int cfd; /* The file descriptor of the mounted cgroup */
122 struct load_node *next;
123 struct load_node **pre;
124 };
125
126 struct load_head {
127 /*
128 * The lock is about insert load_node and refresh load_node.To the first
129 * load_node of each hash bucket, insert and refresh in this hash bucket is
130 * mutually exclusive.
131 */
132 pthread_mutex_t lock;
133 /*
134 * The rdlock is about read loadavg and delete load_node.To each hash
135 * bucket, read and delete is mutually exclusive. But at the same time, we
136 * allow paratactic read operation. This rdlock is at list level.
137 */
138 pthread_rwlock_t rdlock;
139 /*
140 * The rilock is about read loadavg and insert load_node.To the first
141 * load_node of each hash bucket, read and insert is mutually exclusive.
142 * But at the same time, we allow paratactic read operation.
143 */
144 pthread_rwlock_t rilock;
145 struct load_node *next;
146 };
147
148 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
149 /*
150 * init_load initialize the hash table.
151 * Return 0 on success, return -1 on failure.
152 */
153 static int init_load(void)
154 {
155 int i;
156 int ret;
157
158 for (i = 0; i < LOAD_SIZE; i++) {
159 load_hash[i].next = NULL;
160 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
161 if (ret != 0) {
162 lxcfs_error("%s\n", "Failed to initialize lock");
163 goto out3;
164 }
165 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
166 if (ret != 0) {
167 lxcfs_error("%s\n", "Failed to initialize rdlock");
168 goto out2;
169 }
170 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
171 if (ret != 0) {
172 lxcfs_error("%s\n", "Failed to initialize rilock");
173 goto out1;
174 }
175 }
176 return 0;
177 out1:
178 pthread_rwlock_destroy(&load_hash[i].rdlock);
179 out2:
180 pthread_mutex_destroy(&load_hash[i].lock);
181 out3:
182 while (i > 0) {
183 i--;
184 pthread_mutex_destroy(&load_hash[i].lock);
185 pthread_rwlock_destroy(&load_hash[i].rdlock);
186 pthread_rwlock_destroy(&load_hash[i].rilock);
187 }
188 return -1;
189 }
190
191 static void insert_node(struct load_node **n, int locate)
192 {
193 struct load_node *f;
194
195 pthread_mutex_lock(&load_hash[locate].lock);
196 pthread_rwlock_wrlock(&load_hash[locate].rilock);
197 f = load_hash[locate].next;
198 load_hash[locate].next = *n;
199
200 (*n)->pre = &(load_hash[locate].next);
201 if (f)
202 f->pre = &((*n)->next);
203 (*n)->next = f;
204 pthread_mutex_unlock(&load_hash[locate].lock);
205 pthread_rwlock_unlock(&load_hash[locate].rilock);
206 }
207 /*
208 * locate_node() finds special node. Not return NULL means success.
209 * It should be noted that rdlock isn't unlocked at the end of code
210 * because this function is used to read special node. Delete is not
211 * allowed before read has ended.
212 * unlock rdlock only in proc_loadavg_read().
213 */
214 static struct load_node *locate_node(char *cg, int locate)
215 {
216 struct load_node *f = NULL;
217 int i = 0;
218
219 pthread_rwlock_rdlock(&load_hash[locate].rilock);
220 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
221 if (load_hash[locate].next == NULL) {
222 pthread_rwlock_unlock(&load_hash[locate].rilock);
223 return f;
224 }
225 f = load_hash[locate].next;
226 pthread_rwlock_unlock(&load_hash[locate].rilock);
227 while (f && ((i = strcmp(f->cg, cg)) != 0))
228 f = f->next;
229 return f;
230 }
231 /* Delete the load_node n and return the next node of it. */
232 static struct load_node *del_node(struct load_node *n, int locate)
233 {
234 struct load_node *g;
235
236 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
237 if (n->next == NULL) {
238 *(n->pre) = NULL;
239 } else {
240 *(n->pre) = n->next;
241 n->next->pre = n->pre;
242 }
243 g = n->next;
244 free(n->cg);
245 free(n);
246 pthread_rwlock_unlock(&load_hash[locate].rdlock);
247 return g;
248 }
249
250 /* Reserve buffer size to account for file size changes. */
251 #define BUF_RESERVE_SIZE 512
252
253 /*
254 * A table caching which pid is init for a pid namespace.
255 * When looking up which pid is init for $qpid, we first
256 * 1. Stat /proc/$qpid/ns/pid.
257 * 2. Check whether the ino_t is in our store.
258 * a. if not, fork a child in qpid's ns to send us
259 * ucred.pid = 1, and read the initpid. Cache
260 * initpid and creation time for /proc/initpid
261 * in a new store entry.
262 * b. if so, verify that /proc/initpid still matches
263 * what we have saved. If not, clear the store
264 * entry and go back to a. If so, return the
265 * cached initpid.
266 */
267 struct pidns_init_store {
268 ino_t ino; // inode number for /proc/$pid/ns/pid
269 pid_t initpid; // the pid of nit in that ns
270 long int ctime; // the time at which /proc/$initpid was created
271 struct pidns_init_store *next;
272 long int lastcheck;
273 };
274
275 /* lol - look at how they are allocated in the kernel */
276 #define PIDNS_HASH_SIZE 4096
277 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
278
279 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
280 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
281 static void lock_mutex(pthread_mutex_t *l)
282 {
283 int ret;
284
285 if ((ret = pthread_mutex_lock(l)) != 0) {
286 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
287 exit(1);
288 }
289 }
290
291 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
292 * Number of hierarchies mounted. */
293 static int num_hierarchies;
294
295 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
296 * Hierachies mounted {cpuset, blkio, ...}:
297 * Initialized via __constructor__ collect_and_mount_subsystems(). */
298 static char **hierarchies;
299
300 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
301 * Open file descriptors:
302 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
303 * private mount namespace.
304 * Initialized via __constructor__ collect_and_mount_subsystems().
305 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
306 * mounts and respective files in the private namespace even when located in
307 * another namespace using the *at() family of functions
308 * {openat(), fchownat(), ...}. */
309 static int *fd_hierarchies;
310 static int cgroup_mount_ns_fd = -1;
311
312 static void unlock_mutex(pthread_mutex_t *l)
313 {
314 int ret;
315
316 if ((ret = pthread_mutex_unlock(l)) != 0) {
317 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
318 exit(1);
319 }
320 }
321
322 static void store_lock(void)
323 {
324 lock_mutex(&pidns_store_mutex);
325 }
326
327 static void store_unlock(void)
328 {
329 unlock_mutex(&pidns_store_mutex);
330 }
331
332 /* Must be called under store_lock */
333 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
334 {
335 struct stat initsb;
336 char fnam[100];
337
338 snprintf(fnam, 100, "/proc/%d", e->initpid);
339 if (stat(fnam, &initsb) < 0)
340 return false;
341
342 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
343 initsb.st_ctime, e->initpid);
344
345 if (e->ctime != initsb.st_ctime)
346 return false;
347 return true;
348 }
349
350 /* Must be called under store_lock */
351 static void remove_initpid(struct pidns_init_store *e)
352 {
353 struct pidns_init_store *tmp;
354 int h;
355
356 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
357
358 h = HASH(e->ino);
359 if (pidns_hash_table[h] == e) {
360 pidns_hash_table[h] = e->next;
361 free(e);
362 return;
363 }
364
365 tmp = pidns_hash_table[h];
366 while (tmp) {
367 if (tmp->next == e) {
368 tmp->next = e->next;
369 free(e);
370 return;
371 }
372 tmp = tmp->next;
373 }
374 }
375
376 #define PURGE_SECS 5
377 /* Must be called under store_lock */
378 static void prune_initpid_store(void)
379 {
380 static long int last_prune = 0;
381 struct pidns_init_store *e, *prev, *delme;
382 long int now, threshold;
383 int i;
384
385 if (!last_prune) {
386 last_prune = time(NULL);
387 return;
388 }
389 now = time(NULL);
390 if (now < last_prune + PURGE_SECS)
391 return;
392
393 lxcfs_debug("%s\n", "Pruning.");
394
395 last_prune = now;
396 threshold = now - 2 * PURGE_SECS;
397
398 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
399 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
400 if (e->lastcheck < threshold) {
401
402 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
403
404 delme = e;
405 if (prev)
406 prev->next = e->next;
407 else
408 pidns_hash_table[i] = e->next;
409 e = e->next;
410 free(delme);
411 } else {
412 prev = e;
413 e = e->next;
414 }
415 }
416 }
417 }
418
419 /* Must be called under store_lock */
420 static void save_initpid(struct stat *sb, pid_t pid)
421 {
422 struct pidns_init_store *e;
423 char fpath[100];
424 struct stat procsb;
425 int h;
426
427 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
428
429 snprintf(fpath, 100, "/proc/%d", pid);
430 if (stat(fpath, &procsb) < 0)
431 return;
432 do {
433 e = malloc(sizeof(*e));
434 } while (!e);
435 e->ino = sb->st_ino;
436 e->initpid = pid;
437 e->ctime = procsb.st_ctime;
438 h = HASH(e->ino);
439 e->next = pidns_hash_table[h];
440 e->lastcheck = time(NULL);
441 pidns_hash_table[h] = e;
442 }
443
444 /*
445 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
446 * entry for the inode number and creation time. Verify that the init pid
447 * is still valid. If not, remove it. Return the entry if valid, NULL
448 * otherwise.
449 * Must be called under store_lock
450 */
451 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
452 {
453 int h = HASH(sb->st_ino);
454 struct pidns_init_store *e = pidns_hash_table[h];
455
456 while (e) {
457 if (e->ino == sb->st_ino) {
458 if (initpid_still_valid(e, sb)) {
459 e->lastcheck = time(NULL);
460 return e;
461 }
462 remove_initpid(e);
463 return NULL;
464 }
465 e = e->next;
466 }
467
468 return NULL;
469 }
470
471 static int is_dir(const char *path, int fd)
472 {
473 struct stat statbuf;
474 int ret = fstatat(fd, path, &statbuf, fd);
475 if (ret == 0 && S_ISDIR(statbuf.st_mode))
476 return 1;
477 return 0;
478 }
479
480 static char *must_copy_string(const char *str)
481 {
482 char *dup = NULL;
483 if (!str)
484 return NULL;
485 do {
486 dup = strdup(str);
487 } while (!dup);
488
489 return dup;
490 }
491
492 static inline void drop_trailing_newlines(char *s)
493 {
494 int l;
495
496 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
497 s[l-1] = '\0';
498 }
499
500 #define BATCH_SIZE 50
501 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
502 {
503 int newbatches = (newlen / BATCH_SIZE) + 1;
504 int oldbatches = (oldlen / BATCH_SIZE) + 1;
505
506 if (!*mem || newbatches > oldbatches) {
507 char *tmp;
508 do {
509 tmp = realloc(*mem, newbatches * BATCH_SIZE);
510 } while (!tmp);
511 *mem = tmp;
512 }
513 }
514 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
515 {
516 size_t newlen = *len + linelen;
517 dorealloc(contents, *len, newlen + 1);
518 memcpy(*contents + *len, line, linelen+1);
519 *len = newlen;
520 }
521
522 static char *slurp_file(const char *from, int fd)
523 {
524 char *line = NULL;
525 char *contents = NULL;
526 FILE *f = fdopen(fd, "r");
527 size_t len = 0, fulllen = 0;
528 ssize_t linelen;
529
530 if (!f)
531 return NULL;
532
533 while ((linelen = getline(&line, &len, f)) != -1) {
534 append_line(&contents, &fulllen, line, linelen);
535 }
536 fclose(f);
537
538 if (contents)
539 drop_trailing_newlines(contents);
540 free(line);
541 return contents;
542 }
543
544 static bool write_string(const char *fnam, const char *string, int fd)
545 {
546 FILE *f;
547 size_t len, ret;
548
549 if (!(f = fdopen(fd, "w")))
550 return false;
551 len = strlen(string);
552 ret = fwrite(string, 1, len, f);
553 if (ret != len) {
554 lxcfs_error("Error writing to file: %s\n", strerror(errno));
555 fclose(f);
556 return false;
557 }
558 if (fclose(f) < 0) {
559 lxcfs_error("Error writing to file: %s\n", strerror(errno));
560 return false;
561 }
562 return true;
563 }
564
565 struct cgfs_files {
566 char *name;
567 uint32_t uid, gid;
568 uint32_t mode;
569 };
570
571 #define ALLOC_NUM 20
572 static bool store_hierarchy(char *stridx, char *h)
573 {
574 if (num_hierarchies % ALLOC_NUM == 0) {
575 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
576 n *= ALLOC_NUM;
577 char **tmp = realloc(hierarchies, n * sizeof(char *));
578 if (!tmp) {
579 lxcfs_error("%s\n", strerror(errno));
580 exit(1);
581 }
582 hierarchies = tmp;
583 }
584
585 hierarchies[num_hierarchies++] = must_copy_string(h);
586 return true;
587 }
588
589 static void print_subsystems(void)
590 {
591 int i;
592
593 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
594 fprintf(stderr, "hierarchies:\n");
595 for (i = 0; i < num_hierarchies; i++) {
596 if (hierarchies[i])
597 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
598 fd_hierarchies[i], hierarchies[i]);
599 }
600 }
601
602 static bool in_comma_list(const char *needle, const char *haystack)
603 {
604 const char *s = haystack, *e;
605 size_t nlen = strlen(needle);
606
607 while (*s && (e = strchr(s, ','))) {
608 if (nlen != e - s) {
609 s = e + 1;
610 continue;
611 }
612 if (strncmp(needle, s, nlen) == 0)
613 return true;
614 s = e + 1;
615 }
616 if (strcmp(needle, s) == 0)
617 return true;
618 return false;
619 }
620
621 /* do we need to do any massaging here? I'm not sure... */
622 /* Return the mounted controller and store the corresponding open file descriptor
623 * referring to the controller mountpoint in the private lxcfs namespace in
624 * @cfd.
625 */
626 static char *find_mounted_controller(const char *controller, int *cfd)
627 {
628 int i;
629
630 for (i = 0; i < num_hierarchies; i++) {
631 if (!hierarchies[i])
632 continue;
633 if (strcmp(hierarchies[i], controller) == 0) {
634 *cfd = fd_hierarchies[i];
635 return hierarchies[i];
636 }
637 if (in_comma_list(controller, hierarchies[i])) {
638 *cfd = fd_hierarchies[i];
639 return hierarchies[i];
640 }
641 }
642
643 return NULL;
644 }
645
646 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
647 const char *value)
648 {
649 int ret, fd, cfd;
650 size_t len;
651 char *fnam, *tmpc;
652
653 tmpc = find_mounted_controller(controller, &cfd);
654 if (!tmpc)
655 return false;
656
657 /* Make sure we pass a relative path to *at() family of functions.
658 * . + /cgroup + / + file + \0
659 */
660 len = strlen(cgroup) + strlen(file) + 3;
661 fnam = alloca(len);
662 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
663 if (ret < 0 || (size_t)ret >= len)
664 return false;
665
666 fd = openat(cfd, fnam, O_WRONLY);
667 if (fd < 0)
668 return false;
669
670 return write_string(fnam, value, fd);
671 }
672
673 // Chown all the files in the cgroup directory. We do this when we create
674 // a cgroup on behalf of a user.
675 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
676 {
677 struct dirent *direntp;
678 char path[MAXPATHLEN];
679 size_t len;
680 DIR *d;
681 int fd1, ret;
682
683 len = strlen(dirname);
684 if (len >= MAXPATHLEN) {
685 lxcfs_error("Pathname too long: %s\n", dirname);
686 return;
687 }
688
689 fd1 = openat(fd, dirname, O_DIRECTORY);
690 if (fd1 < 0)
691 return;
692
693 d = fdopendir(fd1);
694 if (!d) {
695 lxcfs_error("Failed to open %s\n", dirname);
696 return;
697 }
698
699 while ((direntp = readdir(d))) {
700 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
701 continue;
702 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
703 if (ret < 0 || ret >= MAXPATHLEN) {
704 lxcfs_error("Pathname too long under %s\n", dirname);
705 continue;
706 }
707 if (fchownat(fd, path, uid, gid, 0) < 0)
708 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
709 }
710 closedir(d);
711 }
712
713 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
714 {
715 int cfd;
716 size_t len;
717 char *dirnam, *tmpc;
718
719 tmpc = find_mounted_controller(controller, &cfd);
720 if (!tmpc)
721 return -EINVAL;
722
723 /* Make sure we pass a relative path to *at() family of functions.
724 * . + /cg + \0
725 */
726 len = strlen(cg) + 2;
727 dirnam = alloca(len);
728 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
729
730 if (mkdirat(cfd, dirnam, 0755) < 0)
731 return -errno;
732
733 if (uid == 0 && gid == 0)
734 return 0;
735
736 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
737 return -errno;
738
739 chown_all_cgroup_files(dirnam, uid, gid, cfd);
740
741 return 0;
742 }
743
744 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
745 {
746 struct dirent *direntp;
747 DIR *dir;
748 bool ret = false;
749 char pathname[MAXPATHLEN];
750 int dupfd;
751
752 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
753 if (dupfd < 0)
754 return false;
755
756 dir = fdopendir(dupfd);
757 if (!dir) {
758 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
759 close(dupfd);
760 return false;
761 }
762
763 while ((direntp = readdir(dir))) {
764 struct stat mystat;
765 int rc;
766
767 if (!strcmp(direntp->d_name, ".") ||
768 !strcmp(direntp->d_name, ".."))
769 continue;
770
771 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
772 if (rc < 0 || rc >= MAXPATHLEN) {
773 lxcfs_error("%s\n", "Pathname too long.");
774 continue;
775 }
776
777 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
778 if (rc) {
779 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
780 continue;
781 }
782 if (S_ISDIR(mystat.st_mode))
783 if (!recursive_rmdir(pathname, fd, cfd))
784 lxcfs_debug("Error removing %s.\n", pathname);
785 }
786
787 ret = true;
788 if (closedir(dir) < 0) {
789 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
790 ret = false;
791 }
792
793 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
794 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
795 ret = false;
796 }
797
798 close(dupfd);
799
800 return ret;
801 }
802
803 bool cgfs_remove(const char *controller, const char *cg)
804 {
805 int fd, cfd;
806 size_t len;
807 char *dirnam, *tmpc;
808 bool bret;
809
810 tmpc = find_mounted_controller(controller, &cfd);
811 if (!tmpc)
812 return false;
813
814 /* Make sure we pass a relative path to *at() family of functions.
815 * . + /cg + \0
816 */
817 len = strlen(cg) + 2;
818 dirnam = alloca(len);
819 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
820
821 fd = openat(cfd, dirnam, O_DIRECTORY);
822 if (fd < 0)
823 return false;
824
825 bret = recursive_rmdir(dirnam, fd, cfd);
826 close(fd);
827 return bret;
828 }
829
830 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
831 {
832 int cfd;
833 size_t len;
834 char *pathname, *tmpc;
835
836 tmpc = find_mounted_controller(controller, &cfd);
837 if (!tmpc)
838 return false;
839
840 /* Make sure we pass a relative path to *at() family of functions.
841 * . + /file + \0
842 */
843 len = strlen(file) + 2;
844 pathname = alloca(len);
845 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
846 if (fchmodat(cfd, pathname, mode, 0) < 0)
847 return false;
848 return true;
849 }
850
851 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
852 {
853 size_t len;
854 char *fname;
855
856 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
857 fname = alloca(len);
858 snprintf(fname, len, "%s/tasks", dirname);
859 if (fchownat(fd, fname, uid, gid, 0) != 0)
860 return -errno;
861 snprintf(fname, len, "%s/cgroup.procs", dirname);
862 if (fchownat(fd, fname, uid, gid, 0) != 0)
863 return -errno;
864 return 0;
865 }
866
867 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
868 {
869 int cfd;
870 size_t len;
871 char *pathname, *tmpc;
872
873 tmpc = find_mounted_controller(controller, &cfd);
874 if (!tmpc)
875 return -EINVAL;
876
877 /* Make sure we pass a relative path to *at() family of functions.
878 * . + /file + \0
879 */
880 len = strlen(file) + 2;
881 pathname = alloca(len);
882 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
883 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
884 return -errno;
885
886 if (is_dir(pathname, cfd))
887 // like cgmanager did, we want to chown the tasks file as well
888 return chown_tasks_files(pathname, uid, gid, cfd);
889
890 return 0;
891 }
892
893 FILE *open_pids_file(const char *controller, const char *cgroup)
894 {
895 int fd, cfd;
896 size_t len;
897 char *pathname, *tmpc;
898
899 tmpc = find_mounted_controller(controller, &cfd);
900 if (!tmpc)
901 return NULL;
902
903 /* Make sure we pass a relative path to *at() family of functions.
904 * . + /cgroup + / "cgroup.procs" + \0
905 */
906 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
907 pathname = alloca(len);
908 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
909
910 fd = openat(cfd, pathname, O_WRONLY);
911 if (fd < 0)
912 return NULL;
913
914 return fdopen(fd, "w");
915 }
916
917 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
918 void ***list, size_t typesize,
919 void* (*iterator)(const char*, const char*, const char*))
920 {
921 int cfd, fd, ret;
922 size_t len;
923 char *cg, *tmpc;
924 char pathname[MAXPATHLEN];
925 size_t sz = 0, asz = 0;
926 struct dirent *dirent;
927 DIR *dir;
928
929 tmpc = find_mounted_controller(controller, &cfd);
930 *list = NULL;
931 if (!tmpc)
932 return false;
933
934 /* Make sure we pass a relative path to *at() family of functions. */
935 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
936 cg = alloca(len);
937 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
938 if (ret < 0 || (size_t)ret >= len) {
939 lxcfs_error("Pathname too long under %s\n", cgroup);
940 return false;
941 }
942
943 fd = openat(cfd, cg, O_DIRECTORY);
944 if (fd < 0)
945 return false;
946
947 dir = fdopendir(fd);
948 if (!dir)
949 return false;
950
951 while ((dirent = readdir(dir))) {
952 struct stat mystat;
953
954 if (!strcmp(dirent->d_name, ".") ||
955 !strcmp(dirent->d_name, ".."))
956 continue;
957
958 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
959 if (ret < 0 || ret >= MAXPATHLEN) {
960 lxcfs_error("Pathname too long under %s\n", cg);
961 continue;
962 }
963
964 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
965 if (ret) {
966 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
967 continue;
968 }
969 if ((!directories && !S_ISREG(mystat.st_mode)) ||
970 (directories && !S_ISDIR(mystat.st_mode)))
971 continue;
972
973 if (sz+2 >= asz) {
974 void **tmp;
975 asz += BATCH_SIZE;
976 do {
977 tmp = realloc(*list, asz * typesize);
978 } while (!tmp);
979 *list = tmp;
980 }
981 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
982 (*list)[sz+1] = NULL;
983 sz++;
984 }
985 if (closedir(dir) < 0) {
986 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
987 return false;
988 }
989 return true;
990 }
991
992 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
993 {
994 char *dup;
995 do {
996 dup = strdup(dir_entry);
997 } while (!dup);
998 return dup;
999 }
1000
1001 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1002 {
1003 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1004 }
1005
1006 void free_key(struct cgfs_files *k)
1007 {
1008 if (!k)
1009 return;
1010 free(k->name);
1011 free(k);
1012 }
1013
1014 void free_keys(struct cgfs_files **keys)
1015 {
1016 int i;
1017
1018 if (!keys)
1019 return;
1020 for (i = 0; keys[i]; i++) {
1021 free_key(keys[i]);
1022 }
1023 free(keys);
1024 }
1025
1026 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1027 {
1028 int ret, fd, cfd;
1029 size_t len;
1030 char *fnam, *tmpc;
1031
1032 tmpc = find_mounted_controller(controller, &cfd);
1033 if (!tmpc)
1034 return false;
1035
1036 /* Make sure we pass a relative path to *at() family of functions.
1037 * . + /cgroup + / + file + \0
1038 */
1039 len = strlen(cgroup) + strlen(file) + 3;
1040 fnam = alloca(len);
1041 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1042 if (ret < 0 || (size_t)ret >= len)
1043 return false;
1044
1045 fd = openat(cfd, fnam, O_RDONLY);
1046 if (fd < 0)
1047 return false;
1048
1049 *value = slurp_file(fnam, fd);
1050 return *value != NULL;
1051 }
1052
1053 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1054 {
1055 int ret, cfd;
1056 size_t len;
1057 char *fnam, *tmpc;
1058 struct stat sb;
1059 struct cgfs_files *newkey;
1060
1061 tmpc = find_mounted_controller(controller, &cfd);
1062 if (!tmpc)
1063 return false;
1064
1065 if (file && *file == '/')
1066 file++;
1067
1068 if (file && strchr(file, '/'))
1069 return NULL;
1070
1071 /* Make sure we pass a relative path to *at() family of functions.
1072 * . + /cgroup + / + file + \0
1073 */
1074 len = strlen(cgroup) + 3;
1075 if (file)
1076 len += strlen(file) + 1;
1077 fnam = alloca(len);
1078 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1079 file ? "/" : "", file ? file : "");
1080
1081 ret = fstatat(cfd, fnam, &sb, 0);
1082 if (ret < 0)
1083 return NULL;
1084
1085 do {
1086 newkey = malloc(sizeof(struct cgfs_files));
1087 } while (!newkey);
1088 if (file)
1089 newkey->name = must_copy_string(file);
1090 else if (strrchr(cgroup, '/'))
1091 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1092 else
1093 newkey->name = must_copy_string(cgroup);
1094 newkey->uid = sb.st_uid;
1095 newkey->gid = sb.st_gid;
1096 newkey->mode = sb.st_mode;
1097
1098 return newkey;
1099 }
1100
1101 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1102 {
1103 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1104 if (!entry) {
1105 lxcfs_error("Error getting files under %s:%s\n", controller,
1106 cgroup);
1107 }
1108 return entry;
1109 }
1110
1111 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1112 {
1113 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1114 }
1115
1116 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1117 {
1118 int cfd;
1119 size_t len;
1120 char *fnam, *tmpc;
1121 int ret;
1122 struct stat sb;
1123
1124 tmpc = find_mounted_controller(controller, &cfd);
1125 if (!tmpc)
1126 return false;
1127
1128 /* Make sure we pass a relative path to *at() family of functions.
1129 * . + /cgroup + / + f + \0
1130 */
1131 len = strlen(cgroup) + strlen(f) + 3;
1132 fnam = alloca(len);
1133 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1134 if (ret < 0 || (size_t)ret >= len)
1135 return false;
1136
1137 ret = fstatat(cfd, fnam, &sb, 0);
1138 if (ret < 0 || !S_ISDIR(sb.st_mode))
1139 return false;
1140
1141 return true;
1142 }
1143
1144 #define SEND_CREDS_OK 0
1145 #define SEND_CREDS_NOTSK 1
1146 #define SEND_CREDS_FAIL 2
1147 static bool recv_creds(int sock, struct ucred *cred, char *v);
1148 static int wait_for_pid(pid_t pid);
1149 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1150 static int send_creds_clone_wrapper(void *arg);
1151
1152 /*
1153 * clone a task which switches to @task's namespace and writes '1'.
1154 * over a unix sock so we can read the task's reaper's pid in our
1155 * namespace
1156 *
1157 * Note: glibc's fork() does not respect pidns, which can lead to failed
1158 * assertions inside glibc (and thus failed forks) if the child's pid in
1159 * the pidns and the parent pid outside are identical. Using clone prevents
1160 * this issue.
1161 */
1162 static void write_task_init_pid_exit(int sock, pid_t target)
1163 {
1164 char fnam[100];
1165 pid_t pid;
1166 int fd, ret;
1167 size_t stack_size = sysconf(_SC_PAGESIZE);
1168 void *stack = alloca(stack_size);
1169
1170 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1171 if (ret < 0 || ret >= sizeof(fnam))
1172 _exit(1);
1173
1174 fd = open(fnam, O_RDONLY);
1175 if (fd < 0) {
1176 perror("write_task_init_pid_exit open of ns/pid");
1177 _exit(1);
1178 }
1179 if (setns(fd, 0)) {
1180 perror("write_task_init_pid_exit setns 1");
1181 close(fd);
1182 _exit(1);
1183 }
1184 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1185 if (pid < 0)
1186 _exit(1);
1187 if (pid != 0) {
1188 if (!wait_for_pid(pid))
1189 _exit(1);
1190 _exit(0);
1191 }
1192 }
1193
1194 static int send_creds_clone_wrapper(void *arg) {
1195 struct ucred cred;
1196 char v;
1197 int sock = *(int *)arg;
1198
1199 /* we are the child */
1200 cred.uid = 0;
1201 cred.gid = 0;
1202 cred.pid = 1;
1203 v = '1';
1204 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1205 return 1;
1206 return 0;
1207 }
1208
1209 static pid_t get_init_pid_for_task(pid_t task)
1210 {
1211 int sock[2];
1212 pid_t pid;
1213 pid_t ret = -1;
1214 char v = '0';
1215 struct ucred cred;
1216
1217 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1218 perror("socketpair");
1219 return -1;
1220 }
1221
1222 pid = fork();
1223 if (pid < 0)
1224 goto out;
1225 if (!pid) {
1226 close(sock[1]);
1227 write_task_init_pid_exit(sock[0], task);
1228 _exit(0);
1229 }
1230
1231 if (!recv_creds(sock[1], &cred, &v))
1232 goto out;
1233 ret = cred.pid;
1234
1235 out:
1236 close(sock[0]);
1237 close(sock[1]);
1238 if (pid > 0)
1239 wait_for_pid(pid);
1240 return ret;
1241 }
1242
1243 static pid_t lookup_initpid_in_store(pid_t qpid)
1244 {
1245 pid_t answer = 0;
1246 struct stat sb;
1247 struct pidns_init_store *e;
1248 char fnam[100];
1249
1250 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1251 store_lock();
1252 if (stat(fnam, &sb) < 0)
1253 goto out;
1254 e = lookup_verify_initpid(&sb);
1255 if (e) {
1256 answer = e->initpid;
1257 goto out;
1258 }
1259 answer = get_init_pid_for_task(qpid);
1260 if (answer > 0)
1261 save_initpid(&sb, answer);
1262
1263 out:
1264 /* we prune at end in case we are returning
1265 * the value we were about to return */
1266 prune_initpid_store();
1267 store_unlock();
1268 return answer;
1269 }
1270
1271 static int wait_for_pid(pid_t pid)
1272 {
1273 int status, ret;
1274
1275 if (pid <= 0)
1276 return -1;
1277
1278 again:
1279 ret = waitpid(pid, &status, 0);
1280 if (ret == -1) {
1281 if (errno == EINTR)
1282 goto again;
1283 return -1;
1284 }
1285 if (ret != pid)
1286 goto again;
1287 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1288 return -1;
1289 return 0;
1290 }
1291
1292
1293 /*
1294 * append pid to *src.
1295 * src: a pointer to a char* in which ot append the pid.
1296 * sz: the number of characters printed so far, minus trailing \0.
1297 * asz: the allocated size so far
1298 * pid: the pid to append
1299 */
1300 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1301 {
1302 char tmp[30];
1303
1304 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1305
1306 if (!*src || tmplen + *sz + 1 >= *asz) {
1307 char *tmp;
1308 do {
1309 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1310 } while (!tmp);
1311 *src = tmp;
1312 *asz += BUF_RESERVE_SIZE;
1313 }
1314 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1315 *sz += tmplen;
1316 }
1317
1318 /*
1319 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1320 * valid in the caller's namespace, return the id mapped into
1321 * pid's namespace.
1322 * Returns the mapped id, or -1 on error.
1323 */
1324 unsigned int
1325 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1326 {
1327 unsigned int nsuid, // base id for a range in the idfile's namespace
1328 hostuid, // base id for a range in the caller's namespace
1329 count; // number of ids in this range
1330 char line[400];
1331 int ret;
1332
1333 fseek(idfile, 0L, SEEK_SET);
1334 while (fgets(line, 400, idfile)) {
1335 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1336 if (ret != 3)
1337 continue;
1338 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1339 /*
1340 * uids wrapped around - unexpected as this is a procfile,
1341 * so just bail.
1342 */
1343 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1344 nsuid, hostuid, count, line);
1345 return -1;
1346 }
1347 if (hostuid <= in_id && hostuid+count > in_id) {
1348 /*
1349 * now since hostuid <= in_id < hostuid+count, and
1350 * hostuid+count and nsuid+count do not wrap around,
1351 * we know that nsuid+(in_id-hostuid) which must be
1352 * less that nsuid+(count) must not wrap around
1353 */
1354 return (in_id - hostuid) + nsuid;
1355 }
1356 }
1357
1358 // no answer found
1359 return -1;
1360 }
1361
1362 /*
1363 * for is_privileged_over,
1364 * specify whether we require the calling uid to be root in his
1365 * namespace
1366 */
1367 #define NS_ROOT_REQD true
1368 #define NS_ROOT_OPT false
1369
1370 #define PROCLEN 100
1371
1372 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1373 {
1374 char fpath[PROCLEN];
1375 int ret;
1376 bool answer = false;
1377 uid_t nsuid;
1378
1379 if (victim == -1 || uid == -1)
1380 return false;
1381
1382 /*
1383 * If the request is one not requiring root in the namespace,
1384 * then having the same uid suffices. (i.e. uid 1000 has write
1385 * access to files owned by uid 1000
1386 */
1387 if (!req_ns_root && uid == victim)
1388 return true;
1389
1390 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1391 if (ret < 0 || ret >= PROCLEN)
1392 return false;
1393 FILE *f = fopen(fpath, "r");
1394 if (!f)
1395 return false;
1396
1397 /* if caller's not root in his namespace, reject */
1398 nsuid = convert_id_to_ns(f, uid);
1399 if (nsuid)
1400 goto out;
1401
1402 /*
1403 * If victim is not mapped into caller's ns, reject.
1404 * XXX I'm not sure this check is needed given that fuse
1405 * will be sending requests where the vfs has converted
1406 */
1407 nsuid = convert_id_to_ns(f, victim);
1408 if (nsuid == -1)
1409 goto out;
1410
1411 answer = true;
1412
1413 out:
1414 fclose(f);
1415 return answer;
1416 }
1417
1418 static bool perms_include(int fmode, mode_t req_mode)
1419 {
1420 mode_t r;
1421
1422 switch (req_mode & O_ACCMODE) {
1423 case O_RDONLY:
1424 r = S_IROTH;
1425 break;
1426 case O_WRONLY:
1427 r = S_IWOTH;
1428 break;
1429 case O_RDWR:
1430 r = S_IROTH | S_IWOTH;
1431 break;
1432 default:
1433 return false;
1434 }
1435 return ((fmode & r) == r);
1436 }
1437
1438
1439 /*
1440 * taskcg is a/b/c
1441 * querycg is /a/b/c/d/e
1442 * we return 'd'
1443 */
1444 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1445 {
1446 char *start, *end;
1447
1448 if (strlen(taskcg) <= strlen(querycg)) {
1449 lxcfs_error("%s\n", "I was fed bad input.");
1450 return NULL;
1451 }
1452
1453 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1454 start = strdup(taskcg + 1);
1455 else
1456 start = strdup(taskcg + strlen(querycg) + 1);
1457 if (!start)
1458 return NULL;
1459 end = strchr(start, '/');
1460 if (end)
1461 *end = '\0';
1462 return start;
1463 }
1464
1465 static void stripnewline(char *x)
1466 {
1467 size_t l = strlen(x);
1468 if (l && x[l-1] == '\n')
1469 x[l-1] = '\0';
1470 }
1471
1472 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1473 {
1474 int cfd;
1475 char fnam[PROCLEN];
1476 FILE *f;
1477 char *answer = NULL;
1478 char *line = NULL;
1479 size_t len = 0;
1480 int ret;
1481 const char *h = find_mounted_controller(contrl, &cfd);
1482 if (!h)
1483 return NULL;
1484
1485 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1486 if (ret < 0 || ret >= PROCLEN)
1487 return NULL;
1488 if (!(f = fopen(fnam, "r")))
1489 return NULL;
1490
1491 while (getline(&line, &len, f) != -1) {
1492 char *c1, *c2;
1493 if (!line[0])
1494 continue;
1495 c1 = strchr(line, ':');
1496 if (!c1)
1497 goto out;
1498 c1++;
1499 c2 = strchr(c1, ':');
1500 if (!c2)
1501 goto out;
1502 *c2 = '\0';
1503 if (strcmp(c1, h) != 0)
1504 continue;
1505 c2++;
1506 stripnewline(c2);
1507 do {
1508 answer = strdup(c2);
1509 } while (!answer);
1510 break;
1511 }
1512
1513 out:
1514 fclose(f);
1515 free(line);
1516 return answer;
1517 }
1518
1519 /*
1520 * check whether a fuse context may access a cgroup dir or file
1521 *
1522 * If file is not null, it is a cgroup file to check under cg.
1523 * If file is null, then we are checking perms on cg itself.
1524 *
1525 * For files we can check the mode of the list_keys result.
1526 * For cgroups, we must make assumptions based on the files under the
1527 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1528 * yet.
1529 */
1530 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1531 {
1532 struct cgfs_files *k = NULL;
1533 bool ret = false;
1534
1535 k = cgfs_get_key(contrl, cg, file);
1536 if (!k)
1537 return false;
1538
1539 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1540 if (perms_include(k->mode >> 6, mode)) {
1541 ret = true;
1542 goto out;
1543 }
1544 }
1545 if (fc->gid == k->gid) {
1546 if (perms_include(k->mode >> 3, mode)) {
1547 ret = true;
1548 goto out;
1549 }
1550 }
1551 ret = perms_include(k->mode, mode);
1552
1553 out:
1554 free_key(k);
1555 return ret;
1556 }
1557
1558 #define INITSCOPE "/init.scope"
1559 static void prune_init_slice(char *cg)
1560 {
1561 char *point;
1562 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1563
1564 if (cg_len < initscope_len)
1565 return;
1566
1567 point = cg + cg_len - initscope_len;
1568 if (strcmp(point, INITSCOPE) == 0) {
1569 if (point == cg)
1570 *(point+1) = '\0';
1571 else
1572 *point = '\0';
1573 }
1574 }
1575
1576 /*
1577 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1578 * If pid is in /a, he may act on /a/b, but not on /b.
1579 * if the answer is false and nextcg is not NULL, then *nextcg will point
1580 * to a string containing the next cgroup directory under cg, which must be
1581 * freed by the caller.
1582 */
1583 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1584 {
1585 bool answer = false;
1586 char *c2 = get_pid_cgroup(pid, contrl);
1587 char *linecmp;
1588
1589 if (!c2)
1590 return false;
1591 prune_init_slice(c2);
1592
1593 /*
1594 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1595 * they pass in a cgroup without leading '/'
1596 *
1597 * The original line here was:
1598 * linecmp = *cg == '/' ? c2 : c2+1;
1599 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1600 * Serge, do you know?
1601 */
1602 if (*cg == '/' || !strncmp(cg, "./", 2))
1603 linecmp = c2;
1604 else
1605 linecmp = c2 + 1;
1606 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1607 if (nextcg) {
1608 *nextcg = get_next_cgroup_dir(linecmp, cg);
1609 }
1610 goto out;
1611 }
1612 answer = true;
1613
1614 out:
1615 free(c2);
1616 return answer;
1617 }
1618
1619 /*
1620 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1621 */
1622 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1623 {
1624 bool answer = false;
1625 char *c2, *task_cg;
1626 size_t target_len, task_len;
1627
1628 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1629 return true;
1630
1631 c2 = get_pid_cgroup(pid, contrl);
1632 if (!c2)
1633 return false;
1634 prune_init_slice(c2);
1635
1636 task_cg = c2 + 1;
1637 target_len = strlen(cg);
1638 task_len = strlen(task_cg);
1639 if (task_len == 0) {
1640 /* Task is in the root cg, it can see everything. This case is
1641 * not handled by the strmcps below, since they test for the
1642 * last /, but that is the first / that we've chopped off
1643 * above.
1644 */
1645 answer = true;
1646 goto out;
1647 }
1648 if (strcmp(cg, task_cg) == 0) {
1649 answer = true;
1650 goto out;
1651 }
1652 if (target_len < task_len) {
1653 /* looking up a parent dir */
1654 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1655 answer = true;
1656 goto out;
1657 }
1658 if (target_len > task_len) {
1659 /* looking up a child dir */
1660 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1661 answer = true;
1662 goto out;
1663 }
1664
1665 out:
1666 free(c2);
1667 return answer;
1668 }
1669
1670 /*
1671 * given /cgroup/freezer/a/b, return "freezer".
1672 * the returned char* should NOT be freed.
1673 */
1674 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1675 {
1676 const char *p1;
1677 char *contr, *slash;
1678
1679 if (strlen(path) < 9) {
1680 errno = EACCES;
1681 return NULL;
1682 }
1683 if (*(path + 7) != '/') {
1684 errno = EINVAL;
1685 return NULL;
1686 }
1687 p1 = path + 8;
1688 contr = strdupa(p1);
1689 if (!contr) {
1690 errno = ENOMEM;
1691 return NULL;
1692 }
1693 slash = strstr(contr, "/");
1694 if (slash)
1695 *slash = '\0';
1696
1697 int i;
1698 for (i = 0; i < num_hierarchies; i++) {
1699 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1700 return hierarchies[i];
1701 }
1702 errno = ENOENT;
1703 return NULL;
1704 }
1705
1706 /*
1707 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1708 * Note that the returned value may include files (keynames) etc
1709 */
1710 static const char *find_cgroup_in_path(const char *path)
1711 {
1712 const char *p1;
1713
1714 if (strlen(path) < 9) {
1715 errno = EACCES;
1716 return NULL;
1717 }
1718 p1 = strstr(path + 8, "/");
1719 if (!p1) {
1720 errno = EINVAL;
1721 return NULL;
1722 }
1723 errno = 0;
1724 return p1 + 1;
1725 }
1726
1727 /*
1728 * split the last path element from the path in @cg.
1729 * @dir is newly allocated and should be freed, @last not
1730 */
1731 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1732 {
1733 char *p;
1734
1735 do {
1736 *dir = strdup(cg);
1737 } while (!*dir);
1738 *last = strrchr(cg, '/');
1739 if (!*last) {
1740 *last = NULL;
1741 return;
1742 }
1743 p = strrchr(*dir, '/');
1744 *p = '\0';
1745 }
1746
1747 /*
1748 * FUSE ops for /cgroup
1749 */
1750
1751 int cg_getattr(const char *path, struct stat *sb)
1752 {
1753 struct timespec now;
1754 struct fuse_context *fc = fuse_get_context();
1755 char * cgdir = NULL;
1756 char *last = NULL, *path1, *path2;
1757 struct cgfs_files *k = NULL;
1758 const char *cgroup;
1759 const char *controller = NULL;
1760 int ret = -ENOENT;
1761
1762
1763 if (!fc)
1764 return -EIO;
1765
1766 memset(sb, 0, sizeof(struct stat));
1767
1768 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1769 return -EINVAL;
1770
1771 sb->st_uid = sb->st_gid = 0;
1772 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1773 sb->st_size = 0;
1774
1775 if (strcmp(path, "/cgroup") == 0) {
1776 sb->st_mode = S_IFDIR | 00755;
1777 sb->st_nlink = 2;
1778 return 0;
1779 }
1780
1781 controller = pick_controller_from_path(fc, path);
1782 if (!controller)
1783 return -errno;
1784 cgroup = find_cgroup_in_path(path);
1785 if (!cgroup) {
1786 /* this is just /cgroup/controller, return it as a dir */
1787 sb->st_mode = S_IFDIR | 00755;
1788 sb->st_nlink = 2;
1789 return 0;
1790 }
1791
1792 get_cgdir_and_path(cgroup, &cgdir, &last);
1793
1794 if (!last) {
1795 path1 = "/";
1796 path2 = cgdir;
1797 } else {
1798 path1 = cgdir;
1799 path2 = last;
1800 }
1801
1802 pid_t initpid = lookup_initpid_in_store(fc->pid);
1803 if (initpid <= 0)
1804 initpid = fc->pid;
1805 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1806 * Then check that caller's cgroup is under path if last is a child
1807 * cgroup, or cgdir if last is a file */
1808
1809 if (is_child_cgroup(controller, path1, path2)) {
1810 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1811 ret = -ENOENT;
1812 goto out;
1813 }
1814 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1815 /* this is just /cgroup/controller, return it as a dir */
1816 sb->st_mode = S_IFDIR | 00555;
1817 sb->st_nlink = 2;
1818 ret = 0;
1819 goto out;
1820 }
1821 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1822 ret = -EACCES;
1823 goto out;
1824 }
1825
1826 // get uid, gid, from '/tasks' file and make up a mode
1827 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1828 sb->st_mode = S_IFDIR | 00755;
1829 k = cgfs_get_key(controller, cgroup, NULL);
1830 if (!k) {
1831 sb->st_uid = sb->st_gid = 0;
1832 } else {
1833 sb->st_uid = k->uid;
1834 sb->st_gid = k->gid;
1835 }
1836 free_key(k);
1837 sb->st_nlink = 2;
1838 ret = 0;
1839 goto out;
1840 }
1841
1842 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1843 sb->st_mode = S_IFREG | k->mode;
1844 sb->st_nlink = 1;
1845 sb->st_uid = k->uid;
1846 sb->st_gid = k->gid;
1847 sb->st_size = 0;
1848 free_key(k);
1849 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1850 ret = -ENOENT;
1851 goto out;
1852 }
1853 ret = 0;
1854 }
1855
1856 out:
1857 free(cgdir);
1858 return ret;
1859 }
1860
1861 int cg_opendir(const char *path, struct fuse_file_info *fi)
1862 {
1863 struct fuse_context *fc = fuse_get_context();
1864 const char *cgroup;
1865 struct file_info *dir_info;
1866 char *controller = NULL;
1867
1868 if (!fc)
1869 return -EIO;
1870
1871 if (strcmp(path, "/cgroup") == 0) {
1872 cgroup = NULL;
1873 controller = NULL;
1874 } else {
1875 // return list of keys for the controller, and list of child cgroups
1876 controller = pick_controller_from_path(fc, path);
1877 if (!controller)
1878 return -errno;
1879
1880 cgroup = find_cgroup_in_path(path);
1881 if (!cgroup) {
1882 /* this is just /cgroup/controller, return its contents */
1883 cgroup = "/";
1884 }
1885 }
1886
1887 pid_t initpid = lookup_initpid_in_store(fc->pid);
1888 if (initpid <= 0)
1889 initpid = fc->pid;
1890 if (cgroup) {
1891 if (!caller_may_see_dir(initpid, controller, cgroup))
1892 return -ENOENT;
1893 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1894 return -EACCES;
1895 }
1896
1897 /* we'll free this at cg_releasedir */
1898 dir_info = malloc(sizeof(*dir_info));
1899 if (!dir_info)
1900 return -ENOMEM;
1901 dir_info->controller = must_copy_string(controller);
1902 dir_info->cgroup = must_copy_string(cgroup);
1903 dir_info->type = LXC_TYPE_CGDIR;
1904 dir_info->buf = NULL;
1905 dir_info->file = NULL;
1906 dir_info->buflen = 0;
1907
1908 fi->fh = (unsigned long)dir_info;
1909 return 0;
1910 }
1911
1912 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1913 struct fuse_file_info *fi)
1914 {
1915 struct file_info *d = (struct file_info *)fi->fh;
1916 struct cgfs_files **list = NULL;
1917 int i, ret;
1918 char *nextcg = NULL;
1919 struct fuse_context *fc = fuse_get_context();
1920 char **clist = NULL;
1921
1922 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1923 return -EIO;
1924
1925 if (d->type != LXC_TYPE_CGDIR) {
1926 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1927 return -EIO;
1928 }
1929 if (!d->cgroup && !d->controller) {
1930 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1931 int i;
1932
1933 for (i = 0; i < num_hierarchies; i++) {
1934 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1935 return -EIO;
1936 }
1937 }
1938 return 0;
1939 }
1940
1941 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1942 // not a valid cgroup
1943 ret = -EINVAL;
1944 goto out;
1945 }
1946
1947 pid_t initpid = lookup_initpid_in_store(fc->pid);
1948 if (initpid <= 0)
1949 initpid = fc->pid;
1950 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1951 if (nextcg) {
1952 ret = filler(buf, nextcg, NULL, 0);
1953 free(nextcg);
1954 if (ret != 0) {
1955 ret = -EIO;
1956 goto out;
1957 }
1958 }
1959 ret = 0;
1960 goto out;
1961 }
1962
1963 for (i = 0; list[i]; i++) {
1964 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1965 ret = -EIO;
1966 goto out;
1967 }
1968 }
1969
1970 // now get the list of child cgroups
1971
1972 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1973 ret = 0;
1974 goto out;
1975 }
1976 if (clist) {
1977 for (i = 0; clist[i]; i++) {
1978 if (filler(buf, clist[i], NULL, 0) != 0) {
1979 ret = -EIO;
1980 goto out;
1981 }
1982 }
1983 }
1984 ret = 0;
1985
1986 out:
1987 free_keys(list);
1988 if (clist) {
1989 for (i = 0; clist[i]; i++)
1990 free(clist[i]);
1991 free(clist);
1992 }
1993 return ret;
1994 }
1995
1996 static void do_release_file_info(struct fuse_file_info *fi)
1997 {
1998 struct file_info *f = (struct file_info *)fi->fh;
1999
2000 if (!f)
2001 return;
2002
2003 fi->fh = 0;
2004
2005 free(f->controller);
2006 f->controller = NULL;
2007 free(f->cgroup);
2008 f->cgroup = NULL;
2009 free(f->file);
2010 f->file = NULL;
2011 free(f->buf);
2012 f->buf = NULL;
2013 free(f);
2014 }
2015
2016 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2017 {
2018 do_release_file_info(fi);
2019 return 0;
2020 }
2021
2022 int cg_open(const char *path, struct fuse_file_info *fi)
2023 {
2024 const char *cgroup;
2025 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2026 struct cgfs_files *k = NULL;
2027 struct file_info *file_info;
2028 struct fuse_context *fc = fuse_get_context();
2029 int ret;
2030
2031 if (!fc)
2032 return -EIO;
2033
2034 controller = pick_controller_from_path(fc, path);
2035 if (!controller)
2036 return -errno;
2037 cgroup = find_cgroup_in_path(path);
2038 if (!cgroup)
2039 return -errno;
2040
2041 get_cgdir_and_path(cgroup, &cgdir, &last);
2042 if (!last) {
2043 path1 = "/";
2044 path2 = cgdir;
2045 } else {
2046 path1 = cgdir;
2047 path2 = last;
2048 }
2049
2050 k = cgfs_get_key(controller, path1, path2);
2051 if (!k) {
2052 ret = -EINVAL;
2053 goto out;
2054 }
2055 free_key(k);
2056
2057 pid_t initpid = lookup_initpid_in_store(fc->pid);
2058 if (initpid <= 0)
2059 initpid = fc->pid;
2060 if (!caller_may_see_dir(initpid, controller, path1)) {
2061 ret = -ENOENT;
2062 goto out;
2063 }
2064 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2065 ret = -EACCES;
2066 goto out;
2067 }
2068
2069 /* we'll free this at cg_release */
2070 file_info = malloc(sizeof(*file_info));
2071 if (!file_info) {
2072 ret = -ENOMEM;
2073 goto out;
2074 }
2075 file_info->controller = must_copy_string(controller);
2076 file_info->cgroup = must_copy_string(path1);
2077 file_info->file = must_copy_string(path2);
2078 file_info->type = LXC_TYPE_CGFILE;
2079 file_info->buf = NULL;
2080 file_info->buflen = 0;
2081
2082 fi->fh = (unsigned long)file_info;
2083 ret = 0;
2084
2085 out:
2086 free(cgdir);
2087 return ret;
2088 }
2089
2090 int cg_access(const char *path, int mode)
2091 {
2092 int ret;
2093 const char *cgroup;
2094 char *path1, *path2, *controller;
2095 char *last = NULL, *cgdir = NULL;
2096 struct cgfs_files *k = NULL;
2097 struct fuse_context *fc = fuse_get_context();
2098
2099 if (strcmp(path, "/cgroup") == 0)
2100 return 0;
2101
2102 if (!fc)
2103 return -EIO;
2104
2105 controller = pick_controller_from_path(fc, path);
2106 if (!controller)
2107 return -errno;
2108 cgroup = find_cgroup_in_path(path);
2109 if (!cgroup) {
2110 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2111 if ((mode & W_OK) == 0)
2112 return 0;
2113 return -EACCES;
2114 }
2115
2116 get_cgdir_and_path(cgroup, &cgdir, &last);
2117 if (!last) {
2118 path1 = "/";
2119 path2 = cgdir;
2120 } else {
2121 path1 = cgdir;
2122 path2 = last;
2123 }
2124
2125 k = cgfs_get_key(controller, path1, path2);
2126 if (!k) {
2127 if ((mode & W_OK) == 0)
2128 ret = 0;
2129 else
2130 ret = -EACCES;
2131 goto out;
2132 }
2133 free_key(k);
2134
2135 pid_t initpid = lookup_initpid_in_store(fc->pid);
2136 if (initpid <= 0)
2137 initpid = fc->pid;
2138 if (!caller_may_see_dir(initpid, controller, path1)) {
2139 ret = -ENOENT;
2140 goto out;
2141 }
2142 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2143 ret = -EACCES;
2144 goto out;
2145 }
2146
2147 ret = 0;
2148
2149 out:
2150 free(cgdir);
2151 return ret;
2152 }
2153
2154 int cg_release(const char *path, struct fuse_file_info *fi)
2155 {
2156 do_release_file_info(fi);
2157 return 0;
2158 }
2159
2160 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2161
2162 static bool wait_for_sock(int sock, int timeout)
2163 {
2164 struct epoll_event ev;
2165 int epfd, ret, now, starttime, deltatime, saved_errno;
2166
2167 if ((starttime = time(NULL)) < 0)
2168 return false;
2169
2170 if ((epfd = epoll_create(1)) < 0) {
2171 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2172 return false;
2173 }
2174
2175 ev.events = POLLIN_SET;
2176 ev.data.fd = sock;
2177 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2178 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2179 close(epfd);
2180 return false;
2181 }
2182
2183 again:
2184 if ((now = time(NULL)) < 0) {
2185 close(epfd);
2186 return false;
2187 }
2188
2189 deltatime = (starttime + timeout) - now;
2190 if (deltatime < 0) { // timeout
2191 errno = 0;
2192 close(epfd);
2193 return false;
2194 }
2195 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2196 if (ret < 0 && errno == EINTR)
2197 goto again;
2198 saved_errno = errno;
2199 close(epfd);
2200
2201 if (ret <= 0) {
2202 errno = saved_errno;
2203 return false;
2204 }
2205 return true;
2206 }
2207
2208 static int msgrecv(int sockfd, void *buf, size_t len)
2209 {
2210 if (!wait_for_sock(sockfd, 2))
2211 return -1;
2212 return recv(sockfd, buf, len, MSG_DONTWAIT);
2213 }
2214
2215 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2216 {
2217 struct msghdr msg = { 0 };
2218 struct iovec iov;
2219 struct cmsghdr *cmsg;
2220 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2221 char buf[1];
2222 buf[0] = 'p';
2223
2224 if (pingfirst) {
2225 if (msgrecv(sock, buf, 1) != 1) {
2226 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2227 return SEND_CREDS_FAIL;
2228 }
2229 }
2230
2231 msg.msg_control = cmsgbuf;
2232 msg.msg_controllen = sizeof(cmsgbuf);
2233
2234 cmsg = CMSG_FIRSTHDR(&msg);
2235 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2236 cmsg->cmsg_level = SOL_SOCKET;
2237 cmsg->cmsg_type = SCM_CREDENTIALS;
2238 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2239
2240 msg.msg_name = NULL;
2241 msg.msg_namelen = 0;
2242
2243 buf[0] = v;
2244 iov.iov_base = buf;
2245 iov.iov_len = sizeof(buf);
2246 msg.msg_iov = &iov;
2247 msg.msg_iovlen = 1;
2248
2249 if (sendmsg(sock, &msg, 0) < 0) {
2250 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2251 if (errno == 3)
2252 return SEND_CREDS_NOTSK;
2253 return SEND_CREDS_FAIL;
2254 }
2255
2256 return SEND_CREDS_OK;
2257 }
2258
2259 static bool recv_creds(int sock, struct ucred *cred, char *v)
2260 {
2261 struct msghdr msg = { 0 };
2262 struct iovec iov;
2263 struct cmsghdr *cmsg;
2264 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2265 char buf[1];
2266 int ret;
2267 int optval = 1;
2268
2269 *v = '1';
2270
2271 cred->pid = -1;
2272 cred->uid = -1;
2273 cred->gid = -1;
2274
2275 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2276 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2277 return false;
2278 }
2279 buf[0] = '1';
2280 if (write(sock, buf, 1) != 1) {
2281 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2282 return false;
2283 }
2284
2285 msg.msg_name = NULL;
2286 msg.msg_namelen = 0;
2287 msg.msg_control = cmsgbuf;
2288 msg.msg_controllen = sizeof(cmsgbuf);
2289
2290 iov.iov_base = buf;
2291 iov.iov_len = sizeof(buf);
2292 msg.msg_iov = &iov;
2293 msg.msg_iovlen = 1;
2294
2295 if (!wait_for_sock(sock, 2)) {
2296 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2297 return false;
2298 }
2299 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2300 if (ret < 0) {
2301 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2302 return false;
2303 }
2304
2305 cmsg = CMSG_FIRSTHDR(&msg);
2306
2307 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2308 cmsg->cmsg_level == SOL_SOCKET &&
2309 cmsg->cmsg_type == SCM_CREDENTIALS) {
2310 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2311 }
2312 *v = buf[0];
2313
2314 return true;
2315 }
2316
2317 struct pid_ns_clone_args {
2318 int *cpipe;
2319 int sock;
2320 pid_t tpid;
2321 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2322 };
2323
2324 /*
2325 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2326 * with clone(). This simply writes '1' as ACK back to the parent
2327 * before calling the actual wrapped function.
2328 */
2329 static int pid_ns_clone_wrapper(void *arg) {
2330 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2331 char b = '1';
2332
2333 close(args->cpipe[0]);
2334 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2335 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2336 close(args->cpipe[1]);
2337 return args->wrapped(args->sock, args->tpid);
2338 }
2339
2340 /*
2341 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2342 * int value back over the socket. This shifts the pid from the
2343 * sender's pidns into tpid's pidns.
2344 */
2345 static int pid_to_ns(int sock, pid_t tpid)
2346 {
2347 char v = '0';
2348 struct ucred cred;
2349
2350 while (recv_creds(sock, &cred, &v)) {
2351 if (v == '1')
2352 return 0;
2353 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2354 return 1;
2355 }
2356 return 0;
2357 }
2358
2359
2360 /*
2361 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2362 * in your old pidns. Only children which you clone will be in the target
2363 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2364 * actually convert pids.
2365 *
2366 * Note: glibc's fork() does not respect pidns, which can lead to failed
2367 * assertions inside glibc (and thus failed forks) if the child's pid in
2368 * the pidns and the parent pid outside are identical. Using clone prevents
2369 * this issue.
2370 */
2371 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2372 {
2373 int newnsfd = -1, ret, cpipe[2];
2374 char fnam[100];
2375 pid_t cpid;
2376 char v;
2377
2378 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2379 if (ret < 0 || ret >= sizeof(fnam))
2380 _exit(1);
2381 newnsfd = open(fnam, O_RDONLY);
2382 if (newnsfd < 0)
2383 _exit(1);
2384 if (setns(newnsfd, 0) < 0)
2385 _exit(1);
2386 close(newnsfd);
2387
2388 if (pipe(cpipe) < 0)
2389 _exit(1);
2390
2391 struct pid_ns_clone_args args = {
2392 .cpipe = cpipe,
2393 .sock = sock,
2394 .tpid = tpid,
2395 .wrapped = &pid_to_ns
2396 };
2397 size_t stack_size = sysconf(_SC_PAGESIZE);
2398 void *stack = alloca(stack_size);
2399
2400 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2401 if (cpid < 0)
2402 _exit(1);
2403
2404 // give the child 1 second to be done forking and
2405 // write its ack
2406 if (!wait_for_sock(cpipe[0], 1))
2407 _exit(1);
2408 ret = read(cpipe[0], &v, 1);
2409 if (ret != sizeof(char) || v != '1')
2410 _exit(1);
2411
2412 if (!wait_for_pid(cpid))
2413 _exit(1);
2414 _exit(0);
2415 }
2416
2417 /*
2418 * To read cgroup files with a particular pid, we will setns into the child
2419 * pidns, open a pipe, fork a child - which will be the first to really be in
2420 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2421 */
2422 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2423 {
2424 int sock[2] = {-1, -1};
2425 char *tmpdata = NULL;
2426 int ret;
2427 pid_t qpid, cpid = -1;
2428 bool answer = false;
2429 char v = '0';
2430 struct ucred cred;
2431 size_t sz = 0, asz = 0;
2432
2433 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2434 return false;
2435
2436 /*
2437 * Now we read the pids from returned data one by one, pass
2438 * them into a child in the target namespace, read back the
2439 * translated pids, and put them into our to-return data
2440 */
2441
2442 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2443 perror("socketpair");
2444 free(tmpdata);
2445 return false;
2446 }
2447
2448 cpid = fork();
2449 if (cpid == -1)
2450 goto out;
2451
2452 if (!cpid) // child - exits when done
2453 pid_to_ns_wrapper(sock[1], tpid);
2454
2455 char *ptr = tmpdata;
2456 cred.uid = 0;
2457 cred.gid = 0;
2458 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2459 cred.pid = qpid;
2460 ret = send_creds(sock[0], &cred, v, true);
2461
2462 if (ret == SEND_CREDS_NOTSK)
2463 goto next;
2464 if (ret == SEND_CREDS_FAIL)
2465 goto out;
2466
2467 // read converted results
2468 if (!wait_for_sock(sock[0], 2)) {
2469 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2470 goto out;
2471 }
2472 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2473 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2474 goto out;
2475 }
2476 must_strcat_pid(d, &sz, &asz, qpid);
2477 next:
2478 ptr = strchr(ptr, '\n');
2479 if (!ptr)
2480 break;
2481 ptr++;
2482 }
2483
2484 cred.pid = getpid();
2485 v = '1';
2486 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2487 // failed to ask child to exit
2488 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2489 goto out;
2490 }
2491
2492 answer = true;
2493
2494 out:
2495 free(tmpdata);
2496 if (cpid != -1)
2497 wait_for_pid(cpid);
2498 if (sock[0] != -1) {
2499 close(sock[0]);
2500 close(sock[1]);
2501 }
2502 return answer;
2503 }
2504
2505 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2506 struct fuse_file_info *fi)
2507 {
2508 struct fuse_context *fc = fuse_get_context();
2509 struct file_info *f = (struct file_info *)fi->fh;
2510 struct cgfs_files *k = NULL;
2511 char *data = NULL;
2512 int ret, s;
2513 bool r;
2514
2515 if (f->type != LXC_TYPE_CGFILE) {
2516 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2517 return -EIO;
2518 }
2519
2520 if (offset)
2521 return 0;
2522
2523 if (!fc)
2524 return -EIO;
2525
2526 if (!f->controller)
2527 return -EINVAL;
2528
2529 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2530 return -EINVAL;
2531 }
2532 free_key(k);
2533
2534
2535 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2536 ret = -EACCES;
2537 goto out;
2538 }
2539
2540 if (strcmp(f->file, "tasks") == 0 ||
2541 strcmp(f->file, "/tasks") == 0 ||
2542 strcmp(f->file, "/cgroup.procs") == 0 ||
2543 strcmp(f->file, "cgroup.procs") == 0)
2544 // special case - we have to translate the pids
2545 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2546 else
2547 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2548
2549 if (!r) {
2550 ret = -EINVAL;
2551 goto out;
2552 }
2553
2554 if (!data) {
2555 ret = 0;
2556 goto out;
2557 }
2558 s = strlen(data);
2559 if (s > size)
2560 s = size;
2561 memcpy(buf, data, s);
2562 if (s > 0 && s < size && data[s-1] != '\n')
2563 buf[s++] = '\n';
2564
2565 ret = s;
2566
2567 out:
2568 free(data);
2569 return ret;
2570 }
2571
2572 static int pid_from_ns(int sock, pid_t tpid)
2573 {
2574 pid_t vpid;
2575 struct ucred cred;
2576 char v;
2577 int ret;
2578
2579 cred.uid = 0;
2580 cred.gid = 0;
2581 while (1) {
2582 if (!wait_for_sock(sock, 2)) {
2583 lxcfs_error("%s\n", "Timeout reading from parent.");
2584 return 1;
2585 }
2586 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2587 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2588 return 1;
2589 }
2590 if (vpid == -1) // done
2591 break;
2592 v = '0';
2593 cred.pid = vpid;
2594 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2595 v = '1';
2596 cred.pid = getpid();
2597 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2598 return 1;
2599 }
2600 }
2601 return 0;
2602 }
2603
2604 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2605 {
2606 int newnsfd = -1, ret, cpipe[2];
2607 char fnam[100];
2608 pid_t cpid;
2609 char v;
2610
2611 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2612 if (ret < 0 || ret >= sizeof(fnam))
2613 _exit(1);
2614 newnsfd = open(fnam, O_RDONLY);
2615 if (newnsfd < 0)
2616 _exit(1);
2617 if (setns(newnsfd, 0) < 0)
2618 _exit(1);
2619 close(newnsfd);
2620
2621 if (pipe(cpipe) < 0)
2622 _exit(1);
2623
2624 struct pid_ns_clone_args args = {
2625 .cpipe = cpipe,
2626 .sock = sock,
2627 .tpid = tpid,
2628 .wrapped = &pid_from_ns
2629 };
2630 size_t stack_size = sysconf(_SC_PAGESIZE);
2631 void *stack = alloca(stack_size);
2632
2633 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2634 if (cpid < 0)
2635 _exit(1);
2636
2637 // give the child 1 second to be done forking and
2638 // write its ack
2639 if (!wait_for_sock(cpipe[0], 1))
2640 _exit(1);
2641 ret = read(cpipe[0], &v, 1);
2642 if (ret != sizeof(char) || v != '1')
2643 _exit(1);
2644
2645 if (!wait_for_pid(cpid))
2646 _exit(1);
2647 _exit(0);
2648 }
2649
2650 /*
2651 * Given host @uid, return the uid to which it maps in
2652 * @pid's user namespace, or -1 if none.
2653 */
2654 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2655 {
2656 FILE *f;
2657 char line[400];
2658
2659 sprintf(line, "/proc/%d/uid_map", pid);
2660 if ((f = fopen(line, "r")) == NULL) {
2661 return false;
2662 }
2663
2664 *answer = convert_id_to_ns(f, uid);
2665 fclose(f);
2666
2667 if (*answer == -1)
2668 return false;
2669 return true;
2670 }
2671
2672 /*
2673 * get_pid_creds: get the real uid and gid of @pid from
2674 * /proc/$$/status
2675 * (XXX should we use euid here?)
2676 */
2677 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2678 {
2679 char line[400];
2680 uid_t u;
2681 gid_t g;
2682 FILE *f;
2683
2684 *uid = -1;
2685 *gid = -1;
2686 sprintf(line, "/proc/%d/status", pid);
2687 if ((f = fopen(line, "r")) == NULL) {
2688 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2689 return;
2690 }
2691 while (fgets(line, 400, f)) {
2692 if (strncmp(line, "Uid:", 4) == 0) {
2693 if (sscanf(line+4, "%u", &u) != 1) {
2694 lxcfs_error("bad uid line for pid %u\n", pid);
2695 fclose(f);
2696 return;
2697 }
2698 *uid = u;
2699 } else if (strncmp(line, "Gid:", 4) == 0) {
2700 if (sscanf(line+4, "%u", &g) != 1) {
2701 lxcfs_error("bad gid line for pid %u\n", pid);
2702 fclose(f);
2703 return;
2704 }
2705 *gid = g;
2706 }
2707 }
2708 fclose(f);
2709 }
2710
2711 /*
2712 * May the requestor @r move victim @v to a new cgroup?
2713 * This is allowed if
2714 * . they are the same task
2715 * . they are ownedy by the same uid
2716 * . @r is root on the host, or
2717 * . @v's uid is mapped into @r's where @r is root.
2718 */
2719 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2720 {
2721 uid_t v_uid, tmpuid;
2722 gid_t v_gid;
2723
2724 if (r == v)
2725 return true;
2726 if (r_uid == 0)
2727 return true;
2728 get_pid_creds(v, &v_uid, &v_gid);
2729 if (r_uid == v_uid)
2730 return true;
2731 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2732 && hostuid_to_ns(v_uid, r, &tmpuid))
2733 return true;
2734 return false;
2735 }
2736
2737 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2738 const char *file, const char *buf)
2739 {
2740 int sock[2] = {-1, -1};
2741 pid_t qpid, cpid = -1;
2742 FILE *pids_file = NULL;
2743 bool answer = false, fail = false;
2744
2745 pids_file = open_pids_file(contrl, cg);
2746 if (!pids_file)
2747 return false;
2748
2749 /*
2750 * write the pids to a socket, have helper in writer's pidns
2751 * call movepid for us
2752 */
2753 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2754 perror("socketpair");
2755 goto out;
2756 }
2757
2758 cpid = fork();
2759 if (cpid == -1)
2760 goto out;
2761
2762 if (!cpid) { // child
2763 fclose(pids_file);
2764 pid_from_ns_wrapper(sock[1], tpid);
2765 }
2766
2767 const char *ptr = buf;
2768 while (sscanf(ptr, "%d", &qpid) == 1) {
2769 struct ucred cred;
2770 char v;
2771
2772 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2773 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2774 goto out;
2775 }
2776
2777 if (recv_creds(sock[0], &cred, &v)) {
2778 if (v == '0') {
2779 if (!may_move_pid(tpid, tuid, cred.pid)) {
2780 fail = true;
2781 break;
2782 }
2783 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2784 fail = true;
2785 }
2786 }
2787
2788 ptr = strchr(ptr, '\n');
2789 if (!ptr)
2790 break;
2791 ptr++;
2792 }
2793
2794 /* All good, write the value */
2795 qpid = -1;
2796 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2797 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2798
2799 if (!fail)
2800 answer = true;
2801
2802 out:
2803 if (cpid != -1)
2804 wait_for_pid(cpid);
2805 if (sock[0] != -1) {
2806 close(sock[0]);
2807 close(sock[1]);
2808 }
2809 if (pids_file) {
2810 if (fclose(pids_file) != 0)
2811 answer = false;
2812 }
2813 return answer;
2814 }
2815
2816 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2817 struct fuse_file_info *fi)
2818 {
2819 struct fuse_context *fc = fuse_get_context();
2820 char *localbuf = NULL;
2821 struct cgfs_files *k = NULL;
2822 struct file_info *f = (struct file_info *)fi->fh;
2823 bool r;
2824
2825 if (f->type != LXC_TYPE_CGFILE) {
2826 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2827 return -EIO;
2828 }
2829
2830 if (offset)
2831 return 0;
2832
2833 if (!fc)
2834 return -EIO;
2835
2836 localbuf = alloca(size+1);
2837 localbuf[size] = '\0';
2838 memcpy(localbuf, buf, size);
2839
2840 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2841 size = -EINVAL;
2842 goto out;
2843 }
2844
2845 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2846 size = -EACCES;
2847 goto out;
2848 }
2849
2850 if (strcmp(f->file, "tasks") == 0 ||
2851 strcmp(f->file, "/tasks") == 0 ||
2852 strcmp(f->file, "/cgroup.procs") == 0 ||
2853 strcmp(f->file, "cgroup.procs") == 0)
2854 // special case - we have to translate the pids
2855 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2856 else
2857 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2858
2859 if (!r)
2860 size = -EINVAL;
2861
2862 out:
2863 free_key(k);
2864 return size;
2865 }
2866
2867 int cg_chown(const char *path, uid_t uid, gid_t gid)
2868 {
2869 struct fuse_context *fc = fuse_get_context();
2870 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2871 struct cgfs_files *k = NULL;
2872 const char *cgroup;
2873 int ret;
2874
2875 if (!fc)
2876 return -EIO;
2877
2878 if (strcmp(path, "/cgroup") == 0)
2879 return -EPERM;
2880
2881 controller = pick_controller_from_path(fc, path);
2882 if (!controller)
2883 return errno == ENOENT ? -EPERM : -errno;
2884
2885 cgroup = find_cgroup_in_path(path);
2886 if (!cgroup)
2887 /* this is just /cgroup/controller */
2888 return -EPERM;
2889
2890 get_cgdir_and_path(cgroup, &cgdir, &last);
2891
2892 if (!last) {
2893 path1 = "/";
2894 path2 = cgdir;
2895 } else {
2896 path1 = cgdir;
2897 path2 = last;
2898 }
2899
2900 if (is_child_cgroup(controller, path1, path2)) {
2901 // get uid, gid, from '/tasks' file and make up a mode
2902 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2903 k = cgfs_get_key(controller, cgroup, "tasks");
2904
2905 } else
2906 k = cgfs_get_key(controller, path1, path2);
2907
2908 if (!k) {
2909 ret = -EINVAL;
2910 goto out;
2911 }
2912
2913 /*
2914 * This being a fuse request, the uid and gid must be valid
2915 * in the caller's namespace. So we can just check to make
2916 * sure that the caller is root in his uid, and privileged
2917 * over the file's current owner.
2918 */
2919 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2920 ret = -EACCES;
2921 goto out;
2922 }
2923
2924 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2925
2926 out:
2927 free_key(k);
2928 free(cgdir);
2929
2930 return ret;
2931 }
2932
2933 int cg_chmod(const char *path, mode_t mode)
2934 {
2935 struct fuse_context *fc = fuse_get_context();
2936 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2937 struct cgfs_files *k = NULL;
2938 const char *cgroup;
2939 int ret;
2940
2941 if (!fc)
2942 return -EIO;
2943
2944 if (strcmp(path, "/cgroup") == 0)
2945 return -EPERM;
2946
2947 controller = pick_controller_from_path(fc, path);
2948 if (!controller)
2949 return errno == ENOENT ? -EPERM : -errno;
2950
2951 cgroup = find_cgroup_in_path(path);
2952 if (!cgroup)
2953 /* this is just /cgroup/controller */
2954 return -EPERM;
2955
2956 get_cgdir_and_path(cgroup, &cgdir, &last);
2957
2958 if (!last) {
2959 path1 = "/";
2960 path2 = cgdir;
2961 } else {
2962 path1 = cgdir;
2963 path2 = last;
2964 }
2965
2966 if (is_child_cgroup(controller, path1, path2)) {
2967 // get uid, gid, from '/tasks' file and make up a mode
2968 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2969 k = cgfs_get_key(controller, cgroup, "tasks");
2970
2971 } else
2972 k = cgfs_get_key(controller, path1, path2);
2973
2974 if (!k) {
2975 ret = -EINVAL;
2976 goto out;
2977 }
2978
2979 /*
2980 * This being a fuse request, the uid and gid must be valid
2981 * in the caller's namespace. So we can just check to make
2982 * sure that the caller is root in his uid, and privileged
2983 * over the file's current owner.
2984 */
2985 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2986 ret = -EPERM;
2987 goto out;
2988 }
2989
2990 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2991 ret = -EINVAL;
2992 goto out;
2993 }
2994
2995 ret = 0;
2996 out:
2997 free_key(k);
2998 free(cgdir);
2999 return ret;
3000 }
3001
3002 int cg_mkdir(const char *path, mode_t mode)
3003 {
3004 struct fuse_context *fc = fuse_get_context();
3005 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3006 const char *cgroup;
3007 int ret;
3008
3009 if (!fc)
3010 return -EIO;
3011
3012 controller = pick_controller_from_path(fc, path);
3013 if (!controller)
3014 return errno == ENOENT ? -EPERM : -errno;
3015
3016 cgroup = find_cgroup_in_path(path);
3017 if (!cgroup)
3018 return -errno;
3019
3020 get_cgdir_and_path(cgroup, &cgdir, &last);
3021 if (!last)
3022 path1 = "/";
3023 else
3024 path1 = cgdir;
3025
3026 pid_t initpid = lookup_initpid_in_store(fc->pid);
3027 if (initpid <= 0)
3028 initpid = fc->pid;
3029 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3030 if (!next)
3031 ret = -EINVAL;
3032 else if (last && strcmp(next, last) == 0)
3033 ret = -EEXIST;
3034 else
3035 ret = -EPERM;
3036 goto out;
3037 }
3038
3039 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3040 ret = -EACCES;
3041 goto out;
3042 }
3043 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3044 ret = -EACCES;
3045 goto out;
3046 }
3047
3048 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3049
3050 out:
3051 free(cgdir);
3052 free(next);
3053 return ret;
3054 }
3055
3056 int cg_rmdir(const char *path)
3057 {
3058 struct fuse_context *fc = fuse_get_context();
3059 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3060 const char *cgroup;
3061 int ret;
3062
3063 if (!fc)
3064 return -EIO;
3065
3066 controller = pick_controller_from_path(fc, path);
3067 if (!controller) /* Someone's trying to delete "/cgroup". */
3068 return -EPERM;
3069
3070 cgroup = find_cgroup_in_path(path);
3071 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3072 return -EPERM;
3073
3074 get_cgdir_and_path(cgroup, &cgdir, &last);
3075 if (!last) {
3076 /* Someone's trying to delete a cgroup on the same level as the
3077 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3078 * rmdir "/cgroup/blkio/init.slice".
3079 */
3080 ret = -EPERM;
3081 goto out;
3082 }
3083
3084 pid_t initpid = lookup_initpid_in_store(fc->pid);
3085 if (initpid <= 0)
3086 initpid = fc->pid;
3087 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3088 if (!last || (next && (strcmp(next, last) == 0)))
3089 ret = -EBUSY;
3090 else
3091 ret = -ENOENT;
3092 goto out;
3093 }
3094
3095 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3096 ret = -EACCES;
3097 goto out;
3098 }
3099 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3100 ret = -EACCES;
3101 goto out;
3102 }
3103
3104 if (!cgfs_remove(controller, cgroup)) {
3105 ret = -EINVAL;
3106 goto out;
3107 }
3108
3109 ret = 0;
3110
3111 out:
3112 free(cgdir);
3113 free(next);
3114 return ret;
3115 }
3116
3117 static bool startswith(const char *line, const char *pref)
3118 {
3119 if (strncmp(line, pref, strlen(pref)) == 0)
3120 return true;
3121 return false;
3122 }
3123
3124 static void parse_memstat(char *memstat, unsigned long *cached,
3125 unsigned long *active_anon, unsigned long *inactive_anon,
3126 unsigned long *active_file, unsigned long *inactive_file,
3127 unsigned long *unevictable)
3128 {
3129 char *eol;
3130
3131 while (*memstat) {
3132 if (startswith(memstat, "total_cache")) {
3133 sscanf(memstat + 11, "%lu", cached);
3134 *cached /= 1024;
3135 } else if (startswith(memstat, "total_active_anon")) {
3136 sscanf(memstat + 17, "%lu", active_anon);
3137 *active_anon /= 1024;
3138 } else if (startswith(memstat, "total_inactive_anon")) {
3139 sscanf(memstat + 19, "%lu", inactive_anon);
3140 *inactive_anon /= 1024;
3141 } else if (startswith(memstat, "total_active_file")) {
3142 sscanf(memstat + 17, "%lu", active_file);
3143 *active_file /= 1024;
3144 } else if (startswith(memstat, "total_inactive_file")) {
3145 sscanf(memstat + 19, "%lu", inactive_file);
3146 *inactive_file /= 1024;
3147 } else if (startswith(memstat, "total_unevictable")) {
3148 sscanf(memstat + 17, "%lu", unevictable);
3149 *unevictable /= 1024;
3150 }
3151 eol = strchr(memstat, '\n');
3152 if (!eol)
3153 return;
3154 memstat = eol+1;
3155 }
3156 }
3157
3158 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3159 {
3160 char *eol;
3161 char key[32];
3162
3163 memset(key, 0, 32);
3164 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3165
3166 size_t len = strlen(key);
3167 *v = 0;
3168
3169 while (*str) {
3170 if (startswith(str, key)) {
3171 sscanf(str + len, "%lu", v);
3172 return;
3173 }
3174 eol = strchr(str, '\n');
3175 if (!eol)
3176 return;
3177 str = eol+1;
3178 }
3179 }
3180
3181 static int read_file(const char *path, char *buf, size_t size,
3182 struct file_info *d)
3183 {
3184 size_t linelen = 0, total_len = 0, rv = 0;
3185 char *line = NULL;
3186 char *cache = d->buf;
3187 size_t cache_size = d->buflen;
3188 FILE *f = fopen(path, "r");
3189 if (!f)
3190 return 0;
3191
3192 while (getline(&line, &linelen, f) != -1) {
3193 ssize_t l = snprintf(cache, cache_size, "%s", line);
3194 if (l < 0) {
3195 perror("Error writing to cache");
3196 rv = 0;
3197 goto err;
3198 }
3199 if (l >= cache_size) {
3200 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3201 rv = 0;
3202 goto err;
3203 }
3204 cache += l;
3205 cache_size -= l;
3206 total_len += l;
3207 }
3208
3209 d->size = total_len;
3210 if (total_len > size)
3211 total_len = size;
3212
3213 /* read from off 0 */
3214 memcpy(buf, d->buf, total_len);
3215 rv = total_len;
3216 err:
3217 fclose(f);
3218 free(line);
3219 return rv;
3220 }
3221
3222 /*
3223 * FUSE ops for /proc
3224 */
3225
3226 static unsigned long get_memlimit(const char *cgroup, const char *file)
3227 {
3228 char *memlimit_str = NULL;
3229 unsigned long memlimit = -1;
3230
3231 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3232 memlimit = strtoul(memlimit_str, NULL, 10);
3233
3234 free(memlimit_str);
3235
3236 return memlimit;
3237 }
3238
3239 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3240 {
3241 char *copy = strdupa(cgroup);
3242 unsigned long memlimit = 0, retlimit;
3243
3244 retlimit = get_memlimit(copy, file);
3245
3246 while (strcmp(copy, "/") != 0) {
3247 copy = dirname(copy);
3248 memlimit = get_memlimit(copy, file);
3249 if (memlimit != -1 && memlimit < retlimit)
3250 retlimit = memlimit;
3251 };
3252
3253 return retlimit;
3254 }
3255
3256 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3257 struct fuse_file_info *fi)
3258 {
3259 struct fuse_context *fc = fuse_get_context();
3260 struct file_info *d = (struct file_info *)fi->fh;
3261 char *cg;
3262 char *memusage_str = NULL, *memstat_str = NULL,
3263 *memswlimit_str = NULL, *memswusage_str = NULL;
3264 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3265 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3266 active_file = 0, inactive_file = 0, unevictable = 0,
3267 hostswtotal = 0;
3268 char *line = NULL;
3269 size_t linelen = 0, total_len = 0, rv = 0;
3270 char *cache = d->buf;
3271 size_t cache_size = d->buflen;
3272 FILE *f = NULL;
3273
3274 if (offset){
3275 if (offset > d->size)
3276 return -EINVAL;
3277 if (!d->cached)
3278 return 0;
3279 int left = d->size - offset;
3280 total_len = left > size ? size: left;
3281 memcpy(buf, cache + offset, total_len);
3282 return total_len;
3283 }
3284
3285 pid_t initpid = lookup_initpid_in_store(fc->pid);
3286 if (initpid <= 0)
3287 initpid = fc->pid;
3288 cg = get_pid_cgroup(initpid, "memory");
3289 if (!cg)
3290 return read_file("/proc/meminfo", buf, size, d);
3291 prune_init_slice(cg);
3292
3293 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3294 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3295 goto err;
3296 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3297 goto err;
3298
3299 // Following values are allowed to fail, because swapaccount might be turned
3300 // off for current kernel
3301 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3302 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3303 {
3304 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3305 memswusage = strtoul(memswusage_str, NULL, 10);
3306
3307 memswlimit = memswlimit / 1024;
3308 memswusage = memswusage / 1024;
3309 }
3310
3311 memusage = strtoul(memusage_str, NULL, 10);
3312 memlimit /= 1024;
3313 memusage /= 1024;
3314
3315 parse_memstat(memstat_str, &cached, &active_anon,
3316 &inactive_anon, &active_file, &inactive_file,
3317 &unevictable);
3318
3319 f = fopen("/proc/meminfo", "r");
3320 if (!f)
3321 goto err;
3322
3323 while (getline(&line, &linelen, f) != -1) {
3324 ssize_t l;
3325 char *printme, lbuf[100];
3326
3327 memset(lbuf, 0, 100);
3328 if (startswith(line, "MemTotal:")) {
3329 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3330 if (hosttotal < memlimit)
3331 memlimit = hosttotal;
3332 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3333 printme = lbuf;
3334 } else if (startswith(line, "MemFree:")) {
3335 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3336 printme = lbuf;
3337 } else if (startswith(line, "MemAvailable:")) {
3338 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3339 printme = lbuf;
3340 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3341 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3342 if (hostswtotal < memswlimit)
3343 memswlimit = hostswtotal;
3344 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3345 printme = lbuf;
3346 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3347 unsigned long swaptotal = memswlimit,
3348 swapusage = memswusage - memusage,
3349 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3350 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3351 printme = lbuf;
3352 } else if (startswith(line, "Slab:")) {
3353 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3354 printme = lbuf;
3355 } else if (startswith(line, "Buffers:")) {
3356 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3357 printme = lbuf;
3358 } else if (startswith(line, "Cached:")) {
3359 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3360 printme = lbuf;
3361 } else if (startswith(line, "SwapCached:")) {
3362 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3363 printme = lbuf;
3364 } else if (startswith(line, "Active:")) {
3365 snprintf(lbuf, 100, "Active: %8lu kB\n",
3366 active_anon + active_file);
3367 printme = lbuf;
3368 } else if (startswith(line, "Inactive:")) {
3369 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3370 inactive_anon + inactive_file);
3371 printme = lbuf;
3372 } else if (startswith(line, "Active(anon)")) {
3373 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3374 printme = lbuf;
3375 } else if (startswith(line, "Inactive(anon)")) {
3376 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3377 printme = lbuf;
3378 } else if (startswith(line, "Active(file)")) {
3379 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3380 printme = lbuf;
3381 } else if (startswith(line, "Inactive(file)")) {
3382 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3383 printme = lbuf;
3384 } else if (startswith(line, "Unevictable")) {
3385 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3386 printme = lbuf;
3387 } else if (startswith(line, "SReclaimable")) {
3388 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3389 printme = lbuf;
3390 } else if (startswith(line, "SUnreclaim")) {
3391 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3392 printme = lbuf;
3393 } else
3394 printme = line;
3395
3396 l = snprintf(cache, cache_size, "%s", printme);
3397 if (l < 0) {
3398 perror("Error writing to cache");
3399 rv = 0;
3400 goto err;
3401
3402 }
3403 if (l >= cache_size) {
3404 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3405 rv = 0;
3406 goto err;
3407 }
3408
3409 cache += l;
3410 cache_size -= l;
3411 total_len += l;
3412 }
3413
3414 d->cached = 1;
3415 d->size = total_len;
3416 if (total_len > size ) total_len = size;
3417 memcpy(buf, d->buf, total_len);
3418
3419 rv = total_len;
3420 err:
3421 if (f)
3422 fclose(f);
3423 free(line);
3424 free(cg);
3425 free(memusage_str);
3426 free(memswlimit_str);
3427 free(memswusage_str);
3428 free(memstat_str);
3429 return rv;
3430 }
3431
3432 /*
3433 * Read the cpuset.cpus for cg
3434 * Return the answer in a newly allocated string which must be freed
3435 */
3436 static char *get_cpuset(const char *cg)
3437 {
3438 char *answer;
3439
3440 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3441 return NULL;
3442 return answer;
3443 }
3444
3445 bool cpu_in_cpuset(int cpu, const char *cpuset);
3446
3447 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3448 {
3449 int cpu;
3450
3451 if (sscanf(line, "processor : %d", &cpu) != 1)
3452 return false;
3453 return cpu_in_cpuset(cpu, cpuset);
3454 }
3455
3456 /*
3457 * check whether this is a '^processor" line in /proc/cpuinfo
3458 */
3459 static bool is_processor_line(const char *line)
3460 {
3461 int cpu;
3462
3463 if (sscanf(line, "processor : %d", &cpu) == 1)
3464 return true;
3465 return false;
3466 }
3467
3468 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3469 struct fuse_file_info *fi)
3470 {
3471 struct fuse_context *fc = fuse_get_context();
3472 struct file_info *d = (struct file_info *)fi->fh;
3473 char *cg;
3474 char *cpuset = NULL;
3475 char *line = NULL;
3476 size_t linelen = 0, total_len = 0, rv = 0;
3477 bool am_printing = false, firstline = true, is_s390x = false;
3478 int curcpu = -1, cpu;
3479 char *cache = d->buf;
3480 size_t cache_size = d->buflen;
3481 FILE *f = NULL;
3482
3483 if (offset){
3484 if (offset > d->size)
3485 return -EINVAL;
3486 if (!d->cached)
3487 return 0;
3488 int left = d->size - offset;
3489 total_len = left > size ? size: left;
3490 memcpy(buf, cache + offset, total_len);
3491 return total_len;
3492 }
3493
3494 pid_t initpid = lookup_initpid_in_store(fc->pid);
3495 if (initpid <= 0)
3496 initpid = fc->pid;
3497 cg = get_pid_cgroup(initpid, "cpuset");
3498 if (!cg)
3499 return read_file("proc/cpuinfo", buf, size, d);
3500 prune_init_slice(cg);
3501
3502 cpuset = get_cpuset(cg);
3503 if (!cpuset)
3504 goto err;
3505
3506 f = fopen("/proc/cpuinfo", "r");
3507 if (!f)
3508 goto err;
3509
3510 while (getline(&line, &linelen, f) != -1) {
3511 ssize_t l;
3512 if (firstline) {
3513 firstline = false;
3514 if (strstr(line, "IBM/S390") != NULL) {
3515 is_s390x = true;
3516 am_printing = true;
3517 continue;
3518 }
3519 }
3520 if (strncmp(line, "# processors:", 12) == 0)
3521 continue;
3522 if (is_processor_line(line)) {
3523 am_printing = cpuline_in_cpuset(line, cpuset);
3524 if (am_printing) {
3525 curcpu ++;
3526 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3527 if (l < 0) {
3528 perror("Error writing to cache");
3529 rv = 0;
3530 goto err;
3531 }
3532 if (l >= cache_size) {
3533 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3534 rv = 0;
3535 goto err;
3536 }
3537 cache += l;
3538 cache_size -= l;
3539 total_len += l;
3540 }
3541 continue;
3542 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3543 char *p;
3544 if (!cpu_in_cpuset(cpu, cpuset))
3545 continue;
3546 curcpu ++;
3547 p = strchr(line, ':');
3548 if (!p || !*p)
3549 goto err;
3550 p++;
3551 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3552 if (l < 0) {
3553 perror("Error writing to cache");
3554 rv = 0;
3555 goto err;
3556 }
3557 if (l >= cache_size) {
3558 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3559 rv = 0;
3560 goto err;
3561 }
3562 cache += l;
3563 cache_size -= l;
3564 total_len += l;
3565 continue;
3566
3567 }
3568 if (am_printing) {
3569 l = snprintf(cache, cache_size, "%s", line);
3570 if (l < 0) {
3571 perror("Error writing to cache");
3572 rv = 0;
3573 goto err;
3574 }
3575 if (l >= cache_size) {
3576 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3577 rv = 0;
3578 goto err;
3579 }
3580 cache += l;
3581 cache_size -= l;
3582 total_len += l;
3583 }
3584 }
3585
3586 if (is_s390x) {
3587 char *origcache = d->buf;
3588 ssize_t l;
3589 do {
3590 d->buf = malloc(d->buflen);
3591 } while (!d->buf);
3592 cache = d->buf;
3593 cache_size = d->buflen;
3594 total_len = 0;
3595 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3596 if (l < 0 || l >= cache_size) {
3597 free(origcache);
3598 goto err;
3599 }
3600 cache_size -= l;
3601 cache += l;
3602 total_len += l;
3603 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3604 if (l < 0 || l >= cache_size) {
3605 free(origcache);
3606 goto err;
3607 }
3608 cache_size -= l;
3609 cache += l;
3610 total_len += l;
3611 l = snprintf(cache, cache_size, "%s", origcache);
3612 free(origcache);
3613 if (l < 0 || l >= cache_size)
3614 goto err;
3615 total_len += l;
3616 }
3617
3618 d->cached = 1;
3619 d->size = total_len;
3620 if (total_len > size ) total_len = size;
3621
3622 /* read from off 0 */
3623 memcpy(buf, d->buf, total_len);
3624 rv = total_len;
3625 err:
3626 if (f)
3627 fclose(f);
3628 free(line);
3629 free(cpuset);
3630 free(cg);
3631 return rv;
3632 }
3633
3634 static uint64_t get_reaper_start_time(pid_t pid)
3635 {
3636 int ret;
3637 FILE *f;
3638 uint64_t starttime;
3639 /* strlen("/proc/") = 6
3640 * +
3641 * LXCFS_NUMSTRLEN64
3642 * +
3643 * strlen("/stat") = 5
3644 * +
3645 * \0 = 1
3646 * */
3647 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3648 char path[__PROC_PID_STAT_LEN];
3649 pid_t qpid;
3650
3651 qpid = lookup_initpid_in_store(pid);
3652 if (qpid <= 0) {
3653 /* Caller can check for EINVAL on 0. */
3654 errno = EINVAL;
3655 return 0;
3656 }
3657
3658 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3659 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3660 /* Caller can check for EINVAL on 0. */
3661 errno = EINVAL;
3662 return 0;
3663 }
3664
3665 f = fopen(path, "r");
3666 if (!f) {
3667 /* Caller can check for EINVAL on 0. */
3668 errno = EINVAL;
3669 return 0;
3670 }
3671
3672 /* Note that the *scanf() argument supression requires that length
3673 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3674 * at us. It's like telling someone you're not married and then asking
3675 * if you can bring your wife to the party.
3676 */
3677 ret = fscanf(f, "%*d " /* (1) pid %d */
3678 "%*s " /* (2) comm %s */
3679 "%*c " /* (3) state %c */
3680 "%*d " /* (4) ppid %d */
3681 "%*d " /* (5) pgrp %d */
3682 "%*d " /* (6) session %d */
3683 "%*d " /* (7) tty_nr %d */
3684 "%*d " /* (8) tpgid %d */
3685 "%*u " /* (9) flags %u */
3686 "%*u " /* (10) minflt %lu */
3687 "%*u " /* (11) cminflt %lu */
3688 "%*u " /* (12) majflt %lu */
3689 "%*u " /* (13) cmajflt %lu */
3690 "%*u " /* (14) utime %lu */
3691 "%*u " /* (15) stime %lu */
3692 "%*d " /* (16) cutime %ld */
3693 "%*d " /* (17) cstime %ld */
3694 "%*d " /* (18) priority %ld */
3695 "%*d " /* (19) nice %ld */
3696 "%*d " /* (20) num_threads %ld */
3697 "%*d " /* (21) itrealvalue %ld */
3698 "%" PRIu64, /* (22) starttime %llu */
3699 &starttime);
3700 if (ret != 1) {
3701 fclose(f);
3702 /* Caller can check for EINVAL on 0. */
3703 errno = EINVAL;
3704 return 0;
3705 }
3706
3707 fclose(f);
3708
3709 errno = 0;
3710 return starttime;
3711 }
3712
3713 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3714 {
3715 uint64_t clockticks;
3716 int64_t ticks_per_sec;
3717
3718 clockticks = get_reaper_start_time(pid);
3719 if (clockticks == 0 && errno == EINVAL) {
3720 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3721 return 0;
3722 }
3723
3724 ticks_per_sec = sysconf(_SC_CLK_TCK);
3725 if (ticks_per_sec < 0 && errno == EINVAL) {
3726 lxcfs_debug(
3727 "%s\n",
3728 "failed to determine number of clock ticks in a second");
3729 return 0;
3730 }
3731
3732 return (clockticks /= ticks_per_sec);
3733 }
3734
3735 static uint64_t get_reaper_age(pid_t pid)
3736 {
3737 uint64_t procstart, uptime, procage;
3738
3739 /* We need to substract the time the process has started since system
3740 * boot minus the time when the system has started to get the actual
3741 * reaper age.
3742 */
3743 procstart = get_reaper_start_time_in_sec(pid);
3744 procage = procstart;
3745 if (procstart > 0) {
3746 int ret;
3747 struct timespec spec;
3748
3749 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3750 if (ret < 0)
3751 return 0;
3752 /* We could make this more precise here by using the tv_nsec
3753 * field in the timespec struct and convert it to milliseconds
3754 * and then create a double for the seconds and milliseconds but
3755 * that seems more work than it is worth.
3756 */
3757 uptime = spec.tv_sec;
3758 procage = uptime - procstart;
3759 }
3760
3761 return procage;
3762 }
3763
3764 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3765 static int proc_stat_read(char *buf, size_t size, off_t offset,
3766 struct fuse_file_info *fi)
3767 {
3768 struct fuse_context *fc = fuse_get_context();
3769 struct file_info *d = (struct file_info *)fi->fh;
3770 char *cg;
3771 char *cpuset = NULL;
3772 char *line = NULL;
3773 size_t linelen = 0, total_len = 0, rv = 0;
3774 int curcpu = -1; /* cpu numbering starts at 0 */
3775 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3776 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3777 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3778 char cpuall[CPUALL_MAX_SIZE];
3779 /* reserve for cpu all */
3780 char *cache = d->buf + CPUALL_MAX_SIZE;
3781 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3782 FILE *f = NULL;
3783
3784 if (offset){
3785 if (offset > d->size)
3786 return -EINVAL;
3787 if (!d->cached)
3788 return 0;
3789 int left = d->size - offset;
3790 total_len = left > size ? size: left;
3791 memcpy(buf, d->buf + offset, total_len);
3792 return total_len;
3793 }
3794
3795 pid_t initpid = lookup_initpid_in_store(fc->pid);
3796 if (initpid <= 0)
3797 initpid = fc->pid;
3798 cg = get_pid_cgroup(initpid, "cpuset");
3799 if (!cg)
3800 return read_file("/proc/stat", buf, size, d);
3801 prune_init_slice(cg);
3802
3803 cpuset = get_cpuset(cg);
3804 if (!cpuset)
3805 goto err;
3806
3807 f = fopen("/proc/stat", "r");
3808 if (!f)
3809 goto err;
3810
3811 //skip first line
3812 if (getline(&line, &linelen, f) < 0) {
3813 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3814 goto err;
3815 }
3816
3817 while (getline(&line, &linelen, f) != -1) {
3818 ssize_t l;
3819 int cpu;
3820 char cpu_char[10]; /* That's a lot of cores */
3821 char *c;
3822
3823 if (strlen(line) == 0)
3824 continue;
3825 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3826 /* not a ^cpuN line containing a number N, just print it */
3827 l = snprintf(cache, cache_size, "%s", line);
3828 if (l < 0) {
3829 perror("Error writing to cache");
3830 rv = 0;
3831 goto err;
3832 }
3833 if (l >= cache_size) {
3834 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3835 rv = 0;
3836 goto err;
3837 }
3838 cache += l;
3839 cache_size -= l;
3840 total_len += l;
3841 continue;
3842 }
3843
3844 if (sscanf(cpu_char, "%d", &cpu) != 1)
3845 continue;
3846 if (!cpu_in_cpuset(cpu, cpuset))
3847 continue;
3848 curcpu ++;
3849
3850 c = strchr(line, ' ');
3851 if (!c)
3852 continue;
3853 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3854 if (l < 0) {
3855 perror("Error writing to cache");
3856 rv = 0;
3857 goto err;
3858
3859 }
3860 if (l >= cache_size) {
3861 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3862 rv = 0;
3863 goto err;
3864 }
3865
3866 cache += l;
3867 cache_size -= l;
3868 total_len += l;
3869
3870 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3871 &user,
3872 &nice,
3873 &system,
3874 &idle,
3875 &iowait,
3876 &irq,
3877 &softirq,
3878 &steal,
3879 &guest,
3880 &guest_nice) != 10)
3881 continue;
3882 user_sum += user;
3883 nice_sum += nice;
3884 system_sum += system;
3885 idle_sum += idle;
3886 iowait_sum += iowait;
3887 irq_sum += irq;
3888 softirq_sum += softirq;
3889 steal_sum += steal;
3890 guest_sum += guest;
3891 guest_nice_sum += guest_nice;
3892 }
3893
3894 cache = d->buf;
3895
3896 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3897 user_sum,
3898 nice_sum,
3899 system_sum,
3900 idle_sum,
3901 iowait_sum,
3902 irq_sum,
3903 softirq_sum,
3904 steal_sum,
3905 guest_sum,
3906 guest_nice_sum);
3907 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3908 memcpy(cache, cpuall, cpuall_len);
3909 cache += cpuall_len;
3910 } else {
3911 /* shouldn't happen */
3912 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3913 cpuall_len = 0;
3914 }
3915
3916 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3917 total_len += cpuall_len;
3918 d->cached = 1;
3919 d->size = total_len;
3920 if (total_len > size)
3921 total_len = size;
3922
3923 memcpy(buf, d->buf, total_len);
3924 rv = total_len;
3925
3926 err:
3927 if (f)
3928 fclose(f);
3929 free(line);
3930 free(cpuset);
3931 free(cg);
3932 return rv;
3933 }
3934
3935 /* This function retrieves the busy time of a group of tasks by looking at
3936 * cpuacct.usage. Unfortunately, this only makes sense when the container has
3937 * been given it's own cpuacct cgroup. If not, this function will take the busy
3938 * time of all other taks that do not actually belong to the container into
3939 * account as well. If someone has a clever solution for this please send a
3940 * patch!
3941 */
3942 static unsigned long get_reaper_busy(pid_t task)
3943 {
3944 pid_t initpid = lookup_initpid_in_store(task);
3945 char *cgroup = NULL, *usage_str = NULL;
3946 unsigned long usage = 0;
3947
3948 if (initpid <= 0)
3949 return 0;
3950
3951 cgroup = get_pid_cgroup(initpid, "cpuacct");
3952 if (!cgroup)
3953 goto out;
3954 prune_init_slice(cgroup);
3955 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3956 goto out;
3957 usage = strtoul(usage_str, NULL, 10);
3958 usage /= 1000000000;
3959
3960 out:
3961 free(cgroup);
3962 free(usage_str);
3963 return usage;
3964 }
3965
3966 #if RELOADTEST
3967 void iwashere(void)
3968 {
3969 int fd;
3970
3971 fd = creat("/tmp/lxcfs-iwashere", 0644);
3972 if (fd >= 0)
3973 close(fd);
3974 }
3975 #endif
3976
3977 /*
3978 * We read /proc/uptime and reuse its second field.
3979 * For the first field, we use the mtime for the reaper for
3980 * the calling pid as returned by getreaperage
3981 */
3982 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3983 struct fuse_file_info *fi)
3984 {
3985 struct fuse_context *fc = fuse_get_context();
3986 struct file_info *d = (struct file_info *)fi->fh;
3987 unsigned long int busytime = get_reaper_busy(fc->pid);
3988 char *cache = d->buf;
3989 ssize_t total_len = 0;
3990 uint64_t idletime, reaperage;
3991
3992 #if RELOADTEST
3993 iwashere();
3994 #endif
3995
3996 if (offset){
3997 if (!d->cached)
3998 return 0;
3999 if (offset > d->size)
4000 return -EINVAL;
4001 int left = d->size - offset;
4002 total_len = left > size ? size: left;
4003 memcpy(buf, cache + offset, total_len);
4004 return total_len;
4005 }
4006
4007 reaperage = get_reaper_age(fc->pid);
4008 /* To understand why this is done, please read the comment to the
4009 * get_reaper_busy() function.
4010 */
4011 idletime = reaperage;
4012 if (reaperage >= busytime)
4013 idletime = reaperage - busytime;
4014
4015 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4016 if (total_len < 0 || total_len >= d->buflen){
4017 lxcfs_error("%s\n", "failed to write to cache");
4018 return 0;
4019 }
4020
4021 d->size = (int)total_len;
4022 d->cached = 1;
4023
4024 if (total_len > size) total_len = size;
4025
4026 memcpy(buf, d->buf, total_len);
4027 return total_len;
4028 }
4029
4030 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4031 struct fuse_file_info *fi)
4032 {
4033 char dev_name[72];
4034 struct fuse_context *fc = fuse_get_context();
4035 struct file_info *d = (struct file_info *)fi->fh;
4036 char *cg;
4037 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4038 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4039 unsigned long read = 0, write = 0;
4040 unsigned long read_merged = 0, write_merged = 0;
4041 unsigned long read_sectors = 0, write_sectors = 0;
4042 unsigned long read_ticks = 0, write_ticks = 0;
4043 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4044 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4045 char *cache = d->buf;
4046 size_t cache_size = d->buflen;
4047 char *line = NULL;
4048 size_t linelen = 0, total_len = 0, rv = 0;
4049 unsigned int major = 0, minor = 0;
4050 int i = 0;
4051 FILE *f = NULL;
4052
4053 if (offset){
4054 if (offset > d->size)
4055 return -EINVAL;
4056 if (!d->cached)
4057 return 0;
4058 int left = d->size - offset;
4059 total_len = left > size ? size: left;
4060 memcpy(buf, cache + offset, total_len);
4061 return total_len;
4062 }
4063
4064 pid_t initpid = lookup_initpid_in_store(fc->pid);
4065 if (initpid <= 0)
4066 initpid = fc->pid;
4067 cg = get_pid_cgroup(initpid, "blkio");
4068 if (!cg)
4069 return read_file("/proc/diskstats", buf, size, d);
4070 prune_init_slice(cg);
4071
4072 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4073 goto err;
4074 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4075 goto err;
4076 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4077 goto err;
4078 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4079 goto err;
4080 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4081 goto err;
4082
4083
4084 f = fopen("/proc/diskstats", "r");
4085 if (!f)
4086 goto err;
4087
4088 while (getline(&line, &linelen, f) != -1) {
4089 ssize_t l;
4090 char lbuf[256];
4091
4092 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4093 if (i != 3)
4094 continue;
4095
4096 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4097 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4098 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4099 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4100 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4101 read_sectors = read_sectors/512;
4102 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4103 write_sectors = write_sectors/512;
4104
4105 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4106 rd_svctm = rd_svctm/1000000;
4107 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4108 rd_wait = rd_wait/1000000;
4109 read_ticks = rd_svctm + rd_wait;
4110
4111 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4112 wr_svctm = wr_svctm/1000000;
4113 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4114 wr_wait = wr_wait/1000000;
4115 write_ticks = wr_svctm + wr_wait;
4116
4117 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4118 tot_ticks = tot_ticks/1000000;
4119
4120 memset(lbuf, 0, 256);
4121 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4122 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4123 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4124 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4125 else
4126 continue;
4127
4128 l = snprintf(cache, cache_size, "%s", lbuf);
4129 if (l < 0) {
4130 perror("Error writing to fuse buf");
4131 rv = 0;
4132 goto err;
4133 }
4134 if (l >= cache_size) {
4135 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4136 rv = 0;
4137 goto err;
4138 }
4139 cache += l;
4140 cache_size -= l;
4141 total_len += l;
4142 }
4143
4144 d->cached = 1;
4145 d->size = total_len;
4146 if (total_len > size ) total_len = size;
4147 memcpy(buf, d->buf, total_len);
4148
4149 rv = total_len;
4150 err:
4151 free(cg);
4152 if (f)
4153 fclose(f);
4154 free(line);
4155 free(io_serviced_str);
4156 free(io_merged_str);
4157 free(io_service_bytes_str);
4158 free(io_wait_time_str);
4159 free(io_service_time_str);
4160 return rv;
4161 }
4162
4163 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4164 struct fuse_file_info *fi)
4165 {
4166 struct fuse_context *fc = fuse_get_context();
4167 struct file_info *d = (struct file_info *)fi->fh;
4168 char *cg = NULL;
4169 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4170 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4171 ssize_t total_len = 0, rv = 0;
4172 ssize_t l = 0;
4173 char *cache = d->buf;
4174
4175 if (offset) {
4176 if (offset > d->size)
4177 return -EINVAL;
4178 if (!d->cached)
4179 return 0;
4180 int left = d->size - offset;
4181 total_len = left > size ? size: left;
4182 memcpy(buf, cache + offset, total_len);
4183 return total_len;
4184 }
4185
4186 pid_t initpid = lookup_initpid_in_store(fc->pid);
4187 if (initpid <= 0)
4188 initpid = fc->pid;
4189 cg = get_pid_cgroup(initpid, "memory");
4190 if (!cg)
4191 return read_file("/proc/swaps", buf, size, d);
4192 prune_init_slice(cg);
4193
4194 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4195
4196 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4197 goto err;
4198
4199 memusage = strtoul(memusage_str, NULL, 10);
4200
4201 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4202 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4203
4204 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4205 memswusage = strtoul(memswusage_str, NULL, 10);
4206
4207 swap_total = (memswlimit - memlimit) / 1024;
4208 swap_free = (memswusage - memusage) / 1024;
4209 }
4210
4211 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4212
4213 /* When no mem + swap limit is specified or swapaccount=0*/
4214 if (!memswlimit) {
4215 char *line = NULL;
4216 size_t linelen = 0;
4217 FILE *f = fopen("/proc/meminfo", "r");
4218
4219 if (!f)
4220 goto err;
4221
4222 while (getline(&line, &linelen, f) != -1) {
4223 if (startswith(line, "SwapTotal:")) {
4224 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4225 } else if (startswith(line, "SwapFree:")) {
4226 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4227 }
4228 }
4229
4230 free(line);
4231 fclose(f);
4232 }
4233
4234 if (swap_total > 0) {
4235 l = snprintf(d->buf + total_len, d->size - total_len,
4236 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4237 swap_total, swap_free);
4238 total_len += l;
4239 }
4240
4241 if (total_len < 0 || l < 0) {
4242 perror("Error writing to cache");
4243 rv = 0;
4244 goto err;
4245 }
4246
4247 d->cached = 1;
4248 d->size = (int)total_len;
4249
4250 if (total_len > size) total_len = size;
4251 memcpy(buf, d->buf, total_len);
4252 rv = total_len;
4253
4254 err:
4255 free(cg);
4256 free(memswlimit_str);
4257 free(memlimit_str);
4258 free(memusage_str);
4259 free(memswusage_str);
4260 return rv;
4261 }
4262 /*
4263 * Find the process pid from cgroup path.
4264 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4265 * @pid_buf : put pid to pid_buf.
4266 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4267 * @depth : the depth of cgroup in container.
4268 * @sum : return the number of pid.
4269 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4270 */
4271 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4272 {
4273 DIR *dir;
4274 int fd;
4275 struct dirent *file;
4276 FILE *f = NULL;
4277 size_t linelen = 0;
4278 char *line = NULL;
4279 int pd;
4280 char *path_dir, *path;
4281 char **pid;
4282
4283 /* path = dpath + "/cgroup.procs" + /0 */
4284 do {
4285 path = malloc(strlen(dpath) + 20);
4286 } while (!path);
4287
4288 strcpy(path, dpath);
4289 fd = openat(cfd, path, O_RDONLY);
4290 if (fd < 0)
4291 goto out;
4292
4293 dir = fdopendir(fd);
4294 if (dir == NULL) {
4295 close(fd);
4296 goto out;
4297 }
4298
4299 while (((file = readdir(dir)) != NULL) && depth > 0) {
4300 if (strncmp(file->d_name, ".", 1) == 0)
4301 continue;
4302 if (strncmp(file->d_name, "..", 1) == 0)
4303 continue;
4304 if (file->d_type == DT_DIR) {
4305 /* path + '/' + d_name +/0 */
4306 do {
4307 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4308 } while (!path_dir);
4309 strcpy(path_dir, path);
4310 strcat(path_dir, "/");
4311 strcat(path_dir, file->d_name);
4312 pd = depth - 1;
4313 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4314 free(path_dir);
4315 }
4316 }
4317 closedir(dir);
4318
4319 strcat(path, "/cgroup.procs");
4320 fd = openat(cfd, path, O_RDONLY);
4321 if (fd < 0)
4322 goto out;
4323
4324 f = fdopen(fd, "r");
4325 if (!f) {
4326 close(fd);
4327 goto out;
4328 }
4329
4330 while (getline(&line, &linelen, f) != -1) {
4331 do {
4332 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4333 } while (!pid);
4334 *pid_buf = pid;
4335 do {
4336 *(*pid_buf + sum) = malloc(strlen(line) + 1);
4337 } while (*(*pid_buf + sum) == NULL);
4338 strcpy(*(*pid_buf + sum), line);
4339 sum++;
4340 }
4341 fclose(f);
4342 out:
4343 free(path);
4344 return sum;
4345 }
4346 /*
4347 * calc_load calculates the load according to the following formula:
4348 * load1 = load0 * exp + active * (1 - exp)
4349 *
4350 * @load1: the new loadavg.
4351 * @load0: the former loadavg.
4352 * @active: the total number of running pid at this moment.
4353 * @exp: the fixed-point defined in the beginning.
4354 */
4355 static unsigned long
4356 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4357 {
4358 unsigned long newload;
4359
4360 active = active > 0 ? active * FIXED_1 : 0;
4361 newload = load * exp + active * (FIXED_1 - exp);
4362 if (active >= load)
4363 newload += FIXED_1 - 1;
4364
4365 return newload / FIXED_1;
4366 }
4367
4368 /*
4369 * Return 0 means that container p->cg is closed.
4370 * Return -1 means that error occurred in refresh.
4371 * Positive num equals the total number of pid.
4372 */
4373 static int refresh_load(struct load_node *p, char *path)
4374 {
4375 FILE *f = NULL;
4376 char **idbuf;
4377 char proc_path[256];
4378 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4379 char *line = NULL;
4380 size_t linelen = 0;
4381 int sum, length;
4382 DIR *dp;
4383 struct dirent *file;
4384
4385 do {
4386 idbuf = malloc(sizeof(char *));
4387 } while (!idbuf);
4388 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4389 /* normal exit */
4390 if (sum == 0)
4391 goto out;
4392
4393 for (i = 0; i < sum; i++) {
4394 /*clean up '\n' */
4395 length = strlen(idbuf[i])-1;
4396 idbuf[i][length] = '\0';
4397 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4398 if (ret < 0 || ret > 255) {
4399 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4400 i = sum;
4401 sum = -1;
4402 goto err_out;
4403 }
4404
4405 dp = opendir(proc_path);
4406 if (!dp) {
4407 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4408 continue;
4409 }
4410 while ((file = readdir(dp)) != NULL) {
4411 if (strncmp(file->d_name, ".", 1) == 0)
4412 continue;
4413 if (strncmp(file->d_name, "..", 1) == 0)
4414 continue;
4415 total_pid++;
4416 /* We make the biggest pid become last_pid.*/
4417 ret = atof(file->d_name);
4418 last_pid = (ret > last_pid) ? ret : last_pid;
4419
4420 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4421 if (ret < 0 || ret > 255) {
4422 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4423 i = sum;
4424 sum = -1;
4425 closedir(dp);
4426 goto err_out;
4427 }
4428 f = fopen(proc_path, "r");
4429 if (f != NULL) {
4430 while (getline(&line, &linelen, f) != -1) {
4431 /* Find State */
4432 if ((line[0] == 'S') && (line[1] == 't'))
4433 break;
4434 }
4435 if ((line[7] == 'R') || (line[7] == 'D'))
4436 run_pid++;
4437 fclose(f);
4438 }
4439 }
4440 closedir(dp);
4441 }
4442 /*Calculate the loadavg.*/
4443 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4444 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4445 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4446 p->run_pid = run_pid;
4447 p->total_pid = total_pid;
4448 p->last_pid = last_pid;
4449
4450 free(line);
4451 err_out:
4452 for (; i > 0; i--)
4453 free(idbuf[i-1]);
4454 out:
4455 free(idbuf);
4456 return sum;
4457 }
4458 /*
4459 * Traverse the hash table and update it.
4460 */
4461 void *load_begin(void *arg)
4462 {
4463
4464 char *path = NULL;
4465 int i, sum, length, ret;
4466 struct load_node *f;
4467 int first_node;
4468 clock_t time1, time2;
4469
4470 while (1) {
4471 time1 = clock();
4472 for (i = 0; i < LOAD_SIZE; i++) {
4473 pthread_mutex_lock(&load_hash[i].lock);
4474 if (load_hash[i].next == NULL) {
4475 pthread_mutex_unlock(&load_hash[i].lock);
4476 continue;
4477 }
4478 f = load_hash[i].next;
4479 first_node = 1;
4480 while (f) {
4481 length = strlen(f->cg) + 2;
4482 do {
4483 /* strlen(f->cg) + '.' or '' + \0 */
4484 path = malloc(length);
4485 } while (!path);
4486
4487 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4488 if (ret < 0 || ret > length - 1) {
4489 /* snprintf failed, ignore the node.*/
4490 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4491 goto out;
4492 }
4493 sum = refresh_load(f, path);
4494 if (sum == 0) {
4495 f = del_node(f, i);
4496 } else {
4497 out: f = f->next;
4498 }
4499 free(path);
4500 /* load_hash[i].lock locks only on the first node.*/
4501 if (first_node == 1) {
4502 first_node = 0;
4503 pthread_mutex_unlock(&load_hash[i].lock);
4504 }
4505 }
4506 }
4507 time2 = clock();
4508 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4509 }
4510 }
4511
4512 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4513 struct fuse_file_info *fi)
4514 {
4515 struct fuse_context *fc = fuse_get_context();
4516 struct file_info *d = (struct file_info *)fi->fh;
4517 pid_t initpid;
4518 char *cg;
4519 size_t total_len = 0;
4520 char *cache = d->buf;
4521 struct load_node *n;
4522 int hash;
4523 int cfd;
4524 unsigned long a, b, c;
4525
4526 if (offset) {
4527 if (offset > d->size)
4528 return -EINVAL;
4529 if (!d->cached)
4530 return 0;
4531 int left = d->size - offset;
4532 total_len = left > size ? size : left;
4533 memcpy(buf, cache + offset, total_len);
4534 return total_len;
4535 }
4536 if (!loadavg)
4537 return read_file("/proc/loadavg", buf, size, d);
4538
4539 initpid = lookup_initpid_in_store(fc->pid);
4540 if (initpid <= 0)
4541 initpid = fc->pid;
4542 cg = get_pid_cgroup(initpid, "cpu");
4543 if (!cg)
4544 return read_file("/proc/loadavg", buf, size, d);
4545
4546 prune_init_slice(cg);
4547 hash = calc_hash(cg);
4548 n = locate_node(cg, hash);
4549
4550 /* First time */
4551 if (n == NULL) {
4552 if (!find_mounted_controller("cpu", &cfd)) {
4553 /*
4554 * In locate_node() above, pthread_rwlock_unlock() isn't used
4555 * because delete is not allowed before read has ended.
4556 */
4557 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4558 return 0;
4559 }
4560 do {
4561 n = malloc(sizeof(struct load_node));
4562 } while (!n);
4563
4564 do {
4565 n->cg = malloc(strlen(cg)+1);
4566 } while (!n->cg);
4567 strcpy(n->cg, cg);
4568 n->avenrun[0] = 0;
4569 n->avenrun[1] = 0;
4570 n->avenrun[2] = 0;
4571 n->run_pid = 0;
4572 n->total_pid = 1;
4573 n->last_pid = initpid;
4574 n->cfd = cfd;
4575 insert_node(&n, hash);
4576 }
4577 a = n->avenrun[0] + (FIXED_1/200);
4578 b = n->avenrun[1] + (FIXED_1/200);
4579 c = n->avenrun[2] + (FIXED_1/200);
4580 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4581 LOAD_INT(a), LOAD_FRAC(a),
4582 LOAD_INT(b), LOAD_FRAC(b),
4583 LOAD_INT(c), LOAD_FRAC(c),
4584 n->run_pid, n->total_pid, n->last_pid);
4585 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4586 if (total_len < 0 || total_len >= d->buflen) {
4587 lxcfs_error("%s\n", "Failed to write to cache");
4588 return 0;
4589 }
4590 d->size = (int)total_len;
4591 d->cached = 1;
4592
4593 if (total_len > size)
4594 total_len = size;
4595 memcpy(buf, d->buf, total_len);
4596 return total_len;
4597 }
4598 /* Return a positive number on success, return 0 on failure.*/
4599 pthread_t load_daemon(int load_use)
4600 {
4601 int ret;
4602 pthread_t pid;
4603
4604 ret = init_load();
4605 if (ret == -1) {
4606 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4607 return 0;
4608 }
4609 ret = pthread_create(&pid, NULL, load_begin, NULL);
4610 if (ret != 0) {
4611 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4612 load_free();
4613 return 0;
4614 }
4615 /* use loadavg, here loadavg = 1*/
4616 loadavg = load_use;
4617 return pid;
4618 }
4619
4620 static off_t get_procfile_size(const char *which)
4621 {
4622 FILE *f = fopen(which, "r");
4623 char *line = NULL;
4624 size_t len = 0;
4625 ssize_t sz, answer = 0;
4626 if (!f)
4627 return 0;
4628
4629 while ((sz = getline(&line, &len, f)) != -1)
4630 answer += sz;
4631 fclose (f);
4632 free(line);
4633
4634 return answer;
4635 }
4636
4637 int proc_getattr(const char *path, struct stat *sb)
4638 {
4639 struct timespec now;
4640
4641 memset(sb, 0, sizeof(struct stat));
4642 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4643 return -EINVAL;
4644 sb->st_uid = sb->st_gid = 0;
4645 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4646 if (strcmp(path, "/proc") == 0) {
4647 sb->st_mode = S_IFDIR | 00555;
4648 sb->st_nlink = 2;
4649 return 0;
4650 }
4651 if (strcmp(path, "/proc/meminfo") == 0 ||
4652 strcmp(path, "/proc/cpuinfo") == 0 ||
4653 strcmp(path, "/proc/uptime") == 0 ||
4654 strcmp(path, "/proc/stat") == 0 ||
4655 strcmp(path, "/proc/diskstats") == 0 ||
4656 strcmp(path, "/proc/swaps") == 0 ||
4657 strcmp(path, "/proc/loadavg") == 0) {
4658 sb->st_size = 0;
4659 sb->st_mode = S_IFREG | 00444;
4660 sb->st_nlink = 1;
4661 return 0;
4662 }
4663
4664 return -ENOENT;
4665 }
4666
4667 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4668 struct fuse_file_info *fi)
4669 {
4670 if (filler(buf, ".", NULL, 0) != 0 ||
4671 filler(buf, "..", NULL, 0) != 0 ||
4672 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4673 filler(buf, "meminfo", NULL, 0) != 0 ||
4674 filler(buf, "stat", NULL, 0) != 0 ||
4675 filler(buf, "uptime", NULL, 0) != 0 ||
4676 filler(buf, "diskstats", NULL, 0) != 0 ||
4677 filler(buf, "swaps", NULL, 0) != 0 ||
4678 filler(buf, "loadavg", NULL, 0) != 0)
4679 return -EINVAL;
4680 return 0;
4681 }
4682
4683 int proc_open(const char *path, struct fuse_file_info *fi)
4684 {
4685 int type = -1;
4686 struct file_info *info;
4687
4688 if (strcmp(path, "/proc/meminfo") == 0)
4689 type = LXC_TYPE_PROC_MEMINFO;
4690 else if (strcmp(path, "/proc/cpuinfo") == 0)
4691 type = LXC_TYPE_PROC_CPUINFO;
4692 else if (strcmp(path, "/proc/uptime") == 0)
4693 type = LXC_TYPE_PROC_UPTIME;
4694 else if (strcmp(path, "/proc/stat") == 0)
4695 type = LXC_TYPE_PROC_STAT;
4696 else if (strcmp(path, "/proc/diskstats") == 0)
4697 type = LXC_TYPE_PROC_DISKSTATS;
4698 else if (strcmp(path, "/proc/swaps") == 0)
4699 type = LXC_TYPE_PROC_SWAPS;
4700 else if (strcmp(path, "/proc/loadavg") == 0)
4701 type = LXC_TYPE_PROC_LOADAVG;
4702 if (type == -1)
4703 return -ENOENT;
4704
4705 info = malloc(sizeof(*info));
4706 if (!info)
4707 return -ENOMEM;
4708
4709 memset(info, 0, sizeof(*info));
4710 info->type = type;
4711
4712 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4713 do {
4714 info->buf = malloc(info->buflen);
4715 } while (!info->buf);
4716 memset(info->buf, 0, info->buflen);
4717 /* set actual size to buffer size */
4718 info->size = info->buflen;
4719
4720 fi->fh = (unsigned long)info;
4721 return 0;
4722 }
4723
4724 int proc_access(const char *path, int mask)
4725 {
4726 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4727 return 0;
4728
4729 /* these are all read-only */
4730 if ((mask & ~R_OK) != 0)
4731 return -EACCES;
4732 return 0;
4733 }
4734
4735 int proc_release(const char *path, struct fuse_file_info *fi)
4736 {
4737 do_release_file_info(fi);
4738 return 0;
4739 }
4740
4741 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4742 struct fuse_file_info *fi)
4743 {
4744 struct file_info *f = (struct file_info *) fi->fh;
4745
4746 switch (f->type) {
4747 case LXC_TYPE_PROC_MEMINFO:
4748 return proc_meminfo_read(buf, size, offset, fi);
4749 case LXC_TYPE_PROC_CPUINFO:
4750 return proc_cpuinfo_read(buf, size, offset, fi);
4751 case LXC_TYPE_PROC_UPTIME:
4752 return proc_uptime_read(buf, size, offset, fi);
4753 case LXC_TYPE_PROC_STAT:
4754 return proc_stat_read(buf, size, offset, fi);
4755 case LXC_TYPE_PROC_DISKSTATS:
4756 return proc_diskstats_read(buf, size, offset, fi);
4757 case LXC_TYPE_PROC_SWAPS:
4758 return proc_swaps_read(buf, size, offset, fi);
4759 case LXC_TYPE_PROC_LOADAVG:
4760 return proc_loadavg_read(buf, size, offset, fi);
4761 default:
4762 return -EINVAL;
4763 }
4764 }
4765
4766 /*
4767 * Functions needed to setup cgroups in the __constructor__.
4768 */
4769
4770 static bool mkdir_p(const char *dir, mode_t mode)
4771 {
4772 const char *tmp = dir;
4773 const char *orig = dir;
4774 char *makeme;
4775
4776 do {
4777 dir = tmp + strspn(tmp, "/");
4778 tmp = dir + strcspn(dir, "/");
4779 makeme = strndup(orig, dir - orig);
4780 if (!makeme)
4781 return false;
4782 if (mkdir(makeme, mode) && errno != EEXIST) {
4783 lxcfs_error("Failed to create directory '%s': %s.\n",
4784 makeme, strerror(errno));
4785 free(makeme);
4786 return false;
4787 }
4788 free(makeme);
4789 } while(tmp != dir);
4790
4791 return true;
4792 }
4793
4794 static bool umount_if_mounted(void)
4795 {
4796 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4797 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4798 return false;
4799 }
4800 return true;
4801 }
4802
4803 /* __typeof__ should be safe to use with all compilers. */
4804 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4805 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4806 {
4807 return (fs->f_type == (fs_type_magic)magic_val);
4808 }
4809
4810 /*
4811 * looking at fs/proc_namespace.c, it appears we can
4812 * actually expect the rootfs entry to very specifically contain
4813 * " - rootfs rootfs "
4814 * IIUC, so long as we've chrooted so that rootfs is not our root,
4815 * the rootfs entry should always be skipped in mountinfo contents.
4816 */
4817 static bool is_on_ramfs(void)
4818 {
4819 FILE *f;
4820 char *p, *p2;
4821 char *line = NULL;
4822 size_t len = 0;
4823 int i;
4824
4825 f = fopen("/proc/self/mountinfo", "r");
4826 if (!f)
4827 return false;
4828
4829 while (getline(&line, &len, f) != -1) {
4830 for (p = line, i = 0; p && i < 4; i++)
4831 p = strchr(p + 1, ' ');
4832 if (!p)
4833 continue;
4834 p2 = strchr(p + 1, ' ');
4835 if (!p2)
4836 continue;
4837 *p2 = '\0';
4838 if (strcmp(p + 1, "/") == 0) {
4839 // this is '/'. is it the ramfs?
4840 p = strchr(p2 + 1, '-');
4841 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4842 free(line);
4843 fclose(f);
4844 return true;
4845 }
4846 }
4847 }
4848 free(line);
4849 fclose(f);
4850 return false;
4851 }
4852
4853 static int pivot_enter()
4854 {
4855 int ret = -1, oldroot = -1, newroot = -1;
4856
4857 oldroot = open("/", O_DIRECTORY | O_RDONLY);
4858 if (oldroot < 0) {
4859 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4860 return ret;
4861 }
4862
4863 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4864 if (newroot < 0) {
4865 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4866 goto err;
4867 }
4868
4869 /* change into new root fs */
4870 if (fchdir(newroot) < 0) {
4871 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4872 goto err;
4873 }
4874
4875 /* pivot_root into our new root fs */
4876 if (pivot_root(".", ".") < 0) {
4877 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4878 goto err;
4879 }
4880
4881 /*
4882 * At this point the old-root is mounted on top of our new-root.
4883 * To unmounted it we must not be chdir'd into it, so escape back
4884 * to the old-root.
4885 */
4886 if (fchdir(oldroot) < 0) {
4887 lxcfs_error("%s\n", "Failed to enter old root.");
4888 goto err;
4889 }
4890
4891 if (umount2(".", MNT_DETACH) < 0) {
4892 lxcfs_error("%s\n", "Failed to detach old root.");
4893 goto err;
4894 }
4895
4896 if (fchdir(newroot) < 0) {
4897 lxcfs_error("%s\n", "Failed to re-enter new root.");
4898 goto err;
4899 }
4900
4901 ret = 0;
4902
4903 err:
4904 if (oldroot > 0)
4905 close(oldroot);
4906 if (newroot > 0)
4907 close(newroot);
4908
4909 return ret;
4910 }
4911
4912 static int chroot_enter()
4913 {
4914 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4915 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4916 return -1;
4917 }
4918
4919 if (chroot(".") < 0) {
4920 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4921 return -1;
4922 }
4923
4924 if (chdir("/") < 0) {
4925 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4926 return -1;
4927 }
4928
4929 return 0;
4930 }
4931
4932 static int permute_and_enter(void)
4933 {
4934 struct statfs sb;
4935
4936 if (statfs("/", &sb) < 0) {
4937 lxcfs_error("%s\n", "Could not stat / mountpoint.");
4938 return -1;
4939 }
4940
4941 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
4942 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
4943 * /proc/1/mountinfo. */
4944 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
4945 return chroot_enter();
4946
4947 if (pivot_enter() < 0) {
4948 lxcfs_error("%s\n", "Could not perform pivot root.");
4949 return -1;
4950 }
4951
4952 return 0;
4953 }
4954
4955 /* Prepare our new clean root. */
4956 static int permute_prepare(void)
4957 {
4958 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4959 lxcfs_error("%s\n", "Failed to create directory for new root.");
4960 return -1;
4961 }
4962
4963 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4964 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4965 return -1;
4966 }
4967
4968 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4969 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
4970 return -1;
4971 }
4972
4973 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4974 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
4975 return -1;
4976 }
4977
4978 return 0;
4979 }
4980
4981 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
4982 static bool permute_root(void)
4983 {
4984 /* Prepare new root. */
4985 if (permute_prepare() < 0)
4986 return false;
4987
4988 /* Pivot into new root. */
4989 if (permute_and_enter() < 0)
4990 return false;
4991
4992 return true;
4993 }
4994
4995 static int preserve_mnt_ns(int pid)
4996 {
4997 int ret;
4998 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
4999 char path[len];
5000
5001 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5002 if (ret < 0 || (size_t)ret >= len)
5003 return -1;
5004
5005 return open(path, O_RDONLY | O_CLOEXEC);
5006 }
5007
5008 static bool cgfs_prepare_mounts(void)
5009 {
5010 if (!mkdir_p(BASEDIR, 0700)) {
5011 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5012 return false;
5013 }
5014
5015 if (!umount_if_mounted()) {
5016 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5017 return false;
5018 }
5019
5020 if (unshare(CLONE_NEWNS) < 0) {
5021 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5022 return false;
5023 }
5024
5025 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5026 if (cgroup_mount_ns_fd < 0) {
5027 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5028 return false;
5029 }
5030
5031 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5032 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5033 return false;
5034 }
5035
5036 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5037 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5038 return false;
5039 }
5040
5041 return true;
5042 }
5043
5044 static bool cgfs_mount_hierarchies(void)
5045 {
5046 char *target;
5047 size_t clen, len;
5048 int i, ret;
5049
5050 for (i = 0; i < num_hierarchies; i++) {
5051 char *controller = hierarchies[i];
5052
5053 clen = strlen(controller);
5054 len = strlen(BASEDIR) + clen + 2;
5055 target = malloc(len);
5056 if (!target)
5057 return false;
5058
5059 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5060 if (ret < 0 || ret >= len) {
5061 free(target);
5062 return false;
5063 }
5064 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5065 free(target);
5066 return false;
5067 }
5068 if (!strcmp(controller, "unified"))
5069 ret = mount("none", target, "cgroup2", 0, NULL);
5070 else
5071 ret = mount(controller, target, "cgroup", 0, controller);
5072 if (ret < 0) {
5073 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5074 free(target);
5075 return false;
5076 }
5077
5078 fd_hierarchies[i] = open(target, O_DIRECTORY);
5079 if (fd_hierarchies[i] < 0) {
5080 free(target);
5081 return false;
5082 }
5083 free(target);
5084 }
5085 return true;
5086 }
5087
5088 static bool cgfs_setup_controllers(void)
5089 {
5090 if (!cgfs_prepare_mounts())
5091 return false;
5092
5093 if (!cgfs_mount_hierarchies()) {
5094 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5095 return false;
5096 }
5097
5098 if (!permute_root())
5099 return false;
5100
5101 return true;
5102 }
5103
5104 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5105 {
5106 FILE *f;
5107 char *cret, *line = NULL;
5108 char cwd[MAXPATHLEN];
5109 size_t len = 0;
5110 int i, init_ns = -1;
5111 bool found_unified = false;
5112
5113 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5114 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5115 return;
5116 }
5117
5118 while (getline(&line, &len, f) != -1) {
5119 char *idx, *p, *p2;
5120
5121 p = strchr(line, ':');
5122 if (!p)
5123 goto out;
5124 idx = line;
5125 *(p++) = '\0';
5126
5127 p2 = strrchr(p, ':');
5128 if (!p2)
5129 goto out;
5130 *p2 = '\0';
5131
5132 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5133 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5134 * because it parses out the empty string "" and later on passes
5135 * it to mount(). Let's skip such entries.
5136 */
5137 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5138 found_unified = true;
5139 p = "unified";
5140 }
5141
5142 if (!store_hierarchy(line, p))
5143 goto out;
5144 }
5145
5146 /* Preserve initial namespace. */
5147 init_ns = preserve_mnt_ns(getpid());
5148 if (init_ns < 0) {
5149 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5150 goto out;
5151 }
5152
5153 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5154 if (!fd_hierarchies) {
5155 lxcfs_error("%s\n", strerror(errno));
5156 goto out;
5157 }
5158
5159 for (i = 0; i < num_hierarchies; i++)
5160 fd_hierarchies[i] = -1;
5161
5162 cret = getcwd(cwd, MAXPATHLEN);
5163 if (!cret)
5164 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5165
5166 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5167 * to privately mount lxcfs cgroups. */
5168 if (!cgfs_setup_controllers()) {
5169 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5170 goto out;
5171 }
5172
5173 if (setns(init_ns, 0) < 0) {
5174 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5175 goto out;
5176 }
5177
5178 if (!cret || chdir(cwd) < 0)
5179 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5180
5181 print_subsystems();
5182
5183 out:
5184 free(line);
5185 fclose(f);
5186 if (init_ns >= 0)
5187 close(init_ns);
5188 }
5189
5190 static void __attribute__((destructor)) free_subsystems(void)
5191 {
5192 int i;
5193
5194 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5195
5196 for (i = 0; i < num_hierarchies; i++) {
5197 if (hierarchies[i])
5198 free(hierarchies[i]);
5199 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5200 close(fd_hierarchies[i]);
5201 }
5202 free(hierarchies);
5203 free(fd_hierarchies);
5204
5205 if (cgroup_mount_ns_fd >= 0)
5206 close(cgroup_mount_ns_fd);
5207 }