]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
loadavg: restart thread on library reload
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f
CB
19#include <sched.h>
20#include <stdbool.h>
0ecddf02 21#include <stdint.h>
29a73c2f
CB
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <time.h>
26#include <unistd.h>
27#include <wait.h>
d89504c4 28#include <linux/magic.h>
237e200e 29#include <linux/sched.h>
29a73c2f
CB
30#include <sys/epoll.h>
31#include <sys/mman.h>
32#include <sys/mount.h>
237e200e
SH
33#include <sys/param.h>
34#include <sys/socket.h>
29a73c2f 35#include <sys/syscall.h>
0ecddf02 36#include <sys/sysinfo.h>
d89504c4 37#include <sys/vfs.h>
237e200e 38
237e200e 39#include "bindings.h"
237e200e
SH
40#include "config.h" // for VERSION
41
0ecddf02
CB
42/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43#define LXCFS_NUMSTRLEN64 21
44
29a73c2f
CB
45/* Define pivot_root() if missing from the C library */
46#ifndef HAVE_PIVOT_ROOT
47static int pivot_root(const char * new_root, const char * put_old)
48{
49#ifdef __NR_pivot_root
50return syscall(__NR_pivot_root, new_root, put_old);
51#else
52errno = ENOSYS;
53return -1;
54#endif
55}
56#else
57extern int pivot_root(const char * new_root, const char * put_old);
58#endif
59
237e200e
SH
60enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 68 LXC_TYPE_PROC_SWAPS,
46be8eed 69 LXC_TYPE_PROC_LOADAVG,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
0e47acaa 83/* The function of hash table.*/
84#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 85#define FLUSH_TIME 5 /*the flush rate */
86#define DEPTH_DIR 3 /*the depth of per cgroup */
87/* The function of calculate loadavg .*/
88#define FSHIFT 11 /* nr of bits of precision */
89#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
90#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
91#define EXP_5 2014 /* 1/exp(5sec/5min) */
92#define EXP_15 2037 /* 1/exp(5sec/15min) */
93#define LOAD_INT(x) ((x) >> FSHIFT)
94#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
95/*
96 * This parameter is used for proc_loadavg_read().
97 * 1 means use loadavg, 0 means not use.
98 */
99static int loadavg = 0;
a83618e2 100static volatile sig_atomic_t loadavg_stop = 0;
0e47acaa 101static int calc_hash(char *name)
102{
103 unsigned int hash = 0;
104 unsigned int x = 0;
105 /* ELFHash algorithm. */
106 while (*name) {
107 hash = (hash << 4) + *name++;
108 x = hash & 0xf0000000;
109 if (x != 0)
110 hash ^= (x >> 24);
111 hash &= ~x;
112 }
113 return ((hash & 0x7fffffff) % LOAD_SIZE);
114}
115
116struct load_node {
117 char *cg; /*cg */
118 unsigned long avenrun[3]; /* Load averages */
119 unsigned int run_pid;
120 unsigned int total_pid;
121 unsigned int last_pid;
122 int cfd; /* The file descriptor of the mounted cgroup */
123 struct load_node *next;
124 struct load_node **pre;
125};
126
127struct load_head {
128 /*
129 * The lock is about insert load_node and refresh load_node.To the first
130 * load_node of each hash bucket, insert and refresh in this hash bucket is
131 * mutually exclusive.
132 */
133 pthread_mutex_t lock;
134 /*
135 * The rdlock is about read loadavg and delete load_node.To each hash
136 * bucket, read and delete is mutually exclusive. But at the same time, we
137 * allow paratactic read operation. This rdlock is at list level.
138 */
139 pthread_rwlock_t rdlock;
140 /*
141 * The rilock is about read loadavg and insert load_node.To the first
142 * load_node of each hash bucket, read and insert is mutually exclusive.
143 * But at the same time, we allow paratactic read operation.
144 */
145 pthread_rwlock_t rilock;
146 struct load_node *next;
147};
148
149static struct load_head load_hash[LOAD_SIZE]; /* hash table */
150/*
151 * init_load initialize the hash table.
152 * Return 0 on success, return -1 on failure.
153 */
154static int init_load(void)
155{
156 int i;
157 int ret;
158
159 for (i = 0; i < LOAD_SIZE; i++) {
160 load_hash[i].next = NULL;
161 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
162 if (ret != 0) {
163 lxcfs_error("%s\n", "Failed to initialize lock");
164 goto out3;
165 }
166 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
167 if (ret != 0) {
168 lxcfs_error("%s\n", "Failed to initialize rdlock");
169 goto out2;
170 }
171 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
172 if (ret != 0) {
173 lxcfs_error("%s\n", "Failed to initialize rilock");
174 goto out1;
175 }
176 }
177 return 0;
178out1:
179 pthread_rwlock_destroy(&load_hash[i].rdlock);
180out2:
181 pthread_mutex_destroy(&load_hash[i].lock);
182out3:
183 while (i > 0) {
184 i--;
185 pthread_mutex_destroy(&load_hash[i].lock);
186 pthread_rwlock_destroy(&load_hash[i].rdlock);
187 pthread_rwlock_destroy(&load_hash[i].rilock);
188 }
189 return -1;
190}
191
192static void insert_node(struct load_node **n, int locate)
193{
194 struct load_node *f;
195
196 pthread_mutex_lock(&load_hash[locate].lock);
197 pthread_rwlock_wrlock(&load_hash[locate].rilock);
198 f = load_hash[locate].next;
199 load_hash[locate].next = *n;
200
201 (*n)->pre = &(load_hash[locate].next);
202 if (f)
203 f->pre = &((*n)->next);
204 (*n)->next = f;
205 pthread_mutex_unlock(&load_hash[locate].lock);
206 pthread_rwlock_unlock(&load_hash[locate].rilock);
207}
208/*
209 * locate_node() finds special node. Not return NULL means success.
210 * It should be noted that rdlock isn't unlocked at the end of code
211 * because this function is used to read special node. Delete is not
212 * allowed before read has ended.
213 * unlock rdlock only in proc_loadavg_read().
214 */
215static struct load_node *locate_node(char *cg, int locate)
216{
217 struct load_node *f = NULL;
218 int i = 0;
219
220 pthread_rwlock_rdlock(&load_hash[locate].rilock);
221 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
222 if (load_hash[locate].next == NULL) {
223 pthread_rwlock_unlock(&load_hash[locate].rilock);
224 return f;
225 }
226 f = load_hash[locate].next;
227 pthread_rwlock_unlock(&load_hash[locate].rilock);
228 while (f && ((i = strcmp(f->cg, cg)) != 0))
229 f = f->next;
230 return f;
231}
232/* Delete the load_node n and return the next node of it. */
233static struct load_node *del_node(struct load_node *n, int locate)
234{
235 struct load_node *g;
236
237 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
238 if (n->next == NULL) {
239 *(n->pre) = NULL;
240 } else {
241 *(n->pre) = n->next;
242 n->next->pre = n->pre;
243 }
244 g = n->next;
245 free(n->cg);
246 free(n);
247 pthread_rwlock_unlock(&load_hash[locate].rdlock);
248 return g;
249}
250
a83618e2 251static void load_free(void)
9c480eb7 252{
253 int i;
254 struct load_node *f, *p;
255
256 for (i = 0; i < LOAD_SIZE; i++) {
257 pthread_mutex_lock(&load_hash[i].lock);
258 pthread_rwlock_wrlock(&load_hash[i].rilock);
259 pthread_rwlock_wrlock(&load_hash[i].rdlock);
260 if (load_hash[i].next == NULL) {
261 pthread_mutex_unlock(&load_hash[i].lock);
262 pthread_mutex_destroy(&load_hash[i].lock);
263 pthread_rwlock_unlock(&load_hash[i].rilock);
264 pthread_rwlock_destroy(&load_hash[i].rilock);
265 pthread_rwlock_unlock(&load_hash[i].rdlock);
266 pthread_rwlock_destroy(&load_hash[i].rdlock);
267 continue;
268 }
269 for (f = load_hash[i].next; f; ) {
270 free(f->cg);
271 p = f->next;
272 free(f);
273 f = p;
274 }
275 pthread_mutex_unlock(&load_hash[i].lock);
276 pthread_mutex_destroy(&load_hash[i].lock);
277 pthread_rwlock_unlock(&load_hash[i].rilock);
278 pthread_rwlock_destroy(&load_hash[i].rilock);
279 pthread_rwlock_unlock(&load_hash[i].rdlock);
280 pthread_rwlock_destroy(&load_hash[i].rdlock);
281 }
282}
f34de69a
CB
283/* Reserve buffer size to account for file size changes. */
284#define BUF_RESERVE_SIZE 512
237e200e
SH
285
286/*
287 * A table caching which pid is init for a pid namespace.
288 * When looking up which pid is init for $qpid, we first
289 * 1. Stat /proc/$qpid/ns/pid.
290 * 2. Check whether the ino_t is in our store.
291 * a. if not, fork a child in qpid's ns to send us
292 * ucred.pid = 1, and read the initpid. Cache
293 * initpid and creation time for /proc/initpid
294 * in a new store entry.
295 * b. if so, verify that /proc/initpid still matches
296 * what we have saved. If not, clear the store
297 * entry and go back to a. If so, return the
298 * cached initpid.
299 */
300struct pidns_init_store {
301 ino_t ino; // inode number for /proc/$pid/ns/pid
302 pid_t initpid; // the pid of nit in that ns
303 long int ctime; // the time at which /proc/$initpid was created
304 struct pidns_init_store *next;
305 long int lastcheck;
306};
307
308/* lol - look at how they are allocated in the kernel */
309#define PIDNS_HASH_SIZE 4096
310#define HASH(x) ((x) % PIDNS_HASH_SIZE)
311
312static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
313static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
314static void lock_mutex(pthread_mutex_t *l)
315{
316 int ret;
317
318 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 319 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
320 exit(1);
321 }
322}
323
29a73c2f
CB
324/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
325 * Number of hierarchies mounted. */
326static int num_hierarchies;
327
328/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
329 * Hierachies mounted {cpuset, blkio, ...}:
330 * Initialized via __constructor__ collect_and_mount_subsystems(). */
331static char **hierarchies;
332
333/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
334 * Open file descriptors:
335 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
336 * private mount namespace.
337 * Initialized via __constructor__ collect_and_mount_subsystems().
338 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
339 * mounts and respective files in the private namespace even when located in
340 * another namespace using the *at() family of functions
341 * {openat(), fchownat(), ...}. */
342static int *fd_hierarchies;
a257a8ee 343static int cgroup_mount_ns_fd = -1;
29a73c2f 344
237e200e
SH
345static void unlock_mutex(pthread_mutex_t *l)
346{
347 int ret;
348
349 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 350 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
351 exit(1);
352 }
353}
354
355static void store_lock(void)
356{
357 lock_mutex(&pidns_store_mutex);
358}
359
360static void store_unlock(void)
361{
362 unlock_mutex(&pidns_store_mutex);
363}
364
365/* Must be called under store_lock */
366static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
367{
368 struct stat initsb;
369 char fnam[100];
370
371 snprintf(fnam, 100, "/proc/%d", e->initpid);
372 if (stat(fnam, &initsb) < 0)
373 return false;
7dd6560a
CB
374
375 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
376 initsb.st_ctime, e->initpid);
377
237e200e
SH
378 if (e->ctime != initsb.st_ctime)
379 return false;
380 return true;
381}
382
383/* Must be called under store_lock */
384static void remove_initpid(struct pidns_init_store *e)
385{
386 struct pidns_init_store *tmp;
387 int h;
388
7dd6560a
CB
389 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
390
237e200e
SH
391 h = HASH(e->ino);
392 if (pidns_hash_table[h] == e) {
393 pidns_hash_table[h] = e->next;
394 free(e);
395 return;
396 }
397
398 tmp = pidns_hash_table[h];
399 while (tmp) {
400 if (tmp->next == e) {
401 tmp->next = e->next;
402 free(e);
403 return;
404 }
405 tmp = tmp->next;
406 }
407}
408
409#define PURGE_SECS 5
410/* Must be called under store_lock */
411static void prune_initpid_store(void)
412{
413 static long int last_prune = 0;
414 struct pidns_init_store *e, *prev, *delme;
415 long int now, threshold;
416 int i;
417
418 if (!last_prune) {
419 last_prune = time(NULL);
420 return;
421 }
422 now = time(NULL);
423 if (now < last_prune + PURGE_SECS)
424 return;
7dd6560a
CB
425
426 lxcfs_debug("%s\n", "Pruning.");
427
237e200e
SH
428 last_prune = now;
429 threshold = now - 2 * PURGE_SECS;
430
431 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
432 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
433 if (e->lastcheck < threshold) {
7dd6560a
CB
434
435 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
436
237e200e
SH
437 delme = e;
438 if (prev)
439 prev->next = e->next;
440 else
441 pidns_hash_table[i] = e->next;
442 e = e->next;
443 free(delme);
444 } else {
445 prev = e;
446 e = e->next;
447 }
448 }
449 }
450}
451
452/* Must be called under store_lock */
453static void save_initpid(struct stat *sb, pid_t pid)
454{
455 struct pidns_init_store *e;
456 char fpath[100];
457 struct stat procsb;
458 int h;
459
7dd6560a
CB
460 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
461
237e200e
SH
462 snprintf(fpath, 100, "/proc/%d", pid);
463 if (stat(fpath, &procsb) < 0)
464 return;
465 do {
466 e = malloc(sizeof(*e));
467 } while (!e);
468 e->ino = sb->st_ino;
469 e->initpid = pid;
470 e->ctime = procsb.st_ctime;
471 h = HASH(e->ino);
472 e->next = pidns_hash_table[h];
473 e->lastcheck = time(NULL);
474 pidns_hash_table[h] = e;
475}
476
477/*
478 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
479 * entry for the inode number and creation time. Verify that the init pid
480 * is still valid. If not, remove it. Return the entry if valid, NULL
481 * otherwise.
482 * Must be called under store_lock
483 */
484static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
485{
486 int h = HASH(sb->st_ino);
487 struct pidns_init_store *e = pidns_hash_table[h];
488
489 while (e) {
490 if (e->ino == sb->st_ino) {
491 if (initpid_still_valid(e, sb)) {
492 e->lastcheck = time(NULL);
493 return e;
494 }
495 remove_initpid(e);
496 return NULL;
497 }
498 e = e->next;
499 }
500
501 return NULL;
502}
503
0f657ce3 504static int is_dir(const char *path, int fd)
237e200e
SH
505{
506 struct stat statbuf;
0f657ce3 507 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
508 if (ret == 0 && S_ISDIR(statbuf.st_mode))
509 return 1;
510 return 0;
511}
512
513static char *must_copy_string(const char *str)
514{
515 char *dup = NULL;
516 if (!str)
517 return NULL;
518 do {
519 dup = strdup(str);
520 } while (!dup);
521
522 return dup;
523}
524
525static inline void drop_trailing_newlines(char *s)
526{
527 int l;
528
529 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
530 s[l-1] = '\0';
531}
532
533#define BATCH_SIZE 50
534static void dorealloc(char **mem, size_t oldlen, size_t newlen)
535{
536 int newbatches = (newlen / BATCH_SIZE) + 1;
537 int oldbatches = (oldlen / BATCH_SIZE) + 1;
538
539 if (!*mem || newbatches > oldbatches) {
540 char *tmp;
541 do {
542 tmp = realloc(*mem, newbatches * BATCH_SIZE);
543 } while (!tmp);
544 *mem = tmp;
545 }
546}
547static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
548{
549 size_t newlen = *len + linelen;
550 dorealloc(contents, *len, newlen + 1);
551 memcpy(*contents + *len, line, linelen+1);
552 *len = newlen;
553}
554
60f2ae53 555static char *slurp_file(const char *from, int fd)
237e200e
SH
556{
557 char *line = NULL;
558 char *contents = NULL;
60f2ae53 559 FILE *f = fdopen(fd, "r");
237e200e
SH
560 size_t len = 0, fulllen = 0;
561 ssize_t linelen;
562
563 if (!f)
564 return NULL;
565
566 while ((linelen = getline(&line, &len, f)) != -1) {
567 append_line(&contents, &fulllen, line, linelen);
568 }
569 fclose(f);
570
571 if (contents)
572 drop_trailing_newlines(contents);
573 free(line);
574 return contents;
575}
576
ba59ea09 577static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
578{
579 FILE *f;
580 size_t len, ret;
581
ba59ea09 582 if (!(f = fdopen(fd, "w")))
237e200e
SH
583 return false;
584 len = strlen(string);
585 ret = fwrite(string, 1, len, f);
586 if (ret != len) {
b8defc3d 587 lxcfs_error("Error writing to file: %s\n", strerror(errno));
237e200e
SH
588 fclose(f);
589 return false;
590 }
591 if (fclose(f) < 0) {
b8defc3d 592 lxcfs_error("Error writing to file: %s\n", strerror(errno));
237e200e
SH
593 return false;
594 }
595 return true;
596}
597
237e200e
SH
598struct cgfs_files {
599 char *name;
600 uint32_t uid, gid;
601 uint32_t mode;
602};
603
0619767c 604#define ALLOC_NUM 20
237e200e
SH
605static bool store_hierarchy(char *stridx, char *h)
606{
0619767c
SH
607 if (num_hierarchies % ALLOC_NUM == 0) {
608 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
609 n *= ALLOC_NUM;
610 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 611 if (!tmp) {
b8defc3d 612 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
613 exit(1);
614 }
237e200e 615 hierarchies = tmp;
237e200e 616 }
f676eb79 617
0619767c 618 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
619 return true;
620}
621
622static void print_subsystems(void)
623{
624 int i;
625
a257a8ee 626 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 627 fprintf(stderr, "hierarchies:\n");
237e200e
SH
628 for (i = 0; i < num_hierarchies; i++) {
629 if (hierarchies[i])
b8defc3d
CB
630 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
631 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
632 }
633}
634
635static bool in_comma_list(const char *needle, const char *haystack)
636{
637 const char *s = haystack, *e;
638 size_t nlen = strlen(needle);
639
06081b29 640 while (*s && (e = strchr(s, ','))) {
237e200e
SH
641 if (nlen != e - s) {
642 s = e + 1;
643 continue;
644 }
645 if (strncmp(needle, s, nlen) == 0)
646 return true;
647 s = e + 1;
648 }
649 if (strcmp(needle, s) == 0)
650 return true;
651 return false;
652}
653
654/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
655/* Return the mounted controller and store the corresponding open file descriptor
656 * referring to the controller mountpoint in the private lxcfs namespace in
657 * @cfd.
658 */
659static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
660{
661 int i;
662
663 for (i = 0; i < num_hierarchies; i++) {
664 if (!hierarchies[i])
665 continue;
5dd3e6fd
CB
666 if (strcmp(hierarchies[i], controller) == 0) {
667 *cfd = fd_hierarchies[i];
237e200e 668 return hierarchies[i];
5dd3e6fd
CB
669 }
670 if (in_comma_list(controller, hierarchies[i])) {
671 *cfd = fd_hierarchies[i];
237e200e 672 return hierarchies[i];
5dd3e6fd 673 }
237e200e
SH
674 }
675
676 return NULL;
677}
678
679bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
680 const char *value)
681{
ba59ea09 682 int ret, fd, cfd;
237e200e 683 size_t len;
f5a6d92e 684 char *fnam, *tmpc;
237e200e 685
f5a6d92e 686 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
687 if (!tmpc)
688 return false;
f5a6d92e
CB
689
690 /* Make sure we pass a relative path to *at() family of functions.
691 * . + /cgroup + / + file + \0
692 */
ba59ea09 693 len = strlen(cgroup) + strlen(file) + 3;
237e200e 694 fnam = alloca(len);
ba59ea09
CB
695 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
696 if (ret < 0 || (size_t)ret >= len)
697 return false;
698
699 fd = openat(cfd, fnam, O_WRONLY);
700 if (fd < 0)
701 return false;
f676eb79 702
ba59ea09 703 return write_string(fnam, value, fd);
237e200e
SH
704}
705
706// Chown all the files in the cgroup directory. We do this when we create
707// a cgroup on behalf of a user.
f23fe717 708static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 709{
f23fe717 710 struct dirent *direntp;
237e200e
SH
711 char path[MAXPATHLEN];
712 size_t len;
713 DIR *d;
f23fe717 714 int fd1, ret;
237e200e
SH
715
716 len = strlen(dirname);
717 if (len >= MAXPATHLEN) {
b8defc3d 718 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
719 return;
720 }
721
f23fe717
CB
722 fd1 = openat(fd, dirname, O_DIRECTORY);
723 if (fd1 < 0)
724 return;
725
726 d = fdopendir(fd1);
237e200e 727 if (!d) {
b8defc3d 728 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
729 return;
730 }
731
f23fe717 732 while ((direntp = readdir(d))) {
237e200e
SH
733 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
734 continue;
735 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
736 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 737 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
738 continue;
739 }
f23fe717 740 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 741 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
742 }
743 closedir(d);
744}
745
746int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
747{
5dd3e6fd 748 int cfd;
237e200e 749 size_t len;
f5a6d92e 750 char *dirnam, *tmpc;
237e200e 751
f5a6d92e 752 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
753 if (!tmpc)
754 return -EINVAL;
f5a6d92e
CB
755
756 /* Make sure we pass a relative path to *at() family of functions.
757 * . + /cg + \0
758 */
f23fe717 759 len = strlen(cg) + 2;
237e200e 760 dirnam = alloca(len);
f23fe717 761 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 762
f23fe717 763 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
764 return -errno;
765
766 if (uid == 0 && gid == 0)
767 return 0;
768
f23fe717 769 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
770 return -errno;
771
f23fe717 772 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
773
774 return 0;
775}
776
7213ec5c 777static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 778{
b7672ded 779 struct dirent *direntp;
237e200e
SH
780 DIR *dir;
781 bool ret = false;
782 char pathname[MAXPATHLEN];
b7672ded
CB
783 int dupfd;
784
785 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
786 if (dupfd < 0)
787 return false;
237e200e 788
b7672ded 789 dir = fdopendir(dupfd);
237e200e 790 if (!dir) {
7dd6560a 791 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 792 close(dupfd);
237e200e
SH
793 return false;
794 }
795
b7672ded 796 while ((direntp = readdir(dir))) {
237e200e
SH
797 struct stat mystat;
798 int rc;
799
237e200e
SH
800 if (!strcmp(direntp->d_name, ".") ||
801 !strcmp(direntp->d_name, ".."))
802 continue;
803
804 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
805 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 806 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
807 continue;
808 }
809
2e81a5e3
CB
810 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
811 if (rc) {
7dd6560a 812 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
813 continue;
814 }
7dd6560a 815 if (S_ISDIR(mystat.st_mode))
2e81a5e3 816 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 817 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
818 }
819
820 ret = true;
821 if (closedir(dir) < 0) {
b8defc3d 822 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
823 ret = false;
824 }
825
2e81a5e3 826 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 827 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
828 ret = false;
829 }
7213ec5c
CB
830
831 close(dupfd);
237e200e
SH
832
833 return ret;
834}
835
836bool cgfs_remove(const char *controller, const char *cg)
837{
b7672ded 838 int fd, cfd;
237e200e 839 size_t len;
f5a6d92e 840 char *dirnam, *tmpc;
7213ec5c 841 bool bret;
237e200e 842
f5a6d92e 843 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
844 if (!tmpc)
845 return false;
f5a6d92e
CB
846
847 /* Make sure we pass a relative path to *at() family of functions.
848 * . + /cg + \0
849 */
b7672ded 850 len = strlen(cg) + 2;
237e200e 851 dirnam = alloca(len);
b7672ded
CB
852 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
853
854 fd = openat(cfd, dirnam, O_DIRECTORY);
855 if (fd < 0)
856 return false;
857
7213ec5c
CB
858 bret = recursive_rmdir(dirnam, fd, cfd);
859 close(fd);
860 return bret;
237e200e
SH
861}
862
863bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
864{
5dd3e6fd 865 int cfd;
237e200e 866 size_t len;
f5a6d92e 867 char *pathname, *tmpc;
237e200e 868
f5a6d92e 869 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
870 if (!tmpc)
871 return false;
f5a6d92e
CB
872
873 /* Make sure we pass a relative path to *at() family of functions.
874 * . + /file + \0
875 */
534690b4 876 len = strlen(file) + 2;
237e200e 877 pathname = alloca(len);
534690b4
CB
878 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
879 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
880 return false;
881 return true;
882}
883
0f657ce3 884static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
885{
886 size_t len;
887 char *fname;
888
889 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
890 fname = alloca(len);
891 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 892 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
893 return -errno;
894 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 895 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
896 return -errno;
897 return 0;
898}
899
900int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
901{
5dd3e6fd 902 int cfd;
237e200e 903 size_t len;
f5a6d92e 904 char *pathname, *tmpc;
237e200e 905
f5a6d92e 906 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
907 if (!tmpc)
908 return -EINVAL;
f5a6d92e
CB
909
910 /* Make sure we pass a relative path to *at() family of functions.
911 * . + /file + \0
912 */
0f657ce3 913 len = strlen(file) + 2;
237e200e 914 pathname = alloca(len);
0f657ce3
CB
915 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
916 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
917 return -errno;
918
0f657ce3 919 if (is_dir(pathname, cfd))
237e200e 920 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 921 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
922
923 return 0;
924}
925
926FILE *open_pids_file(const char *controller, const char *cgroup)
927{
3ffd08ee 928 int fd, cfd;
237e200e 929 size_t len;
f5a6d92e 930 char *pathname, *tmpc;
237e200e 931
f5a6d92e 932 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
933 if (!tmpc)
934 return NULL;
f5a6d92e
CB
935
936 /* Make sure we pass a relative path to *at() family of functions.
937 * . + /cgroup + / "cgroup.procs" + \0
938 */
3ffd08ee 939 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 940 pathname = alloca(len);
3ffd08ee
CB
941 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
942
943 fd = openat(cfd, pathname, O_WRONLY);
944 if (fd < 0)
945 return NULL;
946
947 return fdopen(fd, "w");
237e200e
SH
948}
949
f366da65
WB
950static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
951 void ***list, size_t typesize,
952 void* (*iterator)(const char*, const char*, const char*))
237e200e 953{
4ea38a4c 954 int cfd, fd, ret;
237e200e 955 size_t len;
4ea38a4c 956 char *cg, *tmpc;
237e200e 957 char pathname[MAXPATHLEN];
f366da65 958 size_t sz = 0, asz = 0;
4ea38a4c 959 struct dirent *dirent;
237e200e 960 DIR *dir;
237e200e 961
4ea38a4c 962 tmpc = find_mounted_controller(controller, &cfd);
f366da65 963 *list = NULL;
237e200e 964 if (!tmpc)
e97c834b 965 return false;
237e200e 966
f5a6d92e 967 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
968 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
969 cg = alloca(len);
970 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
971 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 972 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
973 return false;
974 }
237e200e 975
4ea38a4c
CB
976 fd = openat(cfd, cg, O_DIRECTORY);
977 if (fd < 0)
978 return false;
979
980 dir = fdopendir(fd);
237e200e
SH
981 if (!dir)
982 return false;
983
4ea38a4c 984 while ((dirent = readdir(dir))) {
237e200e 985 struct stat mystat;
237e200e 986
4ea38a4c
CB
987 if (!strcmp(dirent->d_name, ".") ||
988 !strcmp(dirent->d_name, ".."))
237e200e
SH
989 continue;
990
4ea38a4c
CB
991 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
992 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 993 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
994 continue;
995 }
996
4ea38a4c 997 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 998 if (ret) {
b8defc3d 999 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1000 continue;
1001 }
f366da65
WB
1002 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1003 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1004 continue;
1005
1006 if (sz+2 >= asz) {
f366da65 1007 void **tmp;
237e200e
SH
1008 asz += BATCH_SIZE;
1009 do {
f366da65 1010 tmp = realloc(*list, asz * typesize);
237e200e
SH
1011 } while (!tmp);
1012 *list = tmp;
1013 }
4ea38a4c 1014 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1015 (*list)[sz+1] = NULL;
1016 sz++;
1017 }
1018 if (closedir(dir) < 0) {
b8defc3d 1019 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1020 return false;
1021 }
1022 return true;
1023}
1024
f366da65
WB
1025static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1026{
1027 char *dup;
1028 do {
1029 dup = strdup(dir_entry);
1030 } while (!dup);
1031 return dup;
1032}
1033
1034bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1035{
1036 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1037}
1038
237e200e
SH
1039void free_key(struct cgfs_files *k)
1040{
1041 if (!k)
1042 return;
1043 free(k->name);
1044 free(k);
1045}
1046
1047void free_keys(struct cgfs_files **keys)
1048{
1049 int i;
1050
1051 if (!keys)
1052 return;
1053 for (i = 0; keys[i]; i++) {
1054 free_key(keys[i]);
1055 }
1056 free(keys);
1057}
1058
1059bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1060{
60f2ae53 1061 int ret, fd, cfd;
237e200e 1062 size_t len;
f5a6d92e 1063 char *fnam, *tmpc;
237e200e 1064
f5a6d92e 1065 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1066 if (!tmpc)
1067 return false;
f5a6d92e
CB
1068
1069 /* Make sure we pass a relative path to *at() family of functions.
1070 * . + /cgroup + / + file + \0
1071 */
60f2ae53 1072 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1073 fnam = alloca(len);
60f2ae53
CB
1074 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1075 if (ret < 0 || (size_t)ret >= len)
234a820c 1076 return false;
60f2ae53
CB
1077
1078 fd = openat(cfd, fnam, O_RDONLY);
1079 if (fd < 0)
234a820c 1080 return false;
237e200e 1081
60f2ae53 1082 *value = slurp_file(fnam, fd);
237e200e
SH
1083 return *value != NULL;
1084}
1085
1086struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1087{
4ea38a4c 1088 int ret, cfd;
237e200e 1089 size_t len;
f5a6d92e 1090 char *fnam, *tmpc;
237e200e
SH
1091 struct stat sb;
1092 struct cgfs_files *newkey;
237e200e 1093
f5a6d92e 1094 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1095 if (!tmpc)
1096 return false;
1097
1098 if (file && *file == '/')
1099 file++;
1100
06081b29 1101 if (file && strchr(file, '/'))
237e200e
SH
1102 return NULL;
1103
f5a6d92e
CB
1104 /* Make sure we pass a relative path to *at() family of functions.
1105 * . + /cgroup + / + file + \0
1106 */
4ea38a4c 1107 len = strlen(cgroup) + 3;
237e200e
SH
1108 if (file)
1109 len += strlen(file) + 1;
1110 fnam = alloca(len);
4ea38a4c
CB
1111 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1112 file ? "/" : "", file ? file : "");
237e200e 1113
4ea38a4c 1114 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1115 if (ret < 0)
1116 return NULL;
1117
1118 do {
1119 newkey = malloc(sizeof(struct cgfs_files));
1120 } while (!newkey);
1121 if (file)
1122 newkey->name = must_copy_string(file);
06081b29
CB
1123 else if (strrchr(cgroup, '/'))
1124 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1125 else
1126 newkey->name = must_copy_string(cgroup);
1127 newkey->uid = sb.st_uid;
1128 newkey->gid = sb.st_gid;
1129 newkey->mode = sb.st_mode;
1130
1131 return newkey;
1132}
1133
f366da65 1134static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1135{
f366da65
WB
1136 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1137 if (!entry) {
b8defc3d
CB
1138 lxcfs_error("Error getting files under %s:%s\n", controller,
1139 cgroup);
237e200e 1140 }
f366da65
WB
1141 return entry;
1142}
1143
1144bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1145{
1146 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1147}
1148
1149bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1150{
1151 int cfd;
1152 size_t len;
f5a6d92e 1153 char *fnam, *tmpc;
237e200e
SH
1154 int ret;
1155 struct stat sb;
1156
f5a6d92e 1157 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1158 if (!tmpc)
1159 return false;
f5a6d92e
CB
1160
1161 /* Make sure we pass a relative path to *at() family of functions.
1162 * . + /cgroup + / + f + \0
1163 */
d04232f2 1164 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1165 fnam = alloca(len);
d04232f2
CB
1166 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1167 if (ret < 0 || (size_t)ret >= len)
1168 return false;
237e200e 1169
d04232f2 1170 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1171 if (ret < 0 || !S_ISDIR(sb.st_mode))
1172 return false;
f5a6d92e 1173
237e200e
SH
1174 return true;
1175}
1176
1177#define SEND_CREDS_OK 0
1178#define SEND_CREDS_NOTSK 1
1179#define SEND_CREDS_FAIL 2
1180static bool recv_creds(int sock, struct ucred *cred, char *v);
1181static int wait_for_pid(pid_t pid);
1182static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1183static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1184
1185/*
b10bdd6c 1186 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1187 * over a unix sock so we can read the task's reaper's pid in our
1188 * namespace
b10bdd6c
FG
1189 *
1190 * Note: glibc's fork() does not respect pidns, which can lead to failed
1191 * assertions inside glibc (and thus failed forks) if the child's pid in
1192 * the pidns and the parent pid outside are identical. Using clone prevents
1193 * this issue.
237e200e
SH
1194 */
1195static void write_task_init_pid_exit(int sock, pid_t target)
1196{
237e200e
SH
1197 char fnam[100];
1198 pid_t pid;
237e200e 1199 int fd, ret;
b10bdd6c
FG
1200 size_t stack_size = sysconf(_SC_PAGESIZE);
1201 void *stack = alloca(stack_size);
237e200e
SH
1202
1203 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1204 if (ret < 0 || ret >= sizeof(fnam))
1205 _exit(1);
1206
1207 fd = open(fnam, O_RDONLY);
1208 if (fd < 0) {
1209 perror("write_task_init_pid_exit open of ns/pid");
1210 _exit(1);
1211 }
1212 if (setns(fd, 0)) {
1213 perror("write_task_init_pid_exit setns 1");
1214 close(fd);
1215 _exit(1);
1216 }
b10bdd6c 1217 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1218 if (pid < 0)
1219 _exit(1);
1220 if (pid != 0) {
1221 if (!wait_for_pid(pid))
1222 _exit(1);
1223 _exit(0);
1224 }
b10bdd6c
FG
1225}
1226
1227static int send_creds_clone_wrapper(void *arg) {
1228 struct ucred cred;
1229 char v;
1230 int sock = *(int *)arg;
237e200e
SH
1231
1232 /* we are the child */
1233 cred.uid = 0;
1234 cred.gid = 0;
1235 cred.pid = 1;
1236 v = '1';
1237 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1238 return 1;
1239 return 0;
237e200e
SH
1240}
1241
1242static pid_t get_init_pid_for_task(pid_t task)
1243{
1244 int sock[2];
1245 pid_t pid;
1246 pid_t ret = -1;
1247 char v = '0';
1248 struct ucred cred;
1249
1250 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1251 perror("socketpair");
1252 return -1;
1253 }
1254
1255 pid = fork();
1256 if (pid < 0)
1257 goto out;
1258 if (!pid) {
1259 close(sock[1]);
1260 write_task_init_pid_exit(sock[0], task);
1261 _exit(0);
1262 }
1263
1264 if (!recv_creds(sock[1], &cred, &v))
1265 goto out;
1266 ret = cred.pid;
1267
1268out:
1269 close(sock[0]);
1270 close(sock[1]);
1271 if (pid > 0)
1272 wait_for_pid(pid);
1273 return ret;
1274}
1275
1276static pid_t lookup_initpid_in_store(pid_t qpid)
1277{
1278 pid_t answer = 0;
1279 struct stat sb;
1280 struct pidns_init_store *e;
1281 char fnam[100];
1282
1283 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1284 store_lock();
1285 if (stat(fnam, &sb) < 0)
1286 goto out;
1287 e = lookup_verify_initpid(&sb);
1288 if (e) {
1289 answer = e->initpid;
1290 goto out;
1291 }
1292 answer = get_init_pid_for_task(qpid);
1293 if (answer > 0)
1294 save_initpid(&sb, answer);
1295
1296out:
1297 /* we prune at end in case we are returning
1298 * the value we were about to return */
1299 prune_initpid_store();
1300 store_unlock();
1301 return answer;
1302}
1303
1304static int wait_for_pid(pid_t pid)
1305{
1306 int status, ret;
1307
1308 if (pid <= 0)
1309 return -1;
1310
1311again:
1312 ret = waitpid(pid, &status, 0);
1313 if (ret == -1) {
1314 if (errno == EINTR)
1315 goto again;
1316 return -1;
1317 }
1318 if (ret != pid)
1319 goto again;
1320 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1321 return -1;
1322 return 0;
1323}
1324
1325
1326/*
1327 * append pid to *src.
1328 * src: a pointer to a char* in which ot append the pid.
1329 * sz: the number of characters printed so far, minus trailing \0.
1330 * asz: the allocated size so far
1331 * pid: the pid to append
1332 */
1333static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1334{
1335 char tmp[30];
1336
1337 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1338
1339 if (!*src || tmplen + *sz + 1 >= *asz) {
1340 char *tmp;
1341 do {
1342 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1343 } while (!tmp);
1344 *src = tmp;
1345 *asz += BUF_RESERVE_SIZE;
1346 }
bbfd0e33 1347 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1348 *sz += tmplen;
237e200e
SH
1349}
1350
1351/*
1352 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1353 * valid in the caller's namespace, return the id mapped into
1354 * pid's namespace.
1355 * Returns the mapped id, or -1 on error.
1356 */
1357unsigned int
1358convert_id_to_ns(FILE *idfile, unsigned int in_id)
1359{
1360 unsigned int nsuid, // base id for a range in the idfile's namespace
1361 hostuid, // base id for a range in the caller's namespace
1362 count; // number of ids in this range
1363 char line[400];
1364 int ret;
1365
1366 fseek(idfile, 0L, SEEK_SET);
1367 while (fgets(line, 400, idfile)) {
1368 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1369 if (ret != 3)
1370 continue;
1371 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1372 /*
1373 * uids wrapped around - unexpected as this is a procfile,
1374 * so just bail.
1375 */
b8defc3d 1376 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1377 nsuid, hostuid, count, line);
1378 return -1;
1379 }
1380 if (hostuid <= in_id && hostuid+count > in_id) {
1381 /*
1382 * now since hostuid <= in_id < hostuid+count, and
1383 * hostuid+count and nsuid+count do not wrap around,
1384 * we know that nsuid+(in_id-hostuid) which must be
1385 * less that nsuid+(count) must not wrap around
1386 */
1387 return (in_id - hostuid) + nsuid;
1388 }
1389 }
1390
1391 // no answer found
1392 return -1;
1393}
1394
1395/*
1396 * for is_privileged_over,
1397 * specify whether we require the calling uid to be root in his
1398 * namespace
1399 */
1400#define NS_ROOT_REQD true
1401#define NS_ROOT_OPT false
1402
1403#define PROCLEN 100
1404
1405static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1406{
1407 char fpath[PROCLEN];
1408 int ret;
1409 bool answer = false;
1410 uid_t nsuid;
1411
1412 if (victim == -1 || uid == -1)
1413 return false;
1414
1415 /*
1416 * If the request is one not requiring root in the namespace,
1417 * then having the same uid suffices. (i.e. uid 1000 has write
1418 * access to files owned by uid 1000
1419 */
1420 if (!req_ns_root && uid == victim)
1421 return true;
1422
1423 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1424 if (ret < 0 || ret >= PROCLEN)
1425 return false;
1426 FILE *f = fopen(fpath, "r");
1427 if (!f)
1428 return false;
1429
1430 /* if caller's not root in his namespace, reject */
1431 nsuid = convert_id_to_ns(f, uid);
1432 if (nsuid)
1433 goto out;
1434
1435 /*
1436 * If victim is not mapped into caller's ns, reject.
1437 * XXX I'm not sure this check is needed given that fuse
1438 * will be sending requests where the vfs has converted
1439 */
1440 nsuid = convert_id_to_ns(f, victim);
1441 if (nsuid == -1)
1442 goto out;
1443
1444 answer = true;
1445
1446out:
1447 fclose(f);
1448 return answer;
1449}
1450
1451static bool perms_include(int fmode, mode_t req_mode)
1452{
1453 mode_t r;
1454
1455 switch (req_mode & O_ACCMODE) {
1456 case O_RDONLY:
1457 r = S_IROTH;
1458 break;
1459 case O_WRONLY:
1460 r = S_IWOTH;
1461 break;
1462 case O_RDWR:
1463 r = S_IROTH | S_IWOTH;
1464 break;
1465 default:
1466 return false;
1467 }
1468 return ((fmode & r) == r);
1469}
1470
1471
1472/*
1473 * taskcg is a/b/c
1474 * querycg is /a/b/c/d/e
1475 * we return 'd'
1476 */
1477static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1478{
1479 char *start, *end;
1480
1481 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1482 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1483 return NULL;
1484 }
1485
06081b29 1486 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1487 start = strdup(taskcg + 1);
1488 else
1489 start = strdup(taskcg + strlen(querycg) + 1);
1490 if (!start)
1491 return NULL;
1492 end = strchr(start, '/');
1493 if (end)
1494 *end = '\0';
1495 return start;
1496}
1497
1498static void stripnewline(char *x)
1499{
1500 size_t l = strlen(x);
1501 if (l && x[l-1] == '\n')
1502 x[l-1] = '\0';
1503}
1504
1505static char *get_pid_cgroup(pid_t pid, const char *contrl)
1506{
5dd3e6fd 1507 int cfd;
237e200e
SH
1508 char fnam[PROCLEN];
1509 FILE *f;
1510 char *answer = NULL;
1511 char *line = NULL;
1512 size_t len = 0;
1513 int ret;
5dd3e6fd 1514 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1515 if (!h)
1516 return NULL;
1517
1518 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1519 if (ret < 0 || ret >= PROCLEN)
1520 return NULL;
1521 if (!(f = fopen(fnam, "r")))
1522 return NULL;
1523
1524 while (getline(&line, &len, f) != -1) {
1525 char *c1, *c2;
1526 if (!line[0])
1527 continue;
1528 c1 = strchr(line, ':');
1529 if (!c1)
1530 goto out;
1531 c1++;
1532 c2 = strchr(c1, ':');
1533 if (!c2)
1534 goto out;
1535 *c2 = '\0';
1536 if (strcmp(c1, h) != 0)
1537 continue;
1538 c2++;
1539 stripnewline(c2);
1540 do {
1541 answer = strdup(c2);
1542 } while (!answer);
1543 break;
1544 }
1545
1546out:
1547 fclose(f);
1548 free(line);
1549 return answer;
1550}
1551
1552/*
1553 * check whether a fuse context may access a cgroup dir or file
1554 *
1555 * If file is not null, it is a cgroup file to check under cg.
1556 * If file is null, then we are checking perms on cg itself.
1557 *
1558 * For files we can check the mode of the list_keys result.
1559 * For cgroups, we must make assumptions based on the files under the
1560 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1561 * yet.
1562 */
1563static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1564{
1565 struct cgfs_files *k = NULL;
1566 bool ret = false;
1567
1568 k = cgfs_get_key(contrl, cg, file);
1569 if (!k)
1570 return false;
1571
1572 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1573 if (perms_include(k->mode >> 6, mode)) {
1574 ret = true;
1575 goto out;
1576 }
1577 }
1578 if (fc->gid == k->gid) {
1579 if (perms_include(k->mode >> 3, mode)) {
1580 ret = true;
1581 goto out;
1582 }
1583 }
1584 ret = perms_include(k->mode, mode);
1585
1586out:
1587 free_key(k);
1588 return ret;
1589}
1590
1591#define INITSCOPE "/init.scope"
1592static void prune_init_slice(char *cg)
1593{
1594 char *point;
1595 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1596
1597 if (cg_len < initscope_len)
1598 return;
1599
1600 point = cg + cg_len - initscope_len;
1601 if (strcmp(point, INITSCOPE) == 0) {
1602 if (point == cg)
1603 *(point+1) = '\0';
1604 else
1605 *point = '\0';
1606 }
1607}
1608
1609/*
1610 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1611 * If pid is in /a, he may act on /a/b, but not on /b.
1612 * if the answer is false and nextcg is not NULL, then *nextcg will point
1613 * to a string containing the next cgroup directory under cg, which must be
1614 * freed by the caller.
1615 */
1616static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1617{
1618 bool answer = false;
1619 char *c2 = get_pid_cgroup(pid, contrl);
1620 char *linecmp;
1621
1622 if (!c2)
1623 return false;
1624 prune_init_slice(c2);
1625
1626 /*
12c31268
CB
1627 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1628 * they pass in a cgroup without leading '/'
1629 *
1630 * The original line here was:
1631 * linecmp = *cg == '/' ? c2 : c2+1;
1632 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1633 * Serge, do you know?
237e200e 1634 */
12c31268
CB
1635 if (*cg == '/' || !strncmp(cg, "./", 2))
1636 linecmp = c2;
1637 else
1638 linecmp = c2 + 1;
237e200e
SH
1639 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1640 if (nextcg) {
1641 *nextcg = get_next_cgroup_dir(linecmp, cg);
1642 }
1643 goto out;
1644 }
1645 answer = true;
1646
1647out:
1648 free(c2);
1649 return answer;
1650}
1651
1652/*
1653 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1654 */
1655static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1656{
1657 bool answer = false;
1658 char *c2, *task_cg;
1659 size_t target_len, task_len;
1660
f7bff426 1661 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1662 return true;
1663
1664 c2 = get_pid_cgroup(pid, contrl);
1665 if (!c2)
1666 return false;
1667 prune_init_slice(c2);
1668
1669 task_cg = c2 + 1;
1670 target_len = strlen(cg);
1671 task_len = strlen(task_cg);
1672 if (task_len == 0) {
1673 /* Task is in the root cg, it can see everything. This case is
1674 * not handled by the strmcps below, since they test for the
1675 * last /, but that is the first / that we've chopped off
1676 * above.
1677 */
1678 answer = true;
1679 goto out;
1680 }
1681 if (strcmp(cg, task_cg) == 0) {
1682 answer = true;
1683 goto out;
1684 }
1685 if (target_len < task_len) {
1686 /* looking up a parent dir */
1687 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1688 answer = true;
1689 goto out;
1690 }
1691 if (target_len > task_len) {
1692 /* looking up a child dir */
1693 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1694 answer = true;
1695 goto out;
1696 }
1697
1698out:
1699 free(c2);
1700 return answer;
1701}
1702
1703/*
1704 * given /cgroup/freezer/a/b, return "freezer".
1705 * the returned char* should NOT be freed.
1706 */
1707static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1708{
1709 const char *p1;
1710 char *contr, *slash;
1711
99142521 1712 if (strlen(path) < 9) {
e254948f 1713 errno = EACCES;
237e200e 1714 return NULL;
99142521
CB
1715 }
1716 if (*(path + 7) != '/') {
1717 errno = EINVAL;
237e200e 1718 return NULL;
99142521 1719 }
3adc421c 1720 p1 = path + 8;
237e200e 1721 contr = strdupa(p1);
99142521
CB
1722 if (!contr) {
1723 errno = ENOMEM;
237e200e 1724 return NULL;
99142521 1725 }
237e200e
SH
1726 slash = strstr(contr, "/");
1727 if (slash)
1728 *slash = '\0';
1729
1730 int i;
3adc421c 1731 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1732 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1733 return hierarchies[i];
1734 }
99142521 1735 errno = ENOENT;
237e200e
SH
1736 return NULL;
1737}
1738
1739/*
1740 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1741 * Note that the returned value may include files (keynames) etc
1742 */
1743static const char *find_cgroup_in_path(const char *path)
1744{
1745 const char *p1;
1746
bc70ba9b 1747 if (strlen(path) < 9) {
e254948f 1748 errno = EACCES;
237e200e 1749 return NULL;
bc70ba9b
CB
1750 }
1751 p1 = strstr(path + 8, "/");
1752 if (!p1) {
1753 errno = EINVAL;
237e200e 1754 return NULL;
bc70ba9b
CB
1755 }
1756 errno = 0;
1757 return p1 + 1;
237e200e
SH
1758}
1759
1760/*
1761 * split the last path element from the path in @cg.
1762 * @dir is newly allocated and should be freed, @last not
1763*/
1764static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1765{
1766 char *p;
1767
1768 do {
1769 *dir = strdup(cg);
1770 } while (!*dir);
1771 *last = strrchr(cg, '/');
1772 if (!*last) {
1773 *last = NULL;
1774 return;
1775 }
1776 p = strrchr(*dir, '/');
1777 *p = '\0';
1778}
1779
1780/*
1781 * FUSE ops for /cgroup
1782 */
1783
1784int cg_getattr(const char *path, struct stat *sb)
1785{
1786 struct timespec now;
1787 struct fuse_context *fc = fuse_get_context();
1788 char * cgdir = NULL;
1789 char *last = NULL, *path1, *path2;
1790 struct cgfs_files *k = NULL;
1791 const char *cgroup;
1792 const char *controller = NULL;
1793 int ret = -ENOENT;
1794
1795
1796 if (!fc)
1797 return -EIO;
1798
1799 memset(sb, 0, sizeof(struct stat));
1800
1801 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1802 return -EINVAL;
1803
1804 sb->st_uid = sb->st_gid = 0;
1805 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1806 sb->st_size = 0;
1807
1808 if (strcmp(path, "/cgroup") == 0) {
1809 sb->st_mode = S_IFDIR | 00755;
1810 sb->st_nlink = 2;
1811 return 0;
1812 }
1813
1814 controller = pick_controller_from_path(fc, path);
1815 if (!controller)
2f7036d0 1816 return -errno;
237e200e
SH
1817 cgroup = find_cgroup_in_path(path);
1818 if (!cgroup) {
1819 /* this is just /cgroup/controller, return it as a dir */
1820 sb->st_mode = S_IFDIR | 00755;
1821 sb->st_nlink = 2;
1822 return 0;
1823 }
1824
1825 get_cgdir_and_path(cgroup, &cgdir, &last);
1826
1827 if (!last) {
1828 path1 = "/";
1829 path2 = cgdir;
1830 } else {
1831 path1 = cgdir;
1832 path2 = last;
1833 }
1834
1835 pid_t initpid = lookup_initpid_in_store(fc->pid);
1836 if (initpid <= 0)
1837 initpid = fc->pid;
1838 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1839 * Then check that caller's cgroup is under path if last is a child
1840 * cgroup, or cgdir if last is a file */
1841
1842 if (is_child_cgroup(controller, path1, path2)) {
1843 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1844 ret = -ENOENT;
1845 goto out;
1846 }
1847 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1848 /* this is just /cgroup/controller, return it as a dir */
1849 sb->st_mode = S_IFDIR | 00555;
1850 sb->st_nlink = 2;
1851 ret = 0;
1852 goto out;
1853 }
1854 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1855 ret = -EACCES;
1856 goto out;
1857 }
1858
1859 // get uid, gid, from '/tasks' file and make up a mode
1860 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1861 sb->st_mode = S_IFDIR | 00755;
1862 k = cgfs_get_key(controller, cgroup, NULL);
1863 if (!k) {
1864 sb->st_uid = sb->st_gid = 0;
1865 } else {
1866 sb->st_uid = k->uid;
1867 sb->st_gid = k->gid;
1868 }
1869 free_key(k);
1870 sb->st_nlink = 2;
1871 ret = 0;
1872 goto out;
1873 }
1874
1875 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1876 sb->st_mode = S_IFREG | k->mode;
1877 sb->st_nlink = 1;
1878 sb->st_uid = k->uid;
1879 sb->st_gid = k->gid;
1880 sb->st_size = 0;
1881 free_key(k);
1882 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1883 ret = -ENOENT;
1884 goto out;
1885 }
237e200e
SH
1886 ret = 0;
1887 }
1888
1889out:
1890 free(cgdir);
1891 return ret;
1892}
1893
1894int cg_opendir(const char *path, struct fuse_file_info *fi)
1895{
1896 struct fuse_context *fc = fuse_get_context();
1897 const char *cgroup;
1898 struct file_info *dir_info;
1899 char *controller = NULL;
1900
1901 if (!fc)
1902 return -EIO;
1903
1904 if (strcmp(path, "/cgroup") == 0) {
1905 cgroup = NULL;
1906 controller = NULL;
1907 } else {
1908 // return list of keys for the controller, and list of child cgroups
1909 controller = pick_controller_from_path(fc, path);
1910 if (!controller)
2f7036d0 1911 return -errno;
237e200e
SH
1912
1913 cgroup = find_cgroup_in_path(path);
1914 if (!cgroup) {
1915 /* this is just /cgroup/controller, return its contents */
1916 cgroup = "/";
1917 }
1918 }
1919
1920 pid_t initpid = lookup_initpid_in_store(fc->pid);
1921 if (initpid <= 0)
1922 initpid = fc->pid;
1923 if (cgroup) {
1924 if (!caller_may_see_dir(initpid, controller, cgroup))
1925 return -ENOENT;
1926 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1927 return -EACCES;
1928 }
1929
1930 /* we'll free this at cg_releasedir */
1931 dir_info = malloc(sizeof(*dir_info));
1932 if (!dir_info)
1933 return -ENOMEM;
1934 dir_info->controller = must_copy_string(controller);
1935 dir_info->cgroup = must_copy_string(cgroup);
1936 dir_info->type = LXC_TYPE_CGDIR;
1937 dir_info->buf = NULL;
1938 dir_info->file = NULL;
1939 dir_info->buflen = 0;
1940
1941 fi->fh = (unsigned long)dir_info;
1942 return 0;
1943}
1944
1945int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1946 struct fuse_file_info *fi)
1947{
1948 struct file_info *d = (struct file_info *)fi->fh;
1949 struct cgfs_files **list = NULL;
1950 int i, ret;
1951 char *nextcg = NULL;
1952 struct fuse_context *fc = fuse_get_context();
1953 char **clist = NULL;
1954
d639f863
CB
1955 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1956 return -EIO;
1957
237e200e 1958 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 1959 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
1960 return -EIO;
1961 }
1962 if (!d->cgroup && !d->controller) {
1963 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1964 int i;
1965
1966 for (i = 0; i < num_hierarchies; i++) {
1967 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1968 return -EIO;
1969 }
1970 }
1971 return 0;
1972 }
1973
1974 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1975 // not a valid cgroup
1976 ret = -EINVAL;
1977 goto out;
1978 }
1979
1980 pid_t initpid = lookup_initpid_in_store(fc->pid);
1981 if (initpid <= 0)
1982 initpid = fc->pid;
1983 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1984 if (nextcg) {
1985 ret = filler(buf, nextcg, NULL, 0);
1986 free(nextcg);
1987 if (ret != 0) {
1988 ret = -EIO;
1989 goto out;
1990 }
1991 }
1992 ret = 0;
1993 goto out;
1994 }
1995
1996 for (i = 0; list[i]; i++) {
1997 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1998 ret = -EIO;
1999 goto out;
2000 }
2001 }
2002
2003 // now get the list of child cgroups
2004
2005 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2006 ret = 0;
2007 goto out;
2008 }
f366da65
WB
2009 if (clist) {
2010 for (i = 0; clist[i]; i++) {
2011 if (filler(buf, clist[i], NULL, 0) != 0) {
2012 ret = -EIO;
2013 goto out;
2014 }
237e200e
SH
2015 }
2016 }
2017 ret = 0;
2018
2019out:
2020 free_keys(list);
2021 if (clist) {
2022 for (i = 0; clist[i]; i++)
2023 free(clist[i]);
2024 free(clist);
2025 }
2026 return ret;
2027}
2028
43215927 2029static void do_release_file_info(struct fuse_file_info *fi)
237e200e 2030{
43215927
SH
2031 struct file_info *f = (struct file_info *)fi->fh;
2032
237e200e
SH
2033 if (!f)
2034 return;
43215927
SH
2035
2036 fi->fh = 0;
2037
237e200e 2038 free(f->controller);
43215927 2039 f->controller = NULL;
237e200e 2040 free(f->cgroup);
43215927 2041 f->cgroup = NULL;
237e200e 2042 free(f->file);
43215927 2043 f->file = NULL;
237e200e 2044 free(f->buf);
43215927 2045 f->buf = NULL;
237e200e
SH
2046 free(f);
2047}
2048
2049int cg_releasedir(const char *path, struct fuse_file_info *fi)
2050{
43215927 2051 do_release_file_info(fi);
237e200e
SH
2052 return 0;
2053}
2054
2055int cg_open(const char *path, struct fuse_file_info *fi)
2056{
2057 const char *cgroup;
2058 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2059 struct cgfs_files *k = NULL;
2060 struct file_info *file_info;
2061 struct fuse_context *fc = fuse_get_context();
2062 int ret;
2063
2064 if (!fc)
2065 return -EIO;
2066
2067 controller = pick_controller_from_path(fc, path);
2068 if (!controller)
2f7036d0 2069 return -errno;
237e200e
SH
2070 cgroup = find_cgroup_in_path(path);
2071 if (!cgroup)
bc70ba9b 2072 return -errno;
237e200e
SH
2073
2074 get_cgdir_and_path(cgroup, &cgdir, &last);
2075 if (!last) {
2076 path1 = "/";
2077 path2 = cgdir;
2078 } else {
2079 path1 = cgdir;
2080 path2 = last;
2081 }
2082
2083 k = cgfs_get_key(controller, path1, path2);
2084 if (!k) {
2085 ret = -EINVAL;
2086 goto out;
2087 }
2088 free_key(k);
2089
2090 pid_t initpid = lookup_initpid_in_store(fc->pid);
2091 if (initpid <= 0)
2092 initpid = fc->pid;
2093 if (!caller_may_see_dir(initpid, controller, path1)) {
2094 ret = -ENOENT;
2095 goto out;
2096 }
2097 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2098 ret = -EACCES;
2099 goto out;
2100 }
2101
2102 /* we'll free this at cg_release */
2103 file_info = malloc(sizeof(*file_info));
2104 if (!file_info) {
2105 ret = -ENOMEM;
2106 goto out;
2107 }
2108 file_info->controller = must_copy_string(controller);
2109 file_info->cgroup = must_copy_string(path1);
2110 file_info->file = must_copy_string(path2);
2111 file_info->type = LXC_TYPE_CGFILE;
2112 file_info->buf = NULL;
2113 file_info->buflen = 0;
2114
2115 fi->fh = (unsigned long)file_info;
2116 ret = 0;
2117
2118out:
2119 free(cgdir);
2120 return ret;
2121}
2122
bddbb106
SH
2123int cg_access(const char *path, int mode)
2124{
6f0f6b83 2125 int ret;
bddbb106 2126 const char *cgroup;
6f0f6b83
CB
2127 char *path1, *path2, *controller;
2128 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2129 struct cgfs_files *k = NULL;
2130 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2131
9873c5e8 2132 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2133 return 0;
bddbb106
SH
2134
2135 if (!fc)
2136 return -EIO;
2137
2138 controller = pick_controller_from_path(fc, path);
2139 if (!controller)
2f7036d0 2140 return -errno;
bddbb106 2141 cgroup = find_cgroup_in_path(path);
575316c4
SH
2142 if (!cgroup) {
2143 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2144 if ((mode & W_OK) == 0)
2145 return 0;
2146 return -EACCES;
575316c4 2147 }
bddbb106
SH
2148
2149 get_cgdir_and_path(cgroup, &cgdir, &last);
2150 if (!last) {
2151 path1 = "/";
2152 path2 = cgdir;
2153 } else {
2154 path1 = cgdir;
2155 path2 = last;
2156 }
2157
2158 k = cgfs_get_key(controller, path1, path2);
2159 if (!k) {
3f441bc7
SH
2160 if ((mode & W_OK) == 0)
2161 ret = 0;
2162 else
2163 ret = -EACCES;
bddbb106
SH
2164 goto out;
2165 }
2166 free_key(k);
2167
2168 pid_t initpid = lookup_initpid_in_store(fc->pid);
2169 if (initpid <= 0)
2170 initpid = fc->pid;
2171 if (!caller_may_see_dir(initpid, controller, path1)) {
2172 ret = -ENOENT;
2173 goto out;
2174 }
2175 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2176 ret = -EACCES;
2177 goto out;
2178 }
2179
2180 ret = 0;
2181
2182out:
2183 free(cgdir);
2184 return ret;
2185}
2186
237e200e
SH
2187int cg_release(const char *path, struct fuse_file_info *fi)
2188{
43215927 2189 do_release_file_info(fi);
237e200e
SH
2190 return 0;
2191}
2192
2193#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2194
2195static bool wait_for_sock(int sock, int timeout)
2196{
2197 struct epoll_event ev;
2198 int epfd, ret, now, starttime, deltatime, saved_errno;
2199
2200 if ((starttime = time(NULL)) < 0)
2201 return false;
2202
2203 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2204 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2205 return false;
2206 }
2207
2208 ev.events = POLLIN_SET;
2209 ev.data.fd = sock;
2210 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2211 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2212 close(epfd);
2213 return false;
2214 }
2215
2216again:
2217 if ((now = time(NULL)) < 0) {
2218 close(epfd);
2219 return false;
2220 }
2221
2222 deltatime = (starttime + timeout) - now;
2223 if (deltatime < 0) { // timeout
2224 errno = 0;
2225 close(epfd);
2226 return false;
2227 }
2228 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2229 if (ret < 0 && errno == EINTR)
2230 goto again;
2231 saved_errno = errno;
2232 close(epfd);
2233
2234 if (ret <= 0) {
2235 errno = saved_errno;
2236 return false;
2237 }
2238 return true;
2239}
2240
2241static int msgrecv(int sockfd, void *buf, size_t len)
2242{
2243 if (!wait_for_sock(sockfd, 2))
2244 return -1;
2245 return recv(sockfd, buf, len, MSG_DONTWAIT);
2246}
2247
2248static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2249{
2250 struct msghdr msg = { 0 };
2251 struct iovec iov;
2252 struct cmsghdr *cmsg;
2253 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2254 char buf[1];
2255 buf[0] = 'p';
2256
2257 if (pingfirst) {
2258 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2259 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2260 return SEND_CREDS_FAIL;
2261 }
2262 }
2263
2264 msg.msg_control = cmsgbuf;
2265 msg.msg_controllen = sizeof(cmsgbuf);
2266
2267 cmsg = CMSG_FIRSTHDR(&msg);
2268 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2269 cmsg->cmsg_level = SOL_SOCKET;
2270 cmsg->cmsg_type = SCM_CREDENTIALS;
2271 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2272
2273 msg.msg_name = NULL;
2274 msg.msg_namelen = 0;
2275
2276 buf[0] = v;
2277 iov.iov_base = buf;
2278 iov.iov_len = sizeof(buf);
2279 msg.msg_iov = &iov;
2280 msg.msg_iovlen = 1;
2281
2282 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2283 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2284 if (errno == 3)
2285 return SEND_CREDS_NOTSK;
2286 return SEND_CREDS_FAIL;
2287 }
2288
2289 return SEND_CREDS_OK;
2290}
2291
2292static bool recv_creds(int sock, struct ucred *cred, char *v)
2293{
2294 struct msghdr msg = { 0 };
2295 struct iovec iov;
2296 struct cmsghdr *cmsg;
2297 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2298 char buf[1];
2299 int ret;
2300 int optval = 1;
2301
2302 *v = '1';
2303
2304 cred->pid = -1;
2305 cred->uid = -1;
2306 cred->gid = -1;
2307
2308 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2309 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2310 return false;
2311 }
2312 buf[0] = '1';
2313 if (write(sock, buf, 1) != 1) {
b8defc3d 2314 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2315 return false;
2316 }
2317
2318 msg.msg_name = NULL;
2319 msg.msg_namelen = 0;
2320 msg.msg_control = cmsgbuf;
2321 msg.msg_controllen = sizeof(cmsgbuf);
2322
2323 iov.iov_base = buf;
2324 iov.iov_len = sizeof(buf);
2325 msg.msg_iov = &iov;
2326 msg.msg_iovlen = 1;
2327
2328 if (!wait_for_sock(sock, 2)) {
b8defc3d 2329 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2330 return false;
2331 }
2332 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2333 if (ret < 0) {
b8defc3d 2334 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2335 return false;
2336 }
2337
2338 cmsg = CMSG_FIRSTHDR(&msg);
2339
2340 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2341 cmsg->cmsg_level == SOL_SOCKET &&
2342 cmsg->cmsg_type == SCM_CREDENTIALS) {
2343 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2344 }
2345 *v = buf[0];
2346
2347 return true;
2348}
2349
35174b0f
FG
2350struct pid_ns_clone_args {
2351 int *cpipe;
2352 int sock;
2353 pid_t tpid;
2354 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2355};
2356
2357/*
2358 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2359 * with clone(). This simply writes '1' as ACK back to the parent
2360 * before calling the actual wrapped function.
2361 */
2362static int pid_ns_clone_wrapper(void *arg) {
2363 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2364 char b = '1';
2365
2366 close(args->cpipe[0]);
b8defc3d
CB
2367 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2368 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2369 close(args->cpipe[1]);
2370 return args->wrapped(args->sock, args->tpid);
2371}
237e200e
SH
2372
2373/*
2374 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2375 * int value back over the socket. This shifts the pid from the
2376 * sender's pidns into tpid's pidns.
2377 */
35174b0f 2378static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2379{
2380 char v = '0';
2381 struct ucred cred;
2382
2383 while (recv_creds(sock, &cred, &v)) {
2384 if (v == '1')
35174b0f 2385 return 0;
237e200e 2386 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2387 return 1;
237e200e 2388 }
35174b0f 2389 return 0;
237e200e
SH
2390}
2391
35174b0f 2392
237e200e
SH
2393/*
2394 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2395 * in your old pidns. Only children which you clone will be in the target
2396 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2397 * actually convert pids.
2398 *
2399 * Note: glibc's fork() does not respect pidns, which can lead to failed
2400 * assertions inside glibc (and thus failed forks) if the child's pid in
2401 * the pidns and the parent pid outside are identical. Using clone prevents
2402 * this issue.
237e200e
SH
2403 */
2404static void pid_to_ns_wrapper(int sock, pid_t tpid)
2405{
2406 int newnsfd = -1, ret, cpipe[2];
2407 char fnam[100];
2408 pid_t cpid;
2409 char v;
2410
2411 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2412 if (ret < 0 || ret >= sizeof(fnam))
2413 _exit(1);
2414 newnsfd = open(fnam, O_RDONLY);
2415 if (newnsfd < 0)
2416 _exit(1);
2417 if (setns(newnsfd, 0) < 0)
2418 _exit(1);
2419 close(newnsfd);
2420
2421 if (pipe(cpipe) < 0)
2422 _exit(1);
2423
35174b0f
FG
2424 struct pid_ns_clone_args args = {
2425 .cpipe = cpipe,
2426 .sock = sock,
2427 .tpid = tpid,
2428 .wrapped = &pid_to_ns
2429 };
2430 size_t stack_size = sysconf(_SC_PAGESIZE);
2431 void *stack = alloca(stack_size);
2432
2433 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2434 if (cpid < 0)
2435 _exit(1);
2436
237e200e
SH
2437 // give the child 1 second to be done forking and
2438 // write its ack
2439 if (!wait_for_sock(cpipe[0], 1))
2440 _exit(1);
2441 ret = read(cpipe[0], &v, 1);
2442 if (ret != sizeof(char) || v != '1')
2443 _exit(1);
2444
2445 if (!wait_for_pid(cpid))
2446 _exit(1);
2447 _exit(0);
2448}
2449
2450/*
2451 * To read cgroup files with a particular pid, we will setns into the child
2452 * pidns, open a pipe, fork a child - which will be the first to really be in
2453 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2454 */
2455bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2456{
2457 int sock[2] = {-1, -1};
2458 char *tmpdata = NULL;
2459 int ret;
2460 pid_t qpid, cpid = -1;
2461 bool answer = false;
2462 char v = '0';
2463 struct ucred cred;
2464 size_t sz = 0, asz = 0;
2465
2466 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2467 return false;
2468
2469 /*
2470 * Now we read the pids from returned data one by one, pass
2471 * them into a child in the target namespace, read back the
2472 * translated pids, and put them into our to-return data
2473 */
2474
2475 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2476 perror("socketpair");
2477 free(tmpdata);
2478 return false;
2479 }
2480
2481 cpid = fork();
2482 if (cpid == -1)
2483 goto out;
2484
2485 if (!cpid) // child - exits when done
2486 pid_to_ns_wrapper(sock[1], tpid);
2487
2488 char *ptr = tmpdata;
2489 cred.uid = 0;
2490 cred.gid = 0;
2491 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2492 cred.pid = qpid;
2493 ret = send_creds(sock[0], &cred, v, true);
2494
2495 if (ret == SEND_CREDS_NOTSK)
2496 goto next;
2497 if (ret == SEND_CREDS_FAIL)
2498 goto out;
2499
2500 // read converted results
2501 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2502 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2503 goto out;
2504 }
2505 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2506 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2507 goto out;
2508 }
2509 must_strcat_pid(d, &sz, &asz, qpid);
2510next:
2511 ptr = strchr(ptr, '\n');
2512 if (!ptr)
2513 break;
2514 ptr++;
2515 }
2516
2517 cred.pid = getpid();
2518 v = '1';
2519 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2520 // failed to ask child to exit
b8defc3d 2521 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2522 goto out;
2523 }
2524
2525 answer = true;
2526
2527out:
2528 free(tmpdata);
2529 if (cpid != -1)
2530 wait_for_pid(cpid);
2531 if (sock[0] != -1) {
2532 close(sock[0]);
2533 close(sock[1]);
2534 }
2535 return answer;
2536}
2537
2538int cg_read(const char *path, char *buf, size_t size, off_t offset,
2539 struct fuse_file_info *fi)
2540{
2541 struct fuse_context *fc = fuse_get_context();
2542 struct file_info *f = (struct file_info *)fi->fh;
2543 struct cgfs_files *k = NULL;
2544 char *data = NULL;
2545 int ret, s;
2546 bool r;
2547
2548 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2549 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2550 return -EIO;
2551 }
2552
2553 if (offset)
2554 return 0;
2555
2556 if (!fc)
2557 return -EIO;
2558
2559 if (!f->controller)
2560 return -EINVAL;
2561
2562 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2563 return -EINVAL;
2564 }
2565 free_key(k);
2566
2567
888f8f3c 2568 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2569 ret = -EACCES;
2570 goto out;
2571 }
2572
2573 if (strcmp(f->file, "tasks") == 0 ||
2574 strcmp(f->file, "/tasks") == 0 ||
2575 strcmp(f->file, "/cgroup.procs") == 0 ||
2576 strcmp(f->file, "cgroup.procs") == 0)
2577 // special case - we have to translate the pids
2578 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2579 else
2580 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2581
2582 if (!r) {
2583 ret = -EINVAL;
2584 goto out;
2585 }
2586
2587 if (!data) {
2588 ret = 0;
2589 goto out;
2590 }
2591 s = strlen(data);
2592 if (s > size)
2593 s = size;
2594 memcpy(buf, data, s);
2595 if (s > 0 && s < size && data[s-1] != '\n')
2596 buf[s++] = '\n';
2597
2598 ret = s;
2599
2600out:
2601 free(data);
2602 return ret;
2603}
2604
35174b0f 2605static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2606{
2607 pid_t vpid;
2608 struct ucred cred;
2609 char v;
2610 int ret;
2611
2612 cred.uid = 0;
2613 cred.gid = 0;
2614 while (1) {
2615 if (!wait_for_sock(sock, 2)) {
b8defc3d 2616 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2617 return 1;
237e200e
SH
2618 }
2619 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2620 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2621 return 1;
237e200e
SH
2622 }
2623 if (vpid == -1) // done
2624 break;
2625 v = '0';
2626 cred.pid = vpid;
2627 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2628 v = '1';
2629 cred.pid = getpid();
2630 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2631 return 1;
237e200e
SH
2632 }
2633 }
35174b0f 2634 return 0;
237e200e
SH
2635}
2636
2637static void pid_from_ns_wrapper(int sock, pid_t tpid)
2638{
2639 int newnsfd = -1, ret, cpipe[2];
2640 char fnam[100];
2641 pid_t cpid;
2642 char v;
2643
2644 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2645 if (ret < 0 || ret >= sizeof(fnam))
2646 _exit(1);
2647 newnsfd = open(fnam, O_RDONLY);
2648 if (newnsfd < 0)
2649 _exit(1);
2650 if (setns(newnsfd, 0) < 0)
2651 _exit(1);
2652 close(newnsfd);
2653
2654 if (pipe(cpipe) < 0)
2655 _exit(1);
2656
35174b0f
FG
2657 struct pid_ns_clone_args args = {
2658 .cpipe = cpipe,
2659 .sock = sock,
2660 .tpid = tpid,
2661 .wrapped = &pid_from_ns
2662 };
f0f8b851
SH
2663 size_t stack_size = sysconf(_SC_PAGESIZE);
2664 void *stack = alloca(stack_size);
35174b0f
FG
2665
2666 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2667 if (cpid < 0)
2668 _exit(1);
2669
237e200e
SH
2670 // give the child 1 second to be done forking and
2671 // write its ack
2672 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2673 _exit(1);
237e200e 2674 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2675 if (ret != sizeof(char) || v != '1')
2676 _exit(1);
237e200e
SH
2677
2678 if (!wait_for_pid(cpid))
2679 _exit(1);
2680 _exit(0);
237e200e
SH
2681}
2682
2683/*
2684 * Given host @uid, return the uid to which it maps in
2685 * @pid's user namespace, or -1 if none.
2686 */
2687bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2688{
2689 FILE *f;
2690 char line[400];
2691
2692 sprintf(line, "/proc/%d/uid_map", pid);
2693 if ((f = fopen(line, "r")) == NULL) {
2694 return false;
2695 }
2696
2697 *answer = convert_id_to_ns(f, uid);
2698 fclose(f);
2699
2700 if (*answer == -1)
2701 return false;
2702 return true;
2703}
2704
2705/*
2706 * get_pid_creds: get the real uid and gid of @pid from
2707 * /proc/$$/status
2708 * (XXX should we use euid here?)
2709 */
2710void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2711{
2712 char line[400];
2713 uid_t u;
2714 gid_t g;
2715 FILE *f;
2716
2717 *uid = -1;
2718 *gid = -1;
2719 sprintf(line, "/proc/%d/status", pid);
2720 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2721 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2722 return;
2723 }
2724 while (fgets(line, 400, f)) {
2725 if (strncmp(line, "Uid:", 4) == 0) {
2726 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2727 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2728 fclose(f);
2729 return;
2730 }
2731 *uid = u;
2732 } else if (strncmp(line, "Gid:", 4) == 0) {
2733 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2734 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2735 fclose(f);
2736 return;
2737 }
2738 *gid = g;
2739 }
2740 }
2741 fclose(f);
2742}
2743
2744/*
2745 * May the requestor @r move victim @v to a new cgroup?
2746 * This is allowed if
2747 * . they are the same task
2748 * . they are ownedy by the same uid
2749 * . @r is root on the host, or
2750 * . @v's uid is mapped into @r's where @r is root.
2751 */
2752bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2753{
2754 uid_t v_uid, tmpuid;
2755 gid_t v_gid;
2756
2757 if (r == v)
2758 return true;
2759 if (r_uid == 0)
2760 return true;
2761 get_pid_creds(v, &v_uid, &v_gid);
2762 if (r_uid == v_uid)
2763 return true;
2764 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2765 && hostuid_to_ns(v_uid, r, &tmpuid))
2766 return true;
2767 return false;
2768}
2769
2770static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2771 const char *file, const char *buf)
2772{
2773 int sock[2] = {-1, -1};
2774 pid_t qpid, cpid = -1;
2775 FILE *pids_file = NULL;
2776 bool answer = false, fail = false;
2777
2778 pids_file = open_pids_file(contrl, cg);
2779 if (!pids_file)
2780 return false;
2781
2782 /*
2783 * write the pids to a socket, have helper in writer's pidns
2784 * call movepid for us
2785 */
2786 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2787 perror("socketpair");
2788 goto out;
2789 }
2790
2791 cpid = fork();
2792 if (cpid == -1)
2793 goto out;
2794
2795 if (!cpid) { // child
2796 fclose(pids_file);
2797 pid_from_ns_wrapper(sock[1], tpid);
2798 }
2799
2800 const char *ptr = buf;
2801 while (sscanf(ptr, "%d", &qpid) == 1) {
2802 struct ucred cred;
2803 char v;
2804
2805 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2806 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2807 goto out;
2808 }
2809
2810 if (recv_creds(sock[0], &cred, &v)) {
2811 if (v == '0') {
2812 if (!may_move_pid(tpid, tuid, cred.pid)) {
2813 fail = true;
2814 break;
2815 }
2816 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2817 fail = true;
2818 }
2819 }
2820
2821 ptr = strchr(ptr, '\n');
2822 if (!ptr)
2823 break;
2824 ptr++;
2825 }
2826
2827 /* All good, write the value */
2828 qpid = -1;
2829 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2830 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2831
2832 if (!fail)
2833 answer = true;
2834
2835out:
2836 if (cpid != -1)
2837 wait_for_pid(cpid);
2838 if (sock[0] != -1) {
2839 close(sock[0]);
2840 close(sock[1]);
2841 }
2842 if (pids_file) {
2843 if (fclose(pids_file) != 0)
2844 answer = false;
2845 }
2846 return answer;
2847}
2848
2849int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2850 struct fuse_file_info *fi)
2851{
2852 struct fuse_context *fc = fuse_get_context();
2853 char *localbuf = NULL;
2854 struct cgfs_files *k = NULL;
2855 struct file_info *f = (struct file_info *)fi->fh;
2856 bool r;
2857
2858 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2859 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
2860 return -EIO;
2861 }
2862
2863 if (offset)
2864 return 0;
2865
2866 if (!fc)
2867 return -EIO;
2868
2869 localbuf = alloca(size+1);
2870 localbuf[size] = '\0';
2871 memcpy(localbuf, buf, size);
2872
2873 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2874 size = -EINVAL;
2875 goto out;
2876 }
2877
2878 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2879 size = -EACCES;
2880 goto out;
2881 }
2882
2883 if (strcmp(f->file, "tasks") == 0 ||
2884 strcmp(f->file, "/tasks") == 0 ||
2885 strcmp(f->file, "/cgroup.procs") == 0 ||
2886 strcmp(f->file, "cgroup.procs") == 0)
2887 // special case - we have to translate the pids
2888 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2889 else
2890 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2891
2892 if (!r)
2893 size = -EINVAL;
2894
2895out:
2896 free_key(k);
2897 return size;
2898}
2899
2900int cg_chown(const char *path, uid_t uid, gid_t gid)
2901{
2902 struct fuse_context *fc = fuse_get_context();
2903 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2904 struct cgfs_files *k = NULL;
2905 const char *cgroup;
2906 int ret;
2907
2908 if (!fc)
2909 return -EIO;
2910
2911 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2912 return -EPERM;
237e200e
SH
2913
2914 controller = pick_controller_from_path(fc, path);
2915 if (!controller)
bc70ba9b
CB
2916 return errno == ENOENT ? -EPERM : -errno;
2917
237e200e
SH
2918 cgroup = find_cgroup_in_path(path);
2919 if (!cgroup)
2920 /* this is just /cgroup/controller */
bc70ba9b 2921 return -EPERM;
237e200e
SH
2922
2923 get_cgdir_and_path(cgroup, &cgdir, &last);
2924
2925 if (!last) {
2926 path1 = "/";
2927 path2 = cgdir;
2928 } else {
2929 path1 = cgdir;
2930 path2 = last;
2931 }
2932
2933 if (is_child_cgroup(controller, path1, path2)) {
2934 // get uid, gid, from '/tasks' file and make up a mode
2935 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2936 k = cgfs_get_key(controller, cgroup, "tasks");
2937
2938 } else
2939 k = cgfs_get_key(controller, path1, path2);
2940
2941 if (!k) {
2942 ret = -EINVAL;
2943 goto out;
2944 }
2945
2946 /*
2947 * This being a fuse request, the uid and gid must be valid
2948 * in the caller's namespace. So we can just check to make
2949 * sure that the caller is root in his uid, and privileged
2950 * over the file's current owner.
2951 */
2952 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2953 ret = -EACCES;
2954 goto out;
2955 }
2956
2957 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2958
2959out:
2960 free_key(k);
2961 free(cgdir);
2962
2963 return ret;
2964}
2965
2966int cg_chmod(const char *path, mode_t mode)
2967{
2968 struct fuse_context *fc = fuse_get_context();
2969 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2970 struct cgfs_files *k = NULL;
2971 const char *cgroup;
2972 int ret;
2973
2974 if (!fc)
2975 return -EIO;
2976
2977 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2978 return -EPERM;
237e200e
SH
2979
2980 controller = pick_controller_from_path(fc, path);
2981 if (!controller)
bc70ba9b
CB
2982 return errno == ENOENT ? -EPERM : -errno;
2983
237e200e
SH
2984 cgroup = find_cgroup_in_path(path);
2985 if (!cgroup)
2986 /* this is just /cgroup/controller */
bc70ba9b 2987 return -EPERM;
237e200e
SH
2988
2989 get_cgdir_and_path(cgroup, &cgdir, &last);
2990
2991 if (!last) {
2992 path1 = "/";
2993 path2 = cgdir;
2994 } else {
2995 path1 = cgdir;
2996 path2 = last;
2997 }
2998
2999 if (is_child_cgroup(controller, path1, path2)) {
3000 // get uid, gid, from '/tasks' file and make up a mode
3001 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3002 k = cgfs_get_key(controller, cgroup, "tasks");
3003
3004 } else
3005 k = cgfs_get_key(controller, path1, path2);
3006
3007 if (!k) {
3008 ret = -EINVAL;
3009 goto out;
3010 }
3011
3012 /*
3013 * This being a fuse request, the uid and gid must be valid
3014 * in the caller's namespace. So we can just check to make
3015 * sure that the caller is root in his uid, and privileged
3016 * over the file's current owner.
3017 */
3018 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3019 ret = -EPERM;
3020 goto out;
3021 }
3022
3023 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3024 ret = -EINVAL;
3025 goto out;
3026 }
3027
3028 ret = 0;
3029out:
3030 free_key(k);
3031 free(cgdir);
3032 return ret;
3033}
3034
3035int cg_mkdir(const char *path, mode_t mode)
3036{
3037 struct fuse_context *fc = fuse_get_context();
3038 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3039 const char *cgroup;
3040 int ret;
3041
3042 if (!fc)
3043 return -EIO;
3044
237e200e
SH
3045 controller = pick_controller_from_path(fc, path);
3046 if (!controller)
2f7036d0 3047 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3048
3049 cgroup = find_cgroup_in_path(path);
3050 if (!cgroup)
bc70ba9b 3051 return -errno;
237e200e
SH
3052
3053 get_cgdir_and_path(cgroup, &cgdir, &last);
3054 if (!last)
3055 path1 = "/";
3056 else
3057 path1 = cgdir;
3058
3059 pid_t initpid = lookup_initpid_in_store(fc->pid);
3060 if (initpid <= 0)
3061 initpid = fc->pid;
3062 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3063 if (!next)
3064 ret = -EINVAL;
3065 else if (last && strcmp(next, last) == 0)
3066 ret = -EEXIST;
3067 else
2f7036d0 3068 ret = -EPERM;
237e200e
SH
3069 goto out;
3070 }
3071
3072 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3073 ret = -EACCES;
3074 goto out;
3075 }
3076 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3077 ret = -EACCES;
3078 goto out;
3079 }
3080
3081 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3082
3083out:
3084 free(cgdir);
3085 free(next);
3086 return ret;
3087}
3088
3089int cg_rmdir(const char *path)
3090{
3091 struct fuse_context *fc = fuse_get_context();
3092 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3093 const char *cgroup;
3094 int ret;
3095
3096 if (!fc)
3097 return -EIO;
3098
3099 controller = pick_controller_from_path(fc, path);
e254948f
CB
3100 if (!controller) /* Someone's trying to delete "/cgroup". */
3101 return -EPERM;
237e200e
SH
3102
3103 cgroup = find_cgroup_in_path(path);
e254948f
CB
3104 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3105 return -EPERM;
237e200e
SH
3106
3107 get_cgdir_and_path(cgroup, &cgdir, &last);
3108 if (!last) {
e254948f
CB
3109 /* Someone's trying to delete a cgroup on the same level as the
3110 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3111 * rmdir "/cgroup/blkio/init.slice".
3112 */
3113 ret = -EPERM;
237e200e
SH
3114 goto out;
3115 }
3116
3117 pid_t initpid = lookup_initpid_in_store(fc->pid);
3118 if (initpid <= 0)
3119 initpid = fc->pid;
3120 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3121 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3122 ret = -EBUSY;
3123 else
3124 ret = -ENOENT;
3125 goto out;
3126 }
3127
3128 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3129 ret = -EACCES;
3130 goto out;
3131 }
3132 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3133 ret = -EACCES;
3134 goto out;
3135 }
3136
3137 if (!cgfs_remove(controller, cgroup)) {
3138 ret = -EINVAL;
3139 goto out;
3140 }
3141
3142 ret = 0;
3143
3144out:
3145 free(cgdir);
3146 free(next);
3147 return ret;
3148}
3149
3150static bool startswith(const char *line, const char *pref)
3151{
3152 if (strncmp(line, pref, strlen(pref)) == 0)
3153 return true;
3154 return false;
3155}
3156
c6095b08
SH
3157static void parse_memstat(char *memstat, unsigned long *cached,
3158 unsigned long *active_anon, unsigned long *inactive_anon,
3159 unsigned long *active_file, unsigned long *inactive_file,
3160 unsigned long *unevictable)
237e200e
SH
3161{
3162 char *eol;
3163
237e200e 3164 while (*memstat) {
4accebfb
AS
3165 if (startswith(memstat, "total_cache")) {
3166 sscanf(memstat + 11, "%lu", cached);
c6095b08 3167 *cached /= 1024;
4accebfb
AS
3168 } else if (startswith(memstat, "total_active_anon")) {
3169 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3170 *active_anon /= 1024;
4accebfb
AS
3171 } else if (startswith(memstat, "total_inactive_anon")) {
3172 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3173 *inactive_anon /= 1024;
4accebfb
AS
3174 } else if (startswith(memstat, "total_active_file")) {
3175 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3176 *active_file /= 1024;
4accebfb
AS
3177 } else if (startswith(memstat, "total_inactive_file")) {
3178 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3179 *inactive_file /= 1024;
4accebfb
AS
3180 } else if (startswith(memstat, "total_unevictable")) {
3181 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3182 *unevictable /= 1024;
237e200e
SH
3183 }
3184 eol = strchr(memstat, '\n');
3185 if (!eol)
3186 return;
3187 memstat = eol+1;
3188 }
3189}
3190
3191static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3192{
3193 char *eol;
3194 char key[32];
3195
3196 memset(key, 0, 32);
3197 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3198
3199 size_t len = strlen(key);
3200 *v = 0;
3201
3202 while (*str) {
3203 if (startswith(str, key)) {
3204 sscanf(str + len, "%lu", v);
3205 return;
3206 }
3207 eol = strchr(str, '\n');
3208 if (!eol)
3209 return;
3210 str = eol+1;
3211 }
3212}
3213
3214static int read_file(const char *path, char *buf, size_t size,
3215 struct file_info *d)
3216{
3217 size_t linelen = 0, total_len = 0, rv = 0;
3218 char *line = NULL;
3219 char *cache = d->buf;
3220 size_t cache_size = d->buflen;
3221 FILE *f = fopen(path, "r");
3222 if (!f)
3223 return 0;
3224
3225 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3226 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3227 if (l < 0) {
3228 perror("Error writing to cache");
3229 rv = 0;
3230 goto err;
3231 }
3232 if (l >= cache_size) {
b8defc3d 3233 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3234 rv = 0;
3235 goto err;
3236 }
3237 cache += l;
3238 cache_size -= l;
3239 total_len += l;
3240 }
3241
3242 d->size = total_len;
a262ddb7
CB
3243 if (total_len > size)
3244 total_len = size;
237e200e
SH
3245
3246 /* read from off 0 */
3247 memcpy(buf, d->buf, total_len);
3248 rv = total_len;
3249 err:
3250 fclose(f);
3251 free(line);
3252 return rv;
3253}
3254
3255/*
3256 * FUSE ops for /proc
3257 */
3258
018246ff 3259static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3260{
3261 char *memlimit_str = NULL;
3262 unsigned long memlimit = -1;
3263
018246ff 3264 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3265 memlimit = strtoul(memlimit_str, NULL, 10);
3266
3267 free(memlimit_str);
3268
3269 return memlimit;
3270}
3271
018246ff 3272static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3273{
3274 char *copy = strdupa(cgroup);
3275 unsigned long memlimit = 0, retlimit;
3276
018246ff 3277 retlimit = get_memlimit(copy, file);
237e200e
SH
3278
3279 while (strcmp(copy, "/") != 0) {
3280 copy = dirname(copy);
018246ff 3281 memlimit = get_memlimit(copy, file);
237e200e
SH
3282 if (memlimit != -1 && memlimit < retlimit)
3283 retlimit = memlimit;
3284 };
3285
3286 return retlimit;
3287}
3288
3289static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3290 struct fuse_file_info *fi)
3291{
3292 struct fuse_context *fc = fuse_get_context();
3293 struct file_info *d = (struct file_info *)fi->fh;
3294 char *cg;
3295 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3296 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3297 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3298 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
594a10e6
WB
3299 active_file = 0, inactive_file = 0, unevictable = 0,
3300 hostswtotal = 0;
237e200e
SH
3301 char *line = NULL;
3302 size_t linelen = 0, total_len = 0, rv = 0;
3303 char *cache = d->buf;
3304 size_t cache_size = d->buflen;
3305 FILE *f = NULL;
3306
3307 if (offset){
3308 if (offset > d->size)
3309 return -EINVAL;
3310 if (!d->cached)
3311 return 0;
3312 int left = d->size - offset;
3313 total_len = left > size ? size: left;
3314 memcpy(buf, cache + offset, total_len);
3315 return total_len;
3316 }
3317
3318 pid_t initpid = lookup_initpid_in_store(fc->pid);
3319 if (initpid <= 0)
3320 initpid = fc->pid;
3321 cg = get_pid_cgroup(initpid, "memory");
3322 if (!cg)
3323 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3324 prune_init_slice(cg);
237e200e 3325
018246ff 3326 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3327 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3328 goto err;
3329 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3330 goto err;
3331
3332 // Following values are allowed to fail, because swapaccount might be turned
3333 // off for current kernel
3334 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3335 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3336 {
018246ff 3337 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3338 memswusage = strtoul(memswusage_str, NULL, 10);
3339
237e200e
SH
3340 memswlimit = memswlimit / 1024;
3341 memswusage = memswusage / 1024;
3342 }
3343
3344 memusage = strtoul(memusage_str, NULL, 10);
3345 memlimit /= 1024;
3346 memusage /= 1024;
3347
c6095b08
SH
3348 parse_memstat(memstat_str, &cached, &active_anon,
3349 &inactive_anon, &active_file, &inactive_file,
3350 &unevictable);
237e200e
SH
3351
3352 f = fopen("/proc/meminfo", "r");
3353 if (!f)
3354 goto err;
3355
3356 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3357 ssize_t l;
237e200e
SH
3358 char *printme, lbuf[100];
3359
3360 memset(lbuf, 0, 100);
3361 if (startswith(line, "MemTotal:")) {
594a10e6 3362 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3363 if (hosttotal < memlimit)
3364 memlimit = hosttotal;
3365 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3366 printme = lbuf;
3367 } else if (startswith(line, "MemFree:")) {
3368 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3369 printme = lbuf;
3370 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3371 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e
SH
3372 printme = lbuf;
3373 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
594a10e6 3374 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3375 if (hostswtotal < memswlimit)
3376 memswlimit = hostswtotal;
3377 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e
SH
3378 printme = lbuf;
3379 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
4127e51b 3380 unsigned long swaptotal = memswlimit,
b4665ce0
SH
3381 swapusage = memswusage - memusage,
3382 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3383 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3384 printme = lbuf;
da35d72a
SH
3385 } else if (startswith(line, "Slab:")) {
3386 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3387 printme = lbuf;
237e200e
SH
3388 } else if (startswith(line, "Buffers:")) {
3389 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3390 printme = lbuf;
3391 } else if (startswith(line, "Cached:")) {
3392 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3393 printme = lbuf;
3394 } else if (startswith(line, "SwapCached:")) {
3395 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3396 printme = lbuf;
2f306ad3 3397 } else if (startswith(line, "Active:")) {
c6095b08
SH
3398 snprintf(lbuf, 100, "Active: %8lu kB\n",
3399 active_anon + active_file);
3400 printme = lbuf;
2f306ad3 3401 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3402 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3403 inactive_anon + inactive_file);
3404 printme = lbuf;
3405 } else if (startswith(line, "Active(anon)")) {
3406 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3407 printme = lbuf;
3408 } else if (startswith(line, "Inactive(anon)")) {
3409 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3410 printme = lbuf;
3411 } else if (startswith(line, "Active(file)")) {
3412 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3413 printme = lbuf;
3414 } else if (startswith(line, "Inactive(file)")) {
3415 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3416 printme = lbuf;
3417 } else if (startswith(line, "Unevictable")) {
3418 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3419 printme = lbuf;
3420 } else if (startswith(line, "SReclaimable")) {
3421 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3422 printme = lbuf;
3423 } else if (startswith(line, "SUnreclaim")) {
3424 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3425 printme = lbuf;
237e200e
SH
3426 } else
3427 printme = line;
3428
3429 l = snprintf(cache, cache_size, "%s", printme);
3430 if (l < 0) {
3431 perror("Error writing to cache");
3432 rv = 0;
3433 goto err;
3434
3435 }
3436 if (l >= cache_size) {
b8defc3d 3437 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3438 rv = 0;
3439 goto err;
3440 }
3441
3442 cache += l;
3443 cache_size -= l;
3444 total_len += l;
3445 }
3446
3447 d->cached = 1;
3448 d->size = total_len;
3449 if (total_len > size ) total_len = size;
3450 memcpy(buf, d->buf, total_len);
3451
3452 rv = total_len;
3453err:
3454 if (f)
3455 fclose(f);
3456 free(line);
3457 free(cg);
3458 free(memusage_str);
3459 free(memswlimit_str);
3460 free(memswusage_str);
3461 free(memstat_str);
237e200e
SH
3462 return rv;
3463}
3464
3465/*
3466 * Read the cpuset.cpus for cg
3467 * Return the answer in a newly allocated string which must be freed
3468 */
3469static char *get_cpuset(const char *cg)
3470{
3471 char *answer;
3472
3473 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3474 return NULL;
3475 return answer;
3476}
3477
3478bool cpu_in_cpuset(int cpu, const char *cpuset);
3479
3480static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3481{
3482 int cpu;
3483
3484 if (sscanf(line, "processor : %d", &cpu) != 1)
3485 return false;
3486 return cpu_in_cpuset(cpu, cpuset);
3487}
3488
3489/*
3490 * check whether this is a '^processor" line in /proc/cpuinfo
3491 */
3492static bool is_processor_line(const char *line)
3493{
3494 int cpu;
3495
3496 if (sscanf(line, "processor : %d", &cpu) == 1)
3497 return true;
3498 return false;
3499}
3500
3501static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3502 struct fuse_file_info *fi)
3503{
3504 struct fuse_context *fc = fuse_get_context();
3505 struct file_info *d = (struct file_info *)fi->fh;
3506 char *cg;
3507 char *cpuset = NULL;
3508 char *line = NULL;
3509 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79
SH
3510 bool am_printing = false, firstline = true, is_s390x = false;
3511 int curcpu = -1, cpu;
237e200e
SH
3512 char *cache = d->buf;
3513 size_t cache_size = d->buflen;
3514 FILE *f = NULL;
3515
3516 if (offset){
3517 if (offset > d->size)
3518 return -EINVAL;
3519 if (!d->cached)
3520 return 0;
3521 int left = d->size - offset;
3522 total_len = left > size ? size: left;
3523 memcpy(buf, cache + offset, total_len);
3524 return total_len;
3525 }
3526
3527 pid_t initpid = lookup_initpid_in_store(fc->pid);
3528 if (initpid <= 0)
3529 initpid = fc->pid;
3530 cg = get_pid_cgroup(initpid, "cpuset");
3531 if (!cg)
3532 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3533 prune_init_slice(cg);
237e200e
SH
3534
3535 cpuset = get_cpuset(cg);
3536 if (!cpuset)
3537 goto err;
3538
3539 f = fopen("/proc/cpuinfo", "r");
3540 if (!f)
3541 goto err;
3542
3543 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3544 ssize_t l;
f676eb79
SH
3545 if (firstline) {
3546 firstline = false;
3547 if (strstr(line, "IBM/S390") != NULL) {
3548 is_s390x = true;
3549 am_printing = true;
5ed9d4e2 3550 continue;
f676eb79
SH
3551 }
3552 }
5ed9d4e2
SH
3553 if (strncmp(line, "# processors:", 12) == 0)
3554 continue;
237e200e
SH
3555 if (is_processor_line(line)) {
3556 am_printing = cpuline_in_cpuset(line, cpuset);
3557 if (am_printing) {
3558 curcpu ++;
3559 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3560 if (l < 0) {
3561 perror("Error writing to cache");
3562 rv = 0;
3563 goto err;
3564 }
3565 if (l >= cache_size) {
b8defc3d 3566 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3567 rv = 0;
3568 goto err;
3569 }
3570 cache += l;
3571 cache_size -= l;
3572 total_len += l;
3573 }
3574 continue;
f676eb79
SH
3575 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3576 char *p;
3577 if (!cpu_in_cpuset(cpu, cpuset))
3578 continue;
3579 curcpu ++;
3580 p = strchr(line, ':');
3581 if (!p || !*p)
3582 goto err;
3583 p++;
5ed9d4e2 3584 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3585 if (l < 0) {
3586 perror("Error writing to cache");
3587 rv = 0;
3588 goto err;
3589 }
3590 if (l >= cache_size) {
b8defc3d 3591 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3592 rv = 0;
3593 goto err;
3594 }
3595 cache += l;
3596 cache_size -= l;
3597 total_len += l;
3598 continue;
3599
237e200e
SH
3600 }
3601 if (am_printing) {
3602 l = snprintf(cache, cache_size, "%s", line);
3603 if (l < 0) {
3604 perror("Error writing to cache");
3605 rv = 0;
3606 goto err;
3607 }
3608 if (l >= cache_size) {
b8defc3d 3609 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3610 rv = 0;
3611 goto err;
3612 }
3613 cache += l;
3614 cache_size -= l;
3615 total_len += l;
3616 }
3617 }
3618
5ed9d4e2
SH
3619 if (is_s390x) {
3620 char *origcache = d->buf;
a262ddb7 3621 ssize_t l;
5ed9d4e2
SH
3622 do {
3623 d->buf = malloc(d->buflen);
3624 } while (!d->buf);
3625 cache = d->buf;
3626 cache_size = d->buflen;
3627 total_len = 0;
3628 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3629 if (l < 0 || l >= cache_size) {
3630 free(origcache);
3631 goto err;
3632 }
3633 cache_size -= l;
3634 cache += l;
3635 total_len += l;
3636 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3637 if (l < 0 || l >= cache_size) {
3638 free(origcache);
3639 goto err;
3640 }
3641 cache_size -= l;
3642 cache += l;
3643 total_len += l;
3644 l = snprintf(cache, cache_size, "%s", origcache);
3645 free(origcache);
3646 if (l < 0 || l >= cache_size)
3647 goto err;
3648 total_len += l;
3649 }
3650
237e200e
SH
3651 d->cached = 1;
3652 d->size = total_len;
3653 if (total_len > size ) total_len = size;
3654
3655 /* read from off 0 */
3656 memcpy(buf, d->buf, total_len);
3657 rv = total_len;
3658err:
3659 if (f)
3660 fclose(f);
3661 free(line);
3662 free(cpuset);
3663 free(cg);
3664 return rv;
3665}
3666
0ecddf02 3667static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3668{
9ac264cf 3669 int ret;
0ecddf02
CB
3670 FILE *f;
3671 uint64_t starttime;
3672 /* strlen("/proc/") = 6
3673 * +
3674 * LXCFS_NUMSTRLEN64
3675 * +
3676 * strlen("/stat") = 5
3677 * +
3678 * \0 = 1
3679 * */
3680#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3681 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3682 pid_t qpid;
3683
3684 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3685 if (qpid <= 0) {
3686 /* Caller can check for EINVAL on 0. */
3687 errno = EINVAL;
9ac264cf 3688 return 0;
0ecddf02 3689 }
9ac264cf 3690
0ecddf02
CB
3691 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3692 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3693 /* Caller can check for EINVAL on 0. */
3694 errno = EINVAL;
9ac264cf 3695 return 0;
0ecddf02 3696 }
9ac264cf 3697
0ecddf02
CB
3698 f = fopen(path, "r");
3699 if (!f) {
3700 /* Caller can check for EINVAL on 0. */
3701 errno = EINVAL;
9ac264cf 3702 return 0;
0ecddf02 3703 }
9ac264cf 3704
0ecddf02
CB
3705 /* Note that the *scanf() argument supression requires that length
3706 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3707 * at us. It's like telling someone you're not married and then asking
3708 * if you can bring your wife to the party.
3709 */
3710 ret = fscanf(f, "%*d " /* (1) pid %d */
3711 "%*s " /* (2) comm %s */
3712 "%*c " /* (3) state %c */
3713 "%*d " /* (4) ppid %d */
3714 "%*d " /* (5) pgrp %d */
3715 "%*d " /* (6) session %d */
3716 "%*d " /* (7) tty_nr %d */
3717 "%*d " /* (8) tpgid %d */
3718 "%*u " /* (9) flags %u */
3719 "%*u " /* (10) minflt %lu */
3720 "%*u " /* (11) cminflt %lu */
3721 "%*u " /* (12) majflt %lu */
3722 "%*u " /* (13) cmajflt %lu */
3723 "%*u " /* (14) utime %lu */
3724 "%*u " /* (15) stime %lu */
3725 "%*d " /* (16) cutime %ld */
3726 "%*d " /* (17) cstime %ld */
3727 "%*d " /* (18) priority %ld */
3728 "%*d " /* (19) nice %ld */
3729 "%*d " /* (20) num_threads %ld */
3730 "%*d " /* (21) itrealvalue %ld */
3731 "%" PRIu64, /* (22) starttime %llu */
3732 &starttime);
3733 if (ret != 1) {
3734 fclose(f);
3735 /* Caller can check for EINVAL on 0. */
3736 errno = EINVAL;
3737 return 0;
3738 }
3739
3740 fclose(f);
3741
3742 errno = 0;
3743 return starttime;
3744}
3745
3746static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3747{
3748 uint64_t clockticks;
3749 int64_t ticks_per_sec;
3750
3751 clockticks = get_reaper_start_time(pid);
3752 if (clockticks == 0 && errno == EINVAL) {
3753 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3754 return 0;
3755 }
3756
3757 ticks_per_sec = sysconf(_SC_CLK_TCK);
3758 if (ticks_per_sec < 0 && errno == EINVAL) {
3759 lxcfs_debug(
3760 "%s\n",
3761 "failed to determine number of clock ticks in a second");
3762 return 0;
3763 }
3764
3765 return (clockticks /= ticks_per_sec);
3766}
3767
3768static uint64_t get_reaper_age(pid_t pid)
3769{
3770 uint64_t procstart, uptime, procage;
3771
3772 /* We need to substract the time the process has started since system
3773 * boot minus the time when the system has started to get the actual
3774 * reaper age.
3775 */
3776 procstart = get_reaper_start_time_in_sec(pid);
3777 procage = procstart;
3778 if (procstart > 0) {
3779 int ret;
3780 struct timespec spec;
3781
3782 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3783 if (ret < 0)
3784 return 0;
3785 /* We could make this more precise here by using the tv_nsec
3786 * field in the timespec struct and convert it to milliseconds
3787 * and then create a double for the seconds and milliseconds but
3788 * that seems more work than it is worth.
3789 */
3790 uptime = spec.tv_sec;
3791 procage = uptime - procstart;
3792 }
3793
3794 return procage;
3795}
3796
f34de69a 3797#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
3798static int proc_stat_read(char *buf, size_t size, off_t offset,
3799 struct fuse_file_info *fi)
3800{
3801 struct fuse_context *fc = fuse_get_context();
3802 struct file_info *d = (struct file_info *)fi->fh;
3803 char *cg;
3804 char *cpuset = NULL;
3805 char *line = NULL;
3806 size_t linelen = 0, total_len = 0, rv = 0;
3807 int curcpu = -1; /* cpu numbering starts at 0 */
7144f069 3808 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 3809 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 3810 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
3811 char cpuall[CPUALL_MAX_SIZE];
3812 /* reserve for cpu all */
3813 char *cache = d->buf + CPUALL_MAX_SIZE;
3814 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3815 FILE *f = NULL;
3816
3817 if (offset){
3818 if (offset > d->size)
3819 return -EINVAL;
3820 if (!d->cached)
3821 return 0;
3822 int left = d->size - offset;
3823 total_len = left > size ? size: left;
3824 memcpy(buf, d->buf + offset, total_len);
3825 return total_len;
3826 }
3827
3828 pid_t initpid = lookup_initpid_in_store(fc->pid);
3829 if (initpid <= 0)
3830 initpid = fc->pid;
3831 cg = get_pid_cgroup(initpid, "cpuset");
3832 if (!cg)
3833 return read_file("/proc/stat", buf, size, d);
6d2f6996 3834 prune_init_slice(cg);
237e200e
SH
3835
3836 cpuset = get_cpuset(cg);
3837 if (!cpuset)
3838 goto err;
3839
3840 f = fopen("/proc/stat", "r");
3841 if (!f)
3842 goto err;
3843
3844 //skip first line
3845 if (getline(&line, &linelen, f) < 0) {
b8defc3d 3846 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
3847 goto err;
3848 }
3849
3850 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3851 ssize_t l;
237e200e
SH
3852 int cpu;
3853 char cpu_char[10]; /* That's a lot of cores */
3854 char *c;
3855
b4665ce0
SH
3856 if (strlen(line) == 0)
3857 continue;
237e200e
SH
3858 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3859 /* not a ^cpuN line containing a number N, just print it */
9502bae2 3860 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3861 if (l < 0) {
3862 perror("Error writing to cache");
3863 rv = 0;
3864 goto err;
3865 }
3866 if (l >= cache_size) {
b8defc3d 3867 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3868 rv = 0;
3869 goto err;
3870 }
3871 cache += l;
3872 cache_size -= l;
3873 total_len += l;
3874 continue;
3875 }
3876
3877 if (sscanf(cpu_char, "%d", &cpu) != 1)
3878 continue;
3879 if (!cpu_in_cpuset(cpu, cpuset))
3880 continue;
3881 curcpu ++;
3882
3883 c = strchr(line, ' ');
3884 if (!c)
3885 continue;
3886 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3887 if (l < 0) {
3888 perror("Error writing to cache");
3889 rv = 0;
3890 goto err;
3891
3892 }
3893 if (l >= cache_size) {
b8defc3d 3894 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3895 rv = 0;
3896 goto err;
3897 }
3898
3899 cache += l;
3900 cache_size -= l;
3901 total_len += l;
3902
7144f069
CB
3903 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3904 &user,
3905 &nice,
3906 &system,
3907 &idle,
3908 &iowait,
3909 &irq,
3910 &softirq,
3911 &steal,
3912 &guest,
3913 &guest_nice) != 10)
237e200e
SH
3914 continue;
3915 user_sum += user;
3916 nice_sum += nice;
3917 system_sum += system;
3918 idle_sum += idle;
3919 iowait_sum += iowait;
3920 irq_sum += irq;
3921 softirq_sum += softirq;
3922 steal_sum += steal;
3923 guest_sum += guest;
7144f069 3924 guest_nice_sum += guest_nice;
237e200e
SH
3925 }
3926
3927 cache = d->buf;
3928
7144f069
CB
3929 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3930 user_sum,
3931 nice_sum,
3932 system_sum,
3933 idle_sum,
3934 iowait_sum,
3935 irq_sum,
3936 softirq_sum,
3937 steal_sum,
3938 guest_sum,
3939 guest_nice_sum);
3940 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
3941 memcpy(cache, cpuall, cpuall_len);
3942 cache += cpuall_len;
7144f069 3943 } else {
237e200e 3944 /* shouldn't happen */
b8defc3d 3945 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
3946 cpuall_len = 0;
3947 }
3948
3949 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3950 total_len += cpuall_len;
3951 d->cached = 1;
3952 d->size = total_len;
7144f069
CB
3953 if (total_len > size)
3954 total_len = size;
237e200e
SH
3955
3956 memcpy(buf, d->buf, total_len);
3957 rv = total_len;
3958
3959err:
3960 if (f)
3961 fclose(f);
3962 free(line);
3963 free(cpuset);
3964 free(cg);
3965 return rv;
3966}
3967
0ecddf02
CB
3968/* This function retrieves the busy time of a group of tasks by looking at
3969 * cpuacct.usage. Unfortunately, this only makes sense when the container has
3970 * been given it's own cpuacct cgroup. If not, this function will take the busy
3971 * time of all other taks that do not actually belong to the container into
3972 * account as well. If someone has a clever solution for this please send a
3973 * patch!
3974 */
237e200e
SH
3975static unsigned long get_reaper_busy(pid_t task)
3976{
3977 pid_t initpid = lookup_initpid_in_store(task);
3978 char *cgroup = NULL, *usage_str = NULL;
3979 unsigned long usage = 0;
3980
3981 if (initpid <= 0)
3982 return 0;
3983
3984 cgroup = get_pid_cgroup(initpid, "cpuacct");
3985 if (!cgroup)
3986 goto out;
6d2f6996 3987 prune_init_slice(cgroup);
237e200e
SH
3988 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3989 goto out;
3990 usage = strtoul(usage_str, NULL, 10);
3991 usage /= 1000000000;
3992
3993out:
3994 free(cgroup);
3995 free(usage_str);
3996 return usage;
3997}
3998
3999#if RELOADTEST
4000void iwashere(void)
4001{
237e200e
SH
4002 int fd;
4003
ec2b5e7c 4004 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
4005 if (fd >= 0)
4006 close(fd);
4007}
4008#endif
4009
4010/*
4011 * We read /proc/uptime and reuse its second field.
4012 * For the first field, we use the mtime for the reaper for
4013 * the calling pid as returned by getreaperage
4014 */
4015static int proc_uptime_read(char *buf, size_t size, off_t offset,
4016 struct fuse_file_info *fi)
4017{
4018 struct fuse_context *fc = fuse_get_context();
4019 struct file_info *d = (struct file_info *)fi->fh;
0ecddf02 4020 unsigned long int busytime = get_reaper_busy(fc->pid);
237e200e 4021 char *cache = d->buf;
a262ddb7 4022 ssize_t total_len = 0;
0ecddf02 4023 uint64_t idletime, reaperage;
237e200e
SH
4024
4025#if RELOADTEST
4026 iwashere();
4027#endif
4028
4029 if (offset){
237e200e
SH
4030 if (!d->cached)
4031 return 0;
bbdf646b
BM
4032 if (offset > d->size)
4033 return -EINVAL;
237e200e
SH
4034 int left = d->size - offset;
4035 total_len = left > size ? size: left;
4036 memcpy(buf, cache + offset, total_len);
4037 return total_len;
4038 }
4039
0ecddf02
CB
4040 reaperage = get_reaper_age(fc->pid);
4041 /* To understand why this is done, please read the comment to the
4042 * get_reaper_busy() function.
4043 */
4044 idletime = reaperage;
4045 if (reaperage >= busytime)
4046 idletime = reaperage - busytime;
237e200e 4047
bbdf646b
BM
4048 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4049 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 4050 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
4051 return 0;
4052 }
4053
4054 d->size = (int)total_len;
4055 d->cached = 1;
4056
4057 if (total_len > size) total_len = size;
4058
4059 memcpy(buf, d->buf, total_len);
4060 return total_len;
4061}
4062
4063static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4064 struct fuse_file_info *fi)
4065{
4066 char dev_name[72];
4067 struct fuse_context *fc = fuse_get_context();
4068 struct file_info *d = (struct file_info *)fi->fh;
4069 char *cg;
4070 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4071 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4072 unsigned long read = 0, write = 0;
4073 unsigned long read_merged = 0, write_merged = 0;
4074 unsigned long read_sectors = 0, write_sectors = 0;
4075 unsigned long read_ticks = 0, write_ticks = 0;
4076 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4077 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4078 char *cache = d->buf;
4079 size_t cache_size = d->buflen;
4080 char *line = NULL;
4081 size_t linelen = 0, total_len = 0, rv = 0;
4082 unsigned int major = 0, minor = 0;
4083 int i = 0;
4084 FILE *f = NULL;
4085
4086 if (offset){
4087 if (offset > d->size)
4088 return -EINVAL;
4089 if (!d->cached)
4090 return 0;
4091 int left = d->size - offset;
4092 total_len = left > size ? size: left;
4093 memcpy(buf, cache + offset, total_len);
4094 return total_len;
4095 }
4096
4097 pid_t initpid = lookup_initpid_in_store(fc->pid);
4098 if (initpid <= 0)
4099 initpid = fc->pid;
4100 cg = get_pid_cgroup(initpid, "blkio");
4101 if (!cg)
4102 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 4103 prune_init_slice(cg);
237e200e 4104
2209fe50 4105 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 4106 goto err;
2209fe50 4107 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 4108 goto err;
2209fe50 4109 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 4110 goto err;
2209fe50 4111 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 4112 goto err;
2209fe50 4113 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
4114 goto err;
4115
4116
4117 f = fopen("/proc/diskstats", "r");
4118 if (!f)
4119 goto err;
4120
4121 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4122 ssize_t l;
2209fe50 4123 char lbuf[256];
237e200e
SH
4124
4125 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 4126 if (i != 3)
237e200e 4127 continue;
2209fe50
SH
4128
4129 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4130 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4131 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4132 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4133 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4134 read_sectors = read_sectors/512;
4135 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4136 write_sectors = write_sectors/512;
4137
4138 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4139 rd_svctm = rd_svctm/1000000;
4140 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4141 rd_wait = rd_wait/1000000;
4142 read_ticks = rd_svctm + rd_wait;
4143
4144 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4145 wr_svctm = wr_svctm/1000000;
4146 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4147 wr_wait = wr_wait/1000000;
4148 write_ticks = wr_svctm + wr_wait;
4149
4150 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4151 tot_ticks = tot_ticks/1000000;
237e200e
SH
4152
4153 memset(lbuf, 0, 256);
2db31eb6
SH
4154 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4155 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4156 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4157 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4158 else
4159 continue;
237e200e 4160
2209fe50 4161 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
4162 if (l < 0) {
4163 perror("Error writing to fuse buf");
4164 rv = 0;
4165 goto err;
4166 }
4167 if (l >= cache_size) {
b8defc3d 4168 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4169 rv = 0;
4170 goto err;
4171 }
4172 cache += l;
4173 cache_size -= l;
4174 total_len += l;
4175 }
4176
4177 d->cached = 1;
4178 d->size = total_len;
4179 if (total_len > size ) total_len = size;
4180 memcpy(buf, d->buf, total_len);
4181
4182 rv = total_len;
4183err:
4184 free(cg);
4185 if (f)
4186 fclose(f);
4187 free(line);
4188 free(io_serviced_str);
4189 free(io_merged_str);
4190 free(io_service_bytes_str);
4191 free(io_wait_time_str);
4192 free(io_service_time_str);
4193 return rv;
4194}
4195
70dcc12e
SH
4196static int proc_swaps_read(char *buf, size_t size, off_t offset,
4197 struct fuse_file_info *fi)
4198{
4199 struct fuse_context *fc = fuse_get_context();
4200 struct file_info *d = (struct file_info *)fi->fh;
4201 char *cg = NULL;
018246ff 4202 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 4203 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
4204 ssize_t total_len = 0, rv = 0;
4205 ssize_t l = 0;
70dcc12e
SH
4206 char *cache = d->buf;
4207
4208 if (offset) {
4209 if (offset > d->size)
4210 return -EINVAL;
4211 if (!d->cached)
4212 return 0;
4213 int left = d->size - offset;
4214 total_len = left > size ? size: left;
4215 memcpy(buf, cache + offset, total_len);
4216 return total_len;
4217 }
4218
4219 pid_t initpid = lookup_initpid_in_store(fc->pid);
4220 if (initpid <= 0)
4221 initpid = fc->pid;
4222 cg = get_pid_cgroup(initpid, "memory");
4223 if (!cg)
4224 return read_file("/proc/swaps", buf, size, d);
6d2f6996 4225 prune_init_slice(cg);
70dcc12e 4226
018246ff 4227 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
4228
4229 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4230 goto err;
4231
70dcc12e
SH
4232 memusage = strtoul(memusage_str, NULL, 10);
4233
4234 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4235 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4236
018246ff 4237 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
4238 memswusage = strtoul(memswusage_str, NULL, 10);
4239
70dcc12e
SH
4240 swap_total = (memswlimit - memlimit) / 1024;
4241 swap_free = (memswusage - memusage) / 1024;
4242 }
4243
4244 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4245
4246 /* When no mem + swap limit is specified or swapaccount=0*/
4247 if (!memswlimit) {
4248 char *line = NULL;
4249 size_t linelen = 0;
4250 FILE *f = fopen("/proc/meminfo", "r");
4251
4252 if (!f)
4253 goto err;
4254
4255 while (getline(&line, &linelen, f) != -1) {
4256 if (startswith(line, "SwapTotal:")) {
4257 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4258 } else if (startswith(line, "SwapFree:")) {
4259 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4260 }
4261 }
4262
4263 free(line);
4264 fclose(f);
4265 }
4266
4267 if (swap_total > 0) {
a262ddb7
CB
4268 l = snprintf(d->buf + total_len, d->size - total_len,
4269 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4270 swap_total, swap_free);
4271 total_len += l;
70dcc12e
SH
4272 }
4273
a262ddb7 4274 if (total_len < 0 || l < 0) {
70dcc12e
SH
4275 perror("Error writing to cache");
4276 rv = 0;
4277 goto err;
4278 }
4279
4280 d->cached = 1;
4281 d->size = (int)total_len;
4282
4283 if (total_len > size) total_len = size;
4284 memcpy(buf, d->buf, total_len);
4285 rv = total_len;
4286
4287err:
4288 free(cg);
4289 free(memswlimit_str);
4290 free(memlimit_str);
4291 free(memusage_str);
4292 free(memswusage_str);
70dcc12e
SH
4293 return rv;
4294}
6db4f7a3 4295/*
4296 * Find the process pid from cgroup path.
4297 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4298 * @pid_buf : put pid to pid_buf.
4299 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4300 * @depth : the depth of cgroup in container.
4301 * @sum : return the number of pid.
4302 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4303 */
4304static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4305{
4306 DIR *dir;
4307 int fd;
4308 struct dirent *file;
4309 FILE *f = NULL;
4310 size_t linelen = 0;
4311 char *line = NULL;
4312 int pd;
4313 char *path_dir, *path;
4314 char **pid;
4315
4316 /* path = dpath + "/cgroup.procs" + /0 */
4317 do {
4318 path = malloc(strlen(dpath) + 20);
4319 } while (!path);
4320
4321 strcpy(path, dpath);
4322 fd = openat(cfd, path, O_RDONLY);
4323 if (fd < 0)
4324 goto out;
4325
4326 dir = fdopendir(fd);
4327 if (dir == NULL) {
4328 close(fd);
4329 goto out;
4330 }
4331
4332 while (((file = readdir(dir)) != NULL) && depth > 0) {
4333 if (strncmp(file->d_name, ".", 1) == 0)
4334 continue;
4335 if (strncmp(file->d_name, "..", 1) == 0)
4336 continue;
4337 if (file->d_type == DT_DIR) {
4338 /* path + '/' + d_name +/0 */
4339 do {
4340 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4341 } while (!path_dir);
4342 strcpy(path_dir, path);
4343 strcat(path_dir, "/");
4344 strcat(path_dir, file->d_name);
4345 pd = depth - 1;
4346 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4347 free(path_dir);
4348 }
4349 }
4350 closedir(dir);
4351
4352 strcat(path, "/cgroup.procs");
4353 fd = openat(cfd, path, O_RDONLY);
4354 if (fd < 0)
4355 goto out;
4356
4357 f = fdopen(fd, "r");
4358 if (!f) {
4359 close(fd);
4360 goto out;
4361 }
4362
4363 while (getline(&line, &linelen, f) != -1) {
4364 do {
4365 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4366 } while (!pid);
4367 *pid_buf = pid;
4368 do {
4369 *(*pid_buf + sum) = malloc(strlen(line) + 1);
4370 } while (*(*pid_buf + sum) == NULL);
4371 strcpy(*(*pid_buf + sum), line);
4372 sum++;
4373 }
4374 fclose(f);
4375out:
4376 free(path);
4377 return sum;
4378}
4379/*
4380 * calc_load calculates the load according to the following formula:
4381 * load1 = load0 * exp + active * (1 - exp)
4382 *
4383 * @load1: the new loadavg.
4384 * @load0: the former loadavg.
4385 * @active: the total number of running pid at this moment.
4386 * @exp: the fixed-point defined in the beginning.
4387 */
4388static unsigned long
4389calc_load(unsigned long load, unsigned long exp, unsigned long active)
4390{
4391 unsigned long newload;
4392
4393 active = active > 0 ? active * FIXED_1 : 0;
4394 newload = load * exp + active * (FIXED_1 - exp);
4395 if (active >= load)
4396 newload += FIXED_1 - 1;
4397
4398 return newload / FIXED_1;
4399}
4400
4401/*
4402 * Return 0 means that container p->cg is closed.
4403 * Return -1 means that error occurred in refresh.
4404 * Positive num equals the total number of pid.
4405 */
4406static int refresh_load(struct load_node *p, char *path)
4407{
4408 FILE *f = NULL;
4409 char **idbuf;
4410 char proc_path[256];
4411 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4412 char *line = NULL;
4413 size_t linelen = 0;
4414 int sum, length;
4415 DIR *dp;
4416 struct dirent *file;
4417
4418 do {
4419 idbuf = malloc(sizeof(char *));
4420 } while (!idbuf);
4421 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4422 /* normal exit */
4423 if (sum == 0)
4424 goto out;
4425
4426 for (i = 0; i < sum; i++) {
4427 /*clean up '\n' */
4428 length = strlen(idbuf[i])-1;
4429 idbuf[i][length] = '\0';
4430 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4431 if (ret < 0 || ret > 255) {
4432 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4433 i = sum;
4434 sum = -1;
4435 goto err_out;
4436 }
4437
4438 dp = opendir(proc_path);
4439 if (!dp) {
4440 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4441 continue;
4442 }
4443 while ((file = readdir(dp)) != NULL) {
4444 if (strncmp(file->d_name, ".", 1) == 0)
4445 continue;
4446 if (strncmp(file->d_name, "..", 1) == 0)
4447 continue;
4448 total_pid++;
4449 /* We make the biggest pid become last_pid.*/
4450 ret = atof(file->d_name);
4451 last_pid = (ret > last_pid) ? ret : last_pid;
4452
4453 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4454 if (ret < 0 || ret > 255) {
4455 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4456 i = sum;
4457 sum = -1;
4458 closedir(dp);
4459 goto err_out;
4460 }
4461 f = fopen(proc_path, "r");
4462 if (f != NULL) {
4463 while (getline(&line, &linelen, f) != -1) {
4464 /* Find State */
4465 if ((line[0] == 'S') && (line[1] == 't'))
4466 break;
4467 }
4468 if ((line[7] == 'R') || (line[7] == 'D'))
4469 run_pid++;
4470 fclose(f);
4471 }
4472 }
4473 closedir(dp);
4474 }
4475 /*Calculate the loadavg.*/
4476 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4477 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4478 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4479 p->run_pid = run_pid;
4480 p->total_pid = total_pid;
4481 p->last_pid = last_pid;
4482
4483 free(line);
4484err_out:
4485 for (; i > 0; i--)
4486 free(idbuf[i-1]);
4487out:
4488 free(idbuf);
4489 return sum;
4490}
4491/*
4492 * Traverse the hash table and update it.
4493 */
4494void *load_begin(void *arg)
4495{
4496
4497 char *path = NULL;
4498 int i, sum, length, ret;
4499 struct load_node *f;
4500 int first_node;
4501 clock_t time1, time2;
4502
4503 while (1) {
a83618e2
JS
4504 if (loadavg_stop == 1)
4505 return NULL;
4506
6db4f7a3 4507 time1 = clock();
4508 for (i = 0; i < LOAD_SIZE; i++) {
4509 pthread_mutex_lock(&load_hash[i].lock);
4510 if (load_hash[i].next == NULL) {
4511 pthread_mutex_unlock(&load_hash[i].lock);
4512 continue;
4513 }
4514 f = load_hash[i].next;
4515 first_node = 1;
4516 while (f) {
4517 length = strlen(f->cg) + 2;
4518 do {
4519 /* strlen(f->cg) + '.' or '' + \0 */
4520 path = malloc(length);
4521 } while (!path);
4522
4523 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4524 if (ret < 0 || ret > length - 1) {
4525 /* snprintf failed, ignore the node.*/
4526 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4527 goto out;
4528 }
4529 sum = refresh_load(f, path);
4530 if (sum == 0) {
4531 f = del_node(f, i);
4532 } else {
4533out: f = f->next;
4534 }
4535 free(path);
4536 /* load_hash[i].lock locks only on the first node.*/
4537 if (first_node == 1) {
4538 first_node = 0;
4539 pthread_mutex_unlock(&load_hash[i].lock);
4540 }
4541 }
4542 }
a83618e2
JS
4543
4544 if (loadavg_stop == 1)
4545 return NULL;
4546
6db4f7a3 4547 time2 = clock();
4548 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4549 }
4550}
4551
4552static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4553 struct fuse_file_info *fi)
4554{
4555 struct fuse_context *fc = fuse_get_context();
4556 struct file_info *d = (struct file_info *)fi->fh;
4557 pid_t initpid;
4558 char *cg;
4559 size_t total_len = 0;
4560 char *cache = d->buf;
4561 struct load_node *n;
4562 int hash;
4563 int cfd;
4564 unsigned long a, b, c;
4565
4566 if (offset) {
4567 if (offset > d->size)
4568 return -EINVAL;
4569 if (!d->cached)
4570 return 0;
4571 int left = d->size - offset;
4572 total_len = left > size ? size : left;
4573 memcpy(buf, cache + offset, total_len);
4574 return total_len;
4575 }
4576 if (!loadavg)
4577 return read_file("/proc/loadavg", buf, size, d);
4578
4579 initpid = lookup_initpid_in_store(fc->pid);
4580 if (initpid <= 0)
4581 initpid = fc->pid;
4582 cg = get_pid_cgroup(initpid, "cpu");
4583 if (!cg)
4584 return read_file("/proc/loadavg", buf, size, d);
4585
4586 prune_init_slice(cg);
4587 hash = calc_hash(cg);
4588 n = locate_node(cg, hash);
4589
4590 /* First time */
4591 if (n == NULL) {
4592 if (!find_mounted_controller("cpu", &cfd)) {
4593 /*
4594 * In locate_node() above, pthread_rwlock_unlock() isn't used
4595 * because delete is not allowed before read has ended.
4596 */
4597 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4598 return 0;
4599 }
4600 do {
4601 n = malloc(sizeof(struct load_node));
4602 } while (!n);
4603
4604 do {
4605 n->cg = malloc(strlen(cg)+1);
4606 } while (!n->cg);
4607 strcpy(n->cg, cg);
4608 n->avenrun[0] = 0;
4609 n->avenrun[1] = 0;
4610 n->avenrun[2] = 0;
4611 n->run_pid = 0;
4612 n->total_pid = 1;
4613 n->last_pid = initpid;
4614 n->cfd = cfd;
4615 insert_node(&n, hash);
4616 }
4617 a = n->avenrun[0] + (FIXED_1/200);
4618 b = n->avenrun[1] + (FIXED_1/200);
4619 c = n->avenrun[2] + (FIXED_1/200);
4620 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4621 LOAD_INT(a), LOAD_FRAC(a),
4622 LOAD_INT(b), LOAD_FRAC(b),
4623 LOAD_INT(c), LOAD_FRAC(c),
4624 n->run_pid, n->total_pid, n->last_pid);
4625 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4626 if (total_len < 0 || total_len >= d->buflen) {
4627 lxcfs_error("%s\n", "Failed to write to cache");
4628 return 0;
4629 }
4630 d->size = (int)total_len;
4631 d->cached = 1;
4632
4633 if (total_len > size)
4634 total_len = size;
4635 memcpy(buf, d->buf, total_len);
4636 return total_len;
4637}
4638/* Return a positive number on success, return 0 on failure.*/
4639pthread_t load_daemon(int load_use)
4640{
4641 int ret;
4642 pthread_t pid;
4643
4644 ret = init_load();
4645 if (ret == -1) {
4646 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4647 return 0;
4648 }
4649 ret = pthread_create(&pid, NULL, load_begin, NULL);
4650 if (ret != 0) {
4651 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4652 load_free();
4653 return 0;
4654 }
4655 /* use loadavg, here loadavg = 1*/
4656 loadavg = load_use;
4657 return pid;
4658}
70dcc12e 4659
a83618e2
JS
4660/* Returns 0 on success. */
4661int stop_load_daemon(pthread_t pid)
4662{
4663 int s;
4664
4665 /* Signal the thread to gracefully stop */
4666 loadavg_stop = 1;
4667
4668 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4669 if (s != 0) {
4670 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4671 return -1;
4672 }
4673
4674 load_free();
4675 loadavg_stop = 0;
4676
4677 return 0;
4678}
4679
237e200e
SH
4680static off_t get_procfile_size(const char *which)
4681{
4682 FILE *f = fopen(which, "r");
4683 char *line = NULL;
4684 size_t len = 0;
4685 ssize_t sz, answer = 0;
4686 if (!f)
4687 return 0;
4688
4689 while ((sz = getline(&line, &len, f)) != -1)
4690 answer += sz;
4691 fclose (f);
4692 free(line);
4693
4694 return answer;
4695}
4696
4697int proc_getattr(const char *path, struct stat *sb)
4698{
4699 struct timespec now;
4700
4701 memset(sb, 0, sizeof(struct stat));
4702 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4703 return -EINVAL;
4704 sb->st_uid = sb->st_gid = 0;
4705 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4706 if (strcmp(path, "/proc") == 0) {
4707 sb->st_mode = S_IFDIR | 00555;
4708 sb->st_nlink = 2;
4709 return 0;
4710 }
4711 if (strcmp(path, "/proc/meminfo") == 0 ||
4712 strcmp(path, "/proc/cpuinfo") == 0 ||
4713 strcmp(path, "/proc/uptime") == 0 ||
4714 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 4715 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 4716 strcmp(path, "/proc/swaps") == 0 ||
4717 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
4718 sb->st_size = 0;
4719 sb->st_mode = S_IFREG | 00444;
4720 sb->st_nlink = 1;
4721 return 0;
4722 }
4723
4724 return -ENOENT;
4725}
4726
4727int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4728 struct fuse_file_info *fi)
4729{
d639f863
CB
4730 if (filler(buf, ".", NULL, 0) != 0 ||
4731 filler(buf, "..", NULL, 0) != 0 ||
4732 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4733 filler(buf, "meminfo", NULL, 0) != 0 ||
4734 filler(buf, "stat", NULL, 0) != 0 ||
4735 filler(buf, "uptime", NULL, 0) != 0 ||
4736 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 4737 filler(buf, "swaps", NULL, 0) != 0 ||
4738 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
4739 return -EINVAL;
4740 return 0;
4741}
4742
4743int proc_open(const char *path, struct fuse_file_info *fi)
4744{
4745 int type = -1;
4746 struct file_info *info;
4747
4748 if (strcmp(path, "/proc/meminfo") == 0)
4749 type = LXC_TYPE_PROC_MEMINFO;
4750 else if (strcmp(path, "/proc/cpuinfo") == 0)
4751 type = LXC_TYPE_PROC_CPUINFO;
4752 else if (strcmp(path, "/proc/uptime") == 0)
4753 type = LXC_TYPE_PROC_UPTIME;
4754 else if (strcmp(path, "/proc/stat") == 0)
4755 type = LXC_TYPE_PROC_STAT;
4756 else if (strcmp(path, "/proc/diskstats") == 0)
4757 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
4758 else if (strcmp(path, "/proc/swaps") == 0)
4759 type = LXC_TYPE_PROC_SWAPS;
46be8eed 4760 else if (strcmp(path, "/proc/loadavg") == 0)
4761 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
4762 if (type == -1)
4763 return -ENOENT;
4764
4765 info = malloc(sizeof(*info));
4766 if (!info)
4767 return -ENOMEM;
4768
4769 memset(info, 0, sizeof(*info));
4770 info->type = type;
4771
4772 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4773 do {
4774 info->buf = malloc(info->buflen);
4775 } while (!info->buf);
4776 memset(info->buf, 0, info->buflen);
4777 /* set actual size to buffer size */
4778 info->size = info->buflen;
4779
4780 fi->fh = (unsigned long)info;
4781 return 0;
4782}
4783
bddbb106
SH
4784int proc_access(const char *path, int mask)
4785{
e7849aa3
CB
4786 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4787 return 0;
4788
bddbb106
SH
4789 /* these are all read-only */
4790 if ((mask & ~R_OK) != 0)
1b060d0a 4791 return -EACCES;
bddbb106
SH
4792 return 0;
4793}
4794
237e200e
SH
4795int proc_release(const char *path, struct fuse_file_info *fi)
4796{
43215927 4797 do_release_file_info(fi);
237e200e
SH
4798 return 0;
4799}
4800
4801int proc_read(const char *path, char *buf, size_t size, off_t offset,
4802 struct fuse_file_info *fi)
4803{
4804 struct file_info *f = (struct file_info *) fi->fh;
4805
4806 switch (f->type) {
4807 case LXC_TYPE_PROC_MEMINFO:
4808 return proc_meminfo_read(buf, size, offset, fi);
4809 case LXC_TYPE_PROC_CPUINFO:
4810 return proc_cpuinfo_read(buf, size, offset, fi);
4811 case LXC_TYPE_PROC_UPTIME:
4812 return proc_uptime_read(buf, size, offset, fi);
4813 case LXC_TYPE_PROC_STAT:
4814 return proc_stat_read(buf, size, offset, fi);
4815 case LXC_TYPE_PROC_DISKSTATS:
4816 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
4817 case LXC_TYPE_PROC_SWAPS:
4818 return proc_swaps_read(buf, size, offset, fi);
46be8eed 4819 case LXC_TYPE_PROC_LOADAVG:
4820 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
4821 default:
4822 return -EINVAL;
4823 }
4824}
4825
29a73c2f
CB
4826/*
4827 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
4828 */
4829
4830static bool mkdir_p(const char *dir, mode_t mode)
4831{
4832 const char *tmp = dir;
4833 const char *orig = dir;
4834 char *makeme;
4835
4836 do {
4837 dir = tmp + strspn(tmp, "/");
4838 tmp = dir + strcspn(dir, "/");
4839 makeme = strndup(orig, dir - orig);
4840 if (!makeme)
4841 return false;
4842 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 4843 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
4844 makeme, strerror(errno));
4845 free(makeme);
4846 return false;
4847 }
4848 free(makeme);
4849 } while(tmp != dir);
4850
4851 return true;
4852}
4853
4854static bool umount_if_mounted(void)
4855{
4856 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 4857 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
4858 return false;
4859 }
4860 return true;
4861}
4862
2283e240
CB
4863/* __typeof__ should be safe to use with all compilers. */
4864typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4865static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4866{
4867 return (fs->f_type == (fs_type_magic)magic_val);
4868}
4869
0a4dea41
CB
4870/*
4871 * looking at fs/proc_namespace.c, it appears we can
4872 * actually expect the rootfs entry to very specifically contain
4873 * " - rootfs rootfs "
4874 * IIUC, so long as we've chrooted so that rootfs is not our root,
4875 * the rootfs entry should always be skipped in mountinfo contents.
4876 */
4877static bool is_on_ramfs(void)
4878{
4879 FILE *f;
4880 char *p, *p2;
4881 char *line = NULL;
4882 size_t len = 0;
4883 int i;
4884
4885 f = fopen("/proc/self/mountinfo", "r");
4886 if (!f)
4887 return false;
4888
4889 while (getline(&line, &len, f) != -1) {
4890 for (p = line, i = 0; p && i < 4; i++)
4891 p = strchr(p + 1, ' ');
4892 if (!p)
4893 continue;
4894 p2 = strchr(p + 1, ' ');
4895 if (!p2)
4896 continue;
4897 *p2 = '\0';
4898 if (strcmp(p + 1, "/") == 0) {
4899 // this is '/'. is it the ramfs?
4900 p = strchr(p2 + 1, '-');
4901 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4902 free(line);
4903 fclose(f);
4904 return true;
4905 }
4906 }
4907 }
4908 free(line);
4909 fclose(f);
4910 return false;
4911}
4912
cc309f33 4913static int pivot_enter()
0a4dea41 4914{
cc309f33
CB
4915 int ret = -1, oldroot = -1, newroot = -1;
4916
4917 oldroot = open("/", O_DIRECTORY | O_RDONLY);
4918 if (oldroot < 0) {
4919 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4920 return ret;
4921 }
4922
4923 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4924 if (newroot < 0) {
4925 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4926 goto err;
4927 }
4928
4929 /* change into new root fs */
4930 if (fchdir(newroot) < 0) {
4931 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4932 goto err;
4933 }
4934
0a4dea41
CB
4935 /* pivot_root into our new root fs */
4936 if (pivot_root(".", ".") < 0) {
4937 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 4938 goto err;
0a4dea41
CB
4939 }
4940
4941 /*
4942 * At this point the old-root is mounted on top of our new-root.
4943 * To unmounted it we must not be chdir'd into it, so escape back
4944 * to the old-root.
4945 */
4946 if (fchdir(oldroot) < 0) {
4947 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 4948 goto err;
0a4dea41
CB
4949 }
4950
4951 if (umount2(".", MNT_DETACH) < 0) {
4952 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 4953 goto err;
0a4dea41
CB
4954 }
4955
4956 if (fchdir(newroot) < 0) {
4957 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 4958 goto err;
0a4dea41
CB
4959 }
4960
cc309f33
CB
4961 ret = 0;
4962
4963err:
4964 if (oldroot > 0)
4965 close(oldroot);
4966 if (newroot > 0)
4967 close(newroot);
4968
4969 return ret;
0a4dea41
CB
4970}
4971
4972static int chroot_enter()
4973{
4974 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4975 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4976 return -1;
4977 }
4978
4979 if (chroot(".") < 0) {
4980 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4981 return -1;
4982 }
4983
4984 if (chdir("/") < 0) {
4985 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4986 return -1;
4987 }
4988
4989 return 0;
4990}
4991
0232cbac 4992static int permute_and_enter(void)
29a73c2f 4993{
0a4dea41
CB
4994 struct statfs sb;
4995
4996 if (statfs("/", &sb) < 0) {
4997 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 4998 return -1;
0a4dea41
CB
4999 }
5000
5001 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5002 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5003 * /proc/1/mountinfo. */
5004 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5005 return chroot_enter();
29a73c2f 5006
cc309f33 5007 if (pivot_enter() < 0) {
0a4dea41 5008 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 5009 return -1;
29a73c2f
CB
5010 }
5011
cc309f33 5012 return 0;
29a73c2f
CB
5013}
5014
5015/* Prepare our new clean root. */
0232cbac 5016static int permute_prepare(void)
29a73c2f
CB
5017{
5018 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 5019 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
5020 return -1;
5021 }
5022
5023 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 5024 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
5025 return -1;
5026 }
5027
5028 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 5029 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
5030 return -1;
5031 }
5032
5033 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 5034 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
5035 return -1;
5036 }
5037
5038 return 0;
5039}
5040
0232cbac
CB
5041/* Calls chroot() on ramfs, pivot_root() in all other cases. */
5042static bool permute_root(void)
29a73c2f
CB
5043{
5044 /* Prepare new root. */
0232cbac 5045 if (permute_prepare() < 0)
29a73c2f
CB
5046 return false;
5047
5048 /* Pivot into new root. */
0232cbac 5049 if (permute_and_enter() < 0)
29a73c2f
CB
5050 return false;
5051
5052 return true;
5053}
5054
a257a8ee
CB
5055static int preserve_mnt_ns(int pid)
5056{
5057 int ret;
5058 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5059 char path[len];
5060
5061 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5062 if (ret < 0 || (size_t)ret >= len)
5063 return -1;
5064
5065 return open(path, O_RDONLY | O_CLOEXEC);
5066}
5067
0a4dea41 5068static bool cgfs_prepare_mounts(void)
29a73c2f
CB
5069{
5070 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 5071 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
5072 return false;
5073 }
480262c9 5074
29a73c2f 5075 if (!umount_if_mounted()) {
b8defc3d 5076 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
5077 return false;
5078 }
5079
5080 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 5081 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
5082 return false;
5083 }
5084
a257a8ee
CB
5085 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5086 if (cgroup_mount_ns_fd < 0) {
5087 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5088 return false;
5089 }
5090
480262c9 5091 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 5092 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
5093 return false;
5094 }
480262c9 5095
29a73c2f 5096 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 5097 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
5098 return false;
5099 }
480262c9 5100
29a73c2f
CB
5101 return true;
5102}
5103
0a4dea41 5104static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
5105{
5106 char *target;
5107 size_t clen, len;
5108 int i, ret;
5109
5110 for (i = 0; i < num_hierarchies; i++) {
5111 char *controller = hierarchies[i];
51c7ca35 5112
29a73c2f
CB
5113 clen = strlen(controller);
5114 len = strlen(BASEDIR) + clen + 2;
5115 target = malloc(len);
5116 if (!target)
5117 return false;
51c7ca35 5118
29a73c2f
CB
5119 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5120 if (ret < 0 || ret >= len) {
5121 free(target);
5122 return false;
5123 }
5124 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5125 free(target);
5126 return false;
5127 }
51c7ca35
CB
5128 if (!strcmp(controller, "unified"))
5129 ret = mount("none", target, "cgroup2", 0, NULL);
5130 else
5131 ret = mount(controller, target, "cgroup", 0, controller);
5132 if (ret < 0) {
5133 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
5134 free(target);
5135 return false;
5136 }
5137
5138 fd_hierarchies[i] = open(target, O_DIRECTORY);
5139 if (fd_hierarchies[i] < 0) {
5140 free(target);
5141 return false;
5142 }
5143 free(target);
5144 }
5145 return true;
5146}
5147
480262c9 5148static bool cgfs_setup_controllers(void)
29a73c2f 5149{
0a4dea41 5150 if (!cgfs_prepare_mounts())
29a73c2f 5151 return false;
29a73c2f 5152
0a4dea41 5153 if (!cgfs_mount_hierarchies()) {
b8defc3d 5154 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
5155 return false;
5156 }
5157
0232cbac 5158 if (!permute_root())
29a73c2f
CB
5159 return false;
5160
5161 return true;
5162}
5163
5164static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
5165{
5166 FILE *f;
e58dab00
CB
5167 char *cret, *line = NULL;
5168 char cwd[MAXPATHLEN];
237e200e 5169 size_t len = 0;
480262c9 5170 int i, init_ns = -1;
51c7ca35 5171 bool found_unified = false;
237e200e
SH
5172
5173 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 5174 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
5175 return;
5176 }
e58dab00 5177
237e200e 5178 while (getline(&line, &len, f) != -1) {
51c7ca35 5179 char *idx, *p, *p2;
237e200e
SH
5180
5181 p = strchr(line, ':');
5182 if (!p)
5183 goto out;
51c7ca35 5184 idx = line;
237e200e
SH
5185 *(p++) = '\0';
5186
5187 p2 = strrchr(p, ':');
5188 if (!p2)
5189 goto out;
5190 *p2 = '\0';
5191
a67719f6
CB
5192 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5193 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5194 * because it parses out the empty string "" and later on passes
5195 * it to mount(). Let's skip such entries.
5196 */
51c7ca35
CB
5197 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5198 found_unified = true;
5199 p = "unified";
5200 }
a67719f6 5201
237e200e
SH
5202 if (!store_hierarchy(line, p))
5203 goto out;
5204 }
5205
480262c9 5206 /* Preserve initial namespace. */
a257a8ee 5207 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
5208 if (init_ns < 0) {
5209 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 5210 goto out;
b8defc3d 5211 }
480262c9 5212
92c3ee11 5213 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
5214 if (!fd_hierarchies) {
5215 lxcfs_error("%s\n", strerror(errno));
29a73c2f 5216 goto out;
b8defc3d 5217 }
29a73c2f 5218
480262c9
CB
5219 for (i = 0; i < num_hierarchies; i++)
5220 fd_hierarchies[i] = -1;
5221
e58dab00
CB
5222 cret = getcwd(cwd, MAXPATHLEN);
5223 if (!cret)
5224 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5225
480262c9
CB
5226 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5227 * to privately mount lxcfs cgroups. */
b8defc3d
CB
5228 if (!cgfs_setup_controllers()) {
5229 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 5230 goto out;
b8defc3d 5231 }
480262c9 5232
b8defc3d
CB
5233 if (setns(init_ns, 0) < 0) {
5234 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 5235 goto out;
b8defc3d 5236 }
29a73c2f 5237
e58dab00
CB
5238 if (!cret || chdir(cwd) < 0)
5239 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5240
237e200e
SH
5241 print_subsystems();
5242
5243out:
5244 free(line);
5245 fclose(f);
480262c9
CB
5246 if (init_ns >= 0)
5247 close(init_ns);
237e200e
SH
5248}
5249
5250static void __attribute__((destructor)) free_subsystems(void)
5251{
5252 int i;
5253
b8defc3d
CB
5254 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5255
29a73c2f 5256 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
5257 if (hierarchies[i])
5258 free(hierarchies[i]);
480262c9 5259 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
5260 close(fd_hierarchies[i]);
5261 }
237e200e 5262 free(hierarchies);
480262c9 5263 free(fd_hierarchies);
a257a8ee
CB
5264
5265 if (cgroup_mount_ns_fd >= 0)
5266 close(cgroup_mount_ns_fd);
237e200e 5267}