]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
bindings: fix memory leak in proc_loadavg_read()
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f
CB
19#include <sched.h>
20#include <stdbool.h>
0ecddf02 21#include <stdint.h>
29a73c2f
CB
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <time.h>
26#include <unistd.h>
27#include <wait.h>
d89504c4 28#include <linux/magic.h>
237e200e 29#include <linux/sched.h>
29a73c2f
CB
30#include <sys/epoll.h>
31#include <sys/mman.h>
32#include <sys/mount.h>
237e200e
SH
33#include <sys/param.h>
34#include <sys/socket.h>
29a73c2f 35#include <sys/syscall.h>
0ecddf02 36#include <sys/sysinfo.h>
d89504c4 37#include <sys/vfs.h>
237e200e 38
237e200e 39#include "bindings.h"
237e200e
SH
40#include "config.h" // for VERSION
41
0ecddf02
CB
42/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43#define LXCFS_NUMSTRLEN64 21
44
29a73c2f
CB
45/* Define pivot_root() if missing from the C library */
46#ifndef HAVE_PIVOT_ROOT
47static int pivot_root(const char * new_root, const char * put_old)
48{
49#ifdef __NR_pivot_root
50return syscall(__NR_pivot_root, new_root, put_old);
51#else
52errno = ENOSYS;
53return -1;
54#endif
55}
56#else
57extern int pivot_root(const char * new_root, const char * put_old);
58#endif
59
237e200e
SH
60enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 68 LXC_TYPE_PROC_SWAPS,
46be8eed 69 LXC_TYPE_PROC_LOADAVG,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
0e47acaa 83/* The function of hash table.*/
84#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 85#define FLUSH_TIME 5 /*the flush rate */
86#define DEPTH_DIR 3 /*the depth of per cgroup */
87/* The function of calculate loadavg .*/
88#define FSHIFT 11 /* nr of bits of precision */
89#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
90#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
91#define EXP_5 2014 /* 1/exp(5sec/5min) */
92#define EXP_15 2037 /* 1/exp(5sec/15min) */
93#define LOAD_INT(x) ((x) >> FSHIFT)
94#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
95/*
96 * This parameter is used for proc_loadavg_read().
97 * 1 means use loadavg, 0 means not use.
98 */
99static int loadavg = 0;
0e47acaa 100static int calc_hash(char *name)
101{
102 unsigned int hash = 0;
103 unsigned int x = 0;
104 /* ELFHash algorithm. */
105 while (*name) {
106 hash = (hash << 4) + *name++;
107 x = hash & 0xf0000000;
108 if (x != 0)
109 hash ^= (x >> 24);
110 hash &= ~x;
111 }
112 return ((hash & 0x7fffffff) % LOAD_SIZE);
113}
114
115struct load_node {
116 char *cg; /*cg */
117 unsigned long avenrun[3]; /* Load averages */
118 unsigned int run_pid;
119 unsigned int total_pid;
120 unsigned int last_pid;
121 int cfd; /* The file descriptor of the mounted cgroup */
122 struct load_node *next;
123 struct load_node **pre;
124};
125
126struct load_head {
127 /*
128 * The lock is about insert load_node and refresh load_node.To the first
129 * load_node of each hash bucket, insert and refresh in this hash bucket is
130 * mutually exclusive.
131 */
132 pthread_mutex_t lock;
133 /*
134 * The rdlock is about read loadavg and delete load_node.To each hash
135 * bucket, read and delete is mutually exclusive. But at the same time, we
136 * allow paratactic read operation. This rdlock is at list level.
137 */
138 pthread_rwlock_t rdlock;
139 /*
140 * The rilock is about read loadavg and insert load_node.To the first
141 * load_node of each hash bucket, read and insert is mutually exclusive.
142 * But at the same time, we allow paratactic read operation.
143 */
144 pthread_rwlock_t rilock;
145 struct load_node *next;
146};
147
148static struct load_head load_hash[LOAD_SIZE]; /* hash table */
149/*
150 * init_load initialize the hash table.
151 * Return 0 on success, return -1 on failure.
152 */
153static int init_load(void)
154{
155 int i;
156 int ret;
157
158 for (i = 0; i < LOAD_SIZE; i++) {
159 load_hash[i].next = NULL;
160 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
161 if (ret != 0) {
162 lxcfs_error("%s\n", "Failed to initialize lock");
163 goto out3;
164 }
165 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
166 if (ret != 0) {
167 lxcfs_error("%s\n", "Failed to initialize rdlock");
168 goto out2;
169 }
170 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
171 if (ret != 0) {
172 lxcfs_error("%s\n", "Failed to initialize rilock");
173 goto out1;
174 }
175 }
176 return 0;
177out1:
178 pthread_rwlock_destroy(&load_hash[i].rdlock);
179out2:
180 pthread_mutex_destroy(&load_hash[i].lock);
181out3:
182 while (i > 0) {
183 i--;
184 pthread_mutex_destroy(&load_hash[i].lock);
185 pthread_rwlock_destroy(&load_hash[i].rdlock);
186 pthread_rwlock_destroy(&load_hash[i].rilock);
187 }
188 return -1;
189}
190
191static void insert_node(struct load_node **n, int locate)
192{
193 struct load_node *f;
194
195 pthread_mutex_lock(&load_hash[locate].lock);
196 pthread_rwlock_wrlock(&load_hash[locate].rilock);
197 f = load_hash[locate].next;
198 load_hash[locate].next = *n;
199
200 (*n)->pre = &(load_hash[locate].next);
201 if (f)
202 f->pre = &((*n)->next);
203 (*n)->next = f;
204 pthread_mutex_unlock(&load_hash[locate].lock);
205 pthread_rwlock_unlock(&load_hash[locate].rilock);
206}
207/*
208 * locate_node() finds special node. Not return NULL means success.
209 * It should be noted that rdlock isn't unlocked at the end of code
210 * because this function is used to read special node. Delete is not
211 * allowed before read has ended.
212 * unlock rdlock only in proc_loadavg_read().
213 */
214static struct load_node *locate_node(char *cg, int locate)
215{
216 struct load_node *f = NULL;
217 int i = 0;
218
219 pthread_rwlock_rdlock(&load_hash[locate].rilock);
220 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
221 if (load_hash[locate].next == NULL) {
222 pthread_rwlock_unlock(&load_hash[locate].rilock);
223 return f;
224 }
225 f = load_hash[locate].next;
226 pthread_rwlock_unlock(&load_hash[locate].rilock);
227 while (f && ((i = strcmp(f->cg, cg)) != 0))
228 f = f->next;
229 return f;
230}
231/* Delete the load_node n and return the next node of it. */
232static struct load_node *del_node(struct load_node *n, int locate)
233{
234 struct load_node *g;
235
236 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
237 if (n->next == NULL) {
238 *(n->pre) = NULL;
239 } else {
240 *(n->pre) = n->next;
241 n->next->pre = n->pre;
242 }
243 g = n->next;
244 free(n->cg);
245 free(n);
246 pthread_rwlock_unlock(&load_hash[locate].rdlock);
247 return g;
248}
249
9c480eb7 250void load_free(void)
251{
252 int i;
253 struct load_node *f, *p;
254
255 for (i = 0; i < LOAD_SIZE; i++) {
256 pthread_mutex_lock(&load_hash[i].lock);
257 pthread_rwlock_wrlock(&load_hash[i].rilock);
258 pthread_rwlock_wrlock(&load_hash[i].rdlock);
259 if (load_hash[i].next == NULL) {
260 pthread_mutex_unlock(&load_hash[i].lock);
261 pthread_mutex_destroy(&load_hash[i].lock);
262 pthread_rwlock_unlock(&load_hash[i].rilock);
263 pthread_rwlock_destroy(&load_hash[i].rilock);
264 pthread_rwlock_unlock(&load_hash[i].rdlock);
265 pthread_rwlock_destroy(&load_hash[i].rdlock);
266 continue;
267 }
268 for (f = load_hash[i].next; f; ) {
269 free(f->cg);
270 p = f->next;
271 free(f);
272 f = p;
273 }
274 pthread_mutex_unlock(&load_hash[i].lock);
275 pthread_mutex_destroy(&load_hash[i].lock);
276 pthread_rwlock_unlock(&load_hash[i].rilock);
277 pthread_rwlock_destroy(&load_hash[i].rilock);
278 pthread_rwlock_unlock(&load_hash[i].rdlock);
279 pthread_rwlock_destroy(&load_hash[i].rdlock);
280 }
281}
f34de69a
CB
282/* Reserve buffer size to account for file size changes. */
283#define BUF_RESERVE_SIZE 512
237e200e
SH
284
285/*
286 * A table caching which pid is init for a pid namespace.
287 * When looking up which pid is init for $qpid, we first
288 * 1. Stat /proc/$qpid/ns/pid.
289 * 2. Check whether the ino_t is in our store.
290 * a. if not, fork a child in qpid's ns to send us
291 * ucred.pid = 1, and read the initpid. Cache
292 * initpid and creation time for /proc/initpid
293 * in a new store entry.
294 * b. if so, verify that /proc/initpid still matches
295 * what we have saved. If not, clear the store
296 * entry and go back to a. If so, return the
297 * cached initpid.
298 */
299struct pidns_init_store {
300 ino_t ino; // inode number for /proc/$pid/ns/pid
301 pid_t initpid; // the pid of nit in that ns
302 long int ctime; // the time at which /proc/$initpid was created
303 struct pidns_init_store *next;
304 long int lastcheck;
305};
306
307/* lol - look at how they are allocated in the kernel */
308#define PIDNS_HASH_SIZE 4096
309#define HASH(x) ((x) % PIDNS_HASH_SIZE)
310
311static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
312static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
313static void lock_mutex(pthread_mutex_t *l)
314{
315 int ret;
316
317 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 318 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
319 exit(1);
320 }
321}
322
29a73c2f
CB
323/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
324 * Number of hierarchies mounted. */
325static int num_hierarchies;
326
327/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
328 * Hierachies mounted {cpuset, blkio, ...}:
329 * Initialized via __constructor__ collect_and_mount_subsystems(). */
330static char **hierarchies;
331
332/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
333 * Open file descriptors:
334 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
335 * private mount namespace.
336 * Initialized via __constructor__ collect_and_mount_subsystems().
337 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
338 * mounts and respective files in the private namespace even when located in
339 * another namespace using the *at() family of functions
340 * {openat(), fchownat(), ...}. */
341static int *fd_hierarchies;
a257a8ee 342static int cgroup_mount_ns_fd = -1;
29a73c2f 343
237e200e
SH
344static void unlock_mutex(pthread_mutex_t *l)
345{
346 int ret;
347
348 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 349 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
350 exit(1);
351 }
352}
353
354static void store_lock(void)
355{
356 lock_mutex(&pidns_store_mutex);
357}
358
359static void store_unlock(void)
360{
361 unlock_mutex(&pidns_store_mutex);
362}
363
364/* Must be called under store_lock */
365static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
366{
367 struct stat initsb;
368 char fnam[100];
369
370 snprintf(fnam, 100, "/proc/%d", e->initpid);
371 if (stat(fnam, &initsb) < 0)
372 return false;
7dd6560a
CB
373
374 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
375 initsb.st_ctime, e->initpid);
376
237e200e
SH
377 if (e->ctime != initsb.st_ctime)
378 return false;
379 return true;
380}
381
382/* Must be called under store_lock */
383static void remove_initpid(struct pidns_init_store *e)
384{
385 struct pidns_init_store *tmp;
386 int h;
387
7dd6560a
CB
388 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
389
237e200e
SH
390 h = HASH(e->ino);
391 if (pidns_hash_table[h] == e) {
392 pidns_hash_table[h] = e->next;
393 free(e);
394 return;
395 }
396
397 tmp = pidns_hash_table[h];
398 while (tmp) {
399 if (tmp->next == e) {
400 tmp->next = e->next;
401 free(e);
402 return;
403 }
404 tmp = tmp->next;
405 }
406}
407
408#define PURGE_SECS 5
409/* Must be called under store_lock */
410static void prune_initpid_store(void)
411{
412 static long int last_prune = 0;
413 struct pidns_init_store *e, *prev, *delme;
414 long int now, threshold;
415 int i;
416
417 if (!last_prune) {
418 last_prune = time(NULL);
419 return;
420 }
421 now = time(NULL);
422 if (now < last_prune + PURGE_SECS)
423 return;
7dd6560a
CB
424
425 lxcfs_debug("%s\n", "Pruning.");
426
237e200e
SH
427 last_prune = now;
428 threshold = now - 2 * PURGE_SECS;
429
430 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
431 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
432 if (e->lastcheck < threshold) {
7dd6560a
CB
433
434 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
435
237e200e
SH
436 delme = e;
437 if (prev)
438 prev->next = e->next;
439 else
440 pidns_hash_table[i] = e->next;
441 e = e->next;
442 free(delme);
443 } else {
444 prev = e;
445 e = e->next;
446 }
447 }
448 }
449}
450
451/* Must be called under store_lock */
452static void save_initpid(struct stat *sb, pid_t pid)
453{
454 struct pidns_init_store *e;
455 char fpath[100];
456 struct stat procsb;
457 int h;
458
7dd6560a
CB
459 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
460
237e200e
SH
461 snprintf(fpath, 100, "/proc/%d", pid);
462 if (stat(fpath, &procsb) < 0)
463 return;
464 do {
465 e = malloc(sizeof(*e));
466 } while (!e);
467 e->ino = sb->st_ino;
468 e->initpid = pid;
469 e->ctime = procsb.st_ctime;
470 h = HASH(e->ino);
471 e->next = pidns_hash_table[h];
472 e->lastcheck = time(NULL);
473 pidns_hash_table[h] = e;
474}
475
476/*
477 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
478 * entry for the inode number and creation time. Verify that the init pid
479 * is still valid. If not, remove it. Return the entry if valid, NULL
480 * otherwise.
481 * Must be called under store_lock
482 */
483static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
484{
485 int h = HASH(sb->st_ino);
486 struct pidns_init_store *e = pidns_hash_table[h];
487
488 while (e) {
489 if (e->ino == sb->st_ino) {
490 if (initpid_still_valid(e, sb)) {
491 e->lastcheck = time(NULL);
492 return e;
493 }
494 remove_initpid(e);
495 return NULL;
496 }
497 e = e->next;
498 }
499
500 return NULL;
501}
502
0f657ce3 503static int is_dir(const char *path, int fd)
237e200e
SH
504{
505 struct stat statbuf;
0f657ce3 506 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
507 if (ret == 0 && S_ISDIR(statbuf.st_mode))
508 return 1;
509 return 0;
510}
511
512static char *must_copy_string(const char *str)
513{
514 char *dup = NULL;
515 if (!str)
516 return NULL;
517 do {
518 dup = strdup(str);
519 } while (!dup);
520
521 return dup;
522}
523
524static inline void drop_trailing_newlines(char *s)
525{
526 int l;
527
528 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
529 s[l-1] = '\0';
530}
531
532#define BATCH_SIZE 50
533static void dorealloc(char **mem, size_t oldlen, size_t newlen)
534{
535 int newbatches = (newlen / BATCH_SIZE) + 1;
536 int oldbatches = (oldlen / BATCH_SIZE) + 1;
537
538 if (!*mem || newbatches > oldbatches) {
539 char *tmp;
540 do {
541 tmp = realloc(*mem, newbatches * BATCH_SIZE);
542 } while (!tmp);
543 *mem = tmp;
544 }
545}
546static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
547{
548 size_t newlen = *len + linelen;
549 dorealloc(contents, *len, newlen + 1);
550 memcpy(*contents + *len, line, linelen+1);
551 *len = newlen;
552}
553
60f2ae53 554static char *slurp_file(const char *from, int fd)
237e200e
SH
555{
556 char *line = NULL;
557 char *contents = NULL;
60f2ae53 558 FILE *f = fdopen(fd, "r");
237e200e
SH
559 size_t len = 0, fulllen = 0;
560 ssize_t linelen;
561
562 if (!f)
563 return NULL;
564
565 while ((linelen = getline(&line, &len, f)) != -1) {
566 append_line(&contents, &fulllen, line, linelen);
567 }
568 fclose(f);
569
570 if (contents)
571 drop_trailing_newlines(contents);
572 free(line);
573 return contents;
574}
575
ba59ea09 576static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
577{
578 FILE *f;
579 size_t len, ret;
580
ba59ea09 581 if (!(f = fdopen(fd, "w")))
237e200e
SH
582 return false;
583 len = strlen(string);
584 ret = fwrite(string, 1, len, f);
585 if (ret != len) {
b8defc3d 586 lxcfs_error("Error writing to file: %s\n", strerror(errno));
237e200e
SH
587 fclose(f);
588 return false;
589 }
590 if (fclose(f) < 0) {
b8defc3d 591 lxcfs_error("Error writing to file: %s\n", strerror(errno));
237e200e
SH
592 return false;
593 }
594 return true;
595}
596
237e200e
SH
597struct cgfs_files {
598 char *name;
599 uint32_t uid, gid;
600 uint32_t mode;
601};
602
0619767c 603#define ALLOC_NUM 20
237e200e
SH
604static bool store_hierarchy(char *stridx, char *h)
605{
0619767c
SH
606 if (num_hierarchies % ALLOC_NUM == 0) {
607 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
608 n *= ALLOC_NUM;
609 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 610 if (!tmp) {
b8defc3d 611 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
612 exit(1);
613 }
237e200e 614 hierarchies = tmp;
237e200e 615 }
f676eb79 616
0619767c 617 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
618 return true;
619}
620
621static void print_subsystems(void)
622{
623 int i;
624
a257a8ee 625 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 626 fprintf(stderr, "hierarchies:\n");
237e200e
SH
627 for (i = 0; i < num_hierarchies; i++) {
628 if (hierarchies[i])
b8defc3d
CB
629 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
630 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
631 }
632}
633
634static bool in_comma_list(const char *needle, const char *haystack)
635{
636 const char *s = haystack, *e;
637 size_t nlen = strlen(needle);
638
06081b29 639 while (*s && (e = strchr(s, ','))) {
237e200e
SH
640 if (nlen != e - s) {
641 s = e + 1;
642 continue;
643 }
644 if (strncmp(needle, s, nlen) == 0)
645 return true;
646 s = e + 1;
647 }
648 if (strcmp(needle, s) == 0)
649 return true;
650 return false;
651}
652
653/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
654/* Return the mounted controller and store the corresponding open file descriptor
655 * referring to the controller mountpoint in the private lxcfs namespace in
656 * @cfd.
657 */
658static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
659{
660 int i;
661
662 for (i = 0; i < num_hierarchies; i++) {
663 if (!hierarchies[i])
664 continue;
5dd3e6fd
CB
665 if (strcmp(hierarchies[i], controller) == 0) {
666 *cfd = fd_hierarchies[i];
237e200e 667 return hierarchies[i];
5dd3e6fd
CB
668 }
669 if (in_comma_list(controller, hierarchies[i])) {
670 *cfd = fd_hierarchies[i];
237e200e 671 return hierarchies[i];
5dd3e6fd 672 }
237e200e
SH
673 }
674
675 return NULL;
676}
677
678bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
679 const char *value)
680{
ba59ea09 681 int ret, fd, cfd;
237e200e 682 size_t len;
f5a6d92e 683 char *fnam, *tmpc;
237e200e 684
f5a6d92e 685 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
686 if (!tmpc)
687 return false;
f5a6d92e
CB
688
689 /* Make sure we pass a relative path to *at() family of functions.
690 * . + /cgroup + / + file + \0
691 */
ba59ea09 692 len = strlen(cgroup) + strlen(file) + 3;
237e200e 693 fnam = alloca(len);
ba59ea09
CB
694 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
695 if (ret < 0 || (size_t)ret >= len)
696 return false;
697
698 fd = openat(cfd, fnam, O_WRONLY);
699 if (fd < 0)
700 return false;
f676eb79 701
ba59ea09 702 return write_string(fnam, value, fd);
237e200e
SH
703}
704
705// Chown all the files in the cgroup directory. We do this when we create
706// a cgroup on behalf of a user.
f23fe717 707static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 708{
f23fe717 709 struct dirent *direntp;
237e200e
SH
710 char path[MAXPATHLEN];
711 size_t len;
712 DIR *d;
f23fe717 713 int fd1, ret;
237e200e
SH
714
715 len = strlen(dirname);
716 if (len >= MAXPATHLEN) {
b8defc3d 717 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
718 return;
719 }
720
f23fe717
CB
721 fd1 = openat(fd, dirname, O_DIRECTORY);
722 if (fd1 < 0)
723 return;
724
725 d = fdopendir(fd1);
237e200e 726 if (!d) {
b8defc3d 727 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
728 return;
729 }
730
f23fe717 731 while ((direntp = readdir(d))) {
237e200e
SH
732 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
733 continue;
734 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
735 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 736 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
737 continue;
738 }
f23fe717 739 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 740 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
741 }
742 closedir(d);
743}
744
745int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
746{
5dd3e6fd 747 int cfd;
237e200e 748 size_t len;
f5a6d92e 749 char *dirnam, *tmpc;
237e200e 750
f5a6d92e 751 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
752 if (!tmpc)
753 return -EINVAL;
f5a6d92e
CB
754
755 /* Make sure we pass a relative path to *at() family of functions.
756 * . + /cg + \0
757 */
f23fe717 758 len = strlen(cg) + 2;
237e200e 759 dirnam = alloca(len);
f23fe717 760 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 761
f23fe717 762 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
763 return -errno;
764
765 if (uid == 0 && gid == 0)
766 return 0;
767
f23fe717 768 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
769 return -errno;
770
f23fe717 771 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
772
773 return 0;
774}
775
7213ec5c 776static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 777{
b7672ded 778 struct dirent *direntp;
237e200e
SH
779 DIR *dir;
780 bool ret = false;
781 char pathname[MAXPATHLEN];
b7672ded
CB
782 int dupfd;
783
784 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
785 if (dupfd < 0)
786 return false;
237e200e 787
b7672ded 788 dir = fdopendir(dupfd);
237e200e 789 if (!dir) {
7dd6560a 790 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 791 close(dupfd);
237e200e
SH
792 return false;
793 }
794
b7672ded 795 while ((direntp = readdir(dir))) {
237e200e
SH
796 struct stat mystat;
797 int rc;
798
237e200e
SH
799 if (!strcmp(direntp->d_name, ".") ||
800 !strcmp(direntp->d_name, ".."))
801 continue;
802
803 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
804 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 805 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
806 continue;
807 }
808
2e81a5e3
CB
809 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
810 if (rc) {
7dd6560a 811 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
812 continue;
813 }
7dd6560a 814 if (S_ISDIR(mystat.st_mode))
2e81a5e3 815 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 816 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
817 }
818
819 ret = true;
820 if (closedir(dir) < 0) {
b8defc3d 821 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
822 ret = false;
823 }
824
2e81a5e3 825 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 826 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
827 ret = false;
828 }
7213ec5c
CB
829
830 close(dupfd);
237e200e
SH
831
832 return ret;
833}
834
835bool cgfs_remove(const char *controller, const char *cg)
836{
b7672ded 837 int fd, cfd;
237e200e 838 size_t len;
f5a6d92e 839 char *dirnam, *tmpc;
7213ec5c 840 bool bret;
237e200e 841
f5a6d92e 842 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
843 if (!tmpc)
844 return false;
f5a6d92e
CB
845
846 /* Make sure we pass a relative path to *at() family of functions.
847 * . + /cg + \0
848 */
b7672ded 849 len = strlen(cg) + 2;
237e200e 850 dirnam = alloca(len);
b7672ded
CB
851 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
852
853 fd = openat(cfd, dirnam, O_DIRECTORY);
854 if (fd < 0)
855 return false;
856
7213ec5c
CB
857 bret = recursive_rmdir(dirnam, fd, cfd);
858 close(fd);
859 return bret;
237e200e
SH
860}
861
862bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
863{
5dd3e6fd 864 int cfd;
237e200e 865 size_t len;
f5a6d92e 866 char *pathname, *tmpc;
237e200e 867
f5a6d92e 868 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
869 if (!tmpc)
870 return false;
f5a6d92e
CB
871
872 /* Make sure we pass a relative path to *at() family of functions.
873 * . + /file + \0
874 */
534690b4 875 len = strlen(file) + 2;
237e200e 876 pathname = alloca(len);
534690b4
CB
877 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
878 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
879 return false;
880 return true;
881}
882
0f657ce3 883static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
884{
885 size_t len;
886 char *fname;
887
888 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
889 fname = alloca(len);
890 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 891 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
892 return -errno;
893 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 894 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
895 return -errno;
896 return 0;
897}
898
899int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
900{
5dd3e6fd 901 int cfd;
237e200e 902 size_t len;
f5a6d92e 903 char *pathname, *tmpc;
237e200e 904
f5a6d92e 905 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
906 if (!tmpc)
907 return -EINVAL;
f5a6d92e
CB
908
909 /* Make sure we pass a relative path to *at() family of functions.
910 * . + /file + \0
911 */
0f657ce3 912 len = strlen(file) + 2;
237e200e 913 pathname = alloca(len);
0f657ce3
CB
914 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
915 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
916 return -errno;
917
0f657ce3 918 if (is_dir(pathname, cfd))
237e200e 919 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 920 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
921
922 return 0;
923}
924
925FILE *open_pids_file(const char *controller, const char *cgroup)
926{
3ffd08ee 927 int fd, cfd;
237e200e 928 size_t len;
f5a6d92e 929 char *pathname, *tmpc;
237e200e 930
f5a6d92e 931 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
932 if (!tmpc)
933 return NULL;
f5a6d92e
CB
934
935 /* Make sure we pass a relative path to *at() family of functions.
936 * . + /cgroup + / "cgroup.procs" + \0
937 */
3ffd08ee 938 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 939 pathname = alloca(len);
3ffd08ee
CB
940 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
941
942 fd = openat(cfd, pathname, O_WRONLY);
943 if (fd < 0)
944 return NULL;
945
946 return fdopen(fd, "w");
237e200e
SH
947}
948
f366da65
WB
949static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
950 void ***list, size_t typesize,
951 void* (*iterator)(const char*, const char*, const char*))
237e200e 952{
4ea38a4c 953 int cfd, fd, ret;
237e200e 954 size_t len;
4ea38a4c 955 char *cg, *tmpc;
237e200e 956 char pathname[MAXPATHLEN];
f366da65 957 size_t sz = 0, asz = 0;
4ea38a4c 958 struct dirent *dirent;
237e200e 959 DIR *dir;
237e200e 960
4ea38a4c 961 tmpc = find_mounted_controller(controller, &cfd);
f366da65 962 *list = NULL;
237e200e 963 if (!tmpc)
e97c834b 964 return false;
237e200e 965
f5a6d92e 966 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
967 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
968 cg = alloca(len);
969 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
970 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 971 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
972 return false;
973 }
237e200e 974
4ea38a4c
CB
975 fd = openat(cfd, cg, O_DIRECTORY);
976 if (fd < 0)
977 return false;
978
979 dir = fdopendir(fd);
237e200e
SH
980 if (!dir)
981 return false;
982
4ea38a4c 983 while ((dirent = readdir(dir))) {
237e200e 984 struct stat mystat;
237e200e 985
4ea38a4c
CB
986 if (!strcmp(dirent->d_name, ".") ||
987 !strcmp(dirent->d_name, ".."))
237e200e
SH
988 continue;
989
4ea38a4c
CB
990 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
991 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 992 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
993 continue;
994 }
995
4ea38a4c 996 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 997 if (ret) {
b8defc3d 998 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
999 continue;
1000 }
f366da65
WB
1001 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1002 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1003 continue;
1004
1005 if (sz+2 >= asz) {
f366da65 1006 void **tmp;
237e200e
SH
1007 asz += BATCH_SIZE;
1008 do {
f366da65 1009 tmp = realloc(*list, asz * typesize);
237e200e
SH
1010 } while (!tmp);
1011 *list = tmp;
1012 }
4ea38a4c 1013 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1014 (*list)[sz+1] = NULL;
1015 sz++;
1016 }
1017 if (closedir(dir) < 0) {
b8defc3d 1018 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1019 return false;
1020 }
1021 return true;
1022}
1023
f366da65
WB
1024static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1025{
1026 char *dup;
1027 do {
1028 dup = strdup(dir_entry);
1029 } while (!dup);
1030 return dup;
1031}
1032
1033bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1034{
1035 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1036}
1037
237e200e
SH
1038void free_key(struct cgfs_files *k)
1039{
1040 if (!k)
1041 return;
1042 free(k->name);
1043 free(k);
1044}
1045
1046void free_keys(struct cgfs_files **keys)
1047{
1048 int i;
1049
1050 if (!keys)
1051 return;
1052 for (i = 0; keys[i]; i++) {
1053 free_key(keys[i]);
1054 }
1055 free(keys);
1056}
1057
1058bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1059{
60f2ae53 1060 int ret, fd, cfd;
237e200e 1061 size_t len;
f5a6d92e 1062 char *fnam, *tmpc;
237e200e 1063
f5a6d92e 1064 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1065 if (!tmpc)
1066 return false;
f5a6d92e
CB
1067
1068 /* Make sure we pass a relative path to *at() family of functions.
1069 * . + /cgroup + / + file + \0
1070 */
60f2ae53 1071 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1072 fnam = alloca(len);
60f2ae53
CB
1073 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1074 if (ret < 0 || (size_t)ret >= len)
234a820c 1075 return false;
60f2ae53
CB
1076
1077 fd = openat(cfd, fnam, O_RDONLY);
1078 if (fd < 0)
234a820c 1079 return false;
237e200e 1080
60f2ae53 1081 *value = slurp_file(fnam, fd);
237e200e
SH
1082 return *value != NULL;
1083}
1084
1085struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1086{
4ea38a4c 1087 int ret, cfd;
237e200e 1088 size_t len;
f5a6d92e 1089 char *fnam, *tmpc;
237e200e
SH
1090 struct stat sb;
1091 struct cgfs_files *newkey;
237e200e 1092
f5a6d92e 1093 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1094 if (!tmpc)
1095 return false;
1096
1097 if (file && *file == '/')
1098 file++;
1099
06081b29 1100 if (file && strchr(file, '/'))
237e200e
SH
1101 return NULL;
1102
f5a6d92e
CB
1103 /* Make sure we pass a relative path to *at() family of functions.
1104 * . + /cgroup + / + file + \0
1105 */
4ea38a4c 1106 len = strlen(cgroup) + 3;
237e200e
SH
1107 if (file)
1108 len += strlen(file) + 1;
1109 fnam = alloca(len);
4ea38a4c
CB
1110 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1111 file ? "/" : "", file ? file : "");
237e200e 1112
4ea38a4c 1113 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1114 if (ret < 0)
1115 return NULL;
1116
1117 do {
1118 newkey = malloc(sizeof(struct cgfs_files));
1119 } while (!newkey);
1120 if (file)
1121 newkey->name = must_copy_string(file);
06081b29
CB
1122 else if (strrchr(cgroup, '/'))
1123 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1124 else
1125 newkey->name = must_copy_string(cgroup);
1126 newkey->uid = sb.st_uid;
1127 newkey->gid = sb.st_gid;
1128 newkey->mode = sb.st_mode;
1129
1130 return newkey;
1131}
1132
f366da65 1133static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1134{
f366da65
WB
1135 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1136 if (!entry) {
b8defc3d
CB
1137 lxcfs_error("Error getting files under %s:%s\n", controller,
1138 cgroup);
237e200e 1139 }
f366da65
WB
1140 return entry;
1141}
1142
1143bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1144{
1145 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1146}
1147
1148bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1149{
1150 int cfd;
1151 size_t len;
f5a6d92e 1152 char *fnam, *tmpc;
237e200e
SH
1153 int ret;
1154 struct stat sb;
1155
f5a6d92e 1156 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1157 if (!tmpc)
1158 return false;
f5a6d92e
CB
1159
1160 /* Make sure we pass a relative path to *at() family of functions.
1161 * . + /cgroup + / + f + \0
1162 */
d04232f2 1163 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1164 fnam = alloca(len);
d04232f2
CB
1165 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1166 if (ret < 0 || (size_t)ret >= len)
1167 return false;
237e200e 1168
d04232f2 1169 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1170 if (ret < 0 || !S_ISDIR(sb.st_mode))
1171 return false;
f5a6d92e 1172
237e200e
SH
1173 return true;
1174}
1175
1176#define SEND_CREDS_OK 0
1177#define SEND_CREDS_NOTSK 1
1178#define SEND_CREDS_FAIL 2
1179static bool recv_creds(int sock, struct ucred *cred, char *v);
1180static int wait_for_pid(pid_t pid);
1181static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1182static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1183
1184/*
b10bdd6c 1185 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1186 * over a unix sock so we can read the task's reaper's pid in our
1187 * namespace
b10bdd6c
FG
1188 *
1189 * Note: glibc's fork() does not respect pidns, which can lead to failed
1190 * assertions inside glibc (and thus failed forks) if the child's pid in
1191 * the pidns and the parent pid outside are identical. Using clone prevents
1192 * this issue.
237e200e
SH
1193 */
1194static void write_task_init_pid_exit(int sock, pid_t target)
1195{
237e200e
SH
1196 char fnam[100];
1197 pid_t pid;
237e200e 1198 int fd, ret;
b10bdd6c
FG
1199 size_t stack_size = sysconf(_SC_PAGESIZE);
1200 void *stack = alloca(stack_size);
237e200e
SH
1201
1202 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1203 if (ret < 0 || ret >= sizeof(fnam))
1204 _exit(1);
1205
1206 fd = open(fnam, O_RDONLY);
1207 if (fd < 0) {
1208 perror("write_task_init_pid_exit open of ns/pid");
1209 _exit(1);
1210 }
1211 if (setns(fd, 0)) {
1212 perror("write_task_init_pid_exit setns 1");
1213 close(fd);
1214 _exit(1);
1215 }
b10bdd6c 1216 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1217 if (pid < 0)
1218 _exit(1);
1219 if (pid != 0) {
1220 if (!wait_for_pid(pid))
1221 _exit(1);
1222 _exit(0);
1223 }
b10bdd6c
FG
1224}
1225
1226static int send_creds_clone_wrapper(void *arg) {
1227 struct ucred cred;
1228 char v;
1229 int sock = *(int *)arg;
237e200e
SH
1230
1231 /* we are the child */
1232 cred.uid = 0;
1233 cred.gid = 0;
1234 cred.pid = 1;
1235 v = '1';
1236 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1237 return 1;
1238 return 0;
237e200e
SH
1239}
1240
1241static pid_t get_init_pid_for_task(pid_t task)
1242{
1243 int sock[2];
1244 pid_t pid;
1245 pid_t ret = -1;
1246 char v = '0';
1247 struct ucred cred;
1248
1249 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1250 perror("socketpair");
1251 return -1;
1252 }
1253
1254 pid = fork();
1255 if (pid < 0)
1256 goto out;
1257 if (!pid) {
1258 close(sock[1]);
1259 write_task_init_pid_exit(sock[0], task);
1260 _exit(0);
1261 }
1262
1263 if (!recv_creds(sock[1], &cred, &v))
1264 goto out;
1265 ret = cred.pid;
1266
1267out:
1268 close(sock[0]);
1269 close(sock[1]);
1270 if (pid > 0)
1271 wait_for_pid(pid);
1272 return ret;
1273}
1274
1275static pid_t lookup_initpid_in_store(pid_t qpid)
1276{
1277 pid_t answer = 0;
1278 struct stat sb;
1279 struct pidns_init_store *e;
1280 char fnam[100];
1281
1282 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1283 store_lock();
1284 if (stat(fnam, &sb) < 0)
1285 goto out;
1286 e = lookup_verify_initpid(&sb);
1287 if (e) {
1288 answer = e->initpid;
1289 goto out;
1290 }
1291 answer = get_init_pid_for_task(qpid);
1292 if (answer > 0)
1293 save_initpid(&sb, answer);
1294
1295out:
1296 /* we prune at end in case we are returning
1297 * the value we were about to return */
1298 prune_initpid_store();
1299 store_unlock();
1300 return answer;
1301}
1302
1303static int wait_for_pid(pid_t pid)
1304{
1305 int status, ret;
1306
1307 if (pid <= 0)
1308 return -1;
1309
1310again:
1311 ret = waitpid(pid, &status, 0);
1312 if (ret == -1) {
1313 if (errno == EINTR)
1314 goto again;
1315 return -1;
1316 }
1317 if (ret != pid)
1318 goto again;
1319 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1320 return -1;
1321 return 0;
1322}
1323
1324
1325/*
1326 * append pid to *src.
1327 * src: a pointer to a char* in which ot append the pid.
1328 * sz: the number of characters printed so far, minus trailing \0.
1329 * asz: the allocated size so far
1330 * pid: the pid to append
1331 */
1332static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1333{
1334 char tmp[30];
1335
1336 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1337
1338 if (!*src || tmplen + *sz + 1 >= *asz) {
1339 char *tmp;
1340 do {
1341 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1342 } while (!tmp);
1343 *src = tmp;
1344 *asz += BUF_RESERVE_SIZE;
1345 }
bbfd0e33 1346 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1347 *sz += tmplen;
237e200e
SH
1348}
1349
1350/*
1351 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1352 * valid in the caller's namespace, return the id mapped into
1353 * pid's namespace.
1354 * Returns the mapped id, or -1 on error.
1355 */
1356unsigned int
1357convert_id_to_ns(FILE *idfile, unsigned int in_id)
1358{
1359 unsigned int nsuid, // base id for a range in the idfile's namespace
1360 hostuid, // base id for a range in the caller's namespace
1361 count; // number of ids in this range
1362 char line[400];
1363 int ret;
1364
1365 fseek(idfile, 0L, SEEK_SET);
1366 while (fgets(line, 400, idfile)) {
1367 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1368 if (ret != 3)
1369 continue;
1370 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1371 /*
1372 * uids wrapped around - unexpected as this is a procfile,
1373 * so just bail.
1374 */
b8defc3d 1375 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1376 nsuid, hostuid, count, line);
1377 return -1;
1378 }
1379 if (hostuid <= in_id && hostuid+count > in_id) {
1380 /*
1381 * now since hostuid <= in_id < hostuid+count, and
1382 * hostuid+count and nsuid+count do not wrap around,
1383 * we know that nsuid+(in_id-hostuid) which must be
1384 * less that nsuid+(count) must not wrap around
1385 */
1386 return (in_id - hostuid) + nsuid;
1387 }
1388 }
1389
1390 // no answer found
1391 return -1;
1392}
1393
1394/*
1395 * for is_privileged_over,
1396 * specify whether we require the calling uid to be root in his
1397 * namespace
1398 */
1399#define NS_ROOT_REQD true
1400#define NS_ROOT_OPT false
1401
1402#define PROCLEN 100
1403
1404static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1405{
1406 char fpath[PROCLEN];
1407 int ret;
1408 bool answer = false;
1409 uid_t nsuid;
1410
1411 if (victim == -1 || uid == -1)
1412 return false;
1413
1414 /*
1415 * If the request is one not requiring root in the namespace,
1416 * then having the same uid suffices. (i.e. uid 1000 has write
1417 * access to files owned by uid 1000
1418 */
1419 if (!req_ns_root && uid == victim)
1420 return true;
1421
1422 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1423 if (ret < 0 || ret >= PROCLEN)
1424 return false;
1425 FILE *f = fopen(fpath, "r");
1426 if (!f)
1427 return false;
1428
1429 /* if caller's not root in his namespace, reject */
1430 nsuid = convert_id_to_ns(f, uid);
1431 if (nsuid)
1432 goto out;
1433
1434 /*
1435 * If victim is not mapped into caller's ns, reject.
1436 * XXX I'm not sure this check is needed given that fuse
1437 * will be sending requests where the vfs has converted
1438 */
1439 nsuid = convert_id_to_ns(f, victim);
1440 if (nsuid == -1)
1441 goto out;
1442
1443 answer = true;
1444
1445out:
1446 fclose(f);
1447 return answer;
1448}
1449
1450static bool perms_include(int fmode, mode_t req_mode)
1451{
1452 mode_t r;
1453
1454 switch (req_mode & O_ACCMODE) {
1455 case O_RDONLY:
1456 r = S_IROTH;
1457 break;
1458 case O_WRONLY:
1459 r = S_IWOTH;
1460 break;
1461 case O_RDWR:
1462 r = S_IROTH | S_IWOTH;
1463 break;
1464 default:
1465 return false;
1466 }
1467 return ((fmode & r) == r);
1468}
1469
1470
1471/*
1472 * taskcg is a/b/c
1473 * querycg is /a/b/c/d/e
1474 * we return 'd'
1475 */
1476static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1477{
1478 char *start, *end;
1479
1480 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1481 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1482 return NULL;
1483 }
1484
06081b29 1485 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1486 start = strdup(taskcg + 1);
1487 else
1488 start = strdup(taskcg + strlen(querycg) + 1);
1489 if (!start)
1490 return NULL;
1491 end = strchr(start, '/');
1492 if (end)
1493 *end = '\0';
1494 return start;
1495}
1496
1497static void stripnewline(char *x)
1498{
1499 size_t l = strlen(x);
1500 if (l && x[l-1] == '\n')
1501 x[l-1] = '\0';
1502}
1503
1504static char *get_pid_cgroup(pid_t pid, const char *contrl)
1505{
5dd3e6fd 1506 int cfd;
237e200e
SH
1507 char fnam[PROCLEN];
1508 FILE *f;
1509 char *answer = NULL;
1510 char *line = NULL;
1511 size_t len = 0;
1512 int ret;
5dd3e6fd 1513 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1514 if (!h)
1515 return NULL;
1516
1517 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1518 if (ret < 0 || ret >= PROCLEN)
1519 return NULL;
1520 if (!(f = fopen(fnam, "r")))
1521 return NULL;
1522
1523 while (getline(&line, &len, f) != -1) {
1524 char *c1, *c2;
1525 if (!line[0])
1526 continue;
1527 c1 = strchr(line, ':');
1528 if (!c1)
1529 goto out;
1530 c1++;
1531 c2 = strchr(c1, ':');
1532 if (!c2)
1533 goto out;
1534 *c2 = '\0';
1535 if (strcmp(c1, h) != 0)
1536 continue;
1537 c2++;
1538 stripnewline(c2);
1539 do {
1540 answer = strdup(c2);
1541 } while (!answer);
1542 break;
1543 }
1544
1545out:
1546 fclose(f);
1547 free(line);
1548 return answer;
1549}
1550
1551/*
1552 * check whether a fuse context may access a cgroup dir or file
1553 *
1554 * If file is not null, it is a cgroup file to check under cg.
1555 * If file is null, then we are checking perms on cg itself.
1556 *
1557 * For files we can check the mode of the list_keys result.
1558 * For cgroups, we must make assumptions based on the files under the
1559 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1560 * yet.
1561 */
1562static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1563{
1564 struct cgfs_files *k = NULL;
1565 bool ret = false;
1566
1567 k = cgfs_get_key(contrl, cg, file);
1568 if (!k)
1569 return false;
1570
1571 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1572 if (perms_include(k->mode >> 6, mode)) {
1573 ret = true;
1574 goto out;
1575 }
1576 }
1577 if (fc->gid == k->gid) {
1578 if (perms_include(k->mode >> 3, mode)) {
1579 ret = true;
1580 goto out;
1581 }
1582 }
1583 ret = perms_include(k->mode, mode);
1584
1585out:
1586 free_key(k);
1587 return ret;
1588}
1589
1590#define INITSCOPE "/init.scope"
1591static void prune_init_slice(char *cg)
1592{
1593 char *point;
1594 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1595
1596 if (cg_len < initscope_len)
1597 return;
1598
1599 point = cg + cg_len - initscope_len;
1600 if (strcmp(point, INITSCOPE) == 0) {
1601 if (point == cg)
1602 *(point+1) = '\0';
1603 else
1604 *point = '\0';
1605 }
1606}
1607
1608/*
1609 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1610 * If pid is in /a, he may act on /a/b, but not on /b.
1611 * if the answer is false and nextcg is not NULL, then *nextcg will point
1612 * to a string containing the next cgroup directory under cg, which must be
1613 * freed by the caller.
1614 */
1615static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1616{
1617 bool answer = false;
1618 char *c2 = get_pid_cgroup(pid, contrl);
1619 char *linecmp;
1620
1621 if (!c2)
1622 return false;
1623 prune_init_slice(c2);
1624
1625 /*
12c31268
CB
1626 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1627 * they pass in a cgroup without leading '/'
1628 *
1629 * The original line here was:
1630 * linecmp = *cg == '/' ? c2 : c2+1;
1631 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1632 * Serge, do you know?
237e200e 1633 */
12c31268
CB
1634 if (*cg == '/' || !strncmp(cg, "./", 2))
1635 linecmp = c2;
1636 else
1637 linecmp = c2 + 1;
237e200e
SH
1638 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1639 if (nextcg) {
1640 *nextcg = get_next_cgroup_dir(linecmp, cg);
1641 }
1642 goto out;
1643 }
1644 answer = true;
1645
1646out:
1647 free(c2);
1648 return answer;
1649}
1650
1651/*
1652 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1653 */
1654static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1655{
1656 bool answer = false;
1657 char *c2, *task_cg;
1658 size_t target_len, task_len;
1659
f7bff426 1660 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1661 return true;
1662
1663 c2 = get_pid_cgroup(pid, contrl);
1664 if (!c2)
1665 return false;
1666 prune_init_slice(c2);
1667
1668 task_cg = c2 + 1;
1669 target_len = strlen(cg);
1670 task_len = strlen(task_cg);
1671 if (task_len == 0) {
1672 /* Task is in the root cg, it can see everything. This case is
1673 * not handled by the strmcps below, since they test for the
1674 * last /, but that is the first / that we've chopped off
1675 * above.
1676 */
1677 answer = true;
1678 goto out;
1679 }
1680 if (strcmp(cg, task_cg) == 0) {
1681 answer = true;
1682 goto out;
1683 }
1684 if (target_len < task_len) {
1685 /* looking up a parent dir */
1686 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1687 answer = true;
1688 goto out;
1689 }
1690 if (target_len > task_len) {
1691 /* looking up a child dir */
1692 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1693 answer = true;
1694 goto out;
1695 }
1696
1697out:
1698 free(c2);
1699 return answer;
1700}
1701
1702/*
1703 * given /cgroup/freezer/a/b, return "freezer".
1704 * the returned char* should NOT be freed.
1705 */
1706static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1707{
1708 const char *p1;
1709 char *contr, *slash;
1710
99142521 1711 if (strlen(path) < 9) {
e254948f 1712 errno = EACCES;
237e200e 1713 return NULL;
99142521
CB
1714 }
1715 if (*(path + 7) != '/') {
1716 errno = EINVAL;
237e200e 1717 return NULL;
99142521 1718 }
3adc421c 1719 p1 = path + 8;
237e200e 1720 contr = strdupa(p1);
99142521
CB
1721 if (!contr) {
1722 errno = ENOMEM;
237e200e 1723 return NULL;
99142521 1724 }
237e200e
SH
1725 slash = strstr(contr, "/");
1726 if (slash)
1727 *slash = '\0';
1728
1729 int i;
3adc421c 1730 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1731 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1732 return hierarchies[i];
1733 }
99142521 1734 errno = ENOENT;
237e200e
SH
1735 return NULL;
1736}
1737
1738/*
1739 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1740 * Note that the returned value may include files (keynames) etc
1741 */
1742static const char *find_cgroup_in_path(const char *path)
1743{
1744 const char *p1;
1745
bc70ba9b 1746 if (strlen(path) < 9) {
e254948f 1747 errno = EACCES;
237e200e 1748 return NULL;
bc70ba9b
CB
1749 }
1750 p1 = strstr(path + 8, "/");
1751 if (!p1) {
1752 errno = EINVAL;
237e200e 1753 return NULL;
bc70ba9b
CB
1754 }
1755 errno = 0;
1756 return p1 + 1;
237e200e
SH
1757}
1758
1759/*
1760 * split the last path element from the path in @cg.
1761 * @dir is newly allocated and should be freed, @last not
1762*/
1763static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1764{
1765 char *p;
1766
1767 do {
1768 *dir = strdup(cg);
1769 } while (!*dir);
1770 *last = strrchr(cg, '/');
1771 if (!*last) {
1772 *last = NULL;
1773 return;
1774 }
1775 p = strrchr(*dir, '/');
1776 *p = '\0';
1777}
1778
1779/*
1780 * FUSE ops for /cgroup
1781 */
1782
1783int cg_getattr(const char *path, struct stat *sb)
1784{
1785 struct timespec now;
1786 struct fuse_context *fc = fuse_get_context();
1787 char * cgdir = NULL;
1788 char *last = NULL, *path1, *path2;
1789 struct cgfs_files *k = NULL;
1790 const char *cgroup;
1791 const char *controller = NULL;
1792 int ret = -ENOENT;
1793
1794
1795 if (!fc)
1796 return -EIO;
1797
1798 memset(sb, 0, sizeof(struct stat));
1799
1800 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1801 return -EINVAL;
1802
1803 sb->st_uid = sb->st_gid = 0;
1804 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1805 sb->st_size = 0;
1806
1807 if (strcmp(path, "/cgroup") == 0) {
1808 sb->st_mode = S_IFDIR | 00755;
1809 sb->st_nlink = 2;
1810 return 0;
1811 }
1812
1813 controller = pick_controller_from_path(fc, path);
1814 if (!controller)
2f7036d0 1815 return -errno;
237e200e
SH
1816 cgroup = find_cgroup_in_path(path);
1817 if (!cgroup) {
1818 /* this is just /cgroup/controller, return it as a dir */
1819 sb->st_mode = S_IFDIR | 00755;
1820 sb->st_nlink = 2;
1821 return 0;
1822 }
1823
1824 get_cgdir_and_path(cgroup, &cgdir, &last);
1825
1826 if (!last) {
1827 path1 = "/";
1828 path2 = cgdir;
1829 } else {
1830 path1 = cgdir;
1831 path2 = last;
1832 }
1833
1834 pid_t initpid = lookup_initpid_in_store(fc->pid);
1835 if (initpid <= 0)
1836 initpid = fc->pid;
1837 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1838 * Then check that caller's cgroup is under path if last is a child
1839 * cgroup, or cgdir if last is a file */
1840
1841 if (is_child_cgroup(controller, path1, path2)) {
1842 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1843 ret = -ENOENT;
1844 goto out;
1845 }
1846 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1847 /* this is just /cgroup/controller, return it as a dir */
1848 sb->st_mode = S_IFDIR | 00555;
1849 sb->st_nlink = 2;
1850 ret = 0;
1851 goto out;
1852 }
1853 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1854 ret = -EACCES;
1855 goto out;
1856 }
1857
1858 // get uid, gid, from '/tasks' file and make up a mode
1859 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1860 sb->st_mode = S_IFDIR | 00755;
1861 k = cgfs_get_key(controller, cgroup, NULL);
1862 if (!k) {
1863 sb->st_uid = sb->st_gid = 0;
1864 } else {
1865 sb->st_uid = k->uid;
1866 sb->st_gid = k->gid;
1867 }
1868 free_key(k);
1869 sb->st_nlink = 2;
1870 ret = 0;
1871 goto out;
1872 }
1873
1874 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1875 sb->st_mode = S_IFREG | k->mode;
1876 sb->st_nlink = 1;
1877 sb->st_uid = k->uid;
1878 sb->st_gid = k->gid;
1879 sb->st_size = 0;
1880 free_key(k);
1881 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1882 ret = -ENOENT;
1883 goto out;
1884 }
237e200e
SH
1885 ret = 0;
1886 }
1887
1888out:
1889 free(cgdir);
1890 return ret;
1891}
1892
1893int cg_opendir(const char *path, struct fuse_file_info *fi)
1894{
1895 struct fuse_context *fc = fuse_get_context();
1896 const char *cgroup;
1897 struct file_info *dir_info;
1898 char *controller = NULL;
1899
1900 if (!fc)
1901 return -EIO;
1902
1903 if (strcmp(path, "/cgroup") == 0) {
1904 cgroup = NULL;
1905 controller = NULL;
1906 } else {
1907 // return list of keys for the controller, and list of child cgroups
1908 controller = pick_controller_from_path(fc, path);
1909 if (!controller)
2f7036d0 1910 return -errno;
237e200e
SH
1911
1912 cgroup = find_cgroup_in_path(path);
1913 if (!cgroup) {
1914 /* this is just /cgroup/controller, return its contents */
1915 cgroup = "/";
1916 }
1917 }
1918
1919 pid_t initpid = lookup_initpid_in_store(fc->pid);
1920 if (initpid <= 0)
1921 initpid = fc->pid;
1922 if (cgroup) {
1923 if (!caller_may_see_dir(initpid, controller, cgroup))
1924 return -ENOENT;
1925 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1926 return -EACCES;
1927 }
1928
1929 /* we'll free this at cg_releasedir */
1930 dir_info = malloc(sizeof(*dir_info));
1931 if (!dir_info)
1932 return -ENOMEM;
1933 dir_info->controller = must_copy_string(controller);
1934 dir_info->cgroup = must_copy_string(cgroup);
1935 dir_info->type = LXC_TYPE_CGDIR;
1936 dir_info->buf = NULL;
1937 dir_info->file = NULL;
1938 dir_info->buflen = 0;
1939
1940 fi->fh = (unsigned long)dir_info;
1941 return 0;
1942}
1943
1944int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1945 struct fuse_file_info *fi)
1946{
1947 struct file_info *d = (struct file_info *)fi->fh;
1948 struct cgfs_files **list = NULL;
1949 int i, ret;
1950 char *nextcg = NULL;
1951 struct fuse_context *fc = fuse_get_context();
1952 char **clist = NULL;
1953
d639f863
CB
1954 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1955 return -EIO;
1956
237e200e 1957 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 1958 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
1959 return -EIO;
1960 }
1961 if (!d->cgroup && !d->controller) {
1962 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1963 int i;
1964
1965 for (i = 0; i < num_hierarchies; i++) {
1966 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1967 return -EIO;
1968 }
1969 }
1970 return 0;
1971 }
1972
1973 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1974 // not a valid cgroup
1975 ret = -EINVAL;
1976 goto out;
1977 }
1978
1979 pid_t initpid = lookup_initpid_in_store(fc->pid);
1980 if (initpid <= 0)
1981 initpid = fc->pid;
1982 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1983 if (nextcg) {
1984 ret = filler(buf, nextcg, NULL, 0);
1985 free(nextcg);
1986 if (ret != 0) {
1987 ret = -EIO;
1988 goto out;
1989 }
1990 }
1991 ret = 0;
1992 goto out;
1993 }
1994
1995 for (i = 0; list[i]; i++) {
1996 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1997 ret = -EIO;
1998 goto out;
1999 }
2000 }
2001
2002 // now get the list of child cgroups
2003
2004 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2005 ret = 0;
2006 goto out;
2007 }
f366da65
WB
2008 if (clist) {
2009 for (i = 0; clist[i]; i++) {
2010 if (filler(buf, clist[i], NULL, 0) != 0) {
2011 ret = -EIO;
2012 goto out;
2013 }
237e200e
SH
2014 }
2015 }
2016 ret = 0;
2017
2018out:
2019 free_keys(list);
2020 if (clist) {
2021 for (i = 0; clist[i]; i++)
2022 free(clist[i]);
2023 free(clist);
2024 }
2025 return ret;
2026}
2027
43215927 2028static void do_release_file_info(struct fuse_file_info *fi)
237e200e 2029{
43215927
SH
2030 struct file_info *f = (struct file_info *)fi->fh;
2031
237e200e
SH
2032 if (!f)
2033 return;
43215927
SH
2034
2035 fi->fh = 0;
2036
237e200e 2037 free(f->controller);
43215927 2038 f->controller = NULL;
237e200e 2039 free(f->cgroup);
43215927 2040 f->cgroup = NULL;
237e200e 2041 free(f->file);
43215927 2042 f->file = NULL;
237e200e 2043 free(f->buf);
43215927 2044 f->buf = NULL;
237e200e
SH
2045 free(f);
2046}
2047
2048int cg_releasedir(const char *path, struct fuse_file_info *fi)
2049{
43215927 2050 do_release_file_info(fi);
237e200e
SH
2051 return 0;
2052}
2053
2054int cg_open(const char *path, struct fuse_file_info *fi)
2055{
2056 const char *cgroup;
2057 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2058 struct cgfs_files *k = NULL;
2059 struct file_info *file_info;
2060 struct fuse_context *fc = fuse_get_context();
2061 int ret;
2062
2063 if (!fc)
2064 return -EIO;
2065
2066 controller = pick_controller_from_path(fc, path);
2067 if (!controller)
2f7036d0 2068 return -errno;
237e200e
SH
2069 cgroup = find_cgroup_in_path(path);
2070 if (!cgroup)
bc70ba9b 2071 return -errno;
237e200e
SH
2072
2073 get_cgdir_and_path(cgroup, &cgdir, &last);
2074 if (!last) {
2075 path1 = "/";
2076 path2 = cgdir;
2077 } else {
2078 path1 = cgdir;
2079 path2 = last;
2080 }
2081
2082 k = cgfs_get_key(controller, path1, path2);
2083 if (!k) {
2084 ret = -EINVAL;
2085 goto out;
2086 }
2087 free_key(k);
2088
2089 pid_t initpid = lookup_initpid_in_store(fc->pid);
2090 if (initpid <= 0)
2091 initpid = fc->pid;
2092 if (!caller_may_see_dir(initpid, controller, path1)) {
2093 ret = -ENOENT;
2094 goto out;
2095 }
2096 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2097 ret = -EACCES;
2098 goto out;
2099 }
2100
2101 /* we'll free this at cg_release */
2102 file_info = malloc(sizeof(*file_info));
2103 if (!file_info) {
2104 ret = -ENOMEM;
2105 goto out;
2106 }
2107 file_info->controller = must_copy_string(controller);
2108 file_info->cgroup = must_copy_string(path1);
2109 file_info->file = must_copy_string(path2);
2110 file_info->type = LXC_TYPE_CGFILE;
2111 file_info->buf = NULL;
2112 file_info->buflen = 0;
2113
2114 fi->fh = (unsigned long)file_info;
2115 ret = 0;
2116
2117out:
2118 free(cgdir);
2119 return ret;
2120}
2121
bddbb106
SH
2122int cg_access(const char *path, int mode)
2123{
6f0f6b83 2124 int ret;
bddbb106 2125 const char *cgroup;
6f0f6b83
CB
2126 char *path1, *path2, *controller;
2127 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2128 struct cgfs_files *k = NULL;
2129 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2130
9873c5e8 2131 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2132 return 0;
bddbb106
SH
2133
2134 if (!fc)
2135 return -EIO;
2136
2137 controller = pick_controller_from_path(fc, path);
2138 if (!controller)
2f7036d0 2139 return -errno;
bddbb106 2140 cgroup = find_cgroup_in_path(path);
575316c4
SH
2141 if (!cgroup) {
2142 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2143 if ((mode & W_OK) == 0)
2144 return 0;
2145 return -EACCES;
575316c4 2146 }
bddbb106
SH
2147
2148 get_cgdir_and_path(cgroup, &cgdir, &last);
2149 if (!last) {
2150 path1 = "/";
2151 path2 = cgdir;
2152 } else {
2153 path1 = cgdir;
2154 path2 = last;
2155 }
2156
2157 k = cgfs_get_key(controller, path1, path2);
2158 if (!k) {
3f441bc7
SH
2159 if ((mode & W_OK) == 0)
2160 ret = 0;
2161 else
2162 ret = -EACCES;
bddbb106
SH
2163 goto out;
2164 }
2165 free_key(k);
2166
2167 pid_t initpid = lookup_initpid_in_store(fc->pid);
2168 if (initpid <= 0)
2169 initpid = fc->pid;
2170 if (!caller_may_see_dir(initpid, controller, path1)) {
2171 ret = -ENOENT;
2172 goto out;
2173 }
2174 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2175 ret = -EACCES;
2176 goto out;
2177 }
2178
2179 ret = 0;
2180
2181out:
2182 free(cgdir);
2183 return ret;
2184}
2185
237e200e
SH
2186int cg_release(const char *path, struct fuse_file_info *fi)
2187{
43215927 2188 do_release_file_info(fi);
237e200e
SH
2189 return 0;
2190}
2191
2192#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2193
2194static bool wait_for_sock(int sock, int timeout)
2195{
2196 struct epoll_event ev;
2197 int epfd, ret, now, starttime, deltatime, saved_errno;
2198
2199 if ((starttime = time(NULL)) < 0)
2200 return false;
2201
2202 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2203 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2204 return false;
2205 }
2206
2207 ev.events = POLLIN_SET;
2208 ev.data.fd = sock;
2209 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2210 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2211 close(epfd);
2212 return false;
2213 }
2214
2215again:
2216 if ((now = time(NULL)) < 0) {
2217 close(epfd);
2218 return false;
2219 }
2220
2221 deltatime = (starttime + timeout) - now;
2222 if (deltatime < 0) { // timeout
2223 errno = 0;
2224 close(epfd);
2225 return false;
2226 }
2227 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2228 if (ret < 0 && errno == EINTR)
2229 goto again;
2230 saved_errno = errno;
2231 close(epfd);
2232
2233 if (ret <= 0) {
2234 errno = saved_errno;
2235 return false;
2236 }
2237 return true;
2238}
2239
2240static int msgrecv(int sockfd, void *buf, size_t len)
2241{
2242 if (!wait_for_sock(sockfd, 2))
2243 return -1;
2244 return recv(sockfd, buf, len, MSG_DONTWAIT);
2245}
2246
2247static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2248{
2249 struct msghdr msg = { 0 };
2250 struct iovec iov;
2251 struct cmsghdr *cmsg;
2252 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2253 char buf[1];
2254 buf[0] = 'p';
2255
2256 if (pingfirst) {
2257 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2258 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2259 return SEND_CREDS_FAIL;
2260 }
2261 }
2262
2263 msg.msg_control = cmsgbuf;
2264 msg.msg_controllen = sizeof(cmsgbuf);
2265
2266 cmsg = CMSG_FIRSTHDR(&msg);
2267 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2268 cmsg->cmsg_level = SOL_SOCKET;
2269 cmsg->cmsg_type = SCM_CREDENTIALS;
2270 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2271
2272 msg.msg_name = NULL;
2273 msg.msg_namelen = 0;
2274
2275 buf[0] = v;
2276 iov.iov_base = buf;
2277 iov.iov_len = sizeof(buf);
2278 msg.msg_iov = &iov;
2279 msg.msg_iovlen = 1;
2280
2281 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2282 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2283 if (errno == 3)
2284 return SEND_CREDS_NOTSK;
2285 return SEND_CREDS_FAIL;
2286 }
2287
2288 return SEND_CREDS_OK;
2289}
2290
2291static bool recv_creds(int sock, struct ucred *cred, char *v)
2292{
2293 struct msghdr msg = { 0 };
2294 struct iovec iov;
2295 struct cmsghdr *cmsg;
2296 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2297 char buf[1];
2298 int ret;
2299 int optval = 1;
2300
2301 *v = '1';
2302
2303 cred->pid = -1;
2304 cred->uid = -1;
2305 cred->gid = -1;
2306
2307 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2308 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2309 return false;
2310 }
2311 buf[0] = '1';
2312 if (write(sock, buf, 1) != 1) {
b8defc3d 2313 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2314 return false;
2315 }
2316
2317 msg.msg_name = NULL;
2318 msg.msg_namelen = 0;
2319 msg.msg_control = cmsgbuf;
2320 msg.msg_controllen = sizeof(cmsgbuf);
2321
2322 iov.iov_base = buf;
2323 iov.iov_len = sizeof(buf);
2324 msg.msg_iov = &iov;
2325 msg.msg_iovlen = 1;
2326
2327 if (!wait_for_sock(sock, 2)) {
b8defc3d 2328 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2329 return false;
2330 }
2331 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2332 if (ret < 0) {
b8defc3d 2333 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2334 return false;
2335 }
2336
2337 cmsg = CMSG_FIRSTHDR(&msg);
2338
2339 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2340 cmsg->cmsg_level == SOL_SOCKET &&
2341 cmsg->cmsg_type == SCM_CREDENTIALS) {
2342 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2343 }
2344 *v = buf[0];
2345
2346 return true;
2347}
2348
35174b0f
FG
2349struct pid_ns_clone_args {
2350 int *cpipe;
2351 int sock;
2352 pid_t tpid;
2353 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2354};
2355
2356/*
2357 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2358 * with clone(). This simply writes '1' as ACK back to the parent
2359 * before calling the actual wrapped function.
2360 */
2361static int pid_ns_clone_wrapper(void *arg) {
2362 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2363 char b = '1';
2364
2365 close(args->cpipe[0]);
b8defc3d
CB
2366 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2367 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2368 close(args->cpipe[1]);
2369 return args->wrapped(args->sock, args->tpid);
2370}
237e200e
SH
2371
2372/*
2373 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2374 * int value back over the socket. This shifts the pid from the
2375 * sender's pidns into tpid's pidns.
2376 */
35174b0f 2377static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2378{
2379 char v = '0';
2380 struct ucred cred;
2381
2382 while (recv_creds(sock, &cred, &v)) {
2383 if (v == '1')
35174b0f 2384 return 0;
237e200e 2385 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2386 return 1;
237e200e 2387 }
35174b0f 2388 return 0;
237e200e
SH
2389}
2390
35174b0f 2391
237e200e
SH
2392/*
2393 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2394 * in your old pidns. Only children which you clone will be in the target
2395 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2396 * actually convert pids.
2397 *
2398 * Note: glibc's fork() does not respect pidns, which can lead to failed
2399 * assertions inside glibc (and thus failed forks) if the child's pid in
2400 * the pidns and the parent pid outside are identical. Using clone prevents
2401 * this issue.
237e200e
SH
2402 */
2403static void pid_to_ns_wrapper(int sock, pid_t tpid)
2404{
2405 int newnsfd = -1, ret, cpipe[2];
2406 char fnam[100];
2407 pid_t cpid;
2408 char v;
2409
2410 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2411 if (ret < 0 || ret >= sizeof(fnam))
2412 _exit(1);
2413 newnsfd = open(fnam, O_RDONLY);
2414 if (newnsfd < 0)
2415 _exit(1);
2416 if (setns(newnsfd, 0) < 0)
2417 _exit(1);
2418 close(newnsfd);
2419
2420 if (pipe(cpipe) < 0)
2421 _exit(1);
2422
35174b0f
FG
2423 struct pid_ns_clone_args args = {
2424 .cpipe = cpipe,
2425 .sock = sock,
2426 .tpid = tpid,
2427 .wrapped = &pid_to_ns
2428 };
2429 size_t stack_size = sysconf(_SC_PAGESIZE);
2430 void *stack = alloca(stack_size);
2431
2432 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2433 if (cpid < 0)
2434 _exit(1);
2435
237e200e
SH
2436 // give the child 1 second to be done forking and
2437 // write its ack
2438 if (!wait_for_sock(cpipe[0], 1))
2439 _exit(1);
2440 ret = read(cpipe[0], &v, 1);
2441 if (ret != sizeof(char) || v != '1')
2442 _exit(1);
2443
2444 if (!wait_for_pid(cpid))
2445 _exit(1);
2446 _exit(0);
2447}
2448
2449/*
2450 * To read cgroup files with a particular pid, we will setns into the child
2451 * pidns, open a pipe, fork a child - which will be the first to really be in
2452 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2453 */
2454bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2455{
2456 int sock[2] = {-1, -1};
2457 char *tmpdata = NULL;
2458 int ret;
2459 pid_t qpid, cpid = -1;
2460 bool answer = false;
2461 char v = '0';
2462 struct ucred cred;
2463 size_t sz = 0, asz = 0;
2464
2465 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2466 return false;
2467
2468 /*
2469 * Now we read the pids from returned data one by one, pass
2470 * them into a child in the target namespace, read back the
2471 * translated pids, and put them into our to-return data
2472 */
2473
2474 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2475 perror("socketpair");
2476 free(tmpdata);
2477 return false;
2478 }
2479
2480 cpid = fork();
2481 if (cpid == -1)
2482 goto out;
2483
2484 if (!cpid) // child - exits when done
2485 pid_to_ns_wrapper(sock[1], tpid);
2486
2487 char *ptr = tmpdata;
2488 cred.uid = 0;
2489 cred.gid = 0;
2490 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2491 cred.pid = qpid;
2492 ret = send_creds(sock[0], &cred, v, true);
2493
2494 if (ret == SEND_CREDS_NOTSK)
2495 goto next;
2496 if (ret == SEND_CREDS_FAIL)
2497 goto out;
2498
2499 // read converted results
2500 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2501 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2502 goto out;
2503 }
2504 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2505 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2506 goto out;
2507 }
2508 must_strcat_pid(d, &sz, &asz, qpid);
2509next:
2510 ptr = strchr(ptr, '\n');
2511 if (!ptr)
2512 break;
2513 ptr++;
2514 }
2515
2516 cred.pid = getpid();
2517 v = '1';
2518 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2519 // failed to ask child to exit
b8defc3d 2520 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2521 goto out;
2522 }
2523
2524 answer = true;
2525
2526out:
2527 free(tmpdata);
2528 if (cpid != -1)
2529 wait_for_pid(cpid);
2530 if (sock[0] != -1) {
2531 close(sock[0]);
2532 close(sock[1]);
2533 }
2534 return answer;
2535}
2536
2537int cg_read(const char *path, char *buf, size_t size, off_t offset,
2538 struct fuse_file_info *fi)
2539{
2540 struct fuse_context *fc = fuse_get_context();
2541 struct file_info *f = (struct file_info *)fi->fh;
2542 struct cgfs_files *k = NULL;
2543 char *data = NULL;
2544 int ret, s;
2545 bool r;
2546
2547 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2548 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2549 return -EIO;
2550 }
2551
2552 if (offset)
2553 return 0;
2554
2555 if (!fc)
2556 return -EIO;
2557
2558 if (!f->controller)
2559 return -EINVAL;
2560
2561 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2562 return -EINVAL;
2563 }
2564 free_key(k);
2565
2566
888f8f3c 2567 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2568 ret = -EACCES;
2569 goto out;
2570 }
2571
2572 if (strcmp(f->file, "tasks") == 0 ||
2573 strcmp(f->file, "/tasks") == 0 ||
2574 strcmp(f->file, "/cgroup.procs") == 0 ||
2575 strcmp(f->file, "cgroup.procs") == 0)
2576 // special case - we have to translate the pids
2577 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2578 else
2579 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2580
2581 if (!r) {
2582 ret = -EINVAL;
2583 goto out;
2584 }
2585
2586 if (!data) {
2587 ret = 0;
2588 goto out;
2589 }
2590 s = strlen(data);
2591 if (s > size)
2592 s = size;
2593 memcpy(buf, data, s);
2594 if (s > 0 && s < size && data[s-1] != '\n')
2595 buf[s++] = '\n';
2596
2597 ret = s;
2598
2599out:
2600 free(data);
2601 return ret;
2602}
2603
35174b0f 2604static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2605{
2606 pid_t vpid;
2607 struct ucred cred;
2608 char v;
2609 int ret;
2610
2611 cred.uid = 0;
2612 cred.gid = 0;
2613 while (1) {
2614 if (!wait_for_sock(sock, 2)) {
b8defc3d 2615 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2616 return 1;
237e200e
SH
2617 }
2618 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2619 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2620 return 1;
237e200e
SH
2621 }
2622 if (vpid == -1) // done
2623 break;
2624 v = '0';
2625 cred.pid = vpid;
2626 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2627 v = '1';
2628 cred.pid = getpid();
2629 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2630 return 1;
237e200e
SH
2631 }
2632 }
35174b0f 2633 return 0;
237e200e
SH
2634}
2635
2636static void pid_from_ns_wrapper(int sock, pid_t tpid)
2637{
2638 int newnsfd = -1, ret, cpipe[2];
2639 char fnam[100];
2640 pid_t cpid;
2641 char v;
2642
2643 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2644 if (ret < 0 || ret >= sizeof(fnam))
2645 _exit(1);
2646 newnsfd = open(fnam, O_RDONLY);
2647 if (newnsfd < 0)
2648 _exit(1);
2649 if (setns(newnsfd, 0) < 0)
2650 _exit(1);
2651 close(newnsfd);
2652
2653 if (pipe(cpipe) < 0)
2654 _exit(1);
2655
35174b0f
FG
2656 struct pid_ns_clone_args args = {
2657 .cpipe = cpipe,
2658 .sock = sock,
2659 .tpid = tpid,
2660 .wrapped = &pid_from_ns
2661 };
f0f8b851
SH
2662 size_t stack_size = sysconf(_SC_PAGESIZE);
2663 void *stack = alloca(stack_size);
35174b0f
FG
2664
2665 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2666 if (cpid < 0)
2667 _exit(1);
2668
237e200e
SH
2669 // give the child 1 second to be done forking and
2670 // write its ack
2671 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2672 _exit(1);
237e200e 2673 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2674 if (ret != sizeof(char) || v != '1')
2675 _exit(1);
237e200e
SH
2676
2677 if (!wait_for_pid(cpid))
2678 _exit(1);
2679 _exit(0);
237e200e
SH
2680}
2681
2682/*
2683 * Given host @uid, return the uid to which it maps in
2684 * @pid's user namespace, or -1 if none.
2685 */
2686bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2687{
2688 FILE *f;
2689 char line[400];
2690
2691 sprintf(line, "/proc/%d/uid_map", pid);
2692 if ((f = fopen(line, "r")) == NULL) {
2693 return false;
2694 }
2695
2696 *answer = convert_id_to_ns(f, uid);
2697 fclose(f);
2698
2699 if (*answer == -1)
2700 return false;
2701 return true;
2702}
2703
2704/*
2705 * get_pid_creds: get the real uid and gid of @pid from
2706 * /proc/$$/status
2707 * (XXX should we use euid here?)
2708 */
2709void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2710{
2711 char line[400];
2712 uid_t u;
2713 gid_t g;
2714 FILE *f;
2715
2716 *uid = -1;
2717 *gid = -1;
2718 sprintf(line, "/proc/%d/status", pid);
2719 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2720 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2721 return;
2722 }
2723 while (fgets(line, 400, f)) {
2724 if (strncmp(line, "Uid:", 4) == 0) {
2725 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2726 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2727 fclose(f);
2728 return;
2729 }
2730 *uid = u;
2731 } else if (strncmp(line, "Gid:", 4) == 0) {
2732 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2733 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2734 fclose(f);
2735 return;
2736 }
2737 *gid = g;
2738 }
2739 }
2740 fclose(f);
2741}
2742
2743/*
2744 * May the requestor @r move victim @v to a new cgroup?
2745 * This is allowed if
2746 * . they are the same task
2747 * . they are ownedy by the same uid
2748 * . @r is root on the host, or
2749 * . @v's uid is mapped into @r's where @r is root.
2750 */
2751bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2752{
2753 uid_t v_uid, tmpuid;
2754 gid_t v_gid;
2755
2756 if (r == v)
2757 return true;
2758 if (r_uid == 0)
2759 return true;
2760 get_pid_creds(v, &v_uid, &v_gid);
2761 if (r_uid == v_uid)
2762 return true;
2763 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2764 && hostuid_to_ns(v_uid, r, &tmpuid))
2765 return true;
2766 return false;
2767}
2768
2769static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2770 const char *file, const char *buf)
2771{
2772 int sock[2] = {-1, -1};
2773 pid_t qpid, cpid = -1;
2774 FILE *pids_file = NULL;
2775 bool answer = false, fail = false;
2776
2777 pids_file = open_pids_file(contrl, cg);
2778 if (!pids_file)
2779 return false;
2780
2781 /*
2782 * write the pids to a socket, have helper in writer's pidns
2783 * call movepid for us
2784 */
2785 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2786 perror("socketpair");
2787 goto out;
2788 }
2789
2790 cpid = fork();
2791 if (cpid == -1)
2792 goto out;
2793
2794 if (!cpid) { // child
2795 fclose(pids_file);
2796 pid_from_ns_wrapper(sock[1], tpid);
2797 }
2798
2799 const char *ptr = buf;
2800 while (sscanf(ptr, "%d", &qpid) == 1) {
2801 struct ucred cred;
2802 char v;
2803
2804 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2805 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2806 goto out;
2807 }
2808
2809 if (recv_creds(sock[0], &cred, &v)) {
2810 if (v == '0') {
2811 if (!may_move_pid(tpid, tuid, cred.pid)) {
2812 fail = true;
2813 break;
2814 }
2815 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2816 fail = true;
2817 }
2818 }
2819
2820 ptr = strchr(ptr, '\n');
2821 if (!ptr)
2822 break;
2823 ptr++;
2824 }
2825
2826 /* All good, write the value */
2827 qpid = -1;
2828 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2829 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2830
2831 if (!fail)
2832 answer = true;
2833
2834out:
2835 if (cpid != -1)
2836 wait_for_pid(cpid);
2837 if (sock[0] != -1) {
2838 close(sock[0]);
2839 close(sock[1]);
2840 }
2841 if (pids_file) {
2842 if (fclose(pids_file) != 0)
2843 answer = false;
2844 }
2845 return answer;
2846}
2847
2848int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2849 struct fuse_file_info *fi)
2850{
2851 struct fuse_context *fc = fuse_get_context();
2852 char *localbuf = NULL;
2853 struct cgfs_files *k = NULL;
2854 struct file_info *f = (struct file_info *)fi->fh;
2855 bool r;
2856
2857 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2858 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
2859 return -EIO;
2860 }
2861
2862 if (offset)
2863 return 0;
2864
2865 if (!fc)
2866 return -EIO;
2867
2868 localbuf = alloca(size+1);
2869 localbuf[size] = '\0';
2870 memcpy(localbuf, buf, size);
2871
2872 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2873 size = -EINVAL;
2874 goto out;
2875 }
2876
2877 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2878 size = -EACCES;
2879 goto out;
2880 }
2881
2882 if (strcmp(f->file, "tasks") == 0 ||
2883 strcmp(f->file, "/tasks") == 0 ||
2884 strcmp(f->file, "/cgroup.procs") == 0 ||
2885 strcmp(f->file, "cgroup.procs") == 0)
2886 // special case - we have to translate the pids
2887 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2888 else
2889 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2890
2891 if (!r)
2892 size = -EINVAL;
2893
2894out:
2895 free_key(k);
2896 return size;
2897}
2898
2899int cg_chown(const char *path, uid_t uid, gid_t gid)
2900{
2901 struct fuse_context *fc = fuse_get_context();
2902 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2903 struct cgfs_files *k = NULL;
2904 const char *cgroup;
2905 int ret;
2906
2907 if (!fc)
2908 return -EIO;
2909
2910 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2911 return -EPERM;
237e200e
SH
2912
2913 controller = pick_controller_from_path(fc, path);
2914 if (!controller)
bc70ba9b
CB
2915 return errno == ENOENT ? -EPERM : -errno;
2916
237e200e
SH
2917 cgroup = find_cgroup_in_path(path);
2918 if (!cgroup)
2919 /* this is just /cgroup/controller */
bc70ba9b 2920 return -EPERM;
237e200e
SH
2921
2922 get_cgdir_and_path(cgroup, &cgdir, &last);
2923
2924 if (!last) {
2925 path1 = "/";
2926 path2 = cgdir;
2927 } else {
2928 path1 = cgdir;
2929 path2 = last;
2930 }
2931
2932 if (is_child_cgroup(controller, path1, path2)) {
2933 // get uid, gid, from '/tasks' file and make up a mode
2934 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2935 k = cgfs_get_key(controller, cgroup, "tasks");
2936
2937 } else
2938 k = cgfs_get_key(controller, path1, path2);
2939
2940 if (!k) {
2941 ret = -EINVAL;
2942 goto out;
2943 }
2944
2945 /*
2946 * This being a fuse request, the uid and gid must be valid
2947 * in the caller's namespace. So we can just check to make
2948 * sure that the caller is root in his uid, and privileged
2949 * over the file's current owner.
2950 */
2951 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2952 ret = -EACCES;
2953 goto out;
2954 }
2955
2956 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2957
2958out:
2959 free_key(k);
2960 free(cgdir);
2961
2962 return ret;
2963}
2964
2965int cg_chmod(const char *path, mode_t mode)
2966{
2967 struct fuse_context *fc = fuse_get_context();
2968 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2969 struct cgfs_files *k = NULL;
2970 const char *cgroup;
2971 int ret;
2972
2973 if (!fc)
2974 return -EIO;
2975
2976 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2977 return -EPERM;
237e200e
SH
2978
2979 controller = pick_controller_from_path(fc, path);
2980 if (!controller)
bc70ba9b
CB
2981 return errno == ENOENT ? -EPERM : -errno;
2982
237e200e
SH
2983 cgroup = find_cgroup_in_path(path);
2984 if (!cgroup)
2985 /* this is just /cgroup/controller */
bc70ba9b 2986 return -EPERM;
237e200e
SH
2987
2988 get_cgdir_and_path(cgroup, &cgdir, &last);
2989
2990 if (!last) {
2991 path1 = "/";
2992 path2 = cgdir;
2993 } else {
2994 path1 = cgdir;
2995 path2 = last;
2996 }
2997
2998 if (is_child_cgroup(controller, path1, path2)) {
2999 // get uid, gid, from '/tasks' file and make up a mode
3000 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3001 k = cgfs_get_key(controller, cgroup, "tasks");
3002
3003 } else
3004 k = cgfs_get_key(controller, path1, path2);
3005
3006 if (!k) {
3007 ret = -EINVAL;
3008 goto out;
3009 }
3010
3011 /*
3012 * This being a fuse request, the uid and gid must be valid
3013 * in the caller's namespace. So we can just check to make
3014 * sure that the caller is root in his uid, and privileged
3015 * over the file's current owner.
3016 */
3017 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3018 ret = -EPERM;
3019 goto out;
3020 }
3021
3022 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3023 ret = -EINVAL;
3024 goto out;
3025 }
3026
3027 ret = 0;
3028out:
3029 free_key(k);
3030 free(cgdir);
3031 return ret;
3032}
3033
3034int cg_mkdir(const char *path, mode_t mode)
3035{
3036 struct fuse_context *fc = fuse_get_context();
3037 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3038 const char *cgroup;
3039 int ret;
3040
3041 if (!fc)
3042 return -EIO;
3043
237e200e
SH
3044 controller = pick_controller_from_path(fc, path);
3045 if (!controller)
2f7036d0 3046 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3047
3048 cgroup = find_cgroup_in_path(path);
3049 if (!cgroup)
bc70ba9b 3050 return -errno;
237e200e
SH
3051
3052 get_cgdir_and_path(cgroup, &cgdir, &last);
3053 if (!last)
3054 path1 = "/";
3055 else
3056 path1 = cgdir;
3057
3058 pid_t initpid = lookup_initpid_in_store(fc->pid);
3059 if (initpid <= 0)
3060 initpid = fc->pid;
3061 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3062 if (!next)
3063 ret = -EINVAL;
3064 else if (last && strcmp(next, last) == 0)
3065 ret = -EEXIST;
3066 else
2f7036d0 3067 ret = -EPERM;
237e200e
SH
3068 goto out;
3069 }
3070
3071 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3072 ret = -EACCES;
3073 goto out;
3074 }
3075 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3076 ret = -EACCES;
3077 goto out;
3078 }
3079
3080 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3081
3082out:
3083 free(cgdir);
3084 free(next);
3085 return ret;
3086}
3087
3088int cg_rmdir(const char *path)
3089{
3090 struct fuse_context *fc = fuse_get_context();
3091 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3092 const char *cgroup;
3093 int ret;
3094
3095 if (!fc)
3096 return -EIO;
3097
3098 controller = pick_controller_from_path(fc, path);
e254948f
CB
3099 if (!controller) /* Someone's trying to delete "/cgroup". */
3100 return -EPERM;
237e200e
SH
3101
3102 cgroup = find_cgroup_in_path(path);
e254948f
CB
3103 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3104 return -EPERM;
237e200e
SH
3105
3106 get_cgdir_and_path(cgroup, &cgdir, &last);
3107 if (!last) {
e254948f
CB
3108 /* Someone's trying to delete a cgroup on the same level as the
3109 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3110 * rmdir "/cgroup/blkio/init.slice".
3111 */
3112 ret = -EPERM;
237e200e
SH
3113 goto out;
3114 }
3115
3116 pid_t initpid = lookup_initpid_in_store(fc->pid);
3117 if (initpid <= 0)
3118 initpid = fc->pid;
3119 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3120 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3121 ret = -EBUSY;
3122 else
3123 ret = -ENOENT;
3124 goto out;
3125 }
3126
3127 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3128 ret = -EACCES;
3129 goto out;
3130 }
3131 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3132 ret = -EACCES;
3133 goto out;
3134 }
3135
3136 if (!cgfs_remove(controller, cgroup)) {
3137 ret = -EINVAL;
3138 goto out;
3139 }
3140
3141 ret = 0;
3142
3143out:
3144 free(cgdir);
3145 free(next);
3146 return ret;
3147}
3148
3149static bool startswith(const char *line, const char *pref)
3150{
3151 if (strncmp(line, pref, strlen(pref)) == 0)
3152 return true;
3153 return false;
3154}
3155
c6095b08
SH
3156static void parse_memstat(char *memstat, unsigned long *cached,
3157 unsigned long *active_anon, unsigned long *inactive_anon,
3158 unsigned long *active_file, unsigned long *inactive_file,
3159 unsigned long *unevictable)
237e200e
SH
3160{
3161 char *eol;
3162
237e200e 3163 while (*memstat) {
4accebfb
AS
3164 if (startswith(memstat, "total_cache")) {
3165 sscanf(memstat + 11, "%lu", cached);
c6095b08 3166 *cached /= 1024;
4accebfb
AS
3167 } else if (startswith(memstat, "total_active_anon")) {
3168 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3169 *active_anon /= 1024;
4accebfb
AS
3170 } else if (startswith(memstat, "total_inactive_anon")) {
3171 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3172 *inactive_anon /= 1024;
4accebfb
AS
3173 } else if (startswith(memstat, "total_active_file")) {
3174 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3175 *active_file /= 1024;
4accebfb
AS
3176 } else if (startswith(memstat, "total_inactive_file")) {
3177 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3178 *inactive_file /= 1024;
4accebfb
AS
3179 } else if (startswith(memstat, "total_unevictable")) {
3180 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3181 *unevictable /= 1024;
237e200e
SH
3182 }
3183 eol = strchr(memstat, '\n');
3184 if (!eol)
3185 return;
3186 memstat = eol+1;
3187 }
3188}
3189
3190static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3191{
3192 char *eol;
3193 char key[32];
3194
3195 memset(key, 0, 32);
3196 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3197
3198 size_t len = strlen(key);
3199 *v = 0;
3200
3201 while (*str) {
3202 if (startswith(str, key)) {
3203 sscanf(str + len, "%lu", v);
3204 return;
3205 }
3206 eol = strchr(str, '\n');
3207 if (!eol)
3208 return;
3209 str = eol+1;
3210 }
3211}
3212
3213static int read_file(const char *path, char *buf, size_t size,
3214 struct file_info *d)
3215{
3216 size_t linelen = 0, total_len = 0, rv = 0;
3217 char *line = NULL;
3218 char *cache = d->buf;
3219 size_t cache_size = d->buflen;
3220 FILE *f = fopen(path, "r");
3221 if (!f)
3222 return 0;
3223
3224 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3225 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3226 if (l < 0) {
3227 perror("Error writing to cache");
3228 rv = 0;
3229 goto err;
3230 }
3231 if (l >= cache_size) {
b8defc3d 3232 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3233 rv = 0;
3234 goto err;
3235 }
3236 cache += l;
3237 cache_size -= l;
3238 total_len += l;
3239 }
3240
3241 d->size = total_len;
a262ddb7
CB
3242 if (total_len > size)
3243 total_len = size;
237e200e
SH
3244
3245 /* read from off 0 */
3246 memcpy(buf, d->buf, total_len);
3247 rv = total_len;
3248 err:
3249 fclose(f);
3250 free(line);
3251 return rv;
3252}
3253
3254/*
3255 * FUSE ops for /proc
3256 */
3257
018246ff 3258static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3259{
3260 char *memlimit_str = NULL;
3261 unsigned long memlimit = -1;
3262
018246ff 3263 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3264 memlimit = strtoul(memlimit_str, NULL, 10);
3265
3266 free(memlimit_str);
3267
3268 return memlimit;
3269}
3270
018246ff 3271static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3272{
3273 char *copy = strdupa(cgroup);
3274 unsigned long memlimit = 0, retlimit;
3275
018246ff 3276 retlimit = get_memlimit(copy, file);
237e200e
SH
3277
3278 while (strcmp(copy, "/") != 0) {
3279 copy = dirname(copy);
018246ff 3280 memlimit = get_memlimit(copy, file);
237e200e
SH
3281 if (memlimit != -1 && memlimit < retlimit)
3282 retlimit = memlimit;
3283 };
3284
3285 return retlimit;
3286}
3287
3288static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3289 struct fuse_file_info *fi)
3290{
3291 struct fuse_context *fc = fuse_get_context();
3292 struct file_info *d = (struct file_info *)fi->fh;
3293 char *cg;
3294 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3295 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3296 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3297 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
594a10e6
WB
3298 active_file = 0, inactive_file = 0, unevictable = 0,
3299 hostswtotal = 0;
237e200e
SH
3300 char *line = NULL;
3301 size_t linelen = 0, total_len = 0, rv = 0;
3302 char *cache = d->buf;
3303 size_t cache_size = d->buflen;
3304 FILE *f = NULL;
3305
3306 if (offset){
3307 if (offset > d->size)
3308 return -EINVAL;
3309 if (!d->cached)
3310 return 0;
3311 int left = d->size - offset;
3312 total_len = left > size ? size: left;
3313 memcpy(buf, cache + offset, total_len);
3314 return total_len;
3315 }
3316
3317 pid_t initpid = lookup_initpid_in_store(fc->pid);
3318 if (initpid <= 0)
3319 initpid = fc->pid;
3320 cg = get_pid_cgroup(initpid, "memory");
3321 if (!cg)
3322 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3323 prune_init_slice(cg);
237e200e 3324
018246ff 3325 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3326 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3327 goto err;
3328 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3329 goto err;
3330
3331 // Following values are allowed to fail, because swapaccount might be turned
3332 // off for current kernel
3333 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3334 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3335 {
018246ff 3336 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3337 memswusage = strtoul(memswusage_str, NULL, 10);
3338
237e200e
SH
3339 memswlimit = memswlimit / 1024;
3340 memswusage = memswusage / 1024;
3341 }
3342
3343 memusage = strtoul(memusage_str, NULL, 10);
3344 memlimit /= 1024;
3345 memusage /= 1024;
3346
c6095b08
SH
3347 parse_memstat(memstat_str, &cached, &active_anon,
3348 &inactive_anon, &active_file, &inactive_file,
3349 &unevictable);
237e200e
SH
3350
3351 f = fopen("/proc/meminfo", "r");
3352 if (!f)
3353 goto err;
3354
3355 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3356 ssize_t l;
237e200e
SH
3357 char *printme, lbuf[100];
3358
3359 memset(lbuf, 0, 100);
3360 if (startswith(line, "MemTotal:")) {
594a10e6 3361 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3362 if (hosttotal < memlimit)
3363 memlimit = hosttotal;
3364 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3365 printme = lbuf;
3366 } else if (startswith(line, "MemFree:")) {
3367 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3368 printme = lbuf;
3369 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3370 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e
SH
3371 printme = lbuf;
3372 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
594a10e6 3373 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3374 if (hostswtotal < memswlimit)
3375 memswlimit = hostswtotal;
3376 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e
SH
3377 printme = lbuf;
3378 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
4127e51b 3379 unsigned long swaptotal = memswlimit,
b4665ce0
SH
3380 swapusage = memswusage - memusage,
3381 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3382 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3383 printme = lbuf;
da35d72a
SH
3384 } else if (startswith(line, "Slab:")) {
3385 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3386 printme = lbuf;
237e200e
SH
3387 } else if (startswith(line, "Buffers:")) {
3388 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3389 printme = lbuf;
3390 } else if (startswith(line, "Cached:")) {
3391 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3392 printme = lbuf;
3393 } else if (startswith(line, "SwapCached:")) {
3394 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3395 printme = lbuf;
2f306ad3 3396 } else if (startswith(line, "Active:")) {
c6095b08
SH
3397 snprintf(lbuf, 100, "Active: %8lu kB\n",
3398 active_anon + active_file);
3399 printme = lbuf;
2f306ad3 3400 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3401 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3402 inactive_anon + inactive_file);
3403 printme = lbuf;
3404 } else if (startswith(line, "Active(anon)")) {
3405 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3406 printme = lbuf;
3407 } else if (startswith(line, "Inactive(anon)")) {
3408 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3409 printme = lbuf;
3410 } else if (startswith(line, "Active(file)")) {
3411 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3412 printme = lbuf;
3413 } else if (startswith(line, "Inactive(file)")) {
3414 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3415 printme = lbuf;
3416 } else if (startswith(line, "Unevictable")) {
3417 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3418 printme = lbuf;
3419 } else if (startswith(line, "SReclaimable")) {
3420 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3421 printme = lbuf;
3422 } else if (startswith(line, "SUnreclaim")) {
3423 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3424 printme = lbuf;
237e200e
SH
3425 } else
3426 printme = line;
3427
3428 l = snprintf(cache, cache_size, "%s", printme);
3429 if (l < 0) {
3430 perror("Error writing to cache");
3431 rv = 0;
3432 goto err;
3433
3434 }
3435 if (l >= cache_size) {
b8defc3d 3436 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3437 rv = 0;
3438 goto err;
3439 }
3440
3441 cache += l;
3442 cache_size -= l;
3443 total_len += l;
3444 }
3445
3446 d->cached = 1;
3447 d->size = total_len;
3448 if (total_len > size ) total_len = size;
3449 memcpy(buf, d->buf, total_len);
3450
3451 rv = total_len;
3452err:
3453 if (f)
3454 fclose(f);
3455 free(line);
3456 free(cg);
3457 free(memusage_str);
3458 free(memswlimit_str);
3459 free(memswusage_str);
3460 free(memstat_str);
237e200e
SH
3461 return rv;
3462}
3463
3464/*
3465 * Read the cpuset.cpus for cg
3466 * Return the answer in a newly allocated string which must be freed
3467 */
3468static char *get_cpuset(const char *cg)
3469{
3470 char *answer;
3471
3472 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3473 return NULL;
3474 return answer;
3475}
3476
3477bool cpu_in_cpuset(int cpu, const char *cpuset);
3478
3479static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3480{
3481 int cpu;
3482
3483 if (sscanf(line, "processor : %d", &cpu) != 1)
3484 return false;
3485 return cpu_in_cpuset(cpu, cpuset);
3486}
3487
3488/*
3489 * check whether this is a '^processor" line in /proc/cpuinfo
3490 */
3491static bool is_processor_line(const char *line)
3492{
3493 int cpu;
3494
3495 if (sscanf(line, "processor : %d", &cpu) == 1)
3496 return true;
3497 return false;
3498}
3499
3500static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3501 struct fuse_file_info *fi)
3502{
3503 struct fuse_context *fc = fuse_get_context();
3504 struct file_info *d = (struct file_info *)fi->fh;
3505 char *cg;
3506 char *cpuset = NULL;
3507 char *line = NULL;
3508 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79
SH
3509 bool am_printing = false, firstline = true, is_s390x = false;
3510 int curcpu = -1, cpu;
237e200e
SH
3511 char *cache = d->buf;
3512 size_t cache_size = d->buflen;
3513 FILE *f = NULL;
3514
3515 if (offset){
3516 if (offset > d->size)
3517 return -EINVAL;
3518 if (!d->cached)
3519 return 0;
3520 int left = d->size - offset;
3521 total_len = left > size ? size: left;
3522 memcpy(buf, cache + offset, total_len);
3523 return total_len;
3524 }
3525
3526 pid_t initpid = lookup_initpid_in_store(fc->pid);
3527 if (initpid <= 0)
3528 initpid = fc->pid;
3529 cg = get_pid_cgroup(initpid, "cpuset");
3530 if (!cg)
3531 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3532 prune_init_slice(cg);
237e200e
SH
3533
3534 cpuset = get_cpuset(cg);
3535 if (!cpuset)
3536 goto err;
3537
3538 f = fopen("/proc/cpuinfo", "r");
3539 if (!f)
3540 goto err;
3541
3542 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3543 ssize_t l;
f676eb79
SH
3544 if (firstline) {
3545 firstline = false;
3546 if (strstr(line, "IBM/S390") != NULL) {
3547 is_s390x = true;
3548 am_printing = true;
5ed9d4e2 3549 continue;
f676eb79
SH
3550 }
3551 }
5ed9d4e2
SH
3552 if (strncmp(line, "# processors:", 12) == 0)
3553 continue;
237e200e
SH
3554 if (is_processor_line(line)) {
3555 am_printing = cpuline_in_cpuset(line, cpuset);
3556 if (am_printing) {
3557 curcpu ++;
3558 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3559 if (l < 0) {
3560 perror("Error writing to cache");
3561 rv = 0;
3562 goto err;
3563 }
3564 if (l >= cache_size) {
b8defc3d 3565 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3566 rv = 0;
3567 goto err;
3568 }
3569 cache += l;
3570 cache_size -= l;
3571 total_len += l;
3572 }
3573 continue;
f676eb79
SH
3574 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3575 char *p;
3576 if (!cpu_in_cpuset(cpu, cpuset))
3577 continue;
3578 curcpu ++;
3579 p = strchr(line, ':');
3580 if (!p || !*p)
3581 goto err;
3582 p++;
5ed9d4e2 3583 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3584 if (l < 0) {
3585 perror("Error writing to cache");
3586 rv = 0;
3587 goto err;
3588 }
3589 if (l >= cache_size) {
b8defc3d 3590 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3591 rv = 0;
3592 goto err;
3593 }
3594 cache += l;
3595 cache_size -= l;
3596 total_len += l;
3597 continue;
3598
237e200e
SH
3599 }
3600 if (am_printing) {
3601 l = snprintf(cache, cache_size, "%s", line);
3602 if (l < 0) {
3603 perror("Error writing to cache");
3604 rv = 0;
3605 goto err;
3606 }
3607 if (l >= cache_size) {
b8defc3d 3608 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3609 rv = 0;
3610 goto err;
3611 }
3612 cache += l;
3613 cache_size -= l;
3614 total_len += l;
3615 }
3616 }
3617
5ed9d4e2
SH
3618 if (is_s390x) {
3619 char *origcache = d->buf;
a262ddb7 3620 ssize_t l;
5ed9d4e2
SH
3621 do {
3622 d->buf = malloc(d->buflen);
3623 } while (!d->buf);
3624 cache = d->buf;
3625 cache_size = d->buflen;
3626 total_len = 0;
3627 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3628 if (l < 0 || l >= cache_size) {
3629 free(origcache);
3630 goto err;
3631 }
3632 cache_size -= l;
3633 cache += l;
3634 total_len += l;
3635 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3636 if (l < 0 || l >= cache_size) {
3637 free(origcache);
3638 goto err;
3639 }
3640 cache_size -= l;
3641 cache += l;
3642 total_len += l;
3643 l = snprintf(cache, cache_size, "%s", origcache);
3644 free(origcache);
3645 if (l < 0 || l >= cache_size)
3646 goto err;
3647 total_len += l;
3648 }
3649
237e200e
SH
3650 d->cached = 1;
3651 d->size = total_len;
3652 if (total_len > size ) total_len = size;
3653
3654 /* read from off 0 */
3655 memcpy(buf, d->buf, total_len);
3656 rv = total_len;
3657err:
3658 if (f)
3659 fclose(f);
3660 free(line);
3661 free(cpuset);
3662 free(cg);
3663 return rv;
3664}
3665
0ecddf02 3666static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3667{
9ac264cf 3668 int ret;
0ecddf02
CB
3669 FILE *f;
3670 uint64_t starttime;
3671 /* strlen("/proc/") = 6
3672 * +
3673 * LXCFS_NUMSTRLEN64
3674 * +
3675 * strlen("/stat") = 5
3676 * +
3677 * \0 = 1
3678 * */
3679#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3680 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3681 pid_t qpid;
3682
3683 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3684 if (qpid <= 0) {
3685 /* Caller can check for EINVAL on 0. */
3686 errno = EINVAL;
9ac264cf 3687 return 0;
0ecddf02 3688 }
9ac264cf 3689
0ecddf02
CB
3690 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3691 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3692 /* Caller can check for EINVAL on 0. */
3693 errno = EINVAL;
9ac264cf 3694 return 0;
0ecddf02 3695 }
9ac264cf 3696
0ecddf02
CB
3697 f = fopen(path, "r");
3698 if (!f) {
3699 /* Caller can check for EINVAL on 0. */
3700 errno = EINVAL;
9ac264cf 3701 return 0;
0ecddf02 3702 }
9ac264cf 3703
0ecddf02
CB
3704 /* Note that the *scanf() argument supression requires that length
3705 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3706 * at us. It's like telling someone you're not married and then asking
3707 * if you can bring your wife to the party.
3708 */
3709 ret = fscanf(f, "%*d " /* (1) pid %d */
3710 "%*s " /* (2) comm %s */
3711 "%*c " /* (3) state %c */
3712 "%*d " /* (4) ppid %d */
3713 "%*d " /* (5) pgrp %d */
3714 "%*d " /* (6) session %d */
3715 "%*d " /* (7) tty_nr %d */
3716 "%*d " /* (8) tpgid %d */
3717 "%*u " /* (9) flags %u */
3718 "%*u " /* (10) minflt %lu */
3719 "%*u " /* (11) cminflt %lu */
3720 "%*u " /* (12) majflt %lu */
3721 "%*u " /* (13) cmajflt %lu */
3722 "%*u " /* (14) utime %lu */
3723 "%*u " /* (15) stime %lu */
3724 "%*d " /* (16) cutime %ld */
3725 "%*d " /* (17) cstime %ld */
3726 "%*d " /* (18) priority %ld */
3727 "%*d " /* (19) nice %ld */
3728 "%*d " /* (20) num_threads %ld */
3729 "%*d " /* (21) itrealvalue %ld */
3730 "%" PRIu64, /* (22) starttime %llu */
3731 &starttime);
3732 if (ret != 1) {
3733 fclose(f);
3734 /* Caller can check for EINVAL on 0. */
3735 errno = EINVAL;
3736 return 0;
3737 }
3738
3739 fclose(f);
3740
3741 errno = 0;
3742 return starttime;
3743}
3744
3745static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3746{
3747 uint64_t clockticks;
3748 int64_t ticks_per_sec;
3749
3750 clockticks = get_reaper_start_time(pid);
3751 if (clockticks == 0 && errno == EINVAL) {
3752 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3753 return 0;
3754 }
3755
3756 ticks_per_sec = sysconf(_SC_CLK_TCK);
3757 if (ticks_per_sec < 0 && errno == EINVAL) {
3758 lxcfs_debug(
3759 "%s\n",
3760 "failed to determine number of clock ticks in a second");
3761 return 0;
3762 }
3763
3764 return (clockticks /= ticks_per_sec);
3765}
3766
3767static uint64_t get_reaper_age(pid_t pid)
3768{
3769 uint64_t procstart, uptime, procage;
3770
3771 /* We need to substract the time the process has started since system
3772 * boot minus the time when the system has started to get the actual
3773 * reaper age.
3774 */
3775 procstart = get_reaper_start_time_in_sec(pid);
3776 procage = procstart;
3777 if (procstart > 0) {
3778 int ret;
3779 struct timespec spec;
3780
3781 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3782 if (ret < 0)
3783 return 0;
3784 /* We could make this more precise here by using the tv_nsec
3785 * field in the timespec struct and convert it to milliseconds
3786 * and then create a double for the seconds and milliseconds but
3787 * that seems more work than it is worth.
3788 */
3789 uptime = spec.tv_sec;
3790 procage = uptime - procstart;
3791 }
3792
3793 return procage;
3794}
3795
f34de69a 3796#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
3797static int proc_stat_read(char *buf, size_t size, off_t offset,
3798 struct fuse_file_info *fi)
3799{
3800 struct fuse_context *fc = fuse_get_context();
3801 struct file_info *d = (struct file_info *)fi->fh;
3802 char *cg;
3803 char *cpuset = NULL;
3804 char *line = NULL;
3805 size_t linelen = 0, total_len = 0, rv = 0;
3806 int curcpu = -1; /* cpu numbering starts at 0 */
7144f069 3807 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 3808 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 3809 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
3810 char cpuall[CPUALL_MAX_SIZE];
3811 /* reserve for cpu all */
3812 char *cache = d->buf + CPUALL_MAX_SIZE;
3813 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3814 FILE *f = NULL;
3815
3816 if (offset){
3817 if (offset > d->size)
3818 return -EINVAL;
3819 if (!d->cached)
3820 return 0;
3821 int left = d->size - offset;
3822 total_len = left > size ? size: left;
3823 memcpy(buf, d->buf + offset, total_len);
3824 return total_len;
3825 }
3826
3827 pid_t initpid = lookup_initpid_in_store(fc->pid);
3828 if (initpid <= 0)
3829 initpid = fc->pid;
3830 cg = get_pid_cgroup(initpid, "cpuset");
3831 if (!cg)
3832 return read_file("/proc/stat", buf, size, d);
6d2f6996 3833 prune_init_slice(cg);
237e200e
SH
3834
3835 cpuset = get_cpuset(cg);
3836 if (!cpuset)
3837 goto err;
3838
3839 f = fopen("/proc/stat", "r");
3840 if (!f)
3841 goto err;
3842
3843 //skip first line
3844 if (getline(&line, &linelen, f) < 0) {
b8defc3d 3845 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
3846 goto err;
3847 }
3848
3849 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3850 ssize_t l;
237e200e
SH
3851 int cpu;
3852 char cpu_char[10]; /* That's a lot of cores */
3853 char *c;
3854
b4665ce0
SH
3855 if (strlen(line) == 0)
3856 continue;
237e200e
SH
3857 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3858 /* not a ^cpuN line containing a number N, just print it */
9502bae2 3859 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3860 if (l < 0) {
3861 perror("Error writing to cache");
3862 rv = 0;
3863 goto err;
3864 }
3865 if (l >= cache_size) {
b8defc3d 3866 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3867 rv = 0;
3868 goto err;
3869 }
3870 cache += l;
3871 cache_size -= l;
3872 total_len += l;
3873 continue;
3874 }
3875
3876 if (sscanf(cpu_char, "%d", &cpu) != 1)
3877 continue;
3878 if (!cpu_in_cpuset(cpu, cpuset))
3879 continue;
3880 curcpu ++;
3881
3882 c = strchr(line, ' ');
3883 if (!c)
3884 continue;
3885 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3886 if (l < 0) {
3887 perror("Error writing to cache");
3888 rv = 0;
3889 goto err;
3890
3891 }
3892 if (l >= cache_size) {
b8defc3d 3893 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3894 rv = 0;
3895 goto err;
3896 }
3897
3898 cache += l;
3899 cache_size -= l;
3900 total_len += l;
3901
7144f069
CB
3902 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3903 &user,
3904 &nice,
3905 &system,
3906 &idle,
3907 &iowait,
3908 &irq,
3909 &softirq,
3910 &steal,
3911 &guest,
3912 &guest_nice) != 10)
237e200e
SH
3913 continue;
3914 user_sum += user;
3915 nice_sum += nice;
3916 system_sum += system;
3917 idle_sum += idle;
3918 iowait_sum += iowait;
3919 irq_sum += irq;
3920 softirq_sum += softirq;
3921 steal_sum += steal;
3922 guest_sum += guest;
7144f069 3923 guest_nice_sum += guest_nice;
237e200e
SH
3924 }
3925
3926 cache = d->buf;
3927
7144f069
CB
3928 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3929 user_sum,
3930 nice_sum,
3931 system_sum,
3932 idle_sum,
3933 iowait_sum,
3934 irq_sum,
3935 softirq_sum,
3936 steal_sum,
3937 guest_sum,
3938 guest_nice_sum);
3939 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
3940 memcpy(cache, cpuall, cpuall_len);
3941 cache += cpuall_len;
7144f069 3942 } else {
237e200e 3943 /* shouldn't happen */
b8defc3d 3944 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
3945 cpuall_len = 0;
3946 }
3947
3948 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3949 total_len += cpuall_len;
3950 d->cached = 1;
3951 d->size = total_len;
7144f069
CB
3952 if (total_len > size)
3953 total_len = size;
237e200e
SH
3954
3955 memcpy(buf, d->buf, total_len);
3956 rv = total_len;
3957
3958err:
3959 if (f)
3960 fclose(f);
3961 free(line);
3962 free(cpuset);
3963 free(cg);
3964 return rv;
3965}
3966
0ecddf02
CB
3967/* This function retrieves the busy time of a group of tasks by looking at
3968 * cpuacct.usage. Unfortunately, this only makes sense when the container has
3969 * been given it's own cpuacct cgroup. If not, this function will take the busy
3970 * time of all other taks that do not actually belong to the container into
3971 * account as well. If someone has a clever solution for this please send a
3972 * patch!
3973 */
237e200e
SH
3974static unsigned long get_reaper_busy(pid_t task)
3975{
3976 pid_t initpid = lookup_initpid_in_store(task);
3977 char *cgroup = NULL, *usage_str = NULL;
3978 unsigned long usage = 0;
3979
3980 if (initpid <= 0)
3981 return 0;
3982
3983 cgroup = get_pid_cgroup(initpid, "cpuacct");
3984 if (!cgroup)
3985 goto out;
6d2f6996 3986 prune_init_slice(cgroup);
237e200e
SH
3987 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3988 goto out;
3989 usage = strtoul(usage_str, NULL, 10);
3990 usage /= 1000000000;
3991
3992out:
3993 free(cgroup);
3994 free(usage_str);
3995 return usage;
3996}
3997
3998#if RELOADTEST
3999void iwashere(void)
4000{
237e200e
SH
4001 int fd;
4002
ec2b5e7c 4003 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
4004 if (fd >= 0)
4005 close(fd);
4006}
4007#endif
4008
4009/*
4010 * We read /proc/uptime and reuse its second field.
4011 * For the first field, we use the mtime for the reaper for
4012 * the calling pid as returned by getreaperage
4013 */
4014static int proc_uptime_read(char *buf, size_t size, off_t offset,
4015 struct fuse_file_info *fi)
4016{
4017 struct fuse_context *fc = fuse_get_context();
4018 struct file_info *d = (struct file_info *)fi->fh;
0ecddf02 4019 unsigned long int busytime = get_reaper_busy(fc->pid);
237e200e 4020 char *cache = d->buf;
a262ddb7 4021 ssize_t total_len = 0;
0ecddf02 4022 uint64_t idletime, reaperage;
237e200e
SH
4023
4024#if RELOADTEST
4025 iwashere();
4026#endif
4027
4028 if (offset){
237e200e
SH
4029 if (!d->cached)
4030 return 0;
bbdf646b
BM
4031 if (offset > d->size)
4032 return -EINVAL;
237e200e
SH
4033 int left = d->size - offset;
4034 total_len = left > size ? size: left;
4035 memcpy(buf, cache + offset, total_len);
4036 return total_len;
4037 }
4038
0ecddf02
CB
4039 reaperage = get_reaper_age(fc->pid);
4040 /* To understand why this is done, please read the comment to the
4041 * get_reaper_busy() function.
4042 */
4043 idletime = reaperage;
4044 if (reaperage >= busytime)
4045 idletime = reaperage - busytime;
237e200e 4046
bbdf646b
BM
4047 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4048 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 4049 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
4050 return 0;
4051 }
4052
4053 d->size = (int)total_len;
4054 d->cached = 1;
4055
4056 if (total_len > size) total_len = size;
4057
4058 memcpy(buf, d->buf, total_len);
4059 return total_len;
4060}
4061
4062static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4063 struct fuse_file_info *fi)
4064{
4065 char dev_name[72];
4066 struct fuse_context *fc = fuse_get_context();
4067 struct file_info *d = (struct file_info *)fi->fh;
4068 char *cg;
4069 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4070 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4071 unsigned long read = 0, write = 0;
4072 unsigned long read_merged = 0, write_merged = 0;
4073 unsigned long read_sectors = 0, write_sectors = 0;
4074 unsigned long read_ticks = 0, write_ticks = 0;
4075 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4076 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4077 char *cache = d->buf;
4078 size_t cache_size = d->buflen;
4079 char *line = NULL;
4080 size_t linelen = 0, total_len = 0, rv = 0;
4081 unsigned int major = 0, minor = 0;
4082 int i = 0;
4083 FILE *f = NULL;
4084
4085 if (offset){
4086 if (offset > d->size)
4087 return -EINVAL;
4088 if (!d->cached)
4089 return 0;
4090 int left = d->size - offset;
4091 total_len = left > size ? size: left;
4092 memcpy(buf, cache + offset, total_len);
4093 return total_len;
4094 }
4095
4096 pid_t initpid = lookup_initpid_in_store(fc->pid);
4097 if (initpid <= 0)
4098 initpid = fc->pid;
4099 cg = get_pid_cgroup(initpid, "blkio");
4100 if (!cg)
4101 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 4102 prune_init_slice(cg);
237e200e 4103
2209fe50 4104 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 4105 goto err;
2209fe50 4106 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 4107 goto err;
2209fe50 4108 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 4109 goto err;
2209fe50 4110 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 4111 goto err;
2209fe50 4112 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
4113 goto err;
4114
4115
4116 f = fopen("/proc/diskstats", "r");
4117 if (!f)
4118 goto err;
4119
4120 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4121 ssize_t l;
2209fe50 4122 char lbuf[256];
237e200e
SH
4123
4124 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 4125 if (i != 3)
237e200e 4126 continue;
2209fe50
SH
4127
4128 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4129 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4130 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4131 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4132 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4133 read_sectors = read_sectors/512;
4134 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4135 write_sectors = write_sectors/512;
4136
4137 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4138 rd_svctm = rd_svctm/1000000;
4139 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4140 rd_wait = rd_wait/1000000;
4141 read_ticks = rd_svctm + rd_wait;
4142
4143 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4144 wr_svctm = wr_svctm/1000000;
4145 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4146 wr_wait = wr_wait/1000000;
4147 write_ticks = wr_svctm + wr_wait;
4148
4149 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4150 tot_ticks = tot_ticks/1000000;
237e200e
SH
4151
4152 memset(lbuf, 0, 256);
2db31eb6
SH
4153 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4154 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4155 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4156 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4157 else
4158 continue;
237e200e 4159
2209fe50 4160 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
4161 if (l < 0) {
4162 perror("Error writing to fuse buf");
4163 rv = 0;
4164 goto err;
4165 }
4166 if (l >= cache_size) {
b8defc3d 4167 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4168 rv = 0;
4169 goto err;
4170 }
4171 cache += l;
4172 cache_size -= l;
4173 total_len += l;
4174 }
4175
4176 d->cached = 1;
4177 d->size = total_len;
4178 if (total_len > size ) total_len = size;
4179 memcpy(buf, d->buf, total_len);
4180
4181 rv = total_len;
4182err:
4183 free(cg);
4184 if (f)
4185 fclose(f);
4186 free(line);
4187 free(io_serviced_str);
4188 free(io_merged_str);
4189 free(io_service_bytes_str);
4190 free(io_wait_time_str);
4191 free(io_service_time_str);
4192 return rv;
4193}
4194
70dcc12e
SH
4195static int proc_swaps_read(char *buf, size_t size, off_t offset,
4196 struct fuse_file_info *fi)
4197{
4198 struct fuse_context *fc = fuse_get_context();
4199 struct file_info *d = (struct file_info *)fi->fh;
4200 char *cg = NULL;
018246ff 4201 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 4202 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
4203 ssize_t total_len = 0, rv = 0;
4204 ssize_t l = 0;
70dcc12e
SH
4205 char *cache = d->buf;
4206
4207 if (offset) {
4208 if (offset > d->size)
4209 return -EINVAL;
4210 if (!d->cached)
4211 return 0;
4212 int left = d->size - offset;
4213 total_len = left > size ? size: left;
4214 memcpy(buf, cache + offset, total_len);
4215 return total_len;
4216 }
4217
4218 pid_t initpid = lookup_initpid_in_store(fc->pid);
4219 if (initpid <= 0)
4220 initpid = fc->pid;
4221 cg = get_pid_cgroup(initpid, "memory");
4222 if (!cg)
4223 return read_file("/proc/swaps", buf, size, d);
6d2f6996 4224 prune_init_slice(cg);
70dcc12e 4225
018246ff 4226 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
4227
4228 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4229 goto err;
4230
70dcc12e
SH
4231 memusage = strtoul(memusage_str, NULL, 10);
4232
4233 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4234 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4235
018246ff 4236 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
4237 memswusage = strtoul(memswusage_str, NULL, 10);
4238
70dcc12e
SH
4239 swap_total = (memswlimit - memlimit) / 1024;
4240 swap_free = (memswusage - memusage) / 1024;
4241 }
4242
4243 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4244
4245 /* When no mem + swap limit is specified or swapaccount=0*/
4246 if (!memswlimit) {
4247 char *line = NULL;
4248 size_t linelen = 0;
4249 FILE *f = fopen("/proc/meminfo", "r");
4250
4251 if (!f)
4252 goto err;
4253
4254 while (getline(&line, &linelen, f) != -1) {
4255 if (startswith(line, "SwapTotal:")) {
4256 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4257 } else if (startswith(line, "SwapFree:")) {
4258 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4259 }
4260 }
4261
4262 free(line);
4263 fclose(f);
4264 }
4265
4266 if (swap_total > 0) {
a262ddb7
CB
4267 l = snprintf(d->buf + total_len, d->size - total_len,
4268 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4269 swap_total, swap_free);
4270 total_len += l;
70dcc12e
SH
4271 }
4272
a262ddb7 4273 if (total_len < 0 || l < 0) {
70dcc12e
SH
4274 perror("Error writing to cache");
4275 rv = 0;
4276 goto err;
4277 }
4278
4279 d->cached = 1;
4280 d->size = (int)total_len;
4281
4282 if (total_len > size) total_len = size;
4283 memcpy(buf, d->buf, total_len);
4284 rv = total_len;
4285
4286err:
4287 free(cg);
4288 free(memswlimit_str);
4289 free(memlimit_str);
4290 free(memusage_str);
4291 free(memswusage_str);
70dcc12e
SH
4292 return rv;
4293}
6db4f7a3 4294/*
4295 * Find the process pid from cgroup path.
4296 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4297 * @pid_buf : put pid to pid_buf.
4298 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4299 * @depth : the depth of cgroup in container.
4300 * @sum : return the number of pid.
4301 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4302 */
4303static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4304{
4305 DIR *dir;
4306 int fd;
4307 struct dirent *file;
4308 FILE *f = NULL;
4309 size_t linelen = 0;
4310 char *line = NULL;
4311 int pd;
4312 char *path_dir, *path;
4313 char **pid;
4314
4315 /* path = dpath + "/cgroup.procs" + /0 */
4316 do {
4317 path = malloc(strlen(dpath) + 20);
4318 } while (!path);
4319
4320 strcpy(path, dpath);
4321 fd = openat(cfd, path, O_RDONLY);
4322 if (fd < 0)
4323 goto out;
4324
4325 dir = fdopendir(fd);
4326 if (dir == NULL) {
4327 close(fd);
4328 goto out;
4329 }
4330
4331 while (((file = readdir(dir)) != NULL) && depth > 0) {
4332 if (strncmp(file->d_name, ".", 1) == 0)
4333 continue;
4334 if (strncmp(file->d_name, "..", 1) == 0)
4335 continue;
4336 if (file->d_type == DT_DIR) {
4337 /* path + '/' + d_name +/0 */
4338 do {
4339 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4340 } while (!path_dir);
4341 strcpy(path_dir, path);
4342 strcat(path_dir, "/");
4343 strcat(path_dir, file->d_name);
4344 pd = depth - 1;
4345 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4346 free(path_dir);
4347 }
4348 }
4349 closedir(dir);
4350
4351 strcat(path, "/cgroup.procs");
4352 fd = openat(cfd, path, O_RDONLY);
4353 if (fd < 0)
4354 goto out;
4355
4356 f = fdopen(fd, "r");
4357 if (!f) {
4358 close(fd);
4359 goto out;
4360 }
4361
4362 while (getline(&line, &linelen, f) != -1) {
4363 do {
4364 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4365 } while (!pid);
4366 *pid_buf = pid;
4367 do {
4368 *(*pid_buf + sum) = malloc(strlen(line) + 1);
4369 } while (*(*pid_buf + sum) == NULL);
4370 strcpy(*(*pid_buf + sum), line);
4371 sum++;
4372 }
4373 fclose(f);
4374out:
4375 free(path);
4376 return sum;
4377}
4378/*
4379 * calc_load calculates the load according to the following formula:
4380 * load1 = load0 * exp + active * (1 - exp)
4381 *
4382 * @load1: the new loadavg.
4383 * @load0: the former loadavg.
4384 * @active: the total number of running pid at this moment.
4385 * @exp: the fixed-point defined in the beginning.
4386 */
4387static unsigned long
4388calc_load(unsigned long load, unsigned long exp, unsigned long active)
4389{
4390 unsigned long newload;
4391
4392 active = active > 0 ? active * FIXED_1 : 0;
4393 newload = load * exp + active * (FIXED_1 - exp);
4394 if (active >= load)
4395 newload += FIXED_1 - 1;
4396
4397 return newload / FIXED_1;
4398}
4399
4400/*
4401 * Return 0 means that container p->cg is closed.
4402 * Return -1 means that error occurred in refresh.
4403 * Positive num equals the total number of pid.
4404 */
4405static int refresh_load(struct load_node *p, char *path)
4406{
4407 FILE *f = NULL;
4408 char **idbuf;
4409 char proc_path[256];
4410 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4411 char *line = NULL;
4412 size_t linelen = 0;
4413 int sum, length;
4414 DIR *dp;
4415 struct dirent *file;
4416
4417 do {
4418 idbuf = malloc(sizeof(char *));
4419 } while (!idbuf);
4420 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4421 /* normal exit */
4422 if (sum == 0)
4423 goto out;
4424
4425 for (i = 0; i < sum; i++) {
4426 /*clean up '\n' */
4427 length = strlen(idbuf[i])-1;
4428 idbuf[i][length] = '\0';
4429 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4430 if (ret < 0 || ret > 255) {
4431 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4432 i = sum;
4433 sum = -1;
4434 goto err_out;
4435 }
4436
4437 dp = opendir(proc_path);
4438 if (!dp) {
4439 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4440 continue;
4441 }
4442 while ((file = readdir(dp)) != NULL) {
4443 if (strncmp(file->d_name, ".", 1) == 0)
4444 continue;
4445 if (strncmp(file->d_name, "..", 1) == 0)
4446 continue;
4447 total_pid++;
4448 /* We make the biggest pid become last_pid.*/
4449 ret = atof(file->d_name);
4450 last_pid = (ret > last_pid) ? ret : last_pid;
4451
4452 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4453 if (ret < 0 || ret > 255) {
4454 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4455 i = sum;
4456 sum = -1;
4457 closedir(dp);
4458 goto err_out;
4459 }
4460 f = fopen(proc_path, "r");
4461 if (f != NULL) {
4462 while (getline(&line, &linelen, f) != -1) {
4463 /* Find State */
4464 if ((line[0] == 'S') && (line[1] == 't'))
4465 break;
4466 }
4467 if ((line[7] == 'R') || (line[7] == 'D'))
4468 run_pid++;
4469 fclose(f);
4470 }
4471 }
4472 closedir(dp);
4473 }
4474 /*Calculate the loadavg.*/
4475 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4476 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4477 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4478 p->run_pid = run_pid;
4479 p->total_pid = total_pid;
4480 p->last_pid = last_pid;
4481
4482 free(line);
4483err_out:
4484 for (; i > 0; i--)
4485 free(idbuf[i-1]);
4486out:
4487 free(idbuf);
4488 return sum;
4489}
4490/*
4491 * Traverse the hash table and update it.
4492 */
4493void *load_begin(void *arg)
4494{
4495
4496 char *path = NULL;
4497 int i, sum, length, ret;
4498 struct load_node *f;
4499 int first_node;
4500 clock_t time1, time2;
4501
4502 while (1) {
4503 time1 = clock();
4504 for (i = 0; i < LOAD_SIZE; i++) {
4505 pthread_mutex_lock(&load_hash[i].lock);
4506 if (load_hash[i].next == NULL) {
4507 pthread_mutex_unlock(&load_hash[i].lock);
4508 continue;
4509 }
4510 f = load_hash[i].next;
4511 first_node = 1;
4512 while (f) {
4513 length = strlen(f->cg) + 2;
4514 do {
4515 /* strlen(f->cg) + '.' or '' + \0 */
4516 path = malloc(length);
4517 } while (!path);
4518
4519 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4520 if (ret < 0 || ret > length - 1) {
4521 /* snprintf failed, ignore the node.*/
4522 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4523 goto out;
4524 }
4525 sum = refresh_load(f, path);
4526 if (sum == 0) {
4527 f = del_node(f, i);
4528 } else {
4529out: f = f->next;
4530 }
4531 free(path);
4532 /* load_hash[i].lock locks only on the first node.*/
4533 if (first_node == 1) {
4534 first_node = 0;
4535 pthread_mutex_unlock(&load_hash[i].lock);
4536 }
4537 }
4538 }
4539 time2 = clock();
4540 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4541 }
4542}
4543
4544static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4545 struct fuse_file_info *fi)
4546{
4547 struct fuse_context *fc = fuse_get_context();
4548 struct file_info *d = (struct file_info *)fi->fh;
4549 pid_t initpid;
4550 char *cg;
4551 size_t total_len = 0;
4552 char *cache = d->buf;
4553 struct load_node *n;
4554 int hash;
01d88ede 4555 int cfd, rv = 0;
6db4f7a3 4556 unsigned long a, b, c;
4557
4558 if (offset) {
4559 if (offset > d->size)
4560 return -EINVAL;
4561 if (!d->cached)
4562 return 0;
4563 int left = d->size - offset;
4564 total_len = left > size ? size : left;
4565 memcpy(buf, cache + offset, total_len);
4566 return total_len;
4567 }
4568 if (!loadavg)
4569 return read_file("/proc/loadavg", buf, size, d);
4570
4571 initpid = lookup_initpid_in_store(fc->pid);
4572 if (initpid <= 0)
4573 initpid = fc->pid;
4574 cg = get_pid_cgroup(initpid, "cpu");
4575 if (!cg)
4576 return read_file("/proc/loadavg", buf, size, d);
4577
4578 prune_init_slice(cg);
4579 hash = calc_hash(cg);
4580 n = locate_node(cg, hash);
4581
4582 /* First time */
4583 if (n == NULL) {
4584 if (!find_mounted_controller("cpu", &cfd)) {
4585 /*
4586 * In locate_node() above, pthread_rwlock_unlock() isn't used
4587 * because delete is not allowed before read has ended.
4588 */
4589 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
4590 rv = 0;
4591 goto err;
6db4f7a3 4592 }
4593 do {
4594 n = malloc(sizeof(struct load_node));
4595 } while (!n);
4596
4597 do {
4598 n->cg = malloc(strlen(cg)+1);
4599 } while (!n->cg);
4600 strcpy(n->cg, cg);
4601 n->avenrun[0] = 0;
4602 n->avenrun[1] = 0;
4603 n->avenrun[2] = 0;
4604 n->run_pid = 0;
4605 n->total_pid = 1;
4606 n->last_pid = initpid;
4607 n->cfd = cfd;
4608 insert_node(&n, hash);
4609 }
4610 a = n->avenrun[0] + (FIXED_1/200);
4611 b = n->avenrun[1] + (FIXED_1/200);
4612 c = n->avenrun[2] + (FIXED_1/200);
4613 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4614 LOAD_INT(a), LOAD_FRAC(a),
4615 LOAD_INT(b), LOAD_FRAC(b),
4616 LOAD_INT(c), LOAD_FRAC(c),
4617 n->run_pid, n->total_pid, n->last_pid);
4618 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4619 if (total_len < 0 || total_len >= d->buflen) {
4620 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
4621 rv = 0;
4622 goto err;
6db4f7a3 4623 }
4624 d->size = (int)total_len;
4625 d->cached = 1;
4626
4627 if (total_len > size)
4628 total_len = size;
4629 memcpy(buf, d->buf, total_len);
01d88ede
JS
4630 rv = total_len;
4631
4632err:
4633 free(cg);
4634 return rv;
6db4f7a3 4635}
4636/* Return a positive number on success, return 0 on failure.*/
4637pthread_t load_daemon(int load_use)
4638{
4639 int ret;
4640 pthread_t pid;
4641
4642 ret = init_load();
4643 if (ret == -1) {
4644 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4645 return 0;
4646 }
4647 ret = pthread_create(&pid, NULL, load_begin, NULL);
4648 if (ret != 0) {
4649 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4650 load_free();
4651 return 0;
4652 }
4653 /* use loadavg, here loadavg = 1*/
4654 loadavg = load_use;
4655 return pid;
4656}
70dcc12e 4657
237e200e
SH
4658static off_t get_procfile_size(const char *which)
4659{
4660 FILE *f = fopen(which, "r");
4661 char *line = NULL;
4662 size_t len = 0;
4663 ssize_t sz, answer = 0;
4664 if (!f)
4665 return 0;
4666
4667 while ((sz = getline(&line, &len, f)) != -1)
4668 answer += sz;
4669 fclose (f);
4670 free(line);
4671
4672 return answer;
4673}
4674
4675int proc_getattr(const char *path, struct stat *sb)
4676{
4677 struct timespec now;
4678
4679 memset(sb, 0, sizeof(struct stat));
4680 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4681 return -EINVAL;
4682 sb->st_uid = sb->st_gid = 0;
4683 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4684 if (strcmp(path, "/proc") == 0) {
4685 sb->st_mode = S_IFDIR | 00555;
4686 sb->st_nlink = 2;
4687 return 0;
4688 }
4689 if (strcmp(path, "/proc/meminfo") == 0 ||
4690 strcmp(path, "/proc/cpuinfo") == 0 ||
4691 strcmp(path, "/proc/uptime") == 0 ||
4692 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 4693 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 4694 strcmp(path, "/proc/swaps") == 0 ||
4695 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
4696 sb->st_size = 0;
4697 sb->st_mode = S_IFREG | 00444;
4698 sb->st_nlink = 1;
4699 return 0;
4700 }
4701
4702 return -ENOENT;
4703}
4704
4705int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4706 struct fuse_file_info *fi)
4707{
d639f863
CB
4708 if (filler(buf, ".", NULL, 0) != 0 ||
4709 filler(buf, "..", NULL, 0) != 0 ||
4710 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4711 filler(buf, "meminfo", NULL, 0) != 0 ||
4712 filler(buf, "stat", NULL, 0) != 0 ||
4713 filler(buf, "uptime", NULL, 0) != 0 ||
4714 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 4715 filler(buf, "swaps", NULL, 0) != 0 ||
4716 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
4717 return -EINVAL;
4718 return 0;
4719}
4720
4721int proc_open(const char *path, struct fuse_file_info *fi)
4722{
4723 int type = -1;
4724 struct file_info *info;
4725
4726 if (strcmp(path, "/proc/meminfo") == 0)
4727 type = LXC_TYPE_PROC_MEMINFO;
4728 else if (strcmp(path, "/proc/cpuinfo") == 0)
4729 type = LXC_TYPE_PROC_CPUINFO;
4730 else if (strcmp(path, "/proc/uptime") == 0)
4731 type = LXC_TYPE_PROC_UPTIME;
4732 else if (strcmp(path, "/proc/stat") == 0)
4733 type = LXC_TYPE_PROC_STAT;
4734 else if (strcmp(path, "/proc/diskstats") == 0)
4735 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
4736 else if (strcmp(path, "/proc/swaps") == 0)
4737 type = LXC_TYPE_PROC_SWAPS;
46be8eed 4738 else if (strcmp(path, "/proc/loadavg") == 0)
4739 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
4740 if (type == -1)
4741 return -ENOENT;
4742
4743 info = malloc(sizeof(*info));
4744 if (!info)
4745 return -ENOMEM;
4746
4747 memset(info, 0, sizeof(*info));
4748 info->type = type;
4749
4750 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4751 do {
4752 info->buf = malloc(info->buflen);
4753 } while (!info->buf);
4754 memset(info->buf, 0, info->buflen);
4755 /* set actual size to buffer size */
4756 info->size = info->buflen;
4757
4758 fi->fh = (unsigned long)info;
4759 return 0;
4760}
4761
bddbb106
SH
4762int proc_access(const char *path, int mask)
4763{
e7849aa3
CB
4764 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4765 return 0;
4766
bddbb106
SH
4767 /* these are all read-only */
4768 if ((mask & ~R_OK) != 0)
1b060d0a 4769 return -EACCES;
bddbb106
SH
4770 return 0;
4771}
4772
237e200e
SH
4773int proc_release(const char *path, struct fuse_file_info *fi)
4774{
43215927 4775 do_release_file_info(fi);
237e200e
SH
4776 return 0;
4777}
4778
4779int proc_read(const char *path, char *buf, size_t size, off_t offset,
4780 struct fuse_file_info *fi)
4781{
4782 struct file_info *f = (struct file_info *) fi->fh;
4783
4784 switch (f->type) {
4785 case LXC_TYPE_PROC_MEMINFO:
4786 return proc_meminfo_read(buf, size, offset, fi);
4787 case LXC_TYPE_PROC_CPUINFO:
4788 return proc_cpuinfo_read(buf, size, offset, fi);
4789 case LXC_TYPE_PROC_UPTIME:
4790 return proc_uptime_read(buf, size, offset, fi);
4791 case LXC_TYPE_PROC_STAT:
4792 return proc_stat_read(buf, size, offset, fi);
4793 case LXC_TYPE_PROC_DISKSTATS:
4794 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
4795 case LXC_TYPE_PROC_SWAPS:
4796 return proc_swaps_read(buf, size, offset, fi);
46be8eed 4797 case LXC_TYPE_PROC_LOADAVG:
4798 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
4799 default:
4800 return -EINVAL;
4801 }
4802}
4803
29a73c2f
CB
4804/*
4805 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
4806 */
4807
4808static bool mkdir_p(const char *dir, mode_t mode)
4809{
4810 const char *tmp = dir;
4811 const char *orig = dir;
4812 char *makeme;
4813
4814 do {
4815 dir = tmp + strspn(tmp, "/");
4816 tmp = dir + strcspn(dir, "/");
4817 makeme = strndup(orig, dir - orig);
4818 if (!makeme)
4819 return false;
4820 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 4821 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
4822 makeme, strerror(errno));
4823 free(makeme);
4824 return false;
4825 }
4826 free(makeme);
4827 } while(tmp != dir);
4828
4829 return true;
4830}
4831
4832static bool umount_if_mounted(void)
4833{
4834 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 4835 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
4836 return false;
4837 }
4838 return true;
4839}
4840
2283e240
CB
4841/* __typeof__ should be safe to use with all compilers. */
4842typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4843static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4844{
4845 return (fs->f_type == (fs_type_magic)magic_val);
4846}
4847
0a4dea41
CB
4848/*
4849 * looking at fs/proc_namespace.c, it appears we can
4850 * actually expect the rootfs entry to very specifically contain
4851 * " - rootfs rootfs "
4852 * IIUC, so long as we've chrooted so that rootfs is not our root,
4853 * the rootfs entry should always be skipped in mountinfo contents.
4854 */
4855static bool is_on_ramfs(void)
4856{
4857 FILE *f;
4858 char *p, *p2;
4859 char *line = NULL;
4860 size_t len = 0;
4861 int i;
4862
4863 f = fopen("/proc/self/mountinfo", "r");
4864 if (!f)
4865 return false;
4866
4867 while (getline(&line, &len, f) != -1) {
4868 for (p = line, i = 0; p && i < 4; i++)
4869 p = strchr(p + 1, ' ');
4870 if (!p)
4871 continue;
4872 p2 = strchr(p + 1, ' ');
4873 if (!p2)
4874 continue;
4875 *p2 = '\0';
4876 if (strcmp(p + 1, "/") == 0) {
4877 // this is '/'. is it the ramfs?
4878 p = strchr(p2 + 1, '-');
4879 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4880 free(line);
4881 fclose(f);
4882 return true;
4883 }
4884 }
4885 }
4886 free(line);
4887 fclose(f);
4888 return false;
4889}
4890
cc309f33 4891static int pivot_enter()
0a4dea41 4892{
cc309f33
CB
4893 int ret = -1, oldroot = -1, newroot = -1;
4894
4895 oldroot = open("/", O_DIRECTORY | O_RDONLY);
4896 if (oldroot < 0) {
4897 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4898 return ret;
4899 }
4900
4901 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4902 if (newroot < 0) {
4903 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4904 goto err;
4905 }
4906
4907 /* change into new root fs */
4908 if (fchdir(newroot) < 0) {
4909 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4910 goto err;
4911 }
4912
0a4dea41
CB
4913 /* pivot_root into our new root fs */
4914 if (pivot_root(".", ".") < 0) {
4915 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 4916 goto err;
0a4dea41
CB
4917 }
4918
4919 /*
4920 * At this point the old-root is mounted on top of our new-root.
4921 * To unmounted it we must not be chdir'd into it, so escape back
4922 * to the old-root.
4923 */
4924 if (fchdir(oldroot) < 0) {
4925 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 4926 goto err;
0a4dea41
CB
4927 }
4928
4929 if (umount2(".", MNT_DETACH) < 0) {
4930 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 4931 goto err;
0a4dea41
CB
4932 }
4933
4934 if (fchdir(newroot) < 0) {
4935 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 4936 goto err;
0a4dea41
CB
4937 }
4938
cc309f33
CB
4939 ret = 0;
4940
4941err:
4942 if (oldroot > 0)
4943 close(oldroot);
4944 if (newroot > 0)
4945 close(newroot);
4946
4947 return ret;
0a4dea41
CB
4948}
4949
4950static int chroot_enter()
4951{
4952 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4953 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4954 return -1;
4955 }
4956
4957 if (chroot(".") < 0) {
4958 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4959 return -1;
4960 }
4961
4962 if (chdir("/") < 0) {
4963 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4964 return -1;
4965 }
4966
4967 return 0;
4968}
4969
0232cbac 4970static int permute_and_enter(void)
29a73c2f 4971{
0a4dea41
CB
4972 struct statfs sb;
4973
4974 if (statfs("/", &sb) < 0) {
4975 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 4976 return -1;
0a4dea41
CB
4977 }
4978
4979 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
4980 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
4981 * /proc/1/mountinfo. */
4982 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
4983 return chroot_enter();
29a73c2f 4984
cc309f33 4985 if (pivot_enter() < 0) {
0a4dea41 4986 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 4987 return -1;
29a73c2f
CB
4988 }
4989
cc309f33 4990 return 0;
29a73c2f
CB
4991}
4992
4993/* Prepare our new clean root. */
0232cbac 4994static int permute_prepare(void)
29a73c2f
CB
4995{
4996 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 4997 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
4998 return -1;
4999 }
5000
5001 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 5002 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
5003 return -1;
5004 }
5005
5006 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 5007 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
5008 return -1;
5009 }
5010
5011 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 5012 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
5013 return -1;
5014 }
5015
5016 return 0;
5017}
5018
0232cbac
CB
5019/* Calls chroot() on ramfs, pivot_root() in all other cases. */
5020static bool permute_root(void)
29a73c2f
CB
5021{
5022 /* Prepare new root. */
0232cbac 5023 if (permute_prepare() < 0)
29a73c2f
CB
5024 return false;
5025
5026 /* Pivot into new root. */
0232cbac 5027 if (permute_and_enter() < 0)
29a73c2f
CB
5028 return false;
5029
5030 return true;
5031}
5032
a257a8ee
CB
5033static int preserve_mnt_ns(int pid)
5034{
5035 int ret;
5036 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5037 char path[len];
5038
5039 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5040 if (ret < 0 || (size_t)ret >= len)
5041 return -1;
5042
5043 return open(path, O_RDONLY | O_CLOEXEC);
5044}
5045
0a4dea41 5046static bool cgfs_prepare_mounts(void)
29a73c2f
CB
5047{
5048 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 5049 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
5050 return false;
5051 }
480262c9 5052
29a73c2f 5053 if (!umount_if_mounted()) {
b8defc3d 5054 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
5055 return false;
5056 }
5057
5058 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 5059 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
5060 return false;
5061 }
5062
a257a8ee
CB
5063 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5064 if (cgroup_mount_ns_fd < 0) {
5065 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5066 return false;
5067 }
5068
480262c9 5069 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 5070 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
5071 return false;
5072 }
480262c9 5073
29a73c2f 5074 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 5075 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
5076 return false;
5077 }
480262c9 5078
29a73c2f
CB
5079 return true;
5080}
5081
0a4dea41 5082static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
5083{
5084 char *target;
5085 size_t clen, len;
5086 int i, ret;
5087
5088 for (i = 0; i < num_hierarchies; i++) {
5089 char *controller = hierarchies[i];
51c7ca35 5090
29a73c2f
CB
5091 clen = strlen(controller);
5092 len = strlen(BASEDIR) + clen + 2;
5093 target = malloc(len);
5094 if (!target)
5095 return false;
51c7ca35 5096
29a73c2f
CB
5097 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5098 if (ret < 0 || ret >= len) {
5099 free(target);
5100 return false;
5101 }
5102 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5103 free(target);
5104 return false;
5105 }
51c7ca35
CB
5106 if (!strcmp(controller, "unified"))
5107 ret = mount("none", target, "cgroup2", 0, NULL);
5108 else
5109 ret = mount(controller, target, "cgroup", 0, controller);
5110 if (ret < 0) {
5111 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
5112 free(target);
5113 return false;
5114 }
5115
5116 fd_hierarchies[i] = open(target, O_DIRECTORY);
5117 if (fd_hierarchies[i] < 0) {
5118 free(target);
5119 return false;
5120 }
5121 free(target);
5122 }
5123 return true;
5124}
5125
480262c9 5126static bool cgfs_setup_controllers(void)
29a73c2f 5127{
0a4dea41 5128 if (!cgfs_prepare_mounts())
29a73c2f 5129 return false;
29a73c2f 5130
0a4dea41 5131 if (!cgfs_mount_hierarchies()) {
b8defc3d 5132 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
5133 return false;
5134 }
5135
0232cbac 5136 if (!permute_root())
29a73c2f
CB
5137 return false;
5138
5139 return true;
5140}
5141
5142static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
5143{
5144 FILE *f;
e58dab00
CB
5145 char *cret, *line = NULL;
5146 char cwd[MAXPATHLEN];
237e200e 5147 size_t len = 0;
480262c9 5148 int i, init_ns = -1;
51c7ca35 5149 bool found_unified = false;
237e200e
SH
5150
5151 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 5152 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
5153 return;
5154 }
e58dab00 5155
237e200e 5156 while (getline(&line, &len, f) != -1) {
51c7ca35 5157 char *idx, *p, *p2;
237e200e
SH
5158
5159 p = strchr(line, ':');
5160 if (!p)
5161 goto out;
51c7ca35 5162 idx = line;
237e200e
SH
5163 *(p++) = '\0';
5164
5165 p2 = strrchr(p, ':');
5166 if (!p2)
5167 goto out;
5168 *p2 = '\0';
5169
a67719f6
CB
5170 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5171 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5172 * because it parses out the empty string "" and later on passes
5173 * it to mount(). Let's skip such entries.
5174 */
51c7ca35
CB
5175 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5176 found_unified = true;
5177 p = "unified";
5178 }
a67719f6 5179
237e200e
SH
5180 if (!store_hierarchy(line, p))
5181 goto out;
5182 }
5183
480262c9 5184 /* Preserve initial namespace. */
a257a8ee 5185 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
5186 if (init_ns < 0) {
5187 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 5188 goto out;
b8defc3d 5189 }
480262c9 5190
92c3ee11 5191 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
5192 if (!fd_hierarchies) {
5193 lxcfs_error("%s\n", strerror(errno));
29a73c2f 5194 goto out;
b8defc3d 5195 }
29a73c2f 5196
480262c9
CB
5197 for (i = 0; i < num_hierarchies; i++)
5198 fd_hierarchies[i] = -1;
5199
e58dab00
CB
5200 cret = getcwd(cwd, MAXPATHLEN);
5201 if (!cret)
5202 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5203
480262c9
CB
5204 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5205 * to privately mount lxcfs cgroups. */
b8defc3d
CB
5206 if (!cgfs_setup_controllers()) {
5207 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 5208 goto out;
b8defc3d 5209 }
480262c9 5210
b8defc3d
CB
5211 if (setns(init_ns, 0) < 0) {
5212 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 5213 goto out;
b8defc3d 5214 }
29a73c2f 5215
e58dab00
CB
5216 if (!cret || chdir(cwd) < 0)
5217 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5218
237e200e
SH
5219 print_subsystems();
5220
5221out:
5222 free(line);
5223 fclose(f);
480262c9
CB
5224 if (init_ns >= 0)
5225 close(init_ns);
237e200e
SH
5226}
5227
5228static void __attribute__((destructor)) free_subsystems(void)
5229{
5230 int i;
5231
b8defc3d
CB
5232 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5233
29a73c2f 5234 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
5235 if (hierarchies[i])
5236 free(hierarchies[i]);
480262c9 5237 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
5238 close(fd_hierarchies[i]);
5239 }
237e200e 5240 free(hierarchies);
480262c9 5241 free(fd_hierarchies);
a257a8ee
CB
5242
5243 if (cgroup_mount_ns_fd >= 0)
5244 close(cgroup_mount_ns_fd);
237e200e 5245}