]> git.proxmox.com Git - mirror_lxcfs.git/blame - bindings.c
meminfo: read shmem from cgroup parameter memory.stat
[mirror_lxcfs.git] / bindings.c
CommitLineData
237e200e
SH
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
0ecddf02 11#define __STDC_FORMAT_MACROS
237e200e 12#include <dirent.h>
29a73c2f 13#include <errno.h>
237e200e
SH
14#include <fcntl.h>
15#include <fuse.h>
0ecddf02 16#include <inttypes.h>
237e200e 17#include <libgen.h>
237e200e 18#include <pthread.h>
29a73c2f
CB
19#include <sched.h>
20#include <stdbool.h>
0ecddf02 21#include <stdint.h>
29a73c2f
CB
22#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <time.h>
26#include <unistd.h>
27#include <wait.h>
d89504c4 28#include <linux/magic.h>
237e200e 29#include <linux/sched.h>
29a73c2f
CB
30#include <sys/epoll.h>
31#include <sys/mman.h>
32#include <sys/mount.h>
237e200e
SH
33#include <sys/param.h>
34#include <sys/socket.h>
29a73c2f 35#include <sys/syscall.h>
0ecddf02 36#include <sys/sysinfo.h>
d89504c4 37#include <sys/vfs.h>
237e200e 38
237e200e 39#include "bindings.h"
237e200e
SH
40#include "config.h" // for VERSION
41
0ecddf02
CB
42/* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
43#define LXCFS_NUMSTRLEN64 21
44
29a73c2f
CB
45/* Define pivot_root() if missing from the C library */
46#ifndef HAVE_PIVOT_ROOT
47static int pivot_root(const char * new_root, const char * put_old)
48{
49#ifdef __NR_pivot_root
50return syscall(__NR_pivot_root, new_root, put_old);
51#else
52errno = ENOSYS;
53return -1;
54#endif
55}
56#else
57extern int pivot_root(const char * new_root, const char * put_old);
58#endif
59
237e200e
SH
60enum {
61 LXC_TYPE_CGDIR,
62 LXC_TYPE_CGFILE,
63 LXC_TYPE_PROC_MEMINFO,
64 LXC_TYPE_PROC_CPUINFO,
65 LXC_TYPE_PROC_UPTIME,
66 LXC_TYPE_PROC_STAT,
67 LXC_TYPE_PROC_DISKSTATS,
70dcc12e 68 LXC_TYPE_PROC_SWAPS,
46be8eed 69 LXC_TYPE_PROC_LOADAVG,
237e200e
SH
70};
71
72struct file_info {
73 char *controller;
74 char *cgroup;
75 char *file;
76 int type;
77 char *buf; // unused as of yet
78 int buflen;
79 int size; //actual data size
80 int cached;
81};
82
8be92dd1
JS
83struct cpuacct_usage {
84 uint64_t user;
85 uint64_t system;
86};
87
0e47acaa 88/* The function of hash table.*/
89#define LOAD_SIZE 100 /*the size of hash_table */
6db4f7a3 90#define FLUSH_TIME 5 /*the flush rate */
91#define DEPTH_DIR 3 /*the depth of per cgroup */
92/* The function of calculate loadavg .*/
93#define FSHIFT 11 /* nr of bits of precision */
94#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
95#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
96#define EXP_5 2014 /* 1/exp(5sec/5min) */
97#define EXP_15 2037 /* 1/exp(5sec/15min) */
98#define LOAD_INT(x) ((x) >> FSHIFT)
99#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
100/*
101 * This parameter is used for proc_loadavg_read().
102 * 1 means use loadavg, 0 means not use.
103 */
104static int loadavg = 0;
a83618e2 105static volatile sig_atomic_t loadavg_stop = 0;
0e47acaa 106static int calc_hash(char *name)
107{
108 unsigned int hash = 0;
109 unsigned int x = 0;
110 /* ELFHash algorithm. */
111 while (*name) {
112 hash = (hash << 4) + *name++;
113 x = hash & 0xf0000000;
114 if (x != 0)
115 hash ^= (x >> 24);
116 hash &= ~x;
117 }
118 return ((hash & 0x7fffffff) % LOAD_SIZE);
119}
120
121struct load_node {
122 char *cg; /*cg */
123 unsigned long avenrun[3]; /* Load averages */
124 unsigned int run_pid;
125 unsigned int total_pid;
126 unsigned int last_pid;
127 int cfd; /* The file descriptor of the mounted cgroup */
128 struct load_node *next;
129 struct load_node **pre;
130};
131
132struct load_head {
133 /*
134 * The lock is about insert load_node and refresh load_node.To the first
135 * load_node of each hash bucket, insert and refresh in this hash bucket is
136 * mutually exclusive.
137 */
138 pthread_mutex_t lock;
139 /*
140 * The rdlock is about read loadavg and delete load_node.To each hash
141 * bucket, read and delete is mutually exclusive. But at the same time, we
142 * allow paratactic read operation. This rdlock is at list level.
143 */
144 pthread_rwlock_t rdlock;
145 /*
146 * The rilock is about read loadavg and insert load_node.To the first
147 * load_node of each hash bucket, read and insert is mutually exclusive.
148 * But at the same time, we allow paratactic read operation.
149 */
150 pthread_rwlock_t rilock;
151 struct load_node *next;
152};
153
154static struct load_head load_hash[LOAD_SIZE]; /* hash table */
155/*
156 * init_load initialize the hash table.
157 * Return 0 on success, return -1 on failure.
158 */
159static int init_load(void)
160{
161 int i;
162 int ret;
163
164 for (i = 0; i < LOAD_SIZE; i++) {
165 load_hash[i].next = NULL;
166 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
167 if (ret != 0) {
168 lxcfs_error("%s\n", "Failed to initialize lock");
169 goto out3;
170 }
171 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
172 if (ret != 0) {
173 lxcfs_error("%s\n", "Failed to initialize rdlock");
174 goto out2;
175 }
176 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
177 if (ret != 0) {
178 lxcfs_error("%s\n", "Failed to initialize rilock");
179 goto out1;
180 }
181 }
182 return 0;
183out1:
184 pthread_rwlock_destroy(&load_hash[i].rdlock);
185out2:
186 pthread_mutex_destroy(&load_hash[i].lock);
187out3:
188 while (i > 0) {
189 i--;
190 pthread_mutex_destroy(&load_hash[i].lock);
191 pthread_rwlock_destroy(&load_hash[i].rdlock);
192 pthread_rwlock_destroy(&load_hash[i].rilock);
193 }
194 return -1;
195}
196
197static void insert_node(struct load_node **n, int locate)
198{
199 struct load_node *f;
200
201 pthread_mutex_lock(&load_hash[locate].lock);
202 pthread_rwlock_wrlock(&load_hash[locate].rilock);
203 f = load_hash[locate].next;
204 load_hash[locate].next = *n;
205
206 (*n)->pre = &(load_hash[locate].next);
207 if (f)
208 f->pre = &((*n)->next);
209 (*n)->next = f;
210 pthread_mutex_unlock(&load_hash[locate].lock);
211 pthread_rwlock_unlock(&load_hash[locate].rilock);
212}
213/*
214 * locate_node() finds special node. Not return NULL means success.
215 * It should be noted that rdlock isn't unlocked at the end of code
216 * because this function is used to read special node. Delete is not
217 * allowed before read has ended.
218 * unlock rdlock only in proc_loadavg_read().
219 */
220static struct load_node *locate_node(char *cg, int locate)
221{
222 struct load_node *f = NULL;
223 int i = 0;
224
225 pthread_rwlock_rdlock(&load_hash[locate].rilock);
226 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
227 if (load_hash[locate].next == NULL) {
228 pthread_rwlock_unlock(&load_hash[locate].rilock);
229 return f;
230 }
231 f = load_hash[locate].next;
232 pthread_rwlock_unlock(&load_hash[locate].rilock);
233 while (f && ((i = strcmp(f->cg, cg)) != 0))
234 f = f->next;
235 return f;
236}
237/* Delete the load_node n and return the next node of it. */
238static struct load_node *del_node(struct load_node *n, int locate)
239{
240 struct load_node *g;
241
242 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
243 if (n->next == NULL) {
244 *(n->pre) = NULL;
245 } else {
246 *(n->pre) = n->next;
247 n->next->pre = n->pre;
248 }
249 g = n->next;
250 free(n->cg);
251 free(n);
252 pthread_rwlock_unlock(&load_hash[locate].rdlock);
253 return g;
254}
255
a83618e2 256static void load_free(void)
9c480eb7 257{
258 int i;
259 struct load_node *f, *p;
260
261 for (i = 0; i < LOAD_SIZE; i++) {
262 pthread_mutex_lock(&load_hash[i].lock);
263 pthread_rwlock_wrlock(&load_hash[i].rilock);
264 pthread_rwlock_wrlock(&load_hash[i].rdlock);
265 if (load_hash[i].next == NULL) {
266 pthread_mutex_unlock(&load_hash[i].lock);
267 pthread_mutex_destroy(&load_hash[i].lock);
268 pthread_rwlock_unlock(&load_hash[i].rilock);
269 pthread_rwlock_destroy(&load_hash[i].rilock);
270 pthread_rwlock_unlock(&load_hash[i].rdlock);
271 pthread_rwlock_destroy(&load_hash[i].rdlock);
272 continue;
273 }
274 for (f = load_hash[i].next; f; ) {
275 free(f->cg);
276 p = f->next;
277 free(f);
278 f = p;
279 }
280 pthread_mutex_unlock(&load_hash[i].lock);
281 pthread_mutex_destroy(&load_hash[i].lock);
282 pthread_rwlock_unlock(&load_hash[i].rilock);
283 pthread_rwlock_destroy(&load_hash[i].rilock);
284 pthread_rwlock_unlock(&load_hash[i].rdlock);
285 pthread_rwlock_destroy(&load_hash[i].rdlock);
286 }
287}
f34de69a
CB
288/* Reserve buffer size to account for file size changes. */
289#define BUF_RESERVE_SIZE 512
237e200e
SH
290
291/*
292 * A table caching which pid is init for a pid namespace.
293 * When looking up which pid is init for $qpid, we first
294 * 1. Stat /proc/$qpid/ns/pid.
295 * 2. Check whether the ino_t is in our store.
296 * a. if not, fork a child in qpid's ns to send us
297 * ucred.pid = 1, and read the initpid. Cache
298 * initpid and creation time for /proc/initpid
299 * in a new store entry.
300 * b. if so, verify that /proc/initpid still matches
301 * what we have saved. If not, clear the store
302 * entry and go back to a. If so, return the
303 * cached initpid.
304 */
305struct pidns_init_store {
306 ino_t ino; // inode number for /proc/$pid/ns/pid
307 pid_t initpid; // the pid of nit in that ns
308 long int ctime; // the time at which /proc/$initpid was created
309 struct pidns_init_store *next;
310 long int lastcheck;
311};
312
313/* lol - look at how they are allocated in the kernel */
314#define PIDNS_HASH_SIZE 4096
315#define HASH(x) ((x) % PIDNS_HASH_SIZE)
316
317static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
318static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
319static void lock_mutex(pthread_mutex_t *l)
320{
321 int ret;
322
323 if ((ret = pthread_mutex_lock(l)) != 0) {
b8defc3d 324 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
325 exit(1);
326 }
327}
328
29a73c2f
CB
329/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
330 * Number of hierarchies mounted. */
331static int num_hierarchies;
332
333/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
334 * Hierachies mounted {cpuset, blkio, ...}:
335 * Initialized via __constructor__ collect_and_mount_subsystems(). */
336static char **hierarchies;
337
338/* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
339 * Open file descriptors:
340 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
341 * private mount namespace.
342 * Initialized via __constructor__ collect_and_mount_subsystems().
343 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
344 * mounts and respective files in the private namespace even when located in
345 * another namespace using the *at() family of functions
346 * {openat(), fchownat(), ...}. */
347static int *fd_hierarchies;
a257a8ee 348static int cgroup_mount_ns_fd = -1;
29a73c2f 349
237e200e
SH
350static void unlock_mutex(pthread_mutex_t *l)
351{
352 int ret;
353
354 if ((ret = pthread_mutex_unlock(l)) != 0) {
b8defc3d 355 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
237e200e
SH
356 exit(1);
357 }
358}
359
360static void store_lock(void)
361{
362 lock_mutex(&pidns_store_mutex);
363}
364
365static void store_unlock(void)
366{
367 unlock_mutex(&pidns_store_mutex);
368}
369
370/* Must be called under store_lock */
371static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
372{
373 struct stat initsb;
374 char fnam[100];
375
376 snprintf(fnam, 100, "/proc/%d", e->initpid);
377 if (stat(fnam, &initsb) < 0)
378 return false;
7dd6560a
CB
379
380 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
381 initsb.st_ctime, e->initpid);
382
237e200e
SH
383 if (e->ctime != initsb.st_ctime)
384 return false;
385 return true;
386}
387
388/* Must be called under store_lock */
389static void remove_initpid(struct pidns_init_store *e)
390{
391 struct pidns_init_store *tmp;
392 int h;
393
7dd6560a
CB
394 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
395
237e200e
SH
396 h = HASH(e->ino);
397 if (pidns_hash_table[h] == e) {
398 pidns_hash_table[h] = e->next;
399 free(e);
400 return;
401 }
402
403 tmp = pidns_hash_table[h];
404 while (tmp) {
405 if (tmp->next == e) {
406 tmp->next = e->next;
407 free(e);
408 return;
409 }
410 tmp = tmp->next;
411 }
412}
413
414#define PURGE_SECS 5
415/* Must be called under store_lock */
416static void prune_initpid_store(void)
417{
418 static long int last_prune = 0;
419 struct pidns_init_store *e, *prev, *delme;
420 long int now, threshold;
421 int i;
422
423 if (!last_prune) {
424 last_prune = time(NULL);
425 return;
426 }
427 now = time(NULL);
428 if (now < last_prune + PURGE_SECS)
429 return;
7dd6560a
CB
430
431 lxcfs_debug("%s\n", "Pruning.");
432
237e200e
SH
433 last_prune = now;
434 threshold = now - 2 * PURGE_SECS;
435
436 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
437 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
438 if (e->lastcheck < threshold) {
7dd6560a
CB
439
440 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
441
237e200e
SH
442 delme = e;
443 if (prev)
444 prev->next = e->next;
445 else
446 pidns_hash_table[i] = e->next;
447 e = e->next;
448 free(delme);
449 } else {
450 prev = e;
451 e = e->next;
452 }
453 }
454 }
455}
456
457/* Must be called under store_lock */
458static void save_initpid(struct stat *sb, pid_t pid)
459{
460 struct pidns_init_store *e;
461 char fpath[100];
462 struct stat procsb;
463 int h;
464
7dd6560a
CB
465 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
466
237e200e
SH
467 snprintf(fpath, 100, "/proc/%d", pid);
468 if (stat(fpath, &procsb) < 0)
469 return;
470 do {
471 e = malloc(sizeof(*e));
472 } while (!e);
473 e->ino = sb->st_ino;
474 e->initpid = pid;
475 e->ctime = procsb.st_ctime;
476 h = HASH(e->ino);
477 e->next = pidns_hash_table[h];
478 e->lastcheck = time(NULL);
479 pidns_hash_table[h] = e;
480}
481
482/*
483 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
484 * entry for the inode number and creation time. Verify that the init pid
485 * is still valid. If not, remove it. Return the entry if valid, NULL
486 * otherwise.
487 * Must be called under store_lock
488 */
489static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
490{
491 int h = HASH(sb->st_ino);
492 struct pidns_init_store *e = pidns_hash_table[h];
493
494 while (e) {
495 if (e->ino == sb->st_ino) {
496 if (initpid_still_valid(e, sb)) {
497 e->lastcheck = time(NULL);
498 return e;
499 }
500 remove_initpid(e);
501 return NULL;
502 }
503 e = e->next;
504 }
505
506 return NULL;
507}
508
0f657ce3 509static int is_dir(const char *path, int fd)
237e200e
SH
510{
511 struct stat statbuf;
0f657ce3 512 int ret = fstatat(fd, path, &statbuf, fd);
237e200e
SH
513 if (ret == 0 && S_ISDIR(statbuf.st_mode))
514 return 1;
515 return 0;
516}
517
518static char *must_copy_string(const char *str)
519{
520 char *dup = NULL;
521 if (!str)
522 return NULL;
523 do {
524 dup = strdup(str);
525 } while (!dup);
526
527 return dup;
528}
529
530static inline void drop_trailing_newlines(char *s)
531{
532 int l;
533
534 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
535 s[l-1] = '\0';
536}
537
538#define BATCH_SIZE 50
539static void dorealloc(char **mem, size_t oldlen, size_t newlen)
540{
541 int newbatches = (newlen / BATCH_SIZE) + 1;
542 int oldbatches = (oldlen / BATCH_SIZE) + 1;
543
544 if (!*mem || newbatches > oldbatches) {
545 char *tmp;
546 do {
547 tmp = realloc(*mem, newbatches * BATCH_SIZE);
548 } while (!tmp);
549 *mem = tmp;
550 }
551}
552static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
553{
554 size_t newlen = *len + linelen;
555 dorealloc(contents, *len, newlen + 1);
556 memcpy(*contents + *len, line, linelen+1);
557 *len = newlen;
558}
559
60f2ae53 560static char *slurp_file(const char *from, int fd)
237e200e
SH
561{
562 char *line = NULL;
563 char *contents = NULL;
60f2ae53 564 FILE *f = fdopen(fd, "r");
237e200e
SH
565 size_t len = 0, fulllen = 0;
566 ssize_t linelen;
567
568 if (!f)
569 return NULL;
570
571 while ((linelen = getline(&line, &len, f)) != -1) {
572 append_line(&contents, &fulllen, line, linelen);
573 }
574 fclose(f);
575
576 if (contents)
577 drop_trailing_newlines(contents);
578 free(line);
579 return contents;
580}
581
ba59ea09 582static bool write_string(const char *fnam, const char *string, int fd)
237e200e
SH
583{
584 FILE *f;
585 size_t len, ret;
586
ba59ea09 587 if (!(f = fdopen(fd, "w")))
237e200e
SH
588 return false;
589 len = strlen(string);
590 ret = fwrite(string, 1, len, f);
591 if (ret != len) {
b8defc3d 592 lxcfs_error("Error writing to file: %s\n", strerror(errno));
237e200e
SH
593 fclose(f);
594 return false;
595 }
596 if (fclose(f) < 0) {
b8defc3d 597 lxcfs_error("Error writing to file: %s\n", strerror(errno));
237e200e
SH
598 return false;
599 }
600 return true;
601}
602
237e200e
SH
603struct cgfs_files {
604 char *name;
605 uint32_t uid, gid;
606 uint32_t mode;
607};
608
0619767c 609#define ALLOC_NUM 20
237e200e
SH
610static bool store_hierarchy(char *stridx, char *h)
611{
0619767c
SH
612 if (num_hierarchies % ALLOC_NUM == 0) {
613 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
614 n *= ALLOC_NUM;
615 char **tmp = realloc(hierarchies, n * sizeof(char *));
0619767c 616 if (!tmp) {
b8defc3d 617 lxcfs_error("%s\n", strerror(errno));
0619767c
SH
618 exit(1);
619 }
237e200e 620 hierarchies = tmp;
237e200e 621 }
f676eb79 622
0619767c 623 hierarchies[num_hierarchies++] = must_copy_string(h);
237e200e
SH
624 return true;
625}
626
627static void print_subsystems(void)
628{
629 int i;
630
a257a8ee 631 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
cc97d34c 632 fprintf(stderr, "hierarchies:\n");
237e200e
SH
633 for (i = 0; i < num_hierarchies; i++) {
634 if (hierarchies[i])
b8defc3d
CB
635 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
636 fd_hierarchies[i], hierarchies[i]);
237e200e
SH
637 }
638}
639
640static bool in_comma_list(const char *needle, const char *haystack)
641{
642 const char *s = haystack, *e;
643 size_t nlen = strlen(needle);
644
06081b29 645 while (*s && (e = strchr(s, ','))) {
237e200e
SH
646 if (nlen != e - s) {
647 s = e + 1;
648 continue;
649 }
650 if (strncmp(needle, s, nlen) == 0)
651 return true;
652 s = e + 1;
653 }
654 if (strcmp(needle, s) == 0)
655 return true;
656 return false;
657}
658
659/* do we need to do any massaging here? I'm not sure... */
5dd3e6fd
CB
660/* Return the mounted controller and store the corresponding open file descriptor
661 * referring to the controller mountpoint in the private lxcfs namespace in
662 * @cfd.
663 */
664static char *find_mounted_controller(const char *controller, int *cfd)
237e200e
SH
665{
666 int i;
667
668 for (i = 0; i < num_hierarchies; i++) {
669 if (!hierarchies[i])
670 continue;
5dd3e6fd
CB
671 if (strcmp(hierarchies[i], controller) == 0) {
672 *cfd = fd_hierarchies[i];
237e200e 673 return hierarchies[i];
5dd3e6fd
CB
674 }
675 if (in_comma_list(controller, hierarchies[i])) {
676 *cfd = fd_hierarchies[i];
237e200e 677 return hierarchies[i];
5dd3e6fd 678 }
237e200e
SH
679 }
680
681 return NULL;
682}
683
684bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
685 const char *value)
686{
ba59ea09 687 int ret, fd, cfd;
237e200e 688 size_t len;
f5a6d92e 689 char *fnam, *tmpc;
237e200e 690
f5a6d92e 691 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
692 if (!tmpc)
693 return false;
f5a6d92e
CB
694
695 /* Make sure we pass a relative path to *at() family of functions.
696 * . + /cgroup + / + file + \0
697 */
ba59ea09 698 len = strlen(cgroup) + strlen(file) + 3;
237e200e 699 fnam = alloca(len);
ba59ea09
CB
700 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
701 if (ret < 0 || (size_t)ret >= len)
702 return false;
703
704 fd = openat(cfd, fnam, O_WRONLY);
705 if (fd < 0)
706 return false;
f676eb79 707
ba59ea09 708 return write_string(fnam, value, fd);
237e200e
SH
709}
710
711// Chown all the files in the cgroup directory. We do this when we create
712// a cgroup on behalf of a user.
f23fe717 713static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e 714{
f23fe717 715 struct dirent *direntp;
237e200e
SH
716 char path[MAXPATHLEN];
717 size_t len;
718 DIR *d;
f23fe717 719 int fd1, ret;
237e200e
SH
720
721 len = strlen(dirname);
722 if (len >= MAXPATHLEN) {
b8defc3d 723 lxcfs_error("Pathname too long: %s\n", dirname);
237e200e
SH
724 return;
725 }
726
f23fe717
CB
727 fd1 = openat(fd, dirname, O_DIRECTORY);
728 if (fd1 < 0)
729 return;
730
731 d = fdopendir(fd1);
237e200e 732 if (!d) {
b8defc3d 733 lxcfs_error("Failed to open %s\n", dirname);
237e200e
SH
734 return;
735 }
736
f23fe717 737 while ((direntp = readdir(d))) {
237e200e
SH
738 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
739 continue;
740 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
741 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 742 lxcfs_error("Pathname too long under %s\n", dirname);
237e200e
SH
743 continue;
744 }
f23fe717 745 if (fchownat(fd, path, uid, gid, 0) < 0)
b8defc3d 746 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
237e200e
SH
747 }
748 closedir(d);
749}
750
751int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
752{
5dd3e6fd 753 int cfd;
237e200e 754 size_t len;
f5a6d92e 755 char *dirnam, *tmpc;
237e200e 756
f5a6d92e 757 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
758 if (!tmpc)
759 return -EINVAL;
f5a6d92e
CB
760
761 /* Make sure we pass a relative path to *at() family of functions.
762 * . + /cg + \0
763 */
f23fe717 764 len = strlen(cg) + 2;
237e200e 765 dirnam = alloca(len);
f23fe717 766 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
237e200e 767
f23fe717 768 if (mkdirat(cfd, dirnam, 0755) < 0)
237e200e
SH
769 return -errno;
770
771 if (uid == 0 && gid == 0)
772 return 0;
773
f23fe717 774 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
237e200e
SH
775 return -errno;
776
f23fe717 777 chown_all_cgroup_files(dirnam, uid, gid, cfd);
237e200e
SH
778
779 return 0;
780}
781
7213ec5c 782static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
237e200e 783{
b7672ded 784 struct dirent *direntp;
237e200e
SH
785 DIR *dir;
786 bool ret = false;
787 char pathname[MAXPATHLEN];
b7672ded
CB
788 int dupfd;
789
790 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
791 if (dupfd < 0)
792 return false;
237e200e 793
b7672ded 794 dir = fdopendir(dupfd);
237e200e 795 if (!dir) {
7dd6560a 796 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
7213ec5c 797 close(dupfd);
237e200e
SH
798 return false;
799 }
800
b7672ded 801 while ((direntp = readdir(dir))) {
237e200e
SH
802 struct stat mystat;
803 int rc;
804
237e200e
SH
805 if (!strcmp(direntp->d_name, ".") ||
806 !strcmp(direntp->d_name, ".."))
807 continue;
808
809 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
810 if (rc < 0 || rc >= MAXPATHLEN) {
b8defc3d 811 lxcfs_error("%s\n", "Pathname too long.");
237e200e
SH
812 continue;
813 }
814
2e81a5e3
CB
815 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
816 if (rc) {
7dd6560a 817 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
237e200e
SH
818 continue;
819 }
7dd6560a 820 if (S_ISDIR(mystat.st_mode))
2e81a5e3 821 if (!recursive_rmdir(pathname, fd, cfd))
7dd6560a 822 lxcfs_debug("Error removing %s.\n", pathname);
237e200e
SH
823 }
824
825 ret = true;
826 if (closedir(dir) < 0) {
b8defc3d 827 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
237e200e
SH
828 ret = false;
829 }
830
2e81a5e3 831 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
7dd6560a 832 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
237e200e
SH
833 ret = false;
834 }
7213ec5c
CB
835
836 close(dupfd);
237e200e
SH
837
838 return ret;
839}
840
841bool cgfs_remove(const char *controller, const char *cg)
842{
b7672ded 843 int fd, cfd;
237e200e 844 size_t len;
f5a6d92e 845 char *dirnam, *tmpc;
7213ec5c 846 bool bret;
237e200e 847
f5a6d92e 848 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
849 if (!tmpc)
850 return false;
f5a6d92e
CB
851
852 /* Make sure we pass a relative path to *at() family of functions.
853 * . + /cg + \0
854 */
b7672ded 855 len = strlen(cg) + 2;
237e200e 856 dirnam = alloca(len);
b7672ded
CB
857 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
858
859 fd = openat(cfd, dirnam, O_DIRECTORY);
860 if (fd < 0)
861 return false;
862
7213ec5c
CB
863 bret = recursive_rmdir(dirnam, fd, cfd);
864 close(fd);
865 return bret;
237e200e
SH
866}
867
868bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
869{
5dd3e6fd 870 int cfd;
237e200e 871 size_t len;
f5a6d92e 872 char *pathname, *tmpc;
237e200e 873
f5a6d92e 874 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
875 if (!tmpc)
876 return false;
f5a6d92e
CB
877
878 /* Make sure we pass a relative path to *at() family of functions.
879 * . + /file + \0
880 */
534690b4 881 len = strlen(file) + 2;
237e200e 882 pathname = alloca(len);
534690b4
CB
883 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
884 if (fchmodat(cfd, pathname, mode, 0) < 0)
237e200e
SH
885 return false;
886 return true;
887}
888
0f657ce3 889static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
237e200e
SH
890{
891 size_t len;
892 char *fname;
893
894 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
895 fname = alloca(len);
896 snprintf(fname, len, "%s/tasks", dirname);
0f657ce3 897 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
898 return -errno;
899 snprintf(fname, len, "%s/cgroup.procs", dirname);
0f657ce3 900 if (fchownat(fd, fname, uid, gid, 0) != 0)
237e200e
SH
901 return -errno;
902 return 0;
903}
904
905int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
906{
5dd3e6fd 907 int cfd;
237e200e 908 size_t len;
f5a6d92e 909 char *pathname, *tmpc;
237e200e 910
f5a6d92e 911 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
912 if (!tmpc)
913 return -EINVAL;
f5a6d92e
CB
914
915 /* Make sure we pass a relative path to *at() family of functions.
916 * . + /file + \0
917 */
0f657ce3 918 len = strlen(file) + 2;
237e200e 919 pathname = alloca(len);
0f657ce3
CB
920 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
921 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
237e200e
SH
922 return -errno;
923
0f657ce3 924 if (is_dir(pathname, cfd))
237e200e 925 // like cgmanager did, we want to chown the tasks file as well
0f657ce3 926 return chown_tasks_files(pathname, uid, gid, cfd);
237e200e
SH
927
928 return 0;
929}
930
931FILE *open_pids_file(const char *controller, const char *cgroup)
932{
3ffd08ee 933 int fd, cfd;
237e200e 934 size_t len;
f5a6d92e 935 char *pathname, *tmpc;
237e200e 936
f5a6d92e 937 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
938 if (!tmpc)
939 return NULL;
f5a6d92e
CB
940
941 /* Make sure we pass a relative path to *at() family of functions.
942 * . + /cgroup + / "cgroup.procs" + \0
943 */
3ffd08ee 944 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
237e200e 945 pathname = alloca(len);
3ffd08ee
CB
946 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
947
948 fd = openat(cfd, pathname, O_WRONLY);
949 if (fd < 0)
950 return NULL;
951
952 return fdopen(fd, "w");
237e200e
SH
953}
954
f366da65
WB
955static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
956 void ***list, size_t typesize,
957 void* (*iterator)(const char*, const char*, const char*))
237e200e 958{
4ea38a4c 959 int cfd, fd, ret;
237e200e 960 size_t len;
4ea38a4c 961 char *cg, *tmpc;
237e200e 962 char pathname[MAXPATHLEN];
f366da65 963 size_t sz = 0, asz = 0;
4ea38a4c 964 struct dirent *dirent;
237e200e 965 DIR *dir;
237e200e 966
4ea38a4c 967 tmpc = find_mounted_controller(controller, &cfd);
f366da65 968 *list = NULL;
237e200e 969 if (!tmpc)
e97c834b 970 return false;
237e200e 971
f5a6d92e 972 /* Make sure we pass a relative path to *at() family of functions. */
4ea38a4c
CB
973 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
974 cg = alloca(len);
975 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
976 if (ret < 0 || (size_t)ret >= len) {
b8defc3d 977 lxcfs_error("Pathname too long under %s\n", cgroup);
4ea38a4c
CB
978 return false;
979 }
237e200e 980
4ea38a4c
CB
981 fd = openat(cfd, cg, O_DIRECTORY);
982 if (fd < 0)
983 return false;
984
985 dir = fdopendir(fd);
237e200e
SH
986 if (!dir)
987 return false;
988
4ea38a4c 989 while ((dirent = readdir(dir))) {
237e200e 990 struct stat mystat;
237e200e 991
4ea38a4c
CB
992 if (!strcmp(dirent->d_name, ".") ||
993 !strcmp(dirent->d_name, ".."))
237e200e
SH
994 continue;
995
4ea38a4c
CB
996 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
997 if (ret < 0 || ret >= MAXPATHLEN) {
b8defc3d 998 lxcfs_error("Pathname too long under %s\n", cg);
237e200e
SH
999 continue;
1000 }
1001
4ea38a4c 1002 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
237e200e 1003 if (ret) {
b8defc3d 1004 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
237e200e
SH
1005 continue;
1006 }
f366da65
WB
1007 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1008 (directories && !S_ISDIR(mystat.st_mode)))
237e200e
SH
1009 continue;
1010
1011 if (sz+2 >= asz) {
f366da65 1012 void **tmp;
237e200e
SH
1013 asz += BATCH_SIZE;
1014 do {
f366da65 1015 tmp = realloc(*list, asz * typesize);
237e200e
SH
1016 } while (!tmp);
1017 *list = tmp;
1018 }
4ea38a4c 1019 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
237e200e
SH
1020 (*list)[sz+1] = NULL;
1021 sz++;
1022 }
1023 if (closedir(dir) < 0) {
b8defc3d 1024 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
237e200e
SH
1025 return false;
1026 }
1027 return true;
1028}
1029
f366da65
WB
1030static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1031{
1032 char *dup;
1033 do {
1034 dup = strdup(dir_entry);
1035 } while (!dup);
1036 return dup;
1037}
1038
1039bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1040{
1041 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1042}
1043
237e200e
SH
1044void free_key(struct cgfs_files *k)
1045{
1046 if (!k)
1047 return;
1048 free(k->name);
1049 free(k);
1050}
1051
1052void free_keys(struct cgfs_files **keys)
1053{
1054 int i;
1055
1056 if (!keys)
1057 return;
1058 for (i = 0; keys[i]; i++) {
1059 free_key(keys[i]);
1060 }
1061 free(keys);
1062}
1063
1064bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1065{
60f2ae53 1066 int ret, fd, cfd;
237e200e 1067 size_t len;
f5a6d92e 1068 char *fnam, *tmpc;
237e200e 1069
f5a6d92e 1070 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1071 if (!tmpc)
1072 return false;
f5a6d92e
CB
1073
1074 /* Make sure we pass a relative path to *at() family of functions.
1075 * . + /cgroup + / + file + \0
1076 */
60f2ae53 1077 len = strlen(cgroup) + strlen(file) + 3;
237e200e 1078 fnam = alloca(len);
60f2ae53
CB
1079 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1080 if (ret < 0 || (size_t)ret >= len)
234a820c 1081 return false;
60f2ae53
CB
1082
1083 fd = openat(cfd, fnam, O_RDONLY);
1084 if (fd < 0)
234a820c 1085 return false;
237e200e 1086
60f2ae53 1087 *value = slurp_file(fnam, fd);
237e200e
SH
1088 return *value != NULL;
1089}
1090
1091struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1092{
4ea38a4c 1093 int ret, cfd;
237e200e 1094 size_t len;
f5a6d92e 1095 char *fnam, *tmpc;
237e200e
SH
1096 struct stat sb;
1097 struct cgfs_files *newkey;
237e200e 1098
f5a6d92e 1099 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1100 if (!tmpc)
1101 return false;
1102
1103 if (file && *file == '/')
1104 file++;
1105
06081b29 1106 if (file && strchr(file, '/'))
237e200e
SH
1107 return NULL;
1108
f5a6d92e
CB
1109 /* Make sure we pass a relative path to *at() family of functions.
1110 * . + /cgroup + / + file + \0
1111 */
4ea38a4c 1112 len = strlen(cgroup) + 3;
237e200e
SH
1113 if (file)
1114 len += strlen(file) + 1;
1115 fnam = alloca(len);
4ea38a4c
CB
1116 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1117 file ? "/" : "", file ? file : "");
237e200e 1118
4ea38a4c 1119 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1120 if (ret < 0)
1121 return NULL;
1122
1123 do {
1124 newkey = malloc(sizeof(struct cgfs_files));
1125 } while (!newkey);
1126 if (file)
1127 newkey->name = must_copy_string(file);
06081b29
CB
1128 else if (strrchr(cgroup, '/'))
1129 newkey->name = must_copy_string(strrchr(cgroup, '/'));
237e200e
SH
1130 else
1131 newkey->name = must_copy_string(cgroup);
1132 newkey->uid = sb.st_uid;
1133 newkey->gid = sb.st_gid;
1134 newkey->mode = sb.st_mode;
1135
1136 return newkey;
1137}
1138
f366da65 1139static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
237e200e 1140{
f366da65
WB
1141 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1142 if (!entry) {
b8defc3d
CB
1143 lxcfs_error("Error getting files under %s:%s\n", controller,
1144 cgroup);
237e200e 1145 }
f366da65
WB
1146 return entry;
1147}
1148
1149bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1150{
1151 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
237e200e
SH
1152}
1153
1154bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
5dd3e6fd
CB
1155{
1156 int cfd;
1157 size_t len;
f5a6d92e 1158 char *fnam, *tmpc;
237e200e
SH
1159 int ret;
1160 struct stat sb;
1161
f5a6d92e 1162 tmpc = find_mounted_controller(controller, &cfd);
237e200e
SH
1163 if (!tmpc)
1164 return false;
f5a6d92e
CB
1165
1166 /* Make sure we pass a relative path to *at() family of functions.
1167 * . + /cgroup + / + f + \0
1168 */
d04232f2 1169 len = strlen(cgroup) + strlen(f) + 3;
237e200e 1170 fnam = alloca(len);
d04232f2
CB
1171 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1172 if (ret < 0 || (size_t)ret >= len)
1173 return false;
237e200e 1174
d04232f2 1175 ret = fstatat(cfd, fnam, &sb, 0);
237e200e
SH
1176 if (ret < 0 || !S_ISDIR(sb.st_mode))
1177 return false;
f5a6d92e 1178
237e200e
SH
1179 return true;
1180}
1181
1182#define SEND_CREDS_OK 0
1183#define SEND_CREDS_NOTSK 1
1184#define SEND_CREDS_FAIL 2
1185static bool recv_creds(int sock, struct ucred *cred, char *v);
1186static int wait_for_pid(pid_t pid);
1187static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
b10bdd6c 1188static int send_creds_clone_wrapper(void *arg);
237e200e
SH
1189
1190/*
b10bdd6c 1191 * clone a task which switches to @task's namespace and writes '1'.
237e200e
SH
1192 * over a unix sock so we can read the task's reaper's pid in our
1193 * namespace
b10bdd6c
FG
1194 *
1195 * Note: glibc's fork() does not respect pidns, which can lead to failed
1196 * assertions inside glibc (and thus failed forks) if the child's pid in
1197 * the pidns and the parent pid outside are identical. Using clone prevents
1198 * this issue.
237e200e
SH
1199 */
1200static void write_task_init_pid_exit(int sock, pid_t target)
1201{
237e200e
SH
1202 char fnam[100];
1203 pid_t pid;
237e200e 1204 int fd, ret;
b10bdd6c
FG
1205 size_t stack_size = sysconf(_SC_PAGESIZE);
1206 void *stack = alloca(stack_size);
237e200e
SH
1207
1208 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1209 if (ret < 0 || ret >= sizeof(fnam))
1210 _exit(1);
1211
1212 fd = open(fnam, O_RDONLY);
1213 if (fd < 0) {
1214 perror("write_task_init_pid_exit open of ns/pid");
1215 _exit(1);
1216 }
1217 if (setns(fd, 0)) {
1218 perror("write_task_init_pid_exit setns 1");
1219 close(fd);
1220 _exit(1);
1221 }
b10bdd6c 1222 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
237e200e
SH
1223 if (pid < 0)
1224 _exit(1);
1225 if (pid != 0) {
1226 if (!wait_for_pid(pid))
1227 _exit(1);
1228 _exit(0);
1229 }
b10bdd6c
FG
1230}
1231
1232static int send_creds_clone_wrapper(void *arg) {
1233 struct ucred cred;
1234 char v;
1235 int sock = *(int *)arg;
237e200e
SH
1236
1237 /* we are the child */
1238 cred.uid = 0;
1239 cred.gid = 0;
1240 cred.pid = 1;
1241 v = '1';
1242 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
b10bdd6c
FG
1243 return 1;
1244 return 0;
237e200e
SH
1245}
1246
1247static pid_t get_init_pid_for_task(pid_t task)
1248{
1249 int sock[2];
1250 pid_t pid;
1251 pid_t ret = -1;
1252 char v = '0';
1253 struct ucred cred;
1254
1255 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1256 perror("socketpair");
1257 return -1;
1258 }
1259
1260 pid = fork();
1261 if (pid < 0)
1262 goto out;
1263 if (!pid) {
1264 close(sock[1]);
1265 write_task_init_pid_exit(sock[0], task);
1266 _exit(0);
1267 }
1268
1269 if (!recv_creds(sock[1], &cred, &v))
1270 goto out;
1271 ret = cred.pid;
1272
1273out:
1274 close(sock[0]);
1275 close(sock[1]);
1276 if (pid > 0)
1277 wait_for_pid(pid);
1278 return ret;
1279}
1280
1281static pid_t lookup_initpid_in_store(pid_t qpid)
1282{
1283 pid_t answer = 0;
1284 struct stat sb;
1285 struct pidns_init_store *e;
1286 char fnam[100];
1287
1288 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1289 store_lock();
1290 if (stat(fnam, &sb) < 0)
1291 goto out;
1292 e = lookup_verify_initpid(&sb);
1293 if (e) {
1294 answer = e->initpid;
1295 goto out;
1296 }
1297 answer = get_init_pid_for_task(qpid);
1298 if (answer > 0)
1299 save_initpid(&sb, answer);
1300
1301out:
1302 /* we prune at end in case we are returning
1303 * the value we were about to return */
1304 prune_initpid_store();
1305 store_unlock();
1306 return answer;
1307}
1308
1309static int wait_for_pid(pid_t pid)
1310{
1311 int status, ret;
1312
1313 if (pid <= 0)
1314 return -1;
1315
1316again:
1317 ret = waitpid(pid, &status, 0);
1318 if (ret == -1) {
1319 if (errno == EINTR)
1320 goto again;
1321 return -1;
1322 }
1323 if (ret != pid)
1324 goto again;
1325 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1326 return -1;
1327 return 0;
1328}
1329
1330
1331/*
1332 * append pid to *src.
1333 * src: a pointer to a char* in which ot append the pid.
1334 * sz: the number of characters printed so far, minus trailing \0.
1335 * asz: the allocated size so far
1336 * pid: the pid to append
1337 */
1338static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1339{
1340 char tmp[30];
1341
1342 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1343
1344 if (!*src || tmplen + *sz + 1 >= *asz) {
1345 char *tmp;
1346 do {
1347 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1348 } while (!tmp);
1349 *src = tmp;
1350 *asz += BUF_RESERVE_SIZE;
1351 }
bbfd0e33 1352 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
237e200e 1353 *sz += tmplen;
237e200e
SH
1354}
1355
1356/*
1357 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1358 * valid in the caller's namespace, return the id mapped into
1359 * pid's namespace.
1360 * Returns the mapped id, or -1 on error.
1361 */
1362unsigned int
1363convert_id_to_ns(FILE *idfile, unsigned int in_id)
1364{
1365 unsigned int nsuid, // base id for a range in the idfile's namespace
1366 hostuid, // base id for a range in the caller's namespace
1367 count; // number of ids in this range
1368 char line[400];
1369 int ret;
1370
1371 fseek(idfile, 0L, SEEK_SET);
1372 while (fgets(line, 400, idfile)) {
1373 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1374 if (ret != 3)
1375 continue;
1376 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1377 /*
1378 * uids wrapped around - unexpected as this is a procfile,
1379 * so just bail.
1380 */
b8defc3d 1381 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
237e200e
SH
1382 nsuid, hostuid, count, line);
1383 return -1;
1384 }
1385 if (hostuid <= in_id && hostuid+count > in_id) {
1386 /*
1387 * now since hostuid <= in_id < hostuid+count, and
1388 * hostuid+count and nsuid+count do not wrap around,
1389 * we know that nsuid+(in_id-hostuid) which must be
1390 * less that nsuid+(count) must not wrap around
1391 */
1392 return (in_id - hostuid) + nsuid;
1393 }
1394 }
1395
1396 // no answer found
1397 return -1;
1398}
1399
1400/*
1401 * for is_privileged_over,
1402 * specify whether we require the calling uid to be root in his
1403 * namespace
1404 */
1405#define NS_ROOT_REQD true
1406#define NS_ROOT_OPT false
1407
1408#define PROCLEN 100
1409
1410static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1411{
1412 char fpath[PROCLEN];
1413 int ret;
1414 bool answer = false;
1415 uid_t nsuid;
1416
1417 if (victim == -1 || uid == -1)
1418 return false;
1419
1420 /*
1421 * If the request is one not requiring root in the namespace,
1422 * then having the same uid suffices. (i.e. uid 1000 has write
1423 * access to files owned by uid 1000
1424 */
1425 if (!req_ns_root && uid == victim)
1426 return true;
1427
1428 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1429 if (ret < 0 || ret >= PROCLEN)
1430 return false;
1431 FILE *f = fopen(fpath, "r");
1432 if (!f)
1433 return false;
1434
1435 /* if caller's not root in his namespace, reject */
1436 nsuid = convert_id_to_ns(f, uid);
1437 if (nsuid)
1438 goto out;
1439
1440 /*
1441 * If victim is not mapped into caller's ns, reject.
1442 * XXX I'm not sure this check is needed given that fuse
1443 * will be sending requests where the vfs has converted
1444 */
1445 nsuid = convert_id_to_ns(f, victim);
1446 if (nsuid == -1)
1447 goto out;
1448
1449 answer = true;
1450
1451out:
1452 fclose(f);
1453 return answer;
1454}
1455
1456static bool perms_include(int fmode, mode_t req_mode)
1457{
1458 mode_t r;
1459
1460 switch (req_mode & O_ACCMODE) {
1461 case O_RDONLY:
1462 r = S_IROTH;
1463 break;
1464 case O_WRONLY:
1465 r = S_IWOTH;
1466 break;
1467 case O_RDWR:
1468 r = S_IROTH | S_IWOTH;
1469 break;
1470 default:
1471 return false;
1472 }
1473 return ((fmode & r) == r);
1474}
1475
1476
1477/*
1478 * taskcg is a/b/c
1479 * querycg is /a/b/c/d/e
1480 * we return 'd'
1481 */
1482static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1483{
1484 char *start, *end;
1485
1486 if (strlen(taskcg) <= strlen(querycg)) {
b8defc3d 1487 lxcfs_error("%s\n", "I was fed bad input.");
237e200e
SH
1488 return NULL;
1489 }
1490
06081b29 1491 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
237e200e
SH
1492 start = strdup(taskcg + 1);
1493 else
1494 start = strdup(taskcg + strlen(querycg) + 1);
1495 if (!start)
1496 return NULL;
1497 end = strchr(start, '/');
1498 if (end)
1499 *end = '\0';
1500 return start;
1501}
1502
1503static void stripnewline(char *x)
1504{
1505 size_t l = strlen(x);
1506 if (l && x[l-1] == '\n')
1507 x[l-1] = '\0';
1508}
1509
1510static char *get_pid_cgroup(pid_t pid, const char *contrl)
1511{
5dd3e6fd 1512 int cfd;
237e200e
SH
1513 char fnam[PROCLEN];
1514 FILE *f;
1515 char *answer = NULL;
1516 char *line = NULL;
1517 size_t len = 0;
1518 int ret;
5dd3e6fd 1519 const char *h = find_mounted_controller(contrl, &cfd);
237e200e
SH
1520 if (!h)
1521 return NULL;
1522
1523 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1524 if (ret < 0 || ret >= PROCLEN)
1525 return NULL;
1526 if (!(f = fopen(fnam, "r")))
1527 return NULL;
1528
1529 while (getline(&line, &len, f) != -1) {
1530 char *c1, *c2;
1531 if (!line[0])
1532 continue;
1533 c1 = strchr(line, ':');
1534 if (!c1)
1535 goto out;
1536 c1++;
1537 c2 = strchr(c1, ':');
1538 if (!c2)
1539 goto out;
1540 *c2 = '\0';
1541 if (strcmp(c1, h) != 0)
1542 continue;
1543 c2++;
1544 stripnewline(c2);
1545 do {
1546 answer = strdup(c2);
1547 } while (!answer);
1548 break;
1549 }
1550
1551out:
1552 fclose(f);
1553 free(line);
1554 return answer;
1555}
1556
1557/*
1558 * check whether a fuse context may access a cgroup dir or file
1559 *
1560 * If file is not null, it is a cgroup file to check under cg.
1561 * If file is null, then we are checking perms on cg itself.
1562 *
1563 * For files we can check the mode of the list_keys result.
1564 * For cgroups, we must make assumptions based on the files under the
1565 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1566 * yet.
1567 */
1568static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1569{
1570 struct cgfs_files *k = NULL;
1571 bool ret = false;
1572
1573 k = cgfs_get_key(contrl, cg, file);
1574 if (!k)
1575 return false;
1576
1577 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1578 if (perms_include(k->mode >> 6, mode)) {
1579 ret = true;
1580 goto out;
1581 }
1582 }
1583 if (fc->gid == k->gid) {
1584 if (perms_include(k->mode >> 3, mode)) {
1585 ret = true;
1586 goto out;
1587 }
1588 }
1589 ret = perms_include(k->mode, mode);
1590
1591out:
1592 free_key(k);
1593 return ret;
1594}
1595
1596#define INITSCOPE "/init.scope"
1597static void prune_init_slice(char *cg)
1598{
1599 char *point;
1600 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1601
1602 if (cg_len < initscope_len)
1603 return;
1604
1605 point = cg + cg_len - initscope_len;
1606 if (strcmp(point, INITSCOPE) == 0) {
1607 if (point == cg)
1608 *(point+1) = '\0';
1609 else
1610 *point = '\0';
1611 }
1612}
1613
1614/*
1615 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1616 * If pid is in /a, he may act on /a/b, but not on /b.
1617 * if the answer is false and nextcg is not NULL, then *nextcg will point
1618 * to a string containing the next cgroup directory under cg, which must be
1619 * freed by the caller.
1620 */
1621static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1622{
1623 bool answer = false;
1624 char *c2 = get_pid_cgroup(pid, contrl);
1625 char *linecmp;
1626
1627 if (!c2)
1628 return false;
1629 prune_init_slice(c2);
1630
1631 /*
12c31268
CB
1632 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1633 * they pass in a cgroup without leading '/'
1634 *
1635 * The original line here was:
1636 * linecmp = *cg == '/' ? c2 : c2+1;
1637 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1638 * Serge, do you know?
237e200e 1639 */
12c31268
CB
1640 if (*cg == '/' || !strncmp(cg, "./", 2))
1641 linecmp = c2;
1642 else
1643 linecmp = c2 + 1;
237e200e
SH
1644 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1645 if (nextcg) {
1646 *nextcg = get_next_cgroup_dir(linecmp, cg);
1647 }
1648 goto out;
1649 }
1650 answer = true;
1651
1652out:
1653 free(c2);
1654 return answer;
1655}
1656
1657/*
1658 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1659 */
1660static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1661{
1662 bool answer = false;
1663 char *c2, *task_cg;
1664 size_t target_len, task_len;
1665
f7bff426 1666 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
237e200e
SH
1667 return true;
1668
1669 c2 = get_pid_cgroup(pid, contrl);
1670 if (!c2)
1671 return false;
1672 prune_init_slice(c2);
1673
1674 task_cg = c2 + 1;
1675 target_len = strlen(cg);
1676 task_len = strlen(task_cg);
1677 if (task_len == 0) {
1678 /* Task is in the root cg, it can see everything. This case is
1679 * not handled by the strmcps below, since they test for the
1680 * last /, but that is the first / that we've chopped off
1681 * above.
1682 */
1683 answer = true;
1684 goto out;
1685 }
1686 if (strcmp(cg, task_cg) == 0) {
1687 answer = true;
1688 goto out;
1689 }
1690 if (target_len < task_len) {
1691 /* looking up a parent dir */
1692 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1693 answer = true;
1694 goto out;
1695 }
1696 if (target_len > task_len) {
1697 /* looking up a child dir */
1698 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1699 answer = true;
1700 goto out;
1701 }
1702
1703out:
1704 free(c2);
1705 return answer;
1706}
1707
1708/*
1709 * given /cgroup/freezer/a/b, return "freezer".
1710 * the returned char* should NOT be freed.
1711 */
1712static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1713{
1714 const char *p1;
1715 char *contr, *slash;
1716
99142521 1717 if (strlen(path) < 9) {
e254948f 1718 errno = EACCES;
237e200e 1719 return NULL;
99142521
CB
1720 }
1721 if (*(path + 7) != '/') {
1722 errno = EINVAL;
237e200e 1723 return NULL;
99142521 1724 }
3adc421c 1725 p1 = path + 8;
237e200e 1726 contr = strdupa(p1);
99142521
CB
1727 if (!contr) {
1728 errno = ENOMEM;
237e200e 1729 return NULL;
99142521 1730 }
237e200e
SH
1731 slash = strstr(contr, "/");
1732 if (slash)
1733 *slash = '\0';
1734
1735 int i;
3adc421c 1736 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
1737 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1738 return hierarchies[i];
1739 }
99142521 1740 errno = ENOENT;
237e200e
SH
1741 return NULL;
1742}
1743
1744/*
1745 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1746 * Note that the returned value may include files (keynames) etc
1747 */
1748static const char *find_cgroup_in_path(const char *path)
1749{
1750 const char *p1;
1751
bc70ba9b 1752 if (strlen(path) < 9) {
e254948f 1753 errno = EACCES;
237e200e 1754 return NULL;
bc70ba9b
CB
1755 }
1756 p1 = strstr(path + 8, "/");
1757 if (!p1) {
1758 errno = EINVAL;
237e200e 1759 return NULL;
bc70ba9b
CB
1760 }
1761 errno = 0;
1762 return p1 + 1;
237e200e
SH
1763}
1764
1765/*
1766 * split the last path element from the path in @cg.
1767 * @dir is newly allocated and should be freed, @last not
1768*/
1769static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1770{
1771 char *p;
1772
1773 do {
1774 *dir = strdup(cg);
1775 } while (!*dir);
1776 *last = strrchr(cg, '/');
1777 if (!*last) {
1778 *last = NULL;
1779 return;
1780 }
1781 p = strrchr(*dir, '/');
1782 *p = '\0';
1783}
1784
1785/*
1786 * FUSE ops for /cgroup
1787 */
1788
1789int cg_getattr(const char *path, struct stat *sb)
1790{
1791 struct timespec now;
1792 struct fuse_context *fc = fuse_get_context();
1793 char * cgdir = NULL;
1794 char *last = NULL, *path1, *path2;
1795 struct cgfs_files *k = NULL;
1796 const char *cgroup;
1797 const char *controller = NULL;
1798 int ret = -ENOENT;
1799
1800
1801 if (!fc)
1802 return -EIO;
1803
1804 memset(sb, 0, sizeof(struct stat));
1805
1806 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1807 return -EINVAL;
1808
1809 sb->st_uid = sb->st_gid = 0;
1810 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1811 sb->st_size = 0;
1812
1813 if (strcmp(path, "/cgroup") == 0) {
1814 sb->st_mode = S_IFDIR | 00755;
1815 sb->st_nlink = 2;
1816 return 0;
1817 }
1818
1819 controller = pick_controller_from_path(fc, path);
1820 if (!controller)
2f7036d0 1821 return -errno;
237e200e
SH
1822 cgroup = find_cgroup_in_path(path);
1823 if (!cgroup) {
1824 /* this is just /cgroup/controller, return it as a dir */
1825 sb->st_mode = S_IFDIR | 00755;
1826 sb->st_nlink = 2;
1827 return 0;
1828 }
1829
1830 get_cgdir_and_path(cgroup, &cgdir, &last);
1831
1832 if (!last) {
1833 path1 = "/";
1834 path2 = cgdir;
1835 } else {
1836 path1 = cgdir;
1837 path2 = last;
1838 }
1839
1840 pid_t initpid = lookup_initpid_in_store(fc->pid);
1841 if (initpid <= 0)
1842 initpid = fc->pid;
1843 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1844 * Then check that caller's cgroup is under path if last is a child
1845 * cgroup, or cgdir if last is a file */
1846
1847 if (is_child_cgroup(controller, path1, path2)) {
1848 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1849 ret = -ENOENT;
1850 goto out;
1851 }
1852 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1853 /* this is just /cgroup/controller, return it as a dir */
1854 sb->st_mode = S_IFDIR | 00555;
1855 sb->st_nlink = 2;
1856 ret = 0;
1857 goto out;
1858 }
1859 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1860 ret = -EACCES;
1861 goto out;
1862 }
1863
1864 // get uid, gid, from '/tasks' file and make up a mode
1865 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1866 sb->st_mode = S_IFDIR | 00755;
1867 k = cgfs_get_key(controller, cgroup, NULL);
1868 if (!k) {
1869 sb->st_uid = sb->st_gid = 0;
1870 } else {
1871 sb->st_uid = k->uid;
1872 sb->st_gid = k->gid;
1873 }
1874 free_key(k);
1875 sb->st_nlink = 2;
1876 ret = 0;
1877 goto out;
1878 }
1879
1880 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1881 sb->st_mode = S_IFREG | k->mode;
1882 sb->st_nlink = 1;
1883 sb->st_uid = k->uid;
1884 sb->st_gid = k->gid;
1885 sb->st_size = 0;
1886 free_key(k);
1887 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1888 ret = -ENOENT;
1889 goto out;
1890 }
237e200e
SH
1891 ret = 0;
1892 }
1893
1894out:
1895 free(cgdir);
1896 return ret;
1897}
1898
1899int cg_opendir(const char *path, struct fuse_file_info *fi)
1900{
1901 struct fuse_context *fc = fuse_get_context();
1902 const char *cgroup;
1903 struct file_info *dir_info;
1904 char *controller = NULL;
1905
1906 if (!fc)
1907 return -EIO;
1908
1909 if (strcmp(path, "/cgroup") == 0) {
1910 cgroup = NULL;
1911 controller = NULL;
1912 } else {
1913 // return list of keys for the controller, and list of child cgroups
1914 controller = pick_controller_from_path(fc, path);
1915 if (!controller)
2f7036d0 1916 return -errno;
237e200e
SH
1917
1918 cgroup = find_cgroup_in_path(path);
1919 if (!cgroup) {
1920 /* this is just /cgroup/controller, return its contents */
1921 cgroup = "/";
1922 }
1923 }
1924
1925 pid_t initpid = lookup_initpid_in_store(fc->pid);
1926 if (initpid <= 0)
1927 initpid = fc->pid;
1928 if (cgroup) {
1929 if (!caller_may_see_dir(initpid, controller, cgroup))
1930 return -ENOENT;
1931 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1932 return -EACCES;
1933 }
1934
1935 /* we'll free this at cg_releasedir */
1936 dir_info = malloc(sizeof(*dir_info));
1937 if (!dir_info)
1938 return -ENOMEM;
1939 dir_info->controller = must_copy_string(controller);
1940 dir_info->cgroup = must_copy_string(cgroup);
1941 dir_info->type = LXC_TYPE_CGDIR;
1942 dir_info->buf = NULL;
1943 dir_info->file = NULL;
1944 dir_info->buflen = 0;
1945
1946 fi->fh = (unsigned long)dir_info;
1947 return 0;
1948}
1949
1950int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1951 struct fuse_file_info *fi)
1952{
1953 struct file_info *d = (struct file_info *)fi->fh;
1954 struct cgfs_files **list = NULL;
1955 int i, ret;
1956 char *nextcg = NULL;
1957 struct fuse_context *fc = fuse_get_context();
1958 char **clist = NULL;
1959
d639f863
CB
1960 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1961 return -EIO;
1962
237e200e 1963 if (d->type != LXC_TYPE_CGDIR) {
b8defc3d 1964 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
237e200e
SH
1965 return -EIO;
1966 }
1967 if (!d->cgroup && !d->controller) {
1968 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1969 int i;
1970
1971 for (i = 0; i < num_hierarchies; i++) {
1972 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1973 return -EIO;
1974 }
1975 }
1976 return 0;
1977 }
1978
1979 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1980 // not a valid cgroup
1981 ret = -EINVAL;
1982 goto out;
1983 }
1984
1985 pid_t initpid = lookup_initpid_in_store(fc->pid);
1986 if (initpid <= 0)
1987 initpid = fc->pid;
1988 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1989 if (nextcg) {
1990 ret = filler(buf, nextcg, NULL, 0);
1991 free(nextcg);
1992 if (ret != 0) {
1993 ret = -EIO;
1994 goto out;
1995 }
1996 }
1997 ret = 0;
1998 goto out;
1999 }
2000
2001 for (i = 0; list[i]; i++) {
2002 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2003 ret = -EIO;
2004 goto out;
2005 }
2006 }
2007
2008 // now get the list of child cgroups
2009
2010 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2011 ret = 0;
2012 goto out;
2013 }
f366da65
WB
2014 if (clist) {
2015 for (i = 0; clist[i]; i++) {
2016 if (filler(buf, clist[i], NULL, 0) != 0) {
2017 ret = -EIO;
2018 goto out;
2019 }
237e200e
SH
2020 }
2021 }
2022 ret = 0;
2023
2024out:
2025 free_keys(list);
2026 if (clist) {
2027 for (i = 0; clist[i]; i++)
2028 free(clist[i]);
2029 free(clist);
2030 }
2031 return ret;
2032}
2033
43215927 2034static void do_release_file_info(struct fuse_file_info *fi)
237e200e 2035{
43215927
SH
2036 struct file_info *f = (struct file_info *)fi->fh;
2037
237e200e
SH
2038 if (!f)
2039 return;
43215927
SH
2040
2041 fi->fh = 0;
2042
237e200e 2043 free(f->controller);
43215927 2044 f->controller = NULL;
237e200e 2045 free(f->cgroup);
43215927 2046 f->cgroup = NULL;
237e200e 2047 free(f->file);
43215927 2048 f->file = NULL;
237e200e 2049 free(f->buf);
43215927 2050 f->buf = NULL;
237e200e
SH
2051 free(f);
2052}
2053
2054int cg_releasedir(const char *path, struct fuse_file_info *fi)
2055{
43215927 2056 do_release_file_info(fi);
237e200e
SH
2057 return 0;
2058}
2059
2060int cg_open(const char *path, struct fuse_file_info *fi)
2061{
2062 const char *cgroup;
2063 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2064 struct cgfs_files *k = NULL;
2065 struct file_info *file_info;
2066 struct fuse_context *fc = fuse_get_context();
2067 int ret;
2068
2069 if (!fc)
2070 return -EIO;
2071
2072 controller = pick_controller_from_path(fc, path);
2073 if (!controller)
2f7036d0 2074 return -errno;
237e200e
SH
2075 cgroup = find_cgroup_in_path(path);
2076 if (!cgroup)
bc70ba9b 2077 return -errno;
237e200e
SH
2078
2079 get_cgdir_and_path(cgroup, &cgdir, &last);
2080 if (!last) {
2081 path1 = "/";
2082 path2 = cgdir;
2083 } else {
2084 path1 = cgdir;
2085 path2 = last;
2086 }
2087
2088 k = cgfs_get_key(controller, path1, path2);
2089 if (!k) {
2090 ret = -EINVAL;
2091 goto out;
2092 }
2093 free_key(k);
2094
2095 pid_t initpid = lookup_initpid_in_store(fc->pid);
2096 if (initpid <= 0)
2097 initpid = fc->pid;
2098 if (!caller_may_see_dir(initpid, controller, path1)) {
2099 ret = -ENOENT;
2100 goto out;
2101 }
2102 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
237e200e
SH
2103 ret = -EACCES;
2104 goto out;
2105 }
2106
2107 /* we'll free this at cg_release */
2108 file_info = malloc(sizeof(*file_info));
2109 if (!file_info) {
2110 ret = -ENOMEM;
2111 goto out;
2112 }
2113 file_info->controller = must_copy_string(controller);
2114 file_info->cgroup = must_copy_string(path1);
2115 file_info->file = must_copy_string(path2);
2116 file_info->type = LXC_TYPE_CGFILE;
2117 file_info->buf = NULL;
2118 file_info->buflen = 0;
2119
2120 fi->fh = (unsigned long)file_info;
2121 ret = 0;
2122
2123out:
2124 free(cgdir);
2125 return ret;
2126}
2127
bddbb106
SH
2128int cg_access(const char *path, int mode)
2129{
6f0f6b83 2130 int ret;
bddbb106 2131 const char *cgroup;
6f0f6b83
CB
2132 char *path1, *path2, *controller;
2133 char *last = NULL, *cgdir = NULL;
bddbb106
SH
2134 struct cgfs_files *k = NULL;
2135 struct fuse_context *fc = fuse_get_context();
6f0f6b83 2136
9873c5e8 2137 if (strcmp(path, "/cgroup") == 0)
6f0f6b83 2138 return 0;
bddbb106
SH
2139
2140 if (!fc)
2141 return -EIO;
2142
2143 controller = pick_controller_from_path(fc, path);
2144 if (!controller)
2f7036d0 2145 return -errno;
bddbb106 2146 cgroup = find_cgroup_in_path(path);
575316c4
SH
2147 if (!cgroup) {
2148 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
3f441bc7
SH
2149 if ((mode & W_OK) == 0)
2150 return 0;
2151 return -EACCES;
575316c4 2152 }
bddbb106
SH
2153
2154 get_cgdir_and_path(cgroup, &cgdir, &last);
2155 if (!last) {
2156 path1 = "/";
2157 path2 = cgdir;
2158 } else {
2159 path1 = cgdir;
2160 path2 = last;
2161 }
2162
2163 k = cgfs_get_key(controller, path1, path2);
2164 if (!k) {
3f441bc7
SH
2165 if ((mode & W_OK) == 0)
2166 ret = 0;
2167 else
2168 ret = -EACCES;
bddbb106
SH
2169 goto out;
2170 }
2171 free_key(k);
2172
2173 pid_t initpid = lookup_initpid_in_store(fc->pid);
2174 if (initpid <= 0)
2175 initpid = fc->pid;
2176 if (!caller_may_see_dir(initpid, controller, path1)) {
2177 ret = -ENOENT;
2178 goto out;
2179 }
2180 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2181 ret = -EACCES;
2182 goto out;
2183 }
2184
2185 ret = 0;
2186
2187out:
2188 free(cgdir);
2189 return ret;
2190}
2191
237e200e
SH
2192int cg_release(const char *path, struct fuse_file_info *fi)
2193{
43215927 2194 do_release_file_info(fi);
237e200e
SH
2195 return 0;
2196}
2197
2198#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2199
2200static bool wait_for_sock(int sock, int timeout)
2201{
2202 struct epoll_event ev;
2203 int epfd, ret, now, starttime, deltatime, saved_errno;
2204
2205 if ((starttime = time(NULL)) < 0)
2206 return false;
2207
2208 if ((epfd = epoll_create(1)) < 0) {
b8defc3d 2209 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
237e200e
SH
2210 return false;
2211 }
2212
2213 ev.events = POLLIN_SET;
2214 ev.data.fd = sock;
2215 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
b8defc3d 2216 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
237e200e
SH
2217 close(epfd);
2218 return false;
2219 }
2220
2221again:
2222 if ((now = time(NULL)) < 0) {
2223 close(epfd);
2224 return false;
2225 }
2226
2227 deltatime = (starttime + timeout) - now;
2228 if (deltatime < 0) { // timeout
2229 errno = 0;
2230 close(epfd);
2231 return false;
2232 }
2233 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2234 if (ret < 0 && errno == EINTR)
2235 goto again;
2236 saved_errno = errno;
2237 close(epfd);
2238
2239 if (ret <= 0) {
2240 errno = saved_errno;
2241 return false;
2242 }
2243 return true;
2244}
2245
2246static int msgrecv(int sockfd, void *buf, size_t len)
2247{
2248 if (!wait_for_sock(sockfd, 2))
2249 return -1;
2250 return recv(sockfd, buf, len, MSG_DONTWAIT);
2251}
2252
2253static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2254{
2255 struct msghdr msg = { 0 };
2256 struct iovec iov;
2257 struct cmsghdr *cmsg;
2258 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2259 char buf[1];
2260 buf[0] = 'p';
2261
2262 if (pingfirst) {
2263 if (msgrecv(sock, buf, 1) != 1) {
b8defc3d 2264 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
237e200e
SH
2265 return SEND_CREDS_FAIL;
2266 }
2267 }
2268
2269 msg.msg_control = cmsgbuf;
2270 msg.msg_controllen = sizeof(cmsgbuf);
2271
2272 cmsg = CMSG_FIRSTHDR(&msg);
2273 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2274 cmsg->cmsg_level = SOL_SOCKET;
2275 cmsg->cmsg_type = SCM_CREDENTIALS;
2276 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2277
2278 msg.msg_name = NULL;
2279 msg.msg_namelen = 0;
2280
2281 buf[0] = v;
2282 iov.iov_base = buf;
2283 iov.iov_len = sizeof(buf);
2284 msg.msg_iov = &iov;
2285 msg.msg_iovlen = 1;
2286
2287 if (sendmsg(sock, &msg, 0) < 0) {
b8defc3d 2288 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
237e200e
SH
2289 if (errno == 3)
2290 return SEND_CREDS_NOTSK;
2291 return SEND_CREDS_FAIL;
2292 }
2293
2294 return SEND_CREDS_OK;
2295}
2296
2297static bool recv_creds(int sock, struct ucred *cred, char *v)
2298{
2299 struct msghdr msg = { 0 };
2300 struct iovec iov;
2301 struct cmsghdr *cmsg;
2302 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2303 char buf[1];
2304 int ret;
2305 int optval = 1;
2306
2307 *v = '1';
2308
2309 cred->pid = -1;
2310 cred->uid = -1;
2311 cred->gid = -1;
2312
2313 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
b8defc3d 2314 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
237e200e
SH
2315 return false;
2316 }
2317 buf[0] = '1';
2318 if (write(sock, buf, 1) != 1) {
b8defc3d 2319 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
237e200e
SH
2320 return false;
2321 }
2322
2323 msg.msg_name = NULL;
2324 msg.msg_namelen = 0;
2325 msg.msg_control = cmsgbuf;
2326 msg.msg_controllen = sizeof(cmsgbuf);
2327
2328 iov.iov_base = buf;
2329 iov.iov_len = sizeof(buf);
2330 msg.msg_iov = &iov;
2331 msg.msg_iovlen = 1;
2332
2333 if (!wait_for_sock(sock, 2)) {
b8defc3d 2334 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
237e200e
SH
2335 return false;
2336 }
2337 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2338 if (ret < 0) {
b8defc3d 2339 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
237e200e
SH
2340 return false;
2341 }
2342
2343 cmsg = CMSG_FIRSTHDR(&msg);
2344
2345 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2346 cmsg->cmsg_level == SOL_SOCKET &&
2347 cmsg->cmsg_type == SCM_CREDENTIALS) {
2348 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2349 }
2350 *v = buf[0];
2351
2352 return true;
2353}
2354
35174b0f
FG
2355struct pid_ns_clone_args {
2356 int *cpipe;
2357 int sock;
2358 pid_t tpid;
2359 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2360};
2361
2362/*
2363 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2364 * with clone(). This simply writes '1' as ACK back to the parent
2365 * before calling the actual wrapped function.
2366 */
2367static int pid_ns_clone_wrapper(void *arg) {
2368 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2369 char b = '1';
2370
2371 close(args->cpipe[0]);
b8defc3d
CB
2372 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2373 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
35174b0f
FG
2374 close(args->cpipe[1]);
2375 return args->wrapped(args->sock, args->tpid);
2376}
237e200e
SH
2377
2378/*
2379 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2380 * int value back over the socket. This shifts the pid from the
2381 * sender's pidns into tpid's pidns.
2382 */
35174b0f 2383static int pid_to_ns(int sock, pid_t tpid)
237e200e
SH
2384{
2385 char v = '0';
2386 struct ucred cred;
2387
2388 while (recv_creds(sock, &cred, &v)) {
2389 if (v == '1')
35174b0f 2390 return 0;
237e200e 2391 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
35174b0f 2392 return 1;
237e200e 2393 }
35174b0f 2394 return 0;
237e200e
SH
2395}
2396
35174b0f 2397
237e200e
SH
2398/*
2399 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
35174b0f
FG
2400 * in your old pidns. Only children which you clone will be in the target
2401 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2402 * actually convert pids.
2403 *
2404 * Note: glibc's fork() does not respect pidns, which can lead to failed
2405 * assertions inside glibc (and thus failed forks) if the child's pid in
2406 * the pidns and the parent pid outside are identical. Using clone prevents
2407 * this issue.
237e200e
SH
2408 */
2409static void pid_to_ns_wrapper(int sock, pid_t tpid)
2410{
2411 int newnsfd = -1, ret, cpipe[2];
2412 char fnam[100];
2413 pid_t cpid;
2414 char v;
2415
2416 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2417 if (ret < 0 || ret >= sizeof(fnam))
2418 _exit(1);
2419 newnsfd = open(fnam, O_RDONLY);
2420 if (newnsfd < 0)
2421 _exit(1);
2422 if (setns(newnsfd, 0) < 0)
2423 _exit(1);
2424 close(newnsfd);
2425
2426 if (pipe(cpipe) < 0)
2427 _exit(1);
2428
35174b0f
FG
2429 struct pid_ns_clone_args args = {
2430 .cpipe = cpipe,
2431 .sock = sock,
2432 .tpid = tpid,
2433 .wrapped = &pid_to_ns
2434 };
2435 size_t stack_size = sysconf(_SC_PAGESIZE);
2436 void *stack = alloca(stack_size);
2437
2438 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2439 if (cpid < 0)
2440 _exit(1);
2441
237e200e
SH
2442 // give the child 1 second to be done forking and
2443 // write its ack
2444 if (!wait_for_sock(cpipe[0], 1))
2445 _exit(1);
2446 ret = read(cpipe[0], &v, 1);
2447 if (ret != sizeof(char) || v != '1')
2448 _exit(1);
2449
2450 if (!wait_for_pid(cpid))
2451 _exit(1);
2452 _exit(0);
2453}
2454
2455/*
2456 * To read cgroup files with a particular pid, we will setns into the child
2457 * pidns, open a pipe, fork a child - which will be the first to really be in
2458 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2459 */
2460bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2461{
2462 int sock[2] = {-1, -1};
2463 char *tmpdata = NULL;
2464 int ret;
2465 pid_t qpid, cpid = -1;
2466 bool answer = false;
2467 char v = '0';
2468 struct ucred cred;
2469 size_t sz = 0, asz = 0;
2470
2471 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2472 return false;
2473
2474 /*
2475 * Now we read the pids from returned data one by one, pass
2476 * them into a child in the target namespace, read back the
2477 * translated pids, and put them into our to-return data
2478 */
2479
2480 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2481 perror("socketpair");
2482 free(tmpdata);
2483 return false;
2484 }
2485
2486 cpid = fork();
2487 if (cpid == -1)
2488 goto out;
2489
2490 if (!cpid) // child - exits when done
2491 pid_to_ns_wrapper(sock[1], tpid);
2492
2493 char *ptr = tmpdata;
2494 cred.uid = 0;
2495 cred.gid = 0;
2496 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2497 cred.pid = qpid;
2498 ret = send_creds(sock[0], &cred, v, true);
2499
2500 if (ret == SEND_CREDS_NOTSK)
2501 goto next;
2502 if (ret == SEND_CREDS_FAIL)
2503 goto out;
2504
2505 // read converted results
2506 if (!wait_for_sock(sock[0], 2)) {
b8defc3d 2507 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
237e200e
SH
2508 goto out;
2509 }
2510 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2511 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
237e200e
SH
2512 goto out;
2513 }
2514 must_strcat_pid(d, &sz, &asz, qpid);
2515next:
2516 ptr = strchr(ptr, '\n');
2517 if (!ptr)
2518 break;
2519 ptr++;
2520 }
2521
2522 cred.pid = getpid();
2523 v = '1';
2524 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2525 // failed to ask child to exit
b8defc3d 2526 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
237e200e
SH
2527 goto out;
2528 }
2529
2530 answer = true;
2531
2532out:
2533 free(tmpdata);
2534 if (cpid != -1)
2535 wait_for_pid(cpid);
2536 if (sock[0] != -1) {
2537 close(sock[0]);
2538 close(sock[1]);
2539 }
2540 return answer;
2541}
2542
2543int cg_read(const char *path, char *buf, size_t size, off_t offset,
2544 struct fuse_file_info *fi)
2545{
2546 struct fuse_context *fc = fuse_get_context();
2547 struct file_info *f = (struct file_info *)fi->fh;
2548 struct cgfs_files *k = NULL;
2549 char *data = NULL;
2550 int ret, s;
2551 bool r;
2552
2553 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2554 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
237e200e
SH
2555 return -EIO;
2556 }
2557
2558 if (offset)
2559 return 0;
2560
2561 if (!fc)
2562 return -EIO;
2563
2564 if (!f->controller)
2565 return -EINVAL;
2566
2567 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2568 return -EINVAL;
2569 }
2570 free_key(k);
2571
2572
888f8f3c 2573 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
237e200e
SH
2574 ret = -EACCES;
2575 goto out;
2576 }
2577
2578 if (strcmp(f->file, "tasks") == 0 ||
2579 strcmp(f->file, "/tasks") == 0 ||
2580 strcmp(f->file, "/cgroup.procs") == 0 ||
2581 strcmp(f->file, "cgroup.procs") == 0)
2582 // special case - we have to translate the pids
2583 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2584 else
2585 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2586
2587 if (!r) {
2588 ret = -EINVAL;
2589 goto out;
2590 }
2591
2592 if (!data) {
2593 ret = 0;
2594 goto out;
2595 }
2596 s = strlen(data);
2597 if (s > size)
2598 s = size;
2599 memcpy(buf, data, s);
2600 if (s > 0 && s < size && data[s-1] != '\n')
2601 buf[s++] = '\n';
2602
2603 ret = s;
2604
2605out:
2606 free(data);
2607 return ret;
2608}
2609
35174b0f 2610static int pid_from_ns(int sock, pid_t tpid)
237e200e
SH
2611{
2612 pid_t vpid;
2613 struct ucred cred;
2614 char v;
2615 int ret;
2616
2617 cred.uid = 0;
2618 cred.gid = 0;
2619 while (1) {
2620 if (!wait_for_sock(sock, 2)) {
b8defc3d 2621 lxcfs_error("%s\n", "Timeout reading from parent.");
35174b0f 2622 return 1;
237e200e
SH
2623 }
2624 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
b8defc3d 2625 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
35174b0f 2626 return 1;
237e200e
SH
2627 }
2628 if (vpid == -1) // done
2629 break;
2630 v = '0';
2631 cred.pid = vpid;
2632 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2633 v = '1';
2634 cred.pid = getpid();
2635 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
35174b0f 2636 return 1;
237e200e
SH
2637 }
2638 }
35174b0f 2639 return 0;
237e200e
SH
2640}
2641
2642static void pid_from_ns_wrapper(int sock, pid_t tpid)
2643{
2644 int newnsfd = -1, ret, cpipe[2];
2645 char fnam[100];
2646 pid_t cpid;
2647 char v;
2648
2649 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2650 if (ret < 0 || ret >= sizeof(fnam))
2651 _exit(1);
2652 newnsfd = open(fnam, O_RDONLY);
2653 if (newnsfd < 0)
2654 _exit(1);
2655 if (setns(newnsfd, 0) < 0)
2656 _exit(1);
2657 close(newnsfd);
2658
2659 if (pipe(cpipe) < 0)
2660 _exit(1);
2661
35174b0f
FG
2662 struct pid_ns_clone_args args = {
2663 .cpipe = cpipe,
2664 .sock = sock,
2665 .tpid = tpid,
2666 .wrapped = &pid_from_ns
2667 };
f0f8b851
SH
2668 size_t stack_size = sysconf(_SC_PAGESIZE);
2669 void *stack = alloca(stack_size);
35174b0f
FG
2670
2671 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
237e200e
SH
2672 if (cpid < 0)
2673 _exit(1);
2674
237e200e
SH
2675 // give the child 1 second to be done forking and
2676 // write its ack
2677 if (!wait_for_sock(cpipe[0], 1))
f0f8b851 2678 _exit(1);
237e200e 2679 ret = read(cpipe[0], &v, 1);
f0f8b851
SH
2680 if (ret != sizeof(char) || v != '1')
2681 _exit(1);
237e200e
SH
2682
2683 if (!wait_for_pid(cpid))
2684 _exit(1);
2685 _exit(0);
237e200e
SH
2686}
2687
2688/*
2689 * Given host @uid, return the uid to which it maps in
2690 * @pid's user namespace, or -1 if none.
2691 */
2692bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2693{
2694 FILE *f;
2695 char line[400];
2696
2697 sprintf(line, "/proc/%d/uid_map", pid);
2698 if ((f = fopen(line, "r")) == NULL) {
2699 return false;
2700 }
2701
2702 *answer = convert_id_to_ns(f, uid);
2703 fclose(f);
2704
2705 if (*answer == -1)
2706 return false;
2707 return true;
2708}
2709
2710/*
2711 * get_pid_creds: get the real uid and gid of @pid from
2712 * /proc/$$/status
2713 * (XXX should we use euid here?)
2714 */
2715void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2716{
2717 char line[400];
2718 uid_t u;
2719 gid_t g;
2720 FILE *f;
2721
2722 *uid = -1;
2723 *gid = -1;
2724 sprintf(line, "/proc/%d/status", pid);
2725 if ((f = fopen(line, "r")) == NULL) {
b8defc3d 2726 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
237e200e
SH
2727 return;
2728 }
2729 while (fgets(line, 400, f)) {
2730 if (strncmp(line, "Uid:", 4) == 0) {
2731 if (sscanf(line+4, "%u", &u) != 1) {
b8defc3d 2732 lxcfs_error("bad uid line for pid %u\n", pid);
237e200e
SH
2733 fclose(f);
2734 return;
2735 }
2736 *uid = u;
2737 } else if (strncmp(line, "Gid:", 4) == 0) {
2738 if (sscanf(line+4, "%u", &g) != 1) {
b8defc3d 2739 lxcfs_error("bad gid line for pid %u\n", pid);
237e200e
SH
2740 fclose(f);
2741 return;
2742 }
2743 *gid = g;
2744 }
2745 }
2746 fclose(f);
2747}
2748
2749/*
2750 * May the requestor @r move victim @v to a new cgroup?
2751 * This is allowed if
2752 * . they are the same task
2753 * . they are ownedy by the same uid
2754 * . @r is root on the host, or
2755 * . @v's uid is mapped into @r's where @r is root.
2756 */
2757bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2758{
2759 uid_t v_uid, tmpuid;
2760 gid_t v_gid;
2761
2762 if (r == v)
2763 return true;
2764 if (r_uid == 0)
2765 return true;
2766 get_pid_creds(v, &v_uid, &v_gid);
2767 if (r_uid == v_uid)
2768 return true;
2769 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2770 && hostuid_to_ns(v_uid, r, &tmpuid))
2771 return true;
2772 return false;
2773}
2774
2775static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2776 const char *file, const char *buf)
2777{
2778 int sock[2] = {-1, -1};
2779 pid_t qpid, cpid = -1;
2780 FILE *pids_file = NULL;
2781 bool answer = false, fail = false;
2782
2783 pids_file = open_pids_file(contrl, cg);
2784 if (!pids_file)
2785 return false;
2786
2787 /*
2788 * write the pids to a socket, have helper in writer's pidns
2789 * call movepid for us
2790 */
2791 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2792 perror("socketpair");
2793 goto out;
2794 }
2795
2796 cpid = fork();
2797 if (cpid == -1)
2798 goto out;
2799
2800 if (!cpid) { // child
2801 fclose(pids_file);
2802 pid_from_ns_wrapper(sock[1], tpid);
2803 }
2804
2805 const char *ptr = buf;
2806 while (sscanf(ptr, "%d", &qpid) == 1) {
2807 struct ucred cred;
2808 char v;
2809
2810 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
b8defc3d 2811 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
237e200e
SH
2812 goto out;
2813 }
2814
2815 if (recv_creds(sock[0], &cred, &v)) {
2816 if (v == '0') {
2817 if (!may_move_pid(tpid, tuid, cred.pid)) {
2818 fail = true;
2819 break;
2820 }
2821 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2822 fail = true;
2823 }
2824 }
2825
2826 ptr = strchr(ptr, '\n');
2827 if (!ptr)
2828 break;
2829 ptr++;
2830 }
2831
2832 /* All good, write the value */
2833 qpid = -1;
2834 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
b8defc3d 2835 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
237e200e
SH
2836
2837 if (!fail)
2838 answer = true;
2839
2840out:
2841 if (cpid != -1)
2842 wait_for_pid(cpid);
2843 if (sock[0] != -1) {
2844 close(sock[0]);
2845 close(sock[1]);
2846 }
2847 if (pids_file) {
2848 if (fclose(pids_file) != 0)
2849 answer = false;
2850 }
2851 return answer;
2852}
2853
2854int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2855 struct fuse_file_info *fi)
2856{
2857 struct fuse_context *fc = fuse_get_context();
2858 char *localbuf = NULL;
2859 struct cgfs_files *k = NULL;
2860 struct file_info *f = (struct file_info *)fi->fh;
2861 bool r;
2862
2863 if (f->type != LXC_TYPE_CGFILE) {
b8defc3d 2864 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
237e200e
SH
2865 return -EIO;
2866 }
2867
2868 if (offset)
2869 return 0;
2870
2871 if (!fc)
2872 return -EIO;
2873
2874 localbuf = alloca(size+1);
2875 localbuf[size] = '\0';
2876 memcpy(localbuf, buf, size);
2877
2878 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2879 size = -EINVAL;
2880 goto out;
2881 }
2882
2883 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2884 size = -EACCES;
2885 goto out;
2886 }
2887
2888 if (strcmp(f->file, "tasks") == 0 ||
2889 strcmp(f->file, "/tasks") == 0 ||
2890 strcmp(f->file, "/cgroup.procs") == 0 ||
2891 strcmp(f->file, "cgroup.procs") == 0)
2892 // special case - we have to translate the pids
2893 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2894 else
2895 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2896
2897 if (!r)
2898 size = -EINVAL;
2899
2900out:
2901 free_key(k);
2902 return size;
2903}
2904
2905int cg_chown(const char *path, uid_t uid, gid_t gid)
2906{
2907 struct fuse_context *fc = fuse_get_context();
2908 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2909 struct cgfs_files *k = NULL;
2910 const char *cgroup;
2911 int ret;
2912
2913 if (!fc)
2914 return -EIO;
2915
2916 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2917 return -EPERM;
237e200e
SH
2918
2919 controller = pick_controller_from_path(fc, path);
2920 if (!controller)
bc70ba9b
CB
2921 return errno == ENOENT ? -EPERM : -errno;
2922
237e200e
SH
2923 cgroup = find_cgroup_in_path(path);
2924 if (!cgroup)
2925 /* this is just /cgroup/controller */
bc70ba9b 2926 return -EPERM;
237e200e
SH
2927
2928 get_cgdir_and_path(cgroup, &cgdir, &last);
2929
2930 if (!last) {
2931 path1 = "/";
2932 path2 = cgdir;
2933 } else {
2934 path1 = cgdir;
2935 path2 = last;
2936 }
2937
2938 if (is_child_cgroup(controller, path1, path2)) {
2939 // get uid, gid, from '/tasks' file and make up a mode
2940 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2941 k = cgfs_get_key(controller, cgroup, "tasks");
2942
2943 } else
2944 k = cgfs_get_key(controller, path1, path2);
2945
2946 if (!k) {
2947 ret = -EINVAL;
2948 goto out;
2949 }
2950
2951 /*
2952 * This being a fuse request, the uid and gid must be valid
2953 * in the caller's namespace. So we can just check to make
2954 * sure that the caller is root in his uid, and privileged
2955 * over the file's current owner.
2956 */
2957 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2958 ret = -EACCES;
2959 goto out;
2960 }
2961
2962 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2963
2964out:
2965 free_key(k);
2966 free(cgdir);
2967
2968 return ret;
2969}
2970
2971int cg_chmod(const char *path, mode_t mode)
2972{
2973 struct fuse_context *fc = fuse_get_context();
2974 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2975 struct cgfs_files *k = NULL;
2976 const char *cgroup;
2977 int ret;
2978
2979 if (!fc)
2980 return -EIO;
2981
2982 if (strcmp(path, "/cgroup") == 0)
bc70ba9b 2983 return -EPERM;
237e200e
SH
2984
2985 controller = pick_controller_from_path(fc, path);
2986 if (!controller)
bc70ba9b
CB
2987 return errno == ENOENT ? -EPERM : -errno;
2988
237e200e
SH
2989 cgroup = find_cgroup_in_path(path);
2990 if (!cgroup)
2991 /* this is just /cgroup/controller */
bc70ba9b 2992 return -EPERM;
237e200e
SH
2993
2994 get_cgdir_and_path(cgroup, &cgdir, &last);
2995
2996 if (!last) {
2997 path1 = "/";
2998 path2 = cgdir;
2999 } else {
3000 path1 = cgdir;
3001 path2 = last;
3002 }
3003
3004 if (is_child_cgroup(controller, path1, path2)) {
3005 // get uid, gid, from '/tasks' file and make up a mode
3006 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3007 k = cgfs_get_key(controller, cgroup, "tasks");
3008
3009 } else
3010 k = cgfs_get_key(controller, path1, path2);
3011
3012 if (!k) {
3013 ret = -EINVAL;
3014 goto out;
3015 }
3016
3017 /*
3018 * This being a fuse request, the uid and gid must be valid
3019 * in the caller's namespace. So we can just check to make
3020 * sure that the caller is root in his uid, and privileged
3021 * over the file's current owner.
3022 */
3023 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3024 ret = -EPERM;
3025 goto out;
3026 }
3027
3028 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3029 ret = -EINVAL;
3030 goto out;
3031 }
3032
3033 ret = 0;
3034out:
3035 free_key(k);
3036 free(cgdir);
3037 return ret;
3038}
3039
3040int cg_mkdir(const char *path, mode_t mode)
3041{
3042 struct fuse_context *fc = fuse_get_context();
3043 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3044 const char *cgroup;
3045 int ret;
3046
3047 if (!fc)
3048 return -EIO;
3049
237e200e
SH
3050 controller = pick_controller_from_path(fc, path);
3051 if (!controller)
2f7036d0 3052 return errno == ENOENT ? -EPERM : -errno;
237e200e
SH
3053
3054 cgroup = find_cgroup_in_path(path);
3055 if (!cgroup)
bc70ba9b 3056 return -errno;
237e200e
SH
3057
3058 get_cgdir_and_path(cgroup, &cgdir, &last);
3059 if (!last)
3060 path1 = "/";
3061 else
3062 path1 = cgdir;
3063
3064 pid_t initpid = lookup_initpid_in_store(fc->pid);
3065 if (initpid <= 0)
3066 initpid = fc->pid;
3067 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3068 if (!next)
3069 ret = -EINVAL;
3070 else if (last && strcmp(next, last) == 0)
3071 ret = -EEXIST;
3072 else
2f7036d0 3073 ret = -EPERM;
237e200e
SH
3074 goto out;
3075 }
3076
3077 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3078 ret = -EACCES;
3079 goto out;
3080 }
3081 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3082 ret = -EACCES;
3083 goto out;
3084 }
3085
3086 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3087
3088out:
3089 free(cgdir);
3090 free(next);
3091 return ret;
3092}
3093
3094int cg_rmdir(const char *path)
3095{
3096 struct fuse_context *fc = fuse_get_context();
3097 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3098 const char *cgroup;
3099 int ret;
3100
3101 if (!fc)
3102 return -EIO;
3103
3104 controller = pick_controller_from_path(fc, path);
e254948f
CB
3105 if (!controller) /* Someone's trying to delete "/cgroup". */
3106 return -EPERM;
237e200e
SH
3107
3108 cgroup = find_cgroup_in_path(path);
e254948f
CB
3109 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3110 return -EPERM;
237e200e
SH
3111
3112 get_cgdir_and_path(cgroup, &cgdir, &last);
3113 if (!last) {
e254948f
CB
3114 /* Someone's trying to delete a cgroup on the same level as the
3115 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3116 * rmdir "/cgroup/blkio/init.slice".
3117 */
3118 ret = -EPERM;
237e200e
SH
3119 goto out;
3120 }
3121
3122 pid_t initpid = lookup_initpid_in_store(fc->pid);
3123 if (initpid <= 0)
3124 initpid = fc->pid;
3125 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
de77249b 3126 if (!last || (next && (strcmp(next, last) == 0)))
237e200e
SH
3127 ret = -EBUSY;
3128 else
3129 ret = -ENOENT;
3130 goto out;
3131 }
3132
3133 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3134 ret = -EACCES;
3135 goto out;
3136 }
3137 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3138 ret = -EACCES;
3139 goto out;
3140 }
3141
3142 if (!cgfs_remove(controller, cgroup)) {
3143 ret = -EINVAL;
3144 goto out;
3145 }
3146
3147 ret = 0;
3148
3149out:
3150 free(cgdir);
3151 free(next);
3152 return ret;
3153}
3154
3155static bool startswith(const char *line, const char *pref)
3156{
3157 if (strncmp(line, pref, strlen(pref)) == 0)
3158 return true;
3159 return false;
3160}
3161
c6095b08
SH
3162static void parse_memstat(char *memstat, unsigned long *cached,
3163 unsigned long *active_anon, unsigned long *inactive_anon,
3164 unsigned long *active_file, unsigned long *inactive_file,
559eaa8f 3165 unsigned long *unevictable, unsigned long *shmem)
237e200e
SH
3166{
3167 char *eol;
3168
237e200e 3169 while (*memstat) {
4accebfb
AS
3170 if (startswith(memstat, "total_cache")) {
3171 sscanf(memstat + 11, "%lu", cached);
c6095b08 3172 *cached /= 1024;
4accebfb
AS
3173 } else if (startswith(memstat, "total_active_anon")) {
3174 sscanf(memstat + 17, "%lu", active_anon);
c6095b08 3175 *active_anon /= 1024;
4accebfb
AS
3176 } else if (startswith(memstat, "total_inactive_anon")) {
3177 sscanf(memstat + 19, "%lu", inactive_anon);
c6095b08 3178 *inactive_anon /= 1024;
4accebfb
AS
3179 } else if (startswith(memstat, "total_active_file")) {
3180 sscanf(memstat + 17, "%lu", active_file);
c6095b08 3181 *active_file /= 1024;
4accebfb
AS
3182 } else if (startswith(memstat, "total_inactive_file")) {
3183 sscanf(memstat + 19, "%lu", inactive_file);
c6095b08 3184 *inactive_file /= 1024;
4accebfb
AS
3185 } else if (startswith(memstat, "total_unevictable")) {
3186 sscanf(memstat + 17, "%lu", unevictable);
c6095b08 3187 *unevictable /= 1024;
559eaa8f
JS
3188 } else if (startswith(memstat, "total_shmem")) {
3189 sscanf(memstat + 11, "%lu", shmem);
3190 *shmem /= 1024;
237e200e
SH
3191 }
3192 eol = strchr(memstat, '\n');
3193 if (!eol)
3194 return;
3195 memstat = eol+1;
3196 }
3197}
3198
3199static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3200{
3201 char *eol;
3202 char key[32];
3203
3204 memset(key, 0, 32);
3205 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3206
3207 size_t len = strlen(key);
3208 *v = 0;
3209
3210 while (*str) {
3211 if (startswith(str, key)) {
3212 sscanf(str + len, "%lu", v);
3213 return;
3214 }
3215 eol = strchr(str, '\n');
3216 if (!eol)
3217 return;
3218 str = eol+1;
3219 }
3220}
3221
3222static int read_file(const char *path, char *buf, size_t size,
3223 struct file_info *d)
3224{
3225 size_t linelen = 0, total_len = 0, rv = 0;
3226 char *line = NULL;
3227 char *cache = d->buf;
3228 size_t cache_size = d->buflen;
3229 FILE *f = fopen(path, "r");
3230 if (!f)
3231 return 0;
3232
3233 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3234 ssize_t l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3235 if (l < 0) {
3236 perror("Error writing to cache");
3237 rv = 0;
3238 goto err;
3239 }
3240 if (l >= cache_size) {
b8defc3d 3241 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3242 rv = 0;
3243 goto err;
3244 }
3245 cache += l;
3246 cache_size -= l;
3247 total_len += l;
3248 }
3249
3250 d->size = total_len;
a262ddb7
CB
3251 if (total_len > size)
3252 total_len = size;
237e200e
SH
3253
3254 /* read from off 0 */
3255 memcpy(buf, d->buf, total_len);
3256 rv = total_len;
3257 err:
3258 fclose(f);
3259 free(line);
3260 return rv;
3261}
3262
3263/*
3264 * FUSE ops for /proc
3265 */
3266
018246ff 3267static unsigned long get_memlimit(const char *cgroup, const char *file)
237e200e
SH
3268{
3269 char *memlimit_str = NULL;
3270 unsigned long memlimit = -1;
3271
018246ff 3272 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
237e200e
SH
3273 memlimit = strtoul(memlimit_str, NULL, 10);
3274
3275 free(memlimit_str);
3276
3277 return memlimit;
3278}
3279
018246ff 3280static unsigned long get_min_memlimit(const char *cgroup, const char *file)
237e200e
SH
3281{
3282 char *copy = strdupa(cgroup);
3283 unsigned long memlimit = 0, retlimit;
3284
018246ff 3285 retlimit = get_memlimit(copy, file);
237e200e
SH
3286
3287 while (strcmp(copy, "/") != 0) {
3288 copy = dirname(copy);
018246ff 3289 memlimit = get_memlimit(copy, file);
237e200e
SH
3290 if (memlimit != -1 && memlimit < retlimit)
3291 retlimit = memlimit;
3292 };
3293
3294 return retlimit;
3295}
3296
3297static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3298 struct fuse_file_info *fi)
3299{
3300 struct fuse_context *fc = fuse_get_context();
3301 struct file_info *d = (struct file_info *)fi->fh;
3302 char *cg;
3303 char *memusage_str = NULL, *memstat_str = NULL,
018246ff 3304 *memswlimit_str = NULL, *memswusage_str = NULL;
237e200e 3305 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
c6095b08 3306 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
559eaa8f 3307 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
594a10e6 3308 hostswtotal = 0;
237e200e
SH
3309 char *line = NULL;
3310 size_t linelen = 0, total_len = 0, rv = 0;
3311 char *cache = d->buf;
3312 size_t cache_size = d->buflen;
3313 FILE *f = NULL;
3314
3315 if (offset){
3316 if (offset > d->size)
3317 return -EINVAL;
3318 if (!d->cached)
3319 return 0;
3320 int left = d->size - offset;
3321 total_len = left > size ? size: left;
3322 memcpy(buf, cache + offset, total_len);
3323 return total_len;
3324 }
3325
3326 pid_t initpid = lookup_initpid_in_store(fc->pid);
3327 if (initpid <= 0)
3328 initpid = fc->pid;
3329 cg = get_pid_cgroup(initpid, "memory");
3330 if (!cg)
3331 return read_file("/proc/meminfo", buf, size, d);
6d2f6996 3332 prune_init_slice(cg);
237e200e 3333
018246ff 3334 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
237e200e
SH
3335 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3336 goto err;
3337 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3338 goto err;
3339
3340 // Following values are allowed to fail, because swapaccount might be turned
3341 // off for current kernel
3342 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3343 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3344 {
018246ff 3345 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
237e200e
SH
3346 memswusage = strtoul(memswusage_str, NULL, 10);
3347
237e200e
SH
3348 memswlimit = memswlimit / 1024;
3349 memswusage = memswusage / 1024;
3350 }
3351
3352 memusage = strtoul(memusage_str, NULL, 10);
3353 memlimit /= 1024;
3354 memusage /= 1024;
3355
c6095b08
SH
3356 parse_memstat(memstat_str, &cached, &active_anon,
3357 &inactive_anon, &active_file, &inactive_file,
559eaa8f 3358 &unevictable, &shmem);
237e200e
SH
3359
3360 f = fopen("/proc/meminfo", "r");
3361 if (!f)
3362 goto err;
3363
3364 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3365 ssize_t l;
237e200e
SH
3366 char *printme, lbuf[100];
3367
3368 memset(lbuf, 0, 100);
3369 if (startswith(line, "MemTotal:")) {
594a10e6 3370 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
237e200e
SH
3371 if (hosttotal < memlimit)
3372 memlimit = hosttotal;
3373 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3374 printme = lbuf;
3375 } else if (startswith(line, "MemFree:")) {
3376 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3377 printme = lbuf;
3378 } else if (startswith(line, "MemAvailable:")) {
ad19b86d 3379 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
237e200e
SH
3380 printme = lbuf;
3381 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
594a10e6 3382 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
4127e51b 3383 if (hostswtotal < memswlimit)
3384 memswlimit = hostswtotal;
3385 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
237e200e
SH
3386 printme = lbuf;
3387 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
4127e51b 3388 unsigned long swaptotal = memswlimit,
b4665ce0
SH
3389 swapusage = memswusage - memusage,
3390 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3391 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
237e200e 3392 printme = lbuf;
da35d72a
SH
3393 } else if (startswith(line, "Slab:")) {
3394 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3395 printme = lbuf;
237e200e
SH
3396 } else if (startswith(line, "Buffers:")) {
3397 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3398 printme = lbuf;
3399 } else if (startswith(line, "Cached:")) {
3400 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3401 printme = lbuf;
3402 } else if (startswith(line, "SwapCached:")) {
3403 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3404 printme = lbuf;
2f306ad3 3405 } else if (startswith(line, "Active:")) {
c6095b08
SH
3406 snprintf(lbuf, 100, "Active: %8lu kB\n",
3407 active_anon + active_file);
3408 printme = lbuf;
2f306ad3 3409 } else if (startswith(line, "Inactive:")) {
c6095b08
SH
3410 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3411 inactive_anon + inactive_file);
3412 printme = lbuf;
3413 } else if (startswith(line, "Active(anon)")) {
3414 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3415 printme = lbuf;
3416 } else if (startswith(line, "Inactive(anon)")) {
3417 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3418 printme = lbuf;
3419 } else if (startswith(line, "Active(file)")) {
3420 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3421 printme = lbuf;
3422 } else if (startswith(line, "Inactive(file)")) {
3423 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3424 printme = lbuf;
3425 } else if (startswith(line, "Unevictable")) {
3426 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3427 printme = lbuf;
3428 } else if (startswith(line, "SReclaimable")) {
3429 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3430 printme = lbuf;
3431 } else if (startswith(line, "SUnreclaim")) {
3432 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3433 printme = lbuf;
559eaa8f
JS
3434 } else if (startswith(line, "Shmem:")) {
3435 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3436 printme = lbuf;
237e200e
SH
3437 } else
3438 printme = line;
3439
3440 l = snprintf(cache, cache_size, "%s", printme);
3441 if (l < 0) {
3442 perror("Error writing to cache");
3443 rv = 0;
3444 goto err;
3445
3446 }
3447 if (l >= cache_size) {
b8defc3d 3448 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3449 rv = 0;
3450 goto err;
3451 }
3452
3453 cache += l;
3454 cache_size -= l;
3455 total_len += l;
3456 }
3457
3458 d->cached = 1;
3459 d->size = total_len;
3460 if (total_len > size ) total_len = size;
3461 memcpy(buf, d->buf, total_len);
3462
3463 rv = total_len;
3464err:
3465 if (f)
3466 fclose(f);
3467 free(line);
3468 free(cg);
3469 free(memusage_str);
3470 free(memswlimit_str);
3471 free(memswusage_str);
3472 free(memstat_str);
237e200e
SH
3473 return rv;
3474}
3475
3476/*
3477 * Read the cpuset.cpus for cg
3478 * Return the answer in a newly allocated string which must be freed
3479 */
3480static char *get_cpuset(const char *cg)
3481{
3482 char *answer;
3483
3484 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3485 return NULL;
3486 return answer;
3487}
3488
3489bool cpu_in_cpuset(int cpu, const char *cpuset);
3490
3491static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3492{
3493 int cpu;
3494
3495 if (sscanf(line, "processor : %d", &cpu) != 1)
3496 return false;
3497 return cpu_in_cpuset(cpu, cpuset);
3498}
3499
3500/*
3501 * check whether this is a '^processor" line in /proc/cpuinfo
3502 */
3503static bool is_processor_line(const char *line)
3504{
3505 int cpu;
3506
3507 if (sscanf(line, "processor : %d", &cpu) == 1)
3508 return true;
3509 return false;
3510}
3511
3512static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3513 struct fuse_file_info *fi)
3514{
3515 struct fuse_context *fc = fuse_get_context();
3516 struct file_info *d = (struct file_info *)fi->fh;
3517 char *cg;
3518 char *cpuset = NULL;
3519 char *line = NULL;
3520 size_t linelen = 0, total_len = 0, rv = 0;
f676eb79
SH
3521 bool am_printing = false, firstline = true, is_s390x = false;
3522 int curcpu = -1, cpu;
237e200e
SH
3523 char *cache = d->buf;
3524 size_t cache_size = d->buflen;
3525 FILE *f = NULL;
3526
3527 if (offset){
3528 if (offset > d->size)
3529 return -EINVAL;
3530 if (!d->cached)
3531 return 0;
3532 int left = d->size - offset;
3533 total_len = left > size ? size: left;
3534 memcpy(buf, cache + offset, total_len);
3535 return total_len;
3536 }
3537
3538 pid_t initpid = lookup_initpid_in_store(fc->pid);
3539 if (initpid <= 0)
3540 initpid = fc->pid;
3541 cg = get_pid_cgroup(initpid, "cpuset");
3542 if (!cg)
3543 return read_file("proc/cpuinfo", buf, size, d);
6d2f6996 3544 prune_init_slice(cg);
237e200e
SH
3545
3546 cpuset = get_cpuset(cg);
3547 if (!cpuset)
3548 goto err;
3549
3550 f = fopen("/proc/cpuinfo", "r");
3551 if (!f)
3552 goto err;
3553
3554 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3555 ssize_t l;
f676eb79
SH
3556 if (firstline) {
3557 firstline = false;
3558 if (strstr(line, "IBM/S390") != NULL) {
3559 is_s390x = true;
3560 am_printing = true;
5ed9d4e2 3561 continue;
f676eb79
SH
3562 }
3563 }
5ed9d4e2
SH
3564 if (strncmp(line, "# processors:", 12) == 0)
3565 continue;
237e200e
SH
3566 if (is_processor_line(line)) {
3567 am_printing = cpuline_in_cpuset(line, cpuset);
3568 if (am_printing) {
3569 curcpu ++;
3570 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3571 if (l < 0) {
3572 perror("Error writing to cache");
3573 rv = 0;
3574 goto err;
3575 }
3576 if (l >= cache_size) {
b8defc3d 3577 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3578 rv = 0;
3579 goto err;
3580 }
3581 cache += l;
3582 cache_size -= l;
3583 total_len += l;
3584 }
3585 continue;
f676eb79
SH
3586 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3587 char *p;
3588 if (!cpu_in_cpuset(cpu, cpuset))
3589 continue;
3590 curcpu ++;
3591 p = strchr(line, ':');
3592 if (!p || !*p)
3593 goto err;
3594 p++;
5ed9d4e2 3595 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
f676eb79
SH
3596 if (l < 0) {
3597 perror("Error writing to cache");
3598 rv = 0;
3599 goto err;
3600 }
3601 if (l >= cache_size) {
b8defc3d 3602 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
f676eb79
SH
3603 rv = 0;
3604 goto err;
3605 }
3606 cache += l;
3607 cache_size -= l;
3608 total_len += l;
3609 continue;
3610
237e200e
SH
3611 }
3612 if (am_printing) {
3613 l = snprintf(cache, cache_size, "%s", line);
3614 if (l < 0) {
3615 perror("Error writing to cache");
3616 rv = 0;
3617 goto err;
3618 }
3619 if (l >= cache_size) {
b8defc3d 3620 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3621 rv = 0;
3622 goto err;
3623 }
3624 cache += l;
3625 cache_size -= l;
3626 total_len += l;
3627 }
3628 }
3629
5ed9d4e2
SH
3630 if (is_s390x) {
3631 char *origcache = d->buf;
a262ddb7 3632 ssize_t l;
5ed9d4e2
SH
3633 do {
3634 d->buf = malloc(d->buflen);
3635 } while (!d->buf);
3636 cache = d->buf;
3637 cache_size = d->buflen;
3638 total_len = 0;
3639 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3640 if (l < 0 || l >= cache_size) {
3641 free(origcache);
3642 goto err;
3643 }
3644 cache_size -= l;
3645 cache += l;
3646 total_len += l;
3647 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3648 if (l < 0 || l >= cache_size) {
3649 free(origcache);
3650 goto err;
3651 }
3652 cache_size -= l;
3653 cache += l;
3654 total_len += l;
3655 l = snprintf(cache, cache_size, "%s", origcache);
3656 free(origcache);
3657 if (l < 0 || l >= cache_size)
3658 goto err;
3659 total_len += l;
3660 }
3661
237e200e
SH
3662 d->cached = 1;
3663 d->size = total_len;
3664 if (total_len > size ) total_len = size;
3665
3666 /* read from off 0 */
3667 memcpy(buf, d->buf, total_len);
3668 rv = total_len;
3669err:
3670 if (f)
3671 fclose(f);
3672 free(line);
3673 free(cpuset);
3674 free(cg);
3675 return rv;
3676}
3677
0ecddf02 3678static uint64_t get_reaper_start_time(pid_t pid)
9ac264cf 3679{
9ac264cf 3680 int ret;
0ecddf02
CB
3681 FILE *f;
3682 uint64_t starttime;
3683 /* strlen("/proc/") = 6
3684 * +
3685 * LXCFS_NUMSTRLEN64
3686 * +
3687 * strlen("/stat") = 5
3688 * +
3689 * \0 = 1
3690 * */
3691#define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3692 char path[__PROC_PID_STAT_LEN];
9ac264cf
JB
3693 pid_t qpid;
3694
3695 qpid = lookup_initpid_in_store(pid);
0ecddf02
CB
3696 if (qpid <= 0) {
3697 /* Caller can check for EINVAL on 0. */
3698 errno = EINVAL;
9ac264cf 3699 return 0;
0ecddf02 3700 }
9ac264cf 3701
0ecddf02
CB
3702 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3703 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3704 /* Caller can check for EINVAL on 0. */
3705 errno = EINVAL;
9ac264cf 3706 return 0;
0ecddf02 3707 }
9ac264cf 3708
0ecddf02
CB
3709 f = fopen(path, "r");
3710 if (!f) {
3711 /* Caller can check for EINVAL on 0. */
3712 errno = EINVAL;
9ac264cf 3713 return 0;
0ecddf02 3714 }
9ac264cf 3715
0ecddf02
CB
3716 /* Note that the *scanf() argument supression requires that length
3717 * modifiers such as "l" are omitted. Otherwise some compilers will yell
3718 * at us. It's like telling someone you're not married and then asking
3719 * if you can bring your wife to the party.
3720 */
3721 ret = fscanf(f, "%*d " /* (1) pid %d */
3722 "%*s " /* (2) comm %s */
3723 "%*c " /* (3) state %c */
3724 "%*d " /* (4) ppid %d */
3725 "%*d " /* (5) pgrp %d */
3726 "%*d " /* (6) session %d */
3727 "%*d " /* (7) tty_nr %d */
3728 "%*d " /* (8) tpgid %d */
3729 "%*u " /* (9) flags %u */
3730 "%*u " /* (10) minflt %lu */
3731 "%*u " /* (11) cminflt %lu */
3732 "%*u " /* (12) majflt %lu */
3733 "%*u " /* (13) cmajflt %lu */
3734 "%*u " /* (14) utime %lu */
3735 "%*u " /* (15) stime %lu */
3736 "%*d " /* (16) cutime %ld */
3737 "%*d " /* (17) cstime %ld */
3738 "%*d " /* (18) priority %ld */
3739 "%*d " /* (19) nice %ld */
3740 "%*d " /* (20) num_threads %ld */
3741 "%*d " /* (21) itrealvalue %ld */
3742 "%" PRIu64, /* (22) starttime %llu */
3743 &starttime);
3744 if (ret != 1) {
3745 fclose(f);
3746 /* Caller can check for EINVAL on 0. */
3747 errno = EINVAL;
3748 return 0;
3749 }
3750
3751 fclose(f);
3752
3753 errno = 0;
3754 return starttime;
3755}
3756
3757static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3758{
3759 uint64_t clockticks;
3760 int64_t ticks_per_sec;
3761
3762 clockticks = get_reaper_start_time(pid);
3763 if (clockticks == 0 && errno == EINVAL) {
3764 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3765 return 0;
3766 }
3767
3768 ticks_per_sec = sysconf(_SC_CLK_TCK);
3769 if (ticks_per_sec < 0 && errno == EINVAL) {
3770 lxcfs_debug(
3771 "%s\n",
3772 "failed to determine number of clock ticks in a second");
3773 return 0;
3774 }
3775
3776 return (clockticks /= ticks_per_sec);
3777}
3778
3779static uint64_t get_reaper_age(pid_t pid)
3780{
3781 uint64_t procstart, uptime, procage;
3782
3783 /* We need to substract the time the process has started since system
3784 * boot minus the time when the system has started to get the actual
3785 * reaper age.
3786 */
3787 procstart = get_reaper_start_time_in_sec(pid);
3788 procage = procstart;
3789 if (procstart > 0) {
3790 int ret;
3791 struct timespec spec;
3792
3793 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3794 if (ret < 0)
3795 return 0;
3796 /* We could make this more precise here by using the tv_nsec
3797 * field in the timespec struct and convert it to milliseconds
3798 * and then create a double for the seconds and milliseconds but
3799 * that seems more work than it is worth.
3800 */
3801 uptime = spec.tv_sec;
3802 procage = uptime - procstart;
3803 }
3804
3805 return procage;
3806}
3807
8be92dd1
JS
3808/*
3809 * Returns 0 on success.
3810 * It is the caller's responsibility to free `return_usage`, unless this
3811 * function returns an error.
3812 */
3813static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
3814{
3815 int cpucount = get_nprocs();
3816 struct cpuacct_usage *cpu_usage;
3817 int rv = 0, i, j, ret, read_pos = 0, read_cnt;
3818 int cg_cpu;
3819 uint64_t cg_user, cg_system;
3820 int64_t ticks_per_sec;
3821 char *usage_str = NULL;
3822
3823 ticks_per_sec = sysconf(_SC_CLK_TCK);
3824
3825 if (ticks_per_sec < 0 && errno == EINVAL) {
3826 lxcfs_debug(
3827 "%s\n",
3828 "read_cpuacct_usage_all failed to determine number of clock ticks "
3829 "in a second");
3830 return -1;
3831 }
3832
3833 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3834 if (!cpu_usage)
3835 return -ENOMEM;
3836
3837 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3838 rv = -1;
3839 goto err;
3840 }
3841
3842 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3843 lxcfs_error("read_cpuacct_usage_all reading first line from "
3844 "%s/cpuacct.usage_all failed.\n", cg);
3845 rv = -1;
3846 goto err;
3847 }
3848
3849 read_pos += read_cnt;
3850
3851 for (i = 0, j = 0; i < cpucount; i++) {
3852 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3853 &cg_system, &read_cnt);
3854
3855 if (ret == EOF)
3856 break;
3857
3858 if (ret != 3) {
3859 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3860 "failed.\n", cg);
3861 rv = -1;
3862 goto err;
3863 }
3864
3865 read_pos += read_cnt;
3866
3867 if (!cpu_in_cpuset(i, cpuset))
3868 continue;
3869
3870 /* Convert the time from nanoseconds to USER_HZ */
3871 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3872 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3873 j++;
3874 }
3875
3876 rv = 0;
3877 *return_usage = cpu_usage;
3878
3879err:
3880 if (usage_str)
3881 free(usage_str);
3882
3883 if (rv != 0) {
3884 free(cpu_usage);
3885 *return_usage = NULL;
3886 }
3887
3888 return rv;
3889}
3890
f34de69a 3891#define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
237e200e
SH
3892static int proc_stat_read(char *buf, size_t size, off_t offset,
3893 struct fuse_file_info *fi)
3894{
3895 struct fuse_context *fc = fuse_get_context();
3896 struct file_info *d = (struct file_info *)fi->fh;
3897 char *cg;
3898 char *cpuset = NULL;
3899 char *line = NULL;
3900 size_t linelen = 0, total_len = 0, rv = 0;
3901 int curcpu = -1; /* cpu numbering starts at 0 */
7144f069 3902 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
237e200e 3903 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
7144f069 3904 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
237e200e
SH
3905 char cpuall[CPUALL_MAX_SIZE];
3906 /* reserve for cpu all */
3907 char *cache = d->buf + CPUALL_MAX_SIZE;
3908 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3909 FILE *f = NULL;
8be92dd1 3910 struct cpuacct_usage *cg_cpu_usage = NULL;
237e200e
SH
3911
3912 if (offset){
3913 if (offset > d->size)
3914 return -EINVAL;
3915 if (!d->cached)
3916 return 0;
3917 int left = d->size - offset;
3918 total_len = left > size ? size: left;
3919 memcpy(buf, d->buf + offset, total_len);
3920 return total_len;
3921 }
3922
3923 pid_t initpid = lookup_initpid_in_store(fc->pid);
3924 if (initpid <= 0)
3925 initpid = fc->pid;
3926 cg = get_pid_cgroup(initpid, "cpuset");
3927 if (!cg)
3928 return read_file("/proc/stat", buf, size, d);
6d2f6996 3929 prune_init_slice(cg);
237e200e
SH
3930
3931 cpuset = get_cpuset(cg);
3932 if (!cpuset)
3933 goto err;
3934
8be92dd1
JS
3935 /*
3936 * Read cpuacct.usage_all for all CPUs.
3937 * If the cpuacct cgroup is present, it is used to calculate the container's
3938 * CPU usage. If not, values from the host's /proc/stat are used.
3939 */
3940 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
3941 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
3942 "falling back to the host's /proc/stat");
3943 }
3944
237e200e
SH
3945 f = fopen("/proc/stat", "r");
3946 if (!f)
3947 goto err;
3948
3949 //skip first line
3950 if (getline(&line, &linelen, f) < 0) {
b8defc3d 3951 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
237e200e
SH
3952 goto err;
3953 }
3954
3955 while (getline(&line, &linelen, f) != -1) {
a262ddb7 3956 ssize_t l;
237e200e
SH
3957 int cpu;
3958 char cpu_char[10]; /* That's a lot of cores */
3959 char *c;
8be92dd1
JS
3960 uint64_t all_used, cg_used, new_idle;
3961 int ret;
237e200e 3962
b4665ce0
SH
3963 if (strlen(line) == 0)
3964 continue;
237e200e
SH
3965 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3966 /* not a ^cpuN line containing a number N, just print it */
9502bae2 3967 l = snprintf(cache, cache_size, "%s", line);
237e200e
SH
3968 if (l < 0) {
3969 perror("Error writing to cache");
3970 rv = 0;
3971 goto err;
3972 }
3973 if (l >= cache_size) {
b8defc3d 3974 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
3975 rv = 0;
3976 goto err;
3977 }
3978 cache += l;
3979 cache_size -= l;
3980 total_len += l;
3981 continue;
3982 }
3983
3984 if (sscanf(cpu_char, "%d", &cpu) != 1)
3985 continue;
3986 if (!cpu_in_cpuset(cpu, cpuset))
3987 continue;
3988 curcpu ++;
3989
8be92dd1 3990 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
7144f069
CB
3991 &user,
3992 &nice,
3993 &system,
3994 &idle,
3995 &iowait,
3996 &irq,
3997 &softirq,
3998 &steal,
3999 &guest,
8be92dd1
JS
4000 &guest_nice);
4001
4002 if (ret != 10 || !cg_cpu_usage) {
4003 c = strchr(line, ' ');
4004 if (!c)
4005 continue;
4006 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4007 if (l < 0) {
4008 perror("Error writing to cache");
4009 rv = 0;
4010 goto err;
4011
4012 }
4013 if (l >= cache_size) {
4014 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4015 rv = 0;
4016 goto err;
4017 }
4018
4019 cache += l;
4020 cache_size -= l;
4021 total_len += l;
4022
4023 if (ret != 10)
4024 continue;
4025 }
4026
4027 if (cg_cpu_usage) {
4028 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4029 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4030
4031 if (all_used >= cg_used) {
4032 new_idle = idle + (all_used - cg_used);
4033
4034 } else {
4035 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4036 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4037 curcpu, cg, all_used, cg_used);
4038 new_idle = idle;
4039 }
4040
4041 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4042 curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4043 new_idle);
4044
4045 if (l < 0) {
4046 perror("Error writing to cache");
4047 rv = 0;
4048 goto err;
4049
4050 }
4051 if (l >= cache_size) {
4052 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4053 rv = 0;
4054 goto err;
4055 }
4056
4057 cache += l;
4058 cache_size -= l;
4059 total_len += l;
4060
4061 user_sum += cg_cpu_usage[curcpu].user;
4062 system_sum += cg_cpu_usage[curcpu].system;
4063 idle_sum += new_idle;
4064
4065 } else {
4066 user_sum += user;
4067 nice_sum += nice;
4068 system_sum += system;
4069 idle_sum += idle;
4070 iowait_sum += iowait;
4071 irq_sum += irq;
4072 softirq_sum += softirq;
4073 steal_sum += steal;
4074 guest_sum += guest;
4075 guest_nice_sum += guest_nice;
4076 }
237e200e
SH
4077 }
4078
4079 cache = d->buf;
4080
7144f069
CB
4081 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4082 user_sum,
4083 nice_sum,
4084 system_sum,
4085 idle_sum,
4086 iowait_sum,
4087 irq_sum,
4088 softirq_sum,
4089 steal_sum,
4090 guest_sum,
4091 guest_nice_sum);
4092 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
237e200e
SH
4093 memcpy(cache, cpuall, cpuall_len);
4094 cache += cpuall_len;
7144f069 4095 } else {
237e200e 4096 /* shouldn't happen */
b8defc3d 4097 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
237e200e
SH
4098 cpuall_len = 0;
4099 }
4100
4101 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4102 total_len += cpuall_len;
4103 d->cached = 1;
4104 d->size = total_len;
7144f069
CB
4105 if (total_len > size)
4106 total_len = size;
237e200e
SH
4107
4108 memcpy(buf, d->buf, total_len);
4109 rv = total_len;
4110
4111err:
4112 if (f)
4113 fclose(f);
8be92dd1
JS
4114 if (cg_cpu_usage)
4115 free(cg_cpu_usage);
237e200e
SH
4116 free(line);
4117 free(cpuset);
4118 free(cg);
4119 return rv;
4120}
4121
0ecddf02
CB
4122/* This function retrieves the busy time of a group of tasks by looking at
4123 * cpuacct.usage. Unfortunately, this only makes sense when the container has
4124 * been given it's own cpuacct cgroup. If not, this function will take the busy
4125 * time of all other taks that do not actually belong to the container into
4126 * account as well. If someone has a clever solution for this please send a
4127 * patch!
4128 */
237e200e
SH
4129static unsigned long get_reaper_busy(pid_t task)
4130{
4131 pid_t initpid = lookup_initpid_in_store(task);
4132 char *cgroup = NULL, *usage_str = NULL;
4133 unsigned long usage = 0;
4134
4135 if (initpid <= 0)
4136 return 0;
4137
4138 cgroup = get_pid_cgroup(initpid, "cpuacct");
4139 if (!cgroup)
4140 goto out;
6d2f6996 4141 prune_init_slice(cgroup);
237e200e
SH
4142 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4143 goto out;
4144 usage = strtoul(usage_str, NULL, 10);
4145 usage /= 1000000000;
4146
4147out:
4148 free(cgroup);
4149 free(usage_str);
4150 return usage;
4151}
4152
4153#if RELOADTEST
4154void iwashere(void)
4155{
237e200e
SH
4156 int fd;
4157
ec2b5e7c 4158 fd = creat("/tmp/lxcfs-iwashere", 0644);
237e200e
SH
4159 if (fd >= 0)
4160 close(fd);
4161}
4162#endif
4163
4164/*
4165 * We read /proc/uptime and reuse its second field.
4166 * For the first field, we use the mtime for the reaper for
4167 * the calling pid as returned by getreaperage
4168 */
4169static int proc_uptime_read(char *buf, size_t size, off_t offset,
4170 struct fuse_file_info *fi)
4171{
4172 struct fuse_context *fc = fuse_get_context();
4173 struct file_info *d = (struct file_info *)fi->fh;
0ecddf02 4174 unsigned long int busytime = get_reaper_busy(fc->pid);
237e200e 4175 char *cache = d->buf;
a262ddb7 4176 ssize_t total_len = 0;
0ecddf02 4177 uint64_t idletime, reaperage;
237e200e
SH
4178
4179#if RELOADTEST
4180 iwashere();
4181#endif
4182
4183 if (offset){
237e200e
SH
4184 if (!d->cached)
4185 return 0;
bbdf646b
BM
4186 if (offset > d->size)
4187 return -EINVAL;
237e200e
SH
4188 int left = d->size - offset;
4189 total_len = left > size ? size: left;
4190 memcpy(buf, cache + offset, total_len);
4191 return total_len;
4192 }
4193
0ecddf02
CB
4194 reaperage = get_reaper_age(fc->pid);
4195 /* To understand why this is done, please read the comment to the
4196 * get_reaper_busy() function.
4197 */
4198 idletime = reaperage;
4199 if (reaperage >= busytime)
4200 idletime = reaperage - busytime;
237e200e 4201
bbdf646b
BM
4202 total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4203 if (total_len < 0 || total_len >= d->buflen){
0ecddf02 4204 lxcfs_error("%s\n", "failed to write to cache");
237e200e
SH
4205 return 0;
4206 }
4207
4208 d->size = (int)total_len;
4209 d->cached = 1;
4210
4211 if (total_len > size) total_len = size;
4212
4213 memcpy(buf, d->buf, total_len);
4214 return total_len;
4215}
4216
4217static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4218 struct fuse_file_info *fi)
4219{
4220 char dev_name[72];
4221 struct fuse_context *fc = fuse_get_context();
4222 struct file_info *d = (struct file_info *)fi->fh;
4223 char *cg;
4224 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4225 *io_wait_time_str = NULL, *io_service_time_str = NULL;
4226 unsigned long read = 0, write = 0;
4227 unsigned long read_merged = 0, write_merged = 0;
4228 unsigned long read_sectors = 0, write_sectors = 0;
4229 unsigned long read_ticks = 0, write_ticks = 0;
4230 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4231 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4232 char *cache = d->buf;
4233 size_t cache_size = d->buflen;
4234 char *line = NULL;
4235 size_t linelen = 0, total_len = 0, rv = 0;
4236 unsigned int major = 0, minor = 0;
4237 int i = 0;
4238 FILE *f = NULL;
4239
4240 if (offset){
4241 if (offset > d->size)
4242 return -EINVAL;
4243 if (!d->cached)
4244 return 0;
4245 int left = d->size - offset;
4246 total_len = left > size ? size: left;
4247 memcpy(buf, cache + offset, total_len);
4248 return total_len;
4249 }
4250
4251 pid_t initpid = lookup_initpid_in_store(fc->pid);
4252 if (initpid <= 0)
4253 initpid = fc->pid;
4254 cg = get_pid_cgroup(initpid, "blkio");
4255 if (!cg)
4256 return read_file("/proc/diskstats", buf, size, d);
6d2f6996 4257 prune_init_slice(cg);
237e200e 4258
2209fe50 4259 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
237e200e 4260 goto err;
2209fe50 4261 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
237e200e 4262 goto err;
2209fe50 4263 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
237e200e 4264 goto err;
2209fe50 4265 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
237e200e 4266 goto err;
2209fe50 4267 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
237e200e
SH
4268 goto err;
4269
4270
4271 f = fopen("/proc/diskstats", "r");
4272 if (!f)
4273 goto err;
4274
4275 while (getline(&line, &linelen, f) != -1) {
a262ddb7 4276 ssize_t l;
2209fe50 4277 char lbuf[256];
237e200e
SH
4278
4279 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2209fe50 4280 if (i != 3)
237e200e 4281 continue;
2209fe50
SH
4282
4283 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4284 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4285 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4286 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4287 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4288 read_sectors = read_sectors/512;
4289 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4290 write_sectors = write_sectors/512;
4291
4292 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4293 rd_svctm = rd_svctm/1000000;
4294 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4295 rd_wait = rd_wait/1000000;
4296 read_ticks = rd_svctm + rd_wait;
4297
4298 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4299 wr_svctm = wr_svctm/1000000;
4300 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4301 wr_wait = wr_wait/1000000;
4302 write_ticks = wr_svctm + wr_wait;
4303
4304 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4305 tot_ticks = tot_ticks/1000000;
237e200e
SH
4306
4307 memset(lbuf, 0, 256);
2db31eb6
SH
4308 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4309 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4310 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4311 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4312 else
4313 continue;
237e200e 4314
2209fe50 4315 l = snprintf(cache, cache_size, "%s", lbuf);
237e200e
SH
4316 if (l < 0) {
4317 perror("Error writing to fuse buf");
4318 rv = 0;
4319 goto err;
4320 }
4321 if (l >= cache_size) {
b8defc3d 4322 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
237e200e
SH
4323 rv = 0;
4324 goto err;
4325 }
4326 cache += l;
4327 cache_size -= l;
4328 total_len += l;
4329 }
4330
4331 d->cached = 1;
4332 d->size = total_len;
4333 if (total_len > size ) total_len = size;
4334 memcpy(buf, d->buf, total_len);
4335
4336 rv = total_len;
4337err:
4338 free(cg);
4339 if (f)
4340 fclose(f);
4341 free(line);
4342 free(io_serviced_str);
4343 free(io_merged_str);
4344 free(io_service_bytes_str);
4345 free(io_wait_time_str);
4346 free(io_service_time_str);
4347 return rv;
4348}
4349
70dcc12e
SH
4350static int proc_swaps_read(char *buf, size_t size, off_t offset,
4351 struct fuse_file_info *fi)
4352{
4353 struct fuse_context *fc = fuse_get_context();
4354 struct file_info *d = (struct file_info *)fi->fh;
4355 char *cg = NULL;
018246ff 4356 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
70dcc12e 4357 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
a262ddb7
CB
4358 ssize_t total_len = 0, rv = 0;
4359 ssize_t l = 0;
70dcc12e
SH
4360 char *cache = d->buf;
4361
4362 if (offset) {
4363 if (offset > d->size)
4364 return -EINVAL;
4365 if (!d->cached)
4366 return 0;
4367 int left = d->size - offset;
4368 total_len = left > size ? size: left;
4369 memcpy(buf, cache + offset, total_len);
4370 return total_len;
4371 }
4372
4373 pid_t initpid = lookup_initpid_in_store(fc->pid);
4374 if (initpid <= 0)
4375 initpid = fc->pid;
4376 cg = get_pid_cgroup(initpid, "memory");
4377 if (!cg)
4378 return read_file("/proc/swaps", buf, size, d);
6d2f6996 4379 prune_init_slice(cg);
70dcc12e 4380
018246ff 4381 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
70dcc12e
SH
4382
4383 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4384 goto err;
4385
70dcc12e
SH
4386 memusage = strtoul(memusage_str, NULL, 10);
4387
4388 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4389 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4390
018246ff 4391 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
70dcc12e
SH
4392 memswusage = strtoul(memswusage_str, NULL, 10);
4393
70dcc12e
SH
4394 swap_total = (memswlimit - memlimit) / 1024;
4395 swap_free = (memswusage - memusage) / 1024;
4396 }
4397
4398 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4399
4400 /* When no mem + swap limit is specified or swapaccount=0*/
4401 if (!memswlimit) {
4402 char *line = NULL;
4403 size_t linelen = 0;
4404 FILE *f = fopen("/proc/meminfo", "r");
4405
4406 if (!f)
4407 goto err;
4408
4409 while (getline(&line, &linelen, f) != -1) {
4410 if (startswith(line, "SwapTotal:")) {
4411 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
4412 } else if (startswith(line, "SwapFree:")) {
4413 sscanf(line, "SwapFree: %8lu kB", &swap_free);
4414 }
4415 }
4416
4417 free(line);
4418 fclose(f);
4419 }
4420
4421 if (swap_total > 0) {
a262ddb7
CB
4422 l = snprintf(d->buf + total_len, d->size - total_len,
4423 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4424 swap_total, swap_free);
4425 total_len += l;
70dcc12e
SH
4426 }
4427
a262ddb7 4428 if (total_len < 0 || l < 0) {
70dcc12e
SH
4429 perror("Error writing to cache");
4430 rv = 0;
4431 goto err;
4432 }
4433
4434 d->cached = 1;
4435 d->size = (int)total_len;
4436
4437 if (total_len > size) total_len = size;
4438 memcpy(buf, d->buf, total_len);
4439 rv = total_len;
4440
4441err:
4442 free(cg);
4443 free(memswlimit_str);
4444 free(memlimit_str);
4445 free(memusage_str);
4446 free(memswusage_str);
70dcc12e
SH
4447 return rv;
4448}
6db4f7a3 4449/*
4450 * Find the process pid from cgroup path.
4451 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4452 * @pid_buf : put pid to pid_buf.
4453 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4454 * @depth : the depth of cgroup in container.
4455 * @sum : return the number of pid.
4456 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4457 */
4458static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4459{
4460 DIR *dir;
4461 int fd;
4462 struct dirent *file;
4463 FILE *f = NULL;
4464 size_t linelen = 0;
4465 char *line = NULL;
4466 int pd;
4467 char *path_dir, *path;
4468 char **pid;
4469
4470 /* path = dpath + "/cgroup.procs" + /0 */
4471 do {
4472 path = malloc(strlen(dpath) + 20);
4473 } while (!path);
4474
4475 strcpy(path, dpath);
4476 fd = openat(cfd, path, O_RDONLY);
4477 if (fd < 0)
4478 goto out;
4479
4480 dir = fdopendir(fd);
4481 if (dir == NULL) {
4482 close(fd);
4483 goto out;
4484 }
4485
4486 while (((file = readdir(dir)) != NULL) && depth > 0) {
4487 if (strncmp(file->d_name, ".", 1) == 0)
4488 continue;
4489 if (strncmp(file->d_name, "..", 1) == 0)
4490 continue;
4491 if (file->d_type == DT_DIR) {
4492 /* path + '/' + d_name +/0 */
4493 do {
4494 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4495 } while (!path_dir);
4496 strcpy(path_dir, path);
4497 strcat(path_dir, "/");
4498 strcat(path_dir, file->d_name);
4499 pd = depth - 1;
4500 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4501 free(path_dir);
4502 }
4503 }
4504 closedir(dir);
4505
4506 strcat(path, "/cgroup.procs");
4507 fd = openat(cfd, path, O_RDONLY);
4508 if (fd < 0)
4509 goto out;
4510
4511 f = fdopen(fd, "r");
4512 if (!f) {
4513 close(fd);
4514 goto out;
4515 }
4516
4517 while (getline(&line, &linelen, f) != -1) {
4518 do {
4519 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4520 } while (!pid);
4521 *pid_buf = pid;
4522 do {
4523 *(*pid_buf + sum) = malloc(strlen(line) + 1);
4524 } while (*(*pid_buf + sum) == NULL);
4525 strcpy(*(*pid_buf + sum), line);
4526 sum++;
4527 }
4528 fclose(f);
4529out:
832904c1
JS
4530 if (line)
4531 free(line);
6db4f7a3 4532 free(path);
4533 return sum;
4534}
4535/*
4536 * calc_load calculates the load according to the following formula:
4537 * load1 = load0 * exp + active * (1 - exp)
4538 *
4539 * @load1: the new loadavg.
4540 * @load0: the former loadavg.
4541 * @active: the total number of running pid at this moment.
4542 * @exp: the fixed-point defined in the beginning.
4543 */
4544static unsigned long
4545calc_load(unsigned long load, unsigned long exp, unsigned long active)
4546{
4547 unsigned long newload;
4548
4549 active = active > 0 ? active * FIXED_1 : 0;
4550 newload = load * exp + active * (FIXED_1 - exp);
4551 if (active >= load)
4552 newload += FIXED_1 - 1;
4553
4554 return newload / FIXED_1;
4555}
4556
4557/*
4558 * Return 0 means that container p->cg is closed.
4559 * Return -1 means that error occurred in refresh.
4560 * Positive num equals the total number of pid.
4561 */
4562static int refresh_load(struct load_node *p, char *path)
4563{
4564 FILE *f = NULL;
4565 char **idbuf;
4566 char proc_path[256];
4567 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4568 char *line = NULL;
4569 size_t linelen = 0;
4570 int sum, length;
4571 DIR *dp;
4572 struct dirent *file;
4573
4574 do {
4575 idbuf = malloc(sizeof(char *));
4576 } while (!idbuf);
4577 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4578 /* normal exit */
4579 if (sum == 0)
4580 goto out;
4581
4582 for (i = 0; i < sum; i++) {
4583 /*clean up '\n' */
4584 length = strlen(idbuf[i])-1;
4585 idbuf[i][length] = '\0';
4586 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4587 if (ret < 0 || ret > 255) {
4588 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4589 i = sum;
4590 sum = -1;
4591 goto err_out;
4592 }
4593
4594 dp = opendir(proc_path);
4595 if (!dp) {
4596 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4597 continue;
4598 }
4599 while ((file = readdir(dp)) != NULL) {
4600 if (strncmp(file->d_name, ".", 1) == 0)
4601 continue;
4602 if (strncmp(file->d_name, "..", 1) == 0)
4603 continue;
4604 total_pid++;
4605 /* We make the biggest pid become last_pid.*/
4606 ret = atof(file->d_name);
4607 last_pid = (ret > last_pid) ? ret : last_pid;
4608
4609 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4610 if (ret < 0 || ret > 255) {
4611 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4612 i = sum;
4613 sum = -1;
4614 closedir(dp);
4615 goto err_out;
4616 }
4617 f = fopen(proc_path, "r");
4618 if (f != NULL) {
4619 while (getline(&line, &linelen, f) != -1) {
4620 /* Find State */
4621 if ((line[0] == 'S') && (line[1] == 't'))
4622 break;
4623 }
4624 if ((line[7] == 'R') || (line[7] == 'D'))
4625 run_pid++;
4626 fclose(f);
4627 }
4628 }
4629 closedir(dp);
4630 }
4631 /*Calculate the loadavg.*/
4632 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4633 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4634 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4635 p->run_pid = run_pid;
4636 p->total_pid = total_pid;
4637 p->last_pid = last_pid;
4638
4639 free(line);
4640err_out:
4641 for (; i > 0; i--)
4642 free(idbuf[i-1]);
4643out:
4644 free(idbuf);
4645 return sum;
4646}
4647/*
4648 * Traverse the hash table and update it.
4649 */
4650void *load_begin(void *arg)
4651{
4652
4653 char *path = NULL;
4654 int i, sum, length, ret;
4655 struct load_node *f;
4656 int first_node;
4657 clock_t time1, time2;
4658
4659 while (1) {
a83618e2
JS
4660 if (loadavg_stop == 1)
4661 return NULL;
4662
6db4f7a3 4663 time1 = clock();
4664 for (i = 0; i < LOAD_SIZE; i++) {
4665 pthread_mutex_lock(&load_hash[i].lock);
4666 if (load_hash[i].next == NULL) {
4667 pthread_mutex_unlock(&load_hash[i].lock);
4668 continue;
4669 }
4670 f = load_hash[i].next;
4671 first_node = 1;
4672 while (f) {
4673 length = strlen(f->cg) + 2;
4674 do {
4675 /* strlen(f->cg) + '.' or '' + \0 */
4676 path = malloc(length);
4677 } while (!path);
4678
4679 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4680 if (ret < 0 || ret > length - 1) {
4681 /* snprintf failed, ignore the node.*/
4682 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4683 goto out;
4684 }
4685 sum = refresh_load(f, path);
4686 if (sum == 0) {
4687 f = del_node(f, i);
4688 } else {
4689out: f = f->next;
4690 }
4691 free(path);
4692 /* load_hash[i].lock locks only on the first node.*/
4693 if (first_node == 1) {
4694 first_node = 0;
4695 pthread_mutex_unlock(&load_hash[i].lock);
4696 }
4697 }
4698 }
a83618e2
JS
4699
4700 if (loadavg_stop == 1)
4701 return NULL;
4702
6db4f7a3 4703 time2 = clock();
4704 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4705 }
4706}
4707
4708static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4709 struct fuse_file_info *fi)
4710{
4711 struct fuse_context *fc = fuse_get_context();
4712 struct file_info *d = (struct file_info *)fi->fh;
4713 pid_t initpid;
4714 char *cg;
4715 size_t total_len = 0;
4716 char *cache = d->buf;
4717 struct load_node *n;
4718 int hash;
01d88ede 4719 int cfd, rv = 0;
6db4f7a3 4720 unsigned long a, b, c;
4721
4722 if (offset) {
4723 if (offset > d->size)
4724 return -EINVAL;
4725 if (!d->cached)
4726 return 0;
4727 int left = d->size - offset;
4728 total_len = left > size ? size : left;
4729 memcpy(buf, cache + offset, total_len);
4730 return total_len;
4731 }
4732 if (!loadavg)
4733 return read_file("/proc/loadavg", buf, size, d);
4734
4735 initpid = lookup_initpid_in_store(fc->pid);
4736 if (initpid <= 0)
4737 initpid = fc->pid;
4738 cg = get_pid_cgroup(initpid, "cpu");
4739 if (!cg)
4740 return read_file("/proc/loadavg", buf, size, d);
4741
4742 prune_init_slice(cg);
4743 hash = calc_hash(cg);
4744 n = locate_node(cg, hash);
4745
4746 /* First time */
4747 if (n == NULL) {
4748 if (!find_mounted_controller("cpu", &cfd)) {
4749 /*
4750 * In locate_node() above, pthread_rwlock_unlock() isn't used
4751 * because delete is not allowed before read has ended.
4752 */
4753 pthread_rwlock_unlock(&load_hash[hash].rdlock);
01d88ede
JS
4754 rv = 0;
4755 goto err;
6db4f7a3 4756 }
4757 do {
4758 n = malloc(sizeof(struct load_node));
4759 } while (!n);
4760
4761 do {
4762 n->cg = malloc(strlen(cg)+1);
4763 } while (!n->cg);
4764 strcpy(n->cg, cg);
4765 n->avenrun[0] = 0;
4766 n->avenrun[1] = 0;
4767 n->avenrun[2] = 0;
4768 n->run_pid = 0;
4769 n->total_pid = 1;
4770 n->last_pid = initpid;
4771 n->cfd = cfd;
4772 insert_node(&n, hash);
4773 }
4774 a = n->avenrun[0] + (FIXED_1/200);
4775 b = n->avenrun[1] + (FIXED_1/200);
4776 c = n->avenrun[2] + (FIXED_1/200);
4777 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4778 LOAD_INT(a), LOAD_FRAC(a),
4779 LOAD_INT(b), LOAD_FRAC(b),
4780 LOAD_INT(c), LOAD_FRAC(c),
4781 n->run_pid, n->total_pid, n->last_pid);
4782 pthread_rwlock_unlock(&load_hash[hash].rdlock);
4783 if (total_len < 0 || total_len >= d->buflen) {
4784 lxcfs_error("%s\n", "Failed to write to cache");
01d88ede
JS
4785 rv = 0;
4786 goto err;
6db4f7a3 4787 }
4788 d->size = (int)total_len;
4789 d->cached = 1;
4790
4791 if (total_len > size)
4792 total_len = size;
4793 memcpy(buf, d->buf, total_len);
01d88ede
JS
4794 rv = total_len;
4795
4796err:
4797 free(cg);
4798 return rv;
6db4f7a3 4799}
4800/* Return a positive number on success, return 0 on failure.*/
4801pthread_t load_daemon(int load_use)
4802{
4803 int ret;
4804 pthread_t pid;
4805
4806 ret = init_load();
4807 if (ret == -1) {
4808 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4809 return 0;
4810 }
4811 ret = pthread_create(&pid, NULL, load_begin, NULL);
4812 if (ret != 0) {
4813 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4814 load_free();
4815 return 0;
4816 }
4817 /* use loadavg, here loadavg = 1*/
4818 loadavg = load_use;
4819 return pid;
4820}
70dcc12e 4821
a83618e2
JS
4822/* Returns 0 on success. */
4823int stop_load_daemon(pthread_t pid)
4824{
4825 int s;
4826
4827 /* Signal the thread to gracefully stop */
4828 loadavg_stop = 1;
4829
4830 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4831 if (s != 0) {
4832 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4833 return -1;
4834 }
4835
4836 load_free();
4837 loadavg_stop = 0;
4838
4839 return 0;
4840}
4841
237e200e
SH
4842static off_t get_procfile_size(const char *which)
4843{
4844 FILE *f = fopen(which, "r");
4845 char *line = NULL;
4846 size_t len = 0;
4847 ssize_t sz, answer = 0;
4848 if (!f)
4849 return 0;
4850
4851 while ((sz = getline(&line, &len, f)) != -1)
4852 answer += sz;
4853 fclose (f);
4854 free(line);
4855
4856 return answer;
4857}
4858
4859int proc_getattr(const char *path, struct stat *sb)
4860{
4861 struct timespec now;
4862
4863 memset(sb, 0, sizeof(struct stat));
4864 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4865 return -EINVAL;
4866 sb->st_uid = sb->st_gid = 0;
4867 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4868 if (strcmp(path, "/proc") == 0) {
4869 sb->st_mode = S_IFDIR | 00555;
4870 sb->st_nlink = 2;
4871 return 0;
4872 }
4873 if (strcmp(path, "/proc/meminfo") == 0 ||
4874 strcmp(path, "/proc/cpuinfo") == 0 ||
4875 strcmp(path, "/proc/uptime") == 0 ||
4876 strcmp(path, "/proc/stat") == 0 ||
70dcc12e 4877 strcmp(path, "/proc/diskstats") == 0 ||
46be8eed 4878 strcmp(path, "/proc/swaps") == 0 ||
4879 strcmp(path, "/proc/loadavg") == 0) {
237e200e
SH
4880 sb->st_size = 0;
4881 sb->st_mode = S_IFREG | 00444;
4882 sb->st_nlink = 1;
4883 return 0;
4884 }
4885
4886 return -ENOENT;
4887}
4888
4889int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4890 struct fuse_file_info *fi)
4891{
d639f863
CB
4892 if (filler(buf, ".", NULL, 0) != 0 ||
4893 filler(buf, "..", NULL, 0) != 0 ||
4894 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4895 filler(buf, "meminfo", NULL, 0) != 0 ||
4896 filler(buf, "stat", NULL, 0) != 0 ||
4897 filler(buf, "uptime", NULL, 0) != 0 ||
4898 filler(buf, "diskstats", NULL, 0) != 0 ||
46be8eed 4899 filler(buf, "swaps", NULL, 0) != 0 ||
4900 filler(buf, "loadavg", NULL, 0) != 0)
237e200e
SH
4901 return -EINVAL;
4902 return 0;
4903}
4904
4905int proc_open(const char *path, struct fuse_file_info *fi)
4906{
4907 int type = -1;
4908 struct file_info *info;
4909
4910 if (strcmp(path, "/proc/meminfo") == 0)
4911 type = LXC_TYPE_PROC_MEMINFO;
4912 else if (strcmp(path, "/proc/cpuinfo") == 0)
4913 type = LXC_TYPE_PROC_CPUINFO;
4914 else if (strcmp(path, "/proc/uptime") == 0)
4915 type = LXC_TYPE_PROC_UPTIME;
4916 else if (strcmp(path, "/proc/stat") == 0)
4917 type = LXC_TYPE_PROC_STAT;
4918 else if (strcmp(path, "/proc/diskstats") == 0)
4919 type = LXC_TYPE_PROC_DISKSTATS;
70dcc12e
SH
4920 else if (strcmp(path, "/proc/swaps") == 0)
4921 type = LXC_TYPE_PROC_SWAPS;
46be8eed 4922 else if (strcmp(path, "/proc/loadavg") == 0)
4923 type = LXC_TYPE_PROC_LOADAVG;
237e200e
SH
4924 if (type == -1)
4925 return -ENOENT;
4926
4927 info = malloc(sizeof(*info));
4928 if (!info)
4929 return -ENOMEM;
4930
4931 memset(info, 0, sizeof(*info));
4932 info->type = type;
4933
4934 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4935 do {
4936 info->buf = malloc(info->buflen);
4937 } while (!info->buf);
4938 memset(info->buf, 0, info->buflen);
4939 /* set actual size to buffer size */
4940 info->size = info->buflen;
4941
4942 fi->fh = (unsigned long)info;
4943 return 0;
4944}
4945
bddbb106
SH
4946int proc_access(const char *path, int mask)
4947{
e7849aa3
CB
4948 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4949 return 0;
4950
bddbb106
SH
4951 /* these are all read-only */
4952 if ((mask & ~R_OK) != 0)
1b060d0a 4953 return -EACCES;
bddbb106
SH
4954 return 0;
4955}
4956
237e200e
SH
4957int proc_release(const char *path, struct fuse_file_info *fi)
4958{
43215927 4959 do_release_file_info(fi);
237e200e
SH
4960 return 0;
4961}
4962
4963int proc_read(const char *path, char *buf, size_t size, off_t offset,
4964 struct fuse_file_info *fi)
4965{
4966 struct file_info *f = (struct file_info *) fi->fh;
4967
4968 switch (f->type) {
4969 case LXC_TYPE_PROC_MEMINFO:
4970 return proc_meminfo_read(buf, size, offset, fi);
4971 case LXC_TYPE_PROC_CPUINFO:
4972 return proc_cpuinfo_read(buf, size, offset, fi);
4973 case LXC_TYPE_PROC_UPTIME:
4974 return proc_uptime_read(buf, size, offset, fi);
4975 case LXC_TYPE_PROC_STAT:
4976 return proc_stat_read(buf, size, offset, fi);
4977 case LXC_TYPE_PROC_DISKSTATS:
4978 return proc_diskstats_read(buf, size, offset, fi);
70dcc12e
SH
4979 case LXC_TYPE_PROC_SWAPS:
4980 return proc_swaps_read(buf, size, offset, fi);
46be8eed 4981 case LXC_TYPE_PROC_LOADAVG:
4982 return proc_loadavg_read(buf, size, offset, fi);
237e200e
SH
4983 default:
4984 return -EINVAL;
4985 }
4986}
4987
29a73c2f
CB
4988/*
4989 * Functions needed to setup cgroups in the __constructor__.
29a73c2f
CB
4990 */
4991
4992static bool mkdir_p(const char *dir, mode_t mode)
4993{
4994 const char *tmp = dir;
4995 const char *orig = dir;
4996 char *makeme;
4997
4998 do {
4999 dir = tmp + strspn(tmp, "/");
5000 tmp = dir + strcspn(dir, "/");
5001 makeme = strndup(orig, dir - orig);
5002 if (!makeme)
5003 return false;
5004 if (mkdir(makeme, mode) && errno != EEXIST) {
b8defc3d 5005 lxcfs_error("Failed to create directory '%s': %s.\n",
29a73c2f
CB
5006 makeme, strerror(errno));
5007 free(makeme);
5008 return false;
5009 }
5010 free(makeme);
5011 } while(tmp != dir);
5012
5013 return true;
5014}
5015
5016static bool umount_if_mounted(void)
5017{
5018 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
b8defc3d 5019 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
29a73c2f
CB
5020 return false;
5021 }
5022 return true;
5023}
5024
2283e240
CB
5025/* __typeof__ should be safe to use with all compilers. */
5026typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5027static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5028{
5029 return (fs->f_type == (fs_type_magic)magic_val);
5030}
5031
0a4dea41
CB
5032/*
5033 * looking at fs/proc_namespace.c, it appears we can
5034 * actually expect the rootfs entry to very specifically contain
5035 * " - rootfs rootfs "
5036 * IIUC, so long as we've chrooted so that rootfs is not our root,
5037 * the rootfs entry should always be skipped in mountinfo contents.
5038 */
5039static bool is_on_ramfs(void)
5040{
5041 FILE *f;
5042 char *p, *p2;
5043 char *line = NULL;
5044 size_t len = 0;
5045 int i;
5046
5047 f = fopen("/proc/self/mountinfo", "r");
5048 if (!f)
5049 return false;
5050
5051 while (getline(&line, &len, f) != -1) {
5052 for (p = line, i = 0; p && i < 4; i++)
5053 p = strchr(p + 1, ' ');
5054 if (!p)
5055 continue;
5056 p2 = strchr(p + 1, ' ');
5057 if (!p2)
5058 continue;
5059 *p2 = '\0';
5060 if (strcmp(p + 1, "/") == 0) {
5061 // this is '/'. is it the ramfs?
5062 p = strchr(p2 + 1, '-');
5063 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5064 free(line);
5065 fclose(f);
5066 return true;
5067 }
5068 }
5069 }
5070 free(line);
5071 fclose(f);
5072 return false;
5073}
5074
cc309f33 5075static int pivot_enter()
0a4dea41 5076{
cc309f33
CB
5077 int ret = -1, oldroot = -1, newroot = -1;
5078
5079 oldroot = open("/", O_DIRECTORY | O_RDONLY);
5080 if (oldroot < 0) {
5081 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5082 return ret;
5083 }
5084
5085 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5086 if (newroot < 0) {
5087 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5088 goto err;
5089 }
5090
5091 /* change into new root fs */
5092 if (fchdir(newroot) < 0) {
5093 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5094 goto err;
5095 }
5096
0a4dea41
CB
5097 /* pivot_root into our new root fs */
5098 if (pivot_root(".", ".") < 0) {
5099 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
cc309f33 5100 goto err;
0a4dea41
CB
5101 }
5102
5103 /*
5104 * At this point the old-root is mounted on top of our new-root.
5105 * To unmounted it we must not be chdir'd into it, so escape back
5106 * to the old-root.
5107 */
5108 if (fchdir(oldroot) < 0) {
5109 lxcfs_error("%s\n", "Failed to enter old root.");
cc309f33 5110 goto err;
0a4dea41
CB
5111 }
5112
5113 if (umount2(".", MNT_DETACH) < 0) {
5114 lxcfs_error("%s\n", "Failed to detach old root.");
cc309f33 5115 goto err;
0a4dea41
CB
5116 }
5117
5118 if (fchdir(newroot) < 0) {
5119 lxcfs_error("%s\n", "Failed to re-enter new root.");
cc309f33 5120 goto err;
0a4dea41
CB
5121 }
5122
cc309f33
CB
5123 ret = 0;
5124
5125err:
5126 if (oldroot > 0)
5127 close(oldroot);
5128 if (newroot > 0)
5129 close(newroot);
5130
5131 return ret;
0a4dea41
CB
5132}
5133
5134static int chroot_enter()
5135{
5136 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5137 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5138 return -1;
5139 }
5140
5141 if (chroot(".") < 0) {
5142 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5143 return -1;
5144 }
5145
5146 if (chdir("/") < 0) {
5147 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5148 return -1;
5149 }
5150
5151 return 0;
5152}
5153
0232cbac 5154static int permute_and_enter(void)
29a73c2f 5155{
0a4dea41
CB
5156 struct statfs sb;
5157
5158 if (statfs("/", &sb) < 0) {
5159 lxcfs_error("%s\n", "Could not stat / mountpoint.");
cc309f33 5160 return -1;
0a4dea41
CB
5161 }
5162
5163 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5164 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5165 * /proc/1/mountinfo. */
5166 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5167 return chroot_enter();
29a73c2f 5168
cc309f33 5169 if (pivot_enter() < 0) {
0a4dea41 5170 lxcfs_error("%s\n", "Could not perform pivot root.");
cc309f33 5171 return -1;
29a73c2f
CB
5172 }
5173
cc309f33 5174 return 0;
29a73c2f
CB
5175}
5176
5177/* Prepare our new clean root. */
0232cbac 5178static int permute_prepare(void)
29a73c2f
CB
5179{
5180 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
b8defc3d 5181 lxcfs_error("%s\n", "Failed to create directory for new root.");
29a73c2f
CB
5182 return -1;
5183 }
5184
5185 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
b8defc3d 5186 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
29a73c2f
CB
5187 return -1;
5188 }
5189
5190 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
b8defc3d 5191 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
29a73c2f
CB
5192 return -1;
5193 }
5194
5195 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
b8defc3d 5196 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
29a73c2f
CB
5197 return -1;
5198 }
5199
5200 return 0;
5201}
5202
0232cbac
CB
5203/* Calls chroot() on ramfs, pivot_root() in all other cases. */
5204static bool permute_root(void)
29a73c2f
CB
5205{
5206 /* Prepare new root. */
0232cbac 5207 if (permute_prepare() < 0)
29a73c2f
CB
5208 return false;
5209
5210 /* Pivot into new root. */
0232cbac 5211 if (permute_and_enter() < 0)
29a73c2f
CB
5212 return false;
5213
5214 return true;
5215}
5216
a257a8ee
CB
5217static int preserve_mnt_ns(int pid)
5218{
5219 int ret;
5220 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5221 char path[len];
5222
5223 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5224 if (ret < 0 || (size_t)ret >= len)
5225 return -1;
5226
5227 return open(path, O_RDONLY | O_CLOEXEC);
5228}
5229
0a4dea41 5230static bool cgfs_prepare_mounts(void)
29a73c2f
CB
5231{
5232 if (!mkdir_p(BASEDIR, 0700)) {
b8defc3d 5233 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
29a73c2f
CB
5234 return false;
5235 }
480262c9 5236
29a73c2f 5237 if (!umount_if_mounted()) {
b8defc3d 5238 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
480262c9
CB
5239 return false;
5240 }
5241
5242 if (unshare(CLONE_NEWNS) < 0) {
b8defc3d 5243 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
480262c9
CB
5244 return false;
5245 }
5246
a257a8ee
CB
5247 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5248 if (cgroup_mount_ns_fd < 0) {
5249 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5250 return false;
5251 }
5252
480262c9 5253 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
b8defc3d 5254 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
29a73c2f
CB
5255 return false;
5256 }
480262c9 5257
29a73c2f 5258 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
b8defc3d 5259 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
29a73c2f
CB
5260 return false;
5261 }
480262c9 5262
29a73c2f
CB
5263 return true;
5264}
5265
0a4dea41 5266static bool cgfs_mount_hierarchies(void)
29a73c2f
CB
5267{
5268 char *target;
5269 size_t clen, len;
5270 int i, ret;
5271
5272 for (i = 0; i < num_hierarchies; i++) {
5273 char *controller = hierarchies[i];
51c7ca35 5274
29a73c2f
CB
5275 clen = strlen(controller);
5276 len = strlen(BASEDIR) + clen + 2;
5277 target = malloc(len);
5278 if (!target)
5279 return false;
51c7ca35 5280
29a73c2f
CB
5281 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5282 if (ret < 0 || ret >= len) {
5283 free(target);
5284 return false;
5285 }
5286 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5287 free(target);
5288 return false;
5289 }
51c7ca35
CB
5290 if (!strcmp(controller, "unified"))
5291 ret = mount("none", target, "cgroup2", 0, NULL);
5292 else
5293 ret = mount(controller, target, "cgroup", 0, controller);
5294 if (ret < 0) {
5295 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
29a73c2f
CB
5296 free(target);
5297 return false;
5298 }
5299
5300 fd_hierarchies[i] = open(target, O_DIRECTORY);
5301 if (fd_hierarchies[i] < 0) {
5302 free(target);
5303 return false;
5304 }
5305 free(target);
5306 }
5307 return true;
5308}
5309
480262c9 5310static bool cgfs_setup_controllers(void)
29a73c2f 5311{
0a4dea41 5312 if (!cgfs_prepare_mounts())
29a73c2f 5313 return false;
29a73c2f 5314
0a4dea41 5315 if (!cgfs_mount_hierarchies()) {
b8defc3d 5316 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
29a73c2f
CB
5317 return false;
5318 }
5319
0232cbac 5320 if (!permute_root())
29a73c2f
CB
5321 return false;
5322
5323 return true;
5324}
5325
5326static void __attribute__((constructor)) collect_and_mount_subsystems(void)
237e200e
SH
5327{
5328 FILE *f;
e58dab00
CB
5329 char *cret, *line = NULL;
5330 char cwd[MAXPATHLEN];
237e200e 5331 size_t len = 0;
480262c9 5332 int i, init_ns = -1;
51c7ca35 5333 bool found_unified = false;
237e200e
SH
5334
5335 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
b8defc3d 5336 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
237e200e
SH
5337 return;
5338 }
e58dab00 5339
237e200e 5340 while (getline(&line, &len, f) != -1) {
51c7ca35 5341 char *idx, *p, *p2;
237e200e
SH
5342
5343 p = strchr(line, ':');
5344 if (!p)
5345 goto out;
51c7ca35 5346 idx = line;
237e200e
SH
5347 *(p++) = '\0';
5348
5349 p2 = strrchr(p, ':');
5350 if (!p2)
5351 goto out;
5352 *p2 = '\0';
5353
a67719f6
CB
5354 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5355 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5356 * because it parses out the empty string "" and later on passes
5357 * it to mount(). Let's skip such entries.
5358 */
51c7ca35
CB
5359 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5360 found_unified = true;
5361 p = "unified";
5362 }
a67719f6 5363
237e200e
SH
5364 if (!store_hierarchy(line, p))
5365 goto out;
5366 }
5367
480262c9 5368 /* Preserve initial namespace. */
a257a8ee 5369 init_ns = preserve_mnt_ns(getpid());
b8defc3d
CB
5370 if (init_ns < 0) {
5371 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
480262c9 5372 goto out;
b8defc3d 5373 }
480262c9 5374
92c3ee11 5375 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
b8defc3d
CB
5376 if (!fd_hierarchies) {
5377 lxcfs_error("%s\n", strerror(errno));
29a73c2f 5378 goto out;
b8defc3d 5379 }
29a73c2f 5380
480262c9
CB
5381 for (i = 0; i < num_hierarchies; i++)
5382 fd_hierarchies[i] = -1;
5383
e58dab00
CB
5384 cret = getcwd(cwd, MAXPATHLEN);
5385 if (!cret)
5386 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5387
480262c9
CB
5388 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5389 * to privately mount lxcfs cgroups. */
b8defc3d
CB
5390 if (!cgfs_setup_controllers()) {
5391 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
29a73c2f 5392 goto out;
b8defc3d 5393 }
480262c9 5394
b8defc3d
CB
5395 if (setns(init_ns, 0) < 0) {
5396 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
29a73c2f 5397 goto out;
b8defc3d 5398 }
29a73c2f 5399
e58dab00
CB
5400 if (!cret || chdir(cwd) < 0)
5401 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5402
237e200e
SH
5403 print_subsystems();
5404
5405out:
5406 free(line);
5407 fclose(f);
480262c9
CB
5408 if (init_ns >= 0)
5409 close(init_ns);
237e200e
SH
5410}
5411
5412static void __attribute__((destructor)) free_subsystems(void)
5413{
5414 int i;
5415
b8defc3d
CB
5416 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5417
29a73c2f 5418 for (i = 0; i < num_hierarchies; i++) {
237e200e
SH
5419 if (hierarchies[i])
5420 free(hierarchies[i]);
480262c9 5421 if (fd_hierarchies && fd_hierarchies[i] >= 0)
29a73c2f
CB
5422 close(fd_hierarchies[i]);
5423 }
237e200e 5424 free(hierarchies);
480262c9 5425 free(fd_hierarchies);
a257a8ee
CB
5426
5427 if (cgroup_mount_ns_fd >= 0)
5428 close(cgroup_mount_ns_fd);
237e200e 5429}