bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdarg.h>
  21 #include <stdbool.h>
  22 #include <stdint.h>
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <time.h>
  27 #include <unistd.h>
  28 #include <wait.h>
  29 #include <linux/magic.h>
  30 #include <linux/sched.h>
  31 #include <sys/epoll.h>
  32 #include <sys/mman.h>
  33 #include <sys/mount.h>
  34 #include <sys/param.h>
  35 #include <sys/socket.h>
  36 #include <sys/syscall.h>
  37 #include <sys/sysinfo.h>
  38 #include <sys/vfs.h>
  39
  40 #include "bindings.h"
  41 #include "cgroups/cgroup.h"
  42 #include "cgroups/cgroup_utils.h"
  43 #include "memory_utils.h"
  44 #include "config.h"
  45
  46 /* Define pivot_root() if missing from the C library */
  47 #ifndef HAVE_PIVOT_ROOT
  48 static int pivot_root(const char * new_root, const char * put_old)
  49 {
  50 #ifdef __NR_pivot_root
  51 return syscall(__NR_pivot_root, new_root, put_old);
  52 #else
  53 errno = ENOSYS;
  54 return -1;
  55 #endif
  56 }
  57 #else
  58 extern int pivot_root(const char * new_root, const char * put_old);
  59 #endif
  60
  61 struct cpuacct_usage {
  62         uint64_t user;
  63         uint64_t system;
  64         uint64_t idle;
  65         bool online;
  66 };
  67
  68 /* The function of hash table.*/
  69 #define LOAD_SIZE 100 /*the size of hash_table */
  70 #define FLUSH_TIME 5  /*the flush rate */
  71 #define DEPTH_DIR 3   /*the depth of per cgroup */
  72 /* The function of calculate loadavg .*/
  73 #define FSHIFT          11              /* nr of bits of precision */
  74 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  75 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  76 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  77 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  78 #define LOAD_INT(x) ((x) >> FSHIFT)
  79 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  80 /*
  81  * This parameter is used for proc_loadavg_read().
  82  * 1 means use loadavg, 0 means not use.
  83  */
  84 static int loadavg = 0;
  85 static volatile sig_atomic_t loadavg_stop = 0;
  86 static int calc_hash(const char *name)
  87 {
  88         unsigned int hash = 0;
  89         unsigned int x = 0;
  90         /* ELFHash algorithm. */
  91         while (*name) {
  92                 hash = (hash << 4) + *name++;
  93                 x = hash & 0xf0000000;
  94                 if (x != 0)
  95                         hash ^= (x >> 24);
  96                 hash &= ~x;
  97         }
  98         return (hash & 0x7fffffff);
  99 }
 100
 101 struct load_node {
 102         char *cg;  /*cg */
 103         unsigned long avenrun[3];               /* Load averages */
 104         unsigned int run_pid;
 105         unsigned int total_pid;
 106         unsigned int last_pid;
 107         int cfd; /* The file descriptor of the mounted cgroup */
 108         struct  load_node *next;
 109         struct  load_node **pre;
 110 };
 111
 112 struct load_head {
 113         /*
 114          * The lock is about insert load_node and refresh load_node.To the first
 115          * load_node of each hash bucket, insert and refresh in this hash bucket is
 116          * mutually exclusive.
 117          */
 118         pthread_mutex_t lock;
 119         /*
 120          * The rdlock is about read loadavg and delete load_node.To each hash
 121          * bucket, read and delete is mutually exclusive. But at the same time, we
 122          * allow paratactic read operation. This rdlock is at list level.
 123          */
 124         pthread_rwlock_t rdlock;
 125         /*
 126          * The rilock is about read loadavg and insert load_node.To the first
 127          * load_node of each hash bucket, read and insert is mutually exclusive.
 128          * But at the same time, we allow paratactic read operation.
 129          */
 130         pthread_rwlock_t rilock;
 131         struct load_node *next;
 132 };
 133
 134 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 135 /*
 136  * init_load initialize the hash table.
 137  * Return 0 on success, return -1 on failure.
 138  */
 139 static int init_load(void)
 140 {
 141         int i;
 142         int ret;
 143
 144         for (i = 0; i < LOAD_SIZE; i++) {
 145                 load_hash[i].next = NULL;
 146                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 147                 if (ret != 0) {
 148                         lxcfs_error("%s\n", "Failed to initialize lock");
 149                         goto out3;
 150                 }
 151                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 152                 if (ret != 0) {
 153                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 154                         goto out2;
 155                 }
 156                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 157                 if (ret != 0) {
 158                         lxcfs_error("%s\n", "Failed to initialize rilock");
 159                         goto out1;
 160                 }
 161         }
 162         return 0;
 163 out1:
 164         pthread_rwlock_destroy(&load_hash[i].rdlock);
 165 out2:
 166         pthread_mutex_destroy(&load_hash[i].lock);
 167 out3:
 168         while (i > 0) {
 169                 i--;
 170                 pthread_mutex_destroy(&load_hash[i].lock);
 171                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 172                 pthread_rwlock_destroy(&load_hash[i].rilock);
 173         }
 174         return -1;
 175 }
 176
 177 static void insert_node(struct load_node **n, int locate)
 178 {
 179         struct load_node *f;
 180
 181         pthread_mutex_lock(&load_hash[locate].lock);
 182         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 183         f = load_hash[locate].next;
 184         load_hash[locate].next = *n;
 185
 186         (*n)->pre = &(load_hash[locate].next);
 187         if (f)
 188                 f->pre = &((*n)->next);
 189         (*n)->next = f;
 190         pthread_mutex_unlock(&load_hash[locate].lock);
 191         pthread_rwlock_unlock(&load_hash[locate].rilock);
 192 }
 193 /*
 194  * locate_node() finds special node. Not return NULL means success.
 195  * It should be noted that rdlock isn't unlocked at the end of code
 196  * because this function is used to read special node. Delete is not
 197  * allowed before read has ended.
 198  * unlock rdlock only in proc_loadavg_read().
 199  */
 200 static struct load_node *locate_node(char *cg, int locate)
 201 {
 202         struct load_node *f = NULL;
 203         int i = 0;
 204
 205         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 206         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 207         if (load_hash[locate].next == NULL) {
 208                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 209                 return f;
 210         }
 211         f = load_hash[locate].next;
 212         pthread_rwlock_unlock(&load_hash[locate].rilock);
 213         while (f && ((i = strcmp(f->cg, cg)) != 0))
 214                 f = f->next;
 215         return f;
 216 }
 217 /* Delete the load_node n and return the next node of it. */
 218 static struct load_node *del_node(struct load_node *n, int locate)
 219 {
 220         struct load_node *g;
 221
 222         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 223         if (n->next == NULL) {
 224                 *(n->pre) = NULL;
 225         } else {
 226                 *(n->pre) = n->next;
 227                 n->next->pre = n->pre;
 228         }
 229         g = n->next;
 230         free(n->cg);
 231         free(n);
 232         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 233         return g;
 234 }
 235
 236 static void load_free(void)
 237 {
 238         int i;
 239         struct load_node *f, *p;
 240
 241         for (i = 0; i < LOAD_SIZE; i++) {
 242                 pthread_mutex_lock(&load_hash[i].lock);
 243                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 244                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 245                 if (load_hash[i].next == NULL) {
 246                         pthread_mutex_unlock(&load_hash[i].lock);
 247                         pthread_mutex_destroy(&load_hash[i].lock);
 248                         pthread_rwlock_unlock(&load_hash[i].rilock);
 249                         pthread_rwlock_destroy(&load_hash[i].rilock);
 250                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 251                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 252                         continue;
 253                 }
 254                 for (f = load_hash[i].next; f; ) {
 255                         free(f->cg);
 256                         p = f->next;
 257                         free(f);
 258                         f = p;
 259                 }
 260                 pthread_mutex_unlock(&load_hash[i].lock);
 261                 pthread_mutex_destroy(&load_hash[i].lock);
 262                 pthread_rwlock_unlock(&load_hash[i].rilock);
 263                 pthread_rwlock_destroy(&load_hash[i].rilock);
 264                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 265                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 266         }
 267 }
 268
 269 /* Data for CPU view */
 270 struct cg_proc_stat {
 271         char *cg;
 272         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 273         struct cpuacct_usage *view; // Usage stats reported to the container
 274         int cpu_count;
 275         pthread_mutex_t lock; // For node manipulation
 276         struct cg_proc_stat *next;
 277 };
 278
 279 struct cg_proc_stat_head {
 280         struct cg_proc_stat *next;
 281         time_t lastcheck;
 282
 283         /*
 284          * For access to the list. Reading can be parallel, pruning is exclusive.
 285          */
 286         pthread_rwlock_t lock;
 287 };
 288
 289 #define CPUVIEW_HASH_SIZE 100
 290 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 291
 292 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 293 {
 294         *head = malloc(sizeof(struct cg_proc_stat_head));
 295         if (!(*head)) {
 296                 lxcfs_error("%s\n", strerror(errno));
 297                 return false;
 298         }
 299
 300         (*head)->lastcheck = time(NULL);
 301         (*head)->next = NULL;
 302
 303         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 304                 lxcfs_error("%s\n", "Failed to initialize list lock");
 305                 free(*head);
 306                 return false;
 307         }
 308
 309         return true;
 310 }
 311
 312 static bool init_cpuview()
 313 {
 314         int i;
 315
 316         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 317                 proc_stat_history[i] = NULL;
 318
 319         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 320                 if (!cpuview_init_head(&proc_stat_history[i]))
 321                         goto err;
 322         }
 323
 324         return true;
 325
 326 err:
 327         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 328                 if (proc_stat_history[i]) {
 329                         free(proc_stat_history[i]);
 330                         proc_stat_history[i] = NULL;
 331                 }
 332         }
 333
 334         return false;
 335 }
 336
 337 static void free_proc_stat_node(struct cg_proc_stat *node)
 338 {
 339         pthread_mutex_destroy(&node->lock);
 340         free(node->cg);
 341         free(node->usage);
 342         free(node->view);
 343         free(node);
 344 }
 345
 346 static void cpuview_free_head(struct cg_proc_stat_head *head)
 347 {
 348         struct cg_proc_stat *node, *tmp;
 349
 350         if (head->next) {
 351                 node = head->next;
 352
 353                 for (;;) {
 354                         tmp = node;
 355                         node = node->next;
 356                         free_proc_stat_node(tmp);
 357
 358                         if (!node)
 359                                 break;
 360                 }
 361         }
 362
 363         pthread_rwlock_destroy(&head->lock);
 364         free(head);
 365 }
 366
 367 static void free_cpuview()
 368 {
 369         int i;
 370
 371         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 372                 if (proc_stat_history[i])
 373                         cpuview_free_head(proc_stat_history[i]);
 374         }
 375 }
 376
 377 /*
 378  * A table caching which pid is init for a pid namespace.
 379  * When looking up which pid is init for $qpid, we first
 380  * 1. Stat /proc/$qpid/ns/pid.
 381  * 2. Check whether the ino_t is in our store.
 382  *   a. if not, fork a child in qpid's ns to send us
 383  *       ucred.pid = 1, and read the initpid.  Cache
 384  *       initpid and creation time for /proc/initpid
 385  *       in a new store entry.
 386  *   b. if so, verify that /proc/initpid still matches
 387  *       what we have saved.  If not, clear the store
 388  *       entry and go back to a.  If so, return the
 389  *       cached initpid.
 390  */
 391 struct pidns_init_store {
 392         ino_t ino;          // inode number for /proc/$pid/ns/pid
 393         pid_t initpid;      // the pid of nit in that ns
 394         long int ctime;     // the time at which /proc/$initpid was created
 395         struct pidns_init_store *next;
 396         long int lastcheck;
 397 };
 398
 399 /* lol - look at how they are allocated in the kernel */
 400 #define PIDNS_HASH_SIZE 4096
 401 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 402
 403 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 404 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 405 static void lock_mutex(pthread_mutex_t *l)
 406 {
 407         int ret;
 408
 409         if ((ret = pthread_mutex_lock(l)) != 0) {
 410                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 411                 exit(1);
 412         }
 413 }
 414
 415 static struct cgroup_ops *cgroup_ops;
 416
 417 static int cgroup_mount_ns_fd = -1;
 418
 419 static void unlock_mutex(pthread_mutex_t *l)
 420 {
 421         int ret;
 422
 423         if ((ret = pthread_mutex_unlock(l)) != 0) {
 424                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 425                 exit(1);
 426         }
 427 }
 428
 429 static void store_lock(void)
 430 {
 431         lock_mutex(&pidns_store_mutex);
 432 }
 433
 434 static void store_unlock(void)
 435 {
 436         unlock_mutex(&pidns_store_mutex);
 437 }
 438
 439 /* Must be called under store_lock */
 440 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 441 {
 442         struct stat initsb;
 443         char fnam[100];
 444
 445         snprintf(fnam, 100, "/proc/%d", e->initpid);
 446         if (stat(fnam, &initsb) < 0)
 447                 return false;
 448
 449         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 450                     initsb.st_ctime, e->initpid);
 451
 452         if (e->ctime != initsb.st_ctime)
 453                 return false;
 454         return true;
 455 }
 456
 457 /* Must be called under store_lock */
 458 static void remove_initpid(struct pidns_init_store *e)
 459 {
 460         struct pidns_init_store *tmp;
 461         int h;
 462
 463         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 464
 465         h = HASH(e->ino);
 466         if (pidns_hash_table[h] == e) {
 467                 pidns_hash_table[h] = e->next;
 468                 free(e);
 469                 return;
 470         }
 471
 472         tmp = pidns_hash_table[h];
 473         while (tmp) {
 474                 if (tmp->next == e) {
 475                         tmp->next = e->next;
 476                         free(e);
 477                         return;
 478                 }
 479                 tmp = tmp->next;
 480         }
 481 }
 482
 483 #define PURGE_SECS 5
 484 /* Must be called under store_lock */
 485 static void prune_initpid_store(void)
 486 {
 487         static long int last_prune = 0;
 488         struct pidns_init_store *e, *prev, *delme;
 489         long int now, threshold;
 490         int i;
 491
 492         if (!last_prune) {
 493                 last_prune = time(NULL);
 494                 return;
 495         }
 496         now = time(NULL);
 497         if (now < last_prune + PURGE_SECS)
 498                 return;
 499
 500         lxcfs_debug("%s\n", "Pruning.");
 501
 502         last_prune = now;
 503         threshold = now - 2 * PURGE_SECS;
 504
 505         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 506                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 507                         if (e->lastcheck < threshold) {
 508
 509                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 510
 511                                 delme = e;
 512                                 if (prev)
 513                                         prev->next = e->next;
 514                                 else
 515                                         pidns_hash_table[i] = e->next;
 516                                 e = e->next;
 517                                 free(delme);
 518                         } else {
 519                                 prev = e;
 520                                 e = e->next;
 521                         }
 522                 }
 523         }
 524 }
 525
 526 /* Must be called under store_lock */
 527 static void save_initpid(struct stat *sb, pid_t pid)
 528 {
 529         struct pidns_init_store *e;
 530         char fpath[100];
 531         struct stat procsb;
 532         int h;
 533
 534         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 535
 536         snprintf(fpath, 100, "/proc/%d", pid);
 537         if (stat(fpath, &procsb) < 0)
 538                 return;
 539         do {
 540                 e = malloc(sizeof(*e));
 541         } while (!e);
 542         e->ino = sb->st_ino;
 543         e->initpid = pid;
 544         e->ctime = procsb.st_ctime;
 545         h = HASH(e->ino);
 546         e->next = pidns_hash_table[h];
 547         e->lastcheck = time(NULL);
 548         pidns_hash_table[h] = e;
 549 }
 550
 551 /*
 552  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 553  * entry for the inode number and creation time.  Verify that the init pid
 554  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 555  * otherwise.
 556  * Must be called under store_lock
 557  */
 558 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 559 {
 560         int h = HASH(sb->st_ino);
 561         struct pidns_init_store *e = pidns_hash_table[h];
 562
 563         while (e) {
 564                 if (e->ino == sb->st_ino) {
 565                         if (initpid_still_valid(e, sb)) {
 566                                 e->lastcheck = time(NULL);
 567                                 return e;
 568                         }
 569                         remove_initpid(e);
 570                         return NULL;
 571                 }
 572                 e = e->next;
 573         }
 574
 575         return NULL;
 576 }
 577
 578 static int is_dir(const char *path, int fd)
 579 {
 580         struct stat statbuf;
 581         int ret = fstatat(fd, path, &statbuf, fd);
 582         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 583                 return 1;
 584         return 0;
 585 }
 586
 587 static int preserve_ns(const int pid, const char *ns)
 588 {
 589         int ret;
 590 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
 591 #define __NS_PATH_LEN 50
 592         char path[__NS_PATH_LEN];
 593
 594         /* This way we can use this function to also check whether namespaces
 595          * are supported by the kernel by passing in the NULL or the empty
 596          * string.
 597          */
 598         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
 599                        !ns || strcmp(ns, "") == 0 ? "" : "/",
 600                        !ns || strcmp(ns, "") == 0 ? "" : ns);
 601         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
 602                 errno = EFBIG;
 603                 return -1;
 604         }
 605
 606         return open(path, O_RDONLY | O_CLOEXEC);
 607 }
 608
 609 /**
 610  * in_same_namespace - Check whether two processes are in the same namespace.
 611  * @pid1 - PID of the first process.
 612  * @pid2 - PID of the second process.
 613  * @ns   - Name of the namespace to check. Must correspond to one of the names
 614  *         for the namespaces as shown in /proc/<pid/ns/
 615  *
 616  * If the two processes are not in the same namespace returns an fd to the
 617  * namespace of the second process identified by @pid2. If the two processes are
 618  * in the same namespace returns -EINVAL, -1 if an error occurred.
 619  */
 620 static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
 621 {
 622         __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
 623         int ret = -1;
 624         struct stat ns_st1, ns_st2;
 625
 626         ns_fd1 = preserve_ns(pid1, ns);
 627         if (ns_fd1 < 0) {
 628                 /* The kernel does not support this namespace. This is not an
 629                  * error.
 630                  */
 631                 if (errno == ENOENT)
 632                         return -EINVAL;
 633
 634                 return -1;
 635         }
 636
 637         ns_fd2 = preserve_ns(pid2, ns);
 638         if (ns_fd2 < 0)
 639                 return -1;
 640
 641         ret = fstat(ns_fd1, &ns_st1);
 642         if (ret < 0)
 643                 return -1;
 644
 645         ret = fstat(ns_fd2, &ns_st2);
 646         if (ret < 0)
 647                 return -1;
 648
 649         /* processes are in the same namespace */
 650         if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
 651                 return -EINVAL;
 652
 653         /* processes are in different namespaces */
 654         return move_fd(ns_fd2);
 655 }
 656
 657 static bool is_shared_pidns(pid_t pid)
 658 {
 659         if (pid != 1)
 660                 return false;
 661
 662         if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
 663                 return true;
 664
 665         return false;
 666 }
 667
 668 static bool write_string(const char *fnam, const char *string, int fd)
 669 {
 670         FILE *f;
 671         size_t len, ret;
 672
 673         f = fdopen(fd, "w");
 674         if (!f)
 675                 return false;
 676
 677         len = strlen(string);
 678         ret = fwrite(string, 1, len, f);
 679         if (ret != len) {
 680                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 681                             strerror(errno), string, fnam);
 682                 fclose(f);
 683                 return false;
 684         }
 685
 686         if (fclose(f) < 0) {
 687                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 688                 return false;
 689         }
 690
 691         return true;
 692 }
 693
 694 struct cgfs_files {
 695         char *name;
 696         uint32_t uid, gid;
 697         uint32_t mode;
 698 };
 699
 700 static void print_subsystems(void)
 701 {
 702         int i = 0;
 703
 704         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 705         fprintf(stderr, "hierarchies:\n");
 706         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 707                 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
 708                 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
 709         }
 710 }
 711
 712 /* do we need to do any massaging here?  I'm not sure... */
 713 /* Return the mounted controller and store the corresponding open file descriptor
 714  * referring to the controller mountpoint in the private lxcfs namespace in
 715  * @cfd.
 716  */
 717 static int find_mounted_controller(const char *controller)
 718 {
 719         struct hierarchy *h;
 720
 721         h = cgroup_ops->get_hierarchy(cgroup_ops, controller);
 722         return h ? h->fd : -EBADF;
 723 }
 724
 725 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 726                 const char *value)
 727 {
 728         int ret, fd, cfd;
 729         size_t len;
 730         char *fnam;
 731
 732         cfd = find_mounted_controller(controller);
 733         if (cfd < 0)
 734                 return false;
 735
 736         /* Make sure we pass a relative path to *at() family of functions.
 737          * . + /cgroup + / + file + \0
 738          */
 739         len = strlen(cgroup) + strlen(file) + 3;
 740         fnam = alloca(len);
 741         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
 742         if (ret < 0 || (size_t)ret >= len)
 743                 return false;
 744
 745         fd = openat(cfd, fnam, O_WRONLY);
 746         if (fd < 0)
 747                 return false;
 748
 749         return write_string(fnam, value, fd);
 750 }
 751
 752 // Chown all the files in the cgroup directory.  We do this when we create
 753 // a cgroup on behalf of a user.
 754 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 755 {
 756         struct dirent *direntp;
 757         char path[MAXPATHLEN];
 758         size_t len;
 759         DIR *d;
 760         int fd1, ret;
 761
 762         len = strlen(dirname);
 763         if (len >= MAXPATHLEN) {
 764                 lxcfs_error("Pathname too long: %s\n", dirname);
 765                 return;
 766         }
 767
 768         fd1 = openat(fd, dirname, O_DIRECTORY);
 769         if (fd1 < 0)
 770                 return;
 771
 772         d = fdopendir(fd1);
 773         if (!d) {
 774                 lxcfs_error("Failed to open %s\n", dirname);
 775                 return;
 776         }
 777
 778         while ((direntp = readdir(d))) {
 779                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 780                         continue;
 781                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 782                 if (ret < 0 || ret >= MAXPATHLEN) {
 783                         lxcfs_error("Pathname too long under %s\n", dirname);
 784                         continue;
 785                 }
 786                 if (fchownat(fd, path, uid, gid, 0) < 0)
 787                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 788         }
 789         closedir(d);
 790 }
 791
 792 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 793 {
 794         int cfd;
 795         size_t len;
 796         char *dirnam;
 797
 798         cfd = find_mounted_controller(controller);
 799         if (cfd < 0)
 800                 return -EINVAL;
 801
 802         /* Make sure we pass a relative path to *at() family of functions.
 803          * . + /cg + \0
 804          */
 805         len = strlen(cg) + 2;
 806         dirnam = alloca(len);
 807         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 808
 809         if (mkdirat(cfd, dirnam, 0755) < 0)
 810                 return -errno;
 811
 812         if (uid == 0 && gid == 0)
 813                 return 0;
 814
 815         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 816                 return -errno;
 817
 818         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 819
 820         return 0;
 821 }
 822
 823 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 824 {
 825         struct dirent *direntp;
 826         DIR *dir;
 827         bool ret = false;
 828         char pathname[MAXPATHLEN];
 829         int dupfd;
 830
 831         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 832         if (dupfd < 0)
 833                 return false;
 834
 835         dir = fdopendir(dupfd);
 836         if (!dir) {
 837                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 838                 close(dupfd);
 839                 return false;
 840         }
 841
 842         while ((direntp = readdir(dir))) {
 843                 struct stat mystat;
 844                 int rc;
 845
 846                 if (!strcmp(direntp->d_name, ".") ||
 847                     !strcmp(direntp->d_name, ".."))
 848                         continue;
 849
 850                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 851                 if (rc < 0 || rc >= MAXPATHLEN) {
 852                         lxcfs_error("%s\n", "Pathname too long.");
 853                         continue;
 854                 }
 855
 856                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 857                 if (rc) {
 858                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 859                         continue;
 860                 }
 861                 if (S_ISDIR(mystat.st_mode))
 862                         if (!recursive_rmdir(pathname, fd, cfd))
 863                                 lxcfs_debug("Error removing %s.\n", pathname);
 864         }
 865
 866         ret = true;
 867         if (closedir(dir) < 0) {
 868                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 869                 ret = false;
 870         }
 871
 872         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 873                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 874                 ret = false;
 875         }
 876
 877         close(dupfd);
 878
 879         return ret;
 880 }
 881
 882 bool cgfs_remove(const char *controller, const char *cg)
 883 {
 884         int fd, cfd;
 885         size_t len;
 886         char *dirnam;
 887         bool bret;
 888
 889         cfd = find_mounted_controller(controller);
 890         if (cfd < 0)
 891                 return false;
 892
 893         /* Make sure we pass a relative path to *at() family of functions.
 894          * . +  /cg + \0
 895          */
 896         len = strlen(cg) + 2;
 897         dirnam = alloca(len);
 898         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 899
 900         fd = openat(cfd, dirnam, O_DIRECTORY);
 901         if (fd < 0)
 902                 return false;
 903
 904         bret = recursive_rmdir(dirnam, fd, cfd);
 905         close(fd);
 906         return bret;
 907 }
 908
 909 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 910 {
 911         int cfd;
 912         size_t len;
 913         char *pathname;
 914
 915         cfd = find_mounted_controller(controller);
 916         if (cfd < 0)
 917                 return false;
 918
 919         /* Make sure we pass a relative path to *at() family of functions.
 920          * . + /file + \0
 921          */
 922         len = strlen(file) + 2;
 923         pathname = alloca(len);
 924         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 925         if (fchmodat(cfd, pathname, mode, 0) < 0)
 926                 return false;
 927         return true;
 928 }
 929
 930 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 931 {
 932         size_t len;
 933         char *fname;
 934
 935         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 936         fname = alloca(len);
 937         snprintf(fname, len, "%s/tasks", dirname);
 938         if (fchownat(fd, fname, uid, gid, 0) != 0)
 939                 return -errno;
 940         snprintf(fname, len, "%s/cgroup.procs", dirname);
 941         if (fchownat(fd, fname, uid, gid, 0) != 0)
 942                 return -errno;
 943         return 0;
 944 }
 945
 946 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 947 {
 948         int cfd;
 949         size_t len;
 950         char *pathname;
 951
 952         cfd = find_mounted_controller(controller);
 953         if (cfd < 0)
 954                 return false;
 955
 956         /* Make sure we pass a relative path to *at() family of functions.
 957          * . + /file + \0
 958          */
 959         len = strlen(file) + 2;
 960         pathname = alloca(len);
 961         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 962         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 963                 return -errno;
 964
 965         if (is_dir(pathname, cfd))
 966                 // like cgmanager did, we want to chown the tasks file as well
 967                 return chown_tasks_files(pathname, uid, gid, cfd);
 968
 969         return 0;
 970 }
 971
 972 FILE *open_pids_file(const char *controller, const char *cgroup)
 973 {
 974         int fd, cfd;
 975         size_t len;
 976         char *pathname;
 977
 978         cfd = find_mounted_controller(controller);
 979         if (cfd < 0)
 980                 return false;
 981
 982         /* Make sure we pass a relative path to *at() family of functions.
 983          * . + /cgroup + / "cgroup.procs" + \0
 984          */
 985         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 986         pathname = alloca(len);
 987         snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
 988
 989         fd = openat(cfd, pathname, O_WRONLY);
 990         if (fd < 0)
 991                 return NULL;
 992
 993         return fdopen(fd, "w");
 994 }
 995
 996 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 997                                 void ***list, size_t typesize,
 998                                 void* (*iterator)(const char*, const char*, const char*))
 999 {
1000         int cfd, fd, ret;
1001         size_t len;
1002         char *cg;
1003         char pathname[MAXPATHLEN];
1004         size_t sz = 0, asz = 0;
1005         struct dirent *dirent;
1006         DIR *dir;
1007
1008         cfd = find_mounted_controller(controller);
1009         *list = NULL;
1010         if (cfd < 0)
1011                 return false;
1012
1013         /* Make sure we pass a relative path to *at() family of functions. */
1014         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1015         cg = alloca(len);
1016         ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
1017         if (ret < 0 || (size_t)ret >= len) {
1018                 lxcfs_error("Pathname too long under %s\n", cgroup);
1019                 return false;
1020         }
1021
1022         fd = openat(cfd, cg, O_DIRECTORY);
1023         if (fd < 0)
1024                 return false;
1025
1026         dir = fdopendir(fd);
1027         if (!dir)
1028                 return false;
1029
1030         while ((dirent = readdir(dir))) {
1031                 struct stat mystat;
1032
1033                 if (!strcmp(dirent->d_name, ".") ||
1034                     !strcmp(dirent->d_name, ".."))
1035                         continue;
1036
1037                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1038                 if (ret < 0 || ret >= MAXPATHLEN) {
1039                         lxcfs_error("Pathname too long under %s\n", cg);
1040                         continue;
1041                 }
1042
1043                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1044                 if (ret) {
1045                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1046                         continue;
1047                 }
1048                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1049                     (directories && !S_ISDIR(mystat.st_mode)))
1050                         continue;
1051
1052                 if (sz+2 >= asz) {
1053                         void **tmp;
1054                         asz += BATCH_SIZE;
1055                         do {
1056                                 tmp = realloc(*list, asz * typesize);
1057                         } while  (!tmp);
1058                         *list = tmp;
1059                 }
1060                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1061                 (*list)[sz+1] = NULL;
1062                 sz++;
1063         }
1064         if (closedir(dir) < 0) {
1065                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1066                 return false;
1067         }
1068         return true;
1069 }
1070
1071 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1072 {
1073         char *dup;
1074         do {
1075                 dup = strdup(dir_entry);
1076         } while (!dup);
1077         return dup;
1078 }
1079
1080 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1081 {
1082         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1083 }
1084
1085 void free_key(struct cgfs_files *k)
1086 {
1087         if (!k)
1088                 return;
1089         free(k->name);
1090         free(k);
1091 }
1092
1093 void free_keys(struct cgfs_files **keys)
1094 {
1095         int i;
1096
1097         if (!keys)
1098                 return;
1099         for (i = 0; keys[i]; i++) {
1100                 free_key(keys[i]);
1101         }
1102         free(keys);
1103 }
1104
1105 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1106 {
1107         int ret, cfd;
1108         size_t len;
1109         char *fnam;
1110
1111         cfd = find_mounted_controller(controller);
1112         if (cfd < 0)
1113                 return false;
1114
1115         /* Make sure we pass a relative path to *at() family of functions.
1116          * . + /cgroup + / + file + \0
1117          */
1118         len = strlen(cgroup) + strlen(file) + 3;
1119         fnam = alloca(len);
1120         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1121         if (ret < 0 || (size_t)ret >= len)
1122                 return false;
1123
1124         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1125 }
1126
1127 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1128 {
1129         int ret, cfd;
1130         size_t len;
1131         char *fnam;
1132         struct stat sb;
1133         struct cgfs_files *newkey;
1134
1135         cfd = find_mounted_controller(controller);
1136         if (cfd < 0)
1137                 return false;
1138
1139         if (file && *file == '/')
1140                 file++;
1141
1142         if (file && strchr(file, '/'))
1143                 return NULL;
1144
1145         /* Make sure we pass a relative path to *at() family of functions.
1146          * . + /cgroup + / + file + \0
1147          */
1148         len = strlen(cgroup) + 3;
1149         if (file)
1150                 len += strlen(file) + 1;
1151         fnam = alloca(len);
1152         snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
1153                  file ? "/" : "", file ? file : "");
1154
1155         ret = fstatat(cfd, fnam, &sb, 0);
1156         if (ret < 0)
1157                 return NULL;
1158
1159         do {
1160                 newkey = malloc(sizeof(struct cgfs_files));
1161         } while (!newkey);
1162         if (file)
1163                 newkey->name = must_copy_string(file);
1164         else if (strrchr(cgroup, '/'))
1165                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1166         else
1167                 newkey->name = must_copy_string(cgroup);
1168         newkey->uid = sb.st_uid;
1169         newkey->gid = sb.st_gid;
1170         newkey->mode = sb.st_mode;
1171
1172         return newkey;
1173 }
1174
1175 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1176 {
1177         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1178         if (!entry) {
1179                 lxcfs_error("Error getting files under %s:%s\n", controller,
1180                              cgroup);
1181         }
1182         return entry;
1183 }
1184
1185 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1186 {
1187         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1188 }
1189
1190 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1191 {
1192         int cfd;
1193         size_t len;
1194         char *fnam;
1195         int ret;
1196         struct stat sb;
1197
1198         cfd = find_mounted_controller(controller);
1199         if (cfd < 0)
1200                 return false;
1201
1202         /* Make sure we pass a relative path to *at() family of functions.
1203          * . + /cgroup + / + f + \0
1204          */
1205         len = strlen(cgroup) + strlen(f) + 3;
1206         fnam = alloca(len);
1207         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
1208         if (ret < 0 || (size_t)ret >= len)
1209                 return false;
1210
1211         ret = fstatat(cfd, fnam, &sb, 0);
1212         if (ret < 0 || !S_ISDIR(sb.st_mode))
1213                 return false;
1214
1215         return true;
1216 }
1217
1218 #define SEND_CREDS_OK 0
1219 #define SEND_CREDS_NOTSK 1
1220 #define SEND_CREDS_FAIL 2
1221 static bool recv_creds(int sock, struct ucred *cred, char *v);
1222 static int wait_for_pid(pid_t pid);
1223 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1224 static int send_creds_clone_wrapper(void *arg);
1225
1226 /*
1227  * clone a task which switches to @task's namespace and writes '1'.
1228  * over a unix sock so we can read the task's reaper's pid in our
1229  * namespace
1230  *
1231  * Note: glibc's fork() does not respect pidns, which can lead to failed
1232  * assertions inside glibc (and thus failed forks) if the child's pid in
1233  * the pidns and the parent pid outside are identical. Using clone prevents
1234  * this issue.
1235  */
1236 static void write_task_init_pid_exit(int sock, pid_t target)
1237 {
1238         char fnam[100];
1239         pid_t pid;
1240         int fd, ret;
1241         size_t stack_size = sysconf(_SC_PAGESIZE);
1242         void *stack = alloca(stack_size);
1243
1244         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1245         if (ret < 0 || ret >= sizeof(fnam))
1246                 _exit(1);
1247
1248         fd = open(fnam, O_RDONLY);
1249         if (fd < 0) {
1250                 perror("write_task_init_pid_exit open of ns/pid");
1251                 _exit(1);
1252         }
1253         if (setns(fd, 0)) {
1254                 perror("write_task_init_pid_exit setns 1");
1255                 close(fd);
1256                 _exit(1);
1257         }
1258         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1259         if (pid < 0)
1260                 _exit(1);
1261         if (pid != 0) {
1262                 if (!wait_for_pid(pid))
1263                         _exit(1);
1264                 _exit(0);
1265         }
1266 }
1267
1268 static int send_creds_clone_wrapper(void *arg) {
1269         struct ucred cred;
1270         char v;
1271         int sock = *(int *)arg;
1272
1273         /* we are the child */
1274         cred.uid = 0;
1275         cred.gid = 0;
1276         cred.pid = 1;
1277         v = '1';
1278         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1279                 return 1;
1280         return 0;
1281 }
1282
1283 static pid_t get_init_pid_for_task(pid_t task)
1284 {
1285         int sock[2];
1286         pid_t pid;
1287         pid_t ret = -1;
1288         char v = '0';
1289         struct ucred cred;
1290
1291         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1292                 perror("socketpair");
1293                 return -1;
1294         }
1295
1296         pid = fork();
1297         if (pid < 0)
1298                 goto out;
1299         if (!pid) {
1300                 close(sock[1]);
1301                 write_task_init_pid_exit(sock[0], task);
1302                 _exit(0);
1303         }
1304
1305         if (!recv_creds(sock[1], &cred, &v))
1306                 goto out;
1307         ret = cred.pid;
1308
1309 out:
1310         close(sock[0]);
1311         close(sock[1]);
1312         if (pid > 0)
1313                 wait_for_pid(pid);
1314         return ret;
1315 }
1316
1317 pid_t lookup_initpid_in_store(pid_t qpid)
1318 {
1319         pid_t answer = 0;
1320         struct stat sb;
1321         struct pidns_init_store *e;
1322         char fnam[100];
1323
1324         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1325         store_lock();
1326         if (stat(fnam, &sb) < 0)
1327                 goto out;
1328         e = lookup_verify_initpid(&sb);
1329         if (e) {
1330                 answer = e->initpid;
1331                 goto out;
1332         }
1333         answer = get_init_pid_for_task(qpid);
1334         if (answer > 0)
1335                 save_initpid(&sb, answer);
1336
1337 out:
1338         /* we prune at end in case we are returning
1339          * the value we were about to return */
1340         prune_initpid_store();
1341         store_unlock();
1342         return answer;
1343 }
1344
1345 static int wait_for_pid(pid_t pid)
1346 {
1347         int status, ret;
1348
1349         if (pid <= 0)
1350                 return -1;
1351
1352 again:
1353         ret = waitpid(pid, &status, 0);
1354         if (ret == -1) {
1355                 if (errno == EINTR)
1356                         goto again;
1357                 return -1;
1358         }
1359         if (ret != pid)
1360                 goto again;
1361         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1362                 return -1;
1363         return 0;
1364 }
1365
1366 /*
1367  * append the given formatted string to *src.
1368  * src: a pointer to a char* in which to append the formatted string.
1369  * sz: the number of characters printed so far, minus trailing \0.
1370  * asz: the allocated size so far
1371  * format: string format. See printf for details.
1372  * ...: varargs. See printf for details.
1373  */
1374 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1375 {
1376         char tmp[BUF_RESERVE_SIZE];
1377         va_list         args;
1378
1379         va_start (args, format);
1380         int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1381         va_end(args);
1382
1383         if (!*src || tmplen + *sz + 1 >= *asz) {
1384                 char *tmp;
1385                 do {
1386                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1387                 } while (!tmp);
1388                 *src = tmp;
1389                 *asz += BUF_RESERVE_SIZE;
1390         }
1391         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1392         *sz += tmplen;
1393 }
1394
1395 /*
1396  * append pid to *src.
1397  * src: a pointer to a char* in which ot append the pid.
1398  * sz: the number of characters printed so far, minus trailing \0.
1399  * asz: the allocated size so far
1400  * pid: the pid to append
1401  */
1402 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1403 {
1404         must_strcat(src, sz, asz, "%d\n", (int)pid);
1405 }
1406
1407 /*
1408  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1409  * valid in the caller's namespace, return the id mapped into
1410  * pid's namespace.
1411  * Returns the mapped id, or -1 on error.
1412  */
1413 unsigned int
1414 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1415 {
1416         unsigned int nsuid,   // base id for a range in the idfile's namespace
1417                      hostuid, // base id for a range in the caller's namespace
1418                      count;   // number of ids in this range
1419         char line[400];
1420         int ret;
1421
1422         fseek(idfile, 0L, SEEK_SET);
1423         while (fgets(line, 400, idfile)) {
1424                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1425                 if (ret != 3)
1426                         continue;
1427                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1428                         /*
1429                          * uids wrapped around - unexpected as this is a procfile,
1430                          * so just bail.
1431                          */
1432                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1433                                 nsuid, hostuid, count, line);
1434                         return -1;
1435                 }
1436                 if (hostuid <= in_id && hostuid+count > in_id) {
1437                         /*
1438                          * now since hostuid <= in_id < hostuid+count, and
1439                          * hostuid+count and nsuid+count do not wrap around,
1440                          * we know that nsuid+(in_id-hostuid) which must be
1441                          * less that nsuid+(count) must not wrap around
1442                          */
1443                         return (in_id - hostuid) + nsuid;
1444                 }
1445         }
1446
1447         // no answer found
1448         return -1;
1449 }
1450
1451 /*
1452  * for is_privileged_over,
1453  * specify whether we require the calling uid to be root in his
1454  * namespace
1455  */
1456 #define NS_ROOT_REQD true
1457 #define NS_ROOT_OPT false
1458
1459 #define PROCLEN 100
1460
1461 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1462 {
1463         char fpath[PROCLEN];
1464         int ret;
1465         bool answer = false;
1466         uid_t nsuid;
1467
1468         if (victim == -1 || uid == -1)
1469                 return false;
1470
1471         /*
1472          * If the request is one not requiring root in the namespace,
1473          * then having the same uid suffices.  (i.e. uid 1000 has write
1474          * access to files owned by uid 1000
1475          */
1476         if (!req_ns_root && uid == victim)
1477                 return true;
1478
1479         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1480         if (ret < 0 || ret >= PROCLEN)
1481                 return false;
1482         FILE *f = fopen(fpath, "r");
1483         if (!f)
1484                 return false;
1485
1486         /* if caller's not root in his namespace, reject */
1487         nsuid = convert_id_to_ns(f, uid);
1488         if (nsuid)
1489                 goto out;
1490
1491         /*
1492          * If victim is not mapped into caller's ns, reject.
1493          * XXX I'm not sure this check is needed given that fuse
1494          * will be sending requests where the vfs has converted
1495          */
1496         nsuid = convert_id_to_ns(f, victim);
1497         if (nsuid == -1)
1498                 goto out;
1499
1500         answer = true;
1501
1502 out:
1503         fclose(f);
1504         return answer;
1505 }
1506
1507 static bool perms_include(int fmode, mode_t req_mode)
1508 {
1509         mode_t r;
1510
1511         switch (req_mode & O_ACCMODE) {
1512         case O_RDONLY:
1513                 r = S_IROTH;
1514                 break;
1515         case O_WRONLY:
1516                 r = S_IWOTH;
1517                 break;
1518         case O_RDWR:
1519                 r = S_IROTH | S_IWOTH;
1520                 break;
1521         default:
1522                 return false;
1523         }
1524         return ((fmode & r) == r);
1525 }
1526
1527
1528 /*
1529  * taskcg is  a/b/c
1530  * querycg is /a/b/c/d/e
1531  * we return 'd'
1532  */
1533 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1534 {
1535         char *start, *end;
1536
1537         if (strlen(taskcg) <= strlen(querycg)) {
1538                 lxcfs_error("%s\n", "I was fed bad input.");
1539                 return NULL;
1540         }
1541
1542         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1543                 start =  strdup(taskcg + 1);
1544         else
1545                 start = strdup(taskcg + strlen(querycg) + 1);
1546         if (!start)
1547                 return NULL;
1548         end = strchr(start, '/');
1549         if (end)
1550                 *end = '\0';
1551         return start;
1552 }
1553
1554 char *get_pid_cgroup(pid_t pid, const char *contrl)
1555 {
1556         int cfd;
1557
1558         cfd = find_mounted_controller(contrl);
1559         if (cfd < 0)
1560                 return false;
1561
1562         if (pure_unified_layout(cgroup_ops))
1563                 return cg_unified_get_current_cgroup(pid);
1564
1565         return cg_legacy_get_current_cgroup(pid, contrl);
1566 }
1567
1568 /*
1569  * check whether a fuse context may access a cgroup dir or file
1570  *
1571  * If file is not null, it is a cgroup file to check under cg.
1572  * If file is null, then we are checking perms on cg itself.
1573  *
1574  * For files we can check the mode of the list_keys result.
1575  * For cgroups, we must make assumptions based on the files under the
1576  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1577  * yet.
1578  */
1579 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1580 {
1581         struct cgfs_files *k = NULL;
1582         bool ret = false;
1583
1584         k = cgfs_get_key(contrl, cg, file);
1585         if (!k)
1586                 return false;
1587
1588         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1589                 if (perms_include(k->mode >> 6, mode)) {
1590                         ret = true;
1591                         goto out;
1592                 }
1593         }
1594         if (fc->gid == k->gid) {
1595                 if (perms_include(k->mode >> 3, mode)) {
1596                         ret = true;
1597                         goto out;
1598                 }
1599         }
1600         ret = perms_include(k->mode, mode);
1601
1602 out:
1603         free_key(k);
1604         return ret;
1605 }
1606
1607 #define INITSCOPE "/init.scope"
1608 void prune_init_slice(char *cg)
1609 {
1610         char *point;
1611         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1612
1613         if (cg_len < initscope_len)
1614                 return;
1615
1616         point = cg + cg_len - initscope_len;
1617         if (strcmp(point, INITSCOPE) == 0) {
1618                 if (point == cg)
1619                         *(point+1) = '\0';
1620                 else
1621                         *point = '\0';
1622         }
1623 }
1624
1625 /*
1626  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1627  * If pid is in /a, he may act on /a/b, but not on /b.
1628  * if the answer is false and nextcg is not NULL, then *nextcg will point
1629  * to a string containing the next cgroup directory under cg, which must be
1630  * freed by the caller.
1631  */
1632 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1633 {
1634         bool answer = false;
1635         char *c2 = get_pid_cgroup(pid, contrl);
1636         char *linecmp;
1637
1638         if (!c2)
1639                 return false;
1640         prune_init_slice(c2);
1641
1642         /*
1643          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1644          * they pass in a cgroup without leading '/'
1645          *
1646          * The original line here was:
1647          *      linecmp = *cg == '/' ? c2 : c2+1;
1648          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1649          *       Serge, do you know?
1650          */
1651         if (*cg == '/' || !strncmp(cg, "./", 2))
1652                 linecmp = c2;
1653         else
1654                 linecmp = c2 + 1;
1655         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1656                 if (nextcg) {
1657                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1658                 }
1659                 goto out;
1660         }
1661         answer = true;
1662
1663 out:
1664         free(c2);
1665         return answer;
1666 }
1667
1668 /*
1669  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1670  */
1671 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1672 {
1673         bool answer = false;
1674         char *c2, *task_cg;
1675         size_t target_len, task_len;
1676
1677         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1678                 return true;
1679
1680         c2 = get_pid_cgroup(pid, contrl);
1681         if (!c2)
1682                 return false;
1683         prune_init_slice(c2);
1684
1685         task_cg = c2 + 1;
1686         target_len = strlen(cg);
1687         task_len = strlen(task_cg);
1688         if (task_len == 0) {
1689                 /* Task is in the root cg, it can see everything. This case is
1690                  * not handled by the strmcps below, since they test for the
1691                  * last /, but that is the first / that we've chopped off
1692                  * above.
1693                  */
1694                 answer = true;
1695                 goto out;
1696         }
1697         if (strcmp(cg, task_cg) == 0) {
1698                 answer = true;
1699                 goto out;
1700         }
1701         if (target_len < task_len) {
1702                 /* looking up a parent dir */
1703                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1704                         answer = true;
1705                 goto out;
1706         }
1707         if (target_len > task_len) {
1708                 /* looking up a child dir */
1709                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1710                         answer = true;
1711                 goto out;
1712         }
1713
1714 out:
1715         free(c2);
1716         return answer;
1717 }
1718
1719 /*
1720  * given /cgroup/freezer/a/b, return "freezer".
1721  * the returned char* should NOT be freed.
1722  */
1723 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1724 {
1725         const char *p1;
1726         char *contr, *slash;
1727
1728         if (strlen(path) < 9) {
1729                 errno = EACCES;
1730                 return NULL;
1731         }
1732         if (*(path + 7) != '/') {
1733                 errno = EINVAL;
1734                 return NULL;
1735         }
1736         p1 = path + 8;
1737         contr = strdupa(p1);
1738         if (!contr) {
1739                 errno = ENOMEM;
1740                 return NULL;
1741         }
1742         slash = strstr(contr, "/");
1743         if (slash)
1744                 *slash = '\0';
1745
1746         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1747                 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1748                         return (*h)->__controllers;
1749         }
1750         errno = ENOENT;
1751         return NULL;
1752 }
1753
1754 /*
1755  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1756  * Note that the returned value may include files (keynames) etc
1757  */
1758 static const char *find_cgroup_in_path(const char *path)
1759 {
1760         const char *p1;
1761
1762         if (strlen(path) < 9) {
1763                 errno = EACCES;
1764                 return NULL;
1765         }
1766         p1 = strstr(path + 8, "/");
1767         if (!p1) {
1768                 errno = EINVAL;
1769                 return NULL;
1770         }
1771         errno = 0;
1772         return p1 + 1;
1773 }
1774
1775 /*
1776  * split the last path element from the path in @cg.
1777  * @dir is newly allocated and should be freed, @last not
1778 */
1779 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1780 {
1781         char *p;
1782
1783         do {
1784                 *dir = strdup(cg);
1785         } while (!*dir);
1786         *last = strrchr(cg, '/');
1787         if (!*last) {
1788                 *last = NULL;
1789                 return;
1790         }
1791         p = strrchr(*dir, '/');
1792         *p = '\0';
1793 }
1794
1795 /*
1796  * FUSE ops for /cgroup
1797  */
1798
1799 int cg_getattr(const char *path, struct stat *sb)
1800 {
1801         struct timespec now;
1802         struct fuse_context *fc = fuse_get_context();
1803         char * cgdir = NULL;
1804         char *last = NULL, *path1, *path2;
1805         struct cgfs_files *k = NULL;
1806         const char *cgroup;
1807         const char *controller = NULL;
1808         int ret = -ENOENT;
1809
1810
1811         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1812                 return -EIO;
1813
1814         memset(sb, 0, sizeof(struct stat));
1815
1816         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1817                 return -EINVAL;
1818
1819         sb->st_uid = sb->st_gid = 0;
1820         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1821         sb->st_size = 0;
1822
1823         if (strcmp(path, "/cgroup") == 0) {
1824                 sb->st_mode = S_IFDIR | 00755;
1825                 sb->st_nlink = 2;
1826                 return 0;
1827         }
1828
1829         controller = pick_controller_from_path(fc, path);
1830         if (!controller)
1831                 return -errno;
1832         cgroup = find_cgroup_in_path(path);
1833         if (!cgroup) {
1834                 /* this is just /cgroup/controller, return it as a dir */
1835                 sb->st_mode = S_IFDIR | 00755;
1836                 sb->st_nlink = 2;
1837                 return 0;
1838         }
1839
1840         get_cgdir_and_path(cgroup, &cgdir, &last);
1841
1842         if (!last) {
1843                 path1 = "/";
1844                 path2 = cgdir;
1845         } else {
1846                 path1 = cgdir;
1847                 path2 = last;
1848         }
1849
1850         pid_t initpid = lookup_initpid_in_store(fc->pid);
1851         if (initpid <= 1 || is_shared_pidns(initpid))
1852                 initpid = fc->pid;
1853         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1854          * Then check that caller's cgroup is under path if last is a child
1855          * cgroup, or cgdir if last is a file */
1856
1857         if (is_child_cgroup(controller, path1, path2)) {
1858                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1859                         ret = -ENOENT;
1860                         goto out;
1861                 }
1862                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1863                         /* this is just /cgroup/controller, return it as a dir */
1864                         sb->st_mode = S_IFDIR | 00555;
1865                         sb->st_nlink = 2;
1866                         ret = 0;
1867                         goto out;
1868                 }
1869                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1870                         ret = -EACCES;
1871                         goto out;
1872                 }
1873
1874                 // get uid, gid, from '/tasks' file and make up a mode
1875                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1876                 sb->st_mode = S_IFDIR | 00755;
1877                 k = cgfs_get_key(controller, cgroup, NULL);
1878                 if (!k) {
1879                         sb->st_uid = sb->st_gid = 0;
1880                 } else {
1881                         sb->st_uid = k->uid;
1882                         sb->st_gid = k->gid;
1883                 }
1884                 free_key(k);
1885                 sb->st_nlink = 2;
1886                 ret = 0;
1887                 goto out;
1888         }
1889
1890         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1891                 sb->st_mode = S_IFREG | k->mode;
1892                 sb->st_nlink = 1;
1893                 sb->st_uid = k->uid;
1894                 sb->st_gid = k->gid;
1895                 sb->st_size = 0;
1896                 free_key(k);
1897                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1898                         ret = -ENOENT;
1899                         goto out;
1900                 }
1901                 ret = 0;
1902         }
1903
1904 out:
1905         free(cgdir);
1906         return ret;
1907 }
1908
1909 int cg_opendir(const char *path, struct fuse_file_info *fi)
1910 {
1911         struct fuse_context *fc = fuse_get_context();
1912         const char *cgroup;
1913         struct file_info *dir_info;
1914         char *controller = NULL;
1915
1916         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1917                 return -EIO;
1918
1919         if (strcmp(path, "/cgroup") == 0) {
1920                 cgroup = NULL;
1921                 controller = NULL;
1922         } else {
1923                 // return list of keys for the controller, and list of child cgroups
1924                 controller = pick_controller_from_path(fc, path);
1925                 if (!controller)
1926                         return -errno;
1927
1928                 cgroup = find_cgroup_in_path(path);
1929                 if (!cgroup) {
1930                         /* this is just /cgroup/controller, return its contents */
1931                         cgroup = "/";
1932                 }
1933         }
1934
1935         pid_t initpid = lookup_initpid_in_store(fc->pid);
1936         if (initpid <= 1 || is_shared_pidns(initpid))
1937                 initpid = fc->pid;
1938         if (cgroup) {
1939                 if (!caller_may_see_dir(initpid, controller, cgroup))
1940                         return -ENOENT;
1941                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1942                         return -EACCES;
1943         }
1944
1945         /* we'll free this at cg_releasedir */
1946         dir_info = malloc(sizeof(*dir_info));
1947         if (!dir_info)
1948                 return -ENOMEM;
1949         dir_info->controller = must_copy_string(controller);
1950         dir_info->cgroup = must_copy_string(cgroup);
1951         dir_info->type = LXC_TYPE_CGDIR;
1952         dir_info->buf = NULL;
1953         dir_info->file = NULL;
1954         dir_info->buflen = 0;
1955
1956         fi->fh = (unsigned long)dir_info;
1957         return 0;
1958 }
1959
1960 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1961                 struct fuse_file_info *fi)
1962 {
1963         struct file_info *d = (struct file_info *)fi->fh;
1964         struct cgfs_files **list = NULL;
1965         int i, ret;
1966         char *nextcg = NULL;
1967         struct fuse_context *fc = fuse_get_context();
1968         char **clist = NULL;
1969
1970         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1971                 return -EIO;
1972
1973         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1974                 return -EIO;
1975
1976         if (d->type != LXC_TYPE_CGDIR) {
1977                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1978                 return -EIO;
1979         }
1980         if (!d->cgroup && !d->controller) {
1981                 /*
1982                  * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1983                  * This only works with the legacy hierarchy.
1984                  */
1985                 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1986                         if (is_unified_hierarchy(*h))
1987                                 continue;
1988
1989                         if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
1990                                 return -EIO;
1991                 }
1992
1993                 return 0;
1994         }
1995
1996         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1997                 // not a valid cgroup
1998                 ret = -EINVAL;
1999                 goto out;
2000         }
2001
2002         pid_t initpid = lookup_initpid_in_store(fc->pid);
2003         if (initpid <= 1 || is_shared_pidns(initpid))
2004                 initpid = fc->pid;
2005         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2006                 if (nextcg) {
2007                         ret = filler(buf, nextcg,  NULL, 0);
2008                         free(nextcg);
2009                         if (ret != 0) {
2010                                 ret = -EIO;
2011                                 goto out;
2012                         }
2013                 }
2014                 ret = 0;
2015                 goto out;
2016         }
2017
2018         for (i = 0; list && list[i]; i++) {
2019                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2020                         ret = -EIO;
2021                         goto out;
2022                 }
2023         }
2024
2025         // now get the list of child cgroups
2026
2027         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2028                 ret = 0;
2029                 goto out;
2030         }
2031         if (clist) {
2032                 for (i = 0; clist[i]; i++) {
2033                         if (filler(buf, clist[i], NULL, 0) != 0) {
2034                                 ret = -EIO;
2035                                 goto out;
2036                         }
2037                 }
2038         }
2039         ret = 0;
2040
2041 out:
2042         free_keys(list);
2043         if (clist) {
2044                 for (i = 0; clist[i]; i++)
2045                         free(clist[i]);
2046                 free(clist);
2047         }
2048         return ret;
2049 }
2050
2051 void do_release_file_info(struct fuse_file_info *fi)
2052 {
2053         struct file_info *f = (struct file_info *)fi->fh;
2054
2055         if (!f)
2056                 return;
2057
2058         fi->fh = 0;
2059
2060         free(f->controller);
2061         f->controller = NULL;
2062         free(f->cgroup);
2063         f->cgroup = NULL;
2064         free(f->file);
2065         f->file = NULL;
2066         free(f->buf);
2067         f->buf = NULL;
2068         free(f);
2069         f = NULL;
2070 }
2071
2072 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2073 {
2074         do_release_file_info(fi);
2075         return 0;
2076 }
2077
2078 int cg_open(const char *path, struct fuse_file_info *fi)
2079 {
2080         const char *cgroup;
2081         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2082         struct cgfs_files *k = NULL;
2083         struct file_info *file_info;
2084         struct fuse_context *fc = fuse_get_context();
2085         int ret;
2086
2087         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2088                 return -EIO;
2089
2090         controller = pick_controller_from_path(fc, path);
2091         if (!controller)
2092                 return -errno;
2093         cgroup = find_cgroup_in_path(path);
2094         if (!cgroup)
2095                 return -errno;
2096
2097         get_cgdir_and_path(cgroup, &cgdir, &last);
2098         if (!last) {
2099                 path1 = "/";
2100                 path2 = cgdir;
2101         } else {
2102                 path1 = cgdir;
2103                 path2 = last;
2104         }
2105
2106         k = cgfs_get_key(controller, path1, path2);
2107         if (!k) {
2108                 ret = -EINVAL;
2109                 goto out;
2110         }
2111         free_key(k);
2112
2113         pid_t initpid = lookup_initpid_in_store(fc->pid);
2114         if (initpid <= 1 || is_shared_pidns(initpid))
2115                 initpid = fc->pid;
2116         if (!caller_may_see_dir(initpid, controller, path1)) {
2117                 ret = -ENOENT;
2118                 goto out;
2119         }
2120         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2121                 ret = -EACCES;
2122                 goto out;
2123         }
2124
2125         /* we'll free this at cg_release */
2126         file_info = malloc(sizeof(*file_info));
2127         if (!file_info) {
2128                 ret = -ENOMEM;
2129                 goto out;
2130         }
2131         file_info->controller = must_copy_string(controller);
2132         file_info->cgroup = must_copy_string(path1);
2133         file_info->file = must_copy_string(path2);
2134         file_info->type = LXC_TYPE_CGFILE;
2135         file_info->buf = NULL;
2136         file_info->buflen = 0;
2137
2138         fi->fh = (unsigned long)file_info;
2139         ret = 0;
2140
2141 out:
2142         free(cgdir);
2143         return ret;
2144 }
2145
2146 int cg_access(const char *path, int mode)
2147 {
2148         int ret;
2149         const char *cgroup;
2150         char *path1, *path2, *controller;
2151         char *last = NULL, *cgdir = NULL;
2152         struct cgfs_files *k = NULL;
2153         struct fuse_context *fc = fuse_get_context();
2154
2155         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2156                 return -EIO;
2157
2158         if (strcmp(path, "/cgroup") == 0)
2159                 return 0;
2160
2161         controller = pick_controller_from_path(fc, path);
2162         if (!controller)
2163                 return -errno;
2164         cgroup = find_cgroup_in_path(path);
2165         if (!cgroup) {
2166                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2167                 if ((mode & W_OK) == 0)
2168                         return 0;
2169                 return -EACCES;
2170         }
2171
2172         get_cgdir_and_path(cgroup, &cgdir, &last);
2173         if (!last) {
2174                 path1 = "/";
2175                 path2 = cgdir;
2176         } else {
2177                 path1 = cgdir;
2178                 path2 = last;
2179         }
2180
2181         k = cgfs_get_key(controller, path1, path2);
2182         if (!k) {
2183                 if ((mode & W_OK) == 0)
2184                         ret = 0;
2185                 else
2186                         ret = -EACCES;
2187                 goto out;
2188         }
2189         free_key(k);
2190
2191         pid_t initpid = lookup_initpid_in_store(fc->pid);
2192         if (initpid <= 1 || is_shared_pidns(initpid))
2193                 initpid = fc->pid;
2194         if (!caller_may_see_dir(initpid, controller, path1)) {
2195                 ret = -ENOENT;
2196                 goto out;
2197         }
2198         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2199                 ret = -EACCES;
2200                 goto out;
2201         }
2202
2203         ret = 0;
2204
2205 out:
2206         free(cgdir);
2207         return ret;
2208 }
2209
2210 int cg_release(const char *path, struct fuse_file_info *fi)
2211 {
2212         do_release_file_info(fi);
2213         return 0;
2214 }
2215
2216 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2217
2218 static bool wait_for_sock(int sock, int timeout)
2219 {
2220         struct epoll_event ev;
2221         int epfd, ret, now, starttime, deltatime, saved_errno;
2222
2223         if ((starttime = time(NULL)) < 0)
2224                 return false;
2225
2226         if ((epfd = epoll_create(1)) < 0) {
2227                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2228                 return false;
2229         }
2230
2231         ev.events = POLLIN_SET;
2232         ev.data.fd = sock;
2233         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2234                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2235                 close(epfd);
2236                 return false;
2237         }
2238
2239 again:
2240         if ((now = time(NULL)) < 0) {
2241                 close(epfd);
2242                 return false;
2243         }
2244
2245         deltatime = (starttime + timeout) - now;
2246         if (deltatime < 0) { // timeout
2247                 errno = 0;
2248                 close(epfd);
2249                 return false;
2250         }
2251         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2252         if (ret < 0 && errno == EINTR)
2253                 goto again;
2254         saved_errno = errno;
2255         close(epfd);
2256
2257         if (ret <= 0) {
2258                 errno = saved_errno;
2259                 return false;
2260         }
2261         return true;
2262 }
2263
2264 static int msgrecv(int sockfd, void *buf, size_t len)
2265 {
2266         if (!wait_for_sock(sockfd, 2))
2267                 return -1;
2268         return recv(sockfd, buf, len, MSG_DONTWAIT);
2269 }
2270
2271 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2272 {
2273         struct msghdr msg = { 0 };
2274         struct iovec iov;
2275         struct cmsghdr *cmsg;
2276         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2277         char buf[1];
2278         buf[0] = 'p';
2279
2280         if (pingfirst) {
2281                 if (msgrecv(sock, buf, 1) != 1) {
2282                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2283                         return SEND_CREDS_FAIL;
2284                 }
2285         }
2286
2287         msg.msg_control = cmsgbuf;
2288         msg.msg_controllen = sizeof(cmsgbuf);
2289
2290         cmsg = CMSG_FIRSTHDR(&msg);
2291         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2292         cmsg->cmsg_level = SOL_SOCKET;
2293         cmsg->cmsg_type = SCM_CREDENTIALS;
2294         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2295
2296         msg.msg_name = NULL;
2297         msg.msg_namelen = 0;
2298
2299         buf[0] = v;
2300         iov.iov_base = buf;
2301         iov.iov_len = sizeof(buf);
2302         msg.msg_iov = &iov;
2303         msg.msg_iovlen = 1;
2304
2305         if (sendmsg(sock, &msg, 0) < 0) {
2306                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2307                 if (errno == 3)
2308                         return SEND_CREDS_NOTSK;
2309                 return SEND_CREDS_FAIL;
2310         }
2311
2312         return SEND_CREDS_OK;
2313 }
2314
2315 static bool recv_creds(int sock, struct ucred *cred, char *v)
2316 {
2317         struct msghdr msg = { 0 };
2318         struct iovec iov;
2319         struct cmsghdr *cmsg;
2320         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2321         char buf[1];
2322         int ret;
2323         int optval = 1;
2324
2325         *v = '1';
2326
2327         cred->pid = -1;
2328         cred->uid = -1;
2329         cred->gid = -1;
2330
2331         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2332                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2333                 return false;
2334         }
2335         buf[0] = '1';
2336         if (write(sock, buf, 1) != 1) {
2337                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2338                 return false;
2339         }
2340
2341         msg.msg_name = NULL;
2342         msg.msg_namelen = 0;
2343         msg.msg_control = cmsgbuf;
2344         msg.msg_controllen = sizeof(cmsgbuf);
2345
2346         iov.iov_base = buf;
2347         iov.iov_len = sizeof(buf);
2348         msg.msg_iov = &iov;
2349         msg.msg_iovlen = 1;
2350
2351         if (!wait_for_sock(sock, 2)) {
2352                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2353                 return false;
2354         }
2355         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2356         if (ret < 0) {
2357                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2358                 return false;
2359         }
2360
2361         cmsg = CMSG_FIRSTHDR(&msg);
2362
2363         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2364                         cmsg->cmsg_level == SOL_SOCKET &&
2365                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2366                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2367         }
2368         *v = buf[0];
2369
2370         return true;
2371 }
2372
2373 struct pid_ns_clone_args {
2374         int *cpipe;
2375         int sock;
2376         pid_t tpid;
2377         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2378 };
2379
2380 /*
2381  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2382  * with clone(). This simply writes '1' as ACK back to the parent
2383  * before calling the actual wrapped function.
2384  */
2385 static int pid_ns_clone_wrapper(void *arg) {
2386         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2387         char b = '1';
2388
2389         close(args->cpipe[0]);
2390         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2391                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2392         close(args->cpipe[1]);
2393         return args->wrapped(args->sock, args->tpid);
2394 }
2395
2396 /*
2397  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2398  * int value back over the socket.  This shifts the pid from the
2399  * sender's pidns into tpid's pidns.
2400  */
2401 static int pid_to_ns(int sock, pid_t tpid)
2402 {
2403         char v = '0';
2404         struct ucred cred;
2405
2406         while (recv_creds(sock, &cred, &v)) {
2407                 if (v == '1')
2408                         return 0;
2409                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2410                         return 1;
2411         }
2412         return 0;
2413 }
2414
2415
2416 /*
2417  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2418  * in your old pidns.  Only children which you clone will be in the target
2419  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2420  * actually convert pids.
2421  *
2422  * Note: glibc's fork() does not respect pidns, which can lead to failed
2423  * assertions inside glibc (and thus failed forks) if the child's pid in
2424  * the pidns and the parent pid outside are identical. Using clone prevents
2425  * this issue.
2426  */
2427 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2428 {
2429         int newnsfd = -1, ret, cpipe[2];
2430         char fnam[100];
2431         pid_t cpid;
2432         char v;
2433
2434         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2435         if (ret < 0 || ret >= sizeof(fnam))
2436                 _exit(1);
2437         newnsfd = open(fnam, O_RDONLY);
2438         if (newnsfd < 0)
2439                 _exit(1);
2440         if (setns(newnsfd, 0) < 0)
2441                 _exit(1);
2442         close(newnsfd);
2443
2444         if (pipe(cpipe) < 0)
2445                 _exit(1);
2446
2447         struct pid_ns_clone_args args = {
2448                 .cpipe = cpipe,
2449                 .sock = sock,
2450                 .tpid = tpid,
2451                 .wrapped = &pid_to_ns
2452         };
2453         size_t stack_size = sysconf(_SC_PAGESIZE);
2454         void *stack = alloca(stack_size);
2455
2456         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2457         if (cpid < 0)
2458                 _exit(1);
2459
2460         // give the child 1 second to be done forking and
2461         // write its ack
2462         if (!wait_for_sock(cpipe[0], 1))
2463                 _exit(1);
2464         ret = read(cpipe[0], &v, 1);
2465         if (ret != sizeof(char) || v != '1')
2466                 _exit(1);
2467
2468         if (!wait_for_pid(cpid))
2469                 _exit(1);
2470         _exit(0);
2471 }
2472
2473 /*
2474  * To read cgroup files with a particular pid, we will setns into the child
2475  * pidns, open a pipe, fork a child - which will be the first to really be in
2476  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2477  */
2478 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2479 {
2480         int sock[2] = {-1, -1};
2481         char *tmpdata = NULL;
2482         int ret;
2483         pid_t qpid, cpid = -1;
2484         bool answer = false;
2485         char v = '0';
2486         struct ucred cred;
2487         size_t sz = 0, asz = 0;
2488
2489         if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
2490                 return false;
2491
2492         /*
2493          * Now we read the pids from returned data one by one, pass
2494          * them into a child in the target namespace, read back the
2495          * translated pids, and put them into our to-return data
2496          */
2497
2498         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2499                 perror("socketpair");
2500                 free(tmpdata);
2501                 return false;
2502         }
2503
2504         cpid = fork();
2505         if (cpid == -1)
2506                 goto out;
2507
2508         if (!cpid) // child - exits when done
2509                 pid_to_ns_wrapper(sock[1], tpid);
2510
2511         char *ptr = tmpdata;
2512         cred.uid = 0;
2513         cred.gid = 0;
2514         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2515                 cred.pid = qpid;
2516                 ret = send_creds(sock[0], &cred, v, true);
2517
2518                 if (ret == SEND_CREDS_NOTSK)
2519                         goto next;
2520                 if (ret == SEND_CREDS_FAIL)
2521                         goto out;
2522
2523                 // read converted results
2524                 if (!wait_for_sock(sock[0], 2)) {
2525                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2526                         goto out;
2527                 }
2528                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2529                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2530                         goto out;
2531                 }
2532                 must_strcat_pid(d, &sz, &asz, qpid);
2533 next:
2534                 ptr = strchr(ptr, '\n');
2535                 if (!ptr)
2536                         break;
2537                 ptr++;
2538         }
2539
2540         cred.pid = getpid();
2541         v = '1';
2542         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2543                 // failed to ask child to exit
2544                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2545                 goto out;
2546         }
2547
2548         answer = true;
2549
2550 out:
2551         free(tmpdata);
2552         if (cpid != -1)
2553                 wait_for_pid(cpid);
2554         if (sock[0] != -1) {
2555                 close(sock[0]);
2556                 close(sock[1]);
2557         }
2558         return answer;
2559 }
2560
2561 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2562                 struct fuse_file_info *fi)
2563 {
2564         struct fuse_context *fc = fuse_get_context();
2565         struct file_info *f = (struct file_info *)fi->fh;
2566         struct cgfs_files *k = NULL;
2567         char *data = NULL;
2568         int ret, s;
2569         bool r;
2570
2571         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2572                 return -EIO;
2573
2574         if (f->type != LXC_TYPE_CGFILE) {
2575                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2576                 return -EIO;
2577         }
2578
2579         if (offset)
2580                 return 0;
2581
2582         if (!f->controller)
2583                 return -EINVAL;
2584
2585         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2586                 return -EINVAL;
2587         }
2588         free_key(k);
2589
2590
2591         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2592                 ret = -EACCES;
2593                 goto out;
2594         }
2595
2596         if (strcmp(f->file, "tasks") == 0 ||
2597                         strcmp(f->file, "/tasks") == 0 ||
2598                         strcmp(f->file, "/cgroup.procs") == 0 ||
2599                         strcmp(f->file, "cgroup.procs") == 0)
2600                 // special case - we have to translate the pids
2601                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2602         else
2603                 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
2604
2605         if (!r) {
2606                 ret = -EINVAL;
2607                 goto out;
2608         }
2609
2610         if (!data) {
2611                 ret = 0;
2612                 goto out;
2613         }
2614         s = strlen(data);
2615         if (s > size)
2616                 s = size;
2617         memcpy(buf, data, s);
2618         if (s > 0 && s < size && data[s-1] != '\n')
2619                 buf[s++] = '\n';
2620
2621         ret = s;
2622
2623 out:
2624         free(data);
2625         return ret;
2626 }
2627
2628 static int pid_from_ns(int sock, pid_t tpid)
2629 {
2630         pid_t vpid;
2631         struct ucred cred;
2632         char v;
2633         int ret;
2634
2635         cred.uid = 0;
2636         cred.gid = 0;
2637         while (1) {
2638                 if (!wait_for_sock(sock, 2)) {
2639                         lxcfs_error("%s\n", "Timeout reading from parent.");
2640                         return 1;
2641                 }
2642                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2643                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2644                         return 1;
2645                 }
2646                 if (vpid == -1) // done
2647                         break;
2648                 v = '0';
2649                 cred.pid = vpid;
2650                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2651                         v = '1';
2652                         cred.pid = getpid();
2653                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2654                                 return 1;
2655                 }
2656         }
2657         return 0;
2658 }
2659
2660 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2661 {
2662         int newnsfd = -1, ret, cpipe[2];
2663         char fnam[100];
2664         pid_t cpid;
2665         char v;
2666
2667         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2668         if (ret < 0 || ret >= sizeof(fnam))
2669                 _exit(1);
2670         newnsfd = open(fnam, O_RDONLY);
2671         if (newnsfd < 0)
2672                 _exit(1);
2673         if (setns(newnsfd, 0) < 0)
2674                 _exit(1);
2675         close(newnsfd);
2676
2677         if (pipe(cpipe) < 0)
2678                 _exit(1);
2679
2680         struct pid_ns_clone_args args = {
2681                 .cpipe = cpipe,
2682                 .sock = sock,
2683                 .tpid = tpid,
2684                 .wrapped = &pid_from_ns
2685         };
2686         size_t stack_size = sysconf(_SC_PAGESIZE);
2687         void *stack = alloca(stack_size);
2688
2689         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2690         if (cpid < 0)
2691                 _exit(1);
2692
2693         // give the child 1 second to be done forking and
2694         // write its ack
2695         if (!wait_for_sock(cpipe[0], 1))
2696                 _exit(1);
2697         ret = read(cpipe[0], &v, 1);
2698         if (ret != sizeof(char) || v != '1')
2699                 _exit(1);
2700
2701         if (!wait_for_pid(cpid))
2702                 _exit(1);
2703         _exit(0);
2704 }
2705
2706 /*
2707  * Given host @uid, return the uid to which it maps in
2708  * @pid's user namespace, or -1 if none.
2709  */
2710 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2711 {
2712         FILE *f;
2713         char line[400];
2714
2715         sprintf(line, "/proc/%d/uid_map", pid);
2716         if ((f = fopen(line, "r")) == NULL) {
2717                 return false;
2718         }
2719
2720         *answer = convert_id_to_ns(f, uid);
2721         fclose(f);
2722
2723         if (*answer == -1)
2724                 return false;
2725         return true;
2726 }
2727
2728 /*
2729  * get_pid_creds: get the real uid and gid of @pid from
2730  * /proc/$$/status
2731  * (XXX should we use euid here?)
2732  */
2733 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2734 {
2735         char line[400];
2736         uid_t u;
2737         gid_t g;
2738         FILE *f;
2739
2740         *uid = -1;
2741         *gid = -1;
2742         sprintf(line, "/proc/%d/status", pid);
2743         if ((f = fopen(line, "r")) == NULL) {
2744                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2745                 return;
2746         }
2747         while (fgets(line, 400, f)) {
2748                 if (strncmp(line, "Uid:", 4) == 0) {
2749                         if (sscanf(line+4, "%u", &u) != 1) {
2750                                 lxcfs_error("bad uid line for pid %u\n", pid);
2751                                 fclose(f);
2752                                 return;
2753                         }
2754                         *uid = u;
2755                 } else if (strncmp(line, "Gid:", 4) == 0) {
2756                         if (sscanf(line+4, "%u", &g) != 1) {
2757                                 lxcfs_error("bad gid line for pid %u\n", pid);
2758                                 fclose(f);
2759                                 return;
2760                         }
2761                         *gid = g;
2762                 }
2763         }
2764         fclose(f);
2765 }
2766
2767 /*
2768  * May the requestor @r move victim @v to a new cgroup?
2769  * This is allowed if
2770  *   . they are the same task
2771  *   . they are ownedy by the same uid
2772  *   . @r is root on the host, or
2773  *   . @v's uid is mapped into @r's where @r is root.
2774  */
2775 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2776 {
2777         uid_t v_uid, tmpuid;
2778         gid_t v_gid;
2779
2780         if (r == v)
2781                 return true;
2782         if (r_uid == 0)
2783                 return true;
2784         get_pid_creds(v, &v_uid, &v_gid);
2785         if (r_uid == v_uid)
2786                 return true;
2787         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2788                         && hostuid_to_ns(v_uid, r, &tmpuid))
2789                 return true;
2790         return false;
2791 }
2792
2793 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2794                 const char *file, const char *buf)
2795 {
2796         int sock[2] = {-1, -1};
2797         pid_t qpid, cpid = -1;
2798         FILE *pids_file = NULL;
2799         bool answer = false, fail = false;
2800
2801         pids_file = open_pids_file(contrl, cg);
2802         if (!pids_file)
2803                 return false;
2804
2805         /*
2806          * write the pids to a socket, have helper in writer's pidns
2807          * call movepid for us
2808          */
2809         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2810                 perror("socketpair");
2811                 goto out;
2812         }
2813
2814         cpid = fork();
2815         if (cpid == -1)
2816                 goto out;
2817
2818         if (!cpid) { // child
2819                 fclose(pids_file);
2820                 pid_from_ns_wrapper(sock[1], tpid);
2821         }
2822
2823         const char *ptr = buf;
2824         while (sscanf(ptr, "%d", &qpid) == 1) {
2825                 struct ucred cred;
2826                 char v;
2827
2828                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2829                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2830                         goto out;
2831                 }
2832
2833                 if (recv_creds(sock[0], &cred, &v)) {
2834                         if (v == '0') {
2835                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2836                                         fail = true;
2837                                         break;
2838                                 }
2839                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2840                                         fail = true;
2841                         }
2842                 }
2843
2844                 ptr = strchr(ptr, '\n');
2845                 if (!ptr)
2846                         break;
2847                 ptr++;
2848         }
2849
2850         /* All good, write the value */
2851         qpid = -1;
2852         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2853                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2854
2855         if (!fail)
2856                 answer = true;
2857
2858 out:
2859         if (cpid != -1)
2860                 wait_for_pid(cpid);
2861         if (sock[0] != -1) {
2862                 close(sock[0]);
2863                 close(sock[1]);
2864         }
2865         if (pids_file) {
2866                 if (fclose(pids_file) != 0)
2867                         answer = false;
2868         }
2869         return answer;
2870 }
2871
2872 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2873              struct fuse_file_info *fi)
2874 {
2875         struct fuse_context *fc = fuse_get_context();
2876         char *localbuf = NULL;
2877         struct cgfs_files *k = NULL;
2878         struct file_info *f = (struct file_info *)fi->fh;
2879         bool r;
2880
2881         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2882                 return -EIO;
2883
2884         if (f->type != LXC_TYPE_CGFILE) {
2885                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2886                 return -EIO;
2887         }
2888
2889         if (offset)
2890                 return 0;
2891
2892         localbuf = alloca(size+1);
2893         localbuf[size] = '\0';
2894         memcpy(localbuf, buf, size);
2895
2896         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2897                 size = -EINVAL;
2898                 goto out;
2899         }
2900
2901         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2902                 size = -EACCES;
2903                 goto out;
2904         }
2905
2906         if (strcmp(f->file, "tasks") == 0 ||
2907                         strcmp(f->file, "/tasks") == 0 ||
2908                         strcmp(f->file, "/cgroup.procs") == 0 ||
2909                         strcmp(f->file, "cgroup.procs") == 0)
2910                 // special case - we have to translate the pids
2911                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2912         else
2913                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2914
2915         if (!r)
2916                 size = -EINVAL;
2917
2918 out:
2919         free_key(k);
2920         return size;
2921 }
2922
2923 int cg_chown(const char *path, uid_t uid, gid_t gid)
2924 {
2925         struct fuse_context *fc = fuse_get_context();
2926         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2927         struct cgfs_files *k = NULL;
2928         const char *cgroup;
2929         int ret;
2930
2931         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2932                 return -EIO;
2933
2934         if (strcmp(path, "/cgroup") == 0)
2935                 return -EPERM;
2936
2937         controller = pick_controller_from_path(fc, path);
2938         if (!controller)
2939                 return errno == ENOENT ? -EPERM : -errno;
2940
2941         cgroup = find_cgroup_in_path(path);
2942         if (!cgroup)
2943                 /* this is just /cgroup/controller */
2944                 return -EPERM;
2945
2946         get_cgdir_and_path(cgroup, &cgdir, &last);
2947
2948         if (!last) {
2949                 path1 = "/";
2950                 path2 = cgdir;
2951         } else {
2952                 path1 = cgdir;
2953                 path2 = last;
2954         }
2955
2956         if (is_child_cgroup(controller, path1, path2)) {
2957                 // get uid, gid, from '/tasks' file and make up a mode
2958                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2959                 k = cgfs_get_key(controller, cgroup, "tasks");
2960
2961         } else
2962                 k = cgfs_get_key(controller, path1, path2);
2963
2964         if (!k) {
2965                 ret = -EINVAL;
2966                 goto out;
2967         }
2968
2969         /*
2970          * This being a fuse request, the uid and gid must be valid
2971          * in the caller's namespace.  So we can just check to make
2972          * sure that the caller is root in his uid, and privileged
2973          * over the file's current owner.
2974          */
2975         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2976                 ret = -EACCES;
2977                 goto out;
2978         }
2979
2980         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2981
2982 out:
2983         free_key(k);
2984         free(cgdir);
2985
2986         return ret;
2987 }
2988
2989 int cg_chmod(const char *path, mode_t mode)
2990 {
2991         struct fuse_context *fc = fuse_get_context();
2992         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2993         struct cgfs_files *k = NULL;
2994         const char *cgroup;
2995         int ret;
2996
2997         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2998                 return -EIO;
2999
3000         if (strcmp(path, "/cgroup") == 0)
3001                 return -EPERM;
3002
3003         controller = pick_controller_from_path(fc, path);
3004         if (!controller)
3005                 return errno == ENOENT ? -EPERM : -errno;
3006
3007         cgroup = find_cgroup_in_path(path);
3008         if (!cgroup)
3009                 /* this is just /cgroup/controller */
3010                 return -EPERM;
3011
3012         get_cgdir_and_path(cgroup, &cgdir, &last);
3013
3014         if (!last) {
3015                 path1 = "/";
3016                 path2 = cgdir;
3017         } else {
3018                 path1 = cgdir;
3019                 path2 = last;
3020         }
3021
3022         if (is_child_cgroup(controller, path1, path2)) {
3023                 // get uid, gid, from '/tasks' file and make up a mode
3024                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3025                 k = cgfs_get_key(controller, cgroup, "tasks");
3026
3027         } else
3028                 k = cgfs_get_key(controller, path1, path2);
3029
3030         if (!k) {
3031                 ret = -EINVAL;
3032                 goto out;
3033         }
3034
3035         /*
3036          * This being a fuse request, the uid and gid must be valid
3037          * in the caller's namespace.  So we can just check to make
3038          * sure that the caller is root in his uid, and privileged
3039          * over the file's current owner.
3040          */
3041         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3042                 ret = -EPERM;
3043                 goto out;
3044         }
3045
3046         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3047                 ret = -EINVAL;
3048                 goto out;
3049         }
3050
3051         ret = 0;
3052 out:
3053         free_key(k);
3054         free(cgdir);
3055         return ret;
3056 }
3057
3058 int cg_mkdir(const char *path, mode_t mode)
3059 {
3060         struct fuse_context *fc = fuse_get_context();
3061         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3062         const char *cgroup;
3063         int ret;
3064
3065         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3066                 return -EIO;
3067
3068         controller = pick_controller_from_path(fc, path);
3069         if (!controller)
3070                 return errno == ENOENT ? -EPERM : -errno;
3071
3072         cgroup = find_cgroup_in_path(path);
3073         if (!cgroup)
3074                 return -errno;
3075
3076         get_cgdir_and_path(cgroup, &cgdir, &last);
3077         if (!last)
3078                 path1 = "/";
3079         else
3080                 path1 = cgdir;
3081
3082         pid_t initpid = lookup_initpid_in_store(fc->pid);
3083         if (initpid <= 1 || is_shared_pidns(initpid))
3084                 initpid = fc->pid;
3085         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3086                 if (!next)
3087                         ret = -EINVAL;
3088                 else if (last && strcmp(next, last) == 0)
3089                         ret = -EEXIST;
3090                 else
3091                         ret = -EPERM;
3092                 goto out;
3093         }
3094
3095         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3096                 ret = -EACCES;
3097                 goto out;
3098         }
3099         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3100                 ret = -EACCES;
3101                 goto out;
3102         }
3103
3104         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3105
3106 out:
3107         free(cgdir);
3108         free(next);
3109         return ret;
3110 }
3111
3112 int cg_rmdir(const char *path)
3113 {
3114         struct fuse_context *fc = fuse_get_context();
3115         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3116         const char *cgroup;
3117         int ret;
3118
3119         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3120                 return -EIO;
3121
3122         controller = pick_controller_from_path(fc, path);
3123         if (!controller) /* Someone's trying to delete "/cgroup". */
3124                 return -EPERM;
3125
3126         cgroup = find_cgroup_in_path(path);
3127         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3128                 return -EPERM;
3129
3130         get_cgdir_and_path(cgroup, &cgdir, &last);
3131         if (!last) {
3132                 /* Someone's trying to delete a cgroup on the same level as the
3133                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3134                  * rmdir "/cgroup/blkio/init.slice".
3135                  */
3136                 ret = -EPERM;
3137                 goto out;
3138         }
3139
3140         pid_t initpid = lookup_initpid_in_store(fc->pid);
3141         if (initpid <= 1 || is_shared_pidns(initpid))
3142                 initpid = fc->pid;
3143         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3144                 if (!last || (next && (strcmp(next, last) == 0)))
3145                         ret = -EBUSY;
3146                 else
3147                         ret = -ENOENT;
3148                 goto out;
3149         }
3150
3151         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3152                 ret = -EACCES;
3153                 goto out;
3154         }
3155         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3156                 ret = -EACCES;
3157                 goto out;
3158         }
3159
3160         if (!cgfs_remove(controller, cgroup)) {
3161                 ret = -EINVAL;
3162                 goto out;
3163         }
3164
3165         ret = 0;
3166
3167 out:
3168         free(cgdir);
3169         free(next);
3170         return ret;
3171 }
3172
3173 static bool startswith(const char *line, const char *pref)
3174 {
3175         if (strncmp(line, pref, strlen(pref)) == 0)
3176                 return true;
3177         return false;
3178 }
3179
3180 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3181 static void parse_memstat(int version,
3182                           char *memstat,
3183                           unsigned long *cached,
3184                           unsigned long *active_anon,
3185                           unsigned long *inactive_anon,
3186                           unsigned long *active_file,
3187                           unsigned long *inactive_file,
3188                           unsigned long *unevictable,
3189                           unsigned long *shmem)
3190 {
3191         char *eol;
3192
3193         while (*memstat) {
3194                 if (startswith(memstat, is_unified_controller(version)
3195                                             ? "cache"
3196                                             : "total_cache")) {
3197                         sscanf(memstat + 11, "%lu", cached);
3198                         *cached /= 1024;
3199                 } else if (startswith(memstat, is_unified_controller(version)
3200                                                    ? "active_anon"
3201                                                    : "total_active_anon")) {
3202                         sscanf(memstat + 17, "%lu", active_anon);
3203                         *active_anon /= 1024;
3204                 } else if (startswith(memstat, is_unified_controller(version)
3205                                                    ? "inactive_anon"
3206                                                    : "total_inactive_anon")) {
3207                         sscanf(memstat + 19, "%lu", inactive_anon);
3208                         *inactive_anon /= 1024;
3209                 } else if (startswith(memstat, is_unified_controller(version)
3210                                                    ? "active_file"
3211                                                    : "total_active_file")) {
3212                         sscanf(memstat + 17, "%lu", active_file);
3213                         *active_file /= 1024;
3214                 } else if (startswith(memstat, is_unified_controller(version)
3215                                                    ? "inactive_file"
3216                                                    : "total_inactive_file")) {
3217                         sscanf(memstat + 19, "%lu", inactive_file);
3218                         *inactive_file /= 1024;
3219                 } else if (startswith(memstat, is_unified_controller(version)
3220                                                    ? "unevictable"
3221                                                    : "total_unevictable")) {
3222                         sscanf(memstat + 17, "%lu", unevictable);
3223                         *unevictable /= 1024;
3224                 } else if (startswith(memstat, is_unified_controller(version)
3225                                                    ? "shmem"
3226                                                    : "total_shmem")) {
3227                         sscanf(memstat + 11, "%lu", shmem);
3228                         *shmem /= 1024;
3229                 }
3230                 eol = strchr(memstat, '\n');
3231                 if (!eol)
3232                         return;
3233                 memstat = eol+1;
3234         }
3235 }
3236
3237 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3238 {
3239         char *eol;
3240         char key[32];
3241
3242         memset(key, 0, 32);
3243         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3244
3245         size_t len = strlen(key);
3246         *v = 0;
3247
3248         while (*str) {
3249                 if (startswith(str, key)) {
3250                         sscanf(str + len, "%lu", v);
3251                         return;
3252                 }
3253                 eol = strchr(str, '\n');
3254                 if (!eol)
3255                         return;
3256                 str = eol+1;
3257         }
3258 }
3259
3260 int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
3261 {
3262         size_t linelen = 0, total_len = 0, rv = 0;
3263         char *line = NULL;
3264         char *cache = d->buf;
3265         size_t cache_size = d->buflen;
3266         FILE *f = fopen(path, "r");
3267         if (!f)
3268                 return 0;
3269
3270         while (getline(&line, &linelen, f) != -1) {
3271                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3272                 if (l < 0) {
3273                         perror("Error writing to cache");
3274                         rv = 0;
3275                         goto err;
3276                 }
3277                 if (l >= cache_size) {
3278                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3279                         rv = 0;
3280                         goto err;
3281                 }
3282                 cache += l;
3283                 cache_size -= l;
3284                 total_len += l;
3285         }
3286
3287         d->size = total_len;
3288         if (total_len > size)
3289                 total_len = size;
3290
3291         /* read from off 0 */
3292         memcpy(buf, d->buf, total_len);
3293         rv = total_len;
3294   err:
3295         fclose(f);
3296         free(line);
3297         if (d->size > rv)
3298                 d->cached = d->size - rv;
3299         return rv;
3300 }
3301
3302 /*
3303  * FUSE ops for /proc
3304  */
3305
3306 static unsigned long get_memlimit(const char *cgroup, bool swap)
3307 {
3308         int ret;
3309         __do_free char *memlimit_str = NULL;
3310         unsigned long memlimit = -1;
3311
3312         if (swap)
3313                 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3314         else
3315                 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3316         if (ret > 0)
3317                 memlimit = strtoul(memlimit_str, NULL, 10);
3318
3319         return memlimit;
3320 }
3321
3322 static unsigned long get_min_memlimit(const char *cgroup, bool swap)
3323 {
3324         __do_free char *copy = NULL;
3325         unsigned long memlimit = 0;
3326         unsigned long retlimit;
3327
3328         copy = strdup(cgroup);
3329         retlimit = get_memlimit(copy, swap);
3330
3331         while (strcmp(copy, "/") != 0) {
3332                 char *it = copy;
3333
3334                 it = dirname(it);
3335                 memlimit = get_memlimit(it, swap);
3336                 if (memlimit != -1 && memlimit < retlimit)
3337                         retlimit = memlimit;
3338         };
3339
3340         return retlimit;
3341 }
3342
3343 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3344                              struct fuse_file_info *fi)
3345 {
3346         __do_free char *cgroup = NULL, *line = NULL,
3347                        *memusage_str = NULL, *memstat_str = NULL,
3348                        *memswlimit_str = NULL, *memswusage_str = NULL;
3349         __do_fclose FILE *f = NULL;
3350         struct fuse_context *fc = fuse_get_context();
3351         struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3352         struct file_info *d = (struct file_info *)fi->fh;
3353         unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3354                       memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3355                       inactive_anon = 0, active_file = 0, inactive_file = 0,
3356                       unevictable = 0, shmem = 0, hostswtotal = 0;
3357         size_t linelen = 0, total_len = 0;
3358         char *cache = d->buf;
3359         size_t cache_size = d->buflen;
3360         int ret;
3361
3362         if (offset) {
3363                 int left;
3364
3365                 if (offset > d->size)
3366                         return -EINVAL;
3367
3368                 if (!d->cached)
3369                         return 0;
3370
3371                 left = d->size - offset;
3372                 total_len = left > size ? size : left;
3373                 memcpy(buf, cache + offset, total_len);
3374
3375                 return total_len;
3376         }
3377
3378         pid_t initpid = lookup_initpid_in_store(fc->pid);
3379         if (initpid <= 1 || is_shared_pidns(initpid))
3380                 initpid = fc->pid;
3381
3382         cgroup = get_pid_cgroup(initpid, "memory");
3383         if (!cgroup)
3384                 return read_file_fuse("/proc/meminfo", buf, size, d);
3385
3386         prune_init_slice(cgroup);
3387
3388         memlimit = get_min_memlimit(cgroup, false);
3389
3390         ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3391         if (ret < 0)
3392                 return 0;
3393
3394         ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3395         if (ret < 0)
3396                 return 0;
3397         parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3398                       &active_file, &inactive_file, &unevictable, &shmem);
3399
3400         /*
3401          * Following values are allowed to fail, because swapaccount might be
3402          * turned off for current kernel.
3403          */
3404         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3405         if (ret >= 0)
3406                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3407         if (ret >= 0) {
3408                 memswlimit = get_min_memlimit(cgroup, true);
3409                 memswusage = strtoul(memswusage_str, NULL, 10);
3410                 memswlimit = memswlimit / 1024;
3411                 memswusage = memswusage / 1024;
3412         }
3413
3414         memusage = strtoul(memusage_str, NULL, 10);
3415         memlimit /= 1024;
3416         memusage /= 1024;
3417
3418         f = fopen("/proc/meminfo", "r");
3419         if (!f)
3420                 return 0;
3421
3422         while (getline(&line, &linelen, f) != -1) {
3423                 ssize_t l;
3424                 char *printme, lbuf[100];
3425
3426                 memset(lbuf, 0, 100);
3427                 if (startswith(line, "MemTotal:")) {
3428                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3429                         if (hosttotal < memlimit)
3430                                 memlimit = hosttotal;
3431                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3432                         printme = lbuf;
3433                 } else if (startswith(line, "MemFree:")) {
3434                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3435                         printme = lbuf;
3436                 } else if (startswith(line, "MemAvailable:")) {
3437                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3438                         printme = lbuf;
3439                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3440                            opts && opts->swap_off == false) {
3441                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3442                         if (hostswtotal < memswlimit)
3443                                 memswlimit = hostswtotal;
3444                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3445                         printme = lbuf;
3446                 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3447                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", 0UL);
3448                         printme = lbuf;
3449                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3450                            memswusage > 0 && opts && opts->swap_off == false) {
3451                         unsigned long swaptotal = memswlimit,
3452                                       swapusage = memusage > memswusage
3453                                                       ? 0
3454                                                       : memswusage - memusage,
3455                                       swapfree = swapusage < swaptotal
3456                                                      ? swaptotal - swapusage
3457                                                      : 0;
3458                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3459                         printme = lbuf;
3460                 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3461                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", 0UL);
3462                         printme = lbuf;
3463                 } else if (startswith(line, "Slab:")) {
3464                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3465                         printme = lbuf;
3466                 } else if (startswith(line, "Buffers:")) {
3467                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3468                         printme = lbuf;
3469                 } else if (startswith(line, "Cached:")) {
3470                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3471                         printme = lbuf;
3472                 } else if (startswith(line, "SwapCached:")) {
3473                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3474                         printme = lbuf;
3475                 } else if (startswith(line, "Active:")) {
3476                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3477                                         active_anon + active_file);
3478                         printme = lbuf;
3479                 } else if (startswith(line, "Inactive:")) {
3480                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3481                                         inactive_anon + inactive_file);
3482                         printme = lbuf;
3483                 } else if (startswith(line, "Active(anon)")) {
3484                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3485                         printme = lbuf;
3486                 } else if (startswith(line, "Inactive(anon)")) {
3487                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3488                         printme = lbuf;
3489                 } else if (startswith(line, "Active(file)")) {
3490                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3491                         printme = lbuf;
3492                 } else if (startswith(line, "Inactive(file)")) {
3493                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3494                         printme = lbuf;
3495                 } else if (startswith(line, "Unevictable")) {
3496                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3497                         printme = lbuf;
3498                 } else if (startswith(line, "SReclaimable")) {
3499                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3500                         printme = lbuf;
3501                 } else if (startswith(line, "SUnreclaim")) {
3502                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3503                         printme = lbuf;
3504                 } else if (startswith(line, "Shmem:")) {
3505                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3506                         printme = lbuf;
3507                 } else if (startswith(line, "ShmemHugePages")) {
3508                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3509                         printme = lbuf;
3510                 } else if (startswith(line, "ShmemPmdMapped")) {
3511                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3512                         printme = lbuf;
3513                 } else
3514                         printme = line;
3515
3516                 l = snprintf(cache, cache_size, "%s", printme);
3517                 if (l < 0) {
3518                         perror("Error writing to cache");
3519                         return 0;
3520
3521                 }
3522                 if (l >= cache_size) {
3523                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3524                         return 0;
3525                 }
3526
3527                 cache += l;
3528                 cache_size -= l;
3529                 total_len += l;
3530         }
3531
3532         d->cached = 1;
3533         d->size = total_len;
3534         if (total_len > size ) total_len = size;
3535         memcpy(buf, d->buf, total_len);
3536
3537         return total_len;
3538 }
3539
3540 /*
3541  * Read the cpuset.cpus for cg
3542  * Return the answer in a newly allocated string which must be freed
3543  */
3544 char *get_cpuset(const char *cg)
3545 {
3546         char *value = NULL;
3547         int ret;
3548
3549         ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3550         if (ret < 0)
3551                 return NULL;
3552
3553         return value;
3554 }
3555
3556 bool cpu_in_cpuset(int cpu, const char *cpuset);
3557
3558 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3559 {
3560         int cpu;
3561
3562         if (sscanf(line, "processor       : %d", &cpu) != 1)
3563                 return false;
3564         return cpu_in_cpuset(cpu, cpuset);
3565 }
3566
3567 /*
3568  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3569  * depending on `param`. Parameter value is returned throuh `value`.
3570  */
3571 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3572 {
3573         bool rv = false;
3574         char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3575         char *str = NULL;
3576
3577         sprintf(file, "cpu.cfs_%s_us", param);
3578
3579         if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
3580                 goto err;
3581
3582         if (sscanf(str, "%ld", value) != 1)
3583                 goto err;
3584
3585         rv = true;
3586
3587 err:
3588         if (str)
3589                 free(str);
3590         return rv;
3591 }
3592
3593 /*
3594  * Return the maximum number of visible CPUs based on CPU quotas.
3595  * If there is no quota set, zero is returned.
3596  */
3597 int max_cpu_count(const char *cg)
3598 {
3599         int rv, nprocs;
3600         int64_t cfs_quota, cfs_period;
3601         int nr_cpus_in_cpuset = 0;
3602         char *cpuset = NULL;
3603
3604         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3605                 return 0;
3606
3607         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3608                 return 0;
3609
3610         cpuset = get_cpuset(cg);
3611         if (cpuset)
3612                 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3613
3614         if (cfs_quota <= 0 || cfs_period <= 0){
3615                 if (nr_cpus_in_cpuset > 0)
3616                         return nr_cpus_in_cpuset;
3617
3618                 return 0;
3619         }
3620
3621         rv = cfs_quota / cfs_period;
3622
3623         /* In case quota/period does not yield a whole number, add one CPU for
3624          * the remainder.
3625          */
3626         if ((cfs_quota % cfs_period) > 0)
3627                 rv += 1;
3628
3629         nprocs = get_nprocs();
3630
3631         if (rv > nprocs)
3632                 rv = nprocs;
3633
3634         /* use min value in cpu quota and cpuset */
3635         if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3636                 rv = nr_cpus_in_cpuset;
3637
3638         return rv;
3639 }
3640
3641 /*
3642  * Return the exact number of visible CPUs based on CPU quotas.
3643  * If there is no quota set, zero is returned.
3644  */
3645 static double exact_cpu_count(const char *cg)
3646 {
3647         double rv;
3648         int nprocs;
3649         int64_t cfs_quota, cfs_period;
3650
3651         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3652                 return 0;
3653
3654         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3655                 return 0;
3656
3657         if (cfs_quota <= 0 || cfs_period <= 0)
3658                 return 0;
3659
3660         rv = (double)cfs_quota / (double)cfs_period;
3661
3662         nprocs = get_nprocs();
3663
3664         if (rv > nprocs)
3665                 rv = nprocs;
3666
3667         return rv;
3668 }
3669
3670 /*
3671  * Determine whether CPU views should be used or not.
3672  */
3673 bool use_cpuview(const char *cg)
3674 {
3675         int cfd;
3676
3677         cfd = find_mounted_controller("cpu");
3678         if (cfd < 0)
3679                 return false;
3680
3681         cfd = find_mounted_controller("cpuacct");
3682         if (cfd < 0)
3683                 return false;
3684
3685         return true;
3686 }
3687
3688 /*
3689  * check whether this is a '^processor" line in /proc/cpuinfo
3690  */
3691 static bool is_processor_line(const char *line)
3692 {
3693         int cpu;
3694
3695         if (sscanf(line, "processor       : %d", &cpu) == 1)
3696                 return true;
3697         return false;
3698 }
3699
3700 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3701                 struct fuse_file_info *fi)
3702 {
3703         struct fuse_context *fc = fuse_get_context();
3704         struct file_info *d = (struct file_info *)fi->fh;
3705         char *cg;
3706         char *cpuset = NULL;
3707         char *line = NULL;
3708         size_t linelen = 0, total_len = 0, rv = 0;
3709         bool am_printing = false, firstline = true, is_s390x = false;
3710         int curcpu = -1, cpu, max_cpus = 0;
3711         bool use_view;
3712         char *cache = d->buf;
3713         size_t cache_size = d->buflen;
3714         FILE *f = NULL;
3715
3716         if (offset){
3717                 if (offset > d->size)
3718                         return -EINVAL;
3719                 if (!d->cached)
3720                         return 0;
3721                 int left = d->size - offset;
3722                 total_len = left > size ? size: left;
3723                 memcpy(buf, cache + offset, total_len);
3724                 return total_len;
3725         }
3726
3727         pid_t initpid = lookup_initpid_in_store(fc->pid);
3728         if (initpid <= 1 || is_shared_pidns(initpid))
3729                 initpid = fc->pid;
3730         cg = get_pid_cgroup(initpid, "cpuset");
3731         if (!cg)
3732                 return read_file_fuse("proc/cpuinfo", buf, size, d);
3733         prune_init_slice(cg);
3734
3735         cpuset = get_cpuset(cg);
3736         if (!cpuset)
3737                 goto err;
3738
3739         use_view = use_cpuview(cg);
3740
3741         if (use_view)
3742                 max_cpus = max_cpu_count(cg);
3743
3744         f = fopen("/proc/cpuinfo", "r");
3745         if (!f)
3746                 goto err;
3747
3748         while (getline(&line, &linelen, f) != -1) {
3749                 ssize_t l;
3750                 if (firstline) {
3751                         firstline = false;
3752                         if (strstr(line, "IBM/S390") != NULL) {
3753                                 is_s390x = true;
3754                                 am_printing = true;
3755                                 continue;
3756                         }
3757                 }
3758                 if (strncmp(line, "# processors:", 12) == 0)
3759                         continue;
3760                 if (is_processor_line(line)) {
3761                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3762                                 break;
3763                         am_printing = cpuline_in_cpuset(line, cpuset);
3764                         if (am_printing) {
3765                                 curcpu ++;
3766                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3767                                 if (l < 0) {
3768                                         perror("Error writing to cache");
3769                                         rv = 0;
3770                                         goto err;
3771                                 }
3772                                 if (l >= cache_size) {
3773                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3774                                         rv = 0;
3775                                         goto err;
3776                                 }
3777                                 cache += l;
3778                                 cache_size -= l;
3779                                 total_len += l;
3780                         }
3781                         continue;
3782                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3783                         char *p;
3784                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3785                                 break;
3786                         if (!cpu_in_cpuset(cpu, cpuset))
3787                                 continue;
3788                         curcpu ++;
3789                         p = strchr(line, ':');
3790                         if (!p || !*p)
3791                                 goto err;
3792                         p++;
3793                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3794                         if (l < 0) {
3795                                 perror("Error writing to cache");
3796                                 rv = 0;
3797                                 goto err;
3798                         }
3799                         if (l >= cache_size) {
3800                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3801                                 rv = 0;
3802                                 goto err;
3803                         }
3804                         cache += l;
3805                         cache_size -= l;
3806                         total_len += l;
3807                         continue;
3808
3809                 }
3810                 if (am_printing) {
3811                         l = snprintf(cache, cache_size, "%s", line);
3812                         if (l < 0) {
3813                                 perror("Error writing to cache");
3814                                 rv = 0;
3815                                 goto err;
3816                         }
3817                         if (l >= cache_size) {
3818                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3819                                 rv = 0;
3820                                 goto err;
3821                         }
3822                         cache += l;
3823                         cache_size -= l;
3824                         total_len += l;
3825                 }
3826         }
3827
3828         if (is_s390x) {
3829                 char *origcache = d->buf;
3830                 ssize_t l;
3831                 do {
3832                         d->buf = malloc(d->buflen);
3833                 } while (!d->buf);
3834                 cache = d->buf;
3835                 cache_size = d->buflen;
3836                 total_len = 0;
3837                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3838                 if (l < 0 || l >= cache_size) {
3839                         free(origcache);
3840                         goto err;
3841                 }
3842                 cache_size -= l;
3843                 cache += l;
3844                 total_len += l;
3845                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3846                 if (l < 0 || l >= cache_size) {
3847                         free(origcache);
3848                         goto err;
3849                 }
3850                 cache_size -= l;
3851                 cache += l;
3852                 total_len += l;
3853                 l = snprintf(cache, cache_size, "%s", origcache);
3854                 free(origcache);
3855                 if (l < 0 || l >= cache_size)
3856                         goto err;
3857                 total_len += l;
3858         }
3859
3860         d->cached = 1;
3861         d->size = total_len;
3862         if (total_len > size ) total_len = size;
3863
3864         /* read from off 0 */
3865         memcpy(buf, d->buf, total_len);
3866         rv = total_len;
3867 err:
3868         if (f)
3869                 fclose(f);
3870         free(line);
3871         free(cpuset);
3872         free(cg);
3873         return rv;
3874 }
3875
3876 static uint64_t get_reaper_start_time(pid_t pid)
3877 {
3878         int ret;
3879         FILE *f;
3880         uint64_t starttime;
3881         /* strlen("/proc/") = 6
3882          * +
3883          * LXCFS_NUMSTRLEN64
3884          * +
3885          * strlen("/stat") = 5
3886          * +
3887          * \0 = 1
3888          * */
3889 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3890         char path[__PROC_PID_STAT_LEN];
3891         pid_t qpid;
3892
3893         qpid = lookup_initpid_in_store(pid);
3894         if (qpid <= 0) {
3895                 /* Caller can check for EINVAL on 0. */
3896                 errno = EINVAL;
3897                 return 0;
3898         }
3899
3900         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3901         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3902                 /* Caller can check for EINVAL on 0. */
3903                 errno = EINVAL;
3904                 return 0;
3905         }
3906
3907         f = fopen(path, "r");
3908         if (!f) {
3909                 /* Caller can check for EINVAL on 0. */
3910                 errno = EINVAL;
3911                 return 0;
3912         }
3913
3914         /* Note that the *scanf() argument supression requires that length
3915          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3916          * at us. It's like telling someone you're not married and then asking
3917          * if you can bring your wife to the party.
3918          */
3919         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3920                         "%*s "      /* (2)  comm        %s   */
3921                         "%*c "      /* (3)  state       %c   */
3922                         "%*d "      /* (4)  ppid        %d   */
3923                         "%*d "      /* (5)  pgrp        %d   */
3924                         "%*d "      /* (6)  session     %d   */
3925                         "%*d "      /* (7)  tty_nr      %d   */
3926                         "%*d "      /* (8)  tpgid       %d   */
3927                         "%*u "      /* (9)  flags       %u   */
3928                         "%*u "      /* (10) minflt      %lu  */
3929                         "%*u "      /* (11) cminflt     %lu  */
3930                         "%*u "      /* (12) majflt      %lu  */
3931                         "%*u "      /* (13) cmajflt     %lu  */
3932                         "%*u "      /* (14) utime       %lu  */
3933                         "%*u "      /* (15) stime       %lu  */
3934                         "%*d "      /* (16) cutime      %ld  */
3935                         "%*d "      /* (17) cstime      %ld  */
3936                         "%*d "      /* (18) priority    %ld  */
3937                         "%*d "      /* (19) nice        %ld  */
3938                         "%*d "      /* (20) num_threads %ld  */
3939                         "%*d "      /* (21) itrealvalue %ld  */
3940                         "%" PRIu64, /* (22) starttime   %llu */
3941                      &starttime);
3942         if (ret != 1) {
3943                 fclose(f);
3944                 /* Caller can check for EINVAL on 0. */
3945                 errno = EINVAL;
3946                 return 0;
3947         }
3948
3949         fclose(f);
3950
3951         errno = 0;
3952         return starttime;
3953 }
3954
3955 static double get_reaper_start_time_in_sec(pid_t pid)
3956 {
3957         uint64_t clockticks, ticks_per_sec;
3958         int64_t ret;
3959         double res = 0;
3960
3961         clockticks = get_reaper_start_time(pid);
3962         if (clockticks == 0 && errno == EINVAL) {
3963                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3964                 return 0;
3965         }
3966
3967         ret = sysconf(_SC_CLK_TCK);
3968         if (ret < 0 && errno == EINVAL) {
3969                 lxcfs_debug(
3970                     "%s\n",
3971                     "failed to determine number of clock ticks in a second");
3972                 return 0;
3973         }
3974
3975         ticks_per_sec = (uint64_t)ret;
3976         res = (double)clockticks / ticks_per_sec;
3977         return res;
3978 }
3979
3980 static double get_reaper_age(pid_t pid)
3981 {
3982         uint64_t uptime_ms;
3983         double procstart, procage;
3984
3985         /* We need to substract the time the process has started since system
3986          * boot minus the time when the system has started to get the actual
3987          * reaper age.
3988          */
3989         procstart = get_reaper_start_time_in_sec(pid);
3990         procage = procstart;
3991         if (procstart > 0) {
3992                 int ret;
3993                 struct timespec spec;
3994
3995                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3996                 if (ret < 0)
3997                         return 0;
3998
3999                 /* We could make this more precise here by using the tv_nsec
4000                  * field in the timespec struct and convert it to milliseconds
4001                  * and then create a double for the seconds and milliseconds but
4002                  * that seems more work than it is worth.
4003                  */
4004                 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
4005                 procage = (uptime_ms - (procstart * 1000)) / 1000;
4006         }
4007
4008         return procage;
4009 }
4010
4011 /*
4012  * Returns 0 on success.
4013  * It is the caller's responsibility to free `return_usage`, unless this
4014  * function returns an error.
4015  */
4016 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4017 {
4018         int cpucount = get_nprocs_conf();
4019         struct cpuacct_usage *cpu_usage;
4020         int rv = 0, i, j, ret;
4021         int cg_cpu;
4022         uint64_t cg_user, cg_system;
4023         int64_t ticks_per_sec;
4024         char *usage_str = NULL;
4025
4026         ticks_per_sec = sysconf(_SC_CLK_TCK);
4027
4028         if (ticks_per_sec < 0 && errno == EINVAL) {
4029                 lxcfs_v(
4030                         "%s\n",
4031                         "read_cpuacct_usage_all failed to determine number of clock ticks "
4032                         "in a second");
4033                 return -1;
4034         }
4035
4036         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4037         if (!cpu_usage)
4038                 return -ENOMEM;
4039
4040         memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
4041         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4042                 // read cpuacct.usage_percpu instead
4043                 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4044                 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) {
4045                         rv = -1;
4046                         goto err;
4047                 }
4048                 lxcfs_v("usage_str: %s\n", usage_str);
4049
4050                 // convert cpuacct.usage_percpu into cpuacct.usage_all
4051                 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4052
4053                 char *data = NULL;
4054                 size_t sz = 0, asz = 0;
4055
4056                 must_strcat(&data, &sz, &asz, "cpu user system\n");
4057
4058                 int i = 0, read_pos = 0, read_cnt=0;
4059                 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4060                         lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4061                         must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4062                         i++;
4063                         read_pos += read_cnt;
4064                 }
4065
4066                 free(usage_str);
4067                 usage_str = data;
4068
4069                 lxcfs_v("usage_str: %s\n", usage_str);
4070         }
4071
4072         int read_pos = 0, read_cnt=0;
4073         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4074                 lxcfs_error("read_cpuacct_usage_all reading first line from "
4075                                 "%s/cpuacct.usage_all failed.\n", cg);
4076                 rv = -1;
4077                 goto err;
4078         }
4079
4080         read_pos += read_cnt;
4081
4082         for (i = 0, j = 0; i < cpucount; i++) {
4083                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4084                                 &cg_system, &read_cnt);
4085
4086                 if (ret == EOF)
4087                         break;
4088
4089                 if (ret != 3) {
4090                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4091                                         "failed.\n", cg);
4092                         rv = -1;
4093                         goto err;
4094                 }
4095
4096                 read_pos += read_cnt;
4097
4098                 /* Convert the time from nanoseconds to USER_HZ */
4099                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4100                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4101                 j++;
4102         }
4103
4104         rv = 0;
4105         *return_usage = cpu_usage;
4106         *size = cpucount;
4107
4108 err:
4109         if (usage_str)
4110                 free(usage_str);
4111
4112         if (rv != 0) {
4113                 free(cpu_usage);
4114                 *return_usage = NULL;
4115         }
4116
4117         return rv;
4118 }
4119
4120 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4121 {
4122         int i;
4123         unsigned long sum = 0;
4124
4125         for (i = 0; i < cpu_count; i++) {
4126                 if (!newer[i].online)
4127                         continue;
4128
4129                 /* When cpuset is changed on the fly, the CPUs might get reordered.
4130                  * We could either reset all counters, or check that the substractions
4131                  * below will return expected results.
4132                  */
4133                 if (newer[i].user > older[i].user)
4134                         diff[i].user = newer[i].user - older[i].user;
4135                 else
4136                         diff[i].user = 0;
4137
4138                 if (newer[i].system > older[i].system)
4139                         diff[i].system = newer[i].system - older[i].system;
4140                 else
4141                         diff[i].system = 0;
4142
4143                 if (newer[i].idle > older[i].idle)
4144                         diff[i].idle = newer[i].idle - older[i].idle;
4145                 else
4146                         diff[i].idle = 0;
4147
4148                 sum += diff[i].user;
4149                 sum += diff[i].system;
4150                 sum += diff[i].idle;
4151         }
4152
4153         return sum;
4154 }
4155
4156 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4157 {
4158         unsigned long free_space, to_add;
4159
4160         free_space = threshold - usage->user - usage->system;
4161
4162         if (free_space > usage->idle)
4163                 free_space = usage->idle;
4164
4165         to_add = free_space > *surplus ? *surplus : free_space;
4166
4167         *counter += to_add;
4168         usage->idle -= to_add;
4169         *surplus -= to_add;
4170 }
4171
4172 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4173 {
4174         struct cg_proc_stat *first = NULL, *prev, *tmp;
4175
4176         for (prev = NULL; node; ) {
4177                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4178                         tmp = node;
4179                         lxcfs_debug("Removing stat node for %s\n", node->cg);
4180
4181                         if (prev)
4182                                 prev->next = node->next;
4183                         else
4184                                 first = node->next;
4185
4186                         node = node->next;
4187                         free_proc_stat_node(tmp);
4188                 } else {
4189                         if (!first)
4190                                 first = node;
4191                         prev = node;
4192                         node = node->next;
4193                 }
4194         }
4195
4196         return first;
4197 }
4198
4199 #define PROC_STAT_PRUNE_INTERVAL 10
4200 static void prune_proc_stat_history(void)
4201 {
4202         int i;
4203         time_t now = time(NULL);
4204
4205         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4206                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4207
4208                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4209                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4210                         return;
4211                 }
4212
4213                 if (proc_stat_history[i]->next) {
4214                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4215                         proc_stat_history[i]->lastcheck = now;
4216                 }
4217
4218                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4219         }
4220 }
4221
4222 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4223 {
4224         struct cg_proc_stat *node;
4225
4226         pthread_rwlock_rdlock(&head->lock);
4227
4228         if (!head->next) {
4229                 pthread_rwlock_unlock(&head->lock);
4230                 return NULL;
4231         }
4232
4233         node = head->next;
4234
4235         do {
4236                 if (strcmp(cg, node->cg) == 0)
4237                         goto out;
4238         } while ((node = node->next));
4239
4240         node = NULL;
4241
4242 out:
4243         pthread_rwlock_unlock(&head->lock);
4244         prune_proc_stat_history();
4245         return node;
4246 }
4247
4248 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4249 {
4250         struct cg_proc_stat *node;
4251         int i;
4252
4253         node = malloc(sizeof(struct cg_proc_stat));
4254         if (!node)
4255                 goto err;
4256
4257         node->cg = NULL;
4258         node->usage = NULL;
4259         node->view = NULL;
4260
4261         node->cg = malloc(strlen(cg) + 1);
4262         if (!node->cg)
4263                 goto err;
4264
4265         strcpy(node->cg, cg);
4266
4267         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4268         if (!node->usage)
4269                 goto err;
4270
4271         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4272
4273         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4274         if (!node->view)
4275                 goto err;
4276
4277         node->cpu_count = cpu_count;
4278         node->next = NULL;
4279
4280         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4281                 lxcfs_error("%s\n", "Failed to initialize node lock");
4282                 goto err;
4283         }
4284
4285         for (i = 0; i < cpu_count; i++) {
4286                 node->view[i].user = 0;
4287                 node->view[i].system = 0;
4288                 node->view[i].idle = 0;
4289         }
4290
4291         return node;
4292
4293 err:
4294         if (node && node->cg)
4295                 free(node->cg);
4296         if (node && node->usage)
4297                 free(node->usage);
4298         if (node && node->view)
4299                 free(node->view);
4300         if (node)
4301                 free(node);
4302
4303         return NULL;
4304 }
4305
4306 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4307 {
4308         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4309         struct cg_proc_stat_head *head = proc_stat_history[hash];
4310         struct cg_proc_stat *node, *rv = new_node;
4311
4312         pthread_rwlock_wrlock(&head->lock);
4313
4314         if (!head->next) {
4315                 head->next = new_node;
4316                 goto out;
4317         }
4318
4319         node = head->next;
4320
4321         for (;;) {
4322                 if (strcmp(node->cg, new_node->cg) == 0) {
4323                         /* The node is already present, return it */
4324                         free_proc_stat_node(new_node);
4325                         rv = node;
4326                         goto out;
4327                 }
4328
4329                 if (node->next) {
4330                         node = node->next;
4331                         continue;
4332                 }
4333
4334                 node->next = new_node;
4335                 goto out;
4336         }
4337
4338 out:
4339         pthread_rwlock_unlock(&head->lock);
4340         return rv;
4341 }
4342
4343 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4344 {
4345         struct cpuacct_usage *new_usage, *new_view;
4346         int i;
4347
4348         /* Allocate new memory */
4349         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4350         if (!new_usage)
4351                 return false;
4352
4353         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4354         if (!new_view) {
4355                 free(new_usage);
4356                 return false;
4357         }
4358
4359         /* Copy existing data & initialize new elements */
4360         for (i = 0; i < cpu_count; i++) {
4361                 if (i < node->cpu_count) {
4362                         new_usage[i].user = node->usage[i].user;
4363                         new_usage[i].system = node->usage[i].system;
4364                         new_usage[i].idle = node->usage[i].idle;
4365
4366                         new_view[i].user = node->view[i].user;
4367                         new_view[i].system = node->view[i].system;
4368                         new_view[i].idle = node->view[i].idle;
4369                 } else {
4370                         new_usage[i].user = 0;
4371                         new_usage[i].system = 0;
4372                         new_usage[i].idle = 0;
4373
4374                         new_view[i].user = 0;
4375                         new_view[i].system = 0;
4376                         new_view[i].idle = 0;
4377                 }
4378         }
4379
4380         free(node->usage);
4381         free(node->view);
4382
4383         node->usage = new_usage;
4384         node->view = new_view;
4385         node->cpu_count = cpu_count;
4386
4387         return true;
4388 }
4389
4390 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4391 {
4392         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4393         struct cg_proc_stat_head *head = proc_stat_history[hash];
4394         struct cg_proc_stat *node;
4395
4396         node = find_proc_stat_node(head, cg);
4397
4398         if (!node) {
4399                 node = new_proc_stat_node(usage, cpu_count, cg);
4400                 if (!node)
4401                         return NULL;
4402
4403                 node = add_proc_stat_node(node);
4404                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4405         }
4406
4407         pthread_mutex_lock(&node->lock);
4408
4409         /* If additional CPUs on the host have been enabled, CPU usage counter
4410          * arrays have to be expanded */
4411         if (node->cpu_count < cpu_count) {
4412                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4413                                 node->cpu_count, cpu_count, cg);
4414
4415                 if (!expand_proc_stat_node(node, cpu_count)) {
4416                         pthread_mutex_unlock(&node->lock);
4417                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4418                                         node->cpu_count, cpu_count, cg);
4419                         return NULL;
4420                 }
4421         }
4422
4423         return node;
4424 }
4425
4426 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4427 {
4428         int i;
4429
4430         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4431         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4432
4433         for (i = 0; i < cpu_count; i++) {
4434                 node->view[i].user = 0;
4435                 node->view[i].system = 0;
4436                 node->view[i].idle = 0;
4437         }
4438
4439         node->cpu_count = cpu_count;
4440 }
4441
4442 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4443 {
4444         char *line = NULL;
4445         size_t linelen = 0, total_len = 0, rv = 0, l;
4446         int curcpu = -1; /* cpu numbering starts at 0 */
4447         int physcpu, i;
4448         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4449         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4450         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4451         unsigned long user_surplus = 0, system_surplus = 0;
4452         unsigned long total_sum, threshold;
4453         struct cg_proc_stat *stat_node;
4454         struct cpuacct_usage *diff = NULL;
4455         int nprocs = get_nprocs_conf();
4456
4457         if (cg_cpu_usage_size < nprocs)
4458                 nprocs = cg_cpu_usage_size;
4459
4460         /* Read all CPU stats and stop when we've encountered other lines */
4461         while (getline(&line, &linelen, f) != -1) {
4462                 int ret;
4463                 char cpu_char[10]; /* That's a lot of cores */
4464                 uint64_t all_used, cg_used;
4465
4466                 if (strlen(line) == 0)
4467                         continue;
4468                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4469                         /* not a ^cpuN line containing a number N */
4470                         break;
4471                 }
4472
4473                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4474                         continue;
4475
4476                 if (physcpu >= cg_cpu_usage_size)
4477                         continue;
4478
4479                 curcpu ++;
4480                 cpu_cnt ++;
4481
4482                 if (!cpu_in_cpuset(physcpu, cpuset)) {
4483                         for (i = curcpu; i <= physcpu; i++) {
4484                                 cg_cpu_usage[i].online = false;
4485                         }
4486                         continue;
4487                 }
4488
4489                 if (curcpu < physcpu) {
4490                         /* Some CPUs may be disabled */
4491                         for (i = curcpu; i < physcpu; i++)
4492                                 cg_cpu_usage[i].online = false;
4493
4494                         curcpu = physcpu;
4495                 }
4496
4497                 cg_cpu_usage[curcpu].online = true;
4498
4499                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4500                            &user,
4501                            &nice,
4502                            &system,
4503                            &idle,
4504                            &iowait,
4505                            &irq,
4506                            &softirq,
4507                            &steal,
4508                            &guest,
4509                            &guest_nice);
4510
4511                 if (ret != 10)
4512                         continue;
4513
4514                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4515                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4516
4517                 if (all_used >= cg_used) {
4518                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4519
4520                 } else {
4521                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4522                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4523                                         curcpu, cg, all_used, cg_used);
4524                         cg_cpu_usage[curcpu].idle = idle;
4525                 }
4526         }
4527
4528         /* Cannot use more CPUs than is available due to cpuset */
4529         if (max_cpus > cpu_cnt)
4530                 max_cpus = cpu_cnt;
4531
4532         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4533
4534         if (!stat_node) {
4535                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4536                 rv = 0;
4537                 goto err;
4538         }
4539
4540         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4541         if (!diff) {
4542                 rv = 0;
4543                 goto err;
4544         }
4545
4546         /*
4547          * If the new values are LOWER than values stored in memory, it means
4548          * the cgroup has been reset/recreated and we should reset too.
4549          */
4550         for (curcpu = 0; curcpu < nprocs; curcpu++) {
4551                 if (!cg_cpu_usage[curcpu].online)
4552                         continue;
4553
4554                 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4555                         reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4556
4557                 break;
4558         }
4559
4560         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4561
4562         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4563                 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4564
4565                 if (!stat_node->usage[curcpu].online)
4566                         continue;
4567
4568                 i++;
4569
4570                 stat_node->usage[curcpu].user += diff[curcpu].user;
4571                 stat_node->usage[curcpu].system += diff[curcpu].system;
4572                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4573
4574                 if (max_cpus > 0 && i >= max_cpus) {
4575                         user_surplus += diff[curcpu].user;
4576                         system_surplus += diff[curcpu].system;
4577                 }
4578         }
4579
4580         /* Calculate usage counters of visible CPUs */
4581         if (max_cpus > 0) {
4582                 /* threshold = maximum usage per cpu, including idle */
4583                 threshold = total_sum / cpu_cnt * max_cpus;
4584
4585                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4586                         if (!stat_node->usage[curcpu].online)
4587                                 continue;
4588
4589                         i++;
4590
4591                         if (i == max_cpus)
4592                                 break;
4593
4594                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4595                                 continue;
4596
4597                         /* Add user */
4598                         add_cpu_usage(
4599                                         &user_surplus,
4600                                         &diff[curcpu],
4601                                         &diff[curcpu].user,
4602                                         threshold);
4603
4604                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4605                                 continue;
4606
4607                         /* If there is still room, add system */
4608                         add_cpu_usage(
4609                                         &system_surplus,
4610                                         &diff[curcpu],
4611                                         &diff[curcpu].system,
4612                                         threshold);
4613                 }
4614
4615                 if (user_surplus > 0)
4616                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4617                 if (system_surplus > 0)
4618                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4619
4620                 unsigned long diff_user = 0;
4621                 unsigned long diff_system = 0;
4622                 unsigned long diff_idle = 0;
4623                 unsigned long max_diff_idle = 0;
4624                 unsigned long max_diff_idle_index = 0;
4625                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4626                         if (!stat_node->usage[curcpu].online)
4627                                 continue;
4628
4629                         i++;
4630
4631                         if (i == max_cpus)
4632                                 break;
4633
4634                         stat_node->view[curcpu].user += diff[curcpu].user;
4635                         stat_node->view[curcpu].system += diff[curcpu].system;
4636                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4637
4638                         user_sum += stat_node->view[curcpu].user;
4639                         system_sum += stat_node->view[curcpu].system;
4640                         idle_sum += stat_node->view[curcpu].idle;
4641
4642                         diff_user += diff[curcpu].user;
4643                         diff_system += diff[curcpu].system;
4644                         diff_idle += diff[curcpu].idle;
4645                         if (diff[curcpu].idle > max_diff_idle) {
4646                                 max_diff_idle = diff[curcpu].idle;
4647                                 max_diff_idle_index = curcpu;
4648                         }
4649
4650                         lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4651                 }
4652                 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4653
4654                 // revise cpu usage view to support partial cpu case
4655                 double exact_cpus = exact_cpu_count(cg);
4656                 if (exact_cpus < (double)max_cpus){
4657                         lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4658                         unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4659                         lxcfs_v("delta: %lu\n", delta);
4660                         lxcfs_v("idle_sum before: %lu\n", idle_sum);
4661                         idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4662                         lxcfs_v("idle_sum after: %lu\n", idle_sum);
4663
4664                         curcpu = max_diff_idle_index;
4665                         lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4666                         stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4667                         lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4668                 }
4669         } else {
4670                 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4671                         if (!stat_node->usage[curcpu].online)
4672                                 continue;
4673
4674                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4675                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4676                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4677
4678                         user_sum += stat_node->view[curcpu].user;
4679                         system_sum += stat_node->view[curcpu].system;
4680                         idle_sum += stat_node->view[curcpu].idle;
4681                 }
4682         }
4683
4684         /* Render the file */
4685         /* cpu-all */
4686         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4687                         user_sum,
4688                         system_sum,
4689                         idle_sum);
4690         lxcfs_v("cpu-all: %s\n", buf);
4691
4692         if (l < 0) {
4693                 perror("Error writing to cache");
4694                 rv = 0;
4695                 goto err;
4696         }
4697         if (l >= buf_size) {
4698                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4699                 rv = 0;
4700                 goto err;
4701         }
4702
4703         buf += l;
4704         buf_size -= l;
4705         total_len += l;
4706
4707         /* Render visible CPUs */
4708         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4709                 if (!stat_node->usage[curcpu].online)
4710                         continue;
4711
4712                 i++;
4713
4714                 if (max_cpus > 0 && i == max_cpus)
4715                         break;
4716
4717                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4718                                 i,
4719                                 stat_node->view[curcpu].user,
4720                                 stat_node->view[curcpu].system,
4721                                 stat_node->view[curcpu].idle);
4722                 lxcfs_v("cpu: %s\n", buf);
4723
4724                 if (l < 0) {
4725                         perror("Error writing to cache");
4726                         rv = 0;
4727                         goto err;
4728
4729                 }
4730                 if (l >= buf_size) {
4731                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4732                         rv = 0;
4733                         goto err;
4734                 }
4735
4736                 buf += l;
4737                 buf_size -= l;
4738                 total_len += l;
4739         }
4740
4741         /* Pass the rest of /proc/stat, start with the last line read */
4742         l = snprintf(buf, buf_size, "%s", line);
4743
4744         if (l < 0) {
4745                 perror("Error writing to cache");
4746                 rv = 0;
4747                 goto err;
4748
4749         }
4750         if (l >= buf_size) {
4751                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4752                 rv = 0;
4753                 goto err;
4754         }
4755
4756         buf += l;
4757         buf_size -= l;
4758         total_len += l;
4759
4760         /* Pass the rest of the host's /proc/stat */
4761         while (getline(&line, &linelen, f) != -1) {
4762                 l = snprintf(buf, buf_size, "%s", line);
4763                 if (l < 0) {
4764                         perror("Error writing to cache");
4765                         rv = 0;
4766                         goto err;
4767                 }
4768                 if (l >= buf_size) {
4769                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4770                         rv = 0;
4771                         goto err;
4772                 }
4773                 buf += l;
4774                 buf_size -= l;
4775                 total_len += l;
4776         }
4777
4778         rv = total_len;
4779
4780 err:
4781         if (stat_node)
4782                 pthread_mutex_unlock(&stat_node->lock);
4783         if (line)
4784                 free(line);
4785         if (diff)
4786                 free(diff);
4787         return rv;
4788 }
4789
4790 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4791 static int proc_stat_read(char *buf, size_t size, off_t offset,
4792                 struct fuse_file_info *fi)
4793 {
4794         struct fuse_context *fc = fuse_get_context();
4795         struct file_info *d = (struct file_info *)fi->fh;
4796         char *cg;
4797         char *cpuset = NULL;
4798         char *line = NULL;
4799         size_t linelen = 0, total_len = 0, rv = 0;
4800         int curcpu = -1; /* cpu numbering starts at 0 */
4801         int physcpu = 0;
4802         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4803         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4804                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4805         char cpuall[CPUALL_MAX_SIZE];
4806         /* reserve for cpu all */
4807         char *cache = d->buf + CPUALL_MAX_SIZE;
4808         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4809         FILE *f = NULL;
4810         struct cpuacct_usage *cg_cpu_usage = NULL;
4811         int cg_cpu_usage_size = 0;
4812
4813         if (offset){
4814                 if (offset > d->size)
4815                         return -EINVAL;
4816                 if (!d->cached)
4817                         return 0;
4818                 int left = d->size - offset;
4819                 total_len = left > size ? size: left;
4820                 memcpy(buf, d->buf + offset, total_len);
4821                 return total_len;
4822         }
4823
4824         pid_t initpid = lookup_initpid_in_store(fc->pid);
4825         lxcfs_v("initpid: %d\n", initpid);
4826         if (initpid <= 0)
4827                 initpid = fc->pid;
4828
4829         /*
4830          * when container run with host pid namespace initpid == 1, cgroup will "/"
4831          * we should return host os's /proc contents.
4832          * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4833          */
4834         if (initpid == 1) {
4835             return read_file_fuse("/proc/stat", buf, size, d);
4836         }
4837
4838         cg = get_pid_cgroup(initpid, "cpuset");
4839         lxcfs_v("cg: %s\n", cg);
4840         if (!cg)
4841                 return read_file_fuse("/proc/stat", buf, size, d);
4842         prune_init_slice(cg);
4843
4844         cpuset = get_cpuset(cg);
4845         if (!cpuset)
4846                 goto err;
4847
4848         /*
4849          * Read cpuacct.usage_all for all CPUs.
4850          * If the cpuacct cgroup is present, it is used to calculate the container's
4851          * CPU usage. If not, values from the host's /proc/stat are used.
4852          */
4853         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4854                 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4855                                 "falling back to the host's /proc/stat");
4856         }
4857
4858         f = fopen("/proc/stat", "r");
4859         if (!f)
4860                 goto err;
4861
4862         //skip first line
4863         if (getline(&line, &linelen, f) < 0) {
4864                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4865                 goto err;
4866         }
4867
4868         if (use_cpuview(cg) && cg_cpu_usage) {
4869                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4870                                 f, d->buf, d->buflen);
4871                 goto out;
4872         }
4873
4874         while (getline(&line, &linelen, f) != -1) {
4875                 ssize_t l;
4876                 char cpu_char[10]; /* That's a lot of cores */
4877                 char *c;
4878                 uint64_t all_used, cg_used, new_idle;
4879                 int ret;
4880
4881                 if (strlen(line) == 0)
4882                         continue;
4883                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4884                         /* not a ^cpuN line containing a number N, just print it */
4885                         l = snprintf(cache, cache_size, "%s", line);
4886                         if (l < 0) {
4887                                 perror("Error writing to cache");
4888                                 rv = 0;
4889                                 goto err;
4890                         }
4891                         if (l >= cache_size) {
4892                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4893                                 rv = 0;
4894                                 goto err;
4895                         }
4896                         cache += l;
4897                         cache_size -= l;
4898                         total_len += l;
4899                         continue;
4900                 }
4901
4902                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4903                         continue;
4904                 if (!cpu_in_cpuset(physcpu, cpuset))
4905                         continue;
4906                 curcpu ++;
4907
4908                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4909                            &user,
4910                            &nice,
4911                            &system,
4912                            &idle,
4913                            &iowait,
4914                            &irq,
4915                            &softirq,
4916                            &steal,
4917                            &guest,
4918                            &guest_nice);
4919
4920                 if (ret != 10 || !cg_cpu_usage) {
4921                         c = strchr(line, ' ');
4922                         if (!c)
4923                                 continue;
4924                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4925                         if (l < 0) {
4926                                 perror("Error writing to cache");
4927                                 rv = 0;
4928                                 goto err;
4929
4930                         }
4931                         if (l >= cache_size) {
4932                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4933                                 rv = 0;
4934                                 goto err;
4935                         }
4936
4937                         cache += l;
4938                         cache_size -= l;
4939                         total_len += l;
4940
4941                         if (ret != 10)
4942                                 continue;
4943                 }
4944
4945                 if (cg_cpu_usage) {
4946                         if (physcpu >= cg_cpu_usage_size)
4947                                 break;
4948
4949                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4950                         cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4951
4952                         if (all_used >= cg_used) {
4953                                 new_idle = idle + (all_used - cg_used);
4954
4955                         } else {
4956                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4957                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4958                                                 curcpu, cg, all_used, cg_used);
4959                                 new_idle = idle;
4960                         }
4961
4962                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4963                                         curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4964                                         new_idle);
4965
4966                         if (l < 0) {
4967                                 perror("Error writing to cache");
4968                                 rv = 0;
4969                                 goto err;
4970
4971                         }
4972                         if (l >= cache_size) {
4973                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4974                                 rv = 0;
4975                                 goto err;
4976                         }
4977
4978                         cache += l;
4979                         cache_size -= l;
4980                         total_len += l;
4981
4982                         user_sum += cg_cpu_usage[physcpu].user;
4983                         system_sum += cg_cpu_usage[physcpu].system;
4984                         idle_sum += new_idle;
4985
4986                 } else {
4987                         user_sum += user;
4988                         nice_sum += nice;
4989                         system_sum += system;
4990                         idle_sum += idle;
4991                         iowait_sum += iowait;
4992                         irq_sum += irq;
4993                         softirq_sum += softirq;
4994                         steal_sum += steal;
4995                         guest_sum += guest;
4996                         guest_nice_sum += guest_nice;
4997                 }
4998         }
4999
5000         cache = d->buf;
5001
5002         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5003                         user_sum,
5004                         nice_sum,
5005                         system_sum,
5006                         idle_sum,
5007                         iowait_sum,
5008                         irq_sum,
5009                         softirq_sum,
5010                         steal_sum,
5011                         guest_sum,
5012                         guest_nice_sum);
5013         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
5014                 memcpy(cache, cpuall, cpuall_len);
5015                 cache += cpuall_len;
5016         } else {
5017                 /* shouldn't happen */
5018                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
5019                 cpuall_len = 0;
5020         }
5021
5022         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
5023         total_len += cpuall_len;
5024
5025 out:
5026         d->cached = 1;
5027         d->size = total_len;
5028         if (total_len > size)
5029                 total_len = size;
5030
5031         memcpy(buf, d->buf, total_len);
5032         rv = total_len;
5033
5034 err:
5035         if (f)
5036                 fclose(f);
5037         if (cg_cpu_usage)
5038                 free(cg_cpu_usage);
5039         free(line);
5040         free(cpuset);
5041         free(cg);
5042         return rv;
5043 }
5044
5045 /* This function retrieves the busy time of a group of tasks by looking at
5046  * cpuacct.usage. Unfortunately, this only makes sense when the container has
5047  * been given it's own cpuacct cgroup. If not, this function will take the busy
5048  * time of all other taks that do not actually belong to the container into
5049  * account as well. If someone has a clever solution for this please send a
5050  * patch!
5051  */
5052 static double get_reaper_busy(pid_t task)
5053 {
5054         pid_t initpid = lookup_initpid_in_store(task);
5055         char *cgroup = NULL, *usage_str = NULL;
5056         unsigned long usage = 0;
5057         double res = 0;
5058
5059         if (initpid <= 0)
5060                 return 0;
5061
5062         cgroup = get_pid_cgroup(initpid, "cpuacct");
5063         if (!cgroup)
5064                 goto out;
5065         prune_init_slice(cgroup);
5066         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str))
5067                 goto out;
5068         usage = strtoul(usage_str, NULL, 10);
5069         res = (double)usage / 1000000000;
5070
5071 out:
5072         free(cgroup);
5073         free(usage_str);
5074         return res;
5075 }
5076
5077 #if RELOADTEST
5078 void iwashere(void)
5079 {
5080         int fd;
5081
5082         fd = creat("/tmp/lxcfs-iwashere", 0644);
5083         if (fd >= 0)
5084                 close(fd);
5085 }
5086 #endif
5087
5088 /*
5089  * We read /proc/uptime and reuse its second field.
5090  * For the first field, we use the mtime for the reaper for
5091  * the calling pid as returned by getreaperage
5092  */
5093 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5094                 struct fuse_file_info *fi)
5095 {
5096         struct fuse_context *fc = fuse_get_context();
5097         struct file_info *d = (struct file_info *)fi->fh;
5098         double busytime = get_reaper_busy(fc->pid);
5099         char *cache = d->buf;
5100         ssize_t total_len = 0;
5101         double idletime, reaperage;
5102
5103 #if RELOADTEST
5104         iwashere();
5105 #endif
5106
5107         if (offset){
5108                 if (!d->cached)
5109                         return 0;
5110                 if (offset > d->size)
5111                         return -EINVAL;
5112                 int left = d->size - offset;
5113                 total_len = left > size ? size: left;
5114                 memcpy(buf, cache + offset, total_len);
5115                 return total_len;
5116         }
5117
5118         reaperage = get_reaper_age(fc->pid);
5119         /* To understand why this is done, please read the comment to the
5120          * get_reaper_busy() function.
5121          */
5122         idletime = reaperage;
5123         if (reaperage >= busytime)
5124                 idletime = reaperage - busytime;
5125
5126         total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5127         if (total_len < 0 || total_len >=  d->buflen){
5128                 lxcfs_error("%s\n", "failed to write to cache");
5129                 return 0;
5130         }
5131
5132         d->size = (int)total_len;
5133         d->cached = 1;
5134
5135         if (total_len > size) total_len = size;
5136
5137         memcpy(buf, d->buf, total_len);
5138         return total_len;
5139 }
5140
5141 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5142                                struct fuse_file_info *fi)
5143 {
5144         __do_free char *cg = NULL, *io_serviced_str = NULL,
5145                        *io_merged_str = NULL, *io_service_bytes_str = NULL,
5146                        *io_wait_time_str = NULL, *io_service_time_str = NULL,
5147                        *line = NULL;
5148         __do_fclose FILE *f = NULL;
5149         struct fuse_context *fc = fuse_get_context();
5150         struct file_info *d = (struct file_info *)fi->fh;
5151         unsigned long read = 0, write = 0;
5152         unsigned long read_merged = 0, write_merged = 0;
5153         unsigned long read_sectors = 0, write_sectors = 0;
5154         unsigned long read_ticks = 0, write_ticks = 0;
5155         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5156         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5157         char *cache = d->buf;
5158         size_t cache_size = d->buflen;
5159         size_t linelen = 0, total_len = 0;
5160         unsigned int major = 0, minor = 0;
5161         int i = 0;
5162         int ret;
5163         char dev_name[72];
5164
5165         if (offset){
5166                 int left;
5167
5168                 if (offset > d->size)
5169                         return -EINVAL;
5170
5171                 if (!d->cached)
5172                         return 0;
5173
5174                 left = d->size - offset;
5175                 total_len = left > size ? size: left;
5176                 memcpy(buf, cache + offset, total_len);
5177
5178                 return total_len;
5179         }
5180
5181         pid_t initpid = lookup_initpid_in_store(fc->pid);
5182         if (initpid <= 1 || is_shared_pidns(initpid))
5183                 initpid = fc->pid;
5184         cg = get_pid_cgroup(initpid, "blkio");
5185         if (!cg)
5186                 return read_file_fuse("/proc/diskstats", buf, size, d);
5187         prune_init_slice(cg);
5188
5189         ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
5190         if (ret < 0) {
5191                 if (ret == -EOPNOTSUPP)
5192                         return read_file_fuse("/proc/diskstats", buf, size, d);
5193         }
5194
5195         ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
5196         if (ret < 0) {
5197                 if (ret == -EOPNOTSUPP)
5198                         return read_file_fuse("/proc/diskstats", buf, size, d);
5199         }
5200
5201         ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
5202         if (ret < 0) {
5203                 if (ret == -EOPNOTSUPP)
5204                         return read_file_fuse("/proc/diskstats", buf, size, d);
5205         }
5206
5207         ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
5208         if (ret < 0) {
5209                 if (ret == -EOPNOTSUPP)
5210                         return read_file_fuse("/proc/diskstats", buf, size, d);
5211         }
5212
5213         ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
5214         if (ret < 0) {
5215                 if (ret == -EOPNOTSUPP)
5216                         return read_file_fuse("/proc/diskstats", buf, size, d);
5217         }
5218
5219         f = fopen("/proc/diskstats", "r");
5220         if (!f)
5221                 return 0;
5222
5223         while (getline(&line, &linelen, f) != -1) {
5224                 ssize_t l;
5225                 char lbuf[256];
5226
5227                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5228                 if (i != 3)
5229                         continue;
5230
5231                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5232                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5233                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5234                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5235                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5236                 read_sectors = read_sectors/512;
5237                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5238                 write_sectors = write_sectors/512;
5239
5240                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5241                 rd_svctm = rd_svctm/1000000;
5242                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5243                 rd_wait = rd_wait/1000000;
5244                 read_ticks = rd_svctm + rd_wait;
5245
5246                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5247                 wr_svctm =  wr_svctm/1000000;
5248                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5249                 wr_wait =  wr_wait/1000000;
5250                 write_ticks = wr_svctm + wr_wait;
5251
5252                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5253                 tot_ticks =  tot_ticks/1000000;
5254
5255                 memset(lbuf, 0, 256);
5256                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5257                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5258                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5259                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5260                 else
5261                         continue;
5262
5263                 l = snprintf(cache, cache_size, "%s", lbuf);
5264                 if (l < 0) {
5265                         perror("Error writing to fuse buf");
5266                         return 0;
5267                 }
5268                 if (l >= cache_size) {
5269                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5270                         return 0;
5271                 }
5272                 cache += l;
5273                 cache_size -= l;
5274                 total_len += l;
5275         }
5276
5277         d->cached = 1;
5278         d->size = total_len;
5279         if (total_len > size ) total_len = size;
5280         memcpy(buf, d->buf, total_len);
5281
5282         return total_len;
5283 }
5284
5285 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5286                            struct fuse_file_info *fi)
5287 {
5288         __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5289                        *memswusage_str = NULL;
5290         struct fuse_context *fc = fuse_get_context();
5291         struct file_info *d = (struct file_info *)fi->fh;
5292         unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5293                       memswusage = 0, swap_total = 0, swap_free = 0;
5294         ssize_t total_len = 0;
5295         ssize_t l = 0;
5296         char *cache = d->buf;
5297         int ret;
5298
5299         if (offset) {
5300                 int left;
5301
5302                 if (offset > d->size)
5303                         return -EINVAL;
5304
5305                 if (!d->cached)
5306                         return 0;
5307
5308                 left = d->size - offset;
5309                 total_len = left > size ? size: left;
5310                 memcpy(buf, cache + offset, total_len);
5311
5312                 return total_len;
5313         }
5314
5315         pid_t initpid = lookup_initpid_in_store(fc->pid);
5316         if (initpid <= 1 || is_shared_pidns(initpid))
5317                 initpid = fc->pid;
5318         cg = get_pid_cgroup(initpid, "memory");
5319         if (!cg)
5320                 return read_file_fuse("/proc/swaps", buf, size, d);
5321         prune_init_slice(cg);
5322
5323         memlimit = get_min_memlimit(cg, false);
5324
5325         ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5326         if (ret < 0)
5327                 return 0;
5328
5329         memusage = strtoul(memusage_str, NULL, 10);
5330
5331         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5332         if (ret >= 0)
5333                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5334         if (ret >= 0) {
5335                 memswlimit = get_min_memlimit(cg, true);
5336                 memswusage = strtoul(memswusage_str, NULL, 10);
5337                 swap_total = (memswlimit - memlimit) / 1024;
5338                 swap_free = (memswusage - memusage) / 1024;
5339         }
5340
5341         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5342
5343         /* When no mem + swap limit is specified or swapaccount=0*/
5344         if (!memswlimit) {
5345                 __do_free char *line = NULL;
5346                 __do_fclose FILE *f = NULL;
5347                 size_t linelen = 0;
5348
5349                 f = fopen("/proc/meminfo", "r");
5350                 if (!f)
5351                         return 0;
5352
5353                 while (getline(&line, &linelen, f) != -1) {
5354                         if (startswith(line, "SwapTotal:"))
5355                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5356                         else if (startswith(line, "SwapFree:"))
5357                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5358                 }
5359         }
5360
5361         if (swap_total > 0) {
5362                 l = snprintf(d->buf + total_len, d->size - total_len,
5363                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5364                                 swap_total, swap_free);
5365                 total_len += l;
5366         }
5367
5368         if (total_len < 0 || l < 0) {
5369                 perror("Error writing to cache");
5370                 return 0;
5371         }
5372
5373         d->cached = 1;
5374         d->size = (int)total_len;
5375
5376         if (total_len > size) total_len = size;
5377         memcpy(buf, d->buf, total_len);
5378         return total_len;
5379 }
5380
5381 /*
5382  * Find the process pid from cgroup path.
5383  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5384  * @pid_buf : put pid to pid_buf.
5385  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5386  * @depth : the depth of cgroup in container.
5387  * @sum : return the number of pid.
5388  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5389  */
5390 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5391 {
5392         DIR *dir;
5393         int fd;
5394         struct dirent *file;
5395         FILE *f = NULL;
5396         size_t linelen = 0;
5397         char *line = NULL;
5398         int pd;
5399         char *path_dir, *path;
5400         char **pid;
5401
5402         /* path = dpath + "/cgroup.procs" + /0 */
5403         do {
5404                 path = malloc(strlen(dpath) + 20);
5405         } while (!path);
5406
5407         strcpy(path, dpath);
5408         fd = openat(cfd, path, O_RDONLY);
5409         if (fd < 0)
5410                 goto out;
5411
5412         dir = fdopendir(fd);
5413         if (dir == NULL) {
5414                 close(fd);
5415                 goto out;
5416         }
5417
5418         while (((file = readdir(dir)) != NULL) && depth > 0) {
5419                 if (strncmp(file->d_name, ".", 1) == 0)
5420                         continue;
5421                 if (strncmp(file->d_name, "..", 1) == 0)
5422                         continue;
5423                 if (file->d_type == DT_DIR) {
5424                         /* path + '/' + d_name +/0 */
5425                         do {
5426                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5427                         } while (!path_dir);
5428                         strcpy(path_dir, path);
5429                         strcat(path_dir, "/");
5430                         strcat(path_dir, file->d_name);
5431                         pd = depth - 1;
5432                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5433                         free(path_dir);
5434                 }
5435         }
5436         closedir(dir);
5437
5438         strcat(path, "/cgroup.procs");
5439         fd = openat(cfd, path, O_RDONLY);
5440         if (fd < 0)
5441                 goto out;
5442
5443         f = fdopen(fd, "r");
5444         if (!f) {
5445                 close(fd);
5446                 goto out;
5447         }
5448
5449         while (getline(&line, &linelen, f) != -1) {
5450                 do {
5451                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5452                 } while (!pid);
5453                 *pid_buf = pid;
5454                 do {
5455                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
5456                 } while (*(*pid_buf + sum) == NULL);
5457                 strcpy(*(*pid_buf + sum), line);
5458                 sum++;
5459         }
5460         fclose(f);
5461 out:
5462         if (line)
5463                 free(line);
5464         free(path);
5465         return sum;
5466 }
5467 /*
5468  * calc_load calculates the load according to the following formula:
5469  * load1 = load0 * exp + active * (1 - exp)
5470  *
5471  * @load1: the new loadavg.
5472  * @load0: the former loadavg.
5473  * @active: the total number of running pid at this moment.
5474  * @exp: the fixed-point defined in the beginning.
5475  */
5476 static unsigned long
5477 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5478 {
5479         unsigned long newload;
5480
5481         active = active > 0 ? active * FIXED_1 : 0;
5482         newload = load * exp + active * (FIXED_1 - exp);
5483         if (active >= load)
5484                 newload += FIXED_1 - 1;
5485
5486         return newload / FIXED_1;
5487 }
5488
5489 /*
5490  * Return 0 means that container p->cg is closed.
5491  * Return -1 means that error occurred in refresh.
5492  * Positive num equals the total number of pid.
5493  */
5494 static int refresh_load(struct load_node *p, char *path)
5495 {
5496         FILE *f = NULL;
5497         char **idbuf;
5498         char proc_path[256];
5499         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5500         char *line = NULL;
5501         size_t linelen = 0;
5502         int sum, length;
5503         DIR *dp;
5504         struct dirent *file;
5505
5506         do {
5507                 idbuf = malloc(sizeof(char *));
5508         } while (!idbuf);
5509         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5510         /*  normal exit  */
5511         if (sum == 0)
5512                 goto out;
5513
5514         for (i = 0; i < sum; i++) {
5515                 /*clean up '\n' */
5516                 length = strlen(idbuf[i])-1;
5517                 idbuf[i][length] = '\0';
5518                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5519                 if (ret < 0 || ret > 255) {
5520                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5521                         i = sum;
5522                         sum = -1;
5523                         goto err_out;
5524                 }
5525
5526                 dp = opendir(proc_path);
5527                 if (!dp) {
5528                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5529                         continue;
5530                 }
5531                 while ((file = readdir(dp)) != NULL) {
5532                         if (strncmp(file->d_name, ".", 1) == 0)
5533                                 continue;
5534                         if (strncmp(file->d_name, "..", 1) == 0)
5535                                 continue;
5536                         total_pid++;
5537                         /* We make the biggest pid become last_pid.*/
5538                         ret = atof(file->d_name);
5539                         last_pid = (ret > last_pid) ? ret : last_pid;
5540
5541                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5542                         if (ret < 0 || ret > 255) {
5543                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5544                                 i = sum;
5545                                 sum = -1;
5546                                 closedir(dp);
5547                                 goto err_out;
5548                         }
5549                         f = fopen(proc_path, "r");
5550                         if (f != NULL) {
5551                                 while (getline(&line, &linelen, f) != -1) {
5552                                         /* Find State */
5553                                         if ((line[0] == 'S') && (line[1] == 't'))
5554                                                 break;
5555                                 }
5556                         if ((line[7] == 'R') || (line[7] == 'D'))
5557                                 run_pid++;
5558                         fclose(f);
5559                         }
5560                 }
5561                 closedir(dp);
5562         }
5563         /*Calculate the loadavg.*/
5564         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5565         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5566         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5567         p->run_pid = run_pid;
5568         p->total_pid = total_pid;
5569         p->last_pid = last_pid;
5570
5571         free(line);
5572 err_out:
5573         for (; i > 0; i--)
5574                 free(idbuf[i-1]);
5575 out:
5576         free(idbuf);
5577         return sum;
5578 }
5579 /*
5580  * Traverse the hash table and update it.
5581  */
5582 void *load_begin(void *arg)
5583 {
5584
5585         char *path = NULL;
5586         int i, sum, length, ret;
5587         struct load_node *f;
5588         int first_node;
5589         clock_t time1, time2;
5590
5591         while (1) {
5592                 if (loadavg_stop == 1)
5593                         return NULL;
5594
5595                 time1 = clock();
5596                 for (i = 0; i < LOAD_SIZE; i++) {
5597                         pthread_mutex_lock(&load_hash[i].lock);
5598                         if (load_hash[i].next == NULL) {
5599                                 pthread_mutex_unlock(&load_hash[i].lock);
5600                                 continue;
5601                         }
5602                         f = load_hash[i].next;
5603                         first_node = 1;
5604                         while (f) {
5605                                 length = strlen(f->cg) + 2;
5606                                 do {
5607                                         /* strlen(f->cg) + '.' or '' + \0 */
5608                                         path = malloc(length);
5609                                 } while (!path);
5610
5611                                 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
5612                                 if (ret < 0 || ret > length - 1) {
5613                                         /* snprintf failed, ignore the node.*/
5614                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5615                                         goto out;
5616                                 }
5617                                 sum = refresh_load(f, path);
5618                                 if (sum == 0) {
5619                                         f = del_node(f, i);
5620                                 } else {
5621 out:                                    f = f->next;
5622                                 }
5623                                 free(path);
5624                                 /* load_hash[i].lock locks only on the first node.*/
5625                                 if (first_node == 1) {
5626                                         first_node = 0;
5627                                         pthread_mutex_unlock(&load_hash[i].lock);
5628                                 }
5629                         }
5630                 }
5631
5632                 if (loadavg_stop == 1)
5633                         return NULL;
5634
5635                 time2 = clock();
5636                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5637         }
5638 }
5639
5640 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5641                 struct fuse_file_info *fi)
5642 {
5643         struct fuse_context *fc = fuse_get_context();
5644         struct file_info *d = (struct file_info *)fi->fh;
5645         pid_t initpid;
5646         char *cg;
5647         size_t total_len = 0;
5648         char *cache = d->buf;
5649         struct load_node *n;
5650         int hash;
5651         int cfd, rv = 0;
5652         unsigned long a, b, c;
5653
5654         if (offset) {
5655                 if (offset > d->size)
5656                         return -EINVAL;
5657                 if (!d->cached)
5658                         return 0;
5659                 int left = d->size - offset;
5660                 total_len = left > size ? size : left;
5661                 memcpy(buf, cache + offset, total_len);
5662                 return total_len;
5663         }
5664         if (!loadavg)
5665                 return read_file_fuse("/proc/loadavg", buf, size, d);
5666
5667         initpid = lookup_initpid_in_store(fc->pid);
5668         if (initpid <= 1 || is_shared_pidns(initpid))
5669                 initpid = fc->pid;
5670         cg = get_pid_cgroup(initpid, "cpu");
5671         if (!cg)
5672                 return read_file_fuse("/proc/loadavg", buf, size, d);
5673
5674         prune_init_slice(cg);
5675         hash = calc_hash(cg) % LOAD_SIZE;
5676         n = locate_node(cg, hash);
5677
5678         /* First time */
5679         if (n == NULL) {
5680                 cfd = find_mounted_controller("cpu");
5681                 if (cfd >= 0) {
5682                         /*
5683                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5684                          * because delete is not allowed before read has ended.
5685                          */
5686                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5687                         rv = 0;
5688                         goto err;
5689                 }
5690                 do {
5691                         n = malloc(sizeof(struct load_node));
5692                 } while (!n);
5693
5694                 do {
5695                         n->cg = malloc(strlen(cg)+1);
5696                 } while (!n->cg);
5697                 strcpy(n->cg, cg);
5698                 n->avenrun[0] = 0;
5699                 n->avenrun[1] = 0;
5700                 n->avenrun[2] = 0;
5701                 n->run_pid = 0;
5702                 n->total_pid = 1;
5703                 n->last_pid = initpid;
5704                 n->cfd = cfd;
5705                 insert_node(&n, hash);
5706         }
5707         a = n->avenrun[0] + (FIXED_1/200);
5708         b = n->avenrun[1] + (FIXED_1/200);
5709         c = n->avenrun[2] + (FIXED_1/200);
5710         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5711                 LOAD_INT(a), LOAD_FRAC(a),
5712                 LOAD_INT(b), LOAD_FRAC(b),
5713                 LOAD_INT(c), LOAD_FRAC(c),
5714                 n->run_pid, n->total_pid, n->last_pid);
5715         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5716         if (total_len < 0 || total_len >=  d->buflen) {
5717                 lxcfs_error("%s\n", "Failed to write to cache");
5718                 rv = 0;
5719                 goto err;
5720         }
5721         d->size = (int)total_len;
5722         d->cached = 1;
5723
5724         if (total_len > size)
5725                 total_len = size;
5726         memcpy(buf, d->buf, total_len);
5727         rv = total_len;
5728
5729 err:
5730         free(cg);
5731         return rv;
5732 }
5733 /* Return a positive number on success, return 0 on failure.*/
5734 pthread_t load_daemon(int load_use)
5735 {
5736         int ret;
5737         pthread_t pid;
5738
5739         ret = init_load();
5740         if (ret == -1) {
5741                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5742                 return 0;
5743         }
5744         ret = pthread_create(&pid, NULL, load_begin, NULL);
5745         if (ret != 0) {
5746                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5747                 load_free();
5748                 return 0;
5749         }
5750         /* use loadavg, here loadavg = 1*/
5751         loadavg = load_use;
5752         return pid;
5753 }
5754
5755 /* Returns 0 on success. */
5756 int stop_load_daemon(pthread_t pid)
5757 {
5758         int s;
5759
5760         /* Signal the thread to gracefully stop */
5761         loadavg_stop = 1;
5762
5763         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5764         if (s != 0) {
5765                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5766                 return -1;
5767         }
5768
5769         load_free();
5770         loadavg_stop = 0;
5771
5772         return 0;
5773 }
5774
5775 static off_t get_procfile_size(const char *which)
5776 {
5777         FILE *f = fopen(which, "r");
5778         char *line = NULL;
5779         size_t len = 0;
5780         ssize_t sz, answer = 0;
5781         if (!f)
5782                 return 0;
5783
5784         while ((sz = getline(&line, &len, f)) != -1)
5785                 answer += sz;
5786         fclose (f);
5787         free(line);
5788
5789         return answer;
5790 }
5791
5792 int proc_getattr(const char *path, struct stat *sb)
5793 {
5794         struct timespec now;
5795
5796         memset(sb, 0, sizeof(struct stat));
5797         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5798                 return -EINVAL;
5799         sb->st_uid = sb->st_gid = 0;
5800         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5801         if (strcmp(path, "/proc") == 0) {
5802                 sb->st_mode = S_IFDIR | 00555;
5803                 sb->st_nlink = 2;
5804                 return 0;
5805         }
5806         if (strcmp(path, "/proc/meminfo") == 0 ||
5807                         strcmp(path, "/proc/cpuinfo") == 0 ||
5808                         strcmp(path, "/proc/uptime") == 0 ||
5809                         strcmp(path, "/proc/stat") == 0 ||
5810                         strcmp(path, "/proc/diskstats") == 0 ||
5811                         strcmp(path, "/proc/swaps") == 0 ||
5812                         strcmp(path, "/proc/loadavg") == 0) {
5813                 sb->st_size = 0;
5814                 sb->st_mode = S_IFREG | 00444;
5815                 sb->st_nlink = 1;
5816                 return 0;
5817         }
5818
5819         return -ENOENT;
5820 }
5821
5822 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5823                 struct fuse_file_info *fi)
5824 {
5825         if (filler(buf, ".", NULL, 0) != 0 ||
5826             filler(buf, "..", NULL, 0) != 0 ||
5827             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5828             filler(buf, "meminfo", NULL, 0) != 0 ||
5829             filler(buf, "stat", NULL, 0) != 0 ||
5830             filler(buf, "uptime", NULL, 0) != 0 ||
5831             filler(buf, "diskstats", NULL, 0) != 0 ||
5832             filler(buf, "swaps", NULL, 0) != 0   ||
5833             filler(buf, "loadavg", NULL, 0) != 0)
5834                 return -EINVAL;
5835         return 0;
5836 }
5837
5838 int proc_open(const char *path, struct fuse_file_info *fi)
5839 {
5840         int type = -1;
5841         struct file_info *info;
5842
5843         if (strcmp(path, "/proc/meminfo") == 0)
5844                 type = LXC_TYPE_PROC_MEMINFO;
5845         else if (strcmp(path, "/proc/cpuinfo") == 0)
5846                 type = LXC_TYPE_PROC_CPUINFO;
5847         else if (strcmp(path, "/proc/uptime") == 0)
5848                 type = LXC_TYPE_PROC_UPTIME;
5849         else if (strcmp(path, "/proc/stat") == 0)
5850                 type = LXC_TYPE_PROC_STAT;
5851         else if (strcmp(path, "/proc/diskstats") == 0)
5852                 type = LXC_TYPE_PROC_DISKSTATS;
5853         else if (strcmp(path, "/proc/swaps") == 0)
5854                 type = LXC_TYPE_PROC_SWAPS;
5855         else if (strcmp(path, "/proc/loadavg") == 0)
5856                 type = LXC_TYPE_PROC_LOADAVG;
5857         if (type == -1)
5858                 return -ENOENT;
5859
5860         info = malloc(sizeof(*info));
5861         if (!info)
5862                 return -ENOMEM;
5863
5864         memset(info, 0, sizeof(*info));
5865         info->type = type;
5866
5867         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5868         do {
5869                 info->buf = malloc(info->buflen);
5870         } while (!info->buf);
5871         memset(info->buf, 0, info->buflen);
5872         /* set actual size to buffer size */
5873         info->size = info->buflen;
5874
5875         fi->fh = (unsigned long)info;
5876         return 0;
5877 }
5878
5879 int proc_access(const char *path, int mask)
5880 {
5881         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5882                 return 0;
5883
5884         /* these are all read-only */
5885         if ((mask & ~R_OK) != 0)
5886                 return -EACCES;
5887         return 0;
5888 }
5889
5890 int proc_release(const char *path, struct fuse_file_info *fi)
5891 {
5892         do_release_file_info(fi);
5893         return 0;
5894 }
5895
5896 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5897                 struct fuse_file_info *fi)
5898 {
5899         struct file_info *f = (struct file_info *) fi->fh;
5900
5901         switch (f->type) {
5902         case LXC_TYPE_PROC_MEMINFO:
5903                 return proc_meminfo_read(buf, size, offset, fi);
5904         case LXC_TYPE_PROC_CPUINFO:
5905                 return proc_cpuinfo_read(buf, size, offset, fi);
5906         case LXC_TYPE_PROC_UPTIME:
5907                 return proc_uptime_read(buf, size, offset, fi);
5908         case LXC_TYPE_PROC_STAT:
5909                 return proc_stat_read(buf, size, offset, fi);
5910         case LXC_TYPE_PROC_DISKSTATS:
5911                 return proc_diskstats_read(buf, size, offset, fi);
5912         case LXC_TYPE_PROC_SWAPS:
5913                 return proc_swaps_read(buf, size, offset, fi);
5914         case LXC_TYPE_PROC_LOADAVG:
5915                 return proc_loadavg_read(buf, size, offset, fi);
5916         default:
5917                 return -EINVAL;
5918         }
5919 }
5920
5921 /*
5922  * Functions needed to setup cgroups in the __constructor__.
5923  */
5924
5925 static bool umount_if_mounted(void)
5926 {
5927         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5928                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5929                 return false;
5930         }
5931         return true;
5932 }
5933
5934 /* __typeof__ should be safe to use with all compilers. */
5935 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5936 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5937 {
5938         return (fs->f_type == (fs_type_magic)magic_val);
5939 }
5940
5941 /*
5942  * looking at fs/proc_namespace.c, it appears we can
5943  * actually expect the rootfs entry to very specifically contain
5944  * " - rootfs rootfs "
5945  * IIUC, so long as we've chrooted so that rootfs is not our root,
5946  * the rootfs entry should always be skipped in mountinfo contents.
5947  */
5948 static bool is_on_ramfs(void)
5949 {
5950         FILE *f;
5951         char *p, *p2;
5952         char *line = NULL;
5953         size_t len = 0;
5954         int i;
5955
5956         f = fopen("/proc/self/mountinfo", "r");
5957         if (!f)
5958                 return false;
5959
5960         while (getline(&line, &len, f) != -1) {
5961                 for (p = line, i = 0; p && i < 4; i++)
5962                         p = strchr(p + 1, ' ');
5963                 if (!p)
5964                         continue;
5965                 p2 = strchr(p + 1, ' ');
5966                 if (!p2)
5967                         continue;
5968                 *p2 = '\0';
5969                 if (strcmp(p + 1, "/") == 0) {
5970                         // this is '/'.  is it the ramfs?
5971                         p = strchr(p2 + 1, '-');
5972                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5973                                 free(line);
5974                                 fclose(f);
5975                                 return true;
5976                         }
5977                 }
5978         }
5979         free(line);
5980         fclose(f);
5981         return false;
5982 }
5983
5984 static int pivot_enter()
5985 {
5986         int ret = -1, oldroot = -1, newroot = -1;
5987
5988         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5989         if (oldroot < 0) {
5990                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5991                 return ret;
5992         }
5993
5994         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5995         if (newroot < 0) {
5996                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5997                 goto err;
5998         }
5999
6000         /* change into new root fs */
6001         if (fchdir(newroot) < 0) {
6002                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
6003                 goto err;
6004         }
6005
6006         /* pivot_root into our new root fs */
6007         if (pivot_root(".", ".") < 0) {
6008                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
6009                 goto err;
6010         }
6011
6012         /*
6013          * At this point the old-root is mounted on top of our new-root.
6014          * To unmounted it we must not be chdir'd into it, so escape back
6015          * to the old-root.
6016          */
6017         if (fchdir(oldroot) < 0) {
6018                 lxcfs_error("%s\n", "Failed to enter old root.");
6019                 goto err;
6020         }
6021
6022         if (umount2(".", MNT_DETACH) < 0) {
6023                 lxcfs_error("%s\n", "Failed to detach old root.");
6024                 goto err;
6025         }
6026
6027         if (fchdir(newroot) < 0) {
6028                 lxcfs_error("%s\n", "Failed to re-enter new root.");
6029                 goto err;
6030         }
6031
6032         ret = 0;
6033
6034 err:
6035         if (oldroot > 0)
6036                 close(oldroot);
6037         if (newroot > 0)
6038                 close(newroot);
6039
6040         return ret;
6041 }
6042
6043 static int chroot_enter()
6044 {
6045         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6046                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6047                 return -1;
6048         }
6049
6050         if (chroot(".") < 0) {
6051                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6052                 return -1;
6053         }
6054
6055         if (chdir("/") < 0) {
6056                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6057                 return -1;
6058         }
6059
6060         return 0;
6061 }
6062
6063 static int permute_and_enter(void)
6064 {
6065         struct statfs sb;
6066
6067         if (statfs("/", &sb) < 0) {
6068                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6069                 return -1;
6070         }
6071
6072         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6073          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6074          * /proc/1/mountinfo. */
6075         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6076                 return chroot_enter();
6077
6078         if (pivot_enter() < 0) {
6079                 lxcfs_error("%s\n", "Could not perform pivot root.");
6080                 return -1;
6081         }
6082
6083         return 0;
6084 }
6085
6086 /* Prepare our new clean root. */
6087 static int permute_prepare(void)
6088 {
6089         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6090                 lxcfs_error("%s\n", "Failed to create directory for new root.");
6091                 return -1;
6092         }
6093
6094         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6095                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6096                 return -1;
6097         }
6098
6099         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6100                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6101                 return -1;
6102         }
6103
6104         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6105                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6106                 return -1;
6107         }
6108
6109         return 0;
6110 }
6111
6112 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6113 static bool permute_root(void)
6114 {
6115         /* Prepare new root. */
6116         if (permute_prepare() < 0)
6117                 return false;
6118
6119         /* Pivot into new root. */
6120         if (permute_and_enter() < 0)
6121                 return false;
6122
6123         return true;
6124 }
6125
6126 static int preserve_mnt_ns(int pid)
6127 {
6128         int ret;
6129         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6130         char path[len];
6131
6132         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6133         if (ret < 0 || (size_t)ret >= len)
6134                 return -1;
6135
6136         return open(path, O_RDONLY | O_CLOEXEC);
6137 }
6138
6139 static bool cgfs_prepare_mounts(void)
6140 {
6141         if (!mkdir_p(BASEDIR, 0700)) {
6142                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6143                 return false;
6144         }
6145
6146         if (!umount_if_mounted()) {
6147                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6148                 return false;
6149         }
6150
6151         if (unshare(CLONE_NEWNS) < 0) {
6152                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6153                 return false;
6154         }
6155
6156         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6157         if (cgroup_mount_ns_fd < 0) {
6158                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6159                 return false;
6160         }
6161
6162         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6163                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6164                 return false;
6165         }
6166
6167         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6168                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6169                 return false;
6170         }
6171
6172         return true;
6173 }
6174
6175 static bool cgfs_mount_hierarchies(void)
6176 {
6177         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
6178                 return false;
6179
6180         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
6181                 return false;
6182
6183         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
6184                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
6185                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
6186                 if ((*h)->fd < 0)
6187                         return false;
6188         }
6189
6190         return true;
6191 }
6192
6193 static bool cgfs_setup_controllers(void)
6194 {
6195         if (!cgfs_prepare_mounts())
6196                 return false;
6197
6198         if (!cgfs_mount_hierarchies()) {
6199                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6200                 return false;
6201         }
6202
6203         if (!permute_root())
6204                 return false;
6205
6206         return true;
6207 }
6208
6209 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6210 {
6211         char *cret;
6212         char cwd[MAXPATHLEN];
6213         int init_ns = -1;
6214
6215         cgroup_ops = cgroup_init();
6216         if (!cgroup_ops)
6217                 return;
6218
6219         /* Preserve initial namespace. */
6220         init_ns = preserve_mnt_ns(getpid());
6221         if (init_ns < 0) {
6222                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6223                 goto out;
6224         }
6225
6226         cret = getcwd(cwd, MAXPATHLEN);
6227         if (!cret)
6228                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6229
6230         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6231          * to privately mount lxcfs cgroups. */
6232         if (!cgfs_setup_controllers()) {
6233                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6234                 goto out;
6235         }
6236
6237         if (setns(init_ns, 0) < 0) {
6238                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6239                 goto out;
6240         }
6241
6242         if (!cret || chdir(cwd) < 0)
6243                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6244
6245         if (!init_cpuview()) {
6246                 lxcfs_error("%s\n", "failed to init CPU view");
6247                 goto out;
6248         }
6249
6250         print_subsystems();
6251
6252 out:
6253         if (init_ns >= 0)
6254                 close(init_ns);
6255 }
6256
6257 static void __attribute__((destructor)) free_subsystems(void)
6258 {
6259         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6260
6261         cgroup_exit(cgroup_ops);
6262         free_cpuview();
6263
6264         if (cgroup_mount_ns_fd >= 0)
6265                 close(cgroup_mount_ns_fd);
6266 }