bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 struct cpuacct_usage {
  84         uint64_t user;
  85         uint64_t system;
  86         uint64_t idle;
  87 };
  88
  89 /* The function of hash table.*/
  90 #define LOAD_SIZE 100 /*the size of hash_table */
  91 #define FLUSH_TIME 5  /*the flush rate */
  92 #define DEPTH_DIR 3   /*the depth of per cgroup */
  93 /* The function of calculate loadavg .*/
  94 #define FSHIFT          11              /* nr of bits of precision */
  95 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  96 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  97 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  98 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  99 #define LOAD_INT(x) ((x) >> FSHIFT)
 100 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 101 /*
 102  * This parameter is used for proc_loadavg_read().
 103  * 1 means use loadavg, 0 means not use.
 104  */
 105 static int loadavg = 0;
 106 static volatile sig_atomic_t loadavg_stop = 0;
 107 static int calc_hash(const char *name)
 108 {
 109         unsigned int hash = 0;
 110         unsigned int x = 0;
 111         /* ELFHash algorithm. */
 112         while (*name) {
 113                 hash = (hash << 4) + *name++;
 114                 x = hash & 0xf0000000;
 115                 if (x != 0)
 116                         hash ^= (x >> 24);
 117                 hash &= ~x;
 118         }
 119         return (hash & 0x7fffffff);
 120 }
 121
 122 struct load_node {
 123         char *cg;  /*cg */
 124         unsigned long avenrun[3];               /* Load averages */
 125         unsigned int run_pid;
 126         unsigned int total_pid;
 127         unsigned int last_pid;
 128         int cfd; /* The file descriptor of the mounted cgroup */
 129         struct  load_node *next;
 130         struct  load_node **pre;
 131 };
 132
 133 struct load_head {
 134         /*
 135          * The lock is about insert load_node and refresh load_node.To the first
 136          * load_node of each hash bucket, insert and refresh in this hash bucket is
 137          * mutually exclusive.
 138          */
 139         pthread_mutex_t lock;
 140         /*
 141          * The rdlock is about read loadavg and delete load_node.To each hash
 142          * bucket, read and delete is mutually exclusive. But at the same time, we
 143          * allow paratactic read operation. This rdlock is at list level.
 144          */
 145         pthread_rwlock_t rdlock;
 146         /*
 147          * The rilock is about read loadavg and insert load_node.To the first
 148          * load_node of each hash bucket, read and insert is mutually exclusive.
 149          * But at the same time, we allow paratactic read operation.
 150          */
 151         pthread_rwlock_t rilock;
 152         struct load_node *next;
 153 };
 154
 155 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 156 /*
 157  * init_load initialize the hash table.
 158  * Return 0 on success, return -1 on failure.
 159  */
 160 static int init_load(void)
 161 {
 162         int i;
 163         int ret;
 164
 165         for (i = 0; i < LOAD_SIZE; i++) {
 166                 load_hash[i].next = NULL;
 167                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 168                 if (ret != 0) {
 169                         lxcfs_error("%s\n", "Failed to initialize lock");
 170                         goto out3;
 171                 }
 172                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 173                 if (ret != 0) {
 174                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 175                         goto out2;
 176                 }
 177                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 178                 if (ret != 0) {
 179                         lxcfs_error("%s\n", "Failed to initialize rilock");
 180                         goto out1;
 181                 }
 182         }
 183         return 0;
 184 out1:
 185         pthread_rwlock_destroy(&load_hash[i].rdlock);
 186 out2:
 187         pthread_mutex_destroy(&load_hash[i].lock);
 188 out3:
 189         while (i > 0) {
 190                 i--;
 191                 pthread_mutex_destroy(&load_hash[i].lock);
 192                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 193                 pthread_rwlock_destroy(&load_hash[i].rilock);
 194         }
 195         return -1;
 196 }
 197
 198 static void insert_node(struct load_node **n, int locate)
 199 {
 200         struct load_node *f;
 201
 202         pthread_mutex_lock(&load_hash[locate].lock);
 203         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 204         f = load_hash[locate].next;
 205         load_hash[locate].next = *n;
 206
 207         (*n)->pre = &(load_hash[locate].next);
 208         if (f)
 209                 f->pre = &((*n)->next);
 210         (*n)->next = f;
 211         pthread_mutex_unlock(&load_hash[locate].lock);
 212         pthread_rwlock_unlock(&load_hash[locate].rilock);
 213 }
 214 /*
 215  * locate_node() finds special node. Not return NULL means success.
 216  * It should be noted that rdlock isn't unlocked at the end of code
 217  * because this function is used to read special node. Delete is not
 218  * allowed before read has ended.
 219  * unlock rdlock only in proc_loadavg_read().
 220  */
 221 static struct load_node *locate_node(char *cg, int locate)
 222 {
 223         struct load_node *f = NULL;
 224         int i = 0;
 225
 226         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 227         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 228         if (load_hash[locate].next == NULL) {
 229                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 230                 return f;
 231         }
 232         f = load_hash[locate].next;
 233         pthread_rwlock_unlock(&load_hash[locate].rilock);
 234         while (f && ((i = strcmp(f->cg, cg)) != 0))
 235                 f = f->next;
 236         return f;
 237 }
 238 /* Delete the load_node n and return the next node of it. */
 239 static struct load_node *del_node(struct load_node *n, int locate)
 240 {
 241         struct load_node *g;
 242
 243         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 244         if (n->next == NULL) {
 245                 *(n->pre) = NULL;
 246         } else {
 247                 *(n->pre) = n->next;
 248                 n->next->pre = n->pre;
 249         }
 250         g = n->next;
 251         free(n->cg);
 252         free(n);
 253         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 254         return g;
 255 }
 256
 257 static void load_free(void)
 258 {
 259         int i;
 260         struct load_node *f, *p;
 261
 262         for (i = 0; i < LOAD_SIZE; i++) {
 263                 pthread_mutex_lock(&load_hash[i].lock);
 264                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 265                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 266                 if (load_hash[i].next == NULL) {
 267                         pthread_mutex_unlock(&load_hash[i].lock);
 268                         pthread_mutex_destroy(&load_hash[i].lock);
 269                         pthread_rwlock_unlock(&load_hash[i].rilock);
 270                         pthread_rwlock_destroy(&load_hash[i].rilock);
 271                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 272                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 273                         continue;
 274                 }
 275                 for (f = load_hash[i].next; f; ) {
 276                         free(f->cg);
 277                         p = f->next;
 278                         free(f);
 279                         f = p;
 280                 }
 281                 pthread_mutex_unlock(&load_hash[i].lock);
 282                 pthread_mutex_destroy(&load_hash[i].lock);
 283                 pthread_rwlock_unlock(&load_hash[i].rilock);
 284                 pthread_rwlock_destroy(&load_hash[i].rilock);
 285                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 286                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 287         }
 288 }
 289
 290 /* Data for CPU view */
 291 struct cg_proc_stat {
 292         char *cg;
 293         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 294         struct cpuacct_usage *view; // Usage stats reported to the container
 295         int cpu_count;
 296         pthread_mutex_t lock; // For node manipulation
 297         struct cg_proc_stat *next;
 298 };
 299
 300 struct cg_proc_stat_head {
 301         struct cg_proc_stat *next;
 302         time_t lastcheck;
 303
 304         /*
 305          * For access to the list. Reading can be parallel, pruning is exclusive.
 306          */
 307         pthread_rwlock_t lock;
 308 };
 309
 310 #define CPUVIEW_HASH_SIZE 100
 311 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 312
 313 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 314 {
 315         *head = malloc(sizeof(struct cg_proc_stat_head));
 316         if (!(*head)) {
 317                 lxcfs_error("%s\n", strerror(errno));
 318                 return false;
 319         }
 320
 321         (*head)->lastcheck = time(NULL);
 322         (*head)->next = NULL;
 323
 324         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 325                 lxcfs_error("%s\n", "Failed to initialize list lock");
 326                 free(*head);
 327                 return false;
 328         }
 329
 330         return true;
 331 }
 332
 333 static bool init_cpuview()
 334 {
 335         int i;
 336
 337         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 338                 proc_stat_history[i] = NULL;
 339
 340         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 341                 if (!cpuview_init_head(&proc_stat_history[i]))
 342                         goto err;
 343         }
 344
 345         return true;
 346
 347 err:
 348         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 349                 if (proc_stat_history[i]) {
 350                         free(proc_stat_history[i]);
 351                         proc_stat_history[i] = NULL;
 352                 }
 353         }
 354
 355         return false;
 356 }
 357
 358 static void free_proc_stat_node(struct cg_proc_stat *node)
 359 {
 360         pthread_mutex_destroy(&node->lock);
 361         free(node->cg);
 362         free(node->usage);
 363         free(node->view);
 364         free(node);
 365 }
 366
 367 static void cpuview_free_head(struct cg_proc_stat_head *head)
 368 {
 369         struct cg_proc_stat *node, *tmp;
 370
 371         if (head->next) {
 372                 node = head->next;
 373
 374                 for (;;) {
 375                         tmp = node;
 376                         node = node->next;
 377                         free_proc_stat_node(tmp);
 378
 379                         if (!node)
 380                                 break;
 381                 }
 382         }
 383
 384         pthread_rwlock_destroy(&head->lock);
 385         free(head);
 386 }
 387
 388 static void free_cpuview()
 389 {
 390         int i;
 391
 392         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 393                 if (proc_stat_history[i])
 394                         cpuview_free_head(proc_stat_history[i]);
 395         }
 396 }
 397
 398 /* Reserve buffer size to account for file size changes. */
 399 #define BUF_RESERVE_SIZE 512
 400
 401 /*
 402  * A table caching which pid is init for a pid namespace.
 403  * When looking up which pid is init for $qpid, we first
 404  * 1. Stat /proc/$qpid/ns/pid.
 405  * 2. Check whether the ino_t is in our store.
 406  *   a. if not, fork a child in qpid's ns to send us
 407  *       ucred.pid = 1, and read the initpid.  Cache
 408  *       initpid and creation time for /proc/initpid
 409  *       in a new store entry.
 410  *   b. if so, verify that /proc/initpid still matches
 411  *       what we have saved.  If not, clear the store
 412  *       entry and go back to a.  If so, return the
 413  *       cached initpid.
 414  */
 415 struct pidns_init_store {
 416         ino_t ino;          // inode number for /proc/$pid/ns/pid
 417         pid_t initpid;      // the pid of nit in that ns
 418         long int ctime;     // the time at which /proc/$initpid was created
 419         struct pidns_init_store *next;
 420         long int lastcheck;
 421 };
 422
 423 /* lol - look at how they are allocated in the kernel */
 424 #define PIDNS_HASH_SIZE 4096
 425 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 426
 427 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 428 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 429 static void lock_mutex(pthread_mutex_t *l)
 430 {
 431         int ret;
 432
 433         if ((ret = pthread_mutex_lock(l)) != 0) {
 434                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 435                 exit(1);
 436         }
 437 }
 438
 439 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 440  * Number of hierarchies mounted. */
 441 static int num_hierarchies;
 442
 443 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 444  * Hierachies mounted {cpuset, blkio, ...}:
 445  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 446 static char **hierarchies;
 447
 448 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 449  * Open file descriptors:
 450  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 451  * private mount namespace.
 452  * Initialized via __constructor__ collect_and_mount_subsystems().
 453  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 454  * mounts and respective files in the private namespace even when located in
 455  * another namespace using the *at() family of functions
 456  * {openat(), fchownat(), ...}. */
 457 static int *fd_hierarchies;
 458 static int cgroup_mount_ns_fd = -1;
 459
 460 static void unlock_mutex(pthread_mutex_t *l)
 461 {
 462         int ret;
 463
 464         if ((ret = pthread_mutex_unlock(l)) != 0) {
 465                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 466                 exit(1);
 467         }
 468 }
 469
 470 static void store_lock(void)
 471 {
 472         lock_mutex(&pidns_store_mutex);
 473 }
 474
 475 static void store_unlock(void)
 476 {
 477         unlock_mutex(&pidns_store_mutex);
 478 }
 479
 480 /* Must be called under store_lock */
 481 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 482 {
 483         struct stat initsb;
 484         char fnam[100];
 485
 486         snprintf(fnam, 100, "/proc/%d", e->initpid);
 487         if (stat(fnam, &initsb) < 0)
 488                 return false;
 489
 490         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 491                     initsb.st_ctime, e->initpid);
 492
 493         if (e->ctime != initsb.st_ctime)
 494                 return false;
 495         return true;
 496 }
 497
 498 /* Must be called under store_lock */
 499 static void remove_initpid(struct pidns_init_store *e)
 500 {
 501         struct pidns_init_store *tmp;
 502         int h;
 503
 504         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 505
 506         h = HASH(e->ino);
 507         if (pidns_hash_table[h] == e) {
 508                 pidns_hash_table[h] = e->next;
 509                 free(e);
 510                 return;
 511         }
 512
 513         tmp = pidns_hash_table[h];
 514         while (tmp) {
 515                 if (tmp->next == e) {
 516                         tmp->next = e->next;
 517                         free(e);
 518                         return;
 519                 }
 520                 tmp = tmp->next;
 521         }
 522 }
 523
 524 #define PURGE_SECS 5
 525 /* Must be called under store_lock */
 526 static void prune_initpid_store(void)
 527 {
 528         static long int last_prune = 0;
 529         struct pidns_init_store *e, *prev, *delme;
 530         long int now, threshold;
 531         int i;
 532
 533         if (!last_prune) {
 534                 last_prune = time(NULL);
 535                 return;
 536         }
 537         now = time(NULL);
 538         if (now < last_prune + PURGE_SECS)
 539                 return;
 540
 541         lxcfs_debug("%s\n", "Pruning.");
 542
 543         last_prune = now;
 544         threshold = now - 2 * PURGE_SECS;
 545
 546         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 547                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 548                         if (e->lastcheck < threshold) {
 549
 550                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 551
 552                                 delme = e;
 553                                 if (prev)
 554                                         prev->next = e->next;
 555                                 else
 556                                         pidns_hash_table[i] = e->next;
 557                                 e = e->next;
 558                                 free(delme);
 559                         } else {
 560                                 prev = e;
 561                                 e = e->next;
 562                         }
 563                 }
 564         }
 565 }
 566
 567 /* Must be called under store_lock */
 568 static void save_initpid(struct stat *sb, pid_t pid)
 569 {
 570         struct pidns_init_store *e;
 571         char fpath[100];
 572         struct stat procsb;
 573         int h;
 574
 575         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 576
 577         snprintf(fpath, 100, "/proc/%d", pid);
 578         if (stat(fpath, &procsb) < 0)
 579                 return;
 580         do {
 581                 e = malloc(sizeof(*e));
 582         } while (!e);
 583         e->ino = sb->st_ino;
 584         e->initpid = pid;
 585         e->ctime = procsb.st_ctime;
 586         h = HASH(e->ino);
 587         e->next = pidns_hash_table[h];
 588         e->lastcheck = time(NULL);
 589         pidns_hash_table[h] = e;
 590 }
 591
 592 /*
 593  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 594  * entry for the inode number and creation time.  Verify that the init pid
 595  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 596  * otherwise.
 597  * Must be called under store_lock
 598  */
 599 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 600 {
 601         int h = HASH(sb->st_ino);
 602         struct pidns_init_store *e = pidns_hash_table[h];
 603
 604         while (e) {
 605                 if (e->ino == sb->st_ino) {
 606                         if (initpid_still_valid(e, sb)) {
 607                                 e->lastcheck = time(NULL);
 608                                 return e;
 609                         }
 610                         remove_initpid(e);
 611                         return NULL;
 612                 }
 613                 e = e->next;
 614         }
 615
 616         return NULL;
 617 }
 618
 619 static int is_dir(const char *path, int fd)
 620 {
 621         struct stat statbuf;
 622         int ret = fstatat(fd, path, &statbuf, fd);
 623         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 624                 return 1;
 625         return 0;
 626 }
 627
 628 static char *must_copy_string(const char *str)
 629 {
 630         char *dup = NULL;
 631         if (!str)
 632                 return NULL;
 633         do {
 634                 dup = strdup(str);
 635         } while (!dup);
 636
 637         return dup;
 638 }
 639
 640 static inline void drop_trailing_newlines(char *s)
 641 {
 642         int l;
 643
 644         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 645                 s[l-1] = '\0';
 646 }
 647
 648 #define BATCH_SIZE 50
 649 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 650 {
 651         int newbatches = (newlen / BATCH_SIZE) + 1;
 652         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 653
 654         if (!*mem || newbatches > oldbatches) {
 655                 char *tmp;
 656                 do {
 657                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 658                 } while (!tmp);
 659                 *mem = tmp;
 660         }
 661 }
 662 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 663 {
 664         size_t newlen = *len + linelen;
 665         dorealloc(contents, *len, newlen + 1);
 666         memcpy(*contents + *len, line, linelen+1);
 667         *len = newlen;
 668 }
 669
 670 static char *slurp_file(const char *from, int fd)
 671 {
 672         char *line = NULL;
 673         char *contents = NULL;
 674         FILE *f = fdopen(fd, "r");
 675         size_t len = 0, fulllen = 0;
 676         ssize_t linelen;
 677
 678         if (!f)
 679                 return NULL;
 680
 681         while ((linelen = getline(&line, &len, f)) != -1) {
 682                 append_line(&contents, &fulllen, line, linelen);
 683         }
 684         fclose(f);
 685
 686         if (contents)
 687                 drop_trailing_newlines(contents);
 688         free(line);
 689         return contents;
 690 }
 691
 692 static bool write_string(const char *fnam, const char *string, int fd)
 693 {
 694         FILE *f;
 695         size_t len, ret;
 696
 697         f = fdopen(fd, "w");
 698         if (!f)
 699                 return false;
 700
 701         len = strlen(string);
 702         ret = fwrite(string, 1, len, f);
 703         if (ret != len) {
 704                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 705                             strerror(errno), string, fnam);
 706                 fclose(f);
 707                 return false;
 708         }
 709
 710         if (fclose(f) < 0) {
 711                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 712                 return false;
 713         }
 714
 715         return true;
 716 }
 717
 718 struct cgfs_files {
 719         char *name;
 720         uint32_t uid, gid;
 721         uint32_t mode;
 722 };
 723
 724 #define ALLOC_NUM 20
 725 static bool store_hierarchy(char *stridx, char *h)
 726 {
 727         if (num_hierarchies % ALLOC_NUM == 0) {
 728                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 729                 n *= ALLOC_NUM;
 730                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 731                 if (!tmp) {
 732                         lxcfs_error("%s\n", strerror(errno));
 733                         exit(1);
 734                 }
 735                 hierarchies = tmp;
 736         }
 737
 738         hierarchies[num_hierarchies++] = must_copy_string(h);
 739         return true;
 740 }
 741
 742 static void print_subsystems(void)
 743 {
 744         int i;
 745
 746         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 747         fprintf(stderr, "hierarchies:\n");
 748         for (i = 0; i < num_hierarchies; i++) {
 749                 if (hierarchies[i])
 750                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 751                                 fd_hierarchies[i], hierarchies[i]);
 752         }
 753 }
 754
 755 static bool in_comma_list(const char *needle, const char *haystack)
 756 {
 757         const char *s = haystack, *e;
 758         size_t nlen = strlen(needle);
 759
 760         while (*s && (e = strchr(s, ','))) {
 761                 if (nlen != e - s) {
 762                         s = e + 1;
 763                         continue;
 764                 }
 765                 if (strncmp(needle, s, nlen) == 0)
 766                         return true;
 767                 s = e + 1;
 768         }
 769         if (strcmp(needle, s) == 0)
 770                 return true;
 771         return false;
 772 }
 773
 774 /* do we need to do any massaging here?  I'm not sure... */
 775 /* Return the mounted controller and store the corresponding open file descriptor
 776  * referring to the controller mountpoint in the private lxcfs namespace in
 777  * @cfd.
 778  */
 779 static char *find_mounted_controller(const char *controller, int *cfd)
 780 {
 781         int i;
 782
 783         for (i = 0; i < num_hierarchies; i++) {
 784                 if (!hierarchies[i])
 785                         continue;
 786                 if (strcmp(hierarchies[i], controller) == 0) {
 787                         *cfd = fd_hierarchies[i];
 788                         return hierarchies[i];
 789                 }
 790                 if (in_comma_list(controller, hierarchies[i])) {
 791                         *cfd = fd_hierarchies[i];
 792                         return hierarchies[i];
 793                 }
 794         }
 795
 796         return NULL;
 797 }
 798
 799 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 800                 const char *value)
 801 {
 802         int ret, fd, cfd;
 803         size_t len;
 804         char *fnam, *tmpc;
 805
 806         tmpc = find_mounted_controller(controller, &cfd);
 807         if (!tmpc)
 808                 return false;
 809
 810         /* Make sure we pass a relative path to *at() family of functions.
 811          * . + /cgroup + / + file + \0
 812          */
 813         len = strlen(cgroup) + strlen(file) + 3;
 814         fnam = alloca(len);
 815         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 816         if (ret < 0 || (size_t)ret >= len)
 817                 return false;
 818
 819         fd = openat(cfd, fnam, O_WRONLY);
 820         if (fd < 0)
 821                 return false;
 822
 823         return write_string(fnam, value, fd);
 824 }
 825
 826 // Chown all the files in the cgroup directory.  We do this when we create
 827 // a cgroup on behalf of a user.
 828 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 829 {
 830         struct dirent *direntp;
 831         char path[MAXPATHLEN];
 832         size_t len;
 833         DIR *d;
 834         int fd1, ret;
 835
 836         len = strlen(dirname);
 837         if (len >= MAXPATHLEN) {
 838                 lxcfs_error("Pathname too long: %s\n", dirname);
 839                 return;
 840         }
 841
 842         fd1 = openat(fd, dirname, O_DIRECTORY);
 843         if (fd1 < 0)
 844                 return;
 845
 846         d = fdopendir(fd1);
 847         if (!d) {
 848                 lxcfs_error("Failed to open %s\n", dirname);
 849                 return;
 850         }
 851
 852         while ((direntp = readdir(d))) {
 853                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 854                         continue;
 855                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 856                 if (ret < 0 || ret >= MAXPATHLEN) {
 857                         lxcfs_error("Pathname too long under %s\n", dirname);
 858                         continue;
 859                 }
 860                 if (fchownat(fd, path, uid, gid, 0) < 0)
 861                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 862         }
 863         closedir(d);
 864 }
 865
 866 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 867 {
 868         int cfd;
 869         size_t len;
 870         char *dirnam, *tmpc;
 871
 872         tmpc = find_mounted_controller(controller, &cfd);
 873         if (!tmpc)
 874                 return -EINVAL;
 875
 876         /* Make sure we pass a relative path to *at() family of functions.
 877          * . + /cg + \0
 878          */
 879         len = strlen(cg) + 2;
 880         dirnam = alloca(len);
 881         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 882
 883         if (mkdirat(cfd, dirnam, 0755) < 0)
 884                 return -errno;
 885
 886         if (uid == 0 && gid == 0)
 887                 return 0;
 888
 889         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 890                 return -errno;
 891
 892         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 893
 894         return 0;
 895 }
 896
 897 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 898 {
 899         struct dirent *direntp;
 900         DIR *dir;
 901         bool ret = false;
 902         char pathname[MAXPATHLEN];
 903         int dupfd;
 904
 905         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 906         if (dupfd < 0)
 907                 return false;
 908
 909         dir = fdopendir(dupfd);
 910         if (!dir) {
 911                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 912                 close(dupfd);
 913                 return false;
 914         }
 915
 916         while ((direntp = readdir(dir))) {
 917                 struct stat mystat;
 918                 int rc;
 919
 920                 if (!strcmp(direntp->d_name, ".") ||
 921                     !strcmp(direntp->d_name, ".."))
 922                         continue;
 923
 924                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 925                 if (rc < 0 || rc >= MAXPATHLEN) {
 926                         lxcfs_error("%s\n", "Pathname too long.");
 927                         continue;
 928                 }
 929
 930                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 931                 if (rc) {
 932                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 933                         continue;
 934                 }
 935                 if (S_ISDIR(mystat.st_mode))
 936                         if (!recursive_rmdir(pathname, fd, cfd))
 937                                 lxcfs_debug("Error removing %s.\n", pathname);
 938         }
 939
 940         ret = true;
 941         if (closedir(dir) < 0) {
 942                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 943                 ret = false;
 944         }
 945
 946         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 947                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 948                 ret = false;
 949         }
 950
 951         close(dupfd);
 952
 953         return ret;
 954 }
 955
 956 bool cgfs_remove(const char *controller, const char *cg)
 957 {
 958         int fd, cfd;
 959         size_t len;
 960         char *dirnam, *tmpc;
 961         bool bret;
 962
 963         tmpc = find_mounted_controller(controller, &cfd);
 964         if (!tmpc)
 965                 return false;
 966
 967         /* Make sure we pass a relative path to *at() family of functions.
 968          * . +  /cg + \0
 969          */
 970         len = strlen(cg) + 2;
 971         dirnam = alloca(len);
 972         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 973
 974         fd = openat(cfd, dirnam, O_DIRECTORY);
 975         if (fd < 0)
 976                 return false;
 977
 978         bret = recursive_rmdir(dirnam, fd, cfd);
 979         close(fd);
 980         return bret;
 981 }
 982
 983 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 984 {
 985         int cfd;
 986         size_t len;
 987         char *pathname, *tmpc;
 988
 989         tmpc = find_mounted_controller(controller, &cfd);
 990         if (!tmpc)
 991                 return false;
 992
 993         /* Make sure we pass a relative path to *at() family of functions.
 994          * . + /file + \0
 995          */
 996         len = strlen(file) + 2;
 997         pathname = alloca(len);
 998         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 999         if (fchmodat(cfd, pathname, mode, 0) < 0)
1000                 return false;
1001         return true;
1002 }
1003
1004 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1005 {
1006         size_t len;
1007         char *fname;
1008
1009         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1010         fname = alloca(len);
1011         snprintf(fname, len, "%s/tasks", dirname);
1012         if (fchownat(fd, fname, uid, gid, 0) != 0)
1013                 return -errno;
1014         snprintf(fname, len, "%s/cgroup.procs", dirname);
1015         if (fchownat(fd, fname, uid, gid, 0) != 0)
1016                 return -errno;
1017         return 0;
1018 }
1019
1020 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1021 {
1022         int cfd;
1023         size_t len;
1024         char *pathname, *tmpc;
1025
1026         tmpc = find_mounted_controller(controller, &cfd);
1027         if (!tmpc)
1028                 return -EINVAL;
1029
1030         /* Make sure we pass a relative path to *at() family of functions.
1031          * . + /file + \0
1032          */
1033         len = strlen(file) + 2;
1034         pathname = alloca(len);
1035         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1036         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1037                 return -errno;
1038
1039         if (is_dir(pathname, cfd))
1040                 // like cgmanager did, we want to chown the tasks file as well
1041                 return chown_tasks_files(pathname, uid, gid, cfd);
1042
1043         return 0;
1044 }
1045
1046 FILE *open_pids_file(const char *controller, const char *cgroup)
1047 {
1048         int fd, cfd;
1049         size_t len;
1050         char *pathname, *tmpc;
1051
1052         tmpc = find_mounted_controller(controller, &cfd);
1053         if (!tmpc)
1054                 return NULL;
1055
1056         /* Make sure we pass a relative path to *at() family of functions.
1057          * . + /cgroup + / "cgroup.procs" + \0
1058          */
1059         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1060         pathname = alloca(len);
1061         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1062
1063         fd = openat(cfd, pathname, O_WRONLY);
1064         if (fd < 0)
1065                 return NULL;
1066
1067         return fdopen(fd, "w");
1068 }
1069
1070 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1071                                 void ***list, size_t typesize,
1072                                 void* (*iterator)(const char*, const char*, const char*))
1073 {
1074         int cfd, fd, ret;
1075         size_t len;
1076         char *cg, *tmpc;
1077         char pathname[MAXPATHLEN];
1078         size_t sz = 0, asz = 0;
1079         struct dirent *dirent;
1080         DIR *dir;
1081
1082         tmpc = find_mounted_controller(controller, &cfd);
1083         *list = NULL;
1084         if (!tmpc)
1085                 return false;
1086
1087         /* Make sure we pass a relative path to *at() family of functions. */
1088         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1089         cg = alloca(len);
1090         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1091         if (ret < 0 || (size_t)ret >= len) {
1092                 lxcfs_error("Pathname too long under %s\n", cgroup);
1093                 return false;
1094         }
1095
1096         fd = openat(cfd, cg, O_DIRECTORY);
1097         if (fd < 0)
1098                 return false;
1099
1100         dir = fdopendir(fd);
1101         if (!dir)
1102                 return false;
1103
1104         while ((dirent = readdir(dir))) {
1105                 struct stat mystat;
1106
1107                 if (!strcmp(dirent->d_name, ".") ||
1108                     !strcmp(dirent->d_name, ".."))
1109                         continue;
1110
1111                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1112                 if (ret < 0 || ret >= MAXPATHLEN) {
1113                         lxcfs_error("Pathname too long under %s\n", cg);
1114                         continue;
1115                 }
1116
1117                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1118                 if (ret) {
1119                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1120                         continue;
1121                 }
1122                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1123                     (directories && !S_ISDIR(mystat.st_mode)))
1124                         continue;
1125
1126                 if (sz+2 >= asz) {
1127                         void **tmp;
1128                         asz += BATCH_SIZE;
1129                         do {
1130                                 tmp = realloc(*list, asz * typesize);
1131                         } while  (!tmp);
1132                         *list = tmp;
1133                 }
1134                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1135                 (*list)[sz+1] = NULL;
1136                 sz++;
1137         }
1138         if (closedir(dir) < 0) {
1139                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1140                 return false;
1141         }
1142         return true;
1143 }
1144
1145 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1146 {
1147         char *dup;
1148         do {
1149                 dup = strdup(dir_entry);
1150         } while (!dup);
1151         return dup;
1152 }
1153
1154 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1155 {
1156         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1157 }
1158
1159 void free_key(struct cgfs_files *k)
1160 {
1161         if (!k)
1162                 return;
1163         free(k->name);
1164         free(k);
1165 }
1166
1167 void free_keys(struct cgfs_files **keys)
1168 {
1169         int i;
1170
1171         if (!keys)
1172                 return;
1173         for (i = 0; keys[i]; i++) {
1174                 free_key(keys[i]);
1175         }
1176         free(keys);
1177 }
1178
1179 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1180 {
1181         int ret, fd, cfd;
1182         size_t len;
1183         char *fnam, *tmpc;
1184
1185         tmpc = find_mounted_controller(controller, &cfd);
1186         if (!tmpc)
1187                 return false;
1188
1189         /* Make sure we pass a relative path to *at() family of functions.
1190          * . + /cgroup + / + file + \0
1191          */
1192         len = strlen(cgroup) + strlen(file) + 3;
1193         fnam = alloca(len);
1194         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1195         if (ret < 0 || (size_t)ret >= len)
1196                 return false;
1197
1198         fd = openat(cfd, fnam, O_RDONLY);
1199         if (fd < 0)
1200                 return false;
1201
1202         *value = slurp_file(fnam, fd);
1203         return *value != NULL;
1204 }
1205
1206 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1207 {
1208         int ret, cfd;
1209         size_t len;
1210         char *fnam, *tmpc;
1211
1212         tmpc = find_mounted_controller(controller, &cfd);
1213         if (!tmpc)
1214                 return false;
1215
1216         /* Make sure we pass a relative path to *at() family of functions.
1217          * . + /cgroup + / + file + \0
1218          */
1219         len = strlen(cgroup) + strlen(file) + 3;
1220         fnam = alloca(len);
1221         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1222         if (ret < 0 || (size_t)ret >= len)
1223                 return false;
1224
1225         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1226 }
1227
1228 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1229 {
1230         int ret, cfd;
1231         size_t len;
1232         char *fnam, *tmpc;
1233         struct stat sb;
1234         struct cgfs_files *newkey;
1235
1236         tmpc = find_mounted_controller(controller, &cfd);
1237         if (!tmpc)
1238                 return false;
1239
1240         if (file && *file == '/')
1241                 file++;
1242
1243         if (file && strchr(file, '/'))
1244                 return NULL;
1245
1246         /* Make sure we pass a relative path to *at() family of functions.
1247          * . + /cgroup + / + file + \0
1248          */
1249         len = strlen(cgroup) + 3;
1250         if (file)
1251                 len += strlen(file) + 1;
1252         fnam = alloca(len);
1253         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1254                  file ? "/" : "", file ? file : "");
1255
1256         ret = fstatat(cfd, fnam, &sb, 0);
1257         if (ret < 0)
1258                 return NULL;
1259
1260         do {
1261                 newkey = malloc(sizeof(struct cgfs_files));
1262         } while (!newkey);
1263         if (file)
1264                 newkey->name = must_copy_string(file);
1265         else if (strrchr(cgroup, '/'))
1266                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1267         else
1268                 newkey->name = must_copy_string(cgroup);
1269         newkey->uid = sb.st_uid;
1270         newkey->gid = sb.st_gid;
1271         newkey->mode = sb.st_mode;
1272
1273         return newkey;
1274 }
1275
1276 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1277 {
1278         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1279         if (!entry) {
1280                 lxcfs_error("Error getting files under %s:%s\n", controller,
1281                              cgroup);
1282         }
1283         return entry;
1284 }
1285
1286 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1287 {
1288         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1289 }
1290
1291 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1292 {
1293         int cfd;
1294         size_t len;
1295         char *fnam, *tmpc;
1296         int ret;
1297         struct stat sb;
1298
1299         tmpc = find_mounted_controller(controller, &cfd);
1300         if (!tmpc)
1301                 return false;
1302
1303         /* Make sure we pass a relative path to *at() family of functions.
1304          * . + /cgroup + / + f + \0
1305          */
1306         len = strlen(cgroup) + strlen(f) + 3;
1307         fnam = alloca(len);
1308         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1309         if (ret < 0 || (size_t)ret >= len)
1310                 return false;
1311
1312         ret = fstatat(cfd, fnam, &sb, 0);
1313         if (ret < 0 || !S_ISDIR(sb.st_mode))
1314                 return false;
1315
1316         return true;
1317 }
1318
1319 #define SEND_CREDS_OK 0
1320 #define SEND_CREDS_NOTSK 1
1321 #define SEND_CREDS_FAIL 2
1322 static bool recv_creds(int sock, struct ucred *cred, char *v);
1323 static int wait_for_pid(pid_t pid);
1324 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1325 static int send_creds_clone_wrapper(void *arg);
1326
1327 /*
1328  * clone a task which switches to @task's namespace and writes '1'.
1329  * over a unix sock so we can read the task's reaper's pid in our
1330  * namespace
1331  *
1332  * Note: glibc's fork() does not respect pidns, which can lead to failed
1333  * assertions inside glibc (and thus failed forks) if the child's pid in
1334  * the pidns and the parent pid outside are identical. Using clone prevents
1335  * this issue.
1336  */
1337 static void write_task_init_pid_exit(int sock, pid_t target)
1338 {
1339         char fnam[100];
1340         pid_t pid;
1341         int fd, ret;
1342         size_t stack_size = sysconf(_SC_PAGESIZE);
1343         void *stack = alloca(stack_size);
1344
1345         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1346         if (ret < 0 || ret >= sizeof(fnam))
1347                 _exit(1);
1348
1349         fd = open(fnam, O_RDONLY);
1350         if (fd < 0) {
1351                 perror("write_task_init_pid_exit open of ns/pid");
1352                 _exit(1);
1353         }
1354         if (setns(fd, 0)) {
1355                 perror("write_task_init_pid_exit setns 1");
1356                 close(fd);
1357                 _exit(1);
1358         }
1359         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1360         if (pid < 0)
1361                 _exit(1);
1362         if (pid != 0) {
1363                 if (!wait_for_pid(pid))
1364                         _exit(1);
1365                 _exit(0);
1366         }
1367 }
1368
1369 static int send_creds_clone_wrapper(void *arg) {
1370         struct ucred cred;
1371         char v;
1372         int sock = *(int *)arg;
1373
1374         /* we are the child */
1375         cred.uid = 0;
1376         cred.gid = 0;
1377         cred.pid = 1;
1378         v = '1';
1379         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1380                 return 1;
1381         return 0;
1382 }
1383
1384 static pid_t get_init_pid_for_task(pid_t task)
1385 {
1386         int sock[2];
1387         pid_t pid;
1388         pid_t ret = -1;
1389         char v = '0';
1390         struct ucred cred;
1391
1392         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1393                 perror("socketpair");
1394                 return -1;
1395         }
1396
1397         pid = fork();
1398         if (pid < 0)
1399                 goto out;
1400         if (!pid) {
1401                 close(sock[1]);
1402                 write_task_init_pid_exit(sock[0], task);
1403                 _exit(0);
1404         }
1405
1406         if (!recv_creds(sock[1], &cred, &v))
1407                 goto out;
1408         ret = cred.pid;
1409
1410 out:
1411         close(sock[0]);
1412         close(sock[1]);
1413         if (pid > 0)
1414                 wait_for_pid(pid);
1415         return ret;
1416 }
1417
1418 static pid_t lookup_initpid_in_store(pid_t qpid)
1419 {
1420         pid_t answer = 0;
1421         struct stat sb;
1422         struct pidns_init_store *e;
1423         char fnam[100];
1424
1425         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1426         store_lock();
1427         if (stat(fnam, &sb) < 0)
1428                 goto out;
1429         e = lookup_verify_initpid(&sb);
1430         if (e) {
1431                 answer = e->initpid;
1432                 goto out;
1433         }
1434         answer = get_init_pid_for_task(qpid);
1435         if (answer > 0)
1436                 save_initpid(&sb, answer);
1437
1438 out:
1439         /* we prune at end in case we are returning
1440          * the value we were about to return */
1441         prune_initpid_store();
1442         store_unlock();
1443         return answer;
1444 }
1445
1446 static int wait_for_pid(pid_t pid)
1447 {
1448         int status, ret;
1449
1450         if (pid <= 0)
1451                 return -1;
1452
1453 again:
1454         ret = waitpid(pid, &status, 0);
1455         if (ret == -1) {
1456                 if (errno == EINTR)
1457                         goto again;
1458                 return -1;
1459         }
1460         if (ret != pid)
1461                 goto again;
1462         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1463                 return -1;
1464         return 0;
1465 }
1466
1467
1468 /*
1469  * append pid to *src.
1470  * src: a pointer to a char* in which ot append the pid.
1471  * sz: the number of characters printed so far, minus trailing \0.
1472  * asz: the allocated size so far
1473  * pid: the pid to append
1474  */
1475 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1476 {
1477         char tmp[30];
1478
1479         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1480
1481         if (!*src || tmplen + *sz + 1 >= *asz) {
1482                 char *tmp;
1483                 do {
1484                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1485                 } while (!tmp);
1486                 *src = tmp;
1487                 *asz += BUF_RESERVE_SIZE;
1488         }
1489         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1490         *sz += tmplen;
1491 }
1492
1493 /*
1494  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1495  * valid in the caller's namespace, return the id mapped into
1496  * pid's namespace.
1497  * Returns the mapped id, or -1 on error.
1498  */
1499 unsigned int
1500 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1501 {
1502         unsigned int nsuid,   // base id for a range in the idfile's namespace
1503                      hostuid, // base id for a range in the caller's namespace
1504                      count;   // number of ids in this range
1505         char line[400];
1506         int ret;
1507
1508         fseek(idfile, 0L, SEEK_SET);
1509         while (fgets(line, 400, idfile)) {
1510                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1511                 if (ret != 3)
1512                         continue;
1513                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1514                         /*
1515                          * uids wrapped around - unexpected as this is a procfile,
1516                          * so just bail.
1517                          */
1518                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1519                                 nsuid, hostuid, count, line);
1520                         return -1;
1521                 }
1522                 if (hostuid <= in_id && hostuid+count > in_id) {
1523                         /*
1524                          * now since hostuid <= in_id < hostuid+count, and
1525                          * hostuid+count and nsuid+count do not wrap around,
1526                          * we know that nsuid+(in_id-hostuid) which must be
1527                          * less that nsuid+(count) must not wrap around
1528                          */
1529                         return (in_id - hostuid) + nsuid;
1530                 }
1531         }
1532
1533         // no answer found
1534         return -1;
1535 }
1536
1537 /*
1538  * for is_privileged_over,
1539  * specify whether we require the calling uid to be root in his
1540  * namespace
1541  */
1542 #define NS_ROOT_REQD true
1543 #define NS_ROOT_OPT false
1544
1545 #define PROCLEN 100
1546
1547 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1548 {
1549         char fpath[PROCLEN];
1550         int ret;
1551         bool answer = false;
1552         uid_t nsuid;
1553
1554         if (victim == -1 || uid == -1)
1555                 return false;
1556
1557         /*
1558          * If the request is one not requiring root in the namespace,
1559          * then having the same uid suffices.  (i.e. uid 1000 has write
1560          * access to files owned by uid 1000
1561          */
1562         if (!req_ns_root && uid == victim)
1563                 return true;
1564
1565         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1566         if (ret < 0 || ret >= PROCLEN)
1567                 return false;
1568         FILE *f = fopen(fpath, "r");
1569         if (!f)
1570                 return false;
1571
1572         /* if caller's not root in his namespace, reject */
1573         nsuid = convert_id_to_ns(f, uid);
1574         if (nsuid)
1575                 goto out;
1576
1577         /*
1578          * If victim is not mapped into caller's ns, reject.
1579          * XXX I'm not sure this check is needed given that fuse
1580          * will be sending requests where the vfs has converted
1581          */
1582         nsuid = convert_id_to_ns(f, victim);
1583         if (nsuid == -1)
1584                 goto out;
1585
1586         answer = true;
1587
1588 out:
1589         fclose(f);
1590         return answer;
1591 }
1592
1593 static bool perms_include(int fmode, mode_t req_mode)
1594 {
1595         mode_t r;
1596
1597         switch (req_mode & O_ACCMODE) {
1598         case O_RDONLY:
1599                 r = S_IROTH;
1600                 break;
1601         case O_WRONLY:
1602                 r = S_IWOTH;
1603                 break;
1604         case O_RDWR:
1605                 r = S_IROTH | S_IWOTH;
1606                 break;
1607         default:
1608                 return false;
1609         }
1610         return ((fmode & r) == r);
1611 }
1612
1613
1614 /*
1615  * taskcg is  a/b/c
1616  * querycg is /a/b/c/d/e
1617  * we return 'd'
1618  */
1619 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1620 {
1621         char *start, *end;
1622
1623         if (strlen(taskcg) <= strlen(querycg)) {
1624                 lxcfs_error("%s\n", "I was fed bad input.");
1625                 return NULL;
1626         }
1627
1628         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1629                 start =  strdup(taskcg + 1);
1630         else
1631                 start = strdup(taskcg + strlen(querycg) + 1);
1632         if (!start)
1633                 return NULL;
1634         end = strchr(start, '/');
1635         if (end)
1636                 *end = '\0';
1637         return start;
1638 }
1639
1640 static void stripnewline(char *x)
1641 {
1642         size_t l = strlen(x);
1643         if (l && x[l-1] == '\n')
1644                 x[l-1] = '\0';
1645 }
1646
1647 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1648 {
1649         int cfd;
1650         char fnam[PROCLEN];
1651         FILE *f;
1652         char *answer = NULL;
1653         char *line = NULL;
1654         size_t len = 0;
1655         int ret;
1656         const char *h = find_mounted_controller(contrl, &cfd);
1657         if (!h)
1658                 return NULL;
1659
1660         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1661         if (ret < 0 || ret >= PROCLEN)
1662                 return NULL;
1663         if (!(f = fopen(fnam, "r")))
1664                 return NULL;
1665
1666         while (getline(&line, &len, f) != -1) {
1667                 char *c1, *c2;
1668                 if (!line[0])
1669                         continue;
1670                 c1 = strchr(line, ':');
1671                 if (!c1)
1672                         goto out;
1673                 c1++;
1674                 c2 = strchr(c1, ':');
1675                 if (!c2)
1676                         goto out;
1677                 *c2 = '\0';
1678                 if (strcmp(c1, h) != 0)
1679                         continue;
1680                 c2++;
1681                 stripnewline(c2);
1682                 do {
1683                         answer = strdup(c2);
1684                 } while (!answer);
1685                 break;
1686         }
1687
1688 out:
1689         fclose(f);
1690         free(line);
1691         return answer;
1692 }
1693
1694 /*
1695  * check whether a fuse context may access a cgroup dir or file
1696  *
1697  * If file is not null, it is a cgroup file to check under cg.
1698  * If file is null, then we are checking perms on cg itself.
1699  *
1700  * For files we can check the mode of the list_keys result.
1701  * For cgroups, we must make assumptions based on the files under the
1702  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1703  * yet.
1704  */
1705 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1706 {
1707         struct cgfs_files *k = NULL;
1708         bool ret = false;
1709
1710         k = cgfs_get_key(contrl, cg, file);
1711         if (!k)
1712                 return false;
1713
1714         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1715                 if (perms_include(k->mode >> 6, mode)) {
1716                         ret = true;
1717                         goto out;
1718                 }
1719         }
1720         if (fc->gid == k->gid) {
1721                 if (perms_include(k->mode >> 3, mode)) {
1722                         ret = true;
1723                         goto out;
1724                 }
1725         }
1726         ret = perms_include(k->mode, mode);
1727
1728 out:
1729         free_key(k);
1730         return ret;
1731 }
1732
1733 #define INITSCOPE "/init.scope"
1734 static void prune_init_slice(char *cg)
1735 {
1736         char *point;
1737         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1738
1739         if (cg_len < initscope_len)
1740                 return;
1741
1742         point = cg + cg_len - initscope_len;
1743         if (strcmp(point, INITSCOPE) == 0) {
1744                 if (point == cg)
1745                         *(point+1) = '\0';
1746                 else
1747                         *point = '\0';
1748         }
1749 }
1750
1751 /*
1752  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1753  * If pid is in /a, he may act on /a/b, but not on /b.
1754  * if the answer is false and nextcg is not NULL, then *nextcg will point
1755  * to a string containing the next cgroup directory under cg, which must be
1756  * freed by the caller.
1757  */
1758 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1759 {
1760         bool answer = false;
1761         char *c2 = get_pid_cgroup(pid, contrl);
1762         char *linecmp;
1763
1764         if (!c2)
1765                 return false;
1766         prune_init_slice(c2);
1767
1768         /*
1769          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1770          * they pass in a cgroup without leading '/'
1771          *
1772          * The original line here was:
1773          *      linecmp = *cg == '/' ? c2 : c2+1;
1774          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1775          *       Serge, do you know?
1776          */
1777         if (*cg == '/' || !strncmp(cg, "./", 2))
1778                 linecmp = c2;
1779         else
1780                 linecmp = c2 + 1;
1781         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1782                 if (nextcg) {
1783                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1784                 }
1785                 goto out;
1786         }
1787         answer = true;
1788
1789 out:
1790         free(c2);
1791         return answer;
1792 }
1793
1794 /*
1795  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1796  */
1797 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1798 {
1799         bool answer = false;
1800         char *c2, *task_cg;
1801         size_t target_len, task_len;
1802
1803         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1804                 return true;
1805
1806         c2 = get_pid_cgroup(pid, contrl);
1807         if (!c2)
1808                 return false;
1809         prune_init_slice(c2);
1810
1811         task_cg = c2 + 1;
1812         target_len = strlen(cg);
1813         task_len = strlen(task_cg);
1814         if (task_len == 0) {
1815                 /* Task is in the root cg, it can see everything. This case is
1816                  * not handled by the strmcps below, since they test for the
1817                  * last /, but that is the first / that we've chopped off
1818                  * above.
1819                  */
1820                 answer = true;
1821                 goto out;
1822         }
1823         if (strcmp(cg, task_cg) == 0) {
1824                 answer = true;
1825                 goto out;
1826         }
1827         if (target_len < task_len) {
1828                 /* looking up a parent dir */
1829                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1830                         answer = true;
1831                 goto out;
1832         }
1833         if (target_len > task_len) {
1834                 /* looking up a child dir */
1835                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1836                         answer = true;
1837                 goto out;
1838         }
1839
1840 out:
1841         free(c2);
1842         return answer;
1843 }
1844
1845 /*
1846  * given /cgroup/freezer/a/b, return "freezer".
1847  * the returned char* should NOT be freed.
1848  */
1849 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1850 {
1851         const char *p1;
1852         char *contr, *slash;
1853
1854         if (strlen(path) < 9) {
1855                 errno = EACCES;
1856                 return NULL;
1857         }
1858         if (*(path + 7) != '/') {
1859                 errno = EINVAL;
1860                 return NULL;
1861         }
1862         p1 = path + 8;
1863         contr = strdupa(p1);
1864         if (!contr) {
1865                 errno = ENOMEM;
1866                 return NULL;
1867         }
1868         slash = strstr(contr, "/");
1869         if (slash)
1870                 *slash = '\0';
1871
1872         int i;
1873         for (i = 0; i < num_hierarchies; i++) {
1874                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1875                         return hierarchies[i];
1876         }
1877         errno = ENOENT;
1878         return NULL;
1879 }
1880
1881 /*
1882  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1883  * Note that the returned value may include files (keynames) etc
1884  */
1885 static const char *find_cgroup_in_path(const char *path)
1886 {
1887         const char *p1;
1888
1889         if (strlen(path) < 9) {
1890                 errno = EACCES;
1891                 return NULL;
1892         }
1893         p1 = strstr(path + 8, "/");
1894         if (!p1) {
1895                 errno = EINVAL;
1896                 return NULL;
1897         }
1898         errno = 0;
1899         return p1 + 1;
1900 }
1901
1902 /*
1903  * split the last path element from the path in @cg.
1904  * @dir is newly allocated and should be freed, @last not
1905 */
1906 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1907 {
1908         char *p;
1909
1910         do {
1911                 *dir = strdup(cg);
1912         } while (!*dir);
1913         *last = strrchr(cg, '/');
1914         if (!*last) {
1915                 *last = NULL;
1916                 return;
1917         }
1918         p = strrchr(*dir, '/');
1919         *p = '\0';
1920 }
1921
1922 /*
1923  * FUSE ops for /cgroup
1924  */
1925
1926 int cg_getattr(const char *path, struct stat *sb)
1927 {
1928         struct timespec now;
1929         struct fuse_context *fc = fuse_get_context();
1930         char * cgdir = NULL;
1931         char *last = NULL, *path1, *path2;
1932         struct cgfs_files *k = NULL;
1933         const char *cgroup;
1934         const char *controller = NULL;
1935         int ret = -ENOENT;
1936
1937
1938         if (!fc)
1939                 return -EIO;
1940
1941         memset(sb, 0, sizeof(struct stat));
1942
1943         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1944                 return -EINVAL;
1945
1946         sb->st_uid = sb->st_gid = 0;
1947         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1948         sb->st_size = 0;
1949
1950         if (strcmp(path, "/cgroup") == 0) {
1951                 sb->st_mode = S_IFDIR | 00755;
1952                 sb->st_nlink = 2;
1953                 return 0;
1954         }
1955
1956         controller = pick_controller_from_path(fc, path);
1957         if (!controller)
1958                 return -errno;
1959         cgroup = find_cgroup_in_path(path);
1960         if (!cgroup) {
1961                 /* this is just /cgroup/controller, return it as a dir */
1962                 sb->st_mode = S_IFDIR | 00755;
1963                 sb->st_nlink = 2;
1964                 return 0;
1965         }
1966
1967         get_cgdir_and_path(cgroup, &cgdir, &last);
1968
1969         if (!last) {
1970                 path1 = "/";
1971                 path2 = cgdir;
1972         } else {
1973                 path1 = cgdir;
1974                 path2 = last;
1975         }
1976
1977         pid_t initpid = lookup_initpid_in_store(fc->pid);
1978         if (initpid <= 0)
1979                 initpid = fc->pid;
1980         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1981          * Then check that caller's cgroup is under path if last is a child
1982          * cgroup, or cgdir if last is a file */
1983
1984         if (is_child_cgroup(controller, path1, path2)) {
1985                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1986                         ret = -ENOENT;
1987                         goto out;
1988                 }
1989                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1990                         /* this is just /cgroup/controller, return it as a dir */
1991                         sb->st_mode = S_IFDIR | 00555;
1992                         sb->st_nlink = 2;
1993                         ret = 0;
1994                         goto out;
1995                 }
1996                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1997                         ret = -EACCES;
1998                         goto out;
1999                 }
2000
2001                 // get uid, gid, from '/tasks' file and make up a mode
2002                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2003                 sb->st_mode = S_IFDIR | 00755;
2004                 k = cgfs_get_key(controller, cgroup, NULL);
2005                 if (!k) {
2006                         sb->st_uid = sb->st_gid = 0;
2007                 } else {
2008                         sb->st_uid = k->uid;
2009                         sb->st_gid = k->gid;
2010                 }
2011                 free_key(k);
2012                 sb->st_nlink = 2;
2013                 ret = 0;
2014                 goto out;
2015         }
2016
2017         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2018                 sb->st_mode = S_IFREG | k->mode;
2019                 sb->st_nlink = 1;
2020                 sb->st_uid = k->uid;
2021                 sb->st_gid = k->gid;
2022                 sb->st_size = 0;
2023                 free_key(k);
2024                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2025                         ret = -ENOENT;
2026                         goto out;
2027                 }
2028                 ret = 0;
2029         }
2030
2031 out:
2032         free(cgdir);
2033         return ret;
2034 }
2035
2036 int cg_opendir(const char *path, struct fuse_file_info *fi)
2037 {
2038         struct fuse_context *fc = fuse_get_context();
2039         const char *cgroup;
2040         struct file_info *dir_info;
2041         char *controller = NULL;
2042
2043         if (!fc)
2044                 return -EIO;
2045
2046         if (strcmp(path, "/cgroup") == 0) {
2047                 cgroup = NULL;
2048                 controller = NULL;
2049         } else {
2050                 // return list of keys for the controller, and list of child cgroups
2051                 controller = pick_controller_from_path(fc, path);
2052                 if (!controller)
2053                         return -errno;
2054
2055                 cgroup = find_cgroup_in_path(path);
2056                 if (!cgroup) {
2057                         /* this is just /cgroup/controller, return its contents */
2058                         cgroup = "/";
2059                 }
2060         }
2061
2062         pid_t initpid = lookup_initpid_in_store(fc->pid);
2063         if (initpid <= 0)
2064                 initpid = fc->pid;
2065         if (cgroup) {
2066                 if (!caller_may_see_dir(initpid, controller, cgroup))
2067                         return -ENOENT;
2068                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2069                         return -EACCES;
2070         }
2071
2072         /* we'll free this at cg_releasedir */
2073         dir_info = malloc(sizeof(*dir_info));
2074         if (!dir_info)
2075                 return -ENOMEM;
2076         dir_info->controller = must_copy_string(controller);
2077         dir_info->cgroup = must_copy_string(cgroup);
2078         dir_info->type = LXC_TYPE_CGDIR;
2079         dir_info->buf = NULL;
2080         dir_info->file = NULL;
2081         dir_info->buflen = 0;
2082
2083         fi->fh = (unsigned long)dir_info;
2084         return 0;
2085 }
2086
2087 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2088                 struct fuse_file_info *fi)
2089 {
2090         struct file_info *d = (struct file_info *)fi->fh;
2091         struct cgfs_files **list = NULL;
2092         int i, ret;
2093         char *nextcg = NULL;
2094         struct fuse_context *fc = fuse_get_context();
2095         char **clist = NULL;
2096
2097         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2098                 return -EIO;
2099
2100         if (d->type != LXC_TYPE_CGDIR) {
2101                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2102                 return -EIO;
2103         }
2104         if (!d->cgroup && !d->controller) {
2105                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2106                 int i;
2107
2108                 for (i = 0;  i < num_hierarchies; i++) {
2109                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2110                                 return -EIO;
2111                         }
2112                 }
2113                 return 0;
2114         }
2115
2116         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2117                 // not a valid cgroup
2118                 ret = -EINVAL;
2119                 goto out;
2120         }
2121
2122         pid_t initpid = lookup_initpid_in_store(fc->pid);
2123         if (initpid <= 0)
2124                 initpid = fc->pid;
2125         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2126                 if (nextcg) {
2127                         ret = filler(buf, nextcg,  NULL, 0);
2128                         free(nextcg);
2129                         if (ret != 0) {
2130                                 ret = -EIO;
2131                                 goto out;
2132                         }
2133                 }
2134                 ret = 0;
2135                 goto out;
2136         }
2137
2138         for (i = 0; list[i]; i++) {
2139                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2140                         ret = -EIO;
2141                         goto out;
2142                 }
2143         }
2144
2145         // now get the list of child cgroups
2146
2147         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2148                 ret = 0;
2149                 goto out;
2150         }
2151         if (clist) {
2152                 for (i = 0; clist[i]; i++) {
2153                         if (filler(buf, clist[i], NULL, 0) != 0) {
2154                                 ret = -EIO;
2155                                 goto out;
2156                         }
2157                 }
2158         }
2159         ret = 0;
2160
2161 out:
2162         free_keys(list);
2163         if (clist) {
2164                 for (i = 0; clist[i]; i++)
2165                         free(clist[i]);
2166                 free(clist);
2167         }
2168         return ret;
2169 }
2170
2171 static void do_release_file_info(struct fuse_file_info *fi)
2172 {
2173         struct file_info *f = (struct file_info *)fi->fh;
2174
2175         if (!f)
2176                 return;
2177
2178         fi->fh = 0;
2179
2180         free(f->controller);
2181         f->controller = NULL;
2182         free(f->cgroup);
2183         f->cgroup = NULL;
2184         free(f->file);
2185         f->file = NULL;
2186         free(f->buf);
2187         f->buf = NULL;
2188         free(f);
2189         f = NULL;
2190 }
2191
2192 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2193 {
2194         do_release_file_info(fi);
2195         return 0;
2196 }
2197
2198 int cg_open(const char *path, struct fuse_file_info *fi)
2199 {
2200         const char *cgroup;
2201         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2202         struct cgfs_files *k = NULL;
2203         struct file_info *file_info;
2204         struct fuse_context *fc = fuse_get_context();
2205         int ret;
2206
2207         if (!fc)
2208                 return -EIO;
2209
2210         controller = pick_controller_from_path(fc, path);
2211         if (!controller)
2212                 return -errno;
2213         cgroup = find_cgroup_in_path(path);
2214         if (!cgroup)
2215                 return -errno;
2216
2217         get_cgdir_and_path(cgroup, &cgdir, &last);
2218         if (!last) {
2219                 path1 = "/";
2220                 path2 = cgdir;
2221         } else {
2222                 path1 = cgdir;
2223                 path2 = last;
2224         }
2225
2226         k = cgfs_get_key(controller, path1, path2);
2227         if (!k) {
2228                 ret = -EINVAL;
2229                 goto out;
2230         }
2231         free_key(k);
2232
2233         pid_t initpid = lookup_initpid_in_store(fc->pid);
2234         if (initpid <= 0)
2235                 initpid = fc->pid;
2236         if (!caller_may_see_dir(initpid, controller, path1)) {
2237                 ret = -ENOENT;
2238                 goto out;
2239         }
2240         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2241                 ret = -EACCES;
2242                 goto out;
2243         }
2244
2245         /* we'll free this at cg_release */
2246         file_info = malloc(sizeof(*file_info));
2247         if (!file_info) {
2248                 ret = -ENOMEM;
2249                 goto out;
2250         }
2251         file_info->controller = must_copy_string(controller);
2252         file_info->cgroup = must_copy_string(path1);
2253         file_info->file = must_copy_string(path2);
2254         file_info->type = LXC_TYPE_CGFILE;
2255         file_info->buf = NULL;
2256         file_info->buflen = 0;
2257
2258         fi->fh = (unsigned long)file_info;
2259         ret = 0;
2260
2261 out:
2262         free(cgdir);
2263         return ret;
2264 }
2265
2266 int cg_access(const char *path, int mode)
2267 {
2268         int ret;
2269         const char *cgroup;
2270         char *path1, *path2, *controller;
2271         char *last = NULL, *cgdir = NULL;
2272         struct cgfs_files *k = NULL;
2273         struct fuse_context *fc = fuse_get_context();
2274
2275         if (strcmp(path, "/cgroup") == 0)
2276                 return 0;
2277
2278         if (!fc)
2279                 return -EIO;
2280
2281         controller = pick_controller_from_path(fc, path);
2282         if (!controller)
2283                 return -errno;
2284         cgroup = find_cgroup_in_path(path);
2285         if (!cgroup) {
2286                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2287                 if ((mode & W_OK) == 0)
2288                         return 0;
2289                 return -EACCES;
2290         }
2291
2292         get_cgdir_and_path(cgroup, &cgdir, &last);
2293         if (!last) {
2294                 path1 = "/";
2295                 path2 = cgdir;
2296         } else {
2297                 path1 = cgdir;
2298                 path2 = last;
2299         }
2300
2301         k = cgfs_get_key(controller, path1, path2);
2302         if (!k) {
2303                 if ((mode & W_OK) == 0)
2304                         ret = 0;
2305                 else
2306                         ret = -EACCES;
2307                 goto out;
2308         }
2309         free_key(k);
2310
2311         pid_t initpid = lookup_initpid_in_store(fc->pid);
2312         if (initpid <= 0)
2313                 initpid = fc->pid;
2314         if (!caller_may_see_dir(initpid, controller, path1)) {
2315                 ret = -ENOENT;
2316                 goto out;
2317         }
2318         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2319                 ret = -EACCES;
2320                 goto out;
2321         }
2322
2323         ret = 0;
2324
2325 out:
2326         free(cgdir);
2327         return ret;
2328 }
2329
2330 int cg_release(const char *path, struct fuse_file_info *fi)
2331 {
2332         do_release_file_info(fi);
2333         return 0;
2334 }
2335
2336 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2337
2338 static bool wait_for_sock(int sock, int timeout)
2339 {
2340         struct epoll_event ev;
2341         int epfd, ret, now, starttime, deltatime, saved_errno;
2342
2343         if ((starttime = time(NULL)) < 0)
2344                 return false;
2345
2346         if ((epfd = epoll_create(1)) < 0) {
2347                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2348                 return false;
2349         }
2350
2351         ev.events = POLLIN_SET;
2352         ev.data.fd = sock;
2353         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2354                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2355                 close(epfd);
2356                 return false;
2357         }
2358
2359 again:
2360         if ((now = time(NULL)) < 0) {
2361                 close(epfd);
2362                 return false;
2363         }
2364
2365         deltatime = (starttime + timeout) - now;
2366         if (deltatime < 0) { // timeout
2367                 errno = 0;
2368                 close(epfd);
2369                 return false;
2370         }
2371         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2372         if (ret < 0 && errno == EINTR)
2373                 goto again;
2374         saved_errno = errno;
2375         close(epfd);
2376
2377         if (ret <= 0) {
2378                 errno = saved_errno;
2379                 return false;
2380         }
2381         return true;
2382 }
2383
2384 static int msgrecv(int sockfd, void *buf, size_t len)
2385 {
2386         if (!wait_for_sock(sockfd, 2))
2387                 return -1;
2388         return recv(sockfd, buf, len, MSG_DONTWAIT);
2389 }
2390
2391 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2392 {
2393         struct msghdr msg = { 0 };
2394         struct iovec iov;
2395         struct cmsghdr *cmsg;
2396         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2397         char buf[1];
2398         buf[0] = 'p';
2399
2400         if (pingfirst) {
2401                 if (msgrecv(sock, buf, 1) != 1) {
2402                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2403                         return SEND_CREDS_FAIL;
2404                 }
2405         }
2406
2407         msg.msg_control = cmsgbuf;
2408         msg.msg_controllen = sizeof(cmsgbuf);
2409
2410         cmsg = CMSG_FIRSTHDR(&msg);
2411         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2412         cmsg->cmsg_level = SOL_SOCKET;
2413         cmsg->cmsg_type = SCM_CREDENTIALS;
2414         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2415
2416         msg.msg_name = NULL;
2417         msg.msg_namelen = 0;
2418
2419         buf[0] = v;
2420         iov.iov_base = buf;
2421         iov.iov_len = sizeof(buf);
2422         msg.msg_iov = &iov;
2423         msg.msg_iovlen = 1;
2424
2425         if (sendmsg(sock, &msg, 0) < 0) {
2426                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2427                 if (errno == 3)
2428                         return SEND_CREDS_NOTSK;
2429                 return SEND_CREDS_FAIL;
2430         }
2431
2432         return SEND_CREDS_OK;
2433 }
2434
2435 static bool recv_creds(int sock, struct ucred *cred, char *v)
2436 {
2437         struct msghdr msg = { 0 };
2438         struct iovec iov;
2439         struct cmsghdr *cmsg;
2440         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2441         char buf[1];
2442         int ret;
2443         int optval = 1;
2444
2445         *v = '1';
2446
2447         cred->pid = -1;
2448         cred->uid = -1;
2449         cred->gid = -1;
2450
2451         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2452                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2453                 return false;
2454         }
2455         buf[0] = '1';
2456         if (write(sock, buf, 1) != 1) {
2457                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2458                 return false;
2459         }
2460
2461         msg.msg_name = NULL;
2462         msg.msg_namelen = 0;
2463         msg.msg_control = cmsgbuf;
2464         msg.msg_controllen = sizeof(cmsgbuf);
2465
2466         iov.iov_base = buf;
2467         iov.iov_len = sizeof(buf);
2468         msg.msg_iov = &iov;
2469         msg.msg_iovlen = 1;
2470
2471         if (!wait_for_sock(sock, 2)) {
2472                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2473                 return false;
2474         }
2475         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2476         if (ret < 0) {
2477                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2478                 return false;
2479         }
2480
2481         cmsg = CMSG_FIRSTHDR(&msg);
2482
2483         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2484                         cmsg->cmsg_level == SOL_SOCKET &&
2485                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2486                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2487         }
2488         *v = buf[0];
2489
2490         return true;
2491 }
2492
2493 struct pid_ns_clone_args {
2494         int *cpipe;
2495         int sock;
2496         pid_t tpid;
2497         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2498 };
2499
2500 /*
2501  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2502  * with clone(). This simply writes '1' as ACK back to the parent
2503  * before calling the actual wrapped function.
2504  */
2505 static int pid_ns_clone_wrapper(void *arg) {
2506         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2507         char b = '1';
2508
2509         close(args->cpipe[0]);
2510         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2511                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2512         close(args->cpipe[1]);
2513         return args->wrapped(args->sock, args->tpid);
2514 }
2515
2516 /*
2517  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2518  * int value back over the socket.  This shifts the pid from the
2519  * sender's pidns into tpid's pidns.
2520  */
2521 static int pid_to_ns(int sock, pid_t tpid)
2522 {
2523         char v = '0';
2524         struct ucred cred;
2525
2526         while (recv_creds(sock, &cred, &v)) {
2527                 if (v == '1')
2528                         return 0;
2529                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2530                         return 1;
2531         }
2532         return 0;
2533 }
2534
2535
2536 /*
2537  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2538  * in your old pidns.  Only children which you clone will be in the target
2539  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2540  * actually convert pids.
2541  *
2542  * Note: glibc's fork() does not respect pidns, which can lead to failed
2543  * assertions inside glibc (and thus failed forks) if the child's pid in
2544  * the pidns and the parent pid outside are identical. Using clone prevents
2545  * this issue.
2546  */
2547 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2548 {
2549         int newnsfd = -1, ret, cpipe[2];
2550         char fnam[100];
2551         pid_t cpid;
2552         char v;
2553
2554         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2555         if (ret < 0 || ret >= sizeof(fnam))
2556                 _exit(1);
2557         newnsfd = open(fnam, O_RDONLY);
2558         if (newnsfd < 0)
2559                 _exit(1);
2560         if (setns(newnsfd, 0) < 0)
2561                 _exit(1);
2562         close(newnsfd);
2563
2564         if (pipe(cpipe) < 0)
2565                 _exit(1);
2566
2567         struct pid_ns_clone_args args = {
2568                 .cpipe = cpipe,
2569                 .sock = sock,
2570                 .tpid = tpid,
2571                 .wrapped = &pid_to_ns
2572         };
2573         size_t stack_size = sysconf(_SC_PAGESIZE);
2574         void *stack = alloca(stack_size);
2575
2576         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2577         if (cpid < 0)
2578                 _exit(1);
2579
2580         // give the child 1 second to be done forking and
2581         // write its ack
2582         if (!wait_for_sock(cpipe[0], 1))
2583                 _exit(1);
2584         ret = read(cpipe[0], &v, 1);
2585         if (ret != sizeof(char) || v != '1')
2586                 _exit(1);
2587
2588         if (!wait_for_pid(cpid))
2589                 _exit(1);
2590         _exit(0);
2591 }
2592
2593 /*
2594  * To read cgroup files with a particular pid, we will setns into the child
2595  * pidns, open a pipe, fork a child - which will be the first to really be in
2596  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2597  */
2598 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2599 {
2600         int sock[2] = {-1, -1};
2601         char *tmpdata = NULL;
2602         int ret;
2603         pid_t qpid, cpid = -1;
2604         bool answer = false;
2605         char v = '0';
2606         struct ucred cred;
2607         size_t sz = 0, asz = 0;
2608
2609         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2610                 return false;
2611
2612         /*
2613          * Now we read the pids from returned data one by one, pass
2614          * them into a child in the target namespace, read back the
2615          * translated pids, and put them into our to-return data
2616          */
2617
2618         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2619                 perror("socketpair");
2620                 free(tmpdata);
2621                 return false;
2622         }
2623
2624         cpid = fork();
2625         if (cpid == -1)
2626                 goto out;
2627
2628         if (!cpid) // child - exits when done
2629                 pid_to_ns_wrapper(sock[1], tpid);
2630
2631         char *ptr = tmpdata;
2632         cred.uid = 0;
2633         cred.gid = 0;
2634         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2635                 cred.pid = qpid;
2636                 ret = send_creds(sock[0], &cred, v, true);
2637
2638                 if (ret == SEND_CREDS_NOTSK)
2639                         goto next;
2640                 if (ret == SEND_CREDS_FAIL)
2641                         goto out;
2642
2643                 // read converted results
2644                 if (!wait_for_sock(sock[0], 2)) {
2645                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2646                         goto out;
2647                 }
2648                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2649                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2650                         goto out;
2651                 }
2652                 must_strcat_pid(d, &sz, &asz, qpid);
2653 next:
2654                 ptr = strchr(ptr, '\n');
2655                 if (!ptr)
2656                         break;
2657                 ptr++;
2658         }
2659
2660         cred.pid = getpid();
2661         v = '1';
2662         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2663                 // failed to ask child to exit
2664                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2665                 goto out;
2666         }
2667
2668         answer = true;
2669
2670 out:
2671         free(tmpdata);
2672         if (cpid != -1)
2673                 wait_for_pid(cpid);
2674         if (sock[0] != -1) {
2675                 close(sock[0]);
2676                 close(sock[1]);
2677         }
2678         return answer;
2679 }
2680
2681 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2682                 struct fuse_file_info *fi)
2683 {
2684         struct fuse_context *fc = fuse_get_context();
2685         struct file_info *f = (struct file_info *)fi->fh;
2686         struct cgfs_files *k = NULL;
2687         char *data = NULL;
2688         int ret, s;
2689         bool r;
2690
2691         if (f->type != LXC_TYPE_CGFILE) {
2692                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2693                 return -EIO;
2694         }
2695
2696         if (offset)
2697                 return 0;
2698
2699         if (!fc)
2700                 return -EIO;
2701
2702         if (!f->controller)
2703                 return -EINVAL;
2704
2705         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2706                 return -EINVAL;
2707         }
2708         free_key(k);
2709
2710
2711         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2712                 ret = -EACCES;
2713                 goto out;
2714         }
2715
2716         if (strcmp(f->file, "tasks") == 0 ||
2717                         strcmp(f->file, "/tasks") == 0 ||
2718                         strcmp(f->file, "/cgroup.procs") == 0 ||
2719                         strcmp(f->file, "cgroup.procs") == 0)
2720                 // special case - we have to translate the pids
2721                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2722         else
2723                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2724
2725         if (!r) {
2726                 ret = -EINVAL;
2727                 goto out;
2728         }
2729
2730         if (!data) {
2731                 ret = 0;
2732                 goto out;
2733         }
2734         s = strlen(data);
2735         if (s > size)
2736                 s = size;
2737         memcpy(buf, data, s);
2738         if (s > 0 && s < size && data[s-1] != '\n')
2739                 buf[s++] = '\n';
2740
2741         ret = s;
2742
2743 out:
2744         free(data);
2745         return ret;
2746 }
2747
2748 static int pid_from_ns(int sock, pid_t tpid)
2749 {
2750         pid_t vpid;
2751         struct ucred cred;
2752         char v;
2753         int ret;
2754
2755         cred.uid = 0;
2756         cred.gid = 0;
2757         while (1) {
2758                 if (!wait_for_sock(sock, 2)) {
2759                         lxcfs_error("%s\n", "Timeout reading from parent.");
2760                         return 1;
2761                 }
2762                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2763                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2764                         return 1;
2765                 }
2766                 if (vpid == -1) // done
2767                         break;
2768                 v = '0';
2769                 cred.pid = vpid;
2770                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2771                         v = '1';
2772                         cred.pid = getpid();
2773                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2774                                 return 1;
2775                 }
2776         }
2777         return 0;
2778 }
2779
2780 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2781 {
2782         int newnsfd = -1, ret, cpipe[2];
2783         char fnam[100];
2784         pid_t cpid;
2785         char v;
2786
2787         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2788         if (ret < 0 || ret >= sizeof(fnam))
2789                 _exit(1);
2790         newnsfd = open(fnam, O_RDONLY);
2791         if (newnsfd < 0)
2792                 _exit(1);
2793         if (setns(newnsfd, 0) < 0)
2794                 _exit(1);
2795         close(newnsfd);
2796
2797         if (pipe(cpipe) < 0)
2798                 _exit(1);
2799
2800         struct pid_ns_clone_args args = {
2801                 .cpipe = cpipe,
2802                 .sock = sock,
2803                 .tpid = tpid,
2804                 .wrapped = &pid_from_ns
2805         };
2806         size_t stack_size = sysconf(_SC_PAGESIZE);
2807         void *stack = alloca(stack_size);
2808
2809         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2810         if (cpid < 0)
2811                 _exit(1);
2812
2813         // give the child 1 second to be done forking and
2814         // write its ack
2815         if (!wait_for_sock(cpipe[0], 1))
2816                 _exit(1);
2817         ret = read(cpipe[0], &v, 1);
2818         if (ret != sizeof(char) || v != '1')
2819                 _exit(1);
2820
2821         if (!wait_for_pid(cpid))
2822                 _exit(1);
2823         _exit(0);
2824 }
2825
2826 /*
2827  * Given host @uid, return the uid to which it maps in
2828  * @pid's user namespace, or -1 if none.
2829  */
2830 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2831 {
2832         FILE *f;
2833         char line[400];
2834
2835         sprintf(line, "/proc/%d/uid_map", pid);
2836         if ((f = fopen(line, "r")) == NULL) {
2837                 return false;
2838         }
2839
2840         *answer = convert_id_to_ns(f, uid);
2841         fclose(f);
2842
2843         if (*answer == -1)
2844                 return false;
2845         return true;
2846 }
2847
2848 /*
2849  * get_pid_creds: get the real uid and gid of @pid from
2850  * /proc/$$/status
2851  * (XXX should we use euid here?)
2852  */
2853 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2854 {
2855         char line[400];
2856         uid_t u;
2857         gid_t g;
2858         FILE *f;
2859
2860         *uid = -1;
2861         *gid = -1;
2862         sprintf(line, "/proc/%d/status", pid);
2863         if ((f = fopen(line, "r")) == NULL) {
2864                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2865                 return;
2866         }
2867         while (fgets(line, 400, f)) {
2868                 if (strncmp(line, "Uid:", 4) == 0) {
2869                         if (sscanf(line+4, "%u", &u) != 1) {
2870                                 lxcfs_error("bad uid line for pid %u\n", pid);
2871                                 fclose(f);
2872                                 return;
2873                         }
2874                         *uid = u;
2875                 } else if (strncmp(line, "Gid:", 4) == 0) {
2876                         if (sscanf(line+4, "%u", &g) != 1) {
2877                                 lxcfs_error("bad gid line for pid %u\n", pid);
2878                                 fclose(f);
2879                                 return;
2880                         }
2881                         *gid = g;
2882                 }
2883         }
2884         fclose(f);
2885 }
2886
2887 /*
2888  * May the requestor @r move victim @v to a new cgroup?
2889  * This is allowed if
2890  *   . they are the same task
2891  *   . they are ownedy by the same uid
2892  *   . @r is root on the host, or
2893  *   . @v's uid is mapped into @r's where @r is root.
2894  */
2895 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2896 {
2897         uid_t v_uid, tmpuid;
2898         gid_t v_gid;
2899
2900         if (r == v)
2901                 return true;
2902         if (r_uid == 0)
2903                 return true;
2904         get_pid_creds(v, &v_uid, &v_gid);
2905         if (r_uid == v_uid)
2906                 return true;
2907         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2908                         && hostuid_to_ns(v_uid, r, &tmpuid))
2909                 return true;
2910         return false;
2911 }
2912
2913 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2914                 const char *file, const char *buf)
2915 {
2916         int sock[2] = {-1, -1};
2917         pid_t qpid, cpid = -1;
2918         FILE *pids_file = NULL;
2919         bool answer = false, fail = false;
2920
2921         pids_file = open_pids_file(contrl, cg);
2922         if (!pids_file)
2923                 return false;
2924
2925         /*
2926          * write the pids to a socket, have helper in writer's pidns
2927          * call movepid for us
2928          */
2929         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2930                 perror("socketpair");
2931                 goto out;
2932         }
2933
2934         cpid = fork();
2935         if (cpid == -1)
2936                 goto out;
2937
2938         if (!cpid) { // child
2939                 fclose(pids_file);
2940                 pid_from_ns_wrapper(sock[1], tpid);
2941         }
2942
2943         const char *ptr = buf;
2944         while (sscanf(ptr, "%d", &qpid) == 1) {
2945                 struct ucred cred;
2946                 char v;
2947
2948                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2949                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2950                         goto out;
2951                 }
2952
2953                 if (recv_creds(sock[0], &cred, &v)) {
2954                         if (v == '0') {
2955                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2956                                         fail = true;
2957                                         break;
2958                                 }
2959                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2960                                         fail = true;
2961                         }
2962                 }
2963
2964                 ptr = strchr(ptr, '\n');
2965                 if (!ptr)
2966                         break;
2967                 ptr++;
2968         }
2969
2970         /* All good, write the value */
2971         qpid = -1;
2972         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2973                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2974
2975         if (!fail)
2976                 answer = true;
2977
2978 out:
2979         if (cpid != -1)
2980                 wait_for_pid(cpid);
2981         if (sock[0] != -1) {
2982                 close(sock[0]);
2983                 close(sock[1]);
2984         }
2985         if (pids_file) {
2986                 if (fclose(pids_file) != 0)
2987                         answer = false;
2988         }
2989         return answer;
2990 }
2991
2992 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2993              struct fuse_file_info *fi)
2994 {
2995         struct fuse_context *fc = fuse_get_context();
2996         char *localbuf = NULL;
2997         struct cgfs_files *k = NULL;
2998         struct file_info *f = (struct file_info *)fi->fh;
2999         bool r;
3000
3001         if (f->type != LXC_TYPE_CGFILE) {
3002                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3003                 return -EIO;
3004         }
3005
3006         if (offset)
3007                 return 0;
3008
3009         if (!fc)
3010                 return -EIO;
3011
3012         localbuf = alloca(size+1);
3013         localbuf[size] = '\0';
3014         memcpy(localbuf, buf, size);
3015
3016         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3017                 size = -EINVAL;
3018                 goto out;
3019         }
3020
3021         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3022                 size = -EACCES;
3023                 goto out;
3024         }
3025
3026         if (strcmp(f->file, "tasks") == 0 ||
3027                         strcmp(f->file, "/tasks") == 0 ||
3028                         strcmp(f->file, "/cgroup.procs") == 0 ||
3029                         strcmp(f->file, "cgroup.procs") == 0)
3030                 // special case - we have to translate the pids
3031                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3032         else
3033                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3034
3035         if (!r)
3036                 size = -EINVAL;
3037
3038 out:
3039         free_key(k);
3040         return size;
3041 }
3042
3043 int cg_chown(const char *path, uid_t uid, gid_t gid)
3044 {
3045         struct fuse_context *fc = fuse_get_context();
3046         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3047         struct cgfs_files *k = NULL;
3048         const char *cgroup;
3049         int ret;
3050
3051         if (!fc)
3052                 return -EIO;
3053
3054         if (strcmp(path, "/cgroup") == 0)
3055                 return -EPERM;
3056
3057         controller = pick_controller_from_path(fc, path);
3058         if (!controller)
3059                 return errno == ENOENT ? -EPERM : -errno;
3060
3061         cgroup = find_cgroup_in_path(path);
3062         if (!cgroup)
3063                 /* this is just /cgroup/controller */
3064                 return -EPERM;
3065
3066         get_cgdir_and_path(cgroup, &cgdir, &last);
3067
3068         if (!last) {
3069                 path1 = "/";
3070                 path2 = cgdir;
3071         } else {
3072                 path1 = cgdir;
3073                 path2 = last;
3074         }
3075
3076         if (is_child_cgroup(controller, path1, path2)) {
3077                 // get uid, gid, from '/tasks' file and make up a mode
3078                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3079                 k = cgfs_get_key(controller, cgroup, "tasks");
3080
3081         } else
3082                 k = cgfs_get_key(controller, path1, path2);
3083
3084         if (!k) {
3085                 ret = -EINVAL;
3086                 goto out;
3087         }
3088
3089         /*
3090          * This being a fuse request, the uid and gid must be valid
3091          * in the caller's namespace.  So we can just check to make
3092          * sure that the caller is root in his uid, and privileged
3093          * over the file's current owner.
3094          */
3095         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3096                 ret = -EACCES;
3097                 goto out;
3098         }
3099
3100         ret = cgfs_chown_file(controller, cgroup, uid, gid);
3101
3102 out:
3103         free_key(k);
3104         free(cgdir);
3105
3106         return ret;
3107 }
3108
3109 int cg_chmod(const char *path, mode_t mode)
3110 {
3111         struct fuse_context *fc = fuse_get_context();
3112         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3113         struct cgfs_files *k = NULL;
3114         const char *cgroup;
3115         int ret;
3116
3117         if (!fc)
3118                 return -EIO;
3119
3120         if (strcmp(path, "/cgroup") == 0)
3121                 return -EPERM;
3122
3123         controller = pick_controller_from_path(fc, path);
3124         if (!controller)
3125                 return errno == ENOENT ? -EPERM : -errno;
3126
3127         cgroup = find_cgroup_in_path(path);
3128         if (!cgroup)
3129                 /* this is just /cgroup/controller */
3130                 return -EPERM;
3131
3132         get_cgdir_and_path(cgroup, &cgdir, &last);
3133
3134         if (!last) {
3135                 path1 = "/";
3136                 path2 = cgdir;
3137         } else {
3138                 path1 = cgdir;
3139                 path2 = last;
3140         }
3141
3142         if (is_child_cgroup(controller, path1, path2)) {
3143                 // get uid, gid, from '/tasks' file and make up a mode
3144                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3145                 k = cgfs_get_key(controller, cgroup, "tasks");
3146
3147         } else
3148                 k = cgfs_get_key(controller, path1, path2);
3149
3150         if (!k) {
3151                 ret = -EINVAL;
3152                 goto out;
3153         }
3154
3155         /*
3156          * This being a fuse request, the uid and gid must be valid
3157          * in the caller's namespace.  So we can just check to make
3158          * sure that the caller is root in his uid, and privileged
3159          * over the file's current owner.
3160          */
3161         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3162                 ret = -EPERM;
3163                 goto out;
3164         }
3165
3166         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3167                 ret = -EINVAL;
3168                 goto out;
3169         }
3170
3171         ret = 0;
3172 out:
3173         free_key(k);
3174         free(cgdir);
3175         return ret;
3176 }
3177
3178 int cg_mkdir(const char *path, mode_t mode)
3179 {
3180         struct fuse_context *fc = fuse_get_context();
3181         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3182         const char *cgroup;
3183         int ret;
3184
3185         if (!fc)
3186                 return -EIO;
3187
3188         controller = pick_controller_from_path(fc, path);
3189         if (!controller)
3190                 return errno == ENOENT ? -EPERM : -errno;
3191
3192         cgroup = find_cgroup_in_path(path);
3193         if (!cgroup)
3194                 return -errno;
3195
3196         get_cgdir_and_path(cgroup, &cgdir, &last);
3197         if (!last)
3198                 path1 = "/";
3199         else
3200                 path1 = cgdir;
3201
3202         pid_t initpid = lookup_initpid_in_store(fc->pid);
3203         if (initpid <= 0)
3204                 initpid = fc->pid;
3205         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3206                 if (!next)
3207                         ret = -EINVAL;
3208                 else if (last && strcmp(next, last) == 0)
3209                         ret = -EEXIST;
3210                 else
3211                         ret = -EPERM;
3212                 goto out;
3213         }
3214
3215         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3216                 ret = -EACCES;
3217                 goto out;
3218         }
3219         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3220                 ret = -EACCES;
3221                 goto out;
3222         }
3223
3224         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3225
3226 out:
3227         free(cgdir);
3228         free(next);
3229         return ret;
3230 }
3231
3232 int cg_rmdir(const char *path)
3233 {
3234         struct fuse_context *fc = fuse_get_context();
3235         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3236         const char *cgroup;
3237         int ret;
3238
3239         if (!fc)
3240                 return -EIO;
3241
3242         controller = pick_controller_from_path(fc, path);
3243         if (!controller) /* Someone's trying to delete "/cgroup". */
3244                 return -EPERM;
3245
3246         cgroup = find_cgroup_in_path(path);
3247         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3248                 return -EPERM;
3249
3250         get_cgdir_and_path(cgroup, &cgdir, &last);
3251         if (!last) {
3252                 /* Someone's trying to delete a cgroup on the same level as the
3253                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3254                  * rmdir "/cgroup/blkio/init.slice".
3255                  */
3256                 ret = -EPERM;
3257                 goto out;
3258         }
3259
3260         pid_t initpid = lookup_initpid_in_store(fc->pid);
3261         if (initpid <= 0)
3262                 initpid = fc->pid;
3263         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3264                 if (!last || (next && (strcmp(next, last) == 0)))
3265                         ret = -EBUSY;
3266                 else
3267                         ret = -ENOENT;
3268                 goto out;
3269         }
3270
3271         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3272                 ret = -EACCES;
3273                 goto out;
3274         }
3275         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3276                 ret = -EACCES;
3277                 goto out;
3278         }
3279
3280         if (!cgfs_remove(controller, cgroup)) {
3281                 ret = -EINVAL;
3282                 goto out;
3283         }
3284
3285         ret = 0;
3286
3287 out:
3288         free(cgdir);
3289         free(next);
3290         return ret;
3291 }
3292
3293 static bool startswith(const char *line, const char *pref)
3294 {
3295         if (strncmp(line, pref, strlen(pref)) == 0)
3296                 return true;
3297         return false;
3298 }
3299
3300 static void parse_memstat(char *memstat, unsigned long *cached,
3301                 unsigned long *active_anon, unsigned long *inactive_anon,
3302                 unsigned long *active_file, unsigned long *inactive_file,
3303                 unsigned long *unevictable, unsigned long *shmem)
3304 {
3305         char *eol;
3306
3307         while (*memstat) {
3308                 if (startswith(memstat, "total_cache")) {
3309                         sscanf(memstat + 11, "%lu", cached);
3310                         *cached /= 1024;
3311                 } else if (startswith(memstat, "total_active_anon")) {
3312                         sscanf(memstat + 17, "%lu", active_anon);
3313                         *active_anon /= 1024;
3314                 } else if (startswith(memstat, "total_inactive_anon")) {
3315                         sscanf(memstat + 19, "%lu", inactive_anon);
3316                         *inactive_anon /= 1024;
3317                 } else if (startswith(memstat, "total_active_file")) {
3318                         sscanf(memstat + 17, "%lu", active_file);
3319                         *active_file /= 1024;
3320                 } else if (startswith(memstat, "total_inactive_file")) {
3321                         sscanf(memstat + 19, "%lu", inactive_file);
3322                         *inactive_file /= 1024;
3323                 } else if (startswith(memstat, "total_unevictable")) {
3324                         sscanf(memstat + 17, "%lu", unevictable);
3325                         *unevictable /= 1024;
3326                 } else if (startswith(memstat, "total_shmem")) {
3327                         sscanf(memstat + 11, "%lu", shmem);
3328                         *shmem /= 1024;
3329                 }
3330                 eol = strchr(memstat, '\n');
3331                 if (!eol)
3332                         return;
3333                 memstat = eol+1;
3334         }
3335 }
3336
3337 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3338 {
3339         char *eol;
3340         char key[32];
3341
3342         memset(key, 0, 32);
3343         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3344
3345         size_t len = strlen(key);
3346         *v = 0;
3347
3348         while (*str) {
3349                 if (startswith(str, key)) {
3350                         sscanf(str + len, "%lu", v);
3351                         return;
3352                 }
3353                 eol = strchr(str, '\n');
3354                 if (!eol)
3355                         return;
3356                 str = eol+1;
3357         }
3358 }
3359
3360 static int read_file(const char *path, char *buf, size_t size,
3361                      struct file_info *d)
3362 {
3363         size_t linelen = 0, total_len = 0, rv = 0;
3364         char *line = NULL;
3365         char *cache = d->buf;
3366         size_t cache_size = d->buflen;
3367         FILE *f = fopen(path, "r");
3368         if (!f)
3369                 return 0;
3370
3371         while (getline(&line, &linelen, f) != -1) {
3372                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3373                 if (l < 0) {
3374                         perror("Error writing to cache");
3375                         rv = 0;
3376                         goto err;
3377                 }
3378                 if (l >= cache_size) {
3379                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3380                         rv = 0;
3381                         goto err;
3382                 }
3383                 cache += l;
3384                 cache_size -= l;
3385                 total_len += l;
3386         }
3387
3388         d->size = total_len;
3389         if (total_len > size)
3390                 total_len = size;
3391
3392         /* read from off 0 */
3393         memcpy(buf, d->buf, total_len);
3394         rv = total_len;
3395   err:
3396         fclose(f);
3397         free(line);
3398         return rv;
3399 }
3400
3401 /*
3402  * FUSE ops for /proc
3403  */
3404
3405 static unsigned long get_memlimit(const char *cgroup, const char *file)
3406 {
3407         char *memlimit_str = NULL;
3408         unsigned long memlimit = -1;
3409
3410         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3411                 memlimit = strtoul(memlimit_str, NULL, 10);
3412
3413         free(memlimit_str);
3414
3415         return memlimit;
3416 }
3417
3418 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3419 {
3420         char *copy = strdupa(cgroup);
3421         unsigned long memlimit = 0, retlimit;
3422
3423         retlimit = get_memlimit(copy, file);
3424
3425         while (strcmp(copy, "/") != 0) {
3426                 copy = dirname(copy);
3427                 memlimit = get_memlimit(copy, file);
3428                 if (memlimit != -1 && memlimit < retlimit)
3429                         retlimit = memlimit;
3430         };
3431
3432         return retlimit;
3433 }
3434
3435 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3436                 struct fuse_file_info *fi)
3437 {
3438         struct fuse_context *fc = fuse_get_context();
3439         struct file_info *d = (struct file_info *)fi->fh;
3440         char *cg;
3441         char *memusage_str = NULL, *memstat_str = NULL,
3442                 *memswlimit_str = NULL, *memswusage_str = NULL;
3443         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3444                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3445                 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3446                 hostswtotal = 0;
3447         char *line = NULL;
3448         size_t linelen = 0, total_len = 0, rv = 0;
3449         char *cache = d->buf;
3450         size_t cache_size = d->buflen;
3451         FILE *f = NULL;
3452
3453         if (offset){
3454                 if (offset > d->size)
3455                         return -EINVAL;
3456                 if (!d->cached)
3457                         return 0;
3458                 int left = d->size - offset;
3459                 total_len = left > size ? size: left;
3460                 memcpy(buf, cache + offset, total_len);
3461                 return total_len;
3462         }
3463
3464         pid_t initpid = lookup_initpid_in_store(fc->pid);
3465         if (initpid <= 0)
3466                 initpid = fc->pid;
3467         cg = get_pid_cgroup(initpid, "memory");
3468         if (!cg)
3469                 return read_file("/proc/meminfo", buf, size, d);
3470         prune_init_slice(cg);
3471
3472         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3473         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3474                 goto err;
3475         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3476                 goto err;
3477
3478         // Following values are allowed to fail, because swapaccount might be turned
3479         // off for current kernel
3480         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3481                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3482         {
3483                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3484                 memswusage = strtoul(memswusage_str, NULL, 10);
3485
3486                 memswlimit = memswlimit / 1024;
3487                 memswusage = memswusage / 1024;
3488         }
3489
3490         memusage = strtoul(memusage_str, NULL, 10);
3491         memlimit /= 1024;
3492         memusage /= 1024;
3493
3494         parse_memstat(memstat_str, &cached, &active_anon,
3495                         &inactive_anon, &active_file, &inactive_file,
3496                         &unevictable, &shmem);
3497
3498         f = fopen("/proc/meminfo", "r");
3499         if (!f)
3500                 goto err;
3501
3502         while (getline(&line, &linelen, f) != -1) {
3503                 ssize_t l;
3504                 char *printme, lbuf[100];
3505
3506                 memset(lbuf, 0, 100);
3507                 if (startswith(line, "MemTotal:")) {
3508                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3509                         if (hosttotal < memlimit)
3510                                 memlimit = hosttotal;
3511                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3512                         printme = lbuf;
3513                 } else if (startswith(line, "MemFree:")) {
3514                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3515                         printme = lbuf;
3516                 } else if (startswith(line, "MemAvailable:")) {
3517                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3518                         printme = lbuf;
3519                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3520                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3521                         if (hostswtotal < memswlimit)
3522                                 memswlimit = hostswtotal;
3523                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3524                         printme = lbuf;
3525                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3526                         unsigned long swaptotal = memswlimit,
3527                                         swapusage = memswusage - memusage,
3528                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3529                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3530                         printme = lbuf;
3531                 } else if (startswith(line, "Slab:")) {
3532                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3533                         printme = lbuf;
3534                 } else if (startswith(line, "Buffers:")) {
3535                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3536                         printme = lbuf;
3537                 } else if (startswith(line, "Cached:")) {
3538                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3539                         printme = lbuf;
3540                 } else if (startswith(line, "SwapCached:")) {
3541                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3542                         printme = lbuf;
3543                 } else if (startswith(line, "Active:")) {
3544                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3545                                         active_anon + active_file);
3546                         printme = lbuf;
3547                 } else if (startswith(line, "Inactive:")) {
3548                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3549                                         inactive_anon + inactive_file);
3550                         printme = lbuf;
3551                 } else if (startswith(line, "Active(anon)")) {
3552                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3553                         printme = lbuf;
3554                 } else if (startswith(line, "Inactive(anon)")) {
3555                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3556                         printme = lbuf;
3557                 } else if (startswith(line, "Active(file)")) {
3558                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3559                         printme = lbuf;
3560                 } else if (startswith(line, "Inactive(file)")) {
3561                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3562                         printme = lbuf;
3563                 } else if (startswith(line, "Unevictable")) {
3564                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3565                         printme = lbuf;
3566                 } else if (startswith(line, "SReclaimable")) {
3567                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3568                         printme = lbuf;
3569                 } else if (startswith(line, "SUnreclaim")) {
3570                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3571                         printme = lbuf;
3572                 } else if (startswith(line, "Shmem:")) {
3573                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3574                         printme = lbuf;
3575                 } else if (startswith(line, "ShmemHugePages")) {
3576                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3577                         printme = lbuf;
3578                 } else if (startswith(line, "ShmemPmdMapped")) {
3579                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3580                         printme = lbuf;
3581                 } else
3582                         printme = line;
3583
3584                 l = snprintf(cache, cache_size, "%s", printme);
3585                 if (l < 0) {
3586                         perror("Error writing to cache");
3587                         rv = 0;
3588                         goto err;
3589
3590                 }
3591                 if (l >= cache_size) {
3592                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3593                         rv = 0;
3594                         goto err;
3595                 }
3596
3597                 cache += l;
3598                 cache_size -= l;
3599                 total_len += l;
3600         }
3601
3602         d->cached = 1;
3603         d->size = total_len;
3604         if (total_len > size ) total_len = size;
3605         memcpy(buf, d->buf, total_len);
3606
3607         rv = total_len;
3608 err:
3609         if (f)
3610                 fclose(f);
3611         free(line);
3612         free(cg);
3613         free(memusage_str);
3614         free(memswlimit_str);
3615         free(memswusage_str);
3616         free(memstat_str);
3617         return rv;
3618 }
3619
3620 /*
3621  * Read the cpuset.cpus for cg
3622  * Return the answer in a newly allocated string which must be freed
3623  */
3624 static char *get_cpuset(const char *cg)
3625 {
3626         char *answer;
3627
3628         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3629                 return NULL;
3630         return answer;
3631 }
3632
3633 bool cpu_in_cpuset(int cpu, const char *cpuset);
3634
3635 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3636 {
3637         int cpu;
3638
3639         if (sscanf(line, "processor       : %d", &cpu) != 1)
3640                 return false;
3641         return cpu_in_cpuset(cpu, cpuset);
3642 }
3643
3644 /*
3645  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3646  * depending on `param`. Parameter value is returned throuh `value`.
3647  */
3648 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3649 {
3650         bool rv = false;
3651         char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3652         char *str = NULL;
3653
3654         sprintf(file, "cpu.cfs_%s_us", param);
3655
3656         if (!cgfs_get_value("cpu", cg, file, &str))
3657                 goto err;
3658
3659         if (sscanf(str, "%ld", value) != 1)
3660                 goto err;
3661
3662         rv = true;
3663
3664 err:
3665         if (str)
3666                 free(str);
3667         return rv;
3668 }
3669
3670 /*
3671  * Return the maximum number of visible CPUs based on CPU quotas.
3672  * If there is no quota set, zero is returned.
3673  */
3674 int max_cpu_count(const char *cg)
3675 {
3676         int rv, nprocs;
3677         int64_t cfs_quota, cfs_period;
3678
3679         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3680                 return 0;
3681
3682         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3683                 return 0;
3684
3685         if (cfs_quota <= 0 || cfs_period <= 0)
3686                 return 0;
3687
3688         rv = cfs_quota / cfs_period;
3689
3690         /* In case quota/period does not yield a whole number, add one CPU for
3691          * the remainder.
3692          */
3693         if ((cfs_quota % cfs_period) > 0)
3694                 rv += 1;
3695
3696         nprocs = get_nprocs();
3697
3698         if (rv > nprocs)
3699                 rv = nprocs;
3700
3701         return rv;
3702 }
3703
3704 /*
3705  * Determine whether CPU views should be used or not.
3706  */
3707 bool use_cpuview(const char *cg)
3708 {
3709         int cfd;
3710         char *tmpc;
3711
3712         tmpc = find_mounted_controller("cpu", &cfd);
3713         if (!tmpc)
3714                 return false;
3715
3716         tmpc = find_mounted_controller("cpuacct", &cfd);
3717         if (!tmpc)
3718                 return false;
3719
3720         return true;
3721 }
3722
3723 /*
3724  * check whether this is a '^processor" line in /proc/cpuinfo
3725  */
3726 static bool is_processor_line(const char *line)
3727 {
3728         int cpu;
3729
3730         if (sscanf(line, "processor       : %d", &cpu) == 1)
3731                 return true;
3732         return false;
3733 }
3734
3735 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3736                 struct fuse_file_info *fi)
3737 {
3738         struct fuse_context *fc = fuse_get_context();
3739         struct file_info *d = (struct file_info *)fi->fh;
3740         char *cg;
3741         char *cpuset = NULL;
3742         char *line = NULL;
3743         size_t linelen = 0, total_len = 0, rv = 0;
3744         bool am_printing = false, firstline = true, is_s390x = false;
3745         int curcpu = -1, cpu, max_cpus = 0;
3746         bool use_view;
3747         char *cache = d->buf;
3748         size_t cache_size = d->buflen;
3749         FILE *f = NULL;
3750
3751         if (offset){
3752                 if (offset > d->size)
3753                         return -EINVAL;
3754                 if (!d->cached)
3755                         return 0;
3756                 int left = d->size - offset;
3757                 total_len = left > size ? size: left;
3758                 memcpy(buf, cache + offset, total_len);
3759                 return total_len;
3760         }
3761
3762         pid_t initpid = lookup_initpid_in_store(fc->pid);
3763         if (initpid <= 0)
3764                 initpid = fc->pid;
3765         cg = get_pid_cgroup(initpid, "cpuset");
3766         if (!cg)
3767                 return read_file("proc/cpuinfo", buf, size, d);
3768         prune_init_slice(cg);
3769
3770         cpuset = get_cpuset(cg);
3771         if (!cpuset)
3772                 goto err;
3773
3774         use_view = use_cpuview(cg);
3775
3776         if (use_view)
3777                 max_cpus = max_cpu_count(cg);
3778
3779         f = fopen("/proc/cpuinfo", "r");
3780         if (!f)
3781                 goto err;
3782
3783         while (getline(&line, &linelen, f) != -1) {
3784                 ssize_t l;
3785                 if (firstline) {
3786                         firstline = false;
3787                         if (strstr(line, "IBM/S390") != NULL) {
3788                                 is_s390x = true;
3789                                 am_printing = true;
3790                                 continue;
3791                         }
3792                 }
3793                 if (strncmp(line, "# processors:", 12) == 0)
3794                         continue;
3795                 if (is_processor_line(line)) {
3796                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3797                                 break;
3798                         am_printing = cpuline_in_cpuset(line, cpuset);
3799                         if (am_printing) {
3800                                 curcpu ++;
3801                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3802                                 if (l < 0) {
3803                                         perror("Error writing to cache");
3804                                         rv = 0;
3805                                         goto err;
3806                                 }
3807                                 if (l >= cache_size) {
3808                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3809                                         rv = 0;
3810                                         goto err;
3811                                 }
3812                                 cache += l;
3813                                 cache_size -= l;
3814                                 total_len += l;
3815                         }
3816                         continue;
3817                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3818                         char *p;
3819                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3820                                 break;
3821                         if (!cpu_in_cpuset(cpu, cpuset))
3822                                 continue;
3823                         curcpu ++;
3824                         p = strchr(line, ':');
3825                         if (!p || !*p)
3826                                 goto err;
3827                         p++;
3828                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3829                         if (l < 0) {
3830                                 perror("Error writing to cache");
3831                                 rv = 0;
3832                                 goto err;
3833                         }
3834                         if (l >= cache_size) {
3835                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3836                                 rv = 0;
3837                                 goto err;
3838                         }
3839                         cache += l;
3840                         cache_size -= l;
3841                         total_len += l;
3842                         continue;
3843
3844                 }
3845                 if (am_printing) {
3846                         l = snprintf(cache, cache_size, "%s", line);
3847                         if (l < 0) {
3848                                 perror("Error writing to cache");
3849                                 rv = 0;
3850                                 goto err;
3851                         }
3852                         if (l >= cache_size) {
3853                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3854                                 rv = 0;
3855                                 goto err;
3856                         }
3857                         cache += l;
3858                         cache_size -= l;
3859                         total_len += l;
3860                 }
3861         }
3862
3863         if (is_s390x) {
3864                 char *origcache = d->buf;
3865                 ssize_t l;
3866                 do {
3867                         d->buf = malloc(d->buflen);
3868                 } while (!d->buf);
3869                 cache = d->buf;
3870                 cache_size = d->buflen;
3871                 total_len = 0;
3872                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3873                 if (l < 0 || l >= cache_size) {
3874                         free(origcache);
3875                         goto err;
3876                 }
3877                 cache_size -= l;
3878                 cache += l;
3879                 total_len += l;
3880                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3881                 if (l < 0 || l >= cache_size) {
3882                         free(origcache);
3883                         goto err;
3884                 }
3885                 cache_size -= l;
3886                 cache += l;
3887                 total_len += l;
3888                 l = snprintf(cache, cache_size, "%s", origcache);
3889                 free(origcache);
3890                 if (l < 0 || l >= cache_size)
3891                         goto err;
3892                 total_len += l;
3893         }
3894
3895         d->cached = 1;
3896         d->size = total_len;
3897         if (total_len > size ) total_len = size;
3898
3899         /* read from off 0 */
3900         memcpy(buf, d->buf, total_len);
3901         rv = total_len;
3902 err:
3903         if (f)
3904                 fclose(f);
3905         free(line);
3906         free(cpuset);
3907         free(cg);
3908         return rv;
3909 }
3910
3911 static uint64_t get_reaper_start_time(pid_t pid)
3912 {
3913         int ret;
3914         FILE *f;
3915         uint64_t starttime;
3916         /* strlen("/proc/") = 6
3917          * +
3918          * LXCFS_NUMSTRLEN64
3919          * +
3920          * strlen("/stat") = 5
3921          * +
3922          * \0 = 1
3923          * */
3924 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3925         char path[__PROC_PID_STAT_LEN];
3926         pid_t qpid;
3927
3928         qpid = lookup_initpid_in_store(pid);
3929         if (qpid <= 0) {
3930                 /* Caller can check for EINVAL on 0. */
3931                 errno = EINVAL;
3932                 return 0;
3933         }
3934
3935         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3936         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3937                 /* Caller can check for EINVAL on 0. */
3938                 errno = EINVAL;
3939                 return 0;
3940         }
3941
3942         f = fopen(path, "r");
3943         if (!f) {
3944                 /* Caller can check for EINVAL on 0. */
3945                 errno = EINVAL;
3946                 return 0;
3947         }
3948
3949         /* Note that the *scanf() argument supression requires that length
3950          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3951          * at us. It's like telling someone you're not married and then asking
3952          * if you can bring your wife to the party.
3953          */
3954         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3955                         "%*s "      /* (2)  comm        %s   */
3956                         "%*c "      /* (3)  state       %c   */
3957                         "%*d "      /* (4)  ppid        %d   */
3958                         "%*d "      /* (5)  pgrp        %d   */
3959                         "%*d "      /* (6)  session     %d   */
3960                         "%*d "      /* (7)  tty_nr      %d   */
3961                         "%*d "      /* (8)  tpgid       %d   */
3962                         "%*u "      /* (9)  flags       %u   */
3963                         "%*u "      /* (10) minflt      %lu  */
3964                         "%*u "      /* (11) cminflt     %lu  */
3965                         "%*u "      /* (12) majflt      %lu  */
3966                         "%*u "      /* (13) cmajflt     %lu  */
3967                         "%*u "      /* (14) utime       %lu  */
3968                         "%*u "      /* (15) stime       %lu  */
3969                         "%*d "      /* (16) cutime      %ld  */
3970                         "%*d "      /* (17) cstime      %ld  */
3971                         "%*d "      /* (18) priority    %ld  */
3972                         "%*d "      /* (19) nice        %ld  */
3973                         "%*d "      /* (20) num_threads %ld  */
3974                         "%*d "      /* (21) itrealvalue %ld  */
3975                         "%" PRIu64, /* (22) starttime   %llu */
3976                      &starttime);
3977         if (ret != 1) {
3978                 fclose(f);
3979                 /* Caller can check for EINVAL on 0. */
3980                 errno = EINVAL;
3981                 return 0;
3982         }
3983
3984         fclose(f);
3985
3986         errno = 0;
3987         return starttime;
3988 }
3989
3990 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3991 {
3992         uint64_t clockticks;
3993         int64_t ticks_per_sec;
3994
3995         clockticks = get_reaper_start_time(pid);
3996         if (clockticks == 0 && errno == EINVAL) {
3997                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3998                 return 0;
3999         }
4000
4001         ticks_per_sec = sysconf(_SC_CLK_TCK);
4002         if (ticks_per_sec < 0 && errno == EINVAL) {
4003                 lxcfs_debug(
4004                     "%s\n",
4005                     "failed to determine number of clock ticks in a second");
4006                 return 0;
4007         }
4008
4009         return (clockticks /= ticks_per_sec);
4010 }
4011
4012 static uint64_t get_reaper_age(pid_t pid)
4013 {
4014         uint64_t procstart, uptime, procage;
4015
4016         /* We need to substract the time the process has started since system
4017          * boot minus the time when the system has started to get the actual
4018          * reaper age.
4019          */
4020         procstart = get_reaper_start_time_in_sec(pid);
4021         procage = procstart;
4022         if (procstart > 0) {
4023                 int ret;
4024                 struct timespec spec;
4025
4026                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4027                 if (ret < 0)
4028                         return 0;
4029                 /* We could make this more precise here by using the tv_nsec
4030                  * field in the timespec struct and convert it to milliseconds
4031                  * and then create a double for the seconds and milliseconds but
4032                  * that seems more work than it is worth.
4033                  */
4034                 uptime = spec.tv_sec;
4035                 procage = uptime - procstart;
4036         }
4037
4038         return procage;
4039 }
4040
4041 /*
4042  * Returns 0 on success.
4043  * It is the caller's responsibility to free `return_usage`, unless this
4044  * function returns an error.
4045  */
4046 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
4047 {
4048         int cpucount = get_nprocs();
4049         struct cpuacct_usage *cpu_usage;
4050         int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4051         int cg_cpu;
4052         uint64_t cg_user, cg_system;
4053         int64_t ticks_per_sec;
4054         char *usage_str = NULL;
4055
4056         ticks_per_sec = sysconf(_SC_CLK_TCK);
4057
4058         if (ticks_per_sec < 0 && errno == EINVAL) {
4059                 lxcfs_debug(
4060                         "%s\n",
4061                         "read_cpuacct_usage_all failed to determine number of clock ticks "
4062                         "in a second");
4063                 return -1;
4064         }
4065
4066         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4067         if (!cpu_usage)
4068                 return -ENOMEM;
4069
4070         if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4071                 rv = -1;
4072                 goto err;
4073         }
4074
4075         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4076                 lxcfs_error("read_cpuacct_usage_all reading first line from "
4077                                 "%s/cpuacct.usage_all failed.\n", cg);
4078                 rv = -1;
4079                 goto err;
4080         }
4081
4082         read_pos += read_cnt;
4083
4084         for (i = 0, j = 0; i < cpucount; i++) {
4085                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4086                                 &cg_system, &read_cnt);
4087
4088                 if (ret == EOF)
4089                         break;
4090
4091                 if (ret != 3) {
4092                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4093                                         "failed.\n", cg);
4094                         rv = -1;
4095                         goto err;
4096                 }
4097
4098                 read_pos += read_cnt;
4099
4100                 if (!cpu_in_cpuset(i, cpuset))
4101                         continue;
4102
4103                 /* Convert the time from nanoseconds to USER_HZ */
4104                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4105                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4106                 j++;
4107         }
4108
4109         rv = 0;
4110         *return_usage = cpu_usage;
4111
4112 err:
4113         if (usage_str)
4114                 free(usage_str);
4115
4116         if (rv != 0) {
4117                 free(cpu_usage);
4118                 *return_usage = NULL;
4119         }
4120
4121         return rv;
4122 }
4123
4124 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4125 {
4126         int i;
4127         unsigned long sum = 0;
4128
4129         for (i = 0; i < cpu_count; i++) {
4130                 /* When cpuset is changed on the fly, the CPUs might get reordered.
4131                  * We could either reset all counters, or check that the substractions
4132                  * below will return expected results.
4133                  */
4134                 if (newer[i].user > older[i].user)
4135                         diff[i].user = newer[i].user - older[i].user;
4136                 else
4137                         diff[i].user = 0;
4138
4139                 if (newer[i].system > older[i].system)
4140                         diff[i].system = newer[i].system - older[i].system;
4141                 else
4142                         diff[i].system = 0;
4143
4144                 if (newer[i].idle > older[i].idle)
4145                         diff[i].idle = newer[i].idle - older[i].idle;
4146                 else
4147                         diff[i].idle = 0;
4148
4149                 sum += diff[i].user;
4150                 sum += diff[i].system;
4151                 sum += diff[i].idle;
4152         }
4153
4154         return sum;
4155 }
4156
4157 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4158 {
4159         unsigned long free_space, to_add;
4160
4161         free_space = threshold - usage->user - usage->system;
4162
4163         if (free_space > usage->idle)
4164                 free_space = usage->idle;
4165
4166         to_add = free_space > *surplus ? *surplus : free_space;
4167
4168         *counter += to_add;
4169         usage->idle -= to_add;
4170         *surplus -= to_add;
4171 }
4172
4173 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4174 {
4175         struct cg_proc_stat *first = NULL, *prev, *tmp;
4176
4177         for (prev = NULL; node; ) {
4178                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4179                         tmp = node;
4180                         lxcfs_debug("Removing stat node for %s\n", node->cg);
4181
4182                         if (prev)
4183                                 prev->next = node->next;
4184                         else
4185                                 first = node->next;
4186
4187                         node = node->next;
4188                         free_proc_stat_node(tmp);
4189                 } else {
4190                         if (!first)
4191                                 first = node;
4192                         prev = node;
4193                         node = node->next;
4194                 }
4195         }
4196
4197         return first;
4198 }
4199
4200 #define PROC_STAT_PRUNE_INTERVAL 10
4201 static void prune_proc_stat_history(void)
4202 {
4203         int i;
4204         time_t now = time(NULL);
4205
4206         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4207                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4208
4209                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4210                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4211                         return;
4212                 }
4213
4214                 if (proc_stat_history[i]->next) {
4215                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4216                         proc_stat_history[i]->lastcheck = now;
4217                 }
4218
4219                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4220         }
4221 }
4222
4223 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4224 {
4225         struct cg_proc_stat *node;
4226
4227         pthread_rwlock_rdlock(&head->lock);
4228
4229         if (!head->next) {
4230                 pthread_rwlock_unlock(&head->lock);
4231                 return NULL;
4232         }
4233
4234         node = head->next;
4235
4236         do {
4237                 if (strcmp(cg, node->cg) == 0)
4238                         goto out;
4239         } while ((node = node->next));
4240
4241         node = NULL;
4242
4243 out:
4244         pthread_rwlock_unlock(&head->lock);
4245         prune_proc_stat_history();
4246         return node;
4247 }
4248
4249 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4250 {
4251         struct cg_proc_stat *node;
4252         int i;
4253
4254         node = malloc(sizeof(struct cg_proc_stat));
4255         if (!node)
4256                 goto err;
4257
4258         node->cg = NULL;
4259         node->usage = NULL;
4260         node->view = NULL;
4261
4262         node->cg = malloc(strlen(cg) + 1);
4263         if (!node->cg)
4264                 goto err;
4265
4266         strcpy(node->cg, cg);
4267
4268         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4269         if (!node->usage)
4270                 goto err;
4271
4272         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4273
4274         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4275         if (!node->view)
4276                 goto err;
4277
4278         node->cpu_count = cpu_count;
4279         node->next = NULL;
4280
4281         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4282                 lxcfs_error("%s\n", "Failed to initialize node lock");
4283                 goto err;
4284         }
4285
4286         for (i = 0; i < cpu_count; i++) {
4287                 node->view[i].user = 0;
4288                 node->view[i].system = 0;
4289                 node->view[i].idle = 0;
4290         }
4291
4292         return node;
4293
4294 err:
4295         if (node && node->cg)
4296                 free(node->cg);
4297         if (node && node->usage)
4298                 free(node->usage);
4299         if (node && node->view)
4300                 free(node->view);
4301         if (node)
4302                 free(node);
4303
4304         return NULL;
4305 }
4306
4307 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4308 {
4309         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4310         struct cg_proc_stat_head *head = proc_stat_history[hash];
4311         struct cg_proc_stat *node, *rv = new_node;
4312
4313         pthread_rwlock_wrlock(&head->lock);
4314
4315         if (!head->next) {
4316                 head->next = new_node;
4317                 goto out;
4318         }
4319
4320         node = head->next;
4321
4322         for (;;) {
4323                 if (strcmp(node->cg, new_node->cg) == 0) {
4324                         /* The node is already present, return it */
4325                         free_proc_stat_node(new_node);
4326                         rv = node;
4327                         goto out;
4328                 }
4329
4330                 if (node->next) {
4331                         node = node->next;
4332                         continue;
4333                 }
4334
4335                 node->next = new_node;
4336                 goto out;
4337         }
4338
4339 out:
4340         pthread_rwlock_unlock(&head->lock);
4341         return rv;
4342 }
4343
4344 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4345 {
4346         struct cpuacct_usage *new_usage, *new_view;
4347         int i;
4348
4349         /* Allocate new memory */
4350         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4351         if (!new_usage)
4352                 return false;
4353
4354         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4355         if (!new_view) {
4356                 free(new_usage);
4357                 return false;
4358         }
4359
4360         /* Copy existing data & initialize new elements */
4361         for (i = 0; i < cpu_count; i++) {
4362                 if (i < node->cpu_count) {
4363                         new_usage[i].user = node->usage[i].user;
4364                         new_usage[i].system = node->usage[i].system;
4365                         new_usage[i].idle = node->usage[i].idle;
4366
4367                         new_view[i].user = node->view[i].user;
4368                         new_view[i].system = node->view[i].system;
4369                         new_view[i].idle = node->view[i].idle;
4370                 } else {
4371                         new_usage[i].user = 0;
4372                         new_usage[i].system = 0;
4373                         new_usage[i].idle = 0;
4374
4375                         new_view[i].user = 0;
4376                         new_view[i].system = 0;
4377                         new_view[i].idle = 0;
4378                 }
4379         }
4380
4381         free(node->usage);
4382         free(node->view);
4383
4384         node->usage = new_usage;
4385         node->view = new_view;
4386         node->cpu_count = cpu_count;
4387
4388         return true;
4389 }
4390
4391 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4392 {
4393         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4394         struct cg_proc_stat_head *head = proc_stat_history[hash];
4395         struct cg_proc_stat *node;
4396
4397         node = find_proc_stat_node(head, cg);
4398
4399         if (!node) {
4400                 node = new_proc_stat_node(usage, cpu_count, cg);
4401                 if (!node)
4402                         return NULL;
4403
4404                 node = add_proc_stat_node(node);
4405                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4406         }
4407
4408         pthread_mutex_lock(&node->lock);
4409
4410         /* If additional CPUs on the host have been enabled, CPU usage counter
4411          * arrays have to be expanded */
4412         if (node->cpu_count < cpu_count) {
4413                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4414                                 node->cpu_count, cpu_count, cg);
4415
4416                 if (!expand_proc_stat_node(node, cpu_count)) {
4417                         pthread_mutex_unlock(&node->lock);
4418                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4419                                         node->cpu_count, cpu_count, cg);
4420                         return NULL;
4421                 }
4422         }
4423
4424         return node;
4425 }
4426
4427 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4428 {
4429         int i;
4430
4431         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4432         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4433
4434         for (i = 0; i < cpu_count; i++) {
4435                 node->view[i].user = 0;
4436                 node->view[i].system = 0;
4437                 node->view[i].idle = 0;
4438         }
4439
4440         node->cpu_count = cpu_count;
4441 }
4442
4443 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, FILE *f, char *buf, size_t buf_size)
4444 {
4445         char *line = NULL;
4446         size_t linelen = 0, total_len = 0, rv = 0, l;
4447         int curcpu = -1; /* cpu numbering starts at 0 */
4448         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4449         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4450         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4451         unsigned long user_surplus = 0, system_surplus = 0;
4452         unsigned long total_sum, threshold;
4453         struct cg_proc_stat *stat_node;
4454         struct cpuacct_usage *diff = NULL;
4455         int nprocs = get_nprocs();
4456
4457         /* Read all CPU stats and stop when we've encountered other lines */
4458         while (getline(&line, &linelen, f) != -1) {
4459                 int cpu, ret;
4460                 char cpu_char[10]; /* That's a lot of cores */
4461                 uint64_t all_used, cg_used;
4462
4463                 if (strlen(line) == 0)
4464                         continue;
4465                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4466                         /* not a ^cpuN line containing a number N */
4467                         break;
4468                 }
4469
4470                 if (sscanf(cpu_char, "%d", &cpu) != 1)
4471                         continue;
4472                 if (!cpu_in_cpuset(cpu, cpuset))
4473                         continue;
4474                 curcpu ++;
4475                 cpu_cnt ++;
4476
4477                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4478                            &user,
4479                            &nice,
4480                            &system,
4481                            &idle,
4482                            &iowait,
4483                            &irq,
4484                            &softirq,
4485                            &steal,
4486                            &guest,
4487                            &guest_nice);
4488
4489                 if (ret != 10)
4490                         continue;
4491
4492                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4493                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4494
4495                 if (all_used >= cg_used) {
4496                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4497
4498                 } else {
4499                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4500                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4501                                         curcpu, cg, all_used, cg_used);
4502                         cg_cpu_usage[curcpu].idle = idle;
4503                 }
4504         }
4505
4506         /* Cannot use more CPUs than is available due to cpuset */
4507         if (max_cpus > cpu_cnt)
4508                 max_cpus = cpu_cnt;
4509
4510         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4511
4512         if (!stat_node) {
4513                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4514                 rv = 0;
4515                 goto err;
4516         }
4517
4518         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4519         if (!diff) {
4520                 rv = 0;
4521                 goto err;
4522         }
4523
4524         /*
4525          * If the new values are LOWER than values stored in memory, it means
4526          * the cgroup has been reset/recreated and we should reset too.
4527          */
4528         if (cg_cpu_usage[0].user < stat_node->usage[0].user)
4529                 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4530
4531         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, cpu_cnt);
4532
4533         for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4534                 stat_node->usage[curcpu].user += diff[curcpu].user;
4535                 stat_node->usage[curcpu].system += diff[curcpu].system;
4536                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4537
4538                 if (max_cpus > 0 && curcpu >= max_cpus) {
4539                         user_surplus += diff[curcpu].user;
4540                         system_surplus += diff[curcpu].system;
4541                 }
4542         }
4543
4544         /* Calculate usage counters of visible CPUs */
4545         if (max_cpus > 0) {
4546                 /* threshold = maximum usage per cpu, including idle */
4547                 threshold = total_sum / cpu_cnt * max_cpus;
4548
4549                 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4550                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4551                                 continue;
4552
4553                         /* Add user */
4554                         add_cpu_usage(
4555                                         &user_surplus,
4556                                         &diff[curcpu],
4557                                         &diff[curcpu].user,
4558                                         threshold);
4559
4560                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4561                                 continue;
4562
4563                         /* If there is still room, add system */
4564                         add_cpu_usage(
4565                                         &system_surplus,
4566                                         &diff[curcpu],
4567                                         &diff[curcpu].system,
4568                                         threshold);
4569                 }
4570
4571                 if (user_surplus > 0)
4572                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4573                 if (system_surplus > 0)
4574                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4575
4576                 for (curcpu = 0; curcpu < max_cpus; curcpu++) {
4577                         stat_node->view[curcpu].user += diff[curcpu].user;
4578                         stat_node->view[curcpu].system += diff[curcpu].system;
4579                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4580
4581                         user_sum += stat_node->view[curcpu].user;
4582                         system_sum += stat_node->view[curcpu].system;
4583                         idle_sum += stat_node->view[curcpu].idle;
4584                 }
4585
4586         } else {
4587                 for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4588                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4589                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4590                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4591
4592                         user_sum += stat_node->view[curcpu].user;
4593                         system_sum += stat_node->view[curcpu].system;
4594                         idle_sum += stat_node->view[curcpu].idle;
4595                 }
4596         }
4597
4598         /* Render the file */
4599         /* cpu-all */
4600         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4601                         user_sum,
4602                         system_sum,
4603                         idle_sum);
4604
4605         if (l < 0) {
4606                 perror("Error writing to cache");
4607                 rv = 0;
4608                 goto err;
4609
4610         }
4611         if (l >= buf_size) {
4612                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4613                 rv = 0;
4614                 goto err;
4615         }
4616
4617         buf += l;
4618         buf_size -= l;
4619         total_len += l;
4620
4621         /* Render visible CPUs */
4622         for (curcpu = 0; curcpu < cpu_cnt; curcpu++) {
4623                 if (max_cpus > 0 && curcpu == max_cpus)
4624                         break;
4625
4626                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4627                                 curcpu,
4628                                 stat_node->view[curcpu].user,
4629                                 stat_node->view[curcpu].system,
4630                                 stat_node->view[curcpu].idle);
4631
4632                 if (l < 0) {
4633                         perror("Error writing to cache");
4634                         rv = 0;
4635                         goto err;
4636
4637                 }
4638                 if (l >= buf_size) {
4639                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4640                         rv = 0;
4641                         goto err;
4642                 }
4643
4644                 buf += l;
4645                 buf_size -= l;
4646                 total_len += l;
4647         }
4648
4649         /* Pass the rest of /proc/stat, start with the last line read */
4650         l = snprintf(buf, buf_size, "%s", line);
4651
4652         if (l < 0) {
4653                 perror("Error writing to cache");
4654                 rv = 0;
4655                 goto err;
4656
4657         }
4658         if (l >= buf_size) {
4659                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4660                 rv = 0;
4661                 goto err;
4662         }
4663
4664         buf += l;
4665         buf_size -= l;
4666         total_len += l;
4667
4668         /* Pass the rest of the host's /proc/stat */
4669         while (getline(&line, &linelen, f) != -1) {
4670                 l = snprintf(buf, buf_size, "%s", line);
4671                 if (l < 0) {
4672                         perror("Error writing to cache");
4673                         rv = 0;
4674                         goto err;
4675                 }
4676                 if (l >= buf_size) {
4677                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4678                         rv = 0;
4679                         goto err;
4680                 }
4681                 buf += l;
4682                 buf_size -= l;
4683                 total_len += l;
4684         }
4685
4686         rv = total_len;
4687
4688 err:
4689         if (stat_node)
4690                 pthread_mutex_unlock(&stat_node->lock);
4691         if (line)
4692                 free(line);
4693         if (diff)
4694                 free(diff);
4695         return rv;
4696 }
4697
4698 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4699 static int proc_stat_read(char *buf, size_t size, off_t offset,
4700                 struct fuse_file_info *fi)
4701 {
4702         struct fuse_context *fc = fuse_get_context();
4703         struct file_info *d = (struct file_info *)fi->fh;
4704         char *cg;
4705         char *cpuset = NULL;
4706         char *line = NULL;
4707         size_t linelen = 0, total_len = 0, rv = 0;
4708         int curcpu = -1; /* cpu numbering starts at 0 */
4709         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4710         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4711                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4712         char cpuall[CPUALL_MAX_SIZE];
4713         /* reserve for cpu all */
4714         char *cache = d->buf + CPUALL_MAX_SIZE;
4715         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4716         FILE *f = NULL;
4717         struct cpuacct_usage *cg_cpu_usage = NULL;
4718
4719         if (offset){
4720                 if (offset > d->size)
4721                         return -EINVAL;
4722                 if (!d->cached)
4723                         return 0;
4724                 int left = d->size - offset;
4725                 total_len = left > size ? size: left;
4726                 memcpy(buf, d->buf + offset, total_len);
4727                 return total_len;
4728         }
4729
4730         pid_t initpid = lookup_initpid_in_store(fc->pid);
4731         if (initpid <= 0)
4732                 initpid = fc->pid;
4733         cg = get_pid_cgroup(initpid, "cpuset");
4734         if (!cg)
4735                 return read_file("/proc/stat", buf, size, d);
4736         prune_init_slice(cg);
4737
4738         cpuset = get_cpuset(cg);
4739         if (!cpuset)
4740                 goto err;
4741
4742         /*
4743          * Read cpuacct.usage_all for all CPUs.
4744          * If the cpuacct cgroup is present, it is used to calculate the container's
4745          * CPU usage. If not, values from the host's /proc/stat are used.
4746          */
4747         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
4748                 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4749                                 "falling back to the host's /proc/stat");
4750         }
4751
4752         f = fopen("/proc/stat", "r");
4753         if (!f)
4754                 goto err;
4755
4756         //skip first line
4757         if (getline(&line, &linelen, f) < 0) {
4758                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4759                 goto err;
4760         }
4761
4762         if (use_cpuview(cg) && cg_cpu_usage) {
4763                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, f, d->buf, d->buflen);
4764                 goto out;
4765         }
4766
4767         while (getline(&line, &linelen, f) != -1) {
4768                 ssize_t l;
4769                 int cpu;
4770                 char cpu_char[10]; /* That's a lot of cores */
4771                 char *c;
4772                 uint64_t all_used, cg_used, new_idle;
4773                 int ret;
4774
4775                 if (strlen(line) == 0)
4776                         continue;
4777                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4778                         /* not a ^cpuN line containing a number N, just print it */
4779                         l = snprintf(cache, cache_size, "%s", line);
4780                         if (l < 0) {
4781                                 perror("Error writing to cache");
4782                                 rv = 0;
4783                                 goto err;
4784                         }
4785                         if (l >= cache_size) {
4786                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4787                                 rv = 0;
4788                                 goto err;
4789                         }
4790                         cache += l;
4791                         cache_size -= l;
4792                         total_len += l;
4793                         continue;
4794                 }
4795
4796                 if (sscanf(cpu_char, "%d", &cpu) != 1)
4797                         continue;
4798                 if (!cpu_in_cpuset(cpu, cpuset))
4799                         continue;
4800                 curcpu ++;
4801
4802                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4803                            &user,
4804                            &nice,
4805                            &system,
4806                            &idle,
4807                            &iowait,
4808                            &irq,
4809                            &softirq,
4810                            &steal,
4811                            &guest,
4812                            &guest_nice);
4813
4814                 if (ret != 10 || !cg_cpu_usage) {
4815                         c = strchr(line, ' ');
4816                         if (!c)
4817                                 continue;
4818                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4819                         if (l < 0) {
4820                                 perror("Error writing to cache");
4821                                 rv = 0;
4822                                 goto err;
4823
4824                         }
4825                         if (l >= cache_size) {
4826                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4827                                 rv = 0;
4828                                 goto err;
4829                         }
4830
4831                         cache += l;
4832                         cache_size -= l;
4833                         total_len += l;
4834
4835                         if (ret != 10)
4836                                 continue;
4837                 }
4838
4839                 if (cg_cpu_usage) {
4840                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4841                         cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4842
4843                         if (all_used >= cg_used) {
4844                                 new_idle = idle + (all_used - cg_used);
4845
4846                         } else {
4847                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4848                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4849                                                 curcpu, cg, all_used, cg_used);
4850                                 new_idle = idle;
4851                         }
4852
4853                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4854                                         curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4855                                         new_idle);
4856
4857                         if (l < 0) {
4858                                 perror("Error writing to cache");
4859                                 rv = 0;
4860                                 goto err;
4861
4862                         }
4863                         if (l >= cache_size) {
4864                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4865                                 rv = 0;
4866                                 goto err;
4867                         }
4868
4869                         cache += l;
4870                         cache_size -= l;
4871                         total_len += l;
4872
4873                         user_sum += cg_cpu_usage[curcpu].user;
4874                         system_sum += cg_cpu_usage[curcpu].system;
4875                         idle_sum += new_idle;
4876
4877                 } else {
4878                         user_sum += user;
4879                         nice_sum += nice;
4880                         system_sum += system;
4881                         idle_sum += idle;
4882                         iowait_sum += iowait;
4883                         irq_sum += irq;
4884                         softirq_sum += softirq;
4885                         steal_sum += steal;
4886                         guest_sum += guest;
4887                         guest_nice_sum += guest_nice;
4888                 }
4889         }
4890
4891         cache = d->buf;
4892
4893         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4894                         user_sum,
4895                         nice_sum,
4896                         system_sum,
4897                         idle_sum,
4898                         iowait_sum,
4899                         irq_sum,
4900                         softirq_sum,
4901                         steal_sum,
4902                         guest_sum,
4903                         guest_nice_sum);
4904         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4905                 memcpy(cache, cpuall, cpuall_len);
4906                 cache += cpuall_len;
4907         } else {
4908                 /* shouldn't happen */
4909                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4910                 cpuall_len = 0;
4911         }
4912
4913         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4914         total_len += cpuall_len;
4915
4916 out:
4917         d->cached = 1;
4918         d->size = total_len;
4919         if (total_len > size)
4920                 total_len = size;
4921
4922         memcpy(buf, d->buf, total_len);
4923         rv = total_len;
4924
4925 err:
4926         if (f)
4927                 fclose(f);
4928         if (cg_cpu_usage)
4929                 free(cg_cpu_usage);
4930         free(line);
4931         free(cpuset);
4932         free(cg);
4933         return rv;
4934 }
4935
4936 /* This function retrieves the busy time of a group of tasks by looking at
4937  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4938  * been given it's own cpuacct cgroup. If not, this function will take the busy
4939  * time of all other taks that do not actually belong to the container into
4940  * account as well. If someone has a clever solution for this please send a
4941  * patch!
4942  */
4943 static unsigned long get_reaper_busy(pid_t task)
4944 {
4945         pid_t initpid = lookup_initpid_in_store(task);
4946         char *cgroup = NULL, *usage_str = NULL;
4947         unsigned long usage = 0;
4948
4949         if (initpid <= 0)
4950                 return 0;
4951
4952         cgroup = get_pid_cgroup(initpid, "cpuacct");
4953         if (!cgroup)
4954                 goto out;
4955         prune_init_slice(cgroup);
4956         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4957                 goto out;
4958         usage = strtoul(usage_str, NULL, 10);
4959         usage /= 1000000000;
4960
4961 out:
4962         free(cgroup);
4963         free(usage_str);
4964         return usage;
4965 }
4966
4967 #if RELOADTEST
4968 void iwashere(void)
4969 {
4970         int fd;
4971
4972         fd = creat("/tmp/lxcfs-iwashere", 0644);
4973         if (fd >= 0)
4974                 close(fd);
4975 }
4976 #endif
4977
4978 /*
4979  * We read /proc/uptime and reuse its second field.
4980  * For the first field, we use the mtime for the reaper for
4981  * the calling pid as returned by getreaperage
4982  */
4983 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4984                 struct fuse_file_info *fi)
4985 {
4986         struct fuse_context *fc = fuse_get_context();
4987         struct file_info *d = (struct file_info *)fi->fh;
4988         unsigned long int busytime = get_reaper_busy(fc->pid);
4989         char *cache = d->buf;
4990         ssize_t total_len = 0;
4991         uint64_t idletime, reaperage;
4992
4993 #if RELOADTEST
4994         iwashere();
4995 #endif
4996
4997         if (offset){
4998                 if (!d->cached)
4999                         return 0;
5000                 if (offset > d->size)
5001                         return -EINVAL;
5002                 int left = d->size - offset;
5003                 total_len = left > size ? size: left;
5004                 memcpy(buf, cache + offset, total_len);
5005                 return total_len;
5006         }
5007
5008         reaperage = get_reaper_age(fc->pid);
5009         /* To understand why this is done, please read the comment to the
5010          * get_reaper_busy() function.
5011          */
5012         idletime = reaperage;
5013         if (reaperage >= busytime)
5014                 idletime = reaperage - busytime;
5015
5016         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5017         if (total_len < 0 || total_len >=  d->buflen){
5018                 lxcfs_error("%s\n", "failed to write to cache");
5019                 return 0;
5020         }
5021
5022         d->size = (int)total_len;
5023         d->cached = 1;
5024
5025         if (total_len > size) total_len = size;
5026
5027         memcpy(buf, d->buf, total_len);
5028         return total_len;
5029 }
5030
5031 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5032                 struct fuse_file_info *fi)
5033 {
5034         char dev_name[72];
5035         struct fuse_context *fc = fuse_get_context();
5036         struct file_info *d = (struct file_info *)fi->fh;
5037         char *cg;
5038         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5039                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
5040         unsigned long read = 0, write = 0;
5041         unsigned long read_merged = 0, write_merged = 0;
5042         unsigned long read_sectors = 0, write_sectors = 0;
5043         unsigned long read_ticks = 0, write_ticks = 0;
5044         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5045         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5046         char *cache = d->buf;
5047         size_t cache_size = d->buflen;
5048         char *line = NULL;
5049         size_t linelen = 0, total_len = 0, rv = 0;
5050         unsigned int major = 0, minor = 0;
5051         int i = 0;
5052         FILE *f = NULL;
5053
5054         if (offset){
5055                 if (offset > d->size)
5056                         return -EINVAL;
5057                 if (!d->cached)
5058                         return 0;
5059                 int left = d->size - offset;
5060                 total_len = left > size ? size: left;
5061                 memcpy(buf, cache + offset, total_len);
5062                 return total_len;
5063         }
5064
5065         pid_t initpid = lookup_initpid_in_store(fc->pid);
5066         if (initpid <= 0)
5067                 initpid = fc->pid;
5068         cg = get_pid_cgroup(initpid, "blkio");
5069         if (!cg)
5070                 return read_file("/proc/diskstats", buf, size, d);
5071         prune_init_slice(cg);
5072
5073         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5074                 goto err;
5075         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5076                 goto err;
5077         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5078                 goto err;
5079         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5080                 goto err;
5081         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5082                 goto err;
5083
5084
5085         f = fopen("/proc/diskstats", "r");
5086         if (!f)
5087                 goto err;
5088
5089         while (getline(&line, &linelen, f) != -1) {
5090                 ssize_t l;
5091                 char lbuf[256];
5092
5093                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5094                 if (i != 3)
5095                         continue;
5096
5097                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5098                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5099                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5100                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5101                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5102                 read_sectors = read_sectors/512;
5103                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5104                 write_sectors = write_sectors/512;
5105
5106                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5107                 rd_svctm = rd_svctm/1000000;
5108                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5109                 rd_wait = rd_wait/1000000;
5110                 read_ticks = rd_svctm + rd_wait;
5111
5112                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5113                 wr_svctm =  wr_svctm/1000000;
5114                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5115                 wr_wait =  wr_wait/1000000;
5116                 write_ticks = wr_svctm + wr_wait;
5117
5118                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5119                 tot_ticks =  tot_ticks/1000000;
5120
5121                 memset(lbuf, 0, 256);
5122                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5123                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5124                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5125                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5126                 else
5127                         continue;
5128
5129                 l = snprintf(cache, cache_size, "%s", lbuf);
5130                 if (l < 0) {
5131                         perror("Error writing to fuse buf");
5132                         rv = 0;
5133                         goto err;
5134                 }
5135                 if (l >= cache_size) {
5136                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5137                         rv = 0;
5138                         goto err;
5139                 }
5140                 cache += l;
5141                 cache_size -= l;
5142                 total_len += l;
5143         }
5144
5145         d->cached = 1;
5146         d->size = total_len;
5147         if (total_len > size ) total_len = size;
5148         memcpy(buf, d->buf, total_len);
5149
5150         rv = total_len;
5151 err:
5152         free(cg);
5153         if (f)
5154                 fclose(f);
5155         free(line);
5156         free(io_serviced_str);
5157         free(io_merged_str);
5158         free(io_service_bytes_str);
5159         free(io_wait_time_str);
5160         free(io_service_time_str);
5161         return rv;
5162 }
5163
5164 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5165                 struct fuse_file_info *fi)
5166 {
5167         struct fuse_context *fc = fuse_get_context();
5168         struct file_info *d = (struct file_info *)fi->fh;
5169         char *cg = NULL;
5170         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5171         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5172         ssize_t total_len = 0, rv = 0;
5173         ssize_t l = 0;
5174         char *cache = d->buf;
5175
5176         if (offset) {
5177                 if (offset > d->size)
5178                         return -EINVAL;
5179                 if (!d->cached)
5180                         return 0;
5181                 int left = d->size - offset;
5182                 total_len = left > size ? size: left;
5183                 memcpy(buf, cache + offset, total_len);
5184                 return total_len;
5185         }
5186
5187         pid_t initpid = lookup_initpid_in_store(fc->pid);
5188         if (initpid <= 0)
5189                 initpid = fc->pid;
5190         cg = get_pid_cgroup(initpid, "memory");
5191         if (!cg)
5192                 return read_file("/proc/swaps", buf, size, d);
5193         prune_init_slice(cg);
5194
5195         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5196
5197         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5198                 goto err;
5199
5200         memusage = strtoul(memusage_str, NULL, 10);
5201
5202         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5203             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5204
5205                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5206                 memswusage = strtoul(memswusage_str, NULL, 10);
5207
5208                 swap_total = (memswlimit - memlimit) / 1024;
5209                 swap_free = (memswusage - memusage) / 1024;
5210         }
5211
5212         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5213
5214         /* When no mem + swap limit is specified or swapaccount=0*/
5215         if (!memswlimit) {
5216                 char *line = NULL;
5217                 size_t linelen = 0;
5218                 FILE *f = fopen("/proc/meminfo", "r");
5219
5220                 if (!f)
5221                         goto err;
5222
5223                 while (getline(&line, &linelen, f) != -1) {
5224                         if (startswith(line, "SwapTotal:")) {
5225                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5226                         } else if (startswith(line, "SwapFree:")) {
5227                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5228                         }
5229                 }
5230
5231                 free(line);
5232                 fclose(f);
5233         }
5234
5235         if (swap_total > 0) {
5236                 l = snprintf(d->buf + total_len, d->size - total_len,
5237                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5238                                 swap_total, swap_free);
5239                 total_len += l;
5240         }
5241
5242         if (total_len < 0 || l < 0) {
5243                 perror("Error writing to cache");
5244                 rv = 0;
5245                 goto err;
5246         }
5247
5248         d->cached = 1;
5249         d->size = (int)total_len;
5250
5251         if (total_len > size) total_len = size;
5252         memcpy(buf, d->buf, total_len);
5253         rv = total_len;
5254
5255 err:
5256         free(cg);
5257         free(memswlimit_str);
5258         free(memlimit_str);
5259         free(memusage_str);
5260         free(memswusage_str);
5261         return rv;
5262 }
5263 /*
5264  * Find the process pid from cgroup path.
5265  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5266  * @pid_buf : put pid to pid_buf.
5267  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5268  * @depth : the depth of cgroup in container.
5269  * @sum : return the number of pid.
5270  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5271  */
5272 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5273 {
5274         DIR *dir;
5275         int fd;
5276         struct dirent *file;
5277         FILE *f = NULL;
5278         size_t linelen = 0;
5279         char *line = NULL;
5280         int pd;
5281         char *path_dir, *path;
5282         char **pid;
5283
5284         /* path = dpath + "/cgroup.procs" + /0 */
5285         do {
5286                 path = malloc(strlen(dpath) + 20);
5287         } while (!path);
5288
5289         strcpy(path, dpath);
5290         fd = openat(cfd, path, O_RDONLY);
5291         if (fd < 0)
5292                 goto out;
5293
5294         dir = fdopendir(fd);
5295         if (dir == NULL) {
5296                 close(fd);
5297                 goto out;
5298         }
5299
5300         while (((file = readdir(dir)) != NULL) && depth > 0) {
5301                 if (strncmp(file->d_name, ".", 1) == 0)
5302                         continue;
5303                 if (strncmp(file->d_name, "..", 1) == 0)
5304                         continue;
5305                 if (file->d_type == DT_DIR) {
5306                         /* path + '/' + d_name +/0 */
5307                         do {
5308                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5309                         } while (!path_dir);
5310                         strcpy(path_dir, path);
5311                         strcat(path_dir, "/");
5312                         strcat(path_dir, file->d_name);
5313                         pd = depth - 1;
5314                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5315                         free(path_dir);
5316                 }
5317         }
5318         closedir(dir);
5319
5320         strcat(path, "/cgroup.procs");
5321         fd = openat(cfd, path, O_RDONLY);
5322         if (fd < 0)
5323                 goto out;
5324
5325         f = fdopen(fd, "r");
5326         if (!f) {
5327                 close(fd);
5328                 goto out;
5329         }
5330
5331         while (getline(&line, &linelen, f) != -1) {
5332                 do {
5333                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5334                 } while (!pid);
5335                 *pid_buf = pid;
5336                 do {
5337                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
5338                 } while (*(*pid_buf + sum) == NULL);
5339                 strcpy(*(*pid_buf + sum), line);
5340                 sum++;
5341         }
5342         fclose(f);
5343 out:
5344         if (line)
5345                 free(line);
5346         free(path);
5347         return sum;
5348 }
5349 /*
5350  * calc_load calculates the load according to the following formula:
5351  * load1 = load0 * exp + active * (1 - exp)
5352  *
5353  * @load1: the new loadavg.
5354  * @load0: the former loadavg.
5355  * @active: the total number of running pid at this moment.
5356  * @exp: the fixed-point defined in the beginning.
5357  */
5358 static unsigned long
5359 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5360 {
5361         unsigned long newload;
5362
5363         active = active > 0 ? active * FIXED_1 : 0;
5364         newload = load * exp + active * (FIXED_1 - exp);
5365         if (active >= load)
5366                 newload += FIXED_1 - 1;
5367
5368         return newload / FIXED_1;
5369 }
5370
5371 /*
5372  * Return 0 means that container p->cg is closed.
5373  * Return -1 means that error occurred in refresh.
5374  * Positive num equals the total number of pid.
5375  */
5376 static int refresh_load(struct load_node *p, char *path)
5377 {
5378         FILE *f = NULL;
5379         char **idbuf;
5380         char proc_path[256];
5381         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5382         char *line = NULL;
5383         size_t linelen = 0;
5384         int sum, length;
5385         DIR *dp;
5386         struct dirent *file;
5387
5388         do {
5389                 idbuf = malloc(sizeof(char *));
5390         } while (!idbuf);
5391         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5392         /*  normal exit  */
5393         if (sum == 0)
5394                 goto out;
5395
5396         for (i = 0; i < sum; i++) {
5397                 /*clean up '\n' */
5398                 length = strlen(idbuf[i])-1;
5399                 idbuf[i][length] = '\0';
5400                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5401                 if (ret < 0 || ret > 255) {
5402                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5403                         i = sum;
5404                         sum = -1;
5405                         goto err_out;
5406                 }
5407
5408                 dp = opendir(proc_path);
5409                 if (!dp) {
5410                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5411                         continue;
5412                 }
5413                 while ((file = readdir(dp)) != NULL) {
5414                         if (strncmp(file->d_name, ".", 1) == 0)
5415                                 continue;
5416                         if (strncmp(file->d_name, "..", 1) == 0)
5417                                 continue;
5418                         total_pid++;
5419                         /* We make the biggest pid become last_pid.*/
5420                         ret = atof(file->d_name);
5421                         last_pid = (ret > last_pid) ? ret : last_pid;
5422
5423                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5424                         if (ret < 0 || ret > 255) {
5425                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5426                                 i = sum;
5427                                 sum = -1;
5428                                 closedir(dp);
5429                                 goto err_out;
5430                         }
5431                         f = fopen(proc_path, "r");
5432                         if (f != NULL) {
5433                                 while (getline(&line, &linelen, f) != -1) {
5434                                         /* Find State */
5435                                         if ((line[0] == 'S') && (line[1] == 't'))
5436                                                 break;
5437                                 }
5438                         if ((line[7] == 'R') || (line[7] == 'D'))
5439                                 run_pid++;
5440                         fclose(f);
5441                         }
5442                 }
5443                 closedir(dp);
5444         }
5445         /*Calculate the loadavg.*/
5446         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5447         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5448         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5449         p->run_pid = run_pid;
5450         p->total_pid = total_pid;
5451         p->last_pid = last_pid;
5452
5453         free(line);
5454 err_out:
5455         for (; i > 0; i--)
5456                 free(idbuf[i-1]);
5457 out:
5458         free(idbuf);
5459         return sum;
5460 }
5461 /*
5462  * Traverse the hash table and update it.
5463  */
5464 void *load_begin(void *arg)
5465 {
5466
5467         char *path = NULL;
5468         int i, sum, length, ret;
5469         struct load_node *f;
5470         int first_node;
5471         clock_t time1, time2;
5472
5473         while (1) {
5474                 if (loadavg_stop == 1)
5475                         return NULL;
5476
5477                 time1 = clock();
5478                 for (i = 0; i < LOAD_SIZE; i++) {
5479                         pthread_mutex_lock(&load_hash[i].lock);
5480                         if (load_hash[i].next == NULL) {
5481                                 pthread_mutex_unlock(&load_hash[i].lock);
5482                                 continue;
5483                         }
5484                         f = load_hash[i].next;
5485                         first_node = 1;
5486                         while (f) {
5487                                 length = strlen(f->cg) + 2;
5488                                 do {
5489                                         /* strlen(f->cg) + '.' or '' + \0 */
5490                                         path = malloc(length);
5491                                 } while (!path);
5492
5493                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5494                                 if (ret < 0 || ret > length - 1) {
5495                                         /* snprintf failed, ignore the node.*/
5496                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5497                                         goto out;
5498                                 }
5499                                 sum = refresh_load(f, path);
5500                                 if (sum == 0) {
5501                                         f = del_node(f, i);
5502                                 } else {
5503 out:                                    f = f->next;
5504                                 }
5505                                 free(path);
5506                                 /* load_hash[i].lock locks only on the first node.*/
5507                                 if (first_node == 1) {
5508                                         first_node = 0;
5509                                         pthread_mutex_unlock(&load_hash[i].lock);
5510                                 }
5511                         }
5512                 }
5513
5514                 if (loadavg_stop == 1)
5515                         return NULL;
5516
5517                 time2 = clock();
5518                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5519         }
5520 }
5521
5522 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5523                 struct fuse_file_info *fi)
5524 {
5525         struct fuse_context *fc = fuse_get_context();
5526         struct file_info *d = (struct file_info *)fi->fh;
5527         pid_t initpid;
5528         char *cg;
5529         size_t total_len = 0;
5530         char *cache = d->buf;
5531         struct load_node *n;
5532         int hash;
5533         int cfd, rv = 0;
5534         unsigned long a, b, c;
5535
5536         if (offset) {
5537                 if (offset > d->size)
5538                         return -EINVAL;
5539                 if (!d->cached)
5540                         return 0;
5541                 int left = d->size - offset;
5542                 total_len = left > size ? size : left;
5543                 memcpy(buf, cache + offset, total_len);
5544                 return total_len;
5545         }
5546         if (!loadavg)
5547                 return read_file("/proc/loadavg", buf, size, d);
5548
5549         initpid = lookup_initpid_in_store(fc->pid);
5550         if (initpid <= 0)
5551                 initpid = fc->pid;
5552         cg = get_pid_cgroup(initpid, "cpu");
5553         if (!cg)
5554                 return read_file("/proc/loadavg", buf, size, d);
5555
5556         prune_init_slice(cg);
5557         hash = calc_hash(cg) % LOAD_SIZE;
5558         n = locate_node(cg, hash);
5559
5560         /* First time */
5561         if (n == NULL) {
5562                 if (!find_mounted_controller("cpu", &cfd)) {
5563                         /*
5564                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5565                          * because delete is not allowed before read has ended.
5566                          */
5567                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5568                         rv = 0;
5569                         goto err;
5570                 }
5571                 do {
5572                         n = malloc(sizeof(struct load_node));
5573                 } while (!n);
5574
5575                 do {
5576                         n->cg = malloc(strlen(cg)+1);
5577                 } while (!n->cg);
5578                 strcpy(n->cg, cg);
5579                 n->avenrun[0] = 0;
5580                 n->avenrun[1] = 0;
5581                 n->avenrun[2] = 0;
5582                 n->run_pid = 0;
5583                 n->total_pid = 1;
5584                 n->last_pid = initpid;
5585                 n->cfd = cfd;
5586                 insert_node(&n, hash);
5587         }
5588         a = n->avenrun[0] + (FIXED_1/200);
5589         b = n->avenrun[1] + (FIXED_1/200);
5590         c = n->avenrun[2] + (FIXED_1/200);
5591         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5592                 LOAD_INT(a), LOAD_FRAC(a),
5593                 LOAD_INT(b), LOAD_FRAC(b),
5594                 LOAD_INT(c), LOAD_FRAC(c),
5595                 n->run_pid, n->total_pid, n->last_pid);
5596         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5597         if (total_len < 0 || total_len >=  d->buflen) {
5598                 lxcfs_error("%s\n", "Failed to write to cache");
5599                 rv = 0;
5600                 goto err;
5601         }
5602         d->size = (int)total_len;
5603         d->cached = 1;
5604
5605         if (total_len > size)
5606                 total_len = size;
5607         memcpy(buf, d->buf, total_len);
5608         rv = total_len;
5609
5610 err:
5611         free(cg);
5612         return rv;
5613 }
5614 /* Return a positive number on success, return 0 on failure.*/
5615 pthread_t load_daemon(int load_use)
5616 {
5617         int ret;
5618         pthread_t pid;
5619
5620         ret = init_load();
5621         if (ret == -1) {
5622                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5623                 return 0;
5624         }
5625         ret = pthread_create(&pid, NULL, load_begin, NULL);
5626         if (ret != 0) {
5627                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5628                 load_free();
5629                 return 0;
5630         }
5631         /* use loadavg, here loadavg = 1*/
5632         loadavg = load_use;
5633         return pid;
5634 }
5635
5636 /* Returns 0 on success. */
5637 int stop_load_daemon(pthread_t pid)
5638 {
5639         int s;
5640
5641         /* Signal the thread to gracefully stop */
5642         loadavg_stop = 1;
5643
5644         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5645         if (s != 0) {
5646                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5647                 return -1;
5648         }
5649
5650         load_free();
5651         loadavg_stop = 0;
5652
5653         return 0;
5654 }
5655
5656 static off_t get_procfile_size(const char *which)
5657 {
5658         FILE *f = fopen(which, "r");
5659         char *line = NULL;
5660         size_t len = 0;
5661         ssize_t sz, answer = 0;
5662         if (!f)
5663                 return 0;
5664
5665         while ((sz = getline(&line, &len, f)) != -1)
5666                 answer += sz;
5667         fclose (f);
5668         free(line);
5669
5670         return answer;
5671 }
5672
5673 int proc_getattr(const char *path, struct stat *sb)
5674 {
5675         struct timespec now;
5676
5677         memset(sb, 0, sizeof(struct stat));
5678         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5679                 return -EINVAL;
5680         sb->st_uid = sb->st_gid = 0;
5681         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5682         if (strcmp(path, "/proc") == 0) {
5683                 sb->st_mode = S_IFDIR | 00555;
5684                 sb->st_nlink = 2;
5685                 return 0;
5686         }
5687         if (strcmp(path, "/proc/meminfo") == 0 ||
5688                         strcmp(path, "/proc/cpuinfo") == 0 ||
5689                         strcmp(path, "/proc/uptime") == 0 ||
5690                         strcmp(path, "/proc/stat") == 0 ||
5691                         strcmp(path, "/proc/diskstats") == 0 ||
5692                         strcmp(path, "/proc/swaps") == 0 ||
5693                         strcmp(path, "/proc/loadavg") == 0) {
5694                 sb->st_size = 0;
5695                 sb->st_mode = S_IFREG | 00444;
5696                 sb->st_nlink = 1;
5697                 return 0;
5698         }
5699
5700         return -ENOENT;
5701 }
5702
5703 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5704                 struct fuse_file_info *fi)
5705 {
5706         if (filler(buf, ".", NULL, 0) != 0 ||
5707             filler(buf, "..", NULL, 0) != 0 ||
5708             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5709             filler(buf, "meminfo", NULL, 0) != 0 ||
5710             filler(buf, "stat", NULL, 0) != 0 ||
5711             filler(buf, "uptime", NULL, 0) != 0 ||
5712             filler(buf, "diskstats", NULL, 0) != 0 ||
5713             filler(buf, "swaps", NULL, 0) != 0   ||
5714             filler(buf, "loadavg", NULL, 0) != 0)
5715                 return -EINVAL;
5716         return 0;
5717 }
5718
5719 int proc_open(const char *path, struct fuse_file_info *fi)
5720 {
5721         int type = -1;
5722         struct file_info *info;
5723
5724         if (strcmp(path, "/proc/meminfo") == 0)
5725                 type = LXC_TYPE_PROC_MEMINFO;
5726         else if (strcmp(path, "/proc/cpuinfo") == 0)
5727                 type = LXC_TYPE_PROC_CPUINFO;
5728         else if (strcmp(path, "/proc/uptime") == 0)
5729                 type = LXC_TYPE_PROC_UPTIME;
5730         else if (strcmp(path, "/proc/stat") == 0)
5731                 type = LXC_TYPE_PROC_STAT;
5732         else if (strcmp(path, "/proc/diskstats") == 0)
5733                 type = LXC_TYPE_PROC_DISKSTATS;
5734         else if (strcmp(path, "/proc/swaps") == 0)
5735                 type = LXC_TYPE_PROC_SWAPS;
5736         else if (strcmp(path, "/proc/loadavg") == 0)
5737                 type = LXC_TYPE_PROC_LOADAVG;
5738         if (type == -1)
5739                 return -ENOENT;
5740
5741         info = malloc(sizeof(*info));
5742         if (!info)
5743                 return -ENOMEM;
5744
5745         memset(info, 0, sizeof(*info));
5746         info->type = type;
5747
5748         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5749         do {
5750                 info->buf = malloc(info->buflen);
5751         } while (!info->buf);
5752         memset(info->buf, 0, info->buflen);
5753         /* set actual size to buffer size */
5754         info->size = info->buflen;
5755
5756         fi->fh = (unsigned long)info;
5757         return 0;
5758 }
5759
5760 int proc_access(const char *path, int mask)
5761 {
5762         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5763                 return 0;
5764
5765         /* these are all read-only */
5766         if ((mask & ~R_OK) != 0)
5767                 return -EACCES;
5768         return 0;
5769 }
5770
5771 int proc_release(const char *path, struct fuse_file_info *fi)
5772 {
5773         do_release_file_info(fi);
5774         return 0;
5775 }
5776
5777 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5778                 struct fuse_file_info *fi)
5779 {
5780         struct file_info *f = (struct file_info *) fi->fh;
5781
5782         switch (f->type) {
5783         case LXC_TYPE_PROC_MEMINFO:
5784                 return proc_meminfo_read(buf, size, offset, fi);
5785         case LXC_TYPE_PROC_CPUINFO:
5786                 return proc_cpuinfo_read(buf, size, offset, fi);
5787         case LXC_TYPE_PROC_UPTIME:
5788                 return proc_uptime_read(buf, size, offset, fi);
5789         case LXC_TYPE_PROC_STAT:
5790                 return proc_stat_read(buf, size, offset, fi);
5791         case LXC_TYPE_PROC_DISKSTATS:
5792                 return proc_diskstats_read(buf, size, offset, fi);
5793         case LXC_TYPE_PROC_SWAPS:
5794                 return proc_swaps_read(buf, size, offset, fi);
5795         case LXC_TYPE_PROC_LOADAVG:
5796                 return proc_loadavg_read(buf, size, offset, fi);
5797         default:
5798                 return -EINVAL;
5799         }
5800 }
5801
5802 /*
5803  * Functions needed to setup cgroups in the __constructor__.
5804  */
5805
5806 static bool mkdir_p(const char *dir, mode_t mode)
5807 {
5808         const char *tmp = dir;
5809         const char *orig = dir;
5810         char *makeme;
5811
5812         do {
5813                 dir = tmp + strspn(tmp, "/");
5814                 tmp = dir + strcspn(dir, "/");
5815                 makeme = strndup(orig, dir - orig);
5816                 if (!makeme)
5817                         return false;
5818                 if (mkdir(makeme, mode) && errno != EEXIST) {
5819                         lxcfs_error("Failed to create directory '%s': %s.\n",
5820                                 makeme, strerror(errno));
5821                         free(makeme);
5822                         return false;
5823                 }
5824                 free(makeme);
5825         } while(tmp != dir);
5826
5827         return true;
5828 }
5829
5830 static bool umount_if_mounted(void)
5831 {
5832         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5833                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5834                 return false;
5835         }
5836         return true;
5837 }
5838
5839 /* __typeof__ should be safe to use with all compilers. */
5840 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5841 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5842 {
5843         return (fs->f_type == (fs_type_magic)magic_val);
5844 }
5845
5846 /*
5847  * looking at fs/proc_namespace.c, it appears we can
5848  * actually expect the rootfs entry to very specifically contain
5849  * " - rootfs rootfs "
5850  * IIUC, so long as we've chrooted so that rootfs is not our root,
5851  * the rootfs entry should always be skipped in mountinfo contents.
5852  */
5853 static bool is_on_ramfs(void)
5854 {
5855         FILE *f;
5856         char *p, *p2;
5857         char *line = NULL;
5858         size_t len = 0;
5859         int i;
5860
5861         f = fopen("/proc/self/mountinfo", "r");
5862         if (!f)
5863                 return false;
5864
5865         while (getline(&line, &len, f) != -1) {
5866                 for (p = line, i = 0; p && i < 4; i++)
5867                         p = strchr(p + 1, ' ');
5868                 if (!p)
5869                         continue;
5870                 p2 = strchr(p + 1, ' ');
5871                 if (!p2)
5872                         continue;
5873                 *p2 = '\0';
5874                 if (strcmp(p + 1, "/") == 0) {
5875                         // this is '/'.  is it the ramfs?
5876                         p = strchr(p2 + 1, '-');
5877                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5878                                 free(line);
5879                                 fclose(f);
5880                                 return true;
5881                         }
5882                 }
5883         }
5884         free(line);
5885         fclose(f);
5886         return false;
5887 }
5888
5889 static int pivot_enter()
5890 {
5891         int ret = -1, oldroot = -1, newroot = -1;
5892
5893         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5894         if (oldroot < 0) {
5895                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5896                 return ret;
5897         }
5898
5899         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5900         if (newroot < 0) {
5901                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5902                 goto err;
5903         }
5904
5905         /* change into new root fs */
5906         if (fchdir(newroot) < 0) {
5907                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5908                 goto err;
5909         }
5910
5911         /* pivot_root into our new root fs */
5912         if (pivot_root(".", ".") < 0) {
5913                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5914                 goto err;
5915         }
5916
5917         /*
5918          * At this point the old-root is mounted on top of our new-root.
5919          * To unmounted it we must not be chdir'd into it, so escape back
5920          * to the old-root.
5921          */
5922         if (fchdir(oldroot) < 0) {
5923                 lxcfs_error("%s\n", "Failed to enter old root.");
5924                 goto err;
5925         }
5926
5927         if (umount2(".", MNT_DETACH) < 0) {
5928                 lxcfs_error("%s\n", "Failed to detach old root.");
5929                 goto err;
5930         }
5931
5932         if (fchdir(newroot) < 0) {
5933                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5934                 goto err;
5935         }
5936
5937         ret = 0;
5938
5939 err:
5940         if (oldroot > 0)
5941                 close(oldroot);
5942         if (newroot > 0)
5943                 close(newroot);
5944
5945         return ret;
5946 }
5947
5948 static int chroot_enter()
5949 {
5950         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5951                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5952                 return -1;
5953         }
5954
5955         if (chroot(".") < 0) {
5956                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5957                 return -1;
5958         }
5959
5960         if (chdir("/") < 0) {
5961                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5962                 return -1;
5963         }
5964
5965         return 0;
5966 }
5967
5968 static int permute_and_enter(void)
5969 {
5970         struct statfs sb;
5971
5972         if (statfs("/", &sb) < 0) {
5973                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5974                 return -1;
5975         }
5976
5977         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5978          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5979          * /proc/1/mountinfo. */
5980         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5981                 return chroot_enter();
5982
5983         if (pivot_enter() < 0) {
5984                 lxcfs_error("%s\n", "Could not perform pivot root.");
5985                 return -1;
5986         }
5987
5988         return 0;
5989 }
5990
5991 /* Prepare our new clean root. */
5992 static int permute_prepare(void)
5993 {
5994         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5995                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5996                 return -1;
5997         }
5998
5999         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6000                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6001                 return -1;
6002         }
6003
6004         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6005                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6006                 return -1;
6007         }
6008
6009         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6010                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6011                 return -1;
6012         }
6013
6014         return 0;
6015 }
6016
6017 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6018 static bool permute_root(void)
6019 {
6020         /* Prepare new root. */
6021         if (permute_prepare() < 0)
6022                 return false;
6023
6024         /* Pivot into new root. */
6025         if (permute_and_enter() < 0)
6026                 return false;
6027
6028         return true;
6029 }
6030
6031 static int preserve_mnt_ns(int pid)
6032 {
6033         int ret;
6034         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6035         char path[len];
6036
6037         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6038         if (ret < 0 || (size_t)ret >= len)
6039                 return -1;
6040
6041         return open(path, O_RDONLY | O_CLOEXEC);
6042 }
6043
6044 static bool cgfs_prepare_mounts(void)
6045 {
6046         if (!mkdir_p(BASEDIR, 0700)) {
6047                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6048                 return false;
6049         }
6050
6051         if (!umount_if_mounted()) {
6052                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6053                 return false;
6054         }
6055
6056         if (unshare(CLONE_NEWNS) < 0) {
6057                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6058                 return false;
6059         }
6060
6061         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6062         if (cgroup_mount_ns_fd < 0) {
6063                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6064                 return false;
6065         }
6066
6067         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6068                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6069                 return false;
6070         }
6071
6072         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6073                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6074                 return false;
6075         }
6076
6077         return true;
6078 }
6079
6080 static bool cgfs_mount_hierarchies(void)
6081 {
6082         char *target;
6083         size_t clen, len;
6084         int i, ret;
6085
6086         for (i = 0; i < num_hierarchies; i++) {
6087                 char *controller = hierarchies[i];
6088
6089                 clen = strlen(controller);
6090                 len = strlen(BASEDIR) + clen + 2;
6091                 target = malloc(len);
6092                 if (!target)
6093                         return false;
6094
6095                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6096                 if (ret < 0 || ret >= len) {
6097                         free(target);
6098                         return false;
6099                 }
6100                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6101                         free(target);
6102                         return false;
6103                 }
6104                 if (!strcmp(controller, "unified"))
6105                         ret = mount("none", target, "cgroup2", 0, NULL);
6106                 else
6107                         ret = mount(controller, target, "cgroup", 0, controller);
6108                 if (ret < 0) {
6109                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6110                         free(target);
6111                         return false;
6112                 }
6113
6114                 fd_hierarchies[i] = open(target, O_DIRECTORY);
6115                 if (fd_hierarchies[i] < 0) {
6116                         free(target);
6117                         return false;
6118                 }
6119                 free(target);
6120         }
6121         return true;
6122 }
6123
6124 static bool cgfs_setup_controllers(void)
6125 {
6126         if (!cgfs_prepare_mounts())
6127                 return false;
6128
6129         if (!cgfs_mount_hierarchies()) {
6130                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6131                 return false;
6132         }
6133
6134         if (!permute_root())
6135                 return false;
6136
6137         return true;
6138 }
6139
6140 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6141 {
6142         FILE *f;
6143         char *cret, *line = NULL;
6144         char cwd[MAXPATHLEN];
6145         size_t len = 0;
6146         int i, init_ns = -1;
6147         bool found_unified = false;
6148
6149         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6150                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6151                 return;
6152         }
6153
6154         while (getline(&line, &len, f) != -1) {
6155                 char *idx, *p, *p2;
6156
6157                 p = strchr(line, ':');
6158                 if (!p)
6159                         goto out;
6160                 idx = line;
6161                 *(p++) = '\0';
6162
6163                 p2 = strrchr(p, ':');
6164                 if (!p2)
6165                         goto out;
6166                 *p2 = '\0';
6167
6168                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6169                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6170                  * because it parses out the empty string "" and later on passes
6171                  * it to mount(). Let's skip such entries.
6172                  */
6173                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6174                         found_unified = true;
6175                         p = "unified";
6176                 }
6177
6178                 if (!store_hierarchy(line, p))
6179                         goto out;
6180         }
6181
6182         /* Preserve initial namespace. */
6183         init_ns = preserve_mnt_ns(getpid());
6184         if (init_ns < 0) {
6185                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6186                 goto out;
6187         }
6188
6189         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6190         if (!fd_hierarchies) {
6191                 lxcfs_error("%s\n", strerror(errno));
6192                 goto out;
6193         }
6194
6195         for (i = 0; i < num_hierarchies; i++)
6196                 fd_hierarchies[i] = -1;
6197
6198         cret = getcwd(cwd, MAXPATHLEN);
6199         if (!cret)
6200                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6201
6202         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6203          * to privately mount lxcfs cgroups. */
6204         if (!cgfs_setup_controllers()) {
6205                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6206                 goto out;
6207         }
6208
6209         if (setns(init_ns, 0) < 0) {
6210                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6211                 goto out;
6212         }
6213
6214         if (!cret || chdir(cwd) < 0)
6215                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6216
6217         if (!init_cpuview()) {
6218                 lxcfs_error("%s\n", "failed to init CPU view");
6219                 goto out;
6220         }
6221
6222         print_subsystems();
6223
6224 out:
6225         free(line);
6226         fclose(f);
6227         if (init_ns >= 0)
6228                 close(init_ns);
6229 }
6230
6231 static void __attribute__((destructor)) free_subsystems(void)
6232 {
6233         int i;
6234
6235         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6236
6237         for (i = 0; i < num_hierarchies; i++) {
6238                 if (hierarchies[i])
6239                         free(hierarchies[i]);
6240                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6241                         close(fd_hierarchies[i]);
6242         }
6243         free(hierarchies);
6244         free(fd_hierarchies);
6245         free_cpuview();
6246
6247         if (cgroup_mount_ns_fd >= 0)
6248                 close(cgroup_mount_ns_fd);
6249 }