bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Define pivot_root() if missing from the C library */
  43 #ifndef HAVE_PIVOT_ROOT
  44 static int pivot_root(const char * new_root, const char * put_old)
  45 {
  46 #ifdef __NR_pivot_root
  47 return syscall(__NR_pivot_root, new_root, put_old);
  48 #else
  49 errno = ENOSYS;
  50 return -1;
  51 #endif
  52 }
  53 #else
  54 extern int pivot_root(const char * new_root, const char * put_old);
  55 #endif
  56
  57 struct cpuacct_usage {
  58         uint64_t user;
  59         uint64_t system;
  60         uint64_t idle;
  61         bool online;
  62 };
  63
  64 /* The function of hash table.*/
  65 #define LOAD_SIZE 100 /*the size of hash_table */
  66 #define FLUSH_TIME 5  /*the flush rate */
  67 #define DEPTH_DIR 3   /*the depth of per cgroup */
  68 /* The function of calculate loadavg .*/
  69 #define FSHIFT          11              /* nr of bits of precision */
  70 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  71 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  72 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  73 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  74 #define LOAD_INT(x) ((x) >> FSHIFT)
  75 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  76 /*
  77  * This parameter is used for proc_loadavg_read().
  78  * 1 means use loadavg, 0 means not use.
  79  */
  80 static int loadavg = 0;
  81 static volatile sig_atomic_t loadavg_stop = 0;
  82 static int calc_hash(const char *name)
  83 {
  84         unsigned int hash = 0;
  85         unsigned int x = 0;
  86         /* ELFHash algorithm. */
  87         while (*name) {
  88                 hash = (hash << 4) + *name++;
  89                 x = hash & 0xf0000000;
  90                 if (x != 0)
  91                         hash ^= (x >> 24);
  92                 hash &= ~x;
  93         }
  94         return (hash & 0x7fffffff);
  95 }
  96
  97 struct load_node {
  98         char *cg;  /*cg */
  99         unsigned long avenrun[3];               /* Load averages */
 100         unsigned int run_pid;
 101         unsigned int total_pid;
 102         unsigned int last_pid;
 103         int cfd; /* The file descriptor of the mounted cgroup */
 104         struct  load_node *next;
 105         struct  load_node **pre;
 106 };
 107
 108 struct load_head {
 109         /*
 110          * The lock is about insert load_node and refresh load_node.To the first
 111          * load_node of each hash bucket, insert and refresh in this hash bucket is
 112          * mutually exclusive.
 113          */
 114         pthread_mutex_t lock;
 115         /*
 116          * The rdlock is about read loadavg and delete load_node.To each hash
 117          * bucket, read and delete is mutually exclusive. But at the same time, we
 118          * allow paratactic read operation. This rdlock is at list level.
 119          */
 120         pthread_rwlock_t rdlock;
 121         /*
 122          * The rilock is about read loadavg and insert load_node.To the first
 123          * load_node of each hash bucket, read and insert is mutually exclusive.
 124          * But at the same time, we allow paratactic read operation.
 125          */
 126         pthread_rwlock_t rilock;
 127         struct load_node *next;
 128 };
 129
 130 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 131 /*
 132  * init_load initialize the hash table.
 133  * Return 0 on success, return -1 on failure.
 134  */
 135 static int init_load(void)
 136 {
 137         int i;
 138         int ret;
 139
 140         for (i = 0; i < LOAD_SIZE; i++) {
 141                 load_hash[i].next = NULL;
 142                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 143                 if (ret != 0) {
 144                         lxcfs_error("%s\n", "Failed to initialize lock");
 145                         goto out3;
 146                 }
 147                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 148                 if (ret != 0) {
 149                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 150                         goto out2;
 151                 }
 152                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 153                 if (ret != 0) {
 154                         lxcfs_error("%s\n", "Failed to initialize rilock");
 155                         goto out1;
 156                 }
 157         }
 158         return 0;
 159 out1:
 160         pthread_rwlock_destroy(&load_hash[i].rdlock);
 161 out2:
 162         pthread_mutex_destroy(&load_hash[i].lock);
 163 out3:
 164         while (i > 0) {
 165                 i--;
 166                 pthread_mutex_destroy(&load_hash[i].lock);
 167                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 168                 pthread_rwlock_destroy(&load_hash[i].rilock);
 169         }
 170         return -1;
 171 }
 172
 173 static void insert_node(struct load_node **n, int locate)
 174 {
 175         struct load_node *f;
 176
 177         pthread_mutex_lock(&load_hash[locate].lock);
 178         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 179         f = load_hash[locate].next;
 180         load_hash[locate].next = *n;
 181
 182         (*n)->pre = &(load_hash[locate].next);
 183         if (f)
 184                 f->pre = &((*n)->next);
 185         (*n)->next = f;
 186         pthread_mutex_unlock(&load_hash[locate].lock);
 187         pthread_rwlock_unlock(&load_hash[locate].rilock);
 188 }
 189 /*
 190  * locate_node() finds special node. Not return NULL means success.
 191  * It should be noted that rdlock isn't unlocked at the end of code
 192  * because this function is used to read special node. Delete is not
 193  * allowed before read has ended.
 194  * unlock rdlock only in proc_loadavg_read().
 195  */
 196 static struct load_node *locate_node(char *cg, int locate)
 197 {
 198         struct load_node *f = NULL;
 199         int i = 0;
 200
 201         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 202         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 203         if (load_hash[locate].next == NULL) {
 204                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 205                 return f;
 206         }
 207         f = load_hash[locate].next;
 208         pthread_rwlock_unlock(&load_hash[locate].rilock);
 209         while (f && ((i = strcmp(f->cg, cg)) != 0))
 210                 f = f->next;
 211         return f;
 212 }
 213 /* Delete the load_node n and return the next node of it. */
 214 static struct load_node *del_node(struct load_node *n, int locate)
 215 {
 216         struct load_node *g;
 217
 218         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 219         if (n->next == NULL) {
 220                 *(n->pre) = NULL;
 221         } else {
 222                 *(n->pre) = n->next;
 223                 n->next->pre = n->pre;
 224         }
 225         g = n->next;
 226         free(n->cg);
 227         free(n);
 228         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 229         return g;
 230 }
 231
 232 static void load_free(void)
 233 {
 234         int i;
 235         struct load_node *f, *p;
 236
 237         for (i = 0; i < LOAD_SIZE; i++) {
 238                 pthread_mutex_lock(&load_hash[i].lock);
 239                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 240                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 241                 if (load_hash[i].next == NULL) {
 242                         pthread_mutex_unlock(&load_hash[i].lock);
 243                         pthread_mutex_destroy(&load_hash[i].lock);
 244                         pthread_rwlock_unlock(&load_hash[i].rilock);
 245                         pthread_rwlock_destroy(&load_hash[i].rilock);
 246                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 247                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 248                         continue;
 249                 }
 250                 for (f = load_hash[i].next; f; ) {
 251                         free(f->cg);
 252                         p = f->next;
 253                         free(f);
 254                         f = p;
 255                 }
 256                 pthread_mutex_unlock(&load_hash[i].lock);
 257                 pthread_mutex_destroy(&load_hash[i].lock);
 258                 pthread_rwlock_unlock(&load_hash[i].rilock);
 259                 pthread_rwlock_destroy(&load_hash[i].rilock);
 260                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 261                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 262         }
 263 }
 264
 265 /* Data for CPU view */
 266 struct cg_proc_stat {
 267         char *cg;
 268         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 269         struct cpuacct_usage *view; // Usage stats reported to the container
 270         int cpu_count;
 271         pthread_mutex_t lock; // For node manipulation
 272         struct cg_proc_stat *next;
 273 };
 274
 275 struct cg_proc_stat_head {
 276         struct cg_proc_stat *next;
 277         time_t lastcheck;
 278
 279         /*
 280          * For access to the list. Reading can be parallel, pruning is exclusive.
 281          */
 282         pthread_rwlock_t lock;
 283 };
 284
 285 #define CPUVIEW_HASH_SIZE 100
 286 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 287
 288 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 289 {
 290         *head = malloc(sizeof(struct cg_proc_stat_head));
 291         if (!(*head)) {
 292                 lxcfs_error("%s\n", strerror(errno));
 293                 return false;
 294         }
 295
 296         (*head)->lastcheck = time(NULL);
 297         (*head)->next = NULL;
 298
 299         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 300                 lxcfs_error("%s\n", "Failed to initialize list lock");
 301                 free(*head);
 302                 return false;
 303         }
 304
 305         return true;
 306 }
 307
 308 static bool init_cpuview()
 309 {
 310         int i;
 311
 312         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 313                 proc_stat_history[i] = NULL;
 314
 315         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 316                 if (!cpuview_init_head(&proc_stat_history[i]))
 317                         goto err;
 318         }
 319
 320         return true;
 321
 322 err:
 323         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 324                 if (proc_stat_history[i]) {
 325                         free(proc_stat_history[i]);
 326                         proc_stat_history[i] = NULL;
 327                 }
 328         }
 329
 330         return false;
 331 }
 332
 333 static void free_proc_stat_node(struct cg_proc_stat *node)
 334 {
 335         pthread_mutex_destroy(&node->lock);
 336         free(node->cg);
 337         free(node->usage);
 338         free(node->view);
 339         free(node);
 340 }
 341
 342 static void cpuview_free_head(struct cg_proc_stat_head *head)
 343 {
 344         struct cg_proc_stat *node, *tmp;
 345
 346         if (head->next) {
 347                 node = head->next;
 348
 349                 for (;;) {
 350                         tmp = node;
 351                         node = node->next;
 352                         free_proc_stat_node(tmp);
 353
 354                         if (!node)
 355                                 break;
 356                 }
 357         }
 358
 359         pthread_rwlock_destroy(&head->lock);
 360         free(head);
 361 }
 362
 363 static void free_cpuview()
 364 {
 365         int i;
 366
 367         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 368                 if (proc_stat_history[i])
 369                         cpuview_free_head(proc_stat_history[i]);
 370         }
 371 }
 372
 373 /*
 374  * A table caching which pid is init for a pid namespace.
 375  * When looking up which pid is init for $qpid, we first
 376  * 1. Stat /proc/$qpid/ns/pid.
 377  * 2. Check whether the ino_t is in our store.
 378  *   a. if not, fork a child in qpid's ns to send us
 379  *       ucred.pid = 1, and read the initpid.  Cache
 380  *       initpid and creation time for /proc/initpid
 381  *       in a new store entry.
 382  *   b. if so, verify that /proc/initpid still matches
 383  *       what we have saved.  If not, clear the store
 384  *       entry and go back to a.  If so, return the
 385  *       cached initpid.
 386  */
 387 struct pidns_init_store {
 388         ino_t ino;          // inode number for /proc/$pid/ns/pid
 389         pid_t initpid;      // the pid of nit in that ns
 390         long int ctime;     // the time at which /proc/$initpid was created
 391         struct pidns_init_store *next;
 392         long int lastcheck;
 393 };
 394
 395 /* lol - look at how they are allocated in the kernel */
 396 #define PIDNS_HASH_SIZE 4096
 397 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 398
 399 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 400 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 401 static void lock_mutex(pthread_mutex_t *l)
 402 {
 403         int ret;
 404
 405         if ((ret = pthread_mutex_lock(l)) != 0) {
 406                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 407                 exit(1);
 408         }
 409 }
 410
 411 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 412  * Number of hierarchies mounted. */
 413 static int num_hierarchies;
 414
 415 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 416  * Hierachies mounted {cpuset, blkio, ...}:
 417  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 418 static char **hierarchies;
 419
 420 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 421  * Open file descriptors:
 422  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 423  * private mount namespace.
 424  * Initialized via __constructor__ collect_and_mount_subsystems().
 425  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 426  * mounts and respective files in the private namespace even when located in
 427  * another namespace using the *at() family of functions
 428  * {openat(), fchownat(), ...}. */
 429 static int *fd_hierarchies;
 430 static int cgroup_mount_ns_fd = -1;
 431
 432 static void unlock_mutex(pthread_mutex_t *l)
 433 {
 434         int ret;
 435
 436         if ((ret = pthread_mutex_unlock(l)) != 0) {
 437                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 438                 exit(1);
 439         }
 440 }
 441
 442 static void store_lock(void)
 443 {
 444         lock_mutex(&pidns_store_mutex);
 445 }
 446
 447 static void store_unlock(void)
 448 {
 449         unlock_mutex(&pidns_store_mutex);
 450 }
 451
 452 /* Must be called under store_lock */
 453 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 454 {
 455         struct stat initsb;
 456         char fnam[100];
 457
 458         snprintf(fnam, 100, "/proc/%d", e->initpid);
 459         if (stat(fnam, &initsb) < 0)
 460                 return false;
 461
 462         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 463                     initsb.st_ctime, e->initpid);
 464
 465         if (e->ctime != initsb.st_ctime)
 466                 return false;
 467         return true;
 468 }
 469
 470 /* Must be called under store_lock */
 471 static void remove_initpid(struct pidns_init_store *e)
 472 {
 473         struct pidns_init_store *tmp;
 474         int h;
 475
 476         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 477
 478         h = HASH(e->ino);
 479         if (pidns_hash_table[h] == e) {
 480                 pidns_hash_table[h] = e->next;
 481                 free(e);
 482                 return;
 483         }
 484
 485         tmp = pidns_hash_table[h];
 486         while (tmp) {
 487                 if (tmp->next == e) {
 488                         tmp->next = e->next;
 489                         free(e);
 490                         return;
 491                 }
 492                 tmp = tmp->next;
 493         }
 494 }
 495
 496 #define PURGE_SECS 5
 497 /* Must be called under store_lock */
 498 static void prune_initpid_store(void)
 499 {
 500         static long int last_prune = 0;
 501         struct pidns_init_store *e, *prev, *delme;
 502         long int now, threshold;
 503         int i;
 504
 505         if (!last_prune) {
 506                 last_prune = time(NULL);
 507                 return;
 508         }
 509         now = time(NULL);
 510         if (now < last_prune + PURGE_SECS)
 511                 return;
 512
 513         lxcfs_debug("%s\n", "Pruning.");
 514
 515         last_prune = now;
 516         threshold = now - 2 * PURGE_SECS;
 517
 518         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 519                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 520                         if (e->lastcheck < threshold) {
 521
 522                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 523
 524                                 delme = e;
 525                                 if (prev)
 526                                         prev->next = e->next;
 527                                 else
 528                                         pidns_hash_table[i] = e->next;
 529                                 e = e->next;
 530                                 free(delme);
 531                         } else {
 532                                 prev = e;
 533                                 e = e->next;
 534                         }
 535                 }
 536         }
 537 }
 538
 539 /* Must be called under store_lock */
 540 static void save_initpid(struct stat *sb, pid_t pid)
 541 {
 542         struct pidns_init_store *e;
 543         char fpath[100];
 544         struct stat procsb;
 545         int h;
 546
 547         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 548
 549         snprintf(fpath, 100, "/proc/%d", pid);
 550         if (stat(fpath, &procsb) < 0)
 551                 return;
 552         do {
 553                 e = malloc(sizeof(*e));
 554         } while (!e);
 555         e->ino = sb->st_ino;
 556         e->initpid = pid;
 557         e->ctime = procsb.st_ctime;
 558         h = HASH(e->ino);
 559         e->next = pidns_hash_table[h];
 560         e->lastcheck = time(NULL);
 561         pidns_hash_table[h] = e;
 562 }
 563
 564 /*
 565  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 566  * entry for the inode number and creation time.  Verify that the init pid
 567  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 568  * otherwise.
 569  * Must be called under store_lock
 570  */
 571 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 572 {
 573         int h = HASH(sb->st_ino);
 574         struct pidns_init_store *e = pidns_hash_table[h];
 575
 576         while (e) {
 577                 if (e->ino == sb->st_ino) {
 578                         if (initpid_still_valid(e, sb)) {
 579                                 e->lastcheck = time(NULL);
 580                                 return e;
 581                         }
 582                         remove_initpid(e);
 583                         return NULL;
 584                 }
 585                 e = e->next;
 586         }
 587
 588         return NULL;
 589 }
 590
 591 static int is_dir(const char *path, int fd)
 592 {
 593         struct stat statbuf;
 594         int ret = fstatat(fd, path, &statbuf, fd);
 595         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 596                 return 1;
 597         return 0;
 598 }
 599
 600 static char *must_copy_string(const char *str)
 601 {
 602         char *dup = NULL;
 603         if (!str)
 604                 return NULL;
 605         do {
 606                 dup = strdup(str);
 607         } while (!dup);
 608
 609         return dup;
 610 }
 611
 612 static inline void drop_trailing_newlines(char *s)
 613 {
 614         int l;
 615
 616         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 617                 s[l-1] = '\0';
 618 }
 619
 620 #define BATCH_SIZE 50
 621 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 622 {
 623         int newbatches = (newlen / BATCH_SIZE) + 1;
 624         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 625
 626         if (!*mem || newbatches > oldbatches) {
 627                 char *tmp;
 628                 do {
 629                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 630                 } while (!tmp);
 631                 *mem = tmp;
 632         }
 633 }
 634 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 635 {
 636         size_t newlen = *len + linelen;
 637         dorealloc(contents, *len, newlen + 1);
 638         memcpy(*contents + *len, line, linelen+1);
 639         *len = newlen;
 640 }
 641
 642 static char *slurp_file(const char *from, int fd)
 643 {
 644         char *line = NULL;
 645         char *contents = NULL;
 646         FILE *f = fdopen(fd, "r");
 647         size_t len = 0, fulllen = 0;
 648         ssize_t linelen;
 649
 650         if (!f)
 651                 return NULL;
 652
 653         while ((linelen = getline(&line, &len, f)) != -1) {
 654                 append_line(&contents, &fulllen, line, linelen);
 655         }
 656         fclose(f);
 657
 658         if (contents)
 659                 drop_trailing_newlines(contents);
 660         free(line);
 661         return contents;
 662 }
 663
 664 static bool write_string(const char *fnam, const char *string, int fd)
 665 {
 666         FILE *f;
 667         size_t len, ret;
 668
 669         f = fdopen(fd, "w");
 670         if (!f)
 671                 return false;
 672
 673         len = strlen(string);
 674         ret = fwrite(string, 1, len, f);
 675         if (ret != len) {
 676                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 677                             strerror(errno), string, fnam);
 678                 fclose(f);
 679                 return false;
 680         }
 681
 682         if (fclose(f) < 0) {
 683                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 684                 return false;
 685         }
 686
 687         return true;
 688 }
 689
 690 struct cgfs_files {
 691         char *name;
 692         uint32_t uid, gid;
 693         uint32_t mode;
 694 };
 695
 696 #define ALLOC_NUM 20
 697 static bool store_hierarchy(char *stridx, char *h)
 698 {
 699         if (num_hierarchies % ALLOC_NUM == 0) {
 700                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 701                 n *= ALLOC_NUM;
 702                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 703                 if (!tmp) {
 704                         lxcfs_error("%s\n", strerror(errno));
 705                         exit(1);
 706                 }
 707                 hierarchies = tmp;
 708         }
 709
 710         hierarchies[num_hierarchies++] = must_copy_string(h);
 711         return true;
 712 }
 713
 714 static void print_subsystems(void)
 715 {
 716         int i;
 717
 718         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 719         fprintf(stderr, "hierarchies:\n");
 720         for (i = 0; i < num_hierarchies; i++) {
 721                 if (hierarchies[i])
 722                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 723                                 fd_hierarchies[i], hierarchies[i]);
 724         }
 725 }
 726
 727 static bool in_comma_list(const char *needle, const char *haystack)
 728 {
 729         const char *s = haystack, *e;
 730         size_t nlen = strlen(needle);
 731
 732         while (*s && (e = strchr(s, ','))) {
 733                 if (nlen != e - s) {
 734                         s = e + 1;
 735                         continue;
 736                 }
 737                 if (strncmp(needle, s, nlen) == 0)
 738                         return true;
 739                 s = e + 1;
 740         }
 741         if (strcmp(needle, s) == 0)
 742                 return true;
 743         return false;
 744 }
 745
 746 /* do we need to do any massaging here?  I'm not sure... */
 747 /* Return the mounted controller and store the corresponding open file descriptor
 748  * referring to the controller mountpoint in the private lxcfs namespace in
 749  * @cfd.
 750  */
 751 static char *find_mounted_controller(const char *controller, int *cfd)
 752 {
 753         int i;
 754
 755         for (i = 0; i < num_hierarchies; i++) {
 756                 if (!hierarchies[i])
 757                         continue;
 758                 if (strcmp(hierarchies[i], controller) == 0) {
 759                         *cfd = fd_hierarchies[i];
 760                         return hierarchies[i];
 761                 }
 762                 if (in_comma_list(controller, hierarchies[i])) {
 763                         *cfd = fd_hierarchies[i];
 764                         return hierarchies[i];
 765                 }
 766         }
 767
 768         return NULL;
 769 }
 770
 771 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 772                 const char *value)
 773 {
 774         int ret, fd, cfd;
 775         size_t len;
 776         char *fnam, *tmpc;
 777
 778         tmpc = find_mounted_controller(controller, &cfd);
 779         if (!tmpc)
 780                 return false;
 781
 782         /* Make sure we pass a relative path to *at() family of functions.
 783          * . + /cgroup + / + file + \0
 784          */
 785         len = strlen(cgroup) + strlen(file) + 3;
 786         fnam = alloca(len);
 787         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 788         if (ret < 0 || (size_t)ret >= len)
 789                 return false;
 790
 791         fd = openat(cfd, fnam, O_WRONLY);
 792         if (fd < 0)
 793                 return false;
 794
 795         return write_string(fnam, value, fd);
 796 }
 797
 798 // Chown all the files in the cgroup directory.  We do this when we create
 799 // a cgroup on behalf of a user.
 800 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 801 {
 802         struct dirent *direntp;
 803         char path[MAXPATHLEN];
 804         size_t len;
 805         DIR *d;
 806         int fd1, ret;
 807
 808         len = strlen(dirname);
 809         if (len >= MAXPATHLEN) {
 810                 lxcfs_error("Pathname too long: %s\n", dirname);
 811                 return;
 812         }
 813
 814         fd1 = openat(fd, dirname, O_DIRECTORY);
 815         if (fd1 < 0)
 816                 return;
 817
 818         d = fdopendir(fd1);
 819         if (!d) {
 820                 lxcfs_error("Failed to open %s\n", dirname);
 821                 return;
 822         }
 823
 824         while ((direntp = readdir(d))) {
 825                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 826                         continue;
 827                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 828                 if (ret < 0 || ret >= MAXPATHLEN) {
 829                         lxcfs_error("Pathname too long under %s\n", dirname);
 830                         continue;
 831                 }
 832                 if (fchownat(fd, path, uid, gid, 0) < 0)
 833                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 834         }
 835         closedir(d);
 836 }
 837
 838 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 839 {
 840         int cfd;
 841         size_t len;
 842         char *dirnam, *tmpc;
 843
 844         tmpc = find_mounted_controller(controller, &cfd);
 845         if (!tmpc)
 846                 return -EINVAL;
 847
 848         /* Make sure we pass a relative path to *at() family of functions.
 849          * . + /cg + \0
 850          */
 851         len = strlen(cg) + 2;
 852         dirnam = alloca(len);
 853         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 854
 855         if (mkdirat(cfd, dirnam, 0755) < 0)
 856                 return -errno;
 857
 858         if (uid == 0 && gid == 0)
 859                 return 0;
 860
 861         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 862                 return -errno;
 863
 864         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 865
 866         return 0;
 867 }
 868
 869 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 870 {
 871         struct dirent *direntp;
 872         DIR *dir;
 873         bool ret = false;
 874         char pathname[MAXPATHLEN];
 875         int dupfd;
 876
 877         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 878         if (dupfd < 0)
 879                 return false;
 880
 881         dir = fdopendir(dupfd);
 882         if (!dir) {
 883                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 884                 close(dupfd);
 885                 return false;
 886         }
 887
 888         while ((direntp = readdir(dir))) {
 889                 struct stat mystat;
 890                 int rc;
 891
 892                 if (!strcmp(direntp->d_name, ".") ||
 893                     !strcmp(direntp->d_name, ".."))
 894                         continue;
 895
 896                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 897                 if (rc < 0 || rc >= MAXPATHLEN) {
 898                         lxcfs_error("%s\n", "Pathname too long.");
 899                         continue;
 900                 }
 901
 902                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 903                 if (rc) {
 904                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 905                         continue;
 906                 }
 907                 if (S_ISDIR(mystat.st_mode))
 908                         if (!recursive_rmdir(pathname, fd, cfd))
 909                                 lxcfs_debug("Error removing %s.\n", pathname);
 910         }
 911
 912         ret = true;
 913         if (closedir(dir) < 0) {
 914                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 915                 ret = false;
 916         }
 917
 918         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 919                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 920                 ret = false;
 921         }
 922
 923         close(dupfd);
 924
 925         return ret;
 926 }
 927
 928 bool cgfs_remove(const char *controller, const char *cg)
 929 {
 930         int fd, cfd;
 931         size_t len;
 932         char *dirnam, *tmpc;
 933         bool bret;
 934
 935         tmpc = find_mounted_controller(controller, &cfd);
 936         if (!tmpc)
 937                 return false;
 938
 939         /* Make sure we pass a relative path to *at() family of functions.
 940          * . +  /cg + \0
 941          */
 942         len = strlen(cg) + 2;
 943         dirnam = alloca(len);
 944         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 945
 946         fd = openat(cfd, dirnam, O_DIRECTORY);
 947         if (fd < 0)
 948                 return false;
 949
 950         bret = recursive_rmdir(dirnam, fd, cfd);
 951         close(fd);
 952         return bret;
 953 }
 954
 955 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 956 {
 957         int cfd;
 958         size_t len;
 959         char *pathname, *tmpc;
 960
 961         tmpc = find_mounted_controller(controller, &cfd);
 962         if (!tmpc)
 963                 return false;
 964
 965         /* Make sure we pass a relative path to *at() family of functions.
 966          * . + /file + \0
 967          */
 968         len = strlen(file) + 2;
 969         pathname = alloca(len);
 970         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 971         if (fchmodat(cfd, pathname, mode, 0) < 0)
 972                 return false;
 973         return true;
 974 }
 975
 976 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 977 {
 978         size_t len;
 979         char *fname;
 980
 981         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 982         fname = alloca(len);
 983         snprintf(fname, len, "%s/tasks", dirname);
 984         if (fchownat(fd, fname, uid, gid, 0) != 0)
 985                 return -errno;
 986         snprintf(fname, len, "%s/cgroup.procs", dirname);
 987         if (fchownat(fd, fname, uid, gid, 0) != 0)
 988                 return -errno;
 989         return 0;
 990 }
 991
 992 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 993 {
 994         int cfd;
 995         size_t len;
 996         char *pathname, *tmpc;
 997
 998         tmpc = find_mounted_controller(controller, &cfd);
 999         if (!tmpc)
1000                 return -EINVAL;
1001
1002         /* Make sure we pass a relative path to *at() family of functions.
1003          * . + /file + \0
1004          */
1005         len = strlen(file) + 2;
1006         pathname = alloca(len);
1007         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1008         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1009                 return -errno;
1010
1011         if (is_dir(pathname, cfd))
1012                 // like cgmanager did, we want to chown the tasks file as well
1013                 return chown_tasks_files(pathname, uid, gid, cfd);
1014
1015         return 0;
1016 }
1017
1018 FILE *open_pids_file(const char *controller, const char *cgroup)
1019 {
1020         int fd, cfd;
1021         size_t len;
1022         char *pathname, *tmpc;
1023
1024         tmpc = find_mounted_controller(controller, &cfd);
1025         if (!tmpc)
1026                 return NULL;
1027
1028         /* Make sure we pass a relative path to *at() family of functions.
1029          * . + /cgroup + / "cgroup.procs" + \0
1030          */
1031         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1032         pathname = alloca(len);
1033         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1034
1035         fd = openat(cfd, pathname, O_WRONLY);
1036         if (fd < 0)
1037                 return NULL;
1038
1039         return fdopen(fd, "w");
1040 }
1041
1042 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1043                                 void ***list, size_t typesize,
1044                                 void* (*iterator)(const char*, const char*, const char*))
1045 {
1046         int cfd, fd, ret;
1047         size_t len;
1048         char *cg, *tmpc;
1049         char pathname[MAXPATHLEN];
1050         size_t sz = 0, asz = 0;
1051         struct dirent *dirent;
1052         DIR *dir;
1053
1054         tmpc = find_mounted_controller(controller, &cfd);
1055         *list = NULL;
1056         if (!tmpc)
1057                 return false;
1058
1059         /* Make sure we pass a relative path to *at() family of functions. */
1060         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1061         cg = alloca(len);
1062         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1063         if (ret < 0 || (size_t)ret >= len) {
1064                 lxcfs_error("Pathname too long under %s\n", cgroup);
1065                 return false;
1066         }
1067
1068         fd = openat(cfd, cg, O_DIRECTORY);
1069         if (fd < 0)
1070                 return false;
1071
1072         dir = fdopendir(fd);
1073         if (!dir)
1074                 return false;
1075
1076         while ((dirent = readdir(dir))) {
1077                 struct stat mystat;
1078
1079                 if (!strcmp(dirent->d_name, ".") ||
1080                     !strcmp(dirent->d_name, ".."))
1081                         continue;
1082
1083                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1084                 if (ret < 0 || ret >= MAXPATHLEN) {
1085                         lxcfs_error("Pathname too long under %s\n", cg);
1086                         continue;
1087                 }
1088
1089                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1090                 if (ret) {
1091                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1092                         continue;
1093                 }
1094                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1095                     (directories && !S_ISDIR(mystat.st_mode)))
1096                         continue;
1097
1098                 if (sz+2 >= asz) {
1099                         void **tmp;
1100                         asz += BATCH_SIZE;
1101                         do {
1102                                 tmp = realloc(*list, asz * typesize);
1103                         } while  (!tmp);
1104                         *list = tmp;
1105                 }
1106                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1107                 (*list)[sz+1] = NULL;
1108                 sz++;
1109         }
1110         if (closedir(dir) < 0) {
1111                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1112                 return false;
1113         }
1114         return true;
1115 }
1116
1117 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1118 {
1119         char *dup;
1120         do {
1121                 dup = strdup(dir_entry);
1122         } while (!dup);
1123         return dup;
1124 }
1125
1126 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1127 {
1128         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1129 }
1130
1131 void free_key(struct cgfs_files *k)
1132 {
1133         if (!k)
1134                 return;
1135         free(k->name);
1136         free(k);
1137 }
1138
1139 void free_keys(struct cgfs_files **keys)
1140 {
1141         int i;
1142
1143         if (!keys)
1144                 return;
1145         for (i = 0; keys[i]; i++) {
1146                 free_key(keys[i]);
1147         }
1148         free(keys);
1149 }
1150
1151 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1152 {
1153         int ret, fd, cfd;
1154         size_t len;
1155         char *fnam, *tmpc;
1156
1157         tmpc = find_mounted_controller(controller, &cfd);
1158         if (!tmpc)
1159                 return false;
1160
1161         /* Make sure we pass a relative path to *at() family of functions.
1162          * . + /cgroup + / + file + \0
1163          */
1164         len = strlen(cgroup) + strlen(file) + 3;
1165         fnam = alloca(len);
1166         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1167         if (ret < 0 || (size_t)ret >= len)
1168                 return false;
1169
1170         fd = openat(cfd, fnam, O_RDONLY);
1171         if (fd < 0)
1172                 return false;
1173
1174         *value = slurp_file(fnam, fd);
1175         return *value != NULL;
1176 }
1177
1178 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1179 {
1180         int ret, cfd;
1181         size_t len;
1182         char *fnam, *tmpc;
1183
1184         tmpc = find_mounted_controller(controller, &cfd);
1185         if (!tmpc)
1186                 return false;
1187
1188         /* Make sure we pass a relative path to *at() family of functions.
1189          * . + /cgroup + / + file + \0
1190          */
1191         len = strlen(cgroup) + strlen(file) + 3;
1192         fnam = alloca(len);
1193         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1194         if (ret < 0 || (size_t)ret >= len)
1195                 return false;
1196
1197         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1198 }
1199
1200 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1201 {
1202         int ret, cfd;
1203         size_t len;
1204         char *fnam, *tmpc;
1205         struct stat sb;
1206         struct cgfs_files *newkey;
1207
1208         tmpc = find_mounted_controller(controller, &cfd);
1209         if (!tmpc)
1210                 return false;
1211
1212         if (file && *file == '/')
1213                 file++;
1214
1215         if (file && strchr(file, '/'))
1216                 return NULL;
1217
1218         /* Make sure we pass a relative path to *at() family of functions.
1219          * . + /cgroup + / + file + \0
1220          */
1221         len = strlen(cgroup) + 3;
1222         if (file)
1223                 len += strlen(file) + 1;
1224         fnam = alloca(len);
1225         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1226                  file ? "/" : "", file ? file : "");
1227
1228         ret = fstatat(cfd, fnam, &sb, 0);
1229         if (ret < 0)
1230                 return NULL;
1231
1232         do {
1233                 newkey = malloc(sizeof(struct cgfs_files));
1234         } while (!newkey);
1235         if (file)
1236                 newkey->name = must_copy_string(file);
1237         else if (strrchr(cgroup, '/'))
1238                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1239         else
1240                 newkey->name = must_copy_string(cgroup);
1241         newkey->uid = sb.st_uid;
1242         newkey->gid = sb.st_gid;
1243         newkey->mode = sb.st_mode;
1244
1245         return newkey;
1246 }
1247
1248 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1249 {
1250         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1251         if (!entry) {
1252                 lxcfs_error("Error getting files under %s:%s\n", controller,
1253                              cgroup);
1254         }
1255         return entry;
1256 }
1257
1258 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1259 {
1260         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1261 }
1262
1263 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1264 {
1265         int cfd;
1266         size_t len;
1267         char *fnam, *tmpc;
1268         int ret;
1269         struct stat sb;
1270
1271         tmpc = find_mounted_controller(controller, &cfd);
1272         if (!tmpc)
1273                 return false;
1274
1275         /* Make sure we pass a relative path to *at() family of functions.
1276          * . + /cgroup + / + f + \0
1277          */
1278         len = strlen(cgroup) + strlen(f) + 3;
1279         fnam = alloca(len);
1280         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1281         if (ret < 0 || (size_t)ret >= len)
1282                 return false;
1283
1284         ret = fstatat(cfd, fnam, &sb, 0);
1285         if (ret < 0 || !S_ISDIR(sb.st_mode))
1286                 return false;
1287
1288         return true;
1289 }
1290
1291 #define SEND_CREDS_OK 0
1292 #define SEND_CREDS_NOTSK 1
1293 #define SEND_CREDS_FAIL 2
1294 static bool recv_creds(int sock, struct ucred *cred, char *v);
1295 static int wait_for_pid(pid_t pid);
1296 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1297 static int send_creds_clone_wrapper(void *arg);
1298
1299 /*
1300  * clone a task which switches to @task's namespace and writes '1'.
1301  * over a unix sock so we can read the task's reaper's pid in our
1302  * namespace
1303  *
1304  * Note: glibc's fork() does not respect pidns, which can lead to failed
1305  * assertions inside glibc (and thus failed forks) if the child's pid in
1306  * the pidns and the parent pid outside are identical. Using clone prevents
1307  * this issue.
1308  */
1309 static void write_task_init_pid_exit(int sock, pid_t target)
1310 {
1311         char fnam[100];
1312         pid_t pid;
1313         int fd, ret;
1314         size_t stack_size = sysconf(_SC_PAGESIZE);
1315         void *stack = alloca(stack_size);
1316
1317         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1318         if (ret < 0 || ret >= sizeof(fnam))
1319                 _exit(1);
1320
1321         fd = open(fnam, O_RDONLY);
1322         if (fd < 0) {
1323                 perror("write_task_init_pid_exit open of ns/pid");
1324                 _exit(1);
1325         }
1326         if (setns(fd, 0)) {
1327                 perror("write_task_init_pid_exit setns 1");
1328                 close(fd);
1329                 _exit(1);
1330         }
1331         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1332         if (pid < 0)
1333                 _exit(1);
1334         if (pid != 0) {
1335                 if (!wait_for_pid(pid))
1336                         _exit(1);
1337                 _exit(0);
1338         }
1339 }
1340
1341 static int send_creds_clone_wrapper(void *arg) {
1342         struct ucred cred;
1343         char v;
1344         int sock = *(int *)arg;
1345
1346         /* we are the child */
1347         cred.uid = 0;
1348         cred.gid = 0;
1349         cred.pid = 1;
1350         v = '1';
1351         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1352                 return 1;
1353         return 0;
1354 }
1355
1356 static pid_t get_init_pid_for_task(pid_t task)
1357 {
1358         int sock[2];
1359         pid_t pid;
1360         pid_t ret = -1;
1361         char v = '0';
1362         struct ucred cred;
1363
1364         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1365                 perror("socketpair");
1366                 return -1;
1367         }
1368
1369         pid = fork();
1370         if (pid < 0)
1371                 goto out;
1372         if (!pid) {
1373                 close(sock[1]);
1374                 write_task_init_pid_exit(sock[0], task);
1375                 _exit(0);
1376         }
1377
1378         if (!recv_creds(sock[1], &cred, &v))
1379                 goto out;
1380         ret = cred.pid;
1381
1382 out:
1383         close(sock[0]);
1384         close(sock[1]);
1385         if (pid > 0)
1386                 wait_for_pid(pid);
1387         return ret;
1388 }
1389
1390 pid_t lookup_initpid_in_store(pid_t qpid)
1391 {
1392         pid_t answer = 0;
1393         struct stat sb;
1394         struct pidns_init_store *e;
1395         char fnam[100];
1396
1397         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1398         store_lock();
1399         if (stat(fnam, &sb) < 0)
1400                 goto out;
1401         e = lookup_verify_initpid(&sb);
1402         if (e) {
1403                 answer = e->initpid;
1404                 goto out;
1405         }
1406         answer = get_init_pid_for_task(qpid);
1407         if (answer > 0)
1408                 save_initpid(&sb, answer);
1409
1410 out:
1411         /* we prune at end in case we are returning
1412          * the value we were about to return */
1413         prune_initpid_store();
1414         store_unlock();
1415         return answer;
1416 }
1417
1418 static int wait_for_pid(pid_t pid)
1419 {
1420         int status, ret;
1421
1422         if (pid <= 0)
1423                 return -1;
1424
1425 again:
1426         ret = waitpid(pid, &status, 0);
1427         if (ret == -1) {
1428                 if (errno == EINTR)
1429                         goto again;
1430                 return -1;
1431         }
1432         if (ret != pid)
1433                 goto again;
1434         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1435                 return -1;
1436         return 0;
1437 }
1438
1439
1440 /*
1441  * append pid to *src.
1442  * src: a pointer to a char* in which ot append the pid.
1443  * sz: the number of characters printed so far, minus trailing \0.
1444  * asz: the allocated size so far
1445  * pid: the pid to append
1446  */
1447 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1448 {
1449         char tmp[30];
1450
1451         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1452
1453         if (!*src || tmplen + *sz + 1 >= *asz) {
1454                 char *tmp;
1455                 do {
1456                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1457                 } while (!tmp);
1458                 *src = tmp;
1459                 *asz += BUF_RESERVE_SIZE;
1460         }
1461         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1462         *sz += tmplen;
1463 }
1464
1465 /*
1466  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1467  * valid in the caller's namespace, return the id mapped into
1468  * pid's namespace.
1469  * Returns the mapped id, or -1 on error.
1470  */
1471 unsigned int
1472 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1473 {
1474         unsigned int nsuid,   // base id for a range in the idfile's namespace
1475                      hostuid, // base id for a range in the caller's namespace
1476                      count;   // number of ids in this range
1477         char line[400];
1478         int ret;
1479
1480         fseek(idfile, 0L, SEEK_SET);
1481         while (fgets(line, 400, idfile)) {
1482                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1483                 if (ret != 3)
1484                         continue;
1485                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1486                         /*
1487                          * uids wrapped around - unexpected as this is a procfile,
1488                          * so just bail.
1489                          */
1490                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1491                                 nsuid, hostuid, count, line);
1492                         return -1;
1493                 }
1494                 if (hostuid <= in_id && hostuid+count > in_id) {
1495                         /*
1496                          * now since hostuid <= in_id < hostuid+count, and
1497                          * hostuid+count and nsuid+count do not wrap around,
1498                          * we know that nsuid+(in_id-hostuid) which must be
1499                          * less that nsuid+(count) must not wrap around
1500                          */
1501                         return (in_id - hostuid) + nsuid;
1502                 }
1503         }
1504
1505         // no answer found
1506         return -1;
1507 }
1508
1509 /*
1510  * for is_privileged_over,
1511  * specify whether we require the calling uid to be root in his
1512  * namespace
1513  */
1514 #define NS_ROOT_REQD true
1515 #define NS_ROOT_OPT false
1516
1517 #define PROCLEN 100
1518
1519 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1520 {
1521         char fpath[PROCLEN];
1522         int ret;
1523         bool answer = false;
1524         uid_t nsuid;
1525
1526         if (victim == -1 || uid == -1)
1527                 return false;
1528
1529         /*
1530          * If the request is one not requiring root in the namespace,
1531          * then having the same uid suffices.  (i.e. uid 1000 has write
1532          * access to files owned by uid 1000
1533          */
1534         if (!req_ns_root && uid == victim)
1535                 return true;
1536
1537         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1538         if (ret < 0 || ret >= PROCLEN)
1539                 return false;
1540         FILE *f = fopen(fpath, "r");
1541         if (!f)
1542                 return false;
1543
1544         /* if caller's not root in his namespace, reject */
1545         nsuid = convert_id_to_ns(f, uid);
1546         if (nsuid)
1547                 goto out;
1548
1549         /*
1550          * If victim is not mapped into caller's ns, reject.
1551          * XXX I'm not sure this check is needed given that fuse
1552          * will be sending requests where the vfs has converted
1553          */
1554         nsuid = convert_id_to_ns(f, victim);
1555         if (nsuid == -1)
1556                 goto out;
1557
1558         answer = true;
1559
1560 out:
1561         fclose(f);
1562         return answer;
1563 }
1564
1565 static bool perms_include(int fmode, mode_t req_mode)
1566 {
1567         mode_t r;
1568
1569         switch (req_mode & O_ACCMODE) {
1570         case O_RDONLY:
1571                 r = S_IROTH;
1572                 break;
1573         case O_WRONLY:
1574                 r = S_IWOTH;
1575                 break;
1576         case O_RDWR:
1577                 r = S_IROTH | S_IWOTH;
1578                 break;
1579         default:
1580                 return false;
1581         }
1582         return ((fmode & r) == r);
1583 }
1584
1585
1586 /*
1587  * taskcg is  a/b/c
1588  * querycg is /a/b/c/d/e
1589  * we return 'd'
1590  */
1591 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1592 {
1593         char *start, *end;
1594
1595         if (strlen(taskcg) <= strlen(querycg)) {
1596                 lxcfs_error("%s\n", "I was fed bad input.");
1597                 return NULL;
1598         }
1599
1600         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1601                 start =  strdup(taskcg + 1);
1602         else
1603                 start = strdup(taskcg + strlen(querycg) + 1);
1604         if (!start)
1605                 return NULL;
1606         end = strchr(start, '/');
1607         if (end)
1608                 *end = '\0';
1609         return start;
1610 }
1611
1612 static void stripnewline(char *x)
1613 {
1614         size_t l = strlen(x);
1615         if (l && x[l-1] == '\n')
1616                 x[l-1] = '\0';
1617 }
1618
1619 char *get_pid_cgroup(pid_t pid, const char *contrl)
1620 {
1621         int cfd;
1622         char fnam[PROCLEN];
1623         FILE *f;
1624         char *answer = NULL;
1625         char *line = NULL;
1626         size_t len = 0;
1627         int ret;
1628         const char *h = find_mounted_controller(contrl, &cfd);
1629         if (!h)
1630                 return NULL;
1631
1632         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1633         if (ret < 0 || ret >= PROCLEN)
1634                 return NULL;
1635         if (!(f = fopen(fnam, "r")))
1636                 return NULL;
1637
1638         while (getline(&line, &len, f) != -1) {
1639                 char *c1, *c2;
1640                 if (!line[0])
1641                         continue;
1642                 c1 = strchr(line, ':');
1643                 if (!c1)
1644                         goto out;
1645                 c1++;
1646                 c2 = strchr(c1, ':');
1647                 if (!c2)
1648                         goto out;
1649                 *c2 = '\0';
1650                 if (strcmp(c1, h) != 0)
1651                         continue;
1652                 c2++;
1653                 stripnewline(c2);
1654                 do {
1655                         answer = strdup(c2);
1656                 } while (!answer);
1657                 break;
1658         }
1659
1660 out:
1661         fclose(f);
1662         free(line);
1663         return answer;
1664 }
1665
1666 /*
1667  * check whether a fuse context may access a cgroup dir or file
1668  *
1669  * If file is not null, it is a cgroup file to check under cg.
1670  * If file is null, then we are checking perms on cg itself.
1671  *
1672  * For files we can check the mode of the list_keys result.
1673  * For cgroups, we must make assumptions based on the files under the
1674  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1675  * yet.
1676  */
1677 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1678 {
1679         struct cgfs_files *k = NULL;
1680         bool ret = false;
1681
1682         k = cgfs_get_key(contrl, cg, file);
1683         if (!k)
1684                 return false;
1685
1686         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1687                 if (perms_include(k->mode >> 6, mode)) {
1688                         ret = true;
1689                         goto out;
1690                 }
1691         }
1692         if (fc->gid == k->gid) {
1693                 if (perms_include(k->mode >> 3, mode)) {
1694                         ret = true;
1695                         goto out;
1696                 }
1697         }
1698         ret = perms_include(k->mode, mode);
1699
1700 out:
1701         free_key(k);
1702         return ret;
1703 }
1704
1705 #define INITSCOPE "/init.scope"
1706 void prune_init_slice(char *cg)
1707 {
1708         char *point;
1709         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1710
1711         if (cg_len < initscope_len)
1712                 return;
1713
1714         point = cg + cg_len - initscope_len;
1715         if (strcmp(point, INITSCOPE) == 0) {
1716                 if (point == cg)
1717                         *(point+1) = '\0';
1718                 else
1719                         *point = '\0';
1720         }
1721 }
1722
1723 /*
1724  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1725  * If pid is in /a, he may act on /a/b, but not on /b.
1726  * if the answer is false and nextcg is not NULL, then *nextcg will point
1727  * to a string containing the next cgroup directory under cg, which must be
1728  * freed by the caller.
1729  */
1730 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1731 {
1732         bool answer = false;
1733         char *c2 = get_pid_cgroup(pid, contrl);
1734         char *linecmp;
1735
1736         if (!c2)
1737                 return false;
1738         prune_init_slice(c2);
1739
1740         /*
1741          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1742          * they pass in a cgroup without leading '/'
1743          *
1744          * The original line here was:
1745          *      linecmp = *cg == '/' ? c2 : c2+1;
1746          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1747          *       Serge, do you know?
1748          */
1749         if (*cg == '/' || !strncmp(cg, "./", 2))
1750                 linecmp = c2;
1751         else
1752                 linecmp = c2 + 1;
1753         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1754                 if (nextcg) {
1755                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1756                 }
1757                 goto out;
1758         }
1759         answer = true;
1760
1761 out:
1762         free(c2);
1763         return answer;
1764 }
1765
1766 /*
1767  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1768  */
1769 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1770 {
1771         bool answer = false;
1772         char *c2, *task_cg;
1773         size_t target_len, task_len;
1774
1775         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1776                 return true;
1777
1778         c2 = get_pid_cgroup(pid, contrl);
1779         if (!c2)
1780                 return false;
1781         prune_init_slice(c2);
1782
1783         task_cg = c2 + 1;
1784         target_len = strlen(cg);
1785         task_len = strlen(task_cg);
1786         if (task_len == 0) {
1787                 /* Task is in the root cg, it can see everything. This case is
1788                  * not handled by the strmcps below, since they test for the
1789                  * last /, but that is the first / that we've chopped off
1790                  * above.
1791                  */
1792                 answer = true;
1793                 goto out;
1794         }
1795         if (strcmp(cg, task_cg) == 0) {
1796                 answer = true;
1797                 goto out;
1798         }
1799         if (target_len < task_len) {
1800                 /* looking up a parent dir */
1801                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1802                         answer = true;
1803                 goto out;
1804         }
1805         if (target_len > task_len) {
1806                 /* looking up a child dir */
1807                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1808                         answer = true;
1809                 goto out;
1810         }
1811
1812 out:
1813         free(c2);
1814         return answer;
1815 }
1816
1817 /*
1818  * given /cgroup/freezer/a/b, return "freezer".
1819  * the returned char* should NOT be freed.
1820  */
1821 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1822 {
1823         const char *p1;
1824         char *contr, *slash;
1825
1826         if (strlen(path) < 9) {
1827                 errno = EACCES;
1828                 return NULL;
1829         }
1830         if (*(path + 7) != '/') {
1831                 errno = EINVAL;
1832                 return NULL;
1833         }
1834         p1 = path + 8;
1835         contr = strdupa(p1);
1836         if (!contr) {
1837                 errno = ENOMEM;
1838                 return NULL;
1839         }
1840         slash = strstr(contr, "/");
1841         if (slash)
1842                 *slash = '\0';
1843
1844         int i;
1845         for (i = 0; i < num_hierarchies; i++) {
1846                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1847                         return hierarchies[i];
1848         }
1849         errno = ENOENT;
1850         return NULL;
1851 }
1852
1853 /*
1854  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1855  * Note that the returned value may include files (keynames) etc
1856  */
1857 static const char *find_cgroup_in_path(const char *path)
1858 {
1859         const char *p1;
1860
1861         if (strlen(path) < 9) {
1862                 errno = EACCES;
1863                 return NULL;
1864         }
1865         p1 = strstr(path + 8, "/");
1866         if (!p1) {
1867                 errno = EINVAL;
1868                 return NULL;
1869         }
1870         errno = 0;
1871         return p1 + 1;
1872 }
1873
1874 /*
1875  * split the last path element from the path in @cg.
1876  * @dir is newly allocated and should be freed, @last not
1877 */
1878 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1879 {
1880         char *p;
1881
1882         do {
1883                 *dir = strdup(cg);
1884         } while (!*dir);
1885         *last = strrchr(cg, '/');
1886         if (!*last) {
1887                 *last = NULL;
1888                 return;
1889         }
1890         p = strrchr(*dir, '/');
1891         *p = '\0';
1892 }
1893
1894 /*
1895  * FUSE ops for /cgroup
1896  */
1897
1898 int cg_getattr(const char *path, struct stat *sb)
1899 {
1900         struct timespec now;
1901         struct fuse_context *fc = fuse_get_context();
1902         char * cgdir = NULL;
1903         char *last = NULL, *path1, *path2;
1904         struct cgfs_files *k = NULL;
1905         const char *cgroup;
1906         const char *controller = NULL;
1907         int ret = -ENOENT;
1908
1909
1910         if (!fc)
1911                 return -EIO;
1912
1913         memset(sb, 0, sizeof(struct stat));
1914
1915         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1916                 return -EINVAL;
1917
1918         sb->st_uid = sb->st_gid = 0;
1919         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1920         sb->st_size = 0;
1921
1922         if (strcmp(path, "/cgroup") == 0) {
1923                 sb->st_mode = S_IFDIR | 00755;
1924                 sb->st_nlink = 2;
1925                 return 0;
1926         }
1927
1928         controller = pick_controller_from_path(fc, path);
1929         if (!controller)
1930                 return -errno;
1931         cgroup = find_cgroup_in_path(path);
1932         if (!cgroup) {
1933                 /* this is just /cgroup/controller, return it as a dir */
1934                 sb->st_mode = S_IFDIR | 00755;
1935                 sb->st_nlink = 2;
1936                 return 0;
1937         }
1938
1939         get_cgdir_and_path(cgroup, &cgdir, &last);
1940
1941         if (!last) {
1942                 path1 = "/";
1943                 path2 = cgdir;
1944         } else {
1945                 path1 = cgdir;
1946                 path2 = last;
1947         }
1948
1949         pid_t initpid = lookup_initpid_in_store(fc->pid);
1950         if (initpid <= 0)
1951                 initpid = fc->pid;
1952         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1953          * Then check that caller's cgroup is under path if last is a child
1954          * cgroup, or cgdir if last is a file */
1955
1956         if (is_child_cgroup(controller, path1, path2)) {
1957                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1958                         ret = -ENOENT;
1959                         goto out;
1960                 }
1961                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1962                         /* this is just /cgroup/controller, return it as a dir */
1963                         sb->st_mode = S_IFDIR | 00555;
1964                         sb->st_nlink = 2;
1965                         ret = 0;
1966                         goto out;
1967                 }
1968                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1969                         ret = -EACCES;
1970                         goto out;
1971                 }
1972
1973                 // get uid, gid, from '/tasks' file and make up a mode
1974                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1975                 sb->st_mode = S_IFDIR | 00755;
1976                 k = cgfs_get_key(controller, cgroup, NULL);
1977                 if (!k) {
1978                         sb->st_uid = sb->st_gid = 0;
1979                 } else {
1980                         sb->st_uid = k->uid;
1981                         sb->st_gid = k->gid;
1982                 }
1983                 free_key(k);
1984                 sb->st_nlink = 2;
1985                 ret = 0;
1986                 goto out;
1987         }
1988
1989         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1990                 sb->st_mode = S_IFREG | k->mode;
1991                 sb->st_nlink = 1;
1992                 sb->st_uid = k->uid;
1993                 sb->st_gid = k->gid;
1994                 sb->st_size = 0;
1995                 free_key(k);
1996                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1997                         ret = -ENOENT;
1998                         goto out;
1999                 }
2000                 ret = 0;
2001         }
2002
2003 out:
2004         free(cgdir);
2005         return ret;
2006 }
2007
2008 int cg_opendir(const char *path, struct fuse_file_info *fi)
2009 {
2010         struct fuse_context *fc = fuse_get_context();
2011         const char *cgroup;
2012         struct file_info *dir_info;
2013         char *controller = NULL;
2014
2015         if (!fc)
2016                 return -EIO;
2017
2018         if (strcmp(path, "/cgroup") == 0) {
2019                 cgroup = NULL;
2020                 controller = NULL;
2021         } else {
2022                 // return list of keys for the controller, and list of child cgroups
2023                 controller = pick_controller_from_path(fc, path);
2024                 if (!controller)
2025                         return -errno;
2026
2027                 cgroup = find_cgroup_in_path(path);
2028                 if (!cgroup) {
2029                         /* this is just /cgroup/controller, return its contents */
2030                         cgroup = "/";
2031                 }
2032         }
2033
2034         pid_t initpid = lookup_initpid_in_store(fc->pid);
2035         if (initpid <= 0)
2036                 initpid = fc->pid;
2037         if (cgroup) {
2038                 if (!caller_may_see_dir(initpid, controller, cgroup))
2039                         return -ENOENT;
2040                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2041                         return -EACCES;
2042         }
2043
2044         /* we'll free this at cg_releasedir */
2045         dir_info = malloc(sizeof(*dir_info));
2046         if (!dir_info)
2047                 return -ENOMEM;
2048         dir_info->controller = must_copy_string(controller);
2049         dir_info->cgroup = must_copy_string(cgroup);
2050         dir_info->type = LXC_TYPE_CGDIR;
2051         dir_info->buf = NULL;
2052         dir_info->file = NULL;
2053         dir_info->buflen = 0;
2054
2055         fi->fh = (unsigned long)dir_info;
2056         return 0;
2057 }
2058
2059 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2060                 struct fuse_file_info *fi)
2061 {
2062         struct file_info *d = (struct file_info *)fi->fh;
2063         struct cgfs_files **list = NULL;
2064         int i, ret;
2065         char *nextcg = NULL;
2066         struct fuse_context *fc = fuse_get_context();
2067         char **clist = NULL;
2068
2069         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2070                 return -EIO;
2071
2072         if (d->type != LXC_TYPE_CGDIR) {
2073                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2074                 return -EIO;
2075         }
2076         if (!d->cgroup && !d->controller) {
2077                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2078                 int i;
2079
2080                 for (i = 0;  i < num_hierarchies; i++) {
2081                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2082                                 return -EIO;
2083                         }
2084                 }
2085                 return 0;
2086         }
2087
2088         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2089                 // not a valid cgroup
2090                 ret = -EINVAL;
2091                 goto out;
2092         }
2093
2094         pid_t initpid = lookup_initpid_in_store(fc->pid);
2095         if (initpid <= 0)
2096                 initpid = fc->pid;
2097         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2098                 if (nextcg) {
2099                         ret = filler(buf, nextcg,  NULL, 0);
2100                         free(nextcg);
2101                         if (ret != 0) {
2102                                 ret = -EIO;
2103                                 goto out;
2104                         }
2105                 }
2106                 ret = 0;
2107                 goto out;
2108         }
2109
2110         for (i = 0; list && list[i]; i++) {
2111                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2112                         ret = -EIO;
2113                         goto out;
2114                 }
2115         }
2116
2117         // now get the list of child cgroups
2118
2119         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2120                 ret = 0;
2121                 goto out;
2122         }
2123         if (clist) {
2124                 for (i = 0; clist[i]; i++) {
2125                         if (filler(buf, clist[i], NULL, 0) != 0) {
2126                                 ret = -EIO;
2127                                 goto out;
2128                         }
2129                 }
2130         }
2131         ret = 0;
2132
2133 out:
2134         free_keys(list);
2135         if (clist) {
2136                 for (i = 0; clist[i]; i++)
2137                         free(clist[i]);
2138                 free(clist);
2139         }
2140         return ret;
2141 }
2142
2143 void do_release_file_info(struct fuse_file_info *fi)
2144 {
2145         struct file_info *f = (struct file_info *)fi->fh;
2146
2147         if (!f)
2148                 return;
2149
2150         fi->fh = 0;
2151
2152         free(f->controller);
2153         f->controller = NULL;
2154         free(f->cgroup);
2155         f->cgroup = NULL;
2156         free(f->file);
2157         f->file = NULL;
2158         free(f->buf);
2159         f->buf = NULL;
2160         free(f);
2161         f = NULL;
2162 }
2163
2164 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2165 {
2166         do_release_file_info(fi);
2167         return 0;
2168 }
2169
2170 int cg_open(const char *path, struct fuse_file_info *fi)
2171 {
2172         const char *cgroup;
2173         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2174         struct cgfs_files *k = NULL;
2175         struct file_info *file_info;
2176         struct fuse_context *fc = fuse_get_context();
2177         int ret;
2178
2179         if (!fc)
2180                 return -EIO;
2181
2182         controller = pick_controller_from_path(fc, path);
2183         if (!controller)
2184                 return -errno;
2185         cgroup = find_cgroup_in_path(path);
2186         if (!cgroup)
2187                 return -errno;
2188
2189         get_cgdir_and_path(cgroup, &cgdir, &last);
2190         if (!last) {
2191                 path1 = "/";
2192                 path2 = cgdir;
2193         } else {
2194                 path1 = cgdir;
2195                 path2 = last;
2196         }
2197
2198         k = cgfs_get_key(controller, path1, path2);
2199         if (!k) {
2200                 ret = -EINVAL;
2201                 goto out;
2202         }
2203         free_key(k);
2204
2205         pid_t initpid = lookup_initpid_in_store(fc->pid);
2206         if (initpid <= 0)
2207                 initpid = fc->pid;
2208         if (!caller_may_see_dir(initpid, controller, path1)) {
2209                 ret = -ENOENT;
2210                 goto out;
2211         }
2212         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2213                 ret = -EACCES;
2214                 goto out;
2215         }
2216
2217         /* we'll free this at cg_release */
2218         file_info = malloc(sizeof(*file_info));
2219         if (!file_info) {
2220                 ret = -ENOMEM;
2221                 goto out;
2222         }
2223         file_info->controller = must_copy_string(controller);
2224         file_info->cgroup = must_copy_string(path1);
2225         file_info->file = must_copy_string(path2);
2226         file_info->type = LXC_TYPE_CGFILE;
2227         file_info->buf = NULL;
2228         file_info->buflen = 0;
2229
2230         fi->fh = (unsigned long)file_info;
2231         ret = 0;
2232
2233 out:
2234         free(cgdir);
2235         return ret;
2236 }
2237
2238 int cg_access(const char *path, int mode)
2239 {
2240         int ret;
2241         const char *cgroup;
2242         char *path1, *path2, *controller;
2243         char *last = NULL, *cgdir = NULL;
2244         struct cgfs_files *k = NULL;
2245         struct fuse_context *fc = fuse_get_context();
2246
2247         if (strcmp(path, "/cgroup") == 0)
2248                 return 0;
2249
2250         if (!fc)
2251                 return -EIO;
2252
2253         controller = pick_controller_from_path(fc, path);
2254         if (!controller)
2255                 return -errno;
2256         cgroup = find_cgroup_in_path(path);
2257         if (!cgroup) {
2258                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2259                 if ((mode & W_OK) == 0)
2260                         return 0;
2261                 return -EACCES;
2262         }
2263
2264         get_cgdir_and_path(cgroup, &cgdir, &last);
2265         if (!last) {
2266                 path1 = "/";
2267                 path2 = cgdir;
2268         } else {
2269                 path1 = cgdir;
2270                 path2 = last;
2271         }
2272
2273         k = cgfs_get_key(controller, path1, path2);
2274         if (!k) {
2275                 if ((mode & W_OK) == 0)
2276                         ret = 0;
2277                 else
2278                         ret = -EACCES;
2279                 goto out;
2280         }
2281         free_key(k);
2282
2283         pid_t initpid = lookup_initpid_in_store(fc->pid);
2284         if (initpid <= 0)
2285                 initpid = fc->pid;
2286         if (!caller_may_see_dir(initpid, controller, path1)) {
2287                 ret = -ENOENT;
2288                 goto out;
2289         }
2290         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2291                 ret = -EACCES;
2292                 goto out;
2293         }
2294
2295         ret = 0;
2296
2297 out:
2298         free(cgdir);
2299         return ret;
2300 }
2301
2302 int cg_release(const char *path, struct fuse_file_info *fi)
2303 {
2304         do_release_file_info(fi);
2305         return 0;
2306 }
2307
2308 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2309
2310 static bool wait_for_sock(int sock, int timeout)
2311 {
2312         struct epoll_event ev;
2313         int epfd, ret, now, starttime, deltatime, saved_errno;
2314
2315         if ((starttime = time(NULL)) < 0)
2316                 return false;
2317
2318         if ((epfd = epoll_create(1)) < 0) {
2319                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2320                 return false;
2321         }
2322
2323         ev.events = POLLIN_SET;
2324         ev.data.fd = sock;
2325         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2326                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2327                 close(epfd);
2328                 return false;
2329         }
2330
2331 again:
2332         if ((now = time(NULL)) < 0) {
2333                 close(epfd);
2334                 return false;
2335         }
2336
2337         deltatime = (starttime + timeout) - now;
2338         if (deltatime < 0) { // timeout
2339                 errno = 0;
2340                 close(epfd);
2341                 return false;
2342         }
2343         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2344         if (ret < 0 && errno == EINTR)
2345                 goto again;
2346         saved_errno = errno;
2347         close(epfd);
2348
2349         if (ret <= 0) {
2350                 errno = saved_errno;
2351                 return false;
2352         }
2353         return true;
2354 }
2355
2356 static int msgrecv(int sockfd, void *buf, size_t len)
2357 {
2358         if (!wait_for_sock(sockfd, 2))
2359                 return -1;
2360         return recv(sockfd, buf, len, MSG_DONTWAIT);
2361 }
2362
2363 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2364 {
2365         struct msghdr msg = { 0 };
2366         struct iovec iov;
2367         struct cmsghdr *cmsg;
2368         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2369         char buf[1];
2370         buf[0] = 'p';
2371
2372         if (pingfirst) {
2373                 if (msgrecv(sock, buf, 1) != 1) {
2374                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2375                         return SEND_CREDS_FAIL;
2376                 }
2377         }
2378
2379         msg.msg_control = cmsgbuf;
2380         msg.msg_controllen = sizeof(cmsgbuf);
2381
2382         cmsg = CMSG_FIRSTHDR(&msg);
2383         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2384         cmsg->cmsg_level = SOL_SOCKET;
2385         cmsg->cmsg_type = SCM_CREDENTIALS;
2386         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2387
2388         msg.msg_name = NULL;
2389         msg.msg_namelen = 0;
2390
2391         buf[0] = v;
2392         iov.iov_base = buf;
2393         iov.iov_len = sizeof(buf);
2394         msg.msg_iov = &iov;
2395         msg.msg_iovlen = 1;
2396
2397         if (sendmsg(sock, &msg, 0) < 0) {
2398                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2399                 if (errno == 3)
2400                         return SEND_CREDS_NOTSK;
2401                 return SEND_CREDS_FAIL;
2402         }
2403
2404         return SEND_CREDS_OK;
2405 }
2406
2407 static bool recv_creds(int sock, struct ucred *cred, char *v)
2408 {
2409         struct msghdr msg = { 0 };
2410         struct iovec iov;
2411         struct cmsghdr *cmsg;
2412         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2413         char buf[1];
2414         int ret;
2415         int optval = 1;
2416
2417         *v = '1';
2418
2419         cred->pid = -1;
2420         cred->uid = -1;
2421         cred->gid = -1;
2422
2423         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2424                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2425                 return false;
2426         }
2427         buf[0] = '1';
2428         if (write(sock, buf, 1) != 1) {
2429                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2430                 return false;
2431         }
2432
2433         msg.msg_name = NULL;
2434         msg.msg_namelen = 0;
2435         msg.msg_control = cmsgbuf;
2436         msg.msg_controllen = sizeof(cmsgbuf);
2437
2438         iov.iov_base = buf;
2439         iov.iov_len = sizeof(buf);
2440         msg.msg_iov = &iov;
2441         msg.msg_iovlen = 1;
2442
2443         if (!wait_for_sock(sock, 2)) {
2444                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2445                 return false;
2446         }
2447         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2448         if (ret < 0) {
2449                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2450                 return false;
2451         }
2452
2453         cmsg = CMSG_FIRSTHDR(&msg);
2454
2455         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2456                         cmsg->cmsg_level == SOL_SOCKET &&
2457                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2458                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2459         }
2460         *v = buf[0];
2461
2462         return true;
2463 }
2464
2465 struct pid_ns_clone_args {
2466         int *cpipe;
2467         int sock;
2468         pid_t tpid;
2469         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2470 };
2471
2472 /*
2473  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2474  * with clone(). This simply writes '1' as ACK back to the parent
2475  * before calling the actual wrapped function.
2476  */
2477 static int pid_ns_clone_wrapper(void *arg) {
2478         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2479         char b = '1';
2480
2481         close(args->cpipe[0]);
2482         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2483                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2484         close(args->cpipe[1]);
2485         return args->wrapped(args->sock, args->tpid);
2486 }
2487
2488 /*
2489  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2490  * int value back over the socket.  This shifts the pid from the
2491  * sender's pidns into tpid's pidns.
2492  */
2493 static int pid_to_ns(int sock, pid_t tpid)
2494 {
2495         char v = '0';
2496         struct ucred cred;
2497
2498         while (recv_creds(sock, &cred, &v)) {
2499                 if (v == '1')
2500                         return 0;
2501                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2502                         return 1;
2503         }
2504         return 0;
2505 }
2506
2507
2508 /*
2509  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2510  * in your old pidns.  Only children which you clone will be in the target
2511  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2512  * actually convert pids.
2513  *
2514  * Note: glibc's fork() does not respect pidns, which can lead to failed
2515  * assertions inside glibc (and thus failed forks) if the child's pid in
2516  * the pidns and the parent pid outside are identical. Using clone prevents
2517  * this issue.
2518  */
2519 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2520 {
2521         int newnsfd = -1, ret, cpipe[2];
2522         char fnam[100];
2523         pid_t cpid;
2524         char v;
2525
2526         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2527         if (ret < 0 || ret >= sizeof(fnam))
2528                 _exit(1);
2529         newnsfd = open(fnam, O_RDONLY);
2530         if (newnsfd < 0)
2531                 _exit(1);
2532         if (setns(newnsfd, 0) < 0)
2533                 _exit(1);
2534         close(newnsfd);
2535
2536         if (pipe(cpipe) < 0)
2537                 _exit(1);
2538
2539         struct pid_ns_clone_args args = {
2540                 .cpipe = cpipe,
2541                 .sock = sock,
2542                 .tpid = tpid,
2543                 .wrapped = &pid_to_ns
2544         };
2545         size_t stack_size = sysconf(_SC_PAGESIZE);
2546         void *stack = alloca(stack_size);
2547
2548         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2549         if (cpid < 0)
2550                 _exit(1);
2551
2552         // give the child 1 second to be done forking and
2553         // write its ack
2554         if (!wait_for_sock(cpipe[0], 1))
2555                 _exit(1);
2556         ret = read(cpipe[0], &v, 1);
2557         if (ret != sizeof(char) || v != '1')
2558                 _exit(1);
2559
2560         if (!wait_for_pid(cpid))
2561                 _exit(1);
2562         _exit(0);
2563 }
2564
2565 /*
2566  * To read cgroup files with a particular pid, we will setns into the child
2567  * pidns, open a pipe, fork a child - which will be the first to really be in
2568  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2569  */
2570 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2571 {
2572         int sock[2] = {-1, -1};
2573         char *tmpdata = NULL;
2574         int ret;
2575         pid_t qpid, cpid = -1;
2576         bool answer = false;
2577         char v = '0';
2578         struct ucred cred;
2579         size_t sz = 0, asz = 0;
2580
2581         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2582                 return false;
2583
2584         /*
2585          * Now we read the pids from returned data one by one, pass
2586          * them into a child in the target namespace, read back the
2587          * translated pids, and put them into our to-return data
2588          */
2589
2590         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2591                 perror("socketpair");
2592                 free(tmpdata);
2593                 return false;
2594         }
2595
2596         cpid = fork();
2597         if (cpid == -1)
2598                 goto out;
2599
2600         if (!cpid) // child - exits when done
2601                 pid_to_ns_wrapper(sock[1], tpid);
2602
2603         char *ptr = tmpdata;
2604         cred.uid = 0;
2605         cred.gid = 0;
2606         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2607                 cred.pid = qpid;
2608                 ret = send_creds(sock[0], &cred, v, true);
2609
2610                 if (ret == SEND_CREDS_NOTSK)
2611                         goto next;
2612                 if (ret == SEND_CREDS_FAIL)
2613                         goto out;
2614
2615                 // read converted results
2616                 if (!wait_for_sock(sock[0], 2)) {
2617                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2618                         goto out;
2619                 }
2620                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2621                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2622                         goto out;
2623                 }
2624                 must_strcat_pid(d, &sz, &asz, qpid);
2625 next:
2626                 ptr = strchr(ptr, '\n');
2627                 if (!ptr)
2628                         break;
2629                 ptr++;
2630         }
2631
2632         cred.pid = getpid();
2633         v = '1';
2634         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2635                 // failed to ask child to exit
2636                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2637                 goto out;
2638         }
2639
2640         answer = true;
2641
2642 out:
2643         free(tmpdata);
2644         if (cpid != -1)
2645                 wait_for_pid(cpid);
2646         if (sock[0] != -1) {
2647                 close(sock[0]);
2648                 close(sock[1]);
2649         }
2650         return answer;
2651 }
2652
2653 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2654                 struct fuse_file_info *fi)
2655 {
2656         struct fuse_context *fc = fuse_get_context();
2657         struct file_info *f = (struct file_info *)fi->fh;
2658         struct cgfs_files *k = NULL;
2659         char *data = NULL;
2660         int ret, s;
2661         bool r;
2662
2663         if (f->type != LXC_TYPE_CGFILE) {
2664                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2665                 return -EIO;
2666         }
2667
2668         if (offset)
2669                 return 0;
2670
2671         if (!fc)
2672                 return -EIO;
2673
2674         if (!f->controller)
2675                 return -EINVAL;
2676
2677         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2678                 return -EINVAL;
2679         }
2680         free_key(k);
2681
2682
2683         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2684                 ret = -EACCES;
2685                 goto out;
2686         }
2687
2688         if (strcmp(f->file, "tasks") == 0 ||
2689                         strcmp(f->file, "/tasks") == 0 ||
2690                         strcmp(f->file, "/cgroup.procs") == 0 ||
2691                         strcmp(f->file, "cgroup.procs") == 0)
2692                 // special case - we have to translate the pids
2693                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2694         else
2695                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2696
2697         if (!r) {
2698                 ret = -EINVAL;
2699                 goto out;
2700         }
2701
2702         if (!data) {
2703                 ret = 0;
2704                 goto out;
2705         }
2706         s = strlen(data);
2707         if (s > size)
2708                 s = size;
2709         memcpy(buf, data, s);
2710         if (s > 0 && s < size && data[s-1] != '\n')
2711                 buf[s++] = '\n';
2712
2713         ret = s;
2714
2715 out:
2716         free(data);
2717         return ret;
2718 }
2719
2720 static int pid_from_ns(int sock, pid_t tpid)
2721 {
2722         pid_t vpid;
2723         struct ucred cred;
2724         char v;
2725         int ret;
2726
2727         cred.uid = 0;
2728         cred.gid = 0;
2729         while (1) {
2730                 if (!wait_for_sock(sock, 2)) {
2731                         lxcfs_error("%s\n", "Timeout reading from parent.");
2732                         return 1;
2733                 }
2734                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2735                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2736                         return 1;
2737                 }
2738                 if (vpid == -1) // done
2739                         break;
2740                 v = '0';
2741                 cred.pid = vpid;
2742                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2743                         v = '1';
2744                         cred.pid = getpid();
2745                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2746                                 return 1;
2747                 }
2748         }
2749         return 0;
2750 }
2751
2752 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2753 {
2754         int newnsfd = -1, ret, cpipe[2];
2755         char fnam[100];
2756         pid_t cpid;
2757         char v;
2758
2759         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2760         if (ret < 0 || ret >= sizeof(fnam))
2761                 _exit(1);
2762         newnsfd = open(fnam, O_RDONLY);
2763         if (newnsfd < 0)
2764                 _exit(1);
2765         if (setns(newnsfd, 0) < 0)
2766                 _exit(1);
2767         close(newnsfd);
2768
2769         if (pipe(cpipe) < 0)
2770                 _exit(1);
2771
2772         struct pid_ns_clone_args args = {
2773                 .cpipe = cpipe,
2774                 .sock = sock,
2775                 .tpid = tpid,
2776                 .wrapped = &pid_from_ns
2777         };
2778         size_t stack_size = sysconf(_SC_PAGESIZE);
2779         void *stack = alloca(stack_size);
2780
2781         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2782         if (cpid < 0)
2783                 _exit(1);
2784
2785         // give the child 1 second to be done forking and
2786         // write its ack
2787         if (!wait_for_sock(cpipe[0], 1))
2788                 _exit(1);
2789         ret = read(cpipe[0], &v, 1);
2790         if (ret != sizeof(char) || v != '1')
2791                 _exit(1);
2792
2793         if (!wait_for_pid(cpid))
2794                 _exit(1);
2795         _exit(0);
2796 }
2797
2798 /*
2799  * Given host @uid, return the uid to which it maps in
2800  * @pid's user namespace, or -1 if none.
2801  */
2802 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2803 {
2804         FILE *f;
2805         char line[400];
2806
2807         sprintf(line, "/proc/%d/uid_map", pid);
2808         if ((f = fopen(line, "r")) == NULL) {
2809                 return false;
2810         }
2811
2812         *answer = convert_id_to_ns(f, uid);
2813         fclose(f);
2814
2815         if (*answer == -1)
2816                 return false;
2817         return true;
2818 }
2819
2820 /*
2821  * get_pid_creds: get the real uid and gid of @pid from
2822  * /proc/$$/status
2823  * (XXX should we use euid here?)
2824  */
2825 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2826 {
2827         char line[400];
2828         uid_t u;
2829         gid_t g;
2830         FILE *f;
2831
2832         *uid = -1;
2833         *gid = -1;
2834         sprintf(line, "/proc/%d/status", pid);
2835         if ((f = fopen(line, "r")) == NULL) {
2836                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2837                 return;
2838         }
2839         while (fgets(line, 400, f)) {
2840                 if (strncmp(line, "Uid:", 4) == 0) {
2841                         if (sscanf(line+4, "%u", &u) != 1) {
2842                                 lxcfs_error("bad uid line for pid %u\n", pid);
2843                                 fclose(f);
2844                                 return;
2845                         }
2846                         *uid = u;
2847                 } else if (strncmp(line, "Gid:", 4) == 0) {
2848                         if (sscanf(line+4, "%u", &g) != 1) {
2849                                 lxcfs_error("bad gid line for pid %u\n", pid);
2850                                 fclose(f);
2851                                 return;
2852                         }
2853                         *gid = g;
2854                 }
2855         }
2856         fclose(f);
2857 }
2858
2859 /*
2860  * May the requestor @r move victim @v to a new cgroup?
2861  * This is allowed if
2862  *   . they are the same task
2863  *   . they are ownedy by the same uid
2864  *   . @r is root on the host, or
2865  *   . @v's uid is mapped into @r's where @r is root.
2866  */
2867 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2868 {
2869         uid_t v_uid, tmpuid;
2870         gid_t v_gid;
2871
2872         if (r == v)
2873                 return true;
2874         if (r_uid == 0)
2875                 return true;
2876         get_pid_creds(v, &v_uid, &v_gid);
2877         if (r_uid == v_uid)
2878                 return true;
2879         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2880                         && hostuid_to_ns(v_uid, r, &tmpuid))
2881                 return true;
2882         return false;
2883 }
2884
2885 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2886                 const char *file, const char *buf)
2887 {
2888         int sock[2] = {-1, -1};
2889         pid_t qpid, cpid = -1;
2890         FILE *pids_file = NULL;
2891         bool answer = false, fail = false;
2892
2893         pids_file = open_pids_file(contrl, cg);
2894         if (!pids_file)
2895                 return false;
2896
2897         /*
2898          * write the pids to a socket, have helper in writer's pidns
2899          * call movepid for us
2900          */
2901         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2902                 perror("socketpair");
2903                 goto out;
2904         }
2905
2906         cpid = fork();
2907         if (cpid == -1)
2908                 goto out;
2909
2910         if (!cpid) { // child
2911                 fclose(pids_file);
2912                 pid_from_ns_wrapper(sock[1], tpid);
2913         }
2914
2915         const char *ptr = buf;
2916         while (sscanf(ptr, "%d", &qpid) == 1) {
2917                 struct ucred cred;
2918                 char v;
2919
2920                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2921                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2922                         goto out;
2923                 }
2924
2925                 if (recv_creds(sock[0], &cred, &v)) {
2926                         if (v == '0') {
2927                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2928                                         fail = true;
2929                                         break;
2930                                 }
2931                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2932                                         fail = true;
2933                         }
2934                 }
2935
2936                 ptr = strchr(ptr, '\n');
2937                 if (!ptr)
2938                         break;
2939                 ptr++;
2940         }
2941
2942         /* All good, write the value */
2943         qpid = -1;
2944         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2945                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2946
2947         if (!fail)
2948                 answer = true;
2949
2950 out:
2951         if (cpid != -1)
2952                 wait_for_pid(cpid);
2953         if (sock[0] != -1) {
2954                 close(sock[0]);
2955                 close(sock[1]);
2956         }
2957         if (pids_file) {
2958                 if (fclose(pids_file) != 0)
2959                         answer = false;
2960         }
2961         return answer;
2962 }
2963
2964 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2965              struct fuse_file_info *fi)
2966 {
2967         struct fuse_context *fc = fuse_get_context();
2968         char *localbuf = NULL;
2969         struct cgfs_files *k = NULL;
2970         struct file_info *f = (struct file_info *)fi->fh;
2971         bool r;
2972
2973         if (f->type != LXC_TYPE_CGFILE) {
2974                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2975                 return -EIO;
2976         }
2977
2978         if (offset)
2979                 return 0;
2980
2981         if (!fc)
2982                 return -EIO;
2983
2984         localbuf = alloca(size+1);
2985         localbuf[size] = '\0';
2986         memcpy(localbuf, buf, size);
2987
2988         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2989                 size = -EINVAL;
2990                 goto out;
2991         }
2992
2993         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2994                 size = -EACCES;
2995                 goto out;
2996         }
2997
2998         if (strcmp(f->file, "tasks") == 0 ||
2999                         strcmp(f->file, "/tasks") == 0 ||
3000                         strcmp(f->file, "/cgroup.procs") == 0 ||
3001                         strcmp(f->file, "cgroup.procs") == 0)
3002                 // special case - we have to translate the pids
3003                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3004         else
3005                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3006
3007         if (!r)
3008                 size = -EINVAL;
3009
3010 out:
3011         free_key(k);
3012         return size;
3013 }
3014
3015 int cg_chown(const char *path, uid_t uid, gid_t gid)
3016 {
3017         struct fuse_context *fc = fuse_get_context();
3018         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3019         struct cgfs_files *k = NULL;
3020         const char *cgroup;
3021         int ret;
3022
3023         if (!fc)
3024                 return -EIO;
3025
3026         if (strcmp(path, "/cgroup") == 0)
3027                 return -EPERM;
3028
3029         controller = pick_controller_from_path(fc, path);
3030         if (!controller)
3031                 return errno == ENOENT ? -EPERM : -errno;
3032
3033         cgroup = find_cgroup_in_path(path);
3034         if (!cgroup)
3035                 /* this is just /cgroup/controller */
3036                 return -EPERM;
3037
3038         get_cgdir_and_path(cgroup, &cgdir, &last);
3039
3040         if (!last) {
3041                 path1 = "/";
3042                 path2 = cgdir;
3043         } else {
3044                 path1 = cgdir;
3045                 path2 = last;
3046         }
3047
3048         if (is_child_cgroup(controller, path1, path2)) {
3049                 // get uid, gid, from '/tasks' file and make up a mode
3050                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3051                 k = cgfs_get_key(controller, cgroup, "tasks");
3052
3053         } else
3054                 k = cgfs_get_key(controller, path1, path2);
3055
3056         if (!k) {
3057                 ret = -EINVAL;
3058                 goto out;
3059         }
3060
3061         /*
3062          * This being a fuse request, the uid and gid must be valid
3063          * in the caller's namespace.  So we can just check to make
3064          * sure that the caller is root in his uid, and privileged
3065          * over the file's current owner.
3066          */
3067         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3068                 ret = -EACCES;
3069                 goto out;
3070         }
3071
3072         ret = cgfs_chown_file(controller, cgroup, uid, gid);
3073
3074 out:
3075         free_key(k);
3076         free(cgdir);
3077
3078         return ret;
3079 }
3080
3081 int cg_chmod(const char *path, mode_t mode)
3082 {
3083         struct fuse_context *fc = fuse_get_context();
3084         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3085         struct cgfs_files *k = NULL;
3086         const char *cgroup;
3087         int ret;
3088
3089         if (!fc)
3090                 return -EIO;
3091
3092         if (strcmp(path, "/cgroup") == 0)
3093                 return -EPERM;
3094
3095         controller = pick_controller_from_path(fc, path);
3096         if (!controller)
3097                 return errno == ENOENT ? -EPERM : -errno;
3098
3099         cgroup = find_cgroup_in_path(path);
3100         if (!cgroup)
3101                 /* this is just /cgroup/controller */
3102                 return -EPERM;
3103
3104         get_cgdir_and_path(cgroup, &cgdir, &last);
3105
3106         if (!last) {
3107                 path1 = "/";
3108                 path2 = cgdir;
3109         } else {
3110                 path1 = cgdir;
3111                 path2 = last;
3112         }
3113
3114         if (is_child_cgroup(controller, path1, path2)) {
3115                 // get uid, gid, from '/tasks' file and make up a mode
3116                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3117                 k = cgfs_get_key(controller, cgroup, "tasks");
3118
3119         } else
3120                 k = cgfs_get_key(controller, path1, path2);
3121
3122         if (!k) {
3123                 ret = -EINVAL;
3124                 goto out;
3125         }
3126
3127         /*
3128          * This being a fuse request, the uid and gid must be valid
3129          * in the caller's namespace.  So we can just check to make
3130          * sure that the caller is root in his uid, and privileged
3131          * over the file's current owner.
3132          */
3133         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3134                 ret = -EPERM;
3135                 goto out;
3136         }
3137
3138         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3139                 ret = -EINVAL;
3140                 goto out;
3141         }
3142
3143         ret = 0;
3144 out:
3145         free_key(k);
3146         free(cgdir);
3147         return ret;
3148 }
3149
3150 int cg_mkdir(const char *path, mode_t mode)
3151 {
3152         struct fuse_context *fc = fuse_get_context();
3153         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3154         const char *cgroup;
3155         int ret;
3156
3157         if (!fc)
3158                 return -EIO;
3159
3160         controller = pick_controller_from_path(fc, path);
3161         if (!controller)
3162                 return errno == ENOENT ? -EPERM : -errno;
3163
3164         cgroup = find_cgroup_in_path(path);
3165         if (!cgroup)
3166                 return -errno;
3167
3168         get_cgdir_and_path(cgroup, &cgdir, &last);
3169         if (!last)
3170                 path1 = "/";
3171         else
3172                 path1 = cgdir;
3173
3174         pid_t initpid = lookup_initpid_in_store(fc->pid);
3175         if (initpid <= 0)
3176                 initpid = fc->pid;
3177         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3178                 if (!next)
3179                         ret = -EINVAL;
3180                 else if (last && strcmp(next, last) == 0)
3181                         ret = -EEXIST;
3182                 else
3183                         ret = -EPERM;
3184                 goto out;
3185         }
3186
3187         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3188                 ret = -EACCES;
3189                 goto out;
3190         }
3191         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3192                 ret = -EACCES;
3193                 goto out;
3194         }
3195
3196         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3197
3198 out:
3199         free(cgdir);
3200         free(next);
3201         return ret;
3202 }
3203
3204 int cg_rmdir(const char *path)
3205 {
3206         struct fuse_context *fc = fuse_get_context();
3207         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3208         const char *cgroup;
3209         int ret;
3210
3211         if (!fc)
3212                 return -EIO;
3213
3214         controller = pick_controller_from_path(fc, path);
3215         if (!controller) /* Someone's trying to delete "/cgroup". */
3216                 return -EPERM;
3217
3218         cgroup = find_cgroup_in_path(path);
3219         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3220                 return -EPERM;
3221
3222         get_cgdir_and_path(cgroup, &cgdir, &last);
3223         if (!last) {
3224                 /* Someone's trying to delete a cgroup on the same level as the
3225                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3226                  * rmdir "/cgroup/blkio/init.slice".
3227                  */
3228                 ret = -EPERM;
3229                 goto out;
3230         }
3231
3232         pid_t initpid = lookup_initpid_in_store(fc->pid);
3233         if (initpid <= 0)
3234                 initpid = fc->pid;
3235         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3236                 if (!last || (next && (strcmp(next, last) == 0)))
3237                         ret = -EBUSY;
3238                 else
3239                         ret = -ENOENT;
3240                 goto out;
3241         }
3242
3243         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3244                 ret = -EACCES;
3245                 goto out;
3246         }
3247         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3248                 ret = -EACCES;
3249                 goto out;
3250         }
3251
3252         if (!cgfs_remove(controller, cgroup)) {
3253                 ret = -EINVAL;
3254                 goto out;
3255         }
3256
3257         ret = 0;
3258
3259 out:
3260         free(cgdir);
3261         free(next);
3262         return ret;
3263 }
3264
3265 static bool startswith(const char *line, const char *pref)
3266 {
3267         if (strncmp(line, pref, strlen(pref)) == 0)
3268                 return true;
3269         return false;
3270 }
3271
3272 static void parse_memstat(char *memstat, unsigned long *cached,
3273                 unsigned long *active_anon, unsigned long *inactive_anon,
3274                 unsigned long *active_file, unsigned long *inactive_file,
3275                 unsigned long *unevictable, unsigned long *shmem)
3276 {
3277         char *eol;
3278
3279         while (*memstat) {
3280                 if (startswith(memstat, "total_cache")) {
3281                         sscanf(memstat + 11, "%lu", cached);
3282                         *cached /= 1024;
3283                 } else if (startswith(memstat, "total_active_anon")) {
3284                         sscanf(memstat + 17, "%lu", active_anon);
3285                         *active_anon /= 1024;
3286                 } else if (startswith(memstat, "total_inactive_anon")) {
3287                         sscanf(memstat + 19, "%lu", inactive_anon);
3288                         *inactive_anon /= 1024;
3289                 } else if (startswith(memstat, "total_active_file")) {
3290                         sscanf(memstat + 17, "%lu", active_file);
3291                         *active_file /= 1024;
3292                 } else if (startswith(memstat, "total_inactive_file")) {
3293                         sscanf(memstat + 19, "%lu", inactive_file);
3294                         *inactive_file /= 1024;
3295                 } else if (startswith(memstat, "total_unevictable")) {
3296                         sscanf(memstat + 17, "%lu", unevictable);
3297                         *unevictable /= 1024;
3298                 } else if (startswith(memstat, "total_shmem")) {
3299                         sscanf(memstat + 11, "%lu", shmem);
3300                         *shmem /= 1024;
3301                 }
3302                 eol = strchr(memstat, '\n');
3303                 if (!eol)
3304                         return;
3305                 memstat = eol+1;
3306         }
3307 }
3308
3309 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3310 {
3311         char *eol;
3312         char key[32];
3313
3314         memset(key, 0, 32);
3315         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3316
3317         size_t len = strlen(key);
3318         *v = 0;
3319
3320         while (*str) {
3321                 if (startswith(str, key)) {
3322                         sscanf(str + len, "%lu", v);
3323                         return;
3324                 }
3325                 eol = strchr(str, '\n');
3326                 if (!eol)
3327                         return;
3328                 str = eol+1;
3329         }
3330 }
3331
3332 int read_file(const char *path, char *buf, size_t size, struct file_info *d)
3333 {
3334         size_t linelen = 0, total_len = 0, rv = 0;
3335         char *line = NULL;
3336         char *cache = d->buf;
3337         size_t cache_size = d->buflen;
3338         FILE *f = fopen(path, "r");
3339         if (!f)
3340                 return 0;
3341
3342         while (getline(&line, &linelen, f) != -1) {
3343                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3344                 if (l < 0) {
3345                         perror("Error writing to cache");
3346                         rv = 0;
3347                         goto err;
3348                 }
3349                 if (l >= cache_size) {
3350                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3351                         rv = 0;
3352                         goto err;
3353                 }
3354                 cache += l;
3355                 cache_size -= l;
3356                 total_len += l;
3357         }
3358
3359         d->size = total_len;
3360         if (total_len > size)
3361                 total_len = size;
3362
3363         /* read from off 0 */
3364         memcpy(buf, d->buf, total_len);
3365         rv = total_len;
3366   err:
3367         fclose(f);
3368         free(line);
3369         return rv;
3370 }
3371
3372 /*
3373  * FUSE ops for /proc
3374  */
3375
3376 static unsigned long get_memlimit(const char *cgroup, const char *file)
3377 {
3378         char *memlimit_str = NULL;
3379         unsigned long memlimit = -1;
3380
3381         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3382                 memlimit = strtoul(memlimit_str, NULL, 10);
3383
3384         free(memlimit_str);
3385
3386         return memlimit;
3387 }
3388
3389 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3390 {
3391         char *copy = strdupa(cgroup);
3392         unsigned long memlimit = 0, retlimit;
3393
3394         retlimit = get_memlimit(copy, file);
3395
3396         while (strcmp(copy, "/") != 0) {
3397                 copy = dirname(copy);
3398                 memlimit = get_memlimit(copy, file);
3399                 if (memlimit != -1 && memlimit < retlimit)
3400                         retlimit = memlimit;
3401         };
3402
3403         return retlimit;
3404 }
3405
3406 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3407                 struct fuse_file_info *fi)
3408 {
3409         struct fuse_context *fc = fuse_get_context();
3410         struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3411         struct file_info *d = (struct file_info *)fi->fh;
3412         char *cg;
3413         char *memusage_str = NULL, *memstat_str = NULL,
3414                 *memswlimit_str = NULL, *memswusage_str = NULL;
3415         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3416                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3417                 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3418                 hostswtotal = 0;
3419         char *line = NULL;
3420         size_t linelen = 0, total_len = 0, rv = 0;
3421         char *cache = d->buf;
3422         size_t cache_size = d->buflen;
3423         FILE *f = NULL;
3424
3425         if (offset){
3426                 if (offset > d->size)
3427                         return -EINVAL;
3428                 if (!d->cached)
3429                         return 0;
3430                 int left = d->size - offset;
3431                 total_len = left > size ? size: left;
3432                 memcpy(buf, cache + offset, total_len);
3433                 return total_len;
3434         }
3435
3436         pid_t initpid = lookup_initpid_in_store(fc->pid);
3437         if (initpid <= 0)
3438                 initpid = fc->pid;
3439         cg = get_pid_cgroup(initpid, "memory");
3440         if (!cg)
3441                 return read_file("/proc/meminfo", buf, size, d);
3442         prune_init_slice(cg);
3443
3444         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3445         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3446                 goto err;
3447         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3448                 goto err;
3449
3450         // Following values are allowed to fail, because swapaccount might be turned
3451         // off for current kernel
3452         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3453                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3454         {
3455                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3456                 memswusage = strtoul(memswusage_str, NULL, 10);
3457
3458                 memswlimit = memswlimit / 1024;
3459                 memswusage = memswusage / 1024;
3460         }
3461
3462         memusage = strtoul(memusage_str, NULL, 10);
3463         memlimit /= 1024;
3464         memusage /= 1024;
3465
3466         parse_memstat(memstat_str, &cached, &active_anon,
3467                         &inactive_anon, &active_file, &inactive_file,
3468                         &unevictable, &shmem);
3469
3470         f = fopen("/proc/meminfo", "r");
3471         if (!f)
3472                 goto err;
3473
3474         while (getline(&line, &linelen, f) != -1) {
3475                 ssize_t l;
3476                 char *printme, lbuf[100];
3477
3478                 memset(lbuf, 0, 100);
3479                 if (startswith(line, "MemTotal:")) {
3480                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3481                         if (hosttotal < memlimit)
3482                                 memlimit = hosttotal;
3483                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3484                         printme = lbuf;
3485                 } else if (startswith(line, "MemFree:")) {
3486                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3487                         printme = lbuf;
3488                 } else if (startswith(line, "MemAvailable:")) {
3489                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3490                         printme = lbuf;
3491                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts->swap_off == false) {
3492                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3493                         if (hostswtotal < memswlimit)
3494                                 memswlimit = hostswtotal;
3495                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3496                         printme = lbuf;
3497                 } else if (startswith(line, "SwapTotal:") && opts->swap_off == true) {
3498                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", 0UL);
3499                         printme = lbuf;
3500                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts->swap_off == false) {
3501                         unsigned long swaptotal = memswlimit,
3502                                         swapusage = memswusage - memusage,
3503                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3504                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3505                         printme = lbuf;
3506                 } else if (startswith(line, "SwapFree:") && opts->swap_off == true) {
3507                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", 0UL);
3508                         printme = lbuf;
3509                 } else if (startswith(line, "Slab:")) {
3510                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3511                         printme = lbuf;
3512                 } else if (startswith(line, "Buffers:")) {
3513                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3514                         printme = lbuf;
3515                 } else if (startswith(line, "Cached:")) {
3516                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3517                         printme = lbuf;
3518                 } else if (startswith(line, "SwapCached:")) {
3519                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3520                         printme = lbuf;
3521                 } else if (startswith(line, "Active:")) {
3522                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3523                                         active_anon + active_file);
3524                         printme = lbuf;
3525                 } else if (startswith(line, "Inactive:")) {
3526                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3527                                         inactive_anon + inactive_file);
3528                         printme = lbuf;
3529                 } else if (startswith(line, "Active(anon)")) {
3530                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3531                         printme = lbuf;
3532                 } else if (startswith(line, "Inactive(anon)")) {
3533                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3534                         printme = lbuf;
3535                 } else if (startswith(line, "Active(file)")) {
3536                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3537                         printme = lbuf;
3538                 } else if (startswith(line, "Inactive(file)")) {
3539                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3540                         printme = lbuf;
3541                 } else if (startswith(line, "Unevictable")) {
3542                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3543                         printme = lbuf;
3544                 } else if (startswith(line, "SReclaimable")) {
3545                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3546                         printme = lbuf;
3547                 } else if (startswith(line, "SUnreclaim")) {
3548                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3549                         printme = lbuf;
3550                 } else if (startswith(line, "Shmem:")) {
3551                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3552                         printme = lbuf;
3553                 } else if (startswith(line, "ShmemHugePages")) {
3554                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3555                         printme = lbuf;
3556                 } else if (startswith(line, "ShmemPmdMapped")) {
3557                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3558                         printme = lbuf;
3559                 } else
3560                         printme = line;
3561
3562                 l = snprintf(cache, cache_size, "%s", printme);
3563                 if (l < 0) {
3564                         perror("Error writing to cache");
3565                         rv = 0;
3566                         goto err;
3567
3568                 }
3569                 if (l >= cache_size) {
3570                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3571                         rv = 0;
3572                         goto err;
3573                 }
3574
3575                 cache += l;
3576                 cache_size -= l;
3577                 total_len += l;
3578         }
3579
3580         d->cached = 1;
3581         d->size = total_len;
3582         if (total_len > size ) total_len = size;
3583         memcpy(buf, d->buf, total_len);
3584
3585         rv = total_len;
3586 err:
3587         if (f)
3588                 fclose(f);
3589         free(line);
3590         free(cg);
3591         free(memusage_str);
3592         free(memswlimit_str);
3593         free(memswusage_str);
3594         free(memstat_str);
3595         return rv;
3596 }
3597
3598 /*
3599  * Read the cpuset.cpus for cg
3600  * Return the answer in a newly allocated string which must be freed
3601  */
3602 char *get_cpuset(const char *cg)
3603 {
3604         char *answer;
3605
3606         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3607                 return NULL;
3608         return answer;
3609 }
3610
3611 bool cpu_in_cpuset(int cpu, const char *cpuset);
3612
3613 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3614 {
3615         int cpu;
3616
3617         if (sscanf(line, "processor       : %d", &cpu) != 1)
3618                 return false;
3619         return cpu_in_cpuset(cpu, cpuset);
3620 }
3621
3622 /*
3623  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3624  * depending on `param`. Parameter value is returned throuh `value`.
3625  */
3626 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3627 {
3628         bool rv = false;
3629         char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3630         char *str = NULL;
3631
3632         sprintf(file, "cpu.cfs_%s_us", param);
3633
3634         if (!cgfs_get_value("cpu", cg, file, &str))
3635                 goto err;
3636
3637         if (sscanf(str, "%ld", value) != 1)
3638                 goto err;
3639
3640         rv = true;
3641
3642 err:
3643         if (str)
3644                 free(str);
3645         return rv;
3646 }
3647
3648 /*
3649  * Return the maximum number of visible CPUs based on CPU quotas.
3650  * If there is no quota set, zero is returned.
3651  */
3652 int max_cpu_count(const char *cg)
3653 {
3654         int rv, nprocs;
3655         int64_t cfs_quota, cfs_period;
3656
3657         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3658                 return 0;
3659
3660         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3661                 return 0;
3662
3663         if (cfs_quota <= 0 || cfs_period <= 0)
3664                 return 0;
3665
3666         rv = cfs_quota / cfs_period;
3667
3668         /* In case quota/period does not yield a whole number, add one CPU for
3669          * the remainder.
3670          */
3671         if ((cfs_quota % cfs_period) > 0)
3672                 rv += 1;
3673
3674         nprocs = get_nprocs();
3675
3676         if (rv > nprocs)
3677                 rv = nprocs;
3678
3679         return rv;
3680 }
3681
3682 /*
3683  * Determine whether CPU views should be used or not.
3684  */
3685 bool use_cpuview(const char *cg)
3686 {
3687         int cfd;
3688         char *tmpc;
3689
3690         tmpc = find_mounted_controller("cpu", &cfd);
3691         if (!tmpc)
3692                 return false;
3693
3694         tmpc = find_mounted_controller("cpuacct", &cfd);
3695         if (!tmpc)
3696                 return false;
3697
3698         return true;
3699 }
3700
3701 /*
3702  * check whether this is a '^processor" line in /proc/cpuinfo
3703  */
3704 static bool is_processor_line(const char *line)
3705 {
3706         int cpu;
3707
3708         if (sscanf(line, "processor       : %d", &cpu) == 1)
3709                 return true;
3710         return false;
3711 }
3712
3713 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3714                 struct fuse_file_info *fi)
3715 {
3716         struct fuse_context *fc = fuse_get_context();
3717         struct file_info *d = (struct file_info *)fi->fh;
3718         char *cg;
3719         char *cpuset = NULL;
3720         char *line = NULL;
3721         size_t linelen = 0, total_len = 0, rv = 0;
3722         bool am_printing = false, firstline = true, is_s390x = false;
3723         int curcpu = -1, cpu, max_cpus = 0;
3724         bool use_view;
3725         char *cache = d->buf;
3726         size_t cache_size = d->buflen;
3727         FILE *f = NULL;
3728
3729         if (offset){
3730                 if (offset > d->size)
3731                         return -EINVAL;
3732                 if (!d->cached)
3733                         return 0;
3734                 int left = d->size - offset;
3735                 total_len = left > size ? size: left;
3736                 memcpy(buf, cache + offset, total_len);
3737                 return total_len;
3738         }
3739
3740         pid_t initpid = lookup_initpid_in_store(fc->pid);
3741         if (initpid <= 0)
3742                 initpid = fc->pid;
3743         cg = get_pid_cgroup(initpid, "cpuset");
3744         if (!cg)
3745                 return read_file("proc/cpuinfo", buf, size, d);
3746         prune_init_slice(cg);
3747
3748         cpuset = get_cpuset(cg);
3749         if (!cpuset)
3750                 goto err;
3751
3752         use_view = use_cpuview(cg);
3753
3754         if (use_view)
3755                 max_cpus = max_cpu_count(cg);
3756
3757         f = fopen("/proc/cpuinfo", "r");
3758         if (!f)
3759                 goto err;
3760
3761         while (getline(&line, &linelen, f) != -1) {
3762                 ssize_t l;
3763                 if (firstline) {
3764                         firstline = false;
3765                         if (strstr(line, "IBM/S390") != NULL) {
3766                                 is_s390x = true;
3767                                 am_printing = true;
3768                                 continue;
3769                         }
3770                 }
3771                 if (strncmp(line, "# processors:", 12) == 0)
3772                         continue;
3773                 if (is_processor_line(line)) {
3774                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3775                                 break;
3776                         am_printing = cpuline_in_cpuset(line, cpuset);
3777                         if (am_printing) {
3778                                 curcpu ++;
3779                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3780                                 if (l < 0) {
3781                                         perror("Error writing to cache");
3782                                         rv = 0;
3783                                         goto err;
3784                                 }
3785                                 if (l >= cache_size) {
3786                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3787                                         rv = 0;
3788                                         goto err;
3789                                 }
3790                                 cache += l;
3791                                 cache_size -= l;
3792                                 total_len += l;
3793                         }
3794                         continue;
3795                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3796                         char *p;
3797                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3798                                 break;
3799                         if (!cpu_in_cpuset(cpu, cpuset))
3800                                 continue;
3801                         curcpu ++;
3802                         p = strchr(line, ':');
3803                         if (!p || !*p)
3804                                 goto err;
3805                         p++;
3806                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3807                         if (l < 0) {
3808                                 perror("Error writing to cache");
3809                                 rv = 0;
3810                                 goto err;
3811                         }
3812                         if (l >= cache_size) {
3813                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3814                                 rv = 0;
3815                                 goto err;
3816                         }
3817                         cache += l;
3818                         cache_size -= l;
3819                         total_len += l;
3820                         continue;
3821
3822                 }
3823                 if (am_printing) {
3824                         l = snprintf(cache, cache_size, "%s", line);
3825                         if (l < 0) {
3826                                 perror("Error writing to cache");
3827                                 rv = 0;
3828                                 goto err;
3829                         }
3830                         if (l >= cache_size) {
3831                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3832                                 rv = 0;
3833                                 goto err;
3834                         }
3835                         cache += l;
3836                         cache_size -= l;
3837                         total_len += l;
3838                 }
3839         }
3840
3841         if (is_s390x) {
3842                 char *origcache = d->buf;
3843                 ssize_t l;
3844                 do {
3845                         d->buf = malloc(d->buflen);
3846                 } while (!d->buf);
3847                 cache = d->buf;
3848                 cache_size = d->buflen;
3849                 total_len = 0;
3850                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3851                 if (l < 0 || l >= cache_size) {
3852                         free(origcache);
3853                         goto err;
3854                 }
3855                 cache_size -= l;
3856                 cache += l;
3857                 total_len += l;
3858                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3859                 if (l < 0 || l >= cache_size) {
3860                         free(origcache);
3861                         goto err;
3862                 }
3863                 cache_size -= l;
3864                 cache += l;
3865                 total_len += l;
3866                 l = snprintf(cache, cache_size, "%s", origcache);
3867                 free(origcache);
3868                 if (l < 0 || l >= cache_size)
3869                         goto err;
3870                 total_len += l;
3871         }
3872
3873         d->cached = 1;
3874         d->size = total_len;
3875         if (total_len > size ) total_len = size;
3876
3877         /* read from off 0 */
3878         memcpy(buf, d->buf, total_len);
3879         rv = total_len;
3880 err:
3881         if (f)
3882                 fclose(f);
3883         free(line);
3884         free(cpuset);
3885         free(cg);
3886         return rv;
3887 }
3888
3889 static uint64_t get_reaper_start_time(pid_t pid)
3890 {
3891         int ret;
3892         FILE *f;
3893         uint64_t starttime;
3894         /* strlen("/proc/") = 6
3895          * +
3896          * LXCFS_NUMSTRLEN64
3897          * +
3898          * strlen("/stat") = 5
3899          * +
3900          * \0 = 1
3901          * */
3902 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3903         char path[__PROC_PID_STAT_LEN];
3904         pid_t qpid;
3905
3906         qpid = lookup_initpid_in_store(pid);
3907         if (qpid <= 0) {
3908                 /* Caller can check for EINVAL on 0. */
3909                 errno = EINVAL;
3910                 return 0;
3911         }
3912
3913         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3914         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3915                 /* Caller can check for EINVAL on 0. */
3916                 errno = EINVAL;
3917                 return 0;
3918         }
3919
3920         f = fopen(path, "r");
3921         if (!f) {
3922                 /* Caller can check for EINVAL on 0. */
3923                 errno = EINVAL;
3924                 return 0;
3925         }
3926
3927         /* Note that the *scanf() argument supression requires that length
3928          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3929          * at us. It's like telling someone you're not married and then asking
3930          * if you can bring your wife to the party.
3931          */
3932         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3933                         "%*s "      /* (2)  comm        %s   */
3934                         "%*c "      /* (3)  state       %c   */
3935                         "%*d "      /* (4)  ppid        %d   */
3936                         "%*d "      /* (5)  pgrp        %d   */
3937                         "%*d "      /* (6)  session     %d   */
3938                         "%*d "      /* (7)  tty_nr      %d   */
3939                         "%*d "      /* (8)  tpgid       %d   */
3940                         "%*u "      /* (9)  flags       %u   */
3941                         "%*u "      /* (10) minflt      %lu  */
3942                         "%*u "      /* (11) cminflt     %lu  */
3943                         "%*u "      /* (12) majflt      %lu  */
3944                         "%*u "      /* (13) cmajflt     %lu  */
3945                         "%*u "      /* (14) utime       %lu  */
3946                         "%*u "      /* (15) stime       %lu  */
3947                         "%*d "      /* (16) cutime      %ld  */
3948                         "%*d "      /* (17) cstime      %ld  */
3949                         "%*d "      /* (18) priority    %ld  */
3950                         "%*d "      /* (19) nice        %ld  */
3951                         "%*d "      /* (20) num_threads %ld  */
3952                         "%*d "      /* (21) itrealvalue %ld  */
3953                         "%" PRIu64, /* (22) starttime   %llu */
3954                      &starttime);
3955         if (ret != 1) {
3956                 fclose(f);
3957                 /* Caller can check for EINVAL on 0. */
3958                 errno = EINVAL;
3959                 return 0;
3960         }
3961
3962         fclose(f);
3963
3964         errno = 0;
3965         return starttime;
3966 }
3967
3968 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3969 {
3970         uint64_t clockticks;
3971         int64_t ticks_per_sec;
3972
3973         clockticks = get_reaper_start_time(pid);
3974         if (clockticks == 0 && errno == EINVAL) {
3975                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3976                 return 0;
3977         }
3978
3979         ticks_per_sec = sysconf(_SC_CLK_TCK);
3980         if (ticks_per_sec < 0 && errno == EINVAL) {
3981                 lxcfs_debug(
3982                     "%s\n",
3983                     "failed to determine number of clock ticks in a second");
3984                 return 0;
3985         }
3986
3987         return (clockticks /= ticks_per_sec);
3988 }
3989
3990 static uint64_t get_reaper_age(pid_t pid)
3991 {
3992         uint64_t procstart, uptime, procage;
3993
3994         /* We need to substract the time the process has started since system
3995          * boot minus the time when the system has started to get the actual
3996          * reaper age.
3997          */
3998         procstart = get_reaper_start_time_in_sec(pid);
3999         procage = procstart;
4000         if (procstart > 0) {
4001                 int ret;
4002                 struct timespec spec;
4003
4004                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4005                 if (ret < 0)
4006                         return 0;
4007                 /* We could make this more precise here by using the tv_nsec
4008                  * field in the timespec struct and convert it to milliseconds
4009                  * and then create a double for the seconds and milliseconds but
4010                  * that seems more work than it is worth.
4011                  */
4012                 uptime = spec.tv_sec;
4013                 procage = uptime - procstart;
4014         }
4015
4016         return procage;
4017 }
4018
4019 /*
4020  * Returns 0 on success.
4021  * It is the caller's responsibility to free `return_usage`, unless this
4022  * function returns an error.
4023  */
4024 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4025 {
4026         int cpucount = get_nprocs_conf();
4027         struct cpuacct_usage *cpu_usage;
4028         int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4029         int cg_cpu;
4030         uint64_t cg_user, cg_system;
4031         int64_t ticks_per_sec;
4032         char *usage_str = NULL;
4033
4034         ticks_per_sec = sysconf(_SC_CLK_TCK);
4035
4036         if (ticks_per_sec < 0 && errno == EINVAL) {
4037                 lxcfs_debug(
4038                         "%s\n",
4039                         "read_cpuacct_usage_all failed to determine number of clock ticks "
4040                         "in a second");
4041                 return -1;
4042         }
4043
4044         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4045         if (!cpu_usage)
4046                 return -ENOMEM;
4047
4048         if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4049                 rv = -1;
4050                 goto err;
4051         }
4052
4053         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4054                 lxcfs_error("read_cpuacct_usage_all reading first line from "
4055                                 "%s/cpuacct.usage_all failed.\n", cg);
4056                 rv = -1;
4057                 goto err;
4058         }
4059
4060         read_pos += read_cnt;
4061
4062         for (i = 0, j = 0; i < cpucount; i++) {
4063                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4064                                 &cg_system, &read_cnt);
4065
4066                 if (ret == EOF)
4067                         break;
4068
4069                 if (ret != 3) {
4070                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4071                                         "failed.\n", cg);
4072                         rv = -1;
4073                         goto err;
4074                 }
4075
4076                 read_pos += read_cnt;
4077
4078                 /* Convert the time from nanoseconds to USER_HZ */
4079                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4080                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4081                 j++;
4082         }
4083
4084         rv = 0;
4085         *return_usage = cpu_usage;
4086         *size = cpucount;
4087
4088 err:
4089         if (usage_str)
4090                 free(usage_str);
4091
4092         if (rv != 0) {
4093                 free(cpu_usage);
4094                 *return_usage = NULL;
4095         }
4096
4097         return rv;
4098 }
4099
4100 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4101 {
4102         int i;
4103         unsigned long sum = 0;
4104
4105         for (i = 0; i < cpu_count; i++) {
4106                 if (!newer[i].online)
4107                         continue;
4108
4109                 /* When cpuset is changed on the fly, the CPUs might get reordered.
4110                  * We could either reset all counters, or check that the substractions
4111                  * below will return expected results.
4112                  */
4113                 if (newer[i].user > older[i].user)
4114                         diff[i].user = newer[i].user - older[i].user;
4115                 else
4116                         diff[i].user = 0;
4117
4118                 if (newer[i].system > older[i].system)
4119                         diff[i].system = newer[i].system - older[i].system;
4120                 else
4121                         diff[i].system = 0;
4122
4123                 if (newer[i].idle > older[i].idle)
4124                         diff[i].idle = newer[i].idle - older[i].idle;
4125                 else
4126                         diff[i].idle = 0;
4127
4128                 sum += diff[i].user;
4129                 sum += diff[i].system;
4130                 sum += diff[i].idle;
4131         }
4132
4133         return sum;
4134 }
4135
4136 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4137 {
4138         unsigned long free_space, to_add;
4139
4140         free_space = threshold - usage->user - usage->system;
4141
4142         if (free_space > usage->idle)
4143                 free_space = usage->idle;
4144
4145         to_add = free_space > *surplus ? *surplus : free_space;
4146
4147         *counter += to_add;
4148         usage->idle -= to_add;
4149         *surplus -= to_add;
4150 }
4151
4152 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4153 {
4154         struct cg_proc_stat *first = NULL, *prev, *tmp;
4155
4156         for (prev = NULL; node; ) {
4157                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4158                         tmp = node;
4159                         lxcfs_debug("Removing stat node for %s\n", node->cg);
4160
4161                         if (prev)
4162                                 prev->next = node->next;
4163                         else
4164                                 first = node->next;
4165
4166                         node = node->next;
4167                         free_proc_stat_node(tmp);
4168                 } else {
4169                         if (!first)
4170                                 first = node;
4171                         prev = node;
4172                         node = node->next;
4173                 }
4174         }
4175
4176         return first;
4177 }
4178
4179 #define PROC_STAT_PRUNE_INTERVAL 10
4180 static void prune_proc_stat_history(void)
4181 {
4182         int i;
4183         time_t now = time(NULL);
4184
4185         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4186                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4187
4188                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4189                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4190                         return;
4191                 }
4192
4193                 if (proc_stat_history[i]->next) {
4194                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4195                         proc_stat_history[i]->lastcheck = now;
4196                 }
4197
4198                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4199         }
4200 }
4201
4202 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4203 {
4204         struct cg_proc_stat *node;
4205
4206         pthread_rwlock_rdlock(&head->lock);
4207
4208         if (!head->next) {
4209                 pthread_rwlock_unlock(&head->lock);
4210                 return NULL;
4211         }
4212
4213         node = head->next;
4214
4215         do {
4216                 if (strcmp(cg, node->cg) == 0)
4217                         goto out;
4218         } while ((node = node->next));
4219
4220         node = NULL;
4221
4222 out:
4223         pthread_rwlock_unlock(&head->lock);
4224         prune_proc_stat_history();
4225         return node;
4226 }
4227
4228 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4229 {
4230         struct cg_proc_stat *node;
4231         int i;
4232
4233         node = malloc(sizeof(struct cg_proc_stat));
4234         if (!node)
4235                 goto err;
4236
4237         node->cg = NULL;
4238         node->usage = NULL;
4239         node->view = NULL;
4240
4241         node->cg = malloc(strlen(cg) + 1);
4242         if (!node->cg)
4243                 goto err;
4244
4245         strcpy(node->cg, cg);
4246
4247         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4248         if (!node->usage)
4249                 goto err;
4250
4251         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4252
4253         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4254         if (!node->view)
4255                 goto err;
4256
4257         node->cpu_count = cpu_count;
4258         node->next = NULL;
4259
4260         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4261                 lxcfs_error("%s\n", "Failed to initialize node lock");
4262                 goto err;
4263         }
4264
4265         for (i = 0; i < cpu_count; i++) {
4266                 node->view[i].user = 0;
4267                 node->view[i].system = 0;
4268                 node->view[i].idle = 0;
4269         }
4270
4271         return node;
4272
4273 err:
4274         if (node && node->cg)
4275                 free(node->cg);
4276         if (node && node->usage)
4277                 free(node->usage);
4278         if (node && node->view)
4279                 free(node->view);
4280         if (node)
4281                 free(node);
4282
4283         return NULL;
4284 }
4285
4286 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4287 {
4288         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4289         struct cg_proc_stat_head *head = proc_stat_history[hash];
4290         struct cg_proc_stat *node, *rv = new_node;
4291
4292         pthread_rwlock_wrlock(&head->lock);
4293
4294         if (!head->next) {
4295                 head->next = new_node;
4296                 goto out;
4297         }
4298
4299         node = head->next;
4300
4301         for (;;) {
4302                 if (strcmp(node->cg, new_node->cg) == 0) {
4303                         /* The node is already present, return it */
4304                         free_proc_stat_node(new_node);
4305                         rv = node;
4306                         goto out;
4307                 }
4308
4309                 if (node->next) {
4310                         node = node->next;
4311                         continue;
4312                 }
4313
4314                 node->next = new_node;
4315                 goto out;
4316         }
4317
4318 out:
4319         pthread_rwlock_unlock(&head->lock);
4320         return rv;
4321 }
4322
4323 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4324 {
4325         struct cpuacct_usage *new_usage, *new_view;
4326         int i;
4327
4328         /* Allocate new memory */
4329         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4330         if (!new_usage)
4331                 return false;
4332
4333         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4334         if (!new_view) {
4335                 free(new_usage);
4336                 return false;
4337         }
4338
4339         /* Copy existing data & initialize new elements */
4340         for (i = 0; i < cpu_count; i++) {
4341                 if (i < node->cpu_count) {
4342                         new_usage[i].user = node->usage[i].user;
4343                         new_usage[i].system = node->usage[i].system;
4344                         new_usage[i].idle = node->usage[i].idle;
4345
4346                         new_view[i].user = node->view[i].user;
4347                         new_view[i].system = node->view[i].system;
4348                         new_view[i].idle = node->view[i].idle;
4349                 } else {
4350                         new_usage[i].user = 0;
4351                         new_usage[i].system = 0;
4352                         new_usage[i].idle = 0;
4353
4354                         new_view[i].user = 0;
4355                         new_view[i].system = 0;
4356                         new_view[i].idle = 0;
4357                 }
4358         }
4359
4360         free(node->usage);
4361         free(node->view);
4362
4363         node->usage = new_usage;
4364         node->view = new_view;
4365         node->cpu_count = cpu_count;
4366
4367         return true;
4368 }
4369
4370 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4371 {
4372         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4373         struct cg_proc_stat_head *head = proc_stat_history[hash];
4374         struct cg_proc_stat *node;
4375
4376         node = find_proc_stat_node(head, cg);
4377
4378         if (!node) {
4379                 node = new_proc_stat_node(usage, cpu_count, cg);
4380                 if (!node)
4381                         return NULL;
4382
4383                 node = add_proc_stat_node(node);
4384                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4385         }
4386
4387         pthread_mutex_lock(&node->lock);
4388
4389         /* If additional CPUs on the host have been enabled, CPU usage counter
4390          * arrays have to be expanded */
4391         if (node->cpu_count < cpu_count) {
4392                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4393                                 node->cpu_count, cpu_count, cg);
4394
4395                 if (!expand_proc_stat_node(node, cpu_count)) {
4396                         pthread_mutex_unlock(&node->lock);
4397                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4398                                         node->cpu_count, cpu_count, cg);
4399                         return NULL;
4400                 }
4401         }
4402
4403         return node;
4404 }
4405
4406 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4407 {
4408         int i;
4409
4410         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4411         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4412
4413         for (i = 0; i < cpu_count; i++) {
4414                 node->view[i].user = 0;
4415                 node->view[i].system = 0;
4416                 node->view[i].idle = 0;
4417         }
4418
4419         node->cpu_count = cpu_count;
4420 }
4421
4422 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4423 {
4424         char *line = NULL;
4425         size_t linelen = 0, total_len = 0, rv = 0, l;
4426         int curcpu = -1; /* cpu numbering starts at 0 */
4427         int physcpu, i;
4428         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4429         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4430         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4431         unsigned long user_surplus = 0, system_surplus = 0;
4432         unsigned long total_sum, threshold;
4433         struct cg_proc_stat *stat_node;
4434         struct cpuacct_usage *diff = NULL;
4435         int nprocs = get_nprocs_conf();
4436
4437         if (cg_cpu_usage_size < nprocs)
4438                 nprocs = cg_cpu_usage_size;
4439
4440         /* Read all CPU stats and stop when we've encountered other lines */
4441         while (getline(&line, &linelen, f) != -1) {
4442                 int ret;
4443                 char cpu_char[10]; /* That's a lot of cores */
4444                 uint64_t all_used, cg_used;
4445
4446                 if (strlen(line) == 0)
4447                         continue;
4448                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4449                         /* not a ^cpuN line containing a number N */
4450                         break;
4451                 }
4452
4453                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4454                         continue;
4455
4456                 if (physcpu >= cg_cpu_usage_size)
4457                         continue;
4458
4459                 curcpu ++;
4460                 cpu_cnt ++;
4461
4462                 if (!cpu_in_cpuset(physcpu, cpuset)) {
4463                         for (i = curcpu; i <= physcpu; i++) {
4464                                 cg_cpu_usage[i].online = false;
4465                         }
4466                         continue;
4467                 }
4468
4469                 if (curcpu < physcpu) {
4470                         /* Some CPUs may be disabled */
4471                         for (i = curcpu; i < physcpu; i++)
4472                                 cg_cpu_usage[i].online = false;
4473
4474                         curcpu = physcpu;
4475                 }
4476
4477                 cg_cpu_usage[curcpu].online = true;
4478
4479                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4480                            &user,
4481                            &nice,
4482                            &system,
4483                            &idle,
4484                            &iowait,
4485                            &irq,
4486                            &softirq,
4487                            &steal,
4488                            &guest,
4489                            &guest_nice);
4490
4491                 if (ret != 10)
4492                         continue;
4493
4494                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4495                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4496
4497                 if (all_used >= cg_used) {
4498                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4499
4500                 } else {
4501                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4502                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4503                                         curcpu, cg, all_used, cg_used);
4504                         cg_cpu_usage[curcpu].idle = idle;
4505                 }
4506         }
4507
4508         /* Cannot use more CPUs than is available due to cpuset */
4509         if (max_cpus > cpu_cnt)
4510                 max_cpus = cpu_cnt;
4511
4512         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4513
4514         if (!stat_node) {
4515                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4516                 rv = 0;
4517                 goto err;
4518         }
4519
4520         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4521         if (!diff) {
4522                 rv = 0;
4523                 goto err;
4524         }
4525
4526         /*
4527          * If the new values are LOWER than values stored in memory, it means
4528          * the cgroup has been reset/recreated and we should reset too.
4529          */
4530         for (curcpu = 0; curcpu < nprocs; curcpu++) {
4531                 if (!cg_cpu_usage[curcpu].online)
4532                         continue;
4533
4534                 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4535                         reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4536
4537                 break;
4538         }
4539
4540         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4541
4542         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4543                 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4544
4545                 if (!stat_node->usage[curcpu].online)
4546                         continue;
4547
4548                 i++;
4549
4550                 stat_node->usage[curcpu].user += diff[curcpu].user;
4551                 stat_node->usage[curcpu].system += diff[curcpu].system;
4552                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4553
4554                 if (max_cpus > 0 && i >= max_cpus) {
4555                         user_surplus += diff[curcpu].user;
4556                         system_surplus += diff[curcpu].system;
4557                 }
4558         }
4559
4560         /* Calculate usage counters of visible CPUs */
4561         if (max_cpus > 0) {
4562                 /* threshold = maximum usage per cpu, including idle */
4563                 threshold = total_sum / cpu_cnt * max_cpus;
4564
4565                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4566                         if (i == max_cpus)
4567                                 break;
4568
4569                         if (!stat_node->usage[curcpu].online)
4570                                 continue;
4571
4572                         i++;
4573
4574                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4575                                 continue;
4576
4577                         /* Add user */
4578                         add_cpu_usage(
4579                                         &user_surplus,
4580                                         &diff[curcpu],
4581                                         &diff[curcpu].user,
4582                                         threshold);
4583
4584                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4585                                 continue;
4586
4587                         /* If there is still room, add system */
4588                         add_cpu_usage(
4589                                         &system_surplus,
4590                                         &diff[curcpu],
4591                                         &diff[curcpu].system,
4592                                         threshold);
4593                 }
4594
4595                 if (user_surplus > 0)
4596                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4597                 if (system_surplus > 0)
4598                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4599
4600                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4601                         if (i == max_cpus)
4602                                 break;
4603
4604                         if (!stat_node->usage[curcpu].online)
4605                                 continue;
4606
4607                         i++;
4608
4609                         stat_node->view[curcpu].user += diff[curcpu].user;
4610                         stat_node->view[curcpu].system += diff[curcpu].system;
4611                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4612
4613                         user_sum += stat_node->view[curcpu].user;
4614                         system_sum += stat_node->view[curcpu].system;
4615                         idle_sum += stat_node->view[curcpu].idle;
4616                 }
4617
4618         } else {
4619                 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4620                         if (!stat_node->usage[curcpu].online)
4621                                 continue;
4622
4623                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4624                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4625                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4626
4627                         user_sum += stat_node->view[curcpu].user;
4628                         system_sum += stat_node->view[curcpu].system;
4629                         idle_sum += stat_node->view[curcpu].idle;
4630                 }
4631         }
4632
4633         /* Render the file */
4634         /* cpu-all */
4635         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4636                         user_sum,
4637                         system_sum,
4638                         idle_sum);
4639
4640         if (l < 0) {
4641                 perror("Error writing to cache");
4642                 rv = 0;
4643                 goto err;
4644
4645         }
4646         if (l >= buf_size) {
4647                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4648                 rv = 0;
4649                 goto err;
4650         }
4651
4652         buf += l;
4653         buf_size -= l;
4654         total_len += l;
4655
4656         /* Render visible CPUs */
4657         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4658                 if (!stat_node->usage[curcpu].online)
4659                         continue;
4660
4661                 i++;
4662
4663                 if (max_cpus > 0 && i == max_cpus)
4664                         break;
4665
4666                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4667                                 i,
4668                                 stat_node->view[curcpu].user,
4669                                 stat_node->view[curcpu].system,
4670                                 stat_node->view[curcpu].idle);
4671
4672                 if (l < 0) {
4673                         perror("Error writing to cache");
4674                         rv = 0;
4675                         goto err;
4676
4677                 }
4678                 if (l >= buf_size) {
4679                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4680                         rv = 0;
4681                         goto err;
4682                 }
4683
4684                 buf += l;
4685                 buf_size -= l;
4686                 total_len += l;
4687         }
4688
4689         /* Pass the rest of /proc/stat, start with the last line read */
4690         l = snprintf(buf, buf_size, "%s", line);
4691
4692         if (l < 0) {
4693                 perror("Error writing to cache");
4694                 rv = 0;
4695                 goto err;
4696
4697         }
4698         if (l >= buf_size) {
4699                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4700                 rv = 0;
4701                 goto err;
4702         }
4703
4704         buf += l;
4705         buf_size -= l;
4706         total_len += l;
4707
4708         /* Pass the rest of the host's /proc/stat */
4709         while (getline(&line, &linelen, f) != -1) {
4710                 l = snprintf(buf, buf_size, "%s", line);
4711                 if (l < 0) {
4712                         perror("Error writing to cache");
4713                         rv = 0;
4714                         goto err;
4715                 }
4716                 if (l >= buf_size) {
4717                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4718                         rv = 0;
4719                         goto err;
4720                 }
4721                 buf += l;
4722                 buf_size -= l;
4723                 total_len += l;
4724         }
4725
4726         rv = total_len;
4727
4728 err:
4729         if (stat_node)
4730                 pthread_mutex_unlock(&stat_node->lock);
4731         if (line)
4732                 free(line);
4733         if (diff)
4734                 free(diff);
4735         return rv;
4736 }
4737
4738 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4739 static int proc_stat_read(char *buf, size_t size, off_t offset,
4740                 struct fuse_file_info *fi)
4741 {
4742         struct fuse_context *fc = fuse_get_context();
4743         struct file_info *d = (struct file_info *)fi->fh;
4744         char *cg;
4745         char *cpuset = NULL;
4746         char *line = NULL;
4747         size_t linelen = 0, total_len = 0, rv = 0;
4748         int curcpu = -1; /* cpu numbering starts at 0 */
4749         int physcpu = 0;
4750         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4751         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4752                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4753         char cpuall[CPUALL_MAX_SIZE];
4754         /* reserve for cpu all */
4755         char *cache = d->buf + CPUALL_MAX_SIZE;
4756         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4757         FILE *f = NULL;
4758         struct cpuacct_usage *cg_cpu_usage = NULL;
4759         int cg_cpu_usage_size = 0;
4760
4761         if (offset){
4762                 if (offset > d->size)
4763                         return -EINVAL;
4764                 if (!d->cached)
4765                         return 0;
4766                 int left = d->size - offset;
4767                 total_len = left > size ? size: left;
4768                 memcpy(buf, d->buf + offset, total_len);
4769                 return total_len;
4770         }
4771
4772         pid_t initpid = lookup_initpid_in_store(fc->pid);
4773         if (initpid <= 0)
4774                 initpid = fc->pid;
4775         cg = get_pid_cgroup(initpid, "cpuset");
4776         if (!cg)
4777                 return read_file("/proc/stat", buf, size, d);
4778         prune_init_slice(cg);
4779
4780         cpuset = get_cpuset(cg);
4781         if (!cpuset)
4782                 goto err;
4783
4784         /*
4785          * Read cpuacct.usage_all for all CPUs.
4786          * If the cpuacct cgroup is present, it is used to calculate the container's
4787          * CPU usage. If not, values from the host's /proc/stat are used.
4788          */
4789         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4790                 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4791                                 "falling back to the host's /proc/stat");
4792         }
4793
4794         f = fopen("/proc/stat", "r");
4795         if (!f)
4796                 goto err;
4797
4798         //skip first line
4799         if (getline(&line, &linelen, f) < 0) {
4800                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4801                 goto err;
4802         }
4803
4804         if (use_cpuview(cg) && cg_cpu_usage) {
4805                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4806                                 f, d->buf, d->buflen);
4807                 goto out;
4808         }
4809
4810         while (getline(&line, &linelen, f) != -1) {
4811                 ssize_t l;
4812                 char cpu_char[10]; /* That's a lot of cores */
4813                 char *c;
4814                 uint64_t all_used, cg_used, new_idle;
4815                 int ret;
4816
4817                 if (strlen(line) == 0)
4818                         continue;
4819                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4820                         /* not a ^cpuN line containing a number N, just print it */
4821                         l = snprintf(cache, cache_size, "%s", line);
4822                         if (l < 0) {
4823                                 perror("Error writing to cache");
4824                                 rv = 0;
4825                                 goto err;
4826                         }
4827                         if (l >= cache_size) {
4828                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4829                                 rv = 0;
4830                                 goto err;
4831                         }
4832                         cache += l;
4833                         cache_size -= l;
4834                         total_len += l;
4835                         continue;
4836                 }
4837
4838                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4839                         continue;
4840                 if (!cpu_in_cpuset(physcpu, cpuset))
4841                         continue;
4842                 curcpu ++;
4843
4844                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4845                            &user,
4846                            &nice,
4847                            &system,
4848                            &idle,
4849                            &iowait,
4850                            &irq,
4851                            &softirq,
4852                            &steal,
4853                            &guest,
4854                            &guest_nice);
4855
4856                 if (ret != 10 || !cg_cpu_usage) {
4857                         c = strchr(line, ' ');
4858                         if (!c)
4859                                 continue;
4860                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4861                         if (l < 0) {
4862                                 perror("Error writing to cache");
4863                                 rv = 0;
4864                                 goto err;
4865
4866                         }
4867                         if (l >= cache_size) {
4868                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4869                                 rv = 0;
4870                                 goto err;
4871                         }
4872
4873                         cache += l;
4874                         cache_size -= l;
4875                         total_len += l;
4876
4877                         if (ret != 10)
4878                                 continue;
4879                 }
4880
4881                 if (cg_cpu_usage) {
4882                         if (physcpu >= cg_cpu_usage_size)
4883                                 break;
4884
4885                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4886                         cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4887
4888                         if (all_used >= cg_used) {
4889                                 new_idle = idle + (all_used - cg_used);
4890
4891                         } else {
4892                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4893                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4894                                                 curcpu, cg, all_used, cg_used);
4895                                 new_idle = idle;
4896                         }
4897
4898                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4899                                         curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4900                                         new_idle);
4901
4902                         if (l < 0) {
4903                                 perror("Error writing to cache");
4904                                 rv = 0;
4905                                 goto err;
4906
4907                         }
4908                         if (l >= cache_size) {
4909                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4910                                 rv = 0;
4911                                 goto err;
4912                         }
4913
4914                         cache += l;
4915                         cache_size -= l;
4916                         total_len += l;
4917
4918                         user_sum += cg_cpu_usage[physcpu].user;
4919                         system_sum += cg_cpu_usage[physcpu].system;
4920                         idle_sum += new_idle;
4921
4922                 } else {
4923                         user_sum += user;
4924                         nice_sum += nice;
4925                         system_sum += system;
4926                         idle_sum += idle;
4927                         iowait_sum += iowait;
4928                         irq_sum += irq;
4929                         softirq_sum += softirq;
4930                         steal_sum += steal;
4931                         guest_sum += guest;
4932                         guest_nice_sum += guest_nice;
4933                 }
4934         }
4935
4936         cache = d->buf;
4937
4938         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4939                         user_sum,
4940                         nice_sum,
4941                         system_sum,
4942                         idle_sum,
4943                         iowait_sum,
4944                         irq_sum,
4945                         softirq_sum,
4946                         steal_sum,
4947                         guest_sum,
4948                         guest_nice_sum);
4949         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4950                 memcpy(cache, cpuall, cpuall_len);
4951                 cache += cpuall_len;
4952         } else {
4953                 /* shouldn't happen */
4954                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4955                 cpuall_len = 0;
4956         }
4957
4958         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4959         total_len += cpuall_len;
4960
4961 out:
4962         d->cached = 1;
4963         d->size = total_len;
4964         if (total_len > size)
4965                 total_len = size;
4966
4967         memcpy(buf, d->buf, total_len);
4968         rv = total_len;
4969
4970 err:
4971         if (f)
4972                 fclose(f);
4973         if (cg_cpu_usage)
4974                 free(cg_cpu_usage);
4975         free(line);
4976         free(cpuset);
4977         free(cg);
4978         return rv;
4979 }
4980
4981 /* This function retrieves the busy time of a group of tasks by looking at
4982  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4983  * been given it's own cpuacct cgroup. If not, this function will take the busy
4984  * time of all other taks that do not actually belong to the container into
4985  * account as well. If someone has a clever solution for this please send a
4986  * patch!
4987  */
4988 static unsigned long get_reaper_busy(pid_t task)
4989 {
4990         pid_t initpid = lookup_initpid_in_store(task);
4991         char *cgroup = NULL, *usage_str = NULL;
4992         unsigned long usage = 0;
4993
4994         if (initpid <= 0)
4995                 return 0;
4996
4997         cgroup = get_pid_cgroup(initpid, "cpuacct");
4998         if (!cgroup)
4999                 goto out;
5000         prune_init_slice(cgroup);
5001         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5002                 goto out;
5003         usage = strtoul(usage_str, NULL, 10);
5004         usage /= 1000000000;
5005
5006 out:
5007         free(cgroup);
5008         free(usage_str);
5009         return usage;
5010 }
5011
5012 #if RELOADTEST
5013 void iwashere(void)
5014 {
5015         int fd;
5016
5017         fd = creat("/tmp/lxcfs-iwashere", 0644);
5018         if (fd >= 0)
5019                 close(fd);
5020 }
5021 #endif
5022
5023 /*
5024  * We read /proc/uptime and reuse its second field.
5025  * For the first field, we use the mtime for the reaper for
5026  * the calling pid as returned by getreaperage
5027  */
5028 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5029                 struct fuse_file_info *fi)
5030 {
5031         struct fuse_context *fc = fuse_get_context();
5032         struct file_info *d = (struct file_info *)fi->fh;
5033         unsigned long int busytime = get_reaper_busy(fc->pid);
5034         char *cache = d->buf;
5035         ssize_t total_len = 0;
5036         uint64_t idletime, reaperage;
5037
5038 #if RELOADTEST
5039         iwashere();
5040 #endif
5041
5042         if (offset){
5043                 if (!d->cached)
5044                         return 0;
5045                 if (offset > d->size)
5046                         return -EINVAL;
5047                 int left = d->size - offset;
5048                 total_len = left > size ? size: left;
5049                 memcpy(buf, cache + offset, total_len);
5050                 return total_len;
5051         }
5052
5053         reaperage = get_reaper_age(fc->pid);
5054         /* To understand why this is done, please read the comment to the
5055          * get_reaper_busy() function.
5056          */
5057         idletime = reaperage;
5058         if (reaperage >= busytime)
5059                 idletime = reaperage - busytime;
5060
5061         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5062         if (total_len < 0 || total_len >=  d->buflen){
5063                 lxcfs_error("%s\n", "failed to write to cache");
5064                 return 0;
5065         }
5066
5067         d->size = (int)total_len;
5068         d->cached = 1;
5069
5070         if (total_len > size) total_len = size;
5071
5072         memcpy(buf, d->buf, total_len);
5073         return total_len;
5074 }
5075
5076 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5077                 struct fuse_file_info *fi)
5078 {
5079         char dev_name[72];
5080         struct fuse_context *fc = fuse_get_context();
5081         struct file_info *d = (struct file_info *)fi->fh;
5082         char *cg;
5083         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5084                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
5085         unsigned long read = 0, write = 0;
5086         unsigned long read_merged = 0, write_merged = 0;
5087         unsigned long read_sectors = 0, write_sectors = 0;
5088         unsigned long read_ticks = 0, write_ticks = 0;
5089         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5090         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5091         char *cache = d->buf;
5092         size_t cache_size = d->buflen;
5093         char *line = NULL;
5094         size_t linelen = 0, total_len = 0, rv = 0;
5095         unsigned int major = 0, minor = 0;
5096         int i = 0;
5097         FILE *f = NULL;
5098
5099         if (offset){
5100                 if (offset > d->size)
5101                         return -EINVAL;
5102                 if (!d->cached)
5103                         return 0;
5104                 int left = d->size - offset;
5105                 total_len = left > size ? size: left;
5106                 memcpy(buf, cache + offset, total_len);
5107                 return total_len;
5108         }
5109
5110         pid_t initpid = lookup_initpid_in_store(fc->pid);
5111         if (initpid <= 0)
5112                 initpid = fc->pid;
5113         cg = get_pid_cgroup(initpid, "blkio");
5114         if (!cg)
5115                 return read_file("/proc/diskstats", buf, size, d);
5116         prune_init_slice(cg);
5117
5118         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5119                 goto err;
5120         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5121                 goto err;
5122         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5123                 goto err;
5124         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5125                 goto err;
5126         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5127                 goto err;
5128
5129
5130         f = fopen("/proc/diskstats", "r");
5131         if (!f)
5132                 goto err;
5133
5134         while (getline(&line, &linelen, f) != -1) {
5135                 ssize_t l;
5136                 char lbuf[256];
5137
5138                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5139                 if (i != 3)
5140                         continue;
5141
5142                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5143                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5144                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5145                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5146                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5147                 read_sectors = read_sectors/512;
5148                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5149                 write_sectors = write_sectors/512;
5150
5151                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5152                 rd_svctm = rd_svctm/1000000;
5153                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5154                 rd_wait = rd_wait/1000000;
5155                 read_ticks = rd_svctm + rd_wait;
5156
5157                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5158                 wr_svctm =  wr_svctm/1000000;
5159                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5160                 wr_wait =  wr_wait/1000000;
5161                 write_ticks = wr_svctm + wr_wait;
5162
5163                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5164                 tot_ticks =  tot_ticks/1000000;
5165
5166                 memset(lbuf, 0, 256);
5167                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5168                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5169                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5170                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5171                 else
5172                         continue;
5173
5174                 l = snprintf(cache, cache_size, "%s", lbuf);
5175                 if (l < 0) {
5176                         perror("Error writing to fuse buf");
5177                         rv = 0;
5178                         goto err;
5179                 }
5180                 if (l >= cache_size) {
5181                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5182                         rv = 0;
5183                         goto err;
5184                 }
5185                 cache += l;
5186                 cache_size -= l;
5187                 total_len += l;
5188         }
5189
5190         d->cached = 1;
5191         d->size = total_len;
5192         if (total_len > size ) total_len = size;
5193         memcpy(buf, d->buf, total_len);
5194
5195         rv = total_len;
5196 err:
5197         free(cg);
5198         if (f)
5199                 fclose(f);
5200         free(line);
5201         free(io_serviced_str);
5202         free(io_merged_str);
5203         free(io_service_bytes_str);
5204         free(io_wait_time_str);
5205         free(io_service_time_str);
5206         return rv;
5207 }
5208
5209 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5210                 struct fuse_file_info *fi)
5211 {
5212         struct fuse_context *fc = fuse_get_context();
5213         struct file_info *d = (struct file_info *)fi->fh;
5214         char *cg = NULL;
5215         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5216         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5217         ssize_t total_len = 0, rv = 0;
5218         ssize_t l = 0;
5219         char *cache = d->buf;
5220
5221         if (offset) {
5222                 if (offset > d->size)
5223                         return -EINVAL;
5224                 if (!d->cached)
5225                         return 0;
5226                 int left = d->size - offset;
5227                 total_len = left > size ? size: left;
5228                 memcpy(buf, cache + offset, total_len);
5229                 return total_len;
5230         }
5231
5232         pid_t initpid = lookup_initpid_in_store(fc->pid);
5233         if (initpid <= 0)
5234                 initpid = fc->pid;
5235         cg = get_pid_cgroup(initpid, "memory");
5236         if (!cg)
5237                 return read_file("/proc/swaps", buf, size, d);
5238         prune_init_slice(cg);
5239
5240         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5241
5242         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5243                 goto err;
5244
5245         memusage = strtoul(memusage_str, NULL, 10);
5246
5247         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5248             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5249
5250                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5251                 memswusage = strtoul(memswusage_str, NULL, 10);
5252
5253                 swap_total = (memswlimit - memlimit) / 1024;
5254                 swap_free = (memswusage - memusage) / 1024;
5255         }
5256
5257         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5258
5259         /* When no mem + swap limit is specified or swapaccount=0*/
5260         if (!memswlimit) {
5261                 char *line = NULL;
5262                 size_t linelen = 0;
5263                 FILE *f = fopen("/proc/meminfo", "r");
5264
5265                 if (!f)
5266                         goto err;
5267
5268                 while (getline(&line, &linelen, f) != -1) {
5269                         if (startswith(line, "SwapTotal:")) {
5270                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5271                         } else if (startswith(line, "SwapFree:")) {
5272                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5273                         }
5274                 }
5275
5276                 free(line);
5277                 fclose(f);
5278         }
5279
5280         if (swap_total > 0) {
5281                 l = snprintf(d->buf + total_len, d->size - total_len,
5282                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5283                                 swap_total, swap_free);
5284                 total_len += l;
5285         }
5286
5287         if (total_len < 0 || l < 0) {
5288                 perror("Error writing to cache");
5289                 rv = 0;
5290                 goto err;
5291         }
5292
5293         d->cached = 1;
5294         d->size = (int)total_len;
5295
5296         if (total_len > size) total_len = size;
5297         memcpy(buf, d->buf, total_len);
5298         rv = total_len;
5299
5300 err:
5301         free(cg);
5302         free(memswlimit_str);
5303         free(memlimit_str);
5304         free(memusage_str);
5305         free(memswusage_str);
5306         return rv;
5307 }
5308 /*
5309  * Find the process pid from cgroup path.
5310  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5311  * @pid_buf : put pid to pid_buf.
5312  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5313  * @depth : the depth of cgroup in container.
5314  * @sum : return the number of pid.
5315  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5316  */
5317 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5318 {
5319         DIR *dir;
5320         int fd;
5321         struct dirent *file;
5322         FILE *f = NULL;
5323         size_t linelen = 0;
5324         char *line = NULL;
5325         int pd;
5326         char *path_dir, *path;
5327         char **pid;
5328
5329         /* path = dpath + "/cgroup.procs" + /0 */
5330         do {
5331                 path = malloc(strlen(dpath) + 20);
5332         } while (!path);
5333
5334         strcpy(path, dpath);
5335         fd = openat(cfd, path, O_RDONLY);
5336         if (fd < 0)
5337                 goto out;
5338
5339         dir = fdopendir(fd);
5340         if (dir == NULL) {
5341                 close(fd);
5342                 goto out;
5343         }
5344
5345         while (((file = readdir(dir)) != NULL) && depth > 0) {
5346                 if (strncmp(file->d_name, ".", 1) == 0)
5347                         continue;
5348                 if (strncmp(file->d_name, "..", 1) == 0)
5349                         continue;
5350                 if (file->d_type == DT_DIR) {
5351                         /* path + '/' + d_name +/0 */
5352                         do {
5353                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5354                         } while (!path_dir);
5355                         strcpy(path_dir, path);
5356                         strcat(path_dir, "/");
5357                         strcat(path_dir, file->d_name);
5358                         pd = depth - 1;
5359                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5360                         free(path_dir);
5361                 }
5362         }
5363         closedir(dir);
5364
5365         strcat(path, "/cgroup.procs");
5366         fd = openat(cfd, path, O_RDONLY);
5367         if (fd < 0)
5368                 goto out;
5369
5370         f = fdopen(fd, "r");
5371         if (!f) {
5372                 close(fd);
5373                 goto out;
5374         }
5375
5376         while (getline(&line, &linelen, f) != -1) {
5377                 do {
5378                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5379                 } while (!pid);
5380                 *pid_buf = pid;
5381                 do {
5382                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
5383                 } while (*(*pid_buf + sum) == NULL);
5384                 strcpy(*(*pid_buf + sum), line);
5385                 sum++;
5386         }
5387         fclose(f);
5388 out:
5389         if (line)
5390                 free(line);
5391         free(path);
5392         return sum;
5393 }
5394 /*
5395  * calc_load calculates the load according to the following formula:
5396  * load1 = load0 * exp + active * (1 - exp)
5397  *
5398  * @load1: the new loadavg.
5399  * @load0: the former loadavg.
5400  * @active: the total number of running pid at this moment.
5401  * @exp: the fixed-point defined in the beginning.
5402  */
5403 static unsigned long
5404 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5405 {
5406         unsigned long newload;
5407
5408         active = active > 0 ? active * FIXED_1 : 0;
5409         newload = load * exp + active * (FIXED_1 - exp);
5410         if (active >= load)
5411                 newload += FIXED_1 - 1;
5412
5413         return newload / FIXED_1;
5414 }
5415
5416 /*
5417  * Return 0 means that container p->cg is closed.
5418  * Return -1 means that error occurred in refresh.
5419  * Positive num equals the total number of pid.
5420  */
5421 static int refresh_load(struct load_node *p, char *path)
5422 {
5423         FILE *f = NULL;
5424         char **idbuf;
5425         char proc_path[256];
5426         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5427         char *line = NULL;
5428         size_t linelen = 0;
5429         int sum, length;
5430         DIR *dp;
5431         struct dirent *file;
5432
5433         do {
5434                 idbuf = malloc(sizeof(char *));
5435         } while (!idbuf);
5436         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5437         /*  normal exit  */
5438         if (sum == 0)
5439                 goto out;
5440
5441         for (i = 0; i < sum; i++) {
5442                 /*clean up '\n' */
5443                 length = strlen(idbuf[i])-1;
5444                 idbuf[i][length] = '\0';
5445                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5446                 if (ret < 0 || ret > 255) {
5447                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5448                         i = sum;
5449                         sum = -1;
5450                         goto err_out;
5451                 }
5452
5453                 dp = opendir(proc_path);
5454                 if (!dp) {
5455                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5456                         continue;
5457                 }
5458                 while ((file = readdir(dp)) != NULL) {
5459                         if (strncmp(file->d_name, ".", 1) == 0)
5460                                 continue;
5461                         if (strncmp(file->d_name, "..", 1) == 0)
5462                                 continue;
5463                         total_pid++;
5464                         /* We make the biggest pid become last_pid.*/
5465                         ret = atof(file->d_name);
5466                         last_pid = (ret > last_pid) ? ret : last_pid;
5467
5468                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5469                         if (ret < 0 || ret > 255) {
5470                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5471                                 i = sum;
5472                                 sum = -1;
5473                                 closedir(dp);
5474                                 goto err_out;
5475                         }
5476                         f = fopen(proc_path, "r");
5477                         if (f != NULL) {
5478                                 while (getline(&line, &linelen, f) != -1) {
5479                                         /* Find State */
5480                                         if ((line[0] == 'S') && (line[1] == 't'))
5481                                                 break;
5482                                 }
5483                         if ((line[7] == 'R') || (line[7] == 'D'))
5484                                 run_pid++;
5485                         fclose(f);
5486                         }
5487                 }
5488                 closedir(dp);
5489         }
5490         /*Calculate the loadavg.*/
5491         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5492         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5493         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5494         p->run_pid = run_pid;
5495         p->total_pid = total_pid;
5496         p->last_pid = last_pid;
5497
5498         free(line);
5499 err_out:
5500         for (; i > 0; i--)
5501                 free(idbuf[i-1]);
5502 out:
5503         free(idbuf);
5504         return sum;
5505 }
5506 /*
5507  * Traverse the hash table and update it.
5508  */
5509 void *load_begin(void *arg)
5510 {
5511
5512         char *path = NULL;
5513         int i, sum, length, ret;
5514         struct load_node *f;
5515         int first_node;
5516         clock_t time1, time2;
5517
5518         while (1) {
5519                 if (loadavg_stop == 1)
5520                         return NULL;
5521
5522                 time1 = clock();
5523                 for (i = 0; i < LOAD_SIZE; i++) {
5524                         pthread_mutex_lock(&load_hash[i].lock);
5525                         if (load_hash[i].next == NULL) {
5526                                 pthread_mutex_unlock(&load_hash[i].lock);
5527                                 continue;
5528                         }
5529                         f = load_hash[i].next;
5530                         first_node = 1;
5531                         while (f) {
5532                                 length = strlen(f->cg) + 2;
5533                                 do {
5534                                         /* strlen(f->cg) + '.' or '' + \0 */
5535                                         path = malloc(length);
5536                                 } while (!path);
5537
5538                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5539                                 if (ret < 0 || ret > length - 1) {
5540                                         /* snprintf failed, ignore the node.*/
5541                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5542                                         goto out;
5543                                 }
5544                                 sum = refresh_load(f, path);
5545                                 if (sum == 0) {
5546                                         f = del_node(f, i);
5547                                 } else {
5548 out:                                    f = f->next;
5549                                 }
5550                                 free(path);
5551                                 /* load_hash[i].lock locks only on the first node.*/
5552                                 if (first_node == 1) {
5553                                         first_node = 0;
5554                                         pthread_mutex_unlock(&load_hash[i].lock);
5555                                 }
5556                         }
5557                 }
5558
5559                 if (loadavg_stop == 1)
5560                         return NULL;
5561
5562                 time2 = clock();
5563                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5564         }
5565 }
5566
5567 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5568                 struct fuse_file_info *fi)
5569 {
5570         struct fuse_context *fc = fuse_get_context();
5571         struct file_info *d = (struct file_info *)fi->fh;
5572         pid_t initpid;
5573         char *cg;
5574         size_t total_len = 0;
5575         char *cache = d->buf;
5576         struct load_node *n;
5577         int hash;
5578         int cfd, rv = 0;
5579         unsigned long a, b, c;
5580
5581         if (offset) {
5582                 if (offset > d->size)
5583                         return -EINVAL;
5584                 if (!d->cached)
5585                         return 0;
5586                 int left = d->size - offset;
5587                 total_len = left > size ? size : left;
5588                 memcpy(buf, cache + offset, total_len);
5589                 return total_len;
5590         }
5591         if (!loadavg)
5592                 return read_file("/proc/loadavg", buf, size, d);
5593
5594         initpid = lookup_initpid_in_store(fc->pid);
5595         if (initpid <= 0)
5596                 initpid = fc->pid;
5597         cg = get_pid_cgroup(initpid, "cpu");
5598         if (!cg)
5599                 return read_file("/proc/loadavg", buf, size, d);
5600
5601         prune_init_slice(cg);
5602         hash = calc_hash(cg) % LOAD_SIZE;
5603         n = locate_node(cg, hash);
5604
5605         /* First time */
5606         if (n == NULL) {
5607                 if (!find_mounted_controller("cpu", &cfd)) {
5608                         /*
5609                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5610                          * because delete is not allowed before read has ended.
5611                          */
5612                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5613                         rv = 0;
5614                         goto err;
5615                 }
5616                 do {
5617                         n = malloc(sizeof(struct load_node));
5618                 } while (!n);
5619
5620                 do {
5621                         n->cg = malloc(strlen(cg)+1);
5622                 } while (!n->cg);
5623                 strcpy(n->cg, cg);
5624                 n->avenrun[0] = 0;
5625                 n->avenrun[1] = 0;
5626                 n->avenrun[2] = 0;
5627                 n->run_pid = 0;
5628                 n->total_pid = 1;
5629                 n->last_pid = initpid;
5630                 n->cfd = cfd;
5631                 insert_node(&n, hash);
5632         }
5633         a = n->avenrun[0] + (FIXED_1/200);
5634         b = n->avenrun[1] + (FIXED_1/200);
5635         c = n->avenrun[2] + (FIXED_1/200);
5636         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5637                 LOAD_INT(a), LOAD_FRAC(a),
5638                 LOAD_INT(b), LOAD_FRAC(b),
5639                 LOAD_INT(c), LOAD_FRAC(c),
5640                 n->run_pid, n->total_pid, n->last_pid);
5641         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5642         if (total_len < 0 || total_len >=  d->buflen) {
5643                 lxcfs_error("%s\n", "Failed to write to cache");
5644                 rv = 0;
5645                 goto err;
5646         }
5647         d->size = (int)total_len;
5648         d->cached = 1;
5649
5650         if (total_len > size)
5651                 total_len = size;
5652         memcpy(buf, d->buf, total_len);
5653         rv = total_len;
5654
5655 err:
5656         free(cg);
5657         return rv;
5658 }
5659 /* Return a positive number on success, return 0 on failure.*/
5660 pthread_t load_daemon(int load_use)
5661 {
5662         int ret;
5663         pthread_t pid;
5664
5665         ret = init_load();
5666         if (ret == -1) {
5667                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5668                 return 0;
5669         }
5670         ret = pthread_create(&pid, NULL, load_begin, NULL);
5671         if (ret != 0) {
5672                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5673                 load_free();
5674                 return 0;
5675         }
5676         /* use loadavg, here loadavg = 1*/
5677         loadavg = load_use;
5678         return pid;
5679 }
5680
5681 /* Returns 0 on success. */
5682 int stop_load_daemon(pthread_t pid)
5683 {
5684         int s;
5685
5686         /* Signal the thread to gracefully stop */
5687         loadavg_stop = 1;
5688
5689         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5690         if (s != 0) {
5691                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5692                 return -1;
5693         }
5694
5695         load_free();
5696         loadavg_stop = 0;
5697
5698         return 0;
5699 }
5700
5701 static off_t get_procfile_size(const char *which)
5702 {
5703         FILE *f = fopen(which, "r");
5704         char *line = NULL;
5705         size_t len = 0;
5706         ssize_t sz, answer = 0;
5707         if (!f)
5708                 return 0;
5709
5710         while ((sz = getline(&line, &len, f)) != -1)
5711                 answer += sz;
5712         fclose (f);
5713         free(line);
5714
5715         return answer;
5716 }
5717
5718 int proc_getattr(const char *path, struct stat *sb)
5719 {
5720         struct timespec now;
5721
5722         memset(sb, 0, sizeof(struct stat));
5723         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5724                 return -EINVAL;
5725         sb->st_uid = sb->st_gid = 0;
5726         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5727         if (strcmp(path, "/proc") == 0) {
5728                 sb->st_mode = S_IFDIR | 00555;
5729                 sb->st_nlink = 2;
5730                 return 0;
5731         }
5732         if (strcmp(path, "/proc/meminfo") == 0 ||
5733                         strcmp(path, "/proc/cpuinfo") == 0 ||
5734                         strcmp(path, "/proc/uptime") == 0 ||
5735                         strcmp(path, "/proc/stat") == 0 ||
5736                         strcmp(path, "/proc/diskstats") == 0 ||
5737                         strcmp(path, "/proc/swaps") == 0 ||
5738                         strcmp(path, "/proc/loadavg") == 0) {
5739                 sb->st_size = 0;
5740                 sb->st_mode = S_IFREG | 00444;
5741                 sb->st_nlink = 1;
5742                 return 0;
5743         }
5744
5745         return -ENOENT;
5746 }
5747
5748 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5749                 struct fuse_file_info *fi)
5750 {
5751         if (filler(buf, ".", NULL, 0) != 0 ||
5752             filler(buf, "..", NULL, 0) != 0 ||
5753             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5754             filler(buf, "meminfo", NULL, 0) != 0 ||
5755             filler(buf, "stat", NULL, 0) != 0 ||
5756             filler(buf, "uptime", NULL, 0) != 0 ||
5757             filler(buf, "diskstats", NULL, 0) != 0 ||
5758             filler(buf, "swaps", NULL, 0) != 0   ||
5759             filler(buf, "loadavg", NULL, 0) != 0)
5760                 return -EINVAL;
5761         return 0;
5762 }
5763
5764 int proc_open(const char *path, struct fuse_file_info *fi)
5765 {
5766         int type = -1;
5767         struct file_info *info;
5768
5769         if (strcmp(path, "/proc/meminfo") == 0)
5770                 type = LXC_TYPE_PROC_MEMINFO;
5771         else if (strcmp(path, "/proc/cpuinfo") == 0)
5772                 type = LXC_TYPE_PROC_CPUINFO;
5773         else if (strcmp(path, "/proc/uptime") == 0)
5774                 type = LXC_TYPE_PROC_UPTIME;
5775         else if (strcmp(path, "/proc/stat") == 0)
5776                 type = LXC_TYPE_PROC_STAT;
5777         else if (strcmp(path, "/proc/diskstats") == 0)
5778                 type = LXC_TYPE_PROC_DISKSTATS;
5779         else if (strcmp(path, "/proc/swaps") == 0)
5780                 type = LXC_TYPE_PROC_SWAPS;
5781         else if (strcmp(path, "/proc/loadavg") == 0)
5782                 type = LXC_TYPE_PROC_LOADAVG;
5783         if (type == -1)
5784                 return -ENOENT;
5785
5786         info = malloc(sizeof(*info));
5787         if (!info)
5788                 return -ENOMEM;
5789
5790         memset(info, 0, sizeof(*info));
5791         info->type = type;
5792
5793         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5794         do {
5795                 info->buf = malloc(info->buflen);
5796         } while (!info->buf);
5797         memset(info->buf, 0, info->buflen);
5798         /* set actual size to buffer size */
5799         info->size = info->buflen;
5800
5801         fi->fh = (unsigned long)info;
5802         return 0;
5803 }
5804
5805 int proc_access(const char *path, int mask)
5806 {
5807         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5808                 return 0;
5809
5810         /* these are all read-only */
5811         if ((mask & ~R_OK) != 0)
5812                 return -EACCES;
5813         return 0;
5814 }
5815
5816 int proc_release(const char *path, struct fuse_file_info *fi)
5817 {
5818         do_release_file_info(fi);
5819         return 0;
5820 }
5821
5822 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5823                 struct fuse_file_info *fi)
5824 {
5825         struct file_info *f = (struct file_info *) fi->fh;
5826
5827         switch (f->type) {
5828         case LXC_TYPE_PROC_MEMINFO:
5829                 return proc_meminfo_read(buf, size, offset, fi);
5830         case LXC_TYPE_PROC_CPUINFO:
5831                 return proc_cpuinfo_read(buf, size, offset, fi);
5832         case LXC_TYPE_PROC_UPTIME:
5833                 return proc_uptime_read(buf, size, offset, fi);
5834         case LXC_TYPE_PROC_STAT:
5835                 return proc_stat_read(buf, size, offset, fi);
5836         case LXC_TYPE_PROC_DISKSTATS:
5837                 return proc_diskstats_read(buf, size, offset, fi);
5838         case LXC_TYPE_PROC_SWAPS:
5839                 return proc_swaps_read(buf, size, offset, fi);
5840         case LXC_TYPE_PROC_LOADAVG:
5841                 return proc_loadavg_read(buf, size, offset, fi);
5842         default:
5843                 return -EINVAL;
5844         }
5845 }
5846
5847 /*
5848  * Functions needed to setup cgroups in the __constructor__.
5849  */
5850
5851 static bool mkdir_p(const char *dir, mode_t mode)
5852 {
5853         const char *tmp = dir;
5854         const char *orig = dir;
5855         char *makeme;
5856
5857         do {
5858                 dir = tmp + strspn(tmp, "/");
5859                 tmp = dir + strcspn(dir, "/");
5860                 makeme = strndup(orig, dir - orig);
5861                 if (!makeme)
5862                         return false;
5863                 if (mkdir(makeme, mode) && errno != EEXIST) {
5864                         lxcfs_error("Failed to create directory '%s': %s.\n",
5865                                 makeme, strerror(errno));
5866                         free(makeme);
5867                         return false;
5868                 }
5869                 free(makeme);
5870         } while(tmp != dir);
5871
5872         return true;
5873 }
5874
5875 static bool umount_if_mounted(void)
5876 {
5877         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5878                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5879                 return false;
5880         }
5881         return true;
5882 }
5883
5884 /* __typeof__ should be safe to use with all compilers. */
5885 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5886 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5887 {
5888         return (fs->f_type == (fs_type_magic)magic_val);
5889 }
5890
5891 /*
5892  * looking at fs/proc_namespace.c, it appears we can
5893  * actually expect the rootfs entry to very specifically contain
5894  * " - rootfs rootfs "
5895  * IIUC, so long as we've chrooted so that rootfs is not our root,
5896  * the rootfs entry should always be skipped in mountinfo contents.
5897  */
5898 static bool is_on_ramfs(void)
5899 {
5900         FILE *f;
5901         char *p, *p2;
5902         char *line = NULL;
5903         size_t len = 0;
5904         int i;
5905
5906         f = fopen("/proc/self/mountinfo", "r");
5907         if (!f)
5908                 return false;
5909
5910         while (getline(&line, &len, f) != -1) {
5911                 for (p = line, i = 0; p && i < 4; i++)
5912                         p = strchr(p + 1, ' ');
5913                 if (!p)
5914                         continue;
5915                 p2 = strchr(p + 1, ' ');
5916                 if (!p2)
5917                         continue;
5918                 *p2 = '\0';
5919                 if (strcmp(p + 1, "/") == 0) {
5920                         // this is '/'.  is it the ramfs?
5921                         p = strchr(p2 + 1, '-');
5922                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5923                                 free(line);
5924                                 fclose(f);
5925                                 return true;
5926                         }
5927                 }
5928         }
5929         free(line);
5930         fclose(f);
5931         return false;
5932 }
5933
5934 static int pivot_enter()
5935 {
5936         int ret = -1, oldroot = -1, newroot = -1;
5937
5938         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5939         if (oldroot < 0) {
5940                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5941                 return ret;
5942         }
5943
5944         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5945         if (newroot < 0) {
5946                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5947                 goto err;
5948         }
5949
5950         /* change into new root fs */
5951         if (fchdir(newroot) < 0) {
5952                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5953                 goto err;
5954         }
5955
5956         /* pivot_root into our new root fs */
5957         if (pivot_root(".", ".") < 0) {
5958                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5959                 goto err;
5960         }
5961
5962         /*
5963          * At this point the old-root is mounted on top of our new-root.
5964          * To unmounted it we must not be chdir'd into it, so escape back
5965          * to the old-root.
5966          */
5967         if (fchdir(oldroot) < 0) {
5968                 lxcfs_error("%s\n", "Failed to enter old root.");
5969                 goto err;
5970         }
5971
5972         if (umount2(".", MNT_DETACH) < 0) {
5973                 lxcfs_error("%s\n", "Failed to detach old root.");
5974                 goto err;
5975         }
5976
5977         if (fchdir(newroot) < 0) {
5978                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5979                 goto err;
5980         }
5981
5982         ret = 0;
5983
5984 err:
5985         if (oldroot > 0)
5986                 close(oldroot);
5987         if (newroot > 0)
5988                 close(newroot);
5989
5990         return ret;
5991 }
5992
5993 static int chroot_enter()
5994 {
5995         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5996                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5997                 return -1;
5998         }
5999
6000         if (chroot(".") < 0) {
6001                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6002                 return -1;
6003         }
6004
6005         if (chdir("/") < 0) {
6006                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6007                 return -1;
6008         }
6009
6010         return 0;
6011 }
6012
6013 static int permute_and_enter(void)
6014 {
6015         struct statfs sb;
6016
6017         if (statfs("/", &sb) < 0) {
6018                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6019                 return -1;
6020         }
6021
6022         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6023          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6024          * /proc/1/mountinfo. */
6025         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6026                 return chroot_enter();
6027
6028         if (pivot_enter() < 0) {
6029                 lxcfs_error("%s\n", "Could not perform pivot root.");
6030                 return -1;
6031         }
6032
6033         return 0;
6034 }
6035
6036 /* Prepare our new clean root. */
6037 static int permute_prepare(void)
6038 {
6039         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6040                 lxcfs_error("%s\n", "Failed to create directory for new root.");
6041                 return -1;
6042         }
6043
6044         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6045                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6046                 return -1;
6047         }
6048
6049         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6050                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6051                 return -1;
6052         }
6053
6054         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6055                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6056                 return -1;
6057         }
6058
6059         return 0;
6060 }
6061
6062 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6063 static bool permute_root(void)
6064 {
6065         /* Prepare new root. */
6066         if (permute_prepare() < 0)
6067                 return false;
6068
6069         /* Pivot into new root. */
6070         if (permute_and_enter() < 0)
6071                 return false;
6072
6073         return true;
6074 }
6075
6076 static int preserve_mnt_ns(int pid)
6077 {
6078         int ret;
6079         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6080         char path[len];
6081
6082         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6083         if (ret < 0 || (size_t)ret >= len)
6084                 return -1;
6085
6086         return open(path, O_RDONLY | O_CLOEXEC);
6087 }
6088
6089 static bool cgfs_prepare_mounts(void)
6090 {
6091         if (!mkdir_p(BASEDIR, 0700)) {
6092                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6093                 return false;
6094         }
6095
6096         if (!umount_if_mounted()) {
6097                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6098                 return false;
6099         }
6100
6101         if (unshare(CLONE_NEWNS) < 0) {
6102                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6103                 return false;
6104         }
6105
6106         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6107         if (cgroup_mount_ns_fd < 0) {
6108                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6109                 return false;
6110         }
6111
6112         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6113                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6114                 return false;
6115         }
6116
6117         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6118                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6119                 return false;
6120         }
6121
6122         return true;
6123 }
6124
6125 static bool cgfs_mount_hierarchies(void)
6126 {
6127         char *target;
6128         size_t clen, len;
6129         int i, ret;
6130
6131         for (i = 0; i < num_hierarchies; i++) {
6132                 char *controller = hierarchies[i];
6133
6134                 clen = strlen(controller);
6135                 len = strlen(BASEDIR) + clen + 2;
6136                 target = malloc(len);
6137                 if (!target)
6138                         return false;
6139
6140                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6141                 if (ret < 0 || ret >= len) {
6142                         free(target);
6143                         return false;
6144                 }
6145                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6146                         free(target);
6147                         return false;
6148                 }
6149                 if (!strcmp(controller, "unified"))
6150                         ret = mount("none", target, "cgroup2", 0, NULL);
6151                 else
6152                         ret = mount(controller, target, "cgroup", 0, controller);
6153                 if (ret < 0) {
6154                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6155                         free(target);
6156                         return false;
6157                 }
6158
6159                 fd_hierarchies[i] = open(target, O_DIRECTORY);
6160                 if (fd_hierarchies[i] < 0) {
6161                         free(target);
6162                         return false;
6163                 }
6164                 free(target);
6165         }
6166         return true;
6167 }
6168
6169 static bool cgfs_setup_controllers(void)
6170 {
6171         if (!cgfs_prepare_mounts())
6172                 return false;
6173
6174         if (!cgfs_mount_hierarchies()) {
6175                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6176                 return false;
6177         }
6178
6179         if (!permute_root())
6180                 return false;
6181
6182         return true;
6183 }
6184
6185 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6186 {
6187         FILE *f;
6188         char *cret, *line = NULL;
6189         char cwd[MAXPATHLEN];
6190         size_t len = 0;
6191         int i, init_ns = -1;
6192         bool found_unified = false;
6193
6194         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6195                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6196                 return;
6197         }
6198
6199         while (getline(&line, &len, f) != -1) {
6200                 char *idx, *p, *p2;
6201
6202                 p = strchr(line, ':');
6203                 if (!p)
6204                         goto out;
6205                 idx = line;
6206                 *(p++) = '\0';
6207
6208                 p2 = strrchr(p, ':');
6209                 if (!p2)
6210                         goto out;
6211                 *p2 = '\0';
6212
6213                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6214                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6215                  * because it parses out the empty string "" and later on passes
6216                  * it to mount(). Let's skip such entries.
6217                  */
6218                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6219                         found_unified = true;
6220                         p = "unified";
6221                 }
6222
6223                 if (!store_hierarchy(line, p))
6224                         goto out;
6225         }
6226
6227         /* Preserve initial namespace. */
6228         init_ns = preserve_mnt_ns(getpid());
6229         if (init_ns < 0) {
6230                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6231                 goto out;
6232         }
6233
6234         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6235         if (!fd_hierarchies) {
6236                 lxcfs_error("%s\n", strerror(errno));
6237                 goto out;
6238         }
6239
6240         for (i = 0; i < num_hierarchies; i++)
6241                 fd_hierarchies[i] = -1;
6242
6243         cret = getcwd(cwd, MAXPATHLEN);
6244         if (!cret)
6245                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6246
6247         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6248          * to privately mount lxcfs cgroups. */
6249         if (!cgfs_setup_controllers()) {
6250                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6251                 goto out;
6252         }
6253
6254         if (setns(init_ns, 0) < 0) {
6255                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6256                 goto out;
6257         }
6258
6259         if (!cret || chdir(cwd) < 0)
6260                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6261
6262         if (!init_cpuview()) {
6263                 lxcfs_error("%s\n", "failed to init CPU view");
6264                 goto out;
6265         }
6266
6267         print_subsystems();
6268
6269 out:
6270         free(line);
6271         fclose(f);
6272         if (init_ns >= 0)
6273                 close(init_ns);
6274 }
6275
6276 static void __attribute__((destructor)) free_subsystems(void)
6277 {
6278         int i;
6279
6280         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6281
6282         for (i = 0; i < num_hierarchies; i++) {
6283                 if (hierarchies[i])
6284                         free(hierarchies[i]);
6285                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6286                         close(fd_hierarchies[i]);
6287         }
6288         free(hierarchies);
6289         free(fd_hierarchies);
6290         free_cpuview();
6291
6292         if (cgroup_mount_ns_fd >= 0)
6293                 close(cgroup_mount_ns_fd);
6294 }