bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h"
  41 #include "memory_utils.h"
  42
  43 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  44 #define LXCFS_NUMSTRLEN64 21
  45
  46 /* Define pivot_root() if missing from the C library */
  47 #ifndef HAVE_PIVOT_ROOT
  48 static int pivot_root(const char * new_root, const char * put_old)
  49 {
  50 #ifdef __NR_pivot_root
  51 return syscall(__NR_pivot_root, new_root, put_old);
  52 #else
  53 errno = ENOSYS;
  54 return -1;
  55 #endif
  56 }
  57 #else
  58 extern int pivot_root(const char * new_root, const char * put_old);
  59 #endif
  60
  61 enum {
  62         LXC_TYPE_CGDIR,
  63         LXC_TYPE_CGFILE,
  64         LXC_TYPE_PROC_MEMINFO,
  65         LXC_TYPE_PROC_CPUINFO,
  66         LXC_TYPE_PROC_UPTIME,
  67         LXC_TYPE_PROC_STAT,
  68         LXC_TYPE_PROC_DISKSTATS,
  69         LXC_TYPE_PROC_SWAPS,
  70         LXC_TYPE_PROC_LOADAVG,
  71 };
  72
  73 struct file_info {
  74         char *controller;
  75         char *cgroup;
  76         char *file;
  77         int type;
  78         char *buf;  // unused as of yet
  79         int buflen;
  80         int size; //actual data size
  81         int cached;
  82 };
  83
  84 struct cpuacct_usage {
  85         uint64_t user;
  86         uint64_t system;
  87         uint64_t idle;
  88         bool online;
  89 };
  90
  91 /* The function of hash table.*/
  92 #define LOAD_SIZE 100 /*the size of hash_table */
  93 #define FLUSH_TIME 5  /*the flush rate */
  94 #define DEPTH_DIR 3   /*the depth of per cgroup */
  95 /* The function of calculate loadavg .*/
  96 #define FSHIFT          11              /* nr of bits of precision */
  97 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  98 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  99 #define EXP_5           2014            /* 1/exp(5sec/5min) */
 100 #define EXP_15          2037            /* 1/exp(5sec/15min) */
 101 #define LOAD_INT(x) ((x) >> FSHIFT)
 102 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 103 /*
 104  * This parameter is used for proc_loadavg_read().
 105  * 1 means use loadavg, 0 means not use.
 106  */
 107 static int loadavg = 0;
 108 static volatile sig_atomic_t loadavg_stop = 0;
 109 static int calc_hash(const char *name)
 110 {
 111         unsigned int hash = 0;
 112         unsigned int x = 0;
 113         /* ELFHash algorithm. */
 114         while (*name) {
 115                 hash = (hash << 4) + *name++;
 116                 x = hash & 0xf0000000;
 117                 if (x != 0)
 118                         hash ^= (x >> 24);
 119                 hash &= ~x;
 120         }
 121         return (hash & 0x7fffffff);
 122 }
 123
 124 struct load_node {
 125         char *cg;  /*cg */
 126         unsigned long avenrun[3];               /* Load averages */
 127         unsigned int run_pid;
 128         unsigned int total_pid;
 129         unsigned int last_pid;
 130         int cfd; /* The file descriptor of the mounted cgroup */
 131         struct  load_node *next;
 132         struct  load_node **pre;
 133 };
 134
 135 struct load_head {
 136         /*
 137          * The lock is about insert load_node and refresh load_node.To the first
 138          * load_node of each hash bucket, insert and refresh in this hash bucket is
 139          * mutually exclusive.
 140          */
 141         pthread_mutex_t lock;
 142         /*
 143          * The rdlock is about read loadavg and delete load_node.To each hash
 144          * bucket, read and delete is mutually exclusive. But at the same time, we
 145          * allow paratactic read operation. This rdlock is at list level.
 146          */
 147         pthread_rwlock_t rdlock;
 148         /*
 149          * The rilock is about read loadavg and insert load_node.To the first
 150          * load_node of each hash bucket, read and insert is mutually exclusive.
 151          * But at the same time, we allow paratactic read operation.
 152          */
 153         pthread_rwlock_t rilock;
 154         struct load_node *next;
 155 };
 156
 157 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 158 /*
 159  * init_load initialize the hash table.
 160  * Return 0 on success, return -1 on failure.
 161  */
 162 static int init_load(void)
 163 {
 164         int i;
 165         int ret;
 166
 167         for (i = 0; i < LOAD_SIZE; i++) {
 168                 load_hash[i].next = NULL;
 169                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 170                 if (ret != 0) {
 171                         lxcfs_error("%s\n", "Failed to initialize lock");
 172                         goto out3;
 173                 }
 174                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 175                 if (ret != 0) {
 176                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 177                         goto out2;
 178                 }
 179                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 180                 if (ret != 0) {
 181                         lxcfs_error("%s\n", "Failed to initialize rilock");
 182                         goto out1;
 183                 }
 184         }
 185         return 0;
 186 out1:
 187         pthread_rwlock_destroy(&load_hash[i].rdlock);
 188 out2:
 189         pthread_mutex_destroy(&load_hash[i].lock);
 190 out3:
 191         while (i > 0) {
 192                 i--;
 193                 pthread_mutex_destroy(&load_hash[i].lock);
 194                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 195                 pthread_rwlock_destroy(&load_hash[i].rilock);
 196         }
 197         return -1;
 198 }
 199
 200 static void insert_node(struct load_node **n, int locate)
 201 {
 202         struct load_node *f;
 203
 204         pthread_mutex_lock(&load_hash[locate].lock);
 205         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 206         f = load_hash[locate].next;
 207         load_hash[locate].next = *n;
 208
 209         (*n)->pre = &(load_hash[locate].next);
 210         if (f)
 211                 f->pre = &((*n)->next);
 212         (*n)->next = f;
 213         pthread_mutex_unlock(&load_hash[locate].lock);
 214         pthread_rwlock_unlock(&load_hash[locate].rilock);
 215 }
 216 /*
 217  * locate_node() finds special node. Not return NULL means success.
 218  * It should be noted that rdlock isn't unlocked at the end of code
 219  * because this function is used to read special node. Delete is not
 220  * allowed before read has ended.
 221  * unlock rdlock only in proc_loadavg_read().
 222  */
 223 static struct load_node *locate_node(char *cg, int locate)
 224 {
 225         struct load_node *f = NULL;
 226         int i = 0;
 227
 228         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 229         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 230         if (load_hash[locate].next == NULL) {
 231                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 232                 return f;
 233         }
 234         f = load_hash[locate].next;
 235         pthread_rwlock_unlock(&load_hash[locate].rilock);
 236         while (f && ((i = strcmp(f->cg, cg)) != 0))
 237                 f = f->next;
 238         return f;
 239 }
 240 /* Delete the load_node n and return the next node of it. */
 241 static struct load_node *del_node(struct load_node *n, int locate)
 242 {
 243         struct load_node *g;
 244
 245         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 246         if (n->next == NULL) {
 247                 *(n->pre) = NULL;
 248         } else {
 249                 *(n->pre) = n->next;
 250                 n->next->pre = n->pre;
 251         }
 252         g = n->next;
 253         __free_move__(n->cg);
 254         __free_move__(n);
 255         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 256         return g;
 257 }
 258
 259 static void load_free(void)
 260 {
 261         int i;
 262         struct load_node *f, *p;
 263
 264         for (i = 0; i < LOAD_SIZE; i++) {
 265                 pthread_mutex_lock(&load_hash[i].lock);
 266                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 267                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 268                 if (load_hash[i].next == NULL) {
 269                         pthread_mutex_unlock(&load_hash[i].lock);
 270                         pthread_mutex_destroy(&load_hash[i].lock);
 271                         pthread_rwlock_unlock(&load_hash[i].rilock);
 272                         pthread_rwlock_destroy(&load_hash[i].rilock);
 273                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 274                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 275                         continue;
 276                 }
 277                 for (f = load_hash[i].next; f; ) {
 278                         __free_move__(f->cg);
 279                         p = f->next;
 280                         __free_move__(f);
 281                         f = p;
 282                 }
 283                 pthread_mutex_unlock(&load_hash[i].lock);
 284                 pthread_mutex_destroy(&load_hash[i].lock);
 285                 pthread_rwlock_unlock(&load_hash[i].rilock);
 286                 pthread_rwlock_destroy(&load_hash[i].rilock);
 287                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 288                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 289         }
 290 }
 291
 292 /* Data for CPU view */
 293 struct cg_proc_stat {
 294         char *cg;
 295         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 296         struct cpuacct_usage *view; // Usage stats reported to the container
 297         int cpu_count;
 298         pthread_mutex_t lock; // For node manipulation
 299         struct cg_proc_stat *next;
 300 };
 301
 302 struct cg_proc_stat_head {
 303         struct cg_proc_stat *next;
 304         time_t lastcheck;
 305
 306         /*
 307          * For access to the list. Reading can be parallel, pruning is exclusive.
 308          */
 309         pthread_rwlock_t lock;
 310 };
 311
 312 #define CPUVIEW_HASH_SIZE 100
 313 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 314
 315 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 316 {
 317         *head = malloc(sizeof(struct cg_proc_stat_head));
 318         if (!(*head)) {
 319                 lxcfs_error("%s\n", strerror(errno));
 320                 return false;
 321         }
 322
 323         (*head)->lastcheck = time(NULL);
 324         (*head)->next = NULL;
 325
 326         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 327                 lxcfs_error("%s\n", "Failed to initialize list lock");
 328                 __free_move__(*head);
 329                 return false;
 330         }
 331
 332         return true;
 333 }
 334
 335 static bool init_cpuview()
 336 {
 337         int i;
 338
 339         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 340                 proc_stat_history[i] = NULL;
 341
 342         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 343                 if (!cpuview_init_head(&proc_stat_history[i]))
 344                         goto err;
 345         }
 346
 347         return true;
 348
 349 err:
 350         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 351                 if (proc_stat_history[i]) {
 352                         __free_move__(proc_stat_history[i]);
 353                 }
 354         }
 355
 356         return false;
 357 }
 358
 359 static void free_proc_stat_node(struct cg_proc_stat *node)
 360 {
 361         pthread_mutex_destroy(&node->lock);
 362         __free_move__(node->cg);
 363         __free_move__(node->usage);
 364         __free_move__(node->view);
 365         __free_move__(node);
 366 }
 367
 368 static void cpuview_free_head(struct cg_proc_stat_head *head)
 369 {
 370         struct cg_proc_stat *node, *tmp;
 371
 372         if (head->next) {
 373                 node = head->next;
 374
 375                 for (;;) {
 376                         tmp = node;
 377                         node = node->next;
 378                         free_proc_stat_node(tmp);
 379
 380                         if (!node)
 381                                 break;
 382                 }
 383         }
 384
 385         pthread_rwlock_destroy(&head->lock);
 386         __free_move__(head);
 387 }
 388
 389 static void free_cpuview()
 390 {
 391         int i;
 392
 393         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 394                 if (proc_stat_history[i])
 395                         cpuview_free_head(proc_stat_history[i]);
 396         }
 397 }
 398
 399 /* Reserve buffer size to account for file size changes. */
 400 #define BUF_RESERVE_SIZE 512
 401
 402 /*
 403  * A table caching which pid is init for a pid namespace.
 404  * When looking up which pid is init for $qpid, we first
 405  * 1. Stat /proc/$qpid/ns/pid.
 406  * 2. Check whether the ino_t is in our store.
 407  *   a. if not, fork a child in qpid's ns to send us
 408  *       ucred.pid = 1, and read the initpid.  Cache
 409  *       initpid and creation time for /proc/initpid
 410  *       in a new store entry.
 411  *   b. if so, verify that /proc/initpid still matches
 412  *       what we have saved.  If not, clear the store
 413  *       entry and go back to a.  If so, return the
 414  *       cached initpid.
 415  */
 416 struct pidns_init_store {
 417         ino_t ino;          // inode number for /proc/$pid/ns/pid
 418         pid_t initpid;      // the pid of nit in that ns
 419         long int ctime;     // the time at which /proc/$initpid was created
 420         struct pidns_init_store *next;
 421         long int lastcheck;
 422 };
 423
 424 /* lol - look at how they are allocated in the kernel */
 425 #define PIDNS_HASH_SIZE 4096
 426 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 427
 428 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 429 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 430 static void lock_mutex(pthread_mutex_t *l)
 431 {
 432         int ret;
 433
 434         if ((ret = pthread_mutex_lock(l)) != 0) {
 435                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 436                 exit(1);
 437         }
 438 }
 439
 440 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 441  * Number of hierarchies mounted. */
 442 static int num_hierarchies;
 443
 444 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 445  * Hierachies mounted {cpuset, blkio, ...}:
 446  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 447 static char **hierarchies;
 448
 449 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 450  * Open file descriptors:
 451  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 452  * private mount namespace.
 453  * Initialized via __constructor__ collect_and_mount_subsystems().
 454  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 455  * mounts and respective files in the private namespace even when located in
 456  * another namespace using the *at() family of functions
 457  * {openat(), fchownat(), ...}. */
 458 static int *fd_hierarchies;
 459 static int cgroup_mount_ns_fd = -1;
 460
 461 static void unlock_mutex(pthread_mutex_t *l)
 462 {
 463         int ret;
 464
 465         if ((ret = pthread_mutex_unlock(l)) != 0) {
 466                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 467                 exit(1);
 468         }
 469 }
 470
 471 static void store_lock(void)
 472 {
 473         lock_mutex(&pidns_store_mutex);
 474 }
 475
 476 static void store_unlock(void)
 477 {
 478         unlock_mutex(&pidns_store_mutex);
 479 }
 480
 481 /* Must be called under store_lock */
 482 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 483 {
 484         struct stat initsb;
 485         char fnam[100];
 486
 487         snprintf(fnam, 100, "/proc/%d", e->initpid);
 488         if (stat(fnam, &initsb) < 0)
 489                 return false;
 490
 491         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 492                     initsb.st_ctime, e->initpid);
 493
 494         if (e->ctime != initsb.st_ctime)
 495                 return false;
 496         return true;
 497 }
 498
 499 /* Must be called under store_lock */
 500 static void remove_initpid(struct pidns_init_store *e)
 501 {
 502         struct pidns_init_store *tmp;
 503         int h;
 504
 505         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 506
 507         h = HASH(e->ino);
 508         if (pidns_hash_table[h] == e) {
 509                 pidns_hash_table[h] = e->next;
 510                 __free_move__(e);
 511                 return;
 512         }
 513
 514         tmp = pidns_hash_table[h];
 515         while (tmp) {
 516                 if (tmp->next == e) {
 517                         tmp->next = e->next;
 518                         __free_move__(e);
 519                         return;
 520                 }
 521                 tmp = tmp->next;
 522         }
 523 }
 524
 525 #define PURGE_SECS 5
 526 /* Must be called under store_lock */
 527 static void prune_initpid_store(void)
 528 {
 529         static long int last_prune = 0;
 530         long int now, threshold;
 531         int i;
 532
 533         if (!last_prune) {
 534                 last_prune = time(NULL);
 535                 return;
 536         }
 537         now = time(NULL);
 538         if (now < last_prune + PURGE_SECS)
 539                 return;
 540
 541         lxcfs_debug("%s\n", "Pruning.");
 542
 543         last_prune = now;
 544         threshold = now - 2 * PURGE_SECS;
 545
 546         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 547                 struct pidns_init_store *e, *prev;
 548
 549                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 550                         __do_free struct pidns_init_store *delme = NULL;
 551
 552                         if (e->lastcheck < threshold) {
 553
 554                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 555
 556                                 delme = e;
 557                                 if (prev)
 558                                         prev->next = e->next;
 559                                 else
 560                                         pidns_hash_table[i] = e->next;
 561                                 e = e->next;
 562                         } else {
 563                                 prev = e;
 564                                 e = e->next;
 565                         }
 566                 }
 567         }
 568 }
 569
 570 /* Must be called under store_lock */
 571 static void save_initpid(struct stat *sb, pid_t pid)
 572 {
 573         struct pidns_init_store *e;
 574         char fpath[100];
 575         struct stat procsb;
 576         int h;
 577
 578         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 579
 580         snprintf(fpath, 100, "/proc/%d", pid);
 581         if (stat(fpath, &procsb) < 0)
 582                 return;
 583         do {
 584                 e = malloc(sizeof(*e));
 585         } while (!e);
 586         e->ino = sb->st_ino;
 587         e->initpid = pid;
 588         e->ctime = procsb.st_ctime;
 589         h = HASH(e->ino);
 590         e->next = pidns_hash_table[h];
 591         e->lastcheck = time(NULL);
 592         pidns_hash_table[h] = e;
 593 }
 594
 595 /*
 596  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 597  * entry for the inode number and creation time.  Verify that the init pid
 598  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 599  * otherwise.
 600  * Must be called under store_lock
 601  */
 602 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 603 {
 604         int h = HASH(sb->st_ino);
 605         struct pidns_init_store *e = pidns_hash_table[h];
 606
 607         while (e) {
 608                 if (e->ino == sb->st_ino) {
 609                         if (initpid_still_valid(e, sb)) {
 610                                 e->lastcheck = time(NULL);
 611                                 return e;
 612                         }
 613                         remove_initpid(e);
 614                         return NULL;
 615                 }
 616                 e = e->next;
 617         }
 618
 619         return NULL;
 620 }
 621
 622 static int is_dir(const char *path, int fd)
 623 {
 624         struct stat statbuf;
 625         int ret = fstatat(fd, path, &statbuf, fd);
 626         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 627                 return 1;
 628         return 0;
 629 }
 630
 631 static char *must_copy_string(const char *str)
 632 {
 633         char *dup = NULL;
 634         if (!str)
 635                 return NULL;
 636         do {
 637                 dup = strdup(str);
 638         } while (!dup);
 639
 640         return dup;
 641 }
 642
 643 static inline void drop_trailing_newlines(char *s)
 644 {
 645         int l;
 646
 647         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 648                 s[l-1] = '\0';
 649 }
 650
 651 #define BATCH_SIZE 50
 652 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 653 {
 654         int newbatches = (newlen / BATCH_SIZE) + 1;
 655         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 656
 657         if (!*mem || newbatches > oldbatches) {
 658                 char *tmp;
 659                 do {
 660                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 661                 } while (!tmp);
 662                 *mem = tmp;
 663         }
 664 }
 665 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 666 {
 667         size_t newlen = *len + linelen;
 668         dorealloc(contents, *len, newlen + 1);
 669         memcpy(*contents + *len, line, linelen+1);
 670         *len = newlen;
 671 }
 672
 673 static char *slurp_file(const char *from, int fd)
 674 {
 675         __do_free char *line = NULL;
 676         __do_fclose FILE *f = NULL;
 677         char *contents = NULL;
 678         size_t len = 0, fulllen = 0;
 679         ssize_t linelen;
 680
 681         f = fdopen(fd, "r");
 682         if (!f)
 683                 return NULL;
 684
 685         while ((linelen = getline(&line, &len, f)) != -1)
 686                 append_line(&contents, &fulllen, line, linelen);
 687
 688         if (contents)
 689                 drop_trailing_newlines(contents);
 690
 691         return contents;
 692 }
 693
 694 static bool write_string(const char *fnam, const char *string, int fd)
 695 {
 696         FILE *f;
 697         size_t len, ret;
 698
 699         f = fdopen(fd, "w");
 700         if (!f)
 701                 return false;
 702
 703         len = strlen(string);
 704         ret = fwrite(string, 1, len, f);
 705         if (ret != len) {
 706                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 707                             strerror(errno), string, fnam);
 708                 fclose(f);
 709                 return false;
 710         }
 711
 712         if (fclose(f) < 0) {
 713                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 714                 return false;
 715         }
 716
 717         return true;
 718 }
 719
 720 struct cgfs_files {
 721         char *name;
 722         uint32_t uid, gid;
 723         uint32_t mode;
 724 };
 725
 726 #define ALLOC_NUM 20
 727 static bool store_hierarchy(char *stridx, char *h)
 728 {
 729         if (num_hierarchies % ALLOC_NUM == 0) {
 730                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 731                 n *= ALLOC_NUM;
 732                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 733                 if (!tmp) {
 734                         lxcfs_error("%s\n", strerror(errno));
 735                         exit(1);
 736                 }
 737                 hierarchies = tmp;
 738         }
 739
 740         hierarchies[num_hierarchies++] = must_copy_string(h);
 741         return true;
 742 }
 743
 744 static void print_subsystems(void)
 745 {
 746         int i;
 747
 748         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 749         fprintf(stderr, "hierarchies:\n");
 750         for (i = 0; i < num_hierarchies; i++) {
 751                 if (hierarchies[i])
 752                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 753                                 fd_hierarchies[i], hierarchies[i]);
 754         }
 755 }
 756
 757 static bool in_comma_list(const char *needle, const char *haystack)
 758 {
 759         const char *s = haystack, *e;
 760         size_t nlen = strlen(needle);
 761
 762         while (*s && (e = strchr(s, ','))) {
 763                 if (nlen != e - s) {
 764                         s = e + 1;
 765                         continue;
 766                 }
 767                 if (strncmp(needle, s, nlen) == 0)
 768                         return true;
 769                 s = e + 1;
 770         }
 771         if (strcmp(needle, s) == 0)
 772                 return true;
 773         return false;
 774 }
 775
 776 /* do we need to do any massaging here?  I'm not sure... */
 777 /* Return the mounted controller and store the corresponding open file descriptor
 778  * referring to the controller mountpoint in the private lxcfs namespace in
 779  * @cfd.
 780  */
 781 static char *find_mounted_controller(const char *controller, int *cfd)
 782 {
 783         int i;
 784
 785         for (i = 0; i < num_hierarchies; i++) {
 786                 if (!hierarchies[i])
 787                         continue;
 788                 if (strcmp(hierarchies[i], controller) == 0) {
 789                         *cfd = fd_hierarchies[i];
 790                         return hierarchies[i];
 791                 }
 792                 if (in_comma_list(controller, hierarchies[i])) {
 793                         *cfd = fd_hierarchies[i];
 794                         return hierarchies[i];
 795                 }
 796         }
 797
 798         return NULL;
 799 }
 800
 801 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 802                 const char *value)
 803 {
 804         int ret, fd, cfd;
 805         size_t len;
 806         char *fnam, *tmpc;
 807
 808         tmpc = find_mounted_controller(controller, &cfd);
 809         if (!tmpc)
 810                 return false;
 811
 812         /* Make sure we pass a relative path to *at() family of functions.
 813          * . + /cgroup + / + file + \0
 814          */
 815         len = strlen(cgroup) + strlen(file) + 3;
 816         fnam = alloca(len);
 817         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 818         if (ret < 0 || (size_t)ret >= len)
 819                 return false;
 820
 821         fd = openat(cfd, fnam, O_WRONLY);
 822         if (fd < 0)
 823                 return false;
 824
 825         return write_string(fnam, value, fd);
 826 }
 827
 828 // Chown all the files in the cgroup directory.  We do this when we create
 829 // a cgroup on behalf of a user.
 830 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 831 {
 832         struct dirent *direntp;
 833         char path[MAXPATHLEN];
 834         size_t len;
 835         DIR *d;
 836         int fd1, ret;
 837
 838         len = strlen(dirname);
 839         if (len >= MAXPATHLEN) {
 840                 lxcfs_error("Pathname too long: %s\n", dirname);
 841                 return;
 842         }
 843
 844         fd1 = openat(fd, dirname, O_DIRECTORY);
 845         if (fd1 < 0)
 846                 return;
 847
 848         d = fdopendir(fd1);
 849         if (!d) {
 850                 lxcfs_error("Failed to open %s\n", dirname);
 851                 return;
 852         }
 853
 854         while ((direntp = readdir(d))) {
 855                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 856                         continue;
 857                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 858                 if (ret < 0 || ret >= MAXPATHLEN) {
 859                         lxcfs_error("Pathname too long under %s\n", dirname);
 860                         continue;
 861                 }
 862                 if (fchownat(fd, path, uid, gid, 0) < 0)
 863                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 864         }
 865         closedir(d);
 866 }
 867
 868 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 869 {
 870         int cfd;
 871         size_t len;
 872         char *dirnam, *tmpc;
 873
 874         tmpc = find_mounted_controller(controller, &cfd);
 875         if (!tmpc)
 876                 return -EINVAL;
 877
 878         /* Make sure we pass a relative path to *at() family of functions.
 879          * . + /cg + \0
 880          */
 881         len = strlen(cg) + 2;
 882         dirnam = alloca(len);
 883         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 884
 885         if (mkdirat(cfd, dirnam, 0755) < 0)
 886                 return -errno;
 887
 888         if (uid == 0 && gid == 0)
 889                 return 0;
 890
 891         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 892                 return -errno;
 893
 894         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 895
 896         return 0;
 897 }
 898
 899 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 900 {
 901         struct dirent *direntp;
 902         DIR *dir;
 903         bool ret = false;
 904         char pathname[MAXPATHLEN];
 905         int dupfd;
 906
 907         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 908         if (dupfd < 0)
 909                 return false;
 910
 911         dir = fdopendir(dupfd);
 912         if (!dir) {
 913                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 914                 close(dupfd);
 915                 return false;
 916         }
 917
 918         while ((direntp = readdir(dir))) {
 919                 struct stat mystat;
 920                 int rc;
 921
 922                 if (!strcmp(direntp->d_name, ".") ||
 923                     !strcmp(direntp->d_name, ".."))
 924                         continue;
 925
 926                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 927                 if (rc < 0 || rc >= MAXPATHLEN) {
 928                         lxcfs_error("%s\n", "Pathname too long.");
 929                         continue;
 930                 }
 931
 932                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 933                 if (rc) {
 934                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 935                         continue;
 936                 }
 937                 if (S_ISDIR(mystat.st_mode))
 938                         if (!recursive_rmdir(pathname, fd, cfd))
 939                                 lxcfs_debug("Error removing %s.\n", pathname);
 940         }
 941
 942         ret = true;
 943         if (closedir(dir) < 0) {
 944                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 945                 ret = false;
 946         }
 947
 948         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 949                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 950                 ret = false;
 951         }
 952
 953         close(dupfd);
 954
 955         return ret;
 956 }
 957
 958 bool cgfs_remove(const char *controller, const char *cg)
 959 {
 960         int fd, cfd;
 961         size_t len;
 962         char *dirnam, *tmpc;
 963         bool bret;
 964
 965         tmpc = find_mounted_controller(controller, &cfd);
 966         if (!tmpc)
 967                 return false;
 968
 969         /* Make sure we pass a relative path to *at() family of functions.
 970          * . +  /cg + \0
 971          */
 972         len = strlen(cg) + 2;
 973         dirnam = alloca(len);
 974         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 975
 976         fd = openat(cfd, dirnam, O_DIRECTORY);
 977         if (fd < 0)
 978                 return false;
 979
 980         bret = recursive_rmdir(dirnam, fd, cfd);
 981         close(fd);
 982         return bret;
 983 }
 984
 985 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 986 {
 987         int cfd;
 988         size_t len;
 989         char *pathname, *tmpc;
 990
 991         tmpc = find_mounted_controller(controller, &cfd);
 992         if (!tmpc)
 993                 return false;
 994
 995         /* Make sure we pass a relative path to *at() family of functions.
 996          * . + /file + \0
 997          */
 998         len = strlen(file) + 2;
 999         pathname = alloca(len);
1000         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1001         if (fchmodat(cfd, pathname, mode, 0) < 0)
1002                 return false;
1003         return true;
1004 }
1005
1006 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1007 {
1008         size_t len;
1009         char *fname;
1010
1011         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1012         fname = alloca(len);
1013         snprintf(fname, len, "%s/tasks", dirname);
1014         if (fchownat(fd, fname, uid, gid, 0) != 0)
1015                 return -errno;
1016         snprintf(fname, len, "%s/cgroup.procs", dirname);
1017         if (fchownat(fd, fname, uid, gid, 0) != 0)
1018                 return -errno;
1019         return 0;
1020 }
1021
1022 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1023 {
1024         int cfd;
1025         size_t len;
1026         char *pathname, *tmpc;
1027
1028         tmpc = find_mounted_controller(controller, &cfd);
1029         if (!tmpc)
1030                 return -EINVAL;
1031
1032         /* Make sure we pass a relative path to *at() family of functions.
1033          * . + /file + \0
1034          */
1035         len = strlen(file) + 2;
1036         pathname = alloca(len);
1037         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1038         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1039                 return -errno;
1040
1041         if (is_dir(pathname, cfd))
1042                 // like cgmanager did, we want to chown the tasks file as well
1043                 return chown_tasks_files(pathname, uid, gid, cfd);
1044
1045         return 0;
1046 }
1047
1048 FILE *open_pids_file(const char *controller, const char *cgroup)
1049 {
1050         int fd, cfd;
1051         size_t len;
1052         char *pathname, *tmpc;
1053
1054         tmpc = find_mounted_controller(controller, &cfd);
1055         if (!tmpc)
1056                 return NULL;
1057
1058         /* Make sure we pass a relative path to *at() family of functions.
1059          * . + /cgroup + / "cgroup.procs" + \0
1060          */
1061         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1062         pathname = alloca(len);
1063         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1064
1065         fd = openat(cfd, pathname, O_WRONLY);
1066         if (fd < 0)
1067                 return NULL;
1068
1069         return fdopen(fd, "w");
1070 }
1071
1072 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1073                                 void ***list, size_t typesize,
1074                                 void* (*iterator)(const char*, const char*, const char*))
1075 {
1076         int cfd, fd, ret;
1077         size_t len;
1078         char *cg, *tmpc;
1079         char pathname[MAXPATHLEN];
1080         size_t sz = 0, asz = 0;
1081         struct dirent *dirent;
1082         DIR *dir;
1083
1084         tmpc = find_mounted_controller(controller, &cfd);
1085         *list = NULL;
1086         if (!tmpc)
1087                 return false;
1088
1089         /* Make sure we pass a relative path to *at() family of functions. */
1090         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1091         cg = alloca(len);
1092         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1093         if (ret < 0 || (size_t)ret >= len) {
1094                 lxcfs_error("Pathname too long under %s\n", cgroup);
1095                 return false;
1096         }
1097
1098         fd = openat(cfd, cg, O_DIRECTORY);
1099         if (fd < 0)
1100                 return false;
1101
1102         dir = fdopendir(fd);
1103         if (!dir)
1104                 return false;
1105
1106         while ((dirent = readdir(dir))) {
1107                 struct stat mystat;
1108
1109                 if (!strcmp(dirent->d_name, ".") ||
1110                     !strcmp(dirent->d_name, ".."))
1111                         continue;
1112
1113                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1114                 if (ret < 0 || ret >= MAXPATHLEN) {
1115                         lxcfs_error("Pathname too long under %s\n", cg);
1116                         continue;
1117                 }
1118
1119                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1120                 if (ret) {
1121                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1122                         continue;
1123                 }
1124                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1125                     (directories && !S_ISDIR(mystat.st_mode)))
1126                         continue;
1127
1128                 if (sz+2 >= asz) {
1129                         void **tmp;
1130                         asz += BATCH_SIZE;
1131                         do {
1132                                 tmp = realloc(*list, asz * typesize);
1133                         } while  (!tmp);
1134                         *list = tmp;
1135                 }
1136                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1137                 (*list)[sz+1] = NULL;
1138                 sz++;
1139         }
1140         if (closedir(dir) < 0) {
1141                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1142                 return false;
1143         }
1144         return true;
1145 }
1146
1147 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1148 {
1149         char *dup;
1150         do {
1151                 dup = strdup(dir_entry);
1152         } while (!dup);
1153         return dup;
1154 }
1155
1156 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1157 {
1158         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1159 }
1160
1161 void free_key(struct cgfs_files *k)
1162 {
1163         if (!k)
1164                 return;
1165
1166         __free_move__(k->name);
1167         __free_move__(k);
1168 }
1169
1170 void free_keys(struct cgfs_files **keys)
1171 {
1172         int i;
1173
1174         if (!keys)
1175                 return;
1176
1177         for (i = 0; keys[i]; i++)
1178                 free_key(keys[i]);
1179
1180         __free_move__(keys);
1181 }
1182
1183 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1184 {
1185         int ret, fd, cfd;
1186         size_t len;
1187         char *fnam, *tmpc;
1188
1189         tmpc = find_mounted_controller(controller, &cfd);
1190         if (!tmpc)
1191                 return false;
1192
1193         /* Make sure we pass a relative path to *at() family of functions.
1194          * . + /cgroup + / + file + \0
1195          */
1196         len = strlen(cgroup) + strlen(file) + 3;
1197         fnam = alloca(len);
1198         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1199         if (ret < 0 || (size_t)ret >= len)
1200                 return false;
1201
1202         fd = openat(cfd, fnam, O_RDONLY);
1203         if (fd < 0)
1204                 return false;
1205
1206         *value = slurp_file(fnam, fd);
1207         return *value != NULL;
1208 }
1209
1210 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1211 {
1212         int ret, cfd;
1213         size_t len;
1214         char *fnam, *tmpc;
1215
1216         tmpc = find_mounted_controller(controller, &cfd);
1217         if (!tmpc)
1218                 return false;
1219
1220         /* Make sure we pass a relative path to *at() family of functions.
1221          * . + /cgroup + / + file + \0
1222          */
1223         len = strlen(cgroup) + strlen(file) + 3;
1224         fnam = alloca(len);
1225         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1226         if (ret < 0 || (size_t)ret >= len)
1227                 return false;
1228
1229         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1230 }
1231
1232 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1233 {
1234         int ret, cfd;
1235         size_t len;
1236         char *fnam, *tmpc;
1237         struct stat sb;
1238         struct cgfs_files *newkey;
1239
1240         tmpc = find_mounted_controller(controller, &cfd);
1241         if (!tmpc)
1242                 return false;
1243
1244         if (file && *file == '/')
1245                 file++;
1246
1247         if (file && strchr(file, '/'))
1248                 return NULL;
1249
1250         /* Make sure we pass a relative path to *at() family of functions.
1251          * . + /cgroup + / + file + \0
1252          */
1253         len = strlen(cgroup) + 3;
1254         if (file)
1255                 len += strlen(file) + 1;
1256         fnam = alloca(len);
1257         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1258                  file ? "/" : "", file ? file : "");
1259
1260         ret = fstatat(cfd, fnam, &sb, 0);
1261         if (ret < 0)
1262                 return NULL;
1263
1264         do {
1265                 newkey = malloc(sizeof(struct cgfs_files));
1266         } while (!newkey);
1267         if (file)
1268                 newkey->name = must_copy_string(file);
1269         else if (strrchr(cgroup, '/'))
1270                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1271         else
1272                 newkey->name = must_copy_string(cgroup);
1273         newkey->uid = sb.st_uid;
1274         newkey->gid = sb.st_gid;
1275         newkey->mode = sb.st_mode;
1276
1277         return newkey;
1278 }
1279
1280 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1281 {
1282         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1283         if (!entry) {
1284                 lxcfs_error("Error getting files under %s:%s\n", controller,
1285                              cgroup);
1286         }
1287         return entry;
1288 }
1289
1290 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1291 {
1292         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1293 }
1294
1295 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1296 {
1297         int cfd;
1298         size_t len;
1299         char *fnam, *tmpc;
1300         int ret;
1301         struct stat sb;
1302
1303         tmpc = find_mounted_controller(controller, &cfd);
1304         if (!tmpc)
1305                 return false;
1306
1307         /* Make sure we pass a relative path to *at() family of functions.
1308          * . + /cgroup + / + f + \0
1309          */
1310         len = strlen(cgroup) + strlen(f) + 3;
1311         fnam = alloca(len);
1312         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1313         if (ret < 0 || (size_t)ret >= len)
1314                 return false;
1315
1316         ret = fstatat(cfd, fnam, &sb, 0);
1317         if (ret < 0 || !S_ISDIR(sb.st_mode))
1318                 return false;
1319
1320         return true;
1321 }
1322
1323 #define SEND_CREDS_OK 0
1324 #define SEND_CREDS_NOTSK 1
1325 #define SEND_CREDS_FAIL 2
1326 static bool recv_creds(int sock, struct ucred *cred, char *v);
1327 static int wait_for_pid(pid_t pid);
1328 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1329 static int send_creds_clone_wrapper(void *arg);
1330
1331 /*
1332  * clone a task which switches to @task's namespace and writes '1'.
1333  * over a unix sock so we can read the task's reaper's pid in our
1334  * namespace
1335  *
1336  * Note: glibc's fork() does not respect pidns, which can lead to failed
1337  * assertions inside glibc (and thus failed forks) if the child's pid in
1338  * the pidns and the parent pid outside are identical. Using clone prevents
1339  * this issue.
1340  */
1341 static void write_task_init_pid_exit(int sock, pid_t target)
1342 {
1343         char fnam[100];
1344         pid_t pid;
1345         int fd, ret;
1346         size_t stack_size = sysconf(_SC_PAGESIZE);
1347         void *stack = alloca(stack_size);
1348
1349         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1350         if (ret < 0 || ret >= sizeof(fnam))
1351                 _exit(1);
1352
1353         fd = open(fnam, O_RDONLY);
1354         if (fd < 0) {
1355                 perror("write_task_init_pid_exit open of ns/pid");
1356                 _exit(1);
1357         }
1358         if (setns(fd, 0)) {
1359                 perror("write_task_init_pid_exit setns 1");
1360                 close(fd);
1361                 _exit(1);
1362         }
1363         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1364         if (pid < 0)
1365                 _exit(1);
1366         if (pid != 0) {
1367                 if (!wait_for_pid(pid))
1368                         _exit(1);
1369                 _exit(0);
1370         }
1371 }
1372
1373 static int send_creds_clone_wrapper(void *arg) {
1374         struct ucred cred;
1375         char v;
1376         int sock = *(int *)arg;
1377
1378         /* we are the child */
1379         cred.uid = 0;
1380         cred.gid = 0;
1381         cred.pid = 1;
1382         v = '1';
1383         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1384                 return 1;
1385         return 0;
1386 }
1387
1388 static pid_t get_init_pid_for_task(pid_t task)
1389 {
1390         int sock[2];
1391         pid_t pid;
1392         pid_t ret = -1;
1393         char v = '0';
1394         struct ucred cred;
1395
1396         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1397                 perror("socketpair");
1398                 return -1;
1399         }
1400
1401         pid = fork();
1402         if (pid < 0)
1403                 goto out;
1404         if (!pid) {
1405                 close(sock[1]);
1406                 write_task_init_pid_exit(sock[0], task);
1407                 _exit(0);
1408         }
1409
1410         if (!recv_creds(sock[1], &cred, &v))
1411                 goto out;
1412         ret = cred.pid;
1413
1414 out:
1415         close(sock[0]);
1416         close(sock[1]);
1417         if (pid > 0)
1418                 wait_for_pid(pid);
1419         return ret;
1420 }
1421
1422 static pid_t lookup_initpid_in_store(pid_t qpid)
1423 {
1424         pid_t answer = 0;
1425         struct stat sb;
1426         struct pidns_init_store *e;
1427         char fnam[100];
1428
1429         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1430         store_lock();
1431         if (stat(fnam, &sb) < 0)
1432                 goto out;
1433         e = lookup_verify_initpid(&sb);
1434         if (e) {
1435                 answer = e->initpid;
1436                 goto out;
1437         }
1438         answer = get_init_pid_for_task(qpid);
1439         if (answer > 0)
1440                 save_initpid(&sb, answer);
1441
1442 out:
1443         /* we prune at end in case we are returning
1444          * the value we were about to return */
1445         prune_initpid_store();
1446         store_unlock();
1447         return answer;
1448 }
1449
1450 static int wait_for_pid(pid_t pid)
1451 {
1452         int status, ret;
1453
1454         if (pid <= 0)
1455                 return -1;
1456
1457 again:
1458         ret = waitpid(pid, &status, 0);
1459         if (ret == -1) {
1460                 if (errno == EINTR)
1461                         goto again;
1462                 return -1;
1463         }
1464         if (ret != pid)
1465                 goto again;
1466         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1467                 return -1;
1468         return 0;
1469 }
1470
1471
1472 /*
1473  * append pid to *src.
1474  * src: a pointer to a char* in which ot append the pid.
1475  * sz: the number of characters printed so far, minus trailing \0.
1476  * asz: the allocated size so far
1477  * pid: the pid to append
1478  */
1479 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1480 {
1481         char tmp[30];
1482
1483         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1484
1485         if (!*src || tmplen + *sz + 1 >= *asz) {
1486                 char *tmp;
1487                 do {
1488                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1489                 } while (!tmp);
1490                 *src = tmp;
1491                 *asz += BUF_RESERVE_SIZE;
1492         }
1493         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1494         *sz += tmplen;
1495 }
1496
1497 /*
1498  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1499  * valid in the caller's namespace, return the id mapped into
1500  * pid's namespace.
1501  * Returns the mapped id, or -1 on error.
1502  */
1503 unsigned int
1504 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1505 {
1506         unsigned int nsuid,   // base id for a range in the idfile's namespace
1507                      hostuid, // base id for a range in the caller's namespace
1508                      count;   // number of ids in this range
1509         char line[400];
1510         int ret;
1511
1512         fseek(idfile, 0L, SEEK_SET);
1513         while (fgets(line, 400, idfile)) {
1514                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1515                 if (ret != 3)
1516                         continue;
1517                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1518                         /*
1519                          * uids wrapped around - unexpected as this is a procfile,
1520                          * so just bail.
1521                          */
1522                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1523                                 nsuid, hostuid, count, line);
1524                         return -1;
1525                 }
1526                 if (hostuid <= in_id && hostuid+count > in_id) {
1527                         /*
1528                          * now since hostuid <= in_id < hostuid+count, and
1529                          * hostuid+count and nsuid+count do not wrap around,
1530                          * we know that nsuid+(in_id-hostuid) which must be
1531                          * less that nsuid+(count) must not wrap around
1532                          */
1533                         return (in_id - hostuid) + nsuid;
1534                 }
1535         }
1536
1537         // no answer found
1538         return -1;
1539 }
1540
1541 /*
1542  * for is_privileged_over,
1543  * specify whether we require the calling uid to be root in his
1544  * namespace
1545  */
1546 #define NS_ROOT_REQD true
1547 #define NS_ROOT_OPT false
1548
1549 #define PROCLEN 100
1550
1551 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1552 {
1553         char fpath[PROCLEN];
1554         int ret;
1555         bool answer = false;
1556         uid_t nsuid;
1557
1558         if (victim == -1 || uid == -1)
1559                 return false;
1560
1561         /*
1562          * If the request is one not requiring root in the namespace,
1563          * then having the same uid suffices.  (i.e. uid 1000 has write
1564          * access to files owned by uid 1000
1565          */
1566         if (!req_ns_root && uid == victim)
1567                 return true;
1568
1569         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1570         if (ret < 0 || ret >= PROCLEN)
1571                 return false;
1572         FILE *f = fopen(fpath, "r");
1573         if (!f)
1574                 return false;
1575
1576         /* if caller's not root in his namespace, reject */
1577         nsuid = convert_id_to_ns(f, uid);
1578         if (nsuid)
1579                 goto out;
1580
1581         /*
1582          * If victim is not mapped into caller's ns, reject.
1583          * XXX I'm not sure this check is needed given that fuse
1584          * will be sending requests where the vfs has converted
1585          */
1586         nsuid = convert_id_to_ns(f, victim);
1587         if (nsuid == -1)
1588                 goto out;
1589
1590         answer = true;
1591
1592 out:
1593         fclose(f);
1594         return answer;
1595 }
1596
1597 static bool perms_include(int fmode, mode_t req_mode)
1598 {
1599         mode_t r;
1600
1601         switch (req_mode & O_ACCMODE) {
1602         case O_RDONLY:
1603                 r = S_IROTH;
1604                 break;
1605         case O_WRONLY:
1606                 r = S_IWOTH;
1607                 break;
1608         case O_RDWR:
1609                 r = S_IROTH | S_IWOTH;
1610                 break;
1611         default:
1612                 return false;
1613         }
1614         return ((fmode & r) == r);
1615 }
1616
1617
1618 /*
1619  * taskcg is  a/b/c
1620  * querycg is /a/b/c/d/e
1621  * we return 'd'
1622  */
1623 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1624 {
1625         char *start, *end;
1626
1627         if (strlen(taskcg) <= strlen(querycg)) {
1628                 lxcfs_error("%s\n", "I was fed bad input.");
1629                 return NULL;
1630         }
1631
1632         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1633                 start =  strdup(taskcg + 1);
1634         else
1635                 start = strdup(taskcg + strlen(querycg) + 1);
1636         if (!start)
1637                 return NULL;
1638         end = strchr(start, '/');
1639         if (end)
1640                 *end = '\0';
1641         return start;
1642 }
1643
1644 static void stripnewline(char *x)
1645 {
1646         size_t l = strlen(x);
1647         if (l && x[l-1] == '\n')
1648                 x[l-1] = '\0';
1649 }
1650
1651 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1652 {
1653         __do_free char *line = NULL;
1654         __do_fclose FILE *f = NULL;
1655         int cfd;
1656         char fnam[PROCLEN];
1657         char *answer = NULL;
1658         size_t len = 0;
1659         int ret;
1660         const char *h = find_mounted_controller(contrl, &cfd);
1661         if (!h)
1662                 return NULL;
1663
1664         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1665         if (ret < 0 || ret >= PROCLEN)
1666                 return NULL;
1667         if (!(f = fopen(fnam, "r")))
1668                 return NULL;
1669
1670         while (getline(&line, &len, f) != -1) {
1671                 char *c1, *c2;
1672
1673                 if (!line[0])
1674                         continue;
1675
1676                 c1 = strchr(line, ':');
1677                 if (!c1)
1678                         return NULL;
1679
1680                 c1++;
1681
1682                 c2 = strchr(c1, ':');
1683                 if (!c2)
1684                         return NULL;
1685
1686                 *c2 = '\0';
1687
1688                 if (strcmp(c1, h) != 0)
1689                         continue;
1690
1691                 c2++;
1692
1693                 stripnewline(c2);
1694
1695                 do {
1696                         answer = strdup(c2);
1697                 } while (!answer);
1698
1699                 break;
1700         }
1701
1702         return answer;
1703 }
1704
1705 /*
1706  * check whether a fuse context may access a cgroup dir or file
1707  *
1708  * If file is not null, it is a cgroup file to check under cg.
1709  * If file is null, then we are checking perms on cg itself.
1710  *
1711  * For files we can check the mode of the list_keys result.
1712  * For cgroups, we must make assumptions based on the files under the
1713  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1714  * yet.
1715  */
1716 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1717 {
1718         struct cgfs_files *k = NULL;
1719         bool ret = false;
1720
1721         k = cgfs_get_key(contrl, cg, file);
1722         if (!k)
1723                 return false;
1724
1725         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1726                 if (perms_include(k->mode >> 6, mode)) {
1727                         ret = true;
1728                         goto out;
1729                 }
1730         }
1731         if (fc->gid == k->gid) {
1732                 if (perms_include(k->mode >> 3, mode)) {
1733                         ret = true;
1734                         goto out;
1735                 }
1736         }
1737         ret = perms_include(k->mode, mode);
1738
1739 out:
1740         free_key(k);
1741         return ret;
1742 }
1743
1744 #define INITSCOPE "/init.scope"
1745 static void prune_init_slice(char *cg)
1746 {
1747         char *point;
1748         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1749
1750         if (cg_len < initscope_len)
1751                 return;
1752
1753         point = cg + cg_len - initscope_len;
1754         if (strcmp(point, INITSCOPE) == 0) {
1755                 if (point == cg)
1756                         *(point+1) = '\0';
1757                 else
1758                         *point = '\0';
1759         }
1760 }
1761
1762 /*
1763  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1764  * If pid is in /a, he may act on /a/b, but not on /b.
1765  * if the answer is false and nextcg is not NULL, then *nextcg will point
1766  * to a string containing the next cgroup directory under cg, which must be
1767  * freed by the caller.
1768  */
1769 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1770 {
1771         __do_free char *c2 = NULL;
1772         bool answer = false;
1773         char *linecmp;
1774
1775         c2 = get_pid_cgroup(pid, contrl);
1776         if (!c2)
1777                 return false;
1778         prune_init_slice(c2);
1779
1780         /*
1781          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1782          * they pass in a cgroup without leading '/'
1783          *
1784          * The original line here was:
1785          *      linecmp = *cg == '/' ? c2 : c2+1;
1786          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1787          *       Serge, do you know?
1788          */
1789         if (*cg == '/' || !strncmp(cg, "./", 2))
1790                 linecmp = c2;
1791         else
1792                 linecmp = c2 + 1;
1793         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1794                 if (nextcg)
1795                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1796                 goto out;
1797         }
1798         answer = true;
1799
1800 out:
1801         return answer;
1802 }
1803
1804 /*
1805  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1806  */
1807 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1808 {
1809         __do_free char *c2 = NULL;
1810         bool answer = false;
1811         char *task_cg;
1812         size_t target_len, task_len;
1813
1814         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1815                 return true;
1816
1817         c2 = get_pid_cgroup(pid, contrl);
1818         if (!c2)
1819                 return false;
1820         prune_init_slice(c2);
1821
1822         task_cg = c2 + 1;
1823         target_len = strlen(cg);
1824         task_len = strlen(task_cg);
1825         if (task_len == 0) {
1826                 /* Task is in the root cg, it can see everything. This case is
1827                  * not handled by the strmcps below, since they test for the
1828                  * last /, but that is the first / that we've chopped off
1829                  * above.
1830                  */
1831                 answer = true;
1832                 goto out;
1833         }
1834         if (strcmp(cg, task_cg) == 0) {
1835                 answer = true;
1836                 goto out;
1837         }
1838         if (target_len < task_len) {
1839                 /* looking up a parent dir */
1840                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1841                         answer = true;
1842                 goto out;
1843         }
1844         if (target_len > task_len) {
1845                 /* looking up a child dir */
1846                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1847                         answer = true;
1848                 goto out;
1849         }
1850
1851 out:
1852         return answer;
1853 }
1854
1855 /*
1856  * given /cgroup/freezer/a/b, return "freezer".
1857  * the returned char* should NOT be freed.
1858  */
1859 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1860 {
1861         const char *p1;
1862         char *contr, *slash;
1863
1864         if (strlen(path) < 9) {
1865                 errno = EACCES;
1866                 return NULL;
1867         }
1868         if (*(path + 7) != '/') {
1869                 errno = EINVAL;
1870                 return NULL;
1871         }
1872         p1 = path + 8;
1873         contr = strdupa(p1);
1874         if (!contr) {
1875                 errno = ENOMEM;
1876                 return NULL;
1877         }
1878         slash = strstr(contr, "/");
1879         if (slash)
1880                 *slash = '\0';
1881
1882         int i;
1883         for (i = 0; i < num_hierarchies; i++) {
1884                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1885                         return hierarchies[i];
1886         }
1887         errno = ENOENT;
1888         return NULL;
1889 }
1890
1891 /*
1892  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1893  * Note that the returned value may include files (keynames) etc
1894  */
1895 static const char *find_cgroup_in_path(const char *path)
1896 {
1897         const char *p1;
1898
1899         if (strlen(path) < 9) {
1900                 errno = EACCES;
1901                 return NULL;
1902         }
1903         p1 = strstr(path + 8, "/");
1904         if (!p1) {
1905                 errno = EINVAL;
1906                 return NULL;
1907         }
1908         errno = 0;
1909         return p1 + 1;
1910 }
1911
1912 /*
1913  * split the last path element from the path in @cg.
1914  * @dir is newly allocated and should be freed, @last not
1915 */
1916 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1917 {
1918         char *p;
1919
1920         do {
1921                 *dir = strdup(cg);
1922         } while (!*dir);
1923         *last = strrchr(cg, '/');
1924         if (!*last) {
1925                 *last = NULL;
1926                 return;
1927         }
1928         p = strrchr(*dir, '/');
1929         *p = '\0';
1930 }
1931
1932 /*
1933  * FUSE ops for /cgroup
1934  */
1935
1936 int cg_getattr(const char *path, struct stat *sb)
1937 {
1938         __do_free char * cgdir = NULL;
1939         struct timespec now;
1940         struct fuse_context *fc = fuse_get_context();
1941         char *last = NULL, *path1, *path2;
1942         struct cgfs_files *k = NULL;
1943         const char *cgroup;
1944         const char *controller = NULL;
1945         int ret = -ENOENT;
1946
1947
1948         if (!fc)
1949                 return -EIO;
1950
1951         memset(sb, 0, sizeof(struct stat));
1952
1953         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1954                 return -EINVAL;
1955
1956         sb->st_uid = sb->st_gid = 0;
1957         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1958         sb->st_size = 0;
1959
1960         if (strcmp(path, "/cgroup") == 0) {
1961                 sb->st_mode = S_IFDIR | 00755;
1962                 sb->st_nlink = 2;
1963                 return 0;
1964         }
1965
1966         controller = pick_controller_from_path(fc, path);
1967         if (!controller)
1968                 return -errno;
1969         cgroup = find_cgroup_in_path(path);
1970         if (!cgroup) {
1971                 /* this is just /cgroup/controller, return it as a dir */
1972                 sb->st_mode = S_IFDIR | 00755;
1973                 sb->st_nlink = 2;
1974                 return 0;
1975         }
1976
1977         get_cgdir_and_path(cgroup, &cgdir, &last);
1978
1979         if (!last) {
1980                 path1 = "/";
1981                 path2 = cgdir;
1982         } else {
1983                 path1 = cgdir;
1984                 path2 = last;
1985         }
1986
1987         pid_t initpid = lookup_initpid_in_store(fc->pid);
1988         if (initpid <= 0)
1989                 initpid = fc->pid;
1990         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1991          * Then check that caller's cgroup is under path if last is a child
1992          * cgroup, or cgdir if last is a file */
1993
1994         if (is_child_cgroup(controller, path1, path2)) {
1995                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1996                         ret = -ENOENT;
1997                         goto out;
1998                 }
1999                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2000                         /* this is just /cgroup/controller, return it as a dir */
2001                         sb->st_mode = S_IFDIR | 00555;
2002                         sb->st_nlink = 2;
2003                         ret = 0;
2004                         goto out;
2005                 }
2006                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
2007                         ret = -EACCES;
2008                         goto out;
2009                 }
2010
2011                 // get uid, gid, from '/tasks' file and make up a mode
2012                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2013                 sb->st_mode = S_IFDIR | 00755;
2014                 k = cgfs_get_key(controller, cgroup, NULL);
2015                 if (!k) {
2016                         sb->st_uid = sb->st_gid = 0;
2017                 } else {
2018                         sb->st_uid = k->uid;
2019                         sb->st_gid = k->gid;
2020                 }
2021                 free_key(k);
2022                 sb->st_nlink = 2;
2023                 ret = 0;
2024                 goto out;
2025         }
2026
2027         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2028                 sb->st_mode = S_IFREG | k->mode;
2029                 sb->st_nlink = 1;
2030                 sb->st_uid = k->uid;
2031                 sb->st_gid = k->gid;
2032                 sb->st_size = 0;
2033                 free_key(k);
2034                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2035                         ret = -ENOENT;
2036                         goto out;
2037                 }
2038                 ret = 0;
2039         }
2040
2041 out:
2042         return ret;
2043 }
2044
2045 int cg_opendir(const char *path, struct fuse_file_info *fi)
2046 {
2047         struct fuse_context *fc = fuse_get_context();
2048         const char *cgroup;
2049         struct file_info *dir_info;
2050         char *controller = NULL;
2051
2052         if (!fc)
2053                 return -EIO;
2054
2055         if (strcmp(path, "/cgroup") == 0) {
2056                 cgroup = NULL;
2057                 controller = NULL;
2058         } else {
2059                 // return list of keys for the controller, and list of child cgroups
2060                 controller = pick_controller_from_path(fc, path);
2061                 if (!controller)
2062                         return -errno;
2063
2064                 cgroup = find_cgroup_in_path(path);
2065                 if (!cgroup) {
2066                         /* this is just /cgroup/controller, return its contents */
2067                         cgroup = "/";
2068                 }
2069         }
2070
2071         pid_t initpid = lookup_initpid_in_store(fc->pid);
2072         if (initpid <= 0)
2073                 initpid = fc->pid;
2074         if (cgroup) {
2075                 if (!caller_may_see_dir(initpid, controller, cgroup))
2076                         return -ENOENT;
2077                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2078                         return -EACCES;
2079         }
2080
2081         /* we'll free this at cg_releasedir */
2082         dir_info = malloc(sizeof(*dir_info));
2083         if (!dir_info)
2084                 return -ENOMEM;
2085         dir_info->controller = must_copy_string(controller);
2086         dir_info->cgroup = must_copy_string(cgroup);
2087         dir_info->type = LXC_TYPE_CGDIR;
2088         dir_info->buf = NULL;
2089         dir_info->file = NULL;
2090         dir_info->buflen = 0;
2091
2092         fi->fh = (unsigned long)dir_info;
2093         return 0;
2094 }
2095
2096 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2097                 struct fuse_file_info *fi)
2098 {
2099         __do_free char *nextcg = NULL;
2100         struct file_info *d = (struct file_info *)fi->fh;
2101         struct cgfs_files **list = NULL;
2102         int i, ret;
2103         struct fuse_context *fc = fuse_get_context();
2104         char **clist = NULL;
2105
2106         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2107                 return -EIO;
2108
2109         if (d->type != LXC_TYPE_CGDIR) {
2110                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2111                 return -EIO;
2112         }
2113         if (!d->cgroup && !d->controller) {
2114                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2115                 int i;
2116
2117                 for (i = 0;  i < num_hierarchies; i++) {
2118                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2119                                 return -EIO;
2120                         }
2121                 }
2122                 return 0;
2123         }
2124
2125         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2126                 // not a valid cgroup
2127                 ret = -EINVAL;
2128                 goto out;
2129         }
2130
2131         pid_t initpid = lookup_initpid_in_store(fc->pid);
2132         if (initpid <= 0)
2133                 initpid = fc->pid;
2134         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2135                 if (nextcg) {
2136                         ret = filler(buf, nextcg,  NULL, 0);
2137                         if (ret != 0) {
2138                                 ret = -EIO;
2139                                 goto out;
2140                         }
2141                 }
2142                 ret = 0;
2143                 goto out;
2144         }
2145
2146         for (i = 0; list && list[i]; i++) {
2147                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2148                         ret = -EIO;
2149                         goto out;
2150                 }
2151         }
2152
2153         // now get the list of child cgroups
2154
2155         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2156                 ret = 0;
2157                 goto out;
2158         }
2159         if (clist) {
2160                 for (i = 0; clist[i]; i++) {
2161                         if (filler(buf, clist[i], NULL, 0) != 0) {
2162                                 ret = -EIO;
2163                                 goto out;
2164                         }
2165                 }
2166         }
2167         ret = 0;
2168
2169 out:
2170         free_keys(list);
2171         if (clist) {
2172                 for (i = 0; clist[i]; i++)
2173                         __free_move__(clist[i]);
2174                 __free_move__(clist);
2175         }
2176         return ret;
2177 }
2178
2179 static void do_release_file_info(struct fuse_file_info *fi)
2180 {
2181         struct file_info *f = (struct file_info *)fi->fh;
2182
2183         if (!f)
2184                 return;
2185
2186         fi->fh = 0;
2187
2188         __free_move__(f->controller);
2189         __free_move__(f->cgroup);
2190         __free_move__(f->file);
2191         __free_move__(f->buf);
2192         __free_move__(f);
2193 }
2194
2195 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2196 {
2197         do_release_file_info(fi);
2198         return 0;
2199 }
2200
2201 int cg_open(const char *path, struct fuse_file_info *fi)
2202 {
2203         __do_free char *cgdir = NULL;
2204         const char *cgroup;
2205         char *last = NULL, *path1, *path2, *controller;
2206         struct cgfs_files *k = NULL;
2207         struct file_info *file_info;
2208         struct fuse_context *fc = fuse_get_context();
2209         int ret;
2210
2211         if (!fc)
2212                 return -EIO;
2213
2214         controller = pick_controller_from_path(fc, path);
2215         if (!controller)
2216                 return -errno;
2217         cgroup = find_cgroup_in_path(path);
2218         if (!cgroup)
2219                 return -errno;
2220
2221         get_cgdir_and_path(cgroup, &cgdir, &last);
2222         if (!last) {
2223                 path1 = "/";
2224                 path2 = cgdir;
2225         } else {
2226                 path1 = cgdir;
2227                 path2 = last;
2228         }
2229
2230         k = cgfs_get_key(controller, path1, path2);
2231         if (!k) {
2232                 ret = -EINVAL;
2233                 goto out;
2234         }
2235         free_key(k);
2236
2237         pid_t initpid = lookup_initpid_in_store(fc->pid);
2238         if (initpid <= 0)
2239                 initpid = fc->pid;
2240         if (!caller_may_see_dir(initpid, controller, path1)) {
2241                 ret = -ENOENT;
2242                 goto out;
2243         }
2244         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2245                 ret = -EACCES;
2246                 goto out;
2247         }
2248
2249         /* we'll free this at cg_release */
2250         file_info = malloc(sizeof(*file_info));
2251         if (!file_info) {
2252                 ret = -ENOMEM;
2253                 goto out;
2254         }
2255         file_info->controller = must_copy_string(controller);
2256         file_info->cgroup = must_copy_string(path1);
2257         file_info->file = must_copy_string(path2);
2258         file_info->type = LXC_TYPE_CGFILE;
2259         file_info->buf = NULL;
2260         file_info->buflen = 0;
2261
2262         fi->fh = (unsigned long)file_info;
2263         ret = 0;
2264
2265 out:
2266         return ret;
2267 }
2268
2269 int cg_access(const char *path, int mode)
2270 {
2271         __do_free char *cgdir = NULL;
2272         int ret;
2273         const char *cgroup;
2274         char *path1, *path2, *controller;
2275         char *last = NULL;
2276         struct cgfs_files *k = NULL;
2277         struct fuse_context *fc = fuse_get_context();
2278
2279         if (strcmp(path, "/cgroup") == 0)
2280                 return 0;
2281
2282         if (!fc)
2283                 return -EIO;
2284
2285         controller = pick_controller_from_path(fc, path);
2286         if (!controller)
2287                 return -errno;
2288         cgroup = find_cgroup_in_path(path);
2289         if (!cgroup) {
2290                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2291                 if ((mode & W_OK) == 0)
2292                         return 0;
2293                 return -EACCES;
2294         }
2295
2296         get_cgdir_and_path(cgroup, &cgdir, &last);
2297         if (!last) {
2298                 path1 = "/";
2299                 path2 = cgdir;
2300         } else {
2301                 path1 = cgdir;
2302                 path2 = last;
2303         }
2304
2305         k = cgfs_get_key(controller, path1, path2);
2306         if (!k) {
2307                 if ((mode & W_OK) == 0)
2308                         ret = 0;
2309                 else
2310                         ret = -EACCES;
2311                 goto out;
2312         }
2313         free_key(k);
2314
2315         pid_t initpid = lookup_initpid_in_store(fc->pid);
2316         if (initpid <= 0)
2317                 initpid = fc->pid;
2318         if (!caller_may_see_dir(initpid, controller, path1)) {
2319                 ret = -ENOENT;
2320                 goto out;
2321         }
2322         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2323                 ret = -EACCES;
2324                 goto out;
2325         }
2326
2327         ret = 0;
2328
2329 out:
2330         return ret;
2331 }
2332
2333 int cg_release(const char *path, struct fuse_file_info *fi)
2334 {
2335         do_release_file_info(fi);
2336         return 0;
2337 }
2338
2339 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2340
2341 static bool wait_for_sock(int sock, int timeout)
2342 {
2343         struct epoll_event ev;
2344         int epfd, ret, now, starttime, deltatime, saved_errno;
2345
2346         if ((starttime = time(NULL)) < 0)
2347                 return false;
2348
2349         if ((epfd = epoll_create(1)) < 0) {
2350                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2351                 return false;
2352         }
2353
2354         ev.events = POLLIN_SET;
2355         ev.data.fd = sock;
2356         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2357                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2358                 close(epfd);
2359                 return false;
2360         }
2361
2362 again:
2363         if ((now = time(NULL)) < 0) {
2364                 close(epfd);
2365                 return false;
2366         }
2367
2368         deltatime = (starttime + timeout) - now;
2369         if (deltatime < 0) { // timeout
2370                 errno = 0;
2371                 close(epfd);
2372                 return false;
2373         }
2374         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2375         if (ret < 0 && errno == EINTR)
2376                 goto again;
2377         saved_errno = errno;
2378         close(epfd);
2379
2380         if (ret <= 0) {
2381                 errno = saved_errno;
2382                 return false;
2383         }
2384         return true;
2385 }
2386
2387 static int msgrecv(int sockfd, void *buf, size_t len)
2388 {
2389         if (!wait_for_sock(sockfd, 2))
2390                 return -1;
2391         return recv(sockfd, buf, len, MSG_DONTWAIT);
2392 }
2393
2394 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2395 {
2396         struct msghdr msg = { 0 };
2397         struct iovec iov;
2398         struct cmsghdr *cmsg;
2399         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2400         char buf[1];
2401         buf[0] = 'p';
2402
2403         if (pingfirst) {
2404                 if (msgrecv(sock, buf, 1) != 1) {
2405                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2406                         return SEND_CREDS_FAIL;
2407                 }
2408         }
2409
2410         msg.msg_control = cmsgbuf;
2411         msg.msg_controllen = sizeof(cmsgbuf);
2412
2413         cmsg = CMSG_FIRSTHDR(&msg);
2414         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2415         cmsg->cmsg_level = SOL_SOCKET;
2416         cmsg->cmsg_type = SCM_CREDENTIALS;
2417         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2418
2419         msg.msg_name = NULL;
2420         msg.msg_namelen = 0;
2421
2422         buf[0] = v;
2423         iov.iov_base = buf;
2424         iov.iov_len = sizeof(buf);
2425         msg.msg_iov = &iov;
2426         msg.msg_iovlen = 1;
2427
2428         if (sendmsg(sock, &msg, 0) < 0) {
2429                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2430                 if (errno == 3)
2431                         return SEND_CREDS_NOTSK;
2432                 return SEND_CREDS_FAIL;
2433         }
2434
2435         return SEND_CREDS_OK;
2436 }
2437
2438 static bool recv_creds(int sock, struct ucred *cred, char *v)
2439 {
2440         struct msghdr msg = { 0 };
2441         struct iovec iov;
2442         struct cmsghdr *cmsg;
2443         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2444         char buf[1];
2445         int ret;
2446         int optval = 1;
2447
2448         *v = '1';
2449
2450         cred->pid = -1;
2451         cred->uid = -1;
2452         cred->gid = -1;
2453
2454         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2455                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2456                 return false;
2457         }
2458         buf[0] = '1';
2459         if (write(sock, buf, 1) != 1) {
2460                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2461                 return false;
2462         }
2463
2464         msg.msg_name = NULL;
2465         msg.msg_namelen = 0;
2466         msg.msg_control = cmsgbuf;
2467         msg.msg_controllen = sizeof(cmsgbuf);
2468
2469         iov.iov_base = buf;
2470         iov.iov_len = sizeof(buf);
2471         msg.msg_iov = &iov;
2472         msg.msg_iovlen = 1;
2473
2474         if (!wait_for_sock(sock, 2)) {
2475                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2476                 return false;
2477         }
2478         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2479         if (ret < 0) {
2480                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2481                 return false;
2482         }
2483
2484         cmsg = CMSG_FIRSTHDR(&msg);
2485
2486         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2487                         cmsg->cmsg_level == SOL_SOCKET &&
2488                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2489                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2490         }
2491         *v = buf[0];
2492
2493         return true;
2494 }
2495
2496 struct pid_ns_clone_args {
2497         int *cpipe;
2498         int sock;
2499         pid_t tpid;
2500         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2501 };
2502
2503 /*
2504  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2505  * with clone(). This simply writes '1' as ACK back to the parent
2506  * before calling the actual wrapped function.
2507  */
2508 static int pid_ns_clone_wrapper(void *arg) {
2509         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2510         char b = '1';
2511
2512         close(args->cpipe[0]);
2513         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2514                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2515         close(args->cpipe[1]);
2516         return args->wrapped(args->sock, args->tpid);
2517 }
2518
2519 /*
2520  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2521  * int value back over the socket.  This shifts the pid from the
2522  * sender's pidns into tpid's pidns.
2523  */
2524 static int pid_to_ns(int sock, pid_t tpid)
2525 {
2526         char v = '0';
2527         struct ucred cred;
2528
2529         while (recv_creds(sock, &cred, &v)) {
2530                 if (v == '1')
2531                         return 0;
2532                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2533                         return 1;
2534         }
2535         return 0;
2536 }
2537
2538
2539 /*
2540  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2541  * in your old pidns.  Only children which you clone will be in the target
2542  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2543  * actually convert pids.
2544  *
2545  * Note: glibc's fork() does not respect pidns, which can lead to failed
2546  * assertions inside glibc (and thus failed forks) if the child's pid in
2547  * the pidns and the parent pid outside are identical. Using clone prevents
2548  * this issue.
2549  */
2550 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2551 {
2552         int newnsfd = -1, ret, cpipe[2];
2553         char fnam[100];
2554         pid_t cpid;
2555         char v;
2556
2557         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2558         if (ret < 0 || ret >= sizeof(fnam))
2559                 _exit(1);
2560         newnsfd = open(fnam, O_RDONLY);
2561         if (newnsfd < 0)
2562                 _exit(1);
2563         if (setns(newnsfd, 0) < 0)
2564                 _exit(1);
2565         close(newnsfd);
2566
2567         if (pipe(cpipe) < 0)
2568                 _exit(1);
2569
2570         struct pid_ns_clone_args args = {
2571                 .cpipe = cpipe,
2572                 .sock = sock,
2573                 .tpid = tpid,
2574                 .wrapped = &pid_to_ns
2575         };
2576         size_t stack_size = sysconf(_SC_PAGESIZE);
2577         void *stack = alloca(stack_size);
2578
2579         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2580         if (cpid < 0)
2581                 _exit(1);
2582
2583         // give the child 1 second to be done forking and
2584         // write its ack
2585         if (!wait_for_sock(cpipe[0], 1))
2586                 _exit(1);
2587         ret = read(cpipe[0], &v, 1);
2588         if (ret != sizeof(char) || v != '1')
2589                 _exit(1);
2590
2591         if (!wait_for_pid(cpid))
2592                 _exit(1);
2593         _exit(0);
2594 }
2595
2596 /*
2597  * To read cgroup files with a particular pid, we will setns into the child
2598  * pidns, open a pipe, fork a child - which will be the first to really be in
2599  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2600  */
2601 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2602 {
2603         int sock[2] = {-1, -1};
2604         __do_free char *tmpdata = NULL;
2605         int ret;
2606         pid_t qpid, cpid = -1;
2607         bool answer = false;
2608         char v = '0';
2609         struct ucred cred;
2610         size_t sz = 0, asz = 0;
2611
2612         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2613                 return false;
2614
2615         /*
2616          * Now we read the pids from returned data one by one, pass
2617          * them into a child in the target namespace, read back the
2618          * translated pids, and put them into our to-return data
2619          */
2620
2621         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2622                 perror("socketpair");
2623                 return false;
2624         }
2625
2626         cpid = fork();
2627         if (cpid == -1)
2628                 goto out;
2629
2630         if (!cpid) // child - exits when done
2631                 pid_to_ns_wrapper(sock[1], tpid);
2632
2633         char *ptr = tmpdata;
2634         cred.uid = 0;
2635         cred.gid = 0;
2636         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2637                 cred.pid = qpid;
2638                 ret = send_creds(sock[0], &cred, v, true);
2639
2640                 if (ret == SEND_CREDS_NOTSK)
2641                         goto next;
2642                 if (ret == SEND_CREDS_FAIL)
2643                         goto out;
2644
2645                 // read converted results
2646                 if (!wait_for_sock(sock[0], 2)) {
2647                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2648                         goto out;
2649                 }
2650                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2651                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2652                         goto out;
2653                 }
2654                 must_strcat_pid(d, &sz, &asz, qpid);
2655 next:
2656                 ptr = strchr(ptr, '\n');
2657                 if (!ptr)
2658                         break;
2659                 ptr++;
2660         }
2661
2662         cred.pid = getpid();
2663         v = '1';
2664         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2665                 // failed to ask child to exit
2666                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2667                 goto out;
2668         }
2669
2670         answer = true;
2671
2672 out:
2673         if (cpid != -1)
2674                 wait_for_pid(cpid);
2675         if (sock[0] != -1) {
2676                 close(sock[0]);
2677                 close(sock[1]);
2678         }
2679         return answer;
2680 }
2681
2682 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2683                 struct fuse_file_info *fi)
2684 {
2685         __do_free char *data = NULL;
2686         struct fuse_context *fc = fuse_get_context();
2687         struct file_info *f = (struct file_info *)fi->fh;
2688         struct cgfs_files *k = NULL;
2689         int ret, s;
2690         bool r;
2691
2692         if (f->type != LXC_TYPE_CGFILE) {
2693                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2694                 return -EIO;
2695         }
2696
2697         if (offset)
2698                 return 0;
2699
2700         if (!fc)
2701                 return -EIO;
2702
2703         if (!f->controller)
2704                 return -EINVAL;
2705
2706         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2707                 return -EINVAL;
2708         }
2709         free_key(k);
2710
2711
2712         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2713                 ret = -EACCES;
2714                 goto out;
2715         }
2716
2717         if (strcmp(f->file, "tasks") == 0 ||
2718                         strcmp(f->file, "/tasks") == 0 ||
2719                         strcmp(f->file, "/cgroup.procs") == 0 ||
2720                         strcmp(f->file, "cgroup.procs") == 0)
2721                 // special case - we have to translate the pids
2722                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2723         else
2724                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2725
2726         if (!r) {
2727                 ret = -EINVAL;
2728                 goto out;
2729         }
2730
2731         if (!data) {
2732                 ret = 0;
2733                 goto out;
2734         }
2735         s = strlen(data);
2736         if (s > size)
2737                 s = size;
2738         memcpy(buf, data, s);
2739         if (s > 0 && s < size && data[s-1] != '\n')
2740                 buf[s++] = '\n';
2741
2742         ret = s;
2743
2744 out:
2745         return ret;
2746 }
2747
2748 static int pid_from_ns(int sock, pid_t tpid)
2749 {
2750         pid_t vpid;
2751         struct ucred cred;
2752         char v;
2753         int ret;
2754
2755         cred.uid = 0;
2756         cred.gid = 0;
2757         while (1) {
2758                 if (!wait_for_sock(sock, 2)) {
2759                         lxcfs_error("%s\n", "Timeout reading from parent.");
2760                         return 1;
2761                 }
2762                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2763                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2764                         return 1;
2765                 }
2766                 if (vpid == -1) // done
2767                         break;
2768                 v = '0';
2769                 cred.pid = vpid;
2770                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2771                         v = '1';
2772                         cred.pid = getpid();
2773                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2774                                 return 1;
2775                 }
2776         }
2777         return 0;
2778 }
2779
2780 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2781 {
2782         int newnsfd = -1, ret, cpipe[2];
2783         char fnam[100];
2784         pid_t cpid;
2785         char v;
2786
2787         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2788         if (ret < 0 || ret >= sizeof(fnam))
2789                 _exit(1);
2790         newnsfd = open(fnam, O_RDONLY);
2791         if (newnsfd < 0)
2792                 _exit(1);
2793         if (setns(newnsfd, 0) < 0)
2794                 _exit(1);
2795         close(newnsfd);
2796
2797         if (pipe(cpipe) < 0)
2798                 _exit(1);
2799
2800         struct pid_ns_clone_args args = {
2801                 .cpipe = cpipe,
2802                 .sock = sock,
2803                 .tpid = tpid,
2804                 .wrapped = &pid_from_ns
2805         };
2806         size_t stack_size = sysconf(_SC_PAGESIZE);
2807         void *stack = alloca(stack_size);
2808
2809         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2810         if (cpid < 0)
2811                 _exit(1);
2812
2813         // give the child 1 second to be done forking and
2814         // write its ack
2815         if (!wait_for_sock(cpipe[0], 1))
2816                 _exit(1);
2817         ret = read(cpipe[0], &v, 1);
2818         if (ret != sizeof(char) || v != '1')
2819                 _exit(1);
2820
2821         if (!wait_for_pid(cpid))
2822                 _exit(1);
2823         _exit(0);
2824 }
2825
2826 /*
2827  * Given host @uid, return the uid to which it maps in
2828  * @pid's user namespace, or -1 if none.
2829  */
2830 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2831 {
2832         FILE *f;
2833         char line[400];
2834
2835         sprintf(line, "/proc/%d/uid_map", pid);
2836         if ((f = fopen(line, "r")) == NULL) {
2837                 return false;
2838         }
2839
2840         *answer = convert_id_to_ns(f, uid);
2841         fclose(f);
2842
2843         if (*answer == -1)
2844                 return false;
2845         return true;
2846 }
2847
2848 /*
2849  * get_pid_creds: get the real uid and gid of @pid from
2850  * /proc/$$/status
2851  * (XXX should we use euid here?)
2852  */
2853 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2854 {
2855         char line[400];
2856         uid_t u;
2857         gid_t g;
2858         FILE *f;
2859
2860         *uid = -1;
2861         *gid = -1;
2862         sprintf(line, "/proc/%d/status", pid);
2863         if ((f = fopen(line, "r")) == NULL) {
2864                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2865                 return;
2866         }
2867         while (fgets(line, 400, f)) {
2868                 if (strncmp(line, "Uid:", 4) == 0) {
2869                         if (sscanf(line+4, "%u", &u) != 1) {
2870                                 lxcfs_error("bad uid line for pid %u\n", pid);
2871                                 fclose(f);
2872                                 return;
2873                         }
2874                         *uid = u;
2875                 } else if (strncmp(line, "Gid:", 4) == 0) {
2876                         if (sscanf(line+4, "%u", &g) != 1) {
2877                                 lxcfs_error("bad gid line for pid %u\n", pid);
2878                                 fclose(f);
2879                                 return;
2880                         }
2881                         *gid = g;
2882                 }
2883         }
2884         fclose(f);
2885 }
2886
2887 /*
2888  * May the requestor @r move victim @v to a new cgroup?
2889  * This is allowed if
2890  *   . they are the same task
2891  *   . they are ownedy by the same uid
2892  *   . @r is root on the host, or
2893  *   . @v's uid is mapped into @r's where @r is root.
2894  */
2895 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2896 {
2897         uid_t v_uid, tmpuid;
2898         gid_t v_gid;
2899
2900         if (r == v)
2901                 return true;
2902         if (r_uid == 0)
2903                 return true;
2904         get_pid_creds(v, &v_uid, &v_gid);
2905         if (r_uid == v_uid)
2906                 return true;
2907         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2908                         && hostuid_to_ns(v_uid, r, &tmpuid))
2909                 return true;
2910         return false;
2911 }
2912
2913 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2914                 const char *file, const char *buf)
2915 {
2916         int sock[2] = {-1, -1};
2917         pid_t qpid, cpid = -1;
2918         FILE *pids_file = NULL;
2919         bool answer = false, fail = false;
2920
2921         pids_file = open_pids_file(contrl, cg);
2922         if (!pids_file)
2923                 return false;
2924
2925         /*
2926          * write the pids to a socket, have helper in writer's pidns
2927          * call movepid for us
2928          */
2929         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2930                 perror("socketpair");
2931                 goto out;
2932         }
2933
2934         cpid = fork();
2935         if (cpid == -1)
2936                 goto out;
2937
2938         if (!cpid) { // child
2939                 fclose(pids_file);
2940                 pid_from_ns_wrapper(sock[1], tpid);
2941         }
2942
2943         const char *ptr = buf;
2944         while (sscanf(ptr, "%d", &qpid) == 1) {
2945                 struct ucred cred;
2946                 char v;
2947
2948                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2949                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2950                         goto out;
2951                 }
2952
2953                 if (recv_creds(sock[0], &cred, &v)) {
2954                         if (v == '0') {
2955                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2956                                         fail = true;
2957                                         break;
2958                                 }
2959                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2960                                         fail = true;
2961                         }
2962                 }
2963
2964                 ptr = strchr(ptr, '\n');
2965                 if (!ptr)
2966                         break;
2967                 ptr++;
2968         }
2969
2970         /* All good, write the value */
2971         qpid = -1;
2972         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2973                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2974
2975         if (!fail)
2976                 answer = true;
2977
2978 out:
2979         if (cpid != -1)
2980                 wait_for_pid(cpid);
2981         if (sock[0] != -1) {
2982                 close(sock[0]);
2983                 close(sock[1]);
2984         }
2985         if (pids_file) {
2986                 if (fclose(pids_file) != 0)
2987                         answer = false;
2988         }
2989         return answer;
2990 }
2991
2992 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2993              struct fuse_file_info *fi)
2994 {
2995         struct fuse_context *fc = fuse_get_context();
2996         char *localbuf = NULL;
2997         struct cgfs_files *k = NULL;
2998         struct file_info *f = (struct file_info *)fi->fh;
2999         bool r;
3000
3001         if (f->type != LXC_TYPE_CGFILE) {
3002                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3003                 return -EIO;
3004         }
3005
3006         if (offset)
3007                 return 0;
3008
3009         if (!fc)
3010                 return -EIO;
3011
3012         localbuf = alloca(size+1);
3013         localbuf[size] = '\0';
3014         memcpy(localbuf, buf, size);
3015
3016         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3017                 size = -EINVAL;
3018                 goto out;
3019         }
3020
3021         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3022                 size = -EACCES;
3023                 goto out;
3024         }
3025
3026         if (strcmp(f->file, "tasks") == 0 ||
3027                         strcmp(f->file, "/tasks") == 0 ||
3028                         strcmp(f->file, "/cgroup.procs") == 0 ||
3029                         strcmp(f->file, "cgroup.procs") == 0)
3030                 // special case - we have to translate the pids
3031                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3032         else
3033                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3034
3035         if (!r)
3036                 size = -EINVAL;
3037
3038 out:
3039         free_key(k);
3040         return size;
3041 }
3042
3043 int cg_chown(const char *path, uid_t uid, gid_t gid)
3044 {
3045         __do_free char *cgdir = NULL;
3046         struct fuse_context *fc = fuse_get_context();
3047         char *last = NULL, *path1, *path2, *controller;
3048         struct cgfs_files *k = NULL;
3049         const char *cgroup;
3050         int ret;
3051
3052         if (!fc)
3053                 return -EIO;
3054
3055         if (strcmp(path, "/cgroup") == 0)
3056                 return -EPERM;
3057
3058         controller = pick_controller_from_path(fc, path);
3059         if (!controller)
3060                 return errno == ENOENT ? -EPERM : -errno;
3061
3062         cgroup = find_cgroup_in_path(path);
3063         if (!cgroup)
3064                 /* this is just /cgroup/controller */
3065                 return -EPERM;
3066
3067         get_cgdir_and_path(cgroup, &cgdir, &last);
3068
3069         if (!last) {
3070                 path1 = "/";
3071                 path2 = cgdir;
3072         } else {
3073                 path1 = cgdir;
3074                 path2 = last;
3075         }
3076
3077         if (is_child_cgroup(controller, path1, path2)) {
3078                 // get uid, gid, from '/tasks' file and make up a mode
3079                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3080                 k = cgfs_get_key(controller, cgroup, "tasks");
3081
3082         } else
3083                 k = cgfs_get_key(controller, path1, path2);
3084
3085         if (!k) {
3086                 ret = -EINVAL;
3087                 goto out;
3088         }
3089
3090         /*
3091          * This being a fuse request, the uid and gid must be valid
3092          * in the caller's namespace.  So we can just check to make
3093          * sure that the caller is root in his uid, and privileged
3094          * over the file's current owner.
3095          */
3096         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3097                 ret = -EACCES;
3098                 goto out;
3099         }
3100
3101         ret = cgfs_chown_file(controller, cgroup, uid, gid);
3102
3103 out:
3104         free_key(k);
3105
3106         return ret;
3107 }
3108
3109 int cg_chmod(const char *path, mode_t mode)
3110 {
3111         __do_free char *cgdir = NULL;
3112         struct fuse_context *fc = fuse_get_context();
3113         char *last = NULL, *path1, *path2, *controller;
3114         struct cgfs_files *k = NULL;
3115         const char *cgroup;
3116         int ret;
3117
3118         if (!fc)
3119                 return -EIO;
3120
3121         if (strcmp(path, "/cgroup") == 0)
3122                 return -EPERM;
3123
3124         controller = pick_controller_from_path(fc, path);
3125         if (!controller)
3126                 return errno == ENOENT ? -EPERM : -errno;
3127
3128         cgroup = find_cgroup_in_path(path);
3129         if (!cgroup)
3130                 /* this is just /cgroup/controller */
3131                 return -EPERM;
3132
3133         get_cgdir_and_path(cgroup, &cgdir, &last);
3134
3135         if (!last) {
3136                 path1 = "/";
3137                 path2 = cgdir;
3138         } else {
3139                 path1 = cgdir;
3140                 path2 = last;
3141         }
3142
3143         if (is_child_cgroup(controller, path1, path2)) {
3144                 // get uid, gid, from '/tasks' file and make up a mode
3145                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3146                 k = cgfs_get_key(controller, cgroup, "tasks");
3147
3148         } else
3149                 k = cgfs_get_key(controller, path1, path2);
3150
3151         if (!k) {
3152                 ret = -EINVAL;
3153                 goto out;
3154         }
3155
3156         /*
3157          * This being a fuse request, the uid and gid must be valid
3158          * in the caller's namespace.  So we can just check to make
3159          * sure that the caller is root in his uid, and privileged
3160          * over the file's current owner.
3161          */
3162         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3163                 ret = -EPERM;
3164                 goto out;
3165         }
3166
3167         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3168                 ret = -EINVAL;
3169                 goto out;
3170         }
3171
3172         ret = 0;
3173 out:
3174         free_key(k);
3175         return ret;
3176 }
3177
3178 int cg_mkdir(const char *path, mode_t mode)
3179 {
3180         __do_free char *cgdir = NULL, *next = NULL;
3181         struct fuse_context *fc = fuse_get_context();
3182         char *last = NULL, *path1, *controller;
3183         const char *cgroup;
3184         int ret;
3185
3186         if (!fc)
3187                 return -EIO;
3188
3189         controller = pick_controller_from_path(fc, path);
3190         if (!controller)
3191                 return errno == ENOENT ? -EPERM : -errno;
3192
3193         cgroup = find_cgroup_in_path(path);
3194         if (!cgroup)
3195                 return -errno;
3196
3197         get_cgdir_and_path(cgroup, &cgdir, &last);
3198         if (!last)
3199                 path1 = "/";
3200         else
3201                 path1 = cgdir;
3202
3203         pid_t initpid = lookup_initpid_in_store(fc->pid);
3204         if (initpid <= 0)
3205                 initpid = fc->pid;
3206         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3207                 if (!next)
3208                         ret = -EINVAL;
3209                 else if (last && strcmp(next, last) == 0)
3210                         ret = -EEXIST;
3211                 else
3212                         ret = -EPERM;
3213                 goto out;
3214         }
3215
3216         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3217                 ret = -EACCES;
3218                 goto out;
3219         }
3220         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3221                 ret = -EACCES;
3222                 goto out;
3223         }
3224
3225         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3226
3227 out:
3228         return ret;
3229 }
3230
3231 int cg_rmdir(const char *path)
3232 {
3233         __do_free char *cgdir = NULL, *next = NULL;
3234         struct fuse_context *fc = fuse_get_context();
3235         char *last = NULL, *controller;
3236         const char *cgroup;
3237         int ret;
3238
3239         if (!fc)
3240                 return -EIO;
3241
3242         controller = pick_controller_from_path(fc, path);
3243         if (!controller) /* Someone's trying to delete "/cgroup". */
3244                 return -EPERM;
3245
3246         cgroup = find_cgroup_in_path(path);
3247         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3248                 return -EPERM;
3249
3250         get_cgdir_and_path(cgroup, &cgdir, &last);
3251         if (!last) {
3252                 /* Someone's trying to delete a cgroup on the same level as the
3253                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3254                  * rmdir "/cgroup/blkio/init.slice".
3255                  */
3256                 ret = -EPERM;
3257                 goto out;
3258         }
3259
3260         pid_t initpid = lookup_initpid_in_store(fc->pid);
3261         if (initpid <= 0)
3262                 initpid = fc->pid;
3263         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3264                 if (!last || (next && (strcmp(next, last) == 0)))
3265                         ret = -EBUSY;
3266                 else
3267                         ret = -ENOENT;
3268                 goto out;
3269         }
3270
3271         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3272                 ret = -EACCES;
3273                 goto out;
3274         }
3275         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3276                 ret = -EACCES;
3277                 goto out;
3278         }
3279
3280         if (!cgfs_remove(controller, cgroup)) {
3281                 ret = -EINVAL;
3282                 goto out;
3283         }
3284
3285         ret = 0;
3286
3287 out:
3288         return ret;
3289 }
3290
3291 static bool startswith(const char *line, const char *pref)
3292 {
3293         if (strncmp(line, pref, strlen(pref)) == 0)
3294                 return true;
3295         return false;
3296 }
3297
3298 static void parse_memstat(char *memstat, unsigned long *cached,
3299                 unsigned long *active_anon, unsigned long *inactive_anon,
3300                 unsigned long *active_file, unsigned long *inactive_file,
3301                 unsigned long *unevictable, unsigned long *shmem)
3302 {
3303         char *eol;
3304
3305         while (*memstat) {
3306                 if (startswith(memstat, "total_cache")) {
3307                         sscanf(memstat + 11, "%lu", cached);
3308                         *cached /= 1024;
3309                 } else if (startswith(memstat, "total_active_anon")) {
3310                         sscanf(memstat + 17, "%lu", active_anon);
3311                         *active_anon /= 1024;
3312                 } else if (startswith(memstat, "total_inactive_anon")) {
3313                         sscanf(memstat + 19, "%lu", inactive_anon);
3314                         *inactive_anon /= 1024;
3315                 } else if (startswith(memstat, "total_active_file")) {
3316                         sscanf(memstat + 17, "%lu", active_file);
3317                         *active_file /= 1024;
3318                 } else if (startswith(memstat, "total_inactive_file")) {
3319                         sscanf(memstat + 19, "%lu", inactive_file);
3320                         *inactive_file /= 1024;
3321                 } else if (startswith(memstat, "total_unevictable")) {
3322                         sscanf(memstat + 17, "%lu", unevictable);
3323                         *unevictable /= 1024;
3324                 } else if (startswith(memstat, "total_shmem")) {
3325                         sscanf(memstat + 11, "%lu", shmem);
3326                         *shmem /= 1024;
3327                 }
3328                 eol = strchr(memstat, '\n');
3329                 if (!eol)
3330                         return;
3331                 memstat = eol+1;
3332         }
3333 }
3334
3335 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3336 {
3337         char *eol;
3338         char key[32];
3339
3340         memset(key, 0, 32);
3341         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3342
3343         size_t len = strlen(key);
3344         *v = 0;
3345
3346         while (*str) {
3347                 if (startswith(str, key)) {
3348                         sscanf(str + len, "%lu", v);
3349                         return;
3350                 }
3351                 eol = strchr(str, '\n');
3352                 if (!eol)
3353                         return;
3354                 str = eol+1;
3355         }
3356 }
3357
3358 static int read_file(const char *path, char *buf, size_t size,
3359                      struct file_info *d)
3360 {
3361         __do_free char *line = NULL;
3362         __do_fclose FILE *f = NULL;
3363         size_t linelen = 0, total_len = 0;
3364         char *cache = d->buf;
3365         size_t cache_size = d->buflen;
3366
3367         f = fopen(path, "r");
3368         if (!f)
3369                 return 0;
3370
3371         while (getline(&line, &linelen, f) != -1) {
3372                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3373                 if (l < 0) {
3374                         perror("Error writing to cache");
3375                         return 0;
3376                 }
3377                 if (l >= cache_size) {
3378                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3379                         return 0;
3380                 }
3381                 cache += l;
3382                 cache_size -= l;
3383                 total_len += l;
3384         }
3385
3386         d->size = total_len;
3387         if (total_len > size)
3388                 total_len = size;
3389
3390         /* read from off 0 */
3391         memcpy(buf, d->buf, total_len);
3392         return total_len;
3393 }
3394
3395 /*
3396  * FUSE ops for /proc
3397  */
3398
3399 static unsigned long get_memlimit(const char *cgroup, const char *file)
3400 {
3401         __do_free char *memlimit_str = NULL;
3402         unsigned long memlimit = -1;
3403
3404         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3405                 memlimit = strtoul(memlimit_str, NULL, 10);
3406
3407         return memlimit;
3408 }
3409
3410 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3411 {
3412         char *copy = strdupa(cgroup);
3413         unsigned long memlimit = 0, retlimit;
3414
3415         retlimit = get_memlimit(copy, file);
3416
3417         while (strcmp(copy, "/") != 0) {
3418                 copy = dirname(copy);
3419                 memlimit = get_memlimit(copy, file);
3420                 if (memlimit != -1 && memlimit < retlimit)
3421                         retlimit = memlimit;
3422         };
3423
3424         return retlimit;
3425 }
3426
3427 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3428                 struct fuse_file_info *fi)
3429 {
3430         __do_free char *cg = NULL, *line = NULL, *memusage_str = NULL,
3431                        *memstat_str = NULL, *memswlimit_str = NULL,
3432                        *memswusage_str = NULL;
3433         __do_fclose FILE *f = NULL;
3434         struct fuse_context *fc = fuse_get_context();
3435         struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3436         struct file_info *d = (struct file_info *)fi->fh;
3437         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3438                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3439                 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3440                 hostswtotal = 0;
3441         size_t linelen = 0, total_len = 0, rv = 0;
3442         char *cache = d->buf;
3443         size_t cache_size = d->buflen;
3444
3445         if (offset){
3446                 if (offset > d->size)
3447                         return -EINVAL;
3448                 if (!d->cached)
3449                         return 0;
3450                 int left = d->size - offset;
3451                 total_len = left > size ? size: left;
3452                 memcpy(buf, cache + offset, total_len);
3453                 return total_len;
3454         }
3455
3456         pid_t initpid = lookup_initpid_in_store(fc->pid);
3457         if (initpid <= 0)
3458                 initpid = fc->pid;
3459         cg = get_pid_cgroup(initpid, "memory");
3460         if (!cg)
3461                 return read_file("/proc/meminfo", buf, size, d);
3462         prune_init_slice(cg);
3463
3464         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3465         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3466                 goto err;
3467         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3468                 goto err;
3469
3470         // Following values are allowed to fail, because swapaccount might be turned
3471         // off for current kernel
3472         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3473                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3474         {
3475                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3476                 memswusage = strtoul(memswusage_str, NULL, 10);
3477
3478                 memswlimit = memswlimit / 1024;
3479                 memswusage = memswusage / 1024;
3480         }
3481
3482         memusage = strtoul(memusage_str, NULL, 10);
3483         memlimit /= 1024;
3484         memusage /= 1024;
3485
3486         parse_memstat(memstat_str, &cached, &active_anon,
3487                         &inactive_anon, &active_file, &inactive_file,
3488                         &unevictable, &shmem);
3489
3490         f = fopen("/proc/meminfo", "r");
3491         if (!f)
3492                 goto err;
3493
3494         while (getline(&line, &linelen, f) != -1) {
3495                 ssize_t l;
3496                 char *printme, lbuf[100];
3497
3498                 memset(lbuf, 0, 100);
3499                 if (startswith(line, "MemTotal:")) {
3500                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3501                         if (hosttotal < memlimit)
3502                                 memlimit = hosttotal;
3503                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3504                         printme = lbuf;
3505                 } else if (startswith(line, "MemFree:")) {
3506                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3507                         printme = lbuf;
3508                 } else if (startswith(line, "MemAvailable:")) {
3509                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3510                         printme = lbuf;
3511                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts->swap_off == false) {
3512                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3513                         if (hostswtotal < memswlimit)
3514                                 memswlimit = hostswtotal;
3515                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3516                         printme = lbuf;
3517                 } else if (startswith(line, "SwapTotal:") && opts->swap_off == true) {
3518                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", 0UL);
3519                         printme = lbuf;
3520                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts->swap_off == false) {
3521                         unsigned long swaptotal = memswlimit,
3522                                         swapusage = memswusage - memusage,
3523                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3524                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3525                         printme = lbuf;
3526                 } else if (startswith(line, "SwapFree:") && opts->swap_off == true) {
3527                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", 0UL);
3528                         printme = lbuf;
3529                 } else if (startswith(line, "Slab:")) {
3530                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3531                         printme = lbuf;
3532                 } else if (startswith(line, "Buffers:")) {
3533                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3534                         printme = lbuf;
3535                 } else if (startswith(line, "Cached:")) {
3536                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3537                         printme = lbuf;
3538                 } else if (startswith(line, "SwapCached:")) {
3539                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3540                         printme = lbuf;
3541                 } else if (startswith(line, "Active:")) {
3542                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3543                                         active_anon + active_file);
3544                         printme = lbuf;
3545                 } else if (startswith(line, "Inactive:")) {
3546                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3547                                         inactive_anon + inactive_file);
3548                         printme = lbuf;
3549                 } else if (startswith(line, "Active(anon)")) {
3550                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3551                         printme = lbuf;
3552                 } else if (startswith(line, "Inactive(anon)")) {
3553                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3554                         printme = lbuf;
3555                 } else if (startswith(line, "Active(file)")) {
3556                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3557                         printme = lbuf;
3558                 } else if (startswith(line, "Inactive(file)")) {
3559                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3560                         printme = lbuf;
3561                 } else if (startswith(line, "Unevictable")) {
3562                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3563                         printme = lbuf;
3564                 } else if (startswith(line, "SReclaimable")) {
3565                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3566                         printme = lbuf;
3567                 } else if (startswith(line, "SUnreclaim")) {
3568                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3569                         printme = lbuf;
3570                 } else if (startswith(line, "Shmem:")) {
3571                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3572                         printme = lbuf;
3573                 } else if (startswith(line, "ShmemHugePages")) {
3574                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3575                         printme = lbuf;
3576                 } else if (startswith(line, "ShmemPmdMapped")) {
3577                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3578                         printme = lbuf;
3579                 } else
3580                         printme = line;
3581
3582                 l = snprintf(cache, cache_size, "%s", printme);
3583                 if (l < 0) {
3584                         perror("Error writing to cache");
3585                         rv = 0;
3586                         goto err;
3587
3588                 }
3589                 if (l >= cache_size) {
3590                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3591                         rv = 0;
3592                         goto err;
3593                 }
3594
3595                 cache += l;
3596                 cache_size -= l;
3597                 total_len += l;
3598         }
3599
3600         d->cached = 1;
3601         d->size = total_len;
3602         if (total_len > size ) total_len = size;
3603         memcpy(buf, d->buf, total_len);
3604
3605         rv = total_len;
3606 err:
3607         return rv;
3608 }
3609
3610 /*
3611  * Read the cpuset.cpus for cg
3612  * Return the answer in a newly allocated string which must be freed
3613  */
3614 static char *get_cpuset(const char *cg)
3615 {
3616         char *answer;
3617
3618         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3619                 return NULL;
3620         return answer;
3621 }
3622
3623 bool cpu_in_cpuset(int cpu, const char *cpuset);
3624
3625 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3626 {
3627         int cpu;
3628
3629         if (sscanf(line, "processor       : %d", &cpu) != 1)
3630                 return false;
3631         return cpu_in_cpuset(cpu, cpuset);
3632 }
3633
3634 /*
3635  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3636  * depending on `param`. Parameter value is returned throuh `value`.
3637  */
3638 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3639 {
3640         __do_free char *str = NULL;
3641         bool rv = false;
3642         char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3643
3644         sprintf(file, "cpu.cfs_%s_us", param);
3645
3646         if (!cgfs_get_value("cpu", cg, file, &str))
3647                 goto err;
3648
3649         if (sscanf(str, "%ld", value) != 1)
3650                 goto err;
3651
3652         rv = true;
3653
3654 err:
3655         return rv;
3656 }
3657
3658 /*
3659  * Return the maximum number of visible CPUs based on CPU quotas.
3660  * If there is no quota set, zero is returned.
3661  */
3662 int max_cpu_count(const char *cg)
3663 {
3664         int rv, nprocs;
3665         int64_t cfs_quota, cfs_period;
3666
3667         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3668                 return 0;
3669
3670         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3671                 return 0;
3672
3673         if (cfs_quota <= 0 || cfs_period <= 0)
3674                 return 0;
3675
3676         rv = cfs_quota / cfs_period;
3677
3678         /* In case quota/period does not yield a whole number, add one CPU for
3679          * the remainder.
3680          */
3681         if ((cfs_quota % cfs_period) > 0)
3682                 rv += 1;
3683
3684         nprocs = get_nprocs();
3685
3686         if (rv > nprocs)
3687                 rv = nprocs;
3688
3689         return rv;
3690 }
3691
3692 /*
3693  * Determine whether CPU views should be used or not.
3694  */
3695 bool use_cpuview(const char *cg)
3696 {
3697         int cfd;
3698         char *tmpc;
3699
3700         tmpc = find_mounted_controller("cpu", &cfd);
3701         if (!tmpc)
3702                 return false;
3703
3704         tmpc = find_mounted_controller("cpuacct", &cfd);
3705         if (!tmpc)
3706                 return false;
3707
3708         return true;
3709 }
3710
3711 /*
3712  * check whether this is a '^processor" line in /proc/cpuinfo
3713  */
3714 static bool is_processor_line(const char *line)
3715 {
3716         int cpu;
3717
3718         if (sscanf(line, "processor       : %d", &cpu) == 1)
3719                 return true;
3720         return false;
3721 }
3722
3723 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3724                 struct fuse_file_info *fi)
3725 {
3726         __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3727         __do_fclose FILE *f = NULL;
3728         struct fuse_context *fc = fuse_get_context();
3729         struct file_info *d = (struct file_info *)fi->fh;
3730         size_t linelen = 0, total_len = 0, rv = 0;
3731         bool am_printing = false, firstline = true, is_s390x = false;
3732         int curcpu = -1, cpu, max_cpus = 0;
3733         bool use_view;
3734         char *cache = d->buf;
3735         size_t cache_size = d->buflen;
3736
3737         if (offset){
3738                 if (offset > d->size)
3739                         return -EINVAL;
3740                 if (!d->cached)
3741                         return 0;
3742                 int left = d->size - offset;
3743                 total_len = left > size ? size: left;
3744                 memcpy(buf, cache + offset, total_len);
3745                 return total_len;
3746         }
3747
3748         pid_t initpid = lookup_initpid_in_store(fc->pid);
3749         if (initpid <= 0)
3750                 initpid = fc->pid;
3751         cg = get_pid_cgroup(initpid, "cpuset");
3752         if (!cg)
3753                 return read_file("proc/cpuinfo", buf, size, d);
3754         prune_init_slice(cg);
3755
3756         cpuset = get_cpuset(cg);
3757         if (!cpuset)
3758                 goto err;
3759
3760         use_view = use_cpuview(cg);
3761
3762         if (use_view)
3763                 max_cpus = max_cpu_count(cg);
3764
3765         f = fopen("/proc/cpuinfo", "r");
3766         if (!f)
3767                 goto err;
3768
3769         while (getline(&line, &linelen, f) != -1) {
3770                 ssize_t l;
3771                 if (firstline) {
3772                         firstline = false;
3773                         if (strstr(line, "IBM/S390") != NULL) {
3774                                 is_s390x = true;
3775                                 am_printing = true;
3776                                 continue;
3777                         }
3778                 }
3779                 if (strncmp(line, "# processors:", 12) == 0)
3780                         continue;
3781                 if (is_processor_line(line)) {
3782                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3783                                 break;
3784                         am_printing = cpuline_in_cpuset(line, cpuset);
3785                         if (am_printing) {
3786                                 curcpu ++;
3787                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3788                                 if (l < 0) {
3789                                         perror("Error writing to cache");
3790                                         rv = 0;
3791                                         goto err;
3792                                 }
3793                                 if (l >= cache_size) {
3794                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3795                                         rv = 0;
3796                                         goto err;
3797                                 }
3798                                 cache += l;
3799                                 cache_size -= l;
3800                                 total_len += l;
3801                         }
3802                         continue;
3803                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3804                         char *p;
3805                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3806                                 break;
3807                         if (!cpu_in_cpuset(cpu, cpuset))
3808                                 continue;
3809                         curcpu ++;
3810                         p = strchr(line, ':');
3811                         if (!p || !*p)
3812                                 goto err;
3813                         p++;
3814                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3815                         if (l < 0) {
3816                                 perror("Error writing to cache");
3817                                 rv = 0;
3818                                 goto err;
3819                         }
3820                         if (l >= cache_size) {
3821                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3822                                 rv = 0;
3823                                 goto err;
3824                         }
3825                         cache += l;
3826                         cache_size -= l;
3827                         total_len += l;
3828                         continue;
3829
3830                 }
3831                 if (am_printing) {
3832                         l = snprintf(cache, cache_size, "%s", line);
3833                         if (l < 0) {
3834                                 perror("Error writing to cache");
3835                                 rv = 0;
3836                                 goto err;
3837                         }
3838                         if (l >= cache_size) {
3839                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3840                                 rv = 0;
3841                                 goto err;
3842                         }
3843                         cache += l;
3844                         cache_size -= l;
3845                         total_len += l;
3846                 }
3847         }
3848
3849         if (is_s390x) {
3850                 __do_free char *origcache = d->buf;
3851                 ssize_t l;
3852                 do {
3853                         d->buf = malloc(d->buflen);
3854                 } while (!d->buf);
3855                 cache = d->buf;
3856                 cache_size = d->buflen;
3857                 total_len = 0;
3858                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3859                 if (l < 0 || l >= cache_size)
3860                         goto err;
3861                 cache_size -= l;
3862                 cache += l;
3863                 total_len += l;
3864                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3865                 if (l < 0 || l >= cache_size)
3866                         goto err;
3867                 cache_size -= l;
3868                 cache += l;
3869                 total_len += l;
3870                 l = snprintf(cache, cache_size, "%s", origcache);
3871                 if (l < 0 || l >= cache_size)
3872                         goto err;
3873                 total_len += l;
3874         }
3875
3876         d->cached = 1;
3877         d->size = total_len;
3878         if (total_len > size ) total_len = size;
3879
3880         /* read from off 0 */
3881         memcpy(buf, d->buf, total_len);
3882         rv = total_len;
3883 err:
3884         return rv;
3885 }
3886
3887 static uint64_t get_reaper_start_time(pid_t pid)
3888 {
3889         int ret;
3890         FILE *f;
3891         uint64_t starttime;
3892         /* strlen("/proc/") = 6
3893          * +
3894          * LXCFS_NUMSTRLEN64
3895          * +
3896          * strlen("/stat") = 5
3897          * +
3898          * \0 = 1
3899          * */
3900 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3901         char path[__PROC_PID_STAT_LEN];
3902         pid_t qpid;
3903
3904         qpid = lookup_initpid_in_store(pid);
3905         if (qpid <= 0) {
3906                 /* Caller can check for EINVAL on 0. */
3907                 errno = EINVAL;
3908                 return 0;
3909         }
3910
3911         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3912         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3913                 /* Caller can check for EINVAL on 0. */
3914                 errno = EINVAL;
3915                 return 0;
3916         }
3917
3918         f = fopen(path, "r");
3919         if (!f) {
3920                 /* Caller can check for EINVAL on 0. */
3921                 errno = EINVAL;
3922                 return 0;
3923         }
3924
3925         /* Note that the *scanf() argument supression requires that length
3926          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3927          * at us. It's like telling someone you're not married and then asking
3928          * if you can bring your wife to the party.
3929          */
3930         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3931                         "%*s "      /* (2)  comm        %s   */
3932                         "%*c "      /* (3)  state       %c   */
3933                         "%*d "      /* (4)  ppid        %d   */
3934                         "%*d "      /* (5)  pgrp        %d   */
3935                         "%*d "      /* (6)  session     %d   */
3936                         "%*d "      /* (7)  tty_nr      %d   */
3937                         "%*d "      /* (8)  tpgid       %d   */
3938                         "%*u "      /* (9)  flags       %u   */
3939                         "%*u "      /* (10) minflt      %lu  */
3940                         "%*u "      /* (11) cminflt     %lu  */
3941                         "%*u "      /* (12) majflt      %lu  */
3942                         "%*u "      /* (13) cmajflt     %lu  */
3943                         "%*u "      /* (14) utime       %lu  */
3944                         "%*u "      /* (15) stime       %lu  */
3945                         "%*d "      /* (16) cutime      %ld  */
3946                         "%*d "      /* (17) cstime      %ld  */
3947                         "%*d "      /* (18) priority    %ld  */
3948                         "%*d "      /* (19) nice        %ld  */
3949                         "%*d "      /* (20) num_threads %ld  */
3950                         "%*d "      /* (21) itrealvalue %ld  */
3951                         "%" PRIu64, /* (22) starttime   %llu */
3952                      &starttime);
3953         if (ret != 1) {
3954                 fclose(f);
3955                 /* Caller can check for EINVAL on 0. */
3956                 errno = EINVAL;
3957                 return 0;
3958         }
3959
3960         fclose(f);
3961
3962         errno = 0;
3963         return starttime;
3964 }
3965
3966 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3967 {
3968         uint64_t clockticks;
3969         int64_t ticks_per_sec;
3970
3971         clockticks = get_reaper_start_time(pid);
3972         if (clockticks == 0 && errno == EINVAL) {
3973                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3974                 return 0;
3975         }
3976
3977         ticks_per_sec = sysconf(_SC_CLK_TCK);
3978         if (ticks_per_sec < 0 && errno == EINVAL) {
3979                 lxcfs_debug(
3980                     "%s\n",
3981                     "failed to determine number of clock ticks in a second");
3982                 return 0;
3983         }
3984
3985         return (clockticks /= ticks_per_sec);
3986 }
3987
3988 static uint64_t get_reaper_age(pid_t pid)
3989 {
3990         uint64_t procstart, uptime, procage;
3991
3992         /* We need to substract the time the process has started since system
3993          * boot minus the time when the system has started to get the actual
3994          * reaper age.
3995          */
3996         procstart = get_reaper_start_time_in_sec(pid);
3997         procage = procstart;
3998         if (procstart > 0) {
3999                 int ret;
4000                 struct timespec spec;
4001
4002                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4003                 if (ret < 0)
4004                         return 0;
4005                 /* We could make this more precise here by using the tv_nsec
4006                  * field in the timespec struct and convert it to milliseconds
4007                  * and then create a double for the seconds and milliseconds but
4008                  * that seems more work than it is worth.
4009                  */
4010                 uptime = spec.tv_sec;
4011                 procage = uptime - procstart;
4012         }
4013
4014         return procage;
4015 }
4016
4017 /*
4018  * Returns 0 on success.
4019  * It is the caller's responsibility to free `return_usage`, unless this
4020  * function returns an error.
4021  */
4022 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4023 {
4024         int cpucount = get_nprocs_conf();
4025         struct cpuacct_usage *cpu_usage;
4026         int rv = 0, i, j, ret, read_pos = 0, read_cnt;
4027         int cg_cpu;
4028         uint64_t cg_user, cg_system;
4029         int64_t ticks_per_sec;
4030         char *usage_str = NULL;
4031
4032         ticks_per_sec = sysconf(_SC_CLK_TCK);
4033
4034         if (ticks_per_sec < 0 && errno == EINVAL) {
4035                 lxcfs_debug(
4036                         "%s\n",
4037                         "read_cpuacct_usage_all failed to determine number of clock ticks "
4038                         "in a second");
4039                 return -1;
4040         }
4041
4042         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4043         if (!cpu_usage)
4044                 return -ENOMEM;
4045
4046         if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4047                 rv = -1;
4048                 goto err;
4049         }
4050
4051         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4052                 lxcfs_error("read_cpuacct_usage_all reading first line from "
4053                                 "%s/cpuacct.usage_all failed.\n", cg);
4054                 rv = -1;
4055                 goto err;
4056         }
4057
4058         read_pos += read_cnt;
4059
4060         for (i = 0, j = 0; i < cpucount; i++) {
4061                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4062                                 &cg_system, &read_cnt);
4063
4064                 if (ret == EOF)
4065                         break;
4066
4067                 if (ret != 3) {
4068                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4069                                         "failed.\n", cg);
4070                         rv = -1;
4071                         goto err;
4072                 }
4073
4074                 read_pos += read_cnt;
4075
4076                 /* Convert the time from nanoseconds to USER_HZ */
4077                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4078                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4079                 j++;
4080         }
4081
4082         rv = 0;
4083         *return_usage = cpu_usage;
4084         *size = cpucount;
4085
4086 err:
4087         if (usage_str)
4088                 free(usage_str);
4089
4090         if (rv != 0) {
4091                 free(cpu_usage);
4092                 *return_usage = NULL;
4093         }
4094
4095         return rv;
4096 }
4097
4098 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4099 {
4100         int i;
4101         unsigned long sum = 0;
4102
4103         for (i = 0; i < cpu_count; i++) {
4104                 if (!newer[i].online)
4105                         continue;
4106
4107                 /* When cpuset is changed on the fly, the CPUs might get reordered.
4108                  * We could either reset all counters, or check that the substractions
4109                  * below will return expected results.
4110                  */
4111                 if (newer[i].user > older[i].user)
4112                         diff[i].user = newer[i].user - older[i].user;
4113                 else
4114                         diff[i].user = 0;
4115
4116                 if (newer[i].system > older[i].system)
4117                         diff[i].system = newer[i].system - older[i].system;
4118                 else
4119                         diff[i].system = 0;
4120
4121                 if (newer[i].idle > older[i].idle)
4122                         diff[i].idle = newer[i].idle - older[i].idle;
4123                 else
4124                         diff[i].idle = 0;
4125
4126                 sum += diff[i].user;
4127                 sum += diff[i].system;
4128                 sum += diff[i].idle;
4129         }
4130
4131         return sum;
4132 }
4133
4134 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4135 {
4136         unsigned long free_space, to_add;
4137
4138         free_space = threshold - usage->user - usage->system;
4139
4140         if (free_space > usage->idle)
4141                 free_space = usage->idle;
4142
4143         to_add = free_space > *surplus ? *surplus : free_space;
4144
4145         *counter += to_add;
4146         usage->idle -= to_add;
4147         *surplus -= to_add;
4148 }
4149
4150 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4151 {
4152         struct cg_proc_stat *first = NULL, *prev, *tmp;
4153
4154         for (prev = NULL; node; ) {
4155                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4156                         tmp = node;
4157                         lxcfs_debug("Removing stat node for %s\n", node->cg);
4158
4159                         if (prev)
4160                                 prev->next = node->next;
4161                         else
4162                                 first = node->next;
4163
4164                         node = node->next;
4165                         free_proc_stat_node(tmp);
4166                 } else {
4167                         if (!first)
4168                                 first = node;
4169                         prev = node;
4170                         node = node->next;
4171                 }
4172         }
4173
4174         return first;
4175 }
4176
4177 #define PROC_STAT_PRUNE_INTERVAL 10
4178 static void prune_proc_stat_history(void)
4179 {
4180         int i;
4181         time_t now = time(NULL);
4182
4183         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4184                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4185
4186                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4187                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4188                         return;
4189                 }
4190
4191                 if (proc_stat_history[i]->next) {
4192                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4193                         proc_stat_history[i]->lastcheck = now;
4194                 }
4195
4196                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4197         }
4198 }
4199
4200 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4201 {
4202         struct cg_proc_stat *node;
4203
4204         pthread_rwlock_rdlock(&head->lock);
4205
4206         if (!head->next) {
4207                 pthread_rwlock_unlock(&head->lock);
4208                 return NULL;
4209         }
4210
4211         node = head->next;
4212
4213         do {
4214                 if (strcmp(cg, node->cg) == 0)
4215                         goto out;
4216         } while ((node = node->next));
4217
4218         node = NULL;
4219
4220 out:
4221         pthread_rwlock_unlock(&head->lock);
4222         prune_proc_stat_history();
4223         return node;
4224 }
4225
4226 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4227 {
4228         struct cg_proc_stat *node;
4229         int i;
4230
4231         node = malloc(sizeof(struct cg_proc_stat));
4232         if (!node)
4233                 goto err;
4234
4235         node->cg = NULL;
4236         node->usage = NULL;
4237         node->view = NULL;
4238
4239         node->cg = malloc(strlen(cg) + 1);
4240         if (!node->cg)
4241                 goto err;
4242
4243         strcpy(node->cg, cg);
4244
4245         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4246         if (!node->usage)
4247                 goto err;
4248
4249         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4250
4251         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4252         if (!node->view)
4253                 goto err;
4254
4255         node->cpu_count = cpu_count;
4256         node->next = NULL;
4257
4258         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4259                 lxcfs_error("%s\n", "Failed to initialize node lock");
4260                 goto err;
4261         }
4262
4263         for (i = 0; i < cpu_count; i++) {
4264                 node->view[i].user = 0;
4265                 node->view[i].system = 0;
4266                 node->view[i].idle = 0;
4267         }
4268
4269         return node;
4270
4271 err:
4272         if (node && node->cg)
4273                 free(node->cg);
4274         if (node && node->usage)
4275                 free(node->usage);
4276         if (node && node->view)
4277                 free(node->view);
4278         if (node)
4279                 free(node);
4280
4281         return NULL;
4282 }
4283
4284 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4285 {
4286         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4287         struct cg_proc_stat_head *head = proc_stat_history[hash];
4288         struct cg_proc_stat *node, *rv = new_node;
4289
4290         pthread_rwlock_wrlock(&head->lock);
4291
4292         if (!head->next) {
4293                 head->next = new_node;
4294                 goto out;
4295         }
4296
4297         node = head->next;
4298
4299         for (;;) {
4300                 if (strcmp(node->cg, new_node->cg) == 0) {
4301                         /* The node is already present, return it */
4302                         free_proc_stat_node(new_node);
4303                         rv = node;
4304                         goto out;
4305                 }
4306
4307                 if (node->next) {
4308                         node = node->next;
4309                         continue;
4310                 }
4311
4312                 node->next = new_node;
4313                 goto out;
4314         }
4315
4316 out:
4317         pthread_rwlock_unlock(&head->lock);
4318         return rv;
4319 }
4320
4321 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4322 {
4323         struct cpuacct_usage *new_usage, *new_view;
4324         int i;
4325
4326         /* Allocate new memory */
4327         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4328         if (!new_usage)
4329                 return false;
4330
4331         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4332         if (!new_view) {
4333                 free(new_usage);
4334                 return false;
4335         }
4336
4337         /* Copy existing data & initialize new elements */
4338         for (i = 0; i < cpu_count; i++) {
4339                 if (i < node->cpu_count) {
4340                         new_usage[i].user = node->usage[i].user;
4341                         new_usage[i].system = node->usage[i].system;
4342                         new_usage[i].idle = node->usage[i].idle;
4343
4344                         new_view[i].user = node->view[i].user;
4345                         new_view[i].system = node->view[i].system;
4346                         new_view[i].idle = node->view[i].idle;
4347                 } else {
4348                         new_usage[i].user = 0;
4349                         new_usage[i].system = 0;
4350                         new_usage[i].idle = 0;
4351
4352                         new_view[i].user = 0;
4353                         new_view[i].system = 0;
4354                         new_view[i].idle = 0;
4355                 }
4356         }
4357
4358         free(node->usage);
4359         free(node->view);
4360
4361         node->usage = new_usage;
4362         node->view = new_view;
4363         node->cpu_count = cpu_count;
4364
4365         return true;
4366 }
4367
4368 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4369 {
4370         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4371         struct cg_proc_stat_head *head = proc_stat_history[hash];
4372         struct cg_proc_stat *node;
4373
4374         node = find_proc_stat_node(head, cg);
4375
4376         if (!node) {
4377                 node = new_proc_stat_node(usage, cpu_count, cg);
4378                 if (!node)
4379                         return NULL;
4380
4381                 node = add_proc_stat_node(node);
4382                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4383         }
4384
4385         pthread_mutex_lock(&node->lock);
4386
4387         /* If additional CPUs on the host have been enabled, CPU usage counter
4388          * arrays have to be expanded */
4389         if (node->cpu_count < cpu_count) {
4390                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4391                                 node->cpu_count, cpu_count, cg);
4392
4393                 if (!expand_proc_stat_node(node, cpu_count)) {
4394                         pthread_mutex_unlock(&node->lock);
4395                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4396                                         node->cpu_count, cpu_count, cg);
4397                         return NULL;
4398                 }
4399         }
4400
4401         return node;
4402 }
4403
4404 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4405 {
4406         int i;
4407
4408         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4409         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4410
4411         for (i = 0; i < cpu_count; i++) {
4412                 node->view[i].user = 0;
4413                 node->view[i].system = 0;
4414                 node->view[i].idle = 0;
4415         }
4416
4417         node->cpu_count = cpu_count;
4418 }
4419
4420 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4421 {
4422         char *line = NULL;
4423         size_t linelen = 0, total_len = 0, rv = 0, l;
4424         int curcpu = -1; /* cpu numbering starts at 0 */
4425         int physcpu, i;
4426         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4427         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4428         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4429         unsigned long user_surplus = 0, system_surplus = 0;
4430         unsigned long total_sum, threshold;
4431         struct cg_proc_stat *stat_node;
4432         struct cpuacct_usage *diff = NULL;
4433         int nprocs = get_nprocs_conf();
4434
4435         if (cg_cpu_usage_size < nprocs)
4436                 nprocs = cg_cpu_usage_size;
4437
4438         /* Read all CPU stats and stop when we've encountered other lines */
4439         while (getline(&line, &linelen, f) != -1) {
4440                 int ret;
4441                 char cpu_char[10]; /* That's a lot of cores */
4442                 uint64_t all_used, cg_used;
4443
4444                 if (strlen(line) == 0)
4445                         continue;
4446                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4447                         /* not a ^cpuN line containing a number N */
4448                         break;
4449                 }
4450
4451                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4452                         continue;
4453
4454                 if (physcpu >= cg_cpu_usage_size)
4455                         continue;
4456
4457                 curcpu ++;
4458                 cpu_cnt ++;
4459
4460                 if (!cpu_in_cpuset(physcpu, cpuset)) {
4461                         for (i = curcpu; i <= physcpu; i++) {
4462                                 cg_cpu_usage[i].online = false;
4463                         }
4464                         continue;
4465                 }
4466
4467                 if (curcpu < physcpu) {
4468                         /* Some CPUs may be disabled */
4469                         for (i = curcpu; i < physcpu; i++)
4470                                 cg_cpu_usage[i].online = false;
4471
4472                         curcpu = physcpu;
4473                 }
4474
4475                 cg_cpu_usage[curcpu].online = true;
4476
4477                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4478                            &user,
4479                            &nice,
4480                            &system,
4481                            &idle,
4482                            &iowait,
4483                            &irq,
4484                            &softirq,
4485                            &steal,
4486                            &guest,
4487                            &guest_nice);
4488
4489                 if (ret != 10)
4490                         continue;
4491
4492                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4493                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4494
4495                 if (all_used >= cg_used) {
4496                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4497
4498                 } else {
4499                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4500                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4501                                         curcpu, cg, all_used, cg_used);
4502                         cg_cpu_usage[curcpu].idle = idle;
4503                 }
4504         }
4505
4506         /* Cannot use more CPUs than is available due to cpuset */
4507         if (max_cpus > cpu_cnt)
4508                 max_cpus = cpu_cnt;
4509
4510         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4511
4512         if (!stat_node) {
4513                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4514                 rv = 0;
4515                 goto err;
4516         }
4517
4518         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4519         if (!diff) {
4520                 rv = 0;
4521                 goto err;
4522         }
4523
4524         /*
4525          * If the new values are LOWER than values stored in memory, it means
4526          * the cgroup has been reset/recreated and we should reset too.
4527          */
4528         for (curcpu = 0; curcpu < nprocs; curcpu++) {
4529                 if (!cg_cpu_usage[curcpu].online)
4530                         continue;
4531
4532                 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4533                         reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4534
4535                 break;
4536         }
4537
4538         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4539
4540         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4541                 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4542
4543                 if (!stat_node->usage[curcpu].online)
4544                         continue;
4545
4546                 i++;
4547
4548                 stat_node->usage[curcpu].user += diff[curcpu].user;
4549                 stat_node->usage[curcpu].system += diff[curcpu].system;
4550                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4551
4552                 if (max_cpus > 0 && i >= max_cpus) {
4553                         user_surplus += diff[curcpu].user;
4554                         system_surplus += diff[curcpu].system;
4555                 }
4556         }
4557
4558         /* Calculate usage counters of visible CPUs */
4559         if (max_cpus > 0) {
4560                 /* threshold = maximum usage per cpu, including idle */
4561                 threshold = total_sum / cpu_cnt * max_cpus;
4562
4563                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4564                         if (i == max_cpus)
4565                                 break;
4566
4567                         if (!stat_node->usage[curcpu].online)
4568                                 continue;
4569
4570                         i++;
4571
4572                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4573                                 continue;
4574
4575                         /* Add user */
4576                         add_cpu_usage(
4577                                         &user_surplus,
4578                                         &diff[curcpu],
4579                                         &diff[curcpu].user,
4580                                         threshold);
4581
4582                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4583                                 continue;
4584
4585                         /* If there is still room, add system */
4586                         add_cpu_usage(
4587                                         &system_surplus,
4588                                         &diff[curcpu],
4589                                         &diff[curcpu].system,
4590                                         threshold);
4591                 }
4592
4593                 if (user_surplus > 0)
4594                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4595                 if (system_surplus > 0)
4596                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4597
4598                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4599                         if (i == max_cpus)
4600                                 break;
4601
4602                         if (!stat_node->usage[curcpu].online)
4603                                 continue;
4604
4605                         i++;
4606
4607                         stat_node->view[curcpu].user += diff[curcpu].user;
4608                         stat_node->view[curcpu].system += diff[curcpu].system;
4609                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4610
4611                         user_sum += stat_node->view[curcpu].user;
4612                         system_sum += stat_node->view[curcpu].system;
4613                         idle_sum += stat_node->view[curcpu].idle;
4614                 }
4615
4616         } else {
4617                 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4618                         if (!stat_node->usage[curcpu].online)
4619                                 continue;
4620
4621                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4622                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4623                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4624
4625                         user_sum += stat_node->view[curcpu].user;
4626                         system_sum += stat_node->view[curcpu].system;
4627                         idle_sum += stat_node->view[curcpu].idle;
4628                 }
4629         }
4630
4631         /* Render the file */
4632         /* cpu-all */
4633         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4634                         user_sum,
4635                         system_sum,
4636                         idle_sum);
4637
4638         if (l < 0) {
4639                 perror("Error writing to cache");
4640                 rv = 0;
4641                 goto err;
4642
4643         }
4644         if (l >= buf_size) {
4645                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4646                 rv = 0;
4647                 goto err;
4648         }
4649
4650         buf += l;
4651         buf_size -= l;
4652         total_len += l;
4653
4654         /* Render visible CPUs */
4655         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4656                 if (!stat_node->usage[curcpu].online)
4657                         continue;
4658
4659                 i++;
4660
4661                 if (max_cpus > 0 && i == max_cpus)
4662                         break;
4663
4664                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4665                                 i,
4666                                 stat_node->view[curcpu].user,
4667                                 stat_node->view[curcpu].system,
4668                                 stat_node->view[curcpu].idle);
4669
4670                 if (l < 0) {
4671                         perror("Error writing to cache");
4672                         rv = 0;
4673                         goto err;
4674
4675                 }
4676                 if (l >= buf_size) {
4677                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4678                         rv = 0;
4679                         goto err;
4680                 }
4681
4682                 buf += l;
4683                 buf_size -= l;
4684                 total_len += l;
4685         }
4686
4687         /* Pass the rest of /proc/stat, start with the last line read */
4688         l = snprintf(buf, buf_size, "%s", line);
4689
4690         if (l < 0) {
4691                 perror("Error writing to cache");
4692                 rv = 0;
4693                 goto err;
4694
4695         }
4696         if (l >= buf_size) {
4697                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4698                 rv = 0;
4699                 goto err;
4700         }
4701
4702         buf += l;
4703         buf_size -= l;
4704         total_len += l;
4705
4706         /* Pass the rest of the host's /proc/stat */
4707         while (getline(&line, &linelen, f) != -1) {
4708                 l = snprintf(buf, buf_size, "%s", line);
4709                 if (l < 0) {
4710                         perror("Error writing to cache");
4711                         rv = 0;
4712                         goto err;
4713                 }
4714                 if (l >= buf_size) {
4715                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4716                         rv = 0;
4717                         goto err;
4718                 }
4719                 buf += l;
4720                 buf_size -= l;
4721                 total_len += l;
4722         }
4723
4724         rv = total_len;
4725
4726 err:
4727         if (stat_node)
4728                 pthread_mutex_unlock(&stat_node->lock);
4729         if (line)
4730                 free(line);
4731         if (diff)
4732                 free(diff);
4733         return rv;
4734 }
4735
4736 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4737 static int proc_stat_read(char *buf, size_t size, off_t offset,
4738                 struct fuse_file_info *fi)
4739 {
4740         struct fuse_context *fc = fuse_get_context();
4741         struct file_info *d = (struct file_info *)fi->fh;
4742         char *cg;
4743         char *cpuset = NULL;
4744         char *line = NULL;
4745         size_t linelen = 0, total_len = 0, rv = 0;
4746         int curcpu = -1; /* cpu numbering starts at 0 */
4747         int physcpu = 0;
4748         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4749         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4750                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4751         char cpuall[CPUALL_MAX_SIZE];
4752         /* reserve for cpu all */
4753         char *cache = d->buf + CPUALL_MAX_SIZE;
4754         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4755         FILE *f = NULL;
4756         struct cpuacct_usage *cg_cpu_usage = NULL;
4757         int cg_cpu_usage_size = 0;
4758
4759         if (offset){
4760                 if (offset > d->size)
4761                         return -EINVAL;
4762                 if (!d->cached)
4763                         return 0;
4764                 int left = d->size - offset;
4765                 total_len = left > size ? size: left;
4766                 memcpy(buf, d->buf + offset, total_len);
4767                 return total_len;
4768         }
4769
4770         pid_t initpid = lookup_initpid_in_store(fc->pid);
4771         if (initpid <= 0)
4772                 initpid = fc->pid;
4773         cg = get_pid_cgroup(initpid, "cpuset");
4774         if (!cg)
4775                 return read_file("/proc/stat", buf, size, d);
4776         prune_init_slice(cg);
4777
4778         cpuset = get_cpuset(cg);
4779         if (!cpuset)
4780                 goto err;
4781
4782         /*
4783          * Read cpuacct.usage_all for all CPUs.
4784          * If the cpuacct cgroup is present, it is used to calculate the container's
4785          * CPU usage. If not, values from the host's /proc/stat are used.
4786          */
4787         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4788                 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
4789                                 "falling back to the host's /proc/stat");
4790         }
4791
4792         f = fopen("/proc/stat", "r");
4793         if (!f)
4794                 goto err;
4795
4796         //skip first line
4797         if (getline(&line, &linelen, f) < 0) {
4798                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4799                 goto err;
4800         }
4801
4802         if (use_cpuview(cg) && cg_cpu_usage) {
4803                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4804                                 f, d->buf, d->buflen);
4805                 goto out;
4806         }
4807
4808         while (getline(&line, &linelen, f) != -1) {
4809                 ssize_t l;
4810                 char cpu_char[10]; /* That's a lot of cores */
4811                 char *c;
4812                 uint64_t all_used, cg_used, new_idle;
4813                 int ret;
4814
4815                 if (strlen(line) == 0)
4816                         continue;
4817                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4818                         /* not a ^cpuN line containing a number N, just print it */
4819                         l = snprintf(cache, cache_size, "%s", line);
4820                         if (l < 0) {
4821                                 perror("Error writing to cache");
4822                                 rv = 0;
4823                                 goto err;
4824                         }
4825                         if (l >= cache_size) {
4826                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4827                                 rv = 0;
4828                                 goto err;
4829                         }
4830                         cache += l;
4831                         cache_size -= l;
4832                         total_len += l;
4833                         continue;
4834                 }
4835
4836                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4837                         continue;
4838                 if (!cpu_in_cpuset(physcpu, cpuset))
4839                         continue;
4840                 curcpu ++;
4841
4842                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4843                            &user,
4844                            &nice,
4845                            &system,
4846                            &idle,
4847                            &iowait,
4848                            &irq,
4849                            &softirq,
4850                            &steal,
4851                            &guest,
4852                            &guest_nice);
4853
4854                 if (ret != 10 || !cg_cpu_usage) {
4855                         c = strchr(line, ' ');
4856                         if (!c)
4857                                 continue;
4858                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4859                         if (l < 0) {
4860                                 perror("Error writing to cache");
4861                                 rv = 0;
4862                                 goto err;
4863
4864                         }
4865                         if (l >= cache_size) {
4866                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4867                                 rv = 0;
4868                                 goto err;
4869                         }
4870
4871                         cache += l;
4872                         cache_size -= l;
4873                         total_len += l;
4874
4875                         if (ret != 10)
4876                                 continue;
4877                 }
4878
4879                 if (cg_cpu_usage) {
4880                         if (physcpu >= cg_cpu_usage_size)
4881                                 break;
4882
4883                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4884                         cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4885
4886                         if (all_used >= cg_used) {
4887                                 new_idle = idle + (all_used - cg_used);
4888
4889                         } else {
4890                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4891                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4892                                                 curcpu, cg, all_used, cg_used);
4893                                 new_idle = idle;
4894                         }
4895
4896                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4897                                         curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4898                                         new_idle);
4899
4900                         if (l < 0) {
4901                                 perror("Error writing to cache");
4902                                 rv = 0;
4903                                 goto err;
4904
4905                         }
4906                         if (l >= cache_size) {
4907                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4908                                 rv = 0;
4909                                 goto err;
4910                         }
4911
4912                         cache += l;
4913                         cache_size -= l;
4914                         total_len += l;
4915
4916                         user_sum += cg_cpu_usage[physcpu].user;
4917                         system_sum += cg_cpu_usage[physcpu].system;
4918                         idle_sum += new_idle;
4919
4920                 } else {
4921                         user_sum += user;
4922                         nice_sum += nice;
4923                         system_sum += system;
4924                         idle_sum += idle;
4925                         iowait_sum += iowait;
4926                         irq_sum += irq;
4927                         softirq_sum += softirq;
4928                         steal_sum += steal;
4929                         guest_sum += guest;
4930                         guest_nice_sum += guest_nice;
4931                 }
4932         }
4933
4934         cache = d->buf;
4935
4936         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4937                         user_sum,
4938                         nice_sum,
4939                         system_sum,
4940                         idle_sum,
4941                         iowait_sum,
4942                         irq_sum,
4943                         softirq_sum,
4944                         steal_sum,
4945                         guest_sum,
4946                         guest_nice_sum);
4947         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4948                 memcpy(cache, cpuall, cpuall_len);
4949                 cache += cpuall_len;
4950         } else {
4951                 /* shouldn't happen */
4952                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4953                 cpuall_len = 0;
4954         }
4955
4956         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4957         total_len += cpuall_len;
4958
4959 out:
4960         d->cached = 1;
4961         d->size = total_len;
4962         if (total_len > size)
4963                 total_len = size;
4964
4965         memcpy(buf, d->buf, total_len);
4966         rv = total_len;
4967
4968 err:
4969         if (f)
4970                 fclose(f);
4971         if (cg_cpu_usage)
4972                 free(cg_cpu_usage);
4973         free(line);
4974         free(cpuset);
4975         free(cg);
4976         return rv;
4977 }
4978
4979 /* This function retrieves the busy time of a group of tasks by looking at
4980  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4981  * been given it's own cpuacct cgroup. If not, this function will take the busy
4982  * time of all other taks that do not actually belong to the container into
4983  * account as well. If someone has a clever solution for this please send a
4984  * patch!
4985  */
4986 static unsigned long get_reaper_busy(pid_t task)
4987 {
4988         pid_t initpid = lookup_initpid_in_store(task);
4989         char *cgroup = NULL, *usage_str = NULL;
4990         unsigned long usage = 0;
4991
4992         if (initpid <= 0)
4993                 return 0;
4994
4995         cgroup = get_pid_cgroup(initpid, "cpuacct");
4996         if (!cgroup)
4997                 goto out;
4998         prune_init_slice(cgroup);
4999         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5000                 goto out;
5001         usage = strtoul(usage_str, NULL, 10);
5002         usage /= 1000000000;
5003
5004 out:
5005         free(cgroup);
5006         free(usage_str);
5007         return usage;
5008 }
5009
5010 #if RELOADTEST
5011 void iwashere(void)
5012 {
5013         int fd;
5014
5015         fd = creat("/tmp/lxcfs-iwashere", 0644);
5016         if (fd >= 0)
5017                 close(fd);
5018 }
5019 #endif
5020
5021 /*
5022  * We read /proc/uptime and reuse its second field.
5023  * For the first field, we use the mtime for the reaper for
5024  * the calling pid as returned by getreaperage
5025  */
5026 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5027                 struct fuse_file_info *fi)
5028 {
5029         struct fuse_context *fc = fuse_get_context();
5030         struct file_info *d = (struct file_info *)fi->fh;
5031         unsigned long int busytime = get_reaper_busy(fc->pid);
5032         char *cache = d->buf;
5033         ssize_t total_len = 0;
5034         uint64_t idletime, reaperage;
5035
5036 #if RELOADTEST
5037         iwashere();
5038 #endif
5039
5040         if (offset){
5041                 if (!d->cached)
5042                         return 0;
5043                 if (offset > d->size)
5044                         return -EINVAL;
5045                 int left = d->size - offset;
5046                 total_len = left > size ? size: left;
5047                 memcpy(buf, cache + offset, total_len);
5048                 return total_len;
5049         }
5050
5051         reaperage = get_reaper_age(fc->pid);
5052         /* To understand why this is done, please read the comment to the
5053          * get_reaper_busy() function.
5054          */
5055         idletime = reaperage;
5056         if (reaperage >= busytime)
5057                 idletime = reaperage - busytime;
5058
5059         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
5060         if (total_len < 0 || total_len >=  d->buflen){
5061                 lxcfs_error("%s\n", "failed to write to cache");
5062                 return 0;
5063         }
5064
5065         d->size = (int)total_len;
5066         d->cached = 1;
5067
5068         if (total_len > size) total_len = size;
5069
5070         memcpy(buf, d->buf, total_len);
5071         return total_len;
5072 }
5073
5074 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5075                 struct fuse_file_info *fi)
5076 {
5077         char dev_name[72];
5078         struct fuse_context *fc = fuse_get_context();
5079         struct file_info *d = (struct file_info *)fi->fh;
5080         char *cg;
5081         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5082                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
5083         unsigned long read = 0, write = 0;
5084         unsigned long read_merged = 0, write_merged = 0;
5085         unsigned long read_sectors = 0, write_sectors = 0;
5086         unsigned long read_ticks = 0, write_ticks = 0;
5087         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5088         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5089         char *cache = d->buf;
5090         size_t cache_size = d->buflen;
5091         char *line = NULL;
5092         size_t linelen = 0, total_len = 0, rv = 0;
5093         unsigned int major = 0, minor = 0;
5094         int i = 0;
5095         FILE *f = NULL;
5096
5097         if (offset){
5098                 if (offset > d->size)
5099                         return -EINVAL;
5100                 if (!d->cached)
5101                         return 0;
5102                 int left = d->size - offset;
5103                 total_len = left > size ? size: left;
5104                 memcpy(buf, cache + offset, total_len);
5105                 return total_len;
5106         }
5107
5108         pid_t initpid = lookup_initpid_in_store(fc->pid);
5109         if (initpid <= 0)
5110                 initpid = fc->pid;
5111         cg = get_pid_cgroup(initpid, "blkio");
5112         if (!cg)
5113                 return read_file("/proc/diskstats", buf, size, d);
5114         prune_init_slice(cg);
5115
5116         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5117                 goto err;
5118         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5119                 goto err;
5120         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5121                 goto err;
5122         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5123                 goto err;
5124         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5125                 goto err;
5126
5127
5128         f = fopen("/proc/diskstats", "r");
5129         if (!f)
5130                 goto err;
5131
5132         while (getline(&line, &linelen, f) != -1) {
5133                 ssize_t l;
5134                 char lbuf[256];
5135
5136                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5137                 if (i != 3)
5138                         continue;
5139
5140                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5141                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5142                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5143                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5144                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5145                 read_sectors = read_sectors/512;
5146                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5147                 write_sectors = write_sectors/512;
5148
5149                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5150                 rd_svctm = rd_svctm/1000000;
5151                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5152                 rd_wait = rd_wait/1000000;
5153                 read_ticks = rd_svctm + rd_wait;
5154
5155                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5156                 wr_svctm =  wr_svctm/1000000;
5157                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5158                 wr_wait =  wr_wait/1000000;
5159                 write_ticks = wr_svctm + wr_wait;
5160
5161                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5162                 tot_ticks =  tot_ticks/1000000;
5163
5164                 memset(lbuf, 0, 256);
5165                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5166                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5167                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5168                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5169                 else
5170                         continue;
5171
5172                 l = snprintf(cache, cache_size, "%s", lbuf);
5173                 if (l < 0) {
5174                         perror("Error writing to fuse buf");
5175                         rv = 0;
5176                         goto err;
5177                 }
5178                 if (l >= cache_size) {
5179                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5180                         rv = 0;
5181                         goto err;
5182                 }
5183                 cache += l;
5184                 cache_size -= l;
5185                 total_len += l;
5186         }
5187
5188         d->cached = 1;
5189         d->size = total_len;
5190         if (total_len > size ) total_len = size;
5191         memcpy(buf, d->buf, total_len);
5192
5193         rv = total_len;
5194 err:
5195         free(cg);
5196         if (f)
5197                 fclose(f);
5198         free(line);
5199         free(io_serviced_str);
5200         free(io_merged_str);
5201         free(io_service_bytes_str);
5202         free(io_wait_time_str);
5203         free(io_service_time_str);
5204         return rv;
5205 }
5206
5207 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5208                 struct fuse_file_info *fi)
5209 {
5210         struct fuse_context *fc = fuse_get_context();
5211         struct file_info *d = (struct file_info *)fi->fh;
5212         char *cg = NULL;
5213         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5214         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5215         ssize_t total_len = 0, rv = 0;
5216         ssize_t l = 0;
5217         char *cache = d->buf;
5218
5219         if (offset) {
5220                 if (offset > d->size)
5221                         return -EINVAL;
5222                 if (!d->cached)
5223                         return 0;
5224                 int left = d->size - offset;
5225                 total_len = left > size ? size: left;
5226                 memcpy(buf, cache + offset, total_len);
5227                 return total_len;
5228         }
5229
5230         pid_t initpid = lookup_initpid_in_store(fc->pid);
5231         if (initpid <= 0)
5232                 initpid = fc->pid;
5233         cg = get_pid_cgroup(initpid, "memory");
5234         if (!cg)
5235                 return read_file("/proc/swaps", buf, size, d);
5236         prune_init_slice(cg);
5237
5238         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5239
5240         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5241                 goto err;
5242
5243         memusage = strtoul(memusage_str, NULL, 10);
5244
5245         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5246             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5247
5248                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5249                 memswusage = strtoul(memswusage_str, NULL, 10);
5250
5251                 swap_total = (memswlimit - memlimit) / 1024;
5252                 swap_free = (memswusage - memusage) / 1024;
5253         }
5254
5255         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5256
5257         /* When no mem + swap limit is specified or swapaccount=0*/
5258         if (!memswlimit) {
5259                 char *line = NULL;
5260                 size_t linelen = 0;
5261                 FILE *f = fopen("/proc/meminfo", "r");
5262
5263                 if (!f)
5264                         goto err;
5265
5266                 while (getline(&line, &linelen, f) != -1) {
5267                         if (startswith(line, "SwapTotal:")) {
5268                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5269                         } else if (startswith(line, "SwapFree:")) {
5270                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5271                         }
5272                 }
5273
5274                 free(line);
5275                 fclose(f);
5276         }
5277
5278         if (swap_total > 0) {
5279                 l = snprintf(d->buf + total_len, d->size - total_len,
5280                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5281                                 swap_total, swap_free);
5282                 total_len += l;
5283         }
5284
5285         if (total_len < 0 || l < 0) {
5286                 perror("Error writing to cache");
5287                 rv = 0;
5288                 goto err;
5289         }
5290
5291         d->cached = 1;
5292         d->size = (int)total_len;
5293
5294         if (total_len > size) total_len = size;
5295         memcpy(buf, d->buf, total_len);
5296         rv = total_len;
5297
5298 err:
5299         free(cg);
5300         free(memswlimit_str);
5301         free(memlimit_str);
5302         free(memusage_str);
5303         free(memswusage_str);
5304         return rv;
5305 }
5306 /*
5307  * Find the process pid from cgroup path.
5308  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5309  * @pid_buf : put pid to pid_buf.
5310  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5311  * @depth : the depth of cgroup in container.
5312  * @sum : return the number of pid.
5313  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5314  */
5315 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5316 {
5317         DIR *dir;
5318         int fd;
5319         struct dirent *file;
5320         FILE *f = NULL;
5321         size_t linelen = 0;
5322         char *line = NULL;
5323         int pd;
5324         char *path_dir, *path;
5325         char **pid;
5326
5327         /* path = dpath + "/cgroup.procs" + /0 */
5328         do {
5329                 path = malloc(strlen(dpath) + 20);
5330         } while (!path);
5331
5332         strcpy(path, dpath);
5333         fd = openat(cfd, path, O_RDONLY);
5334         if (fd < 0)
5335                 goto out;
5336
5337         dir = fdopendir(fd);
5338         if (dir == NULL) {
5339                 close(fd);
5340                 goto out;
5341         }
5342
5343         while (((file = readdir(dir)) != NULL) && depth > 0) {
5344                 if (strncmp(file->d_name, ".", 1) == 0)
5345                         continue;
5346                 if (strncmp(file->d_name, "..", 1) == 0)
5347                         continue;
5348                 if (file->d_type == DT_DIR) {
5349                         /* path + '/' + d_name +/0 */
5350                         do {
5351                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5352                         } while (!path_dir);
5353                         strcpy(path_dir, path);
5354                         strcat(path_dir, "/");
5355                         strcat(path_dir, file->d_name);
5356                         pd = depth - 1;
5357                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5358                         free(path_dir);
5359                 }
5360         }
5361         closedir(dir);
5362
5363         strcat(path, "/cgroup.procs");
5364         fd = openat(cfd, path, O_RDONLY);
5365         if (fd < 0)
5366                 goto out;
5367
5368         f = fdopen(fd, "r");
5369         if (!f) {
5370                 close(fd);
5371                 goto out;
5372         }
5373
5374         while (getline(&line, &linelen, f) != -1) {
5375                 do {
5376                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5377                 } while (!pid);
5378                 *pid_buf = pid;
5379                 do {
5380                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
5381                 } while (*(*pid_buf + sum) == NULL);
5382                 strcpy(*(*pid_buf + sum), line);
5383                 sum++;
5384         }
5385         fclose(f);
5386 out:
5387         if (line)
5388                 free(line);
5389         free(path);
5390         return sum;
5391 }
5392 /*
5393  * calc_load calculates the load according to the following formula:
5394  * load1 = load0 * exp + active * (1 - exp)
5395  *
5396  * @load1: the new loadavg.
5397  * @load0: the former loadavg.
5398  * @active: the total number of running pid at this moment.
5399  * @exp: the fixed-point defined in the beginning.
5400  */
5401 static unsigned long
5402 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5403 {
5404         unsigned long newload;
5405
5406         active = active > 0 ? active * FIXED_1 : 0;
5407         newload = load * exp + active * (FIXED_1 - exp);
5408         if (active >= load)
5409                 newload += FIXED_1 - 1;
5410
5411         return newload / FIXED_1;
5412 }
5413
5414 /*
5415  * Return 0 means that container p->cg is closed.
5416  * Return -1 means that error occurred in refresh.
5417  * Positive num equals the total number of pid.
5418  */
5419 static int refresh_load(struct load_node *p, char *path)
5420 {
5421         FILE *f = NULL;
5422         char **idbuf;
5423         char proc_path[256];
5424         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5425         char *line = NULL;
5426         size_t linelen = 0;
5427         int sum, length;
5428         DIR *dp;
5429         struct dirent *file;
5430
5431         do {
5432                 idbuf = malloc(sizeof(char *));
5433         } while (!idbuf);
5434         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5435         /*  normal exit  */
5436         if (sum == 0)
5437                 goto out;
5438
5439         for (i = 0; i < sum; i++) {
5440                 /*clean up '\n' */
5441                 length = strlen(idbuf[i])-1;
5442                 idbuf[i][length] = '\0';
5443                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5444                 if (ret < 0 || ret > 255) {
5445                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5446                         i = sum;
5447                         sum = -1;
5448                         goto err_out;
5449                 }
5450
5451                 dp = opendir(proc_path);
5452                 if (!dp) {
5453                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5454                         continue;
5455                 }
5456                 while ((file = readdir(dp)) != NULL) {
5457                         if (strncmp(file->d_name, ".", 1) == 0)
5458                                 continue;
5459                         if (strncmp(file->d_name, "..", 1) == 0)
5460                                 continue;
5461                         total_pid++;
5462                         /* We make the biggest pid become last_pid.*/
5463                         ret = atof(file->d_name);
5464                         last_pid = (ret > last_pid) ? ret : last_pid;
5465
5466                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5467                         if (ret < 0 || ret > 255) {
5468                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5469                                 i = sum;
5470                                 sum = -1;
5471                                 closedir(dp);
5472                                 goto err_out;
5473                         }
5474                         f = fopen(proc_path, "r");
5475                         if (f != NULL) {
5476                                 while (getline(&line, &linelen, f) != -1) {
5477                                         /* Find State */
5478                                         if ((line[0] == 'S') && (line[1] == 't'))
5479                                                 break;
5480                                 }
5481                         if ((line[7] == 'R') || (line[7] == 'D'))
5482                                 run_pid++;
5483                         fclose(f);
5484                         }
5485                 }
5486                 closedir(dp);
5487         }
5488         /*Calculate the loadavg.*/
5489         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5490         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5491         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5492         p->run_pid = run_pid;
5493         p->total_pid = total_pid;
5494         p->last_pid = last_pid;
5495
5496         free(line);
5497 err_out:
5498         for (; i > 0; i--)
5499                 free(idbuf[i-1]);
5500 out:
5501         free(idbuf);
5502         return sum;
5503 }
5504 /*
5505  * Traverse the hash table and update it.
5506  */
5507 void *load_begin(void *arg)
5508 {
5509
5510         char *path = NULL;
5511         int i, sum, length, ret;
5512         struct load_node *f;
5513         int first_node;
5514         clock_t time1, time2;
5515
5516         while (1) {
5517                 if (loadavg_stop == 1)
5518                         return NULL;
5519
5520                 time1 = clock();
5521                 for (i = 0; i < LOAD_SIZE; i++) {
5522                         pthread_mutex_lock(&load_hash[i].lock);
5523                         if (load_hash[i].next == NULL) {
5524                                 pthread_mutex_unlock(&load_hash[i].lock);
5525                                 continue;
5526                         }
5527                         f = load_hash[i].next;
5528                         first_node = 1;
5529                         while (f) {
5530                                 length = strlen(f->cg) + 2;
5531                                 do {
5532                                         /* strlen(f->cg) + '.' or '' + \0 */
5533                                         path = malloc(length);
5534                                 } while (!path);
5535
5536                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5537                                 if (ret < 0 || ret > length - 1) {
5538                                         /* snprintf failed, ignore the node.*/
5539                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5540                                         goto out;
5541                                 }
5542                                 sum = refresh_load(f, path);
5543                                 if (sum == 0) {
5544                                         f = del_node(f, i);
5545                                 } else {
5546 out:                                    f = f->next;
5547                                 }
5548                                 free(path);
5549                                 /* load_hash[i].lock locks only on the first node.*/
5550                                 if (first_node == 1) {
5551                                         first_node = 0;
5552                                         pthread_mutex_unlock(&load_hash[i].lock);
5553                                 }
5554                         }
5555                 }
5556
5557                 if (loadavg_stop == 1)
5558                         return NULL;
5559
5560                 time2 = clock();
5561                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5562         }
5563 }
5564
5565 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5566                 struct fuse_file_info *fi)
5567 {
5568         struct fuse_context *fc = fuse_get_context();
5569         struct file_info *d = (struct file_info *)fi->fh;
5570         pid_t initpid;
5571         char *cg;
5572         size_t total_len = 0;
5573         char *cache = d->buf;
5574         struct load_node *n;
5575         int hash;
5576         int cfd, rv = 0;
5577         unsigned long a, b, c;
5578
5579         if (offset) {
5580                 if (offset > d->size)
5581                         return -EINVAL;
5582                 if (!d->cached)
5583                         return 0;
5584                 int left = d->size - offset;
5585                 total_len = left > size ? size : left;
5586                 memcpy(buf, cache + offset, total_len);
5587                 return total_len;
5588         }
5589         if (!loadavg)
5590                 return read_file("/proc/loadavg", buf, size, d);
5591
5592         initpid = lookup_initpid_in_store(fc->pid);
5593         if (initpid <= 0)
5594                 initpid = fc->pid;
5595         cg = get_pid_cgroup(initpid, "cpu");
5596         if (!cg)
5597                 return read_file("/proc/loadavg", buf, size, d);
5598
5599         prune_init_slice(cg);
5600         hash = calc_hash(cg) % LOAD_SIZE;
5601         n = locate_node(cg, hash);
5602
5603         /* First time */
5604         if (n == NULL) {
5605                 if (!find_mounted_controller("cpu", &cfd)) {
5606                         /*
5607                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5608                          * because delete is not allowed before read has ended.
5609                          */
5610                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5611                         rv = 0;
5612                         goto err;
5613                 }
5614                 do {
5615                         n = malloc(sizeof(struct load_node));
5616                 } while (!n);
5617
5618                 do {
5619                         n->cg = malloc(strlen(cg)+1);
5620                 } while (!n->cg);
5621                 strcpy(n->cg, cg);
5622                 n->avenrun[0] = 0;
5623                 n->avenrun[1] = 0;
5624                 n->avenrun[2] = 0;
5625                 n->run_pid = 0;
5626                 n->total_pid = 1;
5627                 n->last_pid = initpid;
5628                 n->cfd = cfd;
5629                 insert_node(&n, hash);
5630         }
5631         a = n->avenrun[0] + (FIXED_1/200);
5632         b = n->avenrun[1] + (FIXED_1/200);
5633         c = n->avenrun[2] + (FIXED_1/200);
5634         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5635                 LOAD_INT(a), LOAD_FRAC(a),
5636                 LOAD_INT(b), LOAD_FRAC(b),
5637                 LOAD_INT(c), LOAD_FRAC(c),
5638                 n->run_pid, n->total_pid, n->last_pid);
5639         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5640         if (total_len < 0 || total_len >=  d->buflen) {
5641                 lxcfs_error("%s\n", "Failed to write to cache");
5642                 rv = 0;
5643                 goto err;
5644         }
5645         d->size = (int)total_len;
5646         d->cached = 1;
5647
5648         if (total_len > size)
5649                 total_len = size;
5650         memcpy(buf, d->buf, total_len);
5651         rv = total_len;
5652
5653 err:
5654         free(cg);
5655         return rv;
5656 }
5657 /* Return a positive number on success, return 0 on failure.*/
5658 pthread_t load_daemon(int load_use)
5659 {
5660         int ret;
5661         pthread_t pid;
5662
5663         ret = init_load();
5664         if (ret == -1) {
5665                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5666                 return 0;
5667         }
5668         ret = pthread_create(&pid, NULL, load_begin, NULL);
5669         if (ret != 0) {
5670                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5671                 load_free();
5672                 return 0;
5673         }
5674         /* use loadavg, here loadavg = 1*/
5675         loadavg = load_use;
5676         return pid;
5677 }
5678
5679 /* Returns 0 on success. */
5680 int stop_load_daemon(pthread_t pid)
5681 {
5682         int s;
5683
5684         /* Signal the thread to gracefully stop */
5685         loadavg_stop = 1;
5686
5687         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5688         if (s != 0) {
5689                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5690                 return -1;
5691         }
5692
5693         load_free();
5694         loadavg_stop = 0;
5695
5696         return 0;
5697 }
5698
5699 static off_t get_procfile_size(const char *which)
5700 {
5701         FILE *f = fopen(which, "r");
5702         char *line = NULL;
5703         size_t len = 0;
5704         ssize_t sz, answer = 0;
5705         if (!f)
5706                 return 0;
5707
5708         while ((sz = getline(&line, &len, f)) != -1)
5709                 answer += sz;
5710         fclose (f);
5711         free(line);
5712
5713         return answer;
5714 }
5715
5716 int proc_getattr(const char *path, struct stat *sb)
5717 {
5718         struct timespec now;
5719
5720         memset(sb, 0, sizeof(struct stat));
5721         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5722                 return -EINVAL;
5723         sb->st_uid = sb->st_gid = 0;
5724         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5725         if (strcmp(path, "/proc") == 0) {
5726                 sb->st_mode = S_IFDIR | 00555;
5727                 sb->st_nlink = 2;
5728                 return 0;
5729         }
5730         if (strcmp(path, "/proc/meminfo") == 0 ||
5731                         strcmp(path, "/proc/cpuinfo") == 0 ||
5732                         strcmp(path, "/proc/uptime") == 0 ||
5733                         strcmp(path, "/proc/stat") == 0 ||
5734                         strcmp(path, "/proc/diskstats") == 0 ||
5735                         strcmp(path, "/proc/swaps") == 0 ||
5736                         strcmp(path, "/proc/loadavg") == 0) {
5737                 sb->st_size = 0;
5738                 sb->st_mode = S_IFREG | 00444;
5739                 sb->st_nlink = 1;
5740                 return 0;
5741         }
5742
5743         return -ENOENT;
5744 }
5745
5746 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5747                 struct fuse_file_info *fi)
5748 {
5749         if (filler(buf, ".", NULL, 0) != 0 ||
5750             filler(buf, "..", NULL, 0) != 0 ||
5751             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5752             filler(buf, "meminfo", NULL, 0) != 0 ||
5753             filler(buf, "stat", NULL, 0) != 0 ||
5754             filler(buf, "uptime", NULL, 0) != 0 ||
5755             filler(buf, "diskstats", NULL, 0) != 0 ||
5756             filler(buf, "swaps", NULL, 0) != 0   ||
5757             filler(buf, "loadavg", NULL, 0) != 0)
5758                 return -EINVAL;
5759         return 0;
5760 }
5761
5762 int proc_open(const char *path, struct fuse_file_info *fi)
5763 {
5764         int type = -1;
5765         struct file_info *info;
5766
5767         if (strcmp(path, "/proc/meminfo") == 0)
5768                 type = LXC_TYPE_PROC_MEMINFO;
5769         else if (strcmp(path, "/proc/cpuinfo") == 0)
5770                 type = LXC_TYPE_PROC_CPUINFO;
5771         else if (strcmp(path, "/proc/uptime") == 0)
5772                 type = LXC_TYPE_PROC_UPTIME;
5773         else if (strcmp(path, "/proc/stat") == 0)
5774                 type = LXC_TYPE_PROC_STAT;
5775         else if (strcmp(path, "/proc/diskstats") == 0)
5776                 type = LXC_TYPE_PROC_DISKSTATS;
5777         else if (strcmp(path, "/proc/swaps") == 0)
5778                 type = LXC_TYPE_PROC_SWAPS;
5779         else if (strcmp(path, "/proc/loadavg") == 0)
5780                 type = LXC_TYPE_PROC_LOADAVG;
5781         if (type == -1)
5782                 return -ENOENT;
5783
5784         info = malloc(sizeof(*info));
5785         if (!info)
5786                 return -ENOMEM;
5787
5788         memset(info, 0, sizeof(*info));
5789         info->type = type;
5790
5791         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5792         do {
5793                 info->buf = malloc(info->buflen);
5794         } while (!info->buf);
5795         memset(info->buf, 0, info->buflen);
5796         /* set actual size to buffer size */
5797         info->size = info->buflen;
5798
5799         fi->fh = (unsigned long)info;
5800         return 0;
5801 }
5802
5803 int proc_access(const char *path, int mask)
5804 {
5805         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5806                 return 0;
5807
5808         /* these are all read-only */
5809         if ((mask & ~R_OK) != 0)
5810                 return -EACCES;
5811         return 0;
5812 }
5813
5814 int proc_release(const char *path, struct fuse_file_info *fi)
5815 {
5816         do_release_file_info(fi);
5817         return 0;
5818 }
5819
5820 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5821                 struct fuse_file_info *fi)
5822 {
5823         struct file_info *f = (struct file_info *) fi->fh;
5824
5825         switch (f->type) {
5826         case LXC_TYPE_PROC_MEMINFO:
5827                 return proc_meminfo_read(buf, size, offset, fi);
5828         case LXC_TYPE_PROC_CPUINFO:
5829                 return proc_cpuinfo_read(buf, size, offset, fi);
5830         case LXC_TYPE_PROC_UPTIME:
5831                 return proc_uptime_read(buf, size, offset, fi);
5832         case LXC_TYPE_PROC_STAT:
5833                 return proc_stat_read(buf, size, offset, fi);
5834         case LXC_TYPE_PROC_DISKSTATS:
5835                 return proc_diskstats_read(buf, size, offset, fi);
5836         case LXC_TYPE_PROC_SWAPS:
5837                 return proc_swaps_read(buf, size, offset, fi);
5838         case LXC_TYPE_PROC_LOADAVG:
5839                 return proc_loadavg_read(buf, size, offset, fi);
5840         default:
5841                 return -EINVAL;
5842         }
5843 }
5844
5845 /*
5846  * Functions needed to setup cgroups in the __constructor__.
5847  */
5848
5849 static bool mkdir_p(const char *dir, mode_t mode)
5850 {
5851         const char *tmp = dir;
5852         const char *orig = dir;
5853         char *makeme;
5854
5855         do {
5856                 dir = tmp + strspn(tmp, "/");
5857                 tmp = dir + strcspn(dir, "/");
5858                 makeme = strndup(orig, dir - orig);
5859                 if (!makeme)
5860                         return false;
5861                 if (mkdir(makeme, mode) && errno != EEXIST) {
5862                         lxcfs_error("Failed to create directory '%s': %s.\n",
5863                                 makeme, strerror(errno));
5864                         free(makeme);
5865                         return false;
5866                 }
5867                 free(makeme);
5868         } while(tmp != dir);
5869
5870         return true;
5871 }
5872
5873 static bool umount_if_mounted(void)
5874 {
5875         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5876                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5877                 return false;
5878         }
5879         return true;
5880 }
5881
5882 /* __typeof__ should be safe to use with all compilers. */
5883 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5884 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5885 {
5886         return (fs->f_type == (fs_type_magic)magic_val);
5887 }
5888
5889 /*
5890  * looking at fs/proc_namespace.c, it appears we can
5891  * actually expect the rootfs entry to very specifically contain
5892  * " - rootfs rootfs "
5893  * IIUC, so long as we've chrooted so that rootfs is not our root,
5894  * the rootfs entry should always be skipped in mountinfo contents.
5895  */
5896 static bool is_on_ramfs(void)
5897 {
5898         FILE *f;
5899         char *p, *p2;
5900         char *line = NULL;
5901         size_t len = 0;
5902         int i;
5903
5904         f = fopen("/proc/self/mountinfo", "r");
5905         if (!f)
5906                 return false;
5907
5908         while (getline(&line, &len, f) != -1) {
5909                 for (p = line, i = 0; p && i < 4; i++)
5910                         p = strchr(p + 1, ' ');
5911                 if (!p)
5912                         continue;
5913                 p2 = strchr(p + 1, ' ');
5914                 if (!p2)
5915                         continue;
5916                 *p2 = '\0';
5917                 if (strcmp(p + 1, "/") == 0) {
5918                         // this is '/'.  is it the ramfs?
5919                         p = strchr(p2 + 1, '-');
5920                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5921                                 free(line);
5922                                 fclose(f);
5923                                 return true;
5924                         }
5925                 }
5926         }
5927         free(line);
5928         fclose(f);
5929         return false;
5930 }
5931
5932 static int pivot_enter()
5933 {
5934         int ret = -1, oldroot = -1, newroot = -1;
5935
5936         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5937         if (oldroot < 0) {
5938                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5939                 return ret;
5940         }
5941
5942         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5943         if (newroot < 0) {
5944                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5945                 goto err;
5946         }
5947
5948         /* change into new root fs */
5949         if (fchdir(newroot) < 0) {
5950                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5951                 goto err;
5952         }
5953
5954         /* pivot_root into our new root fs */
5955         if (pivot_root(".", ".") < 0) {
5956                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5957                 goto err;
5958         }
5959
5960         /*
5961          * At this point the old-root is mounted on top of our new-root.
5962          * To unmounted it we must not be chdir'd into it, so escape back
5963          * to the old-root.
5964          */
5965         if (fchdir(oldroot) < 0) {
5966                 lxcfs_error("%s\n", "Failed to enter old root.");
5967                 goto err;
5968         }
5969
5970         if (umount2(".", MNT_DETACH) < 0) {
5971                 lxcfs_error("%s\n", "Failed to detach old root.");
5972                 goto err;
5973         }
5974
5975         if (fchdir(newroot) < 0) {
5976                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5977                 goto err;
5978         }
5979
5980         ret = 0;
5981
5982 err:
5983         if (oldroot > 0)
5984                 close(oldroot);
5985         if (newroot > 0)
5986                 close(newroot);
5987
5988         return ret;
5989 }
5990
5991 static int chroot_enter()
5992 {
5993         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5994                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5995                 return -1;
5996         }
5997
5998         if (chroot(".") < 0) {
5999                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6000                 return -1;
6001         }
6002
6003         if (chdir("/") < 0) {
6004                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6005                 return -1;
6006         }
6007
6008         return 0;
6009 }
6010
6011 static int permute_and_enter(void)
6012 {
6013         struct statfs sb;
6014
6015         if (statfs("/", &sb) < 0) {
6016                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6017                 return -1;
6018         }
6019
6020         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6021          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6022          * /proc/1/mountinfo. */
6023         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6024                 return chroot_enter();
6025
6026         if (pivot_enter() < 0) {
6027                 lxcfs_error("%s\n", "Could not perform pivot root.");
6028                 return -1;
6029         }
6030
6031         return 0;
6032 }
6033
6034 /* Prepare our new clean root. */
6035 static int permute_prepare(void)
6036 {
6037         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6038                 lxcfs_error("%s\n", "Failed to create directory for new root.");
6039                 return -1;
6040         }
6041
6042         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6043                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6044                 return -1;
6045         }
6046
6047         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6048                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6049                 return -1;
6050         }
6051
6052         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6053                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6054                 return -1;
6055         }
6056
6057         return 0;
6058 }
6059
6060 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6061 static bool permute_root(void)
6062 {
6063         /* Prepare new root. */
6064         if (permute_prepare() < 0)
6065                 return false;
6066
6067         /* Pivot into new root. */
6068         if (permute_and_enter() < 0)
6069                 return false;
6070
6071         return true;
6072 }
6073
6074 static int preserve_mnt_ns(int pid)
6075 {
6076         int ret;
6077         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6078         char path[len];
6079
6080         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6081         if (ret < 0 || (size_t)ret >= len)
6082                 return -1;
6083
6084         return open(path, O_RDONLY | O_CLOEXEC);
6085 }
6086
6087 static bool cgfs_prepare_mounts(void)
6088 {
6089         if (!mkdir_p(BASEDIR, 0700)) {
6090                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6091                 return false;
6092         }
6093
6094         if (!umount_if_mounted()) {
6095                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6096                 return false;
6097         }
6098
6099         if (unshare(CLONE_NEWNS) < 0) {
6100                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6101                 return false;
6102         }
6103
6104         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6105         if (cgroup_mount_ns_fd < 0) {
6106                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6107                 return false;
6108         }
6109
6110         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6111                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6112                 return false;
6113         }
6114
6115         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6116                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6117                 return false;
6118         }
6119
6120         return true;
6121 }
6122
6123 static bool cgfs_mount_hierarchies(void)
6124 {
6125         char *target;
6126         size_t clen, len;
6127         int i, ret;
6128
6129         for (i = 0; i < num_hierarchies; i++) {
6130                 char *controller = hierarchies[i];
6131
6132                 clen = strlen(controller);
6133                 len = strlen(BASEDIR) + clen + 2;
6134                 target = malloc(len);
6135                 if (!target)
6136                         return false;
6137
6138                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6139                 if (ret < 0 || ret >= len) {
6140                         free(target);
6141                         return false;
6142                 }
6143                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6144                         free(target);
6145                         return false;
6146                 }
6147                 if (!strcmp(controller, "unified"))
6148                         ret = mount("none", target, "cgroup2", 0, NULL);
6149                 else
6150                         ret = mount(controller, target, "cgroup", 0, controller);
6151                 if (ret < 0) {
6152                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6153                         free(target);
6154                         return false;
6155                 }
6156
6157                 fd_hierarchies[i] = open(target, O_DIRECTORY);
6158                 if (fd_hierarchies[i] < 0) {
6159                         free(target);
6160                         return false;
6161                 }
6162                 free(target);
6163         }
6164         return true;
6165 }
6166
6167 static bool cgfs_setup_controllers(void)
6168 {
6169         if (!cgfs_prepare_mounts())
6170                 return false;
6171
6172         if (!cgfs_mount_hierarchies()) {
6173                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6174                 return false;
6175         }
6176
6177         if (!permute_root())
6178                 return false;
6179
6180         return true;
6181 }
6182
6183 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6184 {
6185         FILE *f;
6186         char *cret, *line = NULL;
6187         char cwd[MAXPATHLEN];
6188         size_t len = 0;
6189         int i, init_ns = -1;
6190         bool found_unified = false;
6191
6192         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6193                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6194                 return;
6195         }
6196
6197         while (getline(&line, &len, f) != -1) {
6198                 char *idx, *p, *p2;
6199
6200                 p = strchr(line, ':');
6201                 if (!p)
6202                         goto out;
6203                 idx = line;
6204                 *(p++) = '\0';
6205
6206                 p2 = strrchr(p, ':');
6207                 if (!p2)
6208                         goto out;
6209                 *p2 = '\0';
6210
6211                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6212                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6213                  * because it parses out the empty string "" and later on passes
6214                  * it to mount(). Let's skip such entries.
6215                  */
6216                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6217                         found_unified = true;
6218                         p = "unified";
6219                 }
6220
6221                 if (!store_hierarchy(line, p))
6222                         goto out;
6223         }
6224
6225         /* Preserve initial namespace. */
6226         init_ns = preserve_mnt_ns(getpid());
6227         if (init_ns < 0) {
6228                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6229                 goto out;
6230         }
6231
6232         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6233         if (!fd_hierarchies) {
6234                 lxcfs_error("%s\n", strerror(errno));
6235                 goto out;
6236         }
6237
6238         for (i = 0; i < num_hierarchies; i++)
6239                 fd_hierarchies[i] = -1;
6240
6241         cret = getcwd(cwd, MAXPATHLEN);
6242         if (!cret)
6243                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6244
6245         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6246          * to privately mount lxcfs cgroups. */
6247         if (!cgfs_setup_controllers()) {
6248                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6249                 goto out;
6250         }
6251
6252         if (setns(init_ns, 0) < 0) {
6253                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6254                 goto out;
6255         }
6256
6257         if (!cret || chdir(cwd) < 0)
6258                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6259
6260         if (!init_cpuview()) {
6261                 lxcfs_error("%s\n", "failed to init CPU view");
6262                 goto out;
6263         }
6264
6265         print_subsystems();
6266
6267 out:
6268         free(line);
6269         fclose(f);
6270         if (init_ns >= 0)
6271                 close(init_ns);
6272 }
6273
6274 static void __attribute__((destructor)) free_subsystems(void)
6275 {
6276         int i;
6277
6278         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6279
6280         for (i = 0; i < num_hierarchies; i++) {
6281                 if (hierarchies[i])
6282                         free(hierarchies[i]);
6283                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6284                         close(fd_hierarchies[i]);
6285         }
6286         free(hierarchies);
6287         free(fd_hierarchies);
6288         free_cpuview();
6289
6290         if (cgroup_mount_ns_fd >= 0)
6291                 close(cgroup_mount_ns_fd);
6292 }