bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdarg.h>
  21 #include <stdbool.h>
  22 #include <stdint.h>
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <time.h>
  27 #include <unistd.h>
  28 #include <wait.h>
  29 #include <linux/magic.h>
  30 #include <linux/sched.h>
  31 #include <sys/epoll.h>
  32 #include <sys/mman.h>
  33 #include <sys/mount.h>
  34 #include <sys/param.h>
  35 #include <sys/socket.h>
  36 #include <sys/syscall.h>
  37 #include <sys/sysinfo.h>
  38 #include <sys/vfs.h>
  39
  40 #include "bindings.h"
  41 #include "memory_utils.h"
  42 #include "config.h"
  43
  44 /* Define pivot_root() if missing from the C library */
  45 #ifndef HAVE_PIVOT_ROOT
  46 static int pivot_root(const char * new_root, const char * put_old)
  47 {
  48 #ifdef __NR_pivot_root
  49 return syscall(__NR_pivot_root, new_root, put_old);
  50 #else
  51 errno = ENOSYS;
  52 return -1;
  53 #endif
  54 }
  55 #else
  56 extern int pivot_root(const char * new_root, const char * put_old);
  57 #endif
  58
  59 struct cpuacct_usage {
  60         uint64_t user;
  61         uint64_t system;
  62         uint64_t idle;
  63         bool online;
  64 };
  65
  66 /* The function of hash table.*/
  67 #define LOAD_SIZE 100 /*the size of hash_table */
  68 #define FLUSH_TIME 5  /*the flush rate */
  69 #define DEPTH_DIR 3   /*the depth of per cgroup */
  70 /* The function of calculate loadavg .*/
  71 #define FSHIFT          11              /* nr of bits of precision */
  72 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  73 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  74 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  75 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  76 #define LOAD_INT(x) ((x) >> FSHIFT)
  77 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  78 /*
  79  * This parameter is used for proc_loadavg_read().
  80  * 1 means use loadavg, 0 means not use.
  81  */
  82 static int loadavg = 0;
  83 static volatile sig_atomic_t loadavg_stop = 0;
  84 static int calc_hash(const char *name)
  85 {
  86         unsigned int hash = 0;
  87         unsigned int x = 0;
  88         /* ELFHash algorithm. */
  89         while (*name) {
  90                 hash = (hash << 4) + *name++;
  91                 x = hash & 0xf0000000;
  92                 if (x != 0)
  93                         hash ^= (x >> 24);
  94                 hash &= ~x;
  95         }
  96         return (hash & 0x7fffffff);
  97 }
  98
  99 struct load_node {
 100         char *cg;  /*cg */
 101         unsigned long avenrun[3];               /* Load averages */
 102         unsigned int run_pid;
 103         unsigned int total_pid;
 104         unsigned int last_pid;
 105         int cfd; /* The file descriptor of the mounted cgroup */
 106         struct  load_node *next;
 107         struct  load_node **pre;
 108 };
 109
 110 struct load_head {
 111         /*
 112          * The lock is about insert load_node and refresh load_node.To the first
 113          * load_node of each hash bucket, insert and refresh in this hash bucket is
 114          * mutually exclusive.
 115          */
 116         pthread_mutex_t lock;
 117         /*
 118          * The rdlock is about read loadavg and delete load_node.To each hash
 119          * bucket, read and delete is mutually exclusive. But at the same time, we
 120          * allow paratactic read operation. This rdlock is at list level.
 121          */
 122         pthread_rwlock_t rdlock;
 123         /*
 124          * The rilock is about read loadavg and insert load_node.To the first
 125          * load_node of each hash bucket, read and insert is mutually exclusive.
 126          * But at the same time, we allow paratactic read operation.
 127          */
 128         pthread_rwlock_t rilock;
 129         struct load_node *next;
 130 };
 131
 132 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 133 /*
 134  * init_load initialize the hash table.
 135  * Return 0 on success, return -1 on failure.
 136  */
 137 static int init_load(void)
 138 {
 139         int i;
 140         int ret;
 141
 142         for (i = 0; i < LOAD_SIZE; i++) {
 143                 load_hash[i].next = NULL;
 144                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 145                 if (ret != 0) {
 146                         lxcfs_error("%s\n", "Failed to initialize lock");
 147                         goto out3;
 148                 }
 149                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 150                 if (ret != 0) {
 151                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 152                         goto out2;
 153                 }
 154                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 155                 if (ret != 0) {
 156                         lxcfs_error("%s\n", "Failed to initialize rilock");
 157                         goto out1;
 158                 }
 159         }
 160         return 0;
 161 out1:
 162         pthread_rwlock_destroy(&load_hash[i].rdlock);
 163 out2:
 164         pthread_mutex_destroy(&load_hash[i].lock);
 165 out3:
 166         while (i > 0) {
 167                 i--;
 168                 pthread_mutex_destroy(&load_hash[i].lock);
 169                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 170                 pthread_rwlock_destroy(&load_hash[i].rilock);
 171         }
 172         return -1;
 173 }
 174
 175 static void insert_node(struct load_node **n, int locate)
 176 {
 177         struct load_node *f;
 178
 179         pthread_mutex_lock(&load_hash[locate].lock);
 180         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 181         f = load_hash[locate].next;
 182         load_hash[locate].next = *n;
 183
 184         (*n)->pre = &(load_hash[locate].next);
 185         if (f)
 186                 f->pre = &((*n)->next);
 187         (*n)->next = f;
 188         pthread_mutex_unlock(&load_hash[locate].lock);
 189         pthread_rwlock_unlock(&load_hash[locate].rilock);
 190 }
 191 /*
 192  * locate_node() finds special node. Not return NULL means success.
 193  * It should be noted that rdlock isn't unlocked at the end of code
 194  * because this function is used to read special node. Delete is not
 195  * allowed before read has ended.
 196  * unlock rdlock only in proc_loadavg_read().
 197  */
 198 static struct load_node *locate_node(char *cg, int locate)
 199 {
 200         struct load_node *f = NULL;
 201         int i = 0;
 202
 203         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 204         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 205         if (load_hash[locate].next == NULL) {
 206                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 207                 return f;
 208         }
 209         f = load_hash[locate].next;
 210         pthread_rwlock_unlock(&load_hash[locate].rilock);
 211         while (f && ((i = strcmp(f->cg, cg)) != 0))
 212                 f = f->next;
 213         return f;
 214 }
 215 /* Delete the load_node n and return the next node of it. */
 216 static struct load_node *del_node(struct load_node *n, int locate)
 217 {
 218         struct load_node *g;
 219
 220         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 221         if (n->next == NULL) {
 222                 *(n->pre) = NULL;
 223         } else {
 224                 *(n->pre) = n->next;
 225                 n->next->pre = n->pre;
 226         }
 227         g = n->next;
 228         free(n->cg);
 229         free(n);
 230         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 231         return g;
 232 }
 233
 234 static void load_free(void)
 235 {
 236         int i;
 237         struct load_node *f, *p;
 238
 239         for (i = 0; i < LOAD_SIZE; i++) {
 240                 pthread_mutex_lock(&load_hash[i].lock);
 241                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 242                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 243                 if (load_hash[i].next == NULL) {
 244                         pthread_mutex_unlock(&load_hash[i].lock);
 245                         pthread_mutex_destroy(&load_hash[i].lock);
 246                         pthread_rwlock_unlock(&load_hash[i].rilock);
 247                         pthread_rwlock_destroy(&load_hash[i].rilock);
 248                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 249                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 250                         continue;
 251                 }
 252                 for (f = load_hash[i].next; f; ) {
 253                         free(f->cg);
 254                         p = f->next;
 255                         free(f);
 256                         f = p;
 257                 }
 258                 pthread_mutex_unlock(&load_hash[i].lock);
 259                 pthread_mutex_destroy(&load_hash[i].lock);
 260                 pthread_rwlock_unlock(&load_hash[i].rilock);
 261                 pthread_rwlock_destroy(&load_hash[i].rilock);
 262                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 263                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 264         }
 265 }
 266
 267 /* Data for CPU view */
 268 struct cg_proc_stat {
 269         char *cg;
 270         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 271         struct cpuacct_usage *view; // Usage stats reported to the container
 272         int cpu_count;
 273         pthread_mutex_t lock; // For node manipulation
 274         struct cg_proc_stat *next;
 275 };
 276
 277 struct cg_proc_stat_head {
 278         struct cg_proc_stat *next;
 279         time_t lastcheck;
 280
 281         /*
 282          * For access to the list. Reading can be parallel, pruning is exclusive.
 283          */
 284         pthread_rwlock_t lock;
 285 };
 286
 287 #define CPUVIEW_HASH_SIZE 100
 288 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 289
 290 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 291 {
 292         *head = malloc(sizeof(struct cg_proc_stat_head));
 293         if (!(*head)) {
 294                 lxcfs_error("%s\n", strerror(errno));
 295                 return false;
 296         }
 297
 298         (*head)->lastcheck = time(NULL);
 299         (*head)->next = NULL;
 300
 301         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 302                 lxcfs_error("%s\n", "Failed to initialize list lock");
 303                 free(*head);
 304                 return false;
 305         }
 306
 307         return true;
 308 }
 309
 310 static bool init_cpuview()
 311 {
 312         int i;
 313
 314         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 315                 proc_stat_history[i] = NULL;
 316
 317         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 318                 if (!cpuview_init_head(&proc_stat_history[i]))
 319                         goto err;
 320         }
 321
 322         return true;
 323
 324 err:
 325         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 326                 if (proc_stat_history[i]) {
 327                         free(proc_stat_history[i]);
 328                         proc_stat_history[i] = NULL;
 329                 }
 330         }
 331
 332         return false;
 333 }
 334
 335 static void free_proc_stat_node(struct cg_proc_stat *node)
 336 {
 337         pthread_mutex_destroy(&node->lock);
 338         free(node->cg);
 339         free(node->usage);
 340         free(node->view);
 341         free(node);
 342 }
 343
 344 static void cpuview_free_head(struct cg_proc_stat_head *head)
 345 {
 346         struct cg_proc_stat *node, *tmp;
 347
 348         if (head->next) {
 349                 node = head->next;
 350
 351                 for (;;) {
 352                         tmp = node;
 353                         node = node->next;
 354                         free_proc_stat_node(tmp);
 355
 356                         if (!node)
 357                                 break;
 358                 }
 359         }
 360
 361         pthread_rwlock_destroy(&head->lock);
 362         free(head);
 363 }
 364
 365 static void free_cpuview()
 366 {
 367         int i;
 368
 369         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 370                 if (proc_stat_history[i])
 371                         cpuview_free_head(proc_stat_history[i]);
 372         }
 373 }
 374
 375 /*
 376  * A table caching which pid is init for a pid namespace.
 377  * When looking up which pid is init for $qpid, we first
 378  * 1. Stat /proc/$qpid/ns/pid.
 379  * 2. Check whether the ino_t is in our store.
 380  *   a. if not, fork a child in qpid's ns to send us
 381  *       ucred.pid = 1, and read the initpid.  Cache
 382  *       initpid and creation time for /proc/initpid
 383  *       in a new store entry.
 384  *   b. if so, verify that /proc/initpid still matches
 385  *       what we have saved.  If not, clear the store
 386  *       entry and go back to a.  If so, return the
 387  *       cached initpid.
 388  */
 389 struct pidns_init_store {
 390         ino_t ino;          // inode number for /proc/$pid/ns/pid
 391         pid_t initpid;      // the pid of nit in that ns
 392         long int ctime;     // the time at which /proc/$initpid was created
 393         struct pidns_init_store *next;
 394         long int lastcheck;
 395 };
 396
 397 /* lol - look at how they are allocated in the kernel */
 398 #define PIDNS_HASH_SIZE 4096
 399 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 400
 401 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 402 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 403 static void lock_mutex(pthread_mutex_t *l)
 404 {
 405         int ret;
 406
 407         if ((ret = pthread_mutex_lock(l)) != 0) {
 408                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 409                 exit(1);
 410         }
 411 }
 412
 413 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 414  * Number of hierarchies mounted. */
 415 static int num_hierarchies;
 416
 417 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 418  * Hierachies mounted {cpuset, blkio, ...}:
 419  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 420 static char **hierarchies;
 421
 422 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 423  * Open file descriptors:
 424  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 425  * private mount namespace.
 426  * Initialized via __constructor__ collect_and_mount_subsystems().
 427  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 428  * mounts and respective files in the private namespace even when located in
 429  * another namespace using the *at() family of functions
 430  * {openat(), fchownat(), ...}. */
 431 static int *fd_hierarchies;
 432 static int cgroup_mount_ns_fd = -1;
 433
 434 static void unlock_mutex(pthread_mutex_t *l)
 435 {
 436         int ret;
 437
 438         if ((ret = pthread_mutex_unlock(l)) != 0) {
 439                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 440                 exit(1);
 441         }
 442 }
 443
 444 static void store_lock(void)
 445 {
 446         lock_mutex(&pidns_store_mutex);
 447 }
 448
 449 static void store_unlock(void)
 450 {
 451         unlock_mutex(&pidns_store_mutex);
 452 }
 453
 454 /* Must be called under store_lock */
 455 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 456 {
 457         struct stat initsb;
 458         char fnam[100];
 459
 460         snprintf(fnam, 100, "/proc/%d", e->initpid);
 461         if (stat(fnam, &initsb) < 0)
 462                 return false;
 463
 464         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 465                     initsb.st_ctime, e->initpid);
 466
 467         if (e->ctime != initsb.st_ctime)
 468                 return false;
 469         return true;
 470 }
 471
 472 /* Must be called under store_lock */
 473 static void remove_initpid(struct pidns_init_store *e)
 474 {
 475         struct pidns_init_store *tmp;
 476         int h;
 477
 478         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 479
 480         h = HASH(e->ino);
 481         if (pidns_hash_table[h] == e) {
 482                 pidns_hash_table[h] = e->next;
 483                 free(e);
 484                 return;
 485         }
 486
 487         tmp = pidns_hash_table[h];
 488         while (tmp) {
 489                 if (tmp->next == e) {
 490                         tmp->next = e->next;
 491                         free(e);
 492                         return;
 493                 }
 494                 tmp = tmp->next;
 495         }
 496 }
 497
 498 #define PURGE_SECS 5
 499 /* Must be called under store_lock */
 500 static void prune_initpid_store(void)
 501 {
 502         static long int last_prune = 0;
 503         struct pidns_init_store *e, *prev, *delme;
 504         long int now, threshold;
 505         int i;
 506
 507         if (!last_prune) {
 508                 last_prune = time(NULL);
 509                 return;
 510         }
 511         now = time(NULL);
 512         if (now < last_prune + PURGE_SECS)
 513                 return;
 514
 515         lxcfs_debug("%s\n", "Pruning.");
 516
 517         last_prune = now;
 518         threshold = now - 2 * PURGE_SECS;
 519
 520         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 521                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 522                         if (e->lastcheck < threshold) {
 523
 524                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 525
 526                                 delme = e;
 527                                 if (prev)
 528                                         prev->next = e->next;
 529                                 else
 530                                         pidns_hash_table[i] = e->next;
 531                                 e = e->next;
 532                                 free(delme);
 533                         } else {
 534                                 prev = e;
 535                                 e = e->next;
 536                         }
 537                 }
 538         }
 539 }
 540
 541 /* Must be called under store_lock */
 542 static void save_initpid(struct stat *sb, pid_t pid)
 543 {
 544         struct pidns_init_store *e;
 545         char fpath[100];
 546         struct stat procsb;
 547         int h;
 548
 549         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 550
 551         snprintf(fpath, 100, "/proc/%d", pid);
 552         if (stat(fpath, &procsb) < 0)
 553                 return;
 554         do {
 555                 e = malloc(sizeof(*e));
 556         } while (!e);
 557         e->ino = sb->st_ino;
 558         e->initpid = pid;
 559         e->ctime = procsb.st_ctime;
 560         h = HASH(e->ino);
 561         e->next = pidns_hash_table[h];
 562         e->lastcheck = time(NULL);
 563         pidns_hash_table[h] = e;
 564 }
 565
 566 /*
 567  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 568  * entry for the inode number and creation time.  Verify that the init pid
 569  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 570  * otherwise.
 571  * Must be called under store_lock
 572  */
 573 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 574 {
 575         int h = HASH(sb->st_ino);
 576         struct pidns_init_store *e = pidns_hash_table[h];
 577
 578         while (e) {
 579                 if (e->ino == sb->st_ino) {
 580                         if (initpid_still_valid(e, sb)) {
 581                                 e->lastcheck = time(NULL);
 582                                 return e;
 583                         }
 584                         remove_initpid(e);
 585                         return NULL;
 586                 }
 587                 e = e->next;
 588         }
 589
 590         return NULL;
 591 }
 592
 593 static int is_dir(const char *path, int fd)
 594 {
 595         struct stat statbuf;
 596         int ret = fstatat(fd, path, &statbuf, fd);
 597         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 598                 return 1;
 599         return 0;
 600 }
 601
 602 static char *must_copy_string(const char *str)
 603 {
 604         char *dup = NULL;
 605         if (!str)
 606                 return NULL;
 607         do {
 608                 dup = strdup(str);
 609         } while (!dup);
 610
 611         return dup;
 612 }
 613
 614 static inline void drop_trailing_newlines(char *s)
 615 {
 616         int l;
 617
 618         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 619                 s[l-1] = '\0';
 620 }
 621
 622 #define BATCH_SIZE 50
 623 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 624 {
 625         int newbatches = (newlen / BATCH_SIZE) + 1;
 626         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 627
 628         if (!*mem || newbatches > oldbatches) {
 629                 char *tmp;
 630                 do {
 631                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 632                 } while (!tmp);
 633                 *mem = tmp;
 634         }
 635 }
 636 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 637 {
 638         size_t newlen = *len + linelen;
 639         dorealloc(contents, *len, newlen + 1);
 640         memcpy(*contents + *len, line, linelen+1);
 641         *len = newlen;
 642 }
 643
 644 static char *slurp_file(const char *from, int fd)
 645 {
 646         char *line = NULL;
 647         char *contents = NULL;
 648         FILE *f = fdopen(fd, "r");
 649         size_t len = 0, fulllen = 0;
 650         ssize_t linelen;
 651
 652         if (!f)
 653                 return NULL;
 654
 655         while ((linelen = getline(&line, &len, f)) != -1) {
 656                 append_line(&contents, &fulllen, line, linelen);
 657         }
 658         fclose(f);
 659
 660         if (contents)
 661                 drop_trailing_newlines(contents);
 662         free(line);
 663         return contents;
 664 }
 665
 666 static int preserve_ns(const int pid, const char *ns)
 667 {
 668         int ret;
 669 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
 670 #define __NS_PATH_LEN 50
 671         char path[__NS_PATH_LEN];
 672
 673         /* This way we can use this function to also check whether namespaces
 674          * are supported by the kernel by passing in the NULL or the empty
 675          * string.
 676          */
 677         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
 678                        !ns || strcmp(ns, "") == 0 ? "" : "/",
 679                        !ns || strcmp(ns, "") == 0 ? "" : ns);
 680         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
 681                 errno = EFBIG;
 682                 return -1;
 683         }
 684
 685         return open(path, O_RDONLY | O_CLOEXEC);
 686 }
 687
 688 /**
 689  * in_same_namespace - Check whether two processes are in the same namespace.
 690  * @pid1 - PID of the first process.
 691  * @pid2 - PID of the second process.
 692  * @ns   - Name of the namespace to check. Must correspond to one of the names
 693  *         for the namespaces as shown in /proc/<pid/ns/
 694  *
 695  * If the two processes are not in the same namespace returns an fd to the
 696  * namespace of the second process identified by @pid2. If the two processes are
 697  * in the same namespace returns -EINVAL, -1 if an error occurred.
 698  */
 699 static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
 700 {
 701         __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
 702         int ret = -1;
 703         struct stat ns_st1, ns_st2;
 704
 705         ns_fd1 = preserve_ns(pid1, ns);
 706         if (ns_fd1 < 0) {
 707                 /* The kernel does not support this namespace. This is not an
 708                  * error.
 709                  */
 710                 if (errno == ENOENT)
 711                         return -EINVAL;
 712
 713                 return -1;
 714         }
 715
 716         ns_fd2 = preserve_ns(pid2, ns);
 717         if (ns_fd2 < 0)
 718                 return -1;
 719
 720         ret = fstat(ns_fd1, &ns_st1);
 721         if (ret < 0)
 722                 return -1;
 723
 724         ret = fstat(ns_fd2, &ns_st2);
 725         if (ret < 0)
 726                 return -1;
 727
 728         /* processes are in the same namespace */
 729         if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
 730                 return -EINVAL;
 731
 732         /* processes are in different namespaces */
 733         return move_fd(ns_fd2);
 734 }
 735
 736 static bool is_shared_pidns(pid_t pid)
 737 {
 738         if (pid != 1)
 739                 return false;
 740
 741         if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
 742                 return true;
 743
 744         return false;
 745 }
 746
 747 static bool write_string(const char *fnam, const char *string, int fd)
 748 {
 749         FILE *f;
 750         size_t len, ret;
 751
 752         f = fdopen(fd, "w");
 753         if (!f)
 754                 return false;
 755
 756         len = strlen(string);
 757         ret = fwrite(string, 1, len, f);
 758         if (ret != len) {
 759                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 760                             strerror(errno), string, fnam);
 761                 fclose(f);
 762                 return false;
 763         }
 764
 765         if (fclose(f) < 0) {
 766                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 767                 return false;
 768         }
 769
 770         return true;
 771 }
 772
 773 struct cgfs_files {
 774         char *name;
 775         uint32_t uid, gid;
 776         uint32_t mode;
 777 };
 778
 779 #define ALLOC_NUM 20
 780 static bool store_hierarchy(char *stridx, char *h)
 781 {
 782         if (num_hierarchies % ALLOC_NUM == 0) {
 783                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 784                 n *= ALLOC_NUM;
 785                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 786                 if (!tmp) {
 787                         lxcfs_error("%s\n", strerror(errno));
 788                         exit(1);
 789                 }
 790                 hierarchies = tmp;
 791         }
 792
 793         hierarchies[num_hierarchies++] = must_copy_string(h);
 794         return true;
 795 }
 796
 797 static void print_subsystems(void)
 798 {
 799         int i;
 800
 801         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 802         fprintf(stderr, "hierarchies:\n");
 803         for (i = 0; i < num_hierarchies; i++) {
 804                 if (hierarchies[i])
 805                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 806                                 fd_hierarchies[i], hierarchies[i]);
 807         }
 808 }
 809
 810 static bool in_comma_list(const char *needle, const char *haystack)
 811 {
 812         const char *s = haystack, *e;
 813         size_t nlen = strlen(needle);
 814
 815         while (*s && (e = strchr(s, ','))) {
 816                 if (nlen != e - s) {
 817                         s = e + 1;
 818                         continue;
 819                 }
 820                 if (strncmp(needle, s, nlen) == 0)
 821                         return true;
 822                 s = e + 1;
 823         }
 824         if (strcmp(needle, s) == 0)
 825                 return true;
 826         return false;
 827 }
 828
 829 /* do we need to do any massaging here?  I'm not sure... */
 830 /* Return the mounted controller and store the corresponding open file descriptor
 831  * referring to the controller mountpoint in the private lxcfs namespace in
 832  * @cfd.
 833  */
 834 static char *find_mounted_controller(const char *controller, int *cfd)
 835 {
 836         int i;
 837
 838         for (i = 0; i < num_hierarchies; i++) {
 839                 if (!hierarchies[i])
 840                         continue;
 841                 if (strcmp(hierarchies[i], controller) == 0) {
 842                         *cfd = fd_hierarchies[i];
 843                         return hierarchies[i];
 844                 }
 845                 if (in_comma_list(controller, hierarchies[i])) {
 846                         *cfd = fd_hierarchies[i];
 847                         return hierarchies[i];
 848                 }
 849         }
 850
 851         return NULL;
 852 }
 853
 854 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 855                 const char *value)
 856 {
 857         int ret, fd, cfd;
 858         size_t len;
 859         char *fnam, *tmpc;
 860
 861         tmpc = find_mounted_controller(controller, &cfd);
 862         if (!tmpc)
 863                 return false;
 864
 865         /* Make sure we pass a relative path to *at() family of functions.
 866          * . + /cgroup + / + file + \0
 867          */
 868         len = strlen(cgroup) + strlen(file) + 3;
 869         fnam = alloca(len);
 870         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 871         if (ret < 0 || (size_t)ret >= len)
 872                 return false;
 873
 874         fd = openat(cfd, fnam, O_WRONLY);
 875         if (fd < 0)
 876                 return false;
 877
 878         return write_string(fnam, value, fd);
 879 }
 880
 881 // Chown all the files in the cgroup directory.  We do this when we create
 882 // a cgroup on behalf of a user.
 883 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 884 {
 885         struct dirent *direntp;
 886         char path[MAXPATHLEN];
 887         size_t len;
 888         DIR *d;
 889         int fd1, ret;
 890
 891         len = strlen(dirname);
 892         if (len >= MAXPATHLEN) {
 893                 lxcfs_error("Pathname too long: %s\n", dirname);
 894                 return;
 895         }
 896
 897         fd1 = openat(fd, dirname, O_DIRECTORY);
 898         if (fd1 < 0)
 899                 return;
 900
 901         d = fdopendir(fd1);
 902         if (!d) {
 903                 lxcfs_error("Failed to open %s\n", dirname);
 904                 return;
 905         }
 906
 907         while ((direntp = readdir(d))) {
 908                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 909                         continue;
 910                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 911                 if (ret < 0 || ret >= MAXPATHLEN) {
 912                         lxcfs_error("Pathname too long under %s\n", dirname);
 913                         continue;
 914                 }
 915                 if (fchownat(fd, path, uid, gid, 0) < 0)
 916                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 917         }
 918         closedir(d);
 919 }
 920
 921 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 922 {
 923         int cfd;
 924         size_t len;
 925         char *dirnam, *tmpc;
 926
 927         tmpc = find_mounted_controller(controller, &cfd);
 928         if (!tmpc)
 929                 return -EINVAL;
 930
 931         /* Make sure we pass a relative path to *at() family of functions.
 932          * . + /cg + \0
 933          */
 934         len = strlen(cg) + 2;
 935         dirnam = alloca(len);
 936         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 937
 938         if (mkdirat(cfd, dirnam, 0755) < 0)
 939                 return -errno;
 940
 941         if (uid == 0 && gid == 0)
 942                 return 0;
 943
 944         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 945                 return -errno;
 946
 947         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 948
 949         return 0;
 950 }
 951
 952 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 953 {
 954         struct dirent *direntp;
 955         DIR *dir;
 956         bool ret = false;
 957         char pathname[MAXPATHLEN];
 958         int dupfd;
 959
 960         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 961         if (dupfd < 0)
 962                 return false;
 963
 964         dir = fdopendir(dupfd);
 965         if (!dir) {
 966                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 967                 close(dupfd);
 968                 return false;
 969         }
 970
 971         while ((direntp = readdir(dir))) {
 972                 struct stat mystat;
 973                 int rc;
 974
 975                 if (!strcmp(direntp->d_name, ".") ||
 976                     !strcmp(direntp->d_name, ".."))
 977                         continue;
 978
 979                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 980                 if (rc < 0 || rc >= MAXPATHLEN) {
 981                         lxcfs_error("%s\n", "Pathname too long.");
 982                         continue;
 983                 }
 984
 985                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 986                 if (rc) {
 987                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 988                         continue;
 989                 }
 990                 if (S_ISDIR(mystat.st_mode))
 991                         if (!recursive_rmdir(pathname, fd, cfd))
 992                                 lxcfs_debug("Error removing %s.\n", pathname);
 993         }
 994
 995         ret = true;
 996         if (closedir(dir) < 0) {
 997                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 998                 ret = false;
 999         }
1000
1001         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
1002                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
1003                 ret = false;
1004         }
1005
1006         close(dupfd);
1007
1008         return ret;
1009 }
1010
1011 bool cgfs_remove(const char *controller, const char *cg)
1012 {
1013         int fd, cfd;
1014         size_t len;
1015         char *dirnam, *tmpc;
1016         bool bret;
1017
1018         tmpc = find_mounted_controller(controller, &cfd);
1019         if (!tmpc)
1020                 return false;
1021
1022         /* Make sure we pass a relative path to *at() family of functions.
1023          * . +  /cg + \0
1024          */
1025         len = strlen(cg) + 2;
1026         dirnam = alloca(len);
1027         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
1028
1029         fd = openat(cfd, dirnam, O_DIRECTORY);
1030         if (fd < 0)
1031                 return false;
1032
1033         bret = recursive_rmdir(dirnam, fd, cfd);
1034         close(fd);
1035         return bret;
1036 }
1037
1038 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
1039 {
1040         int cfd;
1041         size_t len;
1042         char *pathname, *tmpc;
1043
1044         tmpc = find_mounted_controller(controller, &cfd);
1045         if (!tmpc)
1046                 return false;
1047
1048         /* Make sure we pass a relative path to *at() family of functions.
1049          * . + /file + \0
1050          */
1051         len = strlen(file) + 2;
1052         pathname = alloca(len);
1053         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1054         if (fchmodat(cfd, pathname, mode, 0) < 0)
1055                 return false;
1056         return true;
1057 }
1058
1059 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1060 {
1061         size_t len;
1062         char *fname;
1063
1064         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1065         fname = alloca(len);
1066         snprintf(fname, len, "%s/tasks", dirname);
1067         if (fchownat(fd, fname, uid, gid, 0) != 0)
1068                 return -errno;
1069         snprintf(fname, len, "%s/cgroup.procs", dirname);
1070         if (fchownat(fd, fname, uid, gid, 0) != 0)
1071                 return -errno;
1072         return 0;
1073 }
1074
1075 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1076 {
1077         int cfd;
1078         size_t len;
1079         char *pathname, *tmpc;
1080
1081         tmpc = find_mounted_controller(controller, &cfd);
1082         if (!tmpc)
1083                 return -EINVAL;
1084
1085         /* Make sure we pass a relative path to *at() family of functions.
1086          * . + /file + \0
1087          */
1088         len = strlen(file) + 2;
1089         pathname = alloca(len);
1090         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1091         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1092                 return -errno;
1093
1094         if (is_dir(pathname, cfd))
1095                 // like cgmanager did, we want to chown the tasks file as well
1096                 return chown_tasks_files(pathname, uid, gid, cfd);
1097
1098         return 0;
1099 }
1100
1101 FILE *open_pids_file(const char *controller, const char *cgroup)
1102 {
1103         int fd, cfd;
1104         size_t len;
1105         char *pathname, *tmpc;
1106
1107         tmpc = find_mounted_controller(controller, &cfd);
1108         if (!tmpc)
1109                 return NULL;
1110
1111         /* Make sure we pass a relative path to *at() family of functions.
1112          * . + /cgroup + / "cgroup.procs" + \0
1113          */
1114         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1115         pathname = alloca(len);
1116         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1117
1118         fd = openat(cfd, pathname, O_WRONLY);
1119         if (fd < 0)
1120                 return NULL;
1121
1122         return fdopen(fd, "w");
1123 }
1124
1125 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1126                                 void ***list, size_t typesize,
1127                                 void* (*iterator)(const char*, const char*, const char*))
1128 {
1129         int cfd, fd, ret;
1130         size_t len;
1131         char *cg, *tmpc;
1132         char pathname[MAXPATHLEN];
1133         size_t sz = 0, asz = 0;
1134         struct dirent *dirent;
1135         DIR *dir;
1136
1137         tmpc = find_mounted_controller(controller, &cfd);
1138         *list = NULL;
1139         if (!tmpc)
1140                 return false;
1141
1142         /* Make sure we pass a relative path to *at() family of functions. */
1143         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1144         cg = alloca(len);
1145         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1146         if (ret < 0 || (size_t)ret >= len) {
1147                 lxcfs_error("Pathname too long under %s\n", cgroup);
1148                 return false;
1149         }
1150
1151         fd = openat(cfd, cg, O_DIRECTORY);
1152         if (fd < 0)
1153                 return false;
1154
1155         dir = fdopendir(fd);
1156         if (!dir)
1157                 return false;
1158
1159         while ((dirent = readdir(dir))) {
1160                 struct stat mystat;
1161
1162                 if (!strcmp(dirent->d_name, ".") ||
1163                     !strcmp(dirent->d_name, ".."))
1164                         continue;
1165
1166                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1167                 if (ret < 0 || ret >= MAXPATHLEN) {
1168                         lxcfs_error("Pathname too long under %s\n", cg);
1169                         continue;
1170                 }
1171
1172                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1173                 if (ret) {
1174                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1175                         continue;
1176                 }
1177                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1178                     (directories && !S_ISDIR(mystat.st_mode)))
1179                         continue;
1180
1181                 if (sz+2 >= asz) {
1182                         void **tmp;
1183                         asz += BATCH_SIZE;
1184                         do {
1185                                 tmp = realloc(*list, asz * typesize);
1186                         } while  (!tmp);
1187                         *list = tmp;
1188                 }
1189                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1190                 (*list)[sz+1] = NULL;
1191                 sz++;
1192         }
1193         if (closedir(dir) < 0) {
1194                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1195                 return false;
1196         }
1197         return true;
1198 }
1199
1200 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1201 {
1202         char *dup;
1203         do {
1204                 dup = strdup(dir_entry);
1205         } while (!dup);
1206         return dup;
1207 }
1208
1209 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1210 {
1211         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1212 }
1213
1214 void free_key(struct cgfs_files *k)
1215 {
1216         if (!k)
1217                 return;
1218         free(k->name);
1219         free(k);
1220 }
1221
1222 void free_keys(struct cgfs_files **keys)
1223 {
1224         int i;
1225
1226         if (!keys)
1227                 return;
1228         for (i = 0; keys[i]; i++) {
1229                 free_key(keys[i]);
1230         }
1231         free(keys);
1232 }
1233
1234 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1235 {
1236         int ret, fd, cfd;
1237         size_t len;
1238         char *fnam, *tmpc;
1239
1240         tmpc = find_mounted_controller(controller, &cfd);
1241         if (!tmpc)
1242                 return false;
1243
1244         /* Make sure we pass a relative path to *at() family of functions.
1245          * . + /cgroup + / + file + \0
1246          */
1247         len = strlen(cgroup) + strlen(file) + 3;
1248         fnam = alloca(len);
1249         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1250         if (ret < 0 || (size_t)ret >= len)
1251                 return false;
1252
1253         fd = openat(cfd, fnam, O_RDONLY);
1254         if (fd < 0)
1255                 return false;
1256
1257         *value = slurp_file(fnam, fd);
1258         return *value != NULL;
1259 }
1260
1261 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1262 {
1263         int ret, cfd;
1264         size_t len;
1265         char *fnam, *tmpc;
1266
1267         tmpc = find_mounted_controller(controller, &cfd);
1268         if (!tmpc)
1269                 return false;
1270
1271         /* Make sure we pass a relative path to *at() family of functions.
1272          * . + /cgroup + / + file + \0
1273          */
1274         len = strlen(cgroup) + strlen(file) + 3;
1275         fnam = alloca(len);
1276         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1277         if (ret < 0 || (size_t)ret >= len)
1278                 return false;
1279
1280         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1281 }
1282
1283 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1284 {
1285         int ret, cfd;
1286         size_t len;
1287         char *fnam, *tmpc;
1288         struct stat sb;
1289         struct cgfs_files *newkey;
1290
1291         tmpc = find_mounted_controller(controller, &cfd);
1292         if (!tmpc)
1293                 return false;
1294
1295         if (file && *file == '/')
1296                 file++;
1297
1298         if (file && strchr(file, '/'))
1299                 return NULL;
1300
1301         /* Make sure we pass a relative path to *at() family of functions.
1302          * . + /cgroup + / + file + \0
1303          */
1304         len = strlen(cgroup) + 3;
1305         if (file)
1306                 len += strlen(file) + 1;
1307         fnam = alloca(len);
1308         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1309                  file ? "/" : "", file ? file : "");
1310
1311         ret = fstatat(cfd, fnam, &sb, 0);
1312         if (ret < 0)
1313                 return NULL;
1314
1315         do {
1316                 newkey = malloc(sizeof(struct cgfs_files));
1317         } while (!newkey);
1318         if (file)
1319                 newkey->name = must_copy_string(file);
1320         else if (strrchr(cgroup, '/'))
1321                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1322         else
1323                 newkey->name = must_copy_string(cgroup);
1324         newkey->uid = sb.st_uid;
1325         newkey->gid = sb.st_gid;
1326         newkey->mode = sb.st_mode;
1327
1328         return newkey;
1329 }
1330
1331 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1332 {
1333         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1334         if (!entry) {
1335                 lxcfs_error("Error getting files under %s:%s\n", controller,
1336                              cgroup);
1337         }
1338         return entry;
1339 }
1340
1341 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1342 {
1343         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1344 }
1345
1346 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1347 {
1348         int cfd;
1349         size_t len;
1350         char *fnam, *tmpc;
1351         int ret;
1352         struct stat sb;
1353
1354         tmpc = find_mounted_controller(controller, &cfd);
1355         if (!tmpc)
1356                 return false;
1357
1358         /* Make sure we pass a relative path to *at() family of functions.
1359          * . + /cgroup + / + f + \0
1360          */
1361         len = strlen(cgroup) + strlen(f) + 3;
1362         fnam = alloca(len);
1363         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1364         if (ret < 0 || (size_t)ret >= len)
1365                 return false;
1366
1367         ret = fstatat(cfd, fnam, &sb, 0);
1368         if (ret < 0 || !S_ISDIR(sb.st_mode))
1369                 return false;
1370
1371         return true;
1372 }
1373
1374 #define SEND_CREDS_OK 0
1375 #define SEND_CREDS_NOTSK 1
1376 #define SEND_CREDS_FAIL 2
1377 static bool recv_creds(int sock, struct ucred *cred, char *v);
1378 static int wait_for_pid(pid_t pid);
1379 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1380 static int send_creds_clone_wrapper(void *arg);
1381
1382 /*
1383  * clone a task which switches to @task's namespace and writes '1'.
1384  * over a unix sock so we can read the task's reaper's pid in our
1385  * namespace
1386  *
1387  * Note: glibc's fork() does not respect pidns, which can lead to failed
1388  * assertions inside glibc (and thus failed forks) if the child's pid in
1389  * the pidns and the parent pid outside are identical. Using clone prevents
1390  * this issue.
1391  */
1392 static void write_task_init_pid_exit(int sock, pid_t target)
1393 {
1394         char fnam[100];
1395         pid_t pid;
1396         int fd, ret;
1397         size_t stack_size = sysconf(_SC_PAGESIZE);
1398         void *stack = alloca(stack_size);
1399
1400         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1401         if (ret < 0 || ret >= sizeof(fnam))
1402                 _exit(1);
1403
1404         fd = open(fnam, O_RDONLY);
1405         if (fd < 0) {
1406                 perror("write_task_init_pid_exit open of ns/pid");
1407                 _exit(1);
1408         }
1409         if (setns(fd, 0)) {
1410                 perror("write_task_init_pid_exit setns 1");
1411                 close(fd);
1412                 _exit(1);
1413         }
1414         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1415         if (pid < 0)
1416                 _exit(1);
1417         if (pid != 0) {
1418                 if (!wait_for_pid(pid))
1419                         _exit(1);
1420                 _exit(0);
1421         }
1422 }
1423
1424 static int send_creds_clone_wrapper(void *arg) {
1425         struct ucred cred;
1426         char v;
1427         int sock = *(int *)arg;
1428
1429         /* we are the child */
1430         cred.uid = 0;
1431         cred.gid = 0;
1432         cred.pid = 1;
1433         v = '1';
1434         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1435                 return 1;
1436         return 0;
1437 }
1438
1439 static pid_t get_init_pid_for_task(pid_t task)
1440 {
1441         int sock[2];
1442         pid_t pid;
1443         pid_t ret = -1;
1444         char v = '0';
1445         struct ucred cred;
1446
1447         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1448                 perror("socketpair");
1449                 return -1;
1450         }
1451
1452         pid = fork();
1453         if (pid < 0)
1454                 goto out;
1455         if (!pid) {
1456                 close(sock[1]);
1457                 write_task_init_pid_exit(sock[0], task);
1458                 _exit(0);
1459         }
1460
1461         if (!recv_creds(sock[1], &cred, &v))
1462                 goto out;
1463         ret = cred.pid;
1464
1465 out:
1466         close(sock[0]);
1467         close(sock[1]);
1468         if (pid > 0)
1469                 wait_for_pid(pid);
1470         return ret;
1471 }
1472
1473 pid_t lookup_initpid_in_store(pid_t qpid)
1474 {
1475         pid_t answer = 0;
1476         struct stat sb;
1477         struct pidns_init_store *e;
1478         char fnam[100];
1479
1480         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1481         store_lock();
1482         if (stat(fnam, &sb) < 0)
1483                 goto out;
1484         e = lookup_verify_initpid(&sb);
1485         if (e) {
1486                 answer = e->initpid;
1487                 goto out;
1488         }
1489         answer = get_init_pid_for_task(qpid);
1490         if (answer > 0)
1491                 save_initpid(&sb, answer);
1492
1493 out:
1494         /* we prune at end in case we are returning
1495          * the value we were about to return */
1496         prune_initpid_store();
1497         store_unlock();
1498         return answer;
1499 }
1500
1501 static int wait_for_pid(pid_t pid)
1502 {
1503         int status, ret;
1504
1505         if (pid <= 0)
1506                 return -1;
1507
1508 again:
1509         ret = waitpid(pid, &status, 0);
1510         if (ret == -1) {
1511                 if (errno == EINTR)
1512                         goto again;
1513                 return -1;
1514         }
1515         if (ret != pid)
1516                 goto again;
1517         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1518                 return -1;
1519         return 0;
1520 }
1521
1522 /*
1523  * append the given formatted string to *src.
1524  * src: a pointer to a char* in which to append the formatted string.
1525  * sz: the number of characters printed so far, minus trailing \0.
1526  * asz: the allocated size so far
1527  * format: string format. See printf for details.
1528  * ...: varargs. See printf for details.
1529  */
1530 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1531 {
1532         char tmp[BUF_RESERVE_SIZE];
1533         va_list         args;
1534
1535         va_start (args, format);
1536         int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1537         va_end(args);
1538
1539         if (!*src || tmplen + *sz + 1 >= *asz) {
1540                 char *tmp;
1541                 do {
1542                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1543                 } while (!tmp);
1544                 *src = tmp;
1545                 *asz += BUF_RESERVE_SIZE;
1546         }
1547         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1548         *sz += tmplen;
1549 }
1550
1551 /*
1552  * append pid to *src.
1553  * src: a pointer to a char* in which ot append the pid.
1554  * sz: the number of characters printed so far, minus trailing \0.
1555  * asz: the allocated size so far
1556  * pid: the pid to append
1557  */
1558 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1559 {
1560         must_strcat(src, sz, asz, "%d\n", (int)pid);
1561 }
1562
1563 /*
1564  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1565  * valid in the caller's namespace, return the id mapped into
1566  * pid's namespace.
1567  * Returns the mapped id, or -1 on error.
1568  */
1569 unsigned int
1570 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1571 {
1572         unsigned int nsuid,   // base id for a range in the idfile's namespace
1573                      hostuid, // base id for a range in the caller's namespace
1574                      count;   // number of ids in this range
1575         char line[400];
1576         int ret;
1577
1578         fseek(idfile, 0L, SEEK_SET);
1579         while (fgets(line, 400, idfile)) {
1580                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1581                 if (ret != 3)
1582                         continue;
1583                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1584                         /*
1585                          * uids wrapped around - unexpected as this is a procfile,
1586                          * so just bail.
1587                          */
1588                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1589                                 nsuid, hostuid, count, line);
1590                         return -1;
1591                 }
1592                 if (hostuid <= in_id && hostuid+count > in_id) {
1593                         /*
1594                          * now since hostuid <= in_id < hostuid+count, and
1595                          * hostuid+count and nsuid+count do not wrap around,
1596                          * we know that nsuid+(in_id-hostuid) which must be
1597                          * less that nsuid+(count) must not wrap around
1598                          */
1599                         return (in_id - hostuid) + nsuid;
1600                 }
1601         }
1602
1603         // no answer found
1604         return -1;
1605 }
1606
1607 /*
1608  * for is_privileged_over,
1609  * specify whether we require the calling uid to be root in his
1610  * namespace
1611  */
1612 #define NS_ROOT_REQD true
1613 #define NS_ROOT_OPT false
1614
1615 #define PROCLEN 100
1616
1617 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1618 {
1619         char fpath[PROCLEN];
1620         int ret;
1621         bool answer = false;
1622         uid_t nsuid;
1623
1624         if (victim == -1 || uid == -1)
1625                 return false;
1626
1627         /*
1628          * If the request is one not requiring root in the namespace,
1629          * then having the same uid suffices.  (i.e. uid 1000 has write
1630          * access to files owned by uid 1000
1631          */
1632         if (!req_ns_root && uid == victim)
1633                 return true;
1634
1635         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1636         if (ret < 0 || ret >= PROCLEN)
1637                 return false;
1638         FILE *f = fopen(fpath, "r");
1639         if (!f)
1640                 return false;
1641
1642         /* if caller's not root in his namespace, reject */
1643         nsuid = convert_id_to_ns(f, uid);
1644         if (nsuid)
1645                 goto out;
1646
1647         /*
1648          * If victim is not mapped into caller's ns, reject.
1649          * XXX I'm not sure this check is needed given that fuse
1650          * will be sending requests where the vfs has converted
1651          */
1652         nsuid = convert_id_to_ns(f, victim);
1653         if (nsuid == -1)
1654                 goto out;
1655
1656         answer = true;
1657
1658 out:
1659         fclose(f);
1660         return answer;
1661 }
1662
1663 static bool perms_include(int fmode, mode_t req_mode)
1664 {
1665         mode_t r;
1666
1667         switch (req_mode & O_ACCMODE) {
1668         case O_RDONLY:
1669                 r = S_IROTH;
1670                 break;
1671         case O_WRONLY:
1672                 r = S_IWOTH;
1673                 break;
1674         case O_RDWR:
1675                 r = S_IROTH | S_IWOTH;
1676                 break;
1677         default:
1678                 return false;
1679         }
1680         return ((fmode & r) == r);
1681 }
1682
1683
1684 /*
1685  * taskcg is  a/b/c
1686  * querycg is /a/b/c/d/e
1687  * we return 'd'
1688  */
1689 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1690 {
1691         char *start, *end;
1692
1693         if (strlen(taskcg) <= strlen(querycg)) {
1694                 lxcfs_error("%s\n", "I was fed bad input.");
1695                 return NULL;
1696         }
1697
1698         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1699                 start =  strdup(taskcg + 1);
1700         else
1701                 start = strdup(taskcg + strlen(querycg) + 1);
1702         if (!start)
1703                 return NULL;
1704         end = strchr(start, '/');
1705         if (end)
1706                 *end = '\0';
1707         return start;
1708 }
1709
1710 static void stripnewline(char *x)
1711 {
1712         size_t l = strlen(x);
1713         if (l && x[l-1] == '\n')
1714                 x[l-1] = '\0';
1715 }
1716
1717 char *get_pid_cgroup(pid_t pid, const char *contrl)
1718 {
1719         int cfd;
1720         char fnam[PROCLEN];
1721         FILE *f;
1722         char *answer = NULL;
1723         char *line = NULL;
1724         size_t len = 0;
1725         int ret;
1726         const char *h = find_mounted_controller(contrl, &cfd);
1727         if (!h)
1728                 return NULL;
1729
1730         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1731         if (ret < 0 || ret >= PROCLEN)
1732                 return NULL;
1733         if (!(f = fopen(fnam, "r")))
1734                 return NULL;
1735
1736         while (getline(&line, &len, f) != -1) {
1737                 char *c1, *c2;
1738                 if (!line[0])
1739                         continue;
1740                 c1 = strchr(line, ':');
1741                 if (!c1)
1742                         goto out;
1743                 c1++;
1744                 c2 = strchr(c1, ':');
1745                 if (!c2)
1746                         goto out;
1747                 *c2 = '\0';
1748                 if (strcmp(c1, h) != 0)
1749                         continue;
1750                 c2++;
1751                 stripnewline(c2);
1752                 do {
1753                         answer = strdup(c2);
1754                 } while (!answer);
1755                 break;
1756         }
1757
1758 out:
1759         fclose(f);
1760         free(line);
1761         return answer;
1762 }
1763
1764 /*
1765  * check whether a fuse context may access a cgroup dir or file
1766  *
1767  * If file is not null, it is a cgroup file to check under cg.
1768  * If file is null, then we are checking perms on cg itself.
1769  *
1770  * For files we can check the mode of the list_keys result.
1771  * For cgroups, we must make assumptions based on the files under the
1772  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1773  * yet.
1774  */
1775 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1776 {
1777         struct cgfs_files *k = NULL;
1778         bool ret = false;
1779
1780         k = cgfs_get_key(contrl, cg, file);
1781         if (!k)
1782                 return false;
1783
1784         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1785                 if (perms_include(k->mode >> 6, mode)) {
1786                         ret = true;
1787                         goto out;
1788                 }
1789         }
1790         if (fc->gid == k->gid) {
1791                 if (perms_include(k->mode >> 3, mode)) {
1792                         ret = true;
1793                         goto out;
1794                 }
1795         }
1796         ret = perms_include(k->mode, mode);
1797
1798 out:
1799         free_key(k);
1800         return ret;
1801 }
1802
1803 #define INITSCOPE "/init.scope"
1804 void prune_init_slice(char *cg)
1805 {
1806         char *point;
1807         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1808
1809         if (cg_len < initscope_len)
1810                 return;
1811
1812         point = cg + cg_len - initscope_len;
1813         if (strcmp(point, INITSCOPE) == 0) {
1814                 if (point == cg)
1815                         *(point+1) = '\0';
1816                 else
1817                         *point = '\0';
1818         }
1819 }
1820
1821 /*
1822  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1823  * If pid is in /a, he may act on /a/b, but not on /b.
1824  * if the answer is false and nextcg is not NULL, then *nextcg will point
1825  * to a string containing the next cgroup directory under cg, which must be
1826  * freed by the caller.
1827  */
1828 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1829 {
1830         bool answer = false;
1831         char *c2 = get_pid_cgroup(pid, contrl);
1832         char *linecmp;
1833
1834         if (!c2)
1835                 return false;
1836         prune_init_slice(c2);
1837
1838         /*
1839          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1840          * they pass in a cgroup without leading '/'
1841          *
1842          * The original line here was:
1843          *      linecmp = *cg == '/' ? c2 : c2+1;
1844          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1845          *       Serge, do you know?
1846          */
1847         if (*cg == '/' || !strncmp(cg, "./", 2))
1848                 linecmp = c2;
1849         else
1850                 linecmp = c2 + 1;
1851         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1852                 if (nextcg) {
1853                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1854                 }
1855                 goto out;
1856         }
1857         answer = true;
1858
1859 out:
1860         free(c2);
1861         return answer;
1862 }
1863
1864 /*
1865  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1866  */
1867 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1868 {
1869         bool answer = false;
1870         char *c2, *task_cg;
1871         size_t target_len, task_len;
1872
1873         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1874                 return true;
1875
1876         c2 = get_pid_cgroup(pid, contrl);
1877         if (!c2)
1878                 return false;
1879         prune_init_slice(c2);
1880
1881         task_cg = c2 + 1;
1882         target_len = strlen(cg);
1883         task_len = strlen(task_cg);
1884         if (task_len == 0) {
1885                 /* Task is in the root cg, it can see everything. This case is
1886                  * not handled by the strmcps below, since they test for the
1887                  * last /, but that is the first / that we've chopped off
1888                  * above.
1889                  */
1890                 answer = true;
1891                 goto out;
1892         }
1893         if (strcmp(cg, task_cg) == 0) {
1894                 answer = true;
1895                 goto out;
1896         }
1897         if (target_len < task_len) {
1898                 /* looking up a parent dir */
1899                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1900                         answer = true;
1901                 goto out;
1902         }
1903         if (target_len > task_len) {
1904                 /* looking up a child dir */
1905                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1906                         answer = true;
1907                 goto out;
1908         }
1909
1910 out:
1911         free(c2);
1912         return answer;
1913 }
1914
1915 /*
1916  * given /cgroup/freezer/a/b, return "freezer".
1917  * the returned char* should NOT be freed.
1918  */
1919 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1920 {
1921         const char *p1;
1922         char *contr, *slash;
1923
1924         if (strlen(path) < 9) {
1925                 errno = EACCES;
1926                 return NULL;
1927         }
1928         if (*(path + 7) != '/') {
1929                 errno = EINVAL;
1930                 return NULL;
1931         }
1932         p1 = path + 8;
1933         contr = strdupa(p1);
1934         if (!contr) {
1935                 errno = ENOMEM;
1936                 return NULL;
1937         }
1938         slash = strstr(contr, "/");
1939         if (slash)
1940                 *slash = '\0';
1941
1942         int i;
1943         for (i = 0; i < num_hierarchies; i++) {
1944                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1945                         return hierarchies[i];
1946         }
1947         errno = ENOENT;
1948         return NULL;
1949 }
1950
1951 /*
1952  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1953  * Note that the returned value may include files (keynames) etc
1954  */
1955 static const char *find_cgroup_in_path(const char *path)
1956 {
1957         const char *p1;
1958
1959         if (strlen(path) < 9) {
1960                 errno = EACCES;
1961                 return NULL;
1962         }
1963         p1 = strstr(path + 8, "/");
1964         if (!p1) {
1965                 errno = EINVAL;
1966                 return NULL;
1967         }
1968         errno = 0;
1969         return p1 + 1;
1970 }
1971
1972 /*
1973  * split the last path element from the path in @cg.
1974  * @dir is newly allocated and should be freed, @last not
1975 */
1976 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1977 {
1978         char *p;
1979
1980         do {
1981                 *dir = strdup(cg);
1982         } while (!*dir);
1983         *last = strrchr(cg, '/');
1984         if (!*last) {
1985                 *last = NULL;
1986                 return;
1987         }
1988         p = strrchr(*dir, '/');
1989         *p = '\0';
1990 }
1991
1992 /*
1993  * FUSE ops for /cgroup
1994  */
1995
1996 int cg_getattr(const char *path, struct stat *sb)
1997 {
1998         struct timespec now;
1999         struct fuse_context *fc = fuse_get_context();
2000         char * cgdir = NULL;
2001         char *last = NULL, *path1, *path2;
2002         struct cgfs_files *k = NULL;
2003         const char *cgroup;
2004         const char *controller = NULL;
2005         int ret = -ENOENT;
2006
2007
2008         if (!fc)
2009                 return -EIO;
2010
2011         memset(sb, 0, sizeof(struct stat));
2012
2013         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2014                 return -EINVAL;
2015
2016         sb->st_uid = sb->st_gid = 0;
2017         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2018         sb->st_size = 0;
2019
2020         if (strcmp(path, "/cgroup") == 0) {
2021                 sb->st_mode = S_IFDIR | 00755;
2022                 sb->st_nlink = 2;
2023                 return 0;
2024         }
2025
2026         controller = pick_controller_from_path(fc, path);
2027         if (!controller)
2028                 return -errno;
2029         cgroup = find_cgroup_in_path(path);
2030         if (!cgroup) {
2031                 /* this is just /cgroup/controller, return it as a dir */
2032                 sb->st_mode = S_IFDIR | 00755;
2033                 sb->st_nlink = 2;
2034                 return 0;
2035         }
2036
2037         get_cgdir_and_path(cgroup, &cgdir, &last);
2038
2039         if (!last) {
2040                 path1 = "/";
2041                 path2 = cgdir;
2042         } else {
2043                 path1 = cgdir;
2044                 path2 = last;
2045         }
2046
2047         pid_t initpid = lookup_initpid_in_store(fc->pid);
2048         if (initpid <= 1 || is_shared_pidns(initpid))
2049                 initpid = fc->pid;
2050         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
2051          * Then check that caller's cgroup is under path if last is a child
2052          * cgroup, or cgdir if last is a file */
2053
2054         if (is_child_cgroup(controller, path1, path2)) {
2055                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
2056                         ret = -ENOENT;
2057                         goto out;
2058                 }
2059                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2060                         /* this is just /cgroup/controller, return it as a dir */
2061                         sb->st_mode = S_IFDIR | 00555;
2062                         sb->st_nlink = 2;
2063                         ret = 0;
2064                         goto out;
2065                 }
2066                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
2067                         ret = -EACCES;
2068                         goto out;
2069                 }
2070
2071                 // get uid, gid, from '/tasks' file and make up a mode
2072                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2073                 sb->st_mode = S_IFDIR | 00755;
2074                 k = cgfs_get_key(controller, cgroup, NULL);
2075                 if (!k) {
2076                         sb->st_uid = sb->st_gid = 0;
2077                 } else {
2078                         sb->st_uid = k->uid;
2079                         sb->st_gid = k->gid;
2080                 }
2081                 free_key(k);
2082                 sb->st_nlink = 2;
2083                 ret = 0;
2084                 goto out;
2085         }
2086
2087         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2088                 sb->st_mode = S_IFREG | k->mode;
2089                 sb->st_nlink = 1;
2090                 sb->st_uid = k->uid;
2091                 sb->st_gid = k->gid;
2092                 sb->st_size = 0;
2093                 free_key(k);
2094                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2095                         ret = -ENOENT;
2096                         goto out;
2097                 }
2098                 ret = 0;
2099         }
2100
2101 out:
2102         free(cgdir);
2103         return ret;
2104 }
2105
2106 int cg_opendir(const char *path, struct fuse_file_info *fi)
2107 {
2108         struct fuse_context *fc = fuse_get_context();
2109         const char *cgroup;
2110         struct file_info *dir_info;
2111         char *controller = NULL;
2112
2113         if (!fc)
2114                 return -EIO;
2115
2116         if (strcmp(path, "/cgroup") == 0) {
2117                 cgroup = NULL;
2118                 controller = NULL;
2119         } else {
2120                 // return list of keys for the controller, and list of child cgroups
2121                 controller = pick_controller_from_path(fc, path);
2122                 if (!controller)
2123                         return -errno;
2124
2125                 cgroup = find_cgroup_in_path(path);
2126                 if (!cgroup) {
2127                         /* this is just /cgroup/controller, return its contents */
2128                         cgroup = "/";
2129                 }
2130         }
2131
2132         pid_t initpid = lookup_initpid_in_store(fc->pid);
2133         if (initpid <= 1 || is_shared_pidns(initpid))
2134                 initpid = fc->pid;
2135         if (cgroup) {
2136                 if (!caller_may_see_dir(initpid, controller, cgroup))
2137                         return -ENOENT;
2138                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2139                         return -EACCES;
2140         }
2141
2142         /* we'll free this at cg_releasedir */
2143         dir_info = malloc(sizeof(*dir_info));
2144         if (!dir_info)
2145                 return -ENOMEM;
2146         dir_info->controller = must_copy_string(controller);
2147         dir_info->cgroup = must_copy_string(cgroup);
2148         dir_info->type = LXC_TYPE_CGDIR;
2149         dir_info->buf = NULL;
2150         dir_info->file = NULL;
2151         dir_info->buflen = 0;
2152
2153         fi->fh = (unsigned long)dir_info;
2154         return 0;
2155 }
2156
2157 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2158                 struct fuse_file_info *fi)
2159 {
2160         struct file_info *d = (struct file_info *)fi->fh;
2161         struct cgfs_files **list = NULL;
2162         int i, ret;
2163         char *nextcg = NULL;
2164         struct fuse_context *fc = fuse_get_context();
2165         char **clist = NULL;
2166
2167         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2168                 return -EIO;
2169
2170         if (d->type != LXC_TYPE_CGDIR) {
2171                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2172                 return -EIO;
2173         }
2174         if (!d->cgroup && !d->controller) {
2175                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2176                 int i;
2177
2178                 for (i = 0;  i < num_hierarchies; i++) {
2179                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2180                                 return -EIO;
2181                         }
2182                 }
2183                 return 0;
2184         }
2185
2186         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2187                 // not a valid cgroup
2188                 ret = -EINVAL;
2189                 goto out;
2190         }
2191
2192         pid_t initpid = lookup_initpid_in_store(fc->pid);
2193         if (initpid <= 1 || is_shared_pidns(initpid))
2194                 initpid = fc->pid;
2195         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2196                 if (nextcg) {
2197                         ret = filler(buf, nextcg,  NULL, 0);
2198                         free(nextcg);
2199                         if (ret != 0) {
2200                                 ret = -EIO;
2201                                 goto out;
2202                         }
2203                 }
2204                 ret = 0;
2205                 goto out;
2206         }
2207
2208         for (i = 0; list && list[i]; i++) {
2209                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2210                         ret = -EIO;
2211                         goto out;
2212                 }
2213         }
2214
2215         // now get the list of child cgroups
2216
2217         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2218                 ret = 0;
2219                 goto out;
2220         }
2221         if (clist) {
2222                 for (i = 0; clist[i]; i++) {
2223                         if (filler(buf, clist[i], NULL, 0) != 0) {
2224                                 ret = -EIO;
2225                                 goto out;
2226                         }
2227                 }
2228         }
2229         ret = 0;
2230
2231 out:
2232         free_keys(list);
2233         if (clist) {
2234                 for (i = 0; clist[i]; i++)
2235                         free(clist[i]);
2236                 free(clist);
2237         }
2238         return ret;
2239 }
2240
2241 void do_release_file_info(struct fuse_file_info *fi)
2242 {
2243         struct file_info *f = (struct file_info *)fi->fh;
2244
2245         if (!f)
2246                 return;
2247
2248         fi->fh = 0;
2249
2250         free(f->controller);
2251         f->controller = NULL;
2252         free(f->cgroup);
2253         f->cgroup = NULL;
2254         free(f->file);
2255         f->file = NULL;
2256         free(f->buf);
2257         f->buf = NULL;
2258         free(f);
2259         f = NULL;
2260 }
2261
2262 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2263 {
2264         do_release_file_info(fi);
2265         return 0;
2266 }
2267
2268 int cg_open(const char *path, struct fuse_file_info *fi)
2269 {
2270         const char *cgroup;
2271         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2272         struct cgfs_files *k = NULL;
2273         struct file_info *file_info;
2274         struct fuse_context *fc = fuse_get_context();
2275         int ret;
2276
2277         if (!fc)
2278                 return -EIO;
2279
2280         controller = pick_controller_from_path(fc, path);
2281         if (!controller)
2282                 return -errno;
2283         cgroup = find_cgroup_in_path(path);
2284         if (!cgroup)
2285                 return -errno;
2286
2287         get_cgdir_and_path(cgroup, &cgdir, &last);
2288         if (!last) {
2289                 path1 = "/";
2290                 path2 = cgdir;
2291         } else {
2292                 path1 = cgdir;
2293                 path2 = last;
2294         }
2295
2296         k = cgfs_get_key(controller, path1, path2);
2297         if (!k) {
2298                 ret = -EINVAL;
2299                 goto out;
2300         }
2301         free_key(k);
2302
2303         pid_t initpid = lookup_initpid_in_store(fc->pid);
2304         if (initpid <= 1 || is_shared_pidns(initpid))
2305                 initpid = fc->pid;
2306         if (!caller_may_see_dir(initpid, controller, path1)) {
2307                 ret = -ENOENT;
2308                 goto out;
2309         }
2310         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2311                 ret = -EACCES;
2312                 goto out;
2313         }
2314
2315         /* we'll free this at cg_release */
2316         file_info = malloc(sizeof(*file_info));
2317         if (!file_info) {
2318                 ret = -ENOMEM;
2319                 goto out;
2320         }
2321         file_info->controller = must_copy_string(controller);
2322         file_info->cgroup = must_copy_string(path1);
2323         file_info->file = must_copy_string(path2);
2324         file_info->type = LXC_TYPE_CGFILE;
2325         file_info->buf = NULL;
2326         file_info->buflen = 0;
2327
2328         fi->fh = (unsigned long)file_info;
2329         ret = 0;
2330
2331 out:
2332         free(cgdir);
2333         return ret;
2334 }
2335
2336 int cg_access(const char *path, int mode)
2337 {
2338         int ret;
2339         const char *cgroup;
2340         char *path1, *path2, *controller;
2341         char *last = NULL, *cgdir = NULL;
2342         struct cgfs_files *k = NULL;
2343         struct fuse_context *fc = fuse_get_context();
2344
2345         if (strcmp(path, "/cgroup") == 0)
2346                 return 0;
2347
2348         if (!fc)
2349                 return -EIO;
2350
2351         controller = pick_controller_from_path(fc, path);
2352         if (!controller)
2353                 return -errno;
2354         cgroup = find_cgroup_in_path(path);
2355         if (!cgroup) {
2356                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2357                 if ((mode & W_OK) == 0)
2358                         return 0;
2359                 return -EACCES;
2360         }
2361
2362         get_cgdir_and_path(cgroup, &cgdir, &last);
2363         if (!last) {
2364                 path1 = "/";
2365                 path2 = cgdir;
2366         } else {
2367                 path1 = cgdir;
2368                 path2 = last;
2369         }
2370
2371         k = cgfs_get_key(controller, path1, path2);
2372         if (!k) {
2373                 if ((mode & W_OK) == 0)
2374                         ret = 0;
2375                 else
2376                         ret = -EACCES;
2377                 goto out;
2378         }
2379         free_key(k);
2380
2381         pid_t initpid = lookup_initpid_in_store(fc->pid);
2382         if (initpid <= 1 || is_shared_pidns(initpid))
2383                 initpid = fc->pid;
2384         if (!caller_may_see_dir(initpid, controller, path1)) {
2385                 ret = -ENOENT;
2386                 goto out;
2387         }
2388         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2389                 ret = -EACCES;
2390                 goto out;
2391         }
2392
2393         ret = 0;
2394
2395 out:
2396         free(cgdir);
2397         return ret;
2398 }
2399
2400 int cg_release(const char *path, struct fuse_file_info *fi)
2401 {
2402         do_release_file_info(fi);
2403         return 0;
2404 }
2405
2406 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2407
2408 static bool wait_for_sock(int sock, int timeout)
2409 {
2410         struct epoll_event ev;
2411         int epfd, ret, now, starttime, deltatime, saved_errno;
2412
2413         if ((starttime = time(NULL)) < 0)
2414                 return false;
2415
2416         if ((epfd = epoll_create(1)) < 0) {
2417                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2418                 return false;
2419         }
2420
2421         ev.events = POLLIN_SET;
2422         ev.data.fd = sock;
2423         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2424                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2425                 close(epfd);
2426                 return false;
2427         }
2428
2429 again:
2430         if ((now = time(NULL)) < 0) {
2431                 close(epfd);
2432                 return false;
2433         }
2434
2435         deltatime = (starttime + timeout) - now;
2436         if (deltatime < 0) { // timeout
2437                 errno = 0;
2438                 close(epfd);
2439                 return false;
2440         }
2441         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2442         if (ret < 0 && errno == EINTR)
2443                 goto again;
2444         saved_errno = errno;
2445         close(epfd);
2446
2447         if (ret <= 0) {
2448                 errno = saved_errno;
2449                 return false;
2450         }
2451         return true;
2452 }
2453
2454 static int msgrecv(int sockfd, void *buf, size_t len)
2455 {
2456         if (!wait_for_sock(sockfd, 2))
2457                 return -1;
2458         return recv(sockfd, buf, len, MSG_DONTWAIT);
2459 }
2460
2461 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2462 {
2463         struct msghdr msg = { 0 };
2464         struct iovec iov;
2465         struct cmsghdr *cmsg;
2466         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2467         char buf[1];
2468         buf[0] = 'p';
2469
2470         if (pingfirst) {
2471                 if (msgrecv(sock, buf, 1) != 1) {
2472                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2473                         return SEND_CREDS_FAIL;
2474                 }
2475         }
2476
2477         msg.msg_control = cmsgbuf;
2478         msg.msg_controllen = sizeof(cmsgbuf);
2479
2480         cmsg = CMSG_FIRSTHDR(&msg);
2481         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2482         cmsg->cmsg_level = SOL_SOCKET;
2483         cmsg->cmsg_type = SCM_CREDENTIALS;
2484         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2485
2486         msg.msg_name = NULL;
2487         msg.msg_namelen = 0;
2488
2489         buf[0] = v;
2490         iov.iov_base = buf;
2491         iov.iov_len = sizeof(buf);
2492         msg.msg_iov = &iov;
2493         msg.msg_iovlen = 1;
2494
2495         if (sendmsg(sock, &msg, 0) < 0) {
2496                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2497                 if (errno == 3)
2498                         return SEND_CREDS_NOTSK;
2499                 return SEND_CREDS_FAIL;
2500         }
2501
2502         return SEND_CREDS_OK;
2503 }
2504
2505 static bool recv_creds(int sock, struct ucred *cred, char *v)
2506 {
2507         struct msghdr msg = { 0 };
2508         struct iovec iov;
2509         struct cmsghdr *cmsg;
2510         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2511         char buf[1];
2512         int ret;
2513         int optval = 1;
2514
2515         *v = '1';
2516
2517         cred->pid = -1;
2518         cred->uid = -1;
2519         cred->gid = -1;
2520
2521         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2522                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2523                 return false;
2524         }
2525         buf[0] = '1';
2526         if (write(sock, buf, 1) != 1) {
2527                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2528                 return false;
2529         }
2530
2531         msg.msg_name = NULL;
2532         msg.msg_namelen = 0;
2533         msg.msg_control = cmsgbuf;
2534         msg.msg_controllen = sizeof(cmsgbuf);
2535
2536         iov.iov_base = buf;
2537         iov.iov_len = sizeof(buf);
2538         msg.msg_iov = &iov;
2539         msg.msg_iovlen = 1;
2540
2541         if (!wait_for_sock(sock, 2)) {
2542                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2543                 return false;
2544         }
2545         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2546         if (ret < 0) {
2547                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2548                 return false;
2549         }
2550
2551         cmsg = CMSG_FIRSTHDR(&msg);
2552
2553         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2554                         cmsg->cmsg_level == SOL_SOCKET &&
2555                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2556                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2557         }
2558         *v = buf[0];
2559
2560         return true;
2561 }
2562
2563 struct pid_ns_clone_args {
2564         int *cpipe;
2565         int sock;
2566         pid_t tpid;
2567         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2568 };
2569
2570 /*
2571  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2572  * with clone(). This simply writes '1' as ACK back to the parent
2573  * before calling the actual wrapped function.
2574  */
2575 static int pid_ns_clone_wrapper(void *arg) {
2576         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2577         char b = '1';
2578
2579         close(args->cpipe[0]);
2580         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2581                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2582         close(args->cpipe[1]);
2583         return args->wrapped(args->sock, args->tpid);
2584 }
2585
2586 /*
2587  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2588  * int value back over the socket.  This shifts the pid from the
2589  * sender's pidns into tpid's pidns.
2590  */
2591 static int pid_to_ns(int sock, pid_t tpid)
2592 {
2593         char v = '0';
2594         struct ucred cred;
2595
2596         while (recv_creds(sock, &cred, &v)) {
2597                 if (v == '1')
2598                         return 0;
2599                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2600                         return 1;
2601         }
2602         return 0;
2603 }
2604
2605
2606 /*
2607  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2608  * in your old pidns.  Only children which you clone will be in the target
2609  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2610  * actually convert pids.
2611  *
2612  * Note: glibc's fork() does not respect pidns, which can lead to failed
2613  * assertions inside glibc (and thus failed forks) if the child's pid in
2614  * the pidns and the parent pid outside are identical. Using clone prevents
2615  * this issue.
2616  */
2617 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2618 {
2619         int newnsfd = -1, ret, cpipe[2];
2620         char fnam[100];
2621         pid_t cpid;
2622         char v;
2623
2624         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2625         if (ret < 0 || ret >= sizeof(fnam))
2626                 _exit(1);
2627         newnsfd = open(fnam, O_RDONLY);
2628         if (newnsfd < 0)
2629                 _exit(1);
2630         if (setns(newnsfd, 0) < 0)
2631                 _exit(1);
2632         close(newnsfd);
2633
2634         if (pipe(cpipe) < 0)
2635                 _exit(1);
2636
2637         struct pid_ns_clone_args args = {
2638                 .cpipe = cpipe,
2639                 .sock = sock,
2640                 .tpid = tpid,
2641                 .wrapped = &pid_to_ns
2642         };
2643         size_t stack_size = sysconf(_SC_PAGESIZE);
2644         void *stack = alloca(stack_size);
2645
2646         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2647         if (cpid < 0)
2648                 _exit(1);
2649
2650         // give the child 1 second to be done forking and
2651         // write its ack
2652         if (!wait_for_sock(cpipe[0], 1))
2653                 _exit(1);
2654         ret = read(cpipe[0], &v, 1);
2655         if (ret != sizeof(char) || v != '1')
2656                 _exit(1);
2657
2658         if (!wait_for_pid(cpid))
2659                 _exit(1);
2660         _exit(0);
2661 }
2662
2663 /*
2664  * To read cgroup files with a particular pid, we will setns into the child
2665  * pidns, open a pipe, fork a child - which will be the first to really be in
2666  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2667  */
2668 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2669 {
2670         int sock[2] = {-1, -1};
2671         char *tmpdata = NULL;
2672         int ret;
2673         pid_t qpid, cpid = -1;
2674         bool answer = false;
2675         char v = '0';
2676         struct ucred cred;
2677         size_t sz = 0, asz = 0;
2678
2679         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2680                 return false;
2681
2682         /*
2683          * Now we read the pids from returned data one by one, pass
2684          * them into a child in the target namespace, read back the
2685          * translated pids, and put them into our to-return data
2686          */
2687
2688         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2689                 perror("socketpair");
2690                 free(tmpdata);
2691                 return false;
2692         }
2693
2694         cpid = fork();
2695         if (cpid == -1)
2696                 goto out;
2697
2698         if (!cpid) // child - exits when done
2699                 pid_to_ns_wrapper(sock[1], tpid);
2700
2701         char *ptr = tmpdata;
2702         cred.uid = 0;
2703         cred.gid = 0;
2704         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2705                 cred.pid = qpid;
2706                 ret = send_creds(sock[0], &cred, v, true);
2707
2708                 if (ret == SEND_CREDS_NOTSK)
2709                         goto next;
2710                 if (ret == SEND_CREDS_FAIL)
2711                         goto out;
2712
2713                 // read converted results
2714                 if (!wait_for_sock(sock[0], 2)) {
2715                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2716                         goto out;
2717                 }
2718                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2719                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2720                         goto out;
2721                 }
2722                 must_strcat_pid(d, &sz, &asz, qpid);
2723 next:
2724                 ptr = strchr(ptr, '\n');
2725                 if (!ptr)
2726                         break;
2727                 ptr++;
2728         }
2729
2730         cred.pid = getpid();
2731         v = '1';
2732         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2733                 // failed to ask child to exit
2734                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2735                 goto out;
2736         }
2737
2738         answer = true;
2739
2740 out:
2741         free(tmpdata);
2742         if (cpid != -1)
2743                 wait_for_pid(cpid);
2744         if (sock[0] != -1) {
2745                 close(sock[0]);
2746                 close(sock[1]);
2747         }
2748         return answer;
2749 }
2750
2751 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2752                 struct fuse_file_info *fi)
2753 {
2754         struct fuse_context *fc = fuse_get_context();
2755         struct file_info *f = (struct file_info *)fi->fh;
2756         struct cgfs_files *k = NULL;
2757         char *data = NULL;
2758         int ret, s;
2759         bool r;
2760
2761         if (f->type != LXC_TYPE_CGFILE) {
2762                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2763                 return -EIO;
2764         }
2765
2766         if (offset)
2767                 return 0;
2768
2769         if (!fc)
2770                 return -EIO;
2771
2772         if (!f->controller)
2773                 return -EINVAL;
2774
2775         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2776                 return -EINVAL;
2777         }
2778         free_key(k);
2779
2780
2781         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2782                 ret = -EACCES;
2783                 goto out;
2784         }
2785
2786         if (strcmp(f->file, "tasks") == 0 ||
2787                         strcmp(f->file, "/tasks") == 0 ||
2788                         strcmp(f->file, "/cgroup.procs") == 0 ||
2789                         strcmp(f->file, "cgroup.procs") == 0)
2790                 // special case - we have to translate the pids
2791                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2792         else
2793                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2794
2795         if (!r) {
2796                 ret = -EINVAL;
2797                 goto out;
2798         }
2799
2800         if (!data) {
2801                 ret = 0;
2802                 goto out;
2803         }
2804         s = strlen(data);
2805         if (s > size)
2806                 s = size;
2807         memcpy(buf, data, s);
2808         if (s > 0 && s < size && data[s-1] != '\n')
2809                 buf[s++] = '\n';
2810
2811         ret = s;
2812
2813 out:
2814         free(data);
2815         return ret;
2816 }
2817
2818 static int pid_from_ns(int sock, pid_t tpid)
2819 {
2820         pid_t vpid;
2821         struct ucred cred;
2822         char v;
2823         int ret;
2824
2825         cred.uid = 0;
2826         cred.gid = 0;
2827         while (1) {
2828                 if (!wait_for_sock(sock, 2)) {
2829                         lxcfs_error("%s\n", "Timeout reading from parent.");
2830                         return 1;
2831                 }
2832                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2833                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2834                         return 1;
2835                 }
2836                 if (vpid == -1) // done
2837                         break;
2838                 v = '0';
2839                 cred.pid = vpid;
2840                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2841                         v = '1';
2842                         cred.pid = getpid();
2843                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2844                                 return 1;
2845                 }
2846         }
2847         return 0;
2848 }
2849
2850 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2851 {
2852         int newnsfd = -1, ret, cpipe[2];
2853         char fnam[100];
2854         pid_t cpid;
2855         char v;
2856
2857         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2858         if (ret < 0 || ret >= sizeof(fnam))
2859                 _exit(1);
2860         newnsfd = open(fnam, O_RDONLY);
2861         if (newnsfd < 0)
2862                 _exit(1);
2863         if (setns(newnsfd, 0) < 0)
2864                 _exit(1);
2865         close(newnsfd);
2866
2867         if (pipe(cpipe) < 0)
2868                 _exit(1);
2869
2870         struct pid_ns_clone_args args = {
2871                 .cpipe = cpipe,
2872                 .sock = sock,
2873                 .tpid = tpid,
2874                 .wrapped = &pid_from_ns
2875         };
2876         size_t stack_size = sysconf(_SC_PAGESIZE);
2877         void *stack = alloca(stack_size);
2878
2879         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2880         if (cpid < 0)
2881                 _exit(1);
2882
2883         // give the child 1 second to be done forking and
2884         // write its ack
2885         if (!wait_for_sock(cpipe[0], 1))
2886                 _exit(1);
2887         ret = read(cpipe[0], &v, 1);
2888         if (ret != sizeof(char) || v != '1')
2889                 _exit(1);
2890
2891         if (!wait_for_pid(cpid))
2892                 _exit(1);
2893         _exit(0);
2894 }
2895
2896 /*
2897  * Given host @uid, return the uid to which it maps in
2898  * @pid's user namespace, or -1 if none.
2899  */
2900 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2901 {
2902         FILE *f;
2903         char line[400];
2904
2905         sprintf(line, "/proc/%d/uid_map", pid);
2906         if ((f = fopen(line, "r")) == NULL) {
2907                 return false;
2908         }
2909
2910         *answer = convert_id_to_ns(f, uid);
2911         fclose(f);
2912
2913         if (*answer == -1)
2914                 return false;
2915         return true;
2916 }
2917
2918 /*
2919  * get_pid_creds: get the real uid and gid of @pid from
2920  * /proc/$$/status
2921  * (XXX should we use euid here?)
2922  */
2923 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2924 {
2925         char line[400];
2926         uid_t u;
2927         gid_t g;
2928         FILE *f;
2929
2930         *uid = -1;
2931         *gid = -1;
2932         sprintf(line, "/proc/%d/status", pid);
2933         if ((f = fopen(line, "r")) == NULL) {
2934                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2935                 return;
2936         }
2937         while (fgets(line, 400, f)) {
2938                 if (strncmp(line, "Uid:", 4) == 0) {
2939                         if (sscanf(line+4, "%u", &u) != 1) {
2940                                 lxcfs_error("bad uid line for pid %u\n", pid);
2941                                 fclose(f);
2942                                 return;
2943                         }
2944                         *uid = u;
2945                 } else if (strncmp(line, "Gid:", 4) == 0) {
2946                         if (sscanf(line+4, "%u", &g) != 1) {
2947                                 lxcfs_error("bad gid line for pid %u\n", pid);
2948                                 fclose(f);
2949                                 return;
2950                         }
2951                         *gid = g;
2952                 }
2953         }
2954         fclose(f);
2955 }
2956
2957 /*
2958  * May the requestor @r move victim @v to a new cgroup?
2959  * This is allowed if
2960  *   . they are the same task
2961  *   . they are ownedy by the same uid
2962  *   . @r is root on the host, or
2963  *   . @v's uid is mapped into @r's where @r is root.
2964  */
2965 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2966 {
2967         uid_t v_uid, tmpuid;
2968         gid_t v_gid;
2969
2970         if (r == v)
2971                 return true;
2972         if (r_uid == 0)
2973                 return true;
2974         get_pid_creds(v, &v_uid, &v_gid);
2975         if (r_uid == v_uid)
2976                 return true;
2977         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2978                         && hostuid_to_ns(v_uid, r, &tmpuid))
2979                 return true;
2980         return false;
2981 }
2982
2983 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2984                 const char *file, const char *buf)
2985 {
2986         int sock[2] = {-1, -1};
2987         pid_t qpid, cpid = -1;
2988         FILE *pids_file = NULL;
2989         bool answer = false, fail = false;
2990
2991         pids_file = open_pids_file(contrl, cg);
2992         if (!pids_file)
2993                 return false;
2994
2995         /*
2996          * write the pids to a socket, have helper in writer's pidns
2997          * call movepid for us
2998          */
2999         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
3000                 perror("socketpair");
3001                 goto out;
3002         }
3003
3004         cpid = fork();
3005         if (cpid == -1)
3006                 goto out;
3007
3008         if (!cpid) { // child
3009                 fclose(pids_file);
3010                 pid_from_ns_wrapper(sock[1], tpid);
3011         }
3012
3013         const char *ptr = buf;
3014         while (sscanf(ptr, "%d", &qpid) == 1) {
3015                 struct ucred cred;
3016                 char v;
3017
3018                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
3019                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
3020                         goto out;
3021                 }
3022
3023                 if (recv_creds(sock[0], &cred, &v)) {
3024                         if (v == '0') {
3025                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
3026                                         fail = true;
3027                                         break;
3028                                 }
3029                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
3030                                         fail = true;
3031                         }
3032                 }
3033
3034                 ptr = strchr(ptr, '\n');
3035                 if (!ptr)
3036                         break;
3037                 ptr++;
3038         }
3039
3040         /* All good, write the value */
3041         qpid = -1;
3042         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
3043                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
3044
3045         if (!fail)
3046                 answer = true;
3047
3048 out:
3049         if (cpid != -1)
3050                 wait_for_pid(cpid);
3051         if (sock[0] != -1) {
3052                 close(sock[0]);
3053                 close(sock[1]);
3054         }
3055         if (pids_file) {
3056                 if (fclose(pids_file) != 0)
3057                         answer = false;
3058         }
3059         return answer;
3060 }
3061
3062 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
3063              struct fuse_file_info *fi)
3064 {
3065         struct fuse_context *fc = fuse_get_context();
3066         char *localbuf = NULL;
3067         struct cgfs_files *k = NULL;
3068         struct file_info *f = (struct file_info *)fi->fh;
3069         bool r;
3070
3071         if (f->type != LXC_TYPE_CGFILE) {
3072                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3073                 return -EIO;
3074         }
3075
3076         if (offset)
3077                 return 0;
3078
3079         if (!fc)
3080                 return -EIO;
3081
3082         localbuf = alloca(size+1);
3083         localbuf[size] = '\0';
3084         memcpy(localbuf, buf, size);
3085
3086         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3087                 size = -EINVAL;
3088                 goto out;
3089         }
3090
3091         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3092                 size = -EACCES;
3093                 goto out;
3094         }
3095
3096         if (strcmp(f->file, "tasks") == 0 ||
3097                         strcmp(f->file, "/tasks") == 0 ||
3098                         strcmp(f->file, "/cgroup.procs") == 0 ||
3099                         strcmp(f->file, "cgroup.procs") == 0)
3100                 // special case - we have to translate the pids
3101                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3102         else
3103                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3104
3105         if (!r)
3106                 size = -EINVAL;
3107
3108 out:
3109         free_key(k);
3110         return size;
3111 }
3112
3113 int cg_chown(const char *path, uid_t uid, gid_t gid)
3114 {
3115         struct fuse_context *fc = fuse_get_context();
3116         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3117         struct cgfs_files *k = NULL;
3118         const char *cgroup;
3119         int ret;
3120
3121         if (!fc)
3122                 return -EIO;
3123
3124         if (strcmp(path, "/cgroup") == 0)
3125                 return -EPERM;
3126
3127         controller = pick_controller_from_path(fc, path);
3128         if (!controller)
3129                 return errno == ENOENT ? -EPERM : -errno;
3130
3131         cgroup = find_cgroup_in_path(path);
3132         if (!cgroup)
3133                 /* this is just /cgroup/controller */
3134                 return -EPERM;
3135
3136         get_cgdir_and_path(cgroup, &cgdir, &last);
3137
3138         if (!last) {
3139                 path1 = "/";
3140                 path2 = cgdir;
3141         } else {
3142                 path1 = cgdir;
3143                 path2 = last;
3144         }
3145
3146         if (is_child_cgroup(controller, path1, path2)) {
3147                 // get uid, gid, from '/tasks' file and make up a mode
3148                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3149                 k = cgfs_get_key(controller, cgroup, "tasks");
3150
3151         } else
3152                 k = cgfs_get_key(controller, path1, path2);
3153
3154         if (!k) {
3155                 ret = -EINVAL;
3156                 goto out;
3157         }
3158
3159         /*
3160          * This being a fuse request, the uid and gid must be valid
3161          * in the caller's namespace.  So we can just check to make
3162          * sure that the caller is root in his uid, and privileged
3163          * over the file's current owner.
3164          */
3165         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3166                 ret = -EACCES;
3167                 goto out;
3168         }
3169
3170         ret = cgfs_chown_file(controller, cgroup, uid, gid);
3171
3172 out:
3173         free_key(k);
3174         free(cgdir);
3175
3176         return ret;
3177 }
3178
3179 int cg_chmod(const char *path, mode_t mode)
3180 {
3181         struct fuse_context *fc = fuse_get_context();
3182         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3183         struct cgfs_files *k = NULL;
3184         const char *cgroup;
3185         int ret;
3186
3187         if (!fc)
3188                 return -EIO;
3189
3190         if (strcmp(path, "/cgroup") == 0)
3191                 return -EPERM;
3192
3193         controller = pick_controller_from_path(fc, path);
3194         if (!controller)
3195                 return errno == ENOENT ? -EPERM : -errno;
3196
3197         cgroup = find_cgroup_in_path(path);
3198         if (!cgroup)
3199                 /* this is just /cgroup/controller */
3200                 return -EPERM;
3201
3202         get_cgdir_and_path(cgroup, &cgdir, &last);
3203
3204         if (!last) {
3205                 path1 = "/";
3206                 path2 = cgdir;
3207         } else {
3208                 path1 = cgdir;
3209                 path2 = last;
3210         }
3211
3212         if (is_child_cgroup(controller, path1, path2)) {
3213                 // get uid, gid, from '/tasks' file and make up a mode
3214                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3215                 k = cgfs_get_key(controller, cgroup, "tasks");
3216
3217         } else
3218                 k = cgfs_get_key(controller, path1, path2);
3219
3220         if (!k) {
3221                 ret = -EINVAL;
3222                 goto out;
3223         }
3224
3225         /*
3226          * This being a fuse request, the uid and gid must be valid
3227          * in the caller's namespace.  So we can just check to make
3228          * sure that the caller is root in his uid, and privileged
3229          * over the file's current owner.
3230          */
3231         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3232                 ret = -EPERM;
3233                 goto out;
3234         }
3235
3236         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3237                 ret = -EINVAL;
3238                 goto out;
3239         }
3240
3241         ret = 0;
3242 out:
3243         free_key(k);
3244         free(cgdir);
3245         return ret;
3246 }
3247
3248 int cg_mkdir(const char *path, mode_t mode)
3249 {
3250         struct fuse_context *fc = fuse_get_context();
3251         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3252         const char *cgroup;
3253         int ret;
3254
3255         if (!fc)
3256                 return -EIO;
3257
3258         controller = pick_controller_from_path(fc, path);
3259         if (!controller)
3260                 return errno == ENOENT ? -EPERM : -errno;
3261
3262         cgroup = find_cgroup_in_path(path);
3263         if (!cgroup)
3264                 return -errno;
3265
3266         get_cgdir_and_path(cgroup, &cgdir, &last);
3267         if (!last)
3268                 path1 = "/";
3269         else
3270                 path1 = cgdir;
3271
3272         pid_t initpid = lookup_initpid_in_store(fc->pid);
3273         if (initpid <= 1 || is_shared_pidns(initpid))
3274                 initpid = fc->pid;
3275         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3276                 if (!next)
3277                         ret = -EINVAL;
3278                 else if (last && strcmp(next, last) == 0)
3279                         ret = -EEXIST;
3280                 else
3281                         ret = -EPERM;
3282                 goto out;
3283         }
3284
3285         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3286                 ret = -EACCES;
3287                 goto out;
3288         }
3289         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3290                 ret = -EACCES;
3291                 goto out;
3292         }
3293
3294         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3295
3296 out:
3297         free(cgdir);
3298         free(next);
3299         return ret;
3300 }
3301
3302 int cg_rmdir(const char *path)
3303 {
3304         struct fuse_context *fc = fuse_get_context();
3305         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3306         const char *cgroup;
3307         int ret;
3308
3309         if (!fc)
3310                 return -EIO;
3311
3312         controller = pick_controller_from_path(fc, path);
3313         if (!controller) /* Someone's trying to delete "/cgroup". */
3314                 return -EPERM;
3315
3316         cgroup = find_cgroup_in_path(path);
3317         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3318                 return -EPERM;
3319
3320         get_cgdir_and_path(cgroup, &cgdir, &last);
3321         if (!last) {
3322                 /* Someone's trying to delete a cgroup on the same level as the
3323                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3324                  * rmdir "/cgroup/blkio/init.slice".
3325                  */
3326                 ret = -EPERM;
3327                 goto out;
3328         }
3329
3330         pid_t initpid = lookup_initpid_in_store(fc->pid);
3331         if (initpid <= 1 || is_shared_pidns(initpid))
3332                 initpid = fc->pid;
3333         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3334                 if (!last || (next && (strcmp(next, last) == 0)))
3335                         ret = -EBUSY;
3336                 else
3337                         ret = -ENOENT;
3338                 goto out;
3339         }
3340
3341         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3342                 ret = -EACCES;
3343                 goto out;
3344         }
3345         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3346                 ret = -EACCES;
3347                 goto out;
3348         }
3349
3350         if (!cgfs_remove(controller, cgroup)) {
3351                 ret = -EINVAL;
3352                 goto out;
3353         }
3354
3355         ret = 0;
3356
3357 out:
3358         free(cgdir);
3359         free(next);
3360         return ret;
3361 }
3362
3363 static bool startswith(const char *line, const char *pref)
3364 {
3365         if (strncmp(line, pref, strlen(pref)) == 0)
3366                 return true;
3367         return false;
3368 }
3369
3370 static void parse_memstat(char *memstat, unsigned long *cached,
3371                 unsigned long *active_anon, unsigned long *inactive_anon,
3372                 unsigned long *active_file, unsigned long *inactive_file,
3373                 unsigned long *unevictable, unsigned long *shmem)
3374 {
3375         char *eol;
3376
3377         while (*memstat) {
3378                 if (startswith(memstat, "total_cache")) {
3379                         sscanf(memstat + 11, "%lu", cached);
3380                         *cached /= 1024;
3381                 } else if (startswith(memstat, "total_active_anon")) {
3382                         sscanf(memstat + 17, "%lu", active_anon);
3383                         *active_anon /= 1024;
3384                 } else if (startswith(memstat, "total_inactive_anon")) {
3385                         sscanf(memstat + 19, "%lu", inactive_anon);
3386                         *inactive_anon /= 1024;
3387                 } else if (startswith(memstat, "total_active_file")) {
3388                         sscanf(memstat + 17, "%lu", active_file);
3389                         *active_file /= 1024;
3390                 } else if (startswith(memstat, "total_inactive_file")) {
3391                         sscanf(memstat + 19, "%lu", inactive_file);
3392                         *inactive_file /= 1024;
3393                 } else if (startswith(memstat, "total_unevictable")) {
3394                         sscanf(memstat + 17, "%lu", unevictable);
3395                         *unevictable /= 1024;
3396                 } else if (startswith(memstat, "total_shmem")) {
3397                         sscanf(memstat + 11, "%lu", shmem);
3398                         *shmem /= 1024;
3399                 }
3400                 eol = strchr(memstat, '\n');
3401                 if (!eol)
3402                         return;
3403                 memstat = eol+1;
3404         }
3405 }
3406
3407 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3408 {
3409         char *eol;
3410         char key[32];
3411
3412         memset(key, 0, 32);
3413         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3414
3415         size_t len = strlen(key);
3416         *v = 0;
3417
3418         while (*str) {
3419                 if (startswith(str, key)) {
3420                         sscanf(str + len, "%lu", v);
3421                         return;
3422                 }
3423                 eol = strchr(str, '\n');
3424                 if (!eol)
3425                         return;
3426                 str = eol+1;
3427         }
3428 }
3429
3430 int read_file(const char *path, char *buf, size_t size, struct file_info *d)
3431 {
3432         size_t linelen = 0, total_len = 0, rv = 0;
3433         char *line = NULL;
3434         char *cache = d->buf;
3435         size_t cache_size = d->buflen;
3436         FILE *f = fopen(path, "r");
3437         if (!f)
3438                 return 0;
3439
3440         while (getline(&line, &linelen, f) != -1) {
3441                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3442                 if (l < 0) {
3443                         perror("Error writing to cache");
3444                         rv = 0;
3445                         goto err;
3446                 }
3447                 if (l >= cache_size) {
3448                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3449                         rv = 0;
3450                         goto err;
3451                 }
3452                 cache += l;
3453                 cache_size -= l;
3454                 total_len += l;
3455         }
3456
3457         d->size = total_len;
3458         if (total_len > size)
3459                 total_len = size;
3460
3461         /* read from off 0 */
3462         memcpy(buf, d->buf, total_len);
3463         rv = total_len;
3464   err:
3465         fclose(f);
3466         free(line);
3467         return rv;
3468 }
3469
3470 /*
3471  * FUSE ops for /proc
3472  */
3473
3474 static unsigned long get_memlimit(const char *cgroup, const char *file)
3475 {
3476         char *memlimit_str = NULL;
3477         unsigned long memlimit = -1;
3478
3479         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3480                 memlimit = strtoul(memlimit_str, NULL, 10);
3481
3482         free(memlimit_str);
3483
3484         return memlimit;
3485 }
3486
3487 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3488 {
3489         char *copy = strdupa(cgroup);
3490         unsigned long memlimit = 0, retlimit;
3491
3492         retlimit = get_memlimit(copy, file);
3493
3494         while (strcmp(copy, "/") != 0) {
3495                 copy = dirname(copy);
3496                 memlimit = get_memlimit(copy, file);
3497                 if (memlimit != -1 && memlimit < retlimit)
3498                         retlimit = memlimit;
3499         };
3500
3501         return retlimit;
3502 }
3503
3504 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3505                 struct fuse_file_info *fi)
3506 {
3507         struct fuse_context *fc = fuse_get_context();
3508         struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3509         struct file_info *d = (struct file_info *)fi->fh;
3510         char *cg;
3511         char *memusage_str = NULL, *memstat_str = NULL,
3512                 *memswlimit_str = NULL, *memswusage_str = NULL;
3513         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3514                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3515                 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3516                 hostswtotal = 0;
3517         char *line = NULL;
3518         size_t linelen = 0, total_len = 0, rv = 0;
3519         char *cache = d->buf;
3520         size_t cache_size = d->buflen;
3521         FILE *f = NULL;
3522
3523         if (offset){
3524                 if (offset > d->size)
3525                         return -EINVAL;
3526                 if (!d->cached)
3527                         return 0;
3528                 int left = d->size - offset;
3529                 total_len = left > size ? size: left;
3530                 memcpy(buf, cache + offset, total_len);
3531                 return total_len;
3532         }
3533
3534         pid_t initpid = lookup_initpid_in_store(fc->pid);
3535         if (initpid <= 1 || is_shared_pidns(initpid))
3536                 initpid = fc->pid;
3537         cg = get_pid_cgroup(initpid, "memory");
3538         if (!cg)
3539                 return read_file("/proc/meminfo", buf, size, d);
3540         prune_init_slice(cg);
3541
3542         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3543         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3544                 goto err;
3545         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3546                 goto err;
3547
3548         // Following values are allowed to fail, because swapaccount might be turned
3549         // off for current kernel
3550         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3551                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3552         {
3553                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3554                 memswusage = strtoul(memswusage_str, NULL, 10);
3555
3556                 memswlimit = memswlimit / 1024;
3557                 memswusage = memswusage / 1024;
3558         }
3559
3560         memusage = strtoul(memusage_str, NULL, 10);
3561         memlimit /= 1024;
3562         memusage /= 1024;
3563
3564         parse_memstat(memstat_str, &cached, &active_anon,
3565                         &inactive_anon, &active_file, &inactive_file,
3566                         &unevictable, &shmem);
3567
3568         f = fopen("/proc/meminfo", "r");
3569         if (!f)
3570                 goto err;
3571
3572         while (getline(&line, &linelen, f) != -1) {
3573                 ssize_t l;
3574                 char *printme, lbuf[100];
3575
3576                 memset(lbuf, 0, 100);
3577                 if (startswith(line, "MemTotal:")) {
3578                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3579                         if (hosttotal < memlimit)
3580                                 memlimit = hosttotal;
3581                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3582                         printme = lbuf;
3583                 } else if (startswith(line, "MemFree:")) {
3584                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3585                         printme = lbuf;
3586                 } else if (startswith(line, "MemAvailable:")) {
3587                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3588                         printme = lbuf;
3589                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts && opts->swap_off == false) {
3590                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3591                         if (hostswtotal < memswlimit)
3592                                 memswlimit = hostswtotal;
3593                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3594                         printme = lbuf;
3595                 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3596                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", 0UL);
3597                         printme = lbuf;
3598                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts && opts->swap_off == false) {
3599                         unsigned long swaptotal = memswlimit,
3600                                         swapusage = memswusage - memusage,
3601                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3602                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3603                         printme = lbuf;
3604                 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3605                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", 0UL);
3606                         printme = lbuf;
3607                 } else if (startswith(line, "Slab:")) {
3608                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3609                         printme = lbuf;
3610                 } else if (startswith(line, "Buffers:")) {
3611                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3612                         printme = lbuf;
3613                 } else if (startswith(line, "Cached:")) {
3614                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3615                         printme = lbuf;
3616                 } else if (startswith(line, "SwapCached:")) {
3617                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3618                         printme = lbuf;
3619                 } else if (startswith(line, "Active:")) {
3620                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3621                                         active_anon + active_file);
3622                         printme = lbuf;
3623                 } else if (startswith(line, "Inactive:")) {
3624                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3625                                         inactive_anon + inactive_file);
3626                         printme = lbuf;
3627                 } else if (startswith(line, "Active(anon)")) {
3628                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3629                         printme = lbuf;
3630                 } else if (startswith(line, "Inactive(anon)")) {
3631                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3632                         printme = lbuf;
3633                 } else if (startswith(line, "Active(file)")) {
3634                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3635                         printme = lbuf;
3636                 } else if (startswith(line, "Inactive(file)")) {
3637                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3638                         printme = lbuf;
3639                 } else if (startswith(line, "Unevictable")) {
3640                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3641                         printme = lbuf;
3642                 } else if (startswith(line, "SReclaimable")) {
3643                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3644                         printme = lbuf;
3645                 } else if (startswith(line, "SUnreclaim")) {
3646                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3647                         printme = lbuf;
3648                 } else if (startswith(line, "Shmem:")) {
3649                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3650                         printme = lbuf;
3651                 } else if (startswith(line, "ShmemHugePages")) {
3652                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3653                         printme = lbuf;
3654                 } else if (startswith(line, "ShmemPmdMapped")) {
3655                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3656                         printme = lbuf;
3657                 } else
3658                         printme = line;
3659
3660                 l = snprintf(cache, cache_size, "%s", printme);
3661                 if (l < 0) {
3662                         perror("Error writing to cache");
3663                         rv = 0;
3664                         goto err;
3665
3666                 }
3667                 if (l >= cache_size) {
3668                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3669                         rv = 0;
3670                         goto err;
3671                 }
3672
3673                 cache += l;
3674                 cache_size -= l;
3675                 total_len += l;
3676         }
3677
3678         d->cached = 1;
3679         d->size = total_len;
3680         if (total_len > size ) total_len = size;
3681         memcpy(buf, d->buf, total_len);
3682
3683         rv = total_len;
3684 err:
3685         if (f)
3686                 fclose(f);
3687         free(line);
3688         free(cg);
3689         free(memusage_str);
3690         free(memswlimit_str);
3691         free(memswusage_str);
3692         free(memstat_str);
3693         return rv;
3694 }
3695
3696 /*
3697  * Read the cpuset.cpus for cg
3698  * Return the answer in a newly allocated string which must be freed
3699  */
3700 char *get_cpuset(const char *cg)
3701 {
3702         char *answer;
3703
3704         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3705                 return NULL;
3706         return answer;
3707 }
3708
3709 bool cpu_in_cpuset(int cpu, const char *cpuset);
3710
3711 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3712 {
3713         int cpu;
3714
3715         if (sscanf(line, "processor       : %d", &cpu) != 1)
3716                 return false;
3717         return cpu_in_cpuset(cpu, cpuset);
3718 }
3719
3720 /*
3721  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3722  * depending on `param`. Parameter value is returned throuh `value`.
3723  */
3724 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3725 {
3726         bool rv = false;
3727         char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3728         char *str = NULL;
3729
3730         sprintf(file, "cpu.cfs_%s_us", param);
3731
3732         if (!cgfs_get_value("cpu", cg, file, &str))
3733                 goto err;
3734
3735         if (sscanf(str, "%ld", value) != 1)
3736                 goto err;
3737
3738         rv = true;
3739
3740 err:
3741         if (str)
3742                 free(str);
3743         return rv;
3744 }
3745
3746 /*
3747  * Return the maximum number of visible CPUs based on CPU quotas.
3748  * If there is no quota set, zero is returned.
3749  */
3750 int max_cpu_count(const char *cg)
3751 {
3752         int rv, nprocs;
3753         int64_t cfs_quota, cfs_period;
3754         int nr_cpus_in_cpuset = 0;
3755         char *cpuset = NULL;
3756
3757         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3758                 return 0;
3759
3760         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3761                 return 0;
3762
3763         cpuset = get_cpuset(cg);
3764         if (cpuset)
3765                 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3766
3767         if (cfs_quota <= 0 || cfs_period <= 0){
3768                 if (nr_cpus_in_cpuset > 0)
3769                         return nr_cpus_in_cpuset;
3770
3771                 return 0;
3772         }
3773
3774         rv = cfs_quota / cfs_period;
3775
3776         /* In case quota/period does not yield a whole number, add one CPU for
3777          * the remainder.
3778          */
3779         if ((cfs_quota % cfs_period) > 0)
3780                 rv += 1;
3781
3782         nprocs = get_nprocs();
3783
3784         if (rv > nprocs)
3785                 rv = nprocs;
3786
3787         /* use min value in cpu quota and cpuset */
3788         if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3789                 rv = nr_cpus_in_cpuset;
3790
3791         return rv;
3792 }
3793
3794 /*
3795  * Return the exact number of visible CPUs based on CPU quotas.
3796  * If there is no quota set, zero is returned.
3797  */
3798 static double exact_cpu_count(const char *cg)
3799 {
3800         double rv;
3801         int nprocs;
3802         int64_t cfs_quota, cfs_period;
3803
3804         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3805                 return 0;
3806
3807         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3808                 return 0;
3809
3810         if (cfs_quota <= 0 || cfs_period <= 0)
3811                 return 0;
3812
3813         rv = (double)cfs_quota / (double)cfs_period;
3814
3815         nprocs = get_nprocs();
3816
3817         if (rv > nprocs)
3818                 rv = nprocs;
3819
3820         return rv;
3821 }
3822
3823 /*
3824  * Determine whether CPU views should be used or not.
3825  */
3826 bool use_cpuview(const char *cg)
3827 {
3828         int cfd;
3829         char *tmpc;
3830
3831         tmpc = find_mounted_controller("cpu", &cfd);
3832         if (!tmpc)
3833                 return false;
3834
3835         tmpc = find_mounted_controller("cpuacct", &cfd);
3836         if (!tmpc)
3837                 return false;
3838
3839         return true;
3840 }
3841
3842 /*
3843  * check whether this is a '^processor" line in /proc/cpuinfo
3844  */
3845 static bool is_processor_line(const char *line)
3846 {
3847         int cpu;
3848
3849         if (sscanf(line, "processor       : %d", &cpu) == 1)
3850                 return true;
3851         return false;
3852 }
3853
3854 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3855                 struct fuse_file_info *fi)
3856 {
3857         struct fuse_context *fc = fuse_get_context();
3858         struct file_info *d = (struct file_info *)fi->fh;
3859         char *cg;
3860         char *cpuset = NULL;
3861         char *line = NULL;
3862         size_t linelen = 0, total_len = 0, rv = 0;
3863         bool am_printing = false, firstline = true, is_s390x = false;
3864         int curcpu = -1, cpu, max_cpus = 0;
3865         bool use_view;
3866         char *cache = d->buf;
3867         size_t cache_size = d->buflen;
3868         FILE *f = NULL;
3869
3870         if (offset){
3871                 if (offset > d->size)
3872                         return -EINVAL;
3873                 if (!d->cached)
3874                         return 0;
3875                 int left = d->size - offset;
3876                 total_len = left > size ? size: left;
3877                 memcpy(buf, cache + offset, total_len);
3878                 return total_len;
3879         }
3880
3881         pid_t initpid = lookup_initpid_in_store(fc->pid);
3882         if (initpid <= 1 || is_shared_pidns(initpid))
3883                 initpid = fc->pid;
3884         cg = get_pid_cgroup(initpid, "cpuset");
3885         if (!cg)
3886                 return read_file("proc/cpuinfo", buf, size, d);
3887         prune_init_slice(cg);
3888
3889         cpuset = get_cpuset(cg);
3890         if (!cpuset)
3891                 goto err;
3892
3893         use_view = use_cpuview(cg);
3894
3895         if (use_view)
3896                 max_cpus = max_cpu_count(cg);
3897
3898         f = fopen("/proc/cpuinfo", "r");
3899         if (!f)
3900                 goto err;
3901
3902         while (getline(&line, &linelen, f) != -1) {
3903                 ssize_t l;
3904                 if (firstline) {
3905                         firstline = false;
3906                         if (strstr(line, "IBM/S390") != NULL) {
3907                                 is_s390x = true;
3908                                 am_printing = true;
3909                                 continue;
3910                         }
3911                 }
3912                 if (strncmp(line, "# processors:", 12) == 0)
3913                         continue;
3914                 if (is_processor_line(line)) {
3915                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3916                                 break;
3917                         am_printing = cpuline_in_cpuset(line, cpuset);
3918                         if (am_printing) {
3919                                 curcpu ++;
3920                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3921                                 if (l < 0) {
3922                                         perror("Error writing to cache");
3923                                         rv = 0;
3924                                         goto err;
3925                                 }
3926                                 if (l >= cache_size) {
3927                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3928                                         rv = 0;
3929                                         goto err;
3930                                 }
3931                                 cache += l;
3932                                 cache_size -= l;
3933                                 total_len += l;
3934                         }
3935                         continue;
3936                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3937                         char *p;
3938                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3939                                 break;
3940                         if (!cpu_in_cpuset(cpu, cpuset))
3941                                 continue;
3942                         curcpu ++;
3943                         p = strchr(line, ':');
3944                         if (!p || !*p)
3945                                 goto err;
3946                         p++;
3947                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3948                         if (l < 0) {
3949                                 perror("Error writing to cache");
3950                                 rv = 0;
3951                                 goto err;
3952                         }
3953                         if (l >= cache_size) {
3954                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3955                                 rv = 0;
3956                                 goto err;
3957                         }
3958                         cache += l;
3959                         cache_size -= l;
3960                         total_len += l;
3961                         continue;
3962
3963                 }
3964                 if (am_printing) {
3965                         l = snprintf(cache, cache_size, "%s", line);
3966                         if (l < 0) {
3967                                 perror("Error writing to cache");
3968                                 rv = 0;
3969                                 goto err;
3970                         }
3971                         if (l >= cache_size) {
3972                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3973                                 rv = 0;
3974                                 goto err;
3975                         }
3976                         cache += l;
3977                         cache_size -= l;
3978                         total_len += l;
3979                 }
3980         }
3981
3982         if (is_s390x) {
3983                 char *origcache = d->buf;
3984                 ssize_t l;
3985                 do {
3986                         d->buf = malloc(d->buflen);
3987                 } while (!d->buf);
3988                 cache = d->buf;
3989                 cache_size = d->buflen;
3990                 total_len = 0;
3991                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3992                 if (l < 0 || l >= cache_size) {
3993                         free(origcache);
3994                         goto err;
3995                 }
3996                 cache_size -= l;
3997                 cache += l;
3998                 total_len += l;
3999                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
4000                 if (l < 0 || l >= cache_size) {
4001                         free(origcache);
4002                         goto err;
4003                 }
4004                 cache_size -= l;
4005                 cache += l;
4006                 total_len += l;
4007                 l = snprintf(cache, cache_size, "%s", origcache);
4008                 free(origcache);
4009                 if (l < 0 || l >= cache_size)
4010                         goto err;
4011                 total_len += l;
4012         }
4013
4014         d->cached = 1;
4015         d->size = total_len;
4016         if (total_len > size ) total_len = size;
4017
4018         /* read from off 0 */
4019         memcpy(buf, d->buf, total_len);
4020         rv = total_len;
4021 err:
4022         if (f)
4023                 fclose(f);
4024         free(line);
4025         free(cpuset);
4026         free(cg);
4027         return rv;
4028 }
4029
4030 static uint64_t get_reaper_start_time(pid_t pid)
4031 {
4032         int ret;
4033         FILE *f;
4034         uint64_t starttime;
4035         /* strlen("/proc/") = 6
4036          * +
4037          * LXCFS_NUMSTRLEN64
4038          * +
4039          * strlen("/stat") = 5
4040          * +
4041          * \0 = 1
4042          * */
4043 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
4044         char path[__PROC_PID_STAT_LEN];
4045         pid_t qpid;
4046
4047         qpid = lookup_initpid_in_store(pid);
4048         if (qpid <= 0) {
4049                 /* Caller can check for EINVAL on 0. */
4050                 errno = EINVAL;
4051                 return 0;
4052         }
4053
4054         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
4055         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
4056                 /* Caller can check for EINVAL on 0. */
4057                 errno = EINVAL;
4058                 return 0;
4059         }
4060
4061         f = fopen(path, "r");
4062         if (!f) {
4063                 /* Caller can check for EINVAL on 0. */
4064                 errno = EINVAL;
4065                 return 0;
4066         }
4067
4068         /* Note that the *scanf() argument supression requires that length
4069          * modifiers such as "l" are omitted. Otherwise some compilers will yell
4070          * at us. It's like telling someone you're not married and then asking
4071          * if you can bring your wife to the party.
4072          */
4073         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
4074                         "%*s "      /* (2)  comm        %s   */
4075                         "%*c "      /* (3)  state       %c   */
4076                         "%*d "      /* (4)  ppid        %d   */
4077                         "%*d "      /* (5)  pgrp        %d   */
4078                         "%*d "      /* (6)  session     %d   */
4079                         "%*d "      /* (7)  tty_nr      %d   */
4080                         "%*d "      /* (8)  tpgid       %d   */
4081                         "%*u "      /* (9)  flags       %u   */
4082                         "%*u "      /* (10) minflt      %lu  */
4083                         "%*u "      /* (11) cminflt     %lu  */
4084                         "%*u "      /* (12) majflt      %lu  */
4085                         "%*u "      /* (13) cmajflt     %lu  */
4086                         "%*u "      /* (14) utime       %lu  */
4087                         "%*u "      /* (15) stime       %lu  */
4088                         "%*d "      /* (16) cutime      %ld  */
4089                         "%*d "      /* (17) cstime      %ld  */
4090                         "%*d "      /* (18) priority    %ld  */
4091                         "%*d "      /* (19) nice        %ld  */
4092                         "%*d "      /* (20) num_threads %ld  */
4093                         "%*d "      /* (21) itrealvalue %ld  */
4094                         "%" PRIu64, /* (22) starttime   %llu */
4095                      &starttime);
4096         if (ret != 1) {
4097                 fclose(f);
4098                 /* Caller can check for EINVAL on 0. */
4099                 errno = EINVAL;
4100                 return 0;
4101         }
4102
4103         fclose(f);
4104
4105         errno = 0;
4106         return starttime;
4107 }
4108
4109 static double get_reaper_start_time_in_sec(pid_t pid)
4110 {
4111         uint64_t clockticks, ticks_per_sec;
4112         int64_t ret;
4113         double res = 0;
4114
4115         clockticks = get_reaper_start_time(pid);
4116         if (clockticks == 0 && errno == EINVAL) {
4117                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
4118                 return 0;
4119         }
4120
4121         ret = sysconf(_SC_CLK_TCK);
4122         if (ret < 0 && errno == EINVAL) {
4123                 lxcfs_debug(
4124                     "%s\n",
4125                     "failed to determine number of clock ticks in a second");
4126                 return 0;
4127         }
4128
4129         ticks_per_sec = (uint64_t)ret;
4130         res = (double)clockticks / ticks_per_sec;
4131         return res;
4132 }
4133
4134 static double get_reaper_age(pid_t pid)
4135 {
4136         uint64_t uptime_ms;
4137         double procstart, procage;
4138
4139         /* We need to substract the time the process has started since system
4140          * boot minus the time when the system has started to get the actual
4141          * reaper age.
4142          */
4143         procstart = get_reaper_start_time_in_sec(pid);
4144         procage = procstart;
4145         if (procstart > 0) {
4146                 int ret;
4147                 struct timespec spec;
4148
4149                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4150                 if (ret < 0)
4151                         return 0;
4152
4153                 /* We could make this more precise here by using the tv_nsec
4154                  * field in the timespec struct and convert it to milliseconds
4155                  * and then create a double for the seconds and milliseconds but
4156                  * that seems more work than it is worth.
4157                  */
4158                 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
4159                 procage = (uptime_ms - (procstart * 1000)) / 1000;
4160         }
4161
4162         return procage;
4163 }
4164
4165 /*
4166  * Returns 0 on success.
4167  * It is the caller's responsibility to free `return_usage`, unless this
4168  * function returns an error.
4169  */
4170 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4171 {
4172         int cpucount = get_nprocs_conf();
4173         struct cpuacct_usage *cpu_usage;
4174         int rv = 0, i, j, ret;
4175         int cg_cpu;
4176         uint64_t cg_user, cg_system;
4177         int64_t ticks_per_sec;
4178         char *usage_str = NULL;
4179
4180         ticks_per_sec = sysconf(_SC_CLK_TCK);
4181
4182         if (ticks_per_sec < 0 && errno == EINVAL) {
4183                 lxcfs_v(
4184                         "%s\n",
4185                         "read_cpuacct_usage_all failed to determine number of clock ticks "
4186                         "in a second");
4187                 return -1;
4188         }
4189
4190         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4191         if (!cpu_usage)
4192                 return -ENOMEM;
4193
4194         memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
4195         if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4196                 // read cpuacct.usage_percpu instead
4197                 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4198                 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) {
4199                         rv = -1;
4200                         goto err;
4201                 }
4202                 lxcfs_v("usage_str: %s\n", usage_str);
4203
4204                 // convert cpuacct.usage_percpu into cpuacct.usage_all
4205                 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4206
4207                 char *data = NULL;
4208                 size_t sz = 0, asz = 0;
4209
4210                 must_strcat(&data, &sz, &asz, "cpu user system\n");
4211
4212                 int i = 0, read_pos = 0, read_cnt=0;
4213                 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4214                         lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4215                         must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4216                         i++;
4217                         read_pos += read_cnt;
4218                 }
4219
4220                 free(usage_str);
4221                 usage_str = data;
4222
4223                 lxcfs_v("usage_str: %s\n", usage_str);
4224         }
4225
4226         int read_pos = 0, read_cnt=0;
4227         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4228                 lxcfs_error("read_cpuacct_usage_all reading first line from "
4229                                 "%s/cpuacct.usage_all failed.\n", cg);
4230                 rv = -1;
4231                 goto err;
4232         }
4233
4234         read_pos += read_cnt;
4235
4236         for (i = 0, j = 0; i < cpucount; i++) {
4237                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4238                                 &cg_system, &read_cnt);
4239
4240                 if (ret == EOF)
4241                         break;
4242
4243                 if (ret != 3) {
4244                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4245                                         "failed.\n", cg);
4246                         rv = -1;
4247                         goto err;
4248                 }
4249
4250                 read_pos += read_cnt;
4251
4252                 /* Convert the time from nanoseconds to USER_HZ */
4253                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4254                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4255                 j++;
4256         }
4257
4258         rv = 0;
4259         *return_usage = cpu_usage;
4260         *size = cpucount;
4261
4262 err:
4263         if (usage_str)
4264                 free(usage_str);
4265
4266         if (rv != 0) {
4267                 free(cpu_usage);
4268                 *return_usage = NULL;
4269         }
4270
4271         return rv;
4272 }
4273
4274 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4275 {
4276         int i;
4277         unsigned long sum = 0;
4278
4279         for (i = 0; i < cpu_count; i++) {
4280                 if (!newer[i].online)
4281                         continue;
4282
4283                 /* When cpuset is changed on the fly, the CPUs might get reordered.
4284                  * We could either reset all counters, or check that the substractions
4285                  * below will return expected results.
4286                  */
4287                 if (newer[i].user > older[i].user)
4288                         diff[i].user = newer[i].user - older[i].user;
4289                 else
4290                         diff[i].user = 0;
4291
4292                 if (newer[i].system > older[i].system)
4293                         diff[i].system = newer[i].system - older[i].system;
4294                 else
4295                         diff[i].system = 0;
4296
4297                 if (newer[i].idle > older[i].idle)
4298                         diff[i].idle = newer[i].idle - older[i].idle;
4299                 else
4300                         diff[i].idle = 0;
4301
4302                 sum += diff[i].user;
4303                 sum += diff[i].system;
4304                 sum += diff[i].idle;
4305         }
4306
4307         return sum;
4308 }
4309
4310 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4311 {
4312         unsigned long free_space, to_add;
4313
4314         free_space = threshold - usage->user - usage->system;
4315
4316         if (free_space > usage->idle)
4317                 free_space = usage->idle;
4318
4319         to_add = free_space > *surplus ? *surplus : free_space;
4320
4321         *counter += to_add;
4322         usage->idle -= to_add;
4323         *surplus -= to_add;
4324 }
4325
4326 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4327 {
4328         struct cg_proc_stat *first = NULL, *prev, *tmp;
4329
4330         for (prev = NULL; node; ) {
4331                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4332                         tmp = node;
4333                         lxcfs_debug("Removing stat node for %s\n", node->cg);
4334
4335                         if (prev)
4336                                 prev->next = node->next;
4337                         else
4338                                 first = node->next;
4339
4340                         node = node->next;
4341                         free_proc_stat_node(tmp);
4342                 } else {
4343                         if (!first)
4344                                 first = node;
4345                         prev = node;
4346                         node = node->next;
4347                 }
4348         }
4349
4350         return first;
4351 }
4352
4353 #define PROC_STAT_PRUNE_INTERVAL 10
4354 static void prune_proc_stat_history(void)
4355 {
4356         int i;
4357         time_t now = time(NULL);
4358
4359         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4360                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4361
4362                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4363                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4364                         return;
4365                 }
4366
4367                 if (proc_stat_history[i]->next) {
4368                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4369                         proc_stat_history[i]->lastcheck = now;
4370                 }
4371
4372                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4373         }
4374 }
4375
4376 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4377 {
4378         struct cg_proc_stat *node;
4379
4380         pthread_rwlock_rdlock(&head->lock);
4381
4382         if (!head->next) {
4383                 pthread_rwlock_unlock(&head->lock);
4384                 return NULL;
4385         }
4386
4387         node = head->next;
4388
4389         do {
4390                 if (strcmp(cg, node->cg) == 0)
4391                         goto out;
4392         } while ((node = node->next));
4393
4394         node = NULL;
4395
4396 out:
4397         pthread_rwlock_unlock(&head->lock);
4398         prune_proc_stat_history();
4399         return node;
4400 }
4401
4402 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4403 {
4404         struct cg_proc_stat *node;
4405         int i;
4406
4407         node = malloc(sizeof(struct cg_proc_stat));
4408         if (!node)
4409                 goto err;
4410
4411         node->cg = NULL;
4412         node->usage = NULL;
4413         node->view = NULL;
4414
4415         node->cg = malloc(strlen(cg) + 1);
4416         if (!node->cg)
4417                 goto err;
4418
4419         strcpy(node->cg, cg);
4420
4421         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4422         if (!node->usage)
4423                 goto err;
4424
4425         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4426
4427         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4428         if (!node->view)
4429                 goto err;
4430
4431         node->cpu_count = cpu_count;
4432         node->next = NULL;
4433
4434         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4435                 lxcfs_error("%s\n", "Failed to initialize node lock");
4436                 goto err;
4437         }
4438
4439         for (i = 0; i < cpu_count; i++) {
4440                 node->view[i].user = 0;
4441                 node->view[i].system = 0;
4442                 node->view[i].idle = 0;
4443         }
4444
4445         return node;
4446
4447 err:
4448         if (node && node->cg)
4449                 free(node->cg);
4450         if (node && node->usage)
4451                 free(node->usage);
4452         if (node && node->view)
4453                 free(node->view);
4454         if (node)
4455                 free(node);
4456
4457         return NULL;
4458 }
4459
4460 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4461 {
4462         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4463         struct cg_proc_stat_head *head = proc_stat_history[hash];
4464         struct cg_proc_stat *node, *rv = new_node;
4465
4466         pthread_rwlock_wrlock(&head->lock);
4467
4468         if (!head->next) {
4469                 head->next = new_node;
4470                 goto out;
4471         }
4472
4473         node = head->next;
4474
4475         for (;;) {
4476                 if (strcmp(node->cg, new_node->cg) == 0) {
4477                         /* The node is already present, return it */
4478                         free_proc_stat_node(new_node);
4479                         rv = node;
4480                         goto out;
4481                 }
4482
4483                 if (node->next) {
4484                         node = node->next;
4485                         continue;
4486                 }
4487
4488                 node->next = new_node;
4489                 goto out;
4490         }
4491
4492 out:
4493         pthread_rwlock_unlock(&head->lock);
4494         return rv;
4495 }
4496
4497 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4498 {
4499         struct cpuacct_usage *new_usage, *new_view;
4500         int i;
4501
4502         /* Allocate new memory */
4503         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4504         if (!new_usage)
4505                 return false;
4506
4507         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4508         if (!new_view) {
4509                 free(new_usage);
4510                 return false;
4511         }
4512
4513         /* Copy existing data & initialize new elements */
4514         for (i = 0; i < cpu_count; i++) {
4515                 if (i < node->cpu_count) {
4516                         new_usage[i].user = node->usage[i].user;
4517                         new_usage[i].system = node->usage[i].system;
4518                         new_usage[i].idle = node->usage[i].idle;
4519
4520                         new_view[i].user = node->view[i].user;
4521                         new_view[i].system = node->view[i].system;
4522                         new_view[i].idle = node->view[i].idle;
4523                 } else {
4524                         new_usage[i].user = 0;
4525                         new_usage[i].system = 0;
4526                         new_usage[i].idle = 0;
4527
4528                         new_view[i].user = 0;
4529                         new_view[i].system = 0;
4530                         new_view[i].idle = 0;
4531                 }
4532         }
4533
4534         free(node->usage);
4535         free(node->view);
4536
4537         node->usage = new_usage;
4538         node->view = new_view;
4539         node->cpu_count = cpu_count;
4540
4541         return true;
4542 }
4543
4544 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4545 {
4546         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4547         struct cg_proc_stat_head *head = proc_stat_history[hash];
4548         struct cg_proc_stat *node;
4549
4550         node = find_proc_stat_node(head, cg);
4551
4552         if (!node) {
4553                 node = new_proc_stat_node(usage, cpu_count, cg);
4554                 if (!node)
4555                         return NULL;
4556
4557                 node = add_proc_stat_node(node);
4558                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4559         }
4560
4561         pthread_mutex_lock(&node->lock);
4562
4563         /* If additional CPUs on the host have been enabled, CPU usage counter
4564          * arrays have to be expanded */
4565         if (node->cpu_count < cpu_count) {
4566                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4567                                 node->cpu_count, cpu_count, cg);
4568
4569                 if (!expand_proc_stat_node(node, cpu_count)) {
4570                         pthread_mutex_unlock(&node->lock);
4571                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4572                                         node->cpu_count, cpu_count, cg);
4573                         return NULL;
4574                 }
4575         }
4576
4577         return node;
4578 }
4579
4580 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4581 {
4582         int i;
4583
4584         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4585         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4586
4587         for (i = 0; i < cpu_count; i++) {
4588                 node->view[i].user = 0;
4589                 node->view[i].system = 0;
4590                 node->view[i].idle = 0;
4591         }
4592
4593         node->cpu_count = cpu_count;
4594 }
4595
4596 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4597 {
4598         char *line = NULL;
4599         size_t linelen = 0, total_len = 0, rv = 0, l;
4600         int curcpu = -1; /* cpu numbering starts at 0 */
4601         int physcpu, i;
4602         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4603         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4604         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4605         unsigned long user_surplus = 0, system_surplus = 0;
4606         unsigned long total_sum, threshold;
4607         struct cg_proc_stat *stat_node;
4608         struct cpuacct_usage *diff = NULL;
4609         int nprocs = get_nprocs_conf();
4610
4611         if (cg_cpu_usage_size < nprocs)
4612                 nprocs = cg_cpu_usage_size;
4613
4614         /* Read all CPU stats and stop when we've encountered other lines */
4615         while (getline(&line, &linelen, f) != -1) {
4616                 int ret;
4617                 char cpu_char[10]; /* That's a lot of cores */
4618                 uint64_t all_used, cg_used;
4619
4620                 if (strlen(line) == 0)
4621                         continue;
4622                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4623                         /* not a ^cpuN line containing a number N */
4624                         break;
4625                 }
4626
4627                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4628                         continue;
4629
4630                 if (physcpu >= cg_cpu_usage_size)
4631                         continue;
4632
4633                 curcpu ++;
4634                 cpu_cnt ++;
4635
4636                 if (!cpu_in_cpuset(physcpu, cpuset)) {
4637                         for (i = curcpu; i <= physcpu; i++) {
4638                                 cg_cpu_usage[i].online = false;
4639                         }
4640                         continue;
4641                 }
4642
4643                 if (curcpu < physcpu) {
4644                         /* Some CPUs may be disabled */
4645                         for (i = curcpu; i < physcpu; i++)
4646                                 cg_cpu_usage[i].online = false;
4647
4648                         curcpu = physcpu;
4649                 }
4650
4651                 cg_cpu_usage[curcpu].online = true;
4652
4653                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4654                            &user,
4655                            &nice,
4656                            &system,
4657                            &idle,
4658                            &iowait,
4659                            &irq,
4660                            &softirq,
4661                            &steal,
4662                            &guest,
4663                            &guest_nice);
4664
4665                 if (ret != 10)
4666                         continue;
4667
4668                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4669                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4670
4671                 if (all_used >= cg_used) {
4672                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4673
4674                 } else {
4675                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4676                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4677                                         curcpu, cg, all_used, cg_used);
4678                         cg_cpu_usage[curcpu].idle = idle;
4679                 }
4680         }
4681
4682         /* Cannot use more CPUs than is available due to cpuset */
4683         if (max_cpus > cpu_cnt)
4684                 max_cpus = cpu_cnt;
4685
4686         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4687
4688         if (!stat_node) {
4689                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4690                 rv = 0;
4691                 goto err;
4692         }
4693
4694         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4695         if (!diff) {
4696                 rv = 0;
4697                 goto err;
4698         }
4699
4700         /*
4701          * If the new values are LOWER than values stored in memory, it means
4702          * the cgroup has been reset/recreated and we should reset too.
4703          */
4704         for (curcpu = 0; curcpu < nprocs; curcpu++) {
4705                 if (!cg_cpu_usage[curcpu].online)
4706                         continue;
4707
4708                 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4709                         reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4710
4711                 break;
4712         }
4713
4714         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4715
4716         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4717                 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4718
4719                 if (!stat_node->usage[curcpu].online)
4720                         continue;
4721
4722                 i++;
4723
4724                 stat_node->usage[curcpu].user += diff[curcpu].user;
4725                 stat_node->usage[curcpu].system += diff[curcpu].system;
4726                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4727
4728                 if (max_cpus > 0 && i >= max_cpus) {
4729                         user_surplus += diff[curcpu].user;
4730                         system_surplus += diff[curcpu].system;
4731                 }
4732         }
4733
4734         /* Calculate usage counters of visible CPUs */
4735         if (max_cpus > 0) {
4736                 /* threshold = maximum usage per cpu, including idle */
4737                 threshold = total_sum / cpu_cnt * max_cpus;
4738
4739                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4740                         if (!stat_node->usage[curcpu].online)
4741                                 continue;
4742
4743                         i++;
4744
4745                         if (i == max_cpus)
4746                                 break;
4747
4748                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4749                                 continue;
4750
4751                         /* Add user */
4752                         add_cpu_usage(
4753                                         &user_surplus,
4754                                         &diff[curcpu],
4755                                         &diff[curcpu].user,
4756                                         threshold);
4757
4758                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4759                                 continue;
4760
4761                         /* If there is still room, add system */
4762                         add_cpu_usage(
4763                                         &system_surplus,
4764                                         &diff[curcpu],
4765                                         &diff[curcpu].system,
4766                                         threshold);
4767                 }
4768
4769                 if (user_surplus > 0)
4770                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4771                 if (system_surplus > 0)
4772                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4773
4774                 unsigned long diff_user = 0;
4775                 unsigned long diff_system = 0;
4776                 unsigned long diff_idle = 0;
4777                 unsigned long max_diff_idle = 0;
4778                 unsigned long max_diff_idle_index = 0;
4779                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4780                         if (!stat_node->usage[curcpu].online)
4781                                 continue;
4782
4783                         i++;
4784
4785                         if (i == max_cpus)
4786                                 break;
4787
4788                         stat_node->view[curcpu].user += diff[curcpu].user;
4789                         stat_node->view[curcpu].system += diff[curcpu].system;
4790                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4791
4792                         user_sum += stat_node->view[curcpu].user;
4793                         system_sum += stat_node->view[curcpu].system;
4794                         idle_sum += stat_node->view[curcpu].idle;
4795
4796                         diff_user += diff[curcpu].user;
4797                         diff_system += diff[curcpu].system;
4798                         diff_idle += diff[curcpu].idle;
4799                         if (diff[curcpu].idle > max_diff_idle) {
4800                                 max_diff_idle = diff[curcpu].idle;
4801                                 max_diff_idle_index = curcpu;
4802                         }
4803
4804                         lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4805                 }
4806                 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4807
4808                 // revise cpu usage view to support partial cpu case
4809                 double exact_cpus = exact_cpu_count(cg);
4810                 if (exact_cpus < (double)max_cpus){
4811                         lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4812                         unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4813                         lxcfs_v("delta: %lu\n", delta);
4814                         lxcfs_v("idle_sum before: %lu\n", idle_sum);
4815                         idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4816                         lxcfs_v("idle_sum after: %lu\n", idle_sum);
4817
4818                         curcpu = max_diff_idle_index;
4819                         lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4820                         stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4821                         lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4822                 }
4823         } else {
4824                 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4825                         if (!stat_node->usage[curcpu].online)
4826                                 continue;
4827
4828                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4829                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4830                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4831
4832                         user_sum += stat_node->view[curcpu].user;
4833                         system_sum += stat_node->view[curcpu].system;
4834                         idle_sum += stat_node->view[curcpu].idle;
4835                 }
4836         }
4837
4838         /* Render the file */
4839         /* cpu-all */
4840         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4841                         user_sum,
4842                         system_sum,
4843                         idle_sum);
4844         lxcfs_v("cpu-all: %s\n", buf);
4845
4846         if (l < 0) {
4847                 perror("Error writing to cache");
4848                 rv = 0;
4849                 goto err;
4850         }
4851         if (l >= buf_size) {
4852                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4853                 rv = 0;
4854                 goto err;
4855         }
4856
4857         buf += l;
4858         buf_size -= l;
4859         total_len += l;
4860
4861         /* Render visible CPUs */
4862         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4863                 if (!stat_node->usage[curcpu].online)
4864                         continue;
4865
4866                 i++;
4867
4868                 if (max_cpus > 0 && i == max_cpus)
4869                         break;
4870
4871                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4872                                 i,
4873                                 stat_node->view[curcpu].user,
4874                                 stat_node->view[curcpu].system,
4875                                 stat_node->view[curcpu].idle);
4876                 lxcfs_v("cpu: %s\n", buf);
4877
4878                 if (l < 0) {
4879                         perror("Error writing to cache");
4880                         rv = 0;
4881                         goto err;
4882
4883                 }
4884                 if (l >= buf_size) {
4885                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4886                         rv = 0;
4887                         goto err;
4888                 }
4889
4890                 buf += l;
4891                 buf_size -= l;
4892                 total_len += l;
4893         }
4894
4895         /* Pass the rest of /proc/stat, start with the last line read */
4896         l = snprintf(buf, buf_size, "%s", line);
4897
4898         if (l < 0) {
4899                 perror("Error writing to cache");
4900                 rv = 0;
4901                 goto err;
4902
4903         }
4904         if (l >= buf_size) {
4905                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4906                 rv = 0;
4907                 goto err;
4908         }
4909
4910         buf += l;
4911         buf_size -= l;
4912         total_len += l;
4913
4914         /* Pass the rest of the host's /proc/stat */
4915         while (getline(&line, &linelen, f) != -1) {
4916                 l = snprintf(buf, buf_size, "%s", line);
4917                 if (l < 0) {
4918                         perror("Error writing to cache");
4919                         rv = 0;
4920                         goto err;
4921                 }
4922                 if (l >= buf_size) {
4923                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4924                         rv = 0;
4925                         goto err;
4926                 }
4927                 buf += l;
4928                 buf_size -= l;
4929                 total_len += l;
4930         }
4931
4932         rv = total_len;
4933
4934 err:
4935         if (stat_node)
4936                 pthread_mutex_unlock(&stat_node->lock);
4937         if (line)
4938                 free(line);
4939         if (diff)
4940                 free(diff);
4941         return rv;
4942 }
4943
4944 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4945 static int proc_stat_read(char *buf, size_t size, off_t offset,
4946                 struct fuse_file_info *fi)
4947 {
4948         struct fuse_context *fc = fuse_get_context();
4949         struct file_info *d = (struct file_info *)fi->fh;
4950         char *cg;
4951         char *cpuset = NULL;
4952         char *line = NULL;
4953         size_t linelen = 0, total_len = 0, rv = 0;
4954         int curcpu = -1; /* cpu numbering starts at 0 */
4955         int physcpu = 0;
4956         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4957         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4958                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4959         char cpuall[CPUALL_MAX_SIZE];
4960         /* reserve for cpu all */
4961         char *cache = d->buf + CPUALL_MAX_SIZE;
4962         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4963         FILE *f = NULL;
4964         struct cpuacct_usage *cg_cpu_usage = NULL;
4965         int cg_cpu_usage_size = 0;
4966
4967         if (offset){
4968                 if (offset > d->size)
4969                         return -EINVAL;
4970                 if (!d->cached)
4971                         return 0;
4972                 int left = d->size - offset;
4973                 total_len = left > size ? size: left;
4974                 memcpy(buf, d->buf + offset, total_len);
4975                 return total_len;
4976         }
4977
4978         pid_t initpid = lookup_initpid_in_store(fc->pid);
4979         lxcfs_v("initpid: %d\n", initpid);
4980         if (initpid <= 0)
4981                 initpid = fc->pid;
4982
4983         /*
4984          * when container run with host pid namespace initpid == 1, cgroup will "/"
4985          * we should return host os's /proc contents.
4986          * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4987          */
4988         if (initpid == 1) {
4989             return read_file("/proc/stat", buf, size, d);
4990         }
4991
4992         cg = get_pid_cgroup(initpid, "cpuset");
4993         lxcfs_v("cg: %s\n", cg);
4994         if (!cg)
4995                 return read_file("/proc/stat", buf, size, d);
4996         prune_init_slice(cg);
4997
4998         cpuset = get_cpuset(cg);
4999         if (!cpuset)
5000                 goto err;
5001
5002         /*
5003          * Read cpuacct.usage_all for all CPUs.
5004          * If the cpuacct cgroup is present, it is used to calculate the container's
5005          * CPU usage. If not, values from the host's /proc/stat are used.
5006          */
5007         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
5008                 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
5009                                 "falling back to the host's /proc/stat");
5010         }
5011
5012         f = fopen("/proc/stat", "r");
5013         if (!f)
5014                 goto err;
5015
5016         //skip first line
5017         if (getline(&line, &linelen, f) < 0) {
5018                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
5019                 goto err;
5020         }
5021
5022         if (use_cpuview(cg) && cg_cpu_usage) {
5023                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
5024                                 f, d->buf, d->buflen);
5025                 goto out;
5026         }
5027
5028         while (getline(&line, &linelen, f) != -1) {
5029                 ssize_t l;
5030                 char cpu_char[10]; /* That's a lot of cores */
5031                 char *c;
5032                 uint64_t all_used, cg_used, new_idle;
5033                 int ret;
5034
5035                 if (strlen(line) == 0)
5036                         continue;
5037                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
5038                         /* not a ^cpuN line containing a number N, just print it */
5039                         l = snprintf(cache, cache_size, "%s", line);
5040                         if (l < 0) {
5041                                 perror("Error writing to cache");
5042                                 rv = 0;
5043                                 goto err;
5044                         }
5045                         if (l >= cache_size) {
5046                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5047                                 rv = 0;
5048                                 goto err;
5049                         }
5050                         cache += l;
5051                         cache_size -= l;
5052                         total_len += l;
5053                         continue;
5054                 }
5055
5056                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
5057                         continue;
5058                 if (!cpu_in_cpuset(physcpu, cpuset))
5059                         continue;
5060                 curcpu ++;
5061
5062                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
5063                            &user,
5064                            &nice,
5065                            &system,
5066                            &idle,
5067                            &iowait,
5068                            &irq,
5069                            &softirq,
5070                            &steal,
5071                            &guest,
5072                            &guest_nice);
5073
5074                 if (ret != 10 || !cg_cpu_usage) {
5075                         c = strchr(line, ' ');
5076                         if (!c)
5077                                 continue;
5078                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
5079                         if (l < 0) {
5080                                 perror("Error writing to cache");
5081                                 rv = 0;
5082                                 goto err;
5083
5084                         }
5085                         if (l >= cache_size) {
5086                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5087                                 rv = 0;
5088                                 goto err;
5089                         }
5090
5091                         cache += l;
5092                         cache_size -= l;
5093                         total_len += l;
5094
5095                         if (ret != 10)
5096                                 continue;
5097                 }
5098
5099                 if (cg_cpu_usage) {
5100                         if (physcpu >= cg_cpu_usage_size)
5101                                 break;
5102
5103                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
5104                         cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
5105
5106                         if (all_used >= cg_used) {
5107                                 new_idle = idle + (all_used - cg_used);
5108
5109                         } else {
5110                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
5111                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
5112                                                 curcpu, cg, all_used, cg_used);
5113                                 new_idle = idle;
5114                         }
5115
5116                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
5117                                         curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
5118                                         new_idle);
5119
5120                         if (l < 0) {
5121                                 perror("Error writing to cache");
5122                                 rv = 0;
5123                                 goto err;
5124
5125                         }
5126                         if (l >= cache_size) {
5127                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5128                                 rv = 0;
5129                                 goto err;
5130                         }
5131
5132                         cache += l;
5133                         cache_size -= l;
5134                         total_len += l;
5135
5136                         user_sum += cg_cpu_usage[physcpu].user;
5137                         system_sum += cg_cpu_usage[physcpu].system;
5138                         idle_sum += new_idle;
5139
5140                 } else {
5141                         user_sum += user;
5142                         nice_sum += nice;
5143                         system_sum += system;
5144                         idle_sum += idle;
5145                         iowait_sum += iowait;
5146                         irq_sum += irq;
5147                         softirq_sum += softirq;
5148                         steal_sum += steal;
5149                         guest_sum += guest;
5150                         guest_nice_sum += guest_nice;
5151                 }
5152         }
5153
5154         cache = d->buf;
5155
5156         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5157                         user_sum,
5158                         nice_sum,
5159                         system_sum,
5160                         idle_sum,
5161                         iowait_sum,
5162                         irq_sum,
5163                         softirq_sum,
5164                         steal_sum,
5165                         guest_sum,
5166                         guest_nice_sum);
5167         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
5168                 memcpy(cache, cpuall, cpuall_len);
5169                 cache += cpuall_len;
5170         } else {
5171                 /* shouldn't happen */
5172                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
5173                 cpuall_len = 0;
5174         }
5175
5176         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
5177         total_len += cpuall_len;
5178
5179 out:
5180         d->cached = 1;
5181         d->size = total_len;
5182         if (total_len > size)
5183                 total_len = size;
5184
5185         memcpy(buf, d->buf, total_len);
5186         rv = total_len;
5187
5188 err:
5189         if (f)
5190                 fclose(f);
5191         if (cg_cpu_usage)
5192                 free(cg_cpu_usage);
5193         free(line);
5194         free(cpuset);
5195         free(cg);
5196         return rv;
5197 }
5198
5199 /* This function retrieves the busy time of a group of tasks by looking at
5200  * cpuacct.usage. Unfortunately, this only makes sense when the container has
5201  * been given it's own cpuacct cgroup. If not, this function will take the busy
5202  * time of all other taks that do not actually belong to the container into
5203  * account as well. If someone has a clever solution for this please send a
5204  * patch!
5205  */
5206 static double get_reaper_busy(pid_t task)
5207 {
5208         pid_t initpid = lookup_initpid_in_store(task);
5209         char *cgroup = NULL, *usage_str = NULL;
5210         unsigned long usage = 0;
5211         double res = 0;
5212
5213         if (initpid <= 0)
5214                 return 0;
5215
5216         cgroup = get_pid_cgroup(initpid, "cpuacct");
5217         if (!cgroup)
5218                 goto out;
5219         prune_init_slice(cgroup);
5220         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5221                 goto out;
5222         usage = strtoul(usage_str, NULL, 10);
5223         res = (double)usage / 1000000000;
5224
5225 out:
5226         free(cgroup);
5227         free(usage_str);
5228         return res;
5229 }
5230
5231 #if RELOADTEST
5232 void iwashere(void)
5233 {
5234         int fd;
5235
5236         fd = creat("/tmp/lxcfs-iwashere", 0644);
5237         if (fd >= 0)
5238                 close(fd);
5239 }
5240 #endif
5241
5242 /*
5243  * We read /proc/uptime and reuse its second field.
5244  * For the first field, we use the mtime for the reaper for
5245  * the calling pid as returned by getreaperage
5246  */
5247 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5248                 struct fuse_file_info *fi)
5249 {
5250         struct fuse_context *fc = fuse_get_context();
5251         struct file_info *d = (struct file_info *)fi->fh;
5252         double busytime = get_reaper_busy(fc->pid);
5253         char *cache = d->buf;
5254         ssize_t total_len = 0;
5255         double idletime, reaperage;
5256
5257 #if RELOADTEST
5258         iwashere();
5259 #endif
5260
5261         if (offset){
5262                 if (!d->cached)
5263                         return 0;
5264                 if (offset > d->size)
5265                         return -EINVAL;
5266                 int left = d->size - offset;
5267                 total_len = left > size ? size: left;
5268                 memcpy(buf, cache + offset, total_len);
5269                 return total_len;
5270         }
5271
5272         reaperage = get_reaper_age(fc->pid);
5273         /* To understand why this is done, please read the comment to the
5274          * get_reaper_busy() function.
5275          */
5276         idletime = reaperage;
5277         if (reaperage >= busytime)
5278                 idletime = reaperage - busytime;
5279
5280         total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5281         if (total_len < 0 || total_len >=  d->buflen){
5282                 lxcfs_error("%s\n", "failed to write to cache");
5283                 return 0;
5284         }
5285
5286         d->size = (int)total_len;
5287         d->cached = 1;
5288
5289         if (total_len > size) total_len = size;
5290
5291         memcpy(buf, d->buf, total_len);
5292         return total_len;
5293 }
5294
5295 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5296                 struct fuse_file_info *fi)
5297 {
5298         char dev_name[72];
5299         struct fuse_context *fc = fuse_get_context();
5300         struct file_info *d = (struct file_info *)fi->fh;
5301         char *cg;
5302         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5303                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
5304         unsigned long read = 0, write = 0;
5305         unsigned long read_merged = 0, write_merged = 0;
5306         unsigned long read_sectors = 0, write_sectors = 0;
5307         unsigned long read_ticks = 0, write_ticks = 0;
5308         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5309         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5310         char *cache = d->buf;
5311         size_t cache_size = d->buflen;
5312         char *line = NULL;
5313         size_t linelen = 0, total_len = 0, rv = 0;
5314         unsigned int major = 0, minor = 0;
5315         int i = 0;
5316         FILE *f = NULL;
5317
5318         if (offset){
5319                 if (offset > d->size)
5320                         return -EINVAL;
5321                 if (!d->cached)
5322                         return 0;
5323                 int left = d->size - offset;
5324                 total_len = left > size ? size: left;
5325                 memcpy(buf, cache + offset, total_len);
5326                 return total_len;
5327         }
5328
5329         pid_t initpid = lookup_initpid_in_store(fc->pid);
5330         if (initpid <= 1 || is_shared_pidns(initpid))
5331                 initpid = fc->pid;
5332         cg = get_pid_cgroup(initpid, "blkio");
5333         if (!cg)
5334                 return read_file("/proc/diskstats", buf, size, d);
5335         prune_init_slice(cg);
5336
5337         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5338                 goto err;
5339         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5340                 goto err;
5341         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5342                 goto err;
5343         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5344                 goto err;
5345         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5346                 goto err;
5347
5348
5349         f = fopen("/proc/diskstats", "r");
5350         if (!f)
5351                 goto err;
5352
5353         while (getline(&line, &linelen, f) != -1) {
5354                 ssize_t l;
5355                 char lbuf[256];
5356
5357                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5358                 if (i != 3)
5359                         continue;
5360
5361                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5362                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5363                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5364                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5365                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5366                 read_sectors = read_sectors/512;
5367                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5368                 write_sectors = write_sectors/512;
5369
5370                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5371                 rd_svctm = rd_svctm/1000000;
5372                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5373                 rd_wait = rd_wait/1000000;
5374                 read_ticks = rd_svctm + rd_wait;
5375
5376                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5377                 wr_svctm =  wr_svctm/1000000;
5378                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5379                 wr_wait =  wr_wait/1000000;
5380                 write_ticks = wr_svctm + wr_wait;
5381
5382                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5383                 tot_ticks =  tot_ticks/1000000;
5384
5385                 memset(lbuf, 0, 256);
5386                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5387                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5388                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5389                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5390                 else
5391                         continue;
5392
5393                 l = snprintf(cache, cache_size, "%s", lbuf);
5394                 if (l < 0) {
5395                         perror("Error writing to fuse buf");
5396                         rv = 0;
5397                         goto err;
5398                 }
5399                 if (l >= cache_size) {
5400                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5401                         rv = 0;
5402                         goto err;
5403                 }
5404                 cache += l;
5405                 cache_size -= l;
5406                 total_len += l;
5407         }
5408
5409         d->cached = 1;
5410         d->size = total_len;
5411         if (total_len > size ) total_len = size;
5412         memcpy(buf, d->buf, total_len);
5413
5414         rv = total_len;
5415 err:
5416         free(cg);
5417         if (f)
5418                 fclose(f);
5419         free(line);
5420         free(io_serviced_str);
5421         free(io_merged_str);
5422         free(io_service_bytes_str);
5423         free(io_wait_time_str);
5424         free(io_service_time_str);
5425         return rv;
5426 }
5427
5428 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5429                 struct fuse_file_info *fi)
5430 {
5431         struct fuse_context *fc = fuse_get_context();
5432         struct file_info *d = (struct file_info *)fi->fh;
5433         char *cg = NULL;
5434         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5435         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5436         ssize_t total_len = 0, rv = 0;
5437         ssize_t l = 0;
5438         char *cache = d->buf;
5439
5440         if (offset) {
5441                 if (offset > d->size)
5442                         return -EINVAL;
5443                 if (!d->cached)
5444                         return 0;
5445                 int left = d->size - offset;
5446                 total_len = left > size ? size: left;
5447                 memcpy(buf, cache + offset, total_len);
5448                 return total_len;
5449         }
5450
5451         pid_t initpid = lookup_initpid_in_store(fc->pid);
5452         if (initpid <= 1 || is_shared_pidns(initpid))
5453                 initpid = fc->pid;
5454         cg = get_pid_cgroup(initpid, "memory");
5455         if (!cg)
5456                 return read_file("/proc/swaps", buf, size, d);
5457         prune_init_slice(cg);
5458
5459         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5460
5461         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5462                 goto err;
5463
5464         memusage = strtoul(memusage_str, NULL, 10);
5465
5466         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5467             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5468
5469                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5470                 memswusage = strtoul(memswusage_str, NULL, 10);
5471
5472                 swap_total = (memswlimit - memlimit) / 1024;
5473                 swap_free = (memswusage - memusage) / 1024;
5474         }
5475
5476         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5477
5478         /* When no mem + swap limit is specified or swapaccount=0*/
5479         if (!memswlimit) {
5480                 char *line = NULL;
5481                 size_t linelen = 0;
5482                 FILE *f = fopen("/proc/meminfo", "r");
5483
5484                 if (!f)
5485                         goto err;
5486
5487                 while (getline(&line, &linelen, f) != -1) {
5488                         if (startswith(line, "SwapTotal:")) {
5489                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5490                         } else if (startswith(line, "SwapFree:")) {
5491                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5492                         }
5493                 }
5494
5495                 free(line);
5496                 fclose(f);
5497         }
5498
5499         if (swap_total > 0) {
5500                 l = snprintf(d->buf + total_len, d->size - total_len,
5501                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5502                                 swap_total, swap_free);
5503                 total_len += l;
5504         }
5505
5506         if (total_len < 0 || l < 0) {
5507                 perror("Error writing to cache");
5508                 rv = 0;
5509                 goto err;
5510         }
5511
5512         d->cached = 1;
5513         d->size = (int)total_len;
5514
5515         if (total_len > size) total_len = size;
5516         memcpy(buf, d->buf, total_len);
5517         rv = total_len;
5518
5519 err:
5520         free(cg);
5521         free(memswlimit_str);
5522         free(memlimit_str);
5523         free(memusage_str);
5524         free(memswusage_str);
5525         return rv;
5526 }
5527 /*
5528  * Find the process pid from cgroup path.
5529  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5530  * @pid_buf : put pid to pid_buf.
5531  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5532  * @depth : the depth of cgroup in container.
5533  * @sum : return the number of pid.
5534  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5535  */
5536 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5537 {
5538         DIR *dir;
5539         int fd;
5540         struct dirent *file;
5541         FILE *f = NULL;
5542         size_t linelen = 0;
5543         char *line = NULL;
5544         int pd;
5545         char *path_dir, *path;
5546         char **pid;
5547
5548         /* path = dpath + "/cgroup.procs" + /0 */
5549         do {
5550                 path = malloc(strlen(dpath) + 20);
5551         } while (!path);
5552
5553         strcpy(path, dpath);
5554         fd = openat(cfd, path, O_RDONLY);
5555         if (fd < 0)
5556                 goto out;
5557
5558         dir = fdopendir(fd);
5559         if (dir == NULL) {
5560                 close(fd);
5561                 goto out;
5562         }
5563
5564         while (((file = readdir(dir)) != NULL) && depth > 0) {
5565                 if (strncmp(file->d_name, ".", 1) == 0)
5566                         continue;
5567                 if (strncmp(file->d_name, "..", 1) == 0)
5568                         continue;
5569                 if (file->d_type == DT_DIR) {
5570                         /* path + '/' + d_name +/0 */
5571                         do {
5572                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5573                         } while (!path_dir);
5574                         strcpy(path_dir, path);
5575                         strcat(path_dir, "/");
5576                         strcat(path_dir, file->d_name);
5577                         pd = depth - 1;
5578                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5579                         free(path_dir);
5580                 }
5581         }
5582         closedir(dir);
5583
5584         strcat(path, "/cgroup.procs");
5585         fd = openat(cfd, path, O_RDONLY);
5586         if (fd < 0)
5587                 goto out;
5588
5589         f = fdopen(fd, "r");
5590         if (!f) {
5591                 close(fd);
5592                 goto out;
5593         }
5594
5595         while (getline(&line, &linelen, f) != -1) {
5596                 do {
5597                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5598                 } while (!pid);
5599                 *pid_buf = pid;
5600                 do {
5601                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
5602                 } while (*(*pid_buf + sum) == NULL);
5603                 strcpy(*(*pid_buf + sum), line);
5604                 sum++;
5605         }
5606         fclose(f);
5607 out:
5608         if (line)
5609                 free(line);
5610         free(path);
5611         return sum;
5612 }
5613 /*
5614  * calc_load calculates the load according to the following formula:
5615  * load1 = load0 * exp + active * (1 - exp)
5616  *
5617  * @load1: the new loadavg.
5618  * @load0: the former loadavg.
5619  * @active: the total number of running pid at this moment.
5620  * @exp: the fixed-point defined in the beginning.
5621  */
5622 static unsigned long
5623 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5624 {
5625         unsigned long newload;
5626
5627         active = active > 0 ? active * FIXED_1 : 0;
5628         newload = load * exp + active * (FIXED_1 - exp);
5629         if (active >= load)
5630                 newload += FIXED_1 - 1;
5631
5632         return newload / FIXED_1;
5633 }
5634
5635 /*
5636  * Return 0 means that container p->cg is closed.
5637  * Return -1 means that error occurred in refresh.
5638  * Positive num equals the total number of pid.
5639  */
5640 static int refresh_load(struct load_node *p, char *path)
5641 {
5642         FILE *f = NULL;
5643         char **idbuf;
5644         char proc_path[256];
5645         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5646         char *line = NULL;
5647         size_t linelen = 0;
5648         int sum, length;
5649         DIR *dp;
5650         struct dirent *file;
5651
5652         do {
5653                 idbuf = malloc(sizeof(char *));
5654         } while (!idbuf);
5655         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5656         /*  normal exit  */
5657         if (sum == 0)
5658                 goto out;
5659
5660         for (i = 0; i < sum; i++) {
5661                 /*clean up '\n' */
5662                 length = strlen(idbuf[i])-1;
5663                 idbuf[i][length] = '\0';
5664                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5665                 if (ret < 0 || ret > 255) {
5666                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5667                         i = sum;
5668                         sum = -1;
5669                         goto err_out;
5670                 }
5671
5672                 dp = opendir(proc_path);
5673                 if (!dp) {
5674                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5675                         continue;
5676                 }
5677                 while ((file = readdir(dp)) != NULL) {
5678                         if (strncmp(file->d_name, ".", 1) == 0)
5679                                 continue;
5680                         if (strncmp(file->d_name, "..", 1) == 0)
5681                                 continue;
5682                         total_pid++;
5683                         /* We make the biggest pid become last_pid.*/
5684                         ret = atof(file->d_name);
5685                         last_pid = (ret > last_pid) ? ret : last_pid;
5686
5687                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5688                         if (ret < 0 || ret > 255) {
5689                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5690                                 i = sum;
5691                                 sum = -1;
5692                                 closedir(dp);
5693                                 goto err_out;
5694                         }
5695                         f = fopen(proc_path, "r");
5696                         if (f != NULL) {
5697                                 while (getline(&line, &linelen, f) != -1) {
5698                                         /* Find State */
5699                                         if ((line[0] == 'S') && (line[1] == 't'))
5700                                                 break;
5701                                 }
5702                         if ((line[7] == 'R') || (line[7] == 'D'))
5703                                 run_pid++;
5704                         fclose(f);
5705                         }
5706                 }
5707                 closedir(dp);
5708         }
5709         /*Calculate the loadavg.*/
5710         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5711         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5712         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5713         p->run_pid = run_pid;
5714         p->total_pid = total_pid;
5715         p->last_pid = last_pid;
5716
5717         free(line);
5718 err_out:
5719         for (; i > 0; i--)
5720                 free(idbuf[i-1]);
5721 out:
5722         free(idbuf);
5723         return sum;
5724 }
5725 /*
5726  * Traverse the hash table and update it.
5727  */
5728 void *load_begin(void *arg)
5729 {
5730
5731         char *path = NULL;
5732         int i, sum, length, ret;
5733         struct load_node *f;
5734         int first_node;
5735         clock_t time1, time2;
5736
5737         while (1) {
5738                 if (loadavg_stop == 1)
5739                         return NULL;
5740
5741                 time1 = clock();
5742                 for (i = 0; i < LOAD_SIZE; i++) {
5743                         pthread_mutex_lock(&load_hash[i].lock);
5744                         if (load_hash[i].next == NULL) {
5745                                 pthread_mutex_unlock(&load_hash[i].lock);
5746                                 continue;
5747                         }
5748                         f = load_hash[i].next;
5749                         first_node = 1;
5750                         while (f) {
5751                                 length = strlen(f->cg) + 2;
5752                                 do {
5753                                         /* strlen(f->cg) + '.' or '' + \0 */
5754                                         path = malloc(length);
5755                                 } while (!path);
5756
5757                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5758                                 if (ret < 0 || ret > length - 1) {
5759                                         /* snprintf failed, ignore the node.*/
5760                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5761                                         goto out;
5762                                 }
5763                                 sum = refresh_load(f, path);
5764                                 if (sum == 0) {
5765                                         f = del_node(f, i);
5766                                 } else {
5767 out:                                    f = f->next;
5768                                 }
5769                                 free(path);
5770                                 /* load_hash[i].lock locks only on the first node.*/
5771                                 if (first_node == 1) {
5772                                         first_node = 0;
5773                                         pthread_mutex_unlock(&load_hash[i].lock);
5774                                 }
5775                         }
5776                 }
5777
5778                 if (loadavg_stop == 1)
5779                         return NULL;
5780
5781                 time2 = clock();
5782                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5783         }
5784 }
5785
5786 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5787                 struct fuse_file_info *fi)
5788 {
5789         struct fuse_context *fc = fuse_get_context();
5790         struct file_info *d = (struct file_info *)fi->fh;
5791         pid_t initpid;
5792         char *cg;
5793         size_t total_len = 0;
5794         char *cache = d->buf;
5795         struct load_node *n;
5796         int hash;
5797         int cfd, rv = 0;
5798         unsigned long a, b, c;
5799
5800         if (offset) {
5801                 if (offset > d->size)
5802                         return -EINVAL;
5803                 if (!d->cached)
5804                         return 0;
5805                 int left = d->size - offset;
5806                 total_len = left > size ? size : left;
5807                 memcpy(buf, cache + offset, total_len);
5808                 return total_len;
5809         }
5810         if (!loadavg)
5811                 return read_file("/proc/loadavg", buf, size, d);
5812
5813         initpid = lookup_initpid_in_store(fc->pid);
5814         if (initpid <= 1 || is_shared_pidns(initpid))
5815                 initpid = fc->pid;
5816         cg = get_pid_cgroup(initpid, "cpu");
5817         if (!cg)
5818                 return read_file("/proc/loadavg", buf, size, d);
5819
5820         prune_init_slice(cg);
5821         hash = calc_hash(cg) % LOAD_SIZE;
5822         n = locate_node(cg, hash);
5823
5824         /* First time */
5825         if (n == NULL) {
5826                 if (!find_mounted_controller("cpu", &cfd)) {
5827                         /*
5828                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5829                          * because delete is not allowed before read has ended.
5830                          */
5831                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5832                         rv = 0;
5833                         goto err;
5834                 }
5835                 do {
5836                         n = malloc(sizeof(struct load_node));
5837                 } while (!n);
5838
5839                 do {
5840                         n->cg = malloc(strlen(cg)+1);
5841                 } while (!n->cg);
5842                 strcpy(n->cg, cg);
5843                 n->avenrun[0] = 0;
5844                 n->avenrun[1] = 0;
5845                 n->avenrun[2] = 0;
5846                 n->run_pid = 0;
5847                 n->total_pid = 1;
5848                 n->last_pid = initpid;
5849                 n->cfd = cfd;
5850                 insert_node(&n, hash);
5851         }
5852         a = n->avenrun[0] + (FIXED_1/200);
5853         b = n->avenrun[1] + (FIXED_1/200);
5854         c = n->avenrun[2] + (FIXED_1/200);
5855         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5856                 LOAD_INT(a), LOAD_FRAC(a),
5857                 LOAD_INT(b), LOAD_FRAC(b),
5858                 LOAD_INT(c), LOAD_FRAC(c),
5859                 n->run_pid, n->total_pid, n->last_pid);
5860         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5861         if (total_len < 0 || total_len >=  d->buflen) {
5862                 lxcfs_error("%s\n", "Failed to write to cache");
5863                 rv = 0;
5864                 goto err;
5865         }
5866         d->size = (int)total_len;
5867         d->cached = 1;
5868
5869         if (total_len > size)
5870                 total_len = size;
5871         memcpy(buf, d->buf, total_len);
5872         rv = total_len;
5873
5874 err:
5875         free(cg);
5876         return rv;
5877 }
5878 /* Return a positive number on success, return 0 on failure.*/
5879 pthread_t load_daemon(int load_use)
5880 {
5881         int ret;
5882         pthread_t pid;
5883
5884         ret = init_load();
5885         if (ret == -1) {
5886                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5887                 return 0;
5888         }
5889         ret = pthread_create(&pid, NULL, load_begin, NULL);
5890         if (ret != 0) {
5891                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5892                 load_free();
5893                 return 0;
5894         }
5895         /* use loadavg, here loadavg = 1*/
5896         loadavg = load_use;
5897         return pid;
5898 }
5899
5900 /* Returns 0 on success. */
5901 int stop_load_daemon(pthread_t pid)
5902 {
5903         int s;
5904
5905         /* Signal the thread to gracefully stop */
5906         loadavg_stop = 1;
5907
5908         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5909         if (s != 0) {
5910                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5911                 return -1;
5912         }
5913
5914         load_free();
5915         loadavg_stop = 0;
5916
5917         return 0;
5918 }
5919
5920 static off_t get_procfile_size(const char *which)
5921 {
5922         FILE *f = fopen(which, "r");
5923         char *line = NULL;
5924         size_t len = 0;
5925         ssize_t sz, answer = 0;
5926         if (!f)
5927                 return 0;
5928
5929         while ((sz = getline(&line, &len, f)) != -1)
5930                 answer += sz;
5931         fclose (f);
5932         free(line);
5933
5934         return answer;
5935 }
5936
5937 int proc_getattr(const char *path, struct stat *sb)
5938 {
5939         struct timespec now;
5940
5941         memset(sb, 0, sizeof(struct stat));
5942         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5943                 return -EINVAL;
5944         sb->st_uid = sb->st_gid = 0;
5945         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5946         if (strcmp(path, "/proc") == 0) {
5947                 sb->st_mode = S_IFDIR | 00555;
5948                 sb->st_nlink = 2;
5949                 return 0;
5950         }
5951         if (strcmp(path, "/proc/meminfo") == 0 ||
5952                         strcmp(path, "/proc/cpuinfo") == 0 ||
5953                         strcmp(path, "/proc/uptime") == 0 ||
5954                         strcmp(path, "/proc/stat") == 0 ||
5955                         strcmp(path, "/proc/diskstats") == 0 ||
5956                         strcmp(path, "/proc/swaps") == 0 ||
5957                         strcmp(path, "/proc/loadavg") == 0) {
5958                 sb->st_size = 0;
5959                 sb->st_mode = S_IFREG | 00444;
5960                 sb->st_nlink = 1;
5961                 return 0;
5962         }
5963
5964         return -ENOENT;
5965 }
5966
5967 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5968                 struct fuse_file_info *fi)
5969 {
5970         if (filler(buf, ".", NULL, 0) != 0 ||
5971             filler(buf, "..", NULL, 0) != 0 ||
5972             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5973             filler(buf, "meminfo", NULL, 0) != 0 ||
5974             filler(buf, "stat", NULL, 0) != 0 ||
5975             filler(buf, "uptime", NULL, 0) != 0 ||
5976             filler(buf, "diskstats", NULL, 0) != 0 ||
5977             filler(buf, "swaps", NULL, 0) != 0   ||
5978             filler(buf, "loadavg", NULL, 0) != 0)
5979                 return -EINVAL;
5980         return 0;
5981 }
5982
5983 int proc_open(const char *path, struct fuse_file_info *fi)
5984 {
5985         int type = -1;
5986         struct file_info *info;
5987
5988         if (strcmp(path, "/proc/meminfo") == 0)
5989                 type = LXC_TYPE_PROC_MEMINFO;
5990         else if (strcmp(path, "/proc/cpuinfo") == 0)
5991                 type = LXC_TYPE_PROC_CPUINFO;
5992         else if (strcmp(path, "/proc/uptime") == 0)
5993                 type = LXC_TYPE_PROC_UPTIME;
5994         else if (strcmp(path, "/proc/stat") == 0)
5995                 type = LXC_TYPE_PROC_STAT;
5996         else if (strcmp(path, "/proc/diskstats") == 0)
5997                 type = LXC_TYPE_PROC_DISKSTATS;
5998         else if (strcmp(path, "/proc/swaps") == 0)
5999                 type = LXC_TYPE_PROC_SWAPS;
6000         else if (strcmp(path, "/proc/loadavg") == 0)
6001                 type = LXC_TYPE_PROC_LOADAVG;
6002         if (type == -1)
6003                 return -ENOENT;
6004
6005         info = malloc(sizeof(*info));
6006         if (!info)
6007                 return -ENOMEM;
6008
6009         memset(info, 0, sizeof(*info));
6010         info->type = type;
6011
6012         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
6013         do {
6014                 info->buf = malloc(info->buflen);
6015         } while (!info->buf);
6016         memset(info->buf, 0, info->buflen);
6017         /* set actual size to buffer size */
6018         info->size = info->buflen;
6019
6020         fi->fh = (unsigned long)info;
6021         return 0;
6022 }
6023
6024 int proc_access(const char *path, int mask)
6025 {
6026         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
6027                 return 0;
6028
6029         /* these are all read-only */
6030         if ((mask & ~R_OK) != 0)
6031                 return -EACCES;
6032         return 0;
6033 }
6034
6035 int proc_release(const char *path, struct fuse_file_info *fi)
6036 {
6037         do_release_file_info(fi);
6038         return 0;
6039 }
6040
6041 int proc_read(const char *path, char *buf, size_t size, off_t offset,
6042                 struct fuse_file_info *fi)
6043 {
6044         struct file_info *f = (struct file_info *) fi->fh;
6045
6046         switch (f->type) {
6047         case LXC_TYPE_PROC_MEMINFO:
6048                 return proc_meminfo_read(buf, size, offset, fi);
6049         case LXC_TYPE_PROC_CPUINFO:
6050                 return proc_cpuinfo_read(buf, size, offset, fi);
6051         case LXC_TYPE_PROC_UPTIME:
6052                 return proc_uptime_read(buf, size, offset, fi);
6053         case LXC_TYPE_PROC_STAT:
6054                 return proc_stat_read(buf, size, offset, fi);
6055         case LXC_TYPE_PROC_DISKSTATS:
6056                 return proc_diskstats_read(buf, size, offset, fi);
6057         case LXC_TYPE_PROC_SWAPS:
6058                 return proc_swaps_read(buf, size, offset, fi);
6059         case LXC_TYPE_PROC_LOADAVG:
6060                 return proc_loadavg_read(buf, size, offset, fi);
6061         default:
6062                 return -EINVAL;
6063         }
6064 }
6065
6066 /*
6067  * Functions needed to setup cgroups in the __constructor__.
6068  */
6069
6070 static bool mkdir_p(const char *dir, mode_t mode)
6071 {
6072         const char *tmp = dir;
6073         const char *orig = dir;
6074         char *makeme;
6075
6076         do {
6077                 dir = tmp + strspn(tmp, "/");
6078                 tmp = dir + strcspn(dir, "/");
6079                 makeme = strndup(orig, dir - orig);
6080                 if (!makeme)
6081                         return false;
6082                 if (mkdir(makeme, mode) && errno != EEXIST) {
6083                         lxcfs_error("Failed to create directory '%s': %s.\n",
6084                                 makeme, strerror(errno));
6085                         free(makeme);
6086                         return false;
6087                 }
6088                 free(makeme);
6089         } while(tmp != dir);
6090
6091         return true;
6092 }
6093
6094 static bool umount_if_mounted(void)
6095 {
6096         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
6097                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
6098                 return false;
6099         }
6100         return true;
6101 }
6102
6103 /* __typeof__ should be safe to use with all compilers. */
6104 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
6105 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
6106 {
6107         return (fs->f_type == (fs_type_magic)magic_val);
6108 }
6109
6110 /*
6111  * looking at fs/proc_namespace.c, it appears we can
6112  * actually expect the rootfs entry to very specifically contain
6113  * " - rootfs rootfs "
6114  * IIUC, so long as we've chrooted so that rootfs is not our root,
6115  * the rootfs entry should always be skipped in mountinfo contents.
6116  */
6117 static bool is_on_ramfs(void)
6118 {
6119         FILE *f;
6120         char *p, *p2;
6121         char *line = NULL;
6122         size_t len = 0;
6123         int i;
6124
6125         f = fopen("/proc/self/mountinfo", "r");
6126         if (!f)
6127                 return false;
6128
6129         while (getline(&line, &len, f) != -1) {
6130                 for (p = line, i = 0; p && i < 4; i++)
6131                         p = strchr(p + 1, ' ');
6132                 if (!p)
6133                         continue;
6134                 p2 = strchr(p + 1, ' ');
6135                 if (!p2)
6136                         continue;
6137                 *p2 = '\0';
6138                 if (strcmp(p + 1, "/") == 0) {
6139                         // this is '/'.  is it the ramfs?
6140                         p = strchr(p2 + 1, '-');
6141                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
6142                                 free(line);
6143                                 fclose(f);
6144                                 return true;
6145                         }
6146                 }
6147         }
6148         free(line);
6149         fclose(f);
6150         return false;
6151 }
6152
6153 static int pivot_enter()
6154 {
6155         int ret = -1, oldroot = -1, newroot = -1;
6156
6157         oldroot = open("/", O_DIRECTORY | O_RDONLY);
6158         if (oldroot < 0) {
6159                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
6160                 return ret;
6161         }
6162
6163         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
6164         if (newroot < 0) {
6165                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
6166                 goto err;
6167         }
6168
6169         /* change into new root fs */
6170         if (fchdir(newroot) < 0) {
6171                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
6172                 goto err;
6173         }
6174
6175         /* pivot_root into our new root fs */
6176         if (pivot_root(".", ".") < 0) {
6177                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
6178                 goto err;
6179         }
6180
6181         /*
6182          * At this point the old-root is mounted on top of our new-root.
6183          * To unmounted it we must not be chdir'd into it, so escape back
6184          * to the old-root.
6185          */
6186         if (fchdir(oldroot) < 0) {
6187                 lxcfs_error("%s\n", "Failed to enter old root.");
6188                 goto err;
6189         }
6190
6191         if (umount2(".", MNT_DETACH) < 0) {
6192                 lxcfs_error("%s\n", "Failed to detach old root.");
6193                 goto err;
6194         }
6195
6196         if (fchdir(newroot) < 0) {
6197                 lxcfs_error("%s\n", "Failed to re-enter new root.");
6198                 goto err;
6199         }
6200
6201         ret = 0;
6202
6203 err:
6204         if (oldroot > 0)
6205                 close(oldroot);
6206         if (newroot > 0)
6207                 close(newroot);
6208
6209         return ret;
6210 }
6211
6212 static int chroot_enter()
6213 {
6214         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6215                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6216                 return -1;
6217         }
6218
6219         if (chroot(".") < 0) {
6220                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6221                 return -1;
6222         }
6223
6224         if (chdir("/") < 0) {
6225                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6226                 return -1;
6227         }
6228
6229         return 0;
6230 }
6231
6232 static int permute_and_enter(void)
6233 {
6234         struct statfs sb;
6235
6236         if (statfs("/", &sb) < 0) {
6237                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6238                 return -1;
6239         }
6240
6241         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6242          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6243          * /proc/1/mountinfo. */
6244         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6245                 return chroot_enter();
6246
6247         if (pivot_enter() < 0) {
6248                 lxcfs_error("%s\n", "Could not perform pivot root.");
6249                 return -1;
6250         }
6251
6252         return 0;
6253 }
6254
6255 /* Prepare our new clean root. */
6256 static int permute_prepare(void)
6257 {
6258         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6259                 lxcfs_error("%s\n", "Failed to create directory for new root.");
6260                 return -1;
6261         }
6262
6263         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6264                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6265                 return -1;
6266         }
6267
6268         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6269                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6270                 return -1;
6271         }
6272
6273         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6274                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6275                 return -1;
6276         }
6277
6278         return 0;
6279 }
6280
6281 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6282 static bool permute_root(void)
6283 {
6284         /* Prepare new root. */
6285         if (permute_prepare() < 0)
6286                 return false;
6287
6288         /* Pivot into new root. */
6289         if (permute_and_enter() < 0)
6290                 return false;
6291
6292         return true;
6293 }
6294
6295 static int preserve_mnt_ns(int pid)
6296 {
6297         int ret;
6298         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6299         char path[len];
6300
6301         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6302         if (ret < 0 || (size_t)ret >= len)
6303                 return -1;
6304
6305         return open(path, O_RDONLY | O_CLOEXEC);
6306 }
6307
6308 static bool cgfs_prepare_mounts(void)
6309 {
6310         if (!mkdir_p(BASEDIR, 0700)) {
6311                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6312                 return false;
6313         }
6314
6315         if (!umount_if_mounted()) {
6316                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6317                 return false;
6318         }
6319
6320         if (unshare(CLONE_NEWNS) < 0) {
6321                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6322                 return false;
6323         }
6324
6325         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6326         if (cgroup_mount_ns_fd < 0) {
6327                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6328                 return false;
6329         }
6330
6331         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6332                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6333                 return false;
6334         }
6335
6336         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6337                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6338                 return false;
6339         }
6340
6341         return true;
6342 }
6343
6344 static bool cgfs_mount_hierarchies(void)
6345 {
6346         char *target;
6347         size_t clen, len;
6348         int i, ret;
6349
6350         for (i = 0; i < num_hierarchies; i++) {
6351                 char *controller = hierarchies[i];
6352
6353                 clen = strlen(controller);
6354                 len = strlen(BASEDIR) + clen + 2;
6355                 target = malloc(len);
6356                 if (!target)
6357                         return false;
6358
6359                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6360                 if (ret < 0 || ret >= len) {
6361                         free(target);
6362                         return false;
6363                 }
6364                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6365                         free(target);
6366                         return false;
6367                 }
6368                 if (!strcmp(controller, "unified"))
6369                         ret = mount("none", target, "cgroup2", 0, NULL);
6370                 else
6371                         ret = mount(controller, target, "cgroup", 0, controller);
6372                 if (ret < 0) {
6373                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6374                         free(target);
6375                         return false;
6376                 }
6377
6378                 fd_hierarchies[i] = open(target, O_DIRECTORY);
6379                 if (fd_hierarchies[i] < 0) {
6380                         free(target);
6381                         return false;
6382                 }
6383                 free(target);
6384         }
6385         return true;
6386 }
6387
6388 static bool cgfs_setup_controllers(void)
6389 {
6390         if (!cgfs_prepare_mounts())
6391                 return false;
6392
6393         if (!cgfs_mount_hierarchies()) {
6394                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6395                 return false;
6396         }
6397
6398         if (!permute_root())
6399                 return false;
6400
6401         return true;
6402 }
6403
6404 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6405 {
6406         FILE *f;
6407         char *cret, *line = NULL;
6408         char cwd[MAXPATHLEN];
6409         size_t len = 0;
6410         int i, init_ns = -1;
6411         bool found_unified = false;
6412
6413         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6414                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6415                 return;
6416         }
6417
6418         while (getline(&line, &len, f) != -1) {
6419                 char *idx, *p, *p2;
6420
6421                 p = strchr(line, ':');
6422                 if (!p)
6423                         goto out;
6424                 idx = line;
6425                 *(p++) = '\0';
6426
6427                 p2 = strrchr(p, ':');
6428                 if (!p2)
6429                         goto out;
6430                 *p2 = '\0';
6431
6432                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6433                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6434                  * because it parses out the empty string "" and later on passes
6435                  * it to mount(). Let's skip such entries.
6436                  */
6437                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6438                         found_unified = true;
6439                         p = "unified";
6440                 }
6441
6442                 if (!store_hierarchy(line, p))
6443                         goto out;
6444         }
6445
6446         /* Preserve initial namespace. */
6447         init_ns = preserve_mnt_ns(getpid());
6448         if (init_ns < 0) {
6449                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6450                 goto out;
6451         }
6452
6453         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6454         if (!fd_hierarchies) {
6455                 lxcfs_error("%s\n", strerror(errno));
6456                 goto out;
6457         }
6458
6459         for (i = 0; i < num_hierarchies; i++)
6460                 fd_hierarchies[i] = -1;
6461
6462         cret = getcwd(cwd, MAXPATHLEN);
6463         if (!cret)
6464                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6465
6466         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6467          * to privately mount lxcfs cgroups. */
6468         if (!cgfs_setup_controllers()) {
6469                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6470                 goto out;
6471         }
6472
6473         if (setns(init_ns, 0) < 0) {
6474                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6475                 goto out;
6476         }
6477
6478         if (!cret || chdir(cwd) < 0)
6479                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6480
6481         if (!init_cpuview()) {
6482                 lxcfs_error("%s\n", "failed to init CPU view");
6483                 goto out;
6484         }
6485
6486         print_subsystems();
6487
6488 out:
6489         free(line);
6490         fclose(f);
6491         if (init_ns >= 0)
6492                 close(init_ns);
6493 }
6494
6495 static void __attribute__((destructor)) free_subsystems(void)
6496 {
6497         int i;
6498
6499         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6500
6501         for (i = 0; i < num_hierarchies; i++) {
6502                 if (hierarchies[i])
6503                         free(hierarchies[i]);
6504                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6505                         close(fd_hierarchies[i]);
6506         }
6507         free(hierarchies);
6508         free(fd_hierarchies);
6509         free_cpuview();
6510
6511         if (cgroup_mount_ns_fd >= 0)
6512                 close(cgroup_mount_ns_fd);
6513 }