bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 struct cpuacct_usage {
  84         uint64_t user;
  85         uint64_t system;
  86 };
  87
  88 /* The function of hash table.*/
  89 #define LOAD_SIZE 100 /*the size of hash_table */
  90 #define FLUSH_TIME 5  /*the flush rate */
  91 #define DEPTH_DIR 3   /*the depth of per cgroup */
  92 /* The function of calculate loadavg .*/
  93 #define FSHIFT          11              /* nr of bits of precision */
  94 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  95 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  96 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  97 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  98 #define LOAD_INT(x) ((x) >> FSHIFT)
  99 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 100 /*
 101  * This parameter is used for proc_loadavg_read().
 102  * 1 means use loadavg, 0 means not use.
 103  */
 104 static int loadavg = 0;
 105 static volatile sig_atomic_t loadavg_stop = 0;
 106 static int calc_hash(char *name)
 107 {
 108         unsigned int hash = 0;
 109         unsigned int x = 0;
 110         /* ELFHash algorithm. */
 111         while (*name) {
 112                 hash = (hash << 4) + *name++;
 113                 x = hash & 0xf0000000;
 114                 if (x != 0)
 115                         hash ^= (x >> 24);
 116                 hash &= ~x;
 117         }
 118         return ((hash & 0x7fffffff) % LOAD_SIZE);
 119 }
 120
 121 struct load_node {
 122         char *cg;  /*cg */
 123         unsigned long avenrun[3];               /* Load averages */
 124         unsigned int run_pid;
 125         unsigned int total_pid;
 126         unsigned int last_pid;
 127         int cfd; /* The file descriptor of the mounted cgroup */
 128         struct  load_node *next;
 129         struct  load_node **pre;
 130 };
 131
 132 struct load_head {
 133         /*
 134          * The lock is about insert load_node and refresh load_node.To the first
 135          * load_node of each hash bucket, insert and refresh in this hash bucket is
 136          * mutually exclusive.
 137          */
 138         pthread_mutex_t lock;
 139         /*
 140          * The rdlock is about read loadavg and delete load_node.To each hash
 141          * bucket, read and delete is mutually exclusive. But at the same time, we
 142          * allow paratactic read operation. This rdlock is at list level.
 143          */
 144         pthread_rwlock_t rdlock;
 145         /*
 146          * The rilock is about read loadavg and insert load_node.To the first
 147          * load_node of each hash bucket, read and insert is mutually exclusive.
 148          * But at the same time, we allow paratactic read operation.
 149          */
 150         pthread_rwlock_t rilock;
 151         struct load_node *next;
 152 };
 153
 154 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 155 /*
 156  * init_load initialize the hash table.
 157  * Return 0 on success, return -1 on failure.
 158  */
 159 static int init_load(void)
 160 {
 161         int i;
 162         int ret;
 163
 164         for (i = 0; i < LOAD_SIZE; i++) {
 165                 load_hash[i].next = NULL;
 166                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 167                 if (ret != 0) {
 168                         lxcfs_error("%s\n", "Failed to initialize lock");
 169                         goto out3;
 170                 }
 171                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 172                 if (ret != 0) {
 173                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 174                         goto out2;
 175                 }
 176                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 177                 if (ret != 0) {
 178                         lxcfs_error("%s\n", "Failed to initialize rilock");
 179                         goto out1;
 180                 }
 181         }
 182         return 0;
 183 out1:
 184         pthread_rwlock_destroy(&load_hash[i].rdlock);
 185 out2:
 186         pthread_mutex_destroy(&load_hash[i].lock);
 187 out3:
 188         while (i > 0) {
 189                 i--;
 190                 pthread_mutex_destroy(&load_hash[i].lock);
 191                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 192                 pthread_rwlock_destroy(&load_hash[i].rilock);
 193         }
 194         return -1;
 195 }
 196
 197 static void insert_node(struct load_node **n, int locate)
 198 {
 199         struct load_node *f;
 200
 201         pthread_mutex_lock(&load_hash[locate].lock);
 202         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 203         f = load_hash[locate].next;
 204         load_hash[locate].next = *n;
 205
 206         (*n)->pre = &(load_hash[locate].next);
 207         if (f)
 208                 f->pre = &((*n)->next);
 209         (*n)->next = f;
 210         pthread_mutex_unlock(&load_hash[locate].lock);
 211         pthread_rwlock_unlock(&load_hash[locate].rilock);
 212 }
 213 /*
 214  * locate_node() finds special node. Not return NULL means success.
 215  * It should be noted that rdlock isn't unlocked at the end of code
 216  * because this function is used to read special node. Delete is not
 217  * allowed before read has ended.
 218  * unlock rdlock only in proc_loadavg_read().
 219  */
 220 static struct load_node *locate_node(char *cg, int locate)
 221 {
 222         struct load_node *f = NULL;
 223         int i = 0;
 224
 225         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 226         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 227         if (load_hash[locate].next == NULL) {
 228                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 229                 return f;
 230         }
 231         f = load_hash[locate].next;
 232         pthread_rwlock_unlock(&load_hash[locate].rilock);
 233         while (f && ((i = strcmp(f->cg, cg)) != 0))
 234                 f = f->next;
 235         return f;
 236 }
 237 /* Delete the load_node n and return the next node of it. */
 238 static struct load_node *del_node(struct load_node *n, int locate)
 239 {
 240         struct load_node *g;
 241
 242         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 243         if (n->next == NULL) {
 244                 *(n->pre) = NULL;
 245         } else {
 246                 *(n->pre) = n->next;
 247                 n->next->pre = n->pre;
 248         }
 249         g = n->next;
 250         free(n->cg);
 251         free(n);
 252         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 253         return g;
 254 }
 255
 256 static void load_free(void)
 257 {
 258         int i;
 259         struct load_node *f, *p;
 260
 261         for (i = 0; i < LOAD_SIZE; i++) {
 262                 pthread_mutex_lock(&load_hash[i].lock);
 263                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 264                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 265                 if (load_hash[i].next == NULL) {
 266                         pthread_mutex_unlock(&load_hash[i].lock);
 267                         pthread_mutex_destroy(&load_hash[i].lock);
 268                         pthread_rwlock_unlock(&load_hash[i].rilock);
 269                         pthread_rwlock_destroy(&load_hash[i].rilock);
 270                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 271                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 272                         continue;
 273                 }
 274                 for (f = load_hash[i].next; f; ) {
 275                         free(f->cg);
 276                         p = f->next;
 277                         free(f);
 278                         f = p;
 279                 }
 280                 pthread_mutex_unlock(&load_hash[i].lock);
 281                 pthread_mutex_destroy(&load_hash[i].lock);
 282                 pthread_rwlock_unlock(&load_hash[i].rilock);
 283                 pthread_rwlock_destroy(&load_hash[i].rilock);
 284                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 285                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 286         }
 287 }
 288 /* Reserve buffer size to account for file size changes. */
 289 #define BUF_RESERVE_SIZE 512
 290
 291 /*
 292  * A table caching which pid is init for a pid namespace.
 293  * When looking up which pid is init for $qpid, we first
 294  * 1. Stat /proc/$qpid/ns/pid.
 295  * 2. Check whether the ino_t is in our store.
 296  *   a. if not, fork a child in qpid's ns to send us
 297  *       ucred.pid = 1, and read the initpid.  Cache
 298  *       initpid and creation time for /proc/initpid
 299  *       in a new store entry.
 300  *   b. if so, verify that /proc/initpid still matches
 301  *       what we have saved.  If not, clear the store
 302  *       entry and go back to a.  If so, return the
 303  *       cached initpid.
 304  */
 305 struct pidns_init_store {
 306         ino_t ino;          // inode number for /proc/$pid/ns/pid
 307         pid_t initpid;      // the pid of nit in that ns
 308         long int ctime;     // the time at which /proc/$initpid was created
 309         struct pidns_init_store *next;
 310         long int lastcheck;
 311 };
 312
 313 /* lol - look at how they are allocated in the kernel */
 314 #define PIDNS_HASH_SIZE 4096
 315 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 316
 317 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 318 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 319 static void lock_mutex(pthread_mutex_t *l)
 320 {
 321         int ret;
 322
 323         if ((ret = pthread_mutex_lock(l)) != 0) {
 324                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 325                 exit(1);
 326         }
 327 }
 328
 329 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 330  * Number of hierarchies mounted. */
 331 static int num_hierarchies;
 332
 333 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 334  * Hierachies mounted {cpuset, blkio, ...}:
 335  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 336 static char **hierarchies;
 337
 338 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 339  * Open file descriptors:
 340  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 341  * private mount namespace.
 342  * Initialized via __constructor__ collect_and_mount_subsystems().
 343  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 344  * mounts and respective files in the private namespace even when located in
 345  * another namespace using the *at() family of functions
 346  * {openat(), fchownat(), ...}. */
 347 static int *fd_hierarchies;
 348 static int cgroup_mount_ns_fd = -1;
 349
 350 static void unlock_mutex(pthread_mutex_t *l)
 351 {
 352         int ret;
 353
 354         if ((ret = pthread_mutex_unlock(l)) != 0) {
 355                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 356                 exit(1);
 357         }
 358 }
 359
 360 static void store_lock(void)
 361 {
 362         lock_mutex(&pidns_store_mutex);
 363 }
 364
 365 static void store_unlock(void)
 366 {
 367         unlock_mutex(&pidns_store_mutex);
 368 }
 369
 370 /* Must be called under store_lock */
 371 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 372 {
 373         struct stat initsb;
 374         char fnam[100];
 375
 376         snprintf(fnam, 100, "/proc/%d", e->initpid);
 377         if (stat(fnam, &initsb) < 0)
 378                 return false;
 379
 380         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 381                     initsb.st_ctime, e->initpid);
 382
 383         if (e->ctime != initsb.st_ctime)
 384                 return false;
 385         return true;
 386 }
 387
 388 /* Must be called under store_lock */
 389 static void remove_initpid(struct pidns_init_store *e)
 390 {
 391         struct pidns_init_store *tmp;
 392         int h;
 393
 394         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 395
 396         h = HASH(e->ino);
 397         if (pidns_hash_table[h] == e) {
 398                 pidns_hash_table[h] = e->next;
 399                 free(e);
 400                 return;
 401         }
 402
 403         tmp = pidns_hash_table[h];
 404         while (tmp) {
 405                 if (tmp->next == e) {
 406                         tmp->next = e->next;
 407                         free(e);
 408                         return;
 409                 }
 410                 tmp = tmp->next;
 411         }
 412 }
 413
 414 #define PURGE_SECS 5
 415 /* Must be called under store_lock */
 416 static void prune_initpid_store(void)
 417 {
 418         static long int last_prune = 0;
 419         struct pidns_init_store *e, *prev, *delme;
 420         long int now, threshold;
 421         int i;
 422
 423         if (!last_prune) {
 424                 last_prune = time(NULL);
 425                 return;
 426         }
 427         now = time(NULL);
 428         if (now < last_prune + PURGE_SECS)
 429                 return;
 430
 431         lxcfs_debug("%s\n", "Pruning.");
 432
 433         last_prune = now;
 434         threshold = now - 2 * PURGE_SECS;
 435
 436         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 437                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 438                         if (e->lastcheck < threshold) {
 439
 440                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 441
 442                                 delme = e;
 443                                 if (prev)
 444                                         prev->next = e->next;
 445                                 else
 446                                         pidns_hash_table[i] = e->next;
 447                                 e = e->next;
 448                                 free(delme);
 449                         } else {
 450                                 prev = e;
 451                                 e = e->next;
 452                         }
 453                 }
 454         }
 455 }
 456
 457 /* Must be called under store_lock */
 458 static void save_initpid(struct stat *sb, pid_t pid)
 459 {
 460         struct pidns_init_store *e;
 461         char fpath[100];
 462         struct stat procsb;
 463         int h;
 464
 465         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 466
 467         snprintf(fpath, 100, "/proc/%d", pid);
 468         if (stat(fpath, &procsb) < 0)
 469                 return;
 470         do {
 471                 e = malloc(sizeof(*e));
 472         } while (!e);
 473         e->ino = sb->st_ino;
 474         e->initpid = pid;
 475         e->ctime = procsb.st_ctime;
 476         h = HASH(e->ino);
 477         e->next = pidns_hash_table[h];
 478         e->lastcheck = time(NULL);
 479         pidns_hash_table[h] = e;
 480 }
 481
 482 /*
 483  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 484  * entry for the inode number and creation time.  Verify that the init pid
 485  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 486  * otherwise.
 487  * Must be called under store_lock
 488  */
 489 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 490 {
 491         int h = HASH(sb->st_ino);
 492         struct pidns_init_store *e = pidns_hash_table[h];
 493
 494         while (e) {
 495                 if (e->ino == sb->st_ino) {
 496                         if (initpid_still_valid(e, sb)) {
 497                                 e->lastcheck = time(NULL);
 498                                 return e;
 499                         }
 500                         remove_initpid(e);
 501                         return NULL;
 502                 }
 503                 e = e->next;
 504         }
 505
 506         return NULL;
 507 }
 508
 509 static int is_dir(const char *path, int fd)
 510 {
 511         struct stat statbuf;
 512         int ret = fstatat(fd, path, &statbuf, fd);
 513         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 514                 return 1;
 515         return 0;
 516 }
 517
 518 static char *must_copy_string(const char *str)
 519 {
 520         char *dup = NULL;
 521         if (!str)
 522                 return NULL;
 523         do {
 524                 dup = strdup(str);
 525         } while (!dup);
 526
 527         return dup;
 528 }
 529
 530 static inline void drop_trailing_newlines(char *s)
 531 {
 532         int l;
 533
 534         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 535                 s[l-1] = '\0';
 536 }
 537
 538 #define BATCH_SIZE 50
 539 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 540 {
 541         int newbatches = (newlen / BATCH_SIZE) + 1;
 542         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 543
 544         if (!*mem || newbatches > oldbatches) {
 545                 char *tmp;
 546                 do {
 547                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 548                 } while (!tmp);
 549                 *mem = tmp;
 550         }
 551 }
 552 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 553 {
 554         size_t newlen = *len + linelen;
 555         dorealloc(contents, *len, newlen + 1);
 556         memcpy(*contents + *len, line, linelen+1);
 557         *len = newlen;
 558 }
 559
 560 static char *slurp_file(const char *from, int fd)
 561 {
 562         char *line = NULL;
 563         char *contents = NULL;
 564         FILE *f = fdopen(fd, "r");
 565         size_t len = 0, fulllen = 0;
 566         ssize_t linelen;
 567
 568         if (!f)
 569                 return NULL;
 570
 571         while ((linelen = getline(&line, &len, f)) != -1) {
 572                 append_line(&contents, &fulllen, line, linelen);
 573         }
 574         fclose(f);
 575
 576         if (contents)
 577                 drop_trailing_newlines(contents);
 578         free(line);
 579         return contents;
 580 }
 581
 582 static bool write_string(const char *fnam, const char *string, int fd)
 583 {
 584         FILE *f;
 585         size_t len, ret;
 586
 587         if (!(f = fdopen(fd, "w")))
 588                 return false;
 589         len = strlen(string);
 590         ret = fwrite(string, 1, len, f);
 591         if (ret != len) {
 592                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 593                 fclose(f);
 594                 return false;
 595         }
 596         if (fclose(f) < 0) {
 597                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 598                 return false;
 599         }
 600         return true;
 601 }
 602
 603 struct cgfs_files {
 604         char *name;
 605         uint32_t uid, gid;
 606         uint32_t mode;
 607 };
 608
 609 #define ALLOC_NUM 20
 610 static bool store_hierarchy(char *stridx, char *h)
 611 {
 612         if (num_hierarchies % ALLOC_NUM == 0) {
 613                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 614                 n *= ALLOC_NUM;
 615                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 616                 if (!tmp) {
 617                         lxcfs_error("%s\n", strerror(errno));
 618                         exit(1);
 619                 }
 620                 hierarchies = tmp;
 621         }
 622
 623         hierarchies[num_hierarchies++] = must_copy_string(h);
 624         return true;
 625 }
 626
 627 static void print_subsystems(void)
 628 {
 629         int i;
 630
 631         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 632         fprintf(stderr, "hierarchies:\n");
 633         for (i = 0; i < num_hierarchies; i++) {
 634                 if (hierarchies[i])
 635                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 636                                 fd_hierarchies[i], hierarchies[i]);
 637         }
 638 }
 639
 640 static bool in_comma_list(const char *needle, const char *haystack)
 641 {
 642         const char *s = haystack, *e;
 643         size_t nlen = strlen(needle);
 644
 645         while (*s && (e = strchr(s, ','))) {
 646                 if (nlen != e - s) {
 647                         s = e + 1;
 648                         continue;
 649                 }
 650                 if (strncmp(needle, s, nlen) == 0)
 651                         return true;
 652                 s = e + 1;
 653         }
 654         if (strcmp(needle, s) == 0)
 655                 return true;
 656         return false;
 657 }
 658
 659 /* do we need to do any massaging here?  I'm not sure... */
 660 /* Return the mounted controller and store the corresponding open file descriptor
 661  * referring to the controller mountpoint in the private lxcfs namespace in
 662  * @cfd.
 663  */
 664 static char *find_mounted_controller(const char *controller, int *cfd)
 665 {
 666         int i;
 667
 668         for (i = 0; i < num_hierarchies; i++) {
 669                 if (!hierarchies[i])
 670                         continue;
 671                 if (strcmp(hierarchies[i], controller) == 0) {
 672                         *cfd = fd_hierarchies[i];
 673                         return hierarchies[i];
 674                 }
 675                 if (in_comma_list(controller, hierarchies[i])) {
 676                         *cfd = fd_hierarchies[i];
 677                         return hierarchies[i];
 678                 }
 679         }
 680
 681         return NULL;
 682 }
 683
 684 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 685                 const char *value)
 686 {
 687         int ret, fd, cfd;
 688         size_t len;
 689         char *fnam, *tmpc;
 690
 691         tmpc = find_mounted_controller(controller, &cfd);
 692         if (!tmpc)
 693                 return false;
 694
 695         /* Make sure we pass a relative path to *at() family of functions.
 696          * . + /cgroup + / + file + \0
 697          */
 698         len = strlen(cgroup) + strlen(file) + 3;
 699         fnam = alloca(len);
 700         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 701         if (ret < 0 || (size_t)ret >= len)
 702                 return false;
 703
 704         fd = openat(cfd, fnam, O_WRONLY);
 705         if (fd < 0)
 706                 return false;
 707
 708         return write_string(fnam, value, fd);
 709 }
 710
 711 // Chown all the files in the cgroup directory.  We do this when we create
 712 // a cgroup on behalf of a user.
 713 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 714 {
 715         struct dirent *direntp;
 716         char path[MAXPATHLEN];
 717         size_t len;
 718         DIR *d;
 719         int fd1, ret;
 720
 721         len = strlen(dirname);
 722         if (len >= MAXPATHLEN) {
 723                 lxcfs_error("Pathname too long: %s\n", dirname);
 724                 return;
 725         }
 726
 727         fd1 = openat(fd, dirname, O_DIRECTORY);
 728         if (fd1 < 0)
 729                 return;
 730
 731         d = fdopendir(fd1);
 732         if (!d) {
 733                 lxcfs_error("Failed to open %s\n", dirname);
 734                 return;
 735         }
 736
 737         while ((direntp = readdir(d))) {
 738                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 739                         continue;
 740                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 741                 if (ret < 0 || ret >= MAXPATHLEN) {
 742                         lxcfs_error("Pathname too long under %s\n", dirname);
 743                         continue;
 744                 }
 745                 if (fchownat(fd, path, uid, gid, 0) < 0)
 746                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 747         }
 748         closedir(d);
 749 }
 750
 751 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 752 {
 753         int cfd;
 754         size_t len;
 755         char *dirnam, *tmpc;
 756
 757         tmpc = find_mounted_controller(controller, &cfd);
 758         if (!tmpc)
 759                 return -EINVAL;
 760
 761         /* Make sure we pass a relative path to *at() family of functions.
 762          * . + /cg + \0
 763          */
 764         len = strlen(cg) + 2;
 765         dirnam = alloca(len);
 766         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 767
 768         if (mkdirat(cfd, dirnam, 0755) < 0)
 769                 return -errno;
 770
 771         if (uid == 0 && gid == 0)
 772                 return 0;
 773
 774         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 775                 return -errno;
 776
 777         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 778
 779         return 0;
 780 }
 781
 782 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 783 {
 784         struct dirent *direntp;
 785         DIR *dir;
 786         bool ret = false;
 787         char pathname[MAXPATHLEN];
 788         int dupfd;
 789
 790         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 791         if (dupfd < 0)
 792                 return false;
 793
 794         dir = fdopendir(dupfd);
 795         if (!dir) {
 796                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 797                 close(dupfd);
 798                 return false;
 799         }
 800
 801         while ((direntp = readdir(dir))) {
 802                 struct stat mystat;
 803                 int rc;
 804
 805                 if (!strcmp(direntp->d_name, ".") ||
 806                     !strcmp(direntp->d_name, ".."))
 807                         continue;
 808
 809                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 810                 if (rc < 0 || rc >= MAXPATHLEN) {
 811                         lxcfs_error("%s\n", "Pathname too long.");
 812                         continue;
 813                 }
 814
 815                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 816                 if (rc) {
 817                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 818                         continue;
 819                 }
 820                 if (S_ISDIR(mystat.st_mode))
 821                         if (!recursive_rmdir(pathname, fd, cfd))
 822                                 lxcfs_debug("Error removing %s.\n", pathname);
 823         }
 824
 825         ret = true;
 826         if (closedir(dir) < 0) {
 827                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 828                 ret = false;
 829         }
 830
 831         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 832                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 833                 ret = false;
 834         }
 835
 836         close(dupfd);
 837
 838         return ret;
 839 }
 840
 841 bool cgfs_remove(const char *controller, const char *cg)
 842 {
 843         int fd, cfd;
 844         size_t len;
 845         char *dirnam, *tmpc;
 846         bool bret;
 847
 848         tmpc = find_mounted_controller(controller, &cfd);
 849         if (!tmpc)
 850                 return false;
 851
 852         /* Make sure we pass a relative path to *at() family of functions.
 853          * . +  /cg + \0
 854          */
 855         len = strlen(cg) + 2;
 856         dirnam = alloca(len);
 857         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 858
 859         fd = openat(cfd, dirnam, O_DIRECTORY);
 860         if (fd < 0)
 861                 return false;
 862
 863         bret = recursive_rmdir(dirnam, fd, cfd);
 864         close(fd);
 865         return bret;
 866 }
 867
 868 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 869 {
 870         int cfd;
 871         size_t len;
 872         char *pathname, *tmpc;
 873
 874         tmpc = find_mounted_controller(controller, &cfd);
 875         if (!tmpc)
 876                 return false;
 877
 878         /* Make sure we pass a relative path to *at() family of functions.
 879          * . + /file + \0
 880          */
 881         len = strlen(file) + 2;
 882         pathname = alloca(len);
 883         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 884         if (fchmodat(cfd, pathname, mode, 0) < 0)
 885                 return false;
 886         return true;
 887 }
 888
 889 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 890 {
 891         size_t len;
 892         char *fname;
 893
 894         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 895         fname = alloca(len);
 896         snprintf(fname, len, "%s/tasks", dirname);
 897         if (fchownat(fd, fname, uid, gid, 0) != 0)
 898                 return -errno;
 899         snprintf(fname, len, "%s/cgroup.procs", dirname);
 900         if (fchownat(fd, fname, uid, gid, 0) != 0)
 901                 return -errno;
 902         return 0;
 903 }
 904
 905 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 906 {
 907         int cfd;
 908         size_t len;
 909         char *pathname, *tmpc;
 910
 911         tmpc = find_mounted_controller(controller, &cfd);
 912         if (!tmpc)
 913                 return -EINVAL;
 914
 915         /* Make sure we pass a relative path to *at() family of functions.
 916          * . + /file + \0
 917          */
 918         len = strlen(file) + 2;
 919         pathname = alloca(len);
 920         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 921         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 922                 return -errno;
 923
 924         if (is_dir(pathname, cfd))
 925                 // like cgmanager did, we want to chown the tasks file as well
 926                 return chown_tasks_files(pathname, uid, gid, cfd);
 927
 928         return 0;
 929 }
 930
 931 FILE *open_pids_file(const char *controller, const char *cgroup)
 932 {
 933         int fd, cfd;
 934         size_t len;
 935         char *pathname, *tmpc;
 936
 937         tmpc = find_mounted_controller(controller, &cfd);
 938         if (!tmpc)
 939                 return NULL;
 940
 941         /* Make sure we pass a relative path to *at() family of functions.
 942          * . + /cgroup + / "cgroup.procs" + \0
 943          */
 944         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 945         pathname = alloca(len);
 946         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 947
 948         fd = openat(cfd, pathname, O_WRONLY);
 949         if (fd < 0)
 950                 return NULL;
 951
 952         return fdopen(fd, "w");
 953 }
 954
 955 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 956                                 void ***list, size_t typesize,
 957                                 void* (*iterator)(const char*, const char*, const char*))
 958 {
 959         int cfd, fd, ret;
 960         size_t len;
 961         char *cg, *tmpc;
 962         char pathname[MAXPATHLEN];
 963         size_t sz = 0, asz = 0;
 964         struct dirent *dirent;
 965         DIR *dir;
 966
 967         tmpc = find_mounted_controller(controller, &cfd);
 968         *list = NULL;
 969         if (!tmpc)
 970                 return false;
 971
 972         /* Make sure we pass a relative path to *at() family of functions. */
 973         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 974         cg = alloca(len);
 975         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 976         if (ret < 0 || (size_t)ret >= len) {
 977                 lxcfs_error("Pathname too long under %s\n", cgroup);
 978                 return false;
 979         }
 980
 981         fd = openat(cfd, cg, O_DIRECTORY);
 982         if (fd < 0)
 983                 return false;
 984
 985         dir = fdopendir(fd);
 986         if (!dir)
 987                 return false;
 988
 989         while ((dirent = readdir(dir))) {
 990                 struct stat mystat;
 991
 992                 if (!strcmp(dirent->d_name, ".") ||
 993                     !strcmp(dirent->d_name, ".."))
 994                         continue;
 995
 996                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 997                 if (ret < 0 || ret >= MAXPATHLEN) {
 998                         lxcfs_error("Pathname too long under %s\n", cg);
 999                         continue;
1000                 }
1001
1002                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1003                 if (ret) {
1004                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1005                         continue;
1006                 }
1007                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1008                     (directories && !S_ISDIR(mystat.st_mode)))
1009                         continue;
1010
1011                 if (sz+2 >= asz) {
1012                         void **tmp;
1013                         asz += BATCH_SIZE;
1014                         do {
1015                                 tmp = realloc(*list, asz * typesize);
1016                         } while  (!tmp);
1017                         *list = tmp;
1018                 }
1019                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1020                 (*list)[sz+1] = NULL;
1021                 sz++;
1022         }
1023         if (closedir(dir) < 0) {
1024                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1025                 return false;
1026         }
1027         return true;
1028 }
1029
1030 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1031 {
1032         char *dup;
1033         do {
1034                 dup = strdup(dir_entry);
1035         } while (!dup);
1036         return dup;
1037 }
1038
1039 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1040 {
1041         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1042 }
1043
1044 void free_key(struct cgfs_files *k)
1045 {
1046         if (!k)
1047                 return;
1048         free(k->name);
1049         free(k);
1050 }
1051
1052 void free_keys(struct cgfs_files **keys)
1053 {
1054         int i;
1055
1056         if (!keys)
1057                 return;
1058         for (i = 0; keys[i]; i++) {
1059                 free_key(keys[i]);
1060         }
1061         free(keys);
1062 }
1063
1064 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1065 {
1066         int ret, fd, cfd;
1067         size_t len;
1068         char *fnam, *tmpc;
1069
1070         tmpc = find_mounted_controller(controller, &cfd);
1071         if (!tmpc)
1072                 return false;
1073
1074         /* Make sure we pass a relative path to *at() family of functions.
1075          * . + /cgroup + / + file + \0
1076          */
1077         len = strlen(cgroup) + strlen(file) + 3;
1078         fnam = alloca(len);
1079         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1080         if (ret < 0 || (size_t)ret >= len)
1081                 return false;
1082
1083         fd = openat(cfd, fnam, O_RDONLY);
1084         if (fd < 0)
1085                 return false;
1086
1087         *value = slurp_file(fnam, fd);
1088         return *value != NULL;
1089 }
1090
1091 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1092 {
1093         int ret, cfd;
1094         size_t len;
1095         char *fnam, *tmpc;
1096         struct stat sb;
1097         struct cgfs_files *newkey;
1098
1099         tmpc = find_mounted_controller(controller, &cfd);
1100         if (!tmpc)
1101                 return false;
1102
1103         if (file && *file == '/')
1104                 file++;
1105
1106         if (file && strchr(file, '/'))
1107                 return NULL;
1108
1109         /* Make sure we pass a relative path to *at() family of functions.
1110          * . + /cgroup + / + file + \0
1111          */
1112         len = strlen(cgroup) + 3;
1113         if (file)
1114                 len += strlen(file) + 1;
1115         fnam = alloca(len);
1116         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1117                  file ? "/" : "", file ? file : "");
1118
1119         ret = fstatat(cfd, fnam, &sb, 0);
1120         if (ret < 0)
1121                 return NULL;
1122
1123         do {
1124                 newkey = malloc(sizeof(struct cgfs_files));
1125         } while (!newkey);
1126         if (file)
1127                 newkey->name = must_copy_string(file);
1128         else if (strrchr(cgroup, '/'))
1129                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1130         else
1131                 newkey->name = must_copy_string(cgroup);
1132         newkey->uid = sb.st_uid;
1133         newkey->gid = sb.st_gid;
1134         newkey->mode = sb.st_mode;
1135
1136         return newkey;
1137 }
1138
1139 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1140 {
1141         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1142         if (!entry) {
1143                 lxcfs_error("Error getting files under %s:%s\n", controller,
1144                              cgroup);
1145         }
1146         return entry;
1147 }
1148
1149 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1150 {
1151         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1152 }
1153
1154 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1155 {
1156         int cfd;
1157         size_t len;
1158         char *fnam, *tmpc;
1159         int ret;
1160         struct stat sb;
1161
1162         tmpc = find_mounted_controller(controller, &cfd);
1163         if (!tmpc)
1164                 return false;
1165
1166         /* Make sure we pass a relative path to *at() family of functions.
1167          * . + /cgroup + / + f + \0
1168          */
1169         len = strlen(cgroup) + strlen(f) + 3;
1170         fnam = alloca(len);
1171         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1172         if (ret < 0 || (size_t)ret >= len)
1173                 return false;
1174
1175         ret = fstatat(cfd, fnam, &sb, 0);
1176         if (ret < 0 || !S_ISDIR(sb.st_mode))
1177                 return false;
1178
1179         return true;
1180 }
1181
1182 #define SEND_CREDS_OK 0
1183 #define SEND_CREDS_NOTSK 1
1184 #define SEND_CREDS_FAIL 2
1185 static bool recv_creds(int sock, struct ucred *cred, char *v);
1186 static int wait_for_pid(pid_t pid);
1187 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1188 static int send_creds_clone_wrapper(void *arg);
1189
1190 /*
1191  * clone a task which switches to @task's namespace and writes '1'.
1192  * over a unix sock so we can read the task's reaper's pid in our
1193  * namespace
1194  *
1195  * Note: glibc's fork() does not respect pidns, which can lead to failed
1196  * assertions inside glibc (and thus failed forks) if the child's pid in
1197  * the pidns and the parent pid outside are identical. Using clone prevents
1198  * this issue.
1199  */
1200 static void write_task_init_pid_exit(int sock, pid_t target)
1201 {
1202         char fnam[100];
1203         pid_t pid;
1204         int fd, ret;
1205         size_t stack_size = sysconf(_SC_PAGESIZE);
1206         void *stack = alloca(stack_size);
1207
1208         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1209         if (ret < 0 || ret >= sizeof(fnam))
1210                 _exit(1);
1211
1212         fd = open(fnam, O_RDONLY);
1213         if (fd < 0) {
1214                 perror("write_task_init_pid_exit open of ns/pid");
1215                 _exit(1);
1216         }
1217         if (setns(fd, 0)) {
1218                 perror("write_task_init_pid_exit setns 1");
1219                 close(fd);
1220                 _exit(1);
1221         }
1222         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1223         if (pid < 0)
1224                 _exit(1);
1225         if (pid != 0) {
1226                 if (!wait_for_pid(pid))
1227                         _exit(1);
1228                 _exit(0);
1229         }
1230 }
1231
1232 static int send_creds_clone_wrapper(void *arg) {
1233         struct ucred cred;
1234         char v;
1235         int sock = *(int *)arg;
1236
1237         /* we are the child */
1238         cred.uid = 0;
1239         cred.gid = 0;
1240         cred.pid = 1;
1241         v = '1';
1242         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1243                 return 1;
1244         return 0;
1245 }
1246
1247 static pid_t get_init_pid_for_task(pid_t task)
1248 {
1249         int sock[2];
1250         pid_t pid;
1251         pid_t ret = -1;
1252         char v = '0';
1253         struct ucred cred;
1254
1255         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1256                 perror("socketpair");
1257                 return -1;
1258         }
1259
1260         pid = fork();
1261         if (pid < 0)
1262                 goto out;
1263         if (!pid) {
1264                 close(sock[1]);
1265                 write_task_init_pid_exit(sock[0], task);
1266                 _exit(0);
1267         }
1268
1269         if (!recv_creds(sock[1], &cred, &v))
1270                 goto out;
1271         ret = cred.pid;
1272
1273 out:
1274         close(sock[0]);
1275         close(sock[1]);
1276         if (pid > 0)
1277                 wait_for_pid(pid);
1278         return ret;
1279 }
1280
1281 static pid_t lookup_initpid_in_store(pid_t qpid)
1282 {
1283         pid_t answer = 0;
1284         struct stat sb;
1285         struct pidns_init_store *e;
1286         char fnam[100];
1287
1288         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1289         store_lock();
1290         if (stat(fnam, &sb) < 0)
1291                 goto out;
1292         e = lookup_verify_initpid(&sb);
1293         if (e) {
1294                 answer = e->initpid;
1295                 goto out;
1296         }
1297         answer = get_init_pid_for_task(qpid);
1298         if (answer > 0)
1299                 save_initpid(&sb, answer);
1300
1301 out:
1302         /* we prune at end in case we are returning
1303          * the value we were about to return */
1304         prune_initpid_store();
1305         store_unlock();
1306         return answer;
1307 }
1308
1309 static int wait_for_pid(pid_t pid)
1310 {
1311         int status, ret;
1312
1313         if (pid <= 0)
1314                 return -1;
1315
1316 again:
1317         ret = waitpid(pid, &status, 0);
1318         if (ret == -1) {
1319                 if (errno == EINTR)
1320                         goto again;
1321                 return -1;
1322         }
1323         if (ret != pid)
1324                 goto again;
1325         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1326                 return -1;
1327         return 0;
1328 }
1329
1330
1331 /*
1332  * append pid to *src.
1333  * src: a pointer to a char* in which ot append the pid.
1334  * sz: the number of characters printed so far, minus trailing \0.
1335  * asz: the allocated size so far
1336  * pid: the pid to append
1337  */
1338 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1339 {
1340         char tmp[30];
1341
1342         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1343
1344         if (!*src || tmplen + *sz + 1 >= *asz) {
1345                 char *tmp;
1346                 do {
1347                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1348                 } while (!tmp);
1349                 *src = tmp;
1350                 *asz += BUF_RESERVE_SIZE;
1351         }
1352         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1353         *sz += tmplen;
1354 }
1355
1356 /*
1357  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1358  * valid in the caller's namespace, return the id mapped into
1359  * pid's namespace.
1360  * Returns the mapped id, or -1 on error.
1361  */
1362 unsigned int
1363 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1364 {
1365         unsigned int nsuid,   // base id for a range in the idfile's namespace
1366                      hostuid, // base id for a range in the caller's namespace
1367                      count;   // number of ids in this range
1368         char line[400];
1369         int ret;
1370
1371         fseek(idfile, 0L, SEEK_SET);
1372         while (fgets(line, 400, idfile)) {
1373                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1374                 if (ret != 3)
1375                         continue;
1376                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1377                         /*
1378                          * uids wrapped around - unexpected as this is a procfile,
1379                          * so just bail.
1380                          */
1381                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1382                                 nsuid, hostuid, count, line);
1383                         return -1;
1384                 }
1385                 if (hostuid <= in_id && hostuid+count > in_id) {
1386                         /*
1387                          * now since hostuid <= in_id < hostuid+count, and
1388                          * hostuid+count and nsuid+count do not wrap around,
1389                          * we know that nsuid+(in_id-hostuid) which must be
1390                          * less that nsuid+(count) must not wrap around
1391                          */
1392                         return (in_id - hostuid) + nsuid;
1393                 }
1394         }
1395
1396         // no answer found
1397         return -1;
1398 }
1399
1400 /*
1401  * for is_privileged_over,
1402  * specify whether we require the calling uid to be root in his
1403  * namespace
1404  */
1405 #define NS_ROOT_REQD true
1406 #define NS_ROOT_OPT false
1407
1408 #define PROCLEN 100
1409
1410 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1411 {
1412         char fpath[PROCLEN];
1413         int ret;
1414         bool answer = false;
1415         uid_t nsuid;
1416
1417         if (victim == -1 || uid == -1)
1418                 return false;
1419
1420         /*
1421          * If the request is one not requiring root in the namespace,
1422          * then having the same uid suffices.  (i.e. uid 1000 has write
1423          * access to files owned by uid 1000
1424          */
1425         if (!req_ns_root && uid == victim)
1426                 return true;
1427
1428         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1429         if (ret < 0 || ret >= PROCLEN)
1430                 return false;
1431         FILE *f = fopen(fpath, "r");
1432         if (!f)
1433                 return false;
1434
1435         /* if caller's not root in his namespace, reject */
1436         nsuid = convert_id_to_ns(f, uid);
1437         if (nsuid)
1438                 goto out;
1439
1440         /*
1441          * If victim is not mapped into caller's ns, reject.
1442          * XXX I'm not sure this check is needed given that fuse
1443          * will be sending requests where the vfs has converted
1444          */
1445         nsuid = convert_id_to_ns(f, victim);
1446         if (nsuid == -1)
1447                 goto out;
1448
1449         answer = true;
1450
1451 out:
1452         fclose(f);
1453         return answer;
1454 }
1455
1456 static bool perms_include(int fmode, mode_t req_mode)
1457 {
1458         mode_t r;
1459
1460         switch (req_mode & O_ACCMODE) {
1461         case O_RDONLY:
1462                 r = S_IROTH;
1463                 break;
1464         case O_WRONLY:
1465                 r = S_IWOTH;
1466                 break;
1467         case O_RDWR:
1468                 r = S_IROTH | S_IWOTH;
1469                 break;
1470         default:
1471                 return false;
1472         }
1473         return ((fmode & r) == r);
1474 }
1475
1476
1477 /*
1478  * taskcg is  a/b/c
1479  * querycg is /a/b/c/d/e
1480  * we return 'd'
1481  */
1482 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1483 {
1484         char *start, *end;
1485
1486         if (strlen(taskcg) <= strlen(querycg)) {
1487                 lxcfs_error("%s\n", "I was fed bad input.");
1488                 return NULL;
1489         }
1490
1491         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1492                 start =  strdup(taskcg + 1);
1493         else
1494                 start = strdup(taskcg + strlen(querycg) + 1);
1495         if (!start)
1496                 return NULL;
1497         end = strchr(start, '/');
1498         if (end)
1499                 *end = '\0';
1500         return start;
1501 }
1502
1503 static void stripnewline(char *x)
1504 {
1505         size_t l = strlen(x);
1506         if (l && x[l-1] == '\n')
1507                 x[l-1] = '\0';
1508 }
1509
1510 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1511 {
1512         int cfd;
1513         char fnam[PROCLEN];
1514         FILE *f;
1515         char *answer = NULL;
1516         char *line = NULL;
1517         size_t len = 0;
1518         int ret;
1519         const char *h = find_mounted_controller(contrl, &cfd);
1520         if (!h)
1521                 return NULL;
1522
1523         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1524         if (ret < 0 || ret >= PROCLEN)
1525                 return NULL;
1526         if (!(f = fopen(fnam, "r")))
1527                 return NULL;
1528
1529         while (getline(&line, &len, f) != -1) {
1530                 char *c1, *c2;
1531                 if (!line[0])
1532                         continue;
1533                 c1 = strchr(line, ':');
1534                 if (!c1)
1535                         goto out;
1536                 c1++;
1537                 c2 = strchr(c1, ':');
1538                 if (!c2)
1539                         goto out;
1540                 *c2 = '\0';
1541                 if (strcmp(c1, h) != 0)
1542                         continue;
1543                 c2++;
1544                 stripnewline(c2);
1545                 do {
1546                         answer = strdup(c2);
1547                 } while (!answer);
1548                 break;
1549         }
1550
1551 out:
1552         fclose(f);
1553         free(line);
1554         return answer;
1555 }
1556
1557 /*
1558  * check whether a fuse context may access a cgroup dir or file
1559  *
1560  * If file is not null, it is a cgroup file to check under cg.
1561  * If file is null, then we are checking perms on cg itself.
1562  *
1563  * For files we can check the mode of the list_keys result.
1564  * For cgroups, we must make assumptions based on the files under the
1565  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1566  * yet.
1567  */
1568 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1569 {
1570         struct cgfs_files *k = NULL;
1571         bool ret = false;
1572
1573         k = cgfs_get_key(contrl, cg, file);
1574         if (!k)
1575                 return false;
1576
1577         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1578                 if (perms_include(k->mode >> 6, mode)) {
1579                         ret = true;
1580                         goto out;
1581                 }
1582         }
1583         if (fc->gid == k->gid) {
1584                 if (perms_include(k->mode >> 3, mode)) {
1585                         ret = true;
1586                         goto out;
1587                 }
1588         }
1589         ret = perms_include(k->mode, mode);
1590
1591 out:
1592         free_key(k);
1593         return ret;
1594 }
1595
1596 #define INITSCOPE "/init.scope"
1597 static void prune_init_slice(char *cg)
1598 {
1599         char *point;
1600         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1601
1602         if (cg_len < initscope_len)
1603                 return;
1604
1605         point = cg + cg_len - initscope_len;
1606         if (strcmp(point, INITSCOPE) == 0) {
1607                 if (point == cg)
1608                         *(point+1) = '\0';
1609                 else
1610                         *point = '\0';
1611         }
1612 }
1613
1614 /*
1615  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1616  * If pid is in /a, he may act on /a/b, but not on /b.
1617  * if the answer is false and nextcg is not NULL, then *nextcg will point
1618  * to a string containing the next cgroup directory under cg, which must be
1619  * freed by the caller.
1620  */
1621 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1622 {
1623         bool answer = false;
1624         char *c2 = get_pid_cgroup(pid, contrl);
1625         char *linecmp;
1626
1627         if (!c2)
1628                 return false;
1629         prune_init_slice(c2);
1630
1631         /*
1632          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1633          * they pass in a cgroup without leading '/'
1634          *
1635          * The original line here was:
1636          *      linecmp = *cg == '/' ? c2 : c2+1;
1637          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1638          *       Serge, do you know?
1639          */
1640         if (*cg == '/' || !strncmp(cg, "./", 2))
1641                 linecmp = c2;
1642         else
1643                 linecmp = c2 + 1;
1644         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1645                 if (nextcg) {
1646                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1647                 }
1648                 goto out;
1649         }
1650         answer = true;
1651
1652 out:
1653         free(c2);
1654         return answer;
1655 }
1656
1657 /*
1658  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1659  */
1660 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1661 {
1662         bool answer = false;
1663         char *c2, *task_cg;
1664         size_t target_len, task_len;
1665
1666         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1667                 return true;
1668
1669         c2 = get_pid_cgroup(pid, contrl);
1670         if (!c2)
1671                 return false;
1672         prune_init_slice(c2);
1673
1674         task_cg = c2 + 1;
1675         target_len = strlen(cg);
1676         task_len = strlen(task_cg);
1677         if (task_len == 0) {
1678                 /* Task is in the root cg, it can see everything. This case is
1679                  * not handled by the strmcps below, since they test for the
1680                  * last /, but that is the first / that we've chopped off
1681                  * above.
1682                  */
1683                 answer = true;
1684                 goto out;
1685         }
1686         if (strcmp(cg, task_cg) == 0) {
1687                 answer = true;
1688                 goto out;
1689         }
1690         if (target_len < task_len) {
1691                 /* looking up a parent dir */
1692                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1693                         answer = true;
1694                 goto out;
1695         }
1696         if (target_len > task_len) {
1697                 /* looking up a child dir */
1698                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1699                         answer = true;
1700                 goto out;
1701         }
1702
1703 out:
1704         free(c2);
1705         return answer;
1706 }
1707
1708 /*
1709  * given /cgroup/freezer/a/b, return "freezer".
1710  * the returned char* should NOT be freed.
1711  */
1712 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1713 {
1714         const char *p1;
1715         char *contr, *slash;
1716
1717         if (strlen(path) < 9) {
1718                 errno = EACCES;
1719                 return NULL;
1720         }
1721         if (*(path + 7) != '/') {
1722                 errno = EINVAL;
1723                 return NULL;
1724         }
1725         p1 = path + 8;
1726         contr = strdupa(p1);
1727         if (!contr) {
1728                 errno = ENOMEM;
1729                 return NULL;
1730         }
1731         slash = strstr(contr, "/");
1732         if (slash)
1733                 *slash = '\0';
1734
1735         int i;
1736         for (i = 0; i < num_hierarchies; i++) {
1737                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1738                         return hierarchies[i];
1739         }
1740         errno = ENOENT;
1741         return NULL;
1742 }
1743
1744 /*
1745  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1746  * Note that the returned value may include files (keynames) etc
1747  */
1748 static const char *find_cgroup_in_path(const char *path)
1749 {
1750         const char *p1;
1751
1752         if (strlen(path) < 9) {
1753                 errno = EACCES;
1754                 return NULL;
1755         }
1756         p1 = strstr(path + 8, "/");
1757         if (!p1) {
1758                 errno = EINVAL;
1759                 return NULL;
1760         }
1761         errno = 0;
1762         return p1 + 1;
1763 }
1764
1765 /*
1766  * split the last path element from the path in @cg.
1767  * @dir is newly allocated and should be freed, @last not
1768 */
1769 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1770 {
1771         char *p;
1772
1773         do {
1774                 *dir = strdup(cg);
1775         } while (!*dir);
1776         *last = strrchr(cg, '/');
1777         if (!*last) {
1778                 *last = NULL;
1779                 return;
1780         }
1781         p = strrchr(*dir, '/');
1782         *p = '\0';
1783 }
1784
1785 /*
1786  * FUSE ops for /cgroup
1787  */
1788
1789 int cg_getattr(const char *path, struct stat *sb)
1790 {
1791         struct timespec now;
1792         struct fuse_context *fc = fuse_get_context();
1793         char * cgdir = NULL;
1794         char *last = NULL, *path1, *path2;
1795         struct cgfs_files *k = NULL;
1796         const char *cgroup;
1797         const char *controller = NULL;
1798         int ret = -ENOENT;
1799
1800
1801         if (!fc)
1802                 return -EIO;
1803
1804         memset(sb, 0, sizeof(struct stat));
1805
1806         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1807                 return -EINVAL;
1808
1809         sb->st_uid = sb->st_gid = 0;
1810         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1811         sb->st_size = 0;
1812
1813         if (strcmp(path, "/cgroup") == 0) {
1814                 sb->st_mode = S_IFDIR | 00755;
1815                 sb->st_nlink = 2;
1816                 return 0;
1817         }
1818
1819         controller = pick_controller_from_path(fc, path);
1820         if (!controller)
1821                 return -errno;
1822         cgroup = find_cgroup_in_path(path);
1823         if (!cgroup) {
1824                 /* this is just /cgroup/controller, return it as a dir */
1825                 sb->st_mode = S_IFDIR | 00755;
1826                 sb->st_nlink = 2;
1827                 return 0;
1828         }
1829
1830         get_cgdir_and_path(cgroup, &cgdir, &last);
1831
1832         if (!last) {
1833                 path1 = "/";
1834                 path2 = cgdir;
1835         } else {
1836                 path1 = cgdir;
1837                 path2 = last;
1838         }
1839
1840         pid_t initpid = lookup_initpid_in_store(fc->pid);
1841         if (initpid <= 0)
1842                 initpid = fc->pid;
1843         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1844          * Then check that caller's cgroup is under path if last is a child
1845          * cgroup, or cgdir if last is a file */
1846
1847         if (is_child_cgroup(controller, path1, path2)) {
1848                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1849                         ret = -ENOENT;
1850                         goto out;
1851                 }
1852                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1853                         /* this is just /cgroup/controller, return it as a dir */
1854                         sb->st_mode = S_IFDIR | 00555;
1855                         sb->st_nlink = 2;
1856                         ret = 0;
1857                         goto out;
1858                 }
1859                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1860                         ret = -EACCES;
1861                         goto out;
1862                 }
1863
1864                 // get uid, gid, from '/tasks' file and make up a mode
1865                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1866                 sb->st_mode = S_IFDIR | 00755;
1867                 k = cgfs_get_key(controller, cgroup, NULL);
1868                 if (!k) {
1869                         sb->st_uid = sb->st_gid = 0;
1870                 } else {
1871                         sb->st_uid = k->uid;
1872                         sb->st_gid = k->gid;
1873                 }
1874                 free_key(k);
1875                 sb->st_nlink = 2;
1876                 ret = 0;
1877                 goto out;
1878         }
1879
1880         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1881                 sb->st_mode = S_IFREG | k->mode;
1882                 sb->st_nlink = 1;
1883                 sb->st_uid = k->uid;
1884                 sb->st_gid = k->gid;
1885                 sb->st_size = 0;
1886                 free_key(k);
1887                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1888                         ret = -ENOENT;
1889                         goto out;
1890                 }
1891                 ret = 0;
1892         }
1893
1894 out:
1895         free(cgdir);
1896         return ret;
1897 }
1898
1899 int cg_opendir(const char *path, struct fuse_file_info *fi)
1900 {
1901         struct fuse_context *fc = fuse_get_context();
1902         const char *cgroup;
1903         struct file_info *dir_info;
1904         char *controller = NULL;
1905
1906         if (!fc)
1907                 return -EIO;
1908
1909         if (strcmp(path, "/cgroup") == 0) {
1910                 cgroup = NULL;
1911                 controller = NULL;
1912         } else {
1913                 // return list of keys for the controller, and list of child cgroups
1914                 controller = pick_controller_from_path(fc, path);
1915                 if (!controller)
1916                         return -errno;
1917
1918                 cgroup = find_cgroup_in_path(path);
1919                 if (!cgroup) {
1920                         /* this is just /cgroup/controller, return its contents */
1921                         cgroup = "/";
1922                 }
1923         }
1924
1925         pid_t initpid = lookup_initpid_in_store(fc->pid);
1926         if (initpid <= 0)
1927                 initpid = fc->pid;
1928         if (cgroup) {
1929                 if (!caller_may_see_dir(initpid, controller, cgroup))
1930                         return -ENOENT;
1931                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1932                         return -EACCES;
1933         }
1934
1935         /* we'll free this at cg_releasedir */
1936         dir_info = malloc(sizeof(*dir_info));
1937         if (!dir_info)
1938                 return -ENOMEM;
1939         dir_info->controller = must_copy_string(controller);
1940         dir_info->cgroup = must_copy_string(cgroup);
1941         dir_info->type = LXC_TYPE_CGDIR;
1942         dir_info->buf = NULL;
1943         dir_info->file = NULL;
1944         dir_info->buflen = 0;
1945
1946         fi->fh = (unsigned long)dir_info;
1947         return 0;
1948 }
1949
1950 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1951                 struct fuse_file_info *fi)
1952 {
1953         struct file_info *d = (struct file_info *)fi->fh;
1954         struct cgfs_files **list = NULL;
1955         int i, ret;
1956         char *nextcg = NULL;
1957         struct fuse_context *fc = fuse_get_context();
1958         char **clist = NULL;
1959
1960         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1961                 return -EIO;
1962
1963         if (d->type != LXC_TYPE_CGDIR) {
1964                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1965                 return -EIO;
1966         }
1967         if (!d->cgroup && !d->controller) {
1968                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1969                 int i;
1970
1971                 for (i = 0;  i < num_hierarchies; i++) {
1972                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1973                                 return -EIO;
1974                         }
1975                 }
1976                 return 0;
1977         }
1978
1979         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1980                 // not a valid cgroup
1981                 ret = -EINVAL;
1982                 goto out;
1983         }
1984
1985         pid_t initpid = lookup_initpid_in_store(fc->pid);
1986         if (initpid <= 0)
1987                 initpid = fc->pid;
1988         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1989                 if (nextcg) {
1990                         ret = filler(buf, nextcg,  NULL, 0);
1991                         free(nextcg);
1992                         if (ret != 0) {
1993                                 ret = -EIO;
1994                                 goto out;
1995                         }
1996                 }
1997                 ret = 0;
1998                 goto out;
1999         }
2000
2001         for (i = 0; list[i]; i++) {
2002                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2003                         ret = -EIO;
2004                         goto out;
2005                 }
2006         }
2007
2008         // now get the list of child cgroups
2009
2010         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2011                 ret = 0;
2012                 goto out;
2013         }
2014         if (clist) {
2015                 for (i = 0; clist[i]; i++) {
2016                         if (filler(buf, clist[i], NULL, 0) != 0) {
2017                                 ret = -EIO;
2018                                 goto out;
2019                         }
2020                 }
2021         }
2022         ret = 0;
2023
2024 out:
2025         free_keys(list);
2026         if (clist) {
2027                 for (i = 0; clist[i]; i++)
2028                         free(clist[i]);
2029                 free(clist);
2030         }
2031         return ret;
2032 }
2033
2034 static void do_release_file_info(struct fuse_file_info *fi)
2035 {
2036         struct file_info *f = (struct file_info *)fi->fh;
2037
2038         if (!f)
2039                 return;
2040
2041         fi->fh = 0;
2042
2043         free(f->controller);
2044         f->controller = NULL;
2045         free(f->cgroup);
2046         f->cgroup = NULL;
2047         free(f->file);
2048         f->file = NULL;
2049         free(f->buf);
2050         f->buf = NULL;
2051         free(f);
2052 }
2053
2054 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2055 {
2056         do_release_file_info(fi);
2057         return 0;
2058 }
2059
2060 int cg_open(const char *path, struct fuse_file_info *fi)
2061 {
2062         const char *cgroup;
2063         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2064         struct cgfs_files *k = NULL;
2065         struct file_info *file_info;
2066         struct fuse_context *fc = fuse_get_context();
2067         int ret;
2068
2069         if (!fc)
2070                 return -EIO;
2071
2072         controller = pick_controller_from_path(fc, path);
2073         if (!controller)
2074                 return -errno;
2075         cgroup = find_cgroup_in_path(path);
2076         if (!cgroup)
2077                 return -errno;
2078
2079         get_cgdir_and_path(cgroup, &cgdir, &last);
2080         if (!last) {
2081                 path1 = "/";
2082                 path2 = cgdir;
2083         } else {
2084                 path1 = cgdir;
2085                 path2 = last;
2086         }
2087
2088         k = cgfs_get_key(controller, path1, path2);
2089         if (!k) {
2090                 ret = -EINVAL;
2091                 goto out;
2092         }
2093         free_key(k);
2094
2095         pid_t initpid = lookup_initpid_in_store(fc->pid);
2096         if (initpid <= 0)
2097                 initpid = fc->pid;
2098         if (!caller_may_see_dir(initpid, controller, path1)) {
2099                 ret = -ENOENT;
2100                 goto out;
2101         }
2102         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2103                 ret = -EACCES;
2104                 goto out;
2105         }
2106
2107         /* we'll free this at cg_release */
2108         file_info = malloc(sizeof(*file_info));
2109         if (!file_info) {
2110                 ret = -ENOMEM;
2111                 goto out;
2112         }
2113         file_info->controller = must_copy_string(controller);
2114         file_info->cgroup = must_copy_string(path1);
2115         file_info->file = must_copy_string(path2);
2116         file_info->type = LXC_TYPE_CGFILE;
2117         file_info->buf = NULL;
2118         file_info->buflen = 0;
2119
2120         fi->fh = (unsigned long)file_info;
2121         ret = 0;
2122
2123 out:
2124         free(cgdir);
2125         return ret;
2126 }
2127
2128 int cg_access(const char *path, int mode)
2129 {
2130         int ret;
2131         const char *cgroup;
2132         char *path1, *path2, *controller;
2133         char *last = NULL, *cgdir = NULL;
2134         struct cgfs_files *k = NULL;
2135         struct fuse_context *fc = fuse_get_context();
2136
2137         if (strcmp(path, "/cgroup") == 0)
2138                 return 0;
2139
2140         if (!fc)
2141                 return -EIO;
2142
2143         controller = pick_controller_from_path(fc, path);
2144         if (!controller)
2145                 return -errno;
2146         cgroup = find_cgroup_in_path(path);
2147         if (!cgroup) {
2148                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2149                 if ((mode & W_OK) == 0)
2150                         return 0;
2151                 return -EACCES;
2152         }
2153
2154         get_cgdir_and_path(cgroup, &cgdir, &last);
2155         if (!last) {
2156                 path1 = "/";
2157                 path2 = cgdir;
2158         } else {
2159                 path1 = cgdir;
2160                 path2 = last;
2161         }
2162
2163         k = cgfs_get_key(controller, path1, path2);
2164         if (!k) {
2165                 if ((mode & W_OK) == 0)
2166                         ret = 0;
2167                 else
2168                         ret = -EACCES;
2169                 goto out;
2170         }
2171         free_key(k);
2172
2173         pid_t initpid = lookup_initpid_in_store(fc->pid);
2174         if (initpid <= 0)
2175                 initpid = fc->pid;
2176         if (!caller_may_see_dir(initpid, controller, path1)) {
2177                 ret = -ENOENT;
2178                 goto out;
2179         }
2180         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2181                 ret = -EACCES;
2182                 goto out;
2183         }
2184
2185         ret = 0;
2186
2187 out:
2188         free(cgdir);
2189         return ret;
2190 }
2191
2192 int cg_release(const char *path, struct fuse_file_info *fi)
2193 {
2194         do_release_file_info(fi);
2195         return 0;
2196 }
2197
2198 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2199
2200 static bool wait_for_sock(int sock, int timeout)
2201 {
2202         struct epoll_event ev;
2203         int epfd, ret, now, starttime, deltatime, saved_errno;
2204
2205         if ((starttime = time(NULL)) < 0)
2206                 return false;
2207
2208         if ((epfd = epoll_create(1)) < 0) {
2209                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2210                 return false;
2211         }
2212
2213         ev.events = POLLIN_SET;
2214         ev.data.fd = sock;
2215         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2216                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2217                 close(epfd);
2218                 return false;
2219         }
2220
2221 again:
2222         if ((now = time(NULL)) < 0) {
2223                 close(epfd);
2224                 return false;
2225         }
2226
2227         deltatime = (starttime + timeout) - now;
2228         if (deltatime < 0) { // timeout
2229                 errno = 0;
2230                 close(epfd);
2231                 return false;
2232         }
2233         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2234         if (ret < 0 && errno == EINTR)
2235                 goto again;
2236         saved_errno = errno;
2237         close(epfd);
2238
2239         if (ret <= 0) {
2240                 errno = saved_errno;
2241                 return false;
2242         }
2243         return true;
2244 }
2245
2246 static int msgrecv(int sockfd, void *buf, size_t len)
2247 {
2248         if (!wait_for_sock(sockfd, 2))
2249                 return -1;
2250         return recv(sockfd, buf, len, MSG_DONTWAIT);
2251 }
2252
2253 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2254 {
2255         struct msghdr msg = { 0 };
2256         struct iovec iov;
2257         struct cmsghdr *cmsg;
2258         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2259         char buf[1];
2260         buf[0] = 'p';
2261
2262         if (pingfirst) {
2263                 if (msgrecv(sock, buf, 1) != 1) {
2264                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2265                         return SEND_CREDS_FAIL;
2266                 }
2267         }
2268
2269         msg.msg_control = cmsgbuf;
2270         msg.msg_controllen = sizeof(cmsgbuf);
2271
2272         cmsg = CMSG_FIRSTHDR(&msg);
2273         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2274         cmsg->cmsg_level = SOL_SOCKET;
2275         cmsg->cmsg_type = SCM_CREDENTIALS;
2276         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2277
2278         msg.msg_name = NULL;
2279         msg.msg_namelen = 0;
2280
2281         buf[0] = v;
2282         iov.iov_base = buf;
2283         iov.iov_len = sizeof(buf);
2284         msg.msg_iov = &iov;
2285         msg.msg_iovlen = 1;
2286
2287         if (sendmsg(sock, &msg, 0) < 0) {
2288                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2289                 if (errno == 3)
2290                         return SEND_CREDS_NOTSK;
2291                 return SEND_CREDS_FAIL;
2292         }
2293
2294         return SEND_CREDS_OK;
2295 }
2296
2297 static bool recv_creds(int sock, struct ucred *cred, char *v)
2298 {
2299         struct msghdr msg = { 0 };
2300         struct iovec iov;
2301         struct cmsghdr *cmsg;
2302         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2303         char buf[1];
2304         int ret;
2305         int optval = 1;
2306
2307         *v = '1';
2308
2309         cred->pid = -1;
2310         cred->uid = -1;
2311         cred->gid = -1;
2312
2313         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2314                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2315                 return false;
2316         }
2317         buf[0] = '1';
2318         if (write(sock, buf, 1) != 1) {
2319                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2320                 return false;
2321         }
2322
2323         msg.msg_name = NULL;
2324         msg.msg_namelen = 0;
2325         msg.msg_control = cmsgbuf;
2326         msg.msg_controllen = sizeof(cmsgbuf);
2327
2328         iov.iov_base = buf;
2329         iov.iov_len = sizeof(buf);
2330         msg.msg_iov = &iov;
2331         msg.msg_iovlen = 1;
2332
2333         if (!wait_for_sock(sock, 2)) {
2334                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2335                 return false;
2336         }
2337         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2338         if (ret < 0) {
2339                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2340                 return false;
2341         }
2342
2343         cmsg = CMSG_FIRSTHDR(&msg);
2344
2345         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2346                         cmsg->cmsg_level == SOL_SOCKET &&
2347                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2348                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2349         }
2350         *v = buf[0];
2351
2352         return true;
2353 }
2354
2355 struct pid_ns_clone_args {
2356         int *cpipe;
2357         int sock;
2358         pid_t tpid;
2359         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2360 };
2361
2362 /*
2363  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2364  * with clone(). This simply writes '1' as ACK back to the parent
2365  * before calling the actual wrapped function.
2366  */
2367 static int pid_ns_clone_wrapper(void *arg) {
2368         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2369         char b = '1';
2370
2371         close(args->cpipe[0]);
2372         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2373                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2374         close(args->cpipe[1]);
2375         return args->wrapped(args->sock, args->tpid);
2376 }
2377
2378 /*
2379  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2380  * int value back over the socket.  This shifts the pid from the
2381  * sender's pidns into tpid's pidns.
2382  */
2383 static int pid_to_ns(int sock, pid_t tpid)
2384 {
2385         char v = '0';
2386         struct ucred cred;
2387
2388         while (recv_creds(sock, &cred, &v)) {
2389                 if (v == '1')
2390                         return 0;
2391                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2392                         return 1;
2393         }
2394         return 0;
2395 }
2396
2397
2398 /*
2399  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2400  * in your old pidns.  Only children which you clone will be in the target
2401  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2402  * actually convert pids.
2403  *
2404  * Note: glibc's fork() does not respect pidns, which can lead to failed
2405  * assertions inside glibc (and thus failed forks) if the child's pid in
2406  * the pidns and the parent pid outside are identical. Using clone prevents
2407  * this issue.
2408  */
2409 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2410 {
2411         int newnsfd = -1, ret, cpipe[2];
2412         char fnam[100];
2413         pid_t cpid;
2414         char v;
2415
2416         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2417         if (ret < 0 || ret >= sizeof(fnam))
2418                 _exit(1);
2419         newnsfd = open(fnam, O_RDONLY);
2420         if (newnsfd < 0)
2421                 _exit(1);
2422         if (setns(newnsfd, 0) < 0)
2423                 _exit(1);
2424         close(newnsfd);
2425
2426         if (pipe(cpipe) < 0)
2427                 _exit(1);
2428
2429         struct pid_ns_clone_args args = {
2430                 .cpipe = cpipe,
2431                 .sock = sock,
2432                 .tpid = tpid,
2433                 .wrapped = &pid_to_ns
2434         };
2435         size_t stack_size = sysconf(_SC_PAGESIZE);
2436         void *stack = alloca(stack_size);
2437
2438         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2439         if (cpid < 0)
2440                 _exit(1);
2441
2442         // give the child 1 second to be done forking and
2443         // write its ack
2444         if (!wait_for_sock(cpipe[0], 1))
2445                 _exit(1);
2446         ret = read(cpipe[0], &v, 1);
2447         if (ret != sizeof(char) || v != '1')
2448                 _exit(1);
2449
2450         if (!wait_for_pid(cpid))
2451                 _exit(1);
2452         _exit(0);
2453 }
2454
2455 /*
2456  * To read cgroup files with a particular pid, we will setns into the child
2457  * pidns, open a pipe, fork a child - which will be the first to really be in
2458  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2459  */
2460 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2461 {
2462         int sock[2] = {-1, -1};
2463         char *tmpdata = NULL;
2464         int ret;
2465         pid_t qpid, cpid = -1;
2466         bool answer = false;
2467         char v = '0';
2468         struct ucred cred;
2469         size_t sz = 0, asz = 0;
2470
2471         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2472                 return false;
2473
2474         /*
2475          * Now we read the pids from returned data one by one, pass
2476          * them into a child in the target namespace, read back the
2477          * translated pids, and put them into our to-return data
2478          */
2479
2480         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2481                 perror("socketpair");
2482                 free(tmpdata);
2483                 return false;
2484         }
2485
2486         cpid = fork();
2487         if (cpid == -1)
2488                 goto out;
2489
2490         if (!cpid) // child - exits when done
2491                 pid_to_ns_wrapper(sock[1], tpid);
2492
2493         char *ptr = tmpdata;
2494         cred.uid = 0;
2495         cred.gid = 0;
2496         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2497                 cred.pid = qpid;
2498                 ret = send_creds(sock[0], &cred, v, true);
2499
2500                 if (ret == SEND_CREDS_NOTSK)
2501                         goto next;
2502                 if (ret == SEND_CREDS_FAIL)
2503                         goto out;
2504
2505                 // read converted results
2506                 if (!wait_for_sock(sock[0], 2)) {
2507                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2508                         goto out;
2509                 }
2510                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2511                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2512                         goto out;
2513                 }
2514                 must_strcat_pid(d, &sz, &asz, qpid);
2515 next:
2516                 ptr = strchr(ptr, '\n');
2517                 if (!ptr)
2518                         break;
2519                 ptr++;
2520         }
2521
2522         cred.pid = getpid();
2523         v = '1';
2524         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2525                 // failed to ask child to exit
2526                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2527                 goto out;
2528         }
2529
2530         answer = true;
2531
2532 out:
2533         free(tmpdata);
2534         if (cpid != -1)
2535                 wait_for_pid(cpid);
2536         if (sock[0] != -1) {
2537                 close(sock[0]);
2538                 close(sock[1]);
2539         }
2540         return answer;
2541 }
2542
2543 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2544                 struct fuse_file_info *fi)
2545 {
2546         struct fuse_context *fc = fuse_get_context();
2547         struct file_info *f = (struct file_info *)fi->fh;
2548         struct cgfs_files *k = NULL;
2549         char *data = NULL;
2550         int ret, s;
2551         bool r;
2552
2553         if (f->type != LXC_TYPE_CGFILE) {
2554                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2555                 return -EIO;
2556         }
2557
2558         if (offset)
2559                 return 0;
2560
2561         if (!fc)
2562                 return -EIO;
2563
2564         if (!f->controller)
2565                 return -EINVAL;
2566
2567         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2568                 return -EINVAL;
2569         }
2570         free_key(k);
2571
2572
2573         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2574                 ret = -EACCES;
2575                 goto out;
2576         }
2577
2578         if (strcmp(f->file, "tasks") == 0 ||
2579                         strcmp(f->file, "/tasks") == 0 ||
2580                         strcmp(f->file, "/cgroup.procs") == 0 ||
2581                         strcmp(f->file, "cgroup.procs") == 0)
2582                 // special case - we have to translate the pids
2583                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2584         else
2585                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2586
2587         if (!r) {
2588                 ret = -EINVAL;
2589                 goto out;
2590         }
2591
2592         if (!data) {
2593                 ret = 0;
2594                 goto out;
2595         }
2596         s = strlen(data);
2597         if (s > size)
2598                 s = size;
2599         memcpy(buf, data, s);
2600         if (s > 0 && s < size && data[s-1] != '\n')
2601                 buf[s++] = '\n';
2602
2603         ret = s;
2604
2605 out:
2606         free(data);
2607         return ret;
2608 }
2609
2610 static int pid_from_ns(int sock, pid_t tpid)
2611 {
2612         pid_t vpid;
2613         struct ucred cred;
2614         char v;
2615         int ret;
2616
2617         cred.uid = 0;
2618         cred.gid = 0;
2619         while (1) {
2620                 if (!wait_for_sock(sock, 2)) {
2621                         lxcfs_error("%s\n", "Timeout reading from parent.");
2622                         return 1;
2623                 }
2624                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2625                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2626                         return 1;
2627                 }
2628                 if (vpid == -1) // done
2629                         break;
2630                 v = '0';
2631                 cred.pid = vpid;
2632                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2633                         v = '1';
2634                         cred.pid = getpid();
2635                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2636                                 return 1;
2637                 }
2638         }
2639         return 0;
2640 }
2641
2642 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2643 {
2644         int newnsfd = -1, ret, cpipe[2];
2645         char fnam[100];
2646         pid_t cpid;
2647         char v;
2648
2649         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2650         if (ret < 0 || ret >= sizeof(fnam))
2651                 _exit(1);
2652         newnsfd = open(fnam, O_RDONLY);
2653         if (newnsfd < 0)
2654                 _exit(1);
2655         if (setns(newnsfd, 0) < 0)
2656                 _exit(1);
2657         close(newnsfd);
2658
2659         if (pipe(cpipe) < 0)
2660                 _exit(1);
2661
2662         struct pid_ns_clone_args args = {
2663                 .cpipe = cpipe,
2664                 .sock = sock,
2665                 .tpid = tpid,
2666                 .wrapped = &pid_from_ns
2667         };
2668         size_t stack_size = sysconf(_SC_PAGESIZE);
2669         void *stack = alloca(stack_size);
2670
2671         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2672         if (cpid < 0)
2673                 _exit(1);
2674
2675         // give the child 1 second to be done forking and
2676         // write its ack
2677         if (!wait_for_sock(cpipe[0], 1))
2678                 _exit(1);
2679         ret = read(cpipe[0], &v, 1);
2680         if (ret != sizeof(char) || v != '1')
2681                 _exit(1);
2682
2683         if (!wait_for_pid(cpid))
2684                 _exit(1);
2685         _exit(0);
2686 }
2687
2688 /*
2689  * Given host @uid, return the uid to which it maps in
2690  * @pid's user namespace, or -1 if none.
2691  */
2692 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2693 {
2694         FILE *f;
2695         char line[400];
2696
2697         sprintf(line, "/proc/%d/uid_map", pid);
2698         if ((f = fopen(line, "r")) == NULL) {
2699                 return false;
2700         }
2701
2702         *answer = convert_id_to_ns(f, uid);
2703         fclose(f);
2704
2705         if (*answer == -1)
2706                 return false;
2707         return true;
2708 }
2709
2710 /*
2711  * get_pid_creds: get the real uid and gid of @pid from
2712  * /proc/$$/status
2713  * (XXX should we use euid here?)
2714  */
2715 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2716 {
2717         char line[400];
2718         uid_t u;
2719         gid_t g;
2720         FILE *f;
2721
2722         *uid = -1;
2723         *gid = -1;
2724         sprintf(line, "/proc/%d/status", pid);
2725         if ((f = fopen(line, "r")) == NULL) {
2726                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2727                 return;
2728         }
2729         while (fgets(line, 400, f)) {
2730                 if (strncmp(line, "Uid:", 4) == 0) {
2731                         if (sscanf(line+4, "%u", &u) != 1) {
2732                                 lxcfs_error("bad uid line for pid %u\n", pid);
2733                                 fclose(f);
2734                                 return;
2735                         }
2736                         *uid = u;
2737                 } else if (strncmp(line, "Gid:", 4) == 0) {
2738                         if (sscanf(line+4, "%u", &g) != 1) {
2739                                 lxcfs_error("bad gid line for pid %u\n", pid);
2740                                 fclose(f);
2741                                 return;
2742                         }
2743                         *gid = g;
2744                 }
2745         }
2746         fclose(f);
2747 }
2748
2749 /*
2750  * May the requestor @r move victim @v to a new cgroup?
2751  * This is allowed if
2752  *   . they are the same task
2753  *   . they are ownedy by the same uid
2754  *   . @r is root on the host, or
2755  *   . @v's uid is mapped into @r's where @r is root.
2756  */
2757 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2758 {
2759         uid_t v_uid, tmpuid;
2760         gid_t v_gid;
2761
2762         if (r == v)
2763                 return true;
2764         if (r_uid == 0)
2765                 return true;
2766         get_pid_creds(v, &v_uid, &v_gid);
2767         if (r_uid == v_uid)
2768                 return true;
2769         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2770                         && hostuid_to_ns(v_uid, r, &tmpuid))
2771                 return true;
2772         return false;
2773 }
2774
2775 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2776                 const char *file, const char *buf)
2777 {
2778         int sock[2] = {-1, -1};
2779         pid_t qpid, cpid = -1;
2780         FILE *pids_file = NULL;
2781         bool answer = false, fail = false;
2782
2783         pids_file = open_pids_file(contrl, cg);
2784         if (!pids_file)
2785                 return false;
2786
2787         /*
2788          * write the pids to a socket, have helper in writer's pidns
2789          * call movepid for us
2790          */
2791         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2792                 perror("socketpair");
2793                 goto out;
2794         }
2795
2796         cpid = fork();
2797         if (cpid == -1)
2798                 goto out;
2799
2800         if (!cpid) { // child
2801                 fclose(pids_file);
2802                 pid_from_ns_wrapper(sock[1], tpid);
2803         }
2804
2805         const char *ptr = buf;
2806         while (sscanf(ptr, "%d", &qpid) == 1) {
2807                 struct ucred cred;
2808                 char v;
2809
2810                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2811                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2812                         goto out;
2813                 }
2814
2815                 if (recv_creds(sock[0], &cred, &v)) {
2816                         if (v == '0') {
2817                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2818                                         fail = true;
2819                                         break;
2820                                 }
2821                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2822                                         fail = true;
2823                         }
2824                 }
2825
2826                 ptr = strchr(ptr, '\n');
2827                 if (!ptr)
2828                         break;
2829                 ptr++;
2830         }
2831
2832         /* All good, write the value */
2833         qpid = -1;
2834         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2835                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2836
2837         if (!fail)
2838                 answer = true;
2839
2840 out:
2841         if (cpid != -1)
2842                 wait_for_pid(cpid);
2843         if (sock[0] != -1) {
2844                 close(sock[0]);
2845                 close(sock[1]);
2846         }
2847         if (pids_file) {
2848                 if (fclose(pids_file) != 0)
2849                         answer = false;
2850         }
2851         return answer;
2852 }
2853
2854 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2855              struct fuse_file_info *fi)
2856 {
2857         struct fuse_context *fc = fuse_get_context();
2858         char *localbuf = NULL;
2859         struct cgfs_files *k = NULL;
2860         struct file_info *f = (struct file_info *)fi->fh;
2861         bool r;
2862
2863         if (f->type != LXC_TYPE_CGFILE) {
2864                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2865                 return -EIO;
2866         }
2867
2868         if (offset)
2869                 return 0;
2870
2871         if (!fc)
2872                 return -EIO;
2873
2874         localbuf = alloca(size+1);
2875         localbuf[size] = '\0';
2876         memcpy(localbuf, buf, size);
2877
2878         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2879                 size = -EINVAL;
2880                 goto out;
2881         }
2882
2883         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2884                 size = -EACCES;
2885                 goto out;
2886         }
2887
2888         if (strcmp(f->file, "tasks") == 0 ||
2889                         strcmp(f->file, "/tasks") == 0 ||
2890                         strcmp(f->file, "/cgroup.procs") == 0 ||
2891                         strcmp(f->file, "cgroup.procs") == 0)
2892                 // special case - we have to translate the pids
2893                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2894         else
2895                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2896
2897         if (!r)
2898                 size = -EINVAL;
2899
2900 out:
2901         free_key(k);
2902         return size;
2903 }
2904
2905 int cg_chown(const char *path, uid_t uid, gid_t gid)
2906 {
2907         struct fuse_context *fc = fuse_get_context();
2908         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2909         struct cgfs_files *k = NULL;
2910         const char *cgroup;
2911         int ret;
2912
2913         if (!fc)
2914                 return -EIO;
2915
2916         if (strcmp(path, "/cgroup") == 0)
2917                 return -EPERM;
2918
2919         controller = pick_controller_from_path(fc, path);
2920         if (!controller)
2921                 return errno == ENOENT ? -EPERM : -errno;
2922
2923         cgroup = find_cgroup_in_path(path);
2924         if (!cgroup)
2925                 /* this is just /cgroup/controller */
2926                 return -EPERM;
2927
2928         get_cgdir_and_path(cgroup, &cgdir, &last);
2929
2930         if (!last) {
2931                 path1 = "/";
2932                 path2 = cgdir;
2933         } else {
2934                 path1 = cgdir;
2935                 path2 = last;
2936         }
2937
2938         if (is_child_cgroup(controller, path1, path2)) {
2939                 // get uid, gid, from '/tasks' file and make up a mode
2940                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2941                 k = cgfs_get_key(controller, cgroup, "tasks");
2942
2943         } else
2944                 k = cgfs_get_key(controller, path1, path2);
2945
2946         if (!k) {
2947                 ret = -EINVAL;
2948                 goto out;
2949         }
2950
2951         /*
2952          * This being a fuse request, the uid and gid must be valid
2953          * in the caller's namespace.  So we can just check to make
2954          * sure that the caller is root in his uid, and privileged
2955          * over the file's current owner.
2956          */
2957         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2958                 ret = -EACCES;
2959                 goto out;
2960         }
2961
2962         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2963
2964 out:
2965         free_key(k);
2966         free(cgdir);
2967
2968         return ret;
2969 }
2970
2971 int cg_chmod(const char *path, mode_t mode)
2972 {
2973         struct fuse_context *fc = fuse_get_context();
2974         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2975         struct cgfs_files *k = NULL;
2976         const char *cgroup;
2977         int ret;
2978
2979         if (!fc)
2980                 return -EIO;
2981
2982         if (strcmp(path, "/cgroup") == 0)
2983                 return -EPERM;
2984
2985         controller = pick_controller_from_path(fc, path);
2986         if (!controller)
2987                 return errno == ENOENT ? -EPERM : -errno;
2988
2989         cgroup = find_cgroup_in_path(path);
2990         if (!cgroup)
2991                 /* this is just /cgroup/controller */
2992                 return -EPERM;
2993
2994         get_cgdir_and_path(cgroup, &cgdir, &last);
2995
2996         if (!last) {
2997                 path1 = "/";
2998                 path2 = cgdir;
2999         } else {
3000                 path1 = cgdir;
3001                 path2 = last;
3002         }
3003
3004         if (is_child_cgroup(controller, path1, path2)) {
3005                 // get uid, gid, from '/tasks' file and make up a mode
3006                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3007                 k = cgfs_get_key(controller, cgroup, "tasks");
3008
3009         } else
3010                 k = cgfs_get_key(controller, path1, path2);
3011
3012         if (!k) {
3013                 ret = -EINVAL;
3014                 goto out;
3015         }
3016
3017         /*
3018          * This being a fuse request, the uid and gid must be valid
3019          * in the caller's namespace.  So we can just check to make
3020          * sure that the caller is root in his uid, and privileged
3021          * over the file's current owner.
3022          */
3023         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3024                 ret = -EPERM;
3025                 goto out;
3026         }
3027
3028         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3029                 ret = -EINVAL;
3030                 goto out;
3031         }
3032
3033         ret = 0;
3034 out:
3035         free_key(k);
3036         free(cgdir);
3037         return ret;
3038 }
3039
3040 int cg_mkdir(const char *path, mode_t mode)
3041 {
3042         struct fuse_context *fc = fuse_get_context();
3043         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3044         const char *cgroup;
3045         int ret;
3046
3047         if (!fc)
3048                 return -EIO;
3049
3050         controller = pick_controller_from_path(fc, path);
3051         if (!controller)
3052                 return errno == ENOENT ? -EPERM : -errno;
3053
3054         cgroup = find_cgroup_in_path(path);
3055         if (!cgroup)
3056                 return -errno;
3057
3058         get_cgdir_and_path(cgroup, &cgdir, &last);
3059         if (!last)
3060                 path1 = "/";
3061         else
3062                 path1 = cgdir;
3063
3064         pid_t initpid = lookup_initpid_in_store(fc->pid);
3065         if (initpid <= 0)
3066                 initpid = fc->pid;
3067         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3068                 if (!next)
3069                         ret = -EINVAL;
3070                 else if (last && strcmp(next, last) == 0)
3071                         ret = -EEXIST;
3072                 else
3073                         ret = -EPERM;
3074                 goto out;
3075         }
3076
3077         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3078                 ret = -EACCES;
3079                 goto out;
3080         }
3081         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3082                 ret = -EACCES;
3083                 goto out;
3084         }
3085
3086         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3087
3088 out:
3089         free(cgdir);
3090         free(next);
3091         return ret;
3092 }
3093
3094 int cg_rmdir(const char *path)
3095 {
3096         struct fuse_context *fc = fuse_get_context();
3097         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3098         const char *cgroup;
3099         int ret;
3100
3101         if (!fc)
3102                 return -EIO;
3103
3104         controller = pick_controller_from_path(fc, path);
3105         if (!controller) /* Someone's trying to delete "/cgroup". */
3106                 return -EPERM;
3107
3108         cgroup = find_cgroup_in_path(path);
3109         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3110                 return -EPERM;
3111
3112         get_cgdir_and_path(cgroup, &cgdir, &last);
3113         if (!last) {
3114                 /* Someone's trying to delete a cgroup on the same level as the
3115                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3116                  * rmdir "/cgroup/blkio/init.slice".
3117                  */
3118                 ret = -EPERM;
3119                 goto out;
3120         }
3121
3122         pid_t initpid = lookup_initpid_in_store(fc->pid);
3123         if (initpid <= 0)
3124                 initpid = fc->pid;
3125         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3126                 if (!last || (next && (strcmp(next, last) == 0)))
3127                         ret = -EBUSY;
3128                 else
3129                         ret = -ENOENT;
3130                 goto out;
3131         }
3132
3133         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3134                 ret = -EACCES;
3135                 goto out;
3136         }
3137         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3138                 ret = -EACCES;
3139                 goto out;
3140         }
3141
3142         if (!cgfs_remove(controller, cgroup)) {
3143                 ret = -EINVAL;
3144                 goto out;
3145         }
3146
3147         ret = 0;
3148
3149 out:
3150         free(cgdir);
3151         free(next);
3152         return ret;
3153 }
3154
3155 static bool startswith(const char *line, const char *pref)
3156 {
3157         if (strncmp(line, pref, strlen(pref)) == 0)
3158                 return true;
3159         return false;
3160 }
3161
3162 static void parse_memstat(char *memstat, unsigned long *cached,
3163                 unsigned long *active_anon, unsigned long *inactive_anon,
3164                 unsigned long *active_file, unsigned long *inactive_file,
3165                 unsigned long *unevictable, unsigned long *shmem)
3166 {
3167         char *eol;
3168
3169         while (*memstat) {
3170                 if (startswith(memstat, "total_cache")) {
3171                         sscanf(memstat + 11, "%lu", cached);
3172                         *cached /= 1024;
3173                 } else if (startswith(memstat, "total_active_anon")) {
3174                         sscanf(memstat + 17, "%lu", active_anon);
3175                         *active_anon /= 1024;
3176                 } else if (startswith(memstat, "total_inactive_anon")) {
3177                         sscanf(memstat + 19, "%lu", inactive_anon);
3178                         *inactive_anon /= 1024;
3179                 } else if (startswith(memstat, "total_active_file")) {
3180                         sscanf(memstat + 17, "%lu", active_file);
3181                         *active_file /= 1024;
3182                 } else if (startswith(memstat, "total_inactive_file")) {
3183                         sscanf(memstat + 19, "%lu", inactive_file);
3184                         *inactive_file /= 1024;
3185                 } else if (startswith(memstat, "total_unevictable")) {
3186                         sscanf(memstat + 17, "%lu", unevictable);
3187                         *unevictable /= 1024;
3188                 } else if (startswith(memstat, "total_shmem")) {
3189                         sscanf(memstat + 11, "%lu", shmem);
3190                         *shmem /= 1024;
3191                 }
3192                 eol = strchr(memstat, '\n');
3193                 if (!eol)
3194                         return;
3195                 memstat = eol+1;
3196         }
3197 }
3198
3199 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3200 {
3201         char *eol;
3202         char key[32];
3203
3204         memset(key, 0, 32);
3205         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3206
3207         size_t len = strlen(key);
3208         *v = 0;
3209
3210         while (*str) {
3211                 if (startswith(str, key)) {
3212                         sscanf(str + len, "%lu", v);
3213                         return;
3214                 }
3215                 eol = strchr(str, '\n');
3216                 if (!eol)
3217                         return;
3218                 str = eol+1;
3219         }
3220 }
3221
3222 static int read_file(const char *path, char *buf, size_t size,
3223                      struct file_info *d)
3224 {
3225         size_t linelen = 0, total_len = 0, rv = 0;
3226         char *line = NULL;
3227         char *cache = d->buf;
3228         size_t cache_size = d->buflen;
3229         FILE *f = fopen(path, "r");
3230         if (!f)
3231                 return 0;
3232
3233         while (getline(&line, &linelen, f) != -1) {
3234                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3235                 if (l < 0) {
3236                         perror("Error writing to cache");
3237                         rv = 0;
3238                         goto err;
3239                 }
3240                 if (l >= cache_size) {
3241                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3242                         rv = 0;
3243                         goto err;
3244                 }
3245                 cache += l;
3246                 cache_size -= l;
3247                 total_len += l;
3248         }
3249
3250         d->size = total_len;
3251         if (total_len > size)
3252                 total_len = size;
3253
3254         /* read from off 0 */
3255         memcpy(buf, d->buf, total_len);
3256         rv = total_len;
3257   err:
3258         fclose(f);
3259         free(line);
3260         return rv;
3261 }
3262
3263 /*
3264  * FUSE ops for /proc
3265  */
3266
3267 static unsigned long get_memlimit(const char *cgroup, const char *file)
3268 {
3269         char *memlimit_str = NULL;
3270         unsigned long memlimit = -1;
3271
3272         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3273                 memlimit = strtoul(memlimit_str, NULL, 10);
3274
3275         free(memlimit_str);
3276
3277         return memlimit;
3278 }
3279
3280 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3281 {
3282         char *copy = strdupa(cgroup);
3283         unsigned long memlimit = 0, retlimit;
3284
3285         retlimit = get_memlimit(copy, file);
3286
3287         while (strcmp(copy, "/") != 0) {
3288                 copy = dirname(copy);
3289                 memlimit = get_memlimit(copy, file);
3290                 if (memlimit != -1 && memlimit < retlimit)
3291                         retlimit = memlimit;
3292         };
3293
3294         return retlimit;
3295 }
3296
3297 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3298                 struct fuse_file_info *fi)
3299 {
3300         struct fuse_context *fc = fuse_get_context();
3301         struct file_info *d = (struct file_info *)fi->fh;
3302         char *cg;
3303         char *memusage_str = NULL, *memstat_str = NULL,
3304                 *memswlimit_str = NULL, *memswusage_str = NULL;
3305         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3306                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3307                 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3308                 hostswtotal = 0;
3309         char *line = NULL;
3310         size_t linelen = 0, total_len = 0, rv = 0;
3311         char *cache = d->buf;
3312         size_t cache_size = d->buflen;
3313         FILE *f = NULL;
3314
3315         if (offset){
3316                 if (offset > d->size)
3317                         return -EINVAL;
3318                 if (!d->cached)
3319                         return 0;
3320                 int left = d->size - offset;
3321                 total_len = left > size ? size: left;
3322                 memcpy(buf, cache + offset, total_len);
3323                 return total_len;
3324         }
3325
3326         pid_t initpid = lookup_initpid_in_store(fc->pid);
3327         if (initpid <= 0)
3328                 initpid = fc->pid;
3329         cg = get_pid_cgroup(initpid, "memory");
3330         if (!cg)
3331                 return read_file("/proc/meminfo", buf, size, d);
3332         prune_init_slice(cg);
3333
3334         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3335         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3336                 goto err;
3337         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3338                 goto err;
3339
3340         // Following values are allowed to fail, because swapaccount might be turned
3341         // off for current kernel
3342         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3343                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3344         {
3345                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3346                 memswusage = strtoul(memswusage_str, NULL, 10);
3347
3348                 memswlimit = memswlimit / 1024;
3349                 memswusage = memswusage / 1024;
3350         }
3351
3352         memusage = strtoul(memusage_str, NULL, 10);
3353         memlimit /= 1024;
3354         memusage /= 1024;
3355
3356         parse_memstat(memstat_str, &cached, &active_anon,
3357                         &inactive_anon, &active_file, &inactive_file,
3358                         &unevictable, &shmem);
3359
3360         f = fopen("/proc/meminfo", "r");
3361         if (!f)
3362                 goto err;
3363
3364         while (getline(&line, &linelen, f) != -1) {
3365                 ssize_t l;
3366                 char *printme, lbuf[100];
3367
3368                 memset(lbuf, 0, 100);
3369                 if (startswith(line, "MemTotal:")) {
3370                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3371                         if (hosttotal < memlimit)
3372                                 memlimit = hosttotal;
3373                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3374                         printme = lbuf;
3375                 } else if (startswith(line, "MemFree:")) {
3376                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3377                         printme = lbuf;
3378                 } else if (startswith(line, "MemAvailable:")) {
3379                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3380                         printme = lbuf;
3381                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3382                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3383                         if (hostswtotal < memswlimit)
3384                                 memswlimit = hostswtotal;
3385                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3386                         printme = lbuf;
3387                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3388                         unsigned long swaptotal = memswlimit,
3389                                         swapusage = memswusage - memusage,
3390                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3391                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3392                         printme = lbuf;
3393                 } else if (startswith(line, "Slab:")) {
3394                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3395                         printme = lbuf;
3396                 } else if (startswith(line, "Buffers:")) {
3397                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3398                         printme = lbuf;
3399                 } else if (startswith(line, "Cached:")) {
3400                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3401                         printme = lbuf;
3402                 } else if (startswith(line, "SwapCached:")) {
3403                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3404                         printme = lbuf;
3405                 } else if (startswith(line, "Active:")) {
3406                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3407                                         active_anon + active_file);
3408                         printme = lbuf;
3409                 } else if (startswith(line, "Inactive:")) {
3410                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3411                                         inactive_anon + inactive_file);
3412                         printme = lbuf;
3413                 } else if (startswith(line, "Active(anon)")) {
3414                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3415                         printme = lbuf;
3416                 } else if (startswith(line, "Inactive(anon)")) {
3417                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3418                         printme = lbuf;
3419                 } else if (startswith(line, "Active(file)")) {
3420                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3421                         printme = lbuf;
3422                 } else if (startswith(line, "Inactive(file)")) {
3423                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3424                         printme = lbuf;
3425                 } else if (startswith(line, "Unevictable")) {
3426                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3427                         printme = lbuf;
3428                 } else if (startswith(line, "SReclaimable")) {
3429                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3430                         printme = lbuf;
3431                 } else if (startswith(line, "SUnreclaim")) {
3432                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3433                         printme = lbuf;
3434                 } else if (startswith(line, "Shmem:")) {
3435                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3436                         printme = lbuf;
3437                 } else if (startswith(line, "ShmemHugePages")) {
3438                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3439                         printme = lbuf;
3440                 } else if (startswith(line, "ShmemPmdMapped")) {
3441                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3442                         printme = lbuf;
3443                 } else
3444                         printme = line;
3445
3446                 l = snprintf(cache, cache_size, "%s", printme);
3447                 if (l < 0) {
3448                         perror("Error writing to cache");
3449                         rv = 0;
3450                         goto err;
3451
3452                 }
3453                 if (l >= cache_size) {
3454                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3455                         rv = 0;
3456                         goto err;
3457                 }
3458
3459                 cache += l;
3460                 cache_size -= l;
3461                 total_len += l;
3462         }
3463
3464         d->cached = 1;
3465         d->size = total_len;
3466         if (total_len > size ) total_len = size;
3467         memcpy(buf, d->buf, total_len);
3468
3469         rv = total_len;
3470 err:
3471         if (f)
3472                 fclose(f);
3473         free(line);
3474         free(cg);
3475         free(memusage_str);
3476         free(memswlimit_str);
3477         free(memswusage_str);
3478         free(memstat_str);
3479         return rv;
3480 }
3481
3482 /*
3483  * Read the cpuset.cpus for cg
3484  * Return the answer in a newly allocated string which must be freed
3485  */
3486 static char *get_cpuset(const char *cg)
3487 {
3488         char *answer;
3489
3490         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3491                 return NULL;
3492         return answer;
3493 }
3494
3495 bool cpu_in_cpuset(int cpu, const char *cpuset);
3496
3497 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3498 {
3499         int cpu;
3500
3501         if (sscanf(line, "processor       : %d", &cpu) != 1)
3502                 return false;
3503         return cpu_in_cpuset(cpu, cpuset);
3504 }
3505
3506 /*
3507  * check whether this is a '^processor" line in /proc/cpuinfo
3508  */
3509 static bool is_processor_line(const char *line)
3510 {
3511         int cpu;
3512
3513         if (sscanf(line, "processor       : %d", &cpu) == 1)
3514                 return true;
3515         return false;
3516 }
3517
3518 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3519                 struct fuse_file_info *fi)
3520 {
3521         struct fuse_context *fc = fuse_get_context();
3522         struct file_info *d = (struct file_info *)fi->fh;
3523         char *cg;
3524         char *cpuset = NULL;
3525         char *line = NULL;
3526         size_t linelen = 0, total_len = 0, rv = 0;
3527         bool am_printing = false, firstline = true, is_s390x = false;
3528         int curcpu = -1, cpu;
3529         char *cache = d->buf;
3530         size_t cache_size = d->buflen;
3531         FILE *f = NULL;
3532
3533         if (offset){
3534                 if (offset > d->size)
3535                         return -EINVAL;
3536                 if (!d->cached)
3537                         return 0;
3538                 int left = d->size - offset;
3539                 total_len = left > size ? size: left;
3540                 memcpy(buf, cache + offset, total_len);
3541                 return total_len;
3542         }
3543
3544         pid_t initpid = lookup_initpid_in_store(fc->pid);
3545         if (initpid <= 0)
3546                 initpid = fc->pid;
3547         cg = get_pid_cgroup(initpid, "cpuset");
3548         if (!cg)
3549                 return read_file("proc/cpuinfo", buf, size, d);
3550         prune_init_slice(cg);
3551
3552         cpuset = get_cpuset(cg);
3553         if (!cpuset)
3554                 goto err;
3555
3556         f = fopen("/proc/cpuinfo", "r");
3557         if (!f)
3558                 goto err;
3559
3560         while (getline(&line, &linelen, f) != -1) {
3561                 ssize_t l;
3562                 if (firstline) {
3563                         firstline = false;
3564                         if (strstr(line, "IBM/S390") != NULL) {
3565                                 is_s390x = true;
3566                                 am_printing = true;
3567                                 continue;
3568                         }
3569                 }
3570                 if (strncmp(line, "# processors:", 12) == 0)
3571                         continue;
3572                 if (is_processor_line(line)) {
3573                         am_printing = cpuline_in_cpuset(line, cpuset);
3574                         if (am_printing) {
3575                                 curcpu ++;
3576                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3577                                 if (l < 0) {
3578                                         perror("Error writing to cache");
3579                                         rv = 0;
3580                                         goto err;
3581                                 }
3582                                 if (l >= cache_size) {
3583                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3584                                         rv = 0;
3585                                         goto err;
3586                                 }
3587                                 cache += l;
3588                                 cache_size -= l;
3589                                 total_len += l;
3590                         }
3591                         continue;
3592                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3593                         char *p;
3594                         if (!cpu_in_cpuset(cpu, cpuset))
3595                                 continue;
3596                         curcpu ++;
3597                         p = strchr(line, ':');
3598                         if (!p || !*p)
3599                                 goto err;
3600                         p++;
3601                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3602                         if (l < 0) {
3603                                 perror("Error writing to cache");
3604                                 rv = 0;
3605                                 goto err;
3606                         }
3607                         if (l >= cache_size) {
3608                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3609                                 rv = 0;
3610                                 goto err;
3611                         }
3612                         cache += l;
3613                         cache_size -= l;
3614                         total_len += l;
3615                         continue;
3616
3617                 }
3618                 if (am_printing) {
3619                         l = snprintf(cache, cache_size, "%s", line);
3620                         if (l < 0) {
3621                                 perror("Error writing to cache");
3622                                 rv = 0;
3623                                 goto err;
3624                         }
3625                         if (l >= cache_size) {
3626                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3627                                 rv = 0;
3628                                 goto err;
3629                         }
3630                         cache += l;
3631                         cache_size -= l;
3632                         total_len += l;
3633                 }
3634         }
3635
3636         if (is_s390x) {
3637                 char *origcache = d->buf;
3638                 ssize_t l;
3639                 do {
3640                         d->buf = malloc(d->buflen);
3641                 } while (!d->buf);
3642                 cache = d->buf;
3643                 cache_size = d->buflen;
3644                 total_len = 0;
3645                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3646                 if (l < 0 || l >= cache_size) {
3647                         free(origcache);
3648                         goto err;
3649                 }
3650                 cache_size -= l;
3651                 cache += l;
3652                 total_len += l;
3653                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3654                 if (l < 0 || l >= cache_size) {
3655                         free(origcache);
3656                         goto err;
3657                 }
3658                 cache_size -= l;
3659                 cache += l;
3660                 total_len += l;
3661                 l = snprintf(cache, cache_size, "%s", origcache);
3662                 free(origcache);
3663                 if (l < 0 || l >= cache_size)
3664                         goto err;
3665                 total_len += l;
3666         }
3667
3668         d->cached = 1;
3669         d->size = total_len;
3670         if (total_len > size ) total_len = size;
3671
3672         /* read from off 0 */
3673         memcpy(buf, d->buf, total_len);
3674         rv = total_len;
3675 err:
3676         if (f)
3677                 fclose(f);
3678         free(line);
3679         free(cpuset);
3680         free(cg);
3681         return rv;
3682 }
3683
3684 static uint64_t get_reaper_start_time(pid_t pid)
3685 {
3686         int ret;
3687         FILE *f;
3688         uint64_t starttime;
3689         /* strlen("/proc/") = 6
3690          * +
3691          * LXCFS_NUMSTRLEN64
3692          * +
3693          * strlen("/stat") = 5
3694          * +
3695          * \0 = 1
3696          * */
3697 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3698         char path[__PROC_PID_STAT_LEN];
3699         pid_t qpid;
3700
3701         qpid = lookup_initpid_in_store(pid);
3702         if (qpid <= 0) {
3703                 /* Caller can check for EINVAL on 0. */
3704                 errno = EINVAL;
3705                 return 0;
3706         }
3707
3708         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3709         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3710                 /* Caller can check for EINVAL on 0. */
3711                 errno = EINVAL;
3712                 return 0;
3713         }
3714
3715         f = fopen(path, "r");
3716         if (!f) {
3717                 /* Caller can check for EINVAL on 0. */
3718                 errno = EINVAL;
3719                 return 0;
3720         }
3721
3722         /* Note that the *scanf() argument supression requires that length
3723          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3724          * at us. It's like telling someone you're not married and then asking
3725          * if you can bring your wife to the party.
3726          */
3727         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3728                         "%*s "      /* (2)  comm        %s   */
3729                         "%*c "      /* (3)  state       %c   */
3730                         "%*d "      /* (4)  ppid        %d   */
3731                         "%*d "      /* (5)  pgrp        %d   */
3732                         "%*d "      /* (6)  session     %d   */
3733                         "%*d "      /* (7)  tty_nr      %d   */
3734                         "%*d "      /* (8)  tpgid       %d   */
3735                         "%*u "      /* (9)  flags       %u   */
3736                         "%*u "      /* (10) minflt      %lu  */
3737                         "%*u "      /* (11) cminflt     %lu  */
3738                         "%*u "      /* (12) majflt      %lu  */
3739                         "%*u "      /* (13) cmajflt     %lu  */
3740                         "%*u "      /* (14) utime       %lu  */
3741                         "%*u "      /* (15) stime       %lu  */
3742                         "%*d "      /* (16) cutime      %ld  */
3743                         "%*d "      /* (17) cstime      %ld  */
3744                         "%*d "      /* (18) priority    %ld  */
3745                         "%*d "      /* (19) nice        %ld  */
3746                         "%*d "      /* (20) num_threads %ld  */
3747                         "%*d "      /* (21) itrealvalue %ld  */
3748                         "%" PRIu64, /* (22) starttime   %llu */
3749                      &starttime);
3750         if (ret != 1) {
3751                 fclose(f);
3752                 /* Caller can check for EINVAL on 0. */
3753                 errno = EINVAL;
3754                 return 0;
3755         }
3756
3757         fclose(f);
3758
3759         errno = 0;
3760         return starttime;
3761 }
3762
3763 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3764 {
3765         uint64_t clockticks;
3766         int64_t ticks_per_sec;
3767
3768         clockticks = get_reaper_start_time(pid);
3769         if (clockticks == 0 && errno == EINVAL) {
3770                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3771                 return 0;
3772         }
3773
3774         ticks_per_sec = sysconf(_SC_CLK_TCK);
3775         if (ticks_per_sec < 0 && errno == EINVAL) {
3776                 lxcfs_debug(
3777                     "%s\n",
3778                     "failed to determine number of clock ticks in a second");
3779                 return 0;
3780         }
3781
3782         return (clockticks /= ticks_per_sec);
3783 }
3784
3785 static uint64_t get_reaper_age(pid_t pid)
3786 {
3787         uint64_t procstart, uptime, procage;
3788
3789         /* We need to substract the time the process has started since system
3790          * boot minus the time when the system has started to get the actual
3791          * reaper age.
3792          */
3793         procstart = get_reaper_start_time_in_sec(pid);
3794         procage = procstart;
3795         if (procstart > 0) {
3796                 int ret;
3797                 struct timespec spec;
3798
3799                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3800                 if (ret < 0)
3801                         return 0;
3802                 /* We could make this more precise here by using the tv_nsec
3803                  * field in the timespec struct and convert it to milliseconds
3804                  * and then create a double for the seconds and milliseconds but
3805                  * that seems more work than it is worth.
3806                  */
3807                 uptime = spec.tv_sec;
3808                 procage = uptime - procstart;
3809         }
3810
3811         return procage;
3812 }
3813
3814 /*
3815  * Returns 0 on success.
3816  * It is the caller's responsibility to free `return_usage`, unless this
3817  * function returns an error.
3818  */
3819 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
3820 {
3821         int cpucount = get_nprocs();
3822         struct cpuacct_usage *cpu_usage;
3823         int rv = 0, i, j, ret, read_pos = 0, read_cnt;
3824         int cg_cpu;
3825         uint64_t cg_user, cg_system;
3826         int64_t ticks_per_sec;
3827         char *usage_str = NULL;
3828
3829         ticks_per_sec = sysconf(_SC_CLK_TCK);
3830
3831         if (ticks_per_sec < 0 && errno == EINVAL) {
3832                 lxcfs_debug(
3833                         "%s\n",
3834                         "read_cpuacct_usage_all failed to determine number of clock ticks "
3835                         "in a second");
3836                 return -1;
3837         }
3838
3839         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3840         if (!cpu_usage)
3841                 return -ENOMEM;
3842
3843         if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3844                 rv = -1;
3845                 goto err;
3846         }
3847
3848         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3849                 lxcfs_error("read_cpuacct_usage_all reading first line from "
3850                                 "%s/cpuacct.usage_all failed.\n", cg);
3851                 rv = -1;
3852                 goto err;
3853         }
3854
3855         read_pos += read_cnt;
3856
3857         for (i = 0, j = 0; i < cpucount; i++) {
3858                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3859                                 &cg_system, &read_cnt);
3860
3861                 if (ret == EOF)
3862                         break;
3863
3864                 if (ret != 3) {
3865                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3866                                         "failed.\n", cg);
3867                         rv = -1;
3868                         goto err;
3869                 }
3870
3871                 read_pos += read_cnt;
3872
3873                 if (!cpu_in_cpuset(i, cpuset))
3874                         continue;
3875
3876                 /* Convert the time from nanoseconds to USER_HZ */
3877                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3878                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3879                 j++;
3880         }
3881
3882         rv = 0;
3883         *return_usage = cpu_usage;
3884
3885 err:
3886         if (usage_str)
3887                 free(usage_str);
3888
3889         if (rv != 0) {
3890                 free(cpu_usage);
3891                 *return_usage = NULL;
3892         }
3893
3894         return rv;
3895 }
3896
3897 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3898 static int proc_stat_read(char *buf, size_t size, off_t offset,
3899                 struct fuse_file_info *fi)
3900 {
3901         struct fuse_context *fc = fuse_get_context();
3902         struct file_info *d = (struct file_info *)fi->fh;
3903         char *cg;
3904         char *cpuset = NULL;
3905         char *line = NULL;
3906         size_t linelen = 0, total_len = 0, rv = 0;
3907         int curcpu = -1; /* cpu numbering starts at 0 */
3908         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3909         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3910                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3911         char cpuall[CPUALL_MAX_SIZE];
3912         /* reserve for cpu all */
3913         char *cache = d->buf + CPUALL_MAX_SIZE;
3914         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3915         FILE *f = NULL;
3916         struct cpuacct_usage *cg_cpu_usage = NULL;
3917
3918         if (offset){
3919                 if (offset > d->size)
3920                         return -EINVAL;
3921                 if (!d->cached)
3922                         return 0;
3923                 int left = d->size - offset;
3924                 total_len = left > size ? size: left;
3925                 memcpy(buf, d->buf + offset, total_len);
3926                 return total_len;
3927         }
3928
3929         pid_t initpid = lookup_initpid_in_store(fc->pid);
3930         if (initpid <= 0)
3931                 initpid = fc->pid;
3932         cg = get_pid_cgroup(initpid, "cpuset");
3933         if (!cg)
3934                 return read_file("/proc/stat", buf, size, d);
3935         prune_init_slice(cg);
3936
3937         cpuset = get_cpuset(cg);
3938         if (!cpuset)
3939                 goto err;
3940
3941         /*
3942          * Read cpuacct.usage_all for all CPUs.
3943          * If the cpuacct cgroup is present, it is used to calculate the container's
3944          * CPU usage. If not, values from the host's /proc/stat are used.
3945          */
3946         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
3947                 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
3948                                 "falling back to the host's /proc/stat");
3949         }
3950
3951         f = fopen("/proc/stat", "r");
3952         if (!f)
3953                 goto err;
3954
3955         //skip first line
3956         if (getline(&line, &linelen, f) < 0) {
3957                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3958                 goto err;
3959         }
3960
3961         while (getline(&line, &linelen, f) != -1) {
3962                 ssize_t l;
3963                 int cpu;
3964                 char cpu_char[10]; /* That's a lot of cores */
3965                 char *c;
3966                 uint64_t all_used, cg_used, new_idle;
3967                 int ret;
3968
3969                 if (strlen(line) == 0)
3970                         continue;
3971                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3972                         /* not a ^cpuN line containing a number N, just print it */
3973                         l = snprintf(cache, cache_size, "%s", line);
3974                         if (l < 0) {
3975                                 perror("Error writing to cache");
3976                                 rv = 0;
3977                                 goto err;
3978                         }
3979                         if (l >= cache_size) {
3980                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3981                                 rv = 0;
3982                                 goto err;
3983                         }
3984                         cache += l;
3985                         cache_size -= l;
3986                         total_len += l;
3987                         continue;
3988                 }
3989
3990                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3991                         continue;
3992                 if (!cpu_in_cpuset(cpu, cpuset))
3993                         continue;
3994                 curcpu ++;
3995
3996                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3997                            &user,
3998                            &nice,
3999                            &system,
4000                            &idle,
4001                            &iowait,
4002                            &irq,
4003                            &softirq,
4004                            &steal,
4005                            &guest,
4006                            &guest_nice);
4007
4008                 if (ret != 10 || !cg_cpu_usage) {
4009                         c = strchr(line, ' ');
4010                         if (!c)
4011                                 continue;
4012                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4013                         if (l < 0) {
4014                                 perror("Error writing to cache");
4015                                 rv = 0;
4016                                 goto err;
4017
4018                         }
4019                         if (l >= cache_size) {
4020                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4021                                 rv = 0;
4022                                 goto err;
4023                         }
4024
4025                         cache += l;
4026                         cache_size -= l;
4027                         total_len += l;
4028
4029                         if (ret != 10)
4030                                 continue;
4031                 }
4032
4033                 if (cg_cpu_usage) {
4034                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4035                         cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4036
4037                         if (all_used >= cg_used) {
4038                                 new_idle = idle + (all_used - cg_used);
4039
4040                         } else {
4041                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4042                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4043                                                 curcpu, cg, all_used, cg_used);
4044                                 new_idle = idle;
4045                         }
4046
4047                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4048                                         curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4049                                         new_idle);
4050
4051                         if (l < 0) {
4052                                 perror("Error writing to cache");
4053                                 rv = 0;
4054                                 goto err;
4055
4056                         }
4057                         if (l >= cache_size) {
4058                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4059                                 rv = 0;
4060                                 goto err;
4061                         }
4062
4063                         cache += l;
4064                         cache_size -= l;
4065                         total_len += l;
4066
4067                         user_sum += cg_cpu_usage[curcpu].user;
4068                         system_sum += cg_cpu_usage[curcpu].system;
4069                         idle_sum += new_idle;
4070
4071                 } else {
4072                         user_sum += user;
4073                         nice_sum += nice;
4074                         system_sum += system;
4075                         idle_sum += idle;
4076                         iowait_sum += iowait;
4077                         irq_sum += irq;
4078                         softirq_sum += softirq;
4079                         steal_sum += steal;
4080                         guest_sum += guest;
4081                         guest_nice_sum += guest_nice;
4082                 }
4083         }
4084
4085         cache = d->buf;
4086
4087         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4088                         user_sum,
4089                         nice_sum,
4090                         system_sum,
4091                         idle_sum,
4092                         iowait_sum,
4093                         irq_sum,
4094                         softirq_sum,
4095                         steal_sum,
4096                         guest_sum,
4097                         guest_nice_sum);
4098         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4099                 memcpy(cache, cpuall, cpuall_len);
4100                 cache += cpuall_len;
4101         } else {
4102                 /* shouldn't happen */
4103                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4104                 cpuall_len = 0;
4105         }
4106
4107         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4108         total_len += cpuall_len;
4109         d->cached = 1;
4110         d->size = total_len;
4111         if (total_len > size)
4112                 total_len = size;
4113
4114         memcpy(buf, d->buf, total_len);
4115         rv = total_len;
4116
4117 err:
4118         if (f)
4119                 fclose(f);
4120         if (cg_cpu_usage)
4121                 free(cg_cpu_usage);
4122         free(line);
4123         free(cpuset);
4124         free(cg);
4125         return rv;
4126 }
4127
4128 /* This function retrieves the busy time of a group of tasks by looking at
4129  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4130  * been given it's own cpuacct cgroup. If not, this function will take the busy
4131  * time of all other taks that do not actually belong to the container into
4132  * account as well. If someone has a clever solution for this please send a
4133  * patch!
4134  */
4135 static unsigned long get_reaper_busy(pid_t task)
4136 {
4137         pid_t initpid = lookup_initpid_in_store(task);
4138         char *cgroup = NULL, *usage_str = NULL;
4139         unsigned long usage = 0;
4140
4141         if (initpid <= 0)
4142                 return 0;
4143
4144         cgroup = get_pid_cgroup(initpid, "cpuacct");
4145         if (!cgroup)
4146                 goto out;
4147         prune_init_slice(cgroup);
4148         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4149                 goto out;
4150         usage = strtoul(usage_str, NULL, 10);
4151         usage /= 1000000000;
4152
4153 out:
4154         free(cgroup);
4155         free(usage_str);
4156         return usage;
4157 }
4158
4159 #if RELOADTEST
4160 void iwashere(void)
4161 {
4162         int fd;
4163
4164         fd = creat("/tmp/lxcfs-iwashere", 0644);
4165         if (fd >= 0)
4166                 close(fd);
4167 }
4168 #endif
4169
4170 /*
4171  * We read /proc/uptime and reuse its second field.
4172  * For the first field, we use the mtime for the reaper for
4173  * the calling pid as returned by getreaperage
4174  */
4175 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4176                 struct fuse_file_info *fi)
4177 {
4178         struct fuse_context *fc = fuse_get_context();
4179         struct file_info *d = (struct file_info *)fi->fh;
4180         unsigned long int busytime = get_reaper_busy(fc->pid);
4181         char *cache = d->buf;
4182         ssize_t total_len = 0;
4183         uint64_t idletime, reaperage;
4184
4185 #if RELOADTEST
4186         iwashere();
4187 #endif
4188
4189         if (offset){
4190                 if (!d->cached)
4191                         return 0;
4192                 if (offset > d->size)
4193                         return -EINVAL;
4194                 int left = d->size - offset;
4195                 total_len = left > size ? size: left;
4196                 memcpy(buf, cache + offset, total_len);
4197                 return total_len;
4198         }
4199
4200         reaperage = get_reaper_age(fc->pid);
4201         /* To understand why this is done, please read the comment to the
4202          * get_reaper_busy() function.
4203          */
4204         idletime = reaperage;
4205         if (reaperage >= busytime)
4206                 idletime = reaperage - busytime;
4207
4208         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4209         if (total_len < 0 || total_len >=  d->buflen){
4210                 lxcfs_error("%s\n", "failed to write to cache");
4211                 return 0;
4212         }
4213
4214         d->size = (int)total_len;
4215         d->cached = 1;
4216
4217         if (total_len > size) total_len = size;
4218
4219         memcpy(buf, d->buf, total_len);
4220         return total_len;
4221 }
4222
4223 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4224                 struct fuse_file_info *fi)
4225 {
4226         char dev_name[72];
4227         struct fuse_context *fc = fuse_get_context();
4228         struct file_info *d = (struct file_info *)fi->fh;
4229         char *cg;
4230         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4231                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
4232         unsigned long read = 0, write = 0;
4233         unsigned long read_merged = 0, write_merged = 0;
4234         unsigned long read_sectors = 0, write_sectors = 0;
4235         unsigned long read_ticks = 0, write_ticks = 0;
4236         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4237         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4238         char *cache = d->buf;
4239         size_t cache_size = d->buflen;
4240         char *line = NULL;
4241         size_t linelen = 0, total_len = 0, rv = 0;
4242         unsigned int major = 0, minor = 0;
4243         int i = 0;
4244         FILE *f = NULL;
4245
4246         if (offset){
4247                 if (offset > d->size)
4248                         return -EINVAL;
4249                 if (!d->cached)
4250                         return 0;
4251                 int left = d->size - offset;
4252                 total_len = left > size ? size: left;
4253                 memcpy(buf, cache + offset, total_len);
4254                 return total_len;
4255         }
4256
4257         pid_t initpid = lookup_initpid_in_store(fc->pid);
4258         if (initpid <= 0)
4259                 initpid = fc->pid;
4260         cg = get_pid_cgroup(initpid, "blkio");
4261         if (!cg)
4262                 return read_file("/proc/diskstats", buf, size, d);
4263         prune_init_slice(cg);
4264
4265         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4266                 goto err;
4267         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4268                 goto err;
4269         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4270                 goto err;
4271         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4272                 goto err;
4273         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4274                 goto err;
4275
4276
4277         f = fopen("/proc/diskstats", "r");
4278         if (!f)
4279                 goto err;
4280
4281         while (getline(&line, &linelen, f) != -1) {
4282                 ssize_t l;
4283                 char lbuf[256];
4284
4285                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4286                 if (i != 3)
4287                         continue;
4288
4289                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4290                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4291                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4292                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4293                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4294                 read_sectors = read_sectors/512;
4295                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4296                 write_sectors = write_sectors/512;
4297
4298                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4299                 rd_svctm = rd_svctm/1000000;
4300                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4301                 rd_wait = rd_wait/1000000;
4302                 read_ticks = rd_svctm + rd_wait;
4303
4304                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4305                 wr_svctm =  wr_svctm/1000000;
4306                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4307                 wr_wait =  wr_wait/1000000;
4308                 write_ticks = wr_svctm + wr_wait;
4309
4310                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4311                 tot_ticks =  tot_ticks/1000000;
4312
4313                 memset(lbuf, 0, 256);
4314                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4315                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4316                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4317                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4318                 else
4319                         continue;
4320
4321                 l = snprintf(cache, cache_size, "%s", lbuf);
4322                 if (l < 0) {
4323                         perror("Error writing to fuse buf");
4324                         rv = 0;
4325                         goto err;
4326                 }
4327                 if (l >= cache_size) {
4328                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4329                         rv = 0;
4330                         goto err;
4331                 }
4332                 cache += l;
4333                 cache_size -= l;
4334                 total_len += l;
4335         }
4336
4337         d->cached = 1;
4338         d->size = total_len;
4339         if (total_len > size ) total_len = size;
4340         memcpy(buf, d->buf, total_len);
4341
4342         rv = total_len;
4343 err:
4344         free(cg);
4345         if (f)
4346                 fclose(f);
4347         free(line);
4348         free(io_serviced_str);
4349         free(io_merged_str);
4350         free(io_service_bytes_str);
4351         free(io_wait_time_str);
4352         free(io_service_time_str);
4353         return rv;
4354 }
4355
4356 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4357                 struct fuse_file_info *fi)
4358 {
4359         struct fuse_context *fc = fuse_get_context();
4360         struct file_info *d = (struct file_info *)fi->fh;
4361         char *cg = NULL;
4362         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4363         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4364         ssize_t total_len = 0, rv = 0;
4365         ssize_t l = 0;
4366         char *cache = d->buf;
4367
4368         if (offset) {
4369                 if (offset > d->size)
4370                         return -EINVAL;
4371                 if (!d->cached)
4372                         return 0;
4373                 int left = d->size - offset;
4374                 total_len = left > size ? size: left;
4375                 memcpy(buf, cache + offset, total_len);
4376                 return total_len;
4377         }
4378
4379         pid_t initpid = lookup_initpid_in_store(fc->pid);
4380         if (initpid <= 0)
4381                 initpid = fc->pid;
4382         cg = get_pid_cgroup(initpid, "memory");
4383         if (!cg)
4384                 return read_file("/proc/swaps", buf, size, d);
4385         prune_init_slice(cg);
4386
4387         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4388
4389         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4390                 goto err;
4391
4392         memusage = strtoul(memusage_str, NULL, 10);
4393
4394         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4395             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4396
4397                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4398                 memswusage = strtoul(memswusage_str, NULL, 10);
4399
4400                 swap_total = (memswlimit - memlimit) / 1024;
4401                 swap_free = (memswusage - memusage) / 1024;
4402         }
4403
4404         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4405
4406         /* When no mem + swap limit is specified or swapaccount=0*/
4407         if (!memswlimit) {
4408                 char *line = NULL;
4409                 size_t linelen = 0;
4410                 FILE *f = fopen("/proc/meminfo", "r");
4411
4412                 if (!f)
4413                         goto err;
4414
4415                 while (getline(&line, &linelen, f) != -1) {
4416                         if (startswith(line, "SwapTotal:")) {
4417                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4418                         } else if (startswith(line, "SwapFree:")) {
4419                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4420                         }
4421                 }
4422
4423                 free(line);
4424                 fclose(f);
4425         }
4426
4427         if (swap_total > 0) {
4428                 l = snprintf(d->buf + total_len, d->size - total_len,
4429                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4430                                 swap_total, swap_free);
4431                 total_len += l;
4432         }
4433
4434         if (total_len < 0 || l < 0) {
4435                 perror("Error writing to cache");
4436                 rv = 0;
4437                 goto err;
4438         }
4439
4440         d->cached = 1;
4441         d->size = (int)total_len;
4442
4443         if (total_len > size) total_len = size;
4444         memcpy(buf, d->buf, total_len);
4445         rv = total_len;
4446
4447 err:
4448         free(cg);
4449         free(memswlimit_str);
4450         free(memlimit_str);
4451         free(memusage_str);
4452         free(memswusage_str);
4453         return rv;
4454 }
4455 /*
4456  * Find the process pid from cgroup path.
4457  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4458  * @pid_buf : put pid to pid_buf.
4459  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4460  * @depth : the depth of cgroup in container.
4461  * @sum : return the number of pid.
4462  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4463  */
4464 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4465 {
4466         DIR *dir;
4467         int fd;
4468         struct dirent *file;
4469         FILE *f = NULL;
4470         size_t linelen = 0;
4471         char *line = NULL;
4472         int pd;
4473         char *path_dir, *path;
4474         char **pid;
4475
4476         /* path = dpath + "/cgroup.procs" + /0 */
4477         do {
4478                 path = malloc(strlen(dpath) + 20);
4479         } while (!path);
4480
4481         strcpy(path, dpath);
4482         fd = openat(cfd, path, O_RDONLY);
4483         if (fd < 0)
4484                 goto out;
4485
4486         dir = fdopendir(fd);
4487         if (dir == NULL) {
4488                 close(fd);
4489                 goto out;
4490         }
4491
4492         while (((file = readdir(dir)) != NULL) && depth > 0) {
4493                 if (strncmp(file->d_name, ".", 1) == 0)
4494                         continue;
4495                 if (strncmp(file->d_name, "..", 1) == 0)
4496                         continue;
4497                 if (file->d_type == DT_DIR) {
4498                         /* path + '/' + d_name +/0 */
4499                         do {
4500                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4501                         } while (!path_dir);
4502                         strcpy(path_dir, path);
4503                         strcat(path_dir, "/");
4504                         strcat(path_dir, file->d_name);
4505                         pd = depth - 1;
4506                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4507                         free(path_dir);
4508                 }
4509         }
4510         closedir(dir);
4511
4512         strcat(path, "/cgroup.procs");
4513         fd = openat(cfd, path, O_RDONLY);
4514         if (fd < 0)
4515                 goto out;
4516
4517         f = fdopen(fd, "r");
4518         if (!f) {
4519                 close(fd);
4520                 goto out;
4521         }
4522
4523         while (getline(&line, &linelen, f) != -1) {
4524                 do {
4525                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4526                 } while (!pid);
4527                 *pid_buf = pid;
4528                 do {
4529                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
4530                 } while (*(*pid_buf + sum) == NULL);
4531                 strcpy(*(*pid_buf + sum), line);
4532                 sum++;
4533         }
4534         fclose(f);
4535 out:
4536         if (line)
4537                 free(line);
4538         free(path);
4539         return sum;
4540 }
4541 /*
4542  * calc_load calculates the load according to the following formula:
4543  * load1 = load0 * exp + active * (1 - exp)
4544  *
4545  * @load1: the new loadavg.
4546  * @load0: the former loadavg.
4547  * @active: the total number of running pid at this moment.
4548  * @exp: the fixed-point defined in the beginning.
4549  */
4550 static unsigned long
4551 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4552 {
4553         unsigned long newload;
4554
4555         active = active > 0 ? active * FIXED_1 : 0;
4556         newload = load * exp + active * (FIXED_1 - exp);
4557         if (active >= load)
4558                 newload += FIXED_1 - 1;
4559
4560         return newload / FIXED_1;
4561 }
4562
4563 /*
4564  * Return 0 means that container p->cg is closed.
4565  * Return -1 means that error occurred in refresh.
4566  * Positive num equals the total number of pid.
4567  */
4568 static int refresh_load(struct load_node *p, char *path)
4569 {
4570         FILE *f = NULL;
4571         char **idbuf;
4572         char proc_path[256];
4573         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4574         char *line = NULL;
4575         size_t linelen = 0;
4576         int sum, length;
4577         DIR *dp;
4578         struct dirent *file;
4579
4580         do {
4581                 idbuf = malloc(sizeof(char *));
4582         } while (!idbuf);
4583         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4584         /*  normal exit  */
4585         if (sum == 0)
4586                 goto out;
4587
4588         for (i = 0; i < sum; i++) {
4589                 /*clean up '\n' */
4590                 length = strlen(idbuf[i])-1;
4591                 idbuf[i][length] = '\0';
4592                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4593                 if (ret < 0 || ret > 255) {
4594                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4595                         i = sum;
4596                         sum = -1;
4597                         goto err_out;
4598                 }
4599
4600                 dp = opendir(proc_path);
4601                 if (!dp) {
4602                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4603                         continue;
4604                 }
4605                 while ((file = readdir(dp)) != NULL) {
4606                         if (strncmp(file->d_name, ".", 1) == 0)
4607                                 continue;
4608                         if (strncmp(file->d_name, "..", 1) == 0)
4609                                 continue;
4610                         total_pid++;
4611                         /* We make the biggest pid become last_pid.*/
4612                         ret = atof(file->d_name);
4613                         last_pid = (ret > last_pid) ? ret : last_pid;
4614
4615                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4616                         if (ret < 0 || ret > 255) {
4617                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4618                                 i = sum;
4619                                 sum = -1;
4620                                 closedir(dp);
4621                                 goto err_out;
4622                         }
4623                         f = fopen(proc_path, "r");
4624                         if (f != NULL) {
4625                                 while (getline(&line, &linelen, f) != -1) {
4626                                         /* Find State */
4627                                         if ((line[0] == 'S') && (line[1] == 't'))
4628                                                 break;
4629                                 }
4630                         if ((line[7] == 'R') || (line[7] == 'D'))
4631                                 run_pid++;
4632                         fclose(f);
4633                         }
4634                 }
4635                 closedir(dp);
4636         }
4637         /*Calculate the loadavg.*/
4638         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4639         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4640         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4641         p->run_pid = run_pid;
4642         p->total_pid = total_pid;
4643         p->last_pid = last_pid;
4644
4645         free(line);
4646 err_out:
4647         for (; i > 0; i--)
4648                 free(idbuf[i-1]);
4649 out:
4650         free(idbuf);
4651         return sum;
4652 }
4653 /*
4654  * Traverse the hash table and update it.
4655  */
4656 void *load_begin(void *arg)
4657 {
4658
4659         char *path = NULL;
4660         int i, sum, length, ret;
4661         struct load_node *f;
4662         int first_node;
4663         clock_t time1, time2;
4664
4665         while (1) {
4666                 if (loadavg_stop == 1)
4667                         return NULL;
4668
4669                 time1 = clock();
4670                 for (i = 0; i < LOAD_SIZE; i++) {
4671                         pthread_mutex_lock(&load_hash[i].lock);
4672                         if (load_hash[i].next == NULL) {
4673                                 pthread_mutex_unlock(&load_hash[i].lock);
4674                                 continue;
4675                         }
4676                         f = load_hash[i].next;
4677                         first_node = 1;
4678                         while (f) {
4679                                 length = strlen(f->cg) + 2;
4680                                 do {
4681                                         /* strlen(f->cg) + '.' or '' + \0 */
4682                                         path = malloc(length);
4683                                 } while (!path);
4684
4685                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4686                                 if (ret < 0 || ret > length - 1) {
4687                                         /* snprintf failed, ignore the node.*/
4688                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4689                                         goto out;
4690                                 }
4691                                 sum = refresh_load(f, path);
4692                                 if (sum == 0) {
4693                                         f = del_node(f, i);
4694                                 } else {
4695 out:                                    f = f->next;
4696                                 }
4697                                 free(path);
4698                                 /* load_hash[i].lock locks only on the first node.*/
4699                                 if (first_node == 1) {
4700                                         first_node = 0;
4701                                         pthread_mutex_unlock(&load_hash[i].lock);
4702                                 }
4703                         }
4704                 }
4705
4706                 if (loadavg_stop == 1)
4707                         return NULL;
4708
4709                 time2 = clock();
4710                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4711         }
4712 }
4713
4714 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4715                 struct fuse_file_info *fi)
4716 {
4717         struct fuse_context *fc = fuse_get_context();
4718         struct file_info *d = (struct file_info *)fi->fh;
4719         pid_t initpid;
4720         char *cg;
4721         size_t total_len = 0;
4722         char *cache = d->buf;
4723         struct load_node *n;
4724         int hash;
4725         int cfd, rv = 0;
4726         unsigned long a, b, c;
4727
4728         if (offset) {
4729                 if (offset > d->size)
4730                         return -EINVAL;
4731                 if (!d->cached)
4732                         return 0;
4733                 int left = d->size - offset;
4734                 total_len = left > size ? size : left;
4735                 memcpy(buf, cache + offset, total_len);
4736                 return total_len;
4737         }
4738         if (!loadavg)
4739                 return read_file("/proc/loadavg", buf, size, d);
4740
4741         initpid = lookup_initpid_in_store(fc->pid);
4742         if (initpid <= 0)
4743                 initpid = fc->pid;
4744         cg = get_pid_cgroup(initpid, "cpu");
4745         if (!cg)
4746                 return read_file("/proc/loadavg", buf, size, d);
4747
4748         prune_init_slice(cg);
4749         hash = calc_hash(cg);
4750         n = locate_node(cg, hash);
4751
4752         /* First time */
4753         if (n == NULL) {
4754                 if (!find_mounted_controller("cpu", &cfd)) {
4755                         /*
4756                          * In locate_node() above, pthread_rwlock_unlock() isn't used
4757                          * because delete is not allowed before read has ended.
4758                          */
4759                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4760                         rv = 0;
4761                         goto err;
4762                 }
4763                 do {
4764                         n = malloc(sizeof(struct load_node));
4765                 } while (!n);
4766
4767                 do {
4768                         n->cg = malloc(strlen(cg)+1);
4769                 } while (!n->cg);
4770                 strcpy(n->cg, cg);
4771                 n->avenrun[0] = 0;
4772                 n->avenrun[1] = 0;
4773                 n->avenrun[2] = 0;
4774                 n->run_pid = 0;
4775                 n->total_pid = 1;
4776                 n->last_pid = initpid;
4777                 n->cfd = cfd;
4778                 insert_node(&n, hash);
4779         }
4780         a = n->avenrun[0] + (FIXED_1/200);
4781         b = n->avenrun[1] + (FIXED_1/200);
4782         c = n->avenrun[2] + (FIXED_1/200);
4783         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4784                 LOAD_INT(a), LOAD_FRAC(a),
4785                 LOAD_INT(b), LOAD_FRAC(b),
4786                 LOAD_INT(c), LOAD_FRAC(c),
4787                 n->run_pid, n->total_pid, n->last_pid);
4788         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4789         if (total_len < 0 || total_len >=  d->buflen) {
4790                 lxcfs_error("%s\n", "Failed to write to cache");
4791                 rv = 0;
4792                 goto err;
4793         }
4794         d->size = (int)total_len;
4795         d->cached = 1;
4796
4797         if (total_len > size)
4798                 total_len = size;
4799         memcpy(buf, d->buf, total_len);
4800         rv = total_len;
4801
4802 err:
4803         free(cg);
4804         return rv;
4805 }
4806 /* Return a positive number on success, return 0 on failure.*/
4807 pthread_t load_daemon(int load_use)
4808 {
4809         int ret;
4810         pthread_t pid;
4811
4812         ret = init_load();
4813         if (ret == -1) {
4814                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4815                 return 0;
4816         }
4817         ret = pthread_create(&pid, NULL, load_begin, NULL);
4818         if (ret != 0) {
4819                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4820                 load_free();
4821                 return 0;
4822         }
4823         /* use loadavg, here loadavg = 1*/
4824         loadavg = load_use;
4825         return pid;
4826 }
4827
4828 /* Returns 0 on success. */
4829 int stop_load_daemon(pthread_t pid)
4830 {
4831         int s;
4832
4833         /* Signal the thread to gracefully stop */
4834         loadavg_stop = 1;
4835
4836         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4837         if (s != 0) {
4838                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4839                 return -1;
4840         }
4841
4842         load_free();
4843         loadavg_stop = 0;
4844
4845         return 0;
4846 }
4847
4848 static off_t get_procfile_size(const char *which)
4849 {
4850         FILE *f = fopen(which, "r");
4851         char *line = NULL;
4852         size_t len = 0;
4853         ssize_t sz, answer = 0;
4854         if (!f)
4855                 return 0;
4856
4857         while ((sz = getline(&line, &len, f)) != -1)
4858                 answer += sz;
4859         fclose (f);
4860         free(line);
4861
4862         return answer;
4863 }
4864
4865 int proc_getattr(const char *path, struct stat *sb)
4866 {
4867         struct timespec now;
4868
4869         memset(sb, 0, sizeof(struct stat));
4870         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4871                 return -EINVAL;
4872         sb->st_uid = sb->st_gid = 0;
4873         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4874         if (strcmp(path, "/proc") == 0) {
4875                 sb->st_mode = S_IFDIR | 00555;
4876                 sb->st_nlink = 2;
4877                 return 0;
4878         }
4879         if (strcmp(path, "/proc/meminfo") == 0 ||
4880                         strcmp(path, "/proc/cpuinfo") == 0 ||
4881                         strcmp(path, "/proc/uptime") == 0 ||
4882                         strcmp(path, "/proc/stat") == 0 ||
4883                         strcmp(path, "/proc/diskstats") == 0 ||
4884                         strcmp(path, "/proc/swaps") == 0 ||
4885                         strcmp(path, "/proc/loadavg") == 0) {
4886                 sb->st_size = 0;
4887                 sb->st_mode = S_IFREG | 00444;
4888                 sb->st_nlink = 1;
4889                 return 0;
4890         }
4891
4892         return -ENOENT;
4893 }
4894
4895 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4896                 struct fuse_file_info *fi)
4897 {
4898         if (filler(buf, ".", NULL, 0) != 0 ||
4899             filler(buf, "..", NULL, 0) != 0 ||
4900             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4901             filler(buf, "meminfo", NULL, 0) != 0 ||
4902             filler(buf, "stat", NULL, 0) != 0 ||
4903             filler(buf, "uptime", NULL, 0) != 0 ||
4904             filler(buf, "diskstats", NULL, 0) != 0 ||
4905             filler(buf, "swaps", NULL, 0) != 0   ||
4906             filler(buf, "loadavg", NULL, 0) != 0)
4907                 return -EINVAL;
4908         return 0;
4909 }
4910
4911 int proc_open(const char *path, struct fuse_file_info *fi)
4912 {
4913         int type = -1;
4914         struct file_info *info;
4915
4916         if (strcmp(path, "/proc/meminfo") == 0)
4917                 type = LXC_TYPE_PROC_MEMINFO;
4918         else if (strcmp(path, "/proc/cpuinfo") == 0)
4919                 type = LXC_TYPE_PROC_CPUINFO;
4920         else if (strcmp(path, "/proc/uptime") == 0)
4921                 type = LXC_TYPE_PROC_UPTIME;
4922         else if (strcmp(path, "/proc/stat") == 0)
4923                 type = LXC_TYPE_PROC_STAT;
4924         else if (strcmp(path, "/proc/diskstats") == 0)
4925                 type = LXC_TYPE_PROC_DISKSTATS;
4926         else if (strcmp(path, "/proc/swaps") == 0)
4927                 type = LXC_TYPE_PROC_SWAPS;
4928         else if (strcmp(path, "/proc/loadavg") == 0)
4929                 type = LXC_TYPE_PROC_LOADAVG;
4930         if (type == -1)
4931                 return -ENOENT;
4932
4933         info = malloc(sizeof(*info));
4934         if (!info)
4935                 return -ENOMEM;
4936
4937         memset(info, 0, sizeof(*info));
4938         info->type = type;
4939
4940         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4941         do {
4942                 info->buf = malloc(info->buflen);
4943         } while (!info->buf);
4944         memset(info->buf, 0, info->buflen);
4945         /* set actual size to buffer size */
4946         info->size = info->buflen;
4947
4948         fi->fh = (unsigned long)info;
4949         return 0;
4950 }
4951
4952 int proc_access(const char *path, int mask)
4953 {
4954         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4955                 return 0;
4956
4957         /* these are all read-only */
4958         if ((mask & ~R_OK) != 0)
4959                 return -EACCES;
4960         return 0;
4961 }
4962
4963 int proc_release(const char *path, struct fuse_file_info *fi)
4964 {
4965         do_release_file_info(fi);
4966         return 0;
4967 }
4968
4969 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4970                 struct fuse_file_info *fi)
4971 {
4972         struct file_info *f = (struct file_info *) fi->fh;
4973
4974         switch (f->type) {
4975         case LXC_TYPE_PROC_MEMINFO:
4976                 return proc_meminfo_read(buf, size, offset, fi);
4977         case LXC_TYPE_PROC_CPUINFO:
4978                 return proc_cpuinfo_read(buf, size, offset, fi);
4979         case LXC_TYPE_PROC_UPTIME:
4980                 return proc_uptime_read(buf, size, offset, fi);
4981         case LXC_TYPE_PROC_STAT:
4982                 return proc_stat_read(buf, size, offset, fi);
4983         case LXC_TYPE_PROC_DISKSTATS:
4984                 return proc_diskstats_read(buf, size, offset, fi);
4985         case LXC_TYPE_PROC_SWAPS:
4986                 return proc_swaps_read(buf, size, offset, fi);
4987         case LXC_TYPE_PROC_LOADAVG:
4988                 return proc_loadavg_read(buf, size, offset, fi);
4989         default:
4990                 return -EINVAL;
4991         }
4992 }
4993
4994 /*
4995  * Functions needed to setup cgroups in the __constructor__.
4996  */
4997
4998 static bool mkdir_p(const char *dir, mode_t mode)
4999 {
5000         const char *tmp = dir;
5001         const char *orig = dir;
5002         char *makeme;
5003
5004         do {
5005                 dir = tmp + strspn(tmp, "/");
5006                 tmp = dir + strcspn(dir, "/");
5007                 makeme = strndup(orig, dir - orig);
5008                 if (!makeme)
5009                         return false;
5010                 if (mkdir(makeme, mode) && errno != EEXIST) {
5011                         lxcfs_error("Failed to create directory '%s': %s.\n",
5012                                 makeme, strerror(errno));
5013                         free(makeme);
5014                         return false;
5015                 }
5016                 free(makeme);
5017         } while(tmp != dir);
5018
5019         return true;
5020 }
5021
5022 static bool umount_if_mounted(void)
5023 {
5024         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5025                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5026                 return false;
5027         }
5028         return true;
5029 }
5030
5031 /* __typeof__ should be safe to use with all compilers. */
5032 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5033 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5034 {
5035         return (fs->f_type == (fs_type_magic)magic_val);
5036 }
5037
5038 /*
5039  * looking at fs/proc_namespace.c, it appears we can
5040  * actually expect the rootfs entry to very specifically contain
5041  * " - rootfs rootfs "
5042  * IIUC, so long as we've chrooted so that rootfs is not our root,
5043  * the rootfs entry should always be skipped in mountinfo contents.
5044  */
5045 static bool is_on_ramfs(void)
5046 {
5047         FILE *f;
5048         char *p, *p2;
5049         char *line = NULL;
5050         size_t len = 0;
5051         int i;
5052
5053         f = fopen("/proc/self/mountinfo", "r");
5054         if (!f)
5055                 return false;
5056
5057         while (getline(&line, &len, f) != -1) {
5058                 for (p = line, i = 0; p && i < 4; i++)
5059                         p = strchr(p + 1, ' ');
5060                 if (!p)
5061                         continue;
5062                 p2 = strchr(p + 1, ' ');
5063                 if (!p2)
5064                         continue;
5065                 *p2 = '\0';
5066                 if (strcmp(p + 1, "/") == 0) {
5067                         // this is '/'.  is it the ramfs?
5068                         p = strchr(p2 + 1, '-');
5069                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5070                                 free(line);
5071                                 fclose(f);
5072                                 return true;
5073                         }
5074                 }
5075         }
5076         free(line);
5077         fclose(f);
5078         return false;
5079 }
5080
5081 static int pivot_enter()
5082 {
5083         int ret = -1, oldroot = -1, newroot = -1;
5084
5085         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5086         if (oldroot < 0) {
5087                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5088                 return ret;
5089         }
5090
5091         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5092         if (newroot < 0) {
5093                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5094                 goto err;
5095         }
5096
5097         /* change into new root fs */
5098         if (fchdir(newroot) < 0) {
5099                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5100                 goto err;
5101         }
5102
5103         /* pivot_root into our new root fs */
5104         if (pivot_root(".", ".") < 0) {
5105                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5106                 goto err;
5107         }
5108
5109         /*
5110          * At this point the old-root is mounted on top of our new-root.
5111          * To unmounted it we must not be chdir'd into it, so escape back
5112          * to the old-root.
5113          */
5114         if (fchdir(oldroot) < 0) {
5115                 lxcfs_error("%s\n", "Failed to enter old root.");
5116                 goto err;
5117         }
5118
5119         if (umount2(".", MNT_DETACH) < 0) {
5120                 lxcfs_error("%s\n", "Failed to detach old root.");
5121                 goto err;
5122         }
5123
5124         if (fchdir(newroot) < 0) {
5125                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5126                 goto err;
5127         }
5128
5129         ret = 0;
5130
5131 err:
5132         if (oldroot > 0)
5133                 close(oldroot);
5134         if (newroot > 0)
5135                 close(newroot);
5136
5137         return ret;
5138 }
5139
5140 static int chroot_enter()
5141 {
5142         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5143                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5144                 return -1;
5145         }
5146
5147         if (chroot(".") < 0) {
5148                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5149                 return -1;
5150         }
5151
5152         if (chdir("/") < 0) {
5153                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5154                 return -1;
5155         }
5156
5157         return 0;
5158 }
5159
5160 static int permute_and_enter(void)
5161 {
5162         struct statfs sb;
5163
5164         if (statfs("/", &sb) < 0) {
5165                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5166                 return -1;
5167         }
5168
5169         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5170          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5171          * /proc/1/mountinfo. */
5172         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5173                 return chroot_enter();
5174
5175         if (pivot_enter() < 0) {
5176                 lxcfs_error("%s\n", "Could not perform pivot root.");
5177                 return -1;
5178         }
5179
5180         return 0;
5181 }
5182
5183 /* Prepare our new clean root. */
5184 static int permute_prepare(void)
5185 {
5186         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5187                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5188                 return -1;
5189         }
5190
5191         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5192                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5193                 return -1;
5194         }
5195
5196         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5197                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5198                 return -1;
5199         }
5200
5201         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5202                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5203                 return -1;
5204         }
5205
5206         return 0;
5207 }
5208
5209 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5210 static bool permute_root(void)
5211 {
5212         /* Prepare new root. */
5213         if (permute_prepare() < 0)
5214                 return false;
5215
5216         /* Pivot into new root. */
5217         if (permute_and_enter() < 0)
5218                 return false;
5219
5220         return true;
5221 }
5222
5223 static int preserve_mnt_ns(int pid)
5224 {
5225         int ret;
5226         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5227         char path[len];
5228
5229         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5230         if (ret < 0 || (size_t)ret >= len)
5231                 return -1;
5232
5233         return open(path, O_RDONLY | O_CLOEXEC);
5234 }
5235
5236 static bool cgfs_prepare_mounts(void)
5237 {
5238         if (!mkdir_p(BASEDIR, 0700)) {
5239                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5240                 return false;
5241         }
5242
5243         if (!umount_if_mounted()) {
5244                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5245                 return false;
5246         }
5247
5248         if (unshare(CLONE_NEWNS) < 0) {
5249                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5250                 return false;
5251         }
5252
5253         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5254         if (cgroup_mount_ns_fd < 0) {
5255                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5256                 return false;
5257         }
5258
5259         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5260                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5261                 return false;
5262         }
5263
5264         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5265                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5266                 return false;
5267         }
5268
5269         return true;
5270 }
5271
5272 static bool cgfs_mount_hierarchies(void)
5273 {
5274         char *target;
5275         size_t clen, len;
5276         int i, ret;
5277
5278         for (i = 0; i < num_hierarchies; i++) {
5279                 char *controller = hierarchies[i];
5280
5281                 clen = strlen(controller);
5282                 len = strlen(BASEDIR) + clen + 2;
5283                 target = malloc(len);
5284                 if (!target)
5285                         return false;
5286
5287                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5288                 if (ret < 0 || ret >= len) {
5289                         free(target);
5290                         return false;
5291                 }
5292                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5293                         free(target);
5294                         return false;
5295                 }
5296                 if (!strcmp(controller, "unified"))
5297                         ret = mount("none", target, "cgroup2", 0, NULL);
5298                 else
5299                         ret = mount(controller, target, "cgroup", 0, controller);
5300                 if (ret < 0) {
5301                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5302                         free(target);
5303                         return false;
5304                 }
5305
5306                 fd_hierarchies[i] = open(target, O_DIRECTORY);
5307                 if (fd_hierarchies[i] < 0) {
5308                         free(target);
5309                         return false;
5310                 }
5311                 free(target);
5312         }
5313         return true;
5314 }
5315
5316 static bool cgfs_setup_controllers(void)
5317 {
5318         if (!cgfs_prepare_mounts())
5319                 return false;
5320
5321         if (!cgfs_mount_hierarchies()) {
5322                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5323                 return false;
5324         }
5325
5326         if (!permute_root())
5327                 return false;
5328
5329         return true;
5330 }
5331
5332 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5333 {
5334         FILE *f;
5335         char *cret, *line = NULL;
5336         char cwd[MAXPATHLEN];
5337         size_t len = 0;
5338         int i, init_ns = -1;
5339         bool found_unified = false;
5340
5341         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5342                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5343                 return;
5344         }
5345
5346         while (getline(&line, &len, f) != -1) {
5347                 char *idx, *p, *p2;
5348
5349                 p = strchr(line, ':');
5350                 if (!p)
5351                         goto out;
5352                 idx = line;
5353                 *(p++) = '\0';
5354
5355                 p2 = strrchr(p, ':');
5356                 if (!p2)
5357                         goto out;
5358                 *p2 = '\0';
5359
5360                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5361                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5362                  * because it parses out the empty string "" and later on passes
5363                  * it to mount(). Let's skip such entries.
5364                  */
5365                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5366                         found_unified = true;
5367                         p = "unified";
5368                 }
5369
5370                 if (!store_hierarchy(line, p))
5371                         goto out;
5372         }
5373
5374         /* Preserve initial namespace. */
5375         init_ns = preserve_mnt_ns(getpid());
5376         if (init_ns < 0) {
5377                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5378                 goto out;
5379         }
5380
5381         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5382         if (!fd_hierarchies) {
5383                 lxcfs_error("%s\n", strerror(errno));
5384                 goto out;
5385         }
5386
5387         for (i = 0; i < num_hierarchies; i++)
5388                 fd_hierarchies[i] = -1;
5389
5390         cret = getcwd(cwd, MAXPATHLEN);
5391         if (!cret)
5392                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5393
5394         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5395          * to privately mount lxcfs cgroups. */
5396         if (!cgfs_setup_controllers()) {
5397                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5398                 goto out;
5399         }
5400
5401         if (setns(init_ns, 0) < 0) {
5402                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5403                 goto out;
5404         }
5405
5406         if (!cret || chdir(cwd) < 0)
5407                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5408
5409         print_subsystems();
5410
5411 out:
5412         free(line);
5413         fclose(f);
5414         if (init_ns >= 0)
5415                 close(init_ns);
5416 }
5417
5418 static void __attribute__((destructor)) free_subsystems(void)
5419 {
5420         int i;
5421
5422         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5423
5424         for (i = 0; i < num_hierarchies; i++) {
5425                 if (hierarchies[i])
5426                         free(hierarchies[i]);
5427                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5428                         close(fd_hierarchies[i]);
5429         }
5430         free(hierarchies);
5431         free(fd_hierarchies);
5432
5433         if (cgroup_mount_ns_fd >= 0)
5434                 close(cgroup_mount_ns_fd);
5435 }