bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 struct cpuacct_usage {
  84         uint64_t user;
  85         uint64_t system;
  86 };
  87
  88 /* The function of hash table.*/
  89 #define LOAD_SIZE 100 /*the size of hash_table */
  90 #define FLUSH_TIME 5  /*the flush rate */
  91 #define DEPTH_DIR 3   /*the depth of per cgroup */
  92 /* The function of calculate loadavg .*/
  93 #define FSHIFT          11              /* nr of bits of precision */
  94 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  95 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  96 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  97 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  98 #define LOAD_INT(x) ((x) >> FSHIFT)
  99 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 100 /*
 101  * This parameter is used for proc_loadavg_read().
 102  * 1 means use loadavg, 0 means not use.
 103  */
 104 static int loadavg = 0;
 105 static volatile sig_atomic_t loadavg_stop = 0;
 106 static int calc_hash(char *name)
 107 {
 108         unsigned int hash = 0;
 109         unsigned int x = 0;
 110         /* ELFHash algorithm. */
 111         while (*name) {
 112                 hash = (hash << 4) + *name++;
 113                 x = hash & 0xf0000000;
 114                 if (x != 0)
 115                         hash ^= (x >> 24);
 116                 hash &= ~x;
 117         }
 118         return ((hash & 0x7fffffff) % LOAD_SIZE);
 119 }
 120
 121 struct load_node {
 122         char *cg;  /*cg */
 123         unsigned long avenrun[3];               /* Load averages */
 124         unsigned int run_pid;
 125         unsigned int total_pid;
 126         unsigned int last_pid;
 127         int cfd; /* The file descriptor of the mounted cgroup */
 128         struct  load_node *next;
 129         struct  load_node **pre;
 130 };
 131
 132 struct load_head {
 133         /*
 134          * The lock is about insert load_node and refresh load_node.To the first
 135          * load_node of each hash bucket, insert and refresh in this hash bucket is
 136          * mutually exclusive.
 137          */
 138         pthread_mutex_t lock;
 139         /*
 140          * The rdlock is about read loadavg and delete load_node.To each hash
 141          * bucket, read and delete is mutually exclusive. But at the same time, we
 142          * allow paratactic read operation. This rdlock is at list level.
 143          */
 144         pthread_rwlock_t rdlock;
 145         /*
 146          * The rilock is about read loadavg and insert load_node.To the first
 147          * load_node of each hash bucket, read and insert is mutually exclusive.
 148          * But at the same time, we allow paratactic read operation.
 149          */
 150         pthread_rwlock_t rilock;
 151         struct load_node *next;
 152 };
 153
 154 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 155 /*
 156  * init_load initialize the hash table.
 157  * Return 0 on success, return -1 on failure.
 158  */
 159 static int init_load(void)
 160 {
 161         int i;
 162         int ret;
 163
 164         for (i = 0; i < LOAD_SIZE; i++) {
 165                 load_hash[i].next = NULL;
 166                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 167                 if (ret != 0) {
 168                         lxcfs_error("%s\n", "Failed to initialize lock");
 169                         goto out3;
 170                 }
 171                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 172                 if (ret != 0) {
 173                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 174                         goto out2;
 175                 }
 176                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 177                 if (ret != 0) {
 178                         lxcfs_error("%s\n", "Failed to initialize rilock");
 179                         goto out1;
 180                 }
 181         }
 182         return 0;
 183 out1:
 184         pthread_rwlock_destroy(&load_hash[i].rdlock);
 185 out2:
 186         pthread_mutex_destroy(&load_hash[i].lock);
 187 out3:
 188         while (i > 0) {
 189                 i--;
 190                 pthread_mutex_destroy(&load_hash[i].lock);
 191                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 192                 pthread_rwlock_destroy(&load_hash[i].rilock);
 193         }
 194         return -1;
 195 }
 196
 197 static void insert_node(struct load_node **n, int locate)
 198 {
 199         struct load_node *f;
 200
 201         pthread_mutex_lock(&load_hash[locate].lock);
 202         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 203         f = load_hash[locate].next;
 204         load_hash[locate].next = *n;
 205
 206         (*n)->pre = &(load_hash[locate].next);
 207         if (f)
 208                 f->pre = &((*n)->next);
 209         (*n)->next = f;
 210         pthread_mutex_unlock(&load_hash[locate].lock);
 211         pthread_rwlock_unlock(&load_hash[locate].rilock);
 212 }
 213 /*
 214  * locate_node() finds special node. Not return NULL means success.
 215  * It should be noted that rdlock isn't unlocked at the end of code
 216  * because this function is used to read special node. Delete is not
 217  * allowed before read has ended.
 218  * unlock rdlock only in proc_loadavg_read().
 219  */
 220 static struct load_node *locate_node(char *cg, int locate)
 221 {
 222         struct load_node *f = NULL;
 223         int i = 0;
 224
 225         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 226         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 227         if (load_hash[locate].next == NULL) {
 228                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 229                 return f;
 230         }
 231         f = load_hash[locate].next;
 232         pthread_rwlock_unlock(&load_hash[locate].rilock);
 233         while (f && ((i = strcmp(f->cg, cg)) != 0))
 234                 f = f->next;
 235         return f;
 236 }
 237 /* Delete the load_node n and return the next node of it. */
 238 static struct load_node *del_node(struct load_node *n, int locate)
 239 {
 240         struct load_node *g;
 241
 242         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 243         if (n->next == NULL) {
 244                 *(n->pre) = NULL;
 245         } else {
 246                 *(n->pre) = n->next;
 247                 n->next->pre = n->pre;
 248         }
 249         g = n->next;
 250         free(n->cg);
 251         free(n);
 252         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 253         return g;
 254 }
 255
 256 static void load_free(void)
 257 {
 258         int i;
 259         struct load_node *f, *p;
 260
 261         for (i = 0; i < LOAD_SIZE; i++) {
 262                 pthread_mutex_lock(&load_hash[i].lock);
 263                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 264                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 265                 if (load_hash[i].next == NULL) {
 266                         pthread_mutex_unlock(&load_hash[i].lock);
 267                         pthread_mutex_destroy(&load_hash[i].lock);
 268                         pthread_rwlock_unlock(&load_hash[i].rilock);
 269                         pthread_rwlock_destroy(&load_hash[i].rilock);
 270                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 271                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 272                         continue;
 273                 }
 274                 for (f = load_hash[i].next; f; ) {
 275                         free(f->cg);
 276                         p = f->next;
 277                         free(f);
 278                         f = p;
 279                 }
 280                 pthread_mutex_unlock(&load_hash[i].lock);
 281                 pthread_mutex_destroy(&load_hash[i].lock);
 282                 pthread_rwlock_unlock(&load_hash[i].rilock);
 283                 pthread_rwlock_destroy(&load_hash[i].rilock);
 284                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 285                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 286         }
 287 }
 288 /* Reserve buffer size to account for file size changes. */
 289 #define BUF_RESERVE_SIZE 512
 290
 291 /*
 292  * A table caching which pid is init for a pid namespace.
 293  * When looking up which pid is init for $qpid, we first
 294  * 1. Stat /proc/$qpid/ns/pid.
 295  * 2. Check whether the ino_t is in our store.
 296  *   a. if not, fork a child in qpid's ns to send us
 297  *       ucred.pid = 1, and read the initpid.  Cache
 298  *       initpid and creation time for /proc/initpid
 299  *       in a new store entry.
 300  *   b. if so, verify that /proc/initpid still matches
 301  *       what we have saved.  If not, clear the store
 302  *       entry and go back to a.  If so, return the
 303  *       cached initpid.
 304  */
 305 struct pidns_init_store {
 306         ino_t ino;          // inode number for /proc/$pid/ns/pid
 307         pid_t initpid;      // the pid of nit in that ns
 308         long int ctime;     // the time at which /proc/$initpid was created
 309         struct pidns_init_store *next;
 310         long int lastcheck;
 311 };
 312
 313 /* lol - look at how they are allocated in the kernel */
 314 #define PIDNS_HASH_SIZE 4096
 315 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 316
 317 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 318 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 319 static void lock_mutex(pthread_mutex_t *l)
 320 {
 321         int ret;
 322
 323         if ((ret = pthread_mutex_lock(l)) != 0) {
 324                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 325                 exit(1);
 326         }
 327 }
 328
 329 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 330  * Number of hierarchies mounted. */
 331 static int num_hierarchies;
 332
 333 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 334  * Hierachies mounted {cpuset, blkio, ...}:
 335  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 336 static char **hierarchies;
 337
 338 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 339  * Open file descriptors:
 340  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 341  * private mount namespace.
 342  * Initialized via __constructor__ collect_and_mount_subsystems().
 343  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 344  * mounts and respective files in the private namespace even when located in
 345  * another namespace using the *at() family of functions
 346  * {openat(), fchownat(), ...}. */
 347 static int *fd_hierarchies;
 348 static int cgroup_mount_ns_fd = -1;
 349
 350 static void unlock_mutex(pthread_mutex_t *l)
 351 {
 352         int ret;
 353
 354         if ((ret = pthread_mutex_unlock(l)) != 0) {
 355                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 356                 exit(1);
 357         }
 358 }
 359
 360 static void store_lock(void)
 361 {
 362         lock_mutex(&pidns_store_mutex);
 363 }
 364
 365 static void store_unlock(void)
 366 {
 367         unlock_mutex(&pidns_store_mutex);
 368 }
 369
 370 /* Must be called under store_lock */
 371 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 372 {
 373         struct stat initsb;
 374         char fnam[100];
 375
 376         snprintf(fnam, 100, "/proc/%d", e->initpid);
 377         if (stat(fnam, &initsb) < 0)
 378                 return false;
 379
 380         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 381                     initsb.st_ctime, e->initpid);
 382
 383         if (e->ctime != initsb.st_ctime)
 384                 return false;
 385         return true;
 386 }
 387
 388 /* Must be called under store_lock */
 389 static void remove_initpid(struct pidns_init_store *e)
 390 {
 391         struct pidns_init_store *tmp;
 392         int h;
 393
 394         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 395
 396         h = HASH(e->ino);
 397         if (pidns_hash_table[h] == e) {
 398                 pidns_hash_table[h] = e->next;
 399                 free(e);
 400                 return;
 401         }
 402
 403         tmp = pidns_hash_table[h];
 404         while (tmp) {
 405                 if (tmp->next == e) {
 406                         tmp->next = e->next;
 407                         free(e);
 408                         return;
 409                 }
 410                 tmp = tmp->next;
 411         }
 412 }
 413
 414 #define PURGE_SECS 5
 415 /* Must be called under store_lock */
 416 static void prune_initpid_store(void)
 417 {
 418         static long int last_prune = 0;
 419         struct pidns_init_store *e, *prev, *delme;
 420         long int now, threshold;
 421         int i;
 422
 423         if (!last_prune) {
 424                 last_prune = time(NULL);
 425                 return;
 426         }
 427         now = time(NULL);
 428         if (now < last_prune + PURGE_SECS)
 429                 return;
 430
 431         lxcfs_debug("%s\n", "Pruning.");
 432
 433         last_prune = now;
 434         threshold = now - 2 * PURGE_SECS;
 435
 436         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 437                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 438                         if (e->lastcheck < threshold) {
 439
 440                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 441
 442                                 delme = e;
 443                                 if (prev)
 444                                         prev->next = e->next;
 445                                 else
 446                                         pidns_hash_table[i] = e->next;
 447                                 e = e->next;
 448                                 free(delme);
 449                         } else {
 450                                 prev = e;
 451                                 e = e->next;
 452                         }
 453                 }
 454         }
 455 }
 456
 457 /* Must be called under store_lock */
 458 static void save_initpid(struct stat *sb, pid_t pid)
 459 {
 460         struct pidns_init_store *e;
 461         char fpath[100];
 462         struct stat procsb;
 463         int h;
 464
 465         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 466
 467         snprintf(fpath, 100, "/proc/%d", pid);
 468         if (stat(fpath, &procsb) < 0)
 469                 return;
 470         do {
 471                 e = malloc(sizeof(*e));
 472         } while (!e);
 473         e->ino = sb->st_ino;
 474         e->initpid = pid;
 475         e->ctime = procsb.st_ctime;
 476         h = HASH(e->ino);
 477         e->next = pidns_hash_table[h];
 478         e->lastcheck = time(NULL);
 479         pidns_hash_table[h] = e;
 480 }
 481
 482 /*
 483  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 484  * entry for the inode number and creation time.  Verify that the init pid
 485  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 486  * otherwise.
 487  * Must be called under store_lock
 488  */
 489 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 490 {
 491         int h = HASH(sb->st_ino);
 492         struct pidns_init_store *e = pidns_hash_table[h];
 493
 494         while (e) {
 495                 if (e->ino == sb->st_ino) {
 496                         if (initpid_still_valid(e, sb)) {
 497                                 e->lastcheck = time(NULL);
 498                                 return e;
 499                         }
 500                         remove_initpid(e);
 501                         return NULL;
 502                 }
 503                 e = e->next;
 504         }
 505
 506         return NULL;
 507 }
 508
 509 static int is_dir(const char *path, int fd)
 510 {
 511         struct stat statbuf;
 512         int ret = fstatat(fd, path, &statbuf, fd);
 513         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 514                 return 1;
 515         return 0;
 516 }
 517
 518 static char *must_copy_string(const char *str)
 519 {
 520         char *dup = NULL;
 521         if (!str)
 522                 return NULL;
 523         do {
 524                 dup = strdup(str);
 525         } while (!dup);
 526
 527         return dup;
 528 }
 529
 530 static inline void drop_trailing_newlines(char *s)
 531 {
 532         int l;
 533
 534         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 535                 s[l-1] = '\0';
 536 }
 537
 538 #define BATCH_SIZE 50
 539 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 540 {
 541         int newbatches = (newlen / BATCH_SIZE) + 1;
 542         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 543
 544         if (!*mem || newbatches > oldbatches) {
 545                 char *tmp;
 546                 do {
 547                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 548                 } while (!tmp);
 549                 *mem = tmp;
 550         }
 551 }
 552 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 553 {
 554         size_t newlen = *len + linelen;
 555         dorealloc(contents, *len, newlen + 1);
 556         memcpy(*contents + *len, line, linelen+1);
 557         *len = newlen;
 558 }
 559
 560 static char *slurp_file(const char *from, int fd)
 561 {
 562         char *line = NULL;
 563         char *contents = NULL;
 564         FILE *f = fdopen(fd, "r");
 565         size_t len = 0, fulllen = 0;
 566         ssize_t linelen;
 567
 568         if (!f)
 569                 return NULL;
 570
 571         while ((linelen = getline(&line, &len, f)) != -1) {
 572                 append_line(&contents, &fulllen, line, linelen);
 573         }
 574         fclose(f);
 575
 576         if (contents)
 577                 drop_trailing_newlines(contents);
 578         free(line);
 579         return contents;
 580 }
 581
 582 static bool write_string(const char *fnam, const char *string, int fd)
 583 {
 584         FILE *f;
 585         size_t len, ret;
 586
 587         f = fdopen(fd, "w");
 588         if (!f)
 589                 return false;
 590
 591         len = strlen(string);
 592         ret = fwrite(string, 1, len, f);
 593         if (ret != len) {
 594                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 595                             strerror(errno), string, fnam);
 596                 fclose(f);
 597                 return false;
 598         }
 599
 600         if (fclose(f) < 0) {
 601                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 602                 return false;
 603         }
 604
 605         return true;
 606 }
 607
 608 struct cgfs_files {
 609         char *name;
 610         uint32_t uid, gid;
 611         uint32_t mode;
 612 };
 613
 614 #define ALLOC_NUM 20
 615 static bool store_hierarchy(char *stridx, char *h)
 616 {
 617         if (num_hierarchies % ALLOC_NUM == 0) {
 618                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 619                 n *= ALLOC_NUM;
 620                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 621                 if (!tmp) {
 622                         lxcfs_error("%s\n", strerror(errno));
 623                         exit(1);
 624                 }
 625                 hierarchies = tmp;
 626         }
 627
 628         hierarchies[num_hierarchies++] = must_copy_string(h);
 629         return true;
 630 }
 631
 632 static void print_subsystems(void)
 633 {
 634         int i;
 635
 636         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 637         fprintf(stderr, "hierarchies:\n");
 638         for (i = 0; i < num_hierarchies; i++) {
 639                 if (hierarchies[i])
 640                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 641                                 fd_hierarchies[i], hierarchies[i]);
 642         }
 643 }
 644
 645 static bool in_comma_list(const char *needle, const char *haystack)
 646 {
 647         const char *s = haystack, *e;
 648         size_t nlen = strlen(needle);
 649
 650         while (*s && (e = strchr(s, ','))) {
 651                 if (nlen != e - s) {
 652                         s = e + 1;
 653                         continue;
 654                 }
 655                 if (strncmp(needle, s, nlen) == 0)
 656                         return true;
 657                 s = e + 1;
 658         }
 659         if (strcmp(needle, s) == 0)
 660                 return true;
 661         return false;
 662 }
 663
 664 /* do we need to do any massaging here?  I'm not sure... */
 665 /* Return the mounted controller and store the corresponding open file descriptor
 666  * referring to the controller mountpoint in the private lxcfs namespace in
 667  * @cfd.
 668  */
 669 static char *find_mounted_controller(const char *controller, int *cfd)
 670 {
 671         int i;
 672
 673         for (i = 0; i < num_hierarchies; i++) {
 674                 if (!hierarchies[i])
 675                         continue;
 676                 if (strcmp(hierarchies[i], controller) == 0) {
 677                         *cfd = fd_hierarchies[i];
 678                         return hierarchies[i];
 679                 }
 680                 if (in_comma_list(controller, hierarchies[i])) {
 681                         *cfd = fd_hierarchies[i];
 682                         return hierarchies[i];
 683                 }
 684         }
 685
 686         return NULL;
 687 }
 688
 689 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 690                 const char *value)
 691 {
 692         int ret, fd, cfd;
 693         size_t len;
 694         char *fnam, *tmpc;
 695
 696         tmpc = find_mounted_controller(controller, &cfd);
 697         if (!tmpc)
 698                 return false;
 699
 700         /* Make sure we pass a relative path to *at() family of functions.
 701          * . + /cgroup + / + file + \0
 702          */
 703         len = strlen(cgroup) + strlen(file) + 3;
 704         fnam = alloca(len);
 705         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 706         if (ret < 0 || (size_t)ret >= len)
 707                 return false;
 708
 709         fd = openat(cfd, fnam, O_WRONLY);
 710         if (fd < 0)
 711                 return false;
 712
 713         return write_string(fnam, value, fd);
 714 }
 715
 716 // Chown all the files in the cgroup directory.  We do this when we create
 717 // a cgroup on behalf of a user.
 718 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 719 {
 720         struct dirent *direntp;
 721         char path[MAXPATHLEN];
 722         size_t len;
 723         DIR *d;
 724         int fd1, ret;
 725
 726         len = strlen(dirname);
 727         if (len >= MAXPATHLEN) {
 728                 lxcfs_error("Pathname too long: %s\n", dirname);
 729                 return;
 730         }
 731
 732         fd1 = openat(fd, dirname, O_DIRECTORY);
 733         if (fd1 < 0)
 734                 return;
 735
 736         d = fdopendir(fd1);
 737         if (!d) {
 738                 lxcfs_error("Failed to open %s\n", dirname);
 739                 return;
 740         }
 741
 742         while ((direntp = readdir(d))) {
 743                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 744                         continue;
 745                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 746                 if (ret < 0 || ret >= MAXPATHLEN) {
 747                         lxcfs_error("Pathname too long under %s\n", dirname);
 748                         continue;
 749                 }
 750                 if (fchownat(fd, path, uid, gid, 0) < 0)
 751                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 752         }
 753         closedir(d);
 754 }
 755
 756 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 757 {
 758         int cfd;
 759         size_t len;
 760         char *dirnam, *tmpc;
 761
 762         tmpc = find_mounted_controller(controller, &cfd);
 763         if (!tmpc)
 764                 return -EINVAL;
 765
 766         /* Make sure we pass a relative path to *at() family of functions.
 767          * . + /cg + \0
 768          */
 769         len = strlen(cg) + 2;
 770         dirnam = alloca(len);
 771         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 772
 773         if (mkdirat(cfd, dirnam, 0755) < 0)
 774                 return -errno;
 775
 776         if (uid == 0 && gid == 0)
 777                 return 0;
 778
 779         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 780                 return -errno;
 781
 782         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 783
 784         return 0;
 785 }
 786
 787 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 788 {
 789         struct dirent *direntp;
 790         DIR *dir;
 791         bool ret = false;
 792         char pathname[MAXPATHLEN];
 793         int dupfd;
 794
 795         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 796         if (dupfd < 0)
 797                 return false;
 798
 799         dir = fdopendir(dupfd);
 800         if (!dir) {
 801                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 802                 close(dupfd);
 803                 return false;
 804         }
 805
 806         while ((direntp = readdir(dir))) {
 807                 struct stat mystat;
 808                 int rc;
 809
 810                 if (!strcmp(direntp->d_name, ".") ||
 811                     !strcmp(direntp->d_name, ".."))
 812                         continue;
 813
 814                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 815                 if (rc < 0 || rc >= MAXPATHLEN) {
 816                         lxcfs_error("%s\n", "Pathname too long.");
 817                         continue;
 818                 }
 819
 820                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 821                 if (rc) {
 822                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 823                         continue;
 824                 }
 825                 if (S_ISDIR(mystat.st_mode))
 826                         if (!recursive_rmdir(pathname, fd, cfd))
 827                                 lxcfs_debug("Error removing %s.\n", pathname);
 828         }
 829
 830         ret = true;
 831         if (closedir(dir) < 0) {
 832                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 833                 ret = false;
 834         }
 835
 836         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 837                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 838                 ret = false;
 839         }
 840
 841         close(dupfd);
 842
 843         return ret;
 844 }
 845
 846 bool cgfs_remove(const char *controller, const char *cg)
 847 {
 848         int fd, cfd;
 849         size_t len;
 850         char *dirnam, *tmpc;
 851         bool bret;
 852
 853         tmpc = find_mounted_controller(controller, &cfd);
 854         if (!tmpc)
 855                 return false;
 856
 857         /* Make sure we pass a relative path to *at() family of functions.
 858          * . +  /cg + \0
 859          */
 860         len = strlen(cg) + 2;
 861         dirnam = alloca(len);
 862         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 863
 864         fd = openat(cfd, dirnam, O_DIRECTORY);
 865         if (fd < 0)
 866                 return false;
 867
 868         bret = recursive_rmdir(dirnam, fd, cfd);
 869         close(fd);
 870         return bret;
 871 }
 872
 873 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 874 {
 875         int cfd;
 876         size_t len;
 877         char *pathname, *tmpc;
 878
 879         tmpc = find_mounted_controller(controller, &cfd);
 880         if (!tmpc)
 881                 return false;
 882
 883         /* Make sure we pass a relative path to *at() family of functions.
 884          * . + /file + \0
 885          */
 886         len = strlen(file) + 2;
 887         pathname = alloca(len);
 888         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 889         if (fchmodat(cfd, pathname, mode, 0) < 0)
 890                 return false;
 891         return true;
 892 }
 893
 894 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 895 {
 896         size_t len;
 897         char *fname;
 898
 899         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 900         fname = alloca(len);
 901         snprintf(fname, len, "%s/tasks", dirname);
 902         if (fchownat(fd, fname, uid, gid, 0) != 0)
 903                 return -errno;
 904         snprintf(fname, len, "%s/cgroup.procs", dirname);
 905         if (fchownat(fd, fname, uid, gid, 0) != 0)
 906                 return -errno;
 907         return 0;
 908 }
 909
 910 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 911 {
 912         int cfd;
 913         size_t len;
 914         char *pathname, *tmpc;
 915
 916         tmpc = find_mounted_controller(controller, &cfd);
 917         if (!tmpc)
 918                 return -EINVAL;
 919
 920         /* Make sure we pass a relative path to *at() family of functions.
 921          * . + /file + \0
 922          */
 923         len = strlen(file) + 2;
 924         pathname = alloca(len);
 925         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 926         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 927                 return -errno;
 928
 929         if (is_dir(pathname, cfd))
 930                 // like cgmanager did, we want to chown the tasks file as well
 931                 return chown_tasks_files(pathname, uid, gid, cfd);
 932
 933         return 0;
 934 }
 935
 936 FILE *open_pids_file(const char *controller, const char *cgroup)
 937 {
 938         int fd, cfd;
 939         size_t len;
 940         char *pathname, *tmpc;
 941
 942         tmpc = find_mounted_controller(controller, &cfd);
 943         if (!tmpc)
 944                 return NULL;
 945
 946         /* Make sure we pass a relative path to *at() family of functions.
 947          * . + /cgroup + / "cgroup.procs" + \0
 948          */
 949         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 950         pathname = alloca(len);
 951         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 952
 953         fd = openat(cfd, pathname, O_WRONLY);
 954         if (fd < 0)
 955                 return NULL;
 956
 957         return fdopen(fd, "w");
 958 }
 959
 960 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 961                                 void ***list, size_t typesize,
 962                                 void* (*iterator)(const char*, const char*, const char*))
 963 {
 964         int cfd, fd, ret;
 965         size_t len;
 966         char *cg, *tmpc;
 967         char pathname[MAXPATHLEN];
 968         size_t sz = 0, asz = 0;
 969         struct dirent *dirent;
 970         DIR *dir;
 971
 972         tmpc = find_mounted_controller(controller, &cfd);
 973         *list = NULL;
 974         if (!tmpc)
 975                 return false;
 976
 977         /* Make sure we pass a relative path to *at() family of functions. */
 978         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 979         cg = alloca(len);
 980         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 981         if (ret < 0 || (size_t)ret >= len) {
 982                 lxcfs_error("Pathname too long under %s\n", cgroup);
 983                 return false;
 984         }
 985
 986         fd = openat(cfd, cg, O_DIRECTORY);
 987         if (fd < 0)
 988                 return false;
 989
 990         dir = fdopendir(fd);
 991         if (!dir)
 992                 return false;
 993
 994         while ((dirent = readdir(dir))) {
 995                 struct stat mystat;
 996
 997                 if (!strcmp(dirent->d_name, ".") ||
 998                     !strcmp(dirent->d_name, ".."))
 999                         continue;
1000
1001                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1002                 if (ret < 0 || ret >= MAXPATHLEN) {
1003                         lxcfs_error("Pathname too long under %s\n", cg);
1004                         continue;
1005                 }
1006
1007                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1008                 if (ret) {
1009                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1010                         continue;
1011                 }
1012                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1013                     (directories && !S_ISDIR(mystat.st_mode)))
1014                         continue;
1015
1016                 if (sz+2 >= asz) {
1017                         void **tmp;
1018                         asz += BATCH_SIZE;
1019                         do {
1020                                 tmp = realloc(*list, asz * typesize);
1021                         } while  (!tmp);
1022                         *list = tmp;
1023                 }
1024                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1025                 (*list)[sz+1] = NULL;
1026                 sz++;
1027         }
1028         if (closedir(dir) < 0) {
1029                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1030                 return false;
1031         }
1032         return true;
1033 }
1034
1035 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1036 {
1037         char *dup;
1038         do {
1039                 dup = strdup(dir_entry);
1040         } while (!dup);
1041         return dup;
1042 }
1043
1044 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1045 {
1046         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1047 }
1048
1049 void free_key(struct cgfs_files *k)
1050 {
1051         if (!k)
1052                 return;
1053         free(k->name);
1054         free(k);
1055 }
1056
1057 void free_keys(struct cgfs_files **keys)
1058 {
1059         int i;
1060
1061         if (!keys)
1062                 return;
1063         for (i = 0; keys[i]; i++) {
1064                 free_key(keys[i]);
1065         }
1066         free(keys);
1067 }
1068
1069 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1070 {
1071         int ret, fd, cfd;
1072         size_t len;
1073         char *fnam, *tmpc;
1074
1075         tmpc = find_mounted_controller(controller, &cfd);
1076         if (!tmpc)
1077                 return false;
1078
1079         /* Make sure we pass a relative path to *at() family of functions.
1080          * . + /cgroup + / + file + \0
1081          */
1082         len = strlen(cgroup) + strlen(file) + 3;
1083         fnam = alloca(len);
1084         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1085         if (ret < 0 || (size_t)ret >= len)
1086                 return false;
1087
1088         fd = openat(cfd, fnam, O_RDONLY);
1089         if (fd < 0)
1090                 return false;
1091
1092         *value = slurp_file(fnam, fd);
1093         return *value != NULL;
1094 }
1095
1096 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1097 {
1098         int ret, cfd;
1099         size_t len;
1100         char *fnam, *tmpc;
1101         struct stat sb;
1102         struct cgfs_files *newkey;
1103
1104         tmpc = find_mounted_controller(controller, &cfd);
1105         if (!tmpc)
1106                 return false;
1107
1108         if (file && *file == '/')
1109                 file++;
1110
1111         if (file && strchr(file, '/'))
1112                 return NULL;
1113
1114         /* Make sure we pass a relative path to *at() family of functions.
1115          * . + /cgroup + / + file + \0
1116          */
1117         len = strlen(cgroup) + 3;
1118         if (file)
1119                 len += strlen(file) + 1;
1120         fnam = alloca(len);
1121         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1122                  file ? "/" : "", file ? file : "");
1123
1124         ret = fstatat(cfd, fnam, &sb, 0);
1125         if (ret < 0)
1126                 return NULL;
1127
1128         do {
1129                 newkey = malloc(sizeof(struct cgfs_files));
1130         } while (!newkey);
1131         if (file)
1132                 newkey->name = must_copy_string(file);
1133         else if (strrchr(cgroup, '/'))
1134                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1135         else
1136                 newkey->name = must_copy_string(cgroup);
1137         newkey->uid = sb.st_uid;
1138         newkey->gid = sb.st_gid;
1139         newkey->mode = sb.st_mode;
1140
1141         return newkey;
1142 }
1143
1144 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1145 {
1146         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1147         if (!entry) {
1148                 lxcfs_error("Error getting files under %s:%s\n", controller,
1149                              cgroup);
1150         }
1151         return entry;
1152 }
1153
1154 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1155 {
1156         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1157 }
1158
1159 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1160 {
1161         int cfd;
1162         size_t len;
1163         char *fnam, *tmpc;
1164         int ret;
1165         struct stat sb;
1166
1167         tmpc = find_mounted_controller(controller, &cfd);
1168         if (!tmpc)
1169                 return false;
1170
1171         /* Make sure we pass a relative path to *at() family of functions.
1172          * . + /cgroup + / + f + \0
1173          */
1174         len = strlen(cgroup) + strlen(f) + 3;
1175         fnam = alloca(len);
1176         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1177         if (ret < 0 || (size_t)ret >= len)
1178                 return false;
1179
1180         ret = fstatat(cfd, fnam, &sb, 0);
1181         if (ret < 0 || !S_ISDIR(sb.st_mode))
1182                 return false;
1183
1184         return true;
1185 }
1186
1187 #define SEND_CREDS_OK 0
1188 #define SEND_CREDS_NOTSK 1
1189 #define SEND_CREDS_FAIL 2
1190 static bool recv_creds(int sock, struct ucred *cred, char *v);
1191 static int wait_for_pid(pid_t pid);
1192 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1193 static int send_creds_clone_wrapper(void *arg);
1194
1195 /*
1196  * clone a task which switches to @task's namespace and writes '1'.
1197  * over a unix sock so we can read the task's reaper's pid in our
1198  * namespace
1199  *
1200  * Note: glibc's fork() does not respect pidns, which can lead to failed
1201  * assertions inside glibc (and thus failed forks) if the child's pid in
1202  * the pidns and the parent pid outside are identical. Using clone prevents
1203  * this issue.
1204  */
1205 static void write_task_init_pid_exit(int sock, pid_t target)
1206 {
1207         char fnam[100];
1208         pid_t pid;
1209         int fd, ret;
1210         size_t stack_size = sysconf(_SC_PAGESIZE);
1211         void *stack = alloca(stack_size);
1212
1213         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1214         if (ret < 0 || ret >= sizeof(fnam))
1215                 _exit(1);
1216
1217         fd = open(fnam, O_RDONLY);
1218         if (fd < 0) {
1219                 perror("write_task_init_pid_exit open of ns/pid");
1220                 _exit(1);
1221         }
1222         if (setns(fd, 0)) {
1223                 perror("write_task_init_pid_exit setns 1");
1224                 close(fd);
1225                 _exit(1);
1226         }
1227         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1228         if (pid < 0)
1229                 _exit(1);
1230         if (pid != 0) {
1231                 if (!wait_for_pid(pid))
1232                         _exit(1);
1233                 _exit(0);
1234         }
1235 }
1236
1237 static int send_creds_clone_wrapper(void *arg) {
1238         struct ucred cred;
1239         char v;
1240         int sock = *(int *)arg;
1241
1242         /* we are the child */
1243         cred.uid = 0;
1244         cred.gid = 0;
1245         cred.pid = 1;
1246         v = '1';
1247         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1248                 return 1;
1249         return 0;
1250 }
1251
1252 static pid_t get_init_pid_for_task(pid_t task)
1253 {
1254         int sock[2];
1255         pid_t pid;
1256         pid_t ret = -1;
1257         char v = '0';
1258         struct ucred cred;
1259
1260         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1261                 perror("socketpair");
1262                 return -1;
1263         }
1264
1265         pid = fork();
1266         if (pid < 0)
1267                 goto out;
1268         if (!pid) {
1269                 close(sock[1]);
1270                 write_task_init_pid_exit(sock[0], task);
1271                 _exit(0);
1272         }
1273
1274         if (!recv_creds(sock[1], &cred, &v))
1275                 goto out;
1276         ret = cred.pid;
1277
1278 out:
1279         close(sock[0]);
1280         close(sock[1]);
1281         if (pid > 0)
1282                 wait_for_pid(pid);
1283         return ret;
1284 }
1285
1286 static pid_t lookup_initpid_in_store(pid_t qpid)
1287 {
1288         pid_t answer = 0;
1289         struct stat sb;
1290         struct pidns_init_store *e;
1291         char fnam[100];
1292
1293         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1294         store_lock();
1295         if (stat(fnam, &sb) < 0)
1296                 goto out;
1297         e = lookup_verify_initpid(&sb);
1298         if (e) {
1299                 answer = e->initpid;
1300                 goto out;
1301         }
1302         answer = get_init_pid_for_task(qpid);
1303         if (answer > 0)
1304                 save_initpid(&sb, answer);
1305
1306 out:
1307         /* we prune at end in case we are returning
1308          * the value we were about to return */
1309         prune_initpid_store();
1310         store_unlock();
1311         return answer;
1312 }
1313
1314 static int wait_for_pid(pid_t pid)
1315 {
1316         int status, ret;
1317
1318         if (pid <= 0)
1319                 return -1;
1320
1321 again:
1322         ret = waitpid(pid, &status, 0);
1323         if (ret == -1) {
1324                 if (errno == EINTR)
1325                         goto again;
1326                 return -1;
1327         }
1328         if (ret != pid)
1329                 goto again;
1330         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1331                 return -1;
1332         return 0;
1333 }
1334
1335
1336 /*
1337  * append pid to *src.
1338  * src: a pointer to a char* in which ot append the pid.
1339  * sz: the number of characters printed so far, minus trailing \0.
1340  * asz: the allocated size so far
1341  * pid: the pid to append
1342  */
1343 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1344 {
1345         char tmp[30];
1346
1347         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1348
1349         if (!*src || tmplen + *sz + 1 >= *asz) {
1350                 char *tmp;
1351                 do {
1352                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1353                 } while (!tmp);
1354                 *src = tmp;
1355                 *asz += BUF_RESERVE_SIZE;
1356         }
1357         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1358         *sz += tmplen;
1359 }
1360
1361 /*
1362  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1363  * valid in the caller's namespace, return the id mapped into
1364  * pid's namespace.
1365  * Returns the mapped id, or -1 on error.
1366  */
1367 unsigned int
1368 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1369 {
1370         unsigned int nsuid,   // base id for a range in the idfile's namespace
1371                      hostuid, // base id for a range in the caller's namespace
1372                      count;   // number of ids in this range
1373         char line[400];
1374         int ret;
1375
1376         fseek(idfile, 0L, SEEK_SET);
1377         while (fgets(line, 400, idfile)) {
1378                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1379                 if (ret != 3)
1380                         continue;
1381                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1382                         /*
1383                          * uids wrapped around - unexpected as this is a procfile,
1384                          * so just bail.
1385                          */
1386                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1387                                 nsuid, hostuid, count, line);
1388                         return -1;
1389                 }
1390                 if (hostuid <= in_id && hostuid+count > in_id) {
1391                         /*
1392                          * now since hostuid <= in_id < hostuid+count, and
1393                          * hostuid+count and nsuid+count do not wrap around,
1394                          * we know that nsuid+(in_id-hostuid) which must be
1395                          * less that nsuid+(count) must not wrap around
1396                          */
1397                         return (in_id - hostuid) + nsuid;
1398                 }
1399         }
1400
1401         // no answer found
1402         return -1;
1403 }
1404
1405 /*
1406  * for is_privileged_over,
1407  * specify whether we require the calling uid to be root in his
1408  * namespace
1409  */
1410 #define NS_ROOT_REQD true
1411 #define NS_ROOT_OPT false
1412
1413 #define PROCLEN 100
1414
1415 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1416 {
1417         char fpath[PROCLEN];
1418         int ret;
1419         bool answer = false;
1420         uid_t nsuid;
1421
1422         if (victim == -1 || uid == -1)
1423                 return false;
1424
1425         /*
1426          * If the request is one not requiring root in the namespace,
1427          * then having the same uid suffices.  (i.e. uid 1000 has write
1428          * access to files owned by uid 1000
1429          */
1430         if (!req_ns_root && uid == victim)
1431                 return true;
1432
1433         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1434         if (ret < 0 || ret >= PROCLEN)
1435                 return false;
1436         FILE *f = fopen(fpath, "r");
1437         if (!f)
1438                 return false;
1439
1440         /* if caller's not root in his namespace, reject */
1441         nsuid = convert_id_to_ns(f, uid);
1442         if (nsuid)
1443                 goto out;
1444
1445         /*
1446          * If victim is not mapped into caller's ns, reject.
1447          * XXX I'm not sure this check is needed given that fuse
1448          * will be sending requests where the vfs has converted
1449          */
1450         nsuid = convert_id_to_ns(f, victim);
1451         if (nsuid == -1)
1452                 goto out;
1453
1454         answer = true;
1455
1456 out:
1457         fclose(f);
1458         return answer;
1459 }
1460
1461 static bool perms_include(int fmode, mode_t req_mode)
1462 {
1463         mode_t r;
1464
1465         switch (req_mode & O_ACCMODE) {
1466         case O_RDONLY:
1467                 r = S_IROTH;
1468                 break;
1469         case O_WRONLY:
1470                 r = S_IWOTH;
1471                 break;
1472         case O_RDWR:
1473                 r = S_IROTH | S_IWOTH;
1474                 break;
1475         default:
1476                 return false;
1477         }
1478         return ((fmode & r) == r);
1479 }
1480
1481
1482 /*
1483  * taskcg is  a/b/c
1484  * querycg is /a/b/c/d/e
1485  * we return 'd'
1486  */
1487 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1488 {
1489         char *start, *end;
1490
1491         if (strlen(taskcg) <= strlen(querycg)) {
1492                 lxcfs_error("%s\n", "I was fed bad input.");
1493                 return NULL;
1494         }
1495
1496         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1497                 start =  strdup(taskcg + 1);
1498         else
1499                 start = strdup(taskcg + strlen(querycg) + 1);
1500         if (!start)
1501                 return NULL;
1502         end = strchr(start, '/');
1503         if (end)
1504                 *end = '\0';
1505         return start;
1506 }
1507
1508 static void stripnewline(char *x)
1509 {
1510         size_t l = strlen(x);
1511         if (l && x[l-1] == '\n')
1512                 x[l-1] = '\0';
1513 }
1514
1515 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1516 {
1517         int cfd;
1518         char fnam[PROCLEN];
1519         FILE *f;
1520         char *answer = NULL;
1521         char *line = NULL;
1522         size_t len = 0;
1523         int ret;
1524         const char *h = find_mounted_controller(contrl, &cfd);
1525         if (!h)
1526                 return NULL;
1527
1528         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1529         if (ret < 0 || ret >= PROCLEN)
1530                 return NULL;
1531         if (!(f = fopen(fnam, "r")))
1532                 return NULL;
1533
1534         while (getline(&line, &len, f) != -1) {
1535                 char *c1, *c2;
1536                 if (!line[0])
1537                         continue;
1538                 c1 = strchr(line, ':');
1539                 if (!c1)
1540                         goto out;
1541                 c1++;
1542                 c2 = strchr(c1, ':');
1543                 if (!c2)
1544                         goto out;
1545                 *c2 = '\0';
1546                 if (strcmp(c1, h) != 0)
1547                         continue;
1548                 c2++;
1549                 stripnewline(c2);
1550                 do {
1551                         answer = strdup(c2);
1552                 } while (!answer);
1553                 break;
1554         }
1555
1556 out:
1557         fclose(f);
1558         free(line);
1559         return answer;
1560 }
1561
1562 /*
1563  * check whether a fuse context may access a cgroup dir or file
1564  *
1565  * If file is not null, it is a cgroup file to check under cg.
1566  * If file is null, then we are checking perms on cg itself.
1567  *
1568  * For files we can check the mode of the list_keys result.
1569  * For cgroups, we must make assumptions based on the files under the
1570  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1571  * yet.
1572  */
1573 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1574 {
1575         struct cgfs_files *k = NULL;
1576         bool ret = false;
1577
1578         k = cgfs_get_key(contrl, cg, file);
1579         if (!k)
1580                 return false;
1581
1582         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1583                 if (perms_include(k->mode >> 6, mode)) {
1584                         ret = true;
1585                         goto out;
1586                 }
1587         }
1588         if (fc->gid == k->gid) {
1589                 if (perms_include(k->mode >> 3, mode)) {
1590                         ret = true;
1591                         goto out;
1592                 }
1593         }
1594         ret = perms_include(k->mode, mode);
1595
1596 out:
1597         free_key(k);
1598         return ret;
1599 }
1600
1601 #define INITSCOPE "/init.scope"
1602 static void prune_init_slice(char *cg)
1603 {
1604         char *point;
1605         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1606
1607         if (cg_len < initscope_len)
1608                 return;
1609
1610         point = cg + cg_len - initscope_len;
1611         if (strcmp(point, INITSCOPE) == 0) {
1612                 if (point == cg)
1613                         *(point+1) = '\0';
1614                 else
1615                         *point = '\0';
1616         }
1617 }
1618
1619 /*
1620  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1621  * If pid is in /a, he may act on /a/b, but not on /b.
1622  * if the answer is false and nextcg is not NULL, then *nextcg will point
1623  * to a string containing the next cgroup directory under cg, which must be
1624  * freed by the caller.
1625  */
1626 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1627 {
1628         bool answer = false;
1629         char *c2 = get_pid_cgroup(pid, contrl);
1630         char *linecmp;
1631
1632         if (!c2)
1633                 return false;
1634         prune_init_slice(c2);
1635
1636         /*
1637          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1638          * they pass in a cgroup without leading '/'
1639          *
1640          * The original line here was:
1641          *      linecmp = *cg == '/' ? c2 : c2+1;
1642          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1643          *       Serge, do you know?
1644          */
1645         if (*cg == '/' || !strncmp(cg, "./", 2))
1646                 linecmp = c2;
1647         else
1648                 linecmp = c2 + 1;
1649         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1650                 if (nextcg) {
1651                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1652                 }
1653                 goto out;
1654         }
1655         answer = true;
1656
1657 out:
1658         free(c2);
1659         return answer;
1660 }
1661
1662 /*
1663  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1664  */
1665 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1666 {
1667         bool answer = false;
1668         char *c2, *task_cg;
1669         size_t target_len, task_len;
1670
1671         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1672                 return true;
1673
1674         c2 = get_pid_cgroup(pid, contrl);
1675         if (!c2)
1676                 return false;
1677         prune_init_slice(c2);
1678
1679         task_cg = c2 + 1;
1680         target_len = strlen(cg);
1681         task_len = strlen(task_cg);
1682         if (task_len == 0) {
1683                 /* Task is in the root cg, it can see everything. This case is
1684                  * not handled by the strmcps below, since they test for the
1685                  * last /, but that is the first / that we've chopped off
1686                  * above.
1687                  */
1688                 answer = true;
1689                 goto out;
1690         }
1691         if (strcmp(cg, task_cg) == 0) {
1692                 answer = true;
1693                 goto out;
1694         }
1695         if (target_len < task_len) {
1696                 /* looking up a parent dir */
1697                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1698                         answer = true;
1699                 goto out;
1700         }
1701         if (target_len > task_len) {
1702                 /* looking up a child dir */
1703                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1704                         answer = true;
1705                 goto out;
1706         }
1707
1708 out:
1709         free(c2);
1710         return answer;
1711 }
1712
1713 /*
1714  * given /cgroup/freezer/a/b, return "freezer".
1715  * the returned char* should NOT be freed.
1716  */
1717 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1718 {
1719         const char *p1;
1720         char *contr, *slash;
1721
1722         if (strlen(path) < 9) {
1723                 errno = EACCES;
1724                 return NULL;
1725         }
1726         if (*(path + 7) != '/') {
1727                 errno = EINVAL;
1728                 return NULL;
1729         }
1730         p1 = path + 8;
1731         contr = strdupa(p1);
1732         if (!contr) {
1733                 errno = ENOMEM;
1734                 return NULL;
1735         }
1736         slash = strstr(contr, "/");
1737         if (slash)
1738                 *slash = '\0';
1739
1740         int i;
1741         for (i = 0; i < num_hierarchies; i++) {
1742                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1743                         return hierarchies[i];
1744         }
1745         errno = ENOENT;
1746         return NULL;
1747 }
1748
1749 /*
1750  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1751  * Note that the returned value may include files (keynames) etc
1752  */
1753 static const char *find_cgroup_in_path(const char *path)
1754 {
1755         const char *p1;
1756
1757         if (strlen(path) < 9) {
1758                 errno = EACCES;
1759                 return NULL;
1760         }
1761         p1 = strstr(path + 8, "/");
1762         if (!p1) {
1763                 errno = EINVAL;
1764                 return NULL;
1765         }
1766         errno = 0;
1767         return p1 + 1;
1768 }
1769
1770 /*
1771  * split the last path element from the path in @cg.
1772  * @dir is newly allocated and should be freed, @last not
1773 */
1774 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1775 {
1776         char *p;
1777
1778         do {
1779                 *dir = strdup(cg);
1780         } while (!*dir);
1781         *last = strrchr(cg, '/');
1782         if (!*last) {
1783                 *last = NULL;
1784                 return;
1785         }
1786         p = strrchr(*dir, '/');
1787         *p = '\0';
1788 }
1789
1790 /*
1791  * FUSE ops for /cgroup
1792  */
1793
1794 int cg_getattr(const char *path, struct stat *sb)
1795 {
1796         struct timespec now;
1797         struct fuse_context *fc = fuse_get_context();
1798         char * cgdir = NULL;
1799         char *last = NULL, *path1, *path2;
1800         struct cgfs_files *k = NULL;
1801         const char *cgroup;
1802         const char *controller = NULL;
1803         int ret = -ENOENT;
1804
1805
1806         if (!fc)
1807                 return -EIO;
1808
1809         memset(sb, 0, sizeof(struct stat));
1810
1811         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1812                 return -EINVAL;
1813
1814         sb->st_uid = sb->st_gid = 0;
1815         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1816         sb->st_size = 0;
1817
1818         if (strcmp(path, "/cgroup") == 0) {
1819                 sb->st_mode = S_IFDIR | 00755;
1820                 sb->st_nlink = 2;
1821                 return 0;
1822         }
1823
1824         controller = pick_controller_from_path(fc, path);
1825         if (!controller)
1826                 return -errno;
1827         cgroup = find_cgroup_in_path(path);
1828         if (!cgroup) {
1829                 /* this is just /cgroup/controller, return it as a dir */
1830                 sb->st_mode = S_IFDIR | 00755;
1831                 sb->st_nlink = 2;
1832                 return 0;
1833         }
1834
1835         get_cgdir_and_path(cgroup, &cgdir, &last);
1836
1837         if (!last) {
1838                 path1 = "/";
1839                 path2 = cgdir;
1840         } else {
1841                 path1 = cgdir;
1842                 path2 = last;
1843         }
1844
1845         pid_t initpid = lookup_initpid_in_store(fc->pid);
1846         if (initpid <= 0)
1847                 initpid = fc->pid;
1848         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1849          * Then check that caller's cgroup is under path if last is a child
1850          * cgroup, or cgdir if last is a file */
1851
1852         if (is_child_cgroup(controller, path1, path2)) {
1853                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1854                         ret = -ENOENT;
1855                         goto out;
1856                 }
1857                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1858                         /* this is just /cgroup/controller, return it as a dir */
1859                         sb->st_mode = S_IFDIR | 00555;
1860                         sb->st_nlink = 2;
1861                         ret = 0;
1862                         goto out;
1863                 }
1864                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1865                         ret = -EACCES;
1866                         goto out;
1867                 }
1868
1869                 // get uid, gid, from '/tasks' file and make up a mode
1870                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1871                 sb->st_mode = S_IFDIR | 00755;
1872                 k = cgfs_get_key(controller, cgroup, NULL);
1873                 if (!k) {
1874                         sb->st_uid = sb->st_gid = 0;
1875                 } else {
1876                         sb->st_uid = k->uid;
1877                         sb->st_gid = k->gid;
1878                 }
1879                 free_key(k);
1880                 sb->st_nlink = 2;
1881                 ret = 0;
1882                 goto out;
1883         }
1884
1885         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1886                 sb->st_mode = S_IFREG | k->mode;
1887                 sb->st_nlink = 1;
1888                 sb->st_uid = k->uid;
1889                 sb->st_gid = k->gid;
1890                 sb->st_size = 0;
1891                 free_key(k);
1892                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1893                         ret = -ENOENT;
1894                         goto out;
1895                 }
1896                 ret = 0;
1897         }
1898
1899 out:
1900         free(cgdir);
1901         return ret;
1902 }
1903
1904 int cg_opendir(const char *path, struct fuse_file_info *fi)
1905 {
1906         struct fuse_context *fc = fuse_get_context();
1907         const char *cgroup;
1908         struct file_info *dir_info;
1909         char *controller = NULL;
1910
1911         if (!fc)
1912                 return -EIO;
1913
1914         if (strcmp(path, "/cgroup") == 0) {
1915                 cgroup = NULL;
1916                 controller = NULL;
1917         } else {
1918                 // return list of keys for the controller, and list of child cgroups
1919                 controller = pick_controller_from_path(fc, path);
1920                 if (!controller)
1921                         return -errno;
1922
1923                 cgroup = find_cgroup_in_path(path);
1924                 if (!cgroup) {
1925                         /* this is just /cgroup/controller, return its contents */
1926                         cgroup = "/";
1927                 }
1928         }
1929
1930         pid_t initpid = lookup_initpid_in_store(fc->pid);
1931         if (initpid <= 0)
1932                 initpid = fc->pid;
1933         if (cgroup) {
1934                 if (!caller_may_see_dir(initpid, controller, cgroup))
1935                         return -ENOENT;
1936                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1937                         return -EACCES;
1938         }
1939
1940         /* we'll free this at cg_releasedir */
1941         dir_info = malloc(sizeof(*dir_info));
1942         if (!dir_info)
1943                 return -ENOMEM;
1944         dir_info->controller = must_copy_string(controller);
1945         dir_info->cgroup = must_copy_string(cgroup);
1946         dir_info->type = LXC_TYPE_CGDIR;
1947         dir_info->buf = NULL;
1948         dir_info->file = NULL;
1949         dir_info->buflen = 0;
1950
1951         fi->fh = (unsigned long)dir_info;
1952         return 0;
1953 }
1954
1955 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1956                 struct fuse_file_info *fi)
1957 {
1958         struct file_info *d = (struct file_info *)fi->fh;
1959         struct cgfs_files **list = NULL;
1960         int i, ret;
1961         char *nextcg = NULL;
1962         struct fuse_context *fc = fuse_get_context();
1963         char **clist = NULL;
1964
1965         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1966                 return -EIO;
1967
1968         if (d->type != LXC_TYPE_CGDIR) {
1969                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1970                 return -EIO;
1971         }
1972         if (!d->cgroup && !d->controller) {
1973                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1974                 int i;
1975
1976                 for (i = 0;  i < num_hierarchies; i++) {
1977                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1978                                 return -EIO;
1979                         }
1980                 }
1981                 return 0;
1982         }
1983
1984         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1985                 // not a valid cgroup
1986                 ret = -EINVAL;
1987                 goto out;
1988         }
1989
1990         pid_t initpid = lookup_initpid_in_store(fc->pid);
1991         if (initpid <= 0)
1992                 initpid = fc->pid;
1993         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1994                 if (nextcg) {
1995                         ret = filler(buf, nextcg,  NULL, 0);
1996                         free(nextcg);
1997                         if (ret != 0) {
1998                                 ret = -EIO;
1999                                 goto out;
2000                         }
2001                 }
2002                 ret = 0;
2003                 goto out;
2004         }
2005
2006         for (i = 0; list[i]; i++) {
2007                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2008                         ret = -EIO;
2009                         goto out;
2010                 }
2011         }
2012
2013         // now get the list of child cgroups
2014
2015         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2016                 ret = 0;
2017                 goto out;
2018         }
2019         if (clist) {
2020                 for (i = 0; clist[i]; i++) {
2021                         if (filler(buf, clist[i], NULL, 0) != 0) {
2022                                 ret = -EIO;
2023                                 goto out;
2024                         }
2025                 }
2026         }
2027         ret = 0;
2028
2029 out:
2030         free_keys(list);
2031         if (clist) {
2032                 for (i = 0; clist[i]; i++)
2033                         free(clist[i]);
2034                 free(clist);
2035         }
2036         return ret;
2037 }
2038
2039 static void do_release_file_info(struct fuse_file_info *fi)
2040 {
2041         struct file_info *f = (struct file_info *)fi->fh;
2042
2043         if (!f)
2044                 return;
2045
2046         fi->fh = 0;
2047
2048         free(f->controller);
2049         f->controller = NULL;
2050         free(f->cgroup);
2051         f->cgroup = NULL;
2052         free(f->file);
2053         f->file = NULL;
2054         free(f->buf);
2055         f->buf = NULL;
2056         free(f);
2057 }
2058
2059 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2060 {
2061         do_release_file_info(fi);
2062         return 0;
2063 }
2064
2065 int cg_open(const char *path, struct fuse_file_info *fi)
2066 {
2067         const char *cgroup;
2068         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2069         struct cgfs_files *k = NULL;
2070         struct file_info *file_info;
2071         struct fuse_context *fc = fuse_get_context();
2072         int ret;
2073
2074         if (!fc)
2075                 return -EIO;
2076
2077         controller = pick_controller_from_path(fc, path);
2078         if (!controller)
2079                 return -errno;
2080         cgroup = find_cgroup_in_path(path);
2081         if (!cgroup)
2082                 return -errno;
2083
2084         get_cgdir_and_path(cgroup, &cgdir, &last);
2085         if (!last) {
2086                 path1 = "/";
2087                 path2 = cgdir;
2088         } else {
2089                 path1 = cgdir;
2090                 path2 = last;
2091         }
2092
2093         k = cgfs_get_key(controller, path1, path2);
2094         if (!k) {
2095                 ret = -EINVAL;
2096                 goto out;
2097         }
2098         free_key(k);
2099
2100         pid_t initpid = lookup_initpid_in_store(fc->pid);
2101         if (initpid <= 0)
2102                 initpid = fc->pid;
2103         if (!caller_may_see_dir(initpid, controller, path1)) {
2104                 ret = -ENOENT;
2105                 goto out;
2106         }
2107         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2108                 ret = -EACCES;
2109                 goto out;
2110         }
2111
2112         /* we'll free this at cg_release */
2113         file_info = malloc(sizeof(*file_info));
2114         if (!file_info) {
2115                 ret = -ENOMEM;
2116                 goto out;
2117         }
2118         file_info->controller = must_copy_string(controller);
2119         file_info->cgroup = must_copy_string(path1);
2120         file_info->file = must_copy_string(path2);
2121         file_info->type = LXC_TYPE_CGFILE;
2122         file_info->buf = NULL;
2123         file_info->buflen = 0;
2124
2125         fi->fh = (unsigned long)file_info;
2126         ret = 0;
2127
2128 out:
2129         free(cgdir);
2130         return ret;
2131 }
2132
2133 int cg_access(const char *path, int mode)
2134 {
2135         int ret;
2136         const char *cgroup;
2137         char *path1, *path2, *controller;
2138         char *last = NULL, *cgdir = NULL;
2139         struct cgfs_files *k = NULL;
2140         struct fuse_context *fc = fuse_get_context();
2141
2142         if (strcmp(path, "/cgroup") == 0)
2143                 return 0;
2144
2145         if (!fc)
2146                 return -EIO;
2147
2148         controller = pick_controller_from_path(fc, path);
2149         if (!controller)
2150                 return -errno;
2151         cgroup = find_cgroup_in_path(path);
2152         if (!cgroup) {
2153                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2154                 if ((mode & W_OK) == 0)
2155                         return 0;
2156                 return -EACCES;
2157         }
2158
2159         get_cgdir_and_path(cgroup, &cgdir, &last);
2160         if (!last) {
2161                 path1 = "/";
2162                 path2 = cgdir;
2163         } else {
2164                 path1 = cgdir;
2165                 path2 = last;
2166         }
2167
2168         k = cgfs_get_key(controller, path1, path2);
2169         if (!k) {
2170                 if ((mode & W_OK) == 0)
2171                         ret = 0;
2172                 else
2173                         ret = -EACCES;
2174                 goto out;
2175         }
2176         free_key(k);
2177
2178         pid_t initpid = lookup_initpid_in_store(fc->pid);
2179         if (initpid <= 0)
2180                 initpid = fc->pid;
2181         if (!caller_may_see_dir(initpid, controller, path1)) {
2182                 ret = -ENOENT;
2183                 goto out;
2184         }
2185         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2186                 ret = -EACCES;
2187                 goto out;
2188         }
2189
2190         ret = 0;
2191
2192 out:
2193         free(cgdir);
2194         return ret;
2195 }
2196
2197 int cg_release(const char *path, struct fuse_file_info *fi)
2198 {
2199         do_release_file_info(fi);
2200         return 0;
2201 }
2202
2203 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2204
2205 static bool wait_for_sock(int sock, int timeout)
2206 {
2207         struct epoll_event ev;
2208         int epfd, ret, now, starttime, deltatime, saved_errno;
2209
2210         if ((starttime = time(NULL)) < 0)
2211                 return false;
2212
2213         if ((epfd = epoll_create(1)) < 0) {
2214                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2215                 return false;
2216         }
2217
2218         ev.events = POLLIN_SET;
2219         ev.data.fd = sock;
2220         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2221                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2222                 close(epfd);
2223                 return false;
2224         }
2225
2226 again:
2227         if ((now = time(NULL)) < 0) {
2228                 close(epfd);
2229                 return false;
2230         }
2231
2232         deltatime = (starttime + timeout) - now;
2233         if (deltatime < 0) { // timeout
2234                 errno = 0;
2235                 close(epfd);
2236                 return false;
2237         }
2238         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2239         if (ret < 0 && errno == EINTR)
2240                 goto again;
2241         saved_errno = errno;
2242         close(epfd);
2243
2244         if (ret <= 0) {
2245                 errno = saved_errno;
2246                 return false;
2247         }
2248         return true;
2249 }
2250
2251 static int msgrecv(int sockfd, void *buf, size_t len)
2252 {
2253         if (!wait_for_sock(sockfd, 2))
2254                 return -1;
2255         return recv(sockfd, buf, len, MSG_DONTWAIT);
2256 }
2257
2258 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2259 {
2260         struct msghdr msg = { 0 };
2261         struct iovec iov;
2262         struct cmsghdr *cmsg;
2263         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2264         char buf[1];
2265         buf[0] = 'p';
2266
2267         if (pingfirst) {
2268                 if (msgrecv(sock, buf, 1) != 1) {
2269                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2270                         return SEND_CREDS_FAIL;
2271                 }
2272         }
2273
2274         msg.msg_control = cmsgbuf;
2275         msg.msg_controllen = sizeof(cmsgbuf);
2276
2277         cmsg = CMSG_FIRSTHDR(&msg);
2278         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2279         cmsg->cmsg_level = SOL_SOCKET;
2280         cmsg->cmsg_type = SCM_CREDENTIALS;
2281         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2282
2283         msg.msg_name = NULL;
2284         msg.msg_namelen = 0;
2285
2286         buf[0] = v;
2287         iov.iov_base = buf;
2288         iov.iov_len = sizeof(buf);
2289         msg.msg_iov = &iov;
2290         msg.msg_iovlen = 1;
2291
2292         if (sendmsg(sock, &msg, 0) < 0) {
2293                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2294                 if (errno == 3)
2295                         return SEND_CREDS_NOTSK;
2296                 return SEND_CREDS_FAIL;
2297         }
2298
2299         return SEND_CREDS_OK;
2300 }
2301
2302 static bool recv_creds(int sock, struct ucred *cred, char *v)
2303 {
2304         struct msghdr msg = { 0 };
2305         struct iovec iov;
2306         struct cmsghdr *cmsg;
2307         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2308         char buf[1];
2309         int ret;
2310         int optval = 1;
2311
2312         *v = '1';
2313
2314         cred->pid = -1;
2315         cred->uid = -1;
2316         cred->gid = -1;
2317
2318         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2319                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2320                 return false;
2321         }
2322         buf[0] = '1';
2323         if (write(sock, buf, 1) != 1) {
2324                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2325                 return false;
2326         }
2327
2328         msg.msg_name = NULL;
2329         msg.msg_namelen = 0;
2330         msg.msg_control = cmsgbuf;
2331         msg.msg_controllen = sizeof(cmsgbuf);
2332
2333         iov.iov_base = buf;
2334         iov.iov_len = sizeof(buf);
2335         msg.msg_iov = &iov;
2336         msg.msg_iovlen = 1;
2337
2338         if (!wait_for_sock(sock, 2)) {
2339                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2340                 return false;
2341         }
2342         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2343         if (ret < 0) {
2344                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2345                 return false;
2346         }
2347
2348         cmsg = CMSG_FIRSTHDR(&msg);
2349
2350         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2351                         cmsg->cmsg_level == SOL_SOCKET &&
2352                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2353                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2354         }
2355         *v = buf[0];
2356
2357         return true;
2358 }
2359
2360 struct pid_ns_clone_args {
2361         int *cpipe;
2362         int sock;
2363         pid_t tpid;
2364         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2365 };
2366
2367 /*
2368  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2369  * with clone(). This simply writes '1' as ACK back to the parent
2370  * before calling the actual wrapped function.
2371  */
2372 static int pid_ns_clone_wrapper(void *arg) {
2373         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2374         char b = '1';
2375
2376         close(args->cpipe[0]);
2377         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2378                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2379         close(args->cpipe[1]);
2380         return args->wrapped(args->sock, args->tpid);
2381 }
2382
2383 /*
2384  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2385  * int value back over the socket.  This shifts the pid from the
2386  * sender's pidns into tpid's pidns.
2387  */
2388 static int pid_to_ns(int sock, pid_t tpid)
2389 {
2390         char v = '0';
2391         struct ucred cred;
2392
2393         while (recv_creds(sock, &cred, &v)) {
2394                 if (v == '1')
2395                         return 0;
2396                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2397                         return 1;
2398         }
2399         return 0;
2400 }
2401
2402
2403 /*
2404  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2405  * in your old pidns.  Only children which you clone will be in the target
2406  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2407  * actually convert pids.
2408  *
2409  * Note: glibc's fork() does not respect pidns, which can lead to failed
2410  * assertions inside glibc (and thus failed forks) if the child's pid in
2411  * the pidns and the parent pid outside are identical. Using clone prevents
2412  * this issue.
2413  */
2414 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2415 {
2416         int newnsfd = -1, ret, cpipe[2];
2417         char fnam[100];
2418         pid_t cpid;
2419         char v;
2420
2421         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2422         if (ret < 0 || ret >= sizeof(fnam))
2423                 _exit(1);
2424         newnsfd = open(fnam, O_RDONLY);
2425         if (newnsfd < 0)
2426                 _exit(1);
2427         if (setns(newnsfd, 0) < 0)
2428                 _exit(1);
2429         close(newnsfd);
2430
2431         if (pipe(cpipe) < 0)
2432                 _exit(1);
2433
2434         struct pid_ns_clone_args args = {
2435                 .cpipe = cpipe,
2436                 .sock = sock,
2437                 .tpid = tpid,
2438                 .wrapped = &pid_to_ns
2439         };
2440         size_t stack_size = sysconf(_SC_PAGESIZE);
2441         void *stack = alloca(stack_size);
2442
2443         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2444         if (cpid < 0)
2445                 _exit(1);
2446
2447         // give the child 1 second to be done forking and
2448         // write its ack
2449         if (!wait_for_sock(cpipe[0], 1))
2450                 _exit(1);
2451         ret = read(cpipe[0], &v, 1);
2452         if (ret != sizeof(char) || v != '1')
2453                 _exit(1);
2454
2455         if (!wait_for_pid(cpid))
2456                 _exit(1);
2457         _exit(0);
2458 }
2459
2460 /*
2461  * To read cgroup files with a particular pid, we will setns into the child
2462  * pidns, open a pipe, fork a child - which will be the first to really be in
2463  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2464  */
2465 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2466 {
2467         int sock[2] = {-1, -1};
2468         char *tmpdata = NULL;
2469         int ret;
2470         pid_t qpid, cpid = -1;
2471         bool answer = false;
2472         char v = '0';
2473         struct ucred cred;
2474         size_t sz = 0, asz = 0;
2475
2476         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2477                 return false;
2478
2479         /*
2480          * Now we read the pids from returned data one by one, pass
2481          * them into a child in the target namespace, read back the
2482          * translated pids, and put them into our to-return data
2483          */
2484
2485         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2486                 perror("socketpair");
2487                 free(tmpdata);
2488                 return false;
2489         }
2490
2491         cpid = fork();
2492         if (cpid == -1)
2493                 goto out;
2494
2495         if (!cpid) // child - exits when done
2496                 pid_to_ns_wrapper(sock[1], tpid);
2497
2498         char *ptr = tmpdata;
2499         cred.uid = 0;
2500         cred.gid = 0;
2501         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2502                 cred.pid = qpid;
2503                 ret = send_creds(sock[0], &cred, v, true);
2504
2505                 if (ret == SEND_CREDS_NOTSK)
2506                         goto next;
2507                 if (ret == SEND_CREDS_FAIL)
2508                         goto out;
2509
2510                 // read converted results
2511                 if (!wait_for_sock(sock[0], 2)) {
2512                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2513                         goto out;
2514                 }
2515                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2516                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2517                         goto out;
2518                 }
2519                 must_strcat_pid(d, &sz, &asz, qpid);
2520 next:
2521                 ptr = strchr(ptr, '\n');
2522                 if (!ptr)
2523                         break;
2524                 ptr++;
2525         }
2526
2527         cred.pid = getpid();
2528         v = '1';
2529         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2530                 // failed to ask child to exit
2531                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2532                 goto out;
2533         }
2534
2535         answer = true;
2536
2537 out:
2538         free(tmpdata);
2539         if (cpid != -1)
2540                 wait_for_pid(cpid);
2541         if (sock[0] != -1) {
2542                 close(sock[0]);
2543                 close(sock[1]);
2544         }
2545         return answer;
2546 }
2547
2548 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2549                 struct fuse_file_info *fi)
2550 {
2551         struct fuse_context *fc = fuse_get_context();
2552         struct file_info *f = (struct file_info *)fi->fh;
2553         struct cgfs_files *k = NULL;
2554         char *data = NULL;
2555         int ret, s;
2556         bool r;
2557
2558         if (f->type != LXC_TYPE_CGFILE) {
2559                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2560                 return -EIO;
2561         }
2562
2563         if (offset)
2564                 return 0;
2565
2566         if (!fc)
2567                 return -EIO;
2568
2569         if (!f->controller)
2570                 return -EINVAL;
2571
2572         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2573                 return -EINVAL;
2574         }
2575         free_key(k);
2576
2577
2578         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2579                 ret = -EACCES;
2580                 goto out;
2581         }
2582
2583         if (strcmp(f->file, "tasks") == 0 ||
2584                         strcmp(f->file, "/tasks") == 0 ||
2585                         strcmp(f->file, "/cgroup.procs") == 0 ||
2586                         strcmp(f->file, "cgroup.procs") == 0)
2587                 // special case - we have to translate the pids
2588                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2589         else
2590                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2591
2592         if (!r) {
2593                 ret = -EINVAL;
2594                 goto out;
2595         }
2596
2597         if (!data) {
2598                 ret = 0;
2599                 goto out;
2600         }
2601         s = strlen(data);
2602         if (s > size)
2603                 s = size;
2604         memcpy(buf, data, s);
2605         if (s > 0 && s < size && data[s-1] != '\n')
2606                 buf[s++] = '\n';
2607
2608         ret = s;
2609
2610 out:
2611         free(data);
2612         return ret;
2613 }
2614
2615 static int pid_from_ns(int sock, pid_t tpid)
2616 {
2617         pid_t vpid;
2618         struct ucred cred;
2619         char v;
2620         int ret;
2621
2622         cred.uid = 0;
2623         cred.gid = 0;
2624         while (1) {
2625                 if (!wait_for_sock(sock, 2)) {
2626                         lxcfs_error("%s\n", "Timeout reading from parent.");
2627                         return 1;
2628                 }
2629                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2630                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2631                         return 1;
2632                 }
2633                 if (vpid == -1) // done
2634                         break;
2635                 v = '0';
2636                 cred.pid = vpid;
2637                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2638                         v = '1';
2639                         cred.pid = getpid();
2640                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2641                                 return 1;
2642                 }
2643         }
2644         return 0;
2645 }
2646
2647 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2648 {
2649         int newnsfd = -1, ret, cpipe[2];
2650         char fnam[100];
2651         pid_t cpid;
2652         char v;
2653
2654         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2655         if (ret < 0 || ret >= sizeof(fnam))
2656                 _exit(1);
2657         newnsfd = open(fnam, O_RDONLY);
2658         if (newnsfd < 0)
2659                 _exit(1);
2660         if (setns(newnsfd, 0) < 0)
2661                 _exit(1);
2662         close(newnsfd);
2663
2664         if (pipe(cpipe) < 0)
2665                 _exit(1);
2666
2667         struct pid_ns_clone_args args = {
2668                 .cpipe = cpipe,
2669                 .sock = sock,
2670                 .tpid = tpid,
2671                 .wrapped = &pid_from_ns
2672         };
2673         size_t stack_size = sysconf(_SC_PAGESIZE);
2674         void *stack = alloca(stack_size);
2675
2676         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2677         if (cpid < 0)
2678                 _exit(1);
2679
2680         // give the child 1 second to be done forking and
2681         // write its ack
2682         if (!wait_for_sock(cpipe[0], 1))
2683                 _exit(1);
2684         ret = read(cpipe[0], &v, 1);
2685         if (ret != sizeof(char) || v != '1')
2686                 _exit(1);
2687
2688         if (!wait_for_pid(cpid))
2689                 _exit(1);
2690         _exit(0);
2691 }
2692
2693 /*
2694  * Given host @uid, return the uid to which it maps in
2695  * @pid's user namespace, or -1 if none.
2696  */
2697 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2698 {
2699         FILE *f;
2700         char line[400];
2701
2702         sprintf(line, "/proc/%d/uid_map", pid);
2703         if ((f = fopen(line, "r")) == NULL) {
2704                 return false;
2705         }
2706
2707         *answer = convert_id_to_ns(f, uid);
2708         fclose(f);
2709
2710         if (*answer == -1)
2711                 return false;
2712         return true;
2713 }
2714
2715 /*
2716  * get_pid_creds: get the real uid and gid of @pid from
2717  * /proc/$$/status
2718  * (XXX should we use euid here?)
2719  */
2720 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2721 {
2722         char line[400];
2723         uid_t u;
2724         gid_t g;
2725         FILE *f;
2726
2727         *uid = -1;
2728         *gid = -1;
2729         sprintf(line, "/proc/%d/status", pid);
2730         if ((f = fopen(line, "r")) == NULL) {
2731                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2732                 return;
2733         }
2734         while (fgets(line, 400, f)) {
2735                 if (strncmp(line, "Uid:", 4) == 0) {
2736                         if (sscanf(line+4, "%u", &u) != 1) {
2737                                 lxcfs_error("bad uid line for pid %u\n", pid);
2738                                 fclose(f);
2739                                 return;
2740                         }
2741                         *uid = u;
2742                 } else if (strncmp(line, "Gid:", 4) == 0) {
2743                         if (sscanf(line+4, "%u", &g) != 1) {
2744                                 lxcfs_error("bad gid line for pid %u\n", pid);
2745                                 fclose(f);
2746                                 return;
2747                         }
2748                         *gid = g;
2749                 }
2750         }
2751         fclose(f);
2752 }
2753
2754 /*
2755  * May the requestor @r move victim @v to a new cgroup?
2756  * This is allowed if
2757  *   . they are the same task
2758  *   . they are ownedy by the same uid
2759  *   . @r is root on the host, or
2760  *   . @v's uid is mapped into @r's where @r is root.
2761  */
2762 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2763 {
2764         uid_t v_uid, tmpuid;
2765         gid_t v_gid;
2766
2767         if (r == v)
2768                 return true;
2769         if (r_uid == 0)
2770                 return true;
2771         get_pid_creds(v, &v_uid, &v_gid);
2772         if (r_uid == v_uid)
2773                 return true;
2774         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2775                         && hostuid_to_ns(v_uid, r, &tmpuid))
2776                 return true;
2777         return false;
2778 }
2779
2780 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2781                 const char *file, const char *buf)
2782 {
2783         int sock[2] = {-1, -1};
2784         pid_t qpid, cpid = -1;
2785         FILE *pids_file = NULL;
2786         bool answer = false, fail = false;
2787
2788         pids_file = open_pids_file(contrl, cg);
2789         if (!pids_file)
2790                 return false;
2791
2792         /*
2793          * write the pids to a socket, have helper in writer's pidns
2794          * call movepid for us
2795          */
2796         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2797                 perror("socketpair");
2798                 goto out;
2799         }
2800
2801         cpid = fork();
2802         if (cpid == -1)
2803                 goto out;
2804
2805         if (!cpid) { // child
2806                 fclose(pids_file);
2807                 pid_from_ns_wrapper(sock[1], tpid);
2808         }
2809
2810         const char *ptr = buf;
2811         while (sscanf(ptr, "%d", &qpid) == 1) {
2812                 struct ucred cred;
2813                 char v;
2814
2815                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2816                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2817                         goto out;
2818                 }
2819
2820                 if (recv_creds(sock[0], &cred, &v)) {
2821                         if (v == '0') {
2822                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2823                                         fail = true;
2824                                         break;
2825                                 }
2826                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2827                                         fail = true;
2828                         }
2829                 }
2830
2831                 ptr = strchr(ptr, '\n');
2832                 if (!ptr)
2833                         break;
2834                 ptr++;
2835         }
2836
2837         /* All good, write the value */
2838         qpid = -1;
2839         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2840                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2841
2842         if (!fail)
2843                 answer = true;
2844
2845 out:
2846         if (cpid != -1)
2847                 wait_for_pid(cpid);
2848         if (sock[0] != -1) {
2849                 close(sock[0]);
2850                 close(sock[1]);
2851         }
2852         if (pids_file) {
2853                 if (fclose(pids_file) != 0)
2854                         answer = false;
2855         }
2856         return answer;
2857 }
2858
2859 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2860              struct fuse_file_info *fi)
2861 {
2862         struct fuse_context *fc = fuse_get_context();
2863         char *localbuf = NULL;
2864         struct cgfs_files *k = NULL;
2865         struct file_info *f = (struct file_info *)fi->fh;
2866         bool r;
2867
2868         if (f->type != LXC_TYPE_CGFILE) {
2869                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2870                 return -EIO;
2871         }
2872
2873         if (offset)
2874                 return 0;
2875
2876         if (!fc)
2877                 return -EIO;
2878
2879         localbuf = alloca(size+1);
2880         localbuf[size] = '\0';
2881         memcpy(localbuf, buf, size);
2882
2883         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2884                 size = -EINVAL;
2885                 goto out;
2886         }
2887
2888         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2889                 size = -EACCES;
2890                 goto out;
2891         }
2892
2893         if (strcmp(f->file, "tasks") == 0 ||
2894                         strcmp(f->file, "/tasks") == 0 ||
2895                         strcmp(f->file, "/cgroup.procs") == 0 ||
2896                         strcmp(f->file, "cgroup.procs") == 0)
2897                 // special case - we have to translate the pids
2898                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2899         else
2900                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2901
2902         if (!r)
2903                 size = -EINVAL;
2904
2905 out:
2906         free_key(k);
2907         return size;
2908 }
2909
2910 int cg_chown(const char *path, uid_t uid, gid_t gid)
2911 {
2912         struct fuse_context *fc = fuse_get_context();
2913         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2914         struct cgfs_files *k = NULL;
2915         const char *cgroup;
2916         int ret;
2917
2918         if (!fc)
2919                 return -EIO;
2920
2921         if (strcmp(path, "/cgroup") == 0)
2922                 return -EPERM;
2923
2924         controller = pick_controller_from_path(fc, path);
2925         if (!controller)
2926                 return errno == ENOENT ? -EPERM : -errno;
2927
2928         cgroup = find_cgroup_in_path(path);
2929         if (!cgroup)
2930                 /* this is just /cgroup/controller */
2931                 return -EPERM;
2932
2933         get_cgdir_and_path(cgroup, &cgdir, &last);
2934
2935         if (!last) {
2936                 path1 = "/";
2937                 path2 = cgdir;
2938         } else {
2939                 path1 = cgdir;
2940                 path2 = last;
2941         }
2942
2943         if (is_child_cgroup(controller, path1, path2)) {
2944                 // get uid, gid, from '/tasks' file and make up a mode
2945                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2946                 k = cgfs_get_key(controller, cgroup, "tasks");
2947
2948         } else
2949                 k = cgfs_get_key(controller, path1, path2);
2950
2951         if (!k) {
2952                 ret = -EINVAL;
2953                 goto out;
2954         }
2955
2956         /*
2957          * This being a fuse request, the uid and gid must be valid
2958          * in the caller's namespace.  So we can just check to make
2959          * sure that the caller is root in his uid, and privileged
2960          * over the file's current owner.
2961          */
2962         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2963                 ret = -EACCES;
2964                 goto out;
2965         }
2966
2967         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2968
2969 out:
2970         free_key(k);
2971         free(cgdir);
2972
2973         return ret;
2974 }
2975
2976 int cg_chmod(const char *path, mode_t mode)
2977 {
2978         struct fuse_context *fc = fuse_get_context();
2979         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2980         struct cgfs_files *k = NULL;
2981         const char *cgroup;
2982         int ret;
2983
2984         if (!fc)
2985                 return -EIO;
2986
2987         if (strcmp(path, "/cgroup") == 0)
2988                 return -EPERM;
2989
2990         controller = pick_controller_from_path(fc, path);
2991         if (!controller)
2992                 return errno == ENOENT ? -EPERM : -errno;
2993
2994         cgroup = find_cgroup_in_path(path);
2995         if (!cgroup)
2996                 /* this is just /cgroup/controller */
2997                 return -EPERM;
2998
2999         get_cgdir_and_path(cgroup, &cgdir, &last);
3000
3001         if (!last) {
3002                 path1 = "/";
3003                 path2 = cgdir;
3004         } else {
3005                 path1 = cgdir;
3006                 path2 = last;
3007         }
3008
3009         if (is_child_cgroup(controller, path1, path2)) {
3010                 // get uid, gid, from '/tasks' file and make up a mode
3011                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3012                 k = cgfs_get_key(controller, cgroup, "tasks");
3013
3014         } else
3015                 k = cgfs_get_key(controller, path1, path2);
3016
3017         if (!k) {
3018                 ret = -EINVAL;
3019                 goto out;
3020         }
3021
3022         /*
3023          * This being a fuse request, the uid and gid must be valid
3024          * in the caller's namespace.  So we can just check to make
3025          * sure that the caller is root in his uid, and privileged
3026          * over the file's current owner.
3027          */
3028         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3029                 ret = -EPERM;
3030                 goto out;
3031         }
3032
3033         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3034                 ret = -EINVAL;
3035                 goto out;
3036         }
3037
3038         ret = 0;
3039 out:
3040         free_key(k);
3041         free(cgdir);
3042         return ret;
3043 }
3044
3045 int cg_mkdir(const char *path, mode_t mode)
3046 {
3047         struct fuse_context *fc = fuse_get_context();
3048         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3049         const char *cgroup;
3050         int ret;
3051
3052         if (!fc)
3053                 return -EIO;
3054
3055         controller = pick_controller_from_path(fc, path);
3056         if (!controller)
3057                 return errno == ENOENT ? -EPERM : -errno;
3058
3059         cgroup = find_cgroup_in_path(path);
3060         if (!cgroup)
3061                 return -errno;
3062
3063         get_cgdir_and_path(cgroup, &cgdir, &last);
3064         if (!last)
3065                 path1 = "/";
3066         else
3067                 path1 = cgdir;
3068
3069         pid_t initpid = lookup_initpid_in_store(fc->pid);
3070         if (initpid <= 0)
3071                 initpid = fc->pid;
3072         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3073                 if (!next)
3074                         ret = -EINVAL;
3075                 else if (last && strcmp(next, last) == 0)
3076                         ret = -EEXIST;
3077                 else
3078                         ret = -EPERM;
3079                 goto out;
3080         }
3081
3082         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3083                 ret = -EACCES;
3084                 goto out;
3085         }
3086         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3087                 ret = -EACCES;
3088                 goto out;
3089         }
3090
3091         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3092
3093 out:
3094         free(cgdir);
3095         free(next);
3096         return ret;
3097 }
3098
3099 int cg_rmdir(const char *path)
3100 {
3101         struct fuse_context *fc = fuse_get_context();
3102         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3103         const char *cgroup;
3104         int ret;
3105
3106         if (!fc)
3107                 return -EIO;
3108
3109         controller = pick_controller_from_path(fc, path);
3110         if (!controller) /* Someone's trying to delete "/cgroup". */
3111                 return -EPERM;
3112
3113         cgroup = find_cgroup_in_path(path);
3114         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3115                 return -EPERM;
3116
3117         get_cgdir_and_path(cgroup, &cgdir, &last);
3118         if (!last) {
3119                 /* Someone's trying to delete a cgroup on the same level as the
3120                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3121                  * rmdir "/cgroup/blkio/init.slice".
3122                  */
3123                 ret = -EPERM;
3124                 goto out;
3125         }
3126
3127         pid_t initpid = lookup_initpid_in_store(fc->pid);
3128         if (initpid <= 0)
3129                 initpid = fc->pid;
3130         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3131                 if (!last || (next && (strcmp(next, last) == 0)))
3132                         ret = -EBUSY;
3133                 else
3134                         ret = -ENOENT;
3135                 goto out;
3136         }
3137
3138         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3139                 ret = -EACCES;
3140                 goto out;
3141         }
3142         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3143                 ret = -EACCES;
3144                 goto out;
3145         }
3146
3147         if (!cgfs_remove(controller, cgroup)) {
3148                 ret = -EINVAL;
3149                 goto out;
3150         }
3151
3152         ret = 0;
3153
3154 out:
3155         free(cgdir);
3156         free(next);
3157         return ret;
3158 }
3159
3160 static bool startswith(const char *line, const char *pref)
3161 {
3162         if (strncmp(line, pref, strlen(pref)) == 0)
3163                 return true;
3164         return false;
3165 }
3166
3167 static void parse_memstat(char *memstat, unsigned long *cached,
3168                 unsigned long *active_anon, unsigned long *inactive_anon,
3169                 unsigned long *active_file, unsigned long *inactive_file,
3170                 unsigned long *unevictable)
3171 {
3172         char *eol;
3173
3174         while (*memstat) {
3175                 if (startswith(memstat, "total_cache")) {
3176                         sscanf(memstat + 11, "%lu", cached);
3177                         *cached /= 1024;
3178                 } else if (startswith(memstat, "total_active_anon")) {
3179                         sscanf(memstat + 17, "%lu", active_anon);
3180                         *active_anon /= 1024;
3181                 } else if (startswith(memstat, "total_inactive_anon")) {
3182                         sscanf(memstat + 19, "%lu", inactive_anon);
3183                         *inactive_anon /= 1024;
3184                 } else if (startswith(memstat, "total_active_file")) {
3185                         sscanf(memstat + 17, "%lu", active_file);
3186                         *active_file /= 1024;
3187                 } else if (startswith(memstat, "total_inactive_file")) {
3188                         sscanf(memstat + 19, "%lu", inactive_file);
3189                         *inactive_file /= 1024;
3190                 } else if (startswith(memstat, "total_unevictable")) {
3191                         sscanf(memstat + 17, "%lu", unevictable);
3192                         *unevictable /= 1024;
3193                 }
3194                 eol = strchr(memstat, '\n');
3195                 if (!eol)
3196                         return;
3197                 memstat = eol+1;
3198         }
3199 }
3200
3201 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3202 {
3203         char *eol;
3204         char key[32];
3205
3206         memset(key, 0, 32);
3207         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3208
3209         size_t len = strlen(key);
3210         *v = 0;
3211
3212         while (*str) {
3213                 if (startswith(str, key)) {
3214                         sscanf(str + len, "%lu", v);
3215                         return;
3216                 }
3217                 eol = strchr(str, '\n');
3218                 if (!eol)
3219                         return;
3220                 str = eol+1;
3221         }
3222 }
3223
3224 static int read_file(const char *path, char *buf, size_t size,
3225                      struct file_info *d)
3226 {
3227         size_t linelen = 0, total_len = 0, rv = 0;
3228         char *line = NULL;
3229         char *cache = d->buf;
3230         size_t cache_size = d->buflen;
3231         FILE *f = fopen(path, "r");
3232         if (!f)
3233                 return 0;
3234
3235         while (getline(&line, &linelen, f) != -1) {
3236                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3237                 if (l < 0) {
3238                         perror("Error writing to cache");
3239                         rv = 0;
3240                         goto err;
3241                 }
3242                 if (l >= cache_size) {
3243                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3244                         rv = 0;
3245                         goto err;
3246                 }
3247                 cache += l;
3248                 cache_size -= l;
3249                 total_len += l;
3250         }
3251
3252         d->size = total_len;
3253         if (total_len > size)
3254                 total_len = size;
3255
3256         /* read from off 0 */
3257         memcpy(buf, d->buf, total_len);
3258         rv = total_len;
3259   err:
3260         fclose(f);
3261         free(line);
3262         return rv;
3263 }
3264
3265 /*
3266  * FUSE ops for /proc
3267  */
3268
3269 static unsigned long get_memlimit(const char *cgroup, const char *file)
3270 {
3271         char *memlimit_str = NULL;
3272         unsigned long memlimit = -1;
3273
3274         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3275                 memlimit = strtoul(memlimit_str, NULL, 10);
3276
3277         free(memlimit_str);
3278
3279         return memlimit;
3280 }
3281
3282 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3283 {
3284         char *copy = strdupa(cgroup);
3285         unsigned long memlimit = 0, retlimit;
3286
3287         retlimit = get_memlimit(copy, file);
3288
3289         while (strcmp(copy, "/") != 0) {
3290                 copy = dirname(copy);
3291                 memlimit = get_memlimit(copy, file);
3292                 if (memlimit != -1 && memlimit < retlimit)
3293                         retlimit = memlimit;
3294         };
3295
3296         return retlimit;
3297 }
3298
3299 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3300                 struct fuse_file_info *fi)
3301 {
3302         struct fuse_context *fc = fuse_get_context();
3303         struct file_info *d = (struct file_info *)fi->fh;
3304         char *cg;
3305         char *memusage_str = NULL, *memstat_str = NULL,
3306                 *memswlimit_str = NULL, *memswusage_str = NULL;
3307         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3308                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3309                 active_file = 0, inactive_file = 0, unevictable = 0,
3310                 hostswtotal = 0;
3311         char *line = NULL;
3312         size_t linelen = 0, total_len = 0, rv = 0;
3313         char *cache = d->buf;
3314         size_t cache_size = d->buflen;
3315         FILE *f = NULL;
3316
3317         if (offset){
3318                 if (offset > d->size)
3319                         return -EINVAL;
3320                 if (!d->cached)
3321                         return 0;
3322                 int left = d->size - offset;
3323                 total_len = left > size ? size: left;
3324                 memcpy(buf, cache + offset, total_len);
3325                 return total_len;
3326         }
3327
3328         pid_t initpid = lookup_initpid_in_store(fc->pid);
3329         if (initpid <= 0)
3330                 initpid = fc->pid;
3331         cg = get_pid_cgroup(initpid, "memory");
3332         if (!cg)
3333                 return read_file("/proc/meminfo", buf, size, d);
3334         prune_init_slice(cg);
3335
3336         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3337         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3338                 goto err;
3339         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3340                 goto err;
3341
3342         // Following values are allowed to fail, because swapaccount might be turned
3343         // off for current kernel
3344         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3345                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3346         {
3347                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3348                 memswusage = strtoul(memswusage_str, NULL, 10);
3349
3350                 memswlimit = memswlimit / 1024;
3351                 memswusage = memswusage / 1024;
3352         }
3353
3354         memusage = strtoul(memusage_str, NULL, 10);
3355         memlimit /= 1024;
3356         memusage /= 1024;
3357
3358         parse_memstat(memstat_str, &cached, &active_anon,
3359                         &inactive_anon, &active_file, &inactive_file,
3360                         &unevictable);
3361
3362         f = fopen("/proc/meminfo", "r");
3363         if (!f)
3364                 goto err;
3365
3366         while (getline(&line, &linelen, f) != -1) {
3367                 ssize_t l;
3368                 char *printme, lbuf[100];
3369
3370                 memset(lbuf, 0, 100);
3371                 if (startswith(line, "MemTotal:")) {
3372                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3373                         if (hosttotal < memlimit)
3374                                 memlimit = hosttotal;
3375                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3376                         printme = lbuf;
3377                 } else if (startswith(line, "MemFree:")) {
3378                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3379                         printme = lbuf;
3380                 } else if (startswith(line, "MemAvailable:")) {
3381                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3382                         printme = lbuf;
3383                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3384                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3385                         if (hostswtotal < memswlimit)
3386                                 memswlimit = hostswtotal;
3387                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3388                         printme = lbuf;
3389                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3390                         unsigned long swaptotal = memswlimit,
3391                                         swapusage = memswusage - memusage,
3392                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3393                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3394                         printme = lbuf;
3395                 } else if (startswith(line, "Slab:")) {
3396                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3397                         printme = lbuf;
3398                 } else if (startswith(line, "Buffers:")) {
3399                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3400                         printme = lbuf;
3401                 } else if (startswith(line, "Cached:")) {
3402                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3403                         printme = lbuf;
3404                 } else if (startswith(line, "SwapCached:")) {
3405                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3406                         printme = lbuf;
3407                 } else if (startswith(line, "Active:")) {
3408                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3409                                         active_anon + active_file);
3410                         printme = lbuf;
3411                 } else if (startswith(line, "Inactive:")) {
3412                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3413                                         inactive_anon + inactive_file);
3414                         printme = lbuf;
3415                 } else if (startswith(line, "Active(anon)")) {
3416                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3417                         printme = lbuf;
3418                 } else if (startswith(line, "Inactive(anon)")) {
3419                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3420                         printme = lbuf;
3421                 } else if (startswith(line, "Active(file)")) {
3422                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3423                         printme = lbuf;
3424                 } else if (startswith(line, "Inactive(file)")) {
3425                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3426                         printme = lbuf;
3427                 } else if (startswith(line, "Unevictable")) {
3428                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3429                         printme = lbuf;
3430                 } else if (startswith(line, "SReclaimable")) {
3431                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3432                         printme = lbuf;
3433                 } else if (startswith(line, "SUnreclaim")) {
3434                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3435                         printme = lbuf;
3436                 } else
3437                         printme = line;
3438
3439                 l = snprintf(cache, cache_size, "%s", printme);
3440                 if (l < 0) {
3441                         perror("Error writing to cache");
3442                         rv = 0;
3443                         goto err;
3444
3445                 }
3446                 if (l >= cache_size) {
3447                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3448                         rv = 0;
3449                         goto err;
3450                 }
3451
3452                 cache += l;
3453                 cache_size -= l;
3454                 total_len += l;
3455         }
3456
3457         d->cached = 1;
3458         d->size = total_len;
3459         if (total_len > size ) total_len = size;
3460         memcpy(buf, d->buf, total_len);
3461
3462         rv = total_len;
3463 err:
3464         if (f)
3465                 fclose(f);
3466         free(line);
3467         free(cg);
3468         free(memusage_str);
3469         free(memswlimit_str);
3470         free(memswusage_str);
3471         free(memstat_str);
3472         return rv;
3473 }
3474
3475 /*
3476  * Read the cpuset.cpus for cg
3477  * Return the answer in a newly allocated string which must be freed
3478  */
3479 static char *get_cpuset(const char *cg)
3480 {
3481         char *answer;
3482
3483         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3484                 return NULL;
3485         return answer;
3486 }
3487
3488 bool cpu_in_cpuset(int cpu, const char *cpuset);
3489
3490 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3491 {
3492         int cpu;
3493
3494         if (sscanf(line, "processor       : %d", &cpu) != 1)
3495                 return false;
3496         return cpu_in_cpuset(cpu, cpuset);
3497 }
3498
3499 /*
3500  * check whether this is a '^processor" line in /proc/cpuinfo
3501  */
3502 static bool is_processor_line(const char *line)
3503 {
3504         int cpu;
3505
3506         if (sscanf(line, "processor       : %d", &cpu) == 1)
3507                 return true;
3508         return false;
3509 }
3510
3511 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3512                 struct fuse_file_info *fi)
3513 {
3514         struct fuse_context *fc = fuse_get_context();
3515         struct file_info *d = (struct file_info *)fi->fh;
3516         char *cg;
3517         char *cpuset = NULL;
3518         char *line = NULL;
3519         size_t linelen = 0, total_len = 0, rv = 0;
3520         bool am_printing = false, firstline = true, is_s390x = false;
3521         int curcpu = -1, cpu;
3522         char *cache = d->buf;
3523         size_t cache_size = d->buflen;
3524         FILE *f = NULL;
3525
3526         if (offset){
3527                 if (offset > d->size)
3528                         return -EINVAL;
3529                 if (!d->cached)
3530                         return 0;
3531                 int left = d->size - offset;
3532                 total_len = left > size ? size: left;
3533                 memcpy(buf, cache + offset, total_len);
3534                 return total_len;
3535         }
3536
3537         pid_t initpid = lookup_initpid_in_store(fc->pid);
3538         if (initpid <= 0)
3539                 initpid = fc->pid;
3540         cg = get_pid_cgroup(initpid, "cpuset");
3541         if (!cg)
3542                 return read_file("proc/cpuinfo", buf, size, d);
3543         prune_init_slice(cg);
3544
3545         cpuset = get_cpuset(cg);
3546         if (!cpuset)
3547                 goto err;
3548
3549         f = fopen("/proc/cpuinfo", "r");
3550         if (!f)
3551                 goto err;
3552
3553         while (getline(&line, &linelen, f) != -1) {
3554                 ssize_t l;
3555                 if (firstline) {
3556                         firstline = false;
3557                         if (strstr(line, "IBM/S390") != NULL) {
3558                                 is_s390x = true;
3559                                 am_printing = true;
3560                                 continue;
3561                         }
3562                 }
3563                 if (strncmp(line, "# processors:", 12) == 0)
3564                         continue;
3565                 if (is_processor_line(line)) {
3566                         am_printing = cpuline_in_cpuset(line, cpuset);
3567                         if (am_printing) {
3568                                 curcpu ++;
3569                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3570                                 if (l < 0) {
3571                                         perror("Error writing to cache");
3572                                         rv = 0;
3573                                         goto err;
3574                                 }
3575                                 if (l >= cache_size) {
3576                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3577                                         rv = 0;
3578                                         goto err;
3579                                 }
3580                                 cache += l;
3581                                 cache_size -= l;
3582                                 total_len += l;
3583                         }
3584                         continue;
3585                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3586                         char *p;
3587                         if (!cpu_in_cpuset(cpu, cpuset))
3588                                 continue;
3589                         curcpu ++;
3590                         p = strchr(line, ':');
3591                         if (!p || !*p)
3592                                 goto err;
3593                         p++;
3594                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3595                         if (l < 0) {
3596                                 perror("Error writing to cache");
3597                                 rv = 0;
3598                                 goto err;
3599                         }
3600                         if (l >= cache_size) {
3601                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3602                                 rv = 0;
3603                                 goto err;
3604                         }
3605                         cache += l;
3606                         cache_size -= l;
3607                         total_len += l;
3608                         continue;
3609
3610                 }
3611                 if (am_printing) {
3612                         l = snprintf(cache, cache_size, "%s", line);
3613                         if (l < 0) {
3614                                 perror("Error writing to cache");
3615                                 rv = 0;
3616                                 goto err;
3617                         }
3618                         if (l >= cache_size) {
3619                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3620                                 rv = 0;
3621                                 goto err;
3622                         }
3623                         cache += l;
3624                         cache_size -= l;
3625                         total_len += l;
3626                 }
3627         }
3628
3629         if (is_s390x) {
3630                 char *origcache = d->buf;
3631                 ssize_t l;
3632                 do {
3633                         d->buf = malloc(d->buflen);
3634                 } while (!d->buf);
3635                 cache = d->buf;
3636                 cache_size = d->buflen;
3637                 total_len = 0;
3638                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3639                 if (l < 0 || l >= cache_size) {
3640                         free(origcache);
3641                         goto err;
3642                 }
3643                 cache_size -= l;
3644                 cache += l;
3645                 total_len += l;
3646                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3647                 if (l < 0 || l >= cache_size) {
3648                         free(origcache);
3649                         goto err;
3650                 }
3651                 cache_size -= l;
3652                 cache += l;
3653                 total_len += l;
3654                 l = snprintf(cache, cache_size, "%s", origcache);
3655                 free(origcache);
3656                 if (l < 0 || l >= cache_size)
3657                         goto err;
3658                 total_len += l;
3659         }
3660
3661         d->cached = 1;
3662         d->size = total_len;
3663         if (total_len > size ) total_len = size;
3664
3665         /* read from off 0 */
3666         memcpy(buf, d->buf, total_len);
3667         rv = total_len;
3668 err:
3669         if (f)
3670                 fclose(f);
3671         free(line);
3672         free(cpuset);
3673         free(cg);
3674         return rv;
3675 }
3676
3677 static uint64_t get_reaper_start_time(pid_t pid)
3678 {
3679         int ret;
3680         FILE *f;
3681         uint64_t starttime;
3682         /* strlen("/proc/") = 6
3683          * +
3684          * LXCFS_NUMSTRLEN64
3685          * +
3686          * strlen("/stat") = 5
3687          * +
3688          * \0 = 1
3689          * */
3690 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3691         char path[__PROC_PID_STAT_LEN];
3692         pid_t qpid;
3693
3694         qpid = lookup_initpid_in_store(pid);
3695         if (qpid <= 0) {
3696                 /* Caller can check for EINVAL on 0. */
3697                 errno = EINVAL;
3698                 return 0;
3699         }
3700
3701         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3702         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3703                 /* Caller can check for EINVAL on 0. */
3704                 errno = EINVAL;
3705                 return 0;
3706         }
3707
3708         f = fopen(path, "r");
3709         if (!f) {
3710                 /* Caller can check for EINVAL on 0. */
3711                 errno = EINVAL;
3712                 return 0;
3713         }
3714
3715         /* Note that the *scanf() argument supression requires that length
3716          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3717          * at us. It's like telling someone you're not married and then asking
3718          * if you can bring your wife to the party.
3719          */
3720         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3721                         "%*s "      /* (2)  comm        %s   */
3722                         "%*c "      /* (3)  state       %c   */
3723                         "%*d "      /* (4)  ppid        %d   */
3724                         "%*d "      /* (5)  pgrp        %d   */
3725                         "%*d "      /* (6)  session     %d   */
3726                         "%*d "      /* (7)  tty_nr      %d   */
3727                         "%*d "      /* (8)  tpgid       %d   */
3728                         "%*u "      /* (9)  flags       %u   */
3729                         "%*u "      /* (10) minflt      %lu  */
3730                         "%*u "      /* (11) cminflt     %lu  */
3731                         "%*u "      /* (12) majflt      %lu  */
3732                         "%*u "      /* (13) cmajflt     %lu  */
3733                         "%*u "      /* (14) utime       %lu  */
3734                         "%*u "      /* (15) stime       %lu  */
3735                         "%*d "      /* (16) cutime      %ld  */
3736                         "%*d "      /* (17) cstime      %ld  */
3737                         "%*d "      /* (18) priority    %ld  */
3738                         "%*d "      /* (19) nice        %ld  */
3739                         "%*d "      /* (20) num_threads %ld  */
3740                         "%*d "      /* (21) itrealvalue %ld  */
3741                         "%" PRIu64, /* (22) starttime   %llu */
3742                      &starttime);
3743         if (ret != 1) {
3744                 fclose(f);
3745                 /* Caller can check for EINVAL on 0. */
3746                 errno = EINVAL;
3747                 return 0;
3748         }
3749
3750         fclose(f);
3751
3752         errno = 0;
3753         return starttime;
3754 }
3755
3756 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3757 {
3758         uint64_t clockticks;
3759         int64_t ticks_per_sec;
3760
3761         clockticks = get_reaper_start_time(pid);
3762         if (clockticks == 0 && errno == EINVAL) {
3763                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3764                 return 0;
3765         }
3766
3767         ticks_per_sec = sysconf(_SC_CLK_TCK);
3768         if (ticks_per_sec < 0 && errno == EINVAL) {
3769                 lxcfs_debug(
3770                     "%s\n",
3771                     "failed to determine number of clock ticks in a second");
3772                 return 0;
3773         }
3774
3775         return (clockticks /= ticks_per_sec);
3776 }
3777
3778 static uint64_t get_reaper_age(pid_t pid)
3779 {
3780         uint64_t procstart, uptime, procage;
3781
3782         /* We need to substract the time the process has started since system
3783          * boot minus the time when the system has started to get the actual
3784          * reaper age.
3785          */
3786         procstart = get_reaper_start_time_in_sec(pid);
3787         procage = procstart;
3788         if (procstart > 0) {
3789                 int ret;
3790                 struct timespec spec;
3791
3792                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3793                 if (ret < 0)
3794                         return 0;
3795                 /* We could make this more precise here by using the tv_nsec
3796                  * field in the timespec struct and convert it to milliseconds
3797                  * and then create a double for the seconds and milliseconds but
3798                  * that seems more work than it is worth.
3799                  */
3800                 uptime = spec.tv_sec;
3801                 procage = uptime - procstart;
3802         }
3803
3804         return procage;
3805 }
3806
3807 /*
3808  * Returns 0 on success.
3809  * It is the caller's responsibility to free `return_usage`, unless this
3810  * function returns an error.
3811  */
3812 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
3813 {
3814         int cpucount = get_nprocs();
3815         struct cpuacct_usage *cpu_usage;
3816         int rv = 0, i, j, ret, read_pos = 0, read_cnt;
3817         int cg_cpu;
3818         uint64_t cg_user, cg_system;
3819         int64_t ticks_per_sec;
3820         char *usage_str = NULL;
3821
3822         ticks_per_sec = sysconf(_SC_CLK_TCK);
3823
3824         if (ticks_per_sec < 0 && errno == EINVAL) {
3825                 lxcfs_debug(
3826                         "%s\n",
3827                         "read_cpuacct_usage_all failed to determine number of clock ticks "
3828                         "in a second");
3829                 return -1;
3830         }
3831
3832         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3833         if (!cpu_usage)
3834                 return -ENOMEM;
3835
3836         if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3837                 rv = -1;
3838                 goto err;
3839         }
3840
3841         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3842                 lxcfs_error("read_cpuacct_usage_all reading first line from "
3843                                 "%s/cpuacct.usage_all failed.\n", cg);
3844                 rv = -1;
3845                 goto err;
3846         }
3847
3848         read_pos += read_cnt;
3849
3850         for (i = 0, j = 0; i < cpucount; i++) {
3851                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3852                                 &cg_system, &read_cnt);
3853
3854                 if (ret == EOF)
3855                         break;
3856
3857                 if (ret != 3) {
3858                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3859                                         "failed.\n", cg);
3860                         rv = -1;
3861                         goto err;
3862                 }
3863
3864                 read_pos += read_cnt;
3865
3866                 if (!cpu_in_cpuset(i, cpuset))
3867                         continue;
3868
3869                 /* Convert the time from nanoseconds to USER_HZ */
3870                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3871                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3872                 j++;
3873         }
3874
3875         rv = 0;
3876         *return_usage = cpu_usage;
3877
3878 err:
3879         if (usage_str)
3880                 free(usage_str);
3881
3882         if (rv != 0) {
3883                 free(cpu_usage);
3884                 *return_usage = NULL;
3885         }
3886
3887         return rv;
3888 }
3889
3890 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3891 static int proc_stat_read(char *buf, size_t size, off_t offset,
3892                 struct fuse_file_info *fi)
3893 {
3894         struct fuse_context *fc = fuse_get_context();
3895         struct file_info *d = (struct file_info *)fi->fh;
3896         char *cg;
3897         char *cpuset = NULL;
3898         char *line = NULL;
3899         size_t linelen = 0, total_len = 0, rv = 0;
3900         int curcpu = -1; /* cpu numbering starts at 0 */
3901         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3902         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3903                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3904         char cpuall[CPUALL_MAX_SIZE];
3905         /* reserve for cpu all */
3906         char *cache = d->buf + CPUALL_MAX_SIZE;
3907         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3908         FILE *f = NULL;
3909         struct cpuacct_usage *cg_cpu_usage = NULL;
3910
3911         if (offset){
3912                 if (offset > d->size)
3913                         return -EINVAL;
3914                 if (!d->cached)
3915                         return 0;
3916                 int left = d->size - offset;
3917                 total_len = left > size ? size: left;
3918                 memcpy(buf, d->buf + offset, total_len);
3919                 return total_len;
3920         }
3921
3922         pid_t initpid = lookup_initpid_in_store(fc->pid);
3923         if (initpid <= 0)
3924                 initpid = fc->pid;
3925         cg = get_pid_cgroup(initpid, "cpuset");
3926         if (!cg)
3927                 return read_file("/proc/stat", buf, size, d);
3928         prune_init_slice(cg);
3929
3930         cpuset = get_cpuset(cg);
3931         if (!cpuset)
3932                 goto err;
3933
3934         /*
3935          * Read cpuacct.usage_all for all CPUs.
3936          * If the cpuacct cgroup is present, it is used to calculate the container's
3937          * CPU usage. If not, values from the host's /proc/stat are used.
3938          */
3939         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
3940                 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
3941                                 "falling back to the host's /proc/stat");
3942         }
3943
3944         f = fopen("/proc/stat", "r");
3945         if (!f)
3946                 goto err;
3947
3948         //skip first line
3949         if (getline(&line, &linelen, f) < 0) {
3950                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3951                 goto err;
3952         }
3953
3954         while (getline(&line, &linelen, f) != -1) {
3955                 ssize_t l;
3956                 int cpu;
3957                 char cpu_char[10]; /* That's a lot of cores */
3958                 char *c;
3959                 uint64_t all_used, cg_used, new_idle;
3960                 int ret;
3961
3962                 if (strlen(line) == 0)
3963                         continue;
3964                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3965                         /* not a ^cpuN line containing a number N, just print it */
3966                         l = snprintf(cache, cache_size, "%s", line);
3967                         if (l < 0) {
3968                                 perror("Error writing to cache");
3969                                 rv = 0;
3970                                 goto err;
3971                         }
3972                         if (l >= cache_size) {
3973                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3974                                 rv = 0;
3975                                 goto err;
3976                         }
3977                         cache += l;
3978                         cache_size -= l;
3979                         total_len += l;
3980                         continue;
3981                 }
3982
3983                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3984                         continue;
3985                 if (!cpu_in_cpuset(cpu, cpuset))
3986                         continue;
3987                 curcpu ++;
3988
3989                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3990                            &user,
3991                            &nice,
3992                            &system,
3993                            &idle,
3994                            &iowait,
3995                            &irq,
3996                            &softirq,
3997                            &steal,
3998                            &guest,
3999                            &guest_nice);
4000
4001                 if (ret != 10 || !cg_cpu_usage) {
4002                         c = strchr(line, ' ');
4003                         if (!c)
4004                                 continue;
4005                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4006                         if (l < 0) {
4007                                 perror("Error writing to cache");
4008                                 rv = 0;
4009                                 goto err;
4010
4011                         }
4012                         if (l >= cache_size) {
4013                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4014                                 rv = 0;
4015                                 goto err;
4016                         }
4017
4018                         cache += l;
4019                         cache_size -= l;
4020                         total_len += l;
4021
4022                         if (ret != 10)
4023                                 continue;
4024                 }
4025
4026                 if (cg_cpu_usage) {
4027                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4028                         cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4029
4030                         if (all_used >= cg_used) {
4031                                 new_idle = idle + (all_used - cg_used);
4032
4033                         } else {
4034                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4035                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4036                                                 curcpu, cg, all_used, cg_used);
4037                                 new_idle = idle;
4038                         }
4039
4040                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4041                                         curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4042                                         new_idle);
4043
4044                         if (l < 0) {
4045                                 perror("Error writing to cache");
4046                                 rv = 0;
4047                                 goto err;
4048
4049                         }
4050                         if (l >= cache_size) {
4051                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4052                                 rv = 0;
4053                                 goto err;
4054                         }
4055
4056                         cache += l;
4057                         cache_size -= l;
4058                         total_len += l;
4059
4060                         user_sum += cg_cpu_usage[curcpu].user;
4061                         system_sum += cg_cpu_usage[curcpu].system;
4062                         idle_sum += new_idle;
4063
4064                 } else {
4065                         user_sum += user;
4066                         nice_sum += nice;
4067                         system_sum += system;
4068                         idle_sum += idle;
4069                         iowait_sum += iowait;
4070                         irq_sum += irq;
4071                         softirq_sum += softirq;
4072                         steal_sum += steal;
4073                         guest_sum += guest;
4074                         guest_nice_sum += guest_nice;
4075                 }
4076         }
4077
4078         cache = d->buf;
4079
4080         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4081                         user_sum,
4082                         nice_sum,
4083                         system_sum,
4084                         idle_sum,
4085                         iowait_sum,
4086                         irq_sum,
4087                         softirq_sum,
4088                         steal_sum,
4089                         guest_sum,
4090                         guest_nice_sum);
4091         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4092                 memcpy(cache, cpuall, cpuall_len);
4093                 cache += cpuall_len;
4094         } else {
4095                 /* shouldn't happen */
4096                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4097                 cpuall_len = 0;
4098         }
4099
4100         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4101         total_len += cpuall_len;
4102         d->cached = 1;
4103         d->size = total_len;
4104         if (total_len > size)
4105                 total_len = size;
4106
4107         memcpy(buf, d->buf, total_len);
4108         rv = total_len;
4109
4110 err:
4111         if (f)
4112                 fclose(f);
4113         if (cg_cpu_usage)
4114                 free(cg_cpu_usage);
4115         free(line);
4116         free(cpuset);
4117         free(cg);
4118         return rv;
4119 }
4120
4121 /* This function retrieves the busy time of a group of tasks by looking at
4122  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4123  * been given it's own cpuacct cgroup. If not, this function will take the busy
4124  * time of all other taks that do not actually belong to the container into
4125  * account as well. If someone has a clever solution for this please send a
4126  * patch!
4127  */
4128 static unsigned long get_reaper_busy(pid_t task)
4129 {
4130         pid_t initpid = lookup_initpid_in_store(task);
4131         char *cgroup = NULL, *usage_str = NULL;
4132         unsigned long usage = 0;
4133
4134         if (initpid <= 0)
4135                 return 0;
4136
4137         cgroup = get_pid_cgroup(initpid, "cpuacct");
4138         if (!cgroup)
4139                 goto out;
4140         prune_init_slice(cgroup);
4141         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4142                 goto out;
4143         usage = strtoul(usage_str, NULL, 10);
4144         usage /= 1000000000;
4145
4146 out:
4147         free(cgroup);
4148         free(usage_str);
4149         return usage;
4150 }
4151
4152 #if RELOADTEST
4153 void iwashere(void)
4154 {
4155         int fd;
4156
4157         fd = creat("/tmp/lxcfs-iwashere", 0644);
4158         if (fd >= 0)
4159                 close(fd);
4160 }
4161 #endif
4162
4163 /*
4164  * We read /proc/uptime and reuse its second field.
4165  * For the first field, we use the mtime for the reaper for
4166  * the calling pid as returned by getreaperage
4167  */
4168 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4169                 struct fuse_file_info *fi)
4170 {
4171         struct fuse_context *fc = fuse_get_context();
4172         struct file_info *d = (struct file_info *)fi->fh;
4173         unsigned long int busytime = get_reaper_busy(fc->pid);
4174         char *cache = d->buf;
4175         ssize_t total_len = 0;
4176         uint64_t idletime, reaperage;
4177
4178 #if RELOADTEST
4179         iwashere();
4180 #endif
4181
4182         if (offset){
4183                 if (!d->cached)
4184                         return 0;
4185                 if (offset > d->size)
4186                         return -EINVAL;
4187                 int left = d->size - offset;
4188                 total_len = left > size ? size: left;
4189                 memcpy(buf, cache + offset, total_len);
4190                 return total_len;
4191         }
4192
4193         reaperage = get_reaper_age(fc->pid);
4194         /* To understand why this is done, please read the comment to the
4195          * get_reaper_busy() function.
4196          */
4197         idletime = reaperage;
4198         if (reaperage >= busytime)
4199                 idletime = reaperage - busytime;
4200
4201         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4202         if (total_len < 0 || total_len >=  d->buflen){
4203                 lxcfs_error("%s\n", "failed to write to cache");
4204                 return 0;
4205         }
4206
4207         d->size = (int)total_len;
4208         d->cached = 1;
4209
4210         if (total_len > size) total_len = size;
4211
4212         memcpy(buf, d->buf, total_len);
4213         return total_len;
4214 }
4215
4216 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4217                 struct fuse_file_info *fi)
4218 {
4219         char dev_name[72];
4220         struct fuse_context *fc = fuse_get_context();
4221         struct file_info *d = (struct file_info *)fi->fh;
4222         char *cg;
4223         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4224                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
4225         unsigned long read = 0, write = 0;
4226         unsigned long read_merged = 0, write_merged = 0;
4227         unsigned long read_sectors = 0, write_sectors = 0;
4228         unsigned long read_ticks = 0, write_ticks = 0;
4229         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4230         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4231         char *cache = d->buf;
4232         size_t cache_size = d->buflen;
4233         char *line = NULL;
4234         size_t linelen = 0, total_len = 0, rv = 0;
4235         unsigned int major = 0, minor = 0;
4236         int i = 0;
4237         FILE *f = NULL;
4238
4239         if (offset){
4240                 if (offset > d->size)
4241                         return -EINVAL;
4242                 if (!d->cached)
4243                         return 0;
4244                 int left = d->size - offset;
4245                 total_len = left > size ? size: left;
4246                 memcpy(buf, cache + offset, total_len);
4247                 return total_len;
4248         }
4249
4250         pid_t initpid = lookup_initpid_in_store(fc->pid);
4251         if (initpid <= 0)
4252                 initpid = fc->pid;
4253         cg = get_pid_cgroup(initpid, "blkio");
4254         if (!cg)
4255                 return read_file("/proc/diskstats", buf, size, d);
4256         prune_init_slice(cg);
4257
4258         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4259                 goto err;
4260         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4261                 goto err;
4262         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4263                 goto err;
4264         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4265                 goto err;
4266         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4267                 goto err;
4268
4269
4270         f = fopen("/proc/diskstats", "r");
4271         if (!f)
4272                 goto err;
4273
4274         while (getline(&line, &linelen, f) != -1) {
4275                 ssize_t l;
4276                 char lbuf[256];
4277
4278                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4279                 if (i != 3)
4280                         continue;
4281
4282                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4283                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4284                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4285                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4286                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4287                 read_sectors = read_sectors/512;
4288                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4289                 write_sectors = write_sectors/512;
4290
4291                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4292                 rd_svctm = rd_svctm/1000000;
4293                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4294                 rd_wait = rd_wait/1000000;
4295                 read_ticks = rd_svctm + rd_wait;
4296
4297                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4298                 wr_svctm =  wr_svctm/1000000;
4299                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4300                 wr_wait =  wr_wait/1000000;
4301                 write_ticks = wr_svctm + wr_wait;
4302
4303                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4304                 tot_ticks =  tot_ticks/1000000;
4305
4306                 memset(lbuf, 0, 256);
4307                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4308                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4309                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4310                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4311                 else
4312                         continue;
4313
4314                 l = snprintf(cache, cache_size, "%s", lbuf);
4315                 if (l < 0) {
4316                         perror("Error writing to fuse buf");
4317                         rv = 0;
4318                         goto err;
4319                 }
4320                 if (l >= cache_size) {
4321                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4322                         rv = 0;
4323                         goto err;
4324                 }
4325                 cache += l;
4326                 cache_size -= l;
4327                 total_len += l;
4328         }
4329
4330         d->cached = 1;
4331         d->size = total_len;
4332         if (total_len > size ) total_len = size;
4333         memcpy(buf, d->buf, total_len);
4334
4335         rv = total_len;
4336 err:
4337         free(cg);
4338         if (f)
4339                 fclose(f);
4340         free(line);
4341         free(io_serviced_str);
4342         free(io_merged_str);
4343         free(io_service_bytes_str);
4344         free(io_wait_time_str);
4345         free(io_service_time_str);
4346         return rv;
4347 }
4348
4349 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4350                 struct fuse_file_info *fi)
4351 {
4352         struct fuse_context *fc = fuse_get_context();
4353         struct file_info *d = (struct file_info *)fi->fh;
4354         char *cg = NULL;
4355         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4356         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4357         ssize_t total_len = 0, rv = 0;
4358         ssize_t l = 0;
4359         char *cache = d->buf;
4360
4361         if (offset) {
4362                 if (offset > d->size)
4363                         return -EINVAL;
4364                 if (!d->cached)
4365                         return 0;
4366                 int left = d->size - offset;
4367                 total_len = left > size ? size: left;
4368                 memcpy(buf, cache + offset, total_len);
4369                 return total_len;
4370         }
4371
4372         pid_t initpid = lookup_initpid_in_store(fc->pid);
4373         if (initpid <= 0)
4374                 initpid = fc->pid;
4375         cg = get_pid_cgroup(initpid, "memory");
4376         if (!cg)
4377                 return read_file("/proc/swaps", buf, size, d);
4378         prune_init_slice(cg);
4379
4380         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4381
4382         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4383                 goto err;
4384
4385         memusage = strtoul(memusage_str, NULL, 10);
4386
4387         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4388             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4389
4390                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4391                 memswusage = strtoul(memswusage_str, NULL, 10);
4392
4393                 swap_total = (memswlimit - memlimit) / 1024;
4394                 swap_free = (memswusage - memusage) / 1024;
4395         }
4396
4397         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4398
4399         /* When no mem + swap limit is specified or swapaccount=0*/
4400         if (!memswlimit) {
4401                 char *line = NULL;
4402                 size_t linelen = 0;
4403                 FILE *f = fopen("/proc/meminfo", "r");
4404
4405                 if (!f)
4406                         goto err;
4407
4408                 while (getline(&line, &linelen, f) != -1) {
4409                         if (startswith(line, "SwapTotal:")) {
4410                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4411                         } else if (startswith(line, "SwapFree:")) {
4412                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4413                         }
4414                 }
4415
4416                 free(line);
4417                 fclose(f);
4418         }
4419
4420         if (swap_total > 0) {
4421                 l = snprintf(d->buf + total_len, d->size - total_len,
4422                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4423                                 swap_total, swap_free);
4424                 total_len += l;
4425         }
4426
4427         if (total_len < 0 || l < 0) {
4428                 perror("Error writing to cache");
4429                 rv = 0;
4430                 goto err;
4431         }
4432
4433         d->cached = 1;
4434         d->size = (int)total_len;
4435
4436         if (total_len > size) total_len = size;
4437         memcpy(buf, d->buf, total_len);
4438         rv = total_len;
4439
4440 err:
4441         free(cg);
4442         free(memswlimit_str);
4443         free(memlimit_str);
4444         free(memusage_str);
4445         free(memswusage_str);
4446         return rv;
4447 }
4448 /*
4449  * Find the process pid from cgroup path.
4450  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4451  * @pid_buf : put pid to pid_buf.
4452  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4453  * @depth : the depth of cgroup in container.
4454  * @sum : return the number of pid.
4455  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4456  */
4457 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4458 {
4459         DIR *dir;
4460         int fd;
4461         struct dirent *file;
4462         FILE *f = NULL;
4463         size_t linelen = 0;
4464         char *line = NULL;
4465         int pd;
4466         char *path_dir, *path;
4467         char **pid;
4468
4469         /* path = dpath + "/cgroup.procs" + /0 */
4470         do {
4471                 path = malloc(strlen(dpath) + 20);
4472         } while (!path);
4473
4474         strcpy(path, dpath);
4475         fd = openat(cfd, path, O_RDONLY);
4476         if (fd < 0)
4477                 goto out;
4478
4479         dir = fdopendir(fd);
4480         if (dir == NULL) {
4481                 close(fd);
4482                 goto out;
4483         }
4484
4485         while (((file = readdir(dir)) != NULL) && depth > 0) {
4486                 if (strncmp(file->d_name, ".", 1) == 0)
4487                         continue;
4488                 if (strncmp(file->d_name, "..", 1) == 0)
4489                         continue;
4490                 if (file->d_type == DT_DIR) {
4491                         /* path + '/' + d_name +/0 */
4492                         do {
4493                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4494                         } while (!path_dir);
4495                         strcpy(path_dir, path);
4496                         strcat(path_dir, "/");
4497                         strcat(path_dir, file->d_name);
4498                         pd = depth - 1;
4499                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4500                         free(path_dir);
4501                 }
4502         }
4503         closedir(dir);
4504
4505         strcat(path, "/cgroup.procs");
4506         fd = openat(cfd, path, O_RDONLY);
4507         if (fd < 0)
4508                 goto out;
4509
4510         f = fdopen(fd, "r");
4511         if (!f) {
4512                 close(fd);
4513                 goto out;
4514         }
4515
4516         while (getline(&line, &linelen, f) != -1) {
4517                 do {
4518                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4519                 } while (!pid);
4520                 *pid_buf = pid;
4521                 do {
4522                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
4523                 } while (*(*pid_buf + sum) == NULL);
4524                 strcpy(*(*pid_buf + sum), line);
4525                 sum++;
4526         }
4527         fclose(f);
4528 out:
4529         free(path);
4530         return sum;
4531 }
4532 /*
4533  * calc_load calculates the load according to the following formula:
4534  * load1 = load0 * exp + active * (1 - exp)
4535  *
4536  * @load1: the new loadavg.
4537  * @load0: the former loadavg.
4538  * @active: the total number of running pid at this moment.
4539  * @exp: the fixed-point defined in the beginning.
4540  */
4541 static unsigned long
4542 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4543 {
4544         unsigned long newload;
4545
4546         active = active > 0 ? active * FIXED_1 : 0;
4547         newload = load * exp + active * (FIXED_1 - exp);
4548         if (active >= load)
4549                 newload += FIXED_1 - 1;
4550
4551         return newload / FIXED_1;
4552 }
4553
4554 /*
4555  * Return 0 means that container p->cg is closed.
4556  * Return -1 means that error occurred in refresh.
4557  * Positive num equals the total number of pid.
4558  */
4559 static int refresh_load(struct load_node *p, char *path)
4560 {
4561         FILE *f = NULL;
4562         char **idbuf;
4563         char proc_path[256];
4564         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4565         char *line = NULL;
4566         size_t linelen = 0;
4567         int sum, length;
4568         DIR *dp;
4569         struct dirent *file;
4570
4571         do {
4572                 idbuf = malloc(sizeof(char *));
4573         } while (!idbuf);
4574         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4575         /*  normal exit  */
4576         if (sum == 0)
4577                 goto out;
4578
4579         for (i = 0; i < sum; i++) {
4580                 /*clean up '\n' */
4581                 length = strlen(idbuf[i])-1;
4582                 idbuf[i][length] = '\0';
4583                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4584                 if (ret < 0 || ret > 255) {
4585                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4586                         i = sum;
4587                         sum = -1;
4588                         goto err_out;
4589                 }
4590
4591                 dp = opendir(proc_path);
4592                 if (!dp) {
4593                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4594                         continue;
4595                 }
4596                 while ((file = readdir(dp)) != NULL) {
4597                         if (strncmp(file->d_name, ".", 1) == 0)
4598                                 continue;
4599                         if (strncmp(file->d_name, "..", 1) == 0)
4600                                 continue;
4601                         total_pid++;
4602                         /* We make the biggest pid become last_pid.*/
4603                         ret = atof(file->d_name);
4604                         last_pid = (ret > last_pid) ? ret : last_pid;
4605
4606                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4607                         if (ret < 0 || ret > 255) {
4608                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4609                                 i = sum;
4610                                 sum = -1;
4611                                 closedir(dp);
4612                                 goto err_out;
4613                         }
4614                         f = fopen(proc_path, "r");
4615                         if (f != NULL) {
4616                                 while (getline(&line, &linelen, f) != -1) {
4617                                         /* Find State */
4618                                         if ((line[0] == 'S') && (line[1] == 't'))
4619                                                 break;
4620                                 }
4621                         if ((line[7] == 'R') || (line[7] == 'D'))
4622                                 run_pid++;
4623                         fclose(f);
4624                         }
4625                 }
4626                 closedir(dp);
4627         }
4628         /*Calculate the loadavg.*/
4629         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4630         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4631         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4632         p->run_pid = run_pid;
4633         p->total_pid = total_pid;
4634         p->last_pid = last_pid;
4635
4636         free(line);
4637 err_out:
4638         for (; i > 0; i--)
4639                 free(idbuf[i-1]);
4640 out:
4641         free(idbuf);
4642         return sum;
4643 }
4644 /*
4645  * Traverse the hash table and update it.
4646  */
4647 void *load_begin(void *arg)
4648 {
4649
4650         char *path = NULL;
4651         int i, sum, length, ret;
4652         struct load_node *f;
4653         int first_node;
4654         clock_t time1, time2;
4655
4656         while (1) {
4657                 if (loadavg_stop == 1)
4658                         return NULL;
4659
4660                 time1 = clock();
4661                 for (i = 0; i < LOAD_SIZE; i++) {
4662                         pthread_mutex_lock(&load_hash[i].lock);
4663                         if (load_hash[i].next == NULL) {
4664                                 pthread_mutex_unlock(&load_hash[i].lock);
4665                                 continue;
4666                         }
4667                         f = load_hash[i].next;
4668                         first_node = 1;
4669                         while (f) {
4670                                 length = strlen(f->cg) + 2;
4671                                 do {
4672                                         /* strlen(f->cg) + '.' or '' + \0 */
4673                                         path = malloc(length);
4674                                 } while (!path);
4675
4676                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4677                                 if (ret < 0 || ret > length - 1) {
4678                                         /* snprintf failed, ignore the node.*/
4679                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4680                                         goto out;
4681                                 }
4682                                 sum = refresh_load(f, path);
4683                                 if (sum == 0) {
4684                                         f = del_node(f, i);
4685                                 } else {
4686 out:                                    f = f->next;
4687                                 }
4688                                 free(path);
4689                                 /* load_hash[i].lock locks only on the first node.*/
4690                                 if (first_node == 1) {
4691                                         first_node = 0;
4692                                         pthread_mutex_unlock(&load_hash[i].lock);
4693                                 }
4694                         }
4695                 }
4696
4697                 if (loadavg_stop == 1)
4698                         return NULL;
4699
4700                 time2 = clock();
4701                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4702         }
4703 }
4704
4705 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4706                 struct fuse_file_info *fi)
4707 {
4708         struct fuse_context *fc = fuse_get_context();
4709         struct file_info *d = (struct file_info *)fi->fh;
4710         pid_t initpid;
4711         char *cg;
4712         size_t total_len = 0;
4713         char *cache = d->buf;
4714         struct load_node *n;
4715         int hash;
4716         int cfd, rv = 0;
4717         unsigned long a, b, c;
4718
4719         if (offset) {
4720                 if (offset > d->size)
4721                         return -EINVAL;
4722                 if (!d->cached)
4723                         return 0;
4724                 int left = d->size - offset;
4725                 total_len = left > size ? size : left;
4726                 memcpy(buf, cache + offset, total_len);
4727                 return total_len;
4728         }
4729         if (!loadavg)
4730                 return read_file("/proc/loadavg", buf, size, d);
4731
4732         initpid = lookup_initpid_in_store(fc->pid);
4733         if (initpid <= 0)
4734                 initpid = fc->pid;
4735         cg = get_pid_cgroup(initpid, "cpu");
4736         if (!cg)
4737                 return read_file("/proc/loadavg", buf, size, d);
4738
4739         prune_init_slice(cg);
4740         hash = calc_hash(cg);
4741         n = locate_node(cg, hash);
4742
4743         /* First time */
4744         if (n == NULL) {
4745                 if (!find_mounted_controller("cpu", &cfd)) {
4746                         /*
4747                          * In locate_node() above, pthread_rwlock_unlock() isn't used
4748                          * because delete is not allowed before read has ended.
4749                          */
4750                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4751                         rv = 0;
4752                         goto err;
4753                 }
4754                 do {
4755                         n = malloc(sizeof(struct load_node));
4756                 } while (!n);
4757
4758                 do {
4759                         n->cg = malloc(strlen(cg)+1);
4760                 } while (!n->cg);
4761                 strcpy(n->cg, cg);
4762                 n->avenrun[0] = 0;
4763                 n->avenrun[1] = 0;
4764                 n->avenrun[2] = 0;
4765                 n->run_pid = 0;
4766                 n->total_pid = 1;
4767                 n->last_pid = initpid;
4768                 n->cfd = cfd;
4769                 insert_node(&n, hash);
4770         }
4771         a = n->avenrun[0] + (FIXED_1/200);
4772         b = n->avenrun[1] + (FIXED_1/200);
4773         c = n->avenrun[2] + (FIXED_1/200);
4774         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4775                 LOAD_INT(a), LOAD_FRAC(a),
4776                 LOAD_INT(b), LOAD_FRAC(b),
4777                 LOAD_INT(c), LOAD_FRAC(c),
4778                 n->run_pid, n->total_pid, n->last_pid);
4779         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4780         if (total_len < 0 || total_len >=  d->buflen) {
4781                 lxcfs_error("%s\n", "Failed to write to cache");
4782                 rv = 0;
4783                 goto err;
4784         }
4785         d->size = (int)total_len;
4786         d->cached = 1;
4787
4788         if (total_len > size)
4789                 total_len = size;
4790         memcpy(buf, d->buf, total_len);
4791         rv = total_len;
4792
4793 err:
4794         free(cg);
4795         return rv;
4796 }
4797 /* Return a positive number on success, return 0 on failure.*/
4798 pthread_t load_daemon(int load_use)
4799 {
4800         int ret;
4801         pthread_t pid;
4802
4803         ret = init_load();
4804         if (ret == -1) {
4805                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4806                 return 0;
4807         }
4808         ret = pthread_create(&pid, NULL, load_begin, NULL);
4809         if (ret != 0) {
4810                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4811                 load_free();
4812                 return 0;
4813         }
4814         /* use loadavg, here loadavg = 1*/
4815         loadavg = load_use;
4816         return pid;
4817 }
4818
4819 /* Returns 0 on success. */
4820 int stop_load_daemon(pthread_t pid)
4821 {
4822         int s;
4823
4824         /* Signal the thread to gracefully stop */
4825         loadavg_stop = 1;
4826
4827         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4828         if (s != 0) {
4829                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4830                 return -1;
4831         }
4832
4833         load_free();
4834         loadavg_stop = 0;
4835
4836         return 0;
4837 }
4838
4839 static off_t get_procfile_size(const char *which)
4840 {
4841         FILE *f = fopen(which, "r");
4842         char *line = NULL;
4843         size_t len = 0;
4844         ssize_t sz, answer = 0;
4845         if (!f)
4846                 return 0;
4847
4848         while ((sz = getline(&line, &len, f)) != -1)
4849                 answer += sz;
4850         fclose (f);
4851         free(line);
4852
4853         return answer;
4854 }
4855
4856 int proc_getattr(const char *path, struct stat *sb)
4857 {
4858         struct timespec now;
4859
4860         memset(sb, 0, sizeof(struct stat));
4861         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4862                 return -EINVAL;
4863         sb->st_uid = sb->st_gid = 0;
4864         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4865         if (strcmp(path, "/proc") == 0) {
4866                 sb->st_mode = S_IFDIR | 00555;
4867                 sb->st_nlink = 2;
4868                 return 0;
4869         }
4870         if (strcmp(path, "/proc/meminfo") == 0 ||
4871                         strcmp(path, "/proc/cpuinfo") == 0 ||
4872                         strcmp(path, "/proc/uptime") == 0 ||
4873                         strcmp(path, "/proc/stat") == 0 ||
4874                         strcmp(path, "/proc/diskstats") == 0 ||
4875                         strcmp(path, "/proc/swaps") == 0 ||
4876                         strcmp(path, "/proc/loadavg") == 0) {
4877                 sb->st_size = 0;
4878                 sb->st_mode = S_IFREG | 00444;
4879                 sb->st_nlink = 1;
4880                 return 0;
4881         }
4882
4883         return -ENOENT;
4884 }
4885
4886 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4887                 struct fuse_file_info *fi)
4888 {
4889         if (filler(buf, ".", NULL, 0) != 0 ||
4890             filler(buf, "..", NULL, 0) != 0 ||
4891             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4892             filler(buf, "meminfo", NULL, 0) != 0 ||
4893             filler(buf, "stat", NULL, 0) != 0 ||
4894             filler(buf, "uptime", NULL, 0) != 0 ||
4895             filler(buf, "diskstats", NULL, 0) != 0 ||
4896             filler(buf, "swaps", NULL, 0) != 0   ||
4897             filler(buf, "loadavg", NULL, 0) != 0)
4898                 return -EINVAL;
4899         return 0;
4900 }
4901
4902 int proc_open(const char *path, struct fuse_file_info *fi)
4903 {
4904         int type = -1;
4905         struct file_info *info;
4906
4907         if (strcmp(path, "/proc/meminfo") == 0)
4908                 type = LXC_TYPE_PROC_MEMINFO;
4909         else if (strcmp(path, "/proc/cpuinfo") == 0)
4910                 type = LXC_TYPE_PROC_CPUINFO;
4911         else if (strcmp(path, "/proc/uptime") == 0)
4912                 type = LXC_TYPE_PROC_UPTIME;
4913         else if (strcmp(path, "/proc/stat") == 0)
4914                 type = LXC_TYPE_PROC_STAT;
4915         else if (strcmp(path, "/proc/diskstats") == 0)
4916                 type = LXC_TYPE_PROC_DISKSTATS;
4917         else if (strcmp(path, "/proc/swaps") == 0)
4918                 type = LXC_TYPE_PROC_SWAPS;
4919         else if (strcmp(path, "/proc/loadavg") == 0)
4920                 type = LXC_TYPE_PROC_LOADAVG;
4921         if (type == -1)
4922                 return -ENOENT;
4923
4924         info = malloc(sizeof(*info));
4925         if (!info)
4926                 return -ENOMEM;
4927
4928         memset(info, 0, sizeof(*info));
4929         info->type = type;
4930
4931         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4932         do {
4933                 info->buf = malloc(info->buflen);
4934         } while (!info->buf);
4935         memset(info->buf, 0, info->buflen);
4936         /* set actual size to buffer size */
4937         info->size = info->buflen;
4938
4939         fi->fh = (unsigned long)info;
4940         return 0;
4941 }
4942
4943 int proc_access(const char *path, int mask)
4944 {
4945         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4946                 return 0;
4947
4948         /* these are all read-only */
4949         if ((mask & ~R_OK) != 0)
4950                 return -EACCES;
4951         return 0;
4952 }
4953
4954 int proc_release(const char *path, struct fuse_file_info *fi)
4955 {
4956         do_release_file_info(fi);
4957         return 0;
4958 }
4959
4960 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4961                 struct fuse_file_info *fi)
4962 {
4963         struct file_info *f = (struct file_info *) fi->fh;
4964
4965         switch (f->type) {
4966         case LXC_TYPE_PROC_MEMINFO:
4967                 return proc_meminfo_read(buf, size, offset, fi);
4968         case LXC_TYPE_PROC_CPUINFO:
4969                 return proc_cpuinfo_read(buf, size, offset, fi);
4970         case LXC_TYPE_PROC_UPTIME:
4971                 return proc_uptime_read(buf, size, offset, fi);
4972         case LXC_TYPE_PROC_STAT:
4973                 return proc_stat_read(buf, size, offset, fi);
4974         case LXC_TYPE_PROC_DISKSTATS:
4975                 return proc_diskstats_read(buf, size, offset, fi);
4976         case LXC_TYPE_PROC_SWAPS:
4977                 return proc_swaps_read(buf, size, offset, fi);
4978         case LXC_TYPE_PROC_LOADAVG:
4979                 return proc_loadavg_read(buf, size, offset, fi);
4980         default:
4981                 return -EINVAL;
4982         }
4983 }
4984
4985 /*
4986  * Functions needed to setup cgroups in the __constructor__.
4987  */
4988
4989 static bool mkdir_p(const char *dir, mode_t mode)
4990 {
4991         const char *tmp = dir;
4992         const char *orig = dir;
4993         char *makeme;
4994
4995         do {
4996                 dir = tmp + strspn(tmp, "/");
4997                 tmp = dir + strcspn(dir, "/");
4998                 makeme = strndup(orig, dir - orig);
4999                 if (!makeme)
5000                         return false;
5001                 if (mkdir(makeme, mode) && errno != EEXIST) {
5002                         lxcfs_error("Failed to create directory '%s': %s.\n",
5003                                 makeme, strerror(errno));
5004                         free(makeme);
5005                         return false;
5006                 }
5007                 free(makeme);
5008         } while(tmp != dir);
5009
5010         return true;
5011 }
5012
5013 static bool umount_if_mounted(void)
5014 {
5015         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5016                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5017                 return false;
5018         }
5019         return true;
5020 }
5021
5022 /* __typeof__ should be safe to use with all compilers. */
5023 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5024 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5025 {
5026         return (fs->f_type == (fs_type_magic)magic_val);
5027 }
5028
5029 /*
5030  * looking at fs/proc_namespace.c, it appears we can
5031  * actually expect the rootfs entry to very specifically contain
5032  * " - rootfs rootfs "
5033  * IIUC, so long as we've chrooted so that rootfs is not our root,
5034  * the rootfs entry should always be skipped in mountinfo contents.
5035  */
5036 static bool is_on_ramfs(void)
5037 {
5038         FILE *f;
5039         char *p, *p2;
5040         char *line = NULL;
5041         size_t len = 0;
5042         int i;
5043
5044         f = fopen("/proc/self/mountinfo", "r");
5045         if (!f)
5046                 return false;
5047
5048         while (getline(&line, &len, f) != -1) {
5049                 for (p = line, i = 0; p && i < 4; i++)
5050                         p = strchr(p + 1, ' ');
5051                 if (!p)
5052                         continue;
5053                 p2 = strchr(p + 1, ' ');
5054                 if (!p2)
5055                         continue;
5056                 *p2 = '\0';
5057                 if (strcmp(p + 1, "/") == 0) {
5058                         // this is '/'.  is it the ramfs?
5059                         p = strchr(p2 + 1, '-');
5060                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5061                                 free(line);
5062                                 fclose(f);
5063                                 return true;
5064                         }
5065                 }
5066         }
5067         free(line);
5068         fclose(f);
5069         return false;
5070 }
5071
5072 static int pivot_enter()
5073 {
5074         int ret = -1, oldroot = -1, newroot = -1;
5075
5076         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5077         if (oldroot < 0) {
5078                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5079                 return ret;
5080         }
5081
5082         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5083         if (newroot < 0) {
5084                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5085                 goto err;
5086         }
5087
5088         /* change into new root fs */
5089         if (fchdir(newroot) < 0) {
5090                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5091                 goto err;
5092         }
5093
5094         /* pivot_root into our new root fs */
5095         if (pivot_root(".", ".") < 0) {
5096                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5097                 goto err;
5098         }
5099
5100         /*
5101          * At this point the old-root is mounted on top of our new-root.
5102          * To unmounted it we must not be chdir'd into it, so escape back
5103          * to the old-root.
5104          */
5105         if (fchdir(oldroot) < 0) {
5106                 lxcfs_error("%s\n", "Failed to enter old root.");
5107                 goto err;
5108         }
5109
5110         if (umount2(".", MNT_DETACH) < 0) {
5111                 lxcfs_error("%s\n", "Failed to detach old root.");
5112                 goto err;
5113         }
5114
5115         if (fchdir(newroot) < 0) {
5116                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5117                 goto err;
5118         }
5119
5120         ret = 0;
5121
5122 err:
5123         if (oldroot > 0)
5124                 close(oldroot);
5125         if (newroot > 0)
5126                 close(newroot);
5127
5128         return ret;
5129 }
5130
5131 static int chroot_enter()
5132 {
5133         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5134                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5135                 return -1;
5136         }
5137
5138         if (chroot(".") < 0) {
5139                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5140                 return -1;
5141         }
5142
5143         if (chdir("/") < 0) {
5144                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5145                 return -1;
5146         }
5147
5148         return 0;
5149 }
5150
5151 static int permute_and_enter(void)
5152 {
5153         struct statfs sb;
5154
5155         if (statfs("/", &sb) < 0) {
5156                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5157                 return -1;
5158         }
5159
5160         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5161          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5162          * /proc/1/mountinfo. */
5163         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5164                 return chroot_enter();
5165
5166         if (pivot_enter() < 0) {
5167                 lxcfs_error("%s\n", "Could not perform pivot root.");
5168                 return -1;
5169         }
5170
5171         return 0;
5172 }
5173
5174 /* Prepare our new clean root. */
5175 static int permute_prepare(void)
5176 {
5177         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5178                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5179                 return -1;
5180         }
5181
5182         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5183                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5184                 return -1;
5185         }
5186
5187         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5188                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5189                 return -1;
5190         }
5191
5192         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5193                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5194                 return -1;
5195         }
5196
5197         return 0;
5198 }
5199
5200 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5201 static bool permute_root(void)
5202 {
5203         /* Prepare new root. */
5204         if (permute_prepare() < 0)
5205                 return false;
5206
5207         /* Pivot into new root. */
5208         if (permute_and_enter() < 0)
5209                 return false;
5210
5211         return true;
5212 }
5213
5214 static int preserve_mnt_ns(int pid)
5215 {
5216         int ret;
5217         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5218         char path[len];
5219
5220         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5221         if (ret < 0 || (size_t)ret >= len)
5222                 return -1;
5223
5224         return open(path, O_RDONLY | O_CLOEXEC);
5225 }
5226
5227 static bool cgfs_prepare_mounts(void)
5228 {
5229         if (!mkdir_p(BASEDIR, 0700)) {
5230                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5231                 return false;
5232         }
5233
5234         if (!umount_if_mounted()) {
5235                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5236                 return false;
5237         }
5238
5239         if (unshare(CLONE_NEWNS) < 0) {
5240                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5241                 return false;
5242         }
5243
5244         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5245         if (cgroup_mount_ns_fd < 0) {
5246                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5247                 return false;
5248         }
5249
5250         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5251                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5252                 return false;
5253         }
5254
5255         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5256                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5257                 return false;
5258         }
5259
5260         return true;
5261 }
5262
5263 static bool cgfs_mount_hierarchies(void)
5264 {
5265         char *target;
5266         size_t clen, len;
5267         int i, ret;
5268
5269         for (i = 0; i < num_hierarchies; i++) {
5270                 char *controller = hierarchies[i];
5271
5272                 clen = strlen(controller);
5273                 len = strlen(BASEDIR) + clen + 2;
5274                 target = malloc(len);
5275                 if (!target)
5276                         return false;
5277
5278                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5279                 if (ret < 0 || ret >= len) {
5280                         free(target);
5281                         return false;
5282                 }
5283                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5284                         free(target);
5285                         return false;
5286                 }
5287                 if (!strcmp(controller, "unified"))
5288                         ret = mount("none", target, "cgroup2", 0, NULL);
5289                 else
5290                         ret = mount(controller, target, "cgroup", 0, controller);
5291                 if (ret < 0) {
5292                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5293                         free(target);
5294                         return false;
5295                 }
5296
5297                 fd_hierarchies[i] = open(target, O_DIRECTORY);
5298                 if (fd_hierarchies[i] < 0) {
5299                         free(target);
5300                         return false;
5301                 }
5302                 free(target);
5303         }
5304         return true;
5305 }
5306
5307 static bool cgfs_setup_controllers(void)
5308 {
5309         if (!cgfs_prepare_mounts())
5310                 return false;
5311
5312         if (!cgfs_mount_hierarchies()) {
5313                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5314                 return false;
5315         }
5316
5317         if (!permute_root())
5318                 return false;
5319
5320         return true;
5321 }
5322
5323 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5324 {
5325         FILE *f;
5326         char *cret, *line = NULL;
5327         char cwd[MAXPATHLEN];
5328         size_t len = 0;
5329         int i, init_ns = -1;
5330         bool found_unified = false;
5331
5332         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5333                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5334                 return;
5335         }
5336
5337         while (getline(&line, &len, f) != -1) {
5338                 char *idx, *p, *p2;
5339
5340                 p = strchr(line, ':');
5341                 if (!p)
5342                         goto out;
5343                 idx = line;
5344                 *(p++) = '\0';
5345
5346                 p2 = strrchr(p, ':');
5347                 if (!p2)
5348                         goto out;
5349                 *p2 = '\0';
5350
5351                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5352                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5353                  * because it parses out the empty string "" and later on passes
5354                  * it to mount(). Let's skip such entries.
5355                  */
5356                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5357                         found_unified = true;
5358                         p = "unified";
5359                 }
5360
5361                 if (!store_hierarchy(line, p))
5362                         goto out;
5363         }
5364
5365         /* Preserve initial namespace. */
5366         init_ns = preserve_mnt_ns(getpid());
5367         if (init_ns < 0) {
5368                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5369                 goto out;
5370         }
5371
5372         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5373         if (!fd_hierarchies) {
5374                 lxcfs_error("%s\n", strerror(errno));
5375                 goto out;
5376         }
5377
5378         for (i = 0; i < num_hierarchies; i++)
5379                 fd_hierarchies[i] = -1;
5380
5381         cret = getcwd(cwd, MAXPATHLEN);
5382         if (!cret)
5383                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5384
5385         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5386          * to privately mount lxcfs cgroups. */
5387         if (!cgfs_setup_controllers()) {
5388                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5389                 goto out;
5390         }
5391
5392         if (setns(init_ns, 0) < 0) {
5393                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5394                 goto out;
5395         }
5396
5397         if (!cret || chdir(cwd) < 0)
5398                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5399
5400         print_subsystems();
5401
5402 out:
5403         free(line);
5404         fclose(f);
5405         if (init_ns >= 0)
5406                 close(init_ns);
5407 }
5408
5409 static void __attribute__((destructor)) free_subsystems(void)
5410 {
5411         int i;
5412
5413         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5414
5415         for (i = 0; i < num_hierarchies; i++) {
5416                 if (hierarchies[i])
5417                         free(hierarchies[i]);
5418                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5419                         close(fd_hierarchies[i]);
5420         }
5421         free(hierarchies);
5422         free(fd_hierarchies);
5423
5424         if (cgroup_mount_ns_fd >= 0)
5425                 close(cgroup_mount_ns_fd);
5426 }