bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 struct cpuacct_usage {
  84         uint64_t user;
  85         uint64_t system;
  86 };
  87
  88 /* The function of hash table.*/
  89 #define LOAD_SIZE 100 /*the size of hash_table */
  90 #define FLUSH_TIME 5  /*the flush rate */
  91 #define DEPTH_DIR 3   /*the depth of per cgroup */
  92 /* The function of calculate loadavg .*/
  93 #define FSHIFT          11              /* nr of bits of precision */
  94 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  95 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  96 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  97 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  98 #define LOAD_INT(x) ((x) >> FSHIFT)
  99 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 100 /*
 101  * This parameter is used for proc_loadavg_read().
 102  * 1 means use loadavg, 0 means not use.
 103  */
 104 static int loadavg = 0;
 105 static volatile sig_atomic_t loadavg_stop = 0;
 106 static int calc_hash(char *name)
 107 {
 108         unsigned int hash = 0;
 109         unsigned int x = 0;
 110         /* ELFHash algorithm. */
 111         while (*name) {
 112                 hash = (hash << 4) + *name++;
 113                 x = hash & 0xf0000000;
 114                 if (x != 0)
 115                         hash ^= (x >> 24);
 116                 hash &= ~x;
 117         }
 118         return ((hash & 0x7fffffff) % LOAD_SIZE);
 119 }
 120
 121 struct load_node {
 122         char *cg;  /*cg */
 123         unsigned long avenrun[3];               /* Load averages */
 124         unsigned int run_pid;
 125         unsigned int total_pid;
 126         unsigned int last_pid;
 127         int cfd; /* The file descriptor of the mounted cgroup */
 128         struct  load_node *next;
 129         struct  load_node **pre;
 130 };
 131
 132 struct load_head {
 133         /*
 134          * The lock is about insert load_node and refresh load_node.To the first
 135          * load_node of each hash bucket, insert and refresh in this hash bucket is
 136          * mutually exclusive.
 137          */
 138         pthread_mutex_t lock;
 139         /*
 140          * The rdlock is about read loadavg and delete load_node.To each hash
 141          * bucket, read and delete is mutually exclusive. But at the same time, we
 142          * allow paratactic read operation. This rdlock is at list level.
 143          */
 144         pthread_rwlock_t rdlock;
 145         /*
 146          * The rilock is about read loadavg and insert load_node.To the first
 147          * load_node of each hash bucket, read and insert is mutually exclusive.
 148          * But at the same time, we allow paratactic read operation.
 149          */
 150         pthread_rwlock_t rilock;
 151         struct load_node *next;
 152 };
 153
 154 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 155 /*
 156  * init_load initialize the hash table.
 157  * Return 0 on success, return -1 on failure.
 158  */
 159 static int init_load(void)
 160 {
 161         int i;
 162         int ret;
 163
 164         for (i = 0; i < LOAD_SIZE; i++) {
 165                 load_hash[i].next = NULL;
 166                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 167                 if (ret != 0) {
 168                         lxcfs_error("%s\n", "Failed to initialize lock");
 169                         goto out3;
 170                 }
 171                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 172                 if (ret != 0) {
 173                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 174                         goto out2;
 175                 }
 176                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 177                 if (ret != 0) {
 178                         lxcfs_error("%s\n", "Failed to initialize rilock");
 179                         goto out1;
 180                 }
 181         }
 182         return 0;
 183 out1:
 184         pthread_rwlock_destroy(&load_hash[i].rdlock);
 185 out2:
 186         pthread_mutex_destroy(&load_hash[i].lock);
 187 out3:
 188         while (i > 0) {
 189                 i--;
 190                 pthread_mutex_destroy(&load_hash[i].lock);
 191                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 192                 pthread_rwlock_destroy(&load_hash[i].rilock);
 193         }
 194         return -1;
 195 }
 196
 197 static void insert_node(struct load_node **n, int locate)
 198 {
 199         struct load_node *f;
 200
 201         pthread_mutex_lock(&load_hash[locate].lock);
 202         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 203         f = load_hash[locate].next;
 204         load_hash[locate].next = *n;
 205
 206         (*n)->pre = &(load_hash[locate].next);
 207         if (f)
 208                 f->pre = &((*n)->next);
 209         (*n)->next = f;
 210         pthread_mutex_unlock(&load_hash[locate].lock);
 211         pthread_rwlock_unlock(&load_hash[locate].rilock);
 212 }
 213 /*
 214  * locate_node() finds special node. Not return NULL means success.
 215  * It should be noted that rdlock isn't unlocked at the end of code
 216  * because this function is used to read special node. Delete is not
 217  * allowed before read has ended.
 218  * unlock rdlock only in proc_loadavg_read().
 219  */
 220 static struct load_node *locate_node(char *cg, int locate)
 221 {
 222         struct load_node *f = NULL;
 223         int i = 0;
 224
 225         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 226         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 227         if (load_hash[locate].next == NULL) {
 228                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 229                 return f;
 230         }
 231         f = load_hash[locate].next;
 232         pthread_rwlock_unlock(&load_hash[locate].rilock);
 233         while (f && ((i = strcmp(f->cg, cg)) != 0))
 234                 f = f->next;
 235         return f;
 236 }
 237 /* Delete the load_node n and return the next node of it. */
 238 static struct load_node *del_node(struct load_node *n, int locate)
 239 {
 240         struct load_node *g;
 241
 242         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 243         if (n->next == NULL) {
 244                 *(n->pre) = NULL;
 245         } else {
 246                 *(n->pre) = n->next;
 247                 n->next->pre = n->pre;
 248         }
 249         g = n->next;
 250         free(n->cg);
 251         free(n);
 252         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 253         return g;
 254 }
 255
 256 static void load_free(void)
 257 {
 258         int i;
 259         struct load_node *f, *p;
 260
 261         for (i = 0; i < LOAD_SIZE; i++) {
 262                 pthread_mutex_lock(&load_hash[i].lock);
 263                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 264                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 265                 if (load_hash[i].next == NULL) {
 266                         pthread_mutex_unlock(&load_hash[i].lock);
 267                         pthread_mutex_destroy(&load_hash[i].lock);
 268                         pthread_rwlock_unlock(&load_hash[i].rilock);
 269                         pthread_rwlock_destroy(&load_hash[i].rilock);
 270                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 271                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 272                         continue;
 273                 }
 274                 for (f = load_hash[i].next; f; ) {
 275                         free(f->cg);
 276                         p = f->next;
 277                         free(f);
 278                         f = p;
 279                 }
 280                 pthread_mutex_unlock(&load_hash[i].lock);
 281                 pthread_mutex_destroy(&load_hash[i].lock);
 282                 pthread_rwlock_unlock(&load_hash[i].rilock);
 283                 pthread_rwlock_destroy(&load_hash[i].rilock);
 284                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 285                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 286         }
 287 }
 288 /* Reserve buffer size to account for file size changes. */
 289 #define BUF_RESERVE_SIZE 512
 290
 291 /*
 292  * A table caching which pid is init for a pid namespace.
 293  * When looking up which pid is init for $qpid, we first
 294  * 1. Stat /proc/$qpid/ns/pid.
 295  * 2. Check whether the ino_t is in our store.
 296  *   a. if not, fork a child in qpid's ns to send us
 297  *       ucred.pid = 1, and read the initpid.  Cache
 298  *       initpid and creation time for /proc/initpid
 299  *       in a new store entry.
 300  *   b. if so, verify that /proc/initpid still matches
 301  *       what we have saved.  If not, clear the store
 302  *       entry and go back to a.  If so, return the
 303  *       cached initpid.
 304  */
 305 struct pidns_init_store {
 306         ino_t ino;          // inode number for /proc/$pid/ns/pid
 307         pid_t initpid;      // the pid of nit in that ns
 308         long int ctime;     // the time at which /proc/$initpid was created
 309         struct pidns_init_store *next;
 310         long int lastcheck;
 311 };
 312
 313 /* lol - look at how they are allocated in the kernel */
 314 #define PIDNS_HASH_SIZE 4096
 315 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 316
 317 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 318 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 319 static void lock_mutex(pthread_mutex_t *l)
 320 {
 321         int ret;
 322
 323         if ((ret = pthread_mutex_lock(l)) != 0) {
 324                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 325                 exit(1);
 326         }
 327 }
 328
 329 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 330  * Number of hierarchies mounted. */
 331 static int num_hierarchies;
 332
 333 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 334  * Hierachies mounted {cpuset, blkio, ...}:
 335  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 336 static char **hierarchies;
 337
 338 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 339  * Open file descriptors:
 340  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 341  * private mount namespace.
 342  * Initialized via __constructor__ collect_and_mount_subsystems().
 343  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 344  * mounts and respective files in the private namespace even when located in
 345  * another namespace using the *at() family of functions
 346  * {openat(), fchownat(), ...}. */
 347 static int *fd_hierarchies;
 348 static int cgroup_mount_ns_fd = -1;
 349
 350 static void unlock_mutex(pthread_mutex_t *l)
 351 {
 352         int ret;
 353
 354         if ((ret = pthread_mutex_unlock(l)) != 0) {
 355                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 356                 exit(1);
 357         }
 358 }
 359
 360 static void store_lock(void)
 361 {
 362         lock_mutex(&pidns_store_mutex);
 363 }
 364
 365 static void store_unlock(void)
 366 {
 367         unlock_mutex(&pidns_store_mutex);
 368 }
 369
 370 /* Must be called under store_lock */
 371 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 372 {
 373         struct stat initsb;
 374         char fnam[100];
 375
 376         snprintf(fnam, 100, "/proc/%d", e->initpid);
 377         if (stat(fnam, &initsb) < 0)
 378                 return false;
 379
 380         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 381                     initsb.st_ctime, e->initpid);
 382
 383         if (e->ctime != initsb.st_ctime)
 384                 return false;
 385         return true;
 386 }
 387
 388 /* Must be called under store_lock */
 389 static void remove_initpid(struct pidns_init_store *e)
 390 {
 391         struct pidns_init_store *tmp;
 392         int h;
 393
 394         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 395
 396         h = HASH(e->ino);
 397         if (pidns_hash_table[h] == e) {
 398                 pidns_hash_table[h] = e->next;
 399                 free(e);
 400                 return;
 401         }
 402
 403         tmp = pidns_hash_table[h];
 404         while (tmp) {
 405                 if (tmp->next == e) {
 406                         tmp->next = e->next;
 407                         free(e);
 408                         return;
 409                 }
 410                 tmp = tmp->next;
 411         }
 412 }
 413
 414 #define PURGE_SECS 5
 415 /* Must be called under store_lock */
 416 static void prune_initpid_store(void)
 417 {
 418         static long int last_prune = 0;
 419         struct pidns_init_store *e, *prev, *delme;
 420         long int now, threshold;
 421         int i;
 422
 423         if (!last_prune) {
 424                 last_prune = time(NULL);
 425                 return;
 426         }
 427         now = time(NULL);
 428         if (now < last_prune + PURGE_SECS)
 429                 return;
 430
 431         lxcfs_debug("%s\n", "Pruning.");
 432
 433         last_prune = now;
 434         threshold = now - 2 * PURGE_SECS;
 435
 436         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 437                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 438                         if (e->lastcheck < threshold) {
 439
 440                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 441
 442                                 delme = e;
 443                                 if (prev)
 444                                         prev->next = e->next;
 445                                 else
 446                                         pidns_hash_table[i] = e->next;
 447                                 e = e->next;
 448                                 free(delme);
 449                         } else {
 450                                 prev = e;
 451                                 e = e->next;
 452                         }
 453                 }
 454         }
 455 }
 456
 457 /* Must be called under store_lock */
 458 static void save_initpid(struct stat *sb, pid_t pid)
 459 {
 460         struct pidns_init_store *e;
 461         char fpath[100];
 462         struct stat procsb;
 463         int h;
 464
 465         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 466
 467         snprintf(fpath, 100, "/proc/%d", pid);
 468         if (stat(fpath, &procsb) < 0)
 469                 return;
 470         do {
 471                 e = malloc(sizeof(*e));
 472         } while (!e);
 473         e->ino = sb->st_ino;
 474         e->initpid = pid;
 475         e->ctime = procsb.st_ctime;
 476         h = HASH(e->ino);
 477         e->next = pidns_hash_table[h];
 478         e->lastcheck = time(NULL);
 479         pidns_hash_table[h] = e;
 480 }
 481
 482 /*
 483  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 484  * entry for the inode number and creation time.  Verify that the init pid
 485  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 486  * otherwise.
 487  * Must be called under store_lock
 488  */
 489 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 490 {
 491         int h = HASH(sb->st_ino);
 492         struct pidns_init_store *e = pidns_hash_table[h];
 493
 494         while (e) {
 495                 if (e->ino == sb->st_ino) {
 496                         if (initpid_still_valid(e, sb)) {
 497                                 e->lastcheck = time(NULL);
 498                                 return e;
 499                         }
 500                         remove_initpid(e);
 501                         return NULL;
 502                 }
 503                 e = e->next;
 504         }
 505
 506         return NULL;
 507 }
 508
 509 static int is_dir(const char *path, int fd)
 510 {
 511         struct stat statbuf;
 512         int ret = fstatat(fd, path, &statbuf, fd);
 513         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 514                 return 1;
 515         return 0;
 516 }
 517
 518 static char *must_copy_string(const char *str)
 519 {
 520         char *dup = NULL;
 521         if (!str)
 522                 return NULL;
 523         do {
 524                 dup = strdup(str);
 525         } while (!dup);
 526
 527         return dup;
 528 }
 529
 530 static inline void drop_trailing_newlines(char *s)
 531 {
 532         int l;
 533
 534         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 535                 s[l-1] = '\0';
 536 }
 537
 538 #define BATCH_SIZE 50
 539 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 540 {
 541         int newbatches = (newlen / BATCH_SIZE) + 1;
 542         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 543
 544         if (!*mem || newbatches > oldbatches) {
 545                 char *tmp;
 546                 do {
 547                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 548                 } while (!tmp);
 549                 *mem = tmp;
 550         }
 551 }
 552 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 553 {
 554         size_t newlen = *len + linelen;
 555         dorealloc(contents, *len, newlen + 1);
 556         memcpy(*contents + *len, line, linelen+1);
 557         *len = newlen;
 558 }
 559
 560 static char *slurp_file(const char *from, int fd)
 561 {
 562         char *line = NULL;
 563         char *contents = NULL;
 564         FILE *f = fdopen(fd, "r");
 565         size_t len = 0, fulllen = 0;
 566         ssize_t linelen;
 567
 568         if (!f)
 569                 return NULL;
 570
 571         while ((linelen = getline(&line, &len, f)) != -1) {
 572                 append_line(&contents, &fulllen, line, linelen);
 573         }
 574         fclose(f);
 575
 576         if (contents)
 577                 drop_trailing_newlines(contents);
 578         free(line);
 579         return contents;
 580 }
 581
 582 static bool write_string(const char *fnam, const char *string, int fd)
 583 {
 584         FILE *f;
 585         size_t len, ret;
 586
 587         f = fdopen(fd, "w");
 588         if (!f)
 589                 return false;
 590
 591         len = strlen(string);
 592         ret = fwrite(string, 1, len, f);
 593         if (ret != len) {
 594                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 595                             strerror(errno), string, fnam);
 596                 fclose(f);
 597                 return false;
 598         }
 599
 600         if (fclose(f) < 0) {
 601                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 602                 return false;
 603         }
 604
 605         return true;
 606 }
 607
 608 struct cgfs_files {
 609         char *name;
 610         uint32_t uid, gid;
 611         uint32_t mode;
 612 };
 613
 614 #define ALLOC_NUM 20
 615 static bool store_hierarchy(char *stridx, char *h)
 616 {
 617         if (num_hierarchies % ALLOC_NUM == 0) {
 618                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 619                 n *= ALLOC_NUM;
 620                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 621                 if (!tmp) {
 622                         lxcfs_error("%s\n", strerror(errno));
 623                         exit(1);
 624                 }
 625                 hierarchies = tmp;
 626         }
 627
 628         hierarchies[num_hierarchies++] = must_copy_string(h);
 629         return true;
 630 }
 631
 632 static void print_subsystems(void)
 633 {
 634         int i;
 635
 636         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 637         fprintf(stderr, "hierarchies:\n");
 638         for (i = 0; i < num_hierarchies; i++) {
 639                 if (hierarchies[i])
 640                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 641                                 fd_hierarchies[i], hierarchies[i]);
 642         }
 643 }
 644
 645 static bool in_comma_list(const char *needle, const char *haystack)
 646 {
 647         const char *s = haystack, *e;
 648         size_t nlen = strlen(needle);
 649
 650         while (*s && (e = strchr(s, ','))) {
 651                 if (nlen != e - s) {
 652                         s = e + 1;
 653                         continue;
 654                 }
 655                 if (strncmp(needle, s, nlen) == 0)
 656                         return true;
 657                 s = e + 1;
 658         }
 659         if (strcmp(needle, s) == 0)
 660                 return true;
 661         return false;
 662 }
 663
 664 /* do we need to do any massaging here?  I'm not sure... */
 665 /* Return the mounted controller and store the corresponding open file descriptor
 666  * referring to the controller mountpoint in the private lxcfs namespace in
 667  * @cfd.
 668  */
 669 static char *find_mounted_controller(const char *controller, int *cfd)
 670 {
 671         int i;
 672
 673         for (i = 0; i < num_hierarchies; i++) {
 674                 if (!hierarchies[i])
 675                         continue;
 676                 if (strcmp(hierarchies[i], controller) == 0) {
 677                         *cfd = fd_hierarchies[i];
 678                         return hierarchies[i];
 679                 }
 680                 if (in_comma_list(controller, hierarchies[i])) {
 681                         *cfd = fd_hierarchies[i];
 682                         return hierarchies[i];
 683                 }
 684         }
 685
 686         return NULL;
 687 }
 688
 689 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 690                 const char *value)
 691 {
 692         int ret, fd, cfd;
 693         size_t len;
 694         char *fnam, *tmpc;
 695
 696         tmpc = find_mounted_controller(controller, &cfd);
 697         if (!tmpc)
 698                 return false;
 699
 700         /* Make sure we pass a relative path to *at() family of functions.
 701          * . + /cgroup + / + file + \0
 702          */
 703         len = strlen(cgroup) + strlen(file) + 3;
 704         fnam = alloca(len);
 705         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 706         if (ret < 0 || (size_t)ret >= len)
 707                 return false;
 708
 709         fd = openat(cfd, fnam, O_WRONLY);
 710         if (fd < 0)
 711                 return false;
 712
 713         return write_string(fnam, value, fd);
 714 }
 715
 716 // Chown all the files in the cgroup directory.  We do this when we create
 717 // a cgroup on behalf of a user.
 718 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 719 {
 720         struct dirent *direntp;
 721         char path[MAXPATHLEN];
 722         size_t len;
 723         DIR *d;
 724         int fd1, ret;
 725
 726         len = strlen(dirname);
 727         if (len >= MAXPATHLEN) {
 728                 lxcfs_error("Pathname too long: %s\n", dirname);
 729                 return;
 730         }
 731
 732         fd1 = openat(fd, dirname, O_DIRECTORY);
 733         if (fd1 < 0)
 734                 return;
 735
 736         d = fdopendir(fd1);
 737         if (!d) {
 738                 lxcfs_error("Failed to open %s\n", dirname);
 739                 return;
 740         }
 741
 742         while ((direntp = readdir(d))) {
 743                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 744                         continue;
 745                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 746                 if (ret < 0 || ret >= MAXPATHLEN) {
 747                         lxcfs_error("Pathname too long under %s\n", dirname);
 748                         continue;
 749                 }
 750                 if (fchownat(fd, path, uid, gid, 0) < 0)
 751                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 752         }
 753         closedir(d);
 754 }
 755
 756 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 757 {
 758         int cfd;
 759         size_t len;
 760         char *dirnam, *tmpc;
 761
 762         tmpc = find_mounted_controller(controller, &cfd);
 763         if (!tmpc)
 764                 return -EINVAL;
 765
 766         /* Make sure we pass a relative path to *at() family of functions.
 767          * . + /cg + \0
 768          */
 769         len = strlen(cg) + 2;
 770         dirnam = alloca(len);
 771         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 772
 773         if (mkdirat(cfd, dirnam, 0755) < 0)
 774                 return -errno;
 775
 776         if (uid == 0 && gid == 0)
 777                 return 0;
 778
 779         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 780                 return -errno;
 781
 782         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 783
 784         return 0;
 785 }
 786
 787 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 788 {
 789         struct dirent *direntp;
 790         DIR *dir;
 791         bool ret = false;
 792         char pathname[MAXPATHLEN];
 793         int dupfd;
 794
 795         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 796         if (dupfd < 0)
 797                 return false;
 798
 799         dir = fdopendir(dupfd);
 800         if (!dir) {
 801                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 802                 close(dupfd);
 803                 return false;
 804         }
 805
 806         while ((direntp = readdir(dir))) {
 807                 struct stat mystat;
 808                 int rc;
 809
 810                 if (!strcmp(direntp->d_name, ".") ||
 811                     !strcmp(direntp->d_name, ".."))
 812                         continue;
 813
 814                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 815                 if (rc < 0 || rc >= MAXPATHLEN) {
 816                         lxcfs_error("%s\n", "Pathname too long.");
 817                         continue;
 818                 }
 819
 820                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 821                 if (rc) {
 822                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 823                         continue;
 824                 }
 825                 if (S_ISDIR(mystat.st_mode))
 826                         if (!recursive_rmdir(pathname, fd, cfd))
 827                                 lxcfs_debug("Error removing %s.\n", pathname);
 828         }
 829
 830         ret = true;
 831         if (closedir(dir) < 0) {
 832                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 833                 ret = false;
 834         }
 835
 836         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 837                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 838                 ret = false;
 839         }
 840
 841         close(dupfd);
 842
 843         return ret;
 844 }
 845
 846 bool cgfs_remove(const char *controller, const char *cg)
 847 {
 848         int fd, cfd;
 849         size_t len;
 850         char *dirnam, *tmpc;
 851         bool bret;
 852
 853         tmpc = find_mounted_controller(controller, &cfd);
 854         if (!tmpc)
 855                 return false;
 856
 857         /* Make sure we pass a relative path to *at() family of functions.
 858          * . +  /cg + \0
 859          */
 860         len = strlen(cg) + 2;
 861         dirnam = alloca(len);
 862         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 863
 864         fd = openat(cfd, dirnam, O_DIRECTORY);
 865         if (fd < 0)
 866                 return false;
 867
 868         bret = recursive_rmdir(dirnam, fd, cfd);
 869         close(fd);
 870         return bret;
 871 }
 872
 873 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 874 {
 875         int cfd;
 876         size_t len;
 877         char *pathname, *tmpc;
 878
 879         tmpc = find_mounted_controller(controller, &cfd);
 880         if (!tmpc)
 881                 return false;
 882
 883         /* Make sure we pass a relative path to *at() family of functions.
 884          * . + /file + \0
 885          */
 886         len = strlen(file) + 2;
 887         pathname = alloca(len);
 888         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 889         if (fchmodat(cfd, pathname, mode, 0) < 0)
 890                 return false;
 891         return true;
 892 }
 893
 894 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 895 {
 896         size_t len;
 897         char *fname;
 898
 899         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 900         fname = alloca(len);
 901         snprintf(fname, len, "%s/tasks", dirname);
 902         if (fchownat(fd, fname, uid, gid, 0) != 0)
 903                 return -errno;
 904         snprintf(fname, len, "%s/cgroup.procs", dirname);
 905         if (fchownat(fd, fname, uid, gid, 0) != 0)
 906                 return -errno;
 907         return 0;
 908 }
 909
 910 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 911 {
 912         int cfd;
 913         size_t len;
 914         char *pathname, *tmpc;
 915
 916         tmpc = find_mounted_controller(controller, &cfd);
 917         if (!tmpc)
 918                 return -EINVAL;
 919
 920         /* Make sure we pass a relative path to *at() family of functions.
 921          * . + /file + \0
 922          */
 923         len = strlen(file) + 2;
 924         pathname = alloca(len);
 925         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 926         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 927                 return -errno;
 928
 929         if (is_dir(pathname, cfd))
 930                 // like cgmanager did, we want to chown the tasks file as well
 931                 return chown_tasks_files(pathname, uid, gid, cfd);
 932
 933         return 0;
 934 }
 935
 936 FILE *open_pids_file(const char *controller, const char *cgroup)
 937 {
 938         int fd, cfd;
 939         size_t len;
 940         char *pathname, *tmpc;
 941
 942         tmpc = find_mounted_controller(controller, &cfd);
 943         if (!tmpc)
 944                 return NULL;
 945
 946         /* Make sure we pass a relative path to *at() family of functions.
 947          * . + /cgroup + / "cgroup.procs" + \0
 948          */
 949         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 950         pathname = alloca(len);
 951         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 952
 953         fd = openat(cfd, pathname, O_WRONLY);
 954         if (fd < 0)
 955                 return NULL;
 956
 957         return fdopen(fd, "w");
 958 }
 959
 960 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 961                                 void ***list, size_t typesize,
 962                                 void* (*iterator)(const char*, const char*, const char*))
 963 {
 964         int cfd, fd, ret;
 965         size_t len;
 966         char *cg, *tmpc;
 967         char pathname[MAXPATHLEN];
 968         size_t sz = 0, asz = 0;
 969         struct dirent *dirent;
 970         DIR *dir;
 971
 972         tmpc = find_mounted_controller(controller, &cfd);
 973         *list = NULL;
 974         if (!tmpc)
 975                 return false;
 976
 977         /* Make sure we pass a relative path to *at() family of functions. */
 978         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 979         cg = alloca(len);
 980         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 981         if (ret < 0 || (size_t)ret >= len) {
 982                 lxcfs_error("Pathname too long under %s\n", cgroup);
 983                 return false;
 984         }
 985
 986         fd = openat(cfd, cg, O_DIRECTORY);
 987         if (fd < 0)
 988                 return false;
 989
 990         dir = fdopendir(fd);
 991         if (!dir)
 992                 return false;
 993
 994         while ((dirent = readdir(dir))) {
 995                 struct stat mystat;
 996
 997                 if (!strcmp(dirent->d_name, ".") ||
 998                     !strcmp(dirent->d_name, ".."))
 999                         continue;
1000
1001                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1002                 if (ret < 0 || ret >= MAXPATHLEN) {
1003                         lxcfs_error("Pathname too long under %s\n", cg);
1004                         continue;
1005                 }
1006
1007                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1008                 if (ret) {
1009                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1010                         continue;
1011                 }
1012                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1013                     (directories && !S_ISDIR(mystat.st_mode)))
1014                         continue;
1015
1016                 if (sz+2 >= asz) {
1017                         void **tmp;
1018                         asz += BATCH_SIZE;
1019                         do {
1020                                 tmp = realloc(*list, asz * typesize);
1021                         } while  (!tmp);
1022                         *list = tmp;
1023                 }
1024                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1025                 (*list)[sz+1] = NULL;
1026                 sz++;
1027         }
1028         if (closedir(dir) < 0) {
1029                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1030                 return false;
1031         }
1032         return true;
1033 }
1034
1035 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1036 {
1037         char *dup;
1038         do {
1039                 dup = strdup(dir_entry);
1040         } while (!dup);
1041         return dup;
1042 }
1043
1044 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1045 {
1046         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1047 }
1048
1049 void free_key(struct cgfs_files *k)
1050 {
1051         if (!k)
1052                 return;
1053         free(k->name);
1054         free(k);
1055 }
1056
1057 void free_keys(struct cgfs_files **keys)
1058 {
1059         int i;
1060
1061         if (!keys)
1062                 return;
1063         for (i = 0; keys[i]; i++) {
1064                 free_key(keys[i]);
1065         }
1066         free(keys);
1067 }
1068
1069 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1070 {
1071         int ret, fd, cfd;
1072         size_t len;
1073         char *fnam, *tmpc;
1074
1075         tmpc = find_mounted_controller(controller, &cfd);
1076         if (!tmpc)
1077                 return false;
1078
1079         /* Make sure we pass a relative path to *at() family of functions.
1080          * . + /cgroup + / + file + \0
1081          */
1082         len = strlen(cgroup) + strlen(file) + 3;
1083         fnam = alloca(len);
1084         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1085         if (ret < 0 || (size_t)ret >= len)
1086                 return false;
1087
1088         fd = openat(cfd, fnam, O_RDONLY);
1089         if (fd < 0)
1090                 return false;
1091
1092         *value = slurp_file(fnam, fd);
1093         return *value != NULL;
1094 }
1095
1096 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1097 {
1098         int ret, cfd;
1099         size_t len;
1100         char *fnam, *tmpc;
1101         struct stat sb;
1102         struct cgfs_files *newkey;
1103
1104         tmpc = find_mounted_controller(controller, &cfd);
1105         if (!tmpc)
1106                 return false;
1107
1108         if (file && *file == '/')
1109                 file++;
1110
1111         if (file && strchr(file, '/'))
1112                 return NULL;
1113
1114         /* Make sure we pass a relative path to *at() family of functions.
1115          * . + /cgroup + / + file + \0
1116          */
1117         len = strlen(cgroup) + 3;
1118         if (file)
1119                 len += strlen(file) + 1;
1120         fnam = alloca(len);
1121         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1122                  file ? "/" : "", file ? file : "");
1123
1124         ret = fstatat(cfd, fnam, &sb, 0);
1125         if (ret < 0)
1126                 return NULL;
1127
1128         do {
1129                 newkey = malloc(sizeof(struct cgfs_files));
1130         } while (!newkey);
1131         if (file)
1132                 newkey->name = must_copy_string(file);
1133         else if (strrchr(cgroup, '/'))
1134                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1135         else
1136                 newkey->name = must_copy_string(cgroup);
1137         newkey->uid = sb.st_uid;
1138         newkey->gid = sb.st_gid;
1139         newkey->mode = sb.st_mode;
1140
1141         return newkey;
1142 }
1143
1144 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1145 {
1146         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1147         if (!entry) {
1148                 lxcfs_error("Error getting files under %s:%s\n", controller,
1149                              cgroup);
1150         }
1151         return entry;
1152 }
1153
1154 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1155 {
1156         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1157 }
1158
1159 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1160 {
1161         int cfd;
1162         size_t len;
1163         char *fnam, *tmpc;
1164         int ret;
1165         struct stat sb;
1166
1167         tmpc = find_mounted_controller(controller, &cfd);
1168         if (!tmpc)
1169                 return false;
1170
1171         /* Make sure we pass a relative path to *at() family of functions.
1172          * . + /cgroup + / + f + \0
1173          */
1174         len = strlen(cgroup) + strlen(f) + 3;
1175         fnam = alloca(len);
1176         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1177         if (ret < 0 || (size_t)ret >= len)
1178                 return false;
1179
1180         ret = fstatat(cfd, fnam, &sb, 0);
1181         if (ret < 0 || !S_ISDIR(sb.st_mode))
1182                 return false;
1183
1184         return true;
1185 }
1186
1187 #define SEND_CREDS_OK 0
1188 #define SEND_CREDS_NOTSK 1
1189 #define SEND_CREDS_FAIL 2
1190 static bool recv_creds(int sock, struct ucred *cred, char *v);
1191 static int wait_for_pid(pid_t pid);
1192 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1193 static int send_creds_clone_wrapper(void *arg);
1194
1195 /*
1196  * clone a task which switches to @task's namespace and writes '1'.
1197  * over a unix sock so we can read the task's reaper's pid in our
1198  * namespace
1199  *
1200  * Note: glibc's fork() does not respect pidns, which can lead to failed
1201  * assertions inside glibc (and thus failed forks) if the child's pid in
1202  * the pidns and the parent pid outside are identical. Using clone prevents
1203  * this issue.
1204  */
1205 static void write_task_init_pid_exit(int sock, pid_t target)
1206 {
1207         char fnam[100];
1208         pid_t pid;
1209         int fd, ret;
1210         size_t stack_size = sysconf(_SC_PAGESIZE);
1211         void *stack = alloca(stack_size);
1212
1213         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1214         if (ret < 0 || ret >= sizeof(fnam))
1215                 _exit(1);
1216
1217         fd = open(fnam, O_RDONLY);
1218         if (fd < 0) {
1219                 perror("write_task_init_pid_exit open of ns/pid");
1220                 _exit(1);
1221         }
1222         if (setns(fd, 0)) {
1223                 perror("write_task_init_pid_exit setns 1");
1224                 close(fd);
1225                 _exit(1);
1226         }
1227         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1228         if (pid < 0)
1229                 _exit(1);
1230         if (pid != 0) {
1231                 if (!wait_for_pid(pid))
1232                         _exit(1);
1233                 _exit(0);
1234         }
1235 }
1236
1237 static int send_creds_clone_wrapper(void *arg) {
1238         struct ucred cred;
1239         char v;
1240         int sock = *(int *)arg;
1241
1242         /* we are the child */
1243         cred.uid = 0;
1244         cred.gid = 0;
1245         cred.pid = 1;
1246         v = '1';
1247         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1248                 return 1;
1249         return 0;
1250 }
1251
1252 static pid_t get_init_pid_for_task(pid_t task)
1253 {
1254         int sock[2];
1255         pid_t pid;
1256         pid_t ret = -1;
1257         char v = '0';
1258         struct ucred cred;
1259
1260         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1261                 perror("socketpair");
1262                 return -1;
1263         }
1264
1265         pid = fork();
1266         if (pid < 0)
1267                 goto out;
1268         if (!pid) {
1269                 close(sock[1]);
1270                 write_task_init_pid_exit(sock[0], task);
1271                 _exit(0);
1272         }
1273
1274         if (!recv_creds(sock[1], &cred, &v))
1275                 goto out;
1276         ret = cred.pid;
1277
1278 out:
1279         close(sock[0]);
1280         close(sock[1]);
1281         if (pid > 0)
1282                 wait_for_pid(pid);
1283         return ret;
1284 }
1285
1286 static pid_t lookup_initpid_in_store(pid_t qpid)
1287 {
1288         pid_t answer = 0;
1289         struct stat sb;
1290         struct pidns_init_store *e;
1291         char fnam[100];
1292
1293         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1294         store_lock();
1295         if (stat(fnam, &sb) < 0)
1296                 goto out;
1297         e = lookup_verify_initpid(&sb);
1298         if (e) {
1299                 answer = e->initpid;
1300                 goto out;
1301         }
1302         answer = get_init_pid_for_task(qpid);
1303         if (answer > 0)
1304                 save_initpid(&sb, answer);
1305
1306 out:
1307         /* we prune at end in case we are returning
1308          * the value we were about to return */
1309         prune_initpid_store();
1310         store_unlock();
1311         return answer;
1312 }
1313
1314 static int wait_for_pid(pid_t pid)
1315 {
1316         int status, ret;
1317
1318         if (pid <= 0)
1319                 return -1;
1320
1321 again:
1322         ret = waitpid(pid, &status, 0);
1323         if (ret == -1) {
1324                 if (errno == EINTR)
1325                         goto again;
1326                 return -1;
1327         }
1328         if (ret != pid)
1329                 goto again;
1330         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1331                 return -1;
1332         return 0;
1333 }
1334
1335
1336 /*
1337  * append pid to *src.
1338  * src: a pointer to a char* in which ot append the pid.
1339  * sz: the number of characters printed so far, minus trailing \0.
1340  * asz: the allocated size so far
1341  * pid: the pid to append
1342  */
1343 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1344 {
1345         char tmp[30];
1346
1347         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1348
1349         if (!*src || tmplen + *sz + 1 >= *asz) {
1350                 char *tmp;
1351                 do {
1352                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1353                 } while (!tmp);
1354                 *src = tmp;
1355                 *asz += BUF_RESERVE_SIZE;
1356         }
1357         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1358         *sz += tmplen;
1359 }
1360
1361 /*
1362  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1363  * valid in the caller's namespace, return the id mapped into
1364  * pid's namespace.
1365  * Returns the mapped id, or -1 on error.
1366  */
1367 unsigned int
1368 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1369 {
1370         unsigned int nsuid,   // base id for a range in the idfile's namespace
1371                      hostuid, // base id for a range in the caller's namespace
1372                      count;   // number of ids in this range
1373         char line[400];
1374         int ret;
1375
1376         fseek(idfile, 0L, SEEK_SET);
1377         while (fgets(line, 400, idfile)) {
1378                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1379                 if (ret != 3)
1380                         continue;
1381                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1382                         /*
1383                          * uids wrapped around - unexpected as this is a procfile,
1384                          * so just bail.
1385                          */
1386                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1387                                 nsuid, hostuid, count, line);
1388                         return -1;
1389                 }
1390                 if (hostuid <= in_id && hostuid+count > in_id) {
1391                         /*
1392                          * now since hostuid <= in_id < hostuid+count, and
1393                          * hostuid+count and nsuid+count do not wrap around,
1394                          * we know that nsuid+(in_id-hostuid) which must be
1395                          * less that nsuid+(count) must not wrap around
1396                          */
1397                         return (in_id - hostuid) + nsuid;
1398                 }
1399         }
1400
1401         // no answer found
1402         return -1;
1403 }
1404
1405 /*
1406  * for is_privileged_over,
1407  * specify whether we require the calling uid to be root in his
1408  * namespace
1409  */
1410 #define NS_ROOT_REQD true
1411 #define NS_ROOT_OPT false
1412
1413 #define PROCLEN 100
1414
1415 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1416 {
1417         char fpath[PROCLEN];
1418         int ret;
1419         bool answer = false;
1420         uid_t nsuid;
1421
1422         if (victim == -1 || uid == -1)
1423                 return false;
1424
1425         /*
1426          * If the request is one not requiring root in the namespace,
1427          * then having the same uid suffices.  (i.e. uid 1000 has write
1428          * access to files owned by uid 1000
1429          */
1430         if (!req_ns_root && uid == victim)
1431                 return true;
1432
1433         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1434         if (ret < 0 || ret >= PROCLEN)
1435                 return false;
1436         FILE *f = fopen(fpath, "r");
1437         if (!f)
1438                 return false;
1439
1440         /* if caller's not root in his namespace, reject */
1441         nsuid = convert_id_to_ns(f, uid);
1442         if (nsuid)
1443                 goto out;
1444
1445         /*
1446          * If victim is not mapped into caller's ns, reject.
1447          * XXX I'm not sure this check is needed given that fuse
1448          * will be sending requests where the vfs has converted
1449          */
1450         nsuid = convert_id_to_ns(f, victim);
1451         if (nsuid == -1)
1452                 goto out;
1453
1454         answer = true;
1455
1456 out:
1457         fclose(f);
1458         return answer;
1459 }
1460
1461 static bool perms_include(int fmode, mode_t req_mode)
1462 {
1463         mode_t r;
1464
1465         switch (req_mode & O_ACCMODE) {
1466         case O_RDONLY:
1467                 r = S_IROTH;
1468                 break;
1469         case O_WRONLY:
1470                 r = S_IWOTH;
1471                 break;
1472         case O_RDWR:
1473                 r = S_IROTH | S_IWOTH;
1474                 break;
1475         default:
1476                 return false;
1477         }
1478         return ((fmode & r) == r);
1479 }
1480
1481
1482 /*
1483  * taskcg is  a/b/c
1484  * querycg is /a/b/c/d/e
1485  * we return 'd'
1486  */
1487 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1488 {
1489         char *start, *end;
1490
1491         if (strlen(taskcg) <= strlen(querycg)) {
1492                 lxcfs_error("%s\n", "I was fed bad input.");
1493                 return NULL;
1494         }
1495
1496         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1497                 start =  strdup(taskcg + 1);
1498         else
1499                 start = strdup(taskcg + strlen(querycg) + 1);
1500         if (!start)
1501                 return NULL;
1502         end = strchr(start, '/');
1503         if (end)
1504                 *end = '\0';
1505         return start;
1506 }
1507
1508 static void stripnewline(char *x)
1509 {
1510         size_t l = strlen(x);
1511         if (l && x[l-1] == '\n')
1512                 x[l-1] = '\0';
1513 }
1514
1515 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1516 {
1517         int cfd;
1518         char fnam[PROCLEN];
1519         FILE *f;
1520         char *answer = NULL;
1521         char *line = NULL;
1522         size_t len = 0;
1523         int ret;
1524         const char *h = find_mounted_controller(contrl, &cfd);
1525         if (!h)
1526                 return NULL;
1527
1528         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1529         if (ret < 0 || ret >= PROCLEN)
1530                 return NULL;
1531         if (!(f = fopen(fnam, "r")))
1532                 return NULL;
1533
1534         while (getline(&line, &len, f) != -1) {
1535                 char *c1, *c2;
1536                 if (!line[0])
1537                         continue;
1538                 c1 = strchr(line, ':');
1539                 if (!c1)
1540                         goto out;
1541                 c1++;
1542                 c2 = strchr(c1, ':');
1543                 if (!c2)
1544                         goto out;
1545                 *c2 = '\0';
1546                 if (strcmp(c1, h) != 0)
1547                         continue;
1548                 c2++;
1549                 stripnewline(c2);
1550                 do {
1551                         answer = strdup(c2);
1552                 } while (!answer);
1553                 break;
1554         }
1555
1556 out:
1557         fclose(f);
1558         free(line);
1559         return answer;
1560 }
1561
1562 /*
1563  * check whether a fuse context may access a cgroup dir or file
1564  *
1565  * If file is not null, it is a cgroup file to check under cg.
1566  * If file is null, then we are checking perms on cg itself.
1567  *
1568  * For files we can check the mode of the list_keys result.
1569  * For cgroups, we must make assumptions based on the files under the
1570  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1571  * yet.
1572  */
1573 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1574 {
1575         struct cgfs_files *k = NULL;
1576         bool ret = false;
1577
1578         k = cgfs_get_key(contrl, cg, file);
1579         if (!k)
1580                 return false;
1581
1582         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1583                 if (perms_include(k->mode >> 6, mode)) {
1584                         ret = true;
1585                         goto out;
1586                 }
1587         }
1588         if (fc->gid == k->gid) {
1589                 if (perms_include(k->mode >> 3, mode)) {
1590                         ret = true;
1591                         goto out;
1592                 }
1593         }
1594         ret = perms_include(k->mode, mode);
1595
1596 out:
1597         free_key(k);
1598         return ret;
1599 }
1600
1601 #define INITSCOPE "/init.scope"
1602 static void prune_init_slice(char *cg)
1603 {
1604         char *point;
1605         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1606
1607         if (cg_len < initscope_len)
1608                 return;
1609
1610         point = cg + cg_len - initscope_len;
1611         if (strcmp(point, INITSCOPE) == 0) {
1612                 if (point == cg)
1613                         *(point+1) = '\0';
1614                 else
1615                         *point = '\0';
1616         }
1617 }
1618
1619 /*
1620  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1621  * If pid is in /a, he may act on /a/b, but not on /b.
1622  * if the answer is false and nextcg is not NULL, then *nextcg will point
1623  * to a string containing the next cgroup directory under cg, which must be
1624  * freed by the caller.
1625  */
1626 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1627 {
1628         bool answer = false;
1629         char *c2 = get_pid_cgroup(pid, contrl);
1630         char *linecmp;
1631
1632         if (!c2)
1633                 return false;
1634         prune_init_slice(c2);
1635
1636         /*
1637          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1638          * they pass in a cgroup without leading '/'
1639          *
1640          * The original line here was:
1641          *      linecmp = *cg == '/' ? c2 : c2+1;
1642          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1643          *       Serge, do you know?
1644          */
1645         if (*cg == '/' || !strncmp(cg, "./", 2))
1646                 linecmp = c2;
1647         else
1648                 linecmp = c2 + 1;
1649         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1650                 if (nextcg) {
1651                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1652                 }
1653                 goto out;
1654         }
1655         answer = true;
1656
1657 out:
1658         free(c2);
1659         return answer;
1660 }
1661
1662 /*
1663  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1664  */
1665 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1666 {
1667         bool answer = false;
1668         char *c2, *task_cg;
1669         size_t target_len, task_len;
1670
1671         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1672                 return true;
1673
1674         c2 = get_pid_cgroup(pid, contrl);
1675         if (!c2)
1676                 return false;
1677         prune_init_slice(c2);
1678
1679         task_cg = c2 + 1;
1680         target_len = strlen(cg);
1681         task_len = strlen(task_cg);
1682         if (task_len == 0) {
1683                 /* Task is in the root cg, it can see everything. This case is
1684                  * not handled by the strmcps below, since they test for the
1685                  * last /, but that is the first / that we've chopped off
1686                  * above.
1687                  */
1688                 answer = true;
1689                 goto out;
1690         }
1691         if (strcmp(cg, task_cg) == 0) {
1692                 answer = true;
1693                 goto out;
1694         }
1695         if (target_len < task_len) {
1696                 /* looking up a parent dir */
1697                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1698                         answer = true;
1699                 goto out;
1700         }
1701         if (target_len > task_len) {
1702                 /* looking up a child dir */
1703                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1704                         answer = true;
1705                 goto out;
1706         }
1707
1708 out:
1709         free(c2);
1710         return answer;
1711 }
1712
1713 /*
1714  * given /cgroup/freezer/a/b, return "freezer".
1715  * the returned char* should NOT be freed.
1716  */
1717 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1718 {
1719         const char *p1;
1720         char *contr, *slash;
1721
1722         if (strlen(path) < 9) {
1723                 errno = EACCES;
1724                 return NULL;
1725         }
1726         if (*(path + 7) != '/') {
1727                 errno = EINVAL;
1728                 return NULL;
1729         }
1730         p1 = path + 8;
1731         contr = strdupa(p1);
1732         if (!contr) {
1733                 errno = ENOMEM;
1734                 return NULL;
1735         }
1736         slash = strstr(contr, "/");
1737         if (slash)
1738                 *slash = '\0';
1739
1740         int i;
1741         for (i = 0; i < num_hierarchies; i++) {
1742                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1743                         return hierarchies[i];
1744         }
1745         errno = ENOENT;
1746         return NULL;
1747 }
1748
1749 /*
1750  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1751  * Note that the returned value may include files (keynames) etc
1752  */
1753 static const char *find_cgroup_in_path(const char *path)
1754 {
1755         const char *p1;
1756
1757         if (strlen(path) < 9) {
1758                 errno = EACCES;
1759                 return NULL;
1760         }
1761         p1 = strstr(path + 8, "/");
1762         if (!p1) {
1763                 errno = EINVAL;
1764                 return NULL;
1765         }
1766         errno = 0;
1767         return p1 + 1;
1768 }
1769
1770 /*
1771  * split the last path element from the path in @cg.
1772  * @dir is newly allocated and should be freed, @last not
1773 */
1774 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1775 {
1776         char *p;
1777
1778         do {
1779                 *dir = strdup(cg);
1780         } while (!*dir);
1781         *last = strrchr(cg, '/');
1782         if (!*last) {
1783                 *last = NULL;
1784                 return;
1785         }
1786         p = strrchr(*dir, '/');
1787         *p = '\0';
1788 }
1789
1790 /*
1791  * FUSE ops for /cgroup
1792  */
1793
1794 int cg_getattr(const char *path, struct stat *sb)
1795 {
1796         struct timespec now;
1797         struct fuse_context *fc = fuse_get_context();
1798         char * cgdir = NULL;
1799         char *last = NULL, *path1, *path2;
1800         struct cgfs_files *k = NULL;
1801         const char *cgroup;
1802         const char *controller = NULL;
1803         int ret = -ENOENT;
1804
1805
1806         if (!fc)
1807                 return -EIO;
1808
1809         memset(sb, 0, sizeof(struct stat));
1810
1811         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1812                 return -EINVAL;
1813
1814         sb->st_uid = sb->st_gid = 0;
1815         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1816         sb->st_size = 0;
1817
1818         if (strcmp(path, "/cgroup") == 0) {
1819                 sb->st_mode = S_IFDIR | 00755;
1820                 sb->st_nlink = 2;
1821                 return 0;
1822         }
1823
1824         controller = pick_controller_from_path(fc, path);
1825         if (!controller)
1826                 return -errno;
1827         cgroup = find_cgroup_in_path(path);
1828         if (!cgroup) {
1829                 /* this is just /cgroup/controller, return it as a dir */
1830                 sb->st_mode = S_IFDIR | 00755;
1831                 sb->st_nlink = 2;
1832                 return 0;
1833         }
1834
1835         get_cgdir_and_path(cgroup, &cgdir, &last);
1836
1837         if (!last) {
1838                 path1 = "/";
1839                 path2 = cgdir;
1840         } else {
1841                 path1 = cgdir;
1842                 path2 = last;
1843         }
1844
1845         pid_t initpid = lookup_initpid_in_store(fc->pid);
1846         if (initpid <= 0)
1847                 initpid = fc->pid;
1848         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1849          * Then check that caller's cgroup is under path if last is a child
1850          * cgroup, or cgdir if last is a file */
1851
1852         if (is_child_cgroup(controller, path1, path2)) {
1853                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1854                         ret = -ENOENT;
1855                         goto out;
1856                 }
1857                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1858                         /* this is just /cgroup/controller, return it as a dir */
1859                         sb->st_mode = S_IFDIR | 00555;
1860                         sb->st_nlink = 2;
1861                         ret = 0;
1862                         goto out;
1863                 }
1864                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1865                         ret = -EACCES;
1866                         goto out;
1867                 }
1868
1869                 // get uid, gid, from '/tasks' file and make up a mode
1870                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1871                 sb->st_mode = S_IFDIR | 00755;
1872                 k = cgfs_get_key(controller, cgroup, NULL);
1873                 if (!k) {
1874                         sb->st_uid = sb->st_gid = 0;
1875                 } else {
1876                         sb->st_uid = k->uid;
1877                         sb->st_gid = k->gid;
1878                 }
1879                 free_key(k);
1880                 sb->st_nlink = 2;
1881                 ret = 0;
1882                 goto out;
1883         }
1884
1885         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1886                 sb->st_mode = S_IFREG | k->mode;
1887                 sb->st_nlink = 1;
1888                 sb->st_uid = k->uid;
1889                 sb->st_gid = k->gid;
1890                 sb->st_size = 0;
1891                 free_key(k);
1892                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1893                         ret = -ENOENT;
1894                         goto out;
1895                 }
1896                 ret = 0;
1897         }
1898
1899 out:
1900         free(cgdir);
1901         return ret;
1902 }
1903
1904 int cg_opendir(const char *path, struct fuse_file_info *fi)
1905 {
1906         struct fuse_context *fc = fuse_get_context();
1907         const char *cgroup;
1908         struct file_info *dir_info;
1909         char *controller = NULL;
1910
1911         if (!fc)
1912                 return -EIO;
1913
1914         if (strcmp(path, "/cgroup") == 0) {
1915                 cgroup = NULL;
1916                 controller = NULL;
1917         } else {
1918                 // return list of keys for the controller, and list of child cgroups
1919                 controller = pick_controller_from_path(fc, path);
1920                 if (!controller)
1921                         return -errno;
1922
1923                 cgroup = find_cgroup_in_path(path);
1924                 if (!cgroup) {
1925                         /* this is just /cgroup/controller, return its contents */
1926                         cgroup = "/";
1927                 }
1928         }
1929
1930         pid_t initpid = lookup_initpid_in_store(fc->pid);
1931         if (initpid <= 0)
1932                 initpid = fc->pid;
1933         if (cgroup) {
1934                 if (!caller_may_see_dir(initpid, controller, cgroup))
1935                         return -ENOENT;
1936                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1937                         return -EACCES;
1938         }
1939
1940         /* we'll free this at cg_releasedir */
1941         dir_info = malloc(sizeof(*dir_info));
1942         if (!dir_info)
1943                 return -ENOMEM;
1944         dir_info->controller = must_copy_string(controller);
1945         dir_info->cgroup = must_copy_string(cgroup);
1946         dir_info->type = LXC_TYPE_CGDIR;
1947         dir_info->buf = NULL;
1948         dir_info->file = NULL;
1949         dir_info->buflen = 0;
1950
1951         fi->fh = (unsigned long)dir_info;
1952         return 0;
1953 }
1954
1955 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1956                 struct fuse_file_info *fi)
1957 {
1958         struct file_info *d = (struct file_info *)fi->fh;
1959         struct cgfs_files **list = NULL;
1960         int i, ret;
1961         char *nextcg = NULL;
1962         struct fuse_context *fc = fuse_get_context();
1963         char **clist = NULL;
1964
1965         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1966                 return -EIO;
1967
1968         if (d->type != LXC_TYPE_CGDIR) {
1969                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1970                 return -EIO;
1971         }
1972         if (!d->cgroup && !d->controller) {
1973                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1974                 int i;
1975
1976                 for (i = 0;  i < num_hierarchies; i++) {
1977                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1978                                 return -EIO;
1979                         }
1980                 }
1981                 return 0;
1982         }
1983
1984         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1985                 // not a valid cgroup
1986                 ret = -EINVAL;
1987                 goto out;
1988         }
1989
1990         pid_t initpid = lookup_initpid_in_store(fc->pid);
1991         if (initpid <= 0)
1992                 initpid = fc->pid;
1993         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1994                 if (nextcg) {
1995                         ret = filler(buf, nextcg,  NULL, 0);
1996                         free(nextcg);
1997                         if (ret != 0) {
1998                                 ret = -EIO;
1999                                 goto out;
2000                         }
2001                 }
2002                 ret = 0;
2003                 goto out;
2004         }
2005
2006         for (i = 0; list[i]; i++) {
2007                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2008                         ret = -EIO;
2009                         goto out;
2010                 }
2011         }
2012
2013         // now get the list of child cgroups
2014
2015         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2016                 ret = 0;
2017                 goto out;
2018         }
2019         if (clist) {
2020                 for (i = 0; clist[i]; i++) {
2021                         if (filler(buf, clist[i], NULL, 0) != 0) {
2022                                 ret = -EIO;
2023                                 goto out;
2024                         }
2025                 }
2026         }
2027         ret = 0;
2028
2029 out:
2030         free_keys(list);
2031         if (clist) {
2032                 for (i = 0; clist[i]; i++)
2033                         free(clist[i]);
2034                 free(clist);
2035         }
2036         return ret;
2037 }
2038
2039 static void do_release_file_info(struct fuse_file_info *fi)
2040 {
2041         struct file_info *f = (struct file_info *)fi->fh;
2042
2043         if (!f)
2044                 return;
2045
2046         fi->fh = 0;
2047
2048         free(f->controller);
2049         f->controller = NULL;
2050         free(f->cgroup);
2051         f->cgroup = NULL;
2052         free(f->file);
2053         f->file = NULL;
2054         free(f->buf);
2055         f->buf = NULL;
2056         free(f);
2057         f = NULL;
2058 }
2059
2060 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2061 {
2062         do_release_file_info(fi);
2063         return 0;
2064 }
2065
2066 int cg_open(const char *path, struct fuse_file_info *fi)
2067 {
2068         const char *cgroup;
2069         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2070         struct cgfs_files *k = NULL;
2071         struct file_info *file_info;
2072         struct fuse_context *fc = fuse_get_context();
2073         int ret;
2074
2075         if (!fc)
2076                 return -EIO;
2077
2078         controller = pick_controller_from_path(fc, path);
2079         if (!controller)
2080                 return -errno;
2081         cgroup = find_cgroup_in_path(path);
2082         if (!cgroup)
2083                 return -errno;
2084
2085         get_cgdir_and_path(cgroup, &cgdir, &last);
2086         if (!last) {
2087                 path1 = "/";
2088                 path2 = cgdir;
2089         } else {
2090                 path1 = cgdir;
2091                 path2 = last;
2092         }
2093
2094         k = cgfs_get_key(controller, path1, path2);
2095         if (!k) {
2096                 ret = -EINVAL;
2097                 goto out;
2098         }
2099         free_key(k);
2100
2101         pid_t initpid = lookup_initpid_in_store(fc->pid);
2102         if (initpid <= 0)
2103                 initpid = fc->pid;
2104         if (!caller_may_see_dir(initpid, controller, path1)) {
2105                 ret = -ENOENT;
2106                 goto out;
2107         }
2108         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2109                 ret = -EACCES;
2110                 goto out;
2111         }
2112
2113         /* we'll free this at cg_release */
2114         file_info = malloc(sizeof(*file_info));
2115         if (!file_info) {
2116                 ret = -ENOMEM;
2117                 goto out;
2118         }
2119         file_info->controller = must_copy_string(controller);
2120         file_info->cgroup = must_copy_string(path1);
2121         file_info->file = must_copy_string(path2);
2122         file_info->type = LXC_TYPE_CGFILE;
2123         file_info->buf = NULL;
2124         file_info->buflen = 0;
2125
2126         fi->fh = (unsigned long)file_info;
2127         ret = 0;
2128
2129 out:
2130         free(cgdir);
2131         return ret;
2132 }
2133
2134 int cg_access(const char *path, int mode)
2135 {
2136         int ret;
2137         const char *cgroup;
2138         char *path1, *path2, *controller;
2139         char *last = NULL, *cgdir = NULL;
2140         struct cgfs_files *k = NULL;
2141         struct fuse_context *fc = fuse_get_context();
2142
2143         if (strcmp(path, "/cgroup") == 0)
2144                 return 0;
2145
2146         if (!fc)
2147                 return -EIO;
2148
2149         controller = pick_controller_from_path(fc, path);
2150         if (!controller)
2151                 return -errno;
2152         cgroup = find_cgroup_in_path(path);
2153         if (!cgroup) {
2154                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2155                 if ((mode & W_OK) == 0)
2156                         return 0;
2157                 return -EACCES;
2158         }
2159
2160         get_cgdir_and_path(cgroup, &cgdir, &last);
2161         if (!last) {
2162                 path1 = "/";
2163                 path2 = cgdir;
2164         } else {
2165                 path1 = cgdir;
2166                 path2 = last;
2167         }
2168
2169         k = cgfs_get_key(controller, path1, path2);
2170         if (!k) {
2171                 if ((mode & W_OK) == 0)
2172                         ret = 0;
2173                 else
2174                         ret = -EACCES;
2175                 goto out;
2176         }
2177         free_key(k);
2178
2179         pid_t initpid = lookup_initpid_in_store(fc->pid);
2180         if (initpid <= 0)
2181                 initpid = fc->pid;
2182         if (!caller_may_see_dir(initpid, controller, path1)) {
2183                 ret = -ENOENT;
2184                 goto out;
2185         }
2186         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2187                 ret = -EACCES;
2188                 goto out;
2189         }
2190
2191         ret = 0;
2192
2193 out:
2194         free(cgdir);
2195         return ret;
2196 }
2197
2198 int cg_release(const char *path, struct fuse_file_info *fi)
2199 {
2200         do_release_file_info(fi);
2201         return 0;
2202 }
2203
2204 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2205
2206 static bool wait_for_sock(int sock, int timeout)
2207 {
2208         struct epoll_event ev;
2209         int epfd, ret, now, starttime, deltatime, saved_errno;
2210
2211         if ((starttime = time(NULL)) < 0)
2212                 return false;
2213
2214         if ((epfd = epoll_create(1)) < 0) {
2215                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2216                 return false;
2217         }
2218
2219         ev.events = POLLIN_SET;
2220         ev.data.fd = sock;
2221         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2222                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2223                 close(epfd);
2224                 return false;
2225         }
2226
2227 again:
2228         if ((now = time(NULL)) < 0) {
2229                 close(epfd);
2230                 return false;
2231         }
2232
2233         deltatime = (starttime + timeout) - now;
2234         if (deltatime < 0) { // timeout
2235                 errno = 0;
2236                 close(epfd);
2237                 return false;
2238         }
2239         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2240         if (ret < 0 && errno == EINTR)
2241                 goto again;
2242         saved_errno = errno;
2243         close(epfd);
2244
2245         if (ret <= 0) {
2246                 errno = saved_errno;
2247                 return false;
2248         }
2249         return true;
2250 }
2251
2252 static int msgrecv(int sockfd, void *buf, size_t len)
2253 {
2254         if (!wait_for_sock(sockfd, 2))
2255                 return -1;
2256         return recv(sockfd, buf, len, MSG_DONTWAIT);
2257 }
2258
2259 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2260 {
2261         struct msghdr msg = { 0 };
2262         struct iovec iov;
2263         struct cmsghdr *cmsg;
2264         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2265         char buf[1];
2266         buf[0] = 'p';
2267
2268         if (pingfirst) {
2269                 if (msgrecv(sock, buf, 1) != 1) {
2270                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2271                         return SEND_CREDS_FAIL;
2272                 }
2273         }
2274
2275         msg.msg_control = cmsgbuf;
2276         msg.msg_controllen = sizeof(cmsgbuf);
2277
2278         cmsg = CMSG_FIRSTHDR(&msg);
2279         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2280         cmsg->cmsg_level = SOL_SOCKET;
2281         cmsg->cmsg_type = SCM_CREDENTIALS;
2282         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2283
2284         msg.msg_name = NULL;
2285         msg.msg_namelen = 0;
2286
2287         buf[0] = v;
2288         iov.iov_base = buf;
2289         iov.iov_len = sizeof(buf);
2290         msg.msg_iov = &iov;
2291         msg.msg_iovlen = 1;
2292
2293         if (sendmsg(sock, &msg, 0) < 0) {
2294                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2295                 if (errno == 3)
2296                         return SEND_CREDS_NOTSK;
2297                 return SEND_CREDS_FAIL;
2298         }
2299
2300         return SEND_CREDS_OK;
2301 }
2302
2303 static bool recv_creds(int sock, struct ucred *cred, char *v)
2304 {
2305         struct msghdr msg = { 0 };
2306         struct iovec iov;
2307         struct cmsghdr *cmsg;
2308         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2309         char buf[1];
2310         int ret;
2311         int optval = 1;
2312
2313         *v = '1';
2314
2315         cred->pid = -1;
2316         cred->uid = -1;
2317         cred->gid = -1;
2318
2319         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2320                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2321                 return false;
2322         }
2323         buf[0] = '1';
2324         if (write(sock, buf, 1) != 1) {
2325                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2326                 return false;
2327         }
2328
2329         msg.msg_name = NULL;
2330         msg.msg_namelen = 0;
2331         msg.msg_control = cmsgbuf;
2332         msg.msg_controllen = sizeof(cmsgbuf);
2333
2334         iov.iov_base = buf;
2335         iov.iov_len = sizeof(buf);
2336         msg.msg_iov = &iov;
2337         msg.msg_iovlen = 1;
2338
2339         if (!wait_for_sock(sock, 2)) {
2340                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2341                 return false;
2342         }
2343         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2344         if (ret < 0) {
2345                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2346                 return false;
2347         }
2348
2349         cmsg = CMSG_FIRSTHDR(&msg);
2350
2351         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2352                         cmsg->cmsg_level == SOL_SOCKET &&
2353                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2354                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2355         }
2356         *v = buf[0];
2357
2358         return true;
2359 }
2360
2361 struct pid_ns_clone_args {
2362         int *cpipe;
2363         int sock;
2364         pid_t tpid;
2365         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2366 };
2367
2368 /*
2369  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2370  * with clone(). This simply writes '1' as ACK back to the parent
2371  * before calling the actual wrapped function.
2372  */
2373 static int pid_ns_clone_wrapper(void *arg) {
2374         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2375         char b = '1';
2376
2377         close(args->cpipe[0]);
2378         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2379                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2380         close(args->cpipe[1]);
2381         return args->wrapped(args->sock, args->tpid);
2382 }
2383
2384 /*
2385  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2386  * int value back over the socket.  This shifts the pid from the
2387  * sender's pidns into tpid's pidns.
2388  */
2389 static int pid_to_ns(int sock, pid_t tpid)
2390 {
2391         char v = '0';
2392         struct ucred cred;
2393
2394         while (recv_creds(sock, &cred, &v)) {
2395                 if (v == '1')
2396                         return 0;
2397                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2398                         return 1;
2399         }
2400         return 0;
2401 }
2402
2403
2404 /*
2405  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2406  * in your old pidns.  Only children which you clone will be in the target
2407  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2408  * actually convert pids.
2409  *
2410  * Note: glibc's fork() does not respect pidns, which can lead to failed
2411  * assertions inside glibc (and thus failed forks) if the child's pid in
2412  * the pidns and the parent pid outside are identical. Using clone prevents
2413  * this issue.
2414  */
2415 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2416 {
2417         int newnsfd = -1, ret, cpipe[2];
2418         char fnam[100];
2419         pid_t cpid;
2420         char v;
2421
2422         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2423         if (ret < 0 || ret >= sizeof(fnam))
2424                 _exit(1);
2425         newnsfd = open(fnam, O_RDONLY);
2426         if (newnsfd < 0)
2427                 _exit(1);
2428         if (setns(newnsfd, 0) < 0)
2429                 _exit(1);
2430         close(newnsfd);
2431
2432         if (pipe(cpipe) < 0)
2433                 _exit(1);
2434
2435         struct pid_ns_clone_args args = {
2436                 .cpipe = cpipe,
2437                 .sock = sock,
2438                 .tpid = tpid,
2439                 .wrapped = &pid_to_ns
2440         };
2441         size_t stack_size = sysconf(_SC_PAGESIZE);
2442         void *stack = alloca(stack_size);
2443
2444         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2445         if (cpid < 0)
2446                 _exit(1);
2447
2448         // give the child 1 second to be done forking and
2449         // write its ack
2450         if (!wait_for_sock(cpipe[0], 1))
2451                 _exit(1);
2452         ret = read(cpipe[0], &v, 1);
2453         if (ret != sizeof(char) || v != '1')
2454                 _exit(1);
2455
2456         if (!wait_for_pid(cpid))
2457                 _exit(1);
2458         _exit(0);
2459 }
2460
2461 /*
2462  * To read cgroup files with a particular pid, we will setns into the child
2463  * pidns, open a pipe, fork a child - which will be the first to really be in
2464  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2465  */
2466 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2467 {
2468         int sock[2] = {-1, -1};
2469         char *tmpdata = NULL;
2470         int ret;
2471         pid_t qpid, cpid = -1;
2472         bool answer = false;
2473         char v = '0';
2474         struct ucred cred;
2475         size_t sz = 0, asz = 0;
2476
2477         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2478                 return false;
2479
2480         /*
2481          * Now we read the pids from returned data one by one, pass
2482          * them into a child in the target namespace, read back the
2483          * translated pids, and put them into our to-return data
2484          */
2485
2486         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2487                 perror("socketpair");
2488                 free(tmpdata);
2489                 return false;
2490         }
2491
2492         cpid = fork();
2493         if (cpid == -1)
2494                 goto out;
2495
2496         if (!cpid) // child - exits when done
2497                 pid_to_ns_wrapper(sock[1], tpid);
2498
2499         char *ptr = tmpdata;
2500         cred.uid = 0;
2501         cred.gid = 0;
2502         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2503                 cred.pid = qpid;
2504                 ret = send_creds(sock[0], &cred, v, true);
2505
2506                 if (ret == SEND_CREDS_NOTSK)
2507                         goto next;
2508                 if (ret == SEND_CREDS_FAIL)
2509                         goto out;
2510
2511                 // read converted results
2512                 if (!wait_for_sock(sock[0], 2)) {
2513                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2514                         goto out;
2515                 }
2516                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2517                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2518                         goto out;
2519                 }
2520                 must_strcat_pid(d, &sz, &asz, qpid);
2521 next:
2522                 ptr = strchr(ptr, '\n');
2523                 if (!ptr)
2524                         break;
2525                 ptr++;
2526         }
2527
2528         cred.pid = getpid();
2529         v = '1';
2530         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2531                 // failed to ask child to exit
2532                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2533                 goto out;
2534         }
2535
2536         answer = true;
2537
2538 out:
2539         free(tmpdata);
2540         if (cpid != -1)
2541                 wait_for_pid(cpid);
2542         if (sock[0] != -1) {
2543                 close(sock[0]);
2544                 close(sock[1]);
2545         }
2546         return answer;
2547 }
2548
2549 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2550                 struct fuse_file_info *fi)
2551 {
2552         struct fuse_context *fc = fuse_get_context();
2553         struct file_info *f = (struct file_info *)fi->fh;
2554         struct cgfs_files *k = NULL;
2555         char *data = NULL;
2556         int ret, s;
2557         bool r;
2558
2559         if (f->type != LXC_TYPE_CGFILE) {
2560                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2561                 return -EIO;
2562         }
2563
2564         if (offset)
2565                 return 0;
2566
2567         if (!fc)
2568                 return -EIO;
2569
2570         if (!f->controller)
2571                 return -EINVAL;
2572
2573         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2574                 return -EINVAL;
2575         }
2576         free_key(k);
2577
2578
2579         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2580                 ret = -EACCES;
2581                 goto out;
2582         }
2583
2584         if (strcmp(f->file, "tasks") == 0 ||
2585                         strcmp(f->file, "/tasks") == 0 ||
2586                         strcmp(f->file, "/cgroup.procs") == 0 ||
2587                         strcmp(f->file, "cgroup.procs") == 0)
2588                 // special case - we have to translate the pids
2589                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2590         else
2591                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2592
2593         if (!r) {
2594                 ret = -EINVAL;
2595                 goto out;
2596         }
2597
2598         if (!data) {
2599                 ret = 0;
2600                 goto out;
2601         }
2602         s = strlen(data);
2603         if (s > size)
2604                 s = size;
2605         memcpy(buf, data, s);
2606         if (s > 0 && s < size && data[s-1] != '\n')
2607                 buf[s++] = '\n';
2608
2609         ret = s;
2610
2611 out:
2612         free(data);
2613         return ret;
2614 }
2615
2616 static int pid_from_ns(int sock, pid_t tpid)
2617 {
2618         pid_t vpid;
2619         struct ucred cred;
2620         char v;
2621         int ret;
2622
2623         cred.uid = 0;
2624         cred.gid = 0;
2625         while (1) {
2626                 if (!wait_for_sock(sock, 2)) {
2627                         lxcfs_error("%s\n", "Timeout reading from parent.");
2628                         return 1;
2629                 }
2630                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2631                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2632                         return 1;
2633                 }
2634                 if (vpid == -1) // done
2635                         break;
2636                 v = '0';
2637                 cred.pid = vpid;
2638                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2639                         v = '1';
2640                         cred.pid = getpid();
2641                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2642                                 return 1;
2643                 }
2644         }
2645         return 0;
2646 }
2647
2648 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2649 {
2650         int newnsfd = -1, ret, cpipe[2];
2651         char fnam[100];
2652         pid_t cpid;
2653         char v;
2654
2655         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2656         if (ret < 0 || ret >= sizeof(fnam))
2657                 _exit(1);
2658         newnsfd = open(fnam, O_RDONLY);
2659         if (newnsfd < 0)
2660                 _exit(1);
2661         if (setns(newnsfd, 0) < 0)
2662                 _exit(1);
2663         close(newnsfd);
2664
2665         if (pipe(cpipe) < 0)
2666                 _exit(1);
2667
2668         struct pid_ns_clone_args args = {
2669                 .cpipe = cpipe,
2670                 .sock = sock,
2671                 .tpid = tpid,
2672                 .wrapped = &pid_from_ns
2673         };
2674         size_t stack_size = sysconf(_SC_PAGESIZE);
2675         void *stack = alloca(stack_size);
2676
2677         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2678         if (cpid < 0)
2679                 _exit(1);
2680
2681         // give the child 1 second to be done forking and
2682         // write its ack
2683         if (!wait_for_sock(cpipe[0], 1))
2684                 _exit(1);
2685         ret = read(cpipe[0], &v, 1);
2686         if (ret != sizeof(char) || v != '1')
2687                 _exit(1);
2688
2689         if (!wait_for_pid(cpid))
2690                 _exit(1);
2691         _exit(0);
2692 }
2693
2694 /*
2695  * Given host @uid, return the uid to which it maps in
2696  * @pid's user namespace, or -1 if none.
2697  */
2698 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2699 {
2700         FILE *f;
2701         char line[400];
2702
2703         sprintf(line, "/proc/%d/uid_map", pid);
2704         if ((f = fopen(line, "r")) == NULL) {
2705                 return false;
2706         }
2707
2708         *answer = convert_id_to_ns(f, uid);
2709         fclose(f);
2710
2711         if (*answer == -1)
2712                 return false;
2713         return true;
2714 }
2715
2716 /*
2717  * get_pid_creds: get the real uid and gid of @pid from
2718  * /proc/$$/status
2719  * (XXX should we use euid here?)
2720  */
2721 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2722 {
2723         char line[400];
2724         uid_t u;
2725         gid_t g;
2726         FILE *f;
2727
2728         *uid = -1;
2729         *gid = -1;
2730         sprintf(line, "/proc/%d/status", pid);
2731         if ((f = fopen(line, "r")) == NULL) {
2732                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2733                 return;
2734         }
2735         while (fgets(line, 400, f)) {
2736                 if (strncmp(line, "Uid:", 4) == 0) {
2737                         if (sscanf(line+4, "%u", &u) != 1) {
2738                                 lxcfs_error("bad uid line for pid %u\n", pid);
2739                                 fclose(f);
2740                                 return;
2741                         }
2742                         *uid = u;
2743                 } else if (strncmp(line, "Gid:", 4) == 0) {
2744                         if (sscanf(line+4, "%u", &g) != 1) {
2745                                 lxcfs_error("bad gid line for pid %u\n", pid);
2746                                 fclose(f);
2747                                 return;
2748                         }
2749                         *gid = g;
2750                 }
2751         }
2752         fclose(f);
2753 }
2754
2755 /*
2756  * May the requestor @r move victim @v to a new cgroup?
2757  * This is allowed if
2758  *   . they are the same task
2759  *   . they are ownedy by the same uid
2760  *   . @r is root on the host, or
2761  *   . @v's uid is mapped into @r's where @r is root.
2762  */
2763 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2764 {
2765         uid_t v_uid, tmpuid;
2766         gid_t v_gid;
2767
2768         if (r == v)
2769                 return true;
2770         if (r_uid == 0)
2771                 return true;
2772         get_pid_creds(v, &v_uid, &v_gid);
2773         if (r_uid == v_uid)
2774                 return true;
2775         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2776                         && hostuid_to_ns(v_uid, r, &tmpuid))
2777                 return true;
2778         return false;
2779 }
2780
2781 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2782                 const char *file, const char *buf)
2783 {
2784         int sock[2] = {-1, -1};
2785         pid_t qpid, cpid = -1;
2786         FILE *pids_file = NULL;
2787         bool answer = false, fail = false;
2788
2789         pids_file = open_pids_file(contrl, cg);
2790         if (!pids_file)
2791                 return false;
2792
2793         /*
2794          * write the pids to a socket, have helper in writer's pidns
2795          * call movepid for us
2796          */
2797         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2798                 perror("socketpair");
2799                 goto out;
2800         }
2801
2802         cpid = fork();
2803         if (cpid == -1)
2804                 goto out;
2805
2806         if (!cpid) { // child
2807                 fclose(pids_file);
2808                 pid_from_ns_wrapper(sock[1], tpid);
2809         }
2810
2811         const char *ptr = buf;
2812         while (sscanf(ptr, "%d", &qpid) == 1) {
2813                 struct ucred cred;
2814                 char v;
2815
2816                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2817                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2818                         goto out;
2819                 }
2820
2821                 if (recv_creds(sock[0], &cred, &v)) {
2822                         if (v == '0') {
2823                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2824                                         fail = true;
2825                                         break;
2826                                 }
2827                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2828                                         fail = true;
2829                         }
2830                 }
2831
2832                 ptr = strchr(ptr, '\n');
2833                 if (!ptr)
2834                         break;
2835                 ptr++;
2836         }
2837
2838         /* All good, write the value */
2839         qpid = -1;
2840         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2841                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2842
2843         if (!fail)
2844                 answer = true;
2845
2846 out:
2847         if (cpid != -1)
2848                 wait_for_pid(cpid);
2849         if (sock[0] != -1) {
2850                 close(sock[0]);
2851                 close(sock[1]);
2852         }
2853         if (pids_file) {
2854                 if (fclose(pids_file) != 0)
2855                         answer = false;
2856         }
2857         return answer;
2858 }
2859
2860 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2861              struct fuse_file_info *fi)
2862 {
2863         struct fuse_context *fc = fuse_get_context();
2864         char *localbuf = NULL;
2865         struct cgfs_files *k = NULL;
2866         struct file_info *f = (struct file_info *)fi->fh;
2867         bool r;
2868
2869         if (f->type != LXC_TYPE_CGFILE) {
2870                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2871                 return -EIO;
2872         }
2873
2874         if (offset)
2875                 return 0;
2876
2877         if (!fc)
2878                 return -EIO;
2879
2880         localbuf = alloca(size+1);
2881         localbuf[size] = '\0';
2882         memcpy(localbuf, buf, size);
2883
2884         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2885                 size = -EINVAL;
2886                 goto out;
2887         }
2888
2889         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2890                 size = -EACCES;
2891                 goto out;
2892         }
2893
2894         if (strcmp(f->file, "tasks") == 0 ||
2895                         strcmp(f->file, "/tasks") == 0 ||
2896                         strcmp(f->file, "/cgroup.procs") == 0 ||
2897                         strcmp(f->file, "cgroup.procs") == 0)
2898                 // special case - we have to translate the pids
2899                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2900         else
2901                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2902
2903         if (!r)
2904                 size = -EINVAL;
2905
2906 out:
2907         free_key(k);
2908         return size;
2909 }
2910
2911 int cg_chown(const char *path, uid_t uid, gid_t gid)
2912 {
2913         struct fuse_context *fc = fuse_get_context();
2914         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2915         struct cgfs_files *k = NULL;
2916         const char *cgroup;
2917         int ret;
2918
2919         if (!fc)
2920                 return -EIO;
2921
2922         if (strcmp(path, "/cgroup") == 0)
2923                 return -EPERM;
2924
2925         controller = pick_controller_from_path(fc, path);
2926         if (!controller)
2927                 return errno == ENOENT ? -EPERM : -errno;
2928
2929         cgroup = find_cgroup_in_path(path);
2930         if (!cgroup)
2931                 /* this is just /cgroup/controller */
2932                 return -EPERM;
2933
2934         get_cgdir_and_path(cgroup, &cgdir, &last);
2935
2936         if (!last) {
2937                 path1 = "/";
2938                 path2 = cgdir;
2939         } else {
2940                 path1 = cgdir;
2941                 path2 = last;
2942         }
2943
2944         if (is_child_cgroup(controller, path1, path2)) {
2945                 // get uid, gid, from '/tasks' file and make up a mode
2946                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2947                 k = cgfs_get_key(controller, cgroup, "tasks");
2948
2949         } else
2950                 k = cgfs_get_key(controller, path1, path2);
2951
2952         if (!k) {
2953                 ret = -EINVAL;
2954                 goto out;
2955         }
2956
2957         /*
2958          * This being a fuse request, the uid and gid must be valid
2959          * in the caller's namespace.  So we can just check to make
2960          * sure that the caller is root in his uid, and privileged
2961          * over the file's current owner.
2962          */
2963         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2964                 ret = -EACCES;
2965                 goto out;
2966         }
2967
2968         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2969
2970 out:
2971         free_key(k);
2972         free(cgdir);
2973
2974         return ret;
2975 }
2976
2977 int cg_chmod(const char *path, mode_t mode)
2978 {
2979         struct fuse_context *fc = fuse_get_context();
2980         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2981         struct cgfs_files *k = NULL;
2982         const char *cgroup;
2983         int ret;
2984
2985         if (!fc)
2986                 return -EIO;
2987
2988         if (strcmp(path, "/cgroup") == 0)
2989                 return -EPERM;
2990
2991         controller = pick_controller_from_path(fc, path);
2992         if (!controller)
2993                 return errno == ENOENT ? -EPERM : -errno;
2994
2995         cgroup = find_cgroup_in_path(path);
2996         if (!cgroup)
2997                 /* this is just /cgroup/controller */
2998                 return -EPERM;
2999
3000         get_cgdir_and_path(cgroup, &cgdir, &last);
3001
3002         if (!last) {
3003                 path1 = "/";
3004                 path2 = cgdir;
3005         } else {
3006                 path1 = cgdir;
3007                 path2 = last;
3008         }
3009
3010         if (is_child_cgroup(controller, path1, path2)) {
3011                 // get uid, gid, from '/tasks' file and make up a mode
3012                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3013                 k = cgfs_get_key(controller, cgroup, "tasks");
3014
3015         } else
3016                 k = cgfs_get_key(controller, path1, path2);
3017
3018         if (!k) {
3019                 ret = -EINVAL;
3020                 goto out;
3021         }
3022
3023         /*
3024          * This being a fuse request, the uid and gid must be valid
3025          * in the caller's namespace.  So we can just check to make
3026          * sure that the caller is root in his uid, and privileged
3027          * over the file's current owner.
3028          */
3029         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3030                 ret = -EPERM;
3031                 goto out;
3032         }
3033
3034         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3035                 ret = -EINVAL;
3036                 goto out;
3037         }
3038
3039         ret = 0;
3040 out:
3041         free_key(k);
3042         free(cgdir);
3043         return ret;
3044 }
3045
3046 int cg_mkdir(const char *path, mode_t mode)
3047 {
3048         struct fuse_context *fc = fuse_get_context();
3049         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3050         const char *cgroup;
3051         int ret;
3052
3053         if (!fc)
3054                 return -EIO;
3055
3056         controller = pick_controller_from_path(fc, path);
3057         if (!controller)
3058                 return errno == ENOENT ? -EPERM : -errno;
3059
3060         cgroup = find_cgroup_in_path(path);
3061         if (!cgroup)
3062                 return -errno;
3063
3064         get_cgdir_and_path(cgroup, &cgdir, &last);
3065         if (!last)
3066                 path1 = "/";
3067         else
3068                 path1 = cgdir;
3069
3070         pid_t initpid = lookup_initpid_in_store(fc->pid);
3071         if (initpid <= 0)
3072                 initpid = fc->pid;
3073         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3074                 if (!next)
3075                         ret = -EINVAL;
3076                 else if (last && strcmp(next, last) == 0)
3077                         ret = -EEXIST;
3078                 else
3079                         ret = -EPERM;
3080                 goto out;
3081         }
3082
3083         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3084                 ret = -EACCES;
3085                 goto out;
3086         }
3087         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3088                 ret = -EACCES;
3089                 goto out;
3090         }
3091
3092         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3093
3094 out:
3095         free(cgdir);
3096         free(next);
3097         return ret;
3098 }
3099
3100 int cg_rmdir(const char *path)
3101 {
3102         struct fuse_context *fc = fuse_get_context();
3103         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3104         const char *cgroup;
3105         int ret;
3106
3107         if (!fc)
3108                 return -EIO;
3109
3110         controller = pick_controller_from_path(fc, path);
3111         if (!controller) /* Someone's trying to delete "/cgroup". */
3112                 return -EPERM;
3113
3114         cgroup = find_cgroup_in_path(path);
3115         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3116                 return -EPERM;
3117
3118         get_cgdir_and_path(cgroup, &cgdir, &last);
3119         if (!last) {
3120                 /* Someone's trying to delete a cgroup on the same level as the
3121                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3122                  * rmdir "/cgroup/blkio/init.slice".
3123                  */
3124                 ret = -EPERM;
3125                 goto out;
3126         }
3127
3128         pid_t initpid = lookup_initpid_in_store(fc->pid);
3129         if (initpid <= 0)
3130                 initpid = fc->pid;
3131         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3132                 if (!last || (next && (strcmp(next, last) == 0)))
3133                         ret = -EBUSY;
3134                 else
3135                         ret = -ENOENT;
3136                 goto out;
3137         }
3138
3139         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3140                 ret = -EACCES;
3141                 goto out;
3142         }
3143         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3144                 ret = -EACCES;
3145                 goto out;
3146         }
3147
3148         if (!cgfs_remove(controller, cgroup)) {
3149                 ret = -EINVAL;
3150                 goto out;
3151         }
3152
3153         ret = 0;
3154
3155 out:
3156         free(cgdir);
3157         free(next);
3158         return ret;
3159 }
3160
3161 static bool startswith(const char *line, const char *pref)
3162 {
3163         if (strncmp(line, pref, strlen(pref)) == 0)
3164                 return true;
3165         return false;
3166 }
3167
3168 static void parse_memstat(char *memstat, unsigned long *cached,
3169                 unsigned long *active_anon, unsigned long *inactive_anon,
3170                 unsigned long *active_file, unsigned long *inactive_file,
3171                 unsigned long *unevictable, unsigned long *shmem)
3172 {
3173         char *eol;
3174
3175         while (*memstat) {
3176                 if (startswith(memstat, "total_cache")) {
3177                         sscanf(memstat + 11, "%lu", cached);
3178                         *cached /= 1024;
3179                 } else if (startswith(memstat, "total_active_anon")) {
3180                         sscanf(memstat + 17, "%lu", active_anon);
3181                         *active_anon /= 1024;
3182                 } else if (startswith(memstat, "total_inactive_anon")) {
3183                         sscanf(memstat + 19, "%lu", inactive_anon);
3184                         *inactive_anon /= 1024;
3185                 } else if (startswith(memstat, "total_active_file")) {
3186                         sscanf(memstat + 17, "%lu", active_file);
3187                         *active_file /= 1024;
3188                 } else if (startswith(memstat, "total_inactive_file")) {
3189                         sscanf(memstat + 19, "%lu", inactive_file);
3190                         *inactive_file /= 1024;
3191                 } else if (startswith(memstat, "total_unevictable")) {
3192                         sscanf(memstat + 17, "%lu", unevictable);
3193                         *unevictable /= 1024;
3194                 } else if (startswith(memstat, "total_shmem")) {
3195                         sscanf(memstat + 11, "%lu", shmem);
3196                         *shmem /= 1024;
3197                 }
3198                 eol = strchr(memstat, '\n');
3199                 if (!eol)
3200                         return;
3201                 memstat = eol+1;
3202         }
3203 }
3204
3205 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3206 {
3207         char *eol;
3208         char key[32];
3209
3210         memset(key, 0, 32);
3211         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3212
3213         size_t len = strlen(key);
3214         *v = 0;
3215
3216         while (*str) {
3217                 if (startswith(str, key)) {
3218                         sscanf(str + len, "%lu", v);
3219                         return;
3220                 }
3221                 eol = strchr(str, '\n');
3222                 if (!eol)
3223                         return;
3224                 str = eol+1;
3225         }
3226 }
3227
3228 static int read_file(const char *path, char *buf, size_t size,
3229                      struct file_info *d)
3230 {
3231         size_t linelen = 0, total_len = 0, rv = 0;
3232         char *line = NULL;
3233         char *cache = d->buf;
3234         size_t cache_size = d->buflen;
3235         FILE *f = fopen(path, "r");
3236         if (!f)
3237                 return 0;
3238
3239         while (getline(&line, &linelen, f) != -1) {
3240                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3241                 if (l < 0) {
3242                         perror("Error writing to cache");
3243                         rv = 0;
3244                         goto err;
3245                 }
3246                 if (l >= cache_size) {
3247                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3248                         rv = 0;
3249                         goto err;
3250                 }
3251                 cache += l;
3252                 cache_size -= l;
3253                 total_len += l;
3254         }
3255
3256         d->size = total_len;
3257         if (total_len > size)
3258                 total_len = size;
3259
3260         /* read from off 0 */
3261         memcpy(buf, d->buf, total_len);
3262         rv = total_len;
3263   err:
3264         fclose(f);
3265         free(line);
3266         return rv;
3267 }
3268
3269 /*
3270  * FUSE ops for /proc
3271  */
3272
3273 static unsigned long get_memlimit(const char *cgroup, const char *file)
3274 {
3275         char *memlimit_str = NULL;
3276         unsigned long memlimit = -1;
3277
3278         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3279                 memlimit = strtoul(memlimit_str, NULL, 10);
3280
3281         free(memlimit_str);
3282
3283         return memlimit;
3284 }
3285
3286 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3287 {
3288         char *copy = strdupa(cgroup);
3289         unsigned long memlimit = 0, retlimit;
3290
3291         retlimit = get_memlimit(copy, file);
3292
3293         while (strcmp(copy, "/") != 0) {
3294                 copy = dirname(copy);
3295                 memlimit = get_memlimit(copy, file);
3296                 if (memlimit != -1 && memlimit < retlimit)
3297                         retlimit = memlimit;
3298         };
3299
3300         return retlimit;
3301 }
3302
3303 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3304                 struct fuse_file_info *fi)
3305 {
3306         struct fuse_context *fc = fuse_get_context();
3307         struct file_info *d = (struct file_info *)fi->fh;
3308         char *cg;
3309         char *memusage_str = NULL, *memstat_str = NULL,
3310                 *memswlimit_str = NULL, *memswusage_str = NULL;
3311         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3312                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3313                 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3314                 hostswtotal = 0;
3315         char *line = NULL;
3316         size_t linelen = 0, total_len = 0, rv = 0;
3317         char *cache = d->buf;
3318         size_t cache_size = d->buflen;
3319         FILE *f = NULL;
3320
3321         if (offset){
3322                 if (offset > d->size)
3323                         return -EINVAL;
3324                 if (!d->cached)
3325                         return 0;
3326                 int left = d->size - offset;
3327                 total_len = left > size ? size: left;
3328                 memcpy(buf, cache + offset, total_len);
3329                 return total_len;
3330         }
3331
3332         pid_t initpid = lookup_initpid_in_store(fc->pid);
3333         if (initpid <= 0)
3334                 initpid = fc->pid;
3335         cg = get_pid_cgroup(initpid, "memory");
3336         if (!cg)
3337                 return read_file("/proc/meminfo", buf, size, d);
3338         prune_init_slice(cg);
3339
3340         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3341         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3342                 goto err;
3343         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3344                 goto err;
3345
3346         // Following values are allowed to fail, because swapaccount might be turned
3347         // off for current kernel
3348         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3349                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3350         {
3351                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3352                 memswusage = strtoul(memswusage_str, NULL, 10);
3353
3354                 memswlimit = memswlimit / 1024;
3355                 memswusage = memswusage / 1024;
3356         }
3357
3358         memusage = strtoul(memusage_str, NULL, 10);
3359         memlimit /= 1024;
3360         memusage /= 1024;
3361
3362         parse_memstat(memstat_str, &cached, &active_anon,
3363                         &inactive_anon, &active_file, &inactive_file,
3364                         &unevictable, &shmem);
3365
3366         f = fopen("/proc/meminfo", "r");
3367         if (!f)
3368                 goto err;
3369
3370         while (getline(&line, &linelen, f) != -1) {
3371                 ssize_t l;
3372                 char *printme, lbuf[100];
3373
3374                 memset(lbuf, 0, 100);
3375                 if (startswith(line, "MemTotal:")) {
3376                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3377                         if (hosttotal < memlimit)
3378                                 memlimit = hosttotal;
3379                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3380                         printme = lbuf;
3381                 } else if (startswith(line, "MemFree:")) {
3382                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3383                         printme = lbuf;
3384                 } else if (startswith(line, "MemAvailable:")) {
3385                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3386                         printme = lbuf;
3387                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3388                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3389                         if (hostswtotal < memswlimit)
3390                                 memswlimit = hostswtotal;
3391                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3392                         printme = lbuf;
3393                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3394                         unsigned long swaptotal = memswlimit,
3395                                         swapusage = memswusage - memusage,
3396                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3397                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3398                         printme = lbuf;
3399                 } else if (startswith(line, "Slab:")) {
3400                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3401                         printme = lbuf;
3402                 } else if (startswith(line, "Buffers:")) {
3403                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3404                         printme = lbuf;
3405                 } else if (startswith(line, "Cached:")) {
3406                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3407                         printme = lbuf;
3408                 } else if (startswith(line, "SwapCached:")) {
3409                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3410                         printme = lbuf;
3411                 } else if (startswith(line, "Active:")) {
3412                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3413                                         active_anon + active_file);
3414                         printme = lbuf;
3415                 } else if (startswith(line, "Inactive:")) {
3416                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3417                                         inactive_anon + inactive_file);
3418                         printme = lbuf;
3419                 } else if (startswith(line, "Active(anon)")) {
3420                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3421                         printme = lbuf;
3422                 } else if (startswith(line, "Inactive(anon)")) {
3423                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3424                         printme = lbuf;
3425                 } else if (startswith(line, "Active(file)")) {
3426                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3427                         printme = lbuf;
3428                 } else if (startswith(line, "Inactive(file)")) {
3429                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3430                         printme = lbuf;
3431                 } else if (startswith(line, "Unevictable")) {
3432                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3433                         printme = lbuf;
3434                 } else if (startswith(line, "SReclaimable")) {
3435                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3436                         printme = lbuf;
3437                 } else if (startswith(line, "SUnreclaim")) {
3438                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3439                         printme = lbuf;
3440                 } else if (startswith(line, "Shmem:")) {
3441                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3442                         printme = lbuf;
3443                 } else if (startswith(line, "ShmemHugePages")) {
3444                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3445                         printme = lbuf;
3446                 } else if (startswith(line, "ShmemPmdMapped")) {
3447                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3448                         printme = lbuf;
3449                 } else
3450                         printme = line;
3451
3452                 l = snprintf(cache, cache_size, "%s", printme);
3453                 if (l < 0) {
3454                         perror("Error writing to cache");
3455                         rv = 0;
3456                         goto err;
3457
3458                 }
3459                 if (l >= cache_size) {
3460                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3461                         rv = 0;
3462                         goto err;
3463                 }
3464
3465                 cache += l;
3466                 cache_size -= l;
3467                 total_len += l;
3468         }
3469
3470         d->cached = 1;
3471         d->size = total_len;
3472         if (total_len > size ) total_len = size;
3473         memcpy(buf, d->buf, total_len);
3474
3475         rv = total_len;
3476 err:
3477         if (f)
3478                 fclose(f);
3479         free(line);
3480         free(cg);
3481         free(memusage_str);
3482         free(memswlimit_str);
3483         free(memswusage_str);
3484         free(memstat_str);
3485         return rv;
3486 }
3487
3488 /*
3489  * Read the cpuset.cpus for cg
3490  * Return the answer in a newly allocated string which must be freed
3491  */
3492 static char *get_cpuset(const char *cg)
3493 {
3494         char *answer;
3495
3496         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3497                 return NULL;
3498         return answer;
3499 }
3500
3501 bool cpu_in_cpuset(int cpu, const char *cpuset);
3502
3503 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3504 {
3505         int cpu;
3506
3507         if (sscanf(line, "processor       : %d", &cpu) != 1)
3508                 return false;
3509         return cpu_in_cpuset(cpu, cpuset);
3510 }
3511
3512 /*
3513  * check whether this is a '^processor" line in /proc/cpuinfo
3514  */
3515 static bool is_processor_line(const char *line)
3516 {
3517         int cpu;
3518
3519         if (sscanf(line, "processor       : %d", &cpu) == 1)
3520                 return true;
3521         return false;
3522 }
3523
3524 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3525                 struct fuse_file_info *fi)
3526 {
3527         struct fuse_context *fc = fuse_get_context();
3528         struct file_info *d = (struct file_info *)fi->fh;
3529         char *cg;
3530         char *cpuset = NULL;
3531         char *line = NULL;
3532         size_t linelen = 0, total_len = 0, rv = 0;
3533         bool am_printing = false, firstline = true, is_s390x = false;
3534         int curcpu = -1, cpu;
3535         char *cache = d->buf;
3536         size_t cache_size = d->buflen;
3537         FILE *f = NULL;
3538
3539         if (offset){
3540                 if (offset > d->size)
3541                         return -EINVAL;
3542                 if (!d->cached)
3543                         return 0;
3544                 int left = d->size - offset;
3545                 total_len = left > size ? size: left;
3546                 memcpy(buf, cache + offset, total_len);
3547                 return total_len;
3548         }
3549
3550         pid_t initpid = lookup_initpid_in_store(fc->pid);
3551         if (initpid <= 0)
3552                 initpid = fc->pid;
3553         cg = get_pid_cgroup(initpid, "cpuset");
3554         if (!cg)
3555                 return read_file("proc/cpuinfo", buf, size, d);
3556         prune_init_slice(cg);
3557
3558         cpuset = get_cpuset(cg);
3559         if (!cpuset)
3560                 goto err;
3561
3562         f = fopen("/proc/cpuinfo", "r");
3563         if (!f)
3564                 goto err;
3565
3566         while (getline(&line, &linelen, f) != -1) {
3567                 ssize_t l;
3568                 if (firstline) {
3569                         firstline = false;
3570                         if (strstr(line, "IBM/S390") != NULL) {
3571                                 is_s390x = true;
3572                                 am_printing = true;
3573                                 continue;
3574                         }
3575                 }
3576                 if (strncmp(line, "# processors:", 12) == 0)
3577                         continue;
3578                 if (is_processor_line(line)) {
3579                         am_printing = cpuline_in_cpuset(line, cpuset);
3580                         if (am_printing) {
3581                                 curcpu ++;
3582                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3583                                 if (l < 0) {
3584                                         perror("Error writing to cache");
3585                                         rv = 0;
3586                                         goto err;
3587                                 }
3588                                 if (l >= cache_size) {
3589                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3590                                         rv = 0;
3591                                         goto err;
3592                                 }
3593                                 cache += l;
3594                                 cache_size -= l;
3595                                 total_len += l;
3596                         }
3597                         continue;
3598                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3599                         char *p;
3600                         if (!cpu_in_cpuset(cpu, cpuset))
3601                                 continue;
3602                         curcpu ++;
3603                         p = strchr(line, ':');
3604                         if (!p || !*p)
3605                                 goto err;
3606                         p++;
3607                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3608                         if (l < 0) {
3609                                 perror("Error writing to cache");
3610                                 rv = 0;
3611                                 goto err;
3612                         }
3613                         if (l >= cache_size) {
3614                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3615                                 rv = 0;
3616                                 goto err;
3617                         }
3618                         cache += l;
3619                         cache_size -= l;
3620                         total_len += l;
3621                         continue;
3622
3623                 }
3624                 if (am_printing) {
3625                         l = snprintf(cache, cache_size, "%s", line);
3626                         if (l < 0) {
3627                                 perror("Error writing to cache");
3628                                 rv = 0;
3629                                 goto err;
3630                         }
3631                         if (l >= cache_size) {
3632                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3633                                 rv = 0;
3634                                 goto err;
3635                         }
3636                         cache += l;
3637                         cache_size -= l;
3638                         total_len += l;
3639                 }
3640         }
3641
3642         if (is_s390x) {
3643                 char *origcache = d->buf;
3644                 ssize_t l;
3645                 do {
3646                         d->buf = malloc(d->buflen);
3647                 } while (!d->buf);
3648                 cache = d->buf;
3649                 cache_size = d->buflen;
3650                 total_len = 0;
3651                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3652                 if (l < 0 || l >= cache_size) {
3653                         free(origcache);
3654                         goto err;
3655                 }
3656                 cache_size -= l;
3657                 cache += l;
3658                 total_len += l;
3659                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3660                 if (l < 0 || l >= cache_size) {
3661                         free(origcache);
3662                         goto err;
3663                 }
3664                 cache_size -= l;
3665                 cache += l;
3666                 total_len += l;
3667                 l = snprintf(cache, cache_size, "%s", origcache);
3668                 free(origcache);
3669                 if (l < 0 || l >= cache_size)
3670                         goto err;
3671                 total_len += l;
3672         }
3673
3674         d->cached = 1;
3675         d->size = total_len;
3676         if (total_len > size ) total_len = size;
3677
3678         /* read from off 0 */
3679         memcpy(buf, d->buf, total_len);
3680         rv = total_len;
3681 err:
3682         if (f)
3683                 fclose(f);
3684         free(line);
3685         free(cpuset);
3686         free(cg);
3687         return rv;
3688 }
3689
3690 static uint64_t get_reaper_start_time(pid_t pid)
3691 {
3692         int ret;
3693         FILE *f;
3694         uint64_t starttime;
3695         /* strlen("/proc/") = 6
3696          * +
3697          * LXCFS_NUMSTRLEN64
3698          * +
3699          * strlen("/stat") = 5
3700          * +
3701          * \0 = 1
3702          * */
3703 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3704         char path[__PROC_PID_STAT_LEN];
3705         pid_t qpid;
3706
3707         qpid = lookup_initpid_in_store(pid);
3708         if (qpid <= 0) {
3709                 /* Caller can check for EINVAL on 0. */
3710                 errno = EINVAL;
3711                 return 0;
3712         }
3713
3714         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3715         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3716                 /* Caller can check for EINVAL on 0. */
3717                 errno = EINVAL;
3718                 return 0;
3719         }
3720
3721         f = fopen(path, "r");
3722         if (!f) {
3723                 /* Caller can check for EINVAL on 0. */
3724                 errno = EINVAL;
3725                 return 0;
3726         }
3727
3728         /* Note that the *scanf() argument supression requires that length
3729          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3730          * at us. It's like telling someone you're not married and then asking
3731          * if you can bring your wife to the party.
3732          */
3733         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3734                         "%*s "      /* (2)  comm        %s   */
3735                         "%*c "      /* (3)  state       %c   */
3736                         "%*d "      /* (4)  ppid        %d   */
3737                         "%*d "      /* (5)  pgrp        %d   */
3738                         "%*d "      /* (6)  session     %d   */
3739                         "%*d "      /* (7)  tty_nr      %d   */
3740                         "%*d "      /* (8)  tpgid       %d   */
3741                         "%*u "      /* (9)  flags       %u   */
3742                         "%*u "      /* (10) minflt      %lu  */
3743                         "%*u "      /* (11) cminflt     %lu  */
3744                         "%*u "      /* (12) majflt      %lu  */
3745                         "%*u "      /* (13) cmajflt     %lu  */
3746                         "%*u "      /* (14) utime       %lu  */
3747                         "%*u "      /* (15) stime       %lu  */
3748                         "%*d "      /* (16) cutime      %ld  */
3749                         "%*d "      /* (17) cstime      %ld  */
3750                         "%*d "      /* (18) priority    %ld  */
3751                         "%*d "      /* (19) nice        %ld  */
3752                         "%*d "      /* (20) num_threads %ld  */
3753                         "%*d "      /* (21) itrealvalue %ld  */
3754                         "%" PRIu64, /* (22) starttime   %llu */
3755                      &starttime);
3756         if (ret != 1) {
3757                 fclose(f);
3758                 /* Caller can check for EINVAL on 0. */
3759                 errno = EINVAL;
3760                 return 0;
3761         }
3762
3763         fclose(f);
3764
3765         errno = 0;
3766         return starttime;
3767 }
3768
3769 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3770 {
3771         uint64_t clockticks;
3772         int64_t ticks_per_sec;
3773
3774         clockticks = get_reaper_start_time(pid);
3775         if (clockticks == 0 && errno == EINVAL) {
3776                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3777                 return 0;
3778         }
3779
3780         ticks_per_sec = sysconf(_SC_CLK_TCK);
3781         if (ticks_per_sec < 0 && errno == EINVAL) {
3782                 lxcfs_debug(
3783                     "%s\n",
3784                     "failed to determine number of clock ticks in a second");
3785                 return 0;
3786         }
3787
3788         return (clockticks /= ticks_per_sec);
3789 }
3790
3791 static uint64_t get_reaper_age(pid_t pid)
3792 {
3793         uint64_t procstart, uptime, procage;
3794
3795         /* We need to substract the time the process has started since system
3796          * boot minus the time when the system has started to get the actual
3797          * reaper age.
3798          */
3799         procstart = get_reaper_start_time_in_sec(pid);
3800         procage = procstart;
3801         if (procstart > 0) {
3802                 int ret;
3803                 struct timespec spec;
3804
3805                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3806                 if (ret < 0)
3807                         return 0;
3808                 /* We could make this more precise here by using the tv_nsec
3809                  * field in the timespec struct and convert it to milliseconds
3810                  * and then create a double for the seconds and milliseconds but
3811                  * that seems more work than it is worth.
3812                  */
3813                 uptime = spec.tv_sec;
3814                 procage = uptime - procstart;
3815         }
3816
3817         return procage;
3818 }
3819
3820 /*
3821  * Returns 0 on success.
3822  * It is the caller's responsibility to free `return_usage`, unless this
3823  * function returns an error.
3824  */
3825 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage)
3826 {
3827         int cpucount = get_nprocs();
3828         struct cpuacct_usage *cpu_usage;
3829         int rv = 0, i, j, ret, read_pos = 0, read_cnt;
3830         int cg_cpu;
3831         uint64_t cg_user, cg_system;
3832         int64_t ticks_per_sec;
3833         char *usage_str = NULL;
3834
3835         ticks_per_sec = sysconf(_SC_CLK_TCK);
3836
3837         if (ticks_per_sec < 0 && errno == EINVAL) {
3838                 lxcfs_debug(
3839                         "%s\n",
3840                         "read_cpuacct_usage_all failed to determine number of clock ticks "
3841                         "in a second");
3842                 return -1;
3843         }
3844
3845         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3846         if (!cpu_usage)
3847                 return -ENOMEM;
3848
3849         if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3850                 rv = -1;
3851                 goto err;
3852         }
3853
3854         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3855                 lxcfs_error("read_cpuacct_usage_all reading first line from "
3856                                 "%s/cpuacct.usage_all failed.\n", cg);
3857                 rv = -1;
3858                 goto err;
3859         }
3860
3861         read_pos += read_cnt;
3862
3863         for (i = 0, j = 0; i < cpucount; i++) {
3864                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3865                                 &cg_system, &read_cnt);
3866
3867                 if (ret == EOF)
3868                         break;
3869
3870                 if (ret != 3) {
3871                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3872                                         "failed.\n", cg);
3873                         rv = -1;
3874                         goto err;
3875                 }
3876
3877                 read_pos += read_cnt;
3878
3879                 if (!cpu_in_cpuset(i, cpuset))
3880                         continue;
3881
3882                 /* Convert the time from nanoseconds to USER_HZ */
3883                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3884                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3885                 j++;
3886         }
3887
3888         rv = 0;
3889         *return_usage = cpu_usage;
3890
3891 err:
3892         if (usage_str)
3893                 free(usage_str);
3894
3895         if (rv != 0) {
3896                 free(cpu_usage);
3897                 *return_usage = NULL;
3898         }
3899
3900         return rv;
3901 }
3902
3903 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3904 static int proc_stat_read(char *buf, size_t size, off_t offset,
3905                 struct fuse_file_info *fi)
3906 {
3907         struct fuse_context *fc = fuse_get_context();
3908         struct file_info *d = (struct file_info *)fi->fh;
3909         char *cg;
3910         char *cpuset = NULL;
3911         char *line = NULL;
3912         size_t linelen = 0, total_len = 0, rv = 0;
3913         int curcpu = -1; /* cpu numbering starts at 0 */
3914         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3915         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3916                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3917         char cpuall[CPUALL_MAX_SIZE];
3918         /* reserve for cpu all */
3919         char *cache = d->buf + CPUALL_MAX_SIZE;
3920         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3921         FILE *f = NULL;
3922         struct cpuacct_usage *cg_cpu_usage = NULL;
3923
3924         if (offset){
3925                 if (offset > d->size)
3926                         return -EINVAL;
3927                 if (!d->cached)
3928                         return 0;
3929                 int left = d->size - offset;
3930                 total_len = left > size ? size: left;
3931                 memcpy(buf, d->buf + offset, total_len);
3932                 return total_len;
3933         }
3934
3935         pid_t initpid = lookup_initpid_in_store(fc->pid);
3936         if (initpid <= 0)
3937                 initpid = fc->pid;
3938         cg = get_pid_cgroup(initpid, "cpuset");
3939         if (!cg)
3940                 return read_file("/proc/stat", buf, size, d);
3941         prune_init_slice(cg);
3942
3943         cpuset = get_cpuset(cg);
3944         if (!cpuset)
3945                 goto err;
3946
3947         /*
3948          * Read cpuacct.usage_all for all CPUs.
3949          * If the cpuacct cgroup is present, it is used to calculate the container's
3950          * CPU usage. If not, values from the host's /proc/stat are used.
3951          */
3952         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage) != 0) {
3953                 lxcfs_debug("%s\n", "proc_stat_read failed to read from cpuacct, "
3954                                 "falling back to the host's /proc/stat");
3955         }
3956
3957         f = fopen("/proc/stat", "r");
3958         if (!f)
3959                 goto err;
3960
3961         //skip first line
3962         if (getline(&line, &linelen, f) < 0) {
3963                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3964                 goto err;
3965         }
3966
3967         while (getline(&line, &linelen, f) != -1) {
3968                 ssize_t l;
3969                 int cpu;
3970                 char cpu_char[10]; /* That's a lot of cores */
3971                 char *c;
3972                 uint64_t all_used, cg_used, new_idle;
3973                 int ret;
3974
3975                 if (strlen(line) == 0)
3976                         continue;
3977                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3978                         /* not a ^cpuN line containing a number N, just print it */
3979                         l = snprintf(cache, cache_size, "%s", line);
3980                         if (l < 0) {
3981                                 perror("Error writing to cache");
3982                                 rv = 0;
3983                                 goto err;
3984                         }
3985                         if (l >= cache_size) {
3986                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3987                                 rv = 0;
3988                                 goto err;
3989                         }
3990                         cache += l;
3991                         cache_size -= l;
3992                         total_len += l;
3993                         continue;
3994                 }
3995
3996                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3997                         continue;
3998                 if (!cpu_in_cpuset(cpu, cpuset))
3999                         continue;
4000                 curcpu ++;
4001
4002                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4003                            &user,
4004                            &nice,
4005                            &system,
4006                            &idle,
4007                            &iowait,
4008                            &irq,
4009                            &softirq,
4010                            &steal,
4011                            &guest,
4012                            &guest_nice);
4013
4014                 if (ret != 10 || !cg_cpu_usage) {
4015                         c = strchr(line, ' ');
4016                         if (!c)
4017                                 continue;
4018                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4019                         if (l < 0) {
4020                                 perror("Error writing to cache");
4021                                 rv = 0;
4022                                 goto err;
4023
4024                         }
4025                         if (l >= cache_size) {
4026                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4027                                 rv = 0;
4028                                 goto err;
4029                         }
4030
4031                         cache += l;
4032                         cache_size -= l;
4033                         total_len += l;
4034
4035                         if (ret != 10)
4036                                 continue;
4037                 }
4038
4039                 if (cg_cpu_usage) {
4040                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4041                         cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4042
4043                         if (all_used >= cg_used) {
4044                                 new_idle = idle + (all_used - cg_used);
4045
4046                         } else {
4047                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4048                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4049                                                 curcpu, cg, all_used, cg_used);
4050                                 new_idle = idle;
4051                         }
4052
4053                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4054                                         curcpu, cg_cpu_usage[curcpu].user, cg_cpu_usage[curcpu].system,
4055                                         new_idle);
4056
4057                         if (l < 0) {
4058                                 perror("Error writing to cache");
4059                                 rv = 0;
4060                                 goto err;
4061
4062                         }
4063                         if (l >= cache_size) {
4064                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4065                                 rv = 0;
4066                                 goto err;
4067                         }
4068
4069                         cache += l;
4070                         cache_size -= l;
4071                         total_len += l;
4072
4073                         user_sum += cg_cpu_usage[curcpu].user;
4074                         system_sum += cg_cpu_usage[curcpu].system;
4075                         idle_sum += new_idle;
4076
4077                 } else {
4078                         user_sum += user;
4079                         nice_sum += nice;
4080                         system_sum += system;
4081                         idle_sum += idle;
4082                         iowait_sum += iowait;
4083                         irq_sum += irq;
4084                         softirq_sum += softirq;
4085                         steal_sum += steal;
4086                         guest_sum += guest;
4087                         guest_nice_sum += guest_nice;
4088                 }
4089         }
4090
4091         cache = d->buf;
4092
4093         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4094                         user_sum,
4095                         nice_sum,
4096                         system_sum,
4097                         idle_sum,
4098                         iowait_sum,
4099                         irq_sum,
4100                         softirq_sum,
4101                         steal_sum,
4102                         guest_sum,
4103                         guest_nice_sum);
4104         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4105                 memcpy(cache, cpuall, cpuall_len);
4106                 cache += cpuall_len;
4107         } else {
4108                 /* shouldn't happen */
4109                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4110                 cpuall_len = 0;
4111         }
4112
4113         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4114         total_len += cpuall_len;
4115         d->cached = 1;
4116         d->size = total_len;
4117         if (total_len > size)
4118                 total_len = size;
4119
4120         memcpy(buf, d->buf, total_len);
4121         rv = total_len;
4122
4123 err:
4124         if (f)
4125                 fclose(f);
4126         if (cg_cpu_usage)
4127                 free(cg_cpu_usage);
4128         free(line);
4129         free(cpuset);
4130         free(cg);
4131         return rv;
4132 }
4133
4134 /* This function retrieves the busy time of a group of tasks by looking at
4135  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4136  * been given it's own cpuacct cgroup. If not, this function will take the busy
4137  * time of all other taks that do not actually belong to the container into
4138  * account as well. If someone has a clever solution for this please send a
4139  * patch!
4140  */
4141 static unsigned long get_reaper_busy(pid_t task)
4142 {
4143         pid_t initpid = lookup_initpid_in_store(task);
4144         char *cgroup = NULL, *usage_str = NULL;
4145         unsigned long usage = 0;
4146
4147         if (initpid <= 0)
4148                 return 0;
4149
4150         cgroup = get_pid_cgroup(initpid, "cpuacct");
4151         if (!cgroup)
4152                 goto out;
4153         prune_init_slice(cgroup);
4154         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
4155                 goto out;
4156         usage = strtoul(usage_str, NULL, 10);
4157         usage /= 1000000000;
4158
4159 out:
4160         free(cgroup);
4161         free(usage_str);
4162         return usage;
4163 }
4164
4165 #if RELOADTEST
4166 void iwashere(void)
4167 {
4168         int fd;
4169
4170         fd = creat("/tmp/lxcfs-iwashere", 0644);
4171         if (fd >= 0)
4172                 close(fd);
4173 }
4174 #endif
4175
4176 /*
4177  * We read /proc/uptime and reuse its second field.
4178  * For the first field, we use the mtime for the reaper for
4179  * the calling pid as returned by getreaperage
4180  */
4181 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4182                 struct fuse_file_info *fi)
4183 {
4184         struct fuse_context *fc = fuse_get_context();
4185         struct file_info *d = (struct file_info *)fi->fh;
4186         unsigned long int busytime = get_reaper_busy(fc->pid);
4187         char *cache = d->buf;
4188         ssize_t total_len = 0;
4189         uint64_t idletime, reaperage;
4190
4191 #if RELOADTEST
4192         iwashere();
4193 #endif
4194
4195         if (offset){
4196                 if (!d->cached)
4197                         return 0;
4198                 if (offset > d->size)
4199                         return -EINVAL;
4200                 int left = d->size - offset;
4201                 total_len = left > size ? size: left;
4202                 memcpy(buf, cache + offset, total_len);
4203                 return total_len;
4204         }
4205
4206         reaperage = get_reaper_age(fc->pid);
4207         /* To understand why this is done, please read the comment to the
4208          * get_reaper_busy() function.
4209          */
4210         idletime = reaperage;
4211         if (reaperage >= busytime)
4212                 idletime = reaperage - busytime;
4213
4214         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4215         if (total_len < 0 || total_len >=  d->buflen){
4216                 lxcfs_error("%s\n", "failed to write to cache");
4217                 return 0;
4218         }
4219
4220         d->size = (int)total_len;
4221         d->cached = 1;
4222
4223         if (total_len > size) total_len = size;
4224
4225         memcpy(buf, d->buf, total_len);
4226         return total_len;
4227 }
4228
4229 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4230                 struct fuse_file_info *fi)
4231 {
4232         char dev_name[72];
4233         struct fuse_context *fc = fuse_get_context();
4234         struct file_info *d = (struct file_info *)fi->fh;
4235         char *cg;
4236         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4237                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
4238         unsigned long read = 0, write = 0;
4239         unsigned long read_merged = 0, write_merged = 0;
4240         unsigned long read_sectors = 0, write_sectors = 0;
4241         unsigned long read_ticks = 0, write_ticks = 0;
4242         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4243         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4244         char *cache = d->buf;
4245         size_t cache_size = d->buflen;
4246         char *line = NULL;
4247         size_t linelen = 0, total_len = 0, rv = 0;
4248         unsigned int major = 0, minor = 0;
4249         int i = 0;
4250         FILE *f = NULL;
4251
4252         if (offset){
4253                 if (offset > d->size)
4254                         return -EINVAL;
4255                 if (!d->cached)
4256                         return 0;
4257                 int left = d->size - offset;
4258                 total_len = left > size ? size: left;
4259                 memcpy(buf, cache + offset, total_len);
4260                 return total_len;
4261         }
4262
4263         pid_t initpid = lookup_initpid_in_store(fc->pid);
4264         if (initpid <= 0)
4265                 initpid = fc->pid;
4266         cg = get_pid_cgroup(initpid, "blkio");
4267         if (!cg)
4268                 return read_file("/proc/diskstats", buf, size, d);
4269         prune_init_slice(cg);
4270
4271         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4272                 goto err;
4273         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4274                 goto err;
4275         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4276                 goto err;
4277         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4278                 goto err;
4279         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4280                 goto err;
4281
4282
4283         f = fopen("/proc/diskstats", "r");
4284         if (!f)
4285                 goto err;
4286
4287         while (getline(&line, &linelen, f) != -1) {
4288                 ssize_t l;
4289                 char lbuf[256];
4290
4291                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4292                 if (i != 3)
4293                         continue;
4294
4295                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4296                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4297                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4298                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4299                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4300                 read_sectors = read_sectors/512;
4301                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4302                 write_sectors = write_sectors/512;
4303
4304                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4305                 rd_svctm = rd_svctm/1000000;
4306                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4307                 rd_wait = rd_wait/1000000;
4308                 read_ticks = rd_svctm + rd_wait;
4309
4310                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4311                 wr_svctm =  wr_svctm/1000000;
4312                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4313                 wr_wait =  wr_wait/1000000;
4314                 write_ticks = wr_svctm + wr_wait;
4315
4316                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4317                 tot_ticks =  tot_ticks/1000000;
4318
4319                 memset(lbuf, 0, 256);
4320                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4321                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4322                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4323                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4324                 else
4325                         continue;
4326
4327                 l = snprintf(cache, cache_size, "%s", lbuf);
4328                 if (l < 0) {
4329                         perror("Error writing to fuse buf");
4330                         rv = 0;
4331                         goto err;
4332                 }
4333                 if (l >= cache_size) {
4334                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4335                         rv = 0;
4336                         goto err;
4337                 }
4338                 cache += l;
4339                 cache_size -= l;
4340                 total_len += l;
4341         }
4342
4343         d->cached = 1;
4344         d->size = total_len;
4345         if (total_len > size ) total_len = size;
4346         memcpy(buf, d->buf, total_len);
4347
4348         rv = total_len;
4349 err:
4350         free(cg);
4351         if (f)
4352                 fclose(f);
4353         free(line);
4354         free(io_serviced_str);
4355         free(io_merged_str);
4356         free(io_service_bytes_str);
4357         free(io_wait_time_str);
4358         free(io_service_time_str);
4359         return rv;
4360 }
4361
4362 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4363                 struct fuse_file_info *fi)
4364 {
4365         struct fuse_context *fc = fuse_get_context();
4366         struct file_info *d = (struct file_info *)fi->fh;
4367         char *cg = NULL;
4368         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4369         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4370         ssize_t total_len = 0, rv = 0;
4371         ssize_t l = 0;
4372         char *cache = d->buf;
4373
4374         if (offset) {
4375                 if (offset > d->size)
4376                         return -EINVAL;
4377                 if (!d->cached)
4378                         return 0;
4379                 int left = d->size - offset;
4380                 total_len = left > size ? size: left;
4381                 memcpy(buf, cache + offset, total_len);
4382                 return total_len;
4383         }
4384
4385         pid_t initpid = lookup_initpid_in_store(fc->pid);
4386         if (initpid <= 0)
4387                 initpid = fc->pid;
4388         cg = get_pid_cgroup(initpid, "memory");
4389         if (!cg)
4390                 return read_file("/proc/swaps", buf, size, d);
4391         prune_init_slice(cg);
4392
4393         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4394
4395         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4396                 goto err;
4397
4398         memusage = strtoul(memusage_str, NULL, 10);
4399
4400         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4401             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4402
4403                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4404                 memswusage = strtoul(memswusage_str, NULL, 10);
4405
4406                 swap_total = (memswlimit - memlimit) / 1024;
4407                 swap_free = (memswusage - memusage) / 1024;
4408         }
4409
4410         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4411
4412         /* When no mem + swap limit is specified or swapaccount=0*/
4413         if (!memswlimit) {
4414                 char *line = NULL;
4415                 size_t linelen = 0;
4416                 FILE *f = fopen("/proc/meminfo", "r");
4417
4418                 if (!f)
4419                         goto err;
4420
4421                 while (getline(&line, &linelen, f) != -1) {
4422                         if (startswith(line, "SwapTotal:")) {
4423                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4424                         } else if (startswith(line, "SwapFree:")) {
4425                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4426                         }
4427                 }
4428
4429                 free(line);
4430                 fclose(f);
4431         }
4432
4433         if (swap_total > 0) {
4434                 l = snprintf(d->buf + total_len, d->size - total_len,
4435                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4436                                 swap_total, swap_free);
4437                 total_len += l;
4438         }
4439
4440         if (total_len < 0 || l < 0) {
4441                 perror("Error writing to cache");
4442                 rv = 0;
4443                 goto err;
4444         }
4445
4446         d->cached = 1;
4447         d->size = (int)total_len;
4448
4449         if (total_len > size) total_len = size;
4450         memcpy(buf, d->buf, total_len);
4451         rv = total_len;
4452
4453 err:
4454         free(cg);
4455         free(memswlimit_str);
4456         free(memlimit_str);
4457         free(memusage_str);
4458         free(memswusage_str);
4459         return rv;
4460 }
4461 /*
4462  * Find the process pid from cgroup path.
4463  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4464  * @pid_buf : put pid to pid_buf.
4465  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4466  * @depth : the depth of cgroup in container.
4467  * @sum : return the number of pid.
4468  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4469  */
4470 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4471 {
4472         DIR *dir;
4473         int fd;
4474         struct dirent *file;
4475         FILE *f = NULL;
4476         size_t linelen = 0;
4477         char *line = NULL;
4478         int pd;
4479         char *path_dir, *path;
4480         char **pid;
4481
4482         /* path = dpath + "/cgroup.procs" + /0 */
4483         do {
4484                 path = malloc(strlen(dpath) + 20);
4485         } while (!path);
4486
4487         strcpy(path, dpath);
4488         fd = openat(cfd, path, O_RDONLY);
4489         if (fd < 0)
4490                 goto out;
4491
4492         dir = fdopendir(fd);
4493         if (dir == NULL) {
4494                 close(fd);
4495                 goto out;
4496         }
4497
4498         while (((file = readdir(dir)) != NULL) && depth > 0) {
4499                 if (strncmp(file->d_name, ".", 1) == 0)
4500                         continue;
4501                 if (strncmp(file->d_name, "..", 1) == 0)
4502                         continue;
4503                 if (file->d_type == DT_DIR) {
4504                         /* path + '/' + d_name +/0 */
4505                         do {
4506                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4507                         } while (!path_dir);
4508                         strcpy(path_dir, path);
4509                         strcat(path_dir, "/");
4510                         strcat(path_dir, file->d_name);
4511                         pd = depth - 1;
4512                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4513                         free(path_dir);
4514                 }
4515         }
4516         closedir(dir);
4517
4518         strcat(path, "/cgroup.procs");
4519         fd = openat(cfd, path, O_RDONLY);
4520         if (fd < 0)
4521                 goto out;
4522
4523         f = fdopen(fd, "r");
4524         if (!f) {
4525                 close(fd);
4526                 goto out;
4527         }
4528
4529         while (getline(&line, &linelen, f) != -1) {
4530                 do {
4531                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4532                 } while (!pid);
4533                 *pid_buf = pid;
4534                 do {
4535                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
4536                 } while (*(*pid_buf + sum) == NULL);
4537                 strcpy(*(*pid_buf + sum), line);
4538                 sum++;
4539         }
4540         fclose(f);
4541 out:
4542         if (line)
4543                 free(line);
4544         free(path);
4545         return sum;
4546 }
4547 /*
4548  * calc_load calculates the load according to the following formula:
4549  * load1 = load0 * exp + active * (1 - exp)
4550  *
4551  * @load1: the new loadavg.
4552  * @load0: the former loadavg.
4553  * @active: the total number of running pid at this moment.
4554  * @exp: the fixed-point defined in the beginning.
4555  */
4556 static unsigned long
4557 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4558 {
4559         unsigned long newload;
4560
4561         active = active > 0 ? active * FIXED_1 : 0;
4562         newload = load * exp + active * (FIXED_1 - exp);
4563         if (active >= load)
4564                 newload += FIXED_1 - 1;
4565
4566         return newload / FIXED_1;
4567 }
4568
4569 /*
4570  * Return 0 means that container p->cg is closed.
4571  * Return -1 means that error occurred in refresh.
4572  * Positive num equals the total number of pid.
4573  */
4574 static int refresh_load(struct load_node *p, char *path)
4575 {
4576         FILE *f = NULL;
4577         char **idbuf;
4578         char proc_path[256];
4579         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4580         char *line = NULL;
4581         size_t linelen = 0;
4582         int sum, length;
4583         DIR *dp;
4584         struct dirent *file;
4585
4586         do {
4587                 idbuf = malloc(sizeof(char *));
4588         } while (!idbuf);
4589         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4590         /*  normal exit  */
4591         if (sum == 0)
4592                 goto out;
4593
4594         for (i = 0; i < sum; i++) {
4595                 /*clean up '\n' */
4596                 length = strlen(idbuf[i])-1;
4597                 idbuf[i][length] = '\0';
4598                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4599                 if (ret < 0 || ret > 255) {
4600                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4601                         i = sum;
4602                         sum = -1;
4603                         goto err_out;
4604                 }
4605
4606                 dp = opendir(proc_path);
4607                 if (!dp) {
4608                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4609                         continue;
4610                 }
4611                 while ((file = readdir(dp)) != NULL) {
4612                         if (strncmp(file->d_name, ".", 1) == 0)
4613                                 continue;
4614                         if (strncmp(file->d_name, "..", 1) == 0)
4615                                 continue;
4616                         total_pid++;
4617                         /* We make the biggest pid become last_pid.*/
4618                         ret = atof(file->d_name);
4619                         last_pid = (ret > last_pid) ? ret : last_pid;
4620
4621                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4622                         if (ret < 0 || ret > 255) {
4623                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4624                                 i = sum;
4625                                 sum = -1;
4626                                 closedir(dp);
4627                                 goto err_out;
4628                         }
4629                         f = fopen(proc_path, "r");
4630                         if (f != NULL) {
4631                                 while (getline(&line, &linelen, f) != -1) {
4632                                         /* Find State */
4633                                         if ((line[0] == 'S') && (line[1] == 't'))
4634                                                 break;
4635                                 }
4636                         if ((line[7] == 'R') || (line[7] == 'D'))
4637                                 run_pid++;
4638                         fclose(f);
4639                         }
4640                 }
4641                 closedir(dp);
4642         }
4643         /*Calculate the loadavg.*/
4644         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4645         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4646         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4647         p->run_pid = run_pid;
4648         p->total_pid = total_pid;
4649         p->last_pid = last_pid;
4650
4651         free(line);
4652 err_out:
4653         for (; i > 0; i--)
4654                 free(idbuf[i-1]);
4655 out:
4656         free(idbuf);
4657         return sum;
4658 }
4659 /*
4660  * Traverse the hash table and update it.
4661  */
4662 void *load_begin(void *arg)
4663 {
4664
4665         char *path = NULL;
4666         int i, sum, length, ret;
4667         struct load_node *f;
4668         int first_node;
4669         clock_t time1, time2;
4670
4671         while (1) {
4672                 if (loadavg_stop == 1)
4673                         return NULL;
4674
4675                 time1 = clock();
4676                 for (i = 0; i < LOAD_SIZE; i++) {
4677                         pthread_mutex_lock(&load_hash[i].lock);
4678                         if (load_hash[i].next == NULL) {
4679                                 pthread_mutex_unlock(&load_hash[i].lock);
4680                                 continue;
4681                         }
4682                         f = load_hash[i].next;
4683                         first_node = 1;
4684                         while (f) {
4685                                 length = strlen(f->cg) + 2;
4686                                 do {
4687                                         /* strlen(f->cg) + '.' or '' + \0 */
4688                                         path = malloc(length);
4689                                 } while (!path);
4690
4691                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4692                                 if (ret < 0 || ret > length - 1) {
4693                                         /* snprintf failed, ignore the node.*/
4694                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4695                                         goto out;
4696                                 }
4697                                 sum = refresh_load(f, path);
4698                                 if (sum == 0) {
4699                                         f = del_node(f, i);
4700                                 } else {
4701 out:                                    f = f->next;
4702                                 }
4703                                 free(path);
4704                                 /* load_hash[i].lock locks only on the first node.*/
4705                                 if (first_node == 1) {
4706                                         first_node = 0;
4707                                         pthread_mutex_unlock(&load_hash[i].lock);
4708                                 }
4709                         }
4710                 }
4711
4712                 if (loadavg_stop == 1)
4713                         return NULL;
4714
4715                 time2 = clock();
4716                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4717         }
4718 }
4719
4720 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4721                 struct fuse_file_info *fi)
4722 {
4723         struct fuse_context *fc = fuse_get_context();
4724         struct file_info *d = (struct file_info *)fi->fh;
4725         pid_t initpid;
4726         char *cg;
4727         size_t total_len = 0;
4728         char *cache = d->buf;
4729         struct load_node *n;
4730         int hash;
4731         int cfd, rv = 0;
4732         unsigned long a, b, c;
4733
4734         if (offset) {
4735                 if (offset > d->size)
4736                         return -EINVAL;
4737                 if (!d->cached)
4738                         return 0;
4739                 int left = d->size - offset;
4740                 total_len = left > size ? size : left;
4741                 memcpy(buf, cache + offset, total_len);
4742                 return total_len;
4743         }
4744         if (!loadavg)
4745                 return read_file("/proc/loadavg", buf, size, d);
4746
4747         initpid = lookup_initpid_in_store(fc->pid);
4748         if (initpid <= 0)
4749                 initpid = fc->pid;
4750         cg = get_pid_cgroup(initpid, "cpu");
4751         if (!cg)
4752                 return read_file("/proc/loadavg", buf, size, d);
4753
4754         prune_init_slice(cg);
4755         hash = calc_hash(cg);
4756         n = locate_node(cg, hash);
4757
4758         /* First time */
4759         if (n == NULL) {
4760                 if (!find_mounted_controller("cpu", &cfd)) {
4761                         /*
4762                          * In locate_node() above, pthread_rwlock_unlock() isn't used
4763                          * because delete is not allowed before read has ended.
4764                          */
4765                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4766                         rv = 0;
4767                         goto err;
4768                 }
4769                 do {
4770                         n = malloc(sizeof(struct load_node));
4771                 } while (!n);
4772
4773                 do {
4774                         n->cg = malloc(strlen(cg)+1);
4775                 } while (!n->cg);
4776                 strcpy(n->cg, cg);
4777                 n->avenrun[0] = 0;
4778                 n->avenrun[1] = 0;
4779                 n->avenrun[2] = 0;
4780                 n->run_pid = 0;
4781                 n->total_pid = 1;
4782                 n->last_pid = initpid;
4783                 n->cfd = cfd;
4784                 insert_node(&n, hash);
4785         }
4786         a = n->avenrun[0] + (FIXED_1/200);
4787         b = n->avenrun[1] + (FIXED_1/200);
4788         c = n->avenrun[2] + (FIXED_1/200);
4789         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4790                 LOAD_INT(a), LOAD_FRAC(a),
4791                 LOAD_INT(b), LOAD_FRAC(b),
4792                 LOAD_INT(c), LOAD_FRAC(c),
4793                 n->run_pid, n->total_pid, n->last_pid);
4794         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4795         if (total_len < 0 || total_len >=  d->buflen) {
4796                 lxcfs_error("%s\n", "Failed to write to cache");
4797                 rv = 0;
4798                 goto err;
4799         }
4800         d->size = (int)total_len;
4801         d->cached = 1;
4802
4803         if (total_len > size)
4804                 total_len = size;
4805         memcpy(buf, d->buf, total_len);
4806         rv = total_len;
4807
4808 err:
4809         free(cg);
4810         return rv;
4811 }
4812 /* Return a positive number on success, return 0 on failure.*/
4813 pthread_t load_daemon(int load_use)
4814 {
4815         int ret;
4816         pthread_t pid;
4817
4818         ret = init_load();
4819         if (ret == -1) {
4820                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4821                 return 0;
4822         }
4823         ret = pthread_create(&pid, NULL, load_begin, NULL);
4824         if (ret != 0) {
4825                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4826                 load_free();
4827                 return 0;
4828         }
4829         /* use loadavg, here loadavg = 1*/
4830         loadavg = load_use;
4831         return pid;
4832 }
4833
4834 /* Returns 0 on success. */
4835 int stop_load_daemon(pthread_t pid)
4836 {
4837         int s;
4838
4839         /* Signal the thread to gracefully stop */
4840         loadavg_stop = 1;
4841
4842         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4843         if (s != 0) {
4844                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4845                 return -1;
4846         }
4847
4848         load_free();
4849         loadavg_stop = 0;
4850
4851         return 0;
4852 }
4853
4854 static off_t get_procfile_size(const char *which)
4855 {
4856         FILE *f = fopen(which, "r");
4857         char *line = NULL;
4858         size_t len = 0;
4859         ssize_t sz, answer = 0;
4860         if (!f)
4861                 return 0;
4862
4863         while ((sz = getline(&line, &len, f)) != -1)
4864                 answer += sz;
4865         fclose (f);
4866         free(line);
4867
4868         return answer;
4869 }
4870
4871 int proc_getattr(const char *path, struct stat *sb)
4872 {
4873         struct timespec now;
4874
4875         memset(sb, 0, sizeof(struct stat));
4876         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4877                 return -EINVAL;
4878         sb->st_uid = sb->st_gid = 0;
4879         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4880         if (strcmp(path, "/proc") == 0) {
4881                 sb->st_mode = S_IFDIR | 00555;
4882                 sb->st_nlink = 2;
4883                 return 0;
4884         }
4885         if (strcmp(path, "/proc/meminfo") == 0 ||
4886                         strcmp(path, "/proc/cpuinfo") == 0 ||
4887                         strcmp(path, "/proc/uptime") == 0 ||
4888                         strcmp(path, "/proc/stat") == 0 ||
4889                         strcmp(path, "/proc/diskstats") == 0 ||
4890                         strcmp(path, "/proc/swaps") == 0 ||
4891                         strcmp(path, "/proc/loadavg") == 0) {
4892                 sb->st_size = 0;
4893                 sb->st_mode = S_IFREG | 00444;
4894                 sb->st_nlink = 1;
4895                 return 0;
4896         }
4897
4898         return -ENOENT;
4899 }
4900
4901 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4902                 struct fuse_file_info *fi)
4903 {
4904         if (filler(buf, ".", NULL, 0) != 0 ||
4905             filler(buf, "..", NULL, 0) != 0 ||
4906             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4907             filler(buf, "meminfo", NULL, 0) != 0 ||
4908             filler(buf, "stat", NULL, 0) != 0 ||
4909             filler(buf, "uptime", NULL, 0) != 0 ||
4910             filler(buf, "diskstats", NULL, 0) != 0 ||
4911             filler(buf, "swaps", NULL, 0) != 0   ||
4912             filler(buf, "loadavg", NULL, 0) != 0)
4913                 return -EINVAL;
4914         return 0;
4915 }
4916
4917 int proc_open(const char *path, struct fuse_file_info *fi)
4918 {
4919         int type = -1;
4920         struct file_info *info;
4921
4922         if (strcmp(path, "/proc/meminfo") == 0)
4923                 type = LXC_TYPE_PROC_MEMINFO;
4924         else if (strcmp(path, "/proc/cpuinfo") == 0)
4925                 type = LXC_TYPE_PROC_CPUINFO;
4926         else if (strcmp(path, "/proc/uptime") == 0)
4927                 type = LXC_TYPE_PROC_UPTIME;
4928         else if (strcmp(path, "/proc/stat") == 0)
4929                 type = LXC_TYPE_PROC_STAT;
4930         else if (strcmp(path, "/proc/diskstats") == 0)
4931                 type = LXC_TYPE_PROC_DISKSTATS;
4932         else if (strcmp(path, "/proc/swaps") == 0)
4933                 type = LXC_TYPE_PROC_SWAPS;
4934         else if (strcmp(path, "/proc/loadavg") == 0)
4935                 type = LXC_TYPE_PROC_LOADAVG;
4936         if (type == -1)
4937                 return -ENOENT;
4938
4939         info = malloc(sizeof(*info));
4940         if (!info)
4941                 return -ENOMEM;
4942
4943         memset(info, 0, sizeof(*info));
4944         info->type = type;
4945
4946         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4947         do {
4948                 info->buf = malloc(info->buflen);
4949         } while (!info->buf);
4950         memset(info->buf, 0, info->buflen);
4951         /* set actual size to buffer size */
4952         info->size = info->buflen;
4953
4954         fi->fh = (unsigned long)info;
4955         return 0;
4956 }
4957
4958 int proc_access(const char *path, int mask)
4959 {
4960         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4961                 return 0;
4962
4963         /* these are all read-only */
4964         if ((mask & ~R_OK) != 0)
4965                 return -EACCES;
4966         return 0;
4967 }
4968
4969 int proc_release(const char *path, struct fuse_file_info *fi)
4970 {
4971         do_release_file_info(fi);
4972         return 0;
4973 }
4974
4975 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4976                 struct fuse_file_info *fi)
4977 {
4978         struct file_info *f = (struct file_info *) fi->fh;
4979
4980         switch (f->type) {
4981         case LXC_TYPE_PROC_MEMINFO:
4982                 return proc_meminfo_read(buf, size, offset, fi);
4983         case LXC_TYPE_PROC_CPUINFO:
4984                 return proc_cpuinfo_read(buf, size, offset, fi);
4985         case LXC_TYPE_PROC_UPTIME:
4986                 return proc_uptime_read(buf, size, offset, fi);
4987         case LXC_TYPE_PROC_STAT:
4988                 return proc_stat_read(buf, size, offset, fi);
4989         case LXC_TYPE_PROC_DISKSTATS:
4990                 return proc_diskstats_read(buf, size, offset, fi);
4991         case LXC_TYPE_PROC_SWAPS:
4992                 return proc_swaps_read(buf, size, offset, fi);
4993         case LXC_TYPE_PROC_LOADAVG:
4994                 return proc_loadavg_read(buf, size, offset, fi);
4995         default:
4996                 return -EINVAL;
4997         }
4998 }
4999
5000 /*
5001  * Functions needed to setup cgroups in the __constructor__.
5002  */
5003
5004 static bool mkdir_p(const char *dir, mode_t mode)
5005 {
5006         const char *tmp = dir;
5007         const char *orig = dir;
5008         char *makeme;
5009
5010         do {
5011                 dir = tmp + strspn(tmp, "/");
5012                 tmp = dir + strcspn(dir, "/");
5013                 makeme = strndup(orig, dir - orig);
5014                 if (!makeme)
5015                         return false;
5016                 if (mkdir(makeme, mode) && errno != EEXIST) {
5017                         lxcfs_error("Failed to create directory '%s': %s.\n",
5018                                 makeme, strerror(errno));
5019                         free(makeme);
5020                         return false;
5021                 }
5022                 free(makeme);
5023         } while(tmp != dir);
5024
5025         return true;
5026 }
5027
5028 static bool umount_if_mounted(void)
5029 {
5030         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5031                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5032                 return false;
5033         }
5034         return true;
5035 }
5036
5037 /* __typeof__ should be safe to use with all compilers. */
5038 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5039 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5040 {
5041         return (fs->f_type == (fs_type_magic)magic_val);
5042 }
5043
5044 /*
5045  * looking at fs/proc_namespace.c, it appears we can
5046  * actually expect the rootfs entry to very specifically contain
5047  * " - rootfs rootfs "
5048  * IIUC, so long as we've chrooted so that rootfs is not our root,
5049  * the rootfs entry should always be skipped in mountinfo contents.
5050  */
5051 static bool is_on_ramfs(void)
5052 {
5053         FILE *f;
5054         char *p, *p2;
5055         char *line = NULL;
5056         size_t len = 0;
5057         int i;
5058
5059         f = fopen("/proc/self/mountinfo", "r");
5060         if (!f)
5061                 return false;
5062
5063         while (getline(&line, &len, f) != -1) {
5064                 for (p = line, i = 0; p && i < 4; i++)
5065                         p = strchr(p + 1, ' ');
5066                 if (!p)
5067                         continue;
5068                 p2 = strchr(p + 1, ' ');
5069                 if (!p2)
5070                         continue;
5071                 *p2 = '\0';
5072                 if (strcmp(p + 1, "/") == 0) {
5073                         // this is '/'.  is it the ramfs?
5074                         p = strchr(p2 + 1, '-');
5075                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5076                                 free(line);
5077                                 fclose(f);
5078                                 return true;
5079                         }
5080                 }
5081         }
5082         free(line);
5083         fclose(f);
5084         return false;
5085 }
5086
5087 static int pivot_enter()
5088 {
5089         int ret = -1, oldroot = -1, newroot = -1;
5090
5091         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5092         if (oldroot < 0) {
5093                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5094                 return ret;
5095         }
5096
5097         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5098         if (newroot < 0) {
5099                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5100                 goto err;
5101         }
5102
5103         /* change into new root fs */
5104         if (fchdir(newroot) < 0) {
5105                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5106                 goto err;
5107         }
5108
5109         /* pivot_root into our new root fs */
5110         if (pivot_root(".", ".") < 0) {
5111                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5112                 goto err;
5113         }
5114
5115         /*
5116          * At this point the old-root is mounted on top of our new-root.
5117          * To unmounted it we must not be chdir'd into it, so escape back
5118          * to the old-root.
5119          */
5120         if (fchdir(oldroot) < 0) {
5121                 lxcfs_error("%s\n", "Failed to enter old root.");
5122                 goto err;
5123         }
5124
5125         if (umount2(".", MNT_DETACH) < 0) {
5126                 lxcfs_error("%s\n", "Failed to detach old root.");
5127                 goto err;
5128         }
5129
5130         if (fchdir(newroot) < 0) {
5131                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5132                 goto err;
5133         }
5134
5135         ret = 0;
5136
5137 err:
5138         if (oldroot > 0)
5139                 close(oldroot);
5140         if (newroot > 0)
5141                 close(newroot);
5142
5143         return ret;
5144 }
5145
5146 static int chroot_enter()
5147 {
5148         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5149                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5150                 return -1;
5151         }
5152
5153         if (chroot(".") < 0) {
5154                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5155                 return -1;
5156         }
5157
5158         if (chdir("/") < 0) {
5159                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5160                 return -1;
5161         }
5162
5163         return 0;
5164 }
5165
5166 static int permute_and_enter(void)
5167 {
5168         struct statfs sb;
5169
5170         if (statfs("/", &sb) < 0) {
5171                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5172                 return -1;
5173         }
5174
5175         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5176          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5177          * /proc/1/mountinfo. */
5178         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5179                 return chroot_enter();
5180
5181         if (pivot_enter() < 0) {
5182                 lxcfs_error("%s\n", "Could not perform pivot root.");
5183                 return -1;
5184         }
5185
5186         return 0;
5187 }
5188
5189 /* Prepare our new clean root. */
5190 static int permute_prepare(void)
5191 {
5192         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5193                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5194                 return -1;
5195         }
5196
5197         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5198                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5199                 return -1;
5200         }
5201
5202         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5203                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5204                 return -1;
5205         }
5206
5207         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5208                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5209                 return -1;
5210         }
5211
5212         return 0;
5213 }
5214
5215 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5216 static bool permute_root(void)
5217 {
5218         /* Prepare new root. */
5219         if (permute_prepare() < 0)
5220                 return false;
5221
5222         /* Pivot into new root. */
5223         if (permute_and_enter() < 0)
5224                 return false;
5225
5226         return true;
5227 }
5228
5229 static int preserve_mnt_ns(int pid)
5230 {
5231         int ret;
5232         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5233         char path[len];
5234
5235         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5236         if (ret < 0 || (size_t)ret >= len)
5237                 return -1;
5238
5239         return open(path, O_RDONLY | O_CLOEXEC);
5240 }
5241
5242 static bool cgfs_prepare_mounts(void)
5243 {
5244         if (!mkdir_p(BASEDIR, 0700)) {
5245                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5246                 return false;
5247         }
5248
5249         if (!umount_if_mounted()) {
5250                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5251                 return false;
5252         }
5253
5254         if (unshare(CLONE_NEWNS) < 0) {
5255                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5256                 return false;
5257         }
5258
5259         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5260         if (cgroup_mount_ns_fd < 0) {
5261                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5262                 return false;
5263         }
5264
5265         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5266                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5267                 return false;
5268         }
5269
5270         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5271                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5272                 return false;
5273         }
5274
5275         return true;
5276 }
5277
5278 static bool cgfs_mount_hierarchies(void)
5279 {
5280         char *target;
5281         size_t clen, len;
5282         int i, ret;
5283
5284         for (i = 0; i < num_hierarchies; i++) {
5285                 char *controller = hierarchies[i];
5286
5287                 clen = strlen(controller);
5288                 len = strlen(BASEDIR) + clen + 2;
5289                 target = malloc(len);
5290                 if (!target)
5291                         return false;
5292
5293                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5294                 if (ret < 0 || ret >= len) {
5295                         free(target);
5296                         return false;
5297                 }
5298                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5299                         free(target);
5300                         return false;
5301                 }
5302                 if (!strcmp(controller, "unified"))
5303                         ret = mount("none", target, "cgroup2", 0, NULL);
5304                 else
5305                         ret = mount(controller, target, "cgroup", 0, controller);
5306                 if (ret < 0) {
5307                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5308                         free(target);
5309                         return false;
5310                 }
5311
5312                 fd_hierarchies[i] = open(target, O_DIRECTORY);
5313                 if (fd_hierarchies[i] < 0) {
5314                         free(target);
5315                         return false;
5316                 }
5317                 free(target);
5318         }
5319         return true;
5320 }
5321
5322 static bool cgfs_setup_controllers(void)
5323 {
5324         if (!cgfs_prepare_mounts())
5325                 return false;
5326
5327         if (!cgfs_mount_hierarchies()) {
5328                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5329                 return false;
5330         }
5331
5332         if (!permute_root())
5333                 return false;
5334
5335         return true;
5336 }
5337
5338 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5339 {
5340         FILE *f;
5341         char *cret, *line = NULL;
5342         char cwd[MAXPATHLEN];
5343         size_t len = 0;
5344         int i, init_ns = -1;
5345         bool found_unified = false;
5346
5347         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5348                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5349                 return;
5350         }
5351
5352         while (getline(&line, &len, f) != -1) {
5353                 char *idx, *p, *p2;
5354
5355                 p = strchr(line, ':');
5356                 if (!p)
5357                         goto out;
5358                 idx = line;
5359                 *(p++) = '\0';
5360
5361                 p2 = strrchr(p, ':');
5362                 if (!p2)
5363                         goto out;
5364                 *p2 = '\0';
5365
5366                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5367                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5368                  * because it parses out the empty string "" and later on passes
5369                  * it to mount(). Let's skip such entries.
5370                  */
5371                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5372                         found_unified = true;
5373                         p = "unified";
5374                 }
5375
5376                 if (!store_hierarchy(line, p))
5377                         goto out;
5378         }
5379
5380         /* Preserve initial namespace. */
5381         init_ns = preserve_mnt_ns(getpid());
5382         if (init_ns < 0) {
5383                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5384                 goto out;
5385         }
5386
5387         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5388         if (!fd_hierarchies) {
5389                 lxcfs_error("%s\n", strerror(errno));
5390                 goto out;
5391         }
5392
5393         for (i = 0; i < num_hierarchies; i++)
5394                 fd_hierarchies[i] = -1;
5395
5396         cret = getcwd(cwd, MAXPATHLEN);
5397         if (!cret)
5398                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5399
5400         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5401          * to privately mount lxcfs cgroups. */
5402         if (!cgfs_setup_controllers()) {
5403                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5404                 goto out;
5405         }
5406
5407         if (setns(init_ns, 0) < 0) {
5408                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5409                 goto out;
5410         }
5411
5412         if (!cret || chdir(cwd) < 0)
5413                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5414
5415         print_subsystems();
5416
5417 out:
5418         free(line);
5419         fclose(f);
5420         if (init_ns >= 0)
5421                 close(init_ns);
5422 }
5423
5424 static void __attribute__((destructor)) free_subsystems(void)
5425 {
5426         int i;
5427
5428         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5429
5430         for (i = 0; i < num_hierarchies; i++) {
5431                 if (hierarchies[i])
5432                         free(hierarchies[i]);
5433                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5434                         close(fd_hierarchies[i]);
5435         }
5436         free(hierarchies);
5437         free(fd_hierarchies);
5438
5439         if (cgroup_mount_ns_fd >= 0)
5440                 close(cgroup_mount_ns_fd);
5441 }