bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 /* The function of hash table.*/
  84 #define LOAD_SIZE 100 /*the size of hash_table */
  85 #define FLUSH_TIME 5  /*the flush rate */
  86 #define DEPTH_DIR 3   /*the depth of per cgroup */
  87 /* The function of calculate loadavg .*/
  88 #define FSHIFT          11              /* nr of bits of precision */
  89 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  90 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  91 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  92 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  93 #define LOAD_INT(x) ((x) >> FSHIFT)
  94 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  95 /*
  96  * This parameter is used for proc_loadavg_read().
  97  * 1 means use loadavg, 0 means not use.
  98  */
  99 static int loadavg = 0;
 100 static volatile sig_atomic_t loadavg_stop = 0;
 101 static int calc_hash(char *name)
 102 {
 103         unsigned int hash = 0;
 104         unsigned int x = 0;
 105         /* ELFHash algorithm. */
 106         while (*name) {
 107                 hash = (hash << 4) + *name++;
 108                 x = hash & 0xf0000000;
 109                 if (x != 0)
 110                         hash ^= (x >> 24);
 111                 hash &= ~x;
 112         }
 113         return ((hash & 0x7fffffff) % LOAD_SIZE);
 114 }
 115
 116 struct load_node {
 117         char *cg;  /*cg */
 118         unsigned long avenrun[3];               /* Load averages */
 119         unsigned int run_pid;
 120         unsigned int total_pid;
 121         unsigned int last_pid;
 122         int cfd; /* The file descriptor of the mounted cgroup */
 123         struct  load_node *next;
 124         struct  load_node **pre;
 125 };
 126
 127 struct load_head {
 128         /*
 129          * The lock is about insert load_node and refresh load_node.To the first
 130          * load_node of each hash bucket, insert and refresh in this hash bucket is
 131          * mutually exclusive.
 132          */
 133         pthread_mutex_t lock;
 134         /*
 135          * The rdlock is about read loadavg and delete load_node.To each hash
 136          * bucket, read and delete is mutually exclusive. But at the same time, we
 137          * allow paratactic read operation. This rdlock is at list level.
 138          */
 139         pthread_rwlock_t rdlock;
 140         /*
 141          * The rilock is about read loadavg and insert load_node.To the first
 142          * load_node of each hash bucket, read and insert is mutually exclusive.
 143          * But at the same time, we allow paratactic read operation.
 144          */
 145         pthread_rwlock_t rilock;
 146         struct load_node *next;
 147 };
 148
 149 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 150 /*
 151  * init_load initialize the hash table.
 152  * Return 0 on success, return -1 on failure.
 153  */
 154 static int init_load(void)
 155 {
 156         int i;
 157         int ret;
 158
 159         for (i = 0; i < LOAD_SIZE; i++) {
 160                 load_hash[i].next = NULL;
 161                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 162                 if (ret != 0) {
 163                         lxcfs_error("%s\n", "Failed to initialize lock");
 164                         goto out3;
 165                 }
 166                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 167                 if (ret != 0) {
 168                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 169                         goto out2;
 170                 }
 171                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 172                 if (ret != 0) {
 173                         lxcfs_error("%s\n", "Failed to initialize rilock");
 174                         goto out1;
 175                 }
 176         }
 177         return 0;
 178 out1:
 179         pthread_rwlock_destroy(&load_hash[i].rdlock);
 180 out2:
 181         pthread_mutex_destroy(&load_hash[i].lock);
 182 out3:
 183         while (i > 0) {
 184                 i--;
 185                 pthread_mutex_destroy(&load_hash[i].lock);
 186                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 187                 pthread_rwlock_destroy(&load_hash[i].rilock);
 188         }
 189         return -1;
 190 }
 191
 192 static void insert_node(struct load_node **n, int locate)
 193 {
 194         struct load_node *f;
 195
 196         pthread_mutex_lock(&load_hash[locate].lock);
 197         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 198         f = load_hash[locate].next;
 199         load_hash[locate].next = *n;
 200
 201         (*n)->pre = &(load_hash[locate].next);
 202         if (f)
 203                 f->pre = &((*n)->next);
 204         (*n)->next = f;
 205         pthread_mutex_unlock(&load_hash[locate].lock);
 206         pthread_rwlock_unlock(&load_hash[locate].rilock);
 207 }
 208 /*
 209  * locate_node() finds special node. Not return NULL means success.
 210  * It should be noted that rdlock isn't unlocked at the end of code
 211  * because this function is used to read special node. Delete is not
 212  * allowed before read has ended.
 213  * unlock rdlock only in proc_loadavg_read().
 214  */
 215 static struct load_node *locate_node(char *cg, int locate)
 216 {
 217         struct load_node *f = NULL;
 218         int i = 0;
 219
 220         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 221         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 222         if (load_hash[locate].next == NULL) {
 223                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 224                 return f;
 225         }
 226         f = load_hash[locate].next;
 227         pthread_rwlock_unlock(&load_hash[locate].rilock);
 228         while (f && ((i = strcmp(f->cg, cg)) != 0))
 229                 f = f->next;
 230         return f;
 231 }
 232 /* Delete the load_node n and return the next node of it. */
 233 static struct load_node *del_node(struct load_node *n, int locate)
 234 {
 235         struct load_node *g;
 236
 237         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 238         if (n->next == NULL) {
 239                 *(n->pre) = NULL;
 240         } else {
 241                 *(n->pre) = n->next;
 242                 n->next->pre = n->pre;
 243         }
 244         g = n->next;
 245         free(n->cg);
 246         free(n);
 247         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 248         return g;
 249 }
 250
 251 static void load_free(void)
 252 {
 253         int i;
 254         struct load_node *f, *p;
 255
 256         for (i = 0; i < LOAD_SIZE; i++) {
 257                 pthread_mutex_lock(&load_hash[i].lock);
 258                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 259                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 260                 if (load_hash[i].next == NULL) {
 261                         pthread_mutex_unlock(&load_hash[i].lock);
 262                         pthread_mutex_destroy(&load_hash[i].lock);
 263                         pthread_rwlock_unlock(&load_hash[i].rilock);
 264                         pthread_rwlock_destroy(&load_hash[i].rilock);
 265                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 266                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 267                         continue;
 268                 }
 269                 for (f = load_hash[i].next; f; ) {
 270                         free(f->cg);
 271                         p = f->next;
 272                         free(f);
 273                         f = p;
 274                 }
 275                 pthread_mutex_unlock(&load_hash[i].lock);
 276                 pthread_mutex_destroy(&load_hash[i].lock);
 277                 pthread_rwlock_unlock(&load_hash[i].rilock);
 278                 pthread_rwlock_destroy(&load_hash[i].rilock);
 279                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 280                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 281         }
 282 }
 283 /* Reserve buffer size to account for file size changes. */
 284 #define BUF_RESERVE_SIZE 512
 285
 286 /*
 287  * A table caching which pid is init for a pid namespace.
 288  * When looking up which pid is init for $qpid, we first
 289  * 1. Stat /proc/$qpid/ns/pid.
 290  * 2. Check whether the ino_t is in our store.
 291  *   a. if not, fork a child in qpid's ns to send us
 292  *       ucred.pid = 1, and read the initpid.  Cache
 293  *       initpid and creation time for /proc/initpid
 294  *       in a new store entry.
 295  *   b. if so, verify that /proc/initpid still matches
 296  *       what we have saved.  If not, clear the store
 297  *       entry and go back to a.  If so, return the
 298  *       cached initpid.
 299  */
 300 struct pidns_init_store {
 301         ino_t ino;          // inode number for /proc/$pid/ns/pid
 302         pid_t initpid;      // the pid of nit in that ns
 303         long int ctime;     // the time at which /proc/$initpid was created
 304         struct pidns_init_store *next;
 305         long int lastcheck;
 306 };
 307
 308 /* lol - look at how they are allocated in the kernel */
 309 #define PIDNS_HASH_SIZE 4096
 310 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 311
 312 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 313 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 314 static void lock_mutex(pthread_mutex_t *l)
 315 {
 316         int ret;
 317
 318         if ((ret = pthread_mutex_lock(l)) != 0) {
 319                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 320                 exit(1);
 321         }
 322 }
 323
 324 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 325  * Number of hierarchies mounted. */
 326 static int num_hierarchies;
 327
 328 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 329  * Hierachies mounted {cpuset, blkio, ...}:
 330  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 331 static char **hierarchies;
 332
 333 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 334  * Open file descriptors:
 335  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 336  * private mount namespace.
 337  * Initialized via __constructor__ collect_and_mount_subsystems().
 338  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 339  * mounts and respective files in the private namespace even when located in
 340  * another namespace using the *at() family of functions
 341  * {openat(), fchownat(), ...}. */
 342 static int *fd_hierarchies;
 343 static int cgroup_mount_ns_fd = -1;
 344
 345 static void unlock_mutex(pthread_mutex_t *l)
 346 {
 347         int ret;
 348
 349         if ((ret = pthread_mutex_unlock(l)) != 0) {
 350                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 351                 exit(1);
 352         }
 353 }
 354
 355 static void store_lock(void)
 356 {
 357         lock_mutex(&pidns_store_mutex);
 358 }
 359
 360 static void store_unlock(void)
 361 {
 362         unlock_mutex(&pidns_store_mutex);
 363 }
 364
 365 /* Must be called under store_lock */
 366 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 367 {
 368         struct stat initsb;
 369         char fnam[100];
 370
 371         snprintf(fnam, 100, "/proc/%d", e->initpid);
 372         if (stat(fnam, &initsb) < 0)
 373                 return false;
 374
 375         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 376                     initsb.st_ctime, e->initpid);
 377
 378         if (e->ctime != initsb.st_ctime)
 379                 return false;
 380         return true;
 381 }
 382
 383 /* Must be called under store_lock */
 384 static void remove_initpid(struct pidns_init_store *e)
 385 {
 386         struct pidns_init_store *tmp;
 387         int h;
 388
 389         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 390
 391         h = HASH(e->ino);
 392         if (pidns_hash_table[h] == e) {
 393                 pidns_hash_table[h] = e->next;
 394                 free(e);
 395                 return;
 396         }
 397
 398         tmp = pidns_hash_table[h];
 399         while (tmp) {
 400                 if (tmp->next == e) {
 401                         tmp->next = e->next;
 402                         free(e);
 403                         return;
 404                 }
 405                 tmp = tmp->next;
 406         }
 407 }
 408
 409 #define PURGE_SECS 5
 410 /* Must be called under store_lock */
 411 static void prune_initpid_store(void)
 412 {
 413         static long int last_prune = 0;
 414         struct pidns_init_store *e, *prev, *delme;
 415         long int now, threshold;
 416         int i;
 417
 418         if (!last_prune) {
 419                 last_prune = time(NULL);
 420                 return;
 421         }
 422         now = time(NULL);
 423         if (now < last_prune + PURGE_SECS)
 424                 return;
 425
 426         lxcfs_debug("%s\n", "Pruning.");
 427
 428         last_prune = now;
 429         threshold = now - 2 * PURGE_SECS;
 430
 431         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 432                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 433                         if (e->lastcheck < threshold) {
 434
 435                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 436
 437                                 delme = e;
 438                                 if (prev)
 439                                         prev->next = e->next;
 440                                 else
 441                                         pidns_hash_table[i] = e->next;
 442                                 e = e->next;
 443                                 free(delme);
 444                         } else {
 445                                 prev = e;
 446                                 e = e->next;
 447                         }
 448                 }
 449         }
 450 }
 451
 452 /* Must be called under store_lock */
 453 static void save_initpid(struct stat *sb, pid_t pid)
 454 {
 455         struct pidns_init_store *e;
 456         char fpath[100];
 457         struct stat procsb;
 458         int h;
 459
 460         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 461
 462         snprintf(fpath, 100, "/proc/%d", pid);
 463         if (stat(fpath, &procsb) < 0)
 464                 return;
 465         do {
 466                 e = malloc(sizeof(*e));
 467         } while (!e);
 468         e->ino = sb->st_ino;
 469         e->initpid = pid;
 470         e->ctime = procsb.st_ctime;
 471         h = HASH(e->ino);
 472         e->next = pidns_hash_table[h];
 473         e->lastcheck = time(NULL);
 474         pidns_hash_table[h] = e;
 475 }
 476
 477 /*
 478  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 479  * entry for the inode number and creation time.  Verify that the init pid
 480  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 481  * otherwise.
 482  * Must be called under store_lock
 483  */
 484 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 485 {
 486         int h = HASH(sb->st_ino);
 487         struct pidns_init_store *e = pidns_hash_table[h];
 488
 489         while (e) {
 490                 if (e->ino == sb->st_ino) {
 491                         if (initpid_still_valid(e, sb)) {
 492                                 e->lastcheck = time(NULL);
 493                                 return e;
 494                         }
 495                         remove_initpid(e);
 496                         return NULL;
 497                 }
 498                 e = e->next;
 499         }
 500
 501         return NULL;
 502 }
 503
 504 static int is_dir(const char *path, int fd)
 505 {
 506         struct stat statbuf;
 507         int ret = fstatat(fd, path, &statbuf, fd);
 508         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 509                 return 1;
 510         return 0;
 511 }
 512
 513 static char *must_copy_string(const char *str)
 514 {
 515         char *dup = NULL;
 516         if (!str)
 517                 return NULL;
 518         do {
 519                 dup = strdup(str);
 520         } while (!dup);
 521
 522         return dup;
 523 }
 524
 525 static inline void drop_trailing_newlines(char *s)
 526 {
 527         int l;
 528
 529         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 530                 s[l-1] = '\0';
 531 }
 532
 533 #define BATCH_SIZE 50
 534 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 535 {
 536         int newbatches = (newlen / BATCH_SIZE) + 1;
 537         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 538
 539         if (!*mem || newbatches > oldbatches) {
 540                 char *tmp;
 541                 do {
 542                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 543                 } while (!tmp);
 544                 *mem = tmp;
 545         }
 546 }
 547 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 548 {
 549         size_t newlen = *len + linelen;
 550         dorealloc(contents, *len, newlen + 1);
 551         memcpy(*contents + *len, line, linelen+1);
 552         *len = newlen;
 553 }
 554
 555 static char *slurp_file(const char *from, int fd)
 556 {
 557         char *line = NULL;
 558         char *contents = NULL;
 559         FILE *f = fdopen(fd, "r");
 560         size_t len = 0, fulllen = 0;
 561         ssize_t linelen;
 562
 563         if (!f)
 564                 return NULL;
 565
 566         while ((linelen = getline(&line, &len, f)) != -1) {
 567                 append_line(&contents, &fulllen, line, linelen);
 568         }
 569         fclose(f);
 570
 571         if (contents)
 572                 drop_trailing_newlines(contents);
 573         free(line);
 574         return contents;
 575 }
 576
 577 static bool write_string(const char *fnam, const char *string, int fd)
 578 {
 579         FILE *f;
 580         size_t len, ret;
 581
 582         if (!(f = fdopen(fd, "w")))
 583                 return false;
 584         len = strlen(string);
 585         ret = fwrite(string, 1, len, f);
 586         if (ret != len) {
 587                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 588                 fclose(f);
 589                 return false;
 590         }
 591         if (fclose(f) < 0) {
 592                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 593                 return false;
 594         }
 595         return true;
 596 }
 597
 598 struct cgfs_files {
 599         char *name;
 600         uint32_t uid, gid;
 601         uint32_t mode;
 602 };
 603
 604 #define ALLOC_NUM 20
 605 static bool store_hierarchy(char *stridx, char *h)
 606 {
 607         if (num_hierarchies % ALLOC_NUM == 0) {
 608                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 609                 n *= ALLOC_NUM;
 610                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 611                 if (!tmp) {
 612                         lxcfs_error("%s\n", strerror(errno));
 613                         exit(1);
 614                 }
 615                 hierarchies = tmp;
 616         }
 617
 618         hierarchies[num_hierarchies++] = must_copy_string(h);
 619         return true;
 620 }
 621
 622 static void print_subsystems(void)
 623 {
 624         int i;
 625
 626         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 627         fprintf(stderr, "hierarchies:\n");
 628         for (i = 0; i < num_hierarchies; i++) {
 629                 if (hierarchies[i])
 630                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 631                                 fd_hierarchies[i], hierarchies[i]);
 632         }
 633 }
 634
 635 static bool in_comma_list(const char *needle, const char *haystack)
 636 {
 637         const char *s = haystack, *e;
 638         size_t nlen = strlen(needle);
 639
 640         while (*s && (e = strchr(s, ','))) {
 641                 if (nlen != e - s) {
 642                         s = e + 1;
 643                         continue;
 644                 }
 645                 if (strncmp(needle, s, nlen) == 0)
 646                         return true;
 647                 s = e + 1;
 648         }
 649         if (strcmp(needle, s) == 0)
 650                 return true;
 651         return false;
 652 }
 653
 654 /* do we need to do any massaging here?  I'm not sure... */
 655 /* Return the mounted controller and store the corresponding open file descriptor
 656  * referring to the controller mountpoint in the private lxcfs namespace in
 657  * @cfd.
 658  */
 659 static char *find_mounted_controller(const char *controller, int *cfd)
 660 {
 661         int i;
 662
 663         for (i = 0; i < num_hierarchies; i++) {
 664                 if (!hierarchies[i])
 665                         continue;
 666                 if (strcmp(hierarchies[i], controller) == 0) {
 667                         *cfd = fd_hierarchies[i];
 668                         return hierarchies[i];
 669                 }
 670                 if (in_comma_list(controller, hierarchies[i])) {
 671                         *cfd = fd_hierarchies[i];
 672                         return hierarchies[i];
 673                 }
 674         }
 675
 676         return NULL;
 677 }
 678
 679 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 680                 const char *value)
 681 {
 682         int ret, fd, cfd;
 683         size_t len;
 684         char *fnam, *tmpc;
 685
 686         tmpc = find_mounted_controller(controller, &cfd);
 687         if (!tmpc)
 688                 return false;
 689
 690         /* Make sure we pass a relative path to *at() family of functions.
 691          * . + /cgroup + / + file + \0
 692          */
 693         len = strlen(cgroup) + strlen(file) + 3;
 694         fnam = alloca(len);
 695         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 696         if (ret < 0 || (size_t)ret >= len)
 697                 return false;
 698
 699         fd = openat(cfd, fnam, O_WRONLY);
 700         if (fd < 0)
 701                 return false;
 702
 703         return write_string(fnam, value, fd);
 704 }
 705
 706 // Chown all the files in the cgroup directory.  We do this when we create
 707 // a cgroup on behalf of a user.
 708 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 709 {
 710         struct dirent *direntp;
 711         char path[MAXPATHLEN];
 712         size_t len;
 713         DIR *d;
 714         int fd1, ret;
 715
 716         len = strlen(dirname);
 717         if (len >= MAXPATHLEN) {
 718                 lxcfs_error("Pathname too long: %s\n", dirname);
 719                 return;
 720         }
 721
 722         fd1 = openat(fd, dirname, O_DIRECTORY);
 723         if (fd1 < 0)
 724                 return;
 725
 726         d = fdopendir(fd1);
 727         if (!d) {
 728                 lxcfs_error("Failed to open %s\n", dirname);
 729                 return;
 730         }
 731
 732         while ((direntp = readdir(d))) {
 733                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 734                         continue;
 735                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 736                 if (ret < 0 || ret >= MAXPATHLEN) {
 737                         lxcfs_error("Pathname too long under %s\n", dirname);
 738                         continue;
 739                 }
 740                 if (fchownat(fd, path, uid, gid, 0) < 0)
 741                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 742         }
 743         closedir(d);
 744 }
 745
 746 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 747 {
 748         int cfd;
 749         size_t len;
 750         char *dirnam, *tmpc;
 751
 752         tmpc = find_mounted_controller(controller, &cfd);
 753         if (!tmpc)
 754                 return -EINVAL;
 755
 756         /* Make sure we pass a relative path to *at() family of functions.
 757          * . + /cg + \0
 758          */
 759         len = strlen(cg) + 2;
 760         dirnam = alloca(len);
 761         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 762
 763         if (mkdirat(cfd, dirnam, 0755) < 0)
 764                 return -errno;
 765
 766         if (uid == 0 && gid == 0)
 767                 return 0;
 768
 769         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 770                 return -errno;
 771
 772         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 773
 774         return 0;
 775 }
 776
 777 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 778 {
 779         struct dirent *direntp;
 780         DIR *dir;
 781         bool ret = false;
 782         char pathname[MAXPATHLEN];
 783         int dupfd;
 784
 785         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 786         if (dupfd < 0)
 787                 return false;
 788
 789         dir = fdopendir(dupfd);
 790         if (!dir) {
 791                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 792                 close(dupfd);
 793                 return false;
 794         }
 795
 796         while ((direntp = readdir(dir))) {
 797                 struct stat mystat;
 798                 int rc;
 799
 800                 if (!strcmp(direntp->d_name, ".") ||
 801                     !strcmp(direntp->d_name, ".."))
 802                         continue;
 803
 804                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 805                 if (rc < 0 || rc >= MAXPATHLEN) {
 806                         lxcfs_error("%s\n", "Pathname too long.");
 807                         continue;
 808                 }
 809
 810                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 811                 if (rc) {
 812                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 813                         continue;
 814                 }
 815                 if (S_ISDIR(mystat.st_mode))
 816                         if (!recursive_rmdir(pathname, fd, cfd))
 817                                 lxcfs_debug("Error removing %s.\n", pathname);
 818         }
 819
 820         ret = true;
 821         if (closedir(dir) < 0) {
 822                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 823                 ret = false;
 824         }
 825
 826         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 827                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 828                 ret = false;
 829         }
 830
 831         close(dupfd);
 832
 833         return ret;
 834 }
 835
 836 bool cgfs_remove(const char *controller, const char *cg)
 837 {
 838         int fd, cfd;
 839         size_t len;
 840         char *dirnam, *tmpc;
 841         bool bret;
 842
 843         tmpc = find_mounted_controller(controller, &cfd);
 844         if (!tmpc)
 845                 return false;
 846
 847         /* Make sure we pass a relative path to *at() family of functions.
 848          * . +  /cg + \0
 849          */
 850         len = strlen(cg) + 2;
 851         dirnam = alloca(len);
 852         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 853
 854         fd = openat(cfd, dirnam, O_DIRECTORY);
 855         if (fd < 0)
 856                 return false;
 857
 858         bret = recursive_rmdir(dirnam, fd, cfd);
 859         close(fd);
 860         return bret;
 861 }
 862
 863 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 864 {
 865         int cfd;
 866         size_t len;
 867         char *pathname, *tmpc;
 868
 869         tmpc = find_mounted_controller(controller, &cfd);
 870         if (!tmpc)
 871                 return false;
 872
 873         /* Make sure we pass a relative path to *at() family of functions.
 874          * . + /file + \0
 875          */
 876         len = strlen(file) + 2;
 877         pathname = alloca(len);
 878         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 879         if (fchmodat(cfd, pathname, mode, 0) < 0)
 880                 return false;
 881         return true;
 882 }
 883
 884 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 885 {
 886         size_t len;
 887         char *fname;
 888
 889         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 890         fname = alloca(len);
 891         snprintf(fname, len, "%s/tasks", dirname);
 892         if (fchownat(fd, fname, uid, gid, 0) != 0)
 893                 return -errno;
 894         snprintf(fname, len, "%s/cgroup.procs", dirname);
 895         if (fchownat(fd, fname, uid, gid, 0) != 0)
 896                 return -errno;
 897         return 0;
 898 }
 899
 900 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 901 {
 902         int cfd;
 903         size_t len;
 904         char *pathname, *tmpc;
 905
 906         tmpc = find_mounted_controller(controller, &cfd);
 907         if (!tmpc)
 908                 return -EINVAL;
 909
 910         /* Make sure we pass a relative path to *at() family of functions.
 911          * . + /file + \0
 912          */
 913         len = strlen(file) + 2;
 914         pathname = alloca(len);
 915         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 916         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 917                 return -errno;
 918
 919         if (is_dir(pathname, cfd))
 920                 // like cgmanager did, we want to chown the tasks file as well
 921                 return chown_tasks_files(pathname, uid, gid, cfd);
 922
 923         return 0;
 924 }
 925
 926 FILE *open_pids_file(const char *controller, const char *cgroup)
 927 {
 928         int fd, cfd;
 929         size_t len;
 930         char *pathname, *tmpc;
 931
 932         tmpc = find_mounted_controller(controller, &cfd);
 933         if (!tmpc)
 934                 return NULL;
 935
 936         /* Make sure we pass a relative path to *at() family of functions.
 937          * . + /cgroup + / "cgroup.procs" + \0
 938          */
 939         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 940         pathname = alloca(len);
 941         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 942
 943         fd = openat(cfd, pathname, O_WRONLY);
 944         if (fd < 0)
 945                 return NULL;
 946
 947         return fdopen(fd, "w");
 948 }
 949
 950 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 951                                 void ***list, size_t typesize,
 952                                 void* (*iterator)(const char*, const char*, const char*))
 953 {
 954         int cfd, fd, ret;
 955         size_t len;
 956         char *cg, *tmpc;
 957         char pathname[MAXPATHLEN];
 958         size_t sz = 0, asz = 0;
 959         struct dirent *dirent;
 960         DIR *dir;
 961
 962         tmpc = find_mounted_controller(controller, &cfd);
 963         *list = NULL;
 964         if (!tmpc)
 965                 return false;
 966
 967         /* Make sure we pass a relative path to *at() family of functions. */
 968         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 969         cg = alloca(len);
 970         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 971         if (ret < 0 || (size_t)ret >= len) {
 972                 lxcfs_error("Pathname too long under %s\n", cgroup);
 973                 return false;
 974         }
 975
 976         fd = openat(cfd, cg, O_DIRECTORY);
 977         if (fd < 0)
 978                 return false;
 979
 980         dir = fdopendir(fd);
 981         if (!dir)
 982                 return false;
 983
 984         while ((dirent = readdir(dir))) {
 985                 struct stat mystat;
 986
 987                 if (!strcmp(dirent->d_name, ".") ||
 988                     !strcmp(dirent->d_name, ".."))
 989                         continue;
 990
 991                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 992                 if (ret < 0 || ret >= MAXPATHLEN) {
 993                         lxcfs_error("Pathname too long under %s\n", cg);
 994                         continue;
 995                 }
 996
 997                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 998                 if (ret) {
 999                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1000                         continue;
1001                 }
1002                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1003                     (directories && !S_ISDIR(mystat.st_mode)))
1004                         continue;
1005
1006                 if (sz+2 >= asz) {
1007                         void **tmp;
1008                         asz += BATCH_SIZE;
1009                         do {
1010                                 tmp = realloc(*list, asz * typesize);
1011                         } while  (!tmp);
1012                         *list = tmp;
1013                 }
1014                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1015                 (*list)[sz+1] = NULL;
1016                 sz++;
1017         }
1018         if (closedir(dir) < 0) {
1019                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1020                 return false;
1021         }
1022         return true;
1023 }
1024
1025 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1026 {
1027         char *dup;
1028         do {
1029                 dup = strdup(dir_entry);
1030         } while (!dup);
1031         return dup;
1032 }
1033
1034 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1035 {
1036         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1037 }
1038
1039 void free_key(struct cgfs_files *k)
1040 {
1041         if (!k)
1042                 return;
1043         free(k->name);
1044         free(k);
1045 }
1046
1047 void free_keys(struct cgfs_files **keys)
1048 {
1049         int i;
1050
1051         if (!keys)
1052                 return;
1053         for (i = 0; keys[i]; i++) {
1054                 free_key(keys[i]);
1055         }
1056         free(keys);
1057 }
1058
1059 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1060 {
1061         int ret, fd, cfd;
1062         size_t len;
1063         char *fnam, *tmpc;
1064
1065         tmpc = find_mounted_controller(controller, &cfd);
1066         if (!tmpc)
1067                 return false;
1068
1069         /* Make sure we pass a relative path to *at() family of functions.
1070          * . + /cgroup + / + file + \0
1071          */
1072         len = strlen(cgroup) + strlen(file) + 3;
1073         fnam = alloca(len);
1074         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1075         if (ret < 0 || (size_t)ret >= len)
1076                 return false;
1077
1078         fd = openat(cfd, fnam, O_RDONLY);
1079         if (fd < 0)
1080                 return false;
1081
1082         *value = slurp_file(fnam, fd);
1083         return *value != NULL;
1084 }
1085
1086 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1087 {
1088         int ret, cfd;
1089         size_t len;
1090         char *fnam, *tmpc;
1091         struct stat sb;
1092         struct cgfs_files *newkey;
1093
1094         tmpc = find_mounted_controller(controller, &cfd);
1095         if (!tmpc)
1096                 return false;
1097
1098         if (file && *file == '/')
1099                 file++;
1100
1101         if (file && strchr(file, '/'))
1102                 return NULL;
1103
1104         /* Make sure we pass a relative path to *at() family of functions.
1105          * . + /cgroup + / + file + \0
1106          */
1107         len = strlen(cgroup) + 3;
1108         if (file)
1109                 len += strlen(file) + 1;
1110         fnam = alloca(len);
1111         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1112                  file ? "/" : "", file ? file : "");
1113
1114         ret = fstatat(cfd, fnam, &sb, 0);
1115         if (ret < 0)
1116                 return NULL;
1117
1118         do {
1119                 newkey = malloc(sizeof(struct cgfs_files));
1120         } while (!newkey);
1121         if (file)
1122                 newkey->name = must_copy_string(file);
1123         else if (strrchr(cgroup, '/'))
1124                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1125         else
1126                 newkey->name = must_copy_string(cgroup);
1127         newkey->uid = sb.st_uid;
1128         newkey->gid = sb.st_gid;
1129         newkey->mode = sb.st_mode;
1130
1131         return newkey;
1132 }
1133
1134 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1135 {
1136         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1137         if (!entry) {
1138                 lxcfs_error("Error getting files under %s:%s\n", controller,
1139                              cgroup);
1140         }
1141         return entry;
1142 }
1143
1144 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1145 {
1146         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1147 }
1148
1149 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1150 {
1151         int cfd;
1152         size_t len;
1153         char *fnam, *tmpc;
1154         int ret;
1155         struct stat sb;
1156
1157         tmpc = find_mounted_controller(controller, &cfd);
1158         if (!tmpc)
1159                 return false;
1160
1161         /* Make sure we pass a relative path to *at() family of functions.
1162          * . + /cgroup + / + f + \0
1163          */
1164         len = strlen(cgroup) + strlen(f) + 3;
1165         fnam = alloca(len);
1166         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1167         if (ret < 0 || (size_t)ret >= len)
1168                 return false;
1169
1170         ret = fstatat(cfd, fnam, &sb, 0);
1171         if (ret < 0 || !S_ISDIR(sb.st_mode))
1172                 return false;
1173
1174         return true;
1175 }
1176
1177 #define SEND_CREDS_OK 0
1178 #define SEND_CREDS_NOTSK 1
1179 #define SEND_CREDS_FAIL 2
1180 static bool recv_creds(int sock, struct ucred *cred, char *v);
1181 static int wait_for_pid(pid_t pid);
1182 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1183 static int send_creds_clone_wrapper(void *arg);
1184
1185 /*
1186  * clone a task which switches to @task's namespace and writes '1'.
1187  * over a unix sock so we can read the task's reaper's pid in our
1188  * namespace
1189  *
1190  * Note: glibc's fork() does not respect pidns, which can lead to failed
1191  * assertions inside glibc (and thus failed forks) if the child's pid in
1192  * the pidns and the parent pid outside are identical. Using clone prevents
1193  * this issue.
1194  */
1195 static void write_task_init_pid_exit(int sock, pid_t target)
1196 {
1197         char fnam[100];
1198         pid_t pid;
1199         int fd, ret;
1200         size_t stack_size = sysconf(_SC_PAGESIZE);
1201         void *stack = alloca(stack_size);
1202
1203         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1204         if (ret < 0 || ret >= sizeof(fnam))
1205                 _exit(1);
1206
1207         fd = open(fnam, O_RDONLY);
1208         if (fd < 0) {
1209                 perror("write_task_init_pid_exit open of ns/pid");
1210                 _exit(1);
1211         }
1212         if (setns(fd, 0)) {
1213                 perror("write_task_init_pid_exit setns 1");
1214                 close(fd);
1215                 _exit(1);
1216         }
1217         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1218         if (pid < 0)
1219                 _exit(1);
1220         if (pid != 0) {
1221                 if (!wait_for_pid(pid))
1222                         _exit(1);
1223                 _exit(0);
1224         }
1225 }
1226
1227 static int send_creds_clone_wrapper(void *arg) {
1228         struct ucred cred;
1229         char v;
1230         int sock = *(int *)arg;
1231
1232         /* we are the child */
1233         cred.uid = 0;
1234         cred.gid = 0;
1235         cred.pid = 1;
1236         v = '1';
1237         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1238                 return 1;
1239         return 0;
1240 }
1241
1242 static pid_t get_init_pid_for_task(pid_t task)
1243 {
1244         int sock[2];
1245         pid_t pid;
1246         pid_t ret = -1;
1247         char v = '0';
1248         struct ucred cred;
1249
1250         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1251                 perror("socketpair");
1252                 return -1;
1253         }
1254
1255         pid = fork();
1256         if (pid < 0)
1257                 goto out;
1258         if (!pid) {
1259                 close(sock[1]);
1260                 write_task_init_pid_exit(sock[0], task);
1261                 _exit(0);
1262         }
1263
1264         if (!recv_creds(sock[1], &cred, &v))
1265                 goto out;
1266         ret = cred.pid;
1267
1268 out:
1269         close(sock[0]);
1270         close(sock[1]);
1271         if (pid > 0)
1272                 wait_for_pid(pid);
1273         return ret;
1274 }
1275
1276 static pid_t lookup_initpid_in_store(pid_t qpid)
1277 {
1278         pid_t answer = 0;
1279         struct stat sb;
1280         struct pidns_init_store *e;
1281         char fnam[100];
1282
1283         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1284         store_lock();
1285         if (stat(fnam, &sb) < 0)
1286                 goto out;
1287         e = lookup_verify_initpid(&sb);
1288         if (e) {
1289                 answer = e->initpid;
1290                 goto out;
1291         }
1292         answer = get_init_pid_for_task(qpid);
1293         if (answer > 0)
1294                 save_initpid(&sb, answer);
1295
1296 out:
1297         /* we prune at end in case we are returning
1298          * the value we were about to return */
1299         prune_initpid_store();
1300         store_unlock();
1301         return answer;
1302 }
1303
1304 static int wait_for_pid(pid_t pid)
1305 {
1306         int status, ret;
1307
1308         if (pid <= 0)
1309                 return -1;
1310
1311 again:
1312         ret = waitpid(pid, &status, 0);
1313         if (ret == -1) {
1314                 if (errno == EINTR)
1315                         goto again;
1316                 return -1;
1317         }
1318         if (ret != pid)
1319                 goto again;
1320         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1321                 return -1;
1322         return 0;
1323 }
1324
1325
1326 /*
1327  * append pid to *src.
1328  * src: a pointer to a char* in which ot append the pid.
1329  * sz: the number of characters printed so far, minus trailing \0.
1330  * asz: the allocated size so far
1331  * pid: the pid to append
1332  */
1333 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1334 {
1335         char tmp[30];
1336
1337         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1338
1339         if (!*src || tmplen + *sz + 1 >= *asz) {
1340                 char *tmp;
1341                 do {
1342                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1343                 } while (!tmp);
1344                 *src = tmp;
1345                 *asz += BUF_RESERVE_SIZE;
1346         }
1347         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1348         *sz += tmplen;
1349 }
1350
1351 /*
1352  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1353  * valid in the caller's namespace, return the id mapped into
1354  * pid's namespace.
1355  * Returns the mapped id, or -1 on error.
1356  */
1357 unsigned int
1358 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1359 {
1360         unsigned int nsuid,   // base id for a range in the idfile's namespace
1361                      hostuid, // base id for a range in the caller's namespace
1362                      count;   // number of ids in this range
1363         char line[400];
1364         int ret;
1365
1366         fseek(idfile, 0L, SEEK_SET);
1367         while (fgets(line, 400, idfile)) {
1368                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1369                 if (ret != 3)
1370                         continue;
1371                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1372                         /*
1373                          * uids wrapped around - unexpected as this is a procfile,
1374                          * so just bail.
1375                          */
1376                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1377                                 nsuid, hostuid, count, line);
1378                         return -1;
1379                 }
1380                 if (hostuid <= in_id && hostuid+count > in_id) {
1381                         /*
1382                          * now since hostuid <= in_id < hostuid+count, and
1383                          * hostuid+count and nsuid+count do not wrap around,
1384                          * we know that nsuid+(in_id-hostuid) which must be
1385                          * less that nsuid+(count) must not wrap around
1386                          */
1387                         return (in_id - hostuid) + nsuid;
1388                 }
1389         }
1390
1391         // no answer found
1392         return -1;
1393 }
1394
1395 /*
1396  * for is_privileged_over,
1397  * specify whether we require the calling uid to be root in his
1398  * namespace
1399  */
1400 #define NS_ROOT_REQD true
1401 #define NS_ROOT_OPT false
1402
1403 #define PROCLEN 100
1404
1405 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1406 {
1407         char fpath[PROCLEN];
1408         int ret;
1409         bool answer = false;
1410         uid_t nsuid;
1411
1412         if (victim == -1 || uid == -1)
1413                 return false;
1414
1415         /*
1416          * If the request is one not requiring root in the namespace,
1417          * then having the same uid suffices.  (i.e. uid 1000 has write
1418          * access to files owned by uid 1000
1419          */
1420         if (!req_ns_root && uid == victim)
1421                 return true;
1422
1423         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1424         if (ret < 0 || ret >= PROCLEN)
1425                 return false;
1426         FILE *f = fopen(fpath, "r");
1427         if (!f)
1428                 return false;
1429
1430         /* if caller's not root in his namespace, reject */
1431         nsuid = convert_id_to_ns(f, uid);
1432         if (nsuid)
1433                 goto out;
1434
1435         /*
1436          * If victim is not mapped into caller's ns, reject.
1437          * XXX I'm not sure this check is needed given that fuse
1438          * will be sending requests where the vfs has converted
1439          */
1440         nsuid = convert_id_to_ns(f, victim);
1441         if (nsuid == -1)
1442                 goto out;
1443
1444         answer = true;
1445
1446 out:
1447         fclose(f);
1448         return answer;
1449 }
1450
1451 static bool perms_include(int fmode, mode_t req_mode)
1452 {
1453         mode_t r;
1454
1455         switch (req_mode & O_ACCMODE) {
1456         case O_RDONLY:
1457                 r = S_IROTH;
1458                 break;
1459         case O_WRONLY:
1460                 r = S_IWOTH;
1461                 break;
1462         case O_RDWR:
1463                 r = S_IROTH | S_IWOTH;
1464                 break;
1465         default:
1466                 return false;
1467         }
1468         return ((fmode & r) == r);
1469 }
1470
1471
1472 /*
1473  * taskcg is  a/b/c
1474  * querycg is /a/b/c/d/e
1475  * we return 'd'
1476  */
1477 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1478 {
1479         char *start, *end;
1480
1481         if (strlen(taskcg) <= strlen(querycg)) {
1482                 lxcfs_error("%s\n", "I was fed bad input.");
1483                 return NULL;
1484         }
1485
1486         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1487                 start =  strdup(taskcg + 1);
1488         else
1489                 start = strdup(taskcg + strlen(querycg) + 1);
1490         if (!start)
1491                 return NULL;
1492         end = strchr(start, '/');
1493         if (end)
1494                 *end = '\0';
1495         return start;
1496 }
1497
1498 static void stripnewline(char *x)
1499 {
1500         size_t l = strlen(x);
1501         if (l && x[l-1] == '\n')
1502                 x[l-1] = '\0';
1503 }
1504
1505 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1506 {
1507         int cfd;
1508         char fnam[PROCLEN];
1509         FILE *f;
1510         char *answer = NULL;
1511         char *line = NULL;
1512         size_t len = 0;
1513         int ret;
1514         const char *h = find_mounted_controller(contrl, &cfd);
1515         if (!h)
1516                 return NULL;
1517
1518         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1519         if (ret < 0 || ret >= PROCLEN)
1520                 return NULL;
1521         if (!(f = fopen(fnam, "r")))
1522                 return NULL;
1523
1524         while (getline(&line, &len, f) != -1) {
1525                 char *c1, *c2;
1526                 if (!line[0])
1527                         continue;
1528                 c1 = strchr(line, ':');
1529                 if (!c1)
1530                         goto out;
1531                 c1++;
1532                 c2 = strchr(c1, ':');
1533                 if (!c2)
1534                         goto out;
1535                 *c2 = '\0';
1536                 if (strcmp(c1, h) != 0)
1537                         continue;
1538                 c2++;
1539                 stripnewline(c2);
1540                 do {
1541                         answer = strdup(c2);
1542                 } while (!answer);
1543                 break;
1544         }
1545
1546 out:
1547         fclose(f);
1548         free(line);
1549         return answer;
1550 }
1551
1552 /*
1553  * check whether a fuse context may access a cgroup dir or file
1554  *
1555  * If file is not null, it is a cgroup file to check under cg.
1556  * If file is null, then we are checking perms on cg itself.
1557  *
1558  * For files we can check the mode of the list_keys result.
1559  * For cgroups, we must make assumptions based on the files under the
1560  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1561  * yet.
1562  */
1563 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1564 {
1565         struct cgfs_files *k = NULL;
1566         bool ret = false;
1567
1568         k = cgfs_get_key(contrl, cg, file);
1569         if (!k)
1570                 return false;
1571
1572         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1573                 if (perms_include(k->mode >> 6, mode)) {
1574                         ret = true;
1575                         goto out;
1576                 }
1577         }
1578         if (fc->gid == k->gid) {
1579                 if (perms_include(k->mode >> 3, mode)) {
1580                         ret = true;
1581                         goto out;
1582                 }
1583         }
1584         ret = perms_include(k->mode, mode);
1585
1586 out:
1587         free_key(k);
1588         return ret;
1589 }
1590
1591 #define INITSCOPE "/init.scope"
1592 static void prune_init_slice(char *cg)
1593 {
1594         char *point;
1595         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1596
1597         if (cg_len < initscope_len)
1598                 return;
1599
1600         point = cg + cg_len - initscope_len;
1601         if (strcmp(point, INITSCOPE) == 0) {
1602                 if (point == cg)
1603                         *(point+1) = '\0';
1604                 else
1605                         *point = '\0';
1606         }
1607 }
1608
1609 /*
1610  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1611  * If pid is in /a, he may act on /a/b, but not on /b.
1612  * if the answer is false and nextcg is not NULL, then *nextcg will point
1613  * to a string containing the next cgroup directory under cg, which must be
1614  * freed by the caller.
1615  */
1616 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1617 {
1618         bool answer = false;
1619         char *c2 = get_pid_cgroup(pid, contrl);
1620         char *linecmp;
1621
1622         if (!c2)
1623                 return false;
1624         prune_init_slice(c2);
1625
1626         /*
1627          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1628          * they pass in a cgroup without leading '/'
1629          *
1630          * The original line here was:
1631          *      linecmp = *cg == '/' ? c2 : c2+1;
1632          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1633          *       Serge, do you know?
1634          */
1635         if (*cg == '/' || !strncmp(cg, "./", 2))
1636                 linecmp = c2;
1637         else
1638                 linecmp = c2 + 1;
1639         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1640                 if (nextcg) {
1641                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1642                 }
1643                 goto out;
1644         }
1645         answer = true;
1646
1647 out:
1648         free(c2);
1649         return answer;
1650 }
1651
1652 /*
1653  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1654  */
1655 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1656 {
1657         bool answer = false;
1658         char *c2, *task_cg;
1659         size_t target_len, task_len;
1660
1661         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1662                 return true;
1663
1664         c2 = get_pid_cgroup(pid, contrl);
1665         if (!c2)
1666                 return false;
1667         prune_init_slice(c2);
1668
1669         task_cg = c2 + 1;
1670         target_len = strlen(cg);
1671         task_len = strlen(task_cg);
1672         if (task_len == 0) {
1673                 /* Task is in the root cg, it can see everything. This case is
1674                  * not handled by the strmcps below, since they test for the
1675                  * last /, but that is the first / that we've chopped off
1676                  * above.
1677                  */
1678                 answer = true;
1679                 goto out;
1680         }
1681         if (strcmp(cg, task_cg) == 0) {
1682                 answer = true;
1683                 goto out;
1684         }
1685         if (target_len < task_len) {
1686                 /* looking up a parent dir */
1687                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1688                         answer = true;
1689                 goto out;
1690         }
1691         if (target_len > task_len) {
1692                 /* looking up a child dir */
1693                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1694                         answer = true;
1695                 goto out;
1696         }
1697
1698 out:
1699         free(c2);
1700         return answer;
1701 }
1702
1703 /*
1704  * given /cgroup/freezer/a/b, return "freezer".
1705  * the returned char* should NOT be freed.
1706  */
1707 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1708 {
1709         const char *p1;
1710         char *contr, *slash;
1711
1712         if (strlen(path) < 9) {
1713                 errno = EACCES;
1714                 return NULL;
1715         }
1716         if (*(path + 7) != '/') {
1717                 errno = EINVAL;
1718                 return NULL;
1719         }
1720         p1 = path + 8;
1721         contr = strdupa(p1);
1722         if (!contr) {
1723                 errno = ENOMEM;
1724                 return NULL;
1725         }
1726         slash = strstr(contr, "/");
1727         if (slash)
1728                 *slash = '\0';
1729
1730         int i;
1731         for (i = 0; i < num_hierarchies; i++) {
1732                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1733                         return hierarchies[i];
1734         }
1735         errno = ENOENT;
1736         return NULL;
1737 }
1738
1739 /*
1740  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1741  * Note that the returned value may include files (keynames) etc
1742  */
1743 static const char *find_cgroup_in_path(const char *path)
1744 {
1745         const char *p1;
1746
1747         if (strlen(path) < 9) {
1748                 errno = EACCES;
1749                 return NULL;
1750         }
1751         p1 = strstr(path + 8, "/");
1752         if (!p1) {
1753                 errno = EINVAL;
1754                 return NULL;
1755         }
1756         errno = 0;
1757         return p1 + 1;
1758 }
1759
1760 /*
1761  * split the last path element from the path in @cg.
1762  * @dir is newly allocated and should be freed, @last not
1763 */
1764 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1765 {
1766         char *p;
1767
1768         do {
1769                 *dir = strdup(cg);
1770         } while (!*dir);
1771         *last = strrchr(cg, '/');
1772         if (!*last) {
1773                 *last = NULL;
1774                 return;
1775         }
1776         p = strrchr(*dir, '/');
1777         *p = '\0';
1778 }
1779
1780 /*
1781  * FUSE ops for /cgroup
1782  */
1783
1784 int cg_getattr(const char *path, struct stat *sb)
1785 {
1786         struct timespec now;
1787         struct fuse_context *fc = fuse_get_context();
1788         char * cgdir = NULL;
1789         char *last = NULL, *path1, *path2;
1790         struct cgfs_files *k = NULL;
1791         const char *cgroup;
1792         const char *controller = NULL;
1793         int ret = -ENOENT;
1794
1795
1796         if (!fc)
1797                 return -EIO;
1798
1799         memset(sb, 0, sizeof(struct stat));
1800
1801         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1802                 return -EINVAL;
1803
1804         sb->st_uid = sb->st_gid = 0;
1805         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1806         sb->st_size = 0;
1807
1808         if (strcmp(path, "/cgroup") == 0) {
1809                 sb->st_mode = S_IFDIR | 00755;
1810                 sb->st_nlink = 2;
1811                 return 0;
1812         }
1813
1814         controller = pick_controller_from_path(fc, path);
1815         if (!controller)
1816                 return -errno;
1817         cgroup = find_cgroup_in_path(path);
1818         if (!cgroup) {
1819                 /* this is just /cgroup/controller, return it as a dir */
1820                 sb->st_mode = S_IFDIR | 00755;
1821                 sb->st_nlink = 2;
1822                 return 0;
1823         }
1824
1825         get_cgdir_and_path(cgroup, &cgdir, &last);
1826
1827         if (!last) {
1828                 path1 = "/";
1829                 path2 = cgdir;
1830         } else {
1831                 path1 = cgdir;
1832                 path2 = last;
1833         }
1834
1835         pid_t initpid = lookup_initpid_in_store(fc->pid);
1836         if (initpid <= 0)
1837                 initpid = fc->pid;
1838         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1839          * Then check that caller's cgroup is under path if last is a child
1840          * cgroup, or cgdir if last is a file */
1841
1842         if (is_child_cgroup(controller, path1, path2)) {
1843                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1844                         ret = -ENOENT;
1845                         goto out;
1846                 }
1847                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1848                         /* this is just /cgroup/controller, return it as a dir */
1849                         sb->st_mode = S_IFDIR | 00555;
1850                         sb->st_nlink = 2;
1851                         ret = 0;
1852                         goto out;
1853                 }
1854                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1855                         ret = -EACCES;
1856                         goto out;
1857                 }
1858
1859                 // get uid, gid, from '/tasks' file and make up a mode
1860                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1861                 sb->st_mode = S_IFDIR | 00755;
1862                 k = cgfs_get_key(controller, cgroup, NULL);
1863                 if (!k) {
1864                         sb->st_uid = sb->st_gid = 0;
1865                 } else {
1866                         sb->st_uid = k->uid;
1867                         sb->st_gid = k->gid;
1868                 }
1869                 free_key(k);
1870                 sb->st_nlink = 2;
1871                 ret = 0;
1872                 goto out;
1873         }
1874
1875         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1876                 sb->st_mode = S_IFREG | k->mode;
1877                 sb->st_nlink = 1;
1878                 sb->st_uid = k->uid;
1879                 sb->st_gid = k->gid;
1880                 sb->st_size = 0;
1881                 free_key(k);
1882                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1883                         ret = -ENOENT;
1884                         goto out;
1885                 }
1886                 ret = 0;
1887         }
1888
1889 out:
1890         free(cgdir);
1891         return ret;
1892 }
1893
1894 int cg_opendir(const char *path, struct fuse_file_info *fi)
1895 {
1896         struct fuse_context *fc = fuse_get_context();
1897         const char *cgroup;
1898         struct file_info *dir_info;
1899         char *controller = NULL;
1900
1901         if (!fc)
1902                 return -EIO;
1903
1904         if (strcmp(path, "/cgroup") == 0) {
1905                 cgroup = NULL;
1906                 controller = NULL;
1907         } else {
1908                 // return list of keys for the controller, and list of child cgroups
1909                 controller = pick_controller_from_path(fc, path);
1910                 if (!controller)
1911                         return -errno;
1912
1913                 cgroup = find_cgroup_in_path(path);
1914                 if (!cgroup) {
1915                         /* this is just /cgroup/controller, return its contents */
1916                         cgroup = "/";
1917                 }
1918         }
1919
1920         pid_t initpid = lookup_initpid_in_store(fc->pid);
1921         if (initpid <= 0)
1922                 initpid = fc->pid;
1923         if (cgroup) {
1924                 if (!caller_may_see_dir(initpid, controller, cgroup))
1925                         return -ENOENT;
1926                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1927                         return -EACCES;
1928         }
1929
1930         /* we'll free this at cg_releasedir */
1931         dir_info = malloc(sizeof(*dir_info));
1932         if (!dir_info)
1933                 return -ENOMEM;
1934         dir_info->controller = must_copy_string(controller);
1935         dir_info->cgroup = must_copy_string(cgroup);
1936         dir_info->type = LXC_TYPE_CGDIR;
1937         dir_info->buf = NULL;
1938         dir_info->file = NULL;
1939         dir_info->buflen = 0;
1940
1941         fi->fh = (unsigned long)dir_info;
1942         return 0;
1943 }
1944
1945 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1946                 struct fuse_file_info *fi)
1947 {
1948         struct file_info *d = (struct file_info *)fi->fh;
1949         struct cgfs_files **list = NULL;
1950         int i, ret;
1951         char *nextcg = NULL;
1952         struct fuse_context *fc = fuse_get_context();
1953         char **clist = NULL;
1954
1955         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1956                 return -EIO;
1957
1958         if (d->type != LXC_TYPE_CGDIR) {
1959                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1960                 return -EIO;
1961         }
1962         if (!d->cgroup && !d->controller) {
1963                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1964                 int i;
1965
1966                 for (i = 0;  i < num_hierarchies; i++) {
1967                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1968                                 return -EIO;
1969                         }
1970                 }
1971                 return 0;
1972         }
1973
1974         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1975                 // not a valid cgroup
1976                 ret = -EINVAL;
1977                 goto out;
1978         }
1979
1980         pid_t initpid = lookup_initpid_in_store(fc->pid);
1981         if (initpid <= 0)
1982                 initpid = fc->pid;
1983         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1984                 if (nextcg) {
1985                         ret = filler(buf, nextcg,  NULL, 0);
1986                         free(nextcg);
1987                         if (ret != 0) {
1988                                 ret = -EIO;
1989                                 goto out;
1990                         }
1991                 }
1992                 ret = 0;
1993                 goto out;
1994         }
1995
1996         for (i = 0; list[i]; i++) {
1997                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1998                         ret = -EIO;
1999                         goto out;
2000                 }
2001         }
2002
2003         // now get the list of child cgroups
2004
2005         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2006                 ret = 0;
2007                 goto out;
2008         }
2009         if (clist) {
2010                 for (i = 0; clist[i]; i++) {
2011                         if (filler(buf, clist[i], NULL, 0) != 0) {
2012                                 ret = -EIO;
2013                                 goto out;
2014                         }
2015                 }
2016         }
2017         ret = 0;
2018
2019 out:
2020         free_keys(list);
2021         if (clist) {
2022                 for (i = 0; clist[i]; i++)
2023                         free(clist[i]);
2024                 free(clist);
2025         }
2026         return ret;
2027 }
2028
2029 static void do_release_file_info(struct fuse_file_info *fi)
2030 {
2031         struct file_info *f = (struct file_info *)fi->fh;
2032
2033         if (!f)
2034                 return;
2035
2036         fi->fh = 0;
2037
2038         free(f->controller);
2039         f->controller = NULL;
2040         free(f->cgroup);
2041         f->cgroup = NULL;
2042         free(f->file);
2043         f->file = NULL;
2044         free(f->buf);
2045         f->buf = NULL;
2046         free(f);
2047 }
2048
2049 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2050 {
2051         do_release_file_info(fi);
2052         return 0;
2053 }
2054
2055 int cg_open(const char *path, struct fuse_file_info *fi)
2056 {
2057         const char *cgroup;
2058         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2059         struct cgfs_files *k = NULL;
2060         struct file_info *file_info;
2061         struct fuse_context *fc = fuse_get_context();
2062         int ret;
2063
2064         if (!fc)
2065                 return -EIO;
2066
2067         controller = pick_controller_from_path(fc, path);
2068         if (!controller)
2069                 return -errno;
2070         cgroup = find_cgroup_in_path(path);
2071         if (!cgroup)
2072                 return -errno;
2073
2074         get_cgdir_and_path(cgroup, &cgdir, &last);
2075         if (!last) {
2076                 path1 = "/";
2077                 path2 = cgdir;
2078         } else {
2079                 path1 = cgdir;
2080                 path2 = last;
2081         }
2082
2083         k = cgfs_get_key(controller, path1, path2);
2084         if (!k) {
2085                 ret = -EINVAL;
2086                 goto out;
2087         }
2088         free_key(k);
2089
2090         pid_t initpid = lookup_initpid_in_store(fc->pid);
2091         if (initpid <= 0)
2092                 initpid = fc->pid;
2093         if (!caller_may_see_dir(initpid, controller, path1)) {
2094                 ret = -ENOENT;
2095                 goto out;
2096         }
2097         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2098                 ret = -EACCES;
2099                 goto out;
2100         }
2101
2102         /* we'll free this at cg_release */
2103         file_info = malloc(sizeof(*file_info));
2104         if (!file_info) {
2105                 ret = -ENOMEM;
2106                 goto out;
2107         }
2108         file_info->controller = must_copy_string(controller);
2109         file_info->cgroup = must_copy_string(path1);
2110         file_info->file = must_copy_string(path2);
2111         file_info->type = LXC_TYPE_CGFILE;
2112         file_info->buf = NULL;
2113         file_info->buflen = 0;
2114
2115         fi->fh = (unsigned long)file_info;
2116         ret = 0;
2117
2118 out:
2119         free(cgdir);
2120         return ret;
2121 }
2122
2123 int cg_access(const char *path, int mode)
2124 {
2125         int ret;
2126         const char *cgroup;
2127         char *path1, *path2, *controller;
2128         char *last = NULL, *cgdir = NULL;
2129         struct cgfs_files *k = NULL;
2130         struct fuse_context *fc = fuse_get_context();
2131
2132         if (strcmp(path, "/cgroup") == 0)
2133                 return 0;
2134
2135         if (!fc)
2136                 return -EIO;
2137
2138         controller = pick_controller_from_path(fc, path);
2139         if (!controller)
2140                 return -errno;
2141         cgroup = find_cgroup_in_path(path);
2142         if (!cgroup) {
2143                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2144                 if ((mode & W_OK) == 0)
2145                         return 0;
2146                 return -EACCES;
2147         }
2148
2149         get_cgdir_and_path(cgroup, &cgdir, &last);
2150         if (!last) {
2151                 path1 = "/";
2152                 path2 = cgdir;
2153         } else {
2154                 path1 = cgdir;
2155                 path2 = last;
2156         }
2157
2158         k = cgfs_get_key(controller, path1, path2);
2159         if (!k) {
2160                 if ((mode & W_OK) == 0)
2161                         ret = 0;
2162                 else
2163                         ret = -EACCES;
2164                 goto out;
2165         }
2166         free_key(k);
2167
2168         pid_t initpid = lookup_initpid_in_store(fc->pid);
2169         if (initpid <= 0)
2170                 initpid = fc->pid;
2171         if (!caller_may_see_dir(initpid, controller, path1)) {
2172                 ret = -ENOENT;
2173                 goto out;
2174         }
2175         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2176                 ret = -EACCES;
2177                 goto out;
2178         }
2179
2180         ret = 0;
2181
2182 out:
2183         free(cgdir);
2184         return ret;
2185 }
2186
2187 int cg_release(const char *path, struct fuse_file_info *fi)
2188 {
2189         do_release_file_info(fi);
2190         return 0;
2191 }
2192
2193 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2194
2195 static bool wait_for_sock(int sock, int timeout)
2196 {
2197         struct epoll_event ev;
2198         int epfd, ret, now, starttime, deltatime, saved_errno;
2199
2200         if ((starttime = time(NULL)) < 0)
2201                 return false;
2202
2203         if ((epfd = epoll_create(1)) < 0) {
2204                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2205                 return false;
2206         }
2207
2208         ev.events = POLLIN_SET;
2209         ev.data.fd = sock;
2210         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2211                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2212                 close(epfd);
2213                 return false;
2214         }
2215
2216 again:
2217         if ((now = time(NULL)) < 0) {
2218                 close(epfd);
2219                 return false;
2220         }
2221
2222         deltatime = (starttime + timeout) - now;
2223         if (deltatime < 0) { // timeout
2224                 errno = 0;
2225                 close(epfd);
2226                 return false;
2227         }
2228         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2229         if (ret < 0 && errno == EINTR)
2230                 goto again;
2231         saved_errno = errno;
2232         close(epfd);
2233
2234         if (ret <= 0) {
2235                 errno = saved_errno;
2236                 return false;
2237         }
2238         return true;
2239 }
2240
2241 static int msgrecv(int sockfd, void *buf, size_t len)
2242 {
2243         if (!wait_for_sock(sockfd, 2))
2244                 return -1;
2245         return recv(sockfd, buf, len, MSG_DONTWAIT);
2246 }
2247
2248 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2249 {
2250         struct msghdr msg = { 0 };
2251         struct iovec iov;
2252         struct cmsghdr *cmsg;
2253         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2254         char buf[1];
2255         buf[0] = 'p';
2256
2257         if (pingfirst) {
2258                 if (msgrecv(sock, buf, 1) != 1) {
2259                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2260                         return SEND_CREDS_FAIL;
2261                 }
2262         }
2263
2264         msg.msg_control = cmsgbuf;
2265         msg.msg_controllen = sizeof(cmsgbuf);
2266
2267         cmsg = CMSG_FIRSTHDR(&msg);
2268         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2269         cmsg->cmsg_level = SOL_SOCKET;
2270         cmsg->cmsg_type = SCM_CREDENTIALS;
2271         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2272
2273         msg.msg_name = NULL;
2274         msg.msg_namelen = 0;
2275
2276         buf[0] = v;
2277         iov.iov_base = buf;
2278         iov.iov_len = sizeof(buf);
2279         msg.msg_iov = &iov;
2280         msg.msg_iovlen = 1;
2281
2282         if (sendmsg(sock, &msg, 0) < 0) {
2283                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2284                 if (errno == 3)
2285                         return SEND_CREDS_NOTSK;
2286                 return SEND_CREDS_FAIL;
2287         }
2288
2289         return SEND_CREDS_OK;
2290 }
2291
2292 static bool recv_creds(int sock, struct ucred *cred, char *v)
2293 {
2294         struct msghdr msg = { 0 };
2295         struct iovec iov;
2296         struct cmsghdr *cmsg;
2297         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2298         char buf[1];
2299         int ret;
2300         int optval = 1;
2301
2302         *v = '1';
2303
2304         cred->pid = -1;
2305         cred->uid = -1;
2306         cred->gid = -1;
2307
2308         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2309                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2310                 return false;
2311         }
2312         buf[0] = '1';
2313         if (write(sock, buf, 1) != 1) {
2314                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2315                 return false;
2316         }
2317
2318         msg.msg_name = NULL;
2319         msg.msg_namelen = 0;
2320         msg.msg_control = cmsgbuf;
2321         msg.msg_controllen = sizeof(cmsgbuf);
2322
2323         iov.iov_base = buf;
2324         iov.iov_len = sizeof(buf);
2325         msg.msg_iov = &iov;
2326         msg.msg_iovlen = 1;
2327
2328         if (!wait_for_sock(sock, 2)) {
2329                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2330                 return false;
2331         }
2332         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2333         if (ret < 0) {
2334                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2335                 return false;
2336         }
2337
2338         cmsg = CMSG_FIRSTHDR(&msg);
2339
2340         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2341                         cmsg->cmsg_level == SOL_SOCKET &&
2342                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2343                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2344         }
2345         *v = buf[0];
2346
2347         return true;
2348 }
2349
2350 struct pid_ns_clone_args {
2351         int *cpipe;
2352         int sock;
2353         pid_t tpid;
2354         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2355 };
2356
2357 /*
2358  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2359  * with clone(). This simply writes '1' as ACK back to the parent
2360  * before calling the actual wrapped function.
2361  */
2362 static int pid_ns_clone_wrapper(void *arg) {
2363         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2364         char b = '1';
2365
2366         close(args->cpipe[0]);
2367         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2368                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2369         close(args->cpipe[1]);
2370         return args->wrapped(args->sock, args->tpid);
2371 }
2372
2373 /*
2374  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2375  * int value back over the socket.  This shifts the pid from the
2376  * sender's pidns into tpid's pidns.
2377  */
2378 static int pid_to_ns(int sock, pid_t tpid)
2379 {
2380         char v = '0';
2381         struct ucred cred;
2382
2383         while (recv_creds(sock, &cred, &v)) {
2384                 if (v == '1')
2385                         return 0;
2386                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2387                         return 1;
2388         }
2389         return 0;
2390 }
2391
2392
2393 /*
2394  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2395  * in your old pidns.  Only children which you clone will be in the target
2396  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2397  * actually convert pids.
2398  *
2399  * Note: glibc's fork() does not respect pidns, which can lead to failed
2400  * assertions inside glibc (and thus failed forks) if the child's pid in
2401  * the pidns and the parent pid outside are identical. Using clone prevents
2402  * this issue.
2403  */
2404 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2405 {
2406         int newnsfd = -1, ret, cpipe[2];
2407         char fnam[100];
2408         pid_t cpid;
2409         char v;
2410
2411         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2412         if (ret < 0 || ret >= sizeof(fnam))
2413                 _exit(1);
2414         newnsfd = open(fnam, O_RDONLY);
2415         if (newnsfd < 0)
2416                 _exit(1);
2417         if (setns(newnsfd, 0) < 0)
2418                 _exit(1);
2419         close(newnsfd);
2420
2421         if (pipe(cpipe) < 0)
2422                 _exit(1);
2423
2424         struct pid_ns_clone_args args = {
2425                 .cpipe = cpipe,
2426                 .sock = sock,
2427                 .tpid = tpid,
2428                 .wrapped = &pid_to_ns
2429         };
2430         size_t stack_size = sysconf(_SC_PAGESIZE);
2431         void *stack = alloca(stack_size);
2432
2433         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2434         if (cpid < 0)
2435                 _exit(1);
2436
2437         // give the child 1 second to be done forking and
2438         // write its ack
2439         if (!wait_for_sock(cpipe[0], 1))
2440                 _exit(1);
2441         ret = read(cpipe[0], &v, 1);
2442         if (ret != sizeof(char) || v != '1')
2443                 _exit(1);
2444
2445         if (!wait_for_pid(cpid))
2446                 _exit(1);
2447         _exit(0);
2448 }
2449
2450 /*
2451  * To read cgroup files with a particular pid, we will setns into the child
2452  * pidns, open a pipe, fork a child - which will be the first to really be in
2453  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2454  */
2455 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2456 {
2457         int sock[2] = {-1, -1};
2458         char *tmpdata = NULL;
2459         int ret;
2460         pid_t qpid, cpid = -1;
2461         bool answer = false;
2462         char v = '0';
2463         struct ucred cred;
2464         size_t sz = 0, asz = 0;
2465
2466         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2467                 return false;
2468
2469         /*
2470          * Now we read the pids from returned data one by one, pass
2471          * them into a child in the target namespace, read back the
2472          * translated pids, and put them into our to-return data
2473          */
2474
2475         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2476                 perror("socketpair");
2477                 free(tmpdata);
2478                 return false;
2479         }
2480
2481         cpid = fork();
2482         if (cpid == -1)
2483                 goto out;
2484
2485         if (!cpid) // child - exits when done
2486                 pid_to_ns_wrapper(sock[1], tpid);
2487
2488         char *ptr = tmpdata;
2489         cred.uid = 0;
2490         cred.gid = 0;
2491         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2492                 cred.pid = qpid;
2493                 ret = send_creds(sock[0], &cred, v, true);
2494
2495                 if (ret == SEND_CREDS_NOTSK)
2496                         goto next;
2497                 if (ret == SEND_CREDS_FAIL)
2498                         goto out;
2499
2500                 // read converted results
2501                 if (!wait_for_sock(sock[0], 2)) {
2502                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2503                         goto out;
2504                 }
2505                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2506                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2507                         goto out;
2508                 }
2509                 must_strcat_pid(d, &sz, &asz, qpid);
2510 next:
2511                 ptr = strchr(ptr, '\n');
2512                 if (!ptr)
2513                         break;
2514                 ptr++;
2515         }
2516
2517         cred.pid = getpid();
2518         v = '1';
2519         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2520                 // failed to ask child to exit
2521                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2522                 goto out;
2523         }
2524
2525         answer = true;
2526
2527 out:
2528         free(tmpdata);
2529         if (cpid != -1)
2530                 wait_for_pid(cpid);
2531         if (sock[0] != -1) {
2532                 close(sock[0]);
2533                 close(sock[1]);
2534         }
2535         return answer;
2536 }
2537
2538 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2539                 struct fuse_file_info *fi)
2540 {
2541         struct fuse_context *fc = fuse_get_context();
2542         struct file_info *f = (struct file_info *)fi->fh;
2543         struct cgfs_files *k = NULL;
2544         char *data = NULL;
2545         int ret, s;
2546         bool r;
2547
2548         if (f->type != LXC_TYPE_CGFILE) {
2549                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2550                 return -EIO;
2551         }
2552
2553         if (offset)
2554                 return 0;
2555
2556         if (!fc)
2557                 return -EIO;
2558
2559         if (!f->controller)
2560                 return -EINVAL;
2561
2562         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2563                 return -EINVAL;
2564         }
2565         free_key(k);
2566
2567
2568         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2569                 ret = -EACCES;
2570                 goto out;
2571         }
2572
2573         if (strcmp(f->file, "tasks") == 0 ||
2574                         strcmp(f->file, "/tasks") == 0 ||
2575                         strcmp(f->file, "/cgroup.procs") == 0 ||
2576                         strcmp(f->file, "cgroup.procs") == 0)
2577                 // special case - we have to translate the pids
2578                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2579         else
2580                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2581
2582         if (!r) {
2583                 ret = -EINVAL;
2584                 goto out;
2585         }
2586
2587         if (!data) {
2588                 ret = 0;
2589                 goto out;
2590         }
2591         s = strlen(data);
2592         if (s > size)
2593                 s = size;
2594         memcpy(buf, data, s);
2595         if (s > 0 && s < size && data[s-1] != '\n')
2596                 buf[s++] = '\n';
2597
2598         ret = s;
2599
2600 out:
2601         free(data);
2602         return ret;
2603 }
2604
2605 static int pid_from_ns(int sock, pid_t tpid)
2606 {
2607         pid_t vpid;
2608         struct ucred cred;
2609         char v;
2610         int ret;
2611
2612         cred.uid = 0;
2613         cred.gid = 0;
2614         while (1) {
2615                 if (!wait_for_sock(sock, 2)) {
2616                         lxcfs_error("%s\n", "Timeout reading from parent.");
2617                         return 1;
2618                 }
2619                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2620                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2621                         return 1;
2622                 }
2623                 if (vpid == -1) // done
2624                         break;
2625                 v = '0';
2626                 cred.pid = vpid;
2627                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2628                         v = '1';
2629                         cred.pid = getpid();
2630                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2631                                 return 1;
2632                 }
2633         }
2634         return 0;
2635 }
2636
2637 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2638 {
2639         int newnsfd = -1, ret, cpipe[2];
2640         char fnam[100];
2641         pid_t cpid;
2642         char v;
2643
2644         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2645         if (ret < 0 || ret >= sizeof(fnam))
2646                 _exit(1);
2647         newnsfd = open(fnam, O_RDONLY);
2648         if (newnsfd < 0)
2649                 _exit(1);
2650         if (setns(newnsfd, 0) < 0)
2651                 _exit(1);
2652         close(newnsfd);
2653
2654         if (pipe(cpipe) < 0)
2655                 _exit(1);
2656
2657         struct pid_ns_clone_args args = {
2658                 .cpipe = cpipe,
2659                 .sock = sock,
2660                 .tpid = tpid,
2661                 .wrapped = &pid_from_ns
2662         };
2663         size_t stack_size = sysconf(_SC_PAGESIZE);
2664         void *stack = alloca(stack_size);
2665
2666         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2667         if (cpid < 0)
2668                 _exit(1);
2669
2670         // give the child 1 second to be done forking and
2671         // write its ack
2672         if (!wait_for_sock(cpipe[0], 1))
2673                 _exit(1);
2674         ret = read(cpipe[0], &v, 1);
2675         if (ret != sizeof(char) || v != '1')
2676                 _exit(1);
2677
2678         if (!wait_for_pid(cpid))
2679                 _exit(1);
2680         _exit(0);
2681 }
2682
2683 /*
2684  * Given host @uid, return the uid to which it maps in
2685  * @pid's user namespace, or -1 if none.
2686  */
2687 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2688 {
2689         FILE *f;
2690         char line[400];
2691
2692         sprintf(line, "/proc/%d/uid_map", pid);
2693         if ((f = fopen(line, "r")) == NULL) {
2694                 return false;
2695         }
2696
2697         *answer = convert_id_to_ns(f, uid);
2698         fclose(f);
2699
2700         if (*answer == -1)
2701                 return false;
2702         return true;
2703 }
2704
2705 /*
2706  * get_pid_creds: get the real uid and gid of @pid from
2707  * /proc/$$/status
2708  * (XXX should we use euid here?)
2709  */
2710 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2711 {
2712         char line[400];
2713         uid_t u;
2714         gid_t g;
2715         FILE *f;
2716
2717         *uid = -1;
2718         *gid = -1;
2719         sprintf(line, "/proc/%d/status", pid);
2720         if ((f = fopen(line, "r")) == NULL) {
2721                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2722                 return;
2723         }
2724         while (fgets(line, 400, f)) {
2725                 if (strncmp(line, "Uid:", 4) == 0) {
2726                         if (sscanf(line+4, "%u", &u) != 1) {
2727                                 lxcfs_error("bad uid line for pid %u\n", pid);
2728                                 fclose(f);
2729                                 return;
2730                         }
2731                         *uid = u;
2732                 } else if (strncmp(line, "Gid:", 4) == 0) {
2733                         if (sscanf(line+4, "%u", &g) != 1) {
2734                                 lxcfs_error("bad gid line for pid %u\n", pid);
2735                                 fclose(f);
2736                                 return;
2737                         }
2738                         *gid = g;
2739                 }
2740         }
2741         fclose(f);
2742 }
2743
2744 /*
2745  * May the requestor @r move victim @v to a new cgroup?
2746  * This is allowed if
2747  *   . they are the same task
2748  *   . they are ownedy by the same uid
2749  *   . @r is root on the host, or
2750  *   . @v's uid is mapped into @r's where @r is root.
2751  */
2752 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2753 {
2754         uid_t v_uid, tmpuid;
2755         gid_t v_gid;
2756
2757         if (r == v)
2758                 return true;
2759         if (r_uid == 0)
2760                 return true;
2761         get_pid_creds(v, &v_uid, &v_gid);
2762         if (r_uid == v_uid)
2763                 return true;
2764         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2765                         && hostuid_to_ns(v_uid, r, &tmpuid))
2766                 return true;
2767         return false;
2768 }
2769
2770 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2771                 const char *file, const char *buf)
2772 {
2773         int sock[2] = {-1, -1};
2774         pid_t qpid, cpid = -1;
2775         FILE *pids_file = NULL;
2776         bool answer = false, fail = false;
2777
2778         pids_file = open_pids_file(contrl, cg);
2779         if (!pids_file)
2780                 return false;
2781
2782         /*
2783          * write the pids to a socket, have helper in writer's pidns
2784          * call movepid for us
2785          */
2786         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2787                 perror("socketpair");
2788                 goto out;
2789         }
2790
2791         cpid = fork();
2792         if (cpid == -1)
2793                 goto out;
2794
2795         if (!cpid) { // child
2796                 fclose(pids_file);
2797                 pid_from_ns_wrapper(sock[1], tpid);
2798         }
2799
2800         const char *ptr = buf;
2801         while (sscanf(ptr, "%d", &qpid) == 1) {
2802                 struct ucred cred;
2803                 char v;
2804
2805                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2806                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2807                         goto out;
2808                 }
2809
2810                 if (recv_creds(sock[0], &cred, &v)) {
2811                         if (v == '0') {
2812                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2813                                         fail = true;
2814                                         break;
2815                                 }
2816                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2817                                         fail = true;
2818                         }
2819                 }
2820
2821                 ptr = strchr(ptr, '\n');
2822                 if (!ptr)
2823                         break;
2824                 ptr++;
2825         }
2826
2827         /* All good, write the value */
2828         qpid = -1;
2829         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2830                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2831
2832         if (!fail)
2833                 answer = true;
2834
2835 out:
2836         if (cpid != -1)
2837                 wait_for_pid(cpid);
2838         if (sock[0] != -1) {
2839                 close(sock[0]);
2840                 close(sock[1]);
2841         }
2842         if (pids_file) {
2843                 if (fclose(pids_file) != 0)
2844                         answer = false;
2845         }
2846         return answer;
2847 }
2848
2849 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2850              struct fuse_file_info *fi)
2851 {
2852         struct fuse_context *fc = fuse_get_context();
2853         char *localbuf = NULL;
2854         struct cgfs_files *k = NULL;
2855         struct file_info *f = (struct file_info *)fi->fh;
2856         bool r;
2857
2858         if (f->type != LXC_TYPE_CGFILE) {
2859                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2860                 return -EIO;
2861         }
2862
2863         if (offset)
2864                 return 0;
2865
2866         if (!fc)
2867                 return -EIO;
2868
2869         localbuf = alloca(size+1);
2870         localbuf[size] = '\0';
2871         memcpy(localbuf, buf, size);
2872
2873         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2874                 size = -EINVAL;
2875                 goto out;
2876         }
2877
2878         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2879                 size = -EACCES;
2880                 goto out;
2881         }
2882
2883         if (strcmp(f->file, "tasks") == 0 ||
2884                         strcmp(f->file, "/tasks") == 0 ||
2885                         strcmp(f->file, "/cgroup.procs") == 0 ||
2886                         strcmp(f->file, "cgroup.procs") == 0)
2887                 // special case - we have to translate the pids
2888                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2889         else
2890                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2891
2892         if (!r)
2893                 size = -EINVAL;
2894
2895 out:
2896         free_key(k);
2897         return size;
2898 }
2899
2900 int cg_chown(const char *path, uid_t uid, gid_t gid)
2901 {
2902         struct fuse_context *fc = fuse_get_context();
2903         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2904         struct cgfs_files *k = NULL;
2905         const char *cgroup;
2906         int ret;
2907
2908         if (!fc)
2909                 return -EIO;
2910
2911         if (strcmp(path, "/cgroup") == 0)
2912                 return -EPERM;
2913
2914         controller = pick_controller_from_path(fc, path);
2915         if (!controller)
2916                 return errno == ENOENT ? -EPERM : -errno;
2917
2918         cgroup = find_cgroup_in_path(path);
2919         if (!cgroup)
2920                 /* this is just /cgroup/controller */
2921                 return -EPERM;
2922
2923         get_cgdir_and_path(cgroup, &cgdir, &last);
2924
2925         if (!last) {
2926                 path1 = "/";
2927                 path2 = cgdir;
2928         } else {
2929                 path1 = cgdir;
2930                 path2 = last;
2931         }
2932
2933         if (is_child_cgroup(controller, path1, path2)) {
2934                 // get uid, gid, from '/tasks' file and make up a mode
2935                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2936                 k = cgfs_get_key(controller, cgroup, "tasks");
2937
2938         } else
2939                 k = cgfs_get_key(controller, path1, path2);
2940
2941         if (!k) {
2942                 ret = -EINVAL;
2943                 goto out;
2944         }
2945
2946         /*
2947          * This being a fuse request, the uid and gid must be valid
2948          * in the caller's namespace.  So we can just check to make
2949          * sure that the caller is root in his uid, and privileged
2950          * over the file's current owner.
2951          */
2952         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2953                 ret = -EACCES;
2954                 goto out;
2955         }
2956
2957         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2958
2959 out:
2960         free_key(k);
2961         free(cgdir);
2962
2963         return ret;
2964 }
2965
2966 int cg_chmod(const char *path, mode_t mode)
2967 {
2968         struct fuse_context *fc = fuse_get_context();
2969         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2970         struct cgfs_files *k = NULL;
2971         const char *cgroup;
2972         int ret;
2973
2974         if (!fc)
2975                 return -EIO;
2976
2977         if (strcmp(path, "/cgroup") == 0)
2978                 return -EPERM;
2979
2980         controller = pick_controller_from_path(fc, path);
2981         if (!controller)
2982                 return errno == ENOENT ? -EPERM : -errno;
2983
2984         cgroup = find_cgroup_in_path(path);
2985         if (!cgroup)
2986                 /* this is just /cgroup/controller */
2987                 return -EPERM;
2988
2989         get_cgdir_and_path(cgroup, &cgdir, &last);
2990
2991         if (!last) {
2992                 path1 = "/";
2993                 path2 = cgdir;
2994         } else {
2995                 path1 = cgdir;
2996                 path2 = last;
2997         }
2998
2999         if (is_child_cgroup(controller, path1, path2)) {
3000                 // get uid, gid, from '/tasks' file and make up a mode
3001                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3002                 k = cgfs_get_key(controller, cgroup, "tasks");
3003
3004         } else
3005                 k = cgfs_get_key(controller, path1, path2);
3006
3007         if (!k) {
3008                 ret = -EINVAL;
3009                 goto out;
3010         }
3011
3012         /*
3013          * This being a fuse request, the uid and gid must be valid
3014          * in the caller's namespace.  So we can just check to make
3015          * sure that the caller is root in his uid, and privileged
3016          * over the file's current owner.
3017          */
3018         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3019                 ret = -EPERM;
3020                 goto out;
3021         }
3022
3023         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3024                 ret = -EINVAL;
3025                 goto out;
3026         }
3027
3028         ret = 0;
3029 out:
3030         free_key(k);
3031         free(cgdir);
3032         return ret;
3033 }
3034
3035 int cg_mkdir(const char *path, mode_t mode)
3036 {
3037         struct fuse_context *fc = fuse_get_context();
3038         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3039         const char *cgroup;
3040         int ret;
3041
3042         if (!fc)
3043                 return -EIO;
3044
3045         controller = pick_controller_from_path(fc, path);
3046         if (!controller)
3047                 return errno == ENOENT ? -EPERM : -errno;
3048
3049         cgroup = find_cgroup_in_path(path);
3050         if (!cgroup)
3051                 return -errno;
3052
3053         get_cgdir_and_path(cgroup, &cgdir, &last);
3054         if (!last)
3055                 path1 = "/";
3056         else
3057                 path1 = cgdir;
3058
3059         pid_t initpid = lookup_initpid_in_store(fc->pid);
3060         if (initpid <= 0)
3061                 initpid = fc->pid;
3062         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3063                 if (!next)
3064                         ret = -EINVAL;
3065                 else if (last && strcmp(next, last) == 0)
3066                         ret = -EEXIST;
3067                 else
3068                         ret = -EPERM;
3069                 goto out;
3070         }
3071
3072         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3073                 ret = -EACCES;
3074                 goto out;
3075         }
3076         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3077                 ret = -EACCES;
3078                 goto out;
3079         }
3080
3081         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3082
3083 out:
3084         free(cgdir);
3085         free(next);
3086         return ret;
3087 }
3088
3089 int cg_rmdir(const char *path)
3090 {
3091         struct fuse_context *fc = fuse_get_context();
3092         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3093         const char *cgroup;
3094         int ret;
3095
3096         if (!fc)
3097                 return -EIO;
3098
3099         controller = pick_controller_from_path(fc, path);
3100         if (!controller) /* Someone's trying to delete "/cgroup". */
3101                 return -EPERM;
3102
3103         cgroup = find_cgroup_in_path(path);
3104         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3105                 return -EPERM;
3106
3107         get_cgdir_and_path(cgroup, &cgdir, &last);
3108         if (!last) {
3109                 /* Someone's trying to delete a cgroup on the same level as the
3110                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3111                  * rmdir "/cgroup/blkio/init.slice".
3112                  */
3113                 ret = -EPERM;
3114                 goto out;
3115         }
3116
3117         pid_t initpid = lookup_initpid_in_store(fc->pid);
3118         if (initpid <= 0)
3119                 initpid = fc->pid;
3120         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3121                 if (!last || (next && (strcmp(next, last) == 0)))
3122                         ret = -EBUSY;
3123                 else
3124                         ret = -ENOENT;
3125                 goto out;
3126         }
3127
3128         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3129                 ret = -EACCES;
3130                 goto out;
3131         }
3132         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3133                 ret = -EACCES;
3134                 goto out;
3135         }
3136
3137         if (!cgfs_remove(controller, cgroup)) {
3138                 ret = -EINVAL;
3139                 goto out;
3140         }
3141
3142         ret = 0;
3143
3144 out:
3145         free(cgdir);
3146         free(next);
3147         return ret;
3148 }
3149
3150 static bool startswith(const char *line, const char *pref)
3151 {
3152         if (strncmp(line, pref, strlen(pref)) == 0)
3153                 return true;
3154         return false;
3155 }
3156
3157 static void parse_memstat(char *memstat, unsigned long *cached,
3158                 unsigned long *active_anon, unsigned long *inactive_anon,
3159                 unsigned long *active_file, unsigned long *inactive_file,
3160                 unsigned long *unevictable)
3161 {
3162         char *eol;
3163
3164         while (*memstat) {
3165                 if (startswith(memstat, "total_cache")) {
3166                         sscanf(memstat + 11, "%lu", cached);
3167                         *cached /= 1024;
3168                 } else if (startswith(memstat, "total_active_anon")) {
3169                         sscanf(memstat + 17, "%lu", active_anon);
3170                         *active_anon /= 1024;
3171                 } else if (startswith(memstat, "total_inactive_anon")) {
3172                         sscanf(memstat + 19, "%lu", inactive_anon);
3173                         *inactive_anon /= 1024;
3174                 } else if (startswith(memstat, "total_active_file")) {
3175                         sscanf(memstat + 17, "%lu", active_file);
3176                         *active_file /= 1024;
3177                 } else if (startswith(memstat, "total_inactive_file")) {
3178                         sscanf(memstat + 19, "%lu", inactive_file);
3179                         *inactive_file /= 1024;
3180                 } else if (startswith(memstat, "total_unevictable")) {
3181                         sscanf(memstat + 17, "%lu", unevictable);
3182                         *unevictable /= 1024;
3183                 }
3184                 eol = strchr(memstat, '\n');
3185                 if (!eol)
3186                         return;
3187                 memstat = eol+1;
3188         }
3189 }
3190
3191 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3192 {
3193         char *eol;
3194         char key[32];
3195
3196         memset(key, 0, 32);
3197         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3198
3199         size_t len = strlen(key);
3200         *v = 0;
3201
3202         while (*str) {
3203                 if (startswith(str, key)) {
3204                         sscanf(str + len, "%lu", v);
3205                         return;
3206                 }
3207                 eol = strchr(str, '\n');
3208                 if (!eol)
3209                         return;
3210                 str = eol+1;
3211         }
3212 }
3213
3214 static int read_file(const char *path, char *buf, size_t size,
3215                      struct file_info *d)
3216 {
3217         size_t linelen = 0, total_len = 0, rv = 0;
3218         char *line = NULL;
3219         char *cache = d->buf;
3220         size_t cache_size = d->buflen;
3221         FILE *f = fopen(path, "r");
3222         if (!f)
3223                 return 0;
3224
3225         while (getline(&line, &linelen, f) != -1) {
3226                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3227                 if (l < 0) {
3228                         perror("Error writing to cache");
3229                         rv = 0;
3230                         goto err;
3231                 }
3232                 if (l >= cache_size) {
3233                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3234                         rv = 0;
3235                         goto err;
3236                 }
3237                 cache += l;
3238                 cache_size -= l;
3239                 total_len += l;
3240         }
3241
3242         d->size = total_len;
3243         if (total_len > size)
3244                 total_len = size;
3245
3246         /* read from off 0 */
3247         memcpy(buf, d->buf, total_len);
3248         rv = total_len;
3249   err:
3250         fclose(f);
3251         free(line);
3252         return rv;
3253 }
3254
3255 /*
3256  * FUSE ops for /proc
3257  */
3258
3259 static unsigned long get_memlimit(const char *cgroup, const char *file)
3260 {
3261         char *memlimit_str = NULL;
3262         unsigned long memlimit = -1;
3263
3264         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3265                 memlimit = strtoul(memlimit_str, NULL, 10);
3266
3267         free(memlimit_str);
3268
3269         return memlimit;
3270 }
3271
3272 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3273 {
3274         char *copy = strdupa(cgroup);
3275         unsigned long memlimit = 0, retlimit;
3276
3277         retlimit = get_memlimit(copy, file);
3278
3279         while (strcmp(copy, "/") != 0) {
3280                 copy = dirname(copy);
3281                 memlimit = get_memlimit(copy, file);
3282                 if (memlimit != -1 && memlimit < retlimit)
3283                         retlimit = memlimit;
3284         };
3285
3286         return retlimit;
3287 }
3288
3289 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3290                 struct fuse_file_info *fi)
3291 {
3292         struct fuse_context *fc = fuse_get_context();
3293         struct file_info *d = (struct file_info *)fi->fh;
3294         char *cg;
3295         char *memusage_str = NULL, *memstat_str = NULL,
3296                 *memswlimit_str = NULL, *memswusage_str = NULL;
3297         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3298                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3299                 active_file = 0, inactive_file = 0, unevictable = 0,
3300                 hostswtotal = 0;
3301         char *line = NULL;
3302         size_t linelen = 0, total_len = 0, rv = 0;
3303         char *cache = d->buf;
3304         size_t cache_size = d->buflen;
3305         FILE *f = NULL;
3306
3307         if (offset){
3308                 if (offset > d->size)
3309                         return -EINVAL;
3310                 if (!d->cached)
3311                         return 0;
3312                 int left = d->size - offset;
3313                 total_len = left > size ? size: left;
3314                 memcpy(buf, cache + offset, total_len);
3315                 return total_len;
3316         }
3317
3318         pid_t initpid = lookup_initpid_in_store(fc->pid);
3319         if (initpid <= 0)
3320                 initpid = fc->pid;
3321         cg = get_pid_cgroup(initpid, "memory");
3322         if (!cg)
3323                 return read_file("/proc/meminfo", buf, size, d);
3324         prune_init_slice(cg);
3325
3326         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3327         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3328                 goto err;
3329         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3330                 goto err;
3331
3332         // Following values are allowed to fail, because swapaccount might be turned
3333         // off for current kernel
3334         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3335                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3336         {
3337                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3338                 memswusage = strtoul(memswusage_str, NULL, 10);
3339
3340                 memswlimit = memswlimit / 1024;
3341                 memswusage = memswusage / 1024;
3342         }
3343
3344         memusage = strtoul(memusage_str, NULL, 10);
3345         memlimit /= 1024;
3346         memusage /= 1024;
3347
3348         parse_memstat(memstat_str, &cached, &active_anon,
3349                         &inactive_anon, &active_file, &inactive_file,
3350                         &unevictable);
3351
3352         f = fopen("/proc/meminfo", "r");
3353         if (!f)
3354                 goto err;
3355
3356         while (getline(&line, &linelen, f) != -1) {
3357                 ssize_t l;
3358                 char *printme, lbuf[100];
3359
3360                 memset(lbuf, 0, 100);
3361                 if (startswith(line, "MemTotal:")) {
3362                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3363                         if (hosttotal < memlimit)
3364                                 memlimit = hosttotal;
3365                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3366                         printme = lbuf;
3367                 } else if (startswith(line, "MemFree:")) {
3368                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3369                         printme = lbuf;
3370                 } else if (startswith(line, "MemAvailable:")) {
3371                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3372                         printme = lbuf;
3373                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3374                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3375                         if (hostswtotal < memswlimit)
3376                                 memswlimit = hostswtotal;
3377                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3378                         printme = lbuf;
3379                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3380                         unsigned long swaptotal = memswlimit,
3381                                         swapusage = memswusage - memusage,
3382                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3383                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3384                         printme = lbuf;
3385                 } else if (startswith(line, "Slab:")) {
3386                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3387                         printme = lbuf;
3388                 } else if (startswith(line, "Buffers:")) {
3389                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3390                         printme = lbuf;
3391                 } else if (startswith(line, "Cached:")) {
3392                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3393                         printme = lbuf;
3394                 } else if (startswith(line, "SwapCached:")) {
3395                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3396                         printme = lbuf;
3397                 } else if (startswith(line, "Active:")) {
3398                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3399                                         active_anon + active_file);
3400                         printme = lbuf;
3401                 } else if (startswith(line, "Inactive:")) {
3402                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3403                                         inactive_anon + inactive_file);
3404                         printme = lbuf;
3405                 } else if (startswith(line, "Active(anon)")) {
3406                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3407                         printme = lbuf;
3408                 } else if (startswith(line, "Inactive(anon)")) {
3409                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3410                         printme = lbuf;
3411                 } else if (startswith(line, "Active(file)")) {
3412                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3413                         printme = lbuf;
3414                 } else if (startswith(line, "Inactive(file)")) {
3415                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3416                         printme = lbuf;
3417                 } else if (startswith(line, "Unevictable")) {
3418                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3419                         printme = lbuf;
3420                 } else if (startswith(line, "SReclaimable")) {
3421                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3422                         printme = lbuf;
3423                 } else if (startswith(line, "SUnreclaim")) {
3424                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3425                         printme = lbuf;
3426                 } else
3427                         printme = line;
3428
3429                 l = snprintf(cache, cache_size, "%s", printme);
3430                 if (l < 0) {
3431                         perror("Error writing to cache");
3432                         rv = 0;
3433                         goto err;
3434
3435                 }
3436                 if (l >= cache_size) {
3437                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3438                         rv = 0;
3439                         goto err;
3440                 }
3441
3442                 cache += l;
3443                 cache_size -= l;
3444                 total_len += l;
3445         }
3446
3447         d->cached = 1;
3448         d->size = total_len;
3449         if (total_len > size ) total_len = size;
3450         memcpy(buf, d->buf, total_len);
3451
3452         rv = total_len;
3453 err:
3454         if (f)
3455                 fclose(f);
3456         free(line);
3457         free(cg);
3458         free(memusage_str);
3459         free(memswlimit_str);
3460         free(memswusage_str);
3461         free(memstat_str);
3462         return rv;
3463 }
3464
3465 /*
3466  * Read the cpuset.cpus for cg
3467  * Return the answer in a newly allocated string which must be freed
3468  */
3469 static char *get_cpuset(const char *cg)
3470 {
3471         char *answer;
3472
3473         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3474                 return NULL;
3475         return answer;
3476 }
3477
3478 bool cpu_in_cpuset(int cpu, const char *cpuset);
3479
3480 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3481 {
3482         int cpu;
3483
3484         if (sscanf(line, "processor       : %d", &cpu) != 1)
3485                 return false;
3486         return cpu_in_cpuset(cpu, cpuset);
3487 }
3488
3489 /*
3490  * check whether this is a '^processor" line in /proc/cpuinfo
3491  */
3492 static bool is_processor_line(const char *line)
3493 {
3494         int cpu;
3495
3496         if (sscanf(line, "processor       : %d", &cpu) == 1)
3497                 return true;
3498         return false;
3499 }
3500
3501 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3502                 struct fuse_file_info *fi)
3503 {
3504         struct fuse_context *fc = fuse_get_context();
3505         struct file_info *d = (struct file_info *)fi->fh;
3506         char *cg;
3507         char *cpuset = NULL;
3508         char *line = NULL;
3509         size_t linelen = 0, total_len = 0, rv = 0;
3510         bool am_printing = false, firstline = true, is_s390x = false;
3511         int curcpu = -1, cpu;
3512         char *cache = d->buf;
3513         size_t cache_size = d->buflen;
3514         FILE *f = NULL;
3515
3516         if (offset){
3517                 if (offset > d->size)
3518                         return -EINVAL;
3519                 if (!d->cached)
3520                         return 0;
3521                 int left = d->size - offset;
3522                 total_len = left > size ? size: left;
3523                 memcpy(buf, cache + offset, total_len);
3524                 return total_len;
3525         }
3526
3527         pid_t initpid = lookup_initpid_in_store(fc->pid);
3528         if (initpid <= 0)
3529                 initpid = fc->pid;
3530         cg = get_pid_cgroup(initpid, "cpuset");
3531         if (!cg)
3532                 return read_file("proc/cpuinfo", buf, size, d);
3533         prune_init_slice(cg);
3534
3535         cpuset = get_cpuset(cg);
3536         if (!cpuset)
3537                 goto err;
3538
3539         f = fopen("/proc/cpuinfo", "r");
3540         if (!f)
3541                 goto err;
3542
3543         while (getline(&line, &linelen, f) != -1) {
3544                 ssize_t l;
3545                 if (firstline) {
3546                         firstline = false;
3547                         if (strstr(line, "IBM/S390") != NULL) {
3548                                 is_s390x = true;
3549                                 am_printing = true;
3550                                 continue;
3551                         }
3552                 }
3553                 if (strncmp(line, "# processors:", 12) == 0)
3554                         continue;
3555                 if (is_processor_line(line)) {
3556                         am_printing = cpuline_in_cpuset(line, cpuset);
3557                         if (am_printing) {
3558                                 curcpu ++;
3559                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3560                                 if (l < 0) {
3561                                         perror("Error writing to cache");
3562                                         rv = 0;
3563                                         goto err;
3564                                 }
3565                                 if (l >= cache_size) {
3566                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3567                                         rv = 0;
3568                                         goto err;
3569                                 }
3570                                 cache += l;
3571                                 cache_size -= l;
3572                                 total_len += l;
3573                         }
3574                         continue;
3575                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3576                         char *p;
3577                         if (!cpu_in_cpuset(cpu, cpuset))
3578                                 continue;
3579                         curcpu ++;
3580                         p = strchr(line, ':');
3581                         if (!p || !*p)
3582                                 goto err;
3583                         p++;
3584                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3585                         if (l < 0) {
3586                                 perror("Error writing to cache");
3587                                 rv = 0;
3588                                 goto err;
3589                         }
3590                         if (l >= cache_size) {
3591                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3592                                 rv = 0;
3593                                 goto err;
3594                         }
3595                         cache += l;
3596                         cache_size -= l;
3597                         total_len += l;
3598                         continue;
3599
3600                 }
3601                 if (am_printing) {
3602                         l = snprintf(cache, cache_size, "%s", line);
3603                         if (l < 0) {
3604                                 perror("Error writing to cache");
3605                                 rv = 0;
3606                                 goto err;
3607                         }
3608                         if (l >= cache_size) {
3609                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3610                                 rv = 0;
3611                                 goto err;
3612                         }
3613                         cache += l;
3614                         cache_size -= l;
3615                         total_len += l;
3616                 }
3617         }
3618
3619         if (is_s390x) {
3620                 char *origcache = d->buf;
3621                 ssize_t l;
3622                 do {
3623                         d->buf = malloc(d->buflen);
3624                 } while (!d->buf);
3625                 cache = d->buf;
3626                 cache_size = d->buflen;
3627                 total_len = 0;
3628                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3629                 if (l < 0 || l >= cache_size) {
3630                         free(origcache);
3631                         goto err;
3632                 }
3633                 cache_size -= l;
3634                 cache += l;
3635                 total_len += l;
3636                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3637                 if (l < 0 || l >= cache_size) {
3638                         free(origcache);
3639                         goto err;
3640                 }
3641                 cache_size -= l;
3642                 cache += l;
3643                 total_len += l;
3644                 l = snprintf(cache, cache_size, "%s", origcache);
3645                 free(origcache);
3646                 if (l < 0 || l >= cache_size)
3647                         goto err;
3648                 total_len += l;
3649         }
3650
3651         d->cached = 1;
3652         d->size = total_len;
3653         if (total_len > size ) total_len = size;
3654
3655         /* read from off 0 */
3656         memcpy(buf, d->buf, total_len);
3657         rv = total_len;
3658 err:
3659         if (f)
3660                 fclose(f);
3661         free(line);
3662         free(cpuset);
3663         free(cg);
3664         return rv;
3665 }
3666
3667 static uint64_t get_reaper_start_time(pid_t pid)
3668 {
3669         int ret;
3670         FILE *f;
3671         uint64_t starttime;
3672         /* strlen("/proc/") = 6
3673          * +
3674          * LXCFS_NUMSTRLEN64
3675          * +
3676          * strlen("/stat") = 5
3677          * +
3678          * \0 = 1
3679          * */
3680 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3681         char path[__PROC_PID_STAT_LEN];
3682         pid_t qpid;
3683
3684         qpid = lookup_initpid_in_store(pid);
3685         if (qpid <= 0) {
3686                 /* Caller can check for EINVAL on 0. */
3687                 errno = EINVAL;
3688                 return 0;
3689         }
3690
3691         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3692         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3693                 /* Caller can check for EINVAL on 0. */
3694                 errno = EINVAL;
3695                 return 0;
3696         }
3697
3698         f = fopen(path, "r");
3699         if (!f) {
3700                 /* Caller can check for EINVAL on 0. */
3701                 errno = EINVAL;
3702                 return 0;
3703         }
3704
3705         /* Note that the *scanf() argument supression requires that length
3706          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3707          * at us. It's like telling someone you're not married and then asking
3708          * if you can bring your wife to the party.
3709          */
3710         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3711                         "%*s "      /* (2)  comm        %s   */
3712                         "%*c "      /* (3)  state       %c   */
3713                         "%*d "      /* (4)  ppid        %d   */
3714                         "%*d "      /* (5)  pgrp        %d   */
3715                         "%*d "      /* (6)  session     %d   */
3716                         "%*d "      /* (7)  tty_nr      %d   */
3717                         "%*d "      /* (8)  tpgid       %d   */
3718                         "%*u "      /* (9)  flags       %u   */
3719                         "%*u "      /* (10) minflt      %lu  */
3720                         "%*u "      /* (11) cminflt     %lu  */
3721                         "%*u "      /* (12) majflt      %lu  */
3722                         "%*u "      /* (13) cmajflt     %lu  */
3723                         "%*u "      /* (14) utime       %lu  */
3724                         "%*u "      /* (15) stime       %lu  */
3725                         "%*d "      /* (16) cutime      %ld  */
3726                         "%*d "      /* (17) cstime      %ld  */
3727                         "%*d "      /* (18) priority    %ld  */
3728                         "%*d "      /* (19) nice        %ld  */
3729                         "%*d "      /* (20) num_threads %ld  */
3730                         "%*d "      /* (21) itrealvalue %ld  */
3731                         "%" PRIu64, /* (22) starttime   %llu */
3732                      &starttime);
3733         if (ret != 1) {
3734                 fclose(f);
3735                 /* Caller can check for EINVAL on 0. */
3736                 errno = EINVAL;
3737                 return 0;
3738         }
3739
3740         fclose(f);
3741
3742         errno = 0;
3743         return starttime;
3744 }
3745
3746 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3747 {
3748         uint64_t clockticks;
3749         int64_t ticks_per_sec;
3750
3751         clockticks = get_reaper_start_time(pid);
3752         if (clockticks == 0 && errno == EINVAL) {
3753                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3754                 return 0;
3755         }
3756
3757         ticks_per_sec = sysconf(_SC_CLK_TCK);
3758         if (ticks_per_sec < 0 && errno == EINVAL) {
3759                 lxcfs_debug(
3760                     "%s\n",
3761                     "failed to determine number of clock ticks in a second");
3762                 return 0;
3763         }
3764
3765         return (clockticks /= ticks_per_sec);
3766 }
3767
3768 static uint64_t get_reaper_age(pid_t pid)
3769 {
3770         uint64_t procstart, uptime, procage;
3771
3772         /* We need to substract the time the process has started since system
3773          * boot minus the time when the system has started to get the actual
3774          * reaper age.
3775          */
3776         procstart = get_reaper_start_time_in_sec(pid);
3777         procage = procstart;
3778         if (procstart > 0) {
3779                 int ret;
3780                 struct timespec spec;
3781
3782                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3783                 if (ret < 0)
3784                         return 0;
3785                 /* We could make this more precise here by using the tv_nsec
3786                  * field in the timespec struct and convert it to milliseconds
3787                  * and then create a double for the seconds and milliseconds but
3788                  * that seems more work than it is worth.
3789                  */
3790                 uptime = spec.tv_sec;
3791                 procage = uptime - procstart;
3792         }
3793
3794         return procage;
3795 }
3796
3797 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3798 static int proc_stat_read(char *buf, size_t size, off_t offset,
3799                 struct fuse_file_info *fi)
3800 {
3801         struct fuse_context *fc = fuse_get_context();
3802         struct file_info *d = (struct file_info *)fi->fh;
3803         char *cg;
3804         char *cpuset = NULL;
3805         char *line = NULL;
3806         size_t linelen = 0, total_len = 0, rv = 0;
3807         int curcpu = -1; /* cpu numbering starts at 0 */
3808         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3809         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3810                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3811         char cpuall[CPUALL_MAX_SIZE];
3812         /* reserve for cpu all */
3813         char *cache = d->buf + CPUALL_MAX_SIZE;
3814         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3815         FILE *f = NULL;
3816
3817         if (offset){
3818                 if (offset > d->size)
3819                         return -EINVAL;
3820                 if (!d->cached)
3821                         return 0;
3822                 int left = d->size - offset;
3823                 total_len = left > size ? size: left;
3824                 memcpy(buf, d->buf + offset, total_len);
3825                 return total_len;
3826         }
3827
3828         pid_t initpid = lookup_initpid_in_store(fc->pid);
3829         if (initpid <= 0)
3830                 initpid = fc->pid;
3831         cg = get_pid_cgroup(initpid, "cpuset");
3832         if (!cg)
3833                 return read_file("/proc/stat", buf, size, d);
3834         prune_init_slice(cg);
3835
3836         cpuset = get_cpuset(cg);
3837         if (!cpuset)
3838                 goto err;
3839
3840         f = fopen("/proc/stat", "r");
3841         if (!f)
3842                 goto err;
3843
3844         //skip first line
3845         if (getline(&line, &linelen, f) < 0) {
3846                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3847                 goto err;
3848         }
3849
3850         while (getline(&line, &linelen, f) != -1) {
3851                 ssize_t l;
3852                 int cpu;
3853                 char cpu_char[10]; /* That's a lot of cores */
3854                 char *c;
3855
3856                 if (strlen(line) == 0)
3857                         continue;
3858                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3859                         /* not a ^cpuN line containing a number N, just print it */
3860                         l = snprintf(cache, cache_size, "%s", line);
3861                         if (l < 0) {
3862                                 perror("Error writing to cache");
3863                                 rv = 0;
3864                                 goto err;
3865                         }
3866                         if (l >= cache_size) {
3867                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3868                                 rv = 0;
3869                                 goto err;
3870                         }
3871                         cache += l;
3872                         cache_size -= l;
3873                         total_len += l;
3874                         continue;
3875                 }
3876
3877                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3878                         continue;
3879                 if (!cpu_in_cpuset(cpu, cpuset))
3880                         continue;
3881                 curcpu ++;
3882
3883                 c = strchr(line, ' ');
3884                 if (!c)
3885                         continue;
3886                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3887                 if (l < 0) {
3888                         perror("Error writing to cache");
3889                         rv = 0;
3890                         goto err;
3891
3892                 }
3893                 if (l >= cache_size) {
3894                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3895                         rv = 0;
3896                         goto err;
3897                 }
3898
3899                 cache += l;
3900                 cache_size -= l;
3901                 total_len += l;
3902
3903                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3904                            &user,
3905                            &nice,
3906                            &system,
3907                            &idle,
3908                            &iowait,
3909                            &irq,
3910                            &softirq,
3911                            &steal,
3912                            &guest,
3913                            &guest_nice) != 10)
3914                         continue;
3915                 user_sum += user;
3916                 nice_sum += nice;
3917                 system_sum += system;
3918                 idle_sum += idle;
3919                 iowait_sum += iowait;
3920                 irq_sum += irq;
3921                 softirq_sum += softirq;
3922                 steal_sum += steal;
3923                 guest_sum += guest;
3924                 guest_nice_sum += guest_nice;
3925         }
3926
3927         cache = d->buf;
3928
3929         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3930                         user_sum,
3931                         nice_sum,
3932                         system_sum,
3933                         idle_sum,
3934                         iowait_sum,
3935                         irq_sum,
3936                         softirq_sum,
3937                         steal_sum,
3938                         guest_sum,
3939                         guest_nice_sum);
3940         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3941                 memcpy(cache, cpuall, cpuall_len);
3942                 cache += cpuall_len;
3943         } else {
3944                 /* shouldn't happen */
3945                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3946                 cpuall_len = 0;
3947         }
3948
3949         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3950         total_len += cpuall_len;
3951         d->cached = 1;
3952         d->size = total_len;
3953         if (total_len > size)
3954                 total_len = size;
3955
3956         memcpy(buf, d->buf, total_len);
3957         rv = total_len;
3958
3959 err:
3960         if (f)
3961                 fclose(f);
3962         free(line);
3963         free(cpuset);
3964         free(cg);
3965         return rv;
3966 }
3967
3968 /* This function retrieves the busy time of a group of tasks by looking at
3969  * cpuacct.usage. Unfortunately, this only makes sense when the container has
3970  * been given it's own cpuacct cgroup. If not, this function will take the busy
3971  * time of all other taks that do not actually belong to the container into
3972  * account as well. If someone has a clever solution for this please send a
3973  * patch!
3974  */
3975 static unsigned long get_reaper_busy(pid_t task)
3976 {
3977         pid_t initpid = lookup_initpid_in_store(task);
3978         char *cgroup = NULL, *usage_str = NULL;
3979         unsigned long usage = 0;
3980
3981         if (initpid <= 0)
3982                 return 0;
3983
3984         cgroup = get_pid_cgroup(initpid, "cpuacct");
3985         if (!cgroup)
3986                 goto out;
3987         prune_init_slice(cgroup);
3988         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3989                 goto out;
3990         usage = strtoul(usage_str, NULL, 10);
3991         usage /= 1000000000;
3992
3993 out:
3994         free(cgroup);
3995         free(usage_str);
3996         return usage;
3997 }
3998
3999 #if RELOADTEST
4000 void iwashere(void)
4001 {
4002         int fd;
4003
4004         fd = creat("/tmp/lxcfs-iwashere", 0644);
4005         if (fd >= 0)
4006                 close(fd);
4007 }
4008 #endif
4009
4010 /*
4011  * We read /proc/uptime and reuse its second field.
4012  * For the first field, we use the mtime for the reaper for
4013  * the calling pid as returned by getreaperage
4014  */
4015 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4016                 struct fuse_file_info *fi)
4017 {
4018         struct fuse_context *fc = fuse_get_context();
4019         struct file_info *d = (struct file_info *)fi->fh;
4020         unsigned long int busytime = get_reaper_busy(fc->pid);
4021         char *cache = d->buf;
4022         ssize_t total_len = 0;
4023         uint64_t idletime, reaperage;
4024
4025 #if RELOADTEST
4026         iwashere();
4027 #endif
4028
4029         if (offset){
4030                 if (!d->cached)
4031                         return 0;
4032                 if (offset > d->size)
4033                         return -EINVAL;
4034                 int left = d->size - offset;
4035                 total_len = left > size ? size: left;
4036                 memcpy(buf, cache + offset, total_len);
4037                 return total_len;
4038         }
4039
4040         reaperage = get_reaper_age(fc->pid);
4041         /* To understand why this is done, please read the comment to the
4042          * get_reaper_busy() function.
4043          */
4044         idletime = reaperage;
4045         if (reaperage >= busytime)
4046                 idletime = reaperage - busytime;
4047
4048         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4049         if (total_len < 0 || total_len >=  d->buflen){
4050                 lxcfs_error("%s\n", "failed to write to cache");
4051                 return 0;
4052         }
4053
4054         d->size = (int)total_len;
4055         d->cached = 1;
4056
4057         if (total_len > size) total_len = size;
4058
4059         memcpy(buf, d->buf, total_len);
4060         return total_len;
4061 }
4062
4063 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4064                 struct fuse_file_info *fi)
4065 {
4066         char dev_name[72];
4067         struct fuse_context *fc = fuse_get_context();
4068         struct file_info *d = (struct file_info *)fi->fh;
4069         char *cg;
4070         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4071                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
4072         unsigned long read = 0, write = 0;
4073         unsigned long read_merged = 0, write_merged = 0;
4074         unsigned long read_sectors = 0, write_sectors = 0;
4075         unsigned long read_ticks = 0, write_ticks = 0;
4076         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4077         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4078         char *cache = d->buf;
4079         size_t cache_size = d->buflen;
4080         char *line = NULL;
4081         size_t linelen = 0, total_len = 0, rv = 0;
4082         unsigned int major = 0, minor = 0;
4083         int i = 0;
4084         FILE *f = NULL;
4085
4086         if (offset){
4087                 if (offset > d->size)
4088                         return -EINVAL;
4089                 if (!d->cached)
4090                         return 0;
4091                 int left = d->size - offset;
4092                 total_len = left > size ? size: left;
4093                 memcpy(buf, cache + offset, total_len);
4094                 return total_len;
4095         }
4096
4097         pid_t initpid = lookup_initpid_in_store(fc->pid);
4098         if (initpid <= 0)
4099                 initpid = fc->pid;
4100         cg = get_pid_cgroup(initpid, "blkio");
4101         if (!cg)
4102                 return read_file("/proc/diskstats", buf, size, d);
4103         prune_init_slice(cg);
4104
4105         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4106                 goto err;
4107         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4108                 goto err;
4109         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4110                 goto err;
4111         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4112                 goto err;
4113         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4114                 goto err;
4115
4116
4117         f = fopen("/proc/diskstats", "r");
4118         if (!f)
4119                 goto err;
4120
4121         while (getline(&line, &linelen, f) != -1) {
4122                 ssize_t l;
4123                 char lbuf[256];
4124
4125                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4126                 if (i != 3)
4127                         continue;
4128
4129                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4130                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4131                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4132                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4133                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4134                 read_sectors = read_sectors/512;
4135                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4136                 write_sectors = write_sectors/512;
4137
4138                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4139                 rd_svctm = rd_svctm/1000000;
4140                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4141                 rd_wait = rd_wait/1000000;
4142                 read_ticks = rd_svctm + rd_wait;
4143
4144                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4145                 wr_svctm =  wr_svctm/1000000;
4146                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4147                 wr_wait =  wr_wait/1000000;
4148                 write_ticks = wr_svctm + wr_wait;
4149
4150                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4151                 tot_ticks =  tot_ticks/1000000;
4152
4153                 memset(lbuf, 0, 256);
4154                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4155                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4156                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4157                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4158                 else
4159                         continue;
4160
4161                 l = snprintf(cache, cache_size, "%s", lbuf);
4162                 if (l < 0) {
4163                         perror("Error writing to fuse buf");
4164                         rv = 0;
4165                         goto err;
4166                 }
4167                 if (l >= cache_size) {
4168                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4169                         rv = 0;
4170                         goto err;
4171                 }
4172                 cache += l;
4173                 cache_size -= l;
4174                 total_len += l;
4175         }
4176
4177         d->cached = 1;
4178         d->size = total_len;
4179         if (total_len > size ) total_len = size;
4180         memcpy(buf, d->buf, total_len);
4181
4182         rv = total_len;
4183 err:
4184         free(cg);
4185         if (f)
4186                 fclose(f);
4187         free(line);
4188         free(io_serviced_str);
4189         free(io_merged_str);
4190         free(io_service_bytes_str);
4191         free(io_wait_time_str);
4192         free(io_service_time_str);
4193         return rv;
4194 }
4195
4196 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4197                 struct fuse_file_info *fi)
4198 {
4199         struct fuse_context *fc = fuse_get_context();
4200         struct file_info *d = (struct file_info *)fi->fh;
4201         char *cg = NULL;
4202         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4203         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4204         ssize_t total_len = 0, rv = 0;
4205         ssize_t l = 0;
4206         char *cache = d->buf;
4207
4208         if (offset) {
4209                 if (offset > d->size)
4210                         return -EINVAL;
4211                 if (!d->cached)
4212                         return 0;
4213                 int left = d->size - offset;
4214                 total_len = left > size ? size: left;
4215                 memcpy(buf, cache + offset, total_len);
4216                 return total_len;
4217         }
4218
4219         pid_t initpid = lookup_initpid_in_store(fc->pid);
4220         if (initpid <= 0)
4221                 initpid = fc->pid;
4222         cg = get_pid_cgroup(initpid, "memory");
4223         if (!cg)
4224                 return read_file("/proc/swaps", buf, size, d);
4225         prune_init_slice(cg);
4226
4227         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4228
4229         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4230                 goto err;
4231
4232         memusage = strtoul(memusage_str, NULL, 10);
4233
4234         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4235             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4236
4237                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4238                 memswusage = strtoul(memswusage_str, NULL, 10);
4239
4240                 swap_total = (memswlimit - memlimit) / 1024;
4241                 swap_free = (memswusage - memusage) / 1024;
4242         }
4243
4244         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4245
4246         /* When no mem + swap limit is specified or swapaccount=0*/
4247         if (!memswlimit) {
4248                 char *line = NULL;
4249                 size_t linelen = 0;
4250                 FILE *f = fopen("/proc/meminfo", "r");
4251
4252                 if (!f)
4253                         goto err;
4254
4255                 while (getline(&line, &linelen, f) != -1) {
4256                         if (startswith(line, "SwapTotal:")) {
4257                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4258                         } else if (startswith(line, "SwapFree:")) {
4259                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4260                         }
4261                 }
4262
4263                 free(line);
4264                 fclose(f);
4265         }
4266
4267         if (swap_total > 0) {
4268                 l = snprintf(d->buf + total_len, d->size - total_len,
4269                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4270                                 swap_total, swap_free);
4271                 total_len += l;
4272         }
4273
4274         if (total_len < 0 || l < 0) {
4275                 perror("Error writing to cache");
4276                 rv = 0;
4277                 goto err;
4278         }
4279
4280         d->cached = 1;
4281         d->size = (int)total_len;
4282
4283         if (total_len > size) total_len = size;
4284         memcpy(buf, d->buf, total_len);
4285         rv = total_len;
4286
4287 err:
4288         free(cg);
4289         free(memswlimit_str);
4290         free(memlimit_str);
4291         free(memusage_str);
4292         free(memswusage_str);
4293         return rv;
4294 }
4295 /*
4296  * Find the process pid from cgroup path.
4297  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4298  * @pid_buf : put pid to pid_buf.
4299  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4300  * @depth : the depth of cgroup in container.
4301  * @sum : return the number of pid.
4302  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4303  */
4304 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4305 {
4306         DIR *dir;
4307         int fd;
4308         struct dirent *file;
4309         FILE *f = NULL;
4310         size_t linelen = 0;
4311         char *line = NULL;
4312         int pd;
4313         char *path_dir, *path;
4314         char **pid;
4315
4316         /* path = dpath + "/cgroup.procs" + /0 */
4317         do {
4318                 path = malloc(strlen(dpath) + 20);
4319         } while (!path);
4320
4321         strcpy(path, dpath);
4322         fd = openat(cfd, path, O_RDONLY);
4323         if (fd < 0)
4324                 goto out;
4325
4326         dir = fdopendir(fd);
4327         if (dir == NULL) {
4328                 close(fd);
4329                 goto out;
4330         }
4331
4332         while (((file = readdir(dir)) != NULL) && depth > 0) {
4333                 if (strncmp(file->d_name, ".", 1) == 0)
4334                         continue;
4335                 if (strncmp(file->d_name, "..", 1) == 0)
4336                         continue;
4337                 if (file->d_type == DT_DIR) {
4338                         /* path + '/' + d_name +/0 */
4339                         do {
4340                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4341                         } while (!path_dir);
4342                         strcpy(path_dir, path);
4343                         strcat(path_dir, "/");
4344                         strcat(path_dir, file->d_name);
4345                         pd = depth - 1;
4346                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4347                         free(path_dir);
4348                 }
4349         }
4350         closedir(dir);
4351
4352         strcat(path, "/cgroup.procs");
4353         fd = openat(cfd, path, O_RDONLY);
4354         if (fd < 0)
4355                 goto out;
4356
4357         f = fdopen(fd, "r");
4358         if (!f) {
4359                 close(fd);
4360                 goto out;
4361         }
4362
4363         while (getline(&line, &linelen, f) != -1) {
4364                 do {
4365                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4366                 } while (!pid);
4367                 *pid_buf = pid;
4368                 do {
4369                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
4370                 } while (*(*pid_buf + sum) == NULL);
4371                 strcpy(*(*pid_buf + sum), line);
4372                 sum++;
4373         }
4374         fclose(f);
4375 out:
4376         free(path);
4377         return sum;
4378 }
4379 /*
4380  * calc_load calculates the load according to the following formula:
4381  * load1 = load0 * exp + active * (1 - exp)
4382  *
4383  * @load1: the new loadavg.
4384  * @load0: the former loadavg.
4385  * @active: the total number of running pid at this moment.
4386  * @exp: the fixed-point defined in the beginning.
4387  */
4388 static unsigned long
4389 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4390 {
4391         unsigned long newload;
4392
4393         active = active > 0 ? active * FIXED_1 : 0;
4394         newload = load * exp + active * (FIXED_1 - exp);
4395         if (active >= load)
4396                 newload += FIXED_1 - 1;
4397
4398         return newload / FIXED_1;
4399 }
4400
4401 /*
4402  * Return 0 means that container p->cg is closed.
4403  * Return -1 means that error occurred in refresh.
4404  * Positive num equals the total number of pid.
4405  */
4406 static int refresh_load(struct load_node *p, char *path)
4407 {
4408         FILE *f = NULL;
4409         char **idbuf;
4410         char proc_path[256];
4411         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4412         char *line = NULL;
4413         size_t linelen = 0;
4414         int sum, length;
4415         DIR *dp;
4416         struct dirent *file;
4417
4418         do {
4419                 idbuf = malloc(sizeof(char *));
4420         } while (!idbuf);
4421         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4422         /*  normal exit  */
4423         if (sum == 0)
4424                 goto out;
4425
4426         for (i = 0; i < sum; i++) {
4427                 /*clean up '\n' */
4428                 length = strlen(idbuf[i])-1;
4429                 idbuf[i][length] = '\0';
4430                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4431                 if (ret < 0 || ret > 255) {
4432                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4433                         i = sum;
4434                         sum = -1;
4435                         goto err_out;
4436                 }
4437
4438                 dp = opendir(proc_path);
4439                 if (!dp) {
4440                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4441                         continue;
4442                 }
4443                 while ((file = readdir(dp)) != NULL) {
4444                         if (strncmp(file->d_name, ".", 1) == 0)
4445                                 continue;
4446                         if (strncmp(file->d_name, "..", 1) == 0)
4447                                 continue;
4448                         total_pid++;
4449                         /* We make the biggest pid become last_pid.*/
4450                         ret = atof(file->d_name);
4451                         last_pid = (ret > last_pid) ? ret : last_pid;
4452
4453                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4454                         if (ret < 0 || ret > 255) {
4455                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4456                                 i = sum;
4457                                 sum = -1;
4458                                 closedir(dp);
4459                                 goto err_out;
4460                         }
4461                         f = fopen(proc_path, "r");
4462                         if (f != NULL) {
4463                                 while (getline(&line, &linelen, f) != -1) {
4464                                         /* Find State */
4465                                         if ((line[0] == 'S') && (line[1] == 't'))
4466                                                 break;
4467                                 }
4468                         if ((line[7] == 'R') || (line[7] == 'D'))
4469                                 run_pid++;
4470                         fclose(f);
4471                         }
4472                 }
4473                 closedir(dp);
4474         }
4475         /*Calculate the loadavg.*/
4476         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4477         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4478         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4479         p->run_pid = run_pid;
4480         p->total_pid = total_pid;
4481         p->last_pid = last_pid;
4482
4483         free(line);
4484 err_out:
4485         for (; i > 0; i--)
4486                 free(idbuf[i-1]);
4487 out:
4488         free(idbuf);
4489         return sum;
4490 }
4491 /*
4492  * Traverse the hash table and update it.
4493  */
4494 void *load_begin(void *arg)
4495 {
4496
4497         char *path = NULL;
4498         int i, sum, length, ret;
4499         struct load_node *f;
4500         int first_node;
4501         clock_t time1, time2;
4502
4503         while (1) {
4504                 if (loadavg_stop == 1)
4505                         return NULL;
4506
4507                 time1 = clock();
4508                 for (i = 0; i < LOAD_SIZE; i++) {
4509                         pthread_mutex_lock(&load_hash[i].lock);
4510                         if (load_hash[i].next == NULL) {
4511                                 pthread_mutex_unlock(&load_hash[i].lock);
4512                                 continue;
4513                         }
4514                         f = load_hash[i].next;
4515                         first_node = 1;
4516                         while (f) {
4517                                 length = strlen(f->cg) + 2;
4518                                 do {
4519                                         /* strlen(f->cg) + '.' or '' + \0 */
4520                                         path = malloc(length);
4521                                 } while (!path);
4522
4523                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4524                                 if (ret < 0 || ret > length - 1) {
4525                                         /* snprintf failed, ignore the node.*/
4526                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4527                                         goto out;
4528                                 }
4529                                 sum = refresh_load(f, path);
4530                                 if (sum == 0) {
4531                                         f = del_node(f, i);
4532                                 } else {
4533 out:                                    f = f->next;
4534                                 }
4535                                 free(path);
4536                                 /* load_hash[i].lock locks only on the first node.*/
4537                                 if (first_node == 1) {
4538                                         first_node = 0;
4539                                         pthread_mutex_unlock(&load_hash[i].lock);
4540                                 }
4541                         }
4542                 }
4543
4544                 if (loadavg_stop == 1)
4545                         return NULL;
4546
4547                 time2 = clock();
4548                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4549         }
4550 }
4551
4552 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4553                 struct fuse_file_info *fi)
4554 {
4555         struct fuse_context *fc = fuse_get_context();
4556         struct file_info *d = (struct file_info *)fi->fh;
4557         pid_t initpid;
4558         char *cg;
4559         size_t total_len = 0;
4560         char *cache = d->buf;
4561         struct load_node *n;
4562         int hash;
4563         int cfd;
4564         unsigned long a, b, c;
4565
4566         if (offset) {
4567                 if (offset > d->size)
4568                         return -EINVAL;
4569                 if (!d->cached)
4570                         return 0;
4571                 int left = d->size - offset;
4572                 total_len = left > size ? size : left;
4573                 memcpy(buf, cache + offset, total_len);
4574                 return total_len;
4575         }
4576         if (!loadavg)
4577                 return read_file("/proc/loadavg", buf, size, d);
4578
4579         initpid = lookup_initpid_in_store(fc->pid);
4580         if (initpid <= 0)
4581                 initpid = fc->pid;
4582         cg = get_pid_cgroup(initpid, "cpu");
4583         if (!cg)
4584                 return read_file("/proc/loadavg", buf, size, d);
4585
4586         prune_init_slice(cg);
4587         hash = calc_hash(cg);
4588         n = locate_node(cg, hash);
4589
4590         /* First time */
4591         if (n == NULL) {
4592                 if (!find_mounted_controller("cpu", &cfd)) {
4593                         /*
4594                          * In locate_node() above, pthread_rwlock_unlock() isn't used
4595                          * because delete is not allowed before read has ended.
4596                          */
4597                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4598                         return 0;
4599                 }
4600                 do {
4601                         n = malloc(sizeof(struct load_node));
4602                 } while (!n);
4603
4604                 do {
4605                         n->cg = malloc(strlen(cg)+1);
4606                 } while (!n->cg);
4607                 strcpy(n->cg, cg);
4608                 n->avenrun[0] = 0;
4609                 n->avenrun[1] = 0;
4610                 n->avenrun[2] = 0;
4611                 n->run_pid = 0;
4612                 n->total_pid = 1;
4613                 n->last_pid = initpid;
4614                 n->cfd = cfd;
4615                 insert_node(&n, hash);
4616         }
4617         a = n->avenrun[0] + (FIXED_1/200);
4618         b = n->avenrun[1] + (FIXED_1/200);
4619         c = n->avenrun[2] + (FIXED_1/200);
4620         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4621                 LOAD_INT(a), LOAD_FRAC(a),
4622                 LOAD_INT(b), LOAD_FRAC(b),
4623                 LOAD_INT(c), LOAD_FRAC(c),
4624                 n->run_pid, n->total_pid, n->last_pid);
4625         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4626         if (total_len < 0 || total_len >=  d->buflen) {
4627                 lxcfs_error("%s\n", "Failed to write to cache");
4628                 return 0;
4629         }
4630         d->size = (int)total_len;
4631         d->cached = 1;
4632
4633         if (total_len > size)
4634                 total_len = size;
4635         memcpy(buf, d->buf, total_len);
4636         return total_len;
4637 }
4638 /* Return a positive number on success, return 0 on failure.*/
4639 pthread_t load_daemon(int load_use)
4640 {
4641         int ret;
4642         pthread_t pid;
4643
4644         ret = init_load();
4645         if (ret == -1) {
4646                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4647                 return 0;
4648         }
4649         ret = pthread_create(&pid, NULL, load_begin, NULL);
4650         if (ret != 0) {
4651                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4652                 load_free();
4653                 return 0;
4654         }
4655         /* use loadavg, here loadavg = 1*/
4656         loadavg = load_use;
4657         return pid;
4658 }
4659
4660 /* Returns 0 on success. */
4661 int stop_load_daemon(pthread_t pid)
4662 {
4663         int s;
4664
4665         /* Signal the thread to gracefully stop */
4666         loadavg_stop = 1;
4667
4668         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
4669         if (s != 0) {
4670                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
4671                 return -1;
4672         }
4673
4674         load_free();
4675         loadavg_stop = 0;
4676
4677         return 0;
4678 }
4679
4680 static off_t get_procfile_size(const char *which)
4681 {
4682         FILE *f = fopen(which, "r");
4683         char *line = NULL;
4684         size_t len = 0;
4685         ssize_t sz, answer = 0;
4686         if (!f)
4687                 return 0;
4688
4689         while ((sz = getline(&line, &len, f)) != -1)
4690                 answer += sz;
4691         fclose (f);
4692         free(line);
4693
4694         return answer;
4695 }
4696
4697 int proc_getattr(const char *path, struct stat *sb)
4698 {
4699         struct timespec now;
4700
4701         memset(sb, 0, sizeof(struct stat));
4702         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4703                 return -EINVAL;
4704         sb->st_uid = sb->st_gid = 0;
4705         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4706         if (strcmp(path, "/proc") == 0) {
4707                 sb->st_mode = S_IFDIR | 00555;
4708                 sb->st_nlink = 2;
4709                 return 0;
4710         }
4711         if (strcmp(path, "/proc/meminfo") == 0 ||
4712                         strcmp(path, "/proc/cpuinfo") == 0 ||
4713                         strcmp(path, "/proc/uptime") == 0 ||
4714                         strcmp(path, "/proc/stat") == 0 ||
4715                         strcmp(path, "/proc/diskstats") == 0 ||
4716                         strcmp(path, "/proc/swaps") == 0 ||
4717                         strcmp(path, "/proc/loadavg") == 0) {
4718                 sb->st_size = 0;
4719                 sb->st_mode = S_IFREG | 00444;
4720                 sb->st_nlink = 1;
4721                 return 0;
4722         }
4723
4724         return -ENOENT;
4725 }
4726
4727 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4728                 struct fuse_file_info *fi)
4729 {
4730         if (filler(buf, ".", NULL, 0) != 0 ||
4731             filler(buf, "..", NULL, 0) != 0 ||
4732             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4733             filler(buf, "meminfo", NULL, 0) != 0 ||
4734             filler(buf, "stat", NULL, 0) != 0 ||
4735             filler(buf, "uptime", NULL, 0) != 0 ||
4736             filler(buf, "diskstats", NULL, 0) != 0 ||
4737             filler(buf, "swaps", NULL, 0) != 0   ||
4738             filler(buf, "loadavg", NULL, 0) != 0)
4739                 return -EINVAL;
4740         return 0;
4741 }
4742
4743 int proc_open(const char *path, struct fuse_file_info *fi)
4744 {
4745         int type = -1;
4746         struct file_info *info;
4747
4748         if (strcmp(path, "/proc/meminfo") == 0)
4749                 type = LXC_TYPE_PROC_MEMINFO;
4750         else if (strcmp(path, "/proc/cpuinfo") == 0)
4751                 type = LXC_TYPE_PROC_CPUINFO;
4752         else if (strcmp(path, "/proc/uptime") == 0)
4753                 type = LXC_TYPE_PROC_UPTIME;
4754         else if (strcmp(path, "/proc/stat") == 0)
4755                 type = LXC_TYPE_PROC_STAT;
4756         else if (strcmp(path, "/proc/diskstats") == 0)
4757                 type = LXC_TYPE_PROC_DISKSTATS;
4758         else if (strcmp(path, "/proc/swaps") == 0)
4759                 type = LXC_TYPE_PROC_SWAPS;
4760         else if (strcmp(path, "/proc/loadavg") == 0)
4761                 type = LXC_TYPE_PROC_LOADAVG;
4762         if (type == -1)
4763                 return -ENOENT;
4764
4765         info = malloc(sizeof(*info));
4766         if (!info)
4767                 return -ENOMEM;
4768
4769         memset(info, 0, sizeof(*info));
4770         info->type = type;
4771
4772         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4773         do {
4774                 info->buf = malloc(info->buflen);
4775         } while (!info->buf);
4776         memset(info->buf, 0, info->buflen);
4777         /* set actual size to buffer size */
4778         info->size = info->buflen;
4779
4780         fi->fh = (unsigned long)info;
4781         return 0;
4782 }
4783
4784 int proc_access(const char *path, int mask)
4785 {
4786         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4787                 return 0;
4788
4789         /* these are all read-only */
4790         if ((mask & ~R_OK) != 0)
4791                 return -EACCES;
4792         return 0;
4793 }
4794
4795 int proc_release(const char *path, struct fuse_file_info *fi)
4796 {
4797         do_release_file_info(fi);
4798         return 0;
4799 }
4800
4801 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4802                 struct fuse_file_info *fi)
4803 {
4804         struct file_info *f = (struct file_info *) fi->fh;
4805
4806         switch (f->type) {
4807         case LXC_TYPE_PROC_MEMINFO:
4808                 return proc_meminfo_read(buf, size, offset, fi);
4809         case LXC_TYPE_PROC_CPUINFO:
4810                 return proc_cpuinfo_read(buf, size, offset, fi);
4811         case LXC_TYPE_PROC_UPTIME:
4812                 return proc_uptime_read(buf, size, offset, fi);
4813         case LXC_TYPE_PROC_STAT:
4814                 return proc_stat_read(buf, size, offset, fi);
4815         case LXC_TYPE_PROC_DISKSTATS:
4816                 return proc_diskstats_read(buf, size, offset, fi);
4817         case LXC_TYPE_PROC_SWAPS:
4818                 return proc_swaps_read(buf, size, offset, fi);
4819         case LXC_TYPE_PROC_LOADAVG:
4820                 return proc_loadavg_read(buf, size, offset, fi);
4821         default:
4822                 return -EINVAL;
4823         }
4824 }
4825
4826 /*
4827  * Functions needed to setup cgroups in the __constructor__.
4828  */
4829
4830 static bool mkdir_p(const char *dir, mode_t mode)
4831 {
4832         const char *tmp = dir;
4833         const char *orig = dir;
4834         char *makeme;
4835
4836         do {
4837                 dir = tmp + strspn(tmp, "/");
4838                 tmp = dir + strcspn(dir, "/");
4839                 makeme = strndup(orig, dir - orig);
4840                 if (!makeme)
4841                         return false;
4842                 if (mkdir(makeme, mode) && errno != EEXIST) {
4843                         lxcfs_error("Failed to create directory '%s': %s.\n",
4844                                 makeme, strerror(errno));
4845                         free(makeme);
4846                         return false;
4847                 }
4848                 free(makeme);
4849         } while(tmp != dir);
4850
4851         return true;
4852 }
4853
4854 static bool umount_if_mounted(void)
4855 {
4856         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4857                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4858                 return false;
4859         }
4860         return true;
4861 }
4862
4863 /* __typeof__ should be safe to use with all compilers. */
4864 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4865 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4866 {
4867         return (fs->f_type == (fs_type_magic)magic_val);
4868 }
4869
4870 /*
4871  * looking at fs/proc_namespace.c, it appears we can
4872  * actually expect the rootfs entry to very specifically contain
4873  * " - rootfs rootfs "
4874  * IIUC, so long as we've chrooted so that rootfs is not our root,
4875  * the rootfs entry should always be skipped in mountinfo contents.
4876  */
4877 static bool is_on_ramfs(void)
4878 {
4879         FILE *f;
4880         char *p, *p2;
4881         char *line = NULL;
4882         size_t len = 0;
4883         int i;
4884
4885         f = fopen("/proc/self/mountinfo", "r");
4886         if (!f)
4887                 return false;
4888
4889         while (getline(&line, &len, f) != -1) {
4890                 for (p = line, i = 0; p && i < 4; i++)
4891                         p = strchr(p + 1, ' ');
4892                 if (!p)
4893                         continue;
4894                 p2 = strchr(p + 1, ' ');
4895                 if (!p2)
4896                         continue;
4897                 *p2 = '\0';
4898                 if (strcmp(p + 1, "/") == 0) {
4899                         // this is '/'.  is it the ramfs?
4900                         p = strchr(p2 + 1, '-');
4901                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4902                                 free(line);
4903                                 fclose(f);
4904                                 return true;
4905                         }
4906                 }
4907         }
4908         free(line);
4909         fclose(f);
4910         return false;
4911 }
4912
4913 static int pivot_enter()
4914 {
4915         int ret = -1, oldroot = -1, newroot = -1;
4916
4917         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4918         if (oldroot < 0) {
4919                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4920                 return ret;
4921         }
4922
4923         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4924         if (newroot < 0) {
4925                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4926                 goto err;
4927         }
4928
4929         /* change into new root fs */
4930         if (fchdir(newroot) < 0) {
4931                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4932                 goto err;
4933         }
4934
4935         /* pivot_root into our new root fs */
4936         if (pivot_root(".", ".") < 0) {
4937                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4938                 goto err;
4939         }
4940
4941         /*
4942          * At this point the old-root is mounted on top of our new-root.
4943          * To unmounted it we must not be chdir'd into it, so escape back
4944          * to the old-root.
4945          */
4946         if (fchdir(oldroot) < 0) {
4947                 lxcfs_error("%s\n", "Failed to enter old root.");
4948                 goto err;
4949         }
4950
4951         if (umount2(".", MNT_DETACH) < 0) {
4952                 lxcfs_error("%s\n", "Failed to detach old root.");
4953                 goto err;
4954         }
4955
4956         if (fchdir(newroot) < 0) {
4957                 lxcfs_error("%s\n", "Failed to re-enter new root.");
4958                 goto err;
4959         }
4960
4961         ret = 0;
4962
4963 err:
4964         if (oldroot > 0)
4965                 close(oldroot);
4966         if (newroot > 0)
4967                 close(newroot);
4968
4969         return ret;
4970 }
4971
4972 static int chroot_enter()
4973 {
4974         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4975                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4976                 return -1;
4977         }
4978
4979         if (chroot(".") < 0) {
4980                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4981                 return -1;
4982         }
4983
4984         if (chdir("/") < 0) {
4985                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4986                 return -1;
4987         }
4988
4989         return 0;
4990 }
4991
4992 static int permute_and_enter(void)
4993 {
4994         struct statfs sb;
4995
4996         if (statfs("/", &sb) < 0) {
4997                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
4998                 return -1;
4999         }
5000
5001         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5002          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5003          * /proc/1/mountinfo. */
5004         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5005                 return chroot_enter();
5006
5007         if (pivot_enter() < 0) {
5008                 lxcfs_error("%s\n", "Could not perform pivot root.");
5009                 return -1;
5010         }
5011
5012         return 0;
5013 }
5014
5015 /* Prepare our new clean root. */
5016 static int permute_prepare(void)
5017 {
5018         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5019                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5020                 return -1;
5021         }
5022
5023         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5024                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5025                 return -1;
5026         }
5027
5028         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5029                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5030                 return -1;
5031         }
5032
5033         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5034                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5035                 return -1;
5036         }
5037
5038         return 0;
5039 }
5040
5041 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5042 static bool permute_root(void)
5043 {
5044         /* Prepare new root. */
5045         if (permute_prepare() < 0)
5046                 return false;
5047
5048         /* Pivot into new root. */
5049         if (permute_and_enter() < 0)
5050                 return false;
5051
5052         return true;
5053 }
5054
5055 static int preserve_mnt_ns(int pid)
5056 {
5057         int ret;
5058         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5059         char path[len];
5060
5061         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5062         if (ret < 0 || (size_t)ret >= len)
5063                 return -1;
5064
5065         return open(path, O_RDONLY | O_CLOEXEC);
5066 }
5067
5068 static bool cgfs_prepare_mounts(void)
5069 {
5070         if (!mkdir_p(BASEDIR, 0700)) {
5071                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5072                 return false;
5073         }
5074
5075         if (!umount_if_mounted()) {
5076                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5077                 return false;
5078         }
5079
5080         if (unshare(CLONE_NEWNS) < 0) {
5081                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5082                 return false;
5083         }
5084
5085         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5086         if (cgroup_mount_ns_fd < 0) {
5087                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5088                 return false;
5089         }
5090
5091         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5092                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5093                 return false;
5094         }
5095
5096         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5097                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5098                 return false;
5099         }
5100
5101         return true;
5102 }
5103
5104 static bool cgfs_mount_hierarchies(void)
5105 {
5106         char *target;
5107         size_t clen, len;
5108         int i, ret;
5109
5110         for (i = 0; i < num_hierarchies; i++) {
5111                 char *controller = hierarchies[i];
5112
5113                 clen = strlen(controller);
5114                 len = strlen(BASEDIR) + clen + 2;
5115                 target = malloc(len);
5116                 if (!target)
5117                         return false;
5118
5119                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5120                 if (ret < 0 || ret >= len) {
5121                         free(target);
5122                         return false;
5123                 }
5124                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5125                         free(target);
5126                         return false;
5127                 }
5128                 if (!strcmp(controller, "unified"))
5129                         ret = mount("none", target, "cgroup2", 0, NULL);
5130                 else
5131                         ret = mount(controller, target, "cgroup", 0, controller);
5132                 if (ret < 0) {
5133                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5134                         free(target);
5135                         return false;
5136                 }
5137
5138                 fd_hierarchies[i] = open(target, O_DIRECTORY);
5139                 if (fd_hierarchies[i] < 0) {
5140                         free(target);
5141                         return false;
5142                 }
5143                 free(target);
5144         }
5145         return true;
5146 }
5147
5148 static bool cgfs_setup_controllers(void)
5149 {
5150         if (!cgfs_prepare_mounts())
5151                 return false;
5152
5153         if (!cgfs_mount_hierarchies()) {
5154                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5155                 return false;
5156         }
5157
5158         if (!permute_root())
5159                 return false;
5160
5161         return true;
5162 }
5163
5164 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5165 {
5166         FILE *f;
5167         char *cret, *line = NULL;
5168         char cwd[MAXPATHLEN];
5169         size_t len = 0;
5170         int i, init_ns = -1;
5171         bool found_unified = false;
5172
5173         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5174                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5175                 return;
5176         }
5177
5178         while (getline(&line, &len, f) != -1) {
5179                 char *idx, *p, *p2;
5180
5181                 p = strchr(line, ':');
5182                 if (!p)
5183                         goto out;
5184                 idx = line;
5185                 *(p++) = '\0';
5186
5187                 p2 = strrchr(p, ':');
5188                 if (!p2)
5189                         goto out;
5190                 *p2 = '\0';
5191
5192                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5193                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5194                  * because it parses out the empty string "" and later on passes
5195                  * it to mount(). Let's skip such entries.
5196                  */
5197                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5198                         found_unified = true;
5199                         p = "unified";
5200                 }
5201
5202                 if (!store_hierarchy(line, p))
5203                         goto out;
5204         }
5205
5206         /* Preserve initial namespace. */
5207         init_ns = preserve_mnt_ns(getpid());
5208         if (init_ns < 0) {
5209                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5210                 goto out;
5211         }
5212
5213         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5214         if (!fd_hierarchies) {
5215                 lxcfs_error("%s\n", strerror(errno));
5216                 goto out;
5217         }
5218
5219         for (i = 0; i < num_hierarchies; i++)
5220                 fd_hierarchies[i] = -1;
5221
5222         cret = getcwd(cwd, MAXPATHLEN);
5223         if (!cret)
5224                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5225
5226         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5227          * to privately mount lxcfs cgroups. */
5228         if (!cgfs_setup_controllers()) {
5229                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5230                 goto out;
5231         }
5232
5233         if (setns(init_ns, 0) < 0) {
5234                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5235                 goto out;
5236         }
5237
5238         if (!cret || chdir(cwd) < 0)
5239                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5240
5241         print_subsystems();
5242
5243 out:
5244         free(line);
5245         fclose(f);
5246         if (init_ns >= 0)
5247                 close(init_ns);
5248 }
5249
5250 static void __attribute__((destructor)) free_subsystems(void)
5251 {
5252         int i;
5253
5254         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5255
5256         for (i = 0; i < num_hierarchies; i++) {
5257                 if (hierarchies[i])
5258                         free(hierarchies[i]);
5259                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5260                         close(fd_hierarchies[i]);
5261         }
5262         free(hierarchies);
5263         free(fd_hierarchies);
5264
5265         if (cgroup_mount_ns_fd >= 0)
5266                 close(cgroup_mount_ns_fd);
5267 }