bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 /* The function of hash table.*/
  84 #define LOAD_SIZE 100 /*the size of hash_table */
  85 #define FLUSH_TIME 5  /*the flush rate */
  86 #define DEPTH_DIR 3   /*the depth of per cgroup */
  87 /* The function of calculate loadavg .*/
  88 #define FSHIFT          11              /* nr of bits of precision */
  89 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  90 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  91 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  92 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  93 #define LOAD_INT(x) ((x) >> FSHIFT)
  94 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  95 /*
  96  * This parameter is used for proc_loadavg_read().
  97  * 1 means use loadavg, 0 means not use.
  98  */
  99 static int loadavg = 0;
 100 static int calc_hash(char *name)
 101 {
 102         unsigned int hash = 0;
 103         unsigned int x = 0;
 104         /* ELFHash algorithm. */
 105         while (*name) {
 106                 hash = (hash << 4) + *name++;
 107                 x = hash & 0xf0000000;
 108                 if (x != 0)
 109                         hash ^= (x >> 24);
 110                 hash &= ~x;
 111         }
 112         return ((hash & 0x7fffffff) % LOAD_SIZE);
 113 }
 114
 115 struct load_node {
 116         char *cg;  /*cg */
 117         unsigned long avenrun[3];               /* Load averages */
 118         unsigned int run_pid;
 119         unsigned int total_pid;
 120         unsigned int last_pid;
 121         int cfd; /* The file descriptor of the mounted cgroup */
 122         struct  load_node *next;
 123         struct  load_node **pre;
 124 };
 125
 126 struct load_head {
 127         /*
 128          * The lock is about insert load_node and refresh load_node.To the first
 129          * load_node of each hash bucket, insert and refresh in this hash bucket is
 130          * mutually exclusive.
 131          */
 132         pthread_mutex_t lock;
 133         /*
 134          * The rdlock is about read loadavg and delete load_node.To each hash
 135          * bucket, read and delete is mutually exclusive. But at the same time, we
 136          * allow paratactic read operation. This rdlock is at list level.
 137          */
 138         pthread_rwlock_t rdlock;
 139         /*
 140          * The rilock is about read loadavg and insert load_node.To the first
 141          * load_node of each hash bucket, read and insert is mutually exclusive.
 142          * But at the same time, we allow paratactic read operation.
 143          */
 144         pthread_rwlock_t rilock;
 145         struct load_node *next;
 146 };
 147
 148 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 149 /*
 150  * init_load initialize the hash table.
 151  * Return 0 on success, return -1 on failure.
 152  */
 153 static int init_load(void)
 154 {
 155         int i;
 156         int ret;
 157
 158         for (i = 0; i < LOAD_SIZE; i++) {
 159                 load_hash[i].next = NULL;
 160                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 161                 if (ret != 0) {
 162                         lxcfs_error("%s\n", "Failed to initialize lock");
 163                         goto out3;
 164                 }
 165                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 166                 if (ret != 0) {
 167                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 168                         goto out2;
 169                 }
 170                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 171                 if (ret != 0) {
 172                         lxcfs_error("%s\n", "Failed to initialize rilock");
 173                         goto out1;
 174                 }
 175         }
 176         return 0;
 177 out1:
 178         pthread_rwlock_destroy(&load_hash[i].rdlock);
 179 out2:
 180         pthread_mutex_destroy(&load_hash[i].lock);
 181 out3:
 182         while (i > 0) {
 183                 i--;
 184                 pthread_mutex_destroy(&load_hash[i].lock);
 185                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 186                 pthread_rwlock_destroy(&load_hash[i].rilock);
 187         }
 188         return -1;
 189 }
 190
 191 static void insert_node(struct load_node **n, int locate)
 192 {
 193         struct load_node *f;
 194
 195         pthread_mutex_lock(&load_hash[locate].lock);
 196         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 197         f = load_hash[locate].next;
 198         load_hash[locate].next = *n;
 199
 200         (*n)->pre = &(load_hash[locate].next);
 201         if (f)
 202                 f->pre = &((*n)->next);
 203         (*n)->next = f;
 204         pthread_mutex_unlock(&load_hash[locate].lock);
 205         pthread_rwlock_unlock(&load_hash[locate].rilock);
 206 }
 207 /*
 208  * locate_node() finds special node. Not return NULL means success.
 209  * It should be noted that rdlock isn't unlocked at the end of code
 210  * because this function is used to read special node. Delete is not
 211  * allowed before read has ended.
 212  * unlock rdlock only in proc_loadavg_read().
 213  */
 214 static struct load_node *locate_node(char *cg, int locate)
 215 {
 216         struct load_node *f = NULL;
 217         int i = 0;
 218
 219         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 220         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 221         if (load_hash[locate].next == NULL) {
 222                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 223                 return f;
 224         }
 225         f = load_hash[locate].next;
 226         pthread_rwlock_unlock(&load_hash[locate].rilock);
 227         while (f && ((i = strcmp(f->cg, cg)) != 0))
 228                 f = f->next;
 229         return f;
 230 }
 231 /* Delete the load_node n and return the next node of it. */
 232 static struct load_node *del_node(struct load_node *n, int locate)
 233 {
 234         struct load_node *g;
 235
 236         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 237         if (n->next == NULL) {
 238                 *(n->pre) = NULL;
 239         } else {
 240                 *(n->pre) = n->next;
 241                 n->next->pre = n->pre;
 242         }
 243         g = n->next;
 244         free(n->cg);
 245         free(n);
 246         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 247         return g;
 248 }
 249
 250 /* Reserve buffer size to account for file size changes. */
 251 #define BUF_RESERVE_SIZE 512
 252
 253 /*
 254  * A table caching which pid is init for a pid namespace.
 255  * When looking up which pid is init for $qpid, we first
 256  * 1. Stat /proc/$qpid/ns/pid.
 257  * 2. Check whether the ino_t is in our store.
 258  *   a. if not, fork a child in qpid's ns to send us
 259  *       ucred.pid = 1, and read the initpid.  Cache
 260  *       initpid and creation time for /proc/initpid
 261  *       in a new store entry.
 262  *   b. if so, verify that /proc/initpid still matches
 263  *       what we have saved.  If not, clear the store
 264  *       entry and go back to a.  If so, return the
 265  *       cached initpid.
 266  */
 267 struct pidns_init_store {
 268         ino_t ino;          // inode number for /proc/$pid/ns/pid
 269         pid_t initpid;      // the pid of nit in that ns
 270         long int ctime;     // the time at which /proc/$initpid was created
 271         struct pidns_init_store *next;
 272         long int lastcheck;
 273 };
 274
 275 /* lol - look at how they are allocated in the kernel */
 276 #define PIDNS_HASH_SIZE 4096
 277 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 278
 279 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 280 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 281 static void lock_mutex(pthread_mutex_t *l)
 282 {
 283         int ret;
 284
 285         if ((ret = pthread_mutex_lock(l)) != 0) {
 286                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 287                 exit(1);
 288         }
 289 }
 290
 291 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 292  * Number of hierarchies mounted. */
 293 static int num_hierarchies;
 294
 295 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 296  * Hierachies mounted {cpuset, blkio, ...}:
 297  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 298 static char **hierarchies;
 299
 300 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 301  * Open file descriptors:
 302  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 303  * private mount namespace.
 304  * Initialized via __constructor__ collect_and_mount_subsystems().
 305  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 306  * mounts and respective files in the private namespace even when located in
 307  * another namespace using the *at() family of functions
 308  * {openat(), fchownat(), ...}. */
 309 static int *fd_hierarchies;
 310 static int cgroup_mount_ns_fd = -1;
 311
 312 static void unlock_mutex(pthread_mutex_t *l)
 313 {
 314         int ret;
 315
 316         if ((ret = pthread_mutex_unlock(l)) != 0) {
 317                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 318                 exit(1);
 319         }
 320 }
 321
 322 static void store_lock(void)
 323 {
 324         lock_mutex(&pidns_store_mutex);
 325 }
 326
 327 static void store_unlock(void)
 328 {
 329         unlock_mutex(&pidns_store_mutex);
 330 }
 331
 332 /* Must be called under store_lock */
 333 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 334 {
 335         struct stat initsb;
 336         char fnam[100];
 337
 338         snprintf(fnam, 100, "/proc/%d", e->initpid);
 339         if (stat(fnam, &initsb) < 0)
 340                 return false;
 341
 342         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 343                     initsb.st_ctime, e->initpid);
 344
 345         if (e->ctime != initsb.st_ctime)
 346                 return false;
 347         return true;
 348 }
 349
 350 /* Must be called under store_lock */
 351 static void remove_initpid(struct pidns_init_store *e)
 352 {
 353         struct pidns_init_store *tmp;
 354         int h;
 355
 356         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 357
 358         h = HASH(e->ino);
 359         if (pidns_hash_table[h] == e) {
 360                 pidns_hash_table[h] = e->next;
 361                 free(e);
 362                 return;
 363         }
 364
 365         tmp = pidns_hash_table[h];
 366         while (tmp) {
 367                 if (tmp->next == e) {
 368                         tmp->next = e->next;
 369                         free(e);
 370                         return;
 371                 }
 372                 tmp = tmp->next;
 373         }
 374 }
 375
 376 #define PURGE_SECS 5
 377 /* Must be called under store_lock */
 378 static void prune_initpid_store(void)
 379 {
 380         static long int last_prune = 0;
 381         struct pidns_init_store *e, *prev, *delme;
 382         long int now, threshold;
 383         int i;
 384
 385         if (!last_prune) {
 386                 last_prune = time(NULL);
 387                 return;
 388         }
 389         now = time(NULL);
 390         if (now < last_prune + PURGE_SECS)
 391                 return;
 392
 393         lxcfs_debug("%s\n", "Pruning.");
 394
 395         last_prune = now;
 396         threshold = now - 2 * PURGE_SECS;
 397
 398         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 399                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 400                         if (e->lastcheck < threshold) {
 401
 402                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 403
 404                                 delme = e;
 405                                 if (prev)
 406                                         prev->next = e->next;
 407                                 else
 408                                         pidns_hash_table[i] = e->next;
 409                                 e = e->next;
 410                                 free(delme);
 411                         } else {
 412                                 prev = e;
 413                                 e = e->next;
 414                         }
 415                 }
 416         }
 417 }
 418
 419 /* Must be called under store_lock */
 420 static void save_initpid(struct stat *sb, pid_t pid)
 421 {
 422         struct pidns_init_store *e;
 423         char fpath[100];
 424         struct stat procsb;
 425         int h;
 426
 427         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 428
 429         snprintf(fpath, 100, "/proc/%d", pid);
 430         if (stat(fpath, &procsb) < 0)
 431                 return;
 432         do {
 433                 e = malloc(sizeof(*e));
 434         } while (!e);
 435         e->ino = sb->st_ino;
 436         e->initpid = pid;
 437         e->ctime = procsb.st_ctime;
 438         h = HASH(e->ino);
 439         e->next = pidns_hash_table[h];
 440         e->lastcheck = time(NULL);
 441         pidns_hash_table[h] = e;
 442 }
 443
 444 /*
 445  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 446  * entry for the inode number and creation time.  Verify that the init pid
 447  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 448  * otherwise.
 449  * Must be called under store_lock
 450  */
 451 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 452 {
 453         int h = HASH(sb->st_ino);
 454         struct pidns_init_store *e = pidns_hash_table[h];
 455
 456         while (e) {
 457                 if (e->ino == sb->st_ino) {
 458                         if (initpid_still_valid(e, sb)) {
 459                                 e->lastcheck = time(NULL);
 460                                 return e;
 461                         }
 462                         remove_initpid(e);
 463                         return NULL;
 464                 }
 465                 e = e->next;
 466         }
 467
 468         return NULL;
 469 }
 470
 471 static int is_dir(const char *path, int fd)
 472 {
 473         struct stat statbuf;
 474         int ret = fstatat(fd, path, &statbuf, fd);
 475         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 476                 return 1;
 477         return 0;
 478 }
 479
 480 static char *must_copy_string(const char *str)
 481 {
 482         char *dup = NULL;
 483         if (!str)
 484                 return NULL;
 485         do {
 486                 dup = strdup(str);
 487         } while (!dup);
 488
 489         return dup;
 490 }
 491
 492 static inline void drop_trailing_newlines(char *s)
 493 {
 494         int l;
 495
 496         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 497                 s[l-1] = '\0';
 498 }
 499
 500 #define BATCH_SIZE 50
 501 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 502 {
 503         int newbatches = (newlen / BATCH_SIZE) + 1;
 504         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 505
 506         if (!*mem || newbatches > oldbatches) {
 507                 char *tmp;
 508                 do {
 509                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 510                 } while (!tmp);
 511                 *mem = tmp;
 512         }
 513 }
 514 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 515 {
 516         size_t newlen = *len + linelen;
 517         dorealloc(contents, *len, newlen + 1);
 518         memcpy(*contents + *len, line, linelen+1);
 519         *len = newlen;
 520 }
 521
 522 static char *slurp_file(const char *from, int fd)
 523 {
 524         char *line = NULL;
 525         char *contents = NULL;
 526         FILE *f = fdopen(fd, "r");
 527         size_t len = 0, fulllen = 0;
 528         ssize_t linelen;
 529
 530         if (!f)
 531                 return NULL;
 532
 533         while ((linelen = getline(&line, &len, f)) != -1) {
 534                 append_line(&contents, &fulllen, line, linelen);
 535         }
 536         fclose(f);
 537
 538         if (contents)
 539                 drop_trailing_newlines(contents);
 540         free(line);
 541         return contents;
 542 }
 543
 544 static bool write_string(const char *fnam, const char *string, int fd)
 545 {
 546         FILE *f;
 547         size_t len, ret;
 548
 549         if (!(f = fdopen(fd, "w")))
 550                 return false;
 551         len = strlen(string);
 552         ret = fwrite(string, 1, len, f);
 553         if (ret != len) {
 554                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 555                 fclose(f);
 556                 return false;
 557         }
 558         if (fclose(f) < 0) {
 559                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 560                 return false;
 561         }
 562         return true;
 563 }
 564
 565 struct cgfs_files {
 566         char *name;
 567         uint32_t uid, gid;
 568         uint32_t mode;
 569 };
 570
 571 #define ALLOC_NUM 20
 572 static bool store_hierarchy(char *stridx, char *h)
 573 {
 574         if (num_hierarchies % ALLOC_NUM == 0) {
 575                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 576                 n *= ALLOC_NUM;
 577                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 578                 if (!tmp) {
 579                         lxcfs_error("%s\n", strerror(errno));
 580                         exit(1);
 581                 }
 582                 hierarchies = tmp;
 583         }
 584
 585         hierarchies[num_hierarchies++] = must_copy_string(h);
 586         return true;
 587 }
 588
 589 static void print_subsystems(void)
 590 {
 591         int i;
 592
 593         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 594         fprintf(stderr, "hierarchies:\n");
 595         for (i = 0; i < num_hierarchies; i++) {
 596                 if (hierarchies[i])
 597                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 598                                 fd_hierarchies[i], hierarchies[i]);
 599         }
 600 }
 601
 602 static bool in_comma_list(const char *needle, const char *haystack)
 603 {
 604         const char *s = haystack, *e;
 605         size_t nlen = strlen(needle);
 606
 607         while (*s && (e = strchr(s, ','))) {
 608                 if (nlen != e - s) {
 609                         s = e + 1;
 610                         continue;
 611                 }
 612                 if (strncmp(needle, s, nlen) == 0)
 613                         return true;
 614                 s = e + 1;
 615         }
 616         if (strcmp(needle, s) == 0)
 617                 return true;
 618         return false;
 619 }
 620
 621 /* do we need to do any massaging here?  I'm not sure... */
 622 /* Return the mounted controller and store the corresponding open file descriptor
 623  * referring to the controller mountpoint in the private lxcfs namespace in
 624  * @cfd.
 625  */
 626 static char *find_mounted_controller(const char *controller, int *cfd)
 627 {
 628         int i;
 629
 630         for (i = 0; i < num_hierarchies; i++) {
 631                 if (!hierarchies[i])
 632                         continue;
 633                 if (strcmp(hierarchies[i], controller) == 0) {
 634                         *cfd = fd_hierarchies[i];
 635                         return hierarchies[i];
 636                 }
 637                 if (in_comma_list(controller, hierarchies[i])) {
 638                         *cfd = fd_hierarchies[i];
 639                         return hierarchies[i];
 640                 }
 641         }
 642
 643         return NULL;
 644 }
 645
 646 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 647                 const char *value)
 648 {
 649         int ret, fd, cfd;
 650         size_t len;
 651         char *fnam, *tmpc;
 652
 653         tmpc = find_mounted_controller(controller, &cfd);
 654         if (!tmpc)
 655                 return false;
 656
 657         /* Make sure we pass a relative path to *at() family of functions.
 658          * . + /cgroup + / + file + \0
 659          */
 660         len = strlen(cgroup) + strlen(file) + 3;
 661         fnam = alloca(len);
 662         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 663         if (ret < 0 || (size_t)ret >= len)
 664                 return false;
 665
 666         fd = openat(cfd, fnam, O_WRONLY);
 667         if (fd < 0)
 668                 return false;
 669
 670         return write_string(fnam, value, fd);
 671 }
 672
 673 // Chown all the files in the cgroup directory.  We do this when we create
 674 // a cgroup on behalf of a user.
 675 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 676 {
 677         struct dirent *direntp;
 678         char path[MAXPATHLEN];
 679         size_t len;
 680         DIR *d;
 681         int fd1, ret;
 682
 683         len = strlen(dirname);
 684         if (len >= MAXPATHLEN) {
 685                 lxcfs_error("Pathname too long: %s\n", dirname);
 686                 return;
 687         }
 688
 689         fd1 = openat(fd, dirname, O_DIRECTORY);
 690         if (fd1 < 0)
 691                 return;
 692
 693         d = fdopendir(fd1);
 694         if (!d) {
 695                 lxcfs_error("Failed to open %s\n", dirname);
 696                 return;
 697         }
 698
 699         while ((direntp = readdir(d))) {
 700                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 701                         continue;
 702                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 703                 if (ret < 0 || ret >= MAXPATHLEN) {
 704                         lxcfs_error("Pathname too long under %s\n", dirname);
 705                         continue;
 706                 }
 707                 if (fchownat(fd, path, uid, gid, 0) < 0)
 708                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 709         }
 710         closedir(d);
 711 }
 712
 713 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 714 {
 715         int cfd;
 716         size_t len;
 717         char *dirnam, *tmpc;
 718
 719         tmpc = find_mounted_controller(controller, &cfd);
 720         if (!tmpc)
 721                 return -EINVAL;
 722
 723         /* Make sure we pass a relative path to *at() family of functions.
 724          * . + /cg + \0
 725          */
 726         len = strlen(cg) + 2;
 727         dirnam = alloca(len);
 728         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 729
 730         if (mkdirat(cfd, dirnam, 0755) < 0)
 731                 return -errno;
 732
 733         if (uid == 0 && gid == 0)
 734                 return 0;
 735
 736         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 737                 return -errno;
 738
 739         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 740
 741         return 0;
 742 }
 743
 744 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 745 {
 746         struct dirent *direntp;
 747         DIR *dir;
 748         bool ret = false;
 749         char pathname[MAXPATHLEN];
 750         int dupfd;
 751
 752         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 753         if (dupfd < 0)
 754                 return false;
 755
 756         dir = fdopendir(dupfd);
 757         if (!dir) {
 758                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 759                 close(dupfd);
 760                 return false;
 761         }
 762
 763         while ((direntp = readdir(dir))) {
 764                 struct stat mystat;
 765                 int rc;
 766
 767                 if (!strcmp(direntp->d_name, ".") ||
 768                     !strcmp(direntp->d_name, ".."))
 769                         continue;
 770
 771                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 772                 if (rc < 0 || rc >= MAXPATHLEN) {
 773                         lxcfs_error("%s\n", "Pathname too long.");
 774                         continue;
 775                 }
 776
 777                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 778                 if (rc) {
 779                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 780                         continue;
 781                 }
 782                 if (S_ISDIR(mystat.st_mode))
 783                         if (!recursive_rmdir(pathname, fd, cfd))
 784                                 lxcfs_debug("Error removing %s.\n", pathname);
 785         }
 786
 787         ret = true;
 788         if (closedir(dir) < 0) {
 789                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 790                 ret = false;
 791         }
 792
 793         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 794                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 795                 ret = false;
 796         }
 797
 798         close(dupfd);
 799
 800         return ret;
 801 }
 802
 803 bool cgfs_remove(const char *controller, const char *cg)
 804 {
 805         int fd, cfd;
 806         size_t len;
 807         char *dirnam, *tmpc;
 808         bool bret;
 809
 810         tmpc = find_mounted_controller(controller, &cfd);
 811         if (!tmpc)
 812                 return false;
 813
 814         /* Make sure we pass a relative path to *at() family of functions.
 815          * . +  /cg + \0
 816          */
 817         len = strlen(cg) + 2;
 818         dirnam = alloca(len);
 819         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 820
 821         fd = openat(cfd, dirnam, O_DIRECTORY);
 822         if (fd < 0)
 823                 return false;
 824
 825         bret = recursive_rmdir(dirnam, fd, cfd);
 826         close(fd);
 827         return bret;
 828 }
 829
 830 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 831 {
 832         int cfd;
 833         size_t len;
 834         char *pathname, *tmpc;
 835
 836         tmpc = find_mounted_controller(controller, &cfd);
 837         if (!tmpc)
 838                 return false;
 839
 840         /* Make sure we pass a relative path to *at() family of functions.
 841          * . + /file + \0
 842          */
 843         len = strlen(file) + 2;
 844         pathname = alloca(len);
 845         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 846         if (fchmodat(cfd, pathname, mode, 0) < 0)
 847                 return false;
 848         return true;
 849 }
 850
 851 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 852 {
 853         size_t len;
 854         char *fname;
 855
 856         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 857         fname = alloca(len);
 858         snprintf(fname, len, "%s/tasks", dirname);
 859         if (fchownat(fd, fname, uid, gid, 0) != 0)
 860                 return -errno;
 861         snprintf(fname, len, "%s/cgroup.procs", dirname);
 862         if (fchownat(fd, fname, uid, gid, 0) != 0)
 863                 return -errno;
 864         return 0;
 865 }
 866
 867 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 868 {
 869         int cfd;
 870         size_t len;
 871         char *pathname, *tmpc;
 872
 873         tmpc = find_mounted_controller(controller, &cfd);
 874         if (!tmpc)
 875                 return -EINVAL;
 876
 877         /* Make sure we pass a relative path to *at() family of functions.
 878          * . + /file + \0
 879          */
 880         len = strlen(file) + 2;
 881         pathname = alloca(len);
 882         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 883         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 884                 return -errno;
 885
 886         if (is_dir(pathname, cfd))
 887                 // like cgmanager did, we want to chown the tasks file as well
 888                 return chown_tasks_files(pathname, uid, gid, cfd);
 889
 890         return 0;
 891 }
 892
 893 FILE *open_pids_file(const char *controller, const char *cgroup)
 894 {
 895         int fd, cfd;
 896         size_t len;
 897         char *pathname, *tmpc;
 898
 899         tmpc = find_mounted_controller(controller, &cfd);
 900         if (!tmpc)
 901                 return NULL;
 902
 903         /* Make sure we pass a relative path to *at() family of functions.
 904          * . + /cgroup + / "cgroup.procs" + \0
 905          */
 906         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 907         pathname = alloca(len);
 908         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 909
 910         fd = openat(cfd, pathname, O_WRONLY);
 911         if (fd < 0)
 912                 return NULL;
 913
 914         return fdopen(fd, "w");
 915 }
 916
 917 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 918                                 void ***list, size_t typesize,
 919                                 void* (*iterator)(const char*, const char*, const char*))
 920 {
 921         int cfd, fd, ret;
 922         size_t len;
 923         char *cg, *tmpc;
 924         char pathname[MAXPATHLEN];
 925         size_t sz = 0, asz = 0;
 926         struct dirent *dirent;
 927         DIR *dir;
 928
 929         tmpc = find_mounted_controller(controller, &cfd);
 930         *list = NULL;
 931         if (!tmpc)
 932                 return false;
 933
 934         /* Make sure we pass a relative path to *at() family of functions. */
 935         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 936         cg = alloca(len);
 937         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 938         if (ret < 0 || (size_t)ret >= len) {
 939                 lxcfs_error("Pathname too long under %s\n", cgroup);
 940                 return false;
 941         }
 942
 943         fd = openat(cfd, cg, O_DIRECTORY);
 944         if (fd < 0)
 945                 return false;
 946
 947         dir = fdopendir(fd);
 948         if (!dir)
 949                 return false;
 950
 951         while ((dirent = readdir(dir))) {
 952                 struct stat mystat;
 953
 954                 if (!strcmp(dirent->d_name, ".") ||
 955                     !strcmp(dirent->d_name, ".."))
 956                         continue;
 957
 958                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 959                 if (ret < 0 || ret >= MAXPATHLEN) {
 960                         lxcfs_error("Pathname too long under %s\n", cg);
 961                         continue;
 962                 }
 963
 964                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 965                 if (ret) {
 966                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
 967                         continue;
 968                 }
 969                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
 970                     (directories && !S_ISDIR(mystat.st_mode)))
 971                         continue;
 972
 973                 if (sz+2 >= asz) {
 974                         void **tmp;
 975                         asz += BATCH_SIZE;
 976                         do {
 977                                 tmp = realloc(*list, asz * typesize);
 978                         } while  (!tmp);
 979                         *list = tmp;
 980                 }
 981                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
 982                 (*list)[sz+1] = NULL;
 983                 sz++;
 984         }
 985         if (closedir(dir) < 0) {
 986                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
 987                 return false;
 988         }
 989         return true;
 990 }
 991
 992 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 993 {
 994         char *dup;
 995         do {
 996                 dup = strdup(dir_entry);
 997         } while (!dup);
 998         return dup;
 999 }
1000
1001 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1002 {
1003         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1004 }
1005
1006 void free_key(struct cgfs_files *k)
1007 {
1008         if (!k)
1009                 return;
1010         free(k->name);
1011         free(k);
1012 }
1013
1014 void free_keys(struct cgfs_files **keys)
1015 {
1016         int i;
1017
1018         if (!keys)
1019                 return;
1020         for (i = 0; keys[i]; i++) {
1021                 free_key(keys[i]);
1022         }
1023         free(keys);
1024 }
1025
1026 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1027 {
1028         int ret, fd, cfd;
1029         size_t len;
1030         char *fnam, *tmpc;
1031
1032         tmpc = find_mounted_controller(controller, &cfd);
1033         if (!tmpc)
1034                 return false;
1035
1036         /* Make sure we pass a relative path to *at() family of functions.
1037          * . + /cgroup + / + file + \0
1038          */
1039         len = strlen(cgroup) + strlen(file) + 3;
1040         fnam = alloca(len);
1041         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1042         if (ret < 0 || (size_t)ret >= len)
1043                 return false;
1044
1045         fd = openat(cfd, fnam, O_RDONLY);
1046         if (fd < 0)
1047                 return false;
1048
1049         *value = slurp_file(fnam, fd);
1050         return *value != NULL;
1051 }
1052
1053 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1054 {
1055         int ret, cfd;
1056         size_t len;
1057         char *fnam, *tmpc;
1058         struct stat sb;
1059         struct cgfs_files *newkey;
1060
1061         tmpc = find_mounted_controller(controller, &cfd);
1062         if (!tmpc)
1063                 return false;
1064
1065         if (file && *file == '/')
1066                 file++;
1067
1068         if (file && strchr(file, '/'))
1069                 return NULL;
1070
1071         /* Make sure we pass a relative path to *at() family of functions.
1072          * . + /cgroup + / + file + \0
1073          */
1074         len = strlen(cgroup) + 3;
1075         if (file)
1076                 len += strlen(file) + 1;
1077         fnam = alloca(len);
1078         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1079                  file ? "/" : "", file ? file : "");
1080
1081         ret = fstatat(cfd, fnam, &sb, 0);
1082         if (ret < 0)
1083                 return NULL;
1084
1085         do {
1086                 newkey = malloc(sizeof(struct cgfs_files));
1087         } while (!newkey);
1088         if (file)
1089                 newkey->name = must_copy_string(file);
1090         else if (strrchr(cgroup, '/'))
1091                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1092         else
1093                 newkey->name = must_copy_string(cgroup);
1094         newkey->uid = sb.st_uid;
1095         newkey->gid = sb.st_gid;
1096         newkey->mode = sb.st_mode;
1097
1098         return newkey;
1099 }
1100
1101 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1102 {
1103         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1104         if (!entry) {
1105                 lxcfs_error("Error getting files under %s:%s\n", controller,
1106                              cgroup);
1107         }
1108         return entry;
1109 }
1110
1111 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1112 {
1113         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1114 }
1115
1116 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1117 {
1118         int cfd;
1119         size_t len;
1120         char *fnam, *tmpc;
1121         int ret;
1122         struct stat sb;
1123
1124         tmpc = find_mounted_controller(controller, &cfd);
1125         if (!tmpc)
1126                 return false;
1127
1128         /* Make sure we pass a relative path to *at() family of functions.
1129          * . + /cgroup + / + f + \0
1130          */
1131         len = strlen(cgroup) + strlen(f) + 3;
1132         fnam = alloca(len);
1133         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1134         if (ret < 0 || (size_t)ret >= len)
1135                 return false;
1136
1137         ret = fstatat(cfd, fnam, &sb, 0);
1138         if (ret < 0 || !S_ISDIR(sb.st_mode))
1139                 return false;
1140
1141         return true;
1142 }
1143
1144 #define SEND_CREDS_OK 0
1145 #define SEND_CREDS_NOTSK 1
1146 #define SEND_CREDS_FAIL 2
1147 static bool recv_creds(int sock, struct ucred *cred, char *v);
1148 static int wait_for_pid(pid_t pid);
1149 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1150 static int send_creds_clone_wrapper(void *arg);
1151
1152 /*
1153  * clone a task which switches to @task's namespace and writes '1'.
1154  * over a unix sock so we can read the task's reaper's pid in our
1155  * namespace
1156  *
1157  * Note: glibc's fork() does not respect pidns, which can lead to failed
1158  * assertions inside glibc (and thus failed forks) if the child's pid in
1159  * the pidns and the parent pid outside are identical. Using clone prevents
1160  * this issue.
1161  */
1162 static void write_task_init_pid_exit(int sock, pid_t target)
1163 {
1164         char fnam[100];
1165         pid_t pid;
1166         int fd, ret;
1167         size_t stack_size = sysconf(_SC_PAGESIZE);
1168         void *stack = alloca(stack_size);
1169
1170         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1171         if (ret < 0 || ret >= sizeof(fnam))
1172                 _exit(1);
1173
1174         fd = open(fnam, O_RDONLY);
1175         if (fd < 0) {
1176                 perror("write_task_init_pid_exit open of ns/pid");
1177                 _exit(1);
1178         }
1179         if (setns(fd, 0)) {
1180                 perror("write_task_init_pid_exit setns 1");
1181                 close(fd);
1182                 _exit(1);
1183         }
1184         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1185         if (pid < 0)
1186                 _exit(1);
1187         if (pid != 0) {
1188                 if (!wait_for_pid(pid))
1189                         _exit(1);
1190                 _exit(0);
1191         }
1192 }
1193
1194 static int send_creds_clone_wrapper(void *arg) {
1195         struct ucred cred;
1196         char v;
1197         int sock = *(int *)arg;
1198
1199         /* we are the child */
1200         cred.uid = 0;
1201         cred.gid = 0;
1202         cred.pid = 1;
1203         v = '1';
1204         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1205                 return 1;
1206         return 0;
1207 }
1208
1209 static pid_t get_init_pid_for_task(pid_t task)
1210 {
1211         int sock[2];
1212         pid_t pid;
1213         pid_t ret = -1;
1214         char v = '0';
1215         struct ucred cred;
1216
1217         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1218                 perror("socketpair");
1219                 return -1;
1220         }
1221
1222         pid = fork();
1223         if (pid < 0)
1224                 goto out;
1225         if (!pid) {
1226                 close(sock[1]);
1227                 write_task_init_pid_exit(sock[0], task);
1228                 _exit(0);
1229         }
1230
1231         if (!recv_creds(sock[1], &cred, &v))
1232                 goto out;
1233         ret = cred.pid;
1234
1235 out:
1236         close(sock[0]);
1237         close(sock[1]);
1238         if (pid > 0)
1239                 wait_for_pid(pid);
1240         return ret;
1241 }
1242
1243 static pid_t lookup_initpid_in_store(pid_t qpid)
1244 {
1245         pid_t answer = 0;
1246         struct stat sb;
1247         struct pidns_init_store *e;
1248         char fnam[100];
1249
1250         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1251         store_lock();
1252         if (stat(fnam, &sb) < 0)
1253                 goto out;
1254         e = lookup_verify_initpid(&sb);
1255         if (e) {
1256                 answer = e->initpid;
1257                 goto out;
1258         }
1259         answer = get_init_pid_for_task(qpid);
1260         if (answer > 0)
1261                 save_initpid(&sb, answer);
1262
1263 out:
1264         /* we prune at end in case we are returning
1265          * the value we were about to return */
1266         prune_initpid_store();
1267         store_unlock();
1268         return answer;
1269 }
1270
1271 static int wait_for_pid(pid_t pid)
1272 {
1273         int status, ret;
1274
1275         if (pid <= 0)
1276                 return -1;
1277
1278 again:
1279         ret = waitpid(pid, &status, 0);
1280         if (ret == -1) {
1281                 if (errno == EINTR)
1282                         goto again;
1283                 return -1;
1284         }
1285         if (ret != pid)
1286                 goto again;
1287         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1288                 return -1;
1289         return 0;
1290 }
1291
1292
1293 /*
1294  * append pid to *src.
1295  * src: a pointer to a char* in which ot append the pid.
1296  * sz: the number of characters printed so far, minus trailing \0.
1297  * asz: the allocated size so far
1298  * pid: the pid to append
1299  */
1300 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1301 {
1302         char tmp[30];
1303
1304         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1305
1306         if (!*src || tmplen + *sz + 1 >= *asz) {
1307                 char *tmp;
1308                 do {
1309                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1310                 } while (!tmp);
1311                 *src = tmp;
1312                 *asz += BUF_RESERVE_SIZE;
1313         }
1314         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1315         *sz += tmplen;
1316 }
1317
1318 /*
1319  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1320  * valid in the caller's namespace, return the id mapped into
1321  * pid's namespace.
1322  * Returns the mapped id, or -1 on error.
1323  */
1324 unsigned int
1325 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1326 {
1327         unsigned int nsuid,   // base id for a range in the idfile's namespace
1328                      hostuid, // base id for a range in the caller's namespace
1329                      count;   // number of ids in this range
1330         char line[400];
1331         int ret;
1332
1333         fseek(idfile, 0L, SEEK_SET);
1334         while (fgets(line, 400, idfile)) {
1335                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1336                 if (ret != 3)
1337                         continue;
1338                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1339                         /*
1340                          * uids wrapped around - unexpected as this is a procfile,
1341                          * so just bail.
1342                          */
1343                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1344                                 nsuid, hostuid, count, line);
1345                         return -1;
1346                 }
1347                 if (hostuid <= in_id && hostuid+count > in_id) {
1348                         /*
1349                          * now since hostuid <= in_id < hostuid+count, and
1350                          * hostuid+count and nsuid+count do not wrap around,
1351                          * we know that nsuid+(in_id-hostuid) which must be
1352                          * less that nsuid+(count) must not wrap around
1353                          */
1354                         return (in_id - hostuid) + nsuid;
1355                 }
1356         }
1357
1358         // no answer found
1359         return -1;
1360 }
1361
1362 /*
1363  * for is_privileged_over,
1364  * specify whether we require the calling uid to be root in his
1365  * namespace
1366  */
1367 #define NS_ROOT_REQD true
1368 #define NS_ROOT_OPT false
1369
1370 #define PROCLEN 100
1371
1372 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1373 {
1374         char fpath[PROCLEN];
1375         int ret;
1376         bool answer = false;
1377         uid_t nsuid;
1378
1379         if (victim == -1 || uid == -1)
1380                 return false;
1381
1382         /*
1383          * If the request is one not requiring root in the namespace,
1384          * then having the same uid suffices.  (i.e. uid 1000 has write
1385          * access to files owned by uid 1000
1386          */
1387         if (!req_ns_root && uid == victim)
1388                 return true;
1389
1390         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1391         if (ret < 0 || ret >= PROCLEN)
1392                 return false;
1393         FILE *f = fopen(fpath, "r");
1394         if (!f)
1395                 return false;
1396
1397         /* if caller's not root in his namespace, reject */
1398         nsuid = convert_id_to_ns(f, uid);
1399         if (nsuid)
1400                 goto out;
1401
1402         /*
1403          * If victim is not mapped into caller's ns, reject.
1404          * XXX I'm not sure this check is needed given that fuse
1405          * will be sending requests where the vfs has converted
1406          */
1407         nsuid = convert_id_to_ns(f, victim);
1408         if (nsuid == -1)
1409                 goto out;
1410
1411         answer = true;
1412
1413 out:
1414         fclose(f);
1415         return answer;
1416 }
1417
1418 static bool perms_include(int fmode, mode_t req_mode)
1419 {
1420         mode_t r;
1421
1422         switch (req_mode & O_ACCMODE) {
1423         case O_RDONLY:
1424                 r = S_IROTH;
1425                 break;
1426         case O_WRONLY:
1427                 r = S_IWOTH;
1428                 break;
1429         case O_RDWR:
1430                 r = S_IROTH | S_IWOTH;
1431                 break;
1432         default:
1433                 return false;
1434         }
1435         return ((fmode & r) == r);
1436 }
1437
1438
1439 /*
1440  * taskcg is  a/b/c
1441  * querycg is /a/b/c/d/e
1442  * we return 'd'
1443  */
1444 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1445 {
1446         char *start, *end;
1447
1448         if (strlen(taskcg) <= strlen(querycg)) {
1449                 lxcfs_error("%s\n", "I was fed bad input.");
1450                 return NULL;
1451         }
1452
1453         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1454                 start =  strdup(taskcg + 1);
1455         else
1456                 start = strdup(taskcg + strlen(querycg) + 1);
1457         if (!start)
1458                 return NULL;
1459         end = strchr(start, '/');
1460         if (end)
1461                 *end = '\0';
1462         return start;
1463 }
1464
1465 static void stripnewline(char *x)
1466 {
1467         size_t l = strlen(x);
1468         if (l && x[l-1] == '\n')
1469                 x[l-1] = '\0';
1470 }
1471
1472 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1473 {
1474         int cfd;
1475         char fnam[PROCLEN];
1476         FILE *f;
1477         char *answer = NULL;
1478         char *line = NULL;
1479         size_t len = 0;
1480         int ret;
1481         const char *h = find_mounted_controller(contrl, &cfd);
1482         if (!h)
1483                 return NULL;
1484
1485         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1486         if (ret < 0 || ret >= PROCLEN)
1487                 return NULL;
1488         if (!(f = fopen(fnam, "r")))
1489                 return NULL;
1490
1491         while (getline(&line, &len, f) != -1) {
1492                 char *c1, *c2;
1493                 if (!line[0])
1494                         continue;
1495                 c1 = strchr(line, ':');
1496                 if (!c1)
1497                         goto out;
1498                 c1++;
1499                 c2 = strchr(c1, ':');
1500                 if (!c2)
1501                         goto out;
1502                 *c2 = '\0';
1503                 if (strcmp(c1, h) != 0)
1504                         continue;
1505                 c2++;
1506                 stripnewline(c2);
1507                 do {
1508                         answer = strdup(c2);
1509                 } while (!answer);
1510                 break;
1511         }
1512
1513 out:
1514         fclose(f);
1515         free(line);
1516         return answer;
1517 }
1518
1519 /*
1520  * check whether a fuse context may access a cgroup dir or file
1521  *
1522  * If file is not null, it is a cgroup file to check under cg.
1523  * If file is null, then we are checking perms on cg itself.
1524  *
1525  * For files we can check the mode of the list_keys result.
1526  * For cgroups, we must make assumptions based on the files under the
1527  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1528  * yet.
1529  */
1530 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1531 {
1532         struct cgfs_files *k = NULL;
1533         bool ret = false;
1534
1535         k = cgfs_get_key(contrl, cg, file);
1536         if (!k)
1537                 return false;
1538
1539         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1540                 if (perms_include(k->mode >> 6, mode)) {
1541                         ret = true;
1542                         goto out;
1543                 }
1544         }
1545         if (fc->gid == k->gid) {
1546                 if (perms_include(k->mode >> 3, mode)) {
1547                         ret = true;
1548                         goto out;
1549                 }
1550         }
1551         ret = perms_include(k->mode, mode);
1552
1553 out:
1554         free_key(k);
1555         return ret;
1556 }
1557
1558 #define INITSCOPE "/init.scope"
1559 static void prune_init_slice(char *cg)
1560 {
1561         char *point;
1562         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1563
1564         if (cg_len < initscope_len)
1565                 return;
1566
1567         point = cg + cg_len - initscope_len;
1568         if (strcmp(point, INITSCOPE) == 0) {
1569                 if (point == cg)
1570                         *(point+1) = '\0';
1571                 else
1572                         *point = '\0';
1573         }
1574 }
1575
1576 /*
1577  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1578  * If pid is in /a, he may act on /a/b, but not on /b.
1579  * if the answer is false and nextcg is not NULL, then *nextcg will point
1580  * to a string containing the next cgroup directory under cg, which must be
1581  * freed by the caller.
1582  */
1583 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1584 {
1585         bool answer = false;
1586         char *c2 = get_pid_cgroup(pid, contrl);
1587         char *linecmp;
1588
1589         if (!c2)
1590                 return false;
1591         prune_init_slice(c2);
1592
1593         /*
1594          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1595          * they pass in a cgroup without leading '/'
1596          *
1597          * The original line here was:
1598          *      linecmp = *cg == '/' ? c2 : c2+1;
1599          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1600          *       Serge, do you know?
1601          */
1602         if (*cg == '/' || !strncmp(cg, "./", 2))
1603                 linecmp = c2;
1604         else
1605                 linecmp = c2 + 1;
1606         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1607                 if (nextcg) {
1608                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1609                 }
1610                 goto out;
1611         }
1612         answer = true;
1613
1614 out:
1615         free(c2);
1616         return answer;
1617 }
1618
1619 /*
1620  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1621  */
1622 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1623 {
1624         bool answer = false;
1625         char *c2, *task_cg;
1626         size_t target_len, task_len;
1627
1628         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1629                 return true;
1630
1631         c2 = get_pid_cgroup(pid, contrl);
1632         if (!c2)
1633                 return false;
1634         prune_init_slice(c2);
1635
1636         task_cg = c2 + 1;
1637         target_len = strlen(cg);
1638         task_len = strlen(task_cg);
1639         if (task_len == 0) {
1640                 /* Task is in the root cg, it can see everything. This case is
1641                  * not handled by the strmcps below, since they test for the
1642                  * last /, but that is the first / that we've chopped off
1643                  * above.
1644                  */
1645                 answer = true;
1646                 goto out;
1647         }
1648         if (strcmp(cg, task_cg) == 0) {
1649                 answer = true;
1650                 goto out;
1651         }
1652         if (target_len < task_len) {
1653                 /* looking up a parent dir */
1654                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1655                         answer = true;
1656                 goto out;
1657         }
1658         if (target_len > task_len) {
1659                 /* looking up a child dir */
1660                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1661                         answer = true;
1662                 goto out;
1663         }
1664
1665 out:
1666         free(c2);
1667         return answer;
1668 }
1669
1670 /*
1671  * given /cgroup/freezer/a/b, return "freezer".
1672  * the returned char* should NOT be freed.
1673  */
1674 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1675 {
1676         const char *p1;
1677         char *contr, *slash;
1678
1679         if (strlen(path) < 9) {
1680                 errno = EACCES;
1681                 return NULL;
1682         }
1683         if (*(path + 7) != '/') {
1684                 errno = EINVAL;
1685                 return NULL;
1686         }
1687         p1 = path + 8;
1688         contr = strdupa(p1);
1689         if (!contr) {
1690                 errno = ENOMEM;
1691                 return NULL;
1692         }
1693         slash = strstr(contr, "/");
1694         if (slash)
1695                 *slash = '\0';
1696
1697         int i;
1698         for (i = 0; i < num_hierarchies; i++) {
1699                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1700                         return hierarchies[i];
1701         }
1702         errno = ENOENT;
1703         return NULL;
1704 }
1705
1706 /*
1707  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1708  * Note that the returned value may include files (keynames) etc
1709  */
1710 static const char *find_cgroup_in_path(const char *path)
1711 {
1712         const char *p1;
1713
1714         if (strlen(path) < 9) {
1715                 errno = EACCES;
1716                 return NULL;
1717         }
1718         p1 = strstr(path + 8, "/");
1719         if (!p1) {
1720                 errno = EINVAL;
1721                 return NULL;
1722         }
1723         errno = 0;
1724         return p1 + 1;
1725 }
1726
1727 /*
1728  * split the last path element from the path in @cg.
1729  * @dir is newly allocated and should be freed, @last not
1730 */
1731 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1732 {
1733         char *p;
1734
1735         do {
1736                 *dir = strdup(cg);
1737         } while (!*dir);
1738         *last = strrchr(cg, '/');
1739         if (!*last) {
1740                 *last = NULL;
1741                 return;
1742         }
1743         p = strrchr(*dir, '/');
1744         *p = '\0';
1745 }
1746
1747 /*
1748  * FUSE ops for /cgroup
1749  */
1750
1751 int cg_getattr(const char *path, struct stat *sb)
1752 {
1753         struct timespec now;
1754         struct fuse_context *fc = fuse_get_context();
1755         char * cgdir = NULL;
1756         char *last = NULL, *path1, *path2;
1757         struct cgfs_files *k = NULL;
1758         const char *cgroup;
1759         const char *controller = NULL;
1760         int ret = -ENOENT;
1761
1762
1763         if (!fc)
1764                 return -EIO;
1765
1766         memset(sb, 0, sizeof(struct stat));
1767
1768         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1769                 return -EINVAL;
1770
1771         sb->st_uid = sb->st_gid = 0;
1772         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1773         sb->st_size = 0;
1774
1775         if (strcmp(path, "/cgroup") == 0) {
1776                 sb->st_mode = S_IFDIR | 00755;
1777                 sb->st_nlink = 2;
1778                 return 0;
1779         }
1780
1781         controller = pick_controller_from_path(fc, path);
1782         if (!controller)
1783                 return -errno;
1784         cgroup = find_cgroup_in_path(path);
1785         if (!cgroup) {
1786                 /* this is just /cgroup/controller, return it as a dir */
1787                 sb->st_mode = S_IFDIR | 00755;
1788                 sb->st_nlink = 2;
1789                 return 0;
1790         }
1791
1792         get_cgdir_and_path(cgroup, &cgdir, &last);
1793
1794         if (!last) {
1795                 path1 = "/";
1796                 path2 = cgdir;
1797         } else {
1798                 path1 = cgdir;
1799                 path2 = last;
1800         }
1801
1802         pid_t initpid = lookup_initpid_in_store(fc->pid);
1803         if (initpid <= 0)
1804                 initpid = fc->pid;
1805         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1806          * Then check that caller's cgroup is under path if last is a child
1807          * cgroup, or cgdir if last is a file */
1808
1809         if (is_child_cgroup(controller, path1, path2)) {
1810                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1811                         ret = -ENOENT;
1812                         goto out;
1813                 }
1814                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1815                         /* this is just /cgroup/controller, return it as a dir */
1816                         sb->st_mode = S_IFDIR | 00555;
1817                         sb->st_nlink = 2;
1818                         ret = 0;
1819                         goto out;
1820                 }
1821                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1822                         ret = -EACCES;
1823                         goto out;
1824                 }
1825
1826                 // get uid, gid, from '/tasks' file and make up a mode
1827                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1828                 sb->st_mode = S_IFDIR | 00755;
1829                 k = cgfs_get_key(controller, cgroup, NULL);
1830                 if (!k) {
1831                         sb->st_uid = sb->st_gid = 0;
1832                 } else {
1833                         sb->st_uid = k->uid;
1834                         sb->st_gid = k->gid;
1835                 }
1836                 free_key(k);
1837                 sb->st_nlink = 2;
1838                 ret = 0;
1839                 goto out;
1840         }
1841
1842         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1843                 sb->st_mode = S_IFREG | k->mode;
1844                 sb->st_nlink = 1;
1845                 sb->st_uid = k->uid;
1846                 sb->st_gid = k->gid;
1847                 sb->st_size = 0;
1848                 free_key(k);
1849                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1850                         ret = -ENOENT;
1851                         goto out;
1852                 }
1853                 ret = 0;
1854         }
1855
1856 out:
1857         free(cgdir);
1858         return ret;
1859 }
1860
1861 int cg_opendir(const char *path, struct fuse_file_info *fi)
1862 {
1863         struct fuse_context *fc = fuse_get_context();
1864         const char *cgroup;
1865         struct file_info *dir_info;
1866         char *controller = NULL;
1867
1868         if (!fc)
1869                 return -EIO;
1870
1871         if (strcmp(path, "/cgroup") == 0) {
1872                 cgroup = NULL;
1873                 controller = NULL;
1874         } else {
1875                 // return list of keys for the controller, and list of child cgroups
1876                 controller = pick_controller_from_path(fc, path);
1877                 if (!controller)
1878                         return -errno;
1879
1880                 cgroup = find_cgroup_in_path(path);
1881                 if (!cgroup) {
1882                         /* this is just /cgroup/controller, return its contents */
1883                         cgroup = "/";
1884                 }
1885         }
1886
1887         pid_t initpid = lookup_initpid_in_store(fc->pid);
1888         if (initpid <= 0)
1889                 initpid = fc->pid;
1890         if (cgroup) {
1891                 if (!caller_may_see_dir(initpid, controller, cgroup))
1892                         return -ENOENT;
1893                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1894                         return -EACCES;
1895         }
1896
1897         /* we'll free this at cg_releasedir */
1898         dir_info = malloc(sizeof(*dir_info));
1899         if (!dir_info)
1900                 return -ENOMEM;
1901         dir_info->controller = must_copy_string(controller);
1902         dir_info->cgroup = must_copy_string(cgroup);
1903         dir_info->type = LXC_TYPE_CGDIR;
1904         dir_info->buf = NULL;
1905         dir_info->file = NULL;
1906         dir_info->buflen = 0;
1907
1908         fi->fh = (unsigned long)dir_info;
1909         return 0;
1910 }
1911
1912 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1913                 struct fuse_file_info *fi)
1914 {
1915         struct file_info *d = (struct file_info *)fi->fh;
1916         struct cgfs_files **list = NULL;
1917         int i, ret;
1918         char *nextcg = NULL;
1919         struct fuse_context *fc = fuse_get_context();
1920         char **clist = NULL;
1921
1922         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1923                 return -EIO;
1924
1925         if (d->type != LXC_TYPE_CGDIR) {
1926                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1927                 return -EIO;
1928         }
1929         if (!d->cgroup && !d->controller) {
1930                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1931                 int i;
1932
1933                 for (i = 0;  i < num_hierarchies; i++) {
1934                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1935                                 return -EIO;
1936                         }
1937                 }
1938                 return 0;
1939         }
1940
1941         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1942                 // not a valid cgroup
1943                 ret = -EINVAL;
1944                 goto out;
1945         }
1946
1947         pid_t initpid = lookup_initpid_in_store(fc->pid);
1948         if (initpid <= 0)
1949                 initpid = fc->pid;
1950         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1951                 if (nextcg) {
1952                         ret = filler(buf, nextcg,  NULL, 0);
1953                         free(nextcg);
1954                         if (ret != 0) {
1955                                 ret = -EIO;
1956                                 goto out;
1957                         }
1958                 }
1959                 ret = 0;
1960                 goto out;
1961         }
1962
1963         for (i = 0; list[i]; i++) {
1964                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1965                         ret = -EIO;
1966                         goto out;
1967                 }
1968         }
1969
1970         // now get the list of child cgroups
1971
1972         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1973                 ret = 0;
1974                 goto out;
1975         }
1976         if (clist) {
1977                 for (i = 0; clist[i]; i++) {
1978                         if (filler(buf, clist[i], NULL, 0) != 0) {
1979                                 ret = -EIO;
1980                                 goto out;
1981                         }
1982                 }
1983         }
1984         ret = 0;
1985
1986 out:
1987         free_keys(list);
1988         if (clist) {
1989                 for (i = 0; clist[i]; i++)
1990                         free(clist[i]);
1991                 free(clist);
1992         }
1993         return ret;
1994 }
1995
1996 static void do_release_file_info(struct fuse_file_info *fi)
1997 {
1998         struct file_info *f = (struct file_info *)fi->fh;
1999
2000         if (!f)
2001                 return;
2002
2003         fi->fh = 0;
2004
2005         free(f->controller);
2006         f->controller = NULL;
2007         free(f->cgroup);
2008         f->cgroup = NULL;
2009         free(f->file);
2010         f->file = NULL;
2011         free(f->buf);
2012         f->buf = NULL;
2013         free(f);
2014 }
2015
2016 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2017 {
2018         do_release_file_info(fi);
2019         return 0;
2020 }
2021
2022 int cg_open(const char *path, struct fuse_file_info *fi)
2023 {
2024         const char *cgroup;
2025         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2026         struct cgfs_files *k = NULL;
2027         struct file_info *file_info;
2028         struct fuse_context *fc = fuse_get_context();
2029         int ret;
2030
2031         if (!fc)
2032                 return -EIO;
2033
2034         controller = pick_controller_from_path(fc, path);
2035         if (!controller)
2036                 return -errno;
2037         cgroup = find_cgroup_in_path(path);
2038         if (!cgroup)
2039                 return -errno;
2040
2041         get_cgdir_and_path(cgroup, &cgdir, &last);
2042         if (!last) {
2043                 path1 = "/";
2044                 path2 = cgdir;
2045         } else {
2046                 path1 = cgdir;
2047                 path2 = last;
2048         }
2049
2050         k = cgfs_get_key(controller, path1, path2);
2051         if (!k) {
2052                 ret = -EINVAL;
2053                 goto out;
2054         }
2055         free_key(k);
2056
2057         pid_t initpid = lookup_initpid_in_store(fc->pid);
2058         if (initpid <= 0)
2059                 initpid = fc->pid;
2060         if (!caller_may_see_dir(initpid, controller, path1)) {
2061                 ret = -ENOENT;
2062                 goto out;
2063         }
2064         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2065                 ret = -EACCES;
2066                 goto out;
2067         }
2068
2069         /* we'll free this at cg_release */
2070         file_info = malloc(sizeof(*file_info));
2071         if (!file_info) {
2072                 ret = -ENOMEM;
2073                 goto out;
2074         }
2075         file_info->controller = must_copy_string(controller);
2076         file_info->cgroup = must_copy_string(path1);
2077         file_info->file = must_copy_string(path2);
2078         file_info->type = LXC_TYPE_CGFILE;
2079         file_info->buf = NULL;
2080         file_info->buflen = 0;
2081
2082         fi->fh = (unsigned long)file_info;
2083         ret = 0;
2084
2085 out:
2086         free(cgdir);
2087         return ret;
2088 }
2089
2090 int cg_access(const char *path, int mode)
2091 {
2092         int ret;
2093         const char *cgroup;
2094         char *path1, *path2, *controller;
2095         char *last = NULL, *cgdir = NULL;
2096         struct cgfs_files *k = NULL;
2097         struct fuse_context *fc = fuse_get_context();
2098
2099         if (strcmp(path, "/cgroup") == 0)
2100                 return 0;
2101
2102         if (!fc)
2103                 return -EIO;
2104
2105         controller = pick_controller_from_path(fc, path);
2106         if (!controller)
2107                 return -errno;
2108         cgroup = find_cgroup_in_path(path);
2109         if (!cgroup) {
2110                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2111                 if ((mode & W_OK) == 0)
2112                         return 0;
2113                 return -EACCES;
2114         }
2115
2116         get_cgdir_and_path(cgroup, &cgdir, &last);
2117         if (!last) {
2118                 path1 = "/";
2119                 path2 = cgdir;
2120         } else {
2121                 path1 = cgdir;
2122                 path2 = last;
2123         }
2124
2125         k = cgfs_get_key(controller, path1, path2);
2126         if (!k) {
2127                 if ((mode & W_OK) == 0)
2128                         ret = 0;
2129                 else
2130                         ret = -EACCES;
2131                 goto out;
2132         }
2133         free_key(k);
2134
2135         pid_t initpid = lookup_initpid_in_store(fc->pid);
2136         if (initpid <= 0)
2137                 initpid = fc->pid;
2138         if (!caller_may_see_dir(initpid, controller, path1)) {
2139                 ret = -ENOENT;
2140                 goto out;
2141         }
2142         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2143                 ret = -EACCES;
2144                 goto out;
2145         }
2146
2147         ret = 0;
2148
2149 out:
2150         free(cgdir);
2151         return ret;
2152 }
2153
2154 int cg_release(const char *path, struct fuse_file_info *fi)
2155 {
2156         do_release_file_info(fi);
2157         return 0;
2158 }
2159
2160 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2161
2162 static bool wait_for_sock(int sock, int timeout)
2163 {
2164         struct epoll_event ev;
2165         int epfd, ret, now, starttime, deltatime, saved_errno;
2166
2167         if ((starttime = time(NULL)) < 0)
2168                 return false;
2169
2170         if ((epfd = epoll_create(1)) < 0) {
2171                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2172                 return false;
2173         }
2174
2175         ev.events = POLLIN_SET;
2176         ev.data.fd = sock;
2177         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2178                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2179                 close(epfd);
2180                 return false;
2181         }
2182
2183 again:
2184         if ((now = time(NULL)) < 0) {
2185                 close(epfd);
2186                 return false;
2187         }
2188
2189         deltatime = (starttime + timeout) - now;
2190         if (deltatime < 0) { // timeout
2191                 errno = 0;
2192                 close(epfd);
2193                 return false;
2194         }
2195         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2196         if (ret < 0 && errno == EINTR)
2197                 goto again;
2198         saved_errno = errno;
2199         close(epfd);
2200
2201         if (ret <= 0) {
2202                 errno = saved_errno;
2203                 return false;
2204         }
2205         return true;
2206 }
2207
2208 static int msgrecv(int sockfd, void *buf, size_t len)
2209 {
2210         if (!wait_for_sock(sockfd, 2))
2211                 return -1;
2212         return recv(sockfd, buf, len, MSG_DONTWAIT);
2213 }
2214
2215 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2216 {
2217         struct msghdr msg = { 0 };
2218         struct iovec iov;
2219         struct cmsghdr *cmsg;
2220         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2221         char buf[1];
2222         buf[0] = 'p';
2223
2224         if (pingfirst) {
2225                 if (msgrecv(sock, buf, 1) != 1) {
2226                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2227                         return SEND_CREDS_FAIL;
2228                 }
2229         }
2230
2231         msg.msg_control = cmsgbuf;
2232         msg.msg_controllen = sizeof(cmsgbuf);
2233
2234         cmsg = CMSG_FIRSTHDR(&msg);
2235         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2236         cmsg->cmsg_level = SOL_SOCKET;
2237         cmsg->cmsg_type = SCM_CREDENTIALS;
2238         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2239
2240         msg.msg_name = NULL;
2241         msg.msg_namelen = 0;
2242
2243         buf[0] = v;
2244         iov.iov_base = buf;
2245         iov.iov_len = sizeof(buf);
2246         msg.msg_iov = &iov;
2247         msg.msg_iovlen = 1;
2248
2249         if (sendmsg(sock, &msg, 0) < 0) {
2250                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2251                 if (errno == 3)
2252                         return SEND_CREDS_NOTSK;
2253                 return SEND_CREDS_FAIL;
2254         }
2255
2256         return SEND_CREDS_OK;
2257 }
2258
2259 static bool recv_creds(int sock, struct ucred *cred, char *v)
2260 {
2261         struct msghdr msg = { 0 };
2262         struct iovec iov;
2263         struct cmsghdr *cmsg;
2264         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2265         char buf[1];
2266         int ret;
2267         int optval = 1;
2268
2269         *v = '1';
2270
2271         cred->pid = -1;
2272         cred->uid = -1;
2273         cred->gid = -1;
2274
2275         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2276                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2277                 return false;
2278         }
2279         buf[0] = '1';
2280         if (write(sock, buf, 1) != 1) {
2281                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2282                 return false;
2283         }
2284
2285         msg.msg_name = NULL;
2286         msg.msg_namelen = 0;
2287         msg.msg_control = cmsgbuf;
2288         msg.msg_controllen = sizeof(cmsgbuf);
2289
2290         iov.iov_base = buf;
2291         iov.iov_len = sizeof(buf);
2292         msg.msg_iov = &iov;
2293         msg.msg_iovlen = 1;
2294
2295         if (!wait_for_sock(sock, 2)) {
2296                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2297                 return false;
2298         }
2299         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2300         if (ret < 0) {
2301                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2302                 return false;
2303         }
2304
2305         cmsg = CMSG_FIRSTHDR(&msg);
2306
2307         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2308                         cmsg->cmsg_level == SOL_SOCKET &&
2309                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2310                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2311         }
2312         *v = buf[0];
2313
2314         return true;
2315 }
2316
2317 struct pid_ns_clone_args {
2318         int *cpipe;
2319         int sock;
2320         pid_t tpid;
2321         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2322 };
2323
2324 /*
2325  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2326  * with clone(). This simply writes '1' as ACK back to the parent
2327  * before calling the actual wrapped function.
2328  */
2329 static int pid_ns_clone_wrapper(void *arg) {
2330         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2331         char b = '1';
2332
2333         close(args->cpipe[0]);
2334         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2335                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2336         close(args->cpipe[1]);
2337         return args->wrapped(args->sock, args->tpid);
2338 }
2339
2340 /*
2341  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2342  * int value back over the socket.  This shifts the pid from the
2343  * sender's pidns into tpid's pidns.
2344  */
2345 static int pid_to_ns(int sock, pid_t tpid)
2346 {
2347         char v = '0';
2348         struct ucred cred;
2349
2350         while (recv_creds(sock, &cred, &v)) {
2351                 if (v == '1')
2352                         return 0;
2353                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2354                         return 1;
2355         }
2356         return 0;
2357 }
2358
2359
2360 /*
2361  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2362  * in your old pidns.  Only children which you clone will be in the target
2363  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2364  * actually convert pids.
2365  *
2366  * Note: glibc's fork() does not respect pidns, which can lead to failed
2367  * assertions inside glibc (and thus failed forks) if the child's pid in
2368  * the pidns and the parent pid outside are identical. Using clone prevents
2369  * this issue.
2370  */
2371 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2372 {
2373         int newnsfd = -1, ret, cpipe[2];
2374         char fnam[100];
2375         pid_t cpid;
2376         char v;
2377
2378         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2379         if (ret < 0 || ret >= sizeof(fnam))
2380                 _exit(1);
2381         newnsfd = open(fnam, O_RDONLY);
2382         if (newnsfd < 0)
2383                 _exit(1);
2384         if (setns(newnsfd, 0) < 0)
2385                 _exit(1);
2386         close(newnsfd);
2387
2388         if (pipe(cpipe) < 0)
2389                 _exit(1);
2390
2391         struct pid_ns_clone_args args = {
2392                 .cpipe = cpipe,
2393                 .sock = sock,
2394                 .tpid = tpid,
2395                 .wrapped = &pid_to_ns
2396         };
2397         size_t stack_size = sysconf(_SC_PAGESIZE);
2398         void *stack = alloca(stack_size);
2399
2400         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2401         if (cpid < 0)
2402                 _exit(1);
2403
2404         // give the child 1 second to be done forking and
2405         // write its ack
2406         if (!wait_for_sock(cpipe[0], 1))
2407                 _exit(1);
2408         ret = read(cpipe[0], &v, 1);
2409         if (ret != sizeof(char) || v != '1')
2410                 _exit(1);
2411
2412         if (!wait_for_pid(cpid))
2413                 _exit(1);
2414         _exit(0);
2415 }
2416
2417 /*
2418  * To read cgroup files with a particular pid, we will setns into the child
2419  * pidns, open a pipe, fork a child - which will be the first to really be in
2420  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2421  */
2422 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2423 {
2424         int sock[2] = {-1, -1};
2425         char *tmpdata = NULL;
2426         int ret;
2427         pid_t qpid, cpid = -1;
2428         bool answer = false;
2429         char v = '0';
2430         struct ucred cred;
2431         size_t sz = 0, asz = 0;
2432
2433         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2434                 return false;
2435
2436         /*
2437          * Now we read the pids from returned data one by one, pass
2438          * them into a child in the target namespace, read back the
2439          * translated pids, and put them into our to-return data
2440          */
2441
2442         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2443                 perror("socketpair");
2444                 free(tmpdata);
2445                 return false;
2446         }
2447
2448         cpid = fork();
2449         if (cpid == -1)
2450                 goto out;
2451
2452         if (!cpid) // child - exits when done
2453                 pid_to_ns_wrapper(sock[1], tpid);
2454
2455         char *ptr = tmpdata;
2456         cred.uid = 0;
2457         cred.gid = 0;
2458         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2459                 cred.pid = qpid;
2460                 ret = send_creds(sock[0], &cred, v, true);
2461
2462                 if (ret == SEND_CREDS_NOTSK)
2463                         goto next;
2464                 if (ret == SEND_CREDS_FAIL)
2465                         goto out;
2466
2467                 // read converted results
2468                 if (!wait_for_sock(sock[0], 2)) {
2469                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2470                         goto out;
2471                 }
2472                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2473                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2474                         goto out;
2475                 }
2476                 must_strcat_pid(d, &sz, &asz, qpid);
2477 next:
2478                 ptr = strchr(ptr, '\n');
2479                 if (!ptr)
2480                         break;
2481                 ptr++;
2482         }
2483
2484         cred.pid = getpid();
2485         v = '1';
2486         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2487                 // failed to ask child to exit
2488                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2489                 goto out;
2490         }
2491
2492         answer = true;
2493
2494 out:
2495         free(tmpdata);
2496         if (cpid != -1)
2497                 wait_for_pid(cpid);
2498         if (sock[0] != -1) {
2499                 close(sock[0]);
2500                 close(sock[1]);
2501         }
2502         return answer;
2503 }
2504
2505 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2506                 struct fuse_file_info *fi)
2507 {
2508         struct fuse_context *fc = fuse_get_context();
2509         struct file_info *f = (struct file_info *)fi->fh;
2510         struct cgfs_files *k = NULL;
2511         char *data = NULL;
2512         int ret, s;
2513         bool r;
2514
2515         if (f->type != LXC_TYPE_CGFILE) {
2516                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2517                 return -EIO;
2518         }
2519
2520         if (offset)
2521                 return 0;
2522
2523         if (!fc)
2524                 return -EIO;
2525
2526         if (!f->controller)
2527                 return -EINVAL;
2528
2529         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2530                 return -EINVAL;
2531         }
2532         free_key(k);
2533
2534
2535         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2536                 ret = -EACCES;
2537                 goto out;
2538         }
2539
2540         if (strcmp(f->file, "tasks") == 0 ||
2541                         strcmp(f->file, "/tasks") == 0 ||
2542                         strcmp(f->file, "/cgroup.procs") == 0 ||
2543                         strcmp(f->file, "cgroup.procs") == 0)
2544                 // special case - we have to translate the pids
2545                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2546         else
2547                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2548
2549         if (!r) {
2550                 ret = -EINVAL;
2551                 goto out;
2552         }
2553
2554         if (!data) {
2555                 ret = 0;
2556                 goto out;
2557         }
2558         s = strlen(data);
2559         if (s > size)
2560                 s = size;
2561         memcpy(buf, data, s);
2562         if (s > 0 && s < size && data[s-1] != '\n')
2563                 buf[s++] = '\n';
2564
2565         ret = s;
2566
2567 out:
2568         free(data);
2569         return ret;
2570 }
2571
2572 static int pid_from_ns(int sock, pid_t tpid)
2573 {
2574         pid_t vpid;
2575         struct ucred cred;
2576         char v;
2577         int ret;
2578
2579         cred.uid = 0;
2580         cred.gid = 0;
2581         while (1) {
2582                 if (!wait_for_sock(sock, 2)) {
2583                         lxcfs_error("%s\n", "Timeout reading from parent.");
2584                         return 1;
2585                 }
2586                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2587                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2588                         return 1;
2589                 }
2590                 if (vpid == -1) // done
2591                         break;
2592                 v = '0';
2593                 cred.pid = vpid;
2594                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2595                         v = '1';
2596                         cred.pid = getpid();
2597                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2598                                 return 1;
2599                 }
2600         }
2601         return 0;
2602 }
2603
2604 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2605 {
2606         int newnsfd = -1, ret, cpipe[2];
2607         char fnam[100];
2608         pid_t cpid;
2609         char v;
2610
2611         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2612         if (ret < 0 || ret >= sizeof(fnam))
2613                 _exit(1);
2614         newnsfd = open(fnam, O_RDONLY);
2615         if (newnsfd < 0)
2616                 _exit(1);
2617         if (setns(newnsfd, 0) < 0)
2618                 _exit(1);
2619         close(newnsfd);
2620
2621         if (pipe(cpipe) < 0)
2622                 _exit(1);
2623
2624         struct pid_ns_clone_args args = {
2625                 .cpipe = cpipe,
2626                 .sock = sock,
2627                 .tpid = tpid,
2628                 .wrapped = &pid_from_ns
2629         };
2630         size_t stack_size = sysconf(_SC_PAGESIZE);
2631         void *stack = alloca(stack_size);
2632
2633         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2634         if (cpid < 0)
2635                 _exit(1);
2636
2637         // give the child 1 second to be done forking and
2638         // write its ack
2639         if (!wait_for_sock(cpipe[0], 1))
2640                 _exit(1);
2641         ret = read(cpipe[0], &v, 1);
2642         if (ret != sizeof(char) || v != '1')
2643                 _exit(1);
2644
2645         if (!wait_for_pid(cpid))
2646                 _exit(1);
2647         _exit(0);
2648 }
2649
2650 /*
2651  * Given host @uid, return the uid to which it maps in
2652  * @pid's user namespace, or -1 if none.
2653  */
2654 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2655 {
2656         FILE *f;
2657         char line[400];
2658
2659         sprintf(line, "/proc/%d/uid_map", pid);
2660         if ((f = fopen(line, "r")) == NULL) {
2661                 return false;
2662         }
2663
2664         *answer = convert_id_to_ns(f, uid);
2665         fclose(f);
2666
2667         if (*answer == -1)
2668                 return false;
2669         return true;
2670 }
2671
2672 /*
2673  * get_pid_creds: get the real uid and gid of @pid from
2674  * /proc/$$/status
2675  * (XXX should we use euid here?)
2676  */
2677 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2678 {
2679         char line[400];
2680         uid_t u;
2681         gid_t g;
2682         FILE *f;
2683
2684         *uid = -1;
2685         *gid = -1;
2686         sprintf(line, "/proc/%d/status", pid);
2687         if ((f = fopen(line, "r")) == NULL) {
2688                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2689                 return;
2690         }
2691         while (fgets(line, 400, f)) {
2692                 if (strncmp(line, "Uid:", 4) == 0) {
2693                         if (sscanf(line+4, "%u", &u) != 1) {
2694                                 lxcfs_error("bad uid line for pid %u\n", pid);
2695                                 fclose(f);
2696                                 return;
2697                         }
2698                         *uid = u;
2699                 } else if (strncmp(line, "Gid:", 4) == 0) {
2700                         if (sscanf(line+4, "%u", &g) != 1) {
2701                                 lxcfs_error("bad gid line for pid %u\n", pid);
2702                                 fclose(f);
2703                                 return;
2704                         }
2705                         *gid = g;
2706                 }
2707         }
2708         fclose(f);
2709 }
2710
2711 /*
2712  * May the requestor @r move victim @v to a new cgroup?
2713  * This is allowed if
2714  *   . they are the same task
2715  *   . they are ownedy by the same uid
2716  *   . @r is root on the host, or
2717  *   . @v's uid is mapped into @r's where @r is root.
2718  */
2719 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2720 {
2721         uid_t v_uid, tmpuid;
2722         gid_t v_gid;
2723
2724         if (r == v)
2725                 return true;
2726         if (r_uid == 0)
2727                 return true;
2728         get_pid_creds(v, &v_uid, &v_gid);
2729         if (r_uid == v_uid)
2730                 return true;
2731         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2732                         && hostuid_to_ns(v_uid, r, &tmpuid))
2733                 return true;
2734         return false;
2735 }
2736
2737 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2738                 const char *file, const char *buf)
2739 {
2740         int sock[2] = {-1, -1};
2741         pid_t qpid, cpid = -1;
2742         FILE *pids_file = NULL;
2743         bool answer = false, fail = false;
2744
2745         pids_file = open_pids_file(contrl, cg);
2746         if (!pids_file)
2747                 return false;
2748
2749         /*
2750          * write the pids to a socket, have helper in writer's pidns
2751          * call movepid for us
2752          */
2753         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2754                 perror("socketpair");
2755                 goto out;
2756         }
2757
2758         cpid = fork();
2759         if (cpid == -1)
2760                 goto out;
2761
2762         if (!cpid) { // child
2763                 fclose(pids_file);
2764                 pid_from_ns_wrapper(sock[1], tpid);
2765         }
2766
2767         const char *ptr = buf;
2768         while (sscanf(ptr, "%d", &qpid) == 1) {
2769                 struct ucred cred;
2770                 char v;
2771
2772                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2773                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2774                         goto out;
2775                 }
2776
2777                 if (recv_creds(sock[0], &cred, &v)) {
2778                         if (v == '0') {
2779                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2780                                         fail = true;
2781                                         break;
2782                                 }
2783                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2784                                         fail = true;
2785                         }
2786                 }
2787
2788                 ptr = strchr(ptr, '\n');
2789                 if (!ptr)
2790                         break;
2791                 ptr++;
2792         }
2793
2794         /* All good, write the value */
2795         qpid = -1;
2796         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2797                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2798
2799         if (!fail)
2800                 answer = true;
2801
2802 out:
2803         if (cpid != -1)
2804                 wait_for_pid(cpid);
2805         if (sock[0] != -1) {
2806                 close(sock[0]);
2807                 close(sock[1]);
2808         }
2809         if (pids_file) {
2810                 if (fclose(pids_file) != 0)
2811                         answer = false;
2812         }
2813         return answer;
2814 }
2815
2816 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2817              struct fuse_file_info *fi)
2818 {
2819         struct fuse_context *fc = fuse_get_context();
2820         char *localbuf = NULL;
2821         struct cgfs_files *k = NULL;
2822         struct file_info *f = (struct file_info *)fi->fh;
2823         bool r;
2824
2825         if (f->type != LXC_TYPE_CGFILE) {
2826                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2827                 return -EIO;
2828         }
2829
2830         if (offset)
2831                 return 0;
2832
2833         if (!fc)
2834                 return -EIO;
2835
2836         localbuf = alloca(size+1);
2837         localbuf[size] = '\0';
2838         memcpy(localbuf, buf, size);
2839
2840         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2841                 size = -EINVAL;
2842                 goto out;
2843         }
2844
2845         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2846                 size = -EACCES;
2847                 goto out;
2848         }
2849
2850         if (strcmp(f->file, "tasks") == 0 ||
2851                         strcmp(f->file, "/tasks") == 0 ||
2852                         strcmp(f->file, "/cgroup.procs") == 0 ||
2853                         strcmp(f->file, "cgroup.procs") == 0)
2854                 // special case - we have to translate the pids
2855                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2856         else
2857                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2858
2859         if (!r)
2860                 size = -EINVAL;
2861
2862 out:
2863         free_key(k);
2864         return size;
2865 }
2866
2867 int cg_chown(const char *path, uid_t uid, gid_t gid)
2868 {
2869         struct fuse_context *fc = fuse_get_context();
2870         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2871         struct cgfs_files *k = NULL;
2872         const char *cgroup;
2873         int ret;
2874
2875         if (!fc)
2876                 return -EIO;
2877
2878         if (strcmp(path, "/cgroup") == 0)
2879                 return -EPERM;
2880
2881         controller = pick_controller_from_path(fc, path);
2882         if (!controller)
2883                 return errno == ENOENT ? -EPERM : -errno;
2884
2885         cgroup = find_cgroup_in_path(path);
2886         if (!cgroup)
2887                 /* this is just /cgroup/controller */
2888                 return -EPERM;
2889
2890         get_cgdir_and_path(cgroup, &cgdir, &last);
2891
2892         if (!last) {
2893                 path1 = "/";
2894                 path2 = cgdir;
2895         } else {
2896                 path1 = cgdir;
2897                 path2 = last;
2898         }
2899
2900         if (is_child_cgroup(controller, path1, path2)) {
2901                 // get uid, gid, from '/tasks' file and make up a mode
2902                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2903                 k = cgfs_get_key(controller, cgroup, "tasks");
2904
2905         } else
2906                 k = cgfs_get_key(controller, path1, path2);
2907
2908         if (!k) {
2909                 ret = -EINVAL;
2910                 goto out;
2911         }
2912
2913         /*
2914          * This being a fuse request, the uid and gid must be valid
2915          * in the caller's namespace.  So we can just check to make
2916          * sure that the caller is root in his uid, and privileged
2917          * over the file's current owner.
2918          */
2919         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2920                 ret = -EACCES;
2921                 goto out;
2922         }
2923
2924         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2925
2926 out:
2927         free_key(k);
2928         free(cgdir);
2929
2930         return ret;
2931 }
2932
2933 int cg_chmod(const char *path, mode_t mode)
2934 {
2935         struct fuse_context *fc = fuse_get_context();
2936         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2937         struct cgfs_files *k = NULL;
2938         const char *cgroup;
2939         int ret;
2940
2941         if (!fc)
2942                 return -EIO;
2943
2944         if (strcmp(path, "/cgroup") == 0)
2945                 return -EPERM;
2946
2947         controller = pick_controller_from_path(fc, path);
2948         if (!controller)
2949                 return errno == ENOENT ? -EPERM : -errno;
2950
2951         cgroup = find_cgroup_in_path(path);
2952         if (!cgroup)
2953                 /* this is just /cgroup/controller */
2954                 return -EPERM;
2955
2956         get_cgdir_and_path(cgroup, &cgdir, &last);
2957
2958         if (!last) {
2959                 path1 = "/";
2960                 path2 = cgdir;
2961         } else {
2962                 path1 = cgdir;
2963                 path2 = last;
2964         }
2965
2966         if (is_child_cgroup(controller, path1, path2)) {
2967                 // get uid, gid, from '/tasks' file and make up a mode
2968                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2969                 k = cgfs_get_key(controller, cgroup, "tasks");
2970
2971         } else
2972                 k = cgfs_get_key(controller, path1, path2);
2973
2974         if (!k) {
2975                 ret = -EINVAL;
2976                 goto out;
2977         }
2978
2979         /*
2980          * This being a fuse request, the uid and gid must be valid
2981          * in the caller's namespace.  So we can just check to make
2982          * sure that the caller is root in his uid, and privileged
2983          * over the file's current owner.
2984          */
2985         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2986                 ret = -EPERM;
2987                 goto out;
2988         }
2989
2990         if (!cgfs_chmod_file(controller, cgroup, mode)) {
2991                 ret = -EINVAL;
2992                 goto out;
2993         }
2994
2995         ret = 0;
2996 out:
2997         free_key(k);
2998         free(cgdir);
2999         return ret;
3000 }
3001
3002 int cg_mkdir(const char *path, mode_t mode)
3003 {
3004         struct fuse_context *fc = fuse_get_context();
3005         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3006         const char *cgroup;
3007         int ret;
3008
3009         if (!fc)
3010                 return -EIO;
3011
3012         controller = pick_controller_from_path(fc, path);
3013         if (!controller)
3014                 return errno == ENOENT ? -EPERM : -errno;
3015
3016         cgroup = find_cgroup_in_path(path);
3017         if (!cgroup)
3018                 return -errno;
3019
3020         get_cgdir_and_path(cgroup, &cgdir, &last);
3021         if (!last)
3022                 path1 = "/";
3023         else
3024                 path1 = cgdir;
3025
3026         pid_t initpid = lookup_initpid_in_store(fc->pid);
3027         if (initpid <= 0)
3028                 initpid = fc->pid;
3029         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3030                 if (!next)
3031                         ret = -EINVAL;
3032                 else if (last && strcmp(next, last) == 0)
3033                         ret = -EEXIST;
3034                 else
3035                         ret = -EPERM;
3036                 goto out;
3037         }
3038
3039         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3040                 ret = -EACCES;
3041                 goto out;
3042         }
3043         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3044                 ret = -EACCES;
3045                 goto out;
3046         }
3047
3048         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3049
3050 out:
3051         free(cgdir);
3052         free(next);
3053         return ret;
3054 }
3055
3056 int cg_rmdir(const char *path)
3057 {
3058         struct fuse_context *fc = fuse_get_context();
3059         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3060         const char *cgroup;
3061         int ret;
3062
3063         if (!fc)
3064                 return -EIO;
3065
3066         controller = pick_controller_from_path(fc, path);
3067         if (!controller) /* Someone's trying to delete "/cgroup". */
3068                 return -EPERM;
3069
3070         cgroup = find_cgroup_in_path(path);
3071         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3072                 return -EPERM;
3073
3074         get_cgdir_and_path(cgroup, &cgdir, &last);
3075         if (!last) {
3076                 /* Someone's trying to delete a cgroup on the same level as the
3077                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3078                  * rmdir "/cgroup/blkio/init.slice".
3079                  */
3080                 ret = -EPERM;
3081                 goto out;
3082         }
3083
3084         pid_t initpid = lookup_initpid_in_store(fc->pid);
3085         if (initpid <= 0)
3086                 initpid = fc->pid;
3087         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3088                 if (!last || (next && (strcmp(next, last) == 0)))
3089                         ret = -EBUSY;
3090                 else
3091                         ret = -ENOENT;
3092                 goto out;
3093         }
3094
3095         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3096                 ret = -EACCES;
3097                 goto out;
3098         }
3099         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3100                 ret = -EACCES;
3101                 goto out;
3102         }
3103
3104         if (!cgfs_remove(controller, cgroup)) {
3105                 ret = -EINVAL;
3106                 goto out;
3107         }
3108
3109         ret = 0;
3110
3111 out:
3112         free(cgdir);
3113         free(next);
3114         return ret;
3115 }
3116
3117 static bool startswith(const char *line, const char *pref)
3118 {
3119         if (strncmp(line, pref, strlen(pref)) == 0)
3120                 return true;
3121         return false;
3122 }
3123
3124 static void parse_memstat(char *memstat, unsigned long *cached,
3125                 unsigned long *active_anon, unsigned long *inactive_anon,
3126                 unsigned long *active_file, unsigned long *inactive_file,
3127                 unsigned long *unevictable)
3128 {
3129         char *eol;
3130
3131         while (*memstat) {
3132                 if (startswith(memstat, "total_cache")) {
3133                         sscanf(memstat + 11, "%lu", cached);
3134                         *cached /= 1024;
3135                 } else if (startswith(memstat, "total_active_anon")) {
3136                         sscanf(memstat + 17, "%lu", active_anon);
3137                         *active_anon /= 1024;
3138                 } else if (startswith(memstat, "total_inactive_anon")) {
3139                         sscanf(memstat + 19, "%lu", inactive_anon);
3140                         *inactive_anon /= 1024;
3141                 } else if (startswith(memstat, "total_active_file")) {
3142                         sscanf(memstat + 17, "%lu", active_file);
3143                         *active_file /= 1024;
3144                 } else if (startswith(memstat, "total_inactive_file")) {
3145                         sscanf(memstat + 19, "%lu", inactive_file);
3146                         *inactive_file /= 1024;
3147                 } else if (startswith(memstat, "total_unevictable")) {
3148                         sscanf(memstat + 17, "%lu", unevictable);
3149                         *unevictable /= 1024;
3150                 }
3151                 eol = strchr(memstat, '\n');
3152                 if (!eol)
3153                         return;
3154                 memstat = eol+1;
3155         }
3156 }
3157
3158 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3159 {
3160         char *eol;
3161         char key[32];
3162
3163         memset(key, 0, 32);
3164         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3165
3166         size_t len = strlen(key);
3167         *v = 0;
3168
3169         while (*str) {
3170                 if (startswith(str, key)) {
3171                         sscanf(str + len, "%lu", v);
3172                         return;
3173                 }
3174                 eol = strchr(str, '\n');
3175                 if (!eol)
3176                         return;
3177                 str = eol+1;
3178         }
3179 }
3180
3181 static int read_file(const char *path, char *buf, size_t size,
3182                      struct file_info *d)
3183 {
3184         size_t linelen = 0, total_len = 0, rv = 0;
3185         char *line = NULL;
3186         char *cache = d->buf;
3187         size_t cache_size = d->buflen;
3188         FILE *f = fopen(path, "r");
3189         if (!f)
3190                 return 0;
3191
3192         while (getline(&line, &linelen, f) != -1) {
3193                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3194                 if (l < 0) {
3195                         perror("Error writing to cache");
3196                         rv = 0;
3197                         goto err;
3198                 }
3199                 if (l >= cache_size) {
3200                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3201                         rv = 0;
3202                         goto err;
3203                 }
3204                 cache += l;
3205                 cache_size -= l;
3206                 total_len += l;
3207         }
3208
3209         d->size = total_len;
3210         if (total_len > size)
3211                 total_len = size;
3212
3213         /* read from off 0 */
3214         memcpy(buf, d->buf, total_len);
3215         rv = total_len;
3216   err:
3217         fclose(f);
3218         free(line);
3219         return rv;
3220 }
3221
3222 /*
3223  * FUSE ops for /proc
3224  */
3225
3226 static unsigned long get_memlimit(const char *cgroup, const char *file)
3227 {
3228         char *memlimit_str = NULL;
3229         unsigned long memlimit = -1;
3230
3231         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3232                 memlimit = strtoul(memlimit_str, NULL, 10);
3233
3234         free(memlimit_str);
3235
3236         return memlimit;
3237 }
3238
3239 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3240 {
3241         char *copy = strdupa(cgroup);
3242         unsigned long memlimit = 0, retlimit;
3243
3244         retlimit = get_memlimit(copy, file);
3245
3246         while (strcmp(copy, "/") != 0) {
3247                 copy = dirname(copy);
3248                 memlimit = get_memlimit(copy, file);
3249                 if (memlimit != -1 && memlimit < retlimit)
3250                         retlimit = memlimit;
3251         };
3252
3253         return retlimit;
3254 }
3255
3256 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3257                 struct fuse_file_info *fi)
3258 {
3259         struct fuse_context *fc = fuse_get_context();
3260         struct file_info *d = (struct file_info *)fi->fh;
3261         char *cg;
3262         char *memusage_str = NULL, *memstat_str = NULL,
3263                 *memswlimit_str = NULL, *memswusage_str = NULL;
3264         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3265                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3266                 active_file = 0, inactive_file = 0, unevictable = 0,
3267                 hostswtotal = 0;
3268         char *line = NULL;
3269         size_t linelen = 0, total_len = 0, rv = 0;
3270         char *cache = d->buf;
3271         size_t cache_size = d->buflen;
3272         FILE *f = NULL;
3273
3274         if (offset){
3275                 if (offset > d->size)
3276                         return -EINVAL;
3277                 if (!d->cached)
3278                         return 0;
3279                 int left = d->size - offset;
3280                 total_len = left > size ? size: left;
3281                 memcpy(buf, cache + offset, total_len);
3282                 return total_len;
3283         }
3284
3285         pid_t initpid = lookup_initpid_in_store(fc->pid);
3286         if (initpid <= 0)
3287                 initpid = fc->pid;
3288         cg = get_pid_cgroup(initpid, "memory");
3289         if (!cg)
3290                 return read_file("/proc/meminfo", buf, size, d);
3291         prune_init_slice(cg);
3292
3293         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3294         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3295                 goto err;
3296         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3297                 goto err;
3298
3299         // Following values are allowed to fail, because swapaccount might be turned
3300         // off for current kernel
3301         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3302                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3303         {
3304                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3305                 memswusage = strtoul(memswusage_str, NULL, 10);
3306
3307                 memswlimit = memswlimit / 1024;
3308                 memswusage = memswusage / 1024;
3309         }
3310
3311         memusage = strtoul(memusage_str, NULL, 10);
3312         memlimit /= 1024;
3313         memusage /= 1024;
3314
3315         parse_memstat(memstat_str, &cached, &active_anon,
3316                         &inactive_anon, &active_file, &inactive_file,
3317                         &unevictable);
3318
3319         f = fopen("/proc/meminfo", "r");
3320         if (!f)
3321                 goto err;
3322
3323         while (getline(&line, &linelen, f) != -1) {
3324                 ssize_t l;
3325                 char *printme, lbuf[100];
3326
3327                 memset(lbuf, 0, 100);
3328                 if (startswith(line, "MemTotal:")) {
3329                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3330                         if (hosttotal < memlimit)
3331                                 memlimit = hosttotal;
3332                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3333                         printme = lbuf;
3334                 } else if (startswith(line, "MemFree:")) {
3335                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3336                         printme = lbuf;
3337                 } else if (startswith(line, "MemAvailable:")) {
3338                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3339                         printme = lbuf;
3340                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3341                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3342                         if (hostswtotal < memswlimit)
3343                                 memswlimit = hostswtotal;
3344                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3345                         printme = lbuf;
3346                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3347                         unsigned long swaptotal = memswlimit,
3348                                         swapusage = memswusage - memusage,
3349                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3350                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3351                         printme = lbuf;
3352                 } else if (startswith(line, "Slab:")) {
3353                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3354                         printme = lbuf;
3355                 } else if (startswith(line, "Buffers:")) {
3356                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3357                         printme = lbuf;
3358                 } else if (startswith(line, "Cached:")) {
3359                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3360                         printme = lbuf;
3361                 } else if (startswith(line, "SwapCached:")) {
3362                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3363                         printme = lbuf;
3364                 } else if (startswith(line, "Active:")) {
3365                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3366                                         active_anon + active_file);
3367                         printme = lbuf;
3368                 } else if (startswith(line, "Inactive:")) {
3369                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3370                                         inactive_anon + inactive_file);
3371                         printme = lbuf;
3372                 } else if (startswith(line, "Active(anon)")) {
3373                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3374                         printme = lbuf;
3375                 } else if (startswith(line, "Inactive(anon)")) {
3376                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3377                         printme = lbuf;
3378                 } else if (startswith(line, "Active(file)")) {
3379                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3380                         printme = lbuf;
3381                 } else if (startswith(line, "Inactive(file)")) {
3382                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3383                         printme = lbuf;
3384                 } else if (startswith(line, "Unevictable")) {
3385                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3386                         printme = lbuf;
3387                 } else if (startswith(line, "SReclaimable")) {
3388                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3389                         printme = lbuf;
3390                 } else if (startswith(line, "SUnreclaim")) {
3391                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3392                         printme = lbuf;
3393                 } else
3394                         printme = line;
3395
3396                 l = snprintf(cache, cache_size, "%s", printme);
3397                 if (l < 0) {
3398                         perror("Error writing to cache");
3399                         rv = 0;
3400                         goto err;
3401
3402                 }
3403                 if (l >= cache_size) {
3404                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3405                         rv = 0;
3406                         goto err;
3407                 }
3408
3409                 cache += l;
3410                 cache_size -= l;
3411                 total_len += l;
3412         }
3413
3414         d->cached = 1;
3415         d->size = total_len;
3416         if (total_len > size ) total_len = size;
3417         memcpy(buf, d->buf, total_len);
3418
3419         rv = total_len;
3420 err:
3421         if (f)
3422                 fclose(f);
3423         free(line);
3424         free(cg);
3425         free(memusage_str);
3426         free(memswlimit_str);
3427         free(memswusage_str);
3428         free(memstat_str);
3429         return rv;
3430 }
3431
3432 /*
3433  * Read the cpuset.cpus for cg
3434  * Return the answer in a newly allocated string which must be freed
3435  */
3436 static char *get_cpuset(const char *cg)
3437 {
3438         char *answer;
3439
3440         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3441                 return NULL;
3442         return answer;
3443 }
3444
3445 bool cpu_in_cpuset(int cpu, const char *cpuset);
3446
3447 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3448 {
3449         int cpu;
3450
3451         if (sscanf(line, "processor       : %d", &cpu) != 1)
3452                 return false;
3453         return cpu_in_cpuset(cpu, cpuset);
3454 }
3455
3456 /*
3457  * check whether this is a '^processor" line in /proc/cpuinfo
3458  */
3459 static bool is_processor_line(const char *line)
3460 {
3461         int cpu;
3462
3463         if (sscanf(line, "processor       : %d", &cpu) == 1)
3464                 return true;
3465         return false;
3466 }
3467
3468 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3469                 struct fuse_file_info *fi)
3470 {
3471         struct fuse_context *fc = fuse_get_context();
3472         struct file_info *d = (struct file_info *)fi->fh;
3473         char *cg;
3474         char *cpuset = NULL;
3475         char *line = NULL;
3476         size_t linelen = 0, total_len = 0, rv = 0;
3477         bool am_printing = false, firstline = true, is_s390x = false;
3478         int curcpu = -1, cpu;
3479         char *cache = d->buf;
3480         size_t cache_size = d->buflen;
3481         FILE *f = NULL;
3482
3483         if (offset){
3484                 if (offset > d->size)
3485                         return -EINVAL;
3486                 if (!d->cached)
3487                         return 0;
3488                 int left = d->size - offset;
3489                 total_len = left > size ? size: left;
3490                 memcpy(buf, cache + offset, total_len);
3491                 return total_len;
3492         }
3493
3494         pid_t initpid = lookup_initpid_in_store(fc->pid);
3495         if (initpid <= 0)
3496                 initpid = fc->pid;
3497         cg = get_pid_cgroup(initpid, "cpuset");
3498         if (!cg)
3499                 return read_file("proc/cpuinfo", buf, size, d);
3500         prune_init_slice(cg);
3501
3502         cpuset = get_cpuset(cg);
3503         if (!cpuset)
3504                 goto err;
3505
3506         f = fopen("/proc/cpuinfo", "r");
3507         if (!f)
3508                 goto err;
3509
3510         while (getline(&line, &linelen, f) != -1) {
3511                 ssize_t l;
3512                 if (firstline) {
3513                         firstline = false;
3514                         if (strstr(line, "IBM/S390") != NULL) {
3515                                 is_s390x = true;
3516                                 am_printing = true;
3517                                 continue;
3518                         }
3519                 }
3520                 if (strncmp(line, "# processors:", 12) == 0)
3521                         continue;
3522                 if (is_processor_line(line)) {
3523                         am_printing = cpuline_in_cpuset(line, cpuset);
3524                         if (am_printing) {
3525                                 curcpu ++;
3526                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3527                                 if (l < 0) {
3528                                         perror("Error writing to cache");
3529                                         rv = 0;
3530                                         goto err;
3531                                 }
3532                                 if (l >= cache_size) {
3533                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3534                                         rv = 0;
3535                                         goto err;
3536                                 }
3537                                 cache += l;
3538                                 cache_size -= l;
3539                                 total_len += l;
3540                         }
3541                         continue;
3542                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3543                         char *p;
3544                         if (!cpu_in_cpuset(cpu, cpuset))
3545                                 continue;
3546                         curcpu ++;
3547                         p = strchr(line, ':');
3548                         if (!p || !*p)
3549                                 goto err;
3550                         p++;
3551                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3552                         if (l < 0) {
3553                                 perror("Error writing to cache");
3554                                 rv = 0;
3555                                 goto err;
3556                         }
3557                         if (l >= cache_size) {
3558                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3559                                 rv = 0;
3560                                 goto err;
3561                         }
3562                         cache += l;
3563                         cache_size -= l;
3564                         total_len += l;
3565                         continue;
3566
3567                 }
3568                 if (am_printing) {
3569                         l = snprintf(cache, cache_size, "%s", line);
3570                         if (l < 0) {
3571                                 perror("Error writing to cache");
3572                                 rv = 0;
3573                                 goto err;
3574                         }
3575                         if (l >= cache_size) {
3576                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3577                                 rv = 0;
3578                                 goto err;
3579                         }
3580                         cache += l;
3581                         cache_size -= l;
3582                         total_len += l;
3583                 }
3584         }
3585
3586         if (is_s390x) {
3587                 char *origcache = d->buf;
3588                 ssize_t l;
3589                 do {
3590                         d->buf = malloc(d->buflen);
3591                 } while (!d->buf);
3592                 cache = d->buf;
3593                 cache_size = d->buflen;
3594                 total_len = 0;
3595                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3596                 if (l < 0 || l >= cache_size) {
3597                         free(origcache);
3598                         goto err;
3599                 }
3600                 cache_size -= l;
3601                 cache += l;
3602                 total_len += l;
3603                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3604                 if (l < 0 || l >= cache_size) {
3605                         free(origcache);
3606                         goto err;
3607                 }
3608                 cache_size -= l;
3609                 cache += l;
3610                 total_len += l;
3611                 l = snprintf(cache, cache_size, "%s", origcache);
3612                 free(origcache);
3613                 if (l < 0 || l >= cache_size)
3614                         goto err;
3615                 total_len += l;
3616         }
3617
3618         d->cached = 1;
3619         d->size = total_len;
3620         if (total_len > size ) total_len = size;
3621
3622         /* read from off 0 */
3623         memcpy(buf, d->buf, total_len);
3624         rv = total_len;
3625 err:
3626         if (f)
3627                 fclose(f);
3628         free(line);
3629         free(cpuset);
3630         free(cg);
3631         return rv;
3632 }
3633
3634 static uint64_t get_reaper_start_time(pid_t pid)
3635 {
3636         int ret;
3637         FILE *f;
3638         uint64_t starttime;
3639         /* strlen("/proc/") = 6
3640          * +
3641          * LXCFS_NUMSTRLEN64
3642          * +
3643          * strlen("/stat") = 5
3644          * +
3645          * \0 = 1
3646          * */
3647 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3648         char path[__PROC_PID_STAT_LEN];
3649         pid_t qpid;
3650
3651         qpid = lookup_initpid_in_store(pid);
3652         if (qpid <= 0) {
3653                 /* Caller can check for EINVAL on 0. */
3654                 errno = EINVAL;
3655                 return 0;
3656         }
3657
3658         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3659         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3660                 /* Caller can check for EINVAL on 0. */
3661                 errno = EINVAL;
3662                 return 0;
3663         }
3664
3665         f = fopen(path, "r");
3666         if (!f) {
3667                 /* Caller can check for EINVAL on 0. */
3668                 errno = EINVAL;
3669                 return 0;
3670         }
3671
3672         /* Note that the *scanf() argument supression requires that length
3673          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3674          * at us. It's like telling someone you're not married and then asking
3675          * if you can bring your wife to the party.
3676          */
3677         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3678                         "%*s "      /* (2)  comm        %s   */
3679                         "%*c "      /* (3)  state       %c   */
3680                         "%*d "      /* (4)  ppid        %d   */
3681                         "%*d "      /* (5)  pgrp        %d   */
3682                         "%*d "      /* (6)  session     %d   */
3683                         "%*d "      /* (7)  tty_nr      %d   */
3684                         "%*d "      /* (8)  tpgid       %d   */
3685                         "%*u "      /* (9)  flags       %u   */
3686                         "%*u "      /* (10) minflt      %lu  */
3687                         "%*u "      /* (11) cminflt     %lu  */
3688                         "%*u "      /* (12) majflt      %lu  */
3689                         "%*u "      /* (13) cmajflt     %lu  */
3690                         "%*u "      /* (14) utime       %lu  */
3691                         "%*u "      /* (15) stime       %lu  */
3692                         "%*d "      /* (16) cutime      %ld  */
3693                         "%*d "      /* (17) cstime      %ld  */
3694                         "%*d "      /* (18) priority    %ld  */
3695                         "%*d "      /* (19) nice        %ld  */
3696                         "%*d "      /* (20) num_threads %ld  */
3697                         "%*d "      /* (21) itrealvalue %ld  */
3698                         "%" PRIu64, /* (22) starttime   %llu */
3699                      &starttime);
3700         if (ret != 1) {
3701                 fclose(f);
3702                 /* Caller can check for EINVAL on 0. */
3703                 errno = EINVAL;
3704                 return 0;
3705         }
3706
3707         fclose(f);
3708
3709         errno = 0;
3710         return starttime;
3711 }
3712
3713 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3714 {
3715         uint64_t clockticks;
3716         int64_t ticks_per_sec;
3717
3718         clockticks = get_reaper_start_time(pid);
3719         if (clockticks == 0 && errno == EINVAL) {
3720                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3721                 return 0;
3722         }
3723
3724         ticks_per_sec = sysconf(_SC_CLK_TCK);
3725         if (ticks_per_sec < 0 && errno == EINVAL) {
3726                 lxcfs_debug(
3727                     "%s\n",
3728                     "failed to determine number of clock ticks in a second");
3729                 return 0;
3730         }
3731
3732         return (clockticks /= ticks_per_sec);
3733 }
3734
3735 static uint64_t get_reaper_age(pid_t pid)
3736 {
3737         uint64_t procstart, uptime, procage;
3738
3739         /* We need to substract the time the process has started since system
3740          * boot minus the time when the system has started to get the actual
3741          * reaper age.
3742          */
3743         procstart = get_reaper_start_time_in_sec(pid);
3744         procage = procstart;
3745         if (procstart > 0) {
3746                 int ret;
3747                 struct timespec spec;
3748
3749                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3750                 if (ret < 0)
3751                         return 0;
3752                 /* We could make this more precise here by using the tv_nsec
3753                  * field in the timespec struct and convert it to milliseconds
3754                  * and then create a double for the seconds and milliseconds but
3755                  * that seems more work than it is worth.
3756                  */
3757                 uptime = spec.tv_sec;
3758                 procage = uptime - procstart;
3759         }
3760
3761         return procage;
3762 }
3763
3764 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3765 static int proc_stat_read(char *buf, size_t size, off_t offset,
3766                 struct fuse_file_info *fi)
3767 {
3768         struct fuse_context *fc = fuse_get_context();
3769         struct file_info *d = (struct file_info *)fi->fh;
3770         char *cg;
3771         char *cpuset = NULL;
3772         char *line = NULL;
3773         size_t linelen = 0, total_len = 0, rv = 0;
3774         int curcpu = -1; /* cpu numbering starts at 0 */
3775         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3776         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3777                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3778         char cpuall[CPUALL_MAX_SIZE];
3779         /* reserve for cpu all */
3780         char *cache = d->buf + CPUALL_MAX_SIZE;
3781         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3782         FILE *f = NULL;
3783
3784         if (offset){
3785                 if (offset > d->size)
3786                         return -EINVAL;
3787                 if (!d->cached)
3788                         return 0;
3789                 int left = d->size - offset;
3790                 total_len = left > size ? size: left;
3791                 memcpy(buf, d->buf + offset, total_len);
3792                 return total_len;
3793         }
3794
3795         pid_t initpid = lookup_initpid_in_store(fc->pid);
3796         if (initpid <= 0)
3797                 initpid = fc->pid;
3798         cg = get_pid_cgroup(initpid, "cpuset");
3799         if (!cg)
3800                 return read_file("/proc/stat", buf, size, d);
3801         prune_init_slice(cg);
3802
3803         cpuset = get_cpuset(cg);
3804         if (!cpuset)
3805                 goto err;
3806
3807         f = fopen("/proc/stat", "r");
3808         if (!f)
3809                 goto err;
3810
3811         //skip first line
3812         if (getline(&line, &linelen, f) < 0) {
3813                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3814                 goto err;
3815         }
3816
3817         while (getline(&line, &linelen, f) != -1) {
3818                 ssize_t l;
3819                 int cpu;
3820                 char cpu_char[10]; /* That's a lot of cores */
3821                 char *c;
3822
3823                 if (strlen(line) == 0)
3824                         continue;
3825                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3826                         /* not a ^cpuN line containing a number N, just print it */
3827                         l = snprintf(cache, cache_size, "%s", line);
3828                         if (l < 0) {
3829                                 perror("Error writing to cache");
3830                                 rv = 0;
3831                                 goto err;
3832                         }
3833                         if (l >= cache_size) {
3834                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3835                                 rv = 0;
3836                                 goto err;
3837                         }
3838                         cache += l;
3839                         cache_size -= l;
3840                         total_len += l;
3841                         continue;
3842                 }
3843
3844                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3845                         continue;
3846                 if (!cpu_in_cpuset(cpu, cpuset))
3847                         continue;
3848                 curcpu ++;
3849
3850                 c = strchr(line, ' ');
3851                 if (!c)
3852                         continue;
3853                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3854                 if (l < 0) {
3855                         perror("Error writing to cache");
3856                         rv = 0;
3857                         goto err;
3858
3859                 }
3860                 if (l >= cache_size) {
3861                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3862                         rv = 0;
3863                         goto err;
3864                 }
3865
3866                 cache += l;
3867                 cache_size -= l;
3868                 total_len += l;
3869
3870                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3871                            &user,
3872                            &nice,
3873                            &system,
3874                            &idle,
3875                            &iowait,
3876                            &irq,
3877                            &softirq,
3878                            &steal,
3879                            &guest,
3880                            &guest_nice) != 10)
3881                         continue;
3882                 user_sum += user;
3883                 nice_sum += nice;
3884                 system_sum += system;
3885                 idle_sum += idle;
3886                 iowait_sum += iowait;
3887                 irq_sum += irq;
3888                 softirq_sum += softirq;
3889                 steal_sum += steal;
3890                 guest_sum += guest;
3891                 guest_nice_sum += guest_nice;
3892         }
3893
3894         cache = d->buf;
3895
3896         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3897                         user_sum,
3898                         nice_sum,
3899                         system_sum,
3900                         idle_sum,
3901                         iowait_sum,
3902                         irq_sum,
3903                         softirq_sum,
3904                         steal_sum,
3905                         guest_sum,
3906                         guest_nice_sum);
3907         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3908                 memcpy(cache, cpuall, cpuall_len);
3909                 cache += cpuall_len;
3910         } else {
3911                 /* shouldn't happen */
3912                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3913                 cpuall_len = 0;
3914         }
3915
3916         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3917         total_len += cpuall_len;
3918         d->cached = 1;
3919         d->size = total_len;
3920         if (total_len > size)
3921                 total_len = size;
3922
3923         memcpy(buf, d->buf, total_len);
3924         rv = total_len;
3925
3926 err:
3927         if (f)
3928                 fclose(f);
3929         free(line);
3930         free(cpuset);
3931         free(cg);
3932         return rv;
3933 }
3934
3935 /* This function retrieves the busy time of a group of tasks by looking at
3936  * cpuacct.usage. Unfortunately, this only makes sense when the container has
3937  * been given it's own cpuacct cgroup. If not, this function will take the busy
3938  * time of all other taks that do not actually belong to the container into
3939  * account as well. If someone has a clever solution for this please send a
3940  * patch!
3941  */
3942 static unsigned long get_reaper_busy(pid_t task)
3943 {
3944         pid_t initpid = lookup_initpid_in_store(task);
3945         char *cgroup = NULL, *usage_str = NULL;
3946         unsigned long usage = 0;
3947
3948         if (initpid <= 0)
3949                 return 0;
3950
3951         cgroup = get_pid_cgroup(initpid, "cpuacct");
3952         if (!cgroup)
3953                 goto out;
3954         prune_init_slice(cgroup);
3955         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3956                 goto out;
3957         usage = strtoul(usage_str, NULL, 10);
3958         usage /= 1000000000;
3959
3960 out:
3961         free(cgroup);
3962         free(usage_str);
3963         return usage;
3964 }
3965
3966 #if RELOADTEST
3967 void iwashere(void)
3968 {
3969         int fd;
3970
3971         fd = creat("/tmp/lxcfs-iwashere", 0644);
3972         if (fd >= 0)
3973                 close(fd);
3974 }
3975 #endif
3976
3977 /*
3978  * We read /proc/uptime and reuse its second field.
3979  * For the first field, we use the mtime for the reaper for
3980  * the calling pid as returned by getreaperage
3981  */
3982 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3983                 struct fuse_file_info *fi)
3984 {
3985         struct fuse_context *fc = fuse_get_context();
3986         struct file_info *d = (struct file_info *)fi->fh;
3987         unsigned long int busytime = get_reaper_busy(fc->pid);
3988         char *cache = d->buf;
3989         ssize_t total_len = 0;
3990         uint64_t idletime, reaperage;
3991
3992 #if RELOADTEST
3993         iwashere();
3994 #endif
3995
3996         if (offset){
3997                 if (!d->cached)
3998                         return 0;
3999                 if (offset > d->size)
4000                         return -EINVAL;
4001                 int left = d->size - offset;
4002                 total_len = left > size ? size: left;
4003                 memcpy(buf, cache + offset, total_len);
4004                 return total_len;
4005         }
4006
4007         reaperage = get_reaper_age(fc->pid);
4008         /* To understand why this is done, please read the comment to the
4009          * get_reaper_busy() function.
4010          */
4011         idletime = reaperage;
4012         if (reaperage >= busytime)
4013                 idletime = reaperage - busytime;
4014
4015         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4016         if (total_len < 0 || total_len >=  d->buflen){
4017                 lxcfs_error("%s\n", "failed to write to cache");
4018                 return 0;
4019         }
4020
4021         d->size = (int)total_len;
4022         d->cached = 1;
4023
4024         if (total_len > size) total_len = size;
4025
4026         memcpy(buf, d->buf, total_len);
4027         return total_len;
4028 }
4029
4030 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4031                 struct fuse_file_info *fi)
4032 {
4033         char dev_name[72];
4034         struct fuse_context *fc = fuse_get_context();
4035         struct file_info *d = (struct file_info *)fi->fh;
4036         char *cg;
4037         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4038                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
4039         unsigned long read = 0, write = 0;
4040         unsigned long read_merged = 0, write_merged = 0;
4041         unsigned long read_sectors = 0, write_sectors = 0;
4042         unsigned long read_ticks = 0, write_ticks = 0;
4043         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4044         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4045         char *cache = d->buf;
4046         size_t cache_size = d->buflen;
4047         char *line = NULL;
4048         size_t linelen = 0, total_len = 0, rv = 0;
4049         unsigned int major = 0, minor = 0;
4050         int i = 0;
4051         FILE *f = NULL;
4052
4053         if (offset){
4054                 if (offset > d->size)
4055                         return -EINVAL;
4056                 if (!d->cached)
4057                         return 0;
4058                 int left = d->size - offset;
4059                 total_len = left > size ? size: left;
4060                 memcpy(buf, cache + offset, total_len);
4061                 return total_len;
4062         }
4063
4064         pid_t initpid = lookup_initpid_in_store(fc->pid);
4065         if (initpid <= 0)
4066                 initpid = fc->pid;
4067         cg = get_pid_cgroup(initpid, "blkio");
4068         if (!cg)
4069                 return read_file("/proc/diskstats", buf, size, d);
4070         prune_init_slice(cg);
4071
4072         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4073                 goto err;
4074         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4075                 goto err;
4076         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4077                 goto err;
4078         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4079                 goto err;
4080         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4081                 goto err;
4082
4083
4084         f = fopen("/proc/diskstats", "r");
4085         if (!f)
4086                 goto err;
4087
4088         while (getline(&line, &linelen, f) != -1) {
4089                 ssize_t l;
4090                 char lbuf[256];
4091
4092                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4093                 if (i != 3)
4094                         continue;
4095
4096                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4097                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4098                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4099                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4100                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4101                 read_sectors = read_sectors/512;
4102                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4103                 write_sectors = write_sectors/512;
4104
4105                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4106                 rd_svctm = rd_svctm/1000000;
4107                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4108                 rd_wait = rd_wait/1000000;
4109                 read_ticks = rd_svctm + rd_wait;
4110
4111                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4112                 wr_svctm =  wr_svctm/1000000;
4113                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4114                 wr_wait =  wr_wait/1000000;
4115                 write_ticks = wr_svctm + wr_wait;
4116
4117                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4118                 tot_ticks =  tot_ticks/1000000;
4119
4120                 memset(lbuf, 0, 256);
4121                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4122                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4123                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4124                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4125                 else
4126                         continue;
4127
4128                 l = snprintf(cache, cache_size, "%s", lbuf);
4129                 if (l < 0) {
4130                         perror("Error writing to fuse buf");
4131                         rv = 0;
4132                         goto err;
4133                 }
4134                 if (l >= cache_size) {
4135                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4136                         rv = 0;
4137                         goto err;
4138                 }
4139                 cache += l;
4140                 cache_size -= l;
4141                 total_len += l;
4142         }
4143
4144         d->cached = 1;
4145         d->size = total_len;
4146         if (total_len > size ) total_len = size;
4147         memcpy(buf, d->buf, total_len);
4148
4149         rv = total_len;
4150 err:
4151         free(cg);
4152         if (f)
4153                 fclose(f);
4154         free(line);
4155         free(io_serviced_str);
4156         free(io_merged_str);
4157         free(io_service_bytes_str);
4158         free(io_wait_time_str);
4159         free(io_service_time_str);
4160         return rv;
4161 }
4162
4163 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4164                 struct fuse_file_info *fi)
4165 {
4166         struct fuse_context *fc = fuse_get_context();
4167         struct file_info *d = (struct file_info *)fi->fh;
4168         char *cg = NULL;
4169         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4170         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4171         ssize_t total_len = 0, rv = 0;
4172         ssize_t l = 0;
4173         char *cache = d->buf;
4174
4175         if (offset) {
4176                 if (offset > d->size)
4177                         return -EINVAL;
4178                 if (!d->cached)
4179                         return 0;
4180                 int left = d->size - offset;
4181                 total_len = left > size ? size: left;
4182                 memcpy(buf, cache + offset, total_len);
4183                 return total_len;
4184         }
4185
4186         pid_t initpid = lookup_initpid_in_store(fc->pid);
4187         if (initpid <= 0)
4188                 initpid = fc->pid;
4189         cg = get_pid_cgroup(initpid, "memory");
4190         if (!cg)
4191                 return read_file("/proc/swaps", buf, size, d);
4192         prune_init_slice(cg);
4193
4194         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4195
4196         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4197                 goto err;
4198
4199         memusage = strtoul(memusage_str, NULL, 10);
4200
4201         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4202             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4203
4204                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4205                 memswusage = strtoul(memswusage_str, NULL, 10);
4206
4207                 swap_total = (memswlimit - memlimit) / 1024;
4208                 swap_free = (memswusage - memusage) / 1024;
4209         }
4210
4211         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4212
4213         /* When no mem + swap limit is specified or swapaccount=0*/
4214         if (!memswlimit) {
4215                 char *line = NULL;
4216                 size_t linelen = 0;
4217                 FILE *f = fopen("/proc/meminfo", "r");
4218
4219                 if (!f)
4220                         goto err;
4221
4222                 while (getline(&line, &linelen, f) != -1) {
4223                         if (startswith(line, "SwapTotal:")) {
4224                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4225                         } else if (startswith(line, "SwapFree:")) {
4226                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4227                         }
4228                 }
4229
4230                 free(line);
4231                 fclose(f);
4232         }
4233
4234         if (swap_total > 0) {
4235                 l = snprintf(d->buf + total_len, d->size - total_len,
4236                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4237                                 swap_total, swap_free);
4238                 total_len += l;
4239         }
4240
4241         if (total_len < 0 || l < 0) {
4242                 perror("Error writing to cache");
4243                 rv = 0;
4244                 goto err;
4245         }
4246
4247         d->cached = 1;
4248         d->size = (int)total_len;
4249
4250         if (total_len > size) total_len = size;
4251         memcpy(buf, d->buf, total_len);
4252         rv = total_len;
4253
4254 err:
4255         free(cg);
4256         free(memswlimit_str);
4257         free(memlimit_str);
4258         free(memusage_str);
4259         free(memswusage_str);
4260         return rv;
4261 }
4262 /*
4263  * Find the process pid from cgroup path.
4264  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4265  * @pid_buf : put pid to pid_buf.
4266  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4267  * @depth : the depth of cgroup in container.
4268  * @sum : return the number of pid.
4269  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4270  */
4271 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4272 {
4273         DIR *dir;
4274         int fd;
4275         struct dirent *file;
4276         FILE *f = NULL;
4277         size_t linelen = 0;
4278         char *line = NULL;
4279         int pd;
4280         char *path_dir, *path;
4281         char **pid;
4282
4283         /* path = dpath + "/cgroup.procs" + /0 */
4284         do {
4285                 path = malloc(strlen(dpath) + 20);
4286         } while (!path);
4287
4288         strcpy(path, dpath);
4289         fd = openat(cfd, path, O_RDONLY);
4290         if (fd < 0)
4291                 goto out;
4292
4293         dir = fdopendir(fd);
4294         if (dir == NULL) {
4295                 close(fd);
4296                 goto out;
4297         }
4298
4299         while (((file = readdir(dir)) != NULL) && depth > 0) {
4300                 if (strncmp(file->d_name, ".", 1) == 0)
4301                         continue;
4302                 if (strncmp(file->d_name, "..", 1) == 0)
4303                         continue;
4304                 if (file->d_type == DT_DIR) {
4305                         /* path + '/' + d_name +/0 */
4306                         do {
4307                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4308                         } while (!path_dir);
4309                         strcpy(path_dir, path);
4310                         strcat(path_dir, "/");
4311                         strcat(path_dir, file->d_name);
4312                         pd = depth - 1;
4313                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4314                         free(path_dir);
4315                 }
4316         }
4317         closedir(dir);
4318
4319         strcat(path, "/cgroup.procs");
4320         fd = openat(cfd, path, O_RDONLY);
4321         if (fd < 0)
4322                 goto out;
4323
4324         f = fdopen(fd, "r");
4325         if (!f) {
4326                 close(fd);
4327                 goto out;
4328         }
4329
4330         while (getline(&line, &linelen, f) != -1) {
4331                 do {
4332                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4333                 } while (!pid);
4334                 *pid_buf = pid;
4335                 do {
4336                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
4337                 } while (*(*pid_buf + sum) == NULL);
4338                 strcpy(*(*pid_buf + sum), line);
4339                 sum++;
4340         }
4341         fclose(f);
4342 out:
4343         free(path);
4344         return sum;
4345 }
4346 /*
4347  * calc_load calculates the load according to the following formula:
4348  * load1 = load0 * exp + active * (1 - exp)
4349  *
4350  * @load1: the new loadavg.
4351  * @load0: the former loadavg.
4352  * @active: the total number of running pid at this moment.
4353  * @exp: the fixed-point defined in the beginning.
4354  */
4355 static unsigned long
4356 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4357 {
4358         unsigned long newload;
4359
4360         active = active > 0 ? active * FIXED_1 : 0;
4361         newload = load * exp + active * (FIXED_1 - exp);
4362         if (active >= load)
4363                 newload += FIXED_1 - 1;
4364
4365         return newload / FIXED_1;
4366 }
4367
4368 /*
4369  * Return 0 means that container p->cg is closed.
4370  * Return -1 means that error occurred in refresh.
4371  * Positive num equals the total number of pid.
4372  */
4373 static int refresh_load(struct load_node *p, char *path)
4374 {
4375         FILE *f = NULL;
4376         char **idbuf;
4377         char proc_path[256];
4378         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4379         char *line = NULL;
4380         size_t linelen = 0;
4381         int sum, length;
4382         DIR *dp;
4383         struct dirent *file;
4384
4385         do {
4386                 idbuf = malloc(sizeof(char *));
4387         } while (!idbuf);
4388         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4389         /*  normal exit  */
4390         if (sum == 0)
4391                 goto out;
4392
4393         for (i = 0; i < sum; i++) {
4394                 /*clean up '\n' */
4395                 length = strlen(idbuf[i])-1;
4396                 idbuf[i][length] = '\0';
4397                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4398                 if (ret < 0 || ret > 255) {
4399                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4400                         i = sum;
4401                         sum = -1;
4402                         goto err_out;
4403                 }
4404
4405                 dp = opendir(proc_path);
4406                 if (!dp) {
4407                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4408                         continue;
4409                 }
4410                 while ((file = readdir(dp)) != NULL) {
4411                         if (strncmp(file->d_name, ".", 1) == 0)
4412                                 continue;
4413                         if (strncmp(file->d_name, "..", 1) == 0)
4414                                 continue;
4415                         total_pid++;
4416                         /* We make the biggest pid become last_pid.*/
4417                         ret = atof(file->d_name);
4418                         last_pid = (ret > last_pid) ? ret : last_pid;
4419
4420                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4421                         if (ret < 0 || ret > 255) {
4422                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4423                                 i = sum;
4424                                 sum = -1;
4425                                 closedir(dp);
4426                                 goto err_out;
4427                         }
4428                         f = fopen(proc_path, "r");
4429                         if (f != NULL) {
4430                                 while (getline(&line, &linelen, f) != -1) {
4431                                         /* Find State */
4432                                         if ((line[0] == 'S') && (line[1] == 't'))
4433                                                 break;
4434                                 }
4435                         if ((line[7] == 'R') || (line[7] == 'D'))
4436                                 run_pid++;
4437                         fclose(f);
4438                         }
4439                 }
4440                 closedir(dp);
4441         }
4442         /*Calculate the loadavg.*/
4443         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4444         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4445         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4446         p->run_pid = run_pid;
4447         p->total_pid = total_pid;
4448         p->last_pid = last_pid;
4449
4450         free(line);
4451 err_out:
4452         for (; i > 0; i--)
4453                 free(idbuf[i-1]);
4454 out:
4455         free(idbuf);
4456         return sum;
4457 }
4458 /*
4459  * Traverse the hash table and update it.
4460  */
4461 void *load_begin(void *arg)
4462 {
4463
4464         char *path = NULL;
4465         int i, sum, length, ret;
4466         struct load_node *f;
4467         int first_node;
4468         clock_t time1, time2;
4469
4470         while (1) {
4471                 time1 = clock();
4472                 for (i = 0; i < LOAD_SIZE; i++) {
4473                         pthread_mutex_lock(&load_hash[i].lock);
4474                         if (load_hash[i].next == NULL) {
4475                                 pthread_mutex_unlock(&load_hash[i].lock);
4476                                 continue;
4477                         }
4478                         f = load_hash[i].next;
4479                         first_node = 1;
4480                         while (f) {
4481                                 length = strlen(f->cg) + 2;
4482                                 do {
4483                                         /* strlen(f->cg) + '.' or '' + \0 */
4484                                         path = malloc(length);
4485                                 } while (!path);
4486
4487                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4488                                 if (ret < 0 || ret > length - 1) {
4489                                         /* snprintf failed, ignore the node.*/
4490                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4491                                         goto out;
4492                                 }
4493                                 sum = refresh_load(f, path);
4494                                 if (sum == 0) {
4495                                         f = del_node(f, i);
4496                                 } else {
4497 out:                                    f = f->next;
4498                                 }
4499                                 free(path);
4500                                 /* load_hash[i].lock locks only on the first node.*/
4501                                 if (first_node == 1) {
4502                                         first_node = 0;
4503                                         pthread_mutex_unlock(&load_hash[i].lock);
4504                                 }
4505                         }
4506                 }
4507                 time2 = clock();
4508                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4509         }
4510 }
4511
4512 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4513                 struct fuse_file_info *fi)
4514 {
4515         struct fuse_context *fc = fuse_get_context();
4516         struct file_info *d = (struct file_info *)fi->fh;
4517         pid_t initpid;
4518         char *cg;
4519         size_t total_len = 0;
4520         char *cache = d->buf;
4521         struct load_node *n;
4522         int hash;
4523         int cfd;
4524         unsigned long a, b, c;
4525
4526         if (offset) {
4527                 if (offset > d->size)
4528                         return -EINVAL;
4529                 if (!d->cached)
4530                         return 0;
4531                 int left = d->size - offset;
4532                 total_len = left > size ? size : left;
4533                 memcpy(buf, cache + offset, total_len);
4534                 return total_len;
4535         }
4536         if (!loadavg)
4537                 return read_file("/proc/loadavg", buf, size, d);
4538
4539         initpid = lookup_initpid_in_store(fc->pid);
4540         if (initpid <= 0)
4541                 initpid = fc->pid;
4542         cg = get_pid_cgroup(initpid, "cpu");
4543         if (!cg)
4544                 return read_file("/proc/loadavg", buf, size, d);
4545
4546         prune_init_slice(cg);
4547         hash = calc_hash(cg);
4548         n = locate_node(cg, hash);
4549
4550         /* First time */
4551         if (n == NULL) {
4552                 if (!find_mounted_controller("cpu", &cfd)) {
4553                         /*
4554                          * In locate_node() above, pthread_rwlock_unlock() isn't used
4555                          * because delete is not allowed before read has ended.
4556                          */
4557                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4558                         return 0;
4559                 }
4560                 do {
4561                         n = malloc(sizeof(struct load_node));
4562                 } while (!n);
4563
4564                 do {
4565                         n->cg = malloc(strlen(cg)+1);
4566                 } while (!n->cg);
4567                 strcpy(n->cg, cg);
4568                 n->avenrun[0] = 0;
4569                 n->avenrun[1] = 0;
4570                 n->avenrun[2] = 0;
4571                 n->run_pid = 0;
4572                 n->total_pid = 1;
4573                 n->last_pid = initpid;
4574                 n->cfd = cfd;
4575                 insert_node(&n, hash);
4576         }
4577         a = n->avenrun[0] + (FIXED_1/200);
4578         b = n->avenrun[1] + (FIXED_1/200);
4579         c = n->avenrun[2] + (FIXED_1/200);
4580         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4581                 LOAD_INT(a), LOAD_FRAC(a),
4582                 LOAD_INT(b), LOAD_FRAC(b),
4583                 LOAD_INT(c), LOAD_FRAC(c),
4584                 n->run_pid, n->total_pid, n->last_pid);
4585         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4586         if (total_len < 0 || total_len >=  d->buflen) {
4587                 lxcfs_error("%s\n", "Failed to write to cache");
4588                 return 0;
4589         }
4590         d->size = (int)total_len;
4591         d->cached = 1;
4592
4593         if (total_len > size)
4594                 total_len = size;
4595         memcpy(buf, d->buf, total_len);
4596         return total_len;
4597 }
4598 /* Return a positive number on success, return 0 on failure.*/
4599 pthread_t load_daemon(int load_use)
4600 {
4601         int ret;
4602         pthread_t pid;
4603
4604         ret = init_load();
4605         if (ret == -1) {
4606                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4607                 return 0;
4608         }
4609         ret = pthread_create(&pid, NULL, load_begin, NULL);
4610         if (ret != 0) {
4611                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4612                 load_free();
4613                 return 0;
4614         }
4615         /* use loadavg, here loadavg = 1*/
4616         loadavg = load_use;
4617         return pid;
4618 }
4619
4620 static off_t get_procfile_size(const char *which)
4621 {
4622         FILE *f = fopen(which, "r");
4623         char *line = NULL;
4624         size_t len = 0;
4625         ssize_t sz, answer = 0;
4626         if (!f)
4627                 return 0;
4628
4629         while ((sz = getline(&line, &len, f)) != -1)
4630                 answer += sz;
4631         fclose (f);
4632         free(line);
4633
4634         return answer;
4635 }
4636
4637 int proc_getattr(const char *path, struct stat *sb)
4638 {
4639         struct timespec now;
4640
4641         memset(sb, 0, sizeof(struct stat));
4642         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4643                 return -EINVAL;
4644         sb->st_uid = sb->st_gid = 0;
4645         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4646         if (strcmp(path, "/proc") == 0) {
4647                 sb->st_mode = S_IFDIR | 00555;
4648                 sb->st_nlink = 2;
4649                 return 0;
4650         }
4651         if (strcmp(path, "/proc/meminfo") == 0 ||
4652                         strcmp(path, "/proc/cpuinfo") == 0 ||
4653                         strcmp(path, "/proc/uptime") == 0 ||
4654                         strcmp(path, "/proc/stat") == 0 ||
4655                         strcmp(path, "/proc/diskstats") == 0 ||
4656                         strcmp(path, "/proc/swaps") == 0 ||
4657                         strcmp(path, "/proc/loadavg") == 0) {
4658                 sb->st_size = 0;
4659                 sb->st_mode = S_IFREG | 00444;
4660                 sb->st_nlink = 1;
4661                 return 0;
4662         }
4663
4664         return -ENOENT;
4665 }
4666
4667 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4668                 struct fuse_file_info *fi)
4669 {
4670         if (filler(buf, ".", NULL, 0) != 0 ||
4671             filler(buf, "..", NULL, 0) != 0 ||
4672             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4673             filler(buf, "meminfo", NULL, 0) != 0 ||
4674             filler(buf, "stat", NULL, 0) != 0 ||
4675             filler(buf, "uptime", NULL, 0) != 0 ||
4676             filler(buf, "diskstats", NULL, 0) != 0 ||
4677             filler(buf, "swaps", NULL, 0) != 0   ||
4678             filler(buf, "loadavg", NULL, 0) != 0)
4679                 return -EINVAL;
4680         return 0;
4681 }
4682
4683 int proc_open(const char *path, struct fuse_file_info *fi)
4684 {
4685         int type = -1;
4686         struct file_info *info;
4687
4688         if (strcmp(path, "/proc/meminfo") == 0)
4689                 type = LXC_TYPE_PROC_MEMINFO;
4690         else if (strcmp(path, "/proc/cpuinfo") == 0)
4691                 type = LXC_TYPE_PROC_CPUINFO;
4692         else if (strcmp(path, "/proc/uptime") == 0)
4693                 type = LXC_TYPE_PROC_UPTIME;
4694         else if (strcmp(path, "/proc/stat") == 0)
4695                 type = LXC_TYPE_PROC_STAT;
4696         else if (strcmp(path, "/proc/diskstats") == 0)
4697                 type = LXC_TYPE_PROC_DISKSTATS;
4698         else if (strcmp(path, "/proc/swaps") == 0)
4699                 type = LXC_TYPE_PROC_SWAPS;
4700         else if (strcmp(path, "/proc/loadavg") == 0)
4701                 type = LXC_TYPE_PROC_LOADAVG;
4702         if (type == -1)
4703                 return -ENOENT;
4704
4705         info = malloc(sizeof(*info));
4706         if (!info)
4707                 return -ENOMEM;
4708
4709         memset(info, 0, sizeof(*info));
4710         info->type = type;
4711
4712         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4713         do {
4714                 info->buf = malloc(info->buflen);
4715         } while (!info->buf);
4716         memset(info->buf, 0, info->buflen);
4717         /* set actual size to buffer size */
4718         info->size = info->buflen;
4719
4720         fi->fh = (unsigned long)info;
4721         return 0;
4722 }
4723
4724 int proc_access(const char *path, int mask)
4725 {
4726         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4727                 return 0;
4728
4729         /* these are all read-only */
4730         if ((mask & ~R_OK) != 0)
4731                 return -EACCES;
4732         return 0;
4733 }
4734
4735 int proc_release(const char *path, struct fuse_file_info *fi)
4736 {
4737         do_release_file_info(fi);
4738         return 0;
4739 }
4740
4741 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4742                 struct fuse_file_info *fi)
4743 {
4744         struct file_info *f = (struct file_info *) fi->fh;
4745
4746         switch (f->type) {
4747         case LXC_TYPE_PROC_MEMINFO:
4748                 return proc_meminfo_read(buf, size, offset, fi);
4749         case LXC_TYPE_PROC_CPUINFO:
4750                 return proc_cpuinfo_read(buf, size, offset, fi);
4751         case LXC_TYPE_PROC_UPTIME:
4752                 return proc_uptime_read(buf, size, offset, fi);
4753         case LXC_TYPE_PROC_STAT:
4754                 return proc_stat_read(buf, size, offset, fi);
4755         case LXC_TYPE_PROC_DISKSTATS:
4756                 return proc_diskstats_read(buf, size, offset, fi);
4757         case LXC_TYPE_PROC_SWAPS:
4758                 return proc_swaps_read(buf, size, offset, fi);
4759         case LXC_TYPE_PROC_LOADAVG:
4760                 return proc_loadavg_read(buf, size, offset, fi);
4761         default:
4762                 return -EINVAL;
4763         }
4764 }
4765
4766 /*
4767  * Functions needed to setup cgroups in the __constructor__.
4768  */
4769
4770 static bool mkdir_p(const char *dir, mode_t mode)
4771 {
4772         const char *tmp = dir;
4773         const char *orig = dir;
4774         char *makeme;
4775
4776         do {
4777                 dir = tmp + strspn(tmp, "/");
4778                 tmp = dir + strcspn(dir, "/");
4779                 makeme = strndup(orig, dir - orig);
4780                 if (!makeme)
4781                         return false;
4782                 if (mkdir(makeme, mode) && errno != EEXIST) {
4783                         lxcfs_error("Failed to create directory '%s': %s.\n",
4784                                 makeme, strerror(errno));
4785                         free(makeme);
4786                         return false;
4787                 }
4788                 free(makeme);
4789         } while(tmp != dir);
4790
4791         return true;
4792 }
4793
4794 static bool umount_if_mounted(void)
4795 {
4796         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4797                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4798                 return false;
4799         }
4800         return true;
4801 }
4802
4803 /* __typeof__ should be safe to use with all compilers. */
4804 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4805 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4806 {
4807         return (fs->f_type == (fs_type_magic)magic_val);
4808 }
4809
4810 /*
4811  * looking at fs/proc_namespace.c, it appears we can
4812  * actually expect the rootfs entry to very specifically contain
4813  * " - rootfs rootfs "
4814  * IIUC, so long as we've chrooted so that rootfs is not our root,
4815  * the rootfs entry should always be skipped in mountinfo contents.
4816  */
4817 static bool is_on_ramfs(void)
4818 {
4819         FILE *f;
4820         char *p, *p2;
4821         char *line = NULL;
4822         size_t len = 0;
4823         int i;
4824
4825         f = fopen("/proc/self/mountinfo", "r");
4826         if (!f)
4827                 return false;
4828
4829         while (getline(&line, &len, f) != -1) {
4830                 for (p = line, i = 0; p && i < 4; i++)
4831                         p = strchr(p + 1, ' ');
4832                 if (!p)
4833                         continue;
4834                 p2 = strchr(p + 1, ' ');
4835                 if (!p2)
4836                         continue;
4837                 *p2 = '\0';
4838                 if (strcmp(p + 1, "/") == 0) {
4839                         // this is '/'.  is it the ramfs?
4840                         p = strchr(p2 + 1, '-');
4841                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4842                                 free(line);
4843                                 fclose(f);
4844                                 return true;
4845                         }
4846                 }
4847         }
4848         free(line);
4849         fclose(f);
4850         return false;
4851 }
4852
4853 static int pivot_enter()
4854 {
4855         int ret = -1, oldroot = -1, newroot = -1;
4856
4857         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4858         if (oldroot < 0) {
4859                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4860                 return ret;
4861         }
4862
4863         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4864         if (newroot < 0) {
4865                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4866                 goto err;
4867         }
4868
4869         /* change into new root fs */
4870         if (fchdir(newroot) < 0) {
4871                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4872                 goto err;
4873         }
4874
4875         /* pivot_root into our new root fs */
4876         if (pivot_root(".", ".") < 0) {
4877                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4878                 goto err;
4879         }
4880
4881         /*
4882          * At this point the old-root is mounted on top of our new-root.
4883          * To unmounted it we must not be chdir'd into it, so escape back
4884          * to the old-root.
4885          */
4886         if (fchdir(oldroot) < 0) {
4887                 lxcfs_error("%s\n", "Failed to enter old root.");
4888                 goto err;
4889         }
4890
4891         if (umount2(".", MNT_DETACH) < 0) {
4892                 lxcfs_error("%s\n", "Failed to detach old root.");
4893                 goto err;
4894         }
4895
4896         if (fchdir(newroot) < 0) {
4897                 lxcfs_error("%s\n", "Failed to re-enter new root.");
4898                 goto err;
4899         }
4900
4901         ret = 0;
4902
4903 err:
4904         if (oldroot > 0)
4905                 close(oldroot);
4906         if (newroot > 0)
4907                 close(newroot);
4908
4909         return ret;
4910 }
4911
4912 static int chroot_enter()
4913 {
4914         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4915                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4916                 return -1;
4917         }
4918
4919         if (chroot(".") < 0) {
4920                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4921                 return -1;
4922         }
4923
4924         if (chdir("/") < 0) {
4925                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4926                 return -1;
4927         }
4928
4929         return 0;
4930 }
4931
4932 static int permute_and_enter(void)
4933 {
4934         struct statfs sb;
4935
4936         if (statfs("/", &sb) < 0) {
4937                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
4938                 return -1;
4939         }
4940
4941         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
4942          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
4943          * /proc/1/mountinfo. */
4944         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
4945                 return chroot_enter();
4946
4947         if (pivot_enter() < 0) {
4948                 lxcfs_error("%s\n", "Could not perform pivot root.");
4949                 return -1;
4950         }
4951
4952         return 0;
4953 }
4954
4955 /* Prepare our new clean root. */
4956 static int permute_prepare(void)
4957 {
4958         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4959                 lxcfs_error("%s\n", "Failed to create directory for new root.");
4960                 return -1;
4961         }
4962
4963         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4964                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4965                 return -1;
4966         }
4967
4968         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4969                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
4970                 return -1;
4971         }
4972
4973         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4974                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
4975                 return -1;
4976         }
4977
4978         return 0;
4979 }
4980
4981 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
4982 static bool permute_root(void)
4983 {
4984         /* Prepare new root. */
4985         if (permute_prepare() < 0)
4986                 return false;
4987
4988         /* Pivot into new root. */
4989         if (permute_and_enter() < 0)
4990                 return false;
4991
4992         return true;
4993 }
4994
4995 static int preserve_mnt_ns(int pid)
4996 {
4997         int ret;
4998         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
4999         char path[len];
5000
5001         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5002         if (ret < 0 || (size_t)ret >= len)
5003                 return -1;
5004
5005         return open(path, O_RDONLY | O_CLOEXEC);
5006 }
5007
5008 static bool cgfs_prepare_mounts(void)
5009 {
5010         if (!mkdir_p(BASEDIR, 0700)) {
5011                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5012                 return false;
5013         }
5014
5015         if (!umount_if_mounted()) {
5016                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5017                 return false;
5018         }
5019
5020         if (unshare(CLONE_NEWNS) < 0) {
5021                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5022                 return false;
5023         }
5024
5025         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5026         if (cgroup_mount_ns_fd < 0) {
5027                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5028                 return false;
5029         }
5030
5031         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5032                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5033                 return false;
5034         }
5035
5036         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5037                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5038                 return false;
5039         }
5040
5041         return true;
5042 }
5043
5044 static bool cgfs_mount_hierarchies(void)
5045 {
5046         char *target;
5047         size_t clen, len;
5048         int i, ret;
5049
5050         for (i = 0; i < num_hierarchies; i++) {
5051                 char *controller = hierarchies[i];
5052
5053                 clen = strlen(controller);
5054                 len = strlen(BASEDIR) + clen + 2;
5055                 target = malloc(len);
5056                 if (!target)
5057                         return false;
5058
5059                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5060                 if (ret < 0 || ret >= len) {
5061                         free(target);
5062                         return false;
5063                 }
5064                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5065                         free(target);
5066                         return false;
5067                 }
5068                 if (!strcmp(controller, "unified"))
5069                         ret = mount("none", target, "cgroup2", 0, NULL);
5070                 else
5071                         ret = mount(controller, target, "cgroup", 0, controller);
5072                 if (ret < 0) {
5073                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5074                         free(target);
5075                         return false;
5076                 }
5077
5078                 fd_hierarchies[i] = open(target, O_DIRECTORY);
5079                 if (fd_hierarchies[i] < 0) {
5080                         free(target);
5081                         return false;
5082                 }
5083                 free(target);
5084         }
5085         return true;
5086 }
5087
5088 static bool cgfs_setup_controllers(void)
5089 {
5090         if (!cgfs_prepare_mounts())
5091                 return false;
5092
5093         if (!cgfs_mount_hierarchies()) {
5094                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5095                 return false;
5096         }
5097
5098         if (!permute_root())
5099                 return false;
5100
5101         return true;
5102 }
5103
5104 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5105 {
5106         FILE *f;
5107         char *cret, *line = NULL;
5108         char cwd[MAXPATHLEN];
5109         size_t len = 0;
5110         int i, init_ns = -1;
5111         bool found_unified = false;
5112
5113         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5114                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5115                 return;
5116         }
5117
5118         while (getline(&line, &len, f) != -1) {
5119                 char *idx, *p, *p2;
5120
5121                 p = strchr(line, ':');
5122                 if (!p)
5123                         goto out;
5124                 idx = line;
5125                 *(p++) = '\0';
5126
5127                 p2 = strrchr(p, ':');
5128                 if (!p2)
5129                         goto out;
5130                 *p2 = '\0';
5131
5132                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5133                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5134                  * because it parses out the empty string "" and later on passes
5135                  * it to mount(). Let's skip such entries.
5136                  */
5137                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5138                         found_unified = true;
5139                         p = "unified";
5140                 }
5141
5142                 if (!store_hierarchy(line, p))
5143                         goto out;
5144         }
5145
5146         /* Preserve initial namespace. */
5147         init_ns = preserve_mnt_ns(getpid());
5148         if (init_ns < 0) {
5149                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5150                 goto out;
5151         }
5152
5153         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5154         if (!fd_hierarchies) {
5155                 lxcfs_error("%s\n", strerror(errno));
5156                 goto out;
5157         }
5158
5159         for (i = 0; i < num_hierarchies; i++)
5160                 fd_hierarchies[i] = -1;
5161
5162         cret = getcwd(cwd, MAXPATHLEN);
5163         if (!cret)
5164                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5165
5166         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5167          * to privately mount lxcfs cgroups. */
5168         if (!cgfs_setup_controllers()) {
5169                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5170                 goto out;
5171         }
5172
5173         if (setns(init_ns, 0) < 0) {
5174                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5175                 goto out;
5176         }
5177
5178         if (!cret || chdir(cwd) < 0)
5179                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5180
5181         print_subsystems();
5182
5183 out:
5184         free(line);
5185         fclose(f);
5186         if (init_ns >= 0)
5187                 close(init_ns);
5188 }
5189
5190 static void __attribute__((destructor)) free_subsystems(void)
5191 {
5192         int i;
5193
5194         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5195
5196         for (i = 0; i < num_hierarchies; i++) {
5197                 if (hierarchies[i])
5198                         free(hierarchies[i]);
5199                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5200                         close(fd_hierarchies[i]);
5201         }
5202         free(hierarchies);
5203         free(fd_hierarchies);
5204
5205         if (cgroup_mount_ns_fd >= 0)
5206                 close(cgroup_mount_ns_fd);
5207 }