bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 /* The function of hash table.*/
  84 #define LOAD_SIZE 100 /*the size of hash_table */
  85 #define FLUSH_TIME 5  /*the flush rate */
  86 #define DEPTH_DIR 3   /*the depth of per cgroup */
  87 /* The function of calculate loadavg .*/
  88 #define FSHIFT          11              /* nr of bits of precision */
  89 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  90 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  91 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  92 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  93 #define LOAD_INT(x) ((x) >> FSHIFT)
  94 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  95 /*
  96  * This parameter is used for proc_loadavg_read().
  97  * 1 means use loadavg, 0 means not use.
  98  */
  99 static int loadavg = 0;
 100 static int calc_hash(char *name)
 101 {
 102         unsigned int hash = 0;
 103         unsigned int x = 0;
 104         /* ELFHash algorithm. */
 105         while (*name) {
 106                 hash = (hash << 4) + *name++;
 107                 x = hash & 0xf0000000;
 108                 if (x != 0)
 109                         hash ^= (x >> 24);
 110                 hash &= ~x;
 111         }
 112         return ((hash & 0x7fffffff) % LOAD_SIZE);
 113 }
 114
 115 struct load_node {
 116         char *cg;  /*cg */
 117         unsigned long avenrun[3];               /* Load averages */
 118         unsigned int run_pid;
 119         unsigned int total_pid;
 120         unsigned int last_pid;
 121         int cfd; /* The file descriptor of the mounted cgroup */
 122         struct  load_node *next;
 123         struct  load_node **pre;
 124 };
 125
 126 struct load_head {
 127         /*
 128          * The lock is about insert load_node and refresh load_node.To the first
 129          * load_node of each hash bucket, insert and refresh in this hash bucket is
 130          * mutually exclusive.
 131          */
 132         pthread_mutex_t lock;
 133         /*
 134          * The rdlock is about read loadavg and delete load_node.To each hash
 135          * bucket, read and delete is mutually exclusive. But at the same time, we
 136          * allow paratactic read operation. This rdlock is at list level.
 137          */
 138         pthread_rwlock_t rdlock;
 139         /*
 140          * The rilock is about read loadavg and insert load_node.To the first
 141          * load_node of each hash bucket, read and insert is mutually exclusive.
 142          * But at the same time, we allow paratactic read operation.
 143          */
 144         pthread_rwlock_t rilock;
 145         struct load_node *next;
 146 };
 147
 148 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 149 /*
 150  * init_load initialize the hash table.
 151  * Return 0 on success, return -1 on failure.
 152  */
 153 static int init_load(void)
 154 {
 155         int i;
 156         int ret;
 157
 158         for (i = 0; i < LOAD_SIZE; i++) {
 159                 load_hash[i].next = NULL;
 160                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 161                 if (ret != 0) {
 162                         lxcfs_error("%s\n", "Failed to initialize lock");
 163                         goto out3;
 164                 }
 165                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 166                 if (ret != 0) {
 167                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 168                         goto out2;
 169                 }
 170                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 171                 if (ret != 0) {
 172                         lxcfs_error("%s\n", "Failed to initialize rilock");
 173                         goto out1;
 174                 }
 175         }
 176         return 0;
 177 out1:
 178         pthread_rwlock_destroy(&load_hash[i].rdlock);
 179 out2:
 180         pthread_mutex_destroy(&load_hash[i].lock);
 181 out3:
 182         while (i > 0) {
 183                 i--;
 184                 pthread_mutex_destroy(&load_hash[i].lock);
 185                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 186                 pthread_rwlock_destroy(&load_hash[i].rilock);
 187         }
 188         return -1;
 189 }
 190
 191 static void insert_node(struct load_node **n, int locate)
 192 {
 193         struct load_node *f;
 194
 195         pthread_mutex_lock(&load_hash[locate].lock);
 196         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 197         f = load_hash[locate].next;
 198         load_hash[locate].next = *n;
 199
 200         (*n)->pre = &(load_hash[locate].next);
 201         if (f)
 202                 f->pre = &((*n)->next);
 203         (*n)->next = f;
 204         pthread_mutex_unlock(&load_hash[locate].lock);
 205         pthread_rwlock_unlock(&load_hash[locate].rilock);
 206 }
 207 /*
 208  * locate_node() finds special node. Not return NULL means success.
 209  * It should be noted that rdlock isn't unlocked at the end of code
 210  * because this function is used to read special node. Delete is not
 211  * allowed before read has ended.
 212  * unlock rdlock only in proc_loadavg_read().
 213  */
 214 static struct load_node *locate_node(char *cg, int locate)
 215 {
 216         struct load_node *f = NULL;
 217         int i = 0;
 218
 219         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 220         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 221         if (load_hash[locate].next == NULL) {
 222                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 223                 return f;
 224         }
 225         f = load_hash[locate].next;
 226         pthread_rwlock_unlock(&load_hash[locate].rilock);
 227         while (f && ((i = strcmp(f->cg, cg)) != 0))
 228                 f = f->next;
 229         return f;
 230 }
 231 /* Delete the load_node n and return the next node of it. */
 232 static struct load_node *del_node(struct load_node *n, int locate)
 233 {
 234         struct load_node *g;
 235
 236         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 237         if (n->next == NULL) {
 238                 *(n->pre) = NULL;
 239         } else {
 240                 *(n->pre) = n->next;
 241                 n->next->pre = n->pre;
 242         }
 243         g = n->next;
 244         free(n->cg);
 245         free(n);
 246         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 247         return g;
 248 }
 249
 250 void load_free(void)
 251 {
 252         int i;
 253         struct load_node *f, *p;
 254
 255         for (i = 0; i < LOAD_SIZE; i++) {
 256                 pthread_mutex_lock(&load_hash[i].lock);
 257                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 258                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 259                 if (load_hash[i].next == NULL) {
 260                         pthread_mutex_unlock(&load_hash[i].lock);
 261                         pthread_mutex_destroy(&load_hash[i].lock);
 262                         pthread_rwlock_unlock(&load_hash[i].rilock);
 263                         pthread_rwlock_destroy(&load_hash[i].rilock);
 264                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 265                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 266                         continue;
 267                 }
 268                 for (f = load_hash[i].next; f; ) {
 269                         free(f->cg);
 270                         p = f->next;
 271                         free(f);
 272                         f = p;
 273                 }
 274                 pthread_mutex_unlock(&load_hash[i].lock);
 275                 pthread_mutex_destroy(&load_hash[i].lock);
 276                 pthread_rwlock_unlock(&load_hash[i].rilock);
 277                 pthread_rwlock_destroy(&load_hash[i].rilock);
 278                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 279                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 280         }
 281 }
 282 /* Reserve buffer size to account for file size changes. */
 283 #define BUF_RESERVE_SIZE 512
 284
 285 /*
 286  * A table caching which pid is init for a pid namespace.
 287  * When looking up which pid is init for $qpid, we first
 288  * 1. Stat /proc/$qpid/ns/pid.
 289  * 2. Check whether the ino_t is in our store.
 290  *   a. if not, fork a child in qpid's ns to send us
 291  *       ucred.pid = 1, and read the initpid.  Cache
 292  *       initpid and creation time for /proc/initpid
 293  *       in a new store entry.
 294  *   b. if so, verify that /proc/initpid still matches
 295  *       what we have saved.  If not, clear the store
 296  *       entry and go back to a.  If so, return the
 297  *       cached initpid.
 298  */
 299 struct pidns_init_store {
 300         ino_t ino;          // inode number for /proc/$pid/ns/pid
 301         pid_t initpid;      // the pid of nit in that ns
 302         long int ctime;     // the time at which /proc/$initpid was created
 303         struct pidns_init_store *next;
 304         long int lastcheck;
 305 };
 306
 307 /* lol - look at how they are allocated in the kernel */
 308 #define PIDNS_HASH_SIZE 4096
 309 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 310
 311 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 312 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 313 static void lock_mutex(pthread_mutex_t *l)
 314 {
 315         int ret;
 316
 317         if ((ret = pthread_mutex_lock(l)) != 0) {
 318                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 319                 exit(1);
 320         }
 321 }
 322
 323 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 324  * Number of hierarchies mounted. */
 325 static int num_hierarchies;
 326
 327 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 328  * Hierachies mounted {cpuset, blkio, ...}:
 329  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 330 static char **hierarchies;
 331
 332 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 333  * Open file descriptors:
 334  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 335  * private mount namespace.
 336  * Initialized via __constructor__ collect_and_mount_subsystems().
 337  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 338  * mounts and respective files in the private namespace even when located in
 339  * another namespace using the *at() family of functions
 340  * {openat(), fchownat(), ...}. */
 341 static int *fd_hierarchies;
 342 static int cgroup_mount_ns_fd = -1;
 343
 344 static void unlock_mutex(pthread_mutex_t *l)
 345 {
 346         int ret;
 347
 348         if ((ret = pthread_mutex_unlock(l)) != 0) {
 349                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 350                 exit(1);
 351         }
 352 }
 353
 354 static void store_lock(void)
 355 {
 356         lock_mutex(&pidns_store_mutex);
 357 }
 358
 359 static void store_unlock(void)
 360 {
 361         unlock_mutex(&pidns_store_mutex);
 362 }
 363
 364 /* Must be called under store_lock */
 365 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 366 {
 367         struct stat initsb;
 368         char fnam[100];
 369
 370         snprintf(fnam, 100, "/proc/%d", e->initpid);
 371         if (stat(fnam, &initsb) < 0)
 372                 return false;
 373
 374         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 375                     initsb.st_ctime, e->initpid);
 376
 377         if (e->ctime != initsb.st_ctime)
 378                 return false;
 379         return true;
 380 }
 381
 382 /* Must be called under store_lock */
 383 static void remove_initpid(struct pidns_init_store *e)
 384 {
 385         struct pidns_init_store *tmp;
 386         int h;
 387
 388         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 389
 390         h = HASH(e->ino);
 391         if (pidns_hash_table[h] == e) {
 392                 pidns_hash_table[h] = e->next;
 393                 free(e);
 394                 return;
 395         }
 396
 397         tmp = pidns_hash_table[h];
 398         while (tmp) {
 399                 if (tmp->next == e) {
 400                         tmp->next = e->next;
 401                         free(e);
 402                         return;
 403                 }
 404                 tmp = tmp->next;
 405         }
 406 }
 407
 408 #define PURGE_SECS 5
 409 /* Must be called under store_lock */
 410 static void prune_initpid_store(void)
 411 {
 412         static long int last_prune = 0;
 413         struct pidns_init_store *e, *prev, *delme;
 414         long int now, threshold;
 415         int i;
 416
 417         if (!last_prune) {
 418                 last_prune = time(NULL);
 419                 return;
 420         }
 421         now = time(NULL);
 422         if (now < last_prune + PURGE_SECS)
 423                 return;
 424
 425         lxcfs_debug("%s\n", "Pruning.");
 426
 427         last_prune = now;
 428         threshold = now - 2 * PURGE_SECS;
 429
 430         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 431                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 432                         if (e->lastcheck < threshold) {
 433
 434                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 435
 436                                 delme = e;
 437                                 if (prev)
 438                                         prev->next = e->next;
 439                                 else
 440                                         pidns_hash_table[i] = e->next;
 441                                 e = e->next;
 442                                 free(delme);
 443                         } else {
 444                                 prev = e;
 445                                 e = e->next;
 446                         }
 447                 }
 448         }
 449 }
 450
 451 /* Must be called under store_lock */
 452 static void save_initpid(struct stat *sb, pid_t pid)
 453 {
 454         struct pidns_init_store *e;
 455         char fpath[100];
 456         struct stat procsb;
 457         int h;
 458
 459         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 460
 461         snprintf(fpath, 100, "/proc/%d", pid);
 462         if (stat(fpath, &procsb) < 0)
 463                 return;
 464         do {
 465                 e = malloc(sizeof(*e));
 466         } while (!e);
 467         e->ino = sb->st_ino;
 468         e->initpid = pid;
 469         e->ctime = procsb.st_ctime;
 470         h = HASH(e->ino);
 471         e->next = pidns_hash_table[h];
 472         e->lastcheck = time(NULL);
 473         pidns_hash_table[h] = e;
 474 }
 475
 476 /*
 477  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 478  * entry for the inode number and creation time.  Verify that the init pid
 479  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 480  * otherwise.
 481  * Must be called under store_lock
 482  */
 483 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 484 {
 485         int h = HASH(sb->st_ino);
 486         struct pidns_init_store *e = pidns_hash_table[h];
 487
 488         while (e) {
 489                 if (e->ino == sb->st_ino) {
 490                         if (initpid_still_valid(e, sb)) {
 491                                 e->lastcheck = time(NULL);
 492                                 return e;
 493                         }
 494                         remove_initpid(e);
 495                         return NULL;
 496                 }
 497                 e = e->next;
 498         }
 499
 500         return NULL;
 501 }
 502
 503 static int is_dir(const char *path, int fd)
 504 {
 505         struct stat statbuf;
 506         int ret = fstatat(fd, path, &statbuf, fd);
 507         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 508                 return 1;
 509         return 0;
 510 }
 511
 512 static char *must_copy_string(const char *str)
 513 {
 514         char *dup = NULL;
 515         if (!str)
 516                 return NULL;
 517         do {
 518                 dup = strdup(str);
 519         } while (!dup);
 520
 521         return dup;
 522 }
 523
 524 static inline void drop_trailing_newlines(char *s)
 525 {
 526         int l;
 527
 528         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 529                 s[l-1] = '\0';
 530 }
 531
 532 #define BATCH_SIZE 50
 533 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 534 {
 535         int newbatches = (newlen / BATCH_SIZE) + 1;
 536         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 537
 538         if (!*mem || newbatches > oldbatches) {
 539                 char *tmp;
 540                 do {
 541                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 542                 } while (!tmp);
 543                 *mem = tmp;
 544         }
 545 }
 546 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 547 {
 548         size_t newlen = *len + linelen;
 549         dorealloc(contents, *len, newlen + 1);
 550         memcpy(*contents + *len, line, linelen+1);
 551         *len = newlen;
 552 }
 553
 554 static char *slurp_file(const char *from, int fd)
 555 {
 556         char *line = NULL;
 557         char *contents = NULL;
 558         FILE *f = fdopen(fd, "r");
 559         size_t len = 0, fulllen = 0;
 560         ssize_t linelen;
 561
 562         if (!f)
 563                 return NULL;
 564
 565         while ((linelen = getline(&line, &len, f)) != -1) {
 566                 append_line(&contents, &fulllen, line, linelen);
 567         }
 568         fclose(f);
 569
 570         if (contents)
 571                 drop_trailing_newlines(contents);
 572         free(line);
 573         return contents;
 574 }
 575
 576 static bool write_string(const char *fnam, const char *string, int fd)
 577 {
 578         FILE *f;
 579         size_t len, ret;
 580
 581         if (!(f = fdopen(fd, "w")))
 582                 return false;
 583         len = strlen(string);
 584         ret = fwrite(string, 1, len, f);
 585         if (ret != len) {
 586                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 587                 fclose(f);
 588                 return false;
 589         }
 590         if (fclose(f) < 0) {
 591                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 592                 return false;
 593         }
 594         return true;
 595 }
 596
 597 struct cgfs_files {
 598         char *name;
 599         uint32_t uid, gid;
 600         uint32_t mode;
 601 };
 602
 603 #define ALLOC_NUM 20
 604 static bool store_hierarchy(char *stridx, char *h)
 605 {
 606         if (num_hierarchies % ALLOC_NUM == 0) {
 607                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 608                 n *= ALLOC_NUM;
 609                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 610                 if (!tmp) {
 611                         lxcfs_error("%s\n", strerror(errno));
 612                         exit(1);
 613                 }
 614                 hierarchies = tmp;
 615         }
 616
 617         hierarchies[num_hierarchies++] = must_copy_string(h);
 618         return true;
 619 }
 620
 621 static void print_subsystems(void)
 622 {
 623         int i;
 624
 625         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 626         fprintf(stderr, "hierarchies:\n");
 627         for (i = 0; i < num_hierarchies; i++) {
 628                 if (hierarchies[i])
 629                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 630                                 fd_hierarchies[i], hierarchies[i]);
 631         }
 632 }
 633
 634 static bool in_comma_list(const char *needle, const char *haystack)
 635 {
 636         const char *s = haystack, *e;
 637         size_t nlen = strlen(needle);
 638
 639         while (*s && (e = strchr(s, ','))) {
 640                 if (nlen != e - s) {
 641                         s = e + 1;
 642                         continue;
 643                 }
 644                 if (strncmp(needle, s, nlen) == 0)
 645                         return true;
 646                 s = e + 1;
 647         }
 648         if (strcmp(needle, s) == 0)
 649                 return true;
 650         return false;
 651 }
 652
 653 /* do we need to do any massaging here?  I'm not sure... */
 654 /* Return the mounted controller and store the corresponding open file descriptor
 655  * referring to the controller mountpoint in the private lxcfs namespace in
 656  * @cfd.
 657  */
 658 static char *find_mounted_controller(const char *controller, int *cfd)
 659 {
 660         int i;
 661
 662         for (i = 0; i < num_hierarchies; i++) {
 663                 if (!hierarchies[i])
 664                         continue;
 665                 if (strcmp(hierarchies[i], controller) == 0) {
 666                         *cfd = fd_hierarchies[i];
 667                         return hierarchies[i];
 668                 }
 669                 if (in_comma_list(controller, hierarchies[i])) {
 670                         *cfd = fd_hierarchies[i];
 671                         return hierarchies[i];
 672                 }
 673         }
 674
 675         return NULL;
 676 }
 677
 678 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 679                 const char *value)
 680 {
 681         int ret, fd, cfd;
 682         size_t len;
 683         char *fnam, *tmpc;
 684
 685         tmpc = find_mounted_controller(controller, &cfd);
 686         if (!tmpc)
 687                 return false;
 688
 689         /* Make sure we pass a relative path to *at() family of functions.
 690          * . + /cgroup + / + file + \0
 691          */
 692         len = strlen(cgroup) + strlen(file) + 3;
 693         fnam = alloca(len);
 694         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 695         if (ret < 0 || (size_t)ret >= len)
 696                 return false;
 697
 698         fd = openat(cfd, fnam, O_WRONLY);
 699         if (fd < 0)
 700                 return false;
 701
 702         return write_string(fnam, value, fd);
 703 }
 704
 705 // Chown all the files in the cgroup directory.  We do this when we create
 706 // a cgroup on behalf of a user.
 707 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 708 {
 709         struct dirent *direntp;
 710         char path[MAXPATHLEN];
 711         size_t len;
 712         DIR *d;
 713         int fd1, ret;
 714
 715         len = strlen(dirname);
 716         if (len >= MAXPATHLEN) {
 717                 lxcfs_error("Pathname too long: %s\n", dirname);
 718                 return;
 719         }
 720
 721         fd1 = openat(fd, dirname, O_DIRECTORY);
 722         if (fd1 < 0)
 723                 return;
 724
 725         d = fdopendir(fd1);
 726         if (!d) {
 727                 lxcfs_error("Failed to open %s\n", dirname);
 728                 return;
 729         }
 730
 731         while ((direntp = readdir(d))) {
 732                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 733                         continue;
 734                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 735                 if (ret < 0 || ret >= MAXPATHLEN) {
 736                         lxcfs_error("Pathname too long under %s\n", dirname);
 737                         continue;
 738                 }
 739                 if (fchownat(fd, path, uid, gid, 0) < 0)
 740                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 741         }
 742         closedir(d);
 743 }
 744
 745 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 746 {
 747         int cfd;
 748         size_t len;
 749         char *dirnam, *tmpc;
 750
 751         tmpc = find_mounted_controller(controller, &cfd);
 752         if (!tmpc)
 753                 return -EINVAL;
 754
 755         /* Make sure we pass a relative path to *at() family of functions.
 756          * . + /cg + \0
 757          */
 758         len = strlen(cg) + 2;
 759         dirnam = alloca(len);
 760         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 761
 762         if (mkdirat(cfd, dirnam, 0755) < 0)
 763                 return -errno;
 764
 765         if (uid == 0 && gid == 0)
 766                 return 0;
 767
 768         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 769                 return -errno;
 770
 771         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 772
 773         return 0;
 774 }
 775
 776 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 777 {
 778         struct dirent *direntp;
 779         DIR *dir;
 780         bool ret = false;
 781         char pathname[MAXPATHLEN];
 782         int dupfd;
 783
 784         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 785         if (dupfd < 0)
 786                 return false;
 787
 788         dir = fdopendir(dupfd);
 789         if (!dir) {
 790                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 791                 close(dupfd);
 792                 return false;
 793         }
 794
 795         while ((direntp = readdir(dir))) {
 796                 struct stat mystat;
 797                 int rc;
 798
 799                 if (!strcmp(direntp->d_name, ".") ||
 800                     !strcmp(direntp->d_name, ".."))
 801                         continue;
 802
 803                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 804                 if (rc < 0 || rc >= MAXPATHLEN) {
 805                         lxcfs_error("%s\n", "Pathname too long.");
 806                         continue;
 807                 }
 808
 809                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 810                 if (rc) {
 811                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 812                         continue;
 813                 }
 814                 if (S_ISDIR(mystat.st_mode))
 815                         if (!recursive_rmdir(pathname, fd, cfd))
 816                                 lxcfs_debug("Error removing %s.\n", pathname);
 817         }
 818
 819         ret = true;
 820         if (closedir(dir) < 0) {
 821                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 822                 ret = false;
 823         }
 824
 825         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 826                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 827                 ret = false;
 828         }
 829
 830         close(dupfd);
 831
 832         return ret;
 833 }
 834
 835 bool cgfs_remove(const char *controller, const char *cg)
 836 {
 837         int fd, cfd;
 838         size_t len;
 839         char *dirnam, *tmpc;
 840         bool bret;
 841
 842         tmpc = find_mounted_controller(controller, &cfd);
 843         if (!tmpc)
 844                 return false;
 845
 846         /* Make sure we pass a relative path to *at() family of functions.
 847          * . +  /cg + \0
 848          */
 849         len = strlen(cg) + 2;
 850         dirnam = alloca(len);
 851         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 852
 853         fd = openat(cfd, dirnam, O_DIRECTORY);
 854         if (fd < 0)
 855                 return false;
 856
 857         bret = recursive_rmdir(dirnam, fd, cfd);
 858         close(fd);
 859         return bret;
 860 }
 861
 862 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 863 {
 864         int cfd;
 865         size_t len;
 866         char *pathname, *tmpc;
 867
 868         tmpc = find_mounted_controller(controller, &cfd);
 869         if (!tmpc)
 870                 return false;
 871
 872         /* Make sure we pass a relative path to *at() family of functions.
 873          * . + /file + \0
 874          */
 875         len = strlen(file) + 2;
 876         pathname = alloca(len);
 877         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 878         if (fchmodat(cfd, pathname, mode, 0) < 0)
 879                 return false;
 880         return true;
 881 }
 882
 883 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 884 {
 885         size_t len;
 886         char *fname;
 887
 888         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 889         fname = alloca(len);
 890         snprintf(fname, len, "%s/tasks", dirname);
 891         if (fchownat(fd, fname, uid, gid, 0) != 0)
 892                 return -errno;
 893         snprintf(fname, len, "%s/cgroup.procs", dirname);
 894         if (fchownat(fd, fname, uid, gid, 0) != 0)
 895                 return -errno;
 896         return 0;
 897 }
 898
 899 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 900 {
 901         int cfd;
 902         size_t len;
 903         char *pathname, *tmpc;
 904
 905         tmpc = find_mounted_controller(controller, &cfd);
 906         if (!tmpc)
 907                 return -EINVAL;
 908
 909         /* Make sure we pass a relative path to *at() family of functions.
 910          * . + /file + \0
 911          */
 912         len = strlen(file) + 2;
 913         pathname = alloca(len);
 914         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 915         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 916                 return -errno;
 917
 918         if (is_dir(pathname, cfd))
 919                 // like cgmanager did, we want to chown the tasks file as well
 920                 return chown_tasks_files(pathname, uid, gid, cfd);
 921
 922         return 0;
 923 }
 924
 925 FILE *open_pids_file(const char *controller, const char *cgroup)
 926 {
 927         int fd, cfd;
 928         size_t len;
 929         char *pathname, *tmpc;
 930
 931         tmpc = find_mounted_controller(controller, &cfd);
 932         if (!tmpc)
 933                 return NULL;
 934
 935         /* Make sure we pass a relative path to *at() family of functions.
 936          * . + /cgroup + / "cgroup.procs" + \0
 937          */
 938         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 939         pathname = alloca(len);
 940         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 941
 942         fd = openat(cfd, pathname, O_WRONLY);
 943         if (fd < 0)
 944                 return NULL;
 945
 946         return fdopen(fd, "w");
 947 }
 948
 949 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 950                                 void ***list, size_t typesize,
 951                                 void* (*iterator)(const char*, const char*, const char*))
 952 {
 953         int cfd, fd, ret;
 954         size_t len;
 955         char *cg, *tmpc;
 956         char pathname[MAXPATHLEN];
 957         size_t sz = 0, asz = 0;
 958         struct dirent *dirent;
 959         DIR *dir;
 960
 961         tmpc = find_mounted_controller(controller, &cfd);
 962         *list = NULL;
 963         if (!tmpc)
 964                 return false;
 965
 966         /* Make sure we pass a relative path to *at() family of functions. */
 967         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 968         cg = alloca(len);
 969         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 970         if (ret < 0 || (size_t)ret >= len) {
 971                 lxcfs_error("Pathname too long under %s\n", cgroup);
 972                 return false;
 973         }
 974
 975         fd = openat(cfd, cg, O_DIRECTORY);
 976         if (fd < 0)
 977                 return false;
 978
 979         dir = fdopendir(fd);
 980         if (!dir)
 981                 return false;
 982
 983         while ((dirent = readdir(dir))) {
 984                 struct stat mystat;
 985
 986                 if (!strcmp(dirent->d_name, ".") ||
 987                     !strcmp(dirent->d_name, ".."))
 988                         continue;
 989
 990                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 991                 if (ret < 0 || ret >= MAXPATHLEN) {
 992                         lxcfs_error("Pathname too long under %s\n", cg);
 993                         continue;
 994                 }
 995
 996                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 997                 if (ret) {
 998                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
 999                         continue;
1000                 }
1001                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1002                     (directories && !S_ISDIR(mystat.st_mode)))
1003                         continue;
1004
1005                 if (sz+2 >= asz) {
1006                         void **tmp;
1007                         asz += BATCH_SIZE;
1008                         do {
1009                                 tmp = realloc(*list, asz * typesize);
1010                         } while  (!tmp);
1011                         *list = tmp;
1012                 }
1013                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1014                 (*list)[sz+1] = NULL;
1015                 sz++;
1016         }
1017         if (closedir(dir) < 0) {
1018                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1019                 return false;
1020         }
1021         return true;
1022 }
1023
1024 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1025 {
1026         char *dup;
1027         do {
1028                 dup = strdup(dir_entry);
1029         } while (!dup);
1030         return dup;
1031 }
1032
1033 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1034 {
1035         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1036 }
1037
1038 void free_key(struct cgfs_files *k)
1039 {
1040         if (!k)
1041                 return;
1042         free(k->name);
1043         free(k);
1044 }
1045
1046 void free_keys(struct cgfs_files **keys)
1047 {
1048         int i;
1049
1050         if (!keys)
1051                 return;
1052         for (i = 0; keys[i]; i++) {
1053                 free_key(keys[i]);
1054         }
1055         free(keys);
1056 }
1057
1058 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1059 {
1060         int ret, fd, cfd;
1061         size_t len;
1062         char *fnam, *tmpc;
1063
1064         tmpc = find_mounted_controller(controller, &cfd);
1065         if (!tmpc)
1066                 return false;
1067
1068         /* Make sure we pass a relative path to *at() family of functions.
1069          * . + /cgroup + / + file + \0
1070          */
1071         len = strlen(cgroup) + strlen(file) + 3;
1072         fnam = alloca(len);
1073         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1074         if (ret < 0 || (size_t)ret >= len)
1075                 return false;
1076
1077         fd = openat(cfd, fnam, O_RDONLY);
1078         if (fd < 0)
1079                 return false;
1080
1081         *value = slurp_file(fnam, fd);
1082         return *value != NULL;
1083 }
1084
1085 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1086 {
1087         int ret, cfd;
1088         size_t len;
1089         char *fnam, *tmpc;
1090         struct stat sb;
1091         struct cgfs_files *newkey;
1092
1093         tmpc = find_mounted_controller(controller, &cfd);
1094         if (!tmpc)
1095                 return false;
1096
1097         if (file && *file == '/')
1098                 file++;
1099
1100         if (file && strchr(file, '/'))
1101                 return NULL;
1102
1103         /* Make sure we pass a relative path to *at() family of functions.
1104          * . + /cgroup + / + file + \0
1105          */
1106         len = strlen(cgroup) + 3;
1107         if (file)
1108                 len += strlen(file) + 1;
1109         fnam = alloca(len);
1110         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1111                  file ? "/" : "", file ? file : "");
1112
1113         ret = fstatat(cfd, fnam, &sb, 0);
1114         if (ret < 0)
1115                 return NULL;
1116
1117         do {
1118                 newkey = malloc(sizeof(struct cgfs_files));
1119         } while (!newkey);
1120         if (file)
1121                 newkey->name = must_copy_string(file);
1122         else if (strrchr(cgroup, '/'))
1123                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1124         else
1125                 newkey->name = must_copy_string(cgroup);
1126         newkey->uid = sb.st_uid;
1127         newkey->gid = sb.st_gid;
1128         newkey->mode = sb.st_mode;
1129
1130         return newkey;
1131 }
1132
1133 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1134 {
1135         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1136         if (!entry) {
1137                 lxcfs_error("Error getting files under %s:%s\n", controller,
1138                              cgroup);
1139         }
1140         return entry;
1141 }
1142
1143 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1144 {
1145         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1146 }
1147
1148 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1149 {
1150         int cfd;
1151         size_t len;
1152         char *fnam, *tmpc;
1153         int ret;
1154         struct stat sb;
1155
1156         tmpc = find_mounted_controller(controller, &cfd);
1157         if (!tmpc)
1158                 return false;
1159
1160         /* Make sure we pass a relative path to *at() family of functions.
1161          * . + /cgroup + / + f + \0
1162          */
1163         len = strlen(cgroup) + strlen(f) + 3;
1164         fnam = alloca(len);
1165         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1166         if (ret < 0 || (size_t)ret >= len)
1167                 return false;
1168
1169         ret = fstatat(cfd, fnam, &sb, 0);
1170         if (ret < 0 || !S_ISDIR(sb.st_mode))
1171                 return false;
1172
1173         return true;
1174 }
1175
1176 #define SEND_CREDS_OK 0
1177 #define SEND_CREDS_NOTSK 1
1178 #define SEND_CREDS_FAIL 2
1179 static bool recv_creds(int sock, struct ucred *cred, char *v);
1180 static int wait_for_pid(pid_t pid);
1181 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1182 static int send_creds_clone_wrapper(void *arg);
1183
1184 /*
1185  * clone a task which switches to @task's namespace and writes '1'.
1186  * over a unix sock so we can read the task's reaper's pid in our
1187  * namespace
1188  *
1189  * Note: glibc's fork() does not respect pidns, which can lead to failed
1190  * assertions inside glibc (and thus failed forks) if the child's pid in
1191  * the pidns and the parent pid outside are identical. Using clone prevents
1192  * this issue.
1193  */
1194 static void write_task_init_pid_exit(int sock, pid_t target)
1195 {
1196         char fnam[100];
1197         pid_t pid;
1198         int fd, ret;
1199         size_t stack_size = sysconf(_SC_PAGESIZE);
1200         void *stack = alloca(stack_size);
1201
1202         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1203         if (ret < 0 || ret >= sizeof(fnam))
1204                 _exit(1);
1205
1206         fd = open(fnam, O_RDONLY);
1207         if (fd < 0) {
1208                 perror("write_task_init_pid_exit open of ns/pid");
1209                 _exit(1);
1210         }
1211         if (setns(fd, 0)) {
1212                 perror("write_task_init_pid_exit setns 1");
1213                 close(fd);
1214                 _exit(1);
1215         }
1216         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1217         if (pid < 0)
1218                 _exit(1);
1219         if (pid != 0) {
1220                 if (!wait_for_pid(pid))
1221                         _exit(1);
1222                 _exit(0);
1223         }
1224 }
1225
1226 static int send_creds_clone_wrapper(void *arg) {
1227         struct ucred cred;
1228         char v;
1229         int sock = *(int *)arg;
1230
1231         /* we are the child */
1232         cred.uid = 0;
1233         cred.gid = 0;
1234         cred.pid = 1;
1235         v = '1';
1236         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1237                 return 1;
1238         return 0;
1239 }
1240
1241 static pid_t get_init_pid_for_task(pid_t task)
1242 {
1243         int sock[2];
1244         pid_t pid;
1245         pid_t ret = -1;
1246         char v = '0';
1247         struct ucred cred;
1248
1249         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1250                 perror("socketpair");
1251                 return -1;
1252         }
1253
1254         pid = fork();
1255         if (pid < 0)
1256                 goto out;
1257         if (!pid) {
1258                 close(sock[1]);
1259                 write_task_init_pid_exit(sock[0], task);
1260                 _exit(0);
1261         }
1262
1263         if (!recv_creds(sock[1], &cred, &v))
1264                 goto out;
1265         ret = cred.pid;
1266
1267 out:
1268         close(sock[0]);
1269         close(sock[1]);
1270         if (pid > 0)
1271                 wait_for_pid(pid);
1272         return ret;
1273 }
1274
1275 static pid_t lookup_initpid_in_store(pid_t qpid)
1276 {
1277         pid_t answer = 0;
1278         struct stat sb;
1279         struct pidns_init_store *e;
1280         char fnam[100];
1281
1282         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1283         store_lock();
1284         if (stat(fnam, &sb) < 0)
1285                 goto out;
1286         e = lookup_verify_initpid(&sb);
1287         if (e) {
1288                 answer = e->initpid;
1289                 goto out;
1290         }
1291         answer = get_init_pid_for_task(qpid);
1292         if (answer > 0)
1293                 save_initpid(&sb, answer);
1294
1295 out:
1296         /* we prune at end in case we are returning
1297          * the value we were about to return */
1298         prune_initpid_store();
1299         store_unlock();
1300         return answer;
1301 }
1302
1303 static int wait_for_pid(pid_t pid)
1304 {
1305         int status, ret;
1306
1307         if (pid <= 0)
1308                 return -1;
1309
1310 again:
1311         ret = waitpid(pid, &status, 0);
1312         if (ret == -1) {
1313                 if (errno == EINTR)
1314                         goto again;
1315                 return -1;
1316         }
1317         if (ret != pid)
1318                 goto again;
1319         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1320                 return -1;
1321         return 0;
1322 }
1323
1324
1325 /*
1326  * append pid to *src.
1327  * src: a pointer to a char* in which ot append the pid.
1328  * sz: the number of characters printed so far, minus trailing \0.
1329  * asz: the allocated size so far
1330  * pid: the pid to append
1331  */
1332 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1333 {
1334         char tmp[30];
1335
1336         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1337
1338         if (!*src || tmplen + *sz + 1 >= *asz) {
1339                 char *tmp;
1340                 do {
1341                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1342                 } while (!tmp);
1343                 *src = tmp;
1344                 *asz += BUF_RESERVE_SIZE;
1345         }
1346         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1347         *sz += tmplen;
1348 }
1349
1350 /*
1351  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1352  * valid in the caller's namespace, return the id mapped into
1353  * pid's namespace.
1354  * Returns the mapped id, or -1 on error.
1355  */
1356 unsigned int
1357 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1358 {
1359         unsigned int nsuid,   // base id for a range in the idfile's namespace
1360                      hostuid, // base id for a range in the caller's namespace
1361                      count;   // number of ids in this range
1362         char line[400];
1363         int ret;
1364
1365         fseek(idfile, 0L, SEEK_SET);
1366         while (fgets(line, 400, idfile)) {
1367                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1368                 if (ret != 3)
1369                         continue;
1370                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1371                         /*
1372                          * uids wrapped around - unexpected as this is a procfile,
1373                          * so just bail.
1374                          */
1375                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1376                                 nsuid, hostuid, count, line);
1377                         return -1;
1378                 }
1379                 if (hostuid <= in_id && hostuid+count > in_id) {
1380                         /*
1381                          * now since hostuid <= in_id < hostuid+count, and
1382                          * hostuid+count and nsuid+count do not wrap around,
1383                          * we know that nsuid+(in_id-hostuid) which must be
1384                          * less that nsuid+(count) must not wrap around
1385                          */
1386                         return (in_id - hostuid) + nsuid;
1387                 }
1388         }
1389
1390         // no answer found
1391         return -1;
1392 }
1393
1394 /*
1395  * for is_privileged_over,
1396  * specify whether we require the calling uid to be root in his
1397  * namespace
1398  */
1399 #define NS_ROOT_REQD true
1400 #define NS_ROOT_OPT false
1401
1402 #define PROCLEN 100
1403
1404 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1405 {
1406         char fpath[PROCLEN];
1407         int ret;
1408         bool answer = false;
1409         uid_t nsuid;
1410
1411         if (victim == -1 || uid == -1)
1412                 return false;
1413
1414         /*
1415          * If the request is one not requiring root in the namespace,
1416          * then having the same uid suffices.  (i.e. uid 1000 has write
1417          * access to files owned by uid 1000
1418          */
1419         if (!req_ns_root && uid == victim)
1420                 return true;
1421
1422         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1423         if (ret < 0 || ret >= PROCLEN)
1424                 return false;
1425         FILE *f = fopen(fpath, "r");
1426         if (!f)
1427                 return false;
1428
1429         /* if caller's not root in his namespace, reject */
1430         nsuid = convert_id_to_ns(f, uid);
1431         if (nsuid)
1432                 goto out;
1433
1434         /*
1435          * If victim is not mapped into caller's ns, reject.
1436          * XXX I'm not sure this check is needed given that fuse
1437          * will be sending requests where the vfs has converted
1438          */
1439         nsuid = convert_id_to_ns(f, victim);
1440         if (nsuid == -1)
1441                 goto out;
1442
1443         answer = true;
1444
1445 out:
1446         fclose(f);
1447         return answer;
1448 }
1449
1450 static bool perms_include(int fmode, mode_t req_mode)
1451 {
1452         mode_t r;
1453
1454         switch (req_mode & O_ACCMODE) {
1455         case O_RDONLY:
1456                 r = S_IROTH;
1457                 break;
1458         case O_WRONLY:
1459                 r = S_IWOTH;
1460                 break;
1461         case O_RDWR:
1462                 r = S_IROTH | S_IWOTH;
1463                 break;
1464         default:
1465                 return false;
1466         }
1467         return ((fmode & r) == r);
1468 }
1469
1470
1471 /*
1472  * taskcg is  a/b/c
1473  * querycg is /a/b/c/d/e
1474  * we return 'd'
1475  */
1476 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1477 {
1478         char *start, *end;
1479
1480         if (strlen(taskcg) <= strlen(querycg)) {
1481                 lxcfs_error("%s\n", "I was fed bad input.");
1482                 return NULL;
1483         }
1484
1485         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1486                 start =  strdup(taskcg + 1);
1487         else
1488                 start = strdup(taskcg + strlen(querycg) + 1);
1489         if (!start)
1490                 return NULL;
1491         end = strchr(start, '/');
1492         if (end)
1493                 *end = '\0';
1494         return start;
1495 }
1496
1497 static void stripnewline(char *x)
1498 {
1499         size_t l = strlen(x);
1500         if (l && x[l-1] == '\n')
1501                 x[l-1] = '\0';
1502 }
1503
1504 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1505 {
1506         int cfd;
1507         char fnam[PROCLEN];
1508         FILE *f;
1509         char *answer = NULL;
1510         char *line = NULL;
1511         size_t len = 0;
1512         int ret;
1513         const char *h = find_mounted_controller(contrl, &cfd);
1514         if (!h)
1515                 return NULL;
1516
1517         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1518         if (ret < 0 || ret >= PROCLEN)
1519                 return NULL;
1520         if (!(f = fopen(fnam, "r")))
1521                 return NULL;
1522
1523         while (getline(&line, &len, f) != -1) {
1524                 char *c1, *c2;
1525                 if (!line[0])
1526                         continue;
1527                 c1 = strchr(line, ':');
1528                 if (!c1)
1529                         goto out;
1530                 c1++;
1531                 c2 = strchr(c1, ':');
1532                 if (!c2)
1533                         goto out;
1534                 *c2 = '\0';
1535                 if (strcmp(c1, h) != 0)
1536                         continue;
1537                 c2++;
1538                 stripnewline(c2);
1539                 do {
1540                         answer = strdup(c2);
1541                 } while (!answer);
1542                 break;
1543         }
1544
1545 out:
1546         fclose(f);
1547         free(line);
1548         return answer;
1549 }
1550
1551 /*
1552  * check whether a fuse context may access a cgroup dir or file
1553  *
1554  * If file is not null, it is a cgroup file to check under cg.
1555  * If file is null, then we are checking perms on cg itself.
1556  *
1557  * For files we can check the mode of the list_keys result.
1558  * For cgroups, we must make assumptions based on the files under the
1559  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1560  * yet.
1561  */
1562 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1563 {
1564         struct cgfs_files *k = NULL;
1565         bool ret = false;
1566
1567         k = cgfs_get_key(contrl, cg, file);
1568         if (!k)
1569                 return false;
1570
1571         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1572                 if (perms_include(k->mode >> 6, mode)) {
1573                         ret = true;
1574                         goto out;
1575                 }
1576         }
1577         if (fc->gid == k->gid) {
1578                 if (perms_include(k->mode >> 3, mode)) {
1579                         ret = true;
1580                         goto out;
1581                 }
1582         }
1583         ret = perms_include(k->mode, mode);
1584
1585 out:
1586         free_key(k);
1587         return ret;
1588 }
1589
1590 #define INITSCOPE "/init.scope"
1591 static void prune_init_slice(char *cg)
1592 {
1593         char *point;
1594         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1595
1596         if (cg_len < initscope_len)
1597                 return;
1598
1599         point = cg + cg_len - initscope_len;
1600         if (strcmp(point, INITSCOPE) == 0) {
1601                 if (point == cg)
1602                         *(point+1) = '\0';
1603                 else
1604                         *point = '\0';
1605         }
1606 }
1607
1608 /*
1609  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1610  * If pid is in /a, he may act on /a/b, but not on /b.
1611  * if the answer is false and nextcg is not NULL, then *nextcg will point
1612  * to a string containing the next cgroup directory under cg, which must be
1613  * freed by the caller.
1614  */
1615 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1616 {
1617         bool answer = false;
1618         char *c2 = get_pid_cgroup(pid, contrl);
1619         char *linecmp;
1620
1621         if (!c2)
1622                 return false;
1623         prune_init_slice(c2);
1624
1625         /*
1626          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1627          * they pass in a cgroup without leading '/'
1628          *
1629          * The original line here was:
1630          *      linecmp = *cg == '/' ? c2 : c2+1;
1631          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1632          *       Serge, do you know?
1633          */
1634         if (*cg == '/' || !strncmp(cg, "./", 2))
1635                 linecmp = c2;
1636         else
1637                 linecmp = c2 + 1;
1638         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1639                 if (nextcg) {
1640                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1641                 }
1642                 goto out;
1643         }
1644         answer = true;
1645
1646 out:
1647         free(c2);
1648         return answer;
1649 }
1650
1651 /*
1652  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1653  */
1654 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1655 {
1656         bool answer = false;
1657         char *c2, *task_cg;
1658         size_t target_len, task_len;
1659
1660         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1661                 return true;
1662
1663         c2 = get_pid_cgroup(pid, contrl);
1664         if (!c2)
1665                 return false;
1666         prune_init_slice(c2);
1667
1668         task_cg = c2 + 1;
1669         target_len = strlen(cg);
1670         task_len = strlen(task_cg);
1671         if (task_len == 0) {
1672                 /* Task is in the root cg, it can see everything. This case is
1673                  * not handled by the strmcps below, since they test for the
1674                  * last /, but that is the first / that we've chopped off
1675                  * above.
1676                  */
1677                 answer = true;
1678                 goto out;
1679         }
1680         if (strcmp(cg, task_cg) == 0) {
1681                 answer = true;
1682                 goto out;
1683         }
1684         if (target_len < task_len) {
1685                 /* looking up a parent dir */
1686                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1687                         answer = true;
1688                 goto out;
1689         }
1690         if (target_len > task_len) {
1691                 /* looking up a child dir */
1692                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1693                         answer = true;
1694                 goto out;
1695         }
1696
1697 out:
1698         free(c2);
1699         return answer;
1700 }
1701
1702 /*
1703  * given /cgroup/freezer/a/b, return "freezer".
1704  * the returned char* should NOT be freed.
1705  */
1706 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1707 {
1708         const char *p1;
1709         char *contr, *slash;
1710
1711         if (strlen(path) < 9) {
1712                 errno = EACCES;
1713                 return NULL;
1714         }
1715         if (*(path + 7) != '/') {
1716                 errno = EINVAL;
1717                 return NULL;
1718         }
1719         p1 = path + 8;
1720         contr = strdupa(p1);
1721         if (!contr) {
1722                 errno = ENOMEM;
1723                 return NULL;
1724         }
1725         slash = strstr(contr, "/");
1726         if (slash)
1727                 *slash = '\0';
1728
1729         int i;
1730         for (i = 0; i < num_hierarchies; i++) {
1731                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1732                         return hierarchies[i];
1733         }
1734         errno = ENOENT;
1735         return NULL;
1736 }
1737
1738 /*
1739  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1740  * Note that the returned value may include files (keynames) etc
1741  */
1742 static const char *find_cgroup_in_path(const char *path)
1743 {
1744         const char *p1;
1745
1746         if (strlen(path) < 9) {
1747                 errno = EACCES;
1748                 return NULL;
1749         }
1750         p1 = strstr(path + 8, "/");
1751         if (!p1) {
1752                 errno = EINVAL;
1753                 return NULL;
1754         }
1755         errno = 0;
1756         return p1 + 1;
1757 }
1758
1759 /*
1760  * split the last path element from the path in @cg.
1761  * @dir is newly allocated and should be freed, @last not
1762 */
1763 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1764 {
1765         char *p;
1766
1767         do {
1768                 *dir = strdup(cg);
1769         } while (!*dir);
1770         *last = strrchr(cg, '/');
1771         if (!*last) {
1772                 *last = NULL;
1773                 return;
1774         }
1775         p = strrchr(*dir, '/');
1776         *p = '\0';
1777 }
1778
1779 /*
1780  * FUSE ops for /cgroup
1781  */
1782
1783 int cg_getattr(const char *path, struct stat *sb)
1784 {
1785         struct timespec now;
1786         struct fuse_context *fc = fuse_get_context();
1787         char * cgdir = NULL;
1788         char *last = NULL, *path1, *path2;
1789         struct cgfs_files *k = NULL;
1790         const char *cgroup;
1791         const char *controller = NULL;
1792         int ret = -ENOENT;
1793
1794
1795         if (!fc)
1796                 return -EIO;
1797
1798         memset(sb, 0, sizeof(struct stat));
1799
1800         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1801                 return -EINVAL;
1802
1803         sb->st_uid = sb->st_gid = 0;
1804         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1805         sb->st_size = 0;
1806
1807         if (strcmp(path, "/cgroup") == 0) {
1808                 sb->st_mode = S_IFDIR | 00755;
1809                 sb->st_nlink = 2;
1810                 return 0;
1811         }
1812
1813         controller = pick_controller_from_path(fc, path);
1814         if (!controller)
1815                 return -errno;
1816         cgroup = find_cgroup_in_path(path);
1817         if (!cgroup) {
1818                 /* this is just /cgroup/controller, return it as a dir */
1819                 sb->st_mode = S_IFDIR | 00755;
1820                 sb->st_nlink = 2;
1821                 return 0;
1822         }
1823
1824         get_cgdir_and_path(cgroup, &cgdir, &last);
1825
1826         if (!last) {
1827                 path1 = "/";
1828                 path2 = cgdir;
1829         } else {
1830                 path1 = cgdir;
1831                 path2 = last;
1832         }
1833
1834         pid_t initpid = lookup_initpid_in_store(fc->pid);
1835         if (initpid <= 0)
1836                 initpid = fc->pid;
1837         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1838          * Then check that caller's cgroup is under path if last is a child
1839          * cgroup, or cgdir if last is a file */
1840
1841         if (is_child_cgroup(controller, path1, path2)) {
1842                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1843                         ret = -ENOENT;
1844                         goto out;
1845                 }
1846                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1847                         /* this is just /cgroup/controller, return it as a dir */
1848                         sb->st_mode = S_IFDIR | 00555;
1849                         sb->st_nlink = 2;
1850                         ret = 0;
1851                         goto out;
1852                 }
1853                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1854                         ret = -EACCES;
1855                         goto out;
1856                 }
1857
1858                 // get uid, gid, from '/tasks' file and make up a mode
1859                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1860                 sb->st_mode = S_IFDIR | 00755;
1861                 k = cgfs_get_key(controller, cgroup, NULL);
1862                 if (!k) {
1863                         sb->st_uid = sb->st_gid = 0;
1864                 } else {
1865                         sb->st_uid = k->uid;
1866                         sb->st_gid = k->gid;
1867                 }
1868                 free_key(k);
1869                 sb->st_nlink = 2;
1870                 ret = 0;
1871                 goto out;
1872         }
1873
1874         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1875                 sb->st_mode = S_IFREG | k->mode;
1876                 sb->st_nlink = 1;
1877                 sb->st_uid = k->uid;
1878                 sb->st_gid = k->gid;
1879                 sb->st_size = 0;
1880                 free_key(k);
1881                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1882                         ret = -ENOENT;
1883                         goto out;
1884                 }
1885                 ret = 0;
1886         }
1887
1888 out:
1889         free(cgdir);
1890         return ret;
1891 }
1892
1893 int cg_opendir(const char *path, struct fuse_file_info *fi)
1894 {
1895         struct fuse_context *fc = fuse_get_context();
1896         const char *cgroup;
1897         struct file_info *dir_info;
1898         char *controller = NULL;
1899
1900         if (!fc)
1901                 return -EIO;
1902
1903         if (strcmp(path, "/cgroup") == 0) {
1904                 cgroup = NULL;
1905                 controller = NULL;
1906         } else {
1907                 // return list of keys for the controller, and list of child cgroups
1908                 controller = pick_controller_from_path(fc, path);
1909                 if (!controller)
1910                         return -errno;
1911
1912                 cgroup = find_cgroup_in_path(path);
1913                 if (!cgroup) {
1914                         /* this is just /cgroup/controller, return its contents */
1915                         cgroup = "/";
1916                 }
1917         }
1918
1919         pid_t initpid = lookup_initpid_in_store(fc->pid);
1920         if (initpid <= 0)
1921                 initpid = fc->pid;
1922         if (cgroup) {
1923                 if (!caller_may_see_dir(initpid, controller, cgroup))
1924                         return -ENOENT;
1925                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1926                         return -EACCES;
1927         }
1928
1929         /* we'll free this at cg_releasedir */
1930         dir_info = malloc(sizeof(*dir_info));
1931         if (!dir_info)
1932                 return -ENOMEM;
1933         dir_info->controller = must_copy_string(controller);
1934         dir_info->cgroup = must_copy_string(cgroup);
1935         dir_info->type = LXC_TYPE_CGDIR;
1936         dir_info->buf = NULL;
1937         dir_info->file = NULL;
1938         dir_info->buflen = 0;
1939
1940         fi->fh = (unsigned long)dir_info;
1941         return 0;
1942 }
1943
1944 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1945                 struct fuse_file_info *fi)
1946 {
1947         struct file_info *d = (struct file_info *)fi->fh;
1948         struct cgfs_files **list = NULL;
1949         int i, ret;
1950         char *nextcg = NULL;
1951         struct fuse_context *fc = fuse_get_context();
1952         char **clist = NULL;
1953
1954         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1955                 return -EIO;
1956
1957         if (d->type != LXC_TYPE_CGDIR) {
1958                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1959                 return -EIO;
1960         }
1961         if (!d->cgroup && !d->controller) {
1962                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1963                 int i;
1964
1965                 for (i = 0;  i < num_hierarchies; i++) {
1966                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1967                                 return -EIO;
1968                         }
1969                 }
1970                 return 0;
1971         }
1972
1973         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1974                 // not a valid cgroup
1975                 ret = -EINVAL;
1976                 goto out;
1977         }
1978
1979         pid_t initpid = lookup_initpid_in_store(fc->pid);
1980         if (initpid <= 0)
1981                 initpid = fc->pid;
1982         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1983                 if (nextcg) {
1984                         ret = filler(buf, nextcg,  NULL, 0);
1985                         free(nextcg);
1986                         if (ret != 0) {
1987                                 ret = -EIO;
1988                                 goto out;
1989                         }
1990                 }
1991                 ret = 0;
1992                 goto out;
1993         }
1994
1995         for (i = 0; list[i]; i++) {
1996                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1997                         ret = -EIO;
1998                         goto out;
1999                 }
2000         }
2001
2002         // now get the list of child cgroups
2003
2004         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2005                 ret = 0;
2006                 goto out;
2007         }
2008         if (clist) {
2009                 for (i = 0; clist[i]; i++) {
2010                         if (filler(buf, clist[i], NULL, 0) != 0) {
2011                                 ret = -EIO;
2012                                 goto out;
2013                         }
2014                 }
2015         }
2016         ret = 0;
2017
2018 out:
2019         free_keys(list);
2020         if (clist) {
2021                 for (i = 0; clist[i]; i++)
2022                         free(clist[i]);
2023                 free(clist);
2024         }
2025         return ret;
2026 }
2027
2028 static void do_release_file_info(struct fuse_file_info *fi)
2029 {
2030         struct file_info *f = (struct file_info *)fi->fh;
2031
2032         if (!f)
2033                 return;
2034
2035         fi->fh = 0;
2036
2037         free(f->controller);
2038         f->controller = NULL;
2039         free(f->cgroup);
2040         f->cgroup = NULL;
2041         free(f->file);
2042         f->file = NULL;
2043         free(f->buf);
2044         f->buf = NULL;
2045         free(f);
2046 }
2047
2048 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2049 {
2050         do_release_file_info(fi);
2051         return 0;
2052 }
2053
2054 int cg_open(const char *path, struct fuse_file_info *fi)
2055 {
2056         const char *cgroup;
2057         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2058         struct cgfs_files *k = NULL;
2059         struct file_info *file_info;
2060         struct fuse_context *fc = fuse_get_context();
2061         int ret;
2062
2063         if (!fc)
2064                 return -EIO;
2065
2066         controller = pick_controller_from_path(fc, path);
2067         if (!controller)
2068                 return -errno;
2069         cgroup = find_cgroup_in_path(path);
2070         if (!cgroup)
2071                 return -errno;
2072
2073         get_cgdir_and_path(cgroup, &cgdir, &last);
2074         if (!last) {
2075                 path1 = "/";
2076                 path2 = cgdir;
2077         } else {
2078                 path1 = cgdir;
2079                 path2 = last;
2080         }
2081
2082         k = cgfs_get_key(controller, path1, path2);
2083         if (!k) {
2084                 ret = -EINVAL;
2085                 goto out;
2086         }
2087         free_key(k);
2088
2089         pid_t initpid = lookup_initpid_in_store(fc->pid);
2090         if (initpid <= 0)
2091                 initpid = fc->pid;
2092         if (!caller_may_see_dir(initpid, controller, path1)) {
2093                 ret = -ENOENT;
2094                 goto out;
2095         }
2096         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2097                 ret = -EACCES;
2098                 goto out;
2099         }
2100
2101         /* we'll free this at cg_release */
2102         file_info = malloc(sizeof(*file_info));
2103         if (!file_info) {
2104                 ret = -ENOMEM;
2105                 goto out;
2106         }
2107         file_info->controller = must_copy_string(controller);
2108         file_info->cgroup = must_copy_string(path1);
2109         file_info->file = must_copy_string(path2);
2110         file_info->type = LXC_TYPE_CGFILE;
2111         file_info->buf = NULL;
2112         file_info->buflen = 0;
2113
2114         fi->fh = (unsigned long)file_info;
2115         ret = 0;
2116
2117 out:
2118         free(cgdir);
2119         return ret;
2120 }
2121
2122 int cg_access(const char *path, int mode)
2123 {
2124         int ret;
2125         const char *cgroup;
2126         char *path1, *path2, *controller;
2127         char *last = NULL, *cgdir = NULL;
2128         struct cgfs_files *k = NULL;
2129         struct fuse_context *fc = fuse_get_context();
2130
2131         if (strcmp(path, "/cgroup") == 0)
2132                 return 0;
2133
2134         if (!fc)
2135                 return -EIO;
2136
2137         controller = pick_controller_from_path(fc, path);
2138         if (!controller)
2139                 return -errno;
2140         cgroup = find_cgroup_in_path(path);
2141         if (!cgroup) {
2142                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2143                 if ((mode & W_OK) == 0)
2144                         return 0;
2145                 return -EACCES;
2146         }
2147
2148         get_cgdir_and_path(cgroup, &cgdir, &last);
2149         if (!last) {
2150                 path1 = "/";
2151                 path2 = cgdir;
2152         } else {
2153                 path1 = cgdir;
2154                 path2 = last;
2155         }
2156
2157         k = cgfs_get_key(controller, path1, path2);
2158         if (!k) {
2159                 if ((mode & W_OK) == 0)
2160                         ret = 0;
2161                 else
2162                         ret = -EACCES;
2163                 goto out;
2164         }
2165         free_key(k);
2166
2167         pid_t initpid = lookup_initpid_in_store(fc->pid);
2168         if (initpid <= 0)
2169                 initpid = fc->pid;
2170         if (!caller_may_see_dir(initpid, controller, path1)) {
2171                 ret = -ENOENT;
2172                 goto out;
2173         }
2174         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2175                 ret = -EACCES;
2176                 goto out;
2177         }
2178
2179         ret = 0;
2180
2181 out:
2182         free(cgdir);
2183         return ret;
2184 }
2185
2186 int cg_release(const char *path, struct fuse_file_info *fi)
2187 {
2188         do_release_file_info(fi);
2189         return 0;
2190 }
2191
2192 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2193
2194 static bool wait_for_sock(int sock, int timeout)
2195 {
2196         struct epoll_event ev;
2197         int epfd, ret, now, starttime, deltatime, saved_errno;
2198
2199         if ((starttime = time(NULL)) < 0)
2200                 return false;
2201
2202         if ((epfd = epoll_create(1)) < 0) {
2203                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2204                 return false;
2205         }
2206
2207         ev.events = POLLIN_SET;
2208         ev.data.fd = sock;
2209         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2210                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2211                 close(epfd);
2212                 return false;
2213         }
2214
2215 again:
2216         if ((now = time(NULL)) < 0) {
2217                 close(epfd);
2218                 return false;
2219         }
2220
2221         deltatime = (starttime + timeout) - now;
2222         if (deltatime < 0) { // timeout
2223                 errno = 0;
2224                 close(epfd);
2225                 return false;
2226         }
2227         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2228         if (ret < 0 && errno == EINTR)
2229                 goto again;
2230         saved_errno = errno;
2231         close(epfd);
2232
2233         if (ret <= 0) {
2234                 errno = saved_errno;
2235                 return false;
2236         }
2237         return true;
2238 }
2239
2240 static int msgrecv(int sockfd, void *buf, size_t len)
2241 {
2242         if (!wait_for_sock(sockfd, 2))
2243                 return -1;
2244         return recv(sockfd, buf, len, MSG_DONTWAIT);
2245 }
2246
2247 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2248 {
2249         struct msghdr msg = { 0 };
2250         struct iovec iov;
2251         struct cmsghdr *cmsg;
2252         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2253         char buf[1];
2254         buf[0] = 'p';
2255
2256         if (pingfirst) {
2257                 if (msgrecv(sock, buf, 1) != 1) {
2258                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2259                         return SEND_CREDS_FAIL;
2260                 }
2261         }
2262
2263         msg.msg_control = cmsgbuf;
2264         msg.msg_controllen = sizeof(cmsgbuf);
2265
2266         cmsg = CMSG_FIRSTHDR(&msg);
2267         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2268         cmsg->cmsg_level = SOL_SOCKET;
2269         cmsg->cmsg_type = SCM_CREDENTIALS;
2270         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2271
2272         msg.msg_name = NULL;
2273         msg.msg_namelen = 0;
2274
2275         buf[0] = v;
2276         iov.iov_base = buf;
2277         iov.iov_len = sizeof(buf);
2278         msg.msg_iov = &iov;
2279         msg.msg_iovlen = 1;
2280
2281         if (sendmsg(sock, &msg, 0) < 0) {
2282                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2283                 if (errno == 3)
2284                         return SEND_CREDS_NOTSK;
2285                 return SEND_CREDS_FAIL;
2286         }
2287
2288         return SEND_CREDS_OK;
2289 }
2290
2291 static bool recv_creds(int sock, struct ucred *cred, char *v)
2292 {
2293         struct msghdr msg = { 0 };
2294         struct iovec iov;
2295         struct cmsghdr *cmsg;
2296         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2297         char buf[1];
2298         int ret;
2299         int optval = 1;
2300
2301         *v = '1';
2302
2303         cred->pid = -1;
2304         cred->uid = -1;
2305         cred->gid = -1;
2306
2307         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2308                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2309                 return false;
2310         }
2311         buf[0] = '1';
2312         if (write(sock, buf, 1) != 1) {
2313                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2314                 return false;
2315         }
2316
2317         msg.msg_name = NULL;
2318         msg.msg_namelen = 0;
2319         msg.msg_control = cmsgbuf;
2320         msg.msg_controllen = sizeof(cmsgbuf);
2321
2322         iov.iov_base = buf;
2323         iov.iov_len = sizeof(buf);
2324         msg.msg_iov = &iov;
2325         msg.msg_iovlen = 1;
2326
2327         if (!wait_for_sock(sock, 2)) {
2328                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2329                 return false;
2330         }
2331         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2332         if (ret < 0) {
2333                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2334                 return false;
2335         }
2336
2337         cmsg = CMSG_FIRSTHDR(&msg);
2338
2339         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2340                         cmsg->cmsg_level == SOL_SOCKET &&
2341                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2342                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2343         }
2344         *v = buf[0];
2345
2346         return true;
2347 }
2348
2349 struct pid_ns_clone_args {
2350         int *cpipe;
2351         int sock;
2352         pid_t tpid;
2353         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2354 };
2355
2356 /*
2357  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2358  * with clone(). This simply writes '1' as ACK back to the parent
2359  * before calling the actual wrapped function.
2360  */
2361 static int pid_ns_clone_wrapper(void *arg) {
2362         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2363         char b = '1';
2364
2365         close(args->cpipe[0]);
2366         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2367                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2368         close(args->cpipe[1]);
2369         return args->wrapped(args->sock, args->tpid);
2370 }
2371
2372 /*
2373  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2374  * int value back over the socket.  This shifts the pid from the
2375  * sender's pidns into tpid's pidns.
2376  */
2377 static int pid_to_ns(int sock, pid_t tpid)
2378 {
2379         char v = '0';
2380         struct ucred cred;
2381
2382         while (recv_creds(sock, &cred, &v)) {
2383                 if (v == '1')
2384                         return 0;
2385                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2386                         return 1;
2387         }
2388         return 0;
2389 }
2390
2391
2392 /*
2393  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2394  * in your old pidns.  Only children which you clone will be in the target
2395  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2396  * actually convert pids.
2397  *
2398  * Note: glibc's fork() does not respect pidns, which can lead to failed
2399  * assertions inside glibc (and thus failed forks) if the child's pid in
2400  * the pidns and the parent pid outside are identical. Using clone prevents
2401  * this issue.
2402  */
2403 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2404 {
2405         int newnsfd = -1, ret, cpipe[2];
2406         char fnam[100];
2407         pid_t cpid;
2408         char v;
2409
2410         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2411         if (ret < 0 || ret >= sizeof(fnam))
2412                 _exit(1);
2413         newnsfd = open(fnam, O_RDONLY);
2414         if (newnsfd < 0)
2415                 _exit(1);
2416         if (setns(newnsfd, 0) < 0)
2417                 _exit(1);
2418         close(newnsfd);
2419
2420         if (pipe(cpipe) < 0)
2421                 _exit(1);
2422
2423         struct pid_ns_clone_args args = {
2424                 .cpipe = cpipe,
2425                 .sock = sock,
2426                 .tpid = tpid,
2427                 .wrapped = &pid_to_ns
2428         };
2429         size_t stack_size = sysconf(_SC_PAGESIZE);
2430         void *stack = alloca(stack_size);
2431
2432         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2433         if (cpid < 0)
2434                 _exit(1);
2435
2436         // give the child 1 second to be done forking and
2437         // write its ack
2438         if (!wait_for_sock(cpipe[0], 1))
2439                 _exit(1);
2440         ret = read(cpipe[0], &v, 1);
2441         if (ret != sizeof(char) || v != '1')
2442                 _exit(1);
2443
2444         if (!wait_for_pid(cpid))
2445                 _exit(1);
2446         _exit(0);
2447 }
2448
2449 /*
2450  * To read cgroup files with a particular pid, we will setns into the child
2451  * pidns, open a pipe, fork a child - which will be the first to really be in
2452  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2453  */
2454 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2455 {
2456         int sock[2] = {-1, -1};
2457         char *tmpdata = NULL;
2458         int ret;
2459         pid_t qpid, cpid = -1;
2460         bool answer = false;
2461         char v = '0';
2462         struct ucred cred;
2463         size_t sz = 0, asz = 0;
2464
2465         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2466                 return false;
2467
2468         /*
2469          * Now we read the pids from returned data one by one, pass
2470          * them into a child in the target namespace, read back the
2471          * translated pids, and put them into our to-return data
2472          */
2473
2474         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2475                 perror("socketpair");
2476                 free(tmpdata);
2477                 return false;
2478         }
2479
2480         cpid = fork();
2481         if (cpid == -1)
2482                 goto out;
2483
2484         if (!cpid) // child - exits when done
2485                 pid_to_ns_wrapper(sock[1], tpid);
2486
2487         char *ptr = tmpdata;
2488         cred.uid = 0;
2489         cred.gid = 0;
2490         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2491                 cred.pid = qpid;
2492                 ret = send_creds(sock[0], &cred, v, true);
2493
2494                 if (ret == SEND_CREDS_NOTSK)
2495                         goto next;
2496                 if (ret == SEND_CREDS_FAIL)
2497                         goto out;
2498
2499                 // read converted results
2500                 if (!wait_for_sock(sock[0], 2)) {
2501                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2502                         goto out;
2503                 }
2504                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2505                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2506                         goto out;
2507                 }
2508                 must_strcat_pid(d, &sz, &asz, qpid);
2509 next:
2510                 ptr = strchr(ptr, '\n');
2511                 if (!ptr)
2512                         break;
2513                 ptr++;
2514         }
2515
2516         cred.pid = getpid();
2517         v = '1';
2518         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2519                 // failed to ask child to exit
2520                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2521                 goto out;
2522         }
2523
2524         answer = true;
2525
2526 out:
2527         free(tmpdata);
2528         if (cpid != -1)
2529                 wait_for_pid(cpid);
2530         if (sock[0] != -1) {
2531                 close(sock[0]);
2532                 close(sock[1]);
2533         }
2534         return answer;
2535 }
2536
2537 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2538                 struct fuse_file_info *fi)
2539 {
2540         struct fuse_context *fc = fuse_get_context();
2541         struct file_info *f = (struct file_info *)fi->fh;
2542         struct cgfs_files *k = NULL;
2543         char *data = NULL;
2544         int ret, s;
2545         bool r;
2546
2547         if (f->type != LXC_TYPE_CGFILE) {
2548                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2549                 return -EIO;
2550         }
2551
2552         if (offset)
2553                 return 0;
2554
2555         if (!fc)
2556                 return -EIO;
2557
2558         if (!f->controller)
2559                 return -EINVAL;
2560
2561         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2562                 return -EINVAL;
2563         }
2564         free_key(k);
2565
2566
2567         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2568                 ret = -EACCES;
2569                 goto out;
2570         }
2571
2572         if (strcmp(f->file, "tasks") == 0 ||
2573                         strcmp(f->file, "/tasks") == 0 ||
2574                         strcmp(f->file, "/cgroup.procs") == 0 ||
2575                         strcmp(f->file, "cgroup.procs") == 0)
2576                 // special case - we have to translate the pids
2577                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2578         else
2579                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2580
2581         if (!r) {
2582                 ret = -EINVAL;
2583                 goto out;
2584         }
2585
2586         if (!data) {
2587                 ret = 0;
2588                 goto out;
2589         }
2590         s = strlen(data);
2591         if (s > size)
2592                 s = size;
2593         memcpy(buf, data, s);
2594         if (s > 0 && s < size && data[s-1] != '\n')
2595                 buf[s++] = '\n';
2596
2597         ret = s;
2598
2599 out:
2600         free(data);
2601         return ret;
2602 }
2603
2604 static int pid_from_ns(int sock, pid_t tpid)
2605 {
2606         pid_t vpid;
2607         struct ucred cred;
2608         char v;
2609         int ret;
2610
2611         cred.uid = 0;
2612         cred.gid = 0;
2613         while (1) {
2614                 if (!wait_for_sock(sock, 2)) {
2615                         lxcfs_error("%s\n", "Timeout reading from parent.");
2616                         return 1;
2617                 }
2618                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2619                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2620                         return 1;
2621                 }
2622                 if (vpid == -1) // done
2623                         break;
2624                 v = '0';
2625                 cred.pid = vpid;
2626                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2627                         v = '1';
2628                         cred.pid = getpid();
2629                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2630                                 return 1;
2631                 }
2632         }
2633         return 0;
2634 }
2635
2636 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2637 {
2638         int newnsfd = -1, ret, cpipe[2];
2639         char fnam[100];
2640         pid_t cpid;
2641         char v;
2642
2643         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2644         if (ret < 0 || ret >= sizeof(fnam))
2645                 _exit(1);
2646         newnsfd = open(fnam, O_RDONLY);
2647         if (newnsfd < 0)
2648                 _exit(1);
2649         if (setns(newnsfd, 0) < 0)
2650                 _exit(1);
2651         close(newnsfd);
2652
2653         if (pipe(cpipe) < 0)
2654                 _exit(1);
2655
2656         struct pid_ns_clone_args args = {
2657                 .cpipe = cpipe,
2658                 .sock = sock,
2659                 .tpid = tpid,
2660                 .wrapped = &pid_from_ns
2661         };
2662         size_t stack_size = sysconf(_SC_PAGESIZE);
2663         void *stack = alloca(stack_size);
2664
2665         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2666         if (cpid < 0)
2667                 _exit(1);
2668
2669         // give the child 1 second to be done forking and
2670         // write its ack
2671         if (!wait_for_sock(cpipe[0], 1))
2672                 _exit(1);
2673         ret = read(cpipe[0], &v, 1);
2674         if (ret != sizeof(char) || v != '1')
2675                 _exit(1);
2676
2677         if (!wait_for_pid(cpid))
2678                 _exit(1);
2679         _exit(0);
2680 }
2681
2682 /*
2683  * Given host @uid, return the uid to which it maps in
2684  * @pid's user namespace, or -1 if none.
2685  */
2686 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2687 {
2688         FILE *f;
2689         char line[400];
2690
2691         sprintf(line, "/proc/%d/uid_map", pid);
2692         if ((f = fopen(line, "r")) == NULL) {
2693                 return false;
2694         }
2695
2696         *answer = convert_id_to_ns(f, uid);
2697         fclose(f);
2698
2699         if (*answer == -1)
2700                 return false;
2701         return true;
2702 }
2703
2704 /*
2705  * get_pid_creds: get the real uid and gid of @pid from
2706  * /proc/$$/status
2707  * (XXX should we use euid here?)
2708  */
2709 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2710 {
2711         char line[400];
2712         uid_t u;
2713         gid_t g;
2714         FILE *f;
2715
2716         *uid = -1;
2717         *gid = -1;
2718         sprintf(line, "/proc/%d/status", pid);
2719         if ((f = fopen(line, "r")) == NULL) {
2720                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2721                 return;
2722         }
2723         while (fgets(line, 400, f)) {
2724                 if (strncmp(line, "Uid:", 4) == 0) {
2725                         if (sscanf(line+4, "%u", &u) != 1) {
2726                                 lxcfs_error("bad uid line for pid %u\n", pid);
2727                                 fclose(f);
2728                                 return;
2729                         }
2730                         *uid = u;
2731                 } else if (strncmp(line, "Gid:", 4) == 0) {
2732                         if (sscanf(line+4, "%u", &g) != 1) {
2733                                 lxcfs_error("bad gid line for pid %u\n", pid);
2734                                 fclose(f);
2735                                 return;
2736                         }
2737                         *gid = g;
2738                 }
2739         }
2740         fclose(f);
2741 }
2742
2743 /*
2744  * May the requestor @r move victim @v to a new cgroup?
2745  * This is allowed if
2746  *   . they are the same task
2747  *   . they are ownedy by the same uid
2748  *   . @r is root on the host, or
2749  *   . @v's uid is mapped into @r's where @r is root.
2750  */
2751 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2752 {
2753         uid_t v_uid, tmpuid;
2754         gid_t v_gid;
2755
2756         if (r == v)
2757                 return true;
2758         if (r_uid == 0)
2759                 return true;
2760         get_pid_creds(v, &v_uid, &v_gid);
2761         if (r_uid == v_uid)
2762                 return true;
2763         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2764                         && hostuid_to_ns(v_uid, r, &tmpuid))
2765                 return true;
2766         return false;
2767 }
2768
2769 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2770                 const char *file, const char *buf)
2771 {
2772         int sock[2] = {-1, -1};
2773         pid_t qpid, cpid = -1;
2774         FILE *pids_file = NULL;
2775         bool answer = false, fail = false;
2776
2777         pids_file = open_pids_file(contrl, cg);
2778         if (!pids_file)
2779                 return false;
2780
2781         /*
2782          * write the pids to a socket, have helper in writer's pidns
2783          * call movepid for us
2784          */
2785         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2786                 perror("socketpair");
2787                 goto out;
2788         }
2789
2790         cpid = fork();
2791         if (cpid == -1)
2792                 goto out;
2793
2794         if (!cpid) { // child
2795                 fclose(pids_file);
2796                 pid_from_ns_wrapper(sock[1], tpid);
2797         }
2798
2799         const char *ptr = buf;
2800         while (sscanf(ptr, "%d", &qpid) == 1) {
2801                 struct ucred cred;
2802                 char v;
2803
2804                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2805                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2806                         goto out;
2807                 }
2808
2809                 if (recv_creds(sock[0], &cred, &v)) {
2810                         if (v == '0') {
2811                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2812                                         fail = true;
2813                                         break;
2814                                 }
2815                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2816                                         fail = true;
2817                         }
2818                 }
2819
2820                 ptr = strchr(ptr, '\n');
2821                 if (!ptr)
2822                         break;
2823                 ptr++;
2824         }
2825
2826         /* All good, write the value */
2827         qpid = -1;
2828         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2829                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2830
2831         if (!fail)
2832                 answer = true;
2833
2834 out:
2835         if (cpid != -1)
2836                 wait_for_pid(cpid);
2837         if (sock[0] != -1) {
2838                 close(sock[0]);
2839                 close(sock[1]);
2840         }
2841         if (pids_file) {
2842                 if (fclose(pids_file) != 0)
2843                         answer = false;
2844         }
2845         return answer;
2846 }
2847
2848 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2849              struct fuse_file_info *fi)
2850 {
2851         struct fuse_context *fc = fuse_get_context();
2852         char *localbuf = NULL;
2853         struct cgfs_files *k = NULL;
2854         struct file_info *f = (struct file_info *)fi->fh;
2855         bool r;
2856
2857         if (f->type != LXC_TYPE_CGFILE) {
2858                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2859                 return -EIO;
2860         }
2861
2862         if (offset)
2863                 return 0;
2864
2865         if (!fc)
2866                 return -EIO;
2867
2868         localbuf = alloca(size+1);
2869         localbuf[size] = '\0';
2870         memcpy(localbuf, buf, size);
2871
2872         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2873                 size = -EINVAL;
2874                 goto out;
2875         }
2876
2877         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2878                 size = -EACCES;
2879                 goto out;
2880         }
2881
2882         if (strcmp(f->file, "tasks") == 0 ||
2883                         strcmp(f->file, "/tasks") == 0 ||
2884                         strcmp(f->file, "/cgroup.procs") == 0 ||
2885                         strcmp(f->file, "cgroup.procs") == 0)
2886                 // special case - we have to translate the pids
2887                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2888         else
2889                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2890
2891         if (!r)
2892                 size = -EINVAL;
2893
2894 out:
2895         free_key(k);
2896         return size;
2897 }
2898
2899 int cg_chown(const char *path, uid_t uid, gid_t gid)
2900 {
2901         struct fuse_context *fc = fuse_get_context();
2902         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2903         struct cgfs_files *k = NULL;
2904         const char *cgroup;
2905         int ret;
2906
2907         if (!fc)
2908                 return -EIO;
2909
2910         if (strcmp(path, "/cgroup") == 0)
2911                 return -EPERM;
2912
2913         controller = pick_controller_from_path(fc, path);
2914         if (!controller)
2915                 return errno == ENOENT ? -EPERM : -errno;
2916
2917         cgroup = find_cgroup_in_path(path);
2918         if (!cgroup)
2919                 /* this is just /cgroup/controller */
2920                 return -EPERM;
2921
2922         get_cgdir_and_path(cgroup, &cgdir, &last);
2923
2924         if (!last) {
2925                 path1 = "/";
2926                 path2 = cgdir;
2927         } else {
2928                 path1 = cgdir;
2929                 path2 = last;
2930         }
2931
2932         if (is_child_cgroup(controller, path1, path2)) {
2933                 // get uid, gid, from '/tasks' file and make up a mode
2934                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2935                 k = cgfs_get_key(controller, cgroup, "tasks");
2936
2937         } else
2938                 k = cgfs_get_key(controller, path1, path2);
2939
2940         if (!k) {
2941                 ret = -EINVAL;
2942                 goto out;
2943         }
2944
2945         /*
2946          * This being a fuse request, the uid and gid must be valid
2947          * in the caller's namespace.  So we can just check to make
2948          * sure that the caller is root in his uid, and privileged
2949          * over the file's current owner.
2950          */
2951         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2952                 ret = -EACCES;
2953                 goto out;
2954         }
2955
2956         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2957
2958 out:
2959         free_key(k);
2960         free(cgdir);
2961
2962         return ret;
2963 }
2964
2965 int cg_chmod(const char *path, mode_t mode)
2966 {
2967         struct fuse_context *fc = fuse_get_context();
2968         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2969         struct cgfs_files *k = NULL;
2970         const char *cgroup;
2971         int ret;
2972
2973         if (!fc)
2974                 return -EIO;
2975
2976         if (strcmp(path, "/cgroup") == 0)
2977                 return -EPERM;
2978
2979         controller = pick_controller_from_path(fc, path);
2980         if (!controller)
2981                 return errno == ENOENT ? -EPERM : -errno;
2982
2983         cgroup = find_cgroup_in_path(path);
2984         if (!cgroup)
2985                 /* this is just /cgroup/controller */
2986                 return -EPERM;
2987
2988         get_cgdir_and_path(cgroup, &cgdir, &last);
2989
2990         if (!last) {
2991                 path1 = "/";
2992                 path2 = cgdir;
2993         } else {
2994                 path1 = cgdir;
2995                 path2 = last;
2996         }
2997
2998         if (is_child_cgroup(controller, path1, path2)) {
2999                 // get uid, gid, from '/tasks' file and make up a mode
3000                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3001                 k = cgfs_get_key(controller, cgroup, "tasks");
3002
3003         } else
3004                 k = cgfs_get_key(controller, path1, path2);
3005
3006         if (!k) {
3007                 ret = -EINVAL;
3008                 goto out;
3009         }
3010
3011         /*
3012          * This being a fuse request, the uid and gid must be valid
3013          * in the caller's namespace.  So we can just check to make
3014          * sure that the caller is root in his uid, and privileged
3015          * over the file's current owner.
3016          */
3017         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3018                 ret = -EPERM;
3019                 goto out;
3020         }
3021
3022         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3023                 ret = -EINVAL;
3024                 goto out;
3025         }
3026
3027         ret = 0;
3028 out:
3029         free_key(k);
3030         free(cgdir);
3031         return ret;
3032 }
3033
3034 int cg_mkdir(const char *path, mode_t mode)
3035 {
3036         struct fuse_context *fc = fuse_get_context();
3037         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3038         const char *cgroup;
3039         int ret;
3040
3041         if (!fc)
3042                 return -EIO;
3043
3044         controller = pick_controller_from_path(fc, path);
3045         if (!controller)
3046                 return errno == ENOENT ? -EPERM : -errno;
3047
3048         cgroup = find_cgroup_in_path(path);
3049         if (!cgroup)
3050                 return -errno;
3051
3052         get_cgdir_and_path(cgroup, &cgdir, &last);
3053         if (!last)
3054                 path1 = "/";
3055         else
3056                 path1 = cgdir;
3057
3058         pid_t initpid = lookup_initpid_in_store(fc->pid);
3059         if (initpid <= 0)
3060                 initpid = fc->pid;
3061         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3062                 if (!next)
3063                         ret = -EINVAL;
3064                 else if (last && strcmp(next, last) == 0)
3065                         ret = -EEXIST;
3066                 else
3067                         ret = -EPERM;
3068                 goto out;
3069         }
3070
3071         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3072                 ret = -EACCES;
3073                 goto out;
3074         }
3075         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3076                 ret = -EACCES;
3077                 goto out;
3078         }
3079
3080         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3081
3082 out:
3083         free(cgdir);
3084         free(next);
3085         return ret;
3086 }
3087
3088 int cg_rmdir(const char *path)
3089 {
3090         struct fuse_context *fc = fuse_get_context();
3091         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3092         const char *cgroup;
3093         int ret;
3094
3095         if (!fc)
3096                 return -EIO;
3097
3098         controller = pick_controller_from_path(fc, path);
3099         if (!controller) /* Someone's trying to delete "/cgroup". */
3100                 return -EPERM;
3101
3102         cgroup = find_cgroup_in_path(path);
3103         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3104                 return -EPERM;
3105
3106         get_cgdir_and_path(cgroup, &cgdir, &last);
3107         if (!last) {
3108                 /* Someone's trying to delete a cgroup on the same level as the
3109                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3110                  * rmdir "/cgroup/blkio/init.slice".
3111                  */
3112                 ret = -EPERM;
3113                 goto out;
3114         }
3115
3116         pid_t initpid = lookup_initpid_in_store(fc->pid);
3117         if (initpid <= 0)
3118                 initpid = fc->pid;
3119         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3120                 if (!last || (next && (strcmp(next, last) == 0)))
3121                         ret = -EBUSY;
3122                 else
3123                         ret = -ENOENT;
3124                 goto out;
3125         }
3126
3127         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3128                 ret = -EACCES;
3129                 goto out;
3130         }
3131         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3132                 ret = -EACCES;
3133                 goto out;
3134         }
3135
3136         if (!cgfs_remove(controller, cgroup)) {
3137                 ret = -EINVAL;
3138                 goto out;
3139         }
3140
3141         ret = 0;
3142
3143 out:
3144         free(cgdir);
3145         free(next);
3146         return ret;
3147 }
3148
3149 static bool startswith(const char *line, const char *pref)
3150 {
3151         if (strncmp(line, pref, strlen(pref)) == 0)
3152                 return true;
3153         return false;
3154 }
3155
3156 static void parse_memstat(char *memstat, unsigned long *cached,
3157                 unsigned long *active_anon, unsigned long *inactive_anon,
3158                 unsigned long *active_file, unsigned long *inactive_file,
3159                 unsigned long *unevictable)
3160 {
3161         char *eol;
3162
3163         while (*memstat) {
3164                 if (startswith(memstat, "total_cache")) {
3165                         sscanf(memstat + 11, "%lu", cached);
3166                         *cached /= 1024;
3167                 } else if (startswith(memstat, "total_active_anon")) {
3168                         sscanf(memstat + 17, "%lu", active_anon);
3169                         *active_anon /= 1024;
3170                 } else if (startswith(memstat, "total_inactive_anon")) {
3171                         sscanf(memstat + 19, "%lu", inactive_anon);
3172                         *inactive_anon /= 1024;
3173                 } else if (startswith(memstat, "total_active_file")) {
3174                         sscanf(memstat + 17, "%lu", active_file);
3175                         *active_file /= 1024;
3176                 } else if (startswith(memstat, "total_inactive_file")) {
3177                         sscanf(memstat + 19, "%lu", inactive_file);
3178                         *inactive_file /= 1024;
3179                 } else if (startswith(memstat, "total_unevictable")) {
3180                         sscanf(memstat + 17, "%lu", unevictable);
3181                         *unevictable /= 1024;
3182                 }
3183                 eol = strchr(memstat, '\n');
3184                 if (!eol)
3185                         return;
3186                 memstat = eol+1;
3187         }
3188 }
3189
3190 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3191 {
3192         char *eol;
3193         char key[32];
3194
3195         memset(key, 0, 32);
3196         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3197
3198         size_t len = strlen(key);
3199         *v = 0;
3200
3201         while (*str) {
3202                 if (startswith(str, key)) {
3203                         sscanf(str + len, "%lu", v);
3204                         return;
3205                 }
3206                 eol = strchr(str, '\n');
3207                 if (!eol)
3208                         return;
3209                 str = eol+1;
3210         }
3211 }
3212
3213 static int read_file(const char *path, char *buf, size_t size,
3214                      struct file_info *d)
3215 {
3216         size_t linelen = 0, total_len = 0, rv = 0;
3217         char *line = NULL;
3218         char *cache = d->buf;
3219         size_t cache_size = d->buflen;
3220         FILE *f = fopen(path, "r");
3221         if (!f)
3222                 return 0;
3223
3224         while (getline(&line, &linelen, f) != -1) {
3225                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3226                 if (l < 0) {
3227                         perror("Error writing to cache");
3228                         rv = 0;
3229                         goto err;
3230                 }
3231                 if (l >= cache_size) {
3232                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3233                         rv = 0;
3234                         goto err;
3235                 }
3236                 cache += l;
3237                 cache_size -= l;
3238                 total_len += l;
3239         }
3240
3241         d->size = total_len;
3242         if (total_len > size)
3243                 total_len = size;
3244
3245         /* read from off 0 */
3246         memcpy(buf, d->buf, total_len);
3247         rv = total_len;
3248   err:
3249         fclose(f);
3250         free(line);
3251         return rv;
3252 }
3253
3254 /*
3255  * FUSE ops for /proc
3256  */
3257
3258 static unsigned long get_memlimit(const char *cgroup, const char *file)
3259 {
3260         char *memlimit_str = NULL;
3261         unsigned long memlimit = -1;
3262
3263         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3264                 memlimit = strtoul(memlimit_str, NULL, 10);
3265
3266         free(memlimit_str);
3267
3268         return memlimit;
3269 }
3270
3271 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3272 {
3273         char *copy = strdupa(cgroup);
3274         unsigned long memlimit = 0, retlimit;
3275
3276         retlimit = get_memlimit(copy, file);
3277
3278         while (strcmp(copy, "/") != 0) {
3279                 copy = dirname(copy);
3280                 memlimit = get_memlimit(copy, file);
3281                 if (memlimit != -1 && memlimit < retlimit)
3282                         retlimit = memlimit;
3283         };
3284
3285         return retlimit;
3286 }
3287
3288 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3289                 struct fuse_file_info *fi)
3290 {
3291         struct fuse_context *fc = fuse_get_context();
3292         struct file_info *d = (struct file_info *)fi->fh;
3293         char *cg;
3294         char *memusage_str = NULL, *memstat_str = NULL,
3295                 *memswlimit_str = NULL, *memswusage_str = NULL;
3296         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3297                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3298                 active_file = 0, inactive_file = 0, unevictable = 0,
3299                 hostswtotal = 0;
3300         char *line = NULL;
3301         size_t linelen = 0, total_len = 0, rv = 0;
3302         char *cache = d->buf;
3303         size_t cache_size = d->buflen;
3304         FILE *f = NULL;
3305
3306         if (offset){
3307                 if (offset > d->size)
3308                         return -EINVAL;
3309                 if (!d->cached)
3310                         return 0;
3311                 int left = d->size - offset;
3312                 total_len = left > size ? size: left;
3313                 memcpy(buf, cache + offset, total_len);
3314                 return total_len;
3315         }
3316
3317         pid_t initpid = lookup_initpid_in_store(fc->pid);
3318         if (initpid <= 0)
3319                 initpid = fc->pid;
3320         cg = get_pid_cgroup(initpid, "memory");
3321         if (!cg)
3322                 return read_file("/proc/meminfo", buf, size, d);
3323         prune_init_slice(cg);
3324
3325         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3326         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3327                 goto err;
3328         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3329                 goto err;
3330
3331         // Following values are allowed to fail, because swapaccount might be turned
3332         // off for current kernel
3333         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3334                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3335         {
3336                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3337                 memswusage = strtoul(memswusage_str, NULL, 10);
3338
3339                 memswlimit = memswlimit / 1024;
3340                 memswusage = memswusage / 1024;
3341         }
3342
3343         memusage = strtoul(memusage_str, NULL, 10);
3344         memlimit /= 1024;
3345         memusage /= 1024;
3346
3347         parse_memstat(memstat_str, &cached, &active_anon,
3348                         &inactive_anon, &active_file, &inactive_file,
3349                         &unevictable);
3350
3351         f = fopen("/proc/meminfo", "r");
3352         if (!f)
3353                 goto err;
3354
3355         while (getline(&line, &linelen, f) != -1) {
3356                 ssize_t l;
3357                 char *printme, lbuf[100];
3358
3359                 memset(lbuf, 0, 100);
3360                 if (startswith(line, "MemTotal:")) {
3361                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3362                         if (hosttotal < memlimit)
3363                                 memlimit = hosttotal;
3364                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3365                         printme = lbuf;
3366                 } else if (startswith(line, "MemFree:")) {
3367                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3368                         printme = lbuf;
3369                 } else if (startswith(line, "MemAvailable:")) {
3370                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3371                         printme = lbuf;
3372                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3373                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3374                         if (hostswtotal < memswlimit)
3375                                 memswlimit = hostswtotal;
3376                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3377                         printme = lbuf;
3378                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3379                         unsigned long swaptotal = memswlimit,
3380                                         swapusage = memswusage - memusage,
3381                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3382                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3383                         printme = lbuf;
3384                 } else if (startswith(line, "Slab:")) {
3385                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3386                         printme = lbuf;
3387                 } else if (startswith(line, "Buffers:")) {
3388                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3389                         printme = lbuf;
3390                 } else if (startswith(line, "Cached:")) {
3391                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3392                         printme = lbuf;
3393                 } else if (startswith(line, "SwapCached:")) {
3394                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3395                         printme = lbuf;
3396                 } else if (startswith(line, "Active:")) {
3397                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3398                                         active_anon + active_file);
3399                         printme = lbuf;
3400                 } else if (startswith(line, "Inactive:")) {
3401                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3402                                         inactive_anon + inactive_file);
3403                         printme = lbuf;
3404                 } else if (startswith(line, "Active(anon)")) {
3405                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3406                         printme = lbuf;
3407                 } else if (startswith(line, "Inactive(anon)")) {
3408                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3409                         printme = lbuf;
3410                 } else if (startswith(line, "Active(file)")) {
3411                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3412                         printme = lbuf;
3413                 } else if (startswith(line, "Inactive(file)")) {
3414                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3415                         printme = lbuf;
3416                 } else if (startswith(line, "Unevictable")) {
3417                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3418                         printme = lbuf;
3419                 } else if (startswith(line, "SReclaimable")) {
3420                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3421                         printme = lbuf;
3422                 } else if (startswith(line, "SUnreclaim")) {
3423                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3424                         printme = lbuf;
3425                 } else
3426                         printme = line;
3427
3428                 l = snprintf(cache, cache_size, "%s", printme);
3429                 if (l < 0) {
3430                         perror("Error writing to cache");
3431                         rv = 0;
3432                         goto err;
3433
3434                 }
3435                 if (l >= cache_size) {
3436                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3437                         rv = 0;
3438                         goto err;
3439                 }
3440
3441                 cache += l;
3442                 cache_size -= l;
3443                 total_len += l;
3444         }
3445
3446         d->cached = 1;
3447         d->size = total_len;
3448         if (total_len > size ) total_len = size;
3449         memcpy(buf, d->buf, total_len);
3450
3451         rv = total_len;
3452 err:
3453         if (f)
3454                 fclose(f);
3455         free(line);
3456         free(cg);
3457         free(memusage_str);
3458         free(memswlimit_str);
3459         free(memswusage_str);
3460         free(memstat_str);
3461         return rv;
3462 }
3463
3464 /*
3465  * Read the cpuset.cpus for cg
3466  * Return the answer in a newly allocated string which must be freed
3467  */
3468 static char *get_cpuset(const char *cg)
3469 {
3470         char *answer;
3471
3472         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3473                 return NULL;
3474         return answer;
3475 }
3476
3477 bool cpu_in_cpuset(int cpu, const char *cpuset);
3478
3479 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3480 {
3481         int cpu;
3482
3483         if (sscanf(line, "processor       : %d", &cpu) != 1)
3484                 return false;
3485         return cpu_in_cpuset(cpu, cpuset);
3486 }
3487
3488 /*
3489  * check whether this is a '^processor" line in /proc/cpuinfo
3490  */
3491 static bool is_processor_line(const char *line)
3492 {
3493         int cpu;
3494
3495         if (sscanf(line, "processor       : %d", &cpu) == 1)
3496                 return true;
3497         return false;
3498 }
3499
3500 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3501                 struct fuse_file_info *fi)
3502 {
3503         struct fuse_context *fc = fuse_get_context();
3504         struct file_info *d = (struct file_info *)fi->fh;
3505         char *cg;
3506         char *cpuset = NULL;
3507         char *line = NULL;
3508         size_t linelen = 0, total_len = 0, rv = 0;
3509         bool am_printing = false, firstline = true, is_s390x = false;
3510         int curcpu = -1, cpu;
3511         char *cache = d->buf;
3512         size_t cache_size = d->buflen;
3513         FILE *f = NULL;
3514
3515         if (offset){
3516                 if (offset > d->size)
3517                         return -EINVAL;
3518                 if (!d->cached)
3519                         return 0;
3520                 int left = d->size - offset;
3521                 total_len = left > size ? size: left;
3522                 memcpy(buf, cache + offset, total_len);
3523                 return total_len;
3524         }
3525
3526         pid_t initpid = lookup_initpid_in_store(fc->pid);
3527         if (initpid <= 0)
3528                 initpid = fc->pid;
3529         cg = get_pid_cgroup(initpid, "cpuset");
3530         if (!cg)
3531                 return read_file("proc/cpuinfo", buf, size, d);
3532         prune_init_slice(cg);
3533
3534         cpuset = get_cpuset(cg);
3535         if (!cpuset)
3536                 goto err;
3537
3538         f = fopen("/proc/cpuinfo", "r");
3539         if (!f)
3540                 goto err;
3541
3542         while (getline(&line, &linelen, f) != -1) {
3543                 ssize_t l;
3544                 if (firstline) {
3545                         firstline = false;
3546                         if (strstr(line, "IBM/S390") != NULL) {
3547                                 is_s390x = true;
3548                                 am_printing = true;
3549                                 continue;
3550                         }
3551                 }
3552                 if (strncmp(line, "# processors:", 12) == 0)
3553                         continue;
3554                 if (is_processor_line(line)) {
3555                         am_printing = cpuline_in_cpuset(line, cpuset);
3556                         if (am_printing) {
3557                                 curcpu ++;
3558                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3559                                 if (l < 0) {
3560                                         perror("Error writing to cache");
3561                                         rv = 0;
3562                                         goto err;
3563                                 }
3564                                 if (l >= cache_size) {
3565                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3566                                         rv = 0;
3567                                         goto err;
3568                                 }
3569                                 cache += l;
3570                                 cache_size -= l;
3571                                 total_len += l;
3572                         }
3573                         continue;
3574                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3575                         char *p;
3576                         if (!cpu_in_cpuset(cpu, cpuset))
3577                                 continue;
3578                         curcpu ++;
3579                         p = strchr(line, ':');
3580                         if (!p || !*p)
3581                                 goto err;
3582                         p++;
3583                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3584                         if (l < 0) {
3585                                 perror("Error writing to cache");
3586                                 rv = 0;
3587                                 goto err;
3588                         }
3589                         if (l >= cache_size) {
3590                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3591                                 rv = 0;
3592                                 goto err;
3593                         }
3594                         cache += l;
3595                         cache_size -= l;
3596                         total_len += l;
3597                         continue;
3598
3599                 }
3600                 if (am_printing) {
3601                         l = snprintf(cache, cache_size, "%s", line);
3602                         if (l < 0) {
3603                                 perror("Error writing to cache");
3604                                 rv = 0;
3605                                 goto err;
3606                         }
3607                         if (l >= cache_size) {
3608                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3609                                 rv = 0;
3610                                 goto err;
3611                         }
3612                         cache += l;
3613                         cache_size -= l;
3614                         total_len += l;
3615                 }
3616         }
3617
3618         if (is_s390x) {
3619                 char *origcache = d->buf;
3620                 ssize_t l;
3621                 do {
3622                         d->buf = malloc(d->buflen);
3623                 } while (!d->buf);
3624                 cache = d->buf;
3625                 cache_size = d->buflen;
3626                 total_len = 0;
3627                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3628                 if (l < 0 || l >= cache_size) {
3629                         free(origcache);
3630                         goto err;
3631                 }
3632                 cache_size -= l;
3633                 cache += l;
3634                 total_len += l;
3635                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3636                 if (l < 0 || l >= cache_size) {
3637                         free(origcache);
3638                         goto err;
3639                 }
3640                 cache_size -= l;
3641                 cache += l;
3642                 total_len += l;
3643                 l = snprintf(cache, cache_size, "%s", origcache);
3644                 free(origcache);
3645                 if (l < 0 || l >= cache_size)
3646                         goto err;
3647                 total_len += l;
3648         }
3649
3650         d->cached = 1;
3651         d->size = total_len;
3652         if (total_len > size ) total_len = size;
3653
3654         /* read from off 0 */
3655         memcpy(buf, d->buf, total_len);
3656         rv = total_len;
3657 err:
3658         if (f)
3659                 fclose(f);
3660         free(line);
3661         free(cpuset);
3662         free(cg);
3663         return rv;
3664 }
3665
3666 static uint64_t get_reaper_start_time(pid_t pid)
3667 {
3668         int ret;
3669         FILE *f;
3670         uint64_t starttime;
3671         /* strlen("/proc/") = 6
3672          * +
3673          * LXCFS_NUMSTRLEN64
3674          * +
3675          * strlen("/stat") = 5
3676          * +
3677          * \0 = 1
3678          * */
3679 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3680         char path[__PROC_PID_STAT_LEN];
3681         pid_t qpid;
3682
3683         qpid = lookup_initpid_in_store(pid);
3684         if (qpid <= 0) {
3685                 /* Caller can check for EINVAL on 0. */
3686                 errno = EINVAL;
3687                 return 0;
3688         }
3689
3690         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3691         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3692                 /* Caller can check for EINVAL on 0. */
3693                 errno = EINVAL;
3694                 return 0;
3695         }
3696
3697         f = fopen(path, "r");
3698         if (!f) {
3699                 /* Caller can check for EINVAL on 0. */
3700                 errno = EINVAL;
3701                 return 0;
3702         }
3703
3704         /* Note that the *scanf() argument supression requires that length
3705          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3706          * at us. It's like telling someone you're not married and then asking
3707          * if you can bring your wife to the party.
3708          */
3709         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3710                         "%*s "      /* (2)  comm        %s   */
3711                         "%*c "      /* (3)  state       %c   */
3712                         "%*d "      /* (4)  ppid        %d   */
3713                         "%*d "      /* (5)  pgrp        %d   */
3714                         "%*d "      /* (6)  session     %d   */
3715                         "%*d "      /* (7)  tty_nr      %d   */
3716                         "%*d "      /* (8)  tpgid       %d   */
3717                         "%*u "      /* (9)  flags       %u   */
3718                         "%*u "      /* (10) minflt      %lu  */
3719                         "%*u "      /* (11) cminflt     %lu  */
3720                         "%*u "      /* (12) majflt      %lu  */
3721                         "%*u "      /* (13) cmajflt     %lu  */
3722                         "%*u "      /* (14) utime       %lu  */
3723                         "%*u "      /* (15) stime       %lu  */
3724                         "%*d "      /* (16) cutime      %ld  */
3725                         "%*d "      /* (17) cstime      %ld  */
3726                         "%*d "      /* (18) priority    %ld  */
3727                         "%*d "      /* (19) nice        %ld  */
3728                         "%*d "      /* (20) num_threads %ld  */
3729                         "%*d "      /* (21) itrealvalue %ld  */
3730                         "%" PRIu64, /* (22) starttime   %llu */
3731                      &starttime);
3732         if (ret != 1) {
3733                 fclose(f);
3734                 /* Caller can check for EINVAL on 0. */
3735                 errno = EINVAL;
3736                 return 0;
3737         }
3738
3739         fclose(f);
3740
3741         errno = 0;
3742         return starttime;
3743 }
3744
3745 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3746 {
3747         uint64_t clockticks;
3748         int64_t ticks_per_sec;
3749
3750         clockticks = get_reaper_start_time(pid);
3751         if (clockticks == 0 && errno == EINVAL) {
3752                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3753                 return 0;
3754         }
3755
3756         ticks_per_sec = sysconf(_SC_CLK_TCK);
3757         if (ticks_per_sec < 0 && errno == EINVAL) {
3758                 lxcfs_debug(
3759                     "%s\n",
3760                     "failed to determine number of clock ticks in a second");
3761                 return 0;
3762         }
3763
3764         return (clockticks /= ticks_per_sec);
3765 }
3766
3767 static uint64_t get_reaper_age(pid_t pid)
3768 {
3769         uint64_t procstart, uptime, procage;
3770
3771         /* We need to substract the time the process has started since system
3772          * boot minus the time when the system has started to get the actual
3773          * reaper age.
3774          */
3775         procstart = get_reaper_start_time_in_sec(pid);
3776         procage = procstart;
3777         if (procstart > 0) {
3778                 int ret;
3779                 struct timespec spec;
3780
3781                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3782                 if (ret < 0)
3783                         return 0;
3784                 /* We could make this more precise here by using the tv_nsec
3785                  * field in the timespec struct and convert it to milliseconds
3786                  * and then create a double for the seconds and milliseconds but
3787                  * that seems more work than it is worth.
3788                  */
3789                 uptime = spec.tv_sec;
3790                 procage = uptime - procstart;
3791         }
3792
3793         return procage;
3794 }
3795
3796 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3797 static int proc_stat_read(char *buf, size_t size, off_t offset,
3798                 struct fuse_file_info *fi)
3799 {
3800         struct fuse_context *fc = fuse_get_context();
3801         struct file_info *d = (struct file_info *)fi->fh;
3802         char *cg;
3803         char *cpuset = NULL;
3804         char *line = NULL;
3805         size_t linelen = 0, total_len = 0, rv = 0;
3806         int curcpu = -1; /* cpu numbering starts at 0 */
3807         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3808         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3809                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3810         char cpuall[CPUALL_MAX_SIZE];
3811         /* reserve for cpu all */
3812         char *cache = d->buf + CPUALL_MAX_SIZE;
3813         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3814         FILE *f = NULL;
3815
3816         if (offset){
3817                 if (offset > d->size)
3818                         return -EINVAL;
3819                 if (!d->cached)
3820                         return 0;
3821                 int left = d->size - offset;
3822                 total_len = left > size ? size: left;
3823                 memcpy(buf, d->buf + offset, total_len);
3824                 return total_len;
3825         }
3826
3827         pid_t initpid = lookup_initpid_in_store(fc->pid);
3828         if (initpid <= 0)
3829                 initpid = fc->pid;
3830         cg = get_pid_cgroup(initpid, "cpuset");
3831         if (!cg)
3832                 return read_file("/proc/stat", buf, size, d);
3833         prune_init_slice(cg);
3834
3835         cpuset = get_cpuset(cg);
3836         if (!cpuset)
3837                 goto err;
3838
3839         f = fopen("/proc/stat", "r");
3840         if (!f)
3841                 goto err;
3842
3843         //skip first line
3844         if (getline(&line, &linelen, f) < 0) {
3845                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3846                 goto err;
3847         }
3848
3849         while (getline(&line, &linelen, f) != -1) {
3850                 ssize_t l;
3851                 int cpu;
3852                 char cpu_char[10]; /* That's a lot of cores */
3853                 char *c;
3854
3855                 if (strlen(line) == 0)
3856                         continue;
3857                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3858                         /* not a ^cpuN line containing a number N, just print it */
3859                         l = snprintf(cache, cache_size, "%s", line);
3860                         if (l < 0) {
3861                                 perror("Error writing to cache");
3862                                 rv = 0;
3863                                 goto err;
3864                         }
3865                         if (l >= cache_size) {
3866                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3867                                 rv = 0;
3868                                 goto err;
3869                         }
3870                         cache += l;
3871                         cache_size -= l;
3872                         total_len += l;
3873                         continue;
3874                 }
3875
3876                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3877                         continue;
3878                 if (!cpu_in_cpuset(cpu, cpuset))
3879                         continue;
3880                 curcpu ++;
3881
3882                 c = strchr(line, ' ');
3883                 if (!c)
3884                         continue;
3885                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3886                 if (l < 0) {
3887                         perror("Error writing to cache");
3888                         rv = 0;
3889                         goto err;
3890
3891                 }
3892                 if (l >= cache_size) {
3893                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3894                         rv = 0;
3895                         goto err;
3896                 }
3897
3898                 cache += l;
3899                 cache_size -= l;
3900                 total_len += l;
3901
3902                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3903                            &user,
3904                            &nice,
3905                            &system,
3906                            &idle,
3907                            &iowait,
3908                            &irq,
3909                            &softirq,
3910                            &steal,
3911                            &guest,
3912                            &guest_nice) != 10)
3913                         continue;
3914                 user_sum += user;
3915                 nice_sum += nice;
3916                 system_sum += system;
3917                 idle_sum += idle;
3918                 iowait_sum += iowait;
3919                 irq_sum += irq;
3920                 softirq_sum += softirq;
3921                 steal_sum += steal;
3922                 guest_sum += guest;
3923                 guest_nice_sum += guest_nice;
3924         }
3925
3926         cache = d->buf;
3927
3928         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3929                         user_sum,
3930                         nice_sum,
3931                         system_sum,
3932                         idle_sum,
3933                         iowait_sum,
3934                         irq_sum,
3935                         softirq_sum,
3936                         steal_sum,
3937                         guest_sum,
3938                         guest_nice_sum);
3939         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3940                 memcpy(cache, cpuall, cpuall_len);
3941                 cache += cpuall_len;
3942         } else {
3943                 /* shouldn't happen */
3944                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3945                 cpuall_len = 0;
3946         }
3947
3948         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3949         total_len += cpuall_len;
3950         d->cached = 1;
3951         d->size = total_len;
3952         if (total_len > size)
3953                 total_len = size;
3954
3955         memcpy(buf, d->buf, total_len);
3956         rv = total_len;
3957
3958 err:
3959         if (f)
3960                 fclose(f);
3961         free(line);
3962         free(cpuset);
3963         free(cg);
3964         return rv;
3965 }
3966
3967 /* This function retrieves the busy time of a group of tasks by looking at
3968  * cpuacct.usage. Unfortunately, this only makes sense when the container has
3969  * been given it's own cpuacct cgroup. If not, this function will take the busy
3970  * time of all other taks that do not actually belong to the container into
3971  * account as well. If someone has a clever solution for this please send a
3972  * patch!
3973  */
3974 static unsigned long get_reaper_busy(pid_t task)
3975 {
3976         pid_t initpid = lookup_initpid_in_store(task);
3977         char *cgroup = NULL, *usage_str = NULL;
3978         unsigned long usage = 0;
3979
3980         if (initpid <= 0)
3981                 return 0;
3982
3983         cgroup = get_pid_cgroup(initpid, "cpuacct");
3984         if (!cgroup)
3985                 goto out;
3986         prune_init_slice(cgroup);
3987         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3988                 goto out;
3989         usage = strtoul(usage_str, NULL, 10);
3990         usage /= 1000000000;
3991
3992 out:
3993         free(cgroup);
3994         free(usage_str);
3995         return usage;
3996 }
3997
3998 #if RELOADTEST
3999 void iwashere(void)
4000 {
4001         int fd;
4002
4003         fd = creat("/tmp/lxcfs-iwashere", 0644);
4004         if (fd >= 0)
4005                 close(fd);
4006 }
4007 #endif
4008
4009 /*
4010  * We read /proc/uptime and reuse its second field.
4011  * For the first field, we use the mtime for the reaper for
4012  * the calling pid as returned by getreaperage
4013  */
4014 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4015                 struct fuse_file_info *fi)
4016 {
4017         struct fuse_context *fc = fuse_get_context();
4018         struct file_info *d = (struct file_info *)fi->fh;
4019         unsigned long int busytime = get_reaper_busy(fc->pid);
4020         char *cache = d->buf;
4021         ssize_t total_len = 0;
4022         uint64_t idletime, reaperage;
4023
4024 #if RELOADTEST
4025         iwashere();
4026 #endif
4027
4028         if (offset){
4029                 if (!d->cached)
4030                         return 0;
4031                 if (offset > d->size)
4032                         return -EINVAL;
4033                 int left = d->size - offset;
4034                 total_len = left > size ? size: left;
4035                 memcpy(buf, cache + offset, total_len);
4036                 return total_len;
4037         }
4038
4039         reaperage = get_reaper_age(fc->pid);
4040         /* To understand why this is done, please read the comment to the
4041          * get_reaper_busy() function.
4042          */
4043         idletime = reaperage;
4044         if (reaperage >= busytime)
4045                 idletime = reaperage - busytime;
4046
4047         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4048         if (total_len < 0 || total_len >=  d->buflen){
4049                 lxcfs_error("%s\n", "failed to write to cache");
4050                 return 0;
4051         }
4052
4053         d->size = (int)total_len;
4054         d->cached = 1;
4055
4056         if (total_len > size) total_len = size;
4057
4058         memcpy(buf, d->buf, total_len);
4059         return total_len;
4060 }
4061
4062 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4063                 struct fuse_file_info *fi)
4064 {
4065         char dev_name[72];
4066         struct fuse_context *fc = fuse_get_context();
4067         struct file_info *d = (struct file_info *)fi->fh;
4068         char *cg;
4069         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4070                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
4071         unsigned long read = 0, write = 0;
4072         unsigned long read_merged = 0, write_merged = 0;
4073         unsigned long read_sectors = 0, write_sectors = 0;
4074         unsigned long read_ticks = 0, write_ticks = 0;
4075         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4076         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4077         char *cache = d->buf;
4078         size_t cache_size = d->buflen;
4079         char *line = NULL;
4080         size_t linelen = 0, total_len = 0, rv = 0;
4081         unsigned int major = 0, minor = 0;
4082         int i = 0;
4083         FILE *f = NULL;
4084
4085         if (offset){
4086                 if (offset > d->size)
4087                         return -EINVAL;
4088                 if (!d->cached)
4089                         return 0;
4090                 int left = d->size - offset;
4091                 total_len = left > size ? size: left;
4092                 memcpy(buf, cache + offset, total_len);
4093                 return total_len;
4094         }
4095
4096         pid_t initpid = lookup_initpid_in_store(fc->pid);
4097         if (initpid <= 0)
4098                 initpid = fc->pid;
4099         cg = get_pid_cgroup(initpid, "blkio");
4100         if (!cg)
4101                 return read_file("/proc/diskstats", buf, size, d);
4102         prune_init_slice(cg);
4103
4104         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4105                 goto err;
4106         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4107                 goto err;
4108         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4109                 goto err;
4110         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4111                 goto err;
4112         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4113                 goto err;
4114
4115
4116         f = fopen("/proc/diskstats", "r");
4117         if (!f)
4118                 goto err;
4119
4120         while (getline(&line, &linelen, f) != -1) {
4121                 ssize_t l;
4122                 char lbuf[256];
4123
4124                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4125                 if (i != 3)
4126                         continue;
4127
4128                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4129                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4130                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4131                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4132                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4133                 read_sectors = read_sectors/512;
4134                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4135                 write_sectors = write_sectors/512;
4136
4137                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4138                 rd_svctm = rd_svctm/1000000;
4139                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4140                 rd_wait = rd_wait/1000000;
4141                 read_ticks = rd_svctm + rd_wait;
4142
4143                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4144                 wr_svctm =  wr_svctm/1000000;
4145                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4146                 wr_wait =  wr_wait/1000000;
4147                 write_ticks = wr_svctm + wr_wait;
4148
4149                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4150                 tot_ticks =  tot_ticks/1000000;
4151
4152                 memset(lbuf, 0, 256);
4153                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4154                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4155                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4156                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4157                 else
4158                         continue;
4159
4160                 l = snprintf(cache, cache_size, "%s", lbuf);
4161                 if (l < 0) {
4162                         perror("Error writing to fuse buf");
4163                         rv = 0;
4164                         goto err;
4165                 }
4166                 if (l >= cache_size) {
4167                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4168                         rv = 0;
4169                         goto err;
4170                 }
4171                 cache += l;
4172                 cache_size -= l;
4173                 total_len += l;
4174         }
4175
4176         d->cached = 1;
4177         d->size = total_len;
4178         if (total_len > size ) total_len = size;
4179         memcpy(buf, d->buf, total_len);
4180
4181         rv = total_len;
4182 err:
4183         free(cg);
4184         if (f)
4185                 fclose(f);
4186         free(line);
4187         free(io_serviced_str);
4188         free(io_merged_str);
4189         free(io_service_bytes_str);
4190         free(io_wait_time_str);
4191         free(io_service_time_str);
4192         return rv;
4193 }
4194
4195 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4196                 struct fuse_file_info *fi)
4197 {
4198         struct fuse_context *fc = fuse_get_context();
4199         struct file_info *d = (struct file_info *)fi->fh;
4200         char *cg = NULL;
4201         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4202         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4203         ssize_t total_len = 0, rv = 0;
4204         ssize_t l = 0;
4205         char *cache = d->buf;
4206
4207         if (offset) {
4208                 if (offset > d->size)
4209                         return -EINVAL;
4210                 if (!d->cached)
4211                         return 0;
4212                 int left = d->size - offset;
4213                 total_len = left > size ? size: left;
4214                 memcpy(buf, cache + offset, total_len);
4215                 return total_len;
4216         }
4217
4218         pid_t initpid = lookup_initpid_in_store(fc->pid);
4219         if (initpid <= 0)
4220                 initpid = fc->pid;
4221         cg = get_pid_cgroup(initpid, "memory");
4222         if (!cg)
4223                 return read_file("/proc/swaps", buf, size, d);
4224         prune_init_slice(cg);
4225
4226         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4227
4228         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4229                 goto err;
4230
4231         memusage = strtoul(memusage_str, NULL, 10);
4232
4233         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4234             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4235
4236                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4237                 memswusage = strtoul(memswusage_str, NULL, 10);
4238
4239                 swap_total = (memswlimit - memlimit) / 1024;
4240                 swap_free = (memswusage - memusage) / 1024;
4241         }
4242
4243         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4244
4245         /* When no mem + swap limit is specified or swapaccount=0*/
4246         if (!memswlimit) {
4247                 char *line = NULL;
4248                 size_t linelen = 0;
4249                 FILE *f = fopen("/proc/meminfo", "r");
4250
4251                 if (!f)
4252                         goto err;
4253
4254                 while (getline(&line, &linelen, f) != -1) {
4255                         if (startswith(line, "SwapTotal:")) {
4256                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4257                         } else if (startswith(line, "SwapFree:")) {
4258                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4259                         }
4260                 }
4261
4262                 free(line);
4263                 fclose(f);
4264         }
4265
4266         if (swap_total > 0) {
4267                 l = snprintf(d->buf + total_len, d->size - total_len,
4268                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4269                                 swap_total, swap_free);
4270                 total_len += l;
4271         }
4272
4273         if (total_len < 0 || l < 0) {
4274                 perror("Error writing to cache");
4275                 rv = 0;
4276                 goto err;
4277         }
4278
4279         d->cached = 1;
4280         d->size = (int)total_len;
4281
4282         if (total_len > size) total_len = size;
4283         memcpy(buf, d->buf, total_len);
4284         rv = total_len;
4285
4286 err:
4287         free(cg);
4288         free(memswlimit_str);
4289         free(memlimit_str);
4290         free(memusage_str);
4291         free(memswusage_str);
4292         return rv;
4293 }
4294 /*
4295  * Find the process pid from cgroup path.
4296  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
4297  * @pid_buf : put pid to pid_buf.
4298  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
4299  * @depth : the depth of cgroup in container.
4300  * @sum : return the number of pid.
4301  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
4302  */
4303 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
4304 {
4305         DIR *dir;
4306         int fd;
4307         struct dirent *file;
4308         FILE *f = NULL;
4309         size_t linelen = 0;
4310         char *line = NULL;
4311         int pd;
4312         char *path_dir, *path;
4313         char **pid;
4314
4315         /* path = dpath + "/cgroup.procs" + /0 */
4316         do {
4317                 path = malloc(strlen(dpath) + 20);
4318         } while (!path);
4319
4320         strcpy(path, dpath);
4321         fd = openat(cfd, path, O_RDONLY);
4322         if (fd < 0)
4323                 goto out;
4324
4325         dir = fdopendir(fd);
4326         if (dir == NULL) {
4327                 close(fd);
4328                 goto out;
4329         }
4330
4331         while (((file = readdir(dir)) != NULL) && depth > 0) {
4332                 if (strncmp(file->d_name, ".", 1) == 0)
4333                         continue;
4334                 if (strncmp(file->d_name, "..", 1) == 0)
4335                         continue;
4336                 if (file->d_type == DT_DIR) {
4337                         /* path + '/' + d_name +/0 */
4338                         do {
4339                                 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
4340                         } while (!path_dir);
4341                         strcpy(path_dir, path);
4342                         strcat(path_dir, "/");
4343                         strcat(path_dir, file->d_name);
4344                         pd = depth - 1;
4345                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
4346                         free(path_dir);
4347                 }
4348         }
4349         closedir(dir);
4350
4351         strcat(path, "/cgroup.procs");
4352         fd = openat(cfd, path, O_RDONLY);
4353         if (fd < 0)
4354                 goto out;
4355
4356         f = fdopen(fd, "r");
4357         if (!f) {
4358                 close(fd);
4359                 goto out;
4360         }
4361
4362         while (getline(&line, &linelen, f) != -1) {
4363                 do {
4364                         pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
4365                 } while (!pid);
4366                 *pid_buf = pid;
4367                 do {
4368                         *(*pid_buf + sum) = malloc(strlen(line) + 1);
4369                 } while (*(*pid_buf + sum) == NULL);
4370                 strcpy(*(*pid_buf + sum), line);
4371                 sum++;
4372         }
4373         fclose(f);
4374 out:
4375         free(path);
4376         return sum;
4377 }
4378 /*
4379  * calc_load calculates the load according to the following formula:
4380  * load1 = load0 * exp + active * (1 - exp)
4381  *
4382  * @load1: the new loadavg.
4383  * @load0: the former loadavg.
4384  * @active: the total number of running pid at this moment.
4385  * @exp: the fixed-point defined in the beginning.
4386  */
4387 static unsigned long
4388 calc_load(unsigned long load, unsigned long exp, unsigned long active)
4389 {
4390         unsigned long newload;
4391
4392         active = active > 0 ? active * FIXED_1 : 0;
4393         newload = load * exp + active * (FIXED_1 - exp);
4394         if (active >= load)
4395                 newload += FIXED_1 - 1;
4396
4397         return newload / FIXED_1;
4398 }
4399
4400 /*
4401  * Return 0 means that container p->cg is closed.
4402  * Return -1 means that error occurred in refresh.
4403  * Positive num equals the total number of pid.
4404  */
4405 static int refresh_load(struct load_node *p, char *path)
4406 {
4407         FILE *f = NULL;
4408         char **idbuf;
4409         char proc_path[256];
4410         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
4411         char *line = NULL;
4412         size_t linelen = 0;
4413         int sum, length;
4414         DIR *dp;
4415         struct dirent *file;
4416
4417         do {
4418                 idbuf = malloc(sizeof(char *));
4419         } while (!idbuf);
4420         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
4421         /*  normal exit  */
4422         if (sum == 0)
4423                 goto out;
4424
4425         for (i = 0; i < sum; i++) {
4426                 /*clean up '\n' */
4427                 length = strlen(idbuf[i])-1;
4428                 idbuf[i][length] = '\0';
4429                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
4430                 if (ret < 0 || ret > 255) {
4431                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4432                         i = sum;
4433                         sum = -1;
4434                         goto err_out;
4435                 }
4436
4437                 dp = opendir(proc_path);
4438                 if (!dp) {
4439                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
4440                         continue;
4441                 }
4442                 while ((file = readdir(dp)) != NULL) {
4443                         if (strncmp(file->d_name, ".", 1) == 0)
4444                                 continue;
4445                         if (strncmp(file->d_name, "..", 1) == 0)
4446                                 continue;
4447                         total_pid++;
4448                         /* We make the biggest pid become last_pid.*/
4449                         ret = atof(file->d_name);
4450                         last_pid = (ret > last_pid) ? ret : last_pid;
4451
4452                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
4453                         if (ret < 0 || ret > 255) {
4454                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
4455                                 i = sum;
4456                                 sum = -1;
4457                                 closedir(dp);
4458                                 goto err_out;
4459                         }
4460                         f = fopen(proc_path, "r");
4461                         if (f != NULL) {
4462                                 while (getline(&line, &linelen, f) != -1) {
4463                                         /* Find State */
4464                                         if ((line[0] == 'S') && (line[1] == 't'))
4465                                                 break;
4466                                 }
4467                         if ((line[7] == 'R') || (line[7] == 'D'))
4468                                 run_pid++;
4469                         fclose(f);
4470                         }
4471                 }
4472                 closedir(dp);
4473         }
4474         /*Calculate the loadavg.*/
4475         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
4476         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
4477         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
4478         p->run_pid = run_pid;
4479         p->total_pid = total_pid;
4480         p->last_pid = last_pid;
4481
4482         free(line);
4483 err_out:
4484         for (; i > 0; i--)
4485                 free(idbuf[i-1]);
4486 out:
4487         free(idbuf);
4488         return sum;
4489 }
4490 /*
4491  * Traverse the hash table and update it.
4492  */
4493 void *load_begin(void *arg)
4494 {
4495
4496         char *path = NULL;
4497         int i, sum, length, ret;
4498         struct load_node *f;
4499         int first_node;
4500         clock_t time1, time2;
4501
4502         while (1) {
4503                 time1 = clock();
4504                 for (i = 0; i < LOAD_SIZE; i++) {
4505                         pthread_mutex_lock(&load_hash[i].lock);
4506                         if (load_hash[i].next == NULL) {
4507                                 pthread_mutex_unlock(&load_hash[i].lock);
4508                                 continue;
4509                         }
4510                         f = load_hash[i].next;
4511                         first_node = 1;
4512                         while (f) {
4513                                 length = strlen(f->cg) + 2;
4514                                 do {
4515                                         /* strlen(f->cg) + '.' or '' + \0 */
4516                                         path = malloc(length);
4517                                 } while (!path);
4518
4519                                 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
4520                                 if (ret < 0 || ret > length - 1) {
4521                                         /* snprintf failed, ignore the node.*/
4522                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
4523                                         goto out;
4524                                 }
4525                                 sum = refresh_load(f, path);
4526                                 if (sum == 0) {
4527                                         f = del_node(f, i);
4528                                 } else {
4529 out:                                    f = f->next;
4530                                 }
4531                                 free(path);
4532                                 /* load_hash[i].lock locks only on the first node.*/
4533                                 if (first_node == 1) {
4534                                         first_node = 0;
4535                                         pthread_mutex_unlock(&load_hash[i].lock);
4536                                 }
4537                         }
4538                 }
4539                 time2 = clock();
4540                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
4541         }
4542 }
4543
4544 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
4545                 struct fuse_file_info *fi)
4546 {
4547         struct fuse_context *fc = fuse_get_context();
4548         struct file_info *d = (struct file_info *)fi->fh;
4549         pid_t initpid;
4550         char *cg;
4551         size_t total_len = 0;
4552         char *cache = d->buf;
4553         struct load_node *n;
4554         int hash;
4555         int cfd;
4556         unsigned long a, b, c;
4557
4558         if (offset) {
4559                 if (offset > d->size)
4560                         return -EINVAL;
4561                 if (!d->cached)
4562                         return 0;
4563                 int left = d->size - offset;
4564                 total_len = left > size ? size : left;
4565                 memcpy(buf, cache + offset, total_len);
4566                 return total_len;
4567         }
4568         if (!loadavg)
4569                 return read_file("/proc/loadavg", buf, size, d);
4570
4571         initpid = lookup_initpid_in_store(fc->pid);
4572         if (initpid <= 0)
4573                 initpid = fc->pid;
4574         cg = get_pid_cgroup(initpid, "cpu");
4575         if (!cg)
4576                 return read_file("/proc/loadavg", buf, size, d);
4577
4578         prune_init_slice(cg);
4579         hash = calc_hash(cg);
4580         n = locate_node(cg, hash);
4581
4582         /* First time */
4583         if (n == NULL) {
4584                 if (!find_mounted_controller("cpu", &cfd)) {
4585                         /*
4586                          * In locate_node() above, pthread_rwlock_unlock() isn't used
4587                          * because delete is not allowed before read has ended.
4588                          */
4589                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4590                         return 0;
4591                 }
4592                 do {
4593                         n = malloc(sizeof(struct load_node));
4594                 } while (!n);
4595
4596                 do {
4597                         n->cg = malloc(strlen(cg)+1);
4598                 } while (!n->cg);
4599                 strcpy(n->cg, cg);
4600                 n->avenrun[0] = 0;
4601                 n->avenrun[1] = 0;
4602                 n->avenrun[2] = 0;
4603                 n->run_pid = 0;
4604                 n->total_pid = 1;
4605                 n->last_pid = initpid;
4606                 n->cfd = cfd;
4607                 insert_node(&n, hash);
4608         }
4609         a = n->avenrun[0] + (FIXED_1/200);
4610         b = n->avenrun[1] + (FIXED_1/200);
4611         c = n->avenrun[2] + (FIXED_1/200);
4612         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
4613                 LOAD_INT(a), LOAD_FRAC(a),
4614                 LOAD_INT(b), LOAD_FRAC(b),
4615                 LOAD_INT(c), LOAD_FRAC(c),
4616                 n->run_pid, n->total_pid, n->last_pid);
4617         pthread_rwlock_unlock(&load_hash[hash].rdlock);
4618         if (total_len < 0 || total_len >=  d->buflen) {
4619                 lxcfs_error("%s\n", "Failed to write to cache");
4620                 return 0;
4621         }
4622         d->size = (int)total_len;
4623         d->cached = 1;
4624
4625         if (total_len > size)
4626                 total_len = size;
4627         memcpy(buf, d->buf, total_len);
4628         return total_len;
4629 }
4630 /* Return a positive number on success, return 0 on failure.*/
4631 pthread_t load_daemon(int load_use)
4632 {
4633         int ret;
4634         pthread_t pid;
4635
4636         ret = init_load();
4637         if (ret == -1) {
4638                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
4639                 return 0;
4640         }
4641         ret = pthread_create(&pid, NULL, load_begin, NULL);
4642         if (ret != 0) {
4643                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
4644                 load_free();
4645                 return 0;
4646         }
4647         /* use loadavg, here loadavg = 1*/
4648         loadavg = load_use;
4649         return pid;
4650 }
4651
4652 static off_t get_procfile_size(const char *which)
4653 {
4654         FILE *f = fopen(which, "r");
4655         char *line = NULL;
4656         size_t len = 0;
4657         ssize_t sz, answer = 0;
4658         if (!f)
4659                 return 0;
4660
4661         while ((sz = getline(&line, &len, f)) != -1)
4662                 answer += sz;
4663         fclose (f);
4664         free(line);
4665
4666         return answer;
4667 }
4668
4669 int proc_getattr(const char *path, struct stat *sb)
4670 {
4671         struct timespec now;
4672
4673         memset(sb, 0, sizeof(struct stat));
4674         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4675                 return -EINVAL;
4676         sb->st_uid = sb->st_gid = 0;
4677         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4678         if (strcmp(path, "/proc") == 0) {
4679                 sb->st_mode = S_IFDIR | 00555;
4680                 sb->st_nlink = 2;
4681                 return 0;
4682         }
4683         if (strcmp(path, "/proc/meminfo") == 0 ||
4684                         strcmp(path, "/proc/cpuinfo") == 0 ||
4685                         strcmp(path, "/proc/uptime") == 0 ||
4686                         strcmp(path, "/proc/stat") == 0 ||
4687                         strcmp(path, "/proc/diskstats") == 0 ||
4688                         strcmp(path, "/proc/swaps") == 0 ||
4689                         strcmp(path, "/proc/loadavg") == 0) {
4690                 sb->st_size = 0;
4691                 sb->st_mode = S_IFREG | 00444;
4692                 sb->st_nlink = 1;
4693                 return 0;
4694         }
4695
4696         return -ENOENT;
4697 }
4698
4699 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4700                 struct fuse_file_info *fi)
4701 {
4702         if (filler(buf, ".", NULL, 0) != 0 ||
4703             filler(buf, "..", NULL, 0) != 0 ||
4704             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4705             filler(buf, "meminfo", NULL, 0) != 0 ||
4706             filler(buf, "stat", NULL, 0) != 0 ||
4707             filler(buf, "uptime", NULL, 0) != 0 ||
4708             filler(buf, "diskstats", NULL, 0) != 0 ||
4709             filler(buf, "swaps", NULL, 0) != 0   ||
4710             filler(buf, "loadavg", NULL, 0) != 0)
4711                 return -EINVAL;
4712         return 0;
4713 }
4714
4715 int proc_open(const char *path, struct fuse_file_info *fi)
4716 {
4717         int type = -1;
4718         struct file_info *info;
4719
4720         if (strcmp(path, "/proc/meminfo") == 0)
4721                 type = LXC_TYPE_PROC_MEMINFO;
4722         else if (strcmp(path, "/proc/cpuinfo") == 0)
4723                 type = LXC_TYPE_PROC_CPUINFO;
4724         else if (strcmp(path, "/proc/uptime") == 0)
4725                 type = LXC_TYPE_PROC_UPTIME;
4726         else if (strcmp(path, "/proc/stat") == 0)
4727                 type = LXC_TYPE_PROC_STAT;
4728         else if (strcmp(path, "/proc/diskstats") == 0)
4729                 type = LXC_TYPE_PROC_DISKSTATS;
4730         else if (strcmp(path, "/proc/swaps") == 0)
4731                 type = LXC_TYPE_PROC_SWAPS;
4732         else if (strcmp(path, "/proc/loadavg") == 0)
4733                 type = LXC_TYPE_PROC_LOADAVG;
4734         if (type == -1)
4735                 return -ENOENT;
4736
4737         info = malloc(sizeof(*info));
4738         if (!info)
4739                 return -ENOMEM;
4740
4741         memset(info, 0, sizeof(*info));
4742         info->type = type;
4743
4744         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4745         do {
4746                 info->buf = malloc(info->buflen);
4747         } while (!info->buf);
4748         memset(info->buf, 0, info->buflen);
4749         /* set actual size to buffer size */
4750         info->size = info->buflen;
4751
4752         fi->fh = (unsigned long)info;
4753         return 0;
4754 }
4755
4756 int proc_access(const char *path, int mask)
4757 {
4758         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4759                 return 0;
4760
4761         /* these are all read-only */
4762         if ((mask & ~R_OK) != 0)
4763                 return -EACCES;
4764         return 0;
4765 }
4766
4767 int proc_release(const char *path, struct fuse_file_info *fi)
4768 {
4769         do_release_file_info(fi);
4770         return 0;
4771 }
4772
4773 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4774                 struct fuse_file_info *fi)
4775 {
4776         struct file_info *f = (struct file_info *) fi->fh;
4777
4778         switch (f->type) {
4779         case LXC_TYPE_PROC_MEMINFO:
4780                 return proc_meminfo_read(buf, size, offset, fi);
4781         case LXC_TYPE_PROC_CPUINFO:
4782                 return proc_cpuinfo_read(buf, size, offset, fi);
4783         case LXC_TYPE_PROC_UPTIME:
4784                 return proc_uptime_read(buf, size, offset, fi);
4785         case LXC_TYPE_PROC_STAT:
4786                 return proc_stat_read(buf, size, offset, fi);
4787         case LXC_TYPE_PROC_DISKSTATS:
4788                 return proc_diskstats_read(buf, size, offset, fi);
4789         case LXC_TYPE_PROC_SWAPS:
4790                 return proc_swaps_read(buf, size, offset, fi);
4791         case LXC_TYPE_PROC_LOADAVG:
4792                 return proc_loadavg_read(buf, size, offset, fi);
4793         default:
4794                 return -EINVAL;
4795         }
4796 }
4797
4798 /*
4799  * Functions needed to setup cgroups in the __constructor__.
4800  */
4801
4802 static bool mkdir_p(const char *dir, mode_t mode)
4803 {
4804         const char *tmp = dir;
4805         const char *orig = dir;
4806         char *makeme;
4807
4808         do {
4809                 dir = tmp + strspn(tmp, "/");
4810                 tmp = dir + strcspn(dir, "/");
4811                 makeme = strndup(orig, dir - orig);
4812                 if (!makeme)
4813                         return false;
4814                 if (mkdir(makeme, mode) && errno != EEXIST) {
4815                         lxcfs_error("Failed to create directory '%s': %s.\n",
4816                                 makeme, strerror(errno));
4817                         free(makeme);
4818                         return false;
4819                 }
4820                 free(makeme);
4821         } while(tmp != dir);
4822
4823         return true;
4824 }
4825
4826 static bool umount_if_mounted(void)
4827 {
4828         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4829                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4830                 return false;
4831         }
4832         return true;
4833 }
4834
4835 /* __typeof__ should be safe to use with all compilers. */
4836 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4837 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4838 {
4839         return (fs->f_type == (fs_type_magic)magic_val);
4840 }
4841
4842 /*
4843  * looking at fs/proc_namespace.c, it appears we can
4844  * actually expect the rootfs entry to very specifically contain
4845  * " - rootfs rootfs "
4846  * IIUC, so long as we've chrooted so that rootfs is not our root,
4847  * the rootfs entry should always be skipped in mountinfo contents.
4848  */
4849 static bool is_on_ramfs(void)
4850 {
4851         FILE *f;
4852         char *p, *p2;
4853         char *line = NULL;
4854         size_t len = 0;
4855         int i;
4856
4857         f = fopen("/proc/self/mountinfo", "r");
4858         if (!f)
4859                 return false;
4860
4861         while (getline(&line, &len, f) != -1) {
4862                 for (p = line, i = 0; p && i < 4; i++)
4863                         p = strchr(p + 1, ' ');
4864                 if (!p)
4865                         continue;
4866                 p2 = strchr(p + 1, ' ');
4867                 if (!p2)
4868                         continue;
4869                 *p2 = '\0';
4870                 if (strcmp(p + 1, "/") == 0) {
4871                         // this is '/'.  is it the ramfs?
4872                         p = strchr(p2 + 1, '-');
4873                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4874                                 free(line);
4875                                 fclose(f);
4876                                 return true;
4877                         }
4878                 }
4879         }
4880         free(line);
4881         fclose(f);
4882         return false;
4883 }
4884
4885 static int pivot_enter()
4886 {
4887         int ret = -1, oldroot = -1, newroot = -1;
4888
4889         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4890         if (oldroot < 0) {
4891                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4892                 return ret;
4893         }
4894
4895         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4896         if (newroot < 0) {
4897                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4898                 goto err;
4899         }
4900
4901         /* change into new root fs */
4902         if (fchdir(newroot) < 0) {
4903                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4904                 goto err;
4905         }
4906
4907         /* pivot_root into our new root fs */
4908         if (pivot_root(".", ".") < 0) {
4909                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4910                 goto err;
4911         }
4912
4913         /*
4914          * At this point the old-root is mounted on top of our new-root.
4915          * To unmounted it we must not be chdir'd into it, so escape back
4916          * to the old-root.
4917          */
4918         if (fchdir(oldroot) < 0) {
4919                 lxcfs_error("%s\n", "Failed to enter old root.");
4920                 goto err;
4921         }
4922
4923         if (umount2(".", MNT_DETACH) < 0) {
4924                 lxcfs_error("%s\n", "Failed to detach old root.");
4925                 goto err;
4926         }
4927
4928         if (fchdir(newroot) < 0) {
4929                 lxcfs_error("%s\n", "Failed to re-enter new root.");
4930                 goto err;
4931         }
4932
4933         ret = 0;
4934
4935 err:
4936         if (oldroot > 0)
4937                 close(oldroot);
4938         if (newroot > 0)
4939                 close(newroot);
4940
4941         return ret;
4942 }
4943
4944 static int chroot_enter()
4945 {
4946         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4947                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4948                 return -1;
4949         }
4950
4951         if (chroot(".") < 0) {
4952                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4953                 return -1;
4954         }
4955
4956         if (chdir("/") < 0) {
4957                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4958                 return -1;
4959         }
4960
4961         return 0;
4962 }
4963
4964 static int permute_and_enter(void)
4965 {
4966         struct statfs sb;
4967
4968         if (statfs("/", &sb) < 0) {
4969                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
4970                 return -1;
4971         }
4972
4973         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
4974          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
4975          * /proc/1/mountinfo. */
4976         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
4977                 return chroot_enter();
4978
4979         if (pivot_enter() < 0) {
4980                 lxcfs_error("%s\n", "Could not perform pivot root.");
4981                 return -1;
4982         }
4983
4984         return 0;
4985 }
4986
4987 /* Prepare our new clean root. */
4988 static int permute_prepare(void)
4989 {
4990         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4991                 lxcfs_error("%s\n", "Failed to create directory for new root.");
4992                 return -1;
4993         }
4994
4995         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4996                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4997                 return -1;
4998         }
4999
5000         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5001                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5002                 return -1;
5003         }
5004
5005         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5006                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5007                 return -1;
5008         }
5009
5010         return 0;
5011 }
5012
5013 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5014 static bool permute_root(void)
5015 {
5016         /* Prepare new root. */
5017         if (permute_prepare() < 0)
5018                 return false;
5019
5020         /* Pivot into new root. */
5021         if (permute_and_enter() < 0)
5022                 return false;
5023
5024         return true;
5025 }
5026
5027 static int preserve_mnt_ns(int pid)
5028 {
5029         int ret;
5030         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
5031         char path[len];
5032
5033         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
5034         if (ret < 0 || (size_t)ret >= len)
5035                 return -1;
5036
5037         return open(path, O_RDONLY | O_CLOEXEC);
5038 }
5039
5040 static bool cgfs_prepare_mounts(void)
5041 {
5042         if (!mkdir_p(BASEDIR, 0700)) {
5043                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5044                 return false;
5045         }
5046
5047         if (!umount_if_mounted()) {
5048                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5049                 return false;
5050         }
5051
5052         if (unshare(CLONE_NEWNS) < 0) {
5053                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5054                 return false;
5055         }
5056
5057         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
5058         if (cgroup_mount_ns_fd < 0) {
5059                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5060                 return false;
5061         }
5062
5063         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5064                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5065                 return false;
5066         }
5067
5068         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5069                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5070                 return false;
5071         }
5072
5073         return true;
5074 }
5075
5076 static bool cgfs_mount_hierarchies(void)
5077 {
5078         char *target;
5079         size_t clen, len;
5080         int i, ret;
5081
5082         for (i = 0; i < num_hierarchies; i++) {
5083                 char *controller = hierarchies[i];
5084
5085                 clen = strlen(controller);
5086                 len = strlen(BASEDIR) + clen + 2;
5087                 target = malloc(len);
5088                 if (!target)
5089                         return false;
5090
5091                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
5092                 if (ret < 0 || ret >= len) {
5093                         free(target);
5094                         return false;
5095                 }
5096                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
5097                         free(target);
5098                         return false;
5099                 }
5100                 if (!strcmp(controller, "unified"))
5101                         ret = mount("none", target, "cgroup2", 0, NULL);
5102                 else
5103                         ret = mount(controller, target, "cgroup", 0, controller);
5104                 if (ret < 0) {
5105                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
5106                         free(target);
5107                         return false;
5108                 }
5109
5110                 fd_hierarchies[i] = open(target, O_DIRECTORY);
5111                 if (fd_hierarchies[i] < 0) {
5112                         free(target);
5113                         return false;
5114                 }
5115                 free(target);
5116         }
5117         return true;
5118 }
5119
5120 static bool cgfs_setup_controllers(void)
5121 {
5122         if (!cgfs_prepare_mounts())
5123                 return false;
5124
5125         if (!cgfs_mount_hierarchies()) {
5126                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5127                 return false;
5128         }
5129
5130         if (!permute_root())
5131                 return false;
5132
5133         return true;
5134 }
5135
5136 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
5137 {
5138         FILE *f;
5139         char *cret, *line = NULL;
5140         char cwd[MAXPATHLEN];
5141         size_t len = 0;
5142         int i, init_ns = -1;
5143         bool found_unified = false;
5144
5145         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
5146                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
5147                 return;
5148         }
5149
5150         while (getline(&line, &len, f) != -1) {
5151                 char *idx, *p, *p2;
5152
5153                 p = strchr(line, ':');
5154                 if (!p)
5155                         goto out;
5156                 idx = line;
5157                 *(p++) = '\0';
5158
5159                 p2 = strrchr(p, ':');
5160                 if (!p2)
5161                         goto out;
5162                 *p2 = '\0';
5163
5164                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
5165                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
5166                  * because it parses out the empty string "" and later on passes
5167                  * it to mount(). Let's skip such entries.
5168                  */
5169                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
5170                         found_unified = true;
5171                         p = "unified";
5172                 }
5173
5174                 if (!store_hierarchy(line, p))
5175                         goto out;
5176         }
5177
5178         /* Preserve initial namespace. */
5179         init_ns = preserve_mnt_ns(getpid());
5180         if (init_ns < 0) {
5181                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
5182                 goto out;
5183         }
5184
5185         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
5186         if (!fd_hierarchies) {
5187                 lxcfs_error("%s\n", strerror(errno));
5188                 goto out;
5189         }
5190
5191         for (i = 0; i < num_hierarchies; i++)
5192                 fd_hierarchies[i] = -1;
5193
5194         cret = getcwd(cwd, MAXPATHLEN);
5195         if (!cret)
5196                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
5197
5198         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5199          * to privately mount lxcfs cgroups. */
5200         if (!cgfs_setup_controllers()) {
5201                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
5202                 goto out;
5203         }
5204
5205         if (setns(init_ns, 0) < 0) {
5206                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
5207                 goto out;
5208         }
5209
5210         if (!cret || chdir(cwd) < 0)
5211                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
5212
5213         print_subsystems();
5214
5215 out:
5216         free(line);
5217         fclose(f);
5218         if (init_ns >= 0)
5219                 close(init_ns);
5220 }
5221
5222 static void __attribute__((destructor)) free_subsystems(void)
5223 {
5224         int i;
5225
5226         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
5227
5228         for (i = 0; i < num_hierarchies; i++) {
5229                 if (hierarchies[i])
5230                         free(hierarchies[i]);
5231                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
5232                         close(fd_hierarchies[i]);
5233         }
5234         free(hierarchies);
5235         free(fd_hierarchies);
5236
5237         if (cgroup_mount_ns_fd >= 0)
5238                 close(cgroup_mount_ns_fd);
5239 }