bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdarg.h>
  21 #include <stdbool.h>
  22 #include <stdint.h>
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <time.h>
  27 #include <unistd.h>
  28 #include <wait.h>
  29 #include <linux/magic.h>
  30 #include <linux/sched.h>
  31 #include <sys/epoll.h>
  32 #include <sys/mman.h>
  33 #include <sys/mount.h>
  34 #include <sys/param.h>
  35 #include <sys/socket.h>
  36 #include <sys/syscall.h>
  37 #include <sys/sysinfo.h>
  38 #include <sys/vfs.h>
  39
  40 #include "bindings.h"
  41 #include "config.h"
  42 #include "cgroups/cgroup.h"
  43 #include "cgroups/cgroup_utils.h"
  44 #include "memory_utils.h"
  45 #include "utils.h"
  46
  47 /* Define pivot_root() if missing from the C library */
  48 #ifndef HAVE_PIVOT_ROOT
  49 static int pivot_root(const char * new_root, const char * put_old)
  50 {
  51 #ifdef __NR_pivot_root
  52 return syscall(__NR_pivot_root, new_root, put_old);
  53 #else
  54 errno = ENOSYS;
  55 return -1;
  56 #endif
  57 }
  58 #else
  59 extern int pivot_root(const char * new_root, const char * put_old);
  60 #endif
  61
  62 struct cpuacct_usage {
  63         uint64_t user;
  64         uint64_t system;
  65         uint64_t idle;
  66         bool online;
  67 };
  68
  69 /* The function of hash table.*/
  70 #define LOAD_SIZE 100 /*the size of hash_table */
  71 #define FLUSH_TIME 5  /*the flush rate */
  72 #define DEPTH_DIR 3   /*the depth of per cgroup */
  73 /* The function of calculate loadavg .*/
  74 #define FSHIFT          11              /* nr of bits of precision */
  75 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  76 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  77 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  78 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  79 #define LOAD_INT(x) ((x) >> FSHIFT)
  80 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  81 /*
  82  * This parameter is used for proc_loadavg_read().
  83  * 1 means use loadavg, 0 means not use.
  84  */
  85 static int loadavg = 0;
  86 static volatile sig_atomic_t loadavg_stop = 0;
  87 static int calc_hash(const char *name)
  88 {
  89         unsigned int hash = 0;
  90         unsigned int x = 0;
  91         /* ELFHash algorithm. */
  92         while (*name) {
  93                 hash = (hash << 4) + *name++;
  94                 x = hash & 0xf0000000;
  95                 if (x != 0)
  96                         hash ^= (x >> 24);
  97                 hash &= ~x;
  98         }
  99         return (hash & 0x7fffffff);
 100 }
 101
 102 struct load_node {
 103         char *cg;  /*cg */
 104         unsigned long avenrun[3];               /* Load averages */
 105         unsigned int run_pid;
 106         unsigned int total_pid;
 107         unsigned int last_pid;
 108         int cfd; /* The file descriptor of the mounted cgroup */
 109         struct  load_node *next;
 110         struct  load_node **pre;
 111 };
 112
 113 struct load_head {
 114         /*
 115          * The lock is about insert load_node and refresh load_node.To the first
 116          * load_node of each hash bucket, insert and refresh in this hash bucket is
 117          * mutually exclusive.
 118          */
 119         pthread_mutex_t lock;
 120         /*
 121          * The rdlock is about read loadavg and delete load_node.To each hash
 122          * bucket, read and delete is mutually exclusive. But at the same time, we
 123          * allow paratactic read operation. This rdlock is at list level.
 124          */
 125         pthread_rwlock_t rdlock;
 126         /*
 127          * The rilock is about read loadavg and insert load_node.To the first
 128          * load_node of each hash bucket, read and insert is mutually exclusive.
 129          * But at the same time, we allow paratactic read operation.
 130          */
 131         pthread_rwlock_t rilock;
 132         struct load_node *next;
 133 };
 134
 135 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 136 /*
 137  * init_load initialize the hash table.
 138  * Return 0 on success, return -1 on failure.
 139  */
 140 static int init_load(void)
 141 {
 142         int i;
 143         int ret;
 144
 145         for (i = 0; i < LOAD_SIZE; i++) {
 146                 load_hash[i].next = NULL;
 147                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 148                 if (ret != 0) {
 149                         lxcfs_error("%s\n", "Failed to initialize lock");
 150                         goto out3;
 151                 }
 152                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 153                 if (ret != 0) {
 154                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 155                         goto out2;
 156                 }
 157                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 158                 if (ret != 0) {
 159                         lxcfs_error("%s\n", "Failed to initialize rilock");
 160                         goto out1;
 161                 }
 162         }
 163         return 0;
 164 out1:
 165         pthread_rwlock_destroy(&load_hash[i].rdlock);
 166 out2:
 167         pthread_mutex_destroy(&load_hash[i].lock);
 168 out3:
 169         while (i > 0) {
 170                 i--;
 171                 pthread_mutex_destroy(&load_hash[i].lock);
 172                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 173                 pthread_rwlock_destroy(&load_hash[i].rilock);
 174         }
 175         return -1;
 176 }
 177
 178 static void insert_node(struct load_node **n, int locate)
 179 {
 180         struct load_node *f;
 181
 182         pthread_mutex_lock(&load_hash[locate].lock);
 183         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 184         f = load_hash[locate].next;
 185         load_hash[locate].next = *n;
 186
 187         (*n)->pre = &(load_hash[locate].next);
 188         if (f)
 189                 f->pre = &((*n)->next);
 190         (*n)->next = f;
 191         pthread_mutex_unlock(&load_hash[locate].lock);
 192         pthread_rwlock_unlock(&load_hash[locate].rilock);
 193 }
 194 /*
 195  * locate_node() finds special node. Not return NULL means success.
 196  * It should be noted that rdlock isn't unlocked at the end of code
 197  * because this function is used to read special node. Delete is not
 198  * allowed before read has ended.
 199  * unlock rdlock only in proc_loadavg_read().
 200  */
 201 static struct load_node *locate_node(char *cg, int locate)
 202 {
 203         struct load_node *f = NULL;
 204         int i = 0;
 205
 206         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 207         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 208         if (load_hash[locate].next == NULL) {
 209                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 210                 return f;
 211         }
 212         f = load_hash[locate].next;
 213         pthread_rwlock_unlock(&load_hash[locate].rilock);
 214         while (f && ((i = strcmp(f->cg, cg)) != 0))
 215                 f = f->next;
 216         return f;
 217 }
 218
 219 /* Delete the load_node n and return the next node of it. */
 220 static struct load_node *del_node(struct load_node *n, int locate)
 221 {
 222         struct load_node *g;
 223
 224         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 225         if (n->next == NULL) {
 226                 *(n->pre) = NULL;
 227         } else {
 228                 *(n->pre) = n->next;
 229                 n->next->pre = n->pre;
 230         }
 231         g = n->next;
 232         free_disarm(n->cg);
 233         free_disarm(n);
 234         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 235         return g;
 236 }
 237
 238 static void load_free(void)
 239 {
 240         struct load_node *f, *p;
 241
 242         for (int i = 0; i < LOAD_SIZE; i++) {
 243                 pthread_mutex_lock(&load_hash[i].lock);
 244                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 245                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 246                 if (load_hash[i].next == NULL) {
 247                         pthread_mutex_unlock(&load_hash[i].lock);
 248                         pthread_mutex_destroy(&load_hash[i].lock);
 249                         pthread_rwlock_unlock(&load_hash[i].rilock);
 250                         pthread_rwlock_destroy(&load_hash[i].rilock);
 251                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 252                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 253                         continue;
 254                 }
 255
 256                 for (f = load_hash[i].next; f;) {
 257                         free_disarm(f->cg);
 258                         p = f->next;
 259                         free_disarm(f);
 260                         f = p;
 261                 }
 262
 263                 pthread_mutex_unlock(&load_hash[i].lock);
 264                 pthread_mutex_destroy(&load_hash[i].lock);
 265                 pthread_rwlock_unlock(&load_hash[i].rilock);
 266                 pthread_rwlock_destroy(&load_hash[i].rilock);
 267                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 268                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 269         }
 270 }
 271
 272 /* Data for CPU view */
 273 struct cg_proc_stat {
 274         char *cg;
 275         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 276         struct cpuacct_usage *view; // Usage stats reported to the container
 277         int cpu_count;
 278         pthread_mutex_t lock; // For node manipulation
 279         struct cg_proc_stat *next;
 280 };
 281
 282 struct cg_proc_stat_head {
 283         struct cg_proc_stat *next;
 284         time_t lastcheck;
 285
 286         /*
 287          * For access to the list. Reading can be parallel, pruning is exclusive.
 288          */
 289         pthread_rwlock_t lock;
 290 };
 291
 292 #define CPUVIEW_HASH_SIZE 100
 293 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 294
 295 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 296 {
 297         *head = malloc(sizeof(struct cg_proc_stat_head));
 298         if (!(*head)) {
 299                 lxcfs_error("%s\n", strerror(errno));
 300                 return false;
 301         }
 302
 303         (*head)->lastcheck = time(NULL);
 304         (*head)->next = NULL;
 305
 306         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 307                 lxcfs_error("%s\n", "Failed to initialize list lock");
 308                 free_disarm(*head);
 309                 return false;
 310         }
 311
 312         return true;
 313 }
 314
 315 static bool init_cpuview()
 316 {
 317         int i;
 318
 319         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 320                 proc_stat_history[i] = NULL;
 321
 322         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 323                 if (!cpuview_init_head(&proc_stat_history[i]))
 324                         goto err;
 325         }
 326
 327         return true;
 328
 329 err:
 330         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 331                 if (proc_stat_history[i])
 332                         free_disarm(proc_stat_history[i]);
 333         }
 334
 335         return false;
 336 }
 337
 338 static void free_proc_stat_node(struct cg_proc_stat *node)
 339 {
 340         pthread_mutex_destroy(&node->lock);
 341         free_disarm(node->cg);
 342         free_disarm(node->usage);
 343         free_disarm(node->view);
 344         free_disarm(node);
 345 }
 346
 347 static void cpuview_free_head(struct cg_proc_stat_head *head)
 348 {
 349         struct cg_proc_stat *node, *tmp;
 350
 351         if (head->next) {
 352                 node = head->next;
 353
 354                 for (;;) {
 355                         tmp = node;
 356                         node = node->next;
 357                         free_proc_stat_node(tmp);
 358
 359                         if (!node)
 360                                 break;
 361                 }
 362         }
 363
 364         pthread_rwlock_destroy(&head->lock);
 365         free_disarm(head);
 366 }
 367
 368 static void free_cpuview()
 369 {
 370         int i;
 371
 372         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 373                 if (proc_stat_history[i])
 374                         cpuview_free_head(proc_stat_history[i]);
 375         }
 376 }
 377
 378 /*
 379  * A table caching which pid is init for a pid namespace.
 380  * When looking up which pid is init for $qpid, we first
 381  * 1. Stat /proc/$qpid/ns/pid.
 382  * 2. Check whether the ino_t is in our store.
 383  *   a. if not, fork a child in qpid's ns to send us
 384  *       ucred.pid = 1, and read the initpid.  Cache
 385  *       initpid and creation time for /proc/initpid
 386  *       in a new store entry.
 387  *   b. if so, verify that /proc/initpid still matches
 388  *       what we have saved.  If not, clear the store
 389  *       entry and go back to a.  If so, return the
 390  *       cached initpid.
 391  */
 392 struct pidns_init_store {
 393         ino_t ino;          // inode number for /proc/$pid/ns/pid
 394         pid_t initpid;      // the pid of nit in that ns
 395         long int ctime;     // the time at which /proc/$initpid was created
 396         struct pidns_init_store *next;
 397         long int lastcheck;
 398 };
 399
 400 /* lol - look at how they are allocated in the kernel */
 401 #define PIDNS_HASH_SIZE 4096
 402 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 403
 404 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 405 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 406 static void lock_mutex(pthread_mutex_t *l)
 407 {
 408         int ret;
 409
 410         if ((ret = pthread_mutex_lock(l)) != 0) {
 411                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 412                 exit(1);
 413         }
 414 }
 415
 416 struct cgroup_ops *cgroup_ops;
 417
 418 static void unlock_mutex(pthread_mutex_t *l)
 419 {
 420         int ret;
 421
 422         if ((ret = pthread_mutex_unlock(l)) != 0) {
 423                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 424                 exit(1);
 425         }
 426 }
 427
 428 static void store_lock(void)
 429 {
 430         lock_mutex(&pidns_store_mutex);
 431 }
 432
 433 static void store_unlock(void)
 434 {
 435         unlock_mutex(&pidns_store_mutex);
 436 }
 437
 438 /* Must be called under store_lock */
 439 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 440 {
 441         struct stat initsb;
 442         char fnam[100];
 443
 444         snprintf(fnam, 100, "/proc/%d", e->initpid);
 445         if (stat(fnam, &initsb) < 0)
 446                 return false;
 447
 448         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 449                     initsb.st_ctime, e->initpid);
 450
 451         if (e->ctime != initsb.st_ctime)
 452                 return false;
 453         return true;
 454 }
 455
 456 /* Must be called under store_lock */
 457 static void remove_initpid(struct pidns_init_store *e)
 458 {
 459         struct pidns_init_store *tmp;
 460         int h;
 461
 462         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 463
 464         h = HASH(e->ino);
 465         if (pidns_hash_table[h] == e) {
 466                 pidns_hash_table[h] = e->next;
 467                 free_disarm(e);
 468                 return;
 469         }
 470
 471         tmp = pidns_hash_table[h];
 472         while (tmp) {
 473                 if (tmp->next == e) {
 474                         tmp->next = e->next;
 475                         free_disarm(e);
 476                         return;
 477                 }
 478                 tmp = tmp->next;
 479         }
 480 }
 481
 482 #define PURGE_SECS 5
 483 /* Must be called under store_lock */
 484 static void prune_initpid_store(void)
 485 {
 486         static long int last_prune = 0;
 487         struct pidns_init_store *e, *prev, *delme;
 488         long int now, threshold;
 489         int i;
 490
 491         if (!last_prune) {
 492                 last_prune = time(NULL);
 493                 return;
 494         }
 495         now = time(NULL);
 496         if (now < last_prune + PURGE_SECS)
 497                 return;
 498
 499         lxcfs_debug("%s\n", "Pruning.");
 500
 501         last_prune = now;
 502         threshold = now - 2 * PURGE_SECS;
 503
 504         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 505                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 506                         if (e->lastcheck < threshold) {
 507
 508                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 509
 510                                 delme = e;
 511                                 if (prev)
 512                                         prev->next = e->next;
 513                                 else
 514                                         pidns_hash_table[i] = e->next;
 515                                 e = e->next;
 516                                 free_disarm(delme);
 517                         } else {
 518                                 prev = e;
 519                                 e = e->next;
 520                         }
 521                 }
 522         }
 523 }
 524
 525 /* Must be called under store_lock */
 526 static void save_initpid(struct stat *sb, pid_t pid)
 527 {
 528         struct pidns_init_store *e;
 529         char fpath[100];
 530         struct stat procsb;
 531         int h;
 532
 533         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 534
 535         snprintf(fpath, 100, "/proc/%d", pid);
 536         if (stat(fpath, &procsb) < 0)
 537                 return;
 538         do {
 539                 e = malloc(sizeof(*e));
 540         } while (!e);
 541         e->ino = sb->st_ino;
 542         e->initpid = pid;
 543         e->ctime = procsb.st_ctime;
 544         h = HASH(e->ino);
 545         e->next = pidns_hash_table[h];
 546         e->lastcheck = time(NULL);
 547         pidns_hash_table[h] = e;
 548 }
 549
 550 /*
 551  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 552  * entry for the inode number and creation time.  Verify that the init pid
 553  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 554  * otherwise.
 555  * Must be called under store_lock
 556  */
 557 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 558 {
 559         int h = HASH(sb->st_ino);
 560         struct pidns_init_store *e = pidns_hash_table[h];
 561
 562         while (e) {
 563                 if (e->ino == sb->st_ino) {
 564                         if (initpid_still_valid(e, sb)) {
 565                                 e->lastcheck = time(NULL);
 566                                 return e;
 567                         }
 568                         remove_initpid(e);
 569                         return NULL;
 570                 }
 571                 e = e->next;
 572         }
 573
 574         return NULL;
 575 }
 576
 577 static int is_dir(const char *path, int fd)
 578 {
 579         struct stat statbuf;
 580         int ret = fstatat(fd, path, &statbuf, fd);
 581         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 582                 return 1;
 583         return 0;
 584 }
 585
 586 static bool write_string(const char *fnam, const char *string, int fd)
 587 {
 588         FILE *f;
 589         size_t len, ret;
 590
 591         f = fdopen(fd, "w");
 592         if (!f)
 593                 return false;
 594
 595         len = strlen(string);
 596         ret = fwrite(string, 1, len, f);
 597         if (ret != len) {
 598                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 599                             strerror(errno), string, fnam);
 600                 fclose(f);
 601                 return false;
 602         }
 603
 604         if (fclose(f) < 0) {
 605                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 606                 return false;
 607         }
 608
 609         return true;
 610 }
 611
 612 struct cgfs_files {
 613         char *name;
 614         uint32_t uid, gid;
 615         uint32_t mode;
 616 };
 617
 618 static void print_subsystems(void)
 619 {
 620         int i = 0;
 621
 622         fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
 623         fprintf(stderr, "hierarchies:\n");
 624         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 625                 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
 626                 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
 627         }
 628 }
 629
 630 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 631                 const char *value)
 632 {
 633         int ret, fd, cfd;
 634         size_t len;
 635         char *fnam;
 636
 637         cfd = get_cgroup_fd(controller);
 638         if (cfd < 0)
 639                 return false;
 640
 641         /* Make sure we pass a relative path to *at() family of functions.
 642          * . + /cgroup + / + file + \0
 643          */
 644         len = strlen(cgroup) + strlen(file) + 3;
 645         fnam = alloca(len);
 646         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
 647         if (ret < 0 || (size_t)ret >= len)
 648                 return false;
 649
 650         fd = openat(cfd, fnam, O_WRONLY);
 651         if (fd < 0)
 652                 return false;
 653
 654         return write_string(fnam, value, fd);
 655 }
 656
 657 // Chown all the files in the cgroup directory.  We do this when we create
 658 // a cgroup on behalf of a user.
 659 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 660 {
 661         struct dirent *direntp;
 662         char path[MAXPATHLEN];
 663         size_t len;
 664         DIR *d;
 665         int fd1, ret;
 666
 667         len = strlen(dirname);
 668         if (len >= MAXPATHLEN) {
 669                 lxcfs_error("Pathname too long: %s\n", dirname);
 670                 return;
 671         }
 672
 673         fd1 = openat(fd, dirname, O_DIRECTORY);
 674         if (fd1 < 0)
 675                 return;
 676
 677         d = fdopendir(fd1);
 678         if (!d) {
 679                 lxcfs_error("Failed to open %s\n", dirname);
 680                 return;
 681         }
 682
 683         while ((direntp = readdir(d))) {
 684                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 685                         continue;
 686                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 687                 if (ret < 0 || ret >= MAXPATHLEN) {
 688                         lxcfs_error("Pathname too long under %s\n", dirname);
 689                         continue;
 690                 }
 691                 if (fchownat(fd, path, uid, gid, 0) < 0)
 692                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 693         }
 694         closedir(d);
 695 }
 696
 697 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 698 {
 699         int cfd;
 700         size_t len;
 701         char *dirnam;
 702
 703         cfd = get_cgroup_fd(controller);
 704         if (cfd < 0)
 705                 return -EINVAL;
 706
 707         /* Make sure we pass a relative path to *at() family of functions.
 708          * . + /cg + \0
 709          */
 710         len = strlen(cg) + 2;
 711         dirnam = alloca(len);
 712         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 713
 714         if (mkdirat(cfd, dirnam, 0755) < 0)
 715                 return -errno;
 716
 717         if (uid == 0 && gid == 0)
 718                 return 0;
 719
 720         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 721                 return -errno;
 722
 723         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 724
 725         return 0;
 726 }
 727
 728 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 729 {
 730         struct dirent *direntp;
 731         DIR *dir;
 732         bool ret = false;
 733         char pathname[MAXPATHLEN];
 734         int dupfd;
 735
 736         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 737         if (dupfd < 0)
 738                 return false;
 739
 740         dir = fdopendir(dupfd);
 741         if (!dir) {
 742                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 743                 close(dupfd);
 744                 return false;
 745         }
 746
 747         while ((direntp = readdir(dir))) {
 748                 struct stat mystat;
 749                 int rc;
 750
 751                 if (!strcmp(direntp->d_name, ".") ||
 752                     !strcmp(direntp->d_name, ".."))
 753                         continue;
 754
 755                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 756                 if (rc < 0 || rc >= MAXPATHLEN) {
 757                         lxcfs_error("%s\n", "Pathname too long.");
 758                         continue;
 759                 }
 760
 761                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 762                 if (rc) {
 763                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 764                         continue;
 765                 }
 766                 if (S_ISDIR(mystat.st_mode))
 767                         if (!recursive_rmdir(pathname, fd, cfd))
 768                                 lxcfs_debug("Error removing %s.\n", pathname);
 769         }
 770
 771         ret = true;
 772         if (closedir(dir) < 0) {
 773                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 774                 ret = false;
 775         }
 776
 777         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 778                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 779                 ret = false;
 780         }
 781
 782         close(dupfd);
 783
 784         return ret;
 785 }
 786
 787 bool cgfs_remove(const char *controller, const char *cg)
 788 {
 789         int fd, cfd;
 790         size_t len;
 791         char *dirnam;
 792         bool bret;
 793
 794         cfd = get_cgroup_fd(controller);
 795         if (cfd < 0)
 796                 return false;
 797
 798         /* Make sure we pass a relative path to *at() family of functions.
 799          * . +  /cg + \0
 800          */
 801         len = strlen(cg) + 2;
 802         dirnam = alloca(len);
 803         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 804
 805         fd = openat(cfd, dirnam, O_DIRECTORY);
 806         if (fd < 0)
 807                 return false;
 808
 809         bret = recursive_rmdir(dirnam, fd, cfd);
 810         close(fd);
 811         return bret;
 812 }
 813
 814 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 815 {
 816         int cfd;
 817         size_t len;
 818         char *pathname;
 819
 820         cfd = get_cgroup_fd(controller);
 821         if (cfd < 0)
 822                 return false;
 823
 824         /* Make sure we pass a relative path to *at() family of functions.
 825          * . + /file + \0
 826          */
 827         len = strlen(file) + 2;
 828         pathname = alloca(len);
 829         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 830         if (fchmodat(cfd, pathname, mode, 0) < 0)
 831                 return false;
 832         return true;
 833 }
 834
 835 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 836 {
 837         size_t len;
 838         char *fname;
 839
 840         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 841         fname = alloca(len);
 842         snprintf(fname, len, "%s/tasks", dirname);
 843         if (fchownat(fd, fname, uid, gid, 0) != 0)
 844                 return -errno;
 845         snprintf(fname, len, "%s/cgroup.procs", dirname);
 846         if (fchownat(fd, fname, uid, gid, 0) != 0)
 847                 return -errno;
 848         return 0;
 849 }
 850
 851 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 852 {
 853         int cfd;
 854         size_t len;
 855         char *pathname;
 856
 857         cfd = get_cgroup_fd(controller);
 858         if (cfd < 0)
 859                 return false;
 860
 861         /* Make sure we pass a relative path to *at() family of functions.
 862          * . + /file + \0
 863          */
 864         len = strlen(file) + 2;
 865         pathname = alloca(len);
 866         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 867         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 868                 return -errno;
 869
 870         if (is_dir(pathname, cfd))
 871                 // like cgmanager did, we want to chown the tasks file as well
 872                 return chown_tasks_files(pathname, uid, gid, cfd);
 873
 874         return 0;
 875 }
 876
 877 FILE *open_pids_file(const char *controller, const char *cgroup)
 878 {
 879         int fd, cfd;
 880         size_t len;
 881         char *pathname;
 882
 883         cfd = get_cgroup_fd(controller);
 884         if (cfd < 0)
 885                 return false;
 886
 887         /* Make sure we pass a relative path to *at() family of functions.
 888          * . + /cgroup + / "cgroup.procs" + \0
 889          */
 890         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 891         pathname = alloca(len);
 892         snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
 893
 894         fd = openat(cfd, pathname, O_WRONLY);
 895         if (fd < 0)
 896                 return NULL;
 897
 898         return fdopen(fd, "w");
 899 }
 900
 901 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 902                                 void ***list, size_t typesize,
 903                                 void* (*iterator)(const char*, const char*, const char*))
 904 {
 905         int cfd, fd, ret;
 906         size_t len;
 907         char *cg;
 908         char pathname[MAXPATHLEN];
 909         size_t sz = 0, asz = 0;
 910         struct dirent *dirent;
 911         DIR *dir;
 912
 913         cfd = get_cgroup_fd(controller);
 914         *list = NULL;
 915         if (cfd < 0)
 916                 return false;
 917
 918         /* Make sure we pass a relative path to *at() family of functions. */
 919         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 920         cg = alloca(len);
 921         ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
 922         if (ret < 0 || (size_t)ret >= len) {
 923                 lxcfs_error("Pathname too long under %s\n", cgroup);
 924                 return false;
 925         }
 926
 927         fd = openat(cfd, cg, O_DIRECTORY);
 928         if (fd < 0)
 929                 return false;
 930
 931         dir = fdopendir(fd);
 932         if (!dir)
 933                 return false;
 934
 935         while ((dirent = readdir(dir))) {
 936                 struct stat mystat;
 937
 938                 if (!strcmp(dirent->d_name, ".") ||
 939                     !strcmp(dirent->d_name, ".."))
 940                         continue;
 941
 942                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 943                 if (ret < 0 || ret >= MAXPATHLEN) {
 944                         lxcfs_error("Pathname too long under %s\n", cg);
 945                         continue;
 946                 }
 947
 948                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 949                 if (ret) {
 950                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
 951                         continue;
 952                 }
 953                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
 954                     (directories && !S_ISDIR(mystat.st_mode)))
 955                         continue;
 956
 957                 if (sz+2 >= asz) {
 958                         void **tmp;
 959                         asz += BATCH_SIZE;
 960                         do {
 961                                 tmp = realloc(*list, asz * typesize);
 962                         } while  (!tmp);
 963                         *list = tmp;
 964                 }
 965                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
 966                 (*list)[sz+1] = NULL;
 967                 sz++;
 968         }
 969         if (closedir(dir) < 0) {
 970                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
 971                 return false;
 972         }
 973         return true;
 974 }
 975
 976 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 977 {
 978         char *dup;
 979         do {
 980                 dup = strdup(dir_entry);
 981         } while (!dup);
 982         return dup;
 983 }
 984
 985 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
 986 {
 987         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
 988 }
 989
 990 void free_key(struct cgfs_files *k)
 991 {
 992         if (!k)
 993                 return;
 994         free_disarm(k->name);
 995         free_disarm(k);
 996 }
 997
 998 void free_keys(struct cgfs_files **keys)
 999 {
1000         int i;
1001
1002         if (!keys)
1003                 return;
1004         for (i = 0; keys[i]; i++) {
1005                 free_key(keys[i]);
1006         }
1007         free_disarm(keys);
1008 }
1009
1010 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1011 {
1012         int ret, cfd;
1013         size_t len;
1014         char *fnam;
1015
1016         cfd = get_cgroup_fd(controller);
1017         if (cfd < 0)
1018                 return false;
1019
1020         /* Make sure we pass a relative path to *at() family of functions.
1021          * . + /cgroup + / + file + \0
1022          */
1023         len = strlen(cgroup) + strlen(file) + 3;
1024         fnam = alloca(len);
1025         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1026         if (ret < 0 || (size_t)ret >= len)
1027                 return false;
1028
1029         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1030 }
1031
1032 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1033 {
1034         int ret, cfd;
1035         size_t len;
1036         char *fnam;
1037         struct stat sb;
1038         struct cgfs_files *newkey;
1039
1040         cfd = get_cgroup_fd(controller);
1041         if (cfd < 0)
1042                 return false;
1043
1044         if (file && *file == '/')
1045                 file++;
1046
1047         if (file && strchr(file, '/'))
1048                 return NULL;
1049
1050         /* Make sure we pass a relative path to *at() family of functions.
1051          * . + /cgroup + / + file + \0
1052          */
1053         len = strlen(cgroup) + 3;
1054         if (file)
1055                 len += strlen(file) + 1;
1056         fnam = alloca(len);
1057         snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
1058                  file ? "/" : "", file ? file : "");
1059
1060         ret = fstatat(cfd, fnam, &sb, 0);
1061         if (ret < 0)
1062                 return NULL;
1063
1064         do {
1065                 newkey = malloc(sizeof(struct cgfs_files));
1066         } while (!newkey);
1067         if (file)
1068                 newkey->name = must_copy_string(file);
1069         else if (strrchr(cgroup, '/'))
1070                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1071         else
1072                 newkey->name = must_copy_string(cgroup);
1073         newkey->uid = sb.st_uid;
1074         newkey->gid = sb.st_gid;
1075         newkey->mode = sb.st_mode;
1076
1077         return newkey;
1078 }
1079
1080 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1081 {
1082         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1083         if (!entry) {
1084                 lxcfs_error("Error getting files under %s:%s\n", controller,
1085                              cgroup);
1086         }
1087         return entry;
1088 }
1089
1090 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1091 {
1092         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1093 }
1094
1095 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1096 {
1097         int cfd;
1098         size_t len;
1099         char *fnam;
1100         int ret;
1101         struct stat sb;
1102
1103         cfd = get_cgroup_fd(controller);
1104         if (cfd < 0)
1105                 return false;
1106
1107         /* Make sure we pass a relative path to *at() family of functions.
1108          * . + /cgroup + / + f + \0
1109          */
1110         len = strlen(cgroup) + strlen(f) + 3;
1111         fnam = alloca(len);
1112         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
1113         if (ret < 0 || (size_t)ret >= len)
1114                 return false;
1115
1116         ret = fstatat(cfd, fnam, &sb, 0);
1117         if (ret < 0 || !S_ISDIR(sb.st_mode))
1118                 return false;
1119
1120         return true;
1121 }
1122
1123 #define SEND_CREDS_OK 0
1124 #define SEND_CREDS_NOTSK 1
1125 #define SEND_CREDS_FAIL 2
1126 static bool recv_creds(int sock, struct ucred *cred, char *v);
1127 static int wait_for_pid(pid_t pid);
1128 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1129 static int send_creds_clone_wrapper(void *arg);
1130
1131 /*
1132  * clone a task which switches to @task's namespace and writes '1'.
1133  * over a unix sock so we can read the task's reaper's pid in our
1134  * namespace
1135  *
1136  * Note: glibc's fork() does not respect pidns, which can lead to failed
1137  * assertions inside glibc (and thus failed forks) if the child's pid in
1138  * the pidns and the parent pid outside are identical. Using clone prevents
1139  * this issue.
1140  */
1141 static void write_task_init_pid_exit(int sock, pid_t target)
1142 {
1143         char fnam[100];
1144         pid_t pid;
1145         int fd, ret;
1146         size_t stack_size = sysconf(_SC_PAGESIZE);
1147         void *stack = alloca(stack_size);
1148
1149         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1150         if (ret < 0 || ret >= sizeof(fnam))
1151                 _exit(1);
1152
1153         fd = open(fnam, O_RDONLY);
1154         if (fd < 0) {
1155                 perror("write_task_init_pid_exit open of ns/pid");
1156                 _exit(1);
1157         }
1158         if (setns(fd, 0)) {
1159                 perror("write_task_init_pid_exit setns 1");
1160                 close(fd);
1161                 _exit(1);
1162         }
1163         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1164         if (pid < 0)
1165                 _exit(1);
1166         if (pid != 0) {
1167                 if (!wait_for_pid(pid))
1168                         _exit(1);
1169                 _exit(0);
1170         }
1171 }
1172
1173 static int send_creds_clone_wrapper(void *arg) {
1174         struct ucred cred;
1175         char v;
1176         int sock = *(int *)arg;
1177
1178         /* we are the child */
1179         cred.uid = 0;
1180         cred.gid = 0;
1181         cred.pid = 1;
1182         v = '1';
1183         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1184                 return 1;
1185         return 0;
1186 }
1187
1188 static pid_t get_init_pid_for_task(pid_t task)
1189 {
1190         int sock[2];
1191         pid_t pid;
1192         pid_t ret = -1;
1193         char v = '0';
1194         struct ucred cred;
1195
1196         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1197                 perror("socketpair");
1198                 return -1;
1199         }
1200
1201         pid = fork();
1202         if (pid < 0)
1203                 goto out;
1204         if (!pid) {
1205                 close(sock[1]);
1206                 write_task_init_pid_exit(sock[0], task);
1207                 _exit(0);
1208         }
1209
1210         if (!recv_creds(sock[1], &cred, &v))
1211                 goto out;
1212         ret = cred.pid;
1213
1214 out:
1215         close(sock[0]);
1216         close(sock[1]);
1217         if (pid > 0)
1218                 wait_for_pid(pid);
1219         return ret;
1220 }
1221
1222 pid_t lookup_initpid_in_store(pid_t qpid)
1223 {
1224         pid_t answer = 0;
1225         struct stat sb;
1226         struct pidns_init_store *e;
1227         char fnam[100];
1228
1229         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1230         store_lock();
1231         if (stat(fnam, &sb) < 0)
1232                 goto out;
1233         e = lookup_verify_initpid(&sb);
1234         if (e) {
1235                 answer = e->initpid;
1236                 goto out;
1237         }
1238         answer = get_init_pid_for_task(qpid);
1239         if (answer > 0)
1240                 save_initpid(&sb, answer);
1241
1242 out:
1243         /* we prune at end in case we are returning
1244          * the value we were about to return */
1245         prune_initpid_store();
1246         store_unlock();
1247         return answer;
1248 }
1249
1250 static int wait_for_pid(pid_t pid)
1251 {
1252         int status, ret;
1253
1254         if (pid <= 0)
1255                 return -1;
1256
1257 again:
1258         ret = waitpid(pid, &status, 0);
1259         if (ret == -1) {
1260                 if (errno == EINTR)
1261                         goto again;
1262                 return -1;
1263         }
1264         if (ret != pid)
1265                 goto again;
1266         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1267                 return -1;
1268         return 0;
1269 }
1270
1271
1272 /*
1273  * append pid to *src.
1274  * src: a pointer to a char* in which ot append the pid.
1275  * sz: the number of characters printed so far, minus trailing \0.
1276  * asz: the allocated size so far
1277  * pid: the pid to append
1278  */
1279 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1280 {
1281         must_strcat(src, sz, asz, "%d\n", (int)pid);
1282 }
1283
1284 /*
1285  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1286  * valid in the caller's namespace, return the id mapped into
1287  * pid's namespace.
1288  * Returns the mapped id, or -1 on error.
1289  */
1290 unsigned int
1291 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1292 {
1293         unsigned int nsuid,   // base id for a range in the idfile's namespace
1294                      hostuid, // base id for a range in the caller's namespace
1295                      count;   // number of ids in this range
1296         char line[400];
1297         int ret;
1298
1299         fseek(idfile, 0L, SEEK_SET);
1300         while (fgets(line, 400, idfile)) {
1301                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1302                 if (ret != 3)
1303                         continue;
1304                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1305                         /*
1306                          * uids wrapped around - unexpected as this is a procfile,
1307                          * so just bail.
1308                          */
1309                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1310                                 nsuid, hostuid, count, line);
1311                         return -1;
1312                 }
1313                 if (hostuid <= in_id && hostuid+count > in_id) {
1314                         /*
1315                          * now since hostuid <= in_id < hostuid+count, and
1316                          * hostuid+count and nsuid+count do not wrap around,
1317                          * we know that nsuid+(in_id-hostuid) which must be
1318                          * less that nsuid+(count) must not wrap around
1319                          */
1320                         return (in_id - hostuid) + nsuid;
1321                 }
1322         }
1323
1324         // no answer found
1325         return -1;
1326 }
1327
1328 /*
1329  * for is_privileged_over,
1330  * specify whether we require the calling uid to be root in his
1331  * namespace
1332  */
1333 #define NS_ROOT_REQD true
1334 #define NS_ROOT_OPT false
1335
1336 #define PROCLEN 100
1337
1338 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1339 {
1340         char fpath[PROCLEN];
1341         int ret;
1342         bool answer = false;
1343         uid_t nsuid;
1344
1345         if (victim == -1 || uid == -1)
1346                 return false;
1347
1348         /*
1349          * If the request is one not requiring root in the namespace,
1350          * then having the same uid suffices.  (i.e. uid 1000 has write
1351          * access to files owned by uid 1000
1352          */
1353         if (!req_ns_root && uid == victim)
1354                 return true;
1355
1356         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1357         if (ret < 0 || ret >= PROCLEN)
1358                 return false;
1359         FILE *f = fopen(fpath, "r");
1360         if (!f)
1361                 return false;
1362
1363         /* if caller's not root in his namespace, reject */
1364         nsuid = convert_id_to_ns(f, uid);
1365         if (nsuid)
1366                 goto out;
1367
1368         /*
1369          * If victim is not mapped into caller's ns, reject.
1370          * XXX I'm not sure this check is needed given that fuse
1371          * will be sending requests where the vfs has converted
1372          */
1373         nsuid = convert_id_to_ns(f, victim);
1374         if (nsuid == -1)
1375                 goto out;
1376
1377         answer = true;
1378
1379 out:
1380         fclose(f);
1381         return answer;
1382 }
1383
1384 static bool perms_include(int fmode, mode_t req_mode)
1385 {
1386         mode_t r;
1387
1388         switch (req_mode & O_ACCMODE) {
1389         case O_RDONLY:
1390                 r = S_IROTH;
1391                 break;
1392         case O_WRONLY:
1393                 r = S_IWOTH;
1394                 break;
1395         case O_RDWR:
1396                 r = S_IROTH | S_IWOTH;
1397                 break;
1398         default:
1399                 return false;
1400         }
1401         return ((fmode & r) == r);
1402 }
1403
1404
1405 /*
1406  * taskcg is  a/b/c
1407  * querycg is /a/b/c/d/e
1408  * we return 'd'
1409  */
1410 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1411 {
1412         char *start, *end;
1413
1414         if (strlen(taskcg) <= strlen(querycg)) {
1415                 lxcfs_error("%s\n", "I was fed bad input.");
1416                 return NULL;
1417         }
1418
1419         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1420                 start =  strdup(taskcg + 1);
1421         else
1422                 start = strdup(taskcg + strlen(querycg) + 1);
1423         if (!start)
1424                 return NULL;
1425         end = strchr(start, '/');
1426         if (end)
1427                 *end = '\0';
1428         return start;
1429 }
1430
1431 char *get_pid_cgroup(pid_t pid, const char *contrl)
1432 {
1433         int cfd;
1434
1435         cfd = get_cgroup_fd(contrl);
1436         if (cfd < 0)
1437                 return false;
1438
1439         if (pure_unified_layout(cgroup_ops))
1440                 return cg_unified_get_current_cgroup(pid);
1441
1442         return cg_legacy_get_current_cgroup(pid, contrl);
1443 }
1444
1445 /*
1446  * check whether a fuse context may access a cgroup dir or file
1447  *
1448  * If file is not null, it is a cgroup file to check under cg.
1449  * If file is null, then we are checking perms on cg itself.
1450  *
1451  * For files we can check the mode of the list_keys result.
1452  * For cgroups, we must make assumptions based on the files under the
1453  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1454  * yet.
1455  */
1456 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1457 {
1458         struct cgfs_files *k = NULL;
1459         bool ret = false;
1460
1461         k = cgfs_get_key(contrl, cg, file);
1462         if (!k)
1463                 return false;
1464
1465         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1466                 if (perms_include(k->mode >> 6, mode)) {
1467                         ret = true;
1468                         goto out;
1469                 }
1470         }
1471         if (fc->gid == k->gid) {
1472                 if (perms_include(k->mode >> 3, mode)) {
1473                         ret = true;
1474                         goto out;
1475                 }
1476         }
1477         ret = perms_include(k->mode, mode);
1478
1479 out:
1480         free_key(k);
1481         return ret;
1482 }
1483
1484 #define INITSCOPE "/init.scope"
1485 void prune_init_slice(char *cg)
1486 {
1487         char *point;
1488         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1489
1490         if (cg_len < initscope_len)
1491                 return;
1492
1493         point = cg + cg_len - initscope_len;
1494         if (strcmp(point, INITSCOPE) == 0) {
1495                 if (point == cg)
1496                         *(point+1) = '\0';
1497                 else
1498                         *point = '\0';
1499         }
1500 }
1501
1502 /*
1503  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1504  * If pid is in /a, he may act on /a/b, but not on /b.
1505  * if the answer is false and nextcg is not NULL, then *nextcg will point
1506  * to a string containing the next cgroup directory under cg, which must be
1507  * freed by the caller.
1508  */
1509 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1510 {
1511         bool answer = false;
1512         char *c2 = get_pid_cgroup(pid, contrl);
1513         char *linecmp;
1514
1515         if (!c2)
1516                 return false;
1517         prune_init_slice(c2);
1518
1519         /*
1520          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1521          * they pass in a cgroup without leading '/'
1522          *
1523          * The original line here was:
1524          *      linecmp = *cg == '/' ? c2 : c2+1;
1525          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1526          *       Serge, do you know?
1527          */
1528         if (*cg == '/' || !strncmp(cg, "./", 2))
1529                 linecmp = c2;
1530         else
1531                 linecmp = c2 + 1;
1532         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1533                 if (nextcg) {
1534                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1535                 }
1536                 goto out;
1537         }
1538         answer = true;
1539
1540 out:
1541         free(c2);
1542         return answer;
1543 }
1544
1545 /*
1546  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1547  */
1548 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1549 {
1550         bool answer = false;
1551         char *c2, *task_cg;
1552         size_t target_len, task_len;
1553
1554         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1555                 return true;
1556
1557         c2 = get_pid_cgroup(pid, contrl);
1558         if (!c2)
1559                 return false;
1560         prune_init_slice(c2);
1561
1562         task_cg = c2 + 1;
1563         target_len = strlen(cg);
1564         task_len = strlen(task_cg);
1565         if (task_len == 0) {
1566                 /* Task is in the root cg, it can see everything. This case is
1567                  * not handled by the strmcps below, since they test for the
1568                  * last /, but that is the first / that we've chopped off
1569                  * above.
1570                  */
1571                 answer = true;
1572                 goto out;
1573         }
1574         if (strcmp(cg, task_cg) == 0) {
1575                 answer = true;
1576                 goto out;
1577         }
1578         if (target_len < task_len) {
1579                 /* looking up a parent dir */
1580                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1581                         answer = true;
1582                 goto out;
1583         }
1584         if (target_len > task_len) {
1585                 /* looking up a child dir */
1586                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1587                         answer = true;
1588                 goto out;
1589         }
1590
1591 out:
1592         free(c2);
1593         return answer;
1594 }
1595
1596 /*
1597  * given /cgroup/freezer/a/b, return "freezer".
1598  * the returned char* should NOT be freed.
1599  */
1600 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1601 {
1602         const char *p1;
1603         char *contr, *slash;
1604
1605         if (strlen(path) < 9) {
1606                 errno = EACCES;
1607                 return NULL;
1608         }
1609         if (*(path + 7) != '/') {
1610                 errno = EINVAL;
1611                 return NULL;
1612         }
1613         p1 = path + 8;
1614         contr = strdupa(p1);
1615         if (!contr) {
1616                 errno = ENOMEM;
1617                 return NULL;
1618         }
1619         slash = strstr(contr, "/");
1620         if (slash)
1621                 *slash = '\0';
1622
1623         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1624                 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1625                         return (*h)->__controllers;
1626         }
1627         errno = ENOENT;
1628         return NULL;
1629 }
1630
1631 /*
1632  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1633  * Note that the returned value may include files (keynames) etc
1634  */
1635 static const char *find_cgroup_in_path(const char *path)
1636 {
1637         const char *p1;
1638
1639         if (strlen(path) < 9) {
1640                 errno = EACCES;
1641                 return NULL;
1642         }
1643         p1 = strstr(path + 8, "/");
1644         if (!p1) {
1645                 errno = EINVAL;
1646                 return NULL;
1647         }
1648         errno = 0;
1649         return p1 + 1;
1650 }
1651
1652 /*
1653  * split the last path element from the path in @cg.
1654  * @dir is newly allocated and should be freed, @last not
1655 */
1656 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1657 {
1658         char *p;
1659
1660         do {
1661                 *dir = strdup(cg);
1662         } while (!*dir);
1663         *last = strrchr(cg, '/');
1664         if (!*last) {
1665                 *last = NULL;
1666                 return;
1667         }
1668         p = strrchr(*dir, '/');
1669         *p = '\0';
1670 }
1671
1672 /*
1673  * FUSE ops for /cgroup
1674  */
1675
1676 int cg_getattr(const char *path, struct stat *sb)
1677 {
1678         struct timespec now;
1679         struct fuse_context *fc = fuse_get_context();
1680         char * cgdir = NULL;
1681         char *last = NULL, *path1, *path2;
1682         struct cgfs_files *k = NULL;
1683         const char *cgroup;
1684         const char *controller = NULL;
1685         int ret = -ENOENT;
1686
1687
1688         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1689                 return -EIO;
1690
1691         memset(sb, 0, sizeof(struct stat));
1692
1693         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1694                 return -EINVAL;
1695
1696         sb->st_uid = sb->st_gid = 0;
1697         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1698         sb->st_size = 0;
1699
1700         if (strcmp(path, "/cgroup") == 0) {
1701                 sb->st_mode = S_IFDIR | 00755;
1702                 sb->st_nlink = 2;
1703                 return 0;
1704         }
1705
1706         controller = pick_controller_from_path(fc, path);
1707         if (!controller)
1708                 return -errno;
1709         cgroup = find_cgroup_in_path(path);
1710         if (!cgroup) {
1711                 /* this is just /cgroup/controller, return it as a dir */
1712                 sb->st_mode = S_IFDIR | 00755;
1713                 sb->st_nlink = 2;
1714                 return 0;
1715         }
1716
1717         get_cgdir_and_path(cgroup, &cgdir, &last);
1718
1719         if (!last) {
1720                 path1 = "/";
1721                 path2 = cgdir;
1722         } else {
1723                 path1 = cgdir;
1724                 path2 = last;
1725         }
1726
1727         pid_t initpid = lookup_initpid_in_store(fc->pid);
1728         if (initpid <= 1 || is_shared_pidns(initpid))
1729                 initpid = fc->pid;
1730         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1731          * Then check that caller's cgroup is under path if last is a child
1732          * cgroup, or cgdir if last is a file */
1733
1734         if (is_child_cgroup(controller, path1, path2)) {
1735                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1736                         ret = -ENOENT;
1737                         goto out;
1738                 }
1739                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1740                         /* this is just /cgroup/controller, return it as a dir */
1741                         sb->st_mode = S_IFDIR | 00555;
1742                         sb->st_nlink = 2;
1743                         ret = 0;
1744                         goto out;
1745                 }
1746                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1747                         ret = -EACCES;
1748                         goto out;
1749                 }
1750
1751                 // get uid, gid, from '/tasks' file and make up a mode
1752                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1753                 sb->st_mode = S_IFDIR | 00755;
1754                 k = cgfs_get_key(controller, cgroup, NULL);
1755                 if (!k) {
1756                         sb->st_uid = sb->st_gid = 0;
1757                 } else {
1758                         sb->st_uid = k->uid;
1759                         sb->st_gid = k->gid;
1760                 }
1761                 free_key(k);
1762                 sb->st_nlink = 2;
1763                 ret = 0;
1764                 goto out;
1765         }
1766
1767         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1768                 sb->st_mode = S_IFREG | k->mode;
1769                 sb->st_nlink = 1;
1770                 sb->st_uid = k->uid;
1771                 sb->st_gid = k->gid;
1772                 sb->st_size = 0;
1773                 free_key(k);
1774                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1775                         ret = -ENOENT;
1776                         goto out;
1777                 }
1778                 ret = 0;
1779         }
1780
1781 out:
1782         free(cgdir);
1783         return ret;
1784 }
1785
1786 int cg_opendir(const char *path, struct fuse_file_info *fi)
1787 {
1788         struct fuse_context *fc = fuse_get_context();
1789         const char *cgroup;
1790         struct file_info *dir_info;
1791         char *controller = NULL;
1792
1793         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1794                 return -EIO;
1795
1796         if (strcmp(path, "/cgroup") == 0) {
1797                 cgroup = NULL;
1798                 controller = NULL;
1799         } else {
1800                 // return list of keys for the controller, and list of child cgroups
1801                 controller = pick_controller_from_path(fc, path);
1802                 if (!controller)
1803                         return -errno;
1804
1805                 cgroup = find_cgroup_in_path(path);
1806                 if (!cgroup) {
1807                         /* this is just /cgroup/controller, return its contents */
1808                         cgroup = "/";
1809                 }
1810         }
1811
1812         pid_t initpid = lookup_initpid_in_store(fc->pid);
1813         if (initpid <= 1 || is_shared_pidns(initpid))
1814                 initpid = fc->pid;
1815         if (cgroup) {
1816                 if (!caller_may_see_dir(initpid, controller, cgroup))
1817                         return -ENOENT;
1818                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1819                         return -EACCES;
1820         }
1821
1822         /* we'll free this at cg_releasedir */
1823         dir_info = malloc(sizeof(*dir_info));
1824         if (!dir_info)
1825                 return -ENOMEM;
1826         dir_info->controller = must_copy_string(controller);
1827         dir_info->cgroup = must_copy_string(cgroup);
1828         dir_info->type = LXC_TYPE_CGDIR;
1829         dir_info->buf = NULL;
1830         dir_info->file = NULL;
1831         dir_info->buflen = 0;
1832
1833         fi->fh = (unsigned long)dir_info;
1834         return 0;
1835 }
1836
1837 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1838                 struct fuse_file_info *fi)
1839 {
1840         struct file_info *d = (struct file_info *)fi->fh;
1841         struct cgfs_files **list = NULL;
1842         int i, ret;
1843         char *nextcg = NULL;
1844         struct fuse_context *fc = fuse_get_context();
1845         char **clist = NULL;
1846
1847         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1848                 return -EIO;
1849
1850         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1851                 return -EIO;
1852
1853         if (d->type != LXC_TYPE_CGDIR) {
1854                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1855                 return -EIO;
1856         }
1857         if (!d->cgroup && !d->controller) {
1858                 /*
1859                  * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1860                  * This only works with the legacy hierarchy.
1861                  */
1862                 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1863                         if (is_unified_hierarchy(*h))
1864                                 continue;
1865
1866                         if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
1867                                 return -EIO;
1868                 }
1869
1870                 return 0;
1871         }
1872
1873         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1874                 // not a valid cgroup
1875                 ret = -EINVAL;
1876                 goto out;
1877         }
1878
1879         pid_t initpid = lookup_initpid_in_store(fc->pid);
1880         if (initpid <= 1 || is_shared_pidns(initpid))
1881                 initpid = fc->pid;
1882         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1883                 if (nextcg) {
1884                         ret = filler(buf, nextcg,  NULL, 0);
1885                         free(nextcg);
1886                         if (ret != 0) {
1887                                 ret = -EIO;
1888                                 goto out;
1889                         }
1890                 }
1891                 ret = 0;
1892                 goto out;
1893         }
1894
1895         for (i = 0; list && list[i]; i++) {
1896                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1897                         ret = -EIO;
1898                         goto out;
1899                 }
1900         }
1901
1902         // now get the list of child cgroups
1903
1904         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1905                 ret = 0;
1906                 goto out;
1907         }
1908         if (clist) {
1909                 for (i = 0; clist[i]; i++) {
1910                         if (filler(buf, clist[i], NULL, 0) != 0) {
1911                                 ret = -EIO;
1912                                 goto out;
1913                         }
1914                 }
1915         }
1916         ret = 0;
1917
1918 out:
1919         free_keys(list);
1920         if (clist) {
1921                 for (i = 0; clist[i]; i++)
1922                         free(clist[i]);
1923                 free(clist);
1924         }
1925         return ret;
1926 }
1927
1928 void do_release_file_info(struct fuse_file_info *fi)
1929 {
1930         struct file_info *f = (struct file_info *)fi->fh;
1931
1932         if (!f)
1933                 return;
1934
1935         fi->fh = 0;
1936
1937         free_disarm(f->controller);
1938         free_disarm(f->cgroup);
1939         free_disarm(f->file);
1940         free_disarm(f->buf);
1941         free_disarm(f);
1942 }
1943
1944 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1945 {
1946         do_release_file_info(fi);
1947         return 0;
1948 }
1949
1950 int cg_open(const char *path, struct fuse_file_info *fi)
1951 {
1952         const char *cgroup;
1953         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1954         struct cgfs_files *k = NULL;
1955         struct file_info *file_info;
1956         struct fuse_context *fc = fuse_get_context();
1957         int ret;
1958
1959         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1960                 return -EIO;
1961
1962         controller = pick_controller_from_path(fc, path);
1963         if (!controller)
1964                 return -errno;
1965         cgroup = find_cgroup_in_path(path);
1966         if (!cgroup)
1967                 return -errno;
1968
1969         get_cgdir_and_path(cgroup, &cgdir, &last);
1970         if (!last) {
1971                 path1 = "/";
1972                 path2 = cgdir;
1973         } else {
1974                 path1 = cgdir;
1975                 path2 = last;
1976         }
1977
1978         k = cgfs_get_key(controller, path1, path2);
1979         if (!k) {
1980                 ret = -EINVAL;
1981                 goto out;
1982         }
1983         free_key(k);
1984
1985         pid_t initpid = lookup_initpid_in_store(fc->pid);
1986         if (initpid <= 1 || is_shared_pidns(initpid))
1987                 initpid = fc->pid;
1988         if (!caller_may_see_dir(initpid, controller, path1)) {
1989                 ret = -ENOENT;
1990                 goto out;
1991         }
1992         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1993                 ret = -EACCES;
1994                 goto out;
1995         }
1996
1997         /* we'll free this at cg_release */
1998         file_info = malloc(sizeof(*file_info));
1999         if (!file_info) {
2000                 ret = -ENOMEM;
2001                 goto out;
2002         }
2003         file_info->controller = must_copy_string(controller);
2004         file_info->cgroup = must_copy_string(path1);
2005         file_info->file = must_copy_string(path2);
2006         file_info->type = LXC_TYPE_CGFILE;
2007         file_info->buf = NULL;
2008         file_info->buflen = 0;
2009
2010         fi->fh = (unsigned long)file_info;
2011         ret = 0;
2012
2013 out:
2014         free(cgdir);
2015         return ret;
2016 }
2017
2018 int cg_access(const char *path, int mode)
2019 {
2020         int ret;
2021         const char *cgroup;
2022         char *path1, *path2, *controller;
2023         char *last = NULL, *cgdir = NULL;
2024         struct cgfs_files *k = NULL;
2025         struct fuse_context *fc = fuse_get_context();
2026
2027         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2028                 return -EIO;
2029
2030         if (strcmp(path, "/cgroup") == 0)
2031                 return 0;
2032
2033         controller = pick_controller_from_path(fc, path);
2034         if (!controller)
2035                 return -errno;
2036         cgroup = find_cgroup_in_path(path);
2037         if (!cgroup) {
2038                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2039                 if ((mode & W_OK) == 0)
2040                         return 0;
2041                 return -EACCES;
2042         }
2043
2044         get_cgdir_and_path(cgroup, &cgdir, &last);
2045         if (!last) {
2046                 path1 = "/";
2047                 path2 = cgdir;
2048         } else {
2049                 path1 = cgdir;
2050                 path2 = last;
2051         }
2052
2053         k = cgfs_get_key(controller, path1, path2);
2054         if (!k) {
2055                 if ((mode & W_OK) == 0)
2056                         ret = 0;
2057                 else
2058                         ret = -EACCES;
2059                 goto out;
2060         }
2061         free_key(k);
2062
2063         pid_t initpid = lookup_initpid_in_store(fc->pid);
2064         if (initpid <= 1 || is_shared_pidns(initpid))
2065                 initpid = fc->pid;
2066         if (!caller_may_see_dir(initpid, controller, path1)) {
2067                 ret = -ENOENT;
2068                 goto out;
2069         }
2070         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2071                 ret = -EACCES;
2072                 goto out;
2073         }
2074
2075         ret = 0;
2076
2077 out:
2078         free(cgdir);
2079         return ret;
2080 }
2081
2082 int cg_release(const char *path, struct fuse_file_info *fi)
2083 {
2084         do_release_file_info(fi);
2085         return 0;
2086 }
2087
2088 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2089
2090 static bool wait_for_sock(int sock, int timeout)
2091 {
2092         struct epoll_event ev;
2093         int epfd, ret, now, starttime, deltatime, saved_errno;
2094
2095         if ((starttime = time(NULL)) < 0)
2096                 return false;
2097
2098         if ((epfd = epoll_create(1)) < 0) {
2099                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2100                 return false;
2101         }
2102
2103         ev.events = POLLIN_SET;
2104         ev.data.fd = sock;
2105         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2106                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2107                 close(epfd);
2108                 return false;
2109         }
2110
2111 again:
2112         if ((now = time(NULL)) < 0) {
2113                 close(epfd);
2114                 return false;
2115         }
2116
2117         deltatime = (starttime + timeout) - now;
2118         if (deltatime < 0) { // timeout
2119                 errno = 0;
2120                 close(epfd);
2121                 return false;
2122         }
2123         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2124         if (ret < 0 && errno == EINTR)
2125                 goto again;
2126         saved_errno = errno;
2127         close(epfd);
2128
2129         if (ret <= 0) {
2130                 errno = saved_errno;
2131                 return false;
2132         }
2133         return true;
2134 }
2135
2136 static int msgrecv(int sockfd, void *buf, size_t len)
2137 {
2138         if (!wait_for_sock(sockfd, 2))
2139                 return -1;
2140         return recv(sockfd, buf, len, MSG_DONTWAIT);
2141 }
2142
2143 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2144 {
2145         struct msghdr msg = { 0 };
2146         struct iovec iov;
2147         struct cmsghdr *cmsg;
2148         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2149         char buf[1];
2150         buf[0] = 'p';
2151
2152         if (pingfirst) {
2153                 if (msgrecv(sock, buf, 1) != 1) {
2154                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2155                         return SEND_CREDS_FAIL;
2156                 }
2157         }
2158
2159         msg.msg_control = cmsgbuf;
2160         msg.msg_controllen = sizeof(cmsgbuf);
2161
2162         cmsg = CMSG_FIRSTHDR(&msg);
2163         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2164         cmsg->cmsg_level = SOL_SOCKET;
2165         cmsg->cmsg_type = SCM_CREDENTIALS;
2166         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2167
2168         msg.msg_name = NULL;
2169         msg.msg_namelen = 0;
2170
2171         buf[0] = v;
2172         iov.iov_base = buf;
2173         iov.iov_len = sizeof(buf);
2174         msg.msg_iov = &iov;
2175         msg.msg_iovlen = 1;
2176
2177         if (sendmsg(sock, &msg, 0) < 0) {
2178                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2179                 if (errno == 3)
2180                         return SEND_CREDS_NOTSK;
2181                 return SEND_CREDS_FAIL;
2182         }
2183
2184         return SEND_CREDS_OK;
2185 }
2186
2187 static bool recv_creds(int sock, struct ucred *cred, char *v)
2188 {
2189         struct msghdr msg = { 0 };
2190         struct iovec iov;
2191         struct cmsghdr *cmsg;
2192         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2193         char buf[1];
2194         int ret;
2195         int optval = 1;
2196
2197         *v = '1';
2198
2199         cred->pid = -1;
2200         cred->uid = -1;
2201         cred->gid = -1;
2202
2203         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2204                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2205                 return false;
2206         }
2207         buf[0] = '1';
2208         if (write(sock, buf, 1) != 1) {
2209                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2210                 return false;
2211         }
2212
2213         msg.msg_name = NULL;
2214         msg.msg_namelen = 0;
2215         msg.msg_control = cmsgbuf;
2216         msg.msg_controllen = sizeof(cmsgbuf);
2217
2218         iov.iov_base = buf;
2219         iov.iov_len = sizeof(buf);
2220         msg.msg_iov = &iov;
2221         msg.msg_iovlen = 1;
2222
2223         if (!wait_for_sock(sock, 2)) {
2224                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2225                 return false;
2226         }
2227         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2228         if (ret < 0) {
2229                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2230                 return false;
2231         }
2232
2233         cmsg = CMSG_FIRSTHDR(&msg);
2234
2235         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2236                         cmsg->cmsg_level == SOL_SOCKET &&
2237                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2238                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2239         }
2240         *v = buf[0];
2241
2242         return true;
2243 }
2244
2245 struct pid_ns_clone_args {
2246         int *cpipe;
2247         int sock;
2248         pid_t tpid;
2249         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2250 };
2251
2252 /*
2253  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2254  * with clone(). This simply writes '1' as ACK back to the parent
2255  * before calling the actual wrapped function.
2256  */
2257 static int pid_ns_clone_wrapper(void *arg) {
2258         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2259         char b = '1';
2260
2261         close(args->cpipe[0]);
2262         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2263                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2264         close(args->cpipe[1]);
2265         return args->wrapped(args->sock, args->tpid);
2266 }
2267
2268 /*
2269  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2270  * int value back over the socket.  This shifts the pid from the
2271  * sender's pidns into tpid's pidns.
2272  */
2273 static int pid_to_ns(int sock, pid_t tpid)
2274 {
2275         char v = '0';
2276         struct ucred cred;
2277
2278         while (recv_creds(sock, &cred, &v)) {
2279                 if (v == '1')
2280                         return 0;
2281                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2282                         return 1;
2283         }
2284         return 0;
2285 }
2286
2287
2288 /*
2289  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2290  * in your old pidns.  Only children which you clone will be in the target
2291  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2292  * actually convert pids.
2293  *
2294  * Note: glibc's fork() does not respect pidns, which can lead to failed
2295  * assertions inside glibc (and thus failed forks) if the child's pid in
2296  * the pidns and the parent pid outside are identical. Using clone prevents
2297  * this issue.
2298  */
2299 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2300 {
2301         int newnsfd = -1, ret, cpipe[2];
2302         char fnam[100];
2303         pid_t cpid;
2304         char v;
2305
2306         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2307         if (ret < 0 || ret >= sizeof(fnam))
2308                 _exit(1);
2309         newnsfd = open(fnam, O_RDONLY);
2310         if (newnsfd < 0)
2311                 _exit(1);
2312         if (setns(newnsfd, 0) < 0)
2313                 _exit(1);
2314         close(newnsfd);
2315
2316         if (pipe(cpipe) < 0)
2317                 _exit(1);
2318
2319         struct pid_ns_clone_args args = {
2320                 .cpipe = cpipe,
2321                 .sock = sock,
2322                 .tpid = tpid,
2323                 .wrapped = &pid_to_ns
2324         };
2325         size_t stack_size = sysconf(_SC_PAGESIZE);
2326         void *stack = alloca(stack_size);
2327
2328         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2329         if (cpid < 0)
2330                 _exit(1);
2331
2332         // give the child 1 second to be done forking and
2333         // write its ack
2334         if (!wait_for_sock(cpipe[0], 1))
2335                 _exit(1);
2336         ret = read(cpipe[0], &v, 1);
2337         if (ret != sizeof(char) || v != '1')
2338                 _exit(1);
2339
2340         if (!wait_for_pid(cpid))
2341                 _exit(1);
2342         _exit(0);
2343 }
2344
2345 /*
2346  * To read cgroup files with a particular pid, we will setns into the child
2347  * pidns, open a pipe, fork a child - which will be the first to really be in
2348  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2349  */
2350 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2351 {
2352         int sock[2] = {-1, -1};
2353         char *tmpdata = NULL;
2354         int ret;
2355         pid_t qpid, cpid = -1;
2356         bool answer = false;
2357         char v = '0';
2358         struct ucred cred;
2359         size_t sz = 0, asz = 0;
2360
2361         if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
2362                 return false;
2363
2364         /*
2365          * Now we read the pids from returned data one by one, pass
2366          * them into a child in the target namespace, read back the
2367          * translated pids, and put them into our to-return data
2368          */
2369
2370         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2371                 perror("socketpair");
2372                 free(tmpdata);
2373                 return false;
2374         }
2375
2376         cpid = fork();
2377         if (cpid == -1)
2378                 goto out;
2379
2380         if (!cpid) // child - exits when done
2381                 pid_to_ns_wrapper(sock[1], tpid);
2382
2383         char *ptr = tmpdata;
2384         cred.uid = 0;
2385         cred.gid = 0;
2386         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2387                 cred.pid = qpid;
2388                 ret = send_creds(sock[0], &cred, v, true);
2389
2390                 if (ret == SEND_CREDS_NOTSK)
2391                         goto next;
2392                 if (ret == SEND_CREDS_FAIL)
2393                         goto out;
2394
2395                 // read converted results
2396                 if (!wait_for_sock(sock[0], 2)) {
2397                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2398                         goto out;
2399                 }
2400                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2401                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2402                         goto out;
2403                 }
2404                 must_strcat_pid(d, &sz, &asz, qpid);
2405 next:
2406                 ptr = strchr(ptr, '\n');
2407                 if (!ptr)
2408                         break;
2409                 ptr++;
2410         }
2411
2412         cred.pid = getpid();
2413         v = '1';
2414         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2415                 // failed to ask child to exit
2416                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2417                 goto out;
2418         }
2419
2420         answer = true;
2421
2422 out:
2423         free(tmpdata);
2424         if (cpid != -1)
2425                 wait_for_pid(cpid);
2426         if (sock[0] != -1) {
2427                 close(sock[0]);
2428                 close(sock[1]);
2429         }
2430         return answer;
2431 }
2432
2433 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2434                 struct fuse_file_info *fi)
2435 {
2436         struct fuse_context *fc = fuse_get_context();
2437         struct file_info *f = (struct file_info *)fi->fh;
2438         struct cgfs_files *k = NULL;
2439         char *data = NULL;
2440         int ret, s;
2441         bool r;
2442
2443         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2444                 return -EIO;
2445
2446         if (f->type != LXC_TYPE_CGFILE) {
2447                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2448                 return -EIO;
2449         }
2450
2451         if (offset)
2452                 return 0;
2453
2454         if (!f->controller)
2455                 return -EINVAL;
2456
2457         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2458                 return -EINVAL;
2459         }
2460         free_key(k);
2461
2462
2463         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2464                 ret = -EACCES;
2465                 goto out;
2466         }
2467
2468         if (strcmp(f->file, "tasks") == 0 ||
2469                         strcmp(f->file, "/tasks") == 0 ||
2470                         strcmp(f->file, "/cgroup.procs") == 0 ||
2471                         strcmp(f->file, "cgroup.procs") == 0)
2472                 // special case - we have to translate the pids
2473                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2474         else
2475                 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
2476
2477         if (!r) {
2478                 ret = -EINVAL;
2479                 goto out;
2480         }
2481
2482         if (!data) {
2483                 ret = 0;
2484                 goto out;
2485         }
2486         s = strlen(data);
2487         if (s > size)
2488                 s = size;
2489         memcpy(buf, data, s);
2490         if (s > 0 && s < size && data[s-1] != '\n')
2491                 buf[s++] = '\n';
2492
2493         ret = s;
2494
2495 out:
2496         free(data);
2497         return ret;
2498 }
2499
2500 static int pid_from_ns(int sock, pid_t tpid)
2501 {
2502         pid_t vpid;
2503         struct ucred cred;
2504         char v;
2505         int ret;
2506
2507         cred.uid = 0;
2508         cred.gid = 0;
2509         while (1) {
2510                 if (!wait_for_sock(sock, 2)) {
2511                         lxcfs_error("%s\n", "Timeout reading from parent.");
2512                         return 1;
2513                 }
2514                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2515                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2516                         return 1;
2517                 }
2518                 if (vpid == -1) // done
2519                         break;
2520                 v = '0';
2521                 cred.pid = vpid;
2522                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2523                         v = '1';
2524                         cred.pid = getpid();
2525                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2526                                 return 1;
2527                 }
2528         }
2529         return 0;
2530 }
2531
2532 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2533 {
2534         int newnsfd = -1, ret, cpipe[2];
2535         char fnam[100];
2536         pid_t cpid;
2537         char v;
2538
2539         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2540         if (ret < 0 || ret >= sizeof(fnam))
2541                 _exit(1);
2542         newnsfd = open(fnam, O_RDONLY);
2543         if (newnsfd < 0)
2544                 _exit(1);
2545         if (setns(newnsfd, 0) < 0)
2546                 _exit(1);
2547         close(newnsfd);
2548
2549         if (pipe(cpipe) < 0)
2550                 _exit(1);
2551
2552         struct pid_ns_clone_args args = {
2553                 .cpipe = cpipe,
2554                 .sock = sock,
2555                 .tpid = tpid,
2556                 .wrapped = &pid_from_ns
2557         };
2558         size_t stack_size = sysconf(_SC_PAGESIZE);
2559         void *stack = alloca(stack_size);
2560
2561         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2562         if (cpid < 0)
2563                 _exit(1);
2564
2565         // give the child 1 second to be done forking and
2566         // write its ack
2567         if (!wait_for_sock(cpipe[0], 1))
2568                 _exit(1);
2569         ret = read(cpipe[0], &v, 1);
2570         if (ret != sizeof(char) || v != '1')
2571                 _exit(1);
2572
2573         if (!wait_for_pid(cpid))
2574                 _exit(1);
2575         _exit(0);
2576 }
2577
2578 /*
2579  * Given host @uid, return the uid to which it maps in
2580  * @pid's user namespace, or -1 if none.
2581  */
2582 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2583 {
2584         FILE *f;
2585         char line[400];
2586
2587         sprintf(line, "/proc/%d/uid_map", pid);
2588         if ((f = fopen(line, "r")) == NULL) {
2589                 return false;
2590         }
2591
2592         *answer = convert_id_to_ns(f, uid);
2593         fclose(f);
2594
2595         if (*answer == -1)
2596                 return false;
2597         return true;
2598 }
2599
2600 /*
2601  * get_pid_creds: get the real uid and gid of @pid from
2602  * /proc/$$/status
2603  * (XXX should we use euid here?)
2604  */
2605 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2606 {
2607         char line[400];
2608         uid_t u;
2609         gid_t g;
2610         FILE *f;
2611
2612         *uid = -1;
2613         *gid = -1;
2614         sprintf(line, "/proc/%d/status", pid);
2615         if ((f = fopen(line, "r")) == NULL) {
2616                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2617                 return;
2618         }
2619         while (fgets(line, 400, f)) {
2620                 if (strncmp(line, "Uid:", 4) == 0) {
2621                         if (sscanf(line+4, "%u", &u) != 1) {
2622                                 lxcfs_error("bad uid line for pid %u\n", pid);
2623                                 fclose(f);
2624                                 return;
2625                         }
2626                         *uid = u;
2627                 } else if (strncmp(line, "Gid:", 4) == 0) {
2628                         if (sscanf(line+4, "%u", &g) != 1) {
2629                                 lxcfs_error("bad gid line for pid %u\n", pid);
2630                                 fclose(f);
2631                                 return;
2632                         }
2633                         *gid = g;
2634                 }
2635         }
2636         fclose(f);
2637 }
2638
2639 /*
2640  * May the requestor @r move victim @v to a new cgroup?
2641  * This is allowed if
2642  *   . they are the same task
2643  *   . they are ownedy by the same uid
2644  *   . @r is root on the host, or
2645  *   . @v's uid is mapped into @r's where @r is root.
2646  */
2647 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2648 {
2649         uid_t v_uid, tmpuid;
2650         gid_t v_gid;
2651
2652         if (r == v)
2653                 return true;
2654         if (r_uid == 0)
2655                 return true;
2656         get_pid_creds(v, &v_uid, &v_gid);
2657         if (r_uid == v_uid)
2658                 return true;
2659         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2660                         && hostuid_to_ns(v_uid, r, &tmpuid))
2661                 return true;
2662         return false;
2663 }
2664
2665 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2666                 const char *file, const char *buf)
2667 {
2668         int sock[2] = {-1, -1};
2669         pid_t qpid, cpid = -1;
2670         FILE *pids_file = NULL;
2671         bool answer = false, fail = false;
2672
2673         pids_file = open_pids_file(contrl, cg);
2674         if (!pids_file)
2675                 return false;
2676
2677         /*
2678          * write the pids to a socket, have helper in writer's pidns
2679          * call movepid for us
2680          */
2681         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2682                 perror("socketpair");
2683                 goto out;
2684         }
2685
2686         cpid = fork();
2687         if (cpid == -1)
2688                 goto out;
2689
2690         if (!cpid) { // child
2691                 fclose(pids_file);
2692                 pid_from_ns_wrapper(sock[1], tpid);
2693         }
2694
2695         const char *ptr = buf;
2696         while (sscanf(ptr, "%d", &qpid) == 1) {
2697                 struct ucred cred;
2698                 char v;
2699
2700                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2701                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2702                         goto out;
2703                 }
2704
2705                 if (recv_creds(sock[0], &cred, &v)) {
2706                         if (v == '0') {
2707                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2708                                         fail = true;
2709                                         break;
2710                                 }
2711                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2712                                         fail = true;
2713                         }
2714                 }
2715
2716                 ptr = strchr(ptr, '\n');
2717                 if (!ptr)
2718                         break;
2719                 ptr++;
2720         }
2721
2722         /* All good, write the value */
2723         qpid = -1;
2724         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2725                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2726
2727         if (!fail)
2728                 answer = true;
2729
2730 out:
2731         if (cpid != -1)
2732                 wait_for_pid(cpid);
2733         if (sock[0] != -1) {
2734                 close(sock[0]);
2735                 close(sock[1]);
2736         }
2737         if (pids_file) {
2738                 if (fclose(pids_file) != 0)
2739                         answer = false;
2740         }
2741         return answer;
2742 }
2743
2744 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2745              struct fuse_file_info *fi)
2746 {
2747         struct fuse_context *fc = fuse_get_context();
2748         char *localbuf = NULL;
2749         struct cgfs_files *k = NULL;
2750         struct file_info *f = (struct file_info *)fi->fh;
2751         bool r;
2752
2753         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2754                 return -EIO;
2755
2756         if (f->type != LXC_TYPE_CGFILE) {
2757                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2758                 return -EIO;
2759         }
2760
2761         if (offset)
2762                 return 0;
2763
2764         localbuf = alloca(size+1);
2765         localbuf[size] = '\0';
2766         memcpy(localbuf, buf, size);
2767
2768         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2769                 size = -EINVAL;
2770                 goto out;
2771         }
2772
2773         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2774                 size = -EACCES;
2775                 goto out;
2776         }
2777
2778         if (strcmp(f->file, "tasks") == 0 ||
2779                         strcmp(f->file, "/tasks") == 0 ||
2780                         strcmp(f->file, "/cgroup.procs") == 0 ||
2781                         strcmp(f->file, "cgroup.procs") == 0)
2782                 // special case - we have to translate the pids
2783                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2784         else
2785                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2786
2787         if (!r)
2788                 size = -EINVAL;
2789
2790 out:
2791         free_key(k);
2792         return size;
2793 }
2794
2795 int cg_chown(const char *path, uid_t uid, gid_t gid)
2796 {
2797         struct fuse_context *fc = fuse_get_context();
2798         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2799         struct cgfs_files *k = NULL;
2800         const char *cgroup;
2801         int ret;
2802
2803         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2804                 return -EIO;
2805
2806         if (strcmp(path, "/cgroup") == 0)
2807                 return -EPERM;
2808
2809         controller = pick_controller_from_path(fc, path);
2810         if (!controller)
2811                 return errno == ENOENT ? -EPERM : -errno;
2812
2813         cgroup = find_cgroup_in_path(path);
2814         if (!cgroup)
2815                 /* this is just /cgroup/controller */
2816                 return -EPERM;
2817
2818         get_cgdir_and_path(cgroup, &cgdir, &last);
2819
2820         if (!last) {
2821                 path1 = "/";
2822                 path2 = cgdir;
2823         } else {
2824                 path1 = cgdir;
2825                 path2 = last;
2826         }
2827
2828         if (is_child_cgroup(controller, path1, path2)) {
2829                 // get uid, gid, from '/tasks' file and make up a mode
2830                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2831                 k = cgfs_get_key(controller, cgroup, "tasks");
2832
2833         } else
2834                 k = cgfs_get_key(controller, path1, path2);
2835
2836         if (!k) {
2837                 ret = -EINVAL;
2838                 goto out;
2839         }
2840
2841         /*
2842          * This being a fuse request, the uid and gid must be valid
2843          * in the caller's namespace.  So we can just check to make
2844          * sure that the caller is root in his uid, and privileged
2845          * over the file's current owner.
2846          */
2847         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2848                 ret = -EACCES;
2849                 goto out;
2850         }
2851
2852         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2853
2854 out:
2855         free_key(k);
2856         free(cgdir);
2857
2858         return ret;
2859 }
2860
2861 int cg_chmod(const char *path, mode_t mode)
2862 {
2863         struct fuse_context *fc = fuse_get_context();
2864         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2865         struct cgfs_files *k = NULL;
2866         const char *cgroup;
2867         int ret;
2868
2869         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2870                 return -EIO;
2871
2872         if (strcmp(path, "/cgroup") == 0)
2873                 return -EPERM;
2874
2875         controller = pick_controller_from_path(fc, path);
2876         if (!controller)
2877                 return errno == ENOENT ? -EPERM : -errno;
2878
2879         cgroup = find_cgroup_in_path(path);
2880         if (!cgroup)
2881                 /* this is just /cgroup/controller */
2882                 return -EPERM;
2883
2884         get_cgdir_and_path(cgroup, &cgdir, &last);
2885
2886         if (!last) {
2887                 path1 = "/";
2888                 path2 = cgdir;
2889         } else {
2890                 path1 = cgdir;
2891                 path2 = last;
2892         }
2893
2894         if (is_child_cgroup(controller, path1, path2)) {
2895                 // get uid, gid, from '/tasks' file and make up a mode
2896                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2897                 k = cgfs_get_key(controller, cgroup, "tasks");
2898
2899         } else
2900                 k = cgfs_get_key(controller, path1, path2);
2901
2902         if (!k) {
2903                 ret = -EINVAL;
2904                 goto out;
2905         }
2906
2907         /*
2908          * This being a fuse request, the uid and gid must be valid
2909          * in the caller's namespace.  So we can just check to make
2910          * sure that the caller is root in his uid, and privileged
2911          * over the file's current owner.
2912          */
2913         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2914                 ret = -EPERM;
2915                 goto out;
2916         }
2917
2918         if (!cgfs_chmod_file(controller, cgroup, mode)) {
2919                 ret = -EINVAL;
2920                 goto out;
2921         }
2922
2923         ret = 0;
2924 out:
2925         free_key(k);
2926         free(cgdir);
2927         return ret;
2928 }
2929
2930 int cg_mkdir(const char *path, mode_t mode)
2931 {
2932         struct fuse_context *fc = fuse_get_context();
2933         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2934         const char *cgroup;
2935         int ret;
2936
2937         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2938                 return -EIO;
2939
2940         controller = pick_controller_from_path(fc, path);
2941         if (!controller)
2942                 return errno == ENOENT ? -EPERM : -errno;
2943
2944         cgroup = find_cgroup_in_path(path);
2945         if (!cgroup)
2946                 return -errno;
2947
2948         get_cgdir_and_path(cgroup, &cgdir, &last);
2949         if (!last)
2950                 path1 = "/";
2951         else
2952                 path1 = cgdir;
2953
2954         pid_t initpid = lookup_initpid_in_store(fc->pid);
2955         if (initpid <= 1 || is_shared_pidns(initpid))
2956                 initpid = fc->pid;
2957         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2958                 if (!next)
2959                         ret = -EINVAL;
2960                 else if (last && strcmp(next, last) == 0)
2961                         ret = -EEXIST;
2962                 else
2963                         ret = -EPERM;
2964                 goto out;
2965         }
2966
2967         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2968                 ret = -EACCES;
2969                 goto out;
2970         }
2971         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2972                 ret = -EACCES;
2973                 goto out;
2974         }
2975
2976         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2977
2978 out:
2979         free(cgdir);
2980         free(next);
2981         return ret;
2982 }
2983
2984 int cg_rmdir(const char *path)
2985 {
2986         struct fuse_context *fc = fuse_get_context();
2987         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2988         const char *cgroup;
2989         int ret;
2990
2991         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2992                 return -EIO;
2993
2994         controller = pick_controller_from_path(fc, path);
2995         if (!controller) /* Someone's trying to delete "/cgroup". */
2996                 return -EPERM;
2997
2998         cgroup = find_cgroup_in_path(path);
2999         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3000                 return -EPERM;
3001
3002         get_cgdir_and_path(cgroup, &cgdir, &last);
3003         if (!last) {
3004                 /* Someone's trying to delete a cgroup on the same level as the
3005                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3006                  * rmdir "/cgroup/blkio/init.slice".
3007                  */
3008                 ret = -EPERM;
3009                 goto out;
3010         }
3011
3012         pid_t initpid = lookup_initpid_in_store(fc->pid);
3013         if (initpid <= 1 || is_shared_pidns(initpid))
3014                 initpid = fc->pid;
3015         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3016                 if (!last || (next && (strcmp(next, last) == 0)))
3017                         ret = -EBUSY;
3018                 else
3019                         ret = -ENOENT;
3020                 goto out;
3021         }
3022
3023         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3024                 ret = -EACCES;
3025                 goto out;
3026         }
3027         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3028                 ret = -EACCES;
3029                 goto out;
3030         }
3031
3032         if (!cgfs_remove(controller, cgroup)) {
3033                 ret = -EINVAL;
3034                 goto out;
3035         }
3036
3037         ret = 0;
3038
3039 out:
3040         free(cgdir);
3041         free(next);
3042         return ret;
3043 }
3044
3045 static bool startswith(const char *line, const char *pref)
3046 {
3047         if (strncmp(line, pref, strlen(pref)) == 0)
3048                 return true;
3049         return false;
3050 }
3051
3052 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3053 static void parse_memstat(int version,
3054                           char *memstat,
3055                           unsigned long *cached,
3056                           unsigned long *active_anon,
3057                           unsigned long *inactive_anon,
3058                           unsigned long *active_file,
3059                           unsigned long *inactive_file,
3060                           unsigned long *unevictable,
3061                           unsigned long *shmem)
3062 {
3063         char *eol;
3064
3065         while (*memstat) {
3066                 if (startswith(memstat, is_unified_controller(version)
3067                                             ? "cache"
3068                                             : "total_cache")) {
3069                         sscanf(memstat + 11, "%lu", cached);
3070                         *cached /= 1024;
3071                 } else if (startswith(memstat, is_unified_controller(version)
3072                                                    ? "active_anon"
3073                                                    : "total_active_anon")) {
3074                         sscanf(memstat + 17, "%lu", active_anon);
3075                         *active_anon /= 1024;
3076                 } else if (startswith(memstat, is_unified_controller(version)
3077                                                    ? "inactive_anon"
3078                                                    : "total_inactive_anon")) {
3079                         sscanf(memstat + 19, "%lu", inactive_anon);
3080                         *inactive_anon /= 1024;
3081                 } else if (startswith(memstat, is_unified_controller(version)
3082                                                    ? "active_file"
3083                                                    : "total_active_file")) {
3084                         sscanf(memstat + 17, "%lu", active_file);
3085                         *active_file /= 1024;
3086                 } else if (startswith(memstat, is_unified_controller(version)
3087                                                    ? "inactive_file"
3088                                                    : "total_inactive_file")) {
3089                         sscanf(memstat + 19, "%lu", inactive_file);
3090                         *inactive_file /= 1024;
3091                 } else if (startswith(memstat, is_unified_controller(version)
3092                                                    ? "unevictable"
3093                                                    : "total_unevictable")) {
3094                         sscanf(memstat + 17, "%lu", unevictable);
3095                         *unevictable /= 1024;
3096                 } else if (startswith(memstat, is_unified_controller(version)
3097                                                    ? "shmem"
3098                                                    : "total_shmem")) {
3099                         sscanf(memstat + 11, "%lu", shmem);
3100                         *shmem /= 1024;
3101                 }
3102                 eol = strchr(memstat, '\n');
3103                 if (!eol)
3104                         return;
3105                 memstat = eol+1;
3106         }
3107 }
3108
3109 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3110 {
3111         char *eol;
3112         char key[32];
3113
3114         memset(key, 0, 32);
3115         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3116
3117         size_t len = strlen(key);
3118         *v = 0;
3119
3120         while (*str) {
3121                 if (startswith(str, key)) {
3122                         sscanf(str + len, "%lu", v);
3123                         return;
3124                 }
3125                 eol = strchr(str, '\n');
3126                 if (!eol)
3127                         return;
3128                 str = eol+1;
3129         }
3130 }
3131
3132 int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
3133 {
3134         __do_free char *line = NULL;
3135         __do_fclose FILE *f = NULL;
3136         size_t linelen = 0, total_len = 0;
3137         char *cache = d->buf;
3138         size_t cache_size = d->buflen;
3139
3140         f = fopen(path, "r");
3141         if (!f)
3142                 return 0;
3143
3144         while (getline(&line, &linelen, f) != -1) {
3145                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3146                 if (l < 0) {
3147                         perror("Error writing to cache");
3148                         return 0;
3149                 }
3150                 if (l >= cache_size) {
3151                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3152                         return 0;
3153                 }
3154                 cache += l;
3155                 cache_size -= l;
3156                 total_len += l;
3157         }
3158
3159         d->size = total_len;
3160         if (total_len > size)
3161                 total_len = size;
3162
3163         /* read from off 0 */
3164         memcpy(buf, d->buf, total_len);
3165
3166         if (d->size > total_len)
3167                 d->cached = d->size - total_len;
3168         return total_len;
3169 }
3170
3171 /*
3172  * FUSE ops for /proc
3173  */
3174
3175 static unsigned long get_memlimit(const char *cgroup, bool swap)
3176 {
3177         int ret;
3178         __do_free char *memlimit_str = NULL;
3179         unsigned long memlimit = -1;
3180
3181         if (swap)
3182                 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3183         else
3184                 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3185         if (ret > 0)
3186                 memlimit = strtoul(memlimit_str, NULL, 10);
3187
3188         return memlimit;
3189 }
3190
3191 static unsigned long get_min_memlimit(const char *cgroup, bool swap)
3192 {
3193         __do_free char *copy = NULL;
3194         unsigned long memlimit = 0;
3195         unsigned long retlimit;
3196
3197         copy = strdup(cgroup);
3198         retlimit = get_memlimit(copy, swap);
3199
3200         while (strcmp(copy, "/") != 0) {
3201                 char *it = copy;
3202
3203                 it = dirname(it);
3204                 memlimit = get_memlimit(it, swap);
3205                 if (memlimit != -1 && memlimit < retlimit)
3206                         retlimit = memlimit;
3207         };
3208
3209         return retlimit;
3210 }
3211
3212 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3213                              struct fuse_file_info *fi)
3214 {
3215         __do_free char *cgroup = NULL, *line = NULL,
3216                        *memusage_str = NULL, *memstat_str = NULL,
3217                        *memswlimit_str = NULL, *memswusage_str = NULL;
3218         __do_fclose FILE *f = NULL;
3219         struct fuse_context *fc = fuse_get_context();
3220         struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3221         struct file_info *d = (struct file_info *)fi->fh;
3222         unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3223                       memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3224                       inactive_anon = 0, active_file = 0, inactive_file = 0,
3225                       unevictable = 0, shmem = 0, hostswtotal = 0;
3226         size_t linelen = 0, total_len = 0;
3227         char *cache = d->buf;
3228         size_t cache_size = d->buflen;
3229         int ret;
3230
3231         if (offset) {
3232                 int left;
3233
3234                 if (offset > d->size)
3235                         return -EINVAL;
3236
3237                 if (!d->cached)
3238                         return 0;
3239
3240                 left = d->size - offset;
3241                 total_len = left > size ? size : left;
3242                 memcpy(buf, cache + offset, total_len);
3243
3244                 return total_len;
3245         }
3246
3247         pid_t initpid = lookup_initpid_in_store(fc->pid);
3248         if (initpid <= 1 || is_shared_pidns(initpid))
3249                 initpid = fc->pid;
3250
3251         cgroup = get_pid_cgroup(initpid, "memory");
3252         if (!cgroup)
3253                 return read_file_fuse("/proc/meminfo", buf, size, d);
3254
3255         prune_init_slice(cgroup);
3256
3257         memlimit = get_min_memlimit(cgroup, false);
3258
3259         ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3260         if (ret < 0)
3261                 return 0;
3262
3263         ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3264         if (ret < 0)
3265                 return 0;
3266         parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3267                       &active_file, &inactive_file, &unevictable, &shmem);
3268
3269         /*
3270          * Following values are allowed to fail, because swapaccount might be
3271          * turned off for current kernel.
3272          */
3273         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3274         if (ret >= 0)
3275                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3276         if (ret >= 0) {
3277                 memswlimit = get_min_memlimit(cgroup, true);
3278                 memswusage = strtoul(memswusage_str, NULL, 10);
3279                 memswlimit = memswlimit / 1024;
3280                 memswusage = memswusage / 1024;
3281         }
3282
3283         memusage = strtoul(memusage_str, NULL, 10);
3284         memlimit /= 1024;
3285         memusage /= 1024;
3286
3287         f = fopen("/proc/meminfo", "r");
3288         if (!f)
3289                 return 0;
3290
3291         while (getline(&line, &linelen, f) != -1) {
3292                 ssize_t l;
3293                 char *printme, lbuf[100];
3294
3295                 memset(lbuf, 0, 100);
3296                 if (startswith(line, "MemTotal:")) {
3297                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3298                         if (hosttotal < memlimit)
3299                                 memlimit = hosttotal;
3300                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3301                         printme = lbuf;
3302                 } else if (startswith(line, "MemFree:")) {
3303                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3304                         printme = lbuf;
3305                 } else if (startswith(line, "MemAvailable:")) {
3306                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3307                         printme = lbuf;
3308                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3309                            opts && opts->swap_off == false) {
3310                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3311                         if (hostswtotal < memswlimit)
3312                                 memswlimit = hostswtotal;
3313                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3314                         printme = lbuf;
3315                 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3316                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", 0UL);
3317                         printme = lbuf;
3318                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3319                            memswusage > 0 && opts && opts->swap_off == false) {
3320                         unsigned long swaptotal = memswlimit,
3321                                       swapusage = memusage > memswusage
3322                                                       ? 0
3323                                                       : memswusage - memusage,
3324                                       swapfree = swapusage < swaptotal
3325                                                      ? swaptotal - swapusage
3326                                                      : 0;
3327                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3328                         printme = lbuf;
3329                 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3330                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", 0UL);
3331                         printme = lbuf;
3332                 } else if (startswith(line, "Slab:")) {
3333                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3334                         printme = lbuf;
3335                 } else if (startswith(line, "Buffers:")) {
3336                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3337                         printme = lbuf;
3338                 } else if (startswith(line, "Cached:")) {
3339                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3340                         printme = lbuf;
3341                 } else if (startswith(line, "SwapCached:")) {
3342                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3343                         printme = lbuf;
3344                 } else if (startswith(line, "Active:")) {
3345                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3346                                         active_anon + active_file);
3347                         printme = lbuf;
3348                 } else if (startswith(line, "Inactive:")) {
3349                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3350                                         inactive_anon + inactive_file);
3351                         printme = lbuf;
3352                 } else if (startswith(line, "Active(anon)")) {
3353                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3354                         printme = lbuf;
3355                 } else if (startswith(line, "Inactive(anon)")) {
3356                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3357                         printme = lbuf;
3358                 } else if (startswith(line, "Active(file)")) {
3359                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3360                         printme = lbuf;
3361                 } else if (startswith(line, "Inactive(file)")) {
3362                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3363                         printme = lbuf;
3364                 } else if (startswith(line, "Unevictable")) {
3365                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3366                         printme = lbuf;
3367                 } else if (startswith(line, "SReclaimable")) {
3368                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3369                         printme = lbuf;
3370                 } else if (startswith(line, "SUnreclaim")) {
3371                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3372                         printme = lbuf;
3373                 } else if (startswith(line, "Shmem:")) {
3374                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3375                         printme = lbuf;
3376                 } else if (startswith(line, "ShmemHugePages")) {
3377                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3378                         printme = lbuf;
3379                 } else if (startswith(line, "ShmemPmdMapped")) {
3380                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3381                         printme = lbuf;
3382                 } else
3383                         printme = line;
3384
3385                 l = snprintf(cache, cache_size, "%s", printme);
3386                 if (l < 0) {
3387                         perror("Error writing to cache");
3388                         return 0;
3389
3390                 }
3391                 if (l >= cache_size) {
3392                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3393                         return 0;
3394                 }
3395
3396                 cache += l;
3397                 cache_size -= l;
3398                 total_len += l;
3399         }
3400
3401         d->cached = 1;
3402         d->size = total_len;
3403         if (total_len > size ) total_len = size;
3404         memcpy(buf, d->buf, total_len);
3405
3406         return total_len;
3407 }
3408
3409 /*
3410  * Read the cpuset.cpus for cg
3411  * Return the answer in a newly allocated string which must be freed
3412  */
3413 char *get_cpuset(const char *cg)
3414 {
3415         char *value = NULL;
3416         int ret;
3417
3418         ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3419         if (ret < 0)
3420                 return NULL;
3421
3422         return value;
3423 }
3424
3425 bool cpu_in_cpuset(int cpu, const char *cpuset);
3426
3427 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3428 {
3429         int cpu;
3430
3431         if (sscanf(line, "processor       : %d", &cpu) != 1)
3432                 return false;
3433         return cpu_in_cpuset(cpu, cpuset);
3434 }
3435
3436 /*
3437  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3438  * depending on `param`. Parameter value is returned throuh `value`.
3439  */
3440 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3441 {
3442         __do_free char *str = NULL;
3443         char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
3444
3445         snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
3446
3447         if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
3448                 return false;
3449
3450         if (sscanf(str, "%ld", value) != 1)
3451                 return false;
3452
3453         return true;
3454 }
3455
3456 /*
3457  * Return the maximum number of visible CPUs based on CPU quotas.
3458  * If there is no quota set, zero is returned.
3459  */
3460 int max_cpu_count(const char *cg)
3461 {
3462         int rv, nprocs;
3463         int64_t cfs_quota, cfs_period;
3464         int nr_cpus_in_cpuset = 0;
3465         char *cpuset = NULL;
3466
3467         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3468                 return 0;
3469
3470         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3471                 return 0;
3472
3473         cpuset = get_cpuset(cg);
3474         if (cpuset)
3475                 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3476
3477         if (cfs_quota <= 0 || cfs_period <= 0){
3478                 if (nr_cpus_in_cpuset > 0)
3479                         return nr_cpus_in_cpuset;
3480
3481                 return 0;
3482         }
3483
3484         rv = cfs_quota / cfs_period;
3485
3486         /* In case quota/period does not yield a whole number, add one CPU for
3487          * the remainder.
3488          */
3489         if ((cfs_quota % cfs_period) > 0)
3490                 rv += 1;
3491
3492         nprocs = get_nprocs();
3493
3494         if (rv > nprocs)
3495                 rv = nprocs;
3496
3497         /* use min value in cpu quota and cpuset */
3498         if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3499                 rv = nr_cpus_in_cpuset;
3500
3501         return rv;
3502 }
3503
3504 /*
3505  * Return the exact number of visible CPUs based on CPU quotas.
3506  * If there is no quota set, zero is returned.
3507  */
3508 static double exact_cpu_count(const char *cg)
3509 {
3510         double rv;
3511         int nprocs;
3512         int64_t cfs_quota, cfs_period;
3513
3514         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3515                 return 0;
3516
3517         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3518                 return 0;
3519
3520         if (cfs_quota <= 0 || cfs_period <= 0)
3521                 return 0;
3522
3523         rv = (double)cfs_quota / (double)cfs_period;
3524
3525         nprocs = get_nprocs();
3526
3527         if (rv > nprocs)
3528                 rv = nprocs;
3529
3530         return rv;
3531 }
3532
3533 /*
3534  * check whether this is a '^processor" line in /proc/cpuinfo
3535  */
3536 static bool is_processor_line(const char *line)
3537 {
3538         int cpu;
3539
3540         if (sscanf(line, "processor       : %d", &cpu) == 1)
3541                 return true;
3542         return false;
3543 }
3544
3545 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3546                              struct fuse_file_info *fi)
3547 {
3548         __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3549         __do_fclose FILE *f = NULL;
3550         struct fuse_context *fc = fuse_get_context();
3551         struct file_info *d = (struct file_info *)fi->fh;
3552         size_t linelen = 0, total_len = 0;
3553         bool am_printing = false, firstline = true, is_s390x = false;
3554         int curcpu = -1, cpu, max_cpus = 0;
3555         bool use_view;
3556         char *cache = d->buf;
3557         size_t cache_size = d->buflen;
3558
3559         if (offset){
3560                 int left;
3561
3562                 if (offset > d->size)
3563                         return -EINVAL;
3564
3565                 if (!d->cached)
3566                         return 0;
3567
3568                 left = d->size - offset;
3569                 total_len = left > size ? size: left;
3570                 memcpy(buf, cache + offset, total_len);
3571
3572                 return total_len;
3573         }
3574
3575         pid_t initpid = lookup_initpid_in_store(fc->pid);
3576         if (initpid <= 1 || is_shared_pidns(initpid))
3577                 initpid = fc->pid;
3578         cg = get_pid_cgroup(initpid, "cpuset");
3579         if (!cg)
3580                 return read_file_fuse("proc/cpuinfo", buf, size, d);
3581         prune_init_slice(cg);
3582
3583         cpuset = get_cpuset(cg);
3584         if (!cpuset)
3585                 return 0;
3586
3587         use_view = cgroup_ops->can_use_cpuview(cgroup_ops);
3588         if (use_view)
3589                 max_cpus = max_cpu_count(cg);
3590
3591         f = fopen("/proc/cpuinfo", "r");
3592         if (!f)
3593                 return 0;
3594
3595         while (getline(&line, &linelen, f) != -1) {
3596                 ssize_t l;
3597                 if (firstline) {
3598                         firstline = false;
3599                         if (strstr(line, "IBM/S390") != NULL) {
3600                                 is_s390x = true;
3601                                 am_printing = true;
3602                                 continue;
3603                         }
3604                 }
3605                 if (strncmp(line, "# processors:", 12) == 0)
3606                         continue;
3607                 if (is_processor_line(line)) {
3608                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3609                                 break;
3610                         am_printing = cpuline_in_cpuset(line, cpuset);
3611                         if (am_printing) {
3612                                 curcpu ++;
3613                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3614                                 if (l < 0) {
3615                                         perror("Error writing to cache");
3616                                         return 0;
3617                                 }
3618                                 if (l >= cache_size) {
3619                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3620                                         return 0;
3621                                 }
3622                                 cache += l;
3623                                 cache_size -= l;
3624                                 total_len += l;
3625                         }
3626                         continue;
3627                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3628                         char *p;
3629                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3630                                 break;
3631                         if (!cpu_in_cpuset(cpu, cpuset))
3632                                 continue;
3633                         curcpu ++;
3634                         p = strchr(line, ':');
3635                         if (!p || !*p)
3636                                 return 0;
3637                         p++;
3638                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3639                         if (l < 0) {
3640                                 perror("Error writing to cache");
3641                                 return 0;
3642                         }
3643                         if (l >= cache_size) {
3644                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3645                                 return 0;
3646                         }
3647                         cache += l;
3648                         cache_size -= l;
3649                         total_len += l;
3650                         continue;
3651
3652                 }
3653                 if (am_printing) {
3654                         l = snprintf(cache, cache_size, "%s", line);
3655                         if (l < 0) {
3656                                 perror("Error writing to cache");
3657                                 return 0;
3658                         }
3659                         if (l >= cache_size) {
3660                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3661                                 return 0;
3662                         }
3663                         cache += l;
3664                         cache_size -= l;
3665                         total_len += l;
3666                 }
3667         }
3668
3669         if (is_s390x) {
3670                 __do_free char *origcache = d->buf;
3671                 ssize_t l;
3672
3673                 d->buf = malloc(d->buflen);
3674                 if (!d->buf) {
3675                         d->buf = move_ptr(origcache);
3676                         return 0;
3677                 }
3678
3679                 cache = d->buf;
3680                 cache_size = d->buflen;
3681                 total_len = 0;
3682                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3683                 if (l < 0 || l >= cache_size)
3684                         return 0;
3685
3686                 cache_size -= l;
3687                 cache += l;
3688                 total_len += l;
3689                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3690                 if (l < 0 || l >= cache_size)
3691                         return 0;
3692
3693                 cache_size -= l;
3694                 cache += l;
3695                 total_len += l;
3696                 l = snprintf(cache, cache_size, "%s", origcache);
3697                 if (l < 0 || l >= cache_size)
3698                         return 0;
3699                 total_len += l;
3700         }
3701
3702         d->cached = 1;
3703         d->size = total_len;
3704         if (total_len > size ) total_len = size;
3705
3706         /* read from off 0 */
3707         memcpy(buf, d->buf, total_len);
3708         return total_len;
3709 }
3710
3711 static uint64_t get_reaper_start_time(pid_t pid)
3712 {
3713         int ret;
3714         FILE *f;
3715         uint64_t starttime;
3716         /* strlen("/proc/") = 6
3717          * +
3718          * LXCFS_NUMSTRLEN64
3719          * +
3720          * strlen("/stat") = 5
3721          * +
3722          * \0 = 1
3723          * */
3724 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3725         char path[__PROC_PID_STAT_LEN];
3726         pid_t qpid;
3727
3728         qpid = lookup_initpid_in_store(pid);
3729         if (qpid <= 0) {
3730                 /* Caller can check for EINVAL on 0. */
3731                 errno = EINVAL;
3732                 return 0;
3733         }
3734
3735         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3736         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3737                 /* Caller can check for EINVAL on 0. */
3738                 errno = EINVAL;
3739                 return 0;
3740         }
3741
3742         f = fopen(path, "r");
3743         if (!f) {
3744                 /* Caller can check for EINVAL on 0. */
3745                 errno = EINVAL;
3746                 return 0;
3747         }
3748
3749         /* Note that the *scanf() argument supression requires that length
3750          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3751          * at us. It's like telling someone you're not married and then asking
3752          * if you can bring your wife to the party.
3753          */
3754         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3755                         "%*s "      /* (2)  comm        %s   */
3756                         "%*c "      /* (3)  state       %c   */
3757                         "%*d "      /* (4)  ppid        %d   */
3758                         "%*d "      /* (5)  pgrp        %d   */
3759                         "%*d "      /* (6)  session     %d   */
3760                         "%*d "      /* (7)  tty_nr      %d   */
3761                         "%*d "      /* (8)  tpgid       %d   */
3762                         "%*u "      /* (9)  flags       %u   */
3763                         "%*u "      /* (10) minflt      %lu  */
3764                         "%*u "      /* (11) cminflt     %lu  */
3765                         "%*u "      /* (12) majflt      %lu  */
3766                         "%*u "      /* (13) cmajflt     %lu  */
3767                         "%*u "      /* (14) utime       %lu  */
3768                         "%*u "      /* (15) stime       %lu  */
3769                         "%*d "      /* (16) cutime      %ld  */
3770                         "%*d "      /* (17) cstime      %ld  */
3771                         "%*d "      /* (18) priority    %ld  */
3772                         "%*d "      /* (19) nice        %ld  */
3773                         "%*d "      /* (20) num_threads %ld  */
3774                         "%*d "      /* (21) itrealvalue %ld  */
3775                         "%" PRIu64, /* (22) starttime   %llu */
3776                      &starttime);
3777         if (ret != 1) {
3778                 fclose(f);
3779                 /* Caller can check for EINVAL on 0. */
3780                 errno = EINVAL;
3781                 return 0;
3782         }
3783
3784         fclose(f);
3785
3786         errno = 0;
3787         return starttime;
3788 }
3789
3790 static double get_reaper_start_time_in_sec(pid_t pid)
3791 {
3792         uint64_t clockticks, ticks_per_sec;
3793         int64_t ret;
3794         double res = 0;
3795
3796         clockticks = get_reaper_start_time(pid);
3797         if (clockticks == 0 && errno == EINVAL) {
3798                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3799                 return 0;
3800         }
3801
3802         ret = sysconf(_SC_CLK_TCK);
3803         if (ret < 0 && errno == EINVAL) {
3804                 lxcfs_debug(
3805                     "%s\n",
3806                     "failed to determine number of clock ticks in a second");
3807                 return 0;
3808         }
3809
3810         ticks_per_sec = (uint64_t)ret;
3811         res = (double)clockticks / ticks_per_sec;
3812         return res;
3813 }
3814
3815 static double get_reaper_age(pid_t pid)
3816 {
3817         uint64_t uptime_ms;
3818         double procstart, procage;
3819
3820         /* We need to substract the time the process has started since system
3821          * boot minus the time when the system has started to get the actual
3822          * reaper age.
3823          */
3824         procstart = get_reaper_start_time_in_sec(pid);
3825         procage = procstart;
3826         if (procstart > 0) {
3827                 int ret;
3828                 struct timespec spec;
3829
3830                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3831                 if (ret < 0)
3832                         return 0;
3833
3834                 /* We could make this more precise here by using the tv_nsec
3835                  * field in the timespec struct and convert it to milliseconds
3836                  * and then create a double for the seconds and milliseconds but
3837                  * that seems more work than it is worth.
3838                  */
3839                 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
3840                 procage = (uptime_ms - (procstart * 1000)) / 1000;
3841         }
3842
3843         return procage;
3844 }
3845
3846 /*
3847  * Returns 0 on success.
3848  * It is the caller's responsibility to free `return_usage`, unless this
3849  * function returns an error.
3850  */
3851 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
3852 {
3853         __do_free char *usage_str = NULL;
3854         __do_free struct cpuacct_usage *cpu_usage = NULL;
3855         int cpucount = get_nprocs_conf();
3856         int read_pos = 0, read_cnt=0;
3857         int i, j, ret;
3858         int cg_cpu;
3859         uint64_t cg_user, cg_system;
3860         int64_t ticks_per_sec;
3861
3862         ticks_per_sec = sysconf(_SC_CLK_TCK);
3863
3864         if (ticks_per_sec < 0 && errno == EINVAL) {
3865                 lxcfs_v(
3866                         "%s\n",
3867                         "read_cpuacct_usage_all failed to determine number of clock ticks "
3868                         "in a second");
3869                 return -1;
3870         }
3871
3872         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3873         if (!cpu_usage)
3874                 return -ENOMEM;
3875
3876         memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
3877         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3878                 char *data = NULL;
3879                 int i = 0, read_pos = 0, read_cnt=0;
3880                 size_t sz = 0, asz = 0;
3881
3882                 /* read cpuacct.usage_percpu instead. */
3883                 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
3884                 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
3885                         return -1;
3886                 lxcfs_v("usage_str: %s\n", usage_str);
3887
3888                 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
3889                 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
3890
3891                 must_strcat(&data, &sz, &asz, "cpu user system\n");
3892
3893                 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
3894                         lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
3895                         must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
3896                         i++;
3897                         read_pos += read_cnt;
3898                 }
3899
3900                 usage_str = data;
3901
3902                 lxcfs_v("usage_str: %s\n", usage_str);
3903         }
3904
3905         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
3906                 lxcfs_error("read_cpuacct_usage_all reading first line from "
3907                                 "%s/cpuacct.usage_all failed.\n", cg);
3908                 return -1;
3909         }
3910
3911         read_pos += read_cnt;
3912
3913         for (i = 0, j = 0; i < cpucount; i++) {
3914                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
3915                                 &cg_system, &read_cnt);
3916
3917                 if (ret == EOF)
3918                         break;
3919
3920                 if (ret != 3) {
3921                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
3922                                         "failed.\n", cg);
3923                         return -1;
3924                 }
3925
3926                 read_pos += read_cnt;
3927
3928                 /* Convert the time from nanoseconds to USER_HZ */
3929                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
3930                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
3931                 j++;
3932         }
3933
3934         *return_usage = move_ptr(cpu_usage);
3935         *size = cpucount;
3936         return 0;
3937 }
3938
3939 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
3940 {
3941         int i;
3942         unsigned long sum = 0;
3943
3944         for (i = 0; i < cpu_count; i++) {
3945                 if (!newer[i].online)
3946                         continue;
3947
3948                 /* When cpuset is changed on the fly, the CPUs might get reordered.
3949                  * We could either reset all counters, or check that the substractions
3950                  * below will return expected results.
3951                  */
3952                 if (newer[i].user > older[i].user)
3953                         diff[i].user = newer[i].user - older[i].user;
3954                 else
3955                         diff[i].user = 0;
3956
3957                 if (newer[i].system > older[i].system)
3958                         diff[i].system = newer[i].system - older[i].system;
3959                 else
3960                         diff[i].system = 0;
3961
3962                 if (newer[i].idle > older[i].idle)
3963                         diff[i].idle = newer[i].idle - older[i].idle;
3964                 else
3965                         diff[i].idle = 0;
3966
3967                 sum += diff[i].user;
3968                 sum += diff[i].system;
3969                 sum += diff[i].idle;
3970         }
3971
3972         return sum;
3973 }
3974
3975 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
3976 {
3977         unsigned long free_space, to_add;
3978
3979         free_space = threshold - usage->user - usage->system;
3980
3981         if (free_space > usage->idle)
3982                 free_space = usage->idle;
3983
3984         to_add = free_space > *surplus ? *surplus : free_space;
3985
3986         *counter += to_add;
3987         usage->idle -= to_add;
3988         *surplus -= to_add;
3989 }
3990
3991 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
3992 {
3993         struct cg_proc_stat *first = NULL, *prev, *tmp;
3994
3995         for (prev = NULL; node; ) {
3996                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
3997                         tmp = node;
3998                         lxcfs_debug("Removing stat node for %s\n", node->cg);
3999
4000                         if (prev)
4001                                 prev->next = node->next;
4002                         else
4003                                 first = node->next;
4004
4005                         node = node->next;
4006                         free_proc_stat_node(tmp);
4007                 } else {
4008                         if (!first)
4009                                 first = node;
4010                         prev = node;
4011                         node = node->next;
4012                 }
4013         }
4014
4015         return first;
4016 }
4017
4018 #define PROC_STAT_PRUNE_INTERVAL 10
4019 static void prune_proc_stat_history(void)
4020 {
4021         int i;
4022         time_t now = time(NULL);
4023
4024         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4025                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4026
4027                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4028                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4029                         return;
4030                 }
4031
4032                 if (proc_stat_history[i]->next) {
4033                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4034                         proc_stat_history[i]->lastcheck = now;
4035                 }
4036
4037                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4038         }
4039 }
4040
4041 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4042 {
4043         struct cg_proc_stat *node;
4044
4045         pthread_rwlock_rdlock(&head->lock);
4046
4047         if (!head->next) {
4048                 pthread_rwlock_unlock(&head->lock);
4049                 return NULL;
4050         }
4051
4052         node = head->next;
4053
4054         do {
4055                 if (strcmp(cg, node->cg) == 0)
4056                         goto out;
4057         } while ((node = node->next));
4058
4059         node = NULL;
4060
4061 out:
4062         pthread_rwlock_unlock(&head->lock);
4063         prune_proc_stat_history();
4064         return node;
4065 }
4066
4067 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4068 {
4069         struct cg_proc_stat *node;
4070         int i;
4071
4072         node = malloc(sizeof(struct cg_proc_stat));
4073         if (!node)
4074                 goto err;
4075
4076         node->cg = NULL;
4077         node->usage = NULL;
4078         node->view = NULL;
4079
4080         node->cg = malloc(strlen(cg) + 1);
4081         if (!node->cg)
4082                 goto err;
4083
4084         strcpy(node->cg, cg);
4085
4086         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4087         if (!node->usage)
4088                 goto err;
4089
4090         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4091
4092         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4093         if (!node->view)
4094                 goto err;
4095
4096         node->cpu_count = cpu_count;
4097         node->next = NULL;
4098
4099         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4100                 lxcfs_error("%s\n", "Failed to initialize node lock");
4101                 goto err;
4102         }
4103
4104         for (i = 0; i < cpu_count; i++) {
4105                 node->view[i].user = 0;
4106                 node->view[i].system = 0;
4107                 node->view[i].idle = 0;
4108         }
4109
4110         return node;
4111
4112 err:
4113         if (node && node->cg)
4114                 free(node->cg);
4115         if (node && node->usage)
4116                 free(node->usage);
4117         if (node && node->view)
4118                 free(node->view);
4119         if (node)
4120                 free(node);
4121
4122         return NULL;
4123 }
4124
4125 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4126 {
4127         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4128         struct cg_proc_stat_head *head = proc_stat_history[hash];
4129         struct cg_proc_stat *node, *rv = new_node;
4130
4131         pthread_rwlock_wrlock(&head->lock);
4132
4133         if (!head->next) {
4134                 head->next = new_node;
4135                 goto out;
4136         }
4137
4138         node = head->next;
4139
4140         for (;;) {
4141                 if (strcmp(node->cg, new_node->cg) == 0) {
4142                         /* The node is already present, return it */
4143                         free_proc_stat_node(new_node);
4144                         rv = node;
4145                         goto out;
4146                 }
4147
4148                 if (node->next) {
4149                         node = node->next;
4150                         continue;
4151                 }
4152
4153                 node->next = new_node;
4154                 goto out;
4155         }
4156
4157 out:
4158         pthread_rwlock_unlock(&head->lock);
4159         return rv;
4160 }
4161
4162 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4163 {
4164         __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
4165
4166         /* Allocate new memory */
4167         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4168         if (!new_usage)
4169                 return false;
4170
4171         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4172         if (!new_view)
4173                 return false;
4174
4175         /* Copy existing data & initialize new elements */
4176         for (int i = 0; i < cpu_count; i++) {
4177                 if (i < node->cpu_count) {
4178                         new_usage[i].user = node->usage[i].user;
4179                         new_usage[i].system = node->usage[i].system;
4180                         new_usage[i].idle = node->usage[i].idle;
4181
4182                         new_view[i].user = node->view[i].user;
4183                         new_view[i].system = node->view[i].system;
4184                         new_view[i].idle = node->view[i].idle;
4185                 } else {
4186                         new_usage[i].user = 0;
4187                         new_usage[i].system = 0;
4188                         new_usage[i].idle = 0;
4189
4190                         new_view[i].user = 0;
4191                         new_view[i].system = 0;
4192                         new_view[i].idle = 0;
4193                 }
4194         }
4195
4196         free(node->usage);
4197         node->usage = move_ptr(new_usage);
4198
4199         free(node->view);
4200         node->view = move_ptr(new_view);
4201         node->cpu_count = cpu_count;
4202
4203         return true;
4204 }
4205
4206 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4207 {
4208         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4209         struct cg_proc_stat_head *head = proc_stat_history[hash];
4210         struct cg_proc_stat *node;
4211
4212         node = find_proc_stat_node(head, cg);
4213
4214         if (!node) {
4215                 node = new_proc_stat_node(usage, cpu_count, cg);
4216                 if (!node)
4217                         return NULL;
4218
4219                 node = add_proc_stat_node(node);
4220                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4221         }
4222
4223         pthread_mutex_lock(&node->lock);
4224
4225         /* If additional CPUs on the host have been enabled, CPU usage counter
4226          * arrays have to be expanded */
4227         if (node->cpu_count < cpu_count) {
4228                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4229                                 node->cpu_count, cpu_count, cg);
4230
4231                 if (!expand_proc_stat_node(node, cpu_count)) {
4232                         pthread_mutex_unlock(&node->lock);
4233                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4234                                         node->cpu_count, cpu_count, cg);
4235                         return NULL;
4236                 }
4237         }
4238
4239         return node;
4240 }
4241
4242 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4243 {
4244         int i;
4245
4246         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4247         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4248
4249         for (i = 0; i < cpu_count; i++) {
4250                 node->view[i].user = 0;
4251                 node->view[i].system = 0;
4252                 node->view[i].idle = 0;
4253         }
4254
4255         node->cpu_count = cpu_count;
4256 }
4257
4258 static int cpuview_proc_stat(const char *cg, const char *cpuset,
4259                              struct cpuacct_usage *cg_cpu_usage,
4260                              int cg_cpu_usage_size, FILE *f, char *buf,
4261                              size_t buf_size)
4262 {
4263         __do_free char *line = NULL;
4264         __do_free struct cpuacct_usage *diff = NULL;
4265         size_t linelen = 0, total_len = 0, l;
4266         int curcpu = -1; /* cpu numbering starts at 0 */
4267         int physcpu, i;
4268         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4269         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4270                       irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4271         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4272         unsigned long user_surplus = 0, system_surplus = 0;
4273         unsigned long total_sum, threshold;
4274         struct cg_proc_stat *stat_node;
4275         int nprocs = get_nprocs_conf();
4276
4277         if (cg_cpu_usage_size < nprocs)
4278                 nprocs = cg_cpu_usage_size;
4279
4280         /* Read all CPU stats and stop when we've encountered other lines */
4281         while (getline(&line, &linelen, f) != -1) {
4282                 int ret;
4283                 char cpu_char[10]; /* That's a lot of cores */
4284                 uint64_t all_used, cg_used;
4285
4286                 if (strlen(line) == 0)
4287                         continue;
4288
4289                 /* not a ^cpuN line containing a number N */
4290                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
4291                         break;
4292
4293                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4294                         continue;
4295
4296                 if (physcpu >= cg_cpu_usage_size)
4297                         continue;
4298
4299                 curcpu ++;
4300                 cpu_cnt ++;
4301
4302                 if (!cpu_in_cpuset(physcpu, cpuset)) {
4303                         for (i = curcpu; i <= physcpu; i++)
4304                                 cg_cpu_usage[i].online = false;
4305                         continue;
4306                 }
4307
4308                 if (curcpu < physcpu) {
4309                         /* Some CPUs may be disabled */
4310                         for (i = curcpu; i < physcpu; i++)
4311                                 cg_cpu_usage[i].online = false;
4312
4313                         curcpu = physcpu;
4314                 }
4315
4316                 cg_cpu_usage[curcpu].online = true;
4317
4318                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4319                            &user,
4320                            &nice,
4321                            &system,
4322                            &idle,
4323                            &iowait,
4324                            &irq,
4325                            &softirq,
4326                            &steal,
4327                            &guest,
4328                            &guest_nice);
4329
4330                 if (ret != 10)
4331                         continue;
4332
4333                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4334                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4335
4336                 if (all_used >= cg_used) {
4337                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4338
4339                 } else {
4340                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4341                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4342                                         curcpu, cg, all_used, cg_used);
4343                         cg_cpu_usage[curcpu].idle = idle;
4344                 }
4345         }
4346
4347         /* Cannot use more CPUs than is available due to cpuset */
4348         if (max_cpus > cpu_cnt)
4349                 max_cpus = cpu_cnt;
4350
4351         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4352
4353         if (!stat_node) {
4354                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4355                 return 0;
4356         }
4357
4358         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4359         if (!diff) {
4360                 return 0;
4361         }
4362
4363         /*
4364          * If the new values are LOWER than values stored in memory, it means
4365          * the cgroup has been reset/recreated and we should reset too.
4366          */
4367         for (curcpu = 0; curcpu < nprocs; curcpu++) {
4368                 if (!cg_cpu_usage[curcpu].online)
4369                         continue;
4370
4371                 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4372                         reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4373
4374                 break;
4375         }
4376
4377         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4378
4379         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4380                 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4381
4382                 if (!stat_node->usage[curcpu].online)
4383                         continue;
4384
4385                 i++;
4386
4387                 stat_node->usage[curcpu].user += diff[curcpu].user;
4388                 stat_node->usage[curcpu].system += diff[curcpu].system;
4389                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4390
4391                 if (max_cpus > 0 && i >= max_cpus) {
4392                         user_surplus += diff[curcpu].user;
4393                         system_surplus += diff[curcpu].system;
4394                 }
4395         }
4396
4397         /* Calculate usage counters of visible CPUs */
4398         if (max_cpus > 0) {
4399                 unsigned long diff_user = 0;
4400                 unsigned long diff_system = 0;
4401                 unsigned long diff_idle = 0;
4402                 unsigned long max_diff_idle = 0;
4403                 unsigned long max_diff_idle_index = 0;
4404                 double exact_cpus;
4405
4406                 /* threshold = maximum usage per cpu, including idle */
4407                 threshold = total_sum / cpu_cnt * max_cpus;
4408
4409                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4410                         if (!stat_node->usage[curcpu].online)
4411                                 continue;
4412
4413                         i++;
4414
4415                         if (i == max_cpus)
4416                                 break;
4417
4418                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4419                                 continue;
4420
4421                         /* Add user */
4422                         add_cpu_usage(&user_surplus, &diff[curcpu],
4423                                       &diff[curcpu].user, threshold);
4424
4425                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4426                                 continue;
4427
4428                         /* If there is still room, add system */
4429                         add_cpu_usage(&system_surplus, &diff[curcpu],
4430                                       &diff[curcpu].system, threshold);
4431                 }
4432
4433                 if (user_surplus > 0)
4434                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4435                 if (system_surplus > 0)
4436                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4437
4438                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4439                         if (!stat_node->usage[curcpu].online)
4440                                 continue;
4441
4442                         i++;
4443
4444                         if (i == max_cpus)
4445                                 break;
4446
4447                         stat_node->view[curcpu].user += diff[curcpu].user;
4448                         stat_node->view[curcpu].system += diff[curcpu].system;
4449                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4450
4451                         user_sum += stat_node->view[curcpu].user;
4452                         system_sum += stat_node->view[curcpu].system;
4453                         idle_sum += stat_node->view[curcpu].idle;
4454
4455                         diff_user += diff[curcpu].user;
4456                         diff_system += diff[curcpu].system;
4457                         diff_idle += diff[curcpu].idle;
4458                         if (diff[curcpu].idle > max_diff_idle) {
4459                                 max_diff_idle = diff[curcpu].idle;
4460                                 max_diff_idle_index = curcpu;
4461                         }
4462
4463                         lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4464                 }
4465                 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4466
4467                 /* revise cpu usage view to support partial cpu case. */
4468                 exact_cpus = exact_cpu_count(cg);
4469                 if (exact_cpus < (double)max_cpus){
4470                         unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4471
4472                         lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4473                         lxcfs_v("delta: %lu\n", delta);
4474                         lxcfs_v("idle_sum before: %lu\n", idle_sum);
4475                         idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4476                         lxcfs_v("idle_sum after: %lu\n", idle_sum);
4477
4478                         curcpu = max_diff_idle_index;
4479                         lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4480                         stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4481                         lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4482                 }
4483         } else {
4484                 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4485                         if (!stat_node->usage[curcpu].online)
4486                                 continue;
4487
4488                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4489                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4490                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4491
4492                         user_sum += stat_node->view[curcpu].user;
4493                         system_sum += stat_node->view[curcpu].system;
4494                         idle_sum += stat_node->view[curcpu].idle;
4495                 }
4496         }
4497
4498         /* Render the file */
4499         /* cpu-all */
4500         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4501                         user_sum,
4502                         system_sum,
4503                         idle_sum);
4504         lxcfs_v("cpu-all: %s\n", buf);
4505
4506         if (l < 0) {
4507                 perror("Error writing to cache");
4508                 return 0;
4509         }
4510         if (l >= buf_size) {
4511                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4512                 return 0;
4513         }
4514
4515         buf += l;
4516         buf_size -= l;
4517         total_len += l;
4518
4519         /* Render visible CPUs */
4520         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4521                 if (!stat_node->usage[curcpu].online)
4522                         continue;
4523
4524                 i++;
4525
4526                 if (max_cpus > 0 && i == max_cpus)
4527                         break;
4528
4529                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4530                                 i,
4531                                 stat_node->view[curcpu].user,
4532                                 stat_node->view[curcpu].system,
4533                                 stat_node->view[curcpu].idle);
4534                 lxcfs_v("cpu: %s\n", buf);
4535
4536                 if (l < 0) {
4537                         perror("Error writing to cache");
4538                         return 0;
4539
4540                 }
4541                 if (l >= buf_size) {
4542                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4543                         return 0;
4544                 }
4545
4546                 buf += l;
4547                 buf_size -= l;
4548                 total_len += l;
4549         }
4550
4551         /* Pass the rest of /proc/stat, start with the last line read */
4552         l = snprintf(buf, buf_size, "%s", line);
4553
4554         if (l < 0) {
4555                 perror("Error writing to cache");
4556                 return 0;
4557
4558         }
4559         if (l >= buf_size) {
4560                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4561                 return 0;
4562         }
4563
4564         buf += l;
4565         buf_size -= l;
4566         total_len += l;
4567
4568         /* Pass the rest of the host's /proc/stat */
4569         while (getline(&line, &linelen, f) != -1) {
4570                 l = snprintf(buf, buf_size, "%s", line);
4571                 if (l < 0) {
4572                         perror("Error writing to cache");
4573                         return 0;
4574                 }
4575                 if (l >= buf_size) {
4576                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4577                         return 0;
4578                 }
4579                 buf += l;
4580                 buf_size -= l;
4581                 total_len += l;
4582         }
4583
4584         if (stat_node)
4585                 pthread_mutex_unlock(&stat_node->lock);
4586         return total_len;
4587 }
4588
4589 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4590 static int proc_stat_read(char *buf, size_t size, off_t offset,
4591                           struct fuse_file_info *fi)
4592 {
4593         __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
4594         __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
4595         __do_fclose FILE *f = NULL;
4596         struct fuse_context *fc = fuse_get_context();
4597         struct file_info *d = (struct file_info *)fi->fh;
4598         size_t linelen = 0, total_len = 0;
4599         int curcpu = -1; /* cpu numbering starts at 0 */
4600         int physcpu = 0;
4601         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4602                       irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4603         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
4604                       iowait_sum = 0, irq_sum = 0, softirq_sum = 0,
4605                       steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4606         char cpuall[CPUALL_MAX_SIZE];
4607         /* reserve for cpu all */
4608         char *cache = d->buf + CPUALL_MAX_SIZE;
4609         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4610         int cg_cpu_usage_size = 0;
4611
4612         if (offset){
4613                 if (offset > d->size)
4614                         return -EINVAL;
4615                 if (!d->cached)
4616                         return 0;
4617                 int left = d->size - offset;
4618                 total_len = left > size ? size: left;
4619                 memcpy(buf, d->buf + offset, total_len);
4620                 return total_len;
4621         }
4622
4623         pid_t initpid = lookup_initpid_in_store(fc->pid);
4624         lxcfs_v("initpid: %d\n", initpid);
4625         if (initpid <= 0)
4626                 initpid = fc->pid;
4627
4628         /*
4629          * when container run with host pid namespace initpid == 1, cgroup will "/"
4630          * we should return host os's /proc contents.
4631          * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4632          */
4633         if (initpid == 1) {
4634             return read_file_fuse("/proc/stat", buf, size, d);
4635         }
4636
4637         cg = get_pid_cgroup(initpid, "cpuset");
4638         lxcfs_v("cg: %s\n", cg);
4639         if (!cg)
4640                 return read_file_fuse("/proc/stat", buf, size, d);
4641         prune_init_slice(cg);
4642
4643         cpuset = get_cpuset(cg);
4644         if (!cpuset)
4645                 return 0;
4646
4647         /*
4648          * Read cpuacct.usage_all for all CPUs.
4649          * If the cpuacct cgroup is present, it is used to calculate the container's
4650          * CPU usage. If not, values from the host's /proc/stat are used.
4651          */
4652         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4653                 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4654                                 "falling back to the host's /proc/stat");
4655         }
4656
4657         f = fopen("/proc/stat", "r");
4658         if (!f)
4659                 return 0;
4660
4661         //skip first line
4662         if (getline(&line, &linelen, f) < 0) {
4663                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4664                 return 0;
4665         }
4666
4667         if (cgroup_ops->can_use_cpuview(cgroup_ops) && cg_cpu_usage) {
4668                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4669                                 f, d->buf, d->buflen);
4670                 goto out;
4671         }
4672
4673         while (getline(&line, &linelen, f) != -1) {
4674                 ssize_t l;
4675                 char cpu_char[10]; /* That's a lot of cores */
4676                 char *c;
4677                 uint64_t all_used, cg_used, new_idle;
4678                 int ret;
4679
4680                 if (strlen(line) == 0)
4681                         continue;
4682                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4683                         /* not a ^cpuN line containing a number N, just print it */
4684                         l = snprintf(cache, cache_size, "%s", line);
4685                         if (l < 0) {
4686                                 perror("Error writing to cache");
4687                                 return 0;
4688                         }
4689                         if (l >= cache_size) {
4690                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4691                                 return 0;
4692                         }
4693                         cache += l;
4694                         cache_size -= l;
4695                         total_len += l;
4696                         continue;
4697                 }
4698
4699                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4700                         continue;
4701                 if (!cpu_in_cpuset(physcpu, cpuset))
4702                         continue;
4703                 curcpu ++;
4704
4705                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4706                            &user,
4707                            &nice,
4708                            &system,
4709                            &idle,
4710                            &iowait,
4711                            &irq,
4712                            &softirq,
4713                            &steal,
4714                            &guest,
4715                            &guest_nice);
4716
4717                 if (ret != 10 || !cg_cpu_usage) {
4718                         c = strchr(line, ' ');
4719                         if (!c)
4720                                 continue;
4721                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4722                         if (l < 0) {
4723                                 perror("Error writing to cache");
4724                                 return 0;
4725
4726                         }
4727                         if (l >= cache_size) {
4728                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4729                                 return 0;
4730                         }
4731
4732                         cache += l;
4733                         cache_size -= l;
4734                         total_len += l;
4735
4736                         if (ret != 10)
4737                                 continue;
4738                 }
4739
4740                 if (cg_cpu_usage) {
4741                         if (physcpu >= cg_cpu_usage_size)
4742                                 break;
4743
4744                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4745                         cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4746
4747                         if (all_used >= cg_used) {
4748                                 new_idle = idle + (all_used - cg_used);
4749
4750                         } else {
4751                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4752                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4753                                                 curcpu, cg, all_used, cg_used);
4754                                 new_idle = idle;
4755                         }
4756
4757                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4758                                         curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4759                                         new_idle);
4760
4761                         if (l < 0) {
4762                                 perror("Error writing to cache");
4763                                 return 0;
4764
4765                         }
4766                         if (l >= cache_size) {
4767                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4768                                 return 0;
4769                         }
4770
4771                         cache += l;
4772                         cache_size -= l;
4773                         total_len += l;
4774
4775                         user_sum += cg_cpu_usage[physcpu].user;
4776                         system_sum += cg_cpu_usage[physcpu].system;
4777                         idle_sum += new_idle;
4778
4779                 } else {
4780                         user_sum += user;
4781                         nice_sum += nice;
4782                         system_sum += system;
4783                         idle_sum += idle;
4784                         iowait_sum += iowait;
4785                         irq_sum += irq;
4786                         softirq_sum += softirq;
4787                         steal_sum += steal;
4788                         guest_sum += guest;
4789                         guest_nice_sum += guest_nice;
4790                 }
4791         }
4792
4793         cache = d->buf;
4794
4795         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4796                         user_sum,
4797                         nice_sum,
4798                         system_sum,
4799                         idle_sum,
4800                         iowait_sum,
4801                         irq_sum,
4802                         softirq_sum,
4803                         steal_sum,
4804                         guest_sum,
4805                         guest_nice_sum);
4806         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4807                 memcpy(cache, cpuall, cpuall_len);
4808                 cache += cpuall_len;
4809         } else {
4810                 /* shouldn't happen */
4811                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4812                 cpuall_len = 0;
4813         }
4814
4815         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4816         total_len += cpuall_len;
4817
4818 out:
4819         d->cached = 1;
4820         d->size = total_len;
4821         if (total_len > size)
4822                 total_len = size;
4823
4824         memcpy(buf, d->buf, total_len);
4825         return total_len;
4826 }
4827
4828 /* This function retrieves the busy time of a group of tasks by looking at
4829  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4830  * been given it's own cpuacct cgroup. If not, this function will take the busy
4831  * time of all other taks that do not actually belong to the container into
4832  * account as well. If someone has a clever solution for this please send a
4833  * patch!
4834  */
4835 static double get_reaper_busy(pid_t task)
4836 {
4837         __do_free char *cgroup = NULL, *usage_str = NULL;
4838         unsigned long usage = 0;
4839         pid_t initpid;
4840
4841         initpid = lookup_initpid_in_store(task);
4842         if (initpid <= 0)
4843                 return 0;
4844
4845         cgroup = get_pid_cgroup(initpid, "cpuacct");
4846         if (!cgroup)
4847                 return 0;
4848         prune_init_slice(cgroup);
4849         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage",
4850                              &usage_str))
4851                 return 0;
4852
4853         usage = strtoul(usage_str, NULL, 10);
4854         return ((double)usage / 1000000000);
4855 }
4856
4857 #if RELOADTEST
4858 void iwashere(void)
4859 {
4860         int fd;
4861
4862         fd = creat("/tmp/lxcfs-iwashere", 0644);
4863         if (fd >= 0)
4864                 close(fd);
4865 }
4866 #endif
4867
4868 /*
4869  * We read /proc/uptime and reuse its second field.
4870  * For the first field, we use the mtime for the reaper for
4871  * the calling pid as returned by getreaperage
4872  */
4873 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4874                 struct fuse_file_info *fi)
4875 {
4876         struct fuse_context *fc = fuse_get_context();
4877         struct file_info *d = (struct file_info *)fi->fh;
4878         double busytime = get_reaper_busy(fc->pid);
4879         char *cache = d->buf;
4880         ssize_t total_len = 0;
4881         double idletime, reaperage;
4882
4883 #if RELOADTEST
4884         iwashere();
4885 #endif
4886
4887         if (offset){
4888                 if (!d->cached)
4889                         return 0;
4890                 if (offset > d->size)
4891                         return -EINVAL;
4892                 int left = d->size - offset;
4893                 total_len = left > size ? size: left;
4894                 memcpy(buf, cache + offset, total_len);
4895                 return total_len;
4896         }
4897
4898         reaperage = get_reaper_age(fc->pid);
4899         /* To understand why this is done, please read the comment to the
4900          * get_reaper_busy() function.
4901          */
4902         idletime = reaperage;
4903         if (reaperage >= busytime)
4904                 idletime = reaperage - busytime;
4905
4906         total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
4907         if (total_len < 0 || total_len >=  d->buflen){
4908                 lxcfs_error("%s\n", "failed to write to cache");
4909                 return 0;
4910         }
4911
4912         d->size = (int)total_len;
4913         d->cached = 1;
4914
4915         if (total_len > size) total_len = size;
4916
4917         memcpy(buf, d->buf, total_len);
4918         return total_len;
4919 }
4920
4921 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4922                                struct fuse_file_info *fi)
4923 {
4924         __do_free char *cg = NULL, *io_serviced_str = NULL,
4925                        *io_merged_str = NULL, *io_service_bytes_str = NULL,
4926                        *io_wait_time_str = NULL, *io_service_time_str = NULL,
4927                        *line = NULL;
4928         __do_fclose FILE *f = NULL;
4929         struct fuse_context *fc = fuse_get_context();
4930         struct file_info *d = (struct file_info *)fi->fh;
4931         unsigned long read = 0, write = 0;
4932         unsigned long read_merged = 0, write_merged = 0;
4933         unsigned long read_sectors = 0, write_sectors = 0;
4934         unsigned long read_ticks = 0, write_ticks = 0;
4935         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4936         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4937         char *cache = d->buf;
4938         size_t cache_size = d->buflen;
4939         size_t linelen = 0, total_len = 0;
4940         unsigned int major = 0, minor = 0;
4941         int i = 0;
4942         int ret;
4943         char dev_name[72];
4944
4945         if (offset){
4946                 int left;
4947
4948                 if (offset > d->size)
4949                         return -EINVAL;
4950
4951                 if (!d->cached)
4952                         return 0;
4953
4954                 left = d->size - offset;
4955                 total_len = left > size ? size: left;
4956                 memcpy(buf, cache + offset, total_len);
4957
4958                 return total_len;
4959         }
4960
4961         pid_t initpid = lookup_initpid_in_store(fc->pid);
4962         if (initpid <= 1 || is_shared_pidns(initpid))
4963                 initpid = fc->pid;
4964         cg = get_pid_cgroup(initpid, "blkio");
4965         if (!cg)
4966                 return read_file_fuse("/proc/diskstats", buf, size, d);
4967         prune_init_slice(cg);
4968
4969         ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
4970         if (ret < 0) {
4971                 if (ret == -EOPNOTSUPP)
4972                         return read_file_fuse("/proc/diskstats", buf, size, d);
4973         }
4974
4975         ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
4976         if (ret < 0) {
4977                 if (ret == -EOPNOTSUPP)
4978                         return read_file_fuse("/proc/diskstats", buf, size, d);
4979         }
4980
4981         ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
4982         if (ret < 0) {
4983                 if (ret == -EOPNOTSUPP)
4984                         return read_file_fuse("/proc/diskstats", buf, size, d);
4985         }
4986
4987         ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
4988         if (ret < 0) {
4989                 if (ret == -EOPNOTSUPP)
4990                         return read_file_fuse("/proc/diskstats", buf, size, d);
4991         }
4992
4993         ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
4994         if (ret < 0) {
4995                 if (ret == -EOPNOTSUPP)
4996                         return read_file_fuse("/proc/diskstats", buf, size, d);
4997         }
4998
4999         f = fopen("/proc/diskstats", "r");
5000         if (!f)
5001                 return 0;
5002
5003         while (getline(&line, &linelen, f) != -1) {
5004                 ssize_t l;
5005                 char lbuf[256];
5006
5007                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5008                 if (i != 3)
5009                         continue;
5010
5011                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5012                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5013                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5014                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5015                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5016                 read_sectors = read_sectors/512;
5017                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5018                 write_sectors = write_sectors/512;
5019
5020                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5021                 rd_svctm = rd_svctm/1000000;
5022                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5023                 rd_wait = rd_wait/1000000;
5024                 read_ticks = rd_svctm + rd_wait;
5025
5026                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5027                 wr_svctm =  wr_svctm/1000000;
5028                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5029                 wr_wait =  wr_wait/1000000;
5030                 write_ticks = wr_svctm + wr_wait;
5031
5032                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5033                 tot_ticks =  tot_ticks/1000000;
5034
5035                 memset(lbuf, 0, 256);
5036                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5037                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5038                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5039                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5040                 else
5041                         continue;
5042
5043                 l = snprintf(cache, cache_size, "%s", lbuf);
5044                 if (l < 0) {
5045                         perror("Error writing to fuse buf");
5046                         return 0;
5047                 }
5048                 if (l >= cache_size) {
5049                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5050                         return 0;
5051                 }
5052                 cache += l;
5053                 cache_size -= l;
5054                 total_len += l;
5055         }
5056
5057         d->cached = 1;
5058         d->size = total_len;
5059         if (total_len > size ) total_len = size;
5060         memcpy(buf, d->buf, total_len);
5061
5062         return total_len;
5063 }
5064
5065 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5066                            struct fuse_file_info *fi)
5067 {
5068         __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5069                        *memswusage_str = NULL;
5070         struct fuse_context *fc = fuse_get_context();
5071         struct file_info *d = (struct file_info *)fi->fh;
5072         unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5073                       memswusage = 0, swap_total = 0, swap_free = 0;
5074         ssize_t total_len = 0;
5075         ssize_t l = 0;
5076         char *cache = d->buf;
5077         int ret;
5078
5079         if (offset) {
5080                 int left;
5081
5082                 if (offset > d->size)
5083                         return -EINVAL;
5084
5085                 if (!d->cached)
5086                         return 0;
5087
5088                 left = d->size - offset;
5089                 total_len = left > size ? size: left;
5090                 memcpy(buf, cache + offset, total_len);
5091
5092                 return total_len;
5093         }
5094
5095         pid_t initpid = lookup_initpid_in_store(fc->pid);
5096         if (initpid <= 1 || is_shared_pidns(initpid))
5097                 initpid = fc->pid;
5098         cg = get_pid_cgroup(initpid, "memory");
5099         if (!cg)
5100                 return read_file_fuse("/proc/swaps", buf, size, d);
5101         prune_init_slice(cg);
5102
5103         memlimit = get_min_memlimit(cg, false);
5104
5105         ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5106         if (ret < 0)
5107                 return 0;
5108
5109         memusage = strtoul(memusage_str, NULL, 10);
5110
5111         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5112         if (ret >= 0)
5113                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5114         if (ret >= 0) {
5115                 memswlimit = get_min_memlimit(cg, true);
5116                 memswusage = strtoul(memswusage_str, NULL, 10);
5117                 swap_total = (memswlimit - memlimit) / 1024;
5118                 swap_free = (memswusage - memusage) / 1024;
5119         }
5120
5121         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5122
5123         /* When no mem + swap limit is specified or swapaccount=0*/
5124         if (!memswlimit) {
5125                 __do_free char *line = NULL;
5126                 __do_fclose FILE *f = NULL;
5127                 size_t linelen = 0;
5128
5129                 f = fopen("/proc/meminfo", "r");
5130                 if (!f)
5131                         return 0;
5132
5133                 while (getline(&line, &linelen, f) != -1) {
5134                         if (startswith(line, "SwapTotal:"))
5135                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5136                         else if (startswith(line, "SwapFree:"))
5137                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5138                 }
5139         }
5140
5141         if (swap_total > 0) {
5142                 l = snprintf(d->buf + total_len, d->size - total_len,
5143                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5144                                 swap_total, swap_free);
5145                 total_len += l;
5146         }
5147
5148         if (total_len < 0 || l < 0) {
5149                 perror("Error writing to cache");
5150                 return 0;
5151         }
5152
5153         d->cached = 1;
5154         d->size = (int)total_len;
5155
5156         if (total_len > size) total_len = size;
5157         memcpy(buf, d->buf, total_len);
5158         return total_len;
5159 }
5160
5161 /*
5162  * Find the process pid from cgroup path.
5163  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5164  * @pid_buf : put pid to pid_buf.
5165  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5166  * @depth : the depth of cgroup in container.
5167  * @sum : return the number of pid.
5168  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5169  */
5170 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5171 {
5172         __do_free char *path = NULL;
5173         __do_close_prot_errno int fd = -EBADF;
5174         __do_fclose FILE *f = NULL;
5175         __do_closedir DIR *dir = NULL;
5176         struct dirent *file;
5177         size_t linelen = 0;
5178         char *line = NULL;
5179         int pd;
5180         char **pid;
5181
5182         /* path = dpath + "/cgroup.procs" + /0 */
5183         path = malloc(strlen(dpath) + 20);
5184         if (!path)
5185                 return sum;
5186
5187         strcpy(path, dpath);
5188         fd = openat(cfd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
5189         if (fd < 0)
5190                 return sum;
5191
5192         dir = fdopendir(move_fd(fd));
5193         if (!dir)
5194                 return sum;
5195
5196         while (((file = readdir(dir)) != NULL) && depth > 0) {
5197                 if (strcmp(file->d_name, ".") == 0)
5198                         continue;
5199
5200                 if (strcmp(file->d_name, "..") == 0)
5201                         continue;
5202
5203                 if (file->d_type == DT_DIR) {
5204                         __do_free char *path_dir = NULL;
5205
5206                         /* path + '/' + d_name +/0 */
5207                         path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5208                         if (!path_dir)
5209                                 return sum;
5210
5211                         strcpy(path_dir, path);
5212                         strcat(path_dir, "/");
5213                         strcat(path_dir, file->d_name);
5214                         pd = depth - 1;
5215                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5216                 }
5217         }
5218
5219         strcat(path, "/cgroup.procs");
5220         fd = openat(cfd, path, O_RDONLY);
5221         if (fd < 0)
5222                 return sum;
5223
5224         f = fdopen(move_fd(fd), "r");
5225         if (!f)
5226                 return sum;
5227
5228         while (getline(&line, &linelen, f) != -1) {
5229                 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5230                 if (!pid)
5231                         return sum;
5232                 *pid_buf = pid;
5233
5234                 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5235                 if (!*(*pid_buf + sum))
5236                         return sum;
5237
5238                 strcpy(*(*pid_buf + sum), line);
5239                 sum++;
5240         }
5241
5242         return sum;
5243 }
5244
5245 /*
5246  * calc_load calculates the load according to the following formula:
5247  * load1 = load0 * exp + active * (1 - exp)
5248  *
5249  * @load1: the new loadavg.
5250  * @load0: the former loadavg.
5251  * @active: the total number of running pid at this moment.
5252  * @exp: the fixed-point defined in the beginning.
5253  */
5254 static unsigned long
5255 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5256 {
5257         unsigned long newload;
5258
5259         active = active > 0 ? active * FIXED_1 : 0;
5260         newload = load * exp + active * (FIXED_1 - exp);
5261         if (active >= load)
5262                 newload += FIXED_1 - 1;
5263
5264         return newload / FIXED_1;
5265 }
5266
5267 /*
5268  * Return 0 means that container p->cg is closed.
5269  * Return -1 means that error occurred in refresh.
5270  * Positive num equals the total number of pid.
5271  */
5272 static int refresh_load(struct load_node *p, char *path)
5273 {
5274         __do_free char *line = NULL;
5275         char **idbuf;
5276         char proc_path[256];
5277         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5278         size_t linelen = 0;
5279         int sum, length;
5280         struct dirent *file;
5281
5282         idbuf = malloc(sizeof(char *));
5283         if (!idbuf)
5284                 return -1;
5285
5286         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5287         /*  normal exit  */
5288         if (sum == 0)
5289                 goto out;
5290
5291         for (i = 0; i < sum; i++) {
5292                 __do_closedir DIR *dp = NULL;
5293
5294                 /*clean up '\n' */
5295                 length = strlen(idbuf[i])-1;
5296                 idbuf[i][length] = '\0';
5297                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5298                 if (ret < 0 || ret > 255) {
5299                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5300                         i = sum;
5301                         sum = -1;
5302                         goto err_out;
5303                 }
5304
5305                 dp = opendir(proc_path);
5306                 if (!dp) {
5307                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5308                         continue;
5309                 }
5310                 while ((file = readdir(dp)) != NULL) {
5311                         __do_fclose FILE *f = NULL;
5312
5313                         if (strncmp(file->d_name, ".", 1) == 0)
5314                                 continue;
5315                         if (strncmp(file->d_name, "..", 1) == 0)
5316                                 continue;
5317                         total_pid++;
5318                         /* We make the biggest pid become last_pid.*/
5319                         ret = atof(file->d_name);
5320                         last_pid = (ret > last_pid) ? ret : last_pid;
5321
5322                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5323                         if (ret < 0 || ret > 255) {
5324                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5325                                 i = sum;
5326                                 sum = -1;
5327                                 goto err_out;
5328                         }
5329
5330                         f = fopen(proc_path, "r");
5331                         if (f != NULL) {
5332                                 while (getline(&line, &linelen, f) != -1) {
5333                                         /* Find State */
5334                                         if ((line[0] == 'S') && (line[1] == 't'))
5335                                                 break;
5336                                 }
5337
5338                         if ((line[7] == 'R') || (line[7] == 'D'))
5339                                 run_pid++;
5340                         }
5341                 }
5342         }
5343         /*Calculate the loadavg.*/
5344         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5345         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5346         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5347         p->run_pid = run_pid;
5348         p->total_pid = total_pid;
5349         p->last_pid = last_pid;
5350
5351 err_out:
5352         for (; i > 0; i--)
5353                 free(idbuf[i-1]);
5354 out:
5355         free(idbuf);
5356         return sum;
5357 }
5358
5359 /*
5360  * Traverse the hash table and update it.
5361  */
5362 void *load_begin(void *arg)
5363 {
5364
5365         int i, sum, length, ret;
5366         struct load_node *f;
5367         int first_node;
5368         clock_t time1, time2;
5369
5370         while (1) {
5371                 if (loadavg_stop == 1)
5372                         return NULL;
5373
5374                 time1 = clock();
5375                 for (i = 0; i < LOAD_SIZE; i++) {
5376                         pthread_mutex_lock(&load_hash[i].lock);
5377                         if (load_hash[i].next == NULL) {
5378                                 pthread_mutex_unlock(&load_hash[i].lock);
5379                                 continue;
5380                         }
5381                         f = load_hash[i].next;
5382                         first_node = 1;
5383                         while (f) {
5384                                 __do_free char *path = NULL;
5385
5386                                 length = strlen(f->cg) + 2;
5387                                         /* strlen(f->cg) + '.' or '' + \0 */
5388                                 path = malloc(length);
5389                                 if  (!path)
5390                                         goto out;
5391
5392                                 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
5393                                 if (ret < 0 || ret > length - 1) {
5394                                         /* snprintf failed, ignore the node.*/
5395                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5396                                         goto out;
5397                                 }
5398
5399                                 sum = refresh_load(f, path);
5400                                 if (sum == 0)
5401                                         f = del_node(f, i);
5402                                 else
5403 out:                                    f = f->next;
5404                                 /* load_hash[i].lock locks only on the first node.*/
5405                                 if (first_node == 1) {
5406                                         first_node = 0;
5407                                         pthread_mutex_unlock(&load_hash[i].lock);
5408                                 }
5409                         }
5410                 }
5411
5412                 if (loadavg_stop == 1)
5413                         return NULL;
5414
5415                 time2 = clock();
5416                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5417         }
5418 }
5419
5420 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5421                 struct fuse_file_info *fi)
5422 {
5423         struct fuse_context *fc = fuse_get_context();
5424         struct file_info *d = (struct file_info *)fi->fh;
5425         pid_t initpid;
5426         char *cg;
5427         size_t total_len = 0;
5428         char *cache = d->buf;
5429         struct load_node *n;
5430         int hash;
5431         int cfd, rv = 0;
5432         unsigned long a, b, c;
5433
5434         if (offset) {
5435                 if (offset > d->size)
5436                         return -EINVAL;
5437                 if (!d->cached)
5438                         return 0;
5439                 int left = d->size - offset;
5440                 total_len = left > size ? size : left;
5441                 memcpy(buf, cache + offset, total_len);
5442                 return total_len;
5443         }
5444         if (!loadavg)
5445                 return read_file_fuse("/proc/loadavg", buf, size, d);
5446
5447         initpid = lookup_initpid_in_store(fc->pid);
5448         if (initpid <= 1 || is_shared_pidns(initpid))
5449                 initpid = fc->pid;
5450         cg = get_pid_cgroup(initpid, "cpu");
5451         if (!cg)
5452                 return read_file_fuse("/proc/loadavg", buf, size, d);
5453
5454         prune_init_slice(cg);
5455         hash = calc_hash(cg) % LOAD_SIZE;
5456         n = locate_node(cg, hash);
5457
5458         /* First time */
5459         if (n == NULL) {
5460                 cfd = get_cgroup_fd("cpu");
5461                 if (cfd >= 0) {
5462                         /*
5463                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5464                          * because delete is not allowed before read has ended.
5465                          */
5466                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5467                         rv = 0;
5468                         goto err;
5469                 }
5470                 do {
5471                         n = malloc(sizeof(struct load_node));
5472                 } while (!n);
5473
5474                 do {
5475                         n->cg = malloc(strlen(cg)+1);
5476                 } while (!n->cg);
5477                 strcpy(n->cg, cg);
5478                 n->avenrun[0] = 0;
5479                 n->avenrun[1] = 0;
5480                 n->avenrun[2] = 0;
5481                 n->run_pid = 0;
5482                 n->total_pid = 1;
5483                 n->last_pid = initpid;
5484                 n->cfd = cfd;
5485                 insert_node(&n, hash);
5486         }
5487         a = n->avenrun[0] + (FIXED_1/200);
5488         b = n->avenrun[1] + (FIXED_1/200);
5489         c = n->avenrun[2] + (FIXED_1/200);
5490         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5491                 LOAD_INT(a), LOAD_FRAC(a),
5492                 LOAD_INT(b), LOAD_FRAC(b),
5493                 LOAD_INT(c), LOAD_FRAC(c),
5494                 n->run_pid, n->total_pid, n->last_pid);
5495         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5496         if (total_len < 0 || total_len >=  d->buflen) {
5497                 lxcfs_error("%s\n", "Failed to write to cache");
5498                 rv = 0;
5499                 goto err;
5500         }
5501         d->size = (int)total_len;
5502         d->cached = 1;
5503
5504         if (total_len > size)
5505                 total_len = size;
5506         memcpy(buf, d->buf, total_len);
5507         rv = total_len;
5508
5509 err:
5510         free(cg);
5511         return rv;
5512 }
5513 /* Return a positive number on success, return 0 on failure.*/
5514 pthread_t load_daemon(int load_use)
5515 {
5516         int ret;
5517         pthread_t pid;
5518
5519         ret = init_load();
5520         if (ret == -1) {
5521                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5522                 return 0;
5523         }
5524         ret = pthread_create(&pid, NULL, load_begin, NULL);
5525         if (ret != 0) {
5526                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5527                 load_free();
5528                 return 0;
5529         }
5530         /* use loadavg, here loadavg = 1*/
5531         loadavg = load_use;
5532         return pid;
5533 }
5534
5535 /* Returns 0 on success. */
5536 int stop_load_daemon(pthread_t pid)
5537 {
5538         int s;
5539
5540         /* Signal the thread to gracefully stop */
5541         loadavg_stop = 1;
5542
5543         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5544         if (s != 0) {
5545                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5546                 return -1;
5547         }
5548
5549         load_free();
5550         loadavg_stop = 0;
5551
5552         return 0;
5553 }
5554
5555 static off_t get_procfile_size(const char *which)
5556 {
5557         FILE *f = fopen(which, "r");
5558         char *line = NULL;
5559         size_t len = 0;
5560         ssize_t sz, answer = 0;
5561         if (!f)
5562                 return 0;
5563
5564         while ((sz = getline(&line, &len, f)) != -1)
5565                 answer += sz;
5566         fclose (f);
5567         free(line);
5568
5569         return answer;
5570 }
5571
5572 int proc_getattr(const char *path, struct stat *sb)
5573 {
5574         struct timespec now;
5575
5576         memset(sb, 0, sizeof(struct stat));
5577         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5578                 return -EINVAL;
5579         sb->st_uid = sb->st_gid = 0;
5580         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5581         if (strcmp(path, "/proc") == 0) {
5582                 sb->st_mode = S_IFDIR | 00555;
5583                 sb->st_nlink = 2;
5584                 return 0;
5585         }
5586         if (strcmp(path, "/proc/meminfo") == 0 ||
5587                         strcmp(path, "/proc/cpuinfo") == 0 ||
5588                         strcmp(path, "/proc/uptime") == 0 ||
5589                         strcmp(path, "/proc/stat") == 0 ||
5590                         strcmp(path, "/proc/diskstats") == 0 ||
5591                         strcmp(path, "/proc/swaps") == 0 ||
5592                         strcmp(path, "/proc/loadavg") == 0) {
5593                 sb->st_size = 0;
5594                 sb->st_mode = S_IFREG | 00444;
5595                 sb->st_nlink = 1;
5596                 return 0;
5597         }
5598
5599         return -ENOENT;
5600 }
5601
5602 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5603                 struct fuse_file_info *fi)
5604 {
5605         if (filler(buf, ".", NULL, 0) != 0 ||
5606             filler(buf, "..", NULL, 0) != 0 ||
5607             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5608             filler(buf, "meminfo", NULL, 0) != 0 ||
5609             filler(buf, "stat", NULL, 0) != 0 ||
5610             filler(buf, "uptime", NULL, 0) != 0 ||
5611             filler(buf, "diskstats", NULL, 0) != 0 ||
5612             filler(buf, "swaps", NULL, 0) != 0   ||
5613             filler(buf, "loadavg", NULL, 0) != 0)
5614                 return -EINVAL;
5615         return 0;
5616 }
5617
5618 int proc_open(const char *path, struct fuse_file_info *fi)
5619 {
5620         int type = -1;
5621         struct file_info *info;
5622
5623         if (strcmp(path, "/proc/meminfo") == 0)
5624                 type = LXC_TYPE_PROC_MEMINFO;
5625         else if (strcmp(path, "/proc/cpuinfo") == 0)
5626                 type = LXC_TYPE_PROC_CPUINFO;
5627         else if (strcmp(path, "/proc/uptime") == 0)
5628                 type = LXC_TYPE_PROC_UPTIME;
5629         else if (strcmp(path, "/proc/stat") == 0)
5630                 type = LXC_TYPE_PROC_STAT;
5631         else if (strcmp(path, "/proc/diskstats") == 0)
5632                 type = LXC_TYPE_PROC_DISKSTATS;
5633         else if (strcmp(path, "/proc/swaps") == 0)
5634                 type = LXC_TYPE_PROC_SWAPS;
5635         else if (strcmp(path, "/proc/loadavg") == 0)
5636                 type = LXC_TYPE_PROC_LOADAVG;
5637         if (type == -1)
5638                 return -ENOENT;
5639
5640         info = malloc(sizeof(*info));
5641         if (!info)
5642                 return -ENOMEM;
5643
5644         memset(info, 0, sizeof(*info));
5645         info->type = type;
5646
5647         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5648         do {
5649                 info->buf = malloc(info->buflen);
5650         } while (!info->buf);
5651         memset(info->buf, 0, info->buflen);
5652         /* set actual size to buffer size */
5653         info->size = info->buflen;
5654
5655         fi->fh = (unsigned long)info;
5656         return 0;
5657 }
5658
5659 int proc_access(const char *path, int mask)
5660 {
5661         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5662                 return 0;
5663
5664         /* these are all read-only */
5665         if ((mask & ~R_OK) != 0)
5666                 return -EACCES;
5667         return 0;
5668 }
5669
5670 int proc_release(const char *path, struct fuse_file_info *fi)
5671 {
5672         do_release_file_info(fi);
5673         return 0;
5674 }
5675
5676 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5677                 struct fuse_file_info *fi)
5678 {
5679         struct file_info *f = (struct file_info *) fi->fh;
5680
5681         switch (f->type) {
5682         case LXC_TYPE_PROC_MEMINFO:
5683                 return proc_meminfo_read(buf, size, offset, fi);
5684         case LXC_TYPE_PROC_CPUINFO:
5685                 return proc_cpuinfo_read(buf, size, offset, fi);
5686         case LXC_TYPE_PROC_UPTIME:
5687                 return proc_uptime_read(buf, size, offset, fi);
5688         case LXC_TYPE_PROC_STAT:
5689                 return proc_stat_read(buf, size, offset, fi);
5690         case LXC_TYPE_PROC_DISKSTATS:
5691                 return proc_diskstats_read(buf, size, offset, fi);
5692         case LXC_TYPE_PROC_SWAPS:
5693                 return proc_swaps_read(buf, size, offset, fi);
5694         case LXC_TYPE_PROC_LOADAVG:
5695                 return proc_loadavg_read(buf, size, offset, fi);
5696         default:
5697                 return -EINVAL;
5698         }
5699 }
5700
5701 /*
5702  * Functions needed to setup cgroups in the __constructor__.
5703  */
5704
5705 static bool umount_if_mounted(void)
5706 {
5707         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5708                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5709                 return false;
5710         }
5711         return true;
5712 }
5713
5714 /* __typeof__ should be safe to use with all compilers. */
5715 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5716 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5717 {
5718         return (fs->f_type == (fs_type_magic)magic_val);
5719 }
5720
5721 /*
5722  * looking at fs/proc_namespace.c, it appears we can
5723  * actually expect the rootfs entry to very specifically contain
5724  * " - rootfs rootfs "
5725  * IIUC, so long as we've chrooted so that rootfs is not our root,
5726  * the rootfs entry should always be skipped in mountinfo contents.
5727  */
5728 static bool is_on_ramfs(void)
5729 {
5730         FILE *f;
5731         char *p, *p2;
5732         char *line = NULL;
5733         size_t len = 0;
5734         int i;
5735
5736         f = fopen("/proc/self/mountinfo", "r");
5737         if (!f)
5738                 return false;
5739
5740         while (getline(&line, &len, f) != -1) {
5741                 for (p = line, i = 0; p && i < 4; i++)
5742                         p = strchr(p + 1, ' ');
5743                 if (!p)
5744                         continue;
5745                 p2 = strchr(p + 1, ' ');
5746                 if (!p2)
5747                         continue;
5748                 *p2 = '\0';
5749                 if (strcmp(p + 1, "/") == 0) {
5750                         // this is '/'.  is it the ramfs?
5751                         p = strchr(p2 + 1, '-');
5752                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5753                                 free(line);
5754                                 fclose(f);
5755                                 return true;
5756                         }
5757                 }
5758         }
5759         free(line);
5760         fclose(f);
5761         return false;
5762 }
5763
5764 static int pivot_enter()
5765 {
5766         int ret = -1, oldroot = -1, newroot = -1;
5767
5768         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5769         if (oldroot < 0) {
5770                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5771                 return ret;
5772         }
5773
5774         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5775         if (newroot < 0) {
5776                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5777                 goto err;
5778         }
5779
5780         /* change into new root fs */
5781         if (fchdir(newroot) < 0) {
5782                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5783                 goto err;
5784         }
5785
5786         /* pivot_root into our new root fs */
5787         if (pivot_root(".", ".") < 0) {
5788                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5789                 goto err;
5790         }
5791
5792         /*
5793          * At this point the old-root is mounted on top of our new-root.
5794          * To unmounted it we must not be chdir'd into it, so escape back
5795          * to the old-root.
5796          */
5797         if (fchdir(oldroot) < 0) {
5798                 lxcfs_error("%s\n", "Failed to enter old root.");
5799                 goto err;
5800         }
5801
5802         if (umount2(".", MNT_DETACH) < 0) {
5803                 lxcfs_error("%s\n", "Failed to detach old root.");
5804                 goto err;
5805         }
5806
5807         if (fchdir(newroot) < 0) {
5808                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5809                 goto err;
5810         }
5811
5812         ret = 0;
5813
5814 err:
5815         if (oldroot > 0)
5816                 close(oldroot);
5817         if (newroot > 0)
5818                 close(newroot);
5819
5820         return ret;
5821 }
5822
5823 static int chroot_enter()
5824 {
5825         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5826                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5827                 return -1;
5828         }
5829
5830         if (chroot(".") < 0) {
5831                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5832                 return -1;
5833         }
5834
5835         if (chdir("/") < 0) {
5836                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5837                 return -1;
5838         }
5839
5840         return 0;
5841 }
5842
5843 static int permute_and_enter(void)
5844 {
5845         struct statfs sb;
5846
5847         if (statfs("/", &sb) < 0) {
5848                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5849                 return -1;
5850         }
5851
5852         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5853          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5854          * /proc/1/mountinfo. */
5855         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5856                 return chroot_enter();
5857
5858         if (pivot_enter() < 0) {
5859                 lxcfs_error("%s\n", "Could not perform pivot root.");
5860                 return -1;
5861         }
5862
5863         return 0;
5864 }
5865
5866 /* Prepare our new clean root. */
5867 static int permute_prepare(void)
5868 {
5869         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5870                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5871                 return -1;
5872         }
5873
5874         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5875                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5876                 return -1;
5877         }
5878
5879         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5880                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5881                 return -1;
5882         }
5883
5884         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5885                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5886                 return -1;
5887         }
5888
5889         return 0;
5890 }
5891
5892 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
5893 static bool permute_root(void)
5894 {
5895         /* Prepare new root. */
5896         if (permute_prepare() < 0)
5897                 return false;
5898
5899         /* Pivot into new root. */
5900         if (permute_and_enter() < 0)
5901                 return false;
5902
5903         return true;
5904 }
5905
5906 static bool cgfs_prepare_mounts(void)
5907 {
5908         if (!mkdir_p(BASEDIR, 0700)) {
5909                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
5910                 return false;
5911         }
5912
5913         if (!umount_if_mounted()) {
5914                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
5915                 return false;
5916         }
5917
5918         if (unshare(CLONE_NEWNS) < 0) {
5919                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
5920                 return false;
5921         }
5922
5923         cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt");
5924         if (cgroup_ops->mntns_fd < 0) {
5925                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
5926                 return false;
5927         }
5928
5929         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
5930                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
5931                 return false;
5932         }
5933
5934         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
5935                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
5936                 return false;
5937         }
5938
5939         return true;
5940 }
5941
5942 static bool cgfs_mount_hierarchies(void)
5943 {
5944         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
5945                 return false;
5946
5947         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
5948                 return false;
5949
5950         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
5951                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
5952                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
5953                 if ((*h)->fd < 0)
5954                         return false;
5955         }
5956
5957         return true;
5958 }
5959
5960 static bool cgfs_setup_controllers(void)
5961 {
5962         if (!cgfs_prepare_mounts())
5963                 return false;
5964
5965         if (!cgfs_mount_hierarchies()) {
5966                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
5967                 return false;
5968         }
5969
5970         if (!permute_root())
5971                 return false;
5972
5973         return true;
5974 }
5975
5976 static void __attribute__((constructor)) lxcfs_init(void)
5977 {
5978         __do_close_prot_errno int init_ns = -EBADF;
5979         char *cret;
5980         char cwd[MAXPATHLEN];
5981
5982         cgroup_ops = cgroup_init();
5983         if (!cgroup_ops)
5984                 log_exit("Failed to initialize cgroup support");
5985
5986         /* Preserve initial namespace. */
5987         init_ns = preserve_ns(getpid(), "mnt");
5988         if (init_ns < 0)
5989                 log_exit("Failed to preserve initial mount namespace");
5990
5991         cret = getcwd(cwd, MAXPATHLEN);
5992                 log_exit("%s - Could not retrieve current working directory", strerror(errno));
5993
5994         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
5995          * to privately mount lxcfs cgroups. */
5996         if (!cgfs_setup_controllers())
5997                 log_exit("Failed to setup private cgroup mounts for lxcfs");
5998
5999         if (setns(init_ns, 0) < 0)
6000                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
6001
6002         if (!cret || chdir(cwd) < 0)
6003                 log_exit("%s - Could not change back to original working directory", strerror(errno));
6004
6005         if (!init_cpuview())
6006                 log_exit("Failed to init CPU view");
6007
6008         print_subsystems();
6009 }
6010
6011 static void __attribute__((destructor)) lxcfs_exit(void)
6012 {
6013         lxcfs_debug("%s\n", "Running destructor for liblxcfs");
6014         free_cpuview();
6015         cgroup_exit(cgroup_ops);
6016 }