bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdarg.h>
  21 #include <stdbool.h>
  22 #include <stdint.h>
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <time.h>
  27 #include <unistd.h>
  28 #include <wait.h>
  29 #include <linux/magic.h>
  30 #include <linux/sched.h>
  31 #include <sys/epoll.h>
  32 #include <sys/mman.h>
  33 #include <sys/mount.h>
  34 #include <sys/param.h>
  35 #include <sys/socket.h>
  36 #include <sys/syscall.h>
  37 #include <sys/sysinfo.h>
  38 #include <sys/vfs.h>
  39
  40 #include "bindings.h"
  41 #include "cgroups/cgroup.h"
  42 #include "cgroups/cgroup_utils.h"
  43 #include "memory_utils.h"
  44 #include "config.h"
  45
  46 /* Define pivot_root() if missing from the C library */
  47 #ifndef HAVE_PIVOT_ROOT
  48 static int pivot_root(const char * new_root, const char * put_old)
  49 {
  50 #ifdef __NR_pivot_root
  51 return syscall(__NR_pivot_root, new_root, put_old);
  52 #else
  53 errno = ENOSYS;
  54 return -1;
  55 #endif
  56 }
  57 #else
  58 extern int pivot_root(const char * new_root, const char * put_old);
  59 #endif
  60
  61 struct cpuacct_usage {
  62         uint64_t user;
  63         uint64_t system;
  64         uint64_t idle;
  65         bool online;
  66 };
  67
  68 /* The function of hash table.*/
  69 #define LOAD_SIZE 100 /*the size of hash_table */
  70 #define FLUSH_TIME 5  /*the flush rate */
  71 #define DEPTH_DIR 3   /*the depth of per cgroup */
  72 /* The function of calculate loadavg .*/
  73 #define FSHIFT          11              /* nr of bits of precision */
  74 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  75 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  76 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  77 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  78 #define LOAD_INT(x) ((x) >> FSHIFT)
  79 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  80 /*
  81  * This parameter is used for proc_loadavg_read().
  82  * 1 means use loadavg, 0 means not use.
  83  */
  84 static int loadavg = 0;
  85 static volatile sig_atomic_t loadavg_stop = 0;
  86 static int calc_hash(const char *name)
  87 {
  88         unsigned int hash = 0;
  89         unsigned int x = 0;
  90         /* ELFHash algorithm. */
  91         while (*name) {
  92                 hash = (hash << 4) + *name++;
  93                 x = hash & 0xf0000000;
  94                 if (x != 0)
  95                         hash ^= (x >> 24);
  96                 hash &= ~x;
  97         }
  98         return (hash & 0x7fffffff);
  99 }
 100
 101 struct load_node {
 102         char *cg;  /*cg */
 103         unsigned long avenrun[3];               /* Load averages */
 104         unsigned int run_pid;
 105         unsigned int total_pid;
 106         unsigned int last_pid;
 107         int cfd; /* The file descriptor of the mounted cgroup */
 108         struct  load_node *next;
 109         struct  load_node **pre;
 110 };
 111
 112 struct load_head {
 113         /*
 114          * The lock is about insert load_node and refresh load_node.To the first
 115          * load_node of each hash bucket, insert and refresh in this hash bucket is
 116          * mutually exclusive.
 117          */
 118         pthread_mutex_t lock;
 119         /*
 120          * The rdlock is about read loadavg and delete load_node.To each hash
 121          * bucket, read and delete is mutually exclusive. But at the same time, we
 122          * allow paratactic read operation. This rdlock is at list level.
 123          */
 124         pthread_rwlock_t rdlock;
 125         /*
 126          * The rilock is about read loadavg and insert load_node.To the first
 127          * load_node of each hash bucket, read and insert is mutually exclusive.
 128          * But at the same time, we allow paratactic read operation.
 129          */
 130         pthread_rwlock_t rilock;
 131         struct load_node *next;
 132 };
 133
 134 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 135 /*
 136  * init_load initialize the hash table.
 137  * Return 0 on success, return -1 on failure.
 138  */
 139 static int init_load(void)
 140 {
 141         int i;
 142         int ret;
 143
 144         for (i = 0; i < LOAD_SIZE; i++) {
 145                 load_hash[i].next = NULL;
 146                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 147                 if (ret != 0) {
 148                         lxcfs_error("%s\n", "Failed to initialize lock");
 149                         goto out3;
 150                 }
 151                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 152                 if (ret != 0) {
 153                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 154                         goto out2;
 155                 }
 156                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 157                 if (ret != 0) {
 158                         lxcfs_error("%s\n", "Failed to initialize rilock");
 159                         goto out1;
 160                 }
 161         }
 162         return 0;
 163 out1:
 164         pthread_rwlock_destroy(&load_hash[i].rdlock);
 165 out2:
 166         pthread_mutex_destroy(&load_hash[i].lock);
 167 out3:
 168         while (i > 0) {
 169                 i--;
 170                 pthread_mutex_destroy(&load_hash[i].lock);
 171                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 172                 pthread_rwlock_destroy(&load_hash[i].rilock);
 173         }
 174         return -1;
 175 }
 176
 177 static void insert_node(struct load_node **n, int locate)
 178 {
 179         struct load_node *f;
 180
 181         pthread_mutex_lock(&load_hash[locate].lock);
 182         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 183         f = load_hash[locate].next;
 184         load_hash[locate].next = *n;
 185
 186         (*n)->pre = &(load_hash[locate].next);
 187         if (f)
 188                 f->pre = &((*n)->next);
 189         (*n)->next = f;
 190         pthread_mutex_unlock(&load_hash[locate].lock);
 191         pthread_rwlock_unlock(&load_hash[locate].rilock);
 192 }
 193 /*
 194  * locate_node() finds special node. Not return NULL means success.
 195  * It should be noted that rdlock isn't unlocked at the end of code
 196  * because this function is used to read special node. Delete is not
 197  * allowed before read has ended.
 198  * unlock rdlock only in proc_loadavg_read().
 199  */
 200 static struct load_node *locate_node(char *cg, int locate)
 201 {
 202         struct load_node *f = NULL;
 203         int i = 0;
 204
 205         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 206         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 207         if (load_hash[locate].next == NULL) {
 208                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 209                 return f;
 210         }
 211         f = load_hash[locate].next;
 212         pthread_rwlock_unlock(&load_hash[locate].rilock);
 213         while (f && ((i = strcmp(f->cg, cg)) != 0))
 214                 f = f->next;
 215         return f;
 216 }
 217
 218 /* Delete the load_node n and return the next node of it. */
 219 static struct load_node *del_node(struct load_node *n, int locate)
 220 {
 221         struct load_node *g;
 222
 223         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 224         if (n->next == NULL) {
 225                 *(n->pre) = NULL;
 226         } else {
 227                 *(n->pre) = n->next;
 228                 n->next->pre = n->pre;
 229         }
 230         g = n->next;
 231         free_disarm(n->cg);
 232         free_disarm(n);
 233         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 234         return g;
 235 }
 236
 237 static void load_free(void)
 238 {
 239         struct load_node *f, *p;
 240
 241         for (int i = 0; i < LOAD_SIZE; i++) {
 242                 pthread_mutex_lock(&load_hash[i].lock);
 243                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 244                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 245                 if (load_hash[i].next == NULL) {
 246                         pthread_mutex_unlock(&load_hash[i].lock);
 247                         pthread_mutex_destroy(&load_hash[i].lock);
 248                         pthread_rwlock_unlock(&load_hash[i].rilock);
 249                         pthread_rwlock_destroy(&load_hash[i].rilock);
 250                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 251                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 252                         continue;
 253                 }
 254
 255                 for (f = load_hash[i].next; f;) {
 256                         free_disarm(f->cg);
 257                         p = f->next;
 258                         free_disarm(f);
 259                         f = p;
 260                 }
 261
 262                 pthread_mutex_unlock(&load_hash[i].lock);
 263                 pthread_mutex_destroy(&load_hash[i].lock);
 264                 pthread_rwlock_unlock(&load_hash[i].rilock);
 265                 pthread_rwlock_destroy(&load_hash[i].rilock);
 266                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 267                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 268         }
 269 }
 270
 271 /* Data for CPU view */
 272 struct cg_proc_stat {
 273         char *cg;
 274         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 275         struct cpuacct_usage *view; // Usage stats reported to the container
 276         int cpu_count;
 277         pthread_mutex_t lock; // For node manipulation
 278         struct cg_proc_stat *next;
 279 };
 280
 281 struct cg_proc_stat_head {
 282         struct cg_proc_stat *next;
 283         time_t lastcheck;
 284
 285         /*
 286          * For access to the list. Reading can be parallel, pruning is exclusive.
 287          */
 288         pthread_rwlock_t lock;
 289 };
 290
 291 #define CPUVIEW_HASH_SIZE 100
 292 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 293
 294 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 295 {
 296         *head = malloc(sizeof(struct cg_proc_stat_head));
 297         if (!(*head)) {
 298                 lxcfs_error("%s\n", strerror(errno));
 299                 return false;
 300         }
 301
 302         (*head)->lastcheck = time(NULL);
 303         (*head)->next = NULL;
 304
 305         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 306                 lxcfs_error("%s\n", "Failed to initialize list lock");
 307                 free_disarm(*head);
 308                 return false;
 309         }
 310
 311         return true;
 312 }
 313
 314 static bool init_cpuview()
 315 {
 316         int i;
 317
 318         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 319                 proc_stat_history[i] = NULL;
 320
 321         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 322                 if (!cpuview_init_head(&proc_stat_history[i]))
 323                         goto err;
 324         }
 325
 326         return true;
 327
 328 err:
 329         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 330                 if (proc_stat_history[i])
 331                         free_disarm(proc_stat_history[i]);
 332         }
 333
 334         return false;
 335 }
 336
 337 static void free_proc_stat_node(struct cg_proc_stat *node)
 338 {
 339         pthread_mutex_destroy(&node->lock);
 340         free_disarm(node->cg);
 341         free_disarm(node->usage);
 342         free_disarm(node->view);
 343         free_disarm(node);
 344 }
 345
 346 static void cpuview_free_head(struct cg_proc_stat_head *head)
 347 {
 348         struct cg_proc_stat *node, *tmp;
 349
 350         if (head->next) {
 351                 node = head->next;
 352
 353                 for (;;) {
 354                         tmp = node;
 355                         node = node->next;
 356                         free_proc_stat_node(tmp);
 357
 358                         if (!node)
 359                                 break;
 360                 }
 361         }
 362
 363         pthread_rwlock_destroy(&head->lock);
 364         free_disarm(head);
 365 }
 366
 367 static void free_cpuview()
 368 {
 369         int i;
 370
 371         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 372                 if (proc_stat_history[i])
 373                         cpuview_free_head(proc_stat_history[i]);
 374         }
 375 }
 376
 377 /*
 378  * A table caching which pid is init for a pid namespace.
 379  * When looking up which pid is init for $qpid, we first
 380  * 1. Stat /proc/$qpid/ns/pid.
 381  * 2. Check whether the ino_t is in our store.
 382  *   a. if not, fork a child in qpid's ns to send us
 383  *       ucred.pid = 1, and read the initpid.  Cache
 384  *       initpid and creation time for /proc/initpid
 385  *       in a new store entry.
 386  *   b. if so, verify that /proc/initpid still matches
 387  *       what we have saved.  If not, clear the store
 388  *       entry and go back to a.  If so, return the
 389  *       cached initpid.
 390  */
 391 struct pidns_init_store {
 392         ino_t ino;          // inode number for /proc/$pid/ns/pid
 393         pid_t initpid;      // the pid of nit in that ns
 394         long int ctime;     // the time at which /proc/$initpid was created
 395         struct pidns_init_store *next;
 396         long int lastcheck;
 397 };
 398
 399 /* lol - look at how they are allocated in the kernel */
 400 #define PIDNS_HASH_SIZE 4096
 401 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 402
 403 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 404 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 405 static void lock_mutex(pthread_mutex_t *l)
 406 {
 407         int ret;
 408
 409         if ((ret = pthread_mutex_lock(l)) != 0) {
 410                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 411                 exit(1);
 412         }
 413 }
 414
 415 struct cgroup_ops *cgroup_ops;
 416
 417 static void unlock_mutex(pthread_mutex_t *l)
 418 {
 419         int ret;
 420
 421         if ((ret = pthread_mutex_unlock(l)) != 0) {
 422                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 423                 exit(1);
 424         }
 425 }
 426
 427 static void store_lock(void)
 428 {
 429         lock_mutex(&pidns_store_mutex);
 430 }
 431
 432 static void store_unlock(void)
 433 {
 434         unlock_mutex(&pidns_store_mutex);
 435 }
 436
 437 /* Must be called under store_lock */
 438 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 439 {
 440         struct stat initsb;
 441         char fnam[100];
 442
 443         snprintf(fnam, 100, "/proc/%d", e->initpid);
 444         if (stat(fnam, &initsb) < 0)
 445                 return false;
 446
 447         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 448                     initsb.st_ctime, e->initpid);
 449
 450         if (e->ctime != initsb.st_ctime)
 451                 return false;
 452         return true;
 453 }
 454
 455 /* Must be called under store_lock */
 456 static void remove_initpid(struct pidns_init_store *e)
 457 {
 458         struct pidns_init_store *tmp;
 459         int h;
 460
 461         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 462
 463         h = HASH(e->ino);
 464         if (pidns_hash_table[h] == e) {
 465                 pidns_hash_table[h] = e->next;
 466                 free_disarm(e);
 467                 return;
 468         }
 469
 470         tmp = pidns_hash_table[h];
 471         while (tmp) {
 472                 if (tmp->next == e) {
 473                         tmp->next = e->next;
 474                         free_disarm(e);
 475                         return;
 476                 }
 477                 tmp = tmp->next;
 478         }
 479 }
 480
 481 #define PURGE_SECS 5
 482 /* Must be called under store_lock */
 483 static void prune_initpid_store(void)
 484 {
 485         static long int last_prune = 0;
 486         struct pidns_init_store *e, *prev, *delme;
 487         long int now, threshold;
 488         int i;
 489
 490         if (!last_prune) {
 491                 last_prune = time(NULL);
 492                 return;
 493         }
 494         now = time(NULL);
 495         if (now < last_prune + PURGE_SECS)
 496                 return;
 497
 498         lxcfs_debug("%s\n", "Pruning.");
 499
 500         last_prune = now;
 501         threshold = now - 2 * PURGE_SECS;
 502
 503         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 504                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 505                         if (e->lastcheck < threshold) {
 506
 507                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 508
 509                                 delme = e;
 510                                 if (prev)
 511                                         prev->next = e->next;
 512                                 else
 513                                         pidns_hash_table[i] = e->next;
 514                                 e = e->next;
 515                                 free_disarm(delme);
 516                         } else {
 517                                 prev = e;
 518                                 e = e->next;
 519                         }
 520                 }
 521         }
 522 }
 523
 524 /* Must be called under store_lock */
 525 static void save_initpid(struct stat *sb, pid_t pid)
 526 {
 527         struct pidns_init_store *e;
 528         char fpath[100];
 529         struct stat procsb;
 530         int h;
 531
 532         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 533
 534         snprintf(fpath, 100, "/proc/%d", pid);
 535         if (stat(fpath, &procsb) < 0)
 536                 return;
 537         do {
 538                 e = malloc(sizeof(*e));
 539         } while (!e);
 540         e->ino = sb->st_ino;
 541         e->initpid = pid;
 542         e->ctime = procsb.st_ctime;
 543         h = HASH(e->ino);
 544         e->next = pidns_hash_table[h];
 545         e->lastcheck = time(NULL);
 546         pidns_hash_table[h] = e;
 547 }
 548
 549 /*
 550  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 551  * entry for the inode number and creation time.  Verify that the init pid
 552  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 553  * otherwise.
 554  * Must be called under store_lock
 555  */
 556 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 557 {
 558         int h = HASH(sb->st_ino);
 559         struct pidns_init_store *e = pidns_hash_table[h];
 560
 561         while (e) {
 562                 if (e->ino == sb->st_ino) {
 563                         if (initpid_still_valid(e, sb)) {
 564                                 e->lastcheck = time(NULL);
 565                                 return e;
 566                         }
 567                         remove_initpid(e);
 568                         return NULL;
 569                 }
 570                 e = e->next;
 571         }
 572
 573         return NULL;
 574 }
 575
 576 static int is_dir(const char *path, int fd)
 577 {
 578         struct stat statbuf;
 579         int ret = fstatat(fd, path, &statbuf, fd);
 580         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 581                 return 1;
 582         return 0;
 583 }
 584
 585 static int preserve_ns(const int pid, const char *ns)
 586 {
 587         int ret;
 588 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
 589 #define __NS_PATH_LEN 50
 590         char path[__NS_PATH_LEN];
 591
 592         /* This way we can use this function to also check whether namespaces
 593          * are supported by the kernel by passing in the NULL or the empty
 594          * string.
 595          */
 596         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
 597                        !ns || strcmp(ns, "") == 0 ? "" : "/",
 598                        !ns || strcmp(ns, "") == 0 ? "" : ns);
 599         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
 600                 errno = EFBIG;
 601                 return -1;
 602         }
 603
 604         return open(path, O_RDONLY | O_CLOEXEC);
 605 }
 606
 607 /**
 608  * in_same_namespace - Check whether two processes are in the same namespace.
 609  * @pid1 - PID of the first process.
 610  * @pid2 - PID of the second process.
 611  * @ns   - Name of the namespace to check. Must correspond to one of the names
 612  *         for the namespaces as shown in /proc/<pid/ns/
 613  *
 614  * If the two processes are not in the same namespace returns an fd to the
 615  * namespace of the second process identified by @pid2. If the two processes are
 616  * in the same namespace returns -EINVAL, -1 if an error occurred.
 617  */
 618 static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
 619 {
 620         __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
 621         int ret = -1;
 622         struct stat ns_st1, ns_st2;
 623
 624         ns_fd1 = preserve_ns(pid1, ns);
 625         if (ns_fd1 < 0) {
 626                 /* The kernel does not support this namespace. This is not an
 627                  * error.
 628                  */
 629                 if (errno == ENOENT)
 630                         return -EINVAL;
 631
 632                 return -1;
 633         }
 634
 635         ns_fd2 = preserve_ns(pid2, ns);
 636         if (ns_fd2 < 0)
 637                 return -1;
 638
 639         ret = fstat(ns_fd1, &ns_st1);
 640         if (ret < 0)
 641                 return -1;
 642
 643         ret = fstat(ns_fd2, &ns_st2);
 644         if (ret < 0)
 645                 return -1;
 646
 647         /* processes are in the same namespace */
 648         if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
 649                 return -EINVAL;
 650
 651         /* processes are in different namespaces */
 652         return move_fd(ns_fd2);
 653 }
 654
 655 static bool is_shared_pidns(pid_t pid)
 656 {
 657         if (pid != 1)
 658                 return false;
 659
 660         if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
 661                 return true;
 662
 663         return false;
 664 }
 665
 666 static bool write_string(const char *fnam, const char *string, int fd)
 667 {
 668         FILE *f;
 669         size_t len, ret;
 670
 671         f = fdopen(fd, "w");
 672         if (!f)
 673                 return false;
 674
 675         len = strlen(string);
 676         ret = fwrite(string, 1, len, f);
 677         if (ret != len) {
 678                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 679                             strerror(errno), string, fnam);
 680                 fclose(f);
 681                 return false;
 682         }
 683
 684         if (fclose(f) < 0) {
 685                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 686                 return false;
 687         }
 688
 689         return true;
 690 }
 691
 692 struct cgfs_files {
 693         char *name;
 694         uint32_t uid, gid;
 695         uint32_t mode;
 696 };
 697
 698 static void print_subsystems(void)
 699 {
 700         int i = 0;
 701
 702         fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
 703         fprintf(stderr, "hierarchies:\n");
 704         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 705                 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
 706                 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
 707         }
 708 }
 709
 710 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 711                 const char *value)
 712 {
 713         int ret, fd, cfd;
 714         size_t len;
 715         char *fnam;
 716
 717         cfd = get_cgroup_fd(controller);
 718         if (cfd < 0)
 719                 return false;
 720
 721         /* Make sure we pass a relative path to *at() family of functions.
 722          * . + /cgroup + / + file + \0
 723          */
 724         len = strlen(cgroup) + strlen(file) + 3;
 725         fnam = alloca(len);
 726         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
 727         if (ret < 0 || (size_t)ret >= len)
 728                 return false;
 729
 730         fd = openat(cfd, fnam, O_WRONLY);
 731         if (fd < 0)
 732                 return false;
 733
 734         return write_string(fnam, value, fd);
 735 }
 736
 737 // Chown all the files in the cgroup directory.  We do this when we create
 738 // a cgroup on behalf of a user.
 739 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 740 {
 741         struct dirent *direntp;
 742         char path[MAXPATHLEN];
 743         size_t len;
 744         DIR *d;
 745         int fd1, ret;
 746
 747         len = strlen(dirname);
 748         if (len >= MAXPATHLEN) {
 749                 lxcfs_error("Pathname too long: %s\n", dirname);
 750                 return;
 751         }
 752
 753         fd1 = openat(fd, dirname, O_DIRECTORY);
 754         if (fd1 < 0)
 755                 return;
 756
 757         d = fdopendir(fd1);
 758         if (!d) {
 759                 lxcfs_error("Failed to open %s\n", dirname);
 760                 return;
 761         }
 762
 763         while ((direntp = readdir(d))) {
 764                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 765                         continue;
 766                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 767                 if (ret < 0 || ret >= MAXPATHLEN) {
 768                         lxcfs_error("Pathname too long under %s\n", dirname);
 769                         continue;
 770                 }
 771                 if (fchownat(fd, path, uid, gid, 0) < 0)
 772                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 773         }
 774         closedir(d);
 775 }
 776
 777 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 778 {
 779         int cfd;
 780         size_t len;
 781         char *dirnam;
 782
 783         cfd = get_cgroup_fd(controller);
 784         if (cfd < 0)
 785                 return -EINVAL;
 786
 787         /* Make sure we pass a relative path to *at() family of functions.
 788          * . + /cg + \0
 789          */
 790         len = strlen(cg) + 2;
 791         dirnam = alloca(len);
 792         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 793
 794         if (mkdirat(cfd, dirnam, 0755) < 0)
 795                 return -errno;
 796
 797         if (uid == 0 && gid == 0)
 798                 return 0;
 799
 800         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 801                 return -errno;
 802
 803         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 804
 805         return 0;
 806 }
 807
 808 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 809 {
 810         struct dirent *direntp;
 811         DIR *dir;
 812         bool ret = false;
 813         char pathname[MAXPATHLEN];
 814         int dupfd;
 815
 816         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 817         if (dupfd < 0)
 818                 return false;
 819
 820         dir = fdopendir(dupfd);
 821         if (!dir) {
 822                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 823                 close(dupfd);
 824                 return false;
 825         }
 826
 827         while ((direntp = readdir(dir))) {
 828                 struct stat mystat;
 829                 int rc;
 830
 831                 if (!strcmp(direntp->d_name, ".") ||
 832                     !strcmp(direntp->d_name, ".."))
 833                         continue;
 834
 835                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 836                 if (rc < 0 || rc >= MAXPATHLEN) {
 837                         lxcfs_error("%s\n", "Pathname too long.");
 838                         continue;
 839                 }
 840
 841                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 842                 if (rc) {
 843                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 844                         continue;
 845                 }
 846                 if (S_ISDIR(mystat.st_mode))
 847                         if (!recursive_rmdir(pathname, fd, cfd))
 848                                 lxcfs_debug("Error removing %s.\n", pathname);
 849         }
 850
 851         ret = true;
 852         if (closedir(dir) < 0) {
 853                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 854                 ret = false;
 855         }
 856
 857         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 858                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 859                 ret = false;
 860         }
 861
 862         close(dupfd);
 863
 864         return ret;
 865 }
 866
 867 bool cgfs_remove(const char *controller, const char *cg)
 868 {
 869         int fd, cfd;
 870         size_t len;
 871         char *dirnam;
 872         bool bret;
 873
 874         cfd = get_cgroup_fd(controller);
 875         if (cfd < 0)
 876                 return false;
 877
 878         /* Make sure we pass a relative path to *at() family of functions.
 879          * . +  /cg + \0
 880          */
 881         len = strlen(cg) + 2;
 882         dirnam = alloca(len);
 883         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 884
 885         fd = openat(cfd, dirnam, O_DIRECTORY);
 886         if (fd < 0)
 887                 return false;
 888
 889         bret = recursive_rmdir(dirnam, fd, cfd);
 890         close(fd);
 891         return bret;
 892 }
 893
 894 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 895 {
 896         int cfd;
 897         size_t len;
 898         char *pathname;
 899
 900         cfd = get_cgroup_fd(controller);
 901         if (cfd < 0)
 902                 return false;
 903
 904         /* Make sure we pass a relative path to *at() family of functions.
 905          * . + /file + \0
 906          */
 907         len = strlen(file) + 2;
 908         pathname = alloca(len);
 909         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 910         if (fchmodat(cfd, pathname, mode, 0) < 0)
 911                 return false;
 912         return true;
 913 }
 914
 915 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 916 {
 917         size_t len;
 918         char *fname;
 919
 920         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 921         fname = alloca(len);
 922         snprintf(fname, len, "%s/tasks", dirname);
 923         if (fchownat(fd, fname, uid, gid, 0) != 0)
 924                 return -errno;
 925         snprintf(fname, len, "%s/cgroup.procs", dirname);
 926         if (fchownat(fd, fname, uid, gid, 0) != 0)
 927                 return -errno;
 928         return 0;
 929 }
 930
 931 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 932 {
 933         int cfd;
 934         size_t len;
 935         char *pathname;
 936
 937         cfd = get_cgroup_fd(controller);
 938         if (cfd < 0)
 939                 return false;
 940
 941         /* Make sure we pass a relative path to *at() family of functions.
 942          * . + /file + \0
 943          */
 944         len = strlen(file) + 2;
 945         pathname = alloca(len);
 946         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 947         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 948                 return -errno;
 949
 950         if (is_dir(pathname, cfd))
 951                 // like cgmanager did, we want to chown the tasks file as well
 952                 return chown_tasks_files(pathname, uid, gid, cfd);
 953
 954         return 0;
 955 }
 956
 957 FILE *open_pids_file(const char *controller, const char *cgroup)
 958 {
 959         int fd, cfd;
 960         size_t len;
 961         char *pathname;
 962
 963         cfd = get_cgroup_fd(controller);
 964         if (cfd < 0)
 965                 return false;
 966
 967         /* Make sure we pass a relative path to *at() family of functions.
 968          * . + /cgroup + / "cgroup.procs" + \0
 969          */
 970         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 971         pathname = alloca(len);
 972         snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
 973
 974         fd = openat(cfd, pathname, O_WRONLY);
 975         if (fd < 0)
 976                 return NULL;
 977
 978         return fdopen(fd, "w");
 979 }
 980
 981 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 982                                 void ***list, size_t typesize,
 983                                 void* (*iterator)(const char*, const char*, const char*))
 984 {
 985         int cfd, fd, ret;
 986         size_t len;
 987         char *cg;
 988         char pathname[MAXPATHLEN];
 989         size_t sz = 0, asz = 0;
 990         struct dirent *dirent;
 991         DIR *dir;
 992
 993         cfd = get_cgroup_fd(controller);
 994         *list = NULL;
 995         if (cfd < 0)
 996                 return false;
 997
 998         /* Make sure we pass a relative path to *at() family of functions. */
 999         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1000         cg = alloca(len);
1001         ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
1002         if (ret < 0 || (size_t)ret >= len) {
1003                 lxcfs_error("Pathname too long under %s\n", cgroup);
1004                 return false;
1005         }
1006
1007         fd = openat(cfd, cg, O_DIRECTORY);
1008         if (fd < 0)
1009                 return false;
1010
1011         dir = fdopendir(fd);
1012         if (!dir)
1013                 return false;
1014
1015         while ((dirent = readdir(dir))) {
1016                 struct stat mystat;
1017
1018                 if (!strcmp(dirent->d_name, ".") ||
1019                     !strcmp(dirent->d_name, ".."))
1020                         continue;
1021
1022                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1023                 if (ret < 0 || ret >= MAXPATHLEN) {
1024                         lxcfs_error("Pathname too long under %s\n", cg);
1025                         continue;
1026                 }
1027
1028                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1029                 if (ret) {
1030                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1031                         continue;
1032                 }
1033                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1034                     (directories && !S_ISDIR(mystat.st_mode)))
1035                         continue;
1036
1037                 if (sz+2 >= asz) {
1038                         void **tmp;
1039                         asz += BATCH_SIZE;
1040                         do {
1041                                 tmp = realloc(*list, asz * typesize);
1042                         } while  (!tmp);
1043                         *list = tmp;
1044                 }
1045                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1046                 (*list)[sz+1] = NULL;
1047                 sz++;
1048         }
1049         if (closedir(dir) < 0) {
1050                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1051                 return false;
1052         }
1053         return true;
1054 }
1055
1056 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1057 {
1058         char *dup;
1059         do {
1060                 dup = strdup(dir_entry);
1061         } while (!dup);
1062         return dup;
1063 }
1064
1065 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1066 {
1067         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1068 }
1069
1070 void free_key(struct cgfs_files *k)
1071 {
1072         if (!k)
1073                 return;
1074         free_disarm(k->name);
1075         free_disarm(k);
1076 }
1077
1078 void free_keys(struct cgfs_files **keys)
1079 {
1080         int i;
1081
1082         if (!keys)
1083                 return;
1084         for (i = 0; keys[i]; i++) {
1085                 free_key(keys[i]);
1086         }
1087         free_disarm(keys);
1088 }
1089
1090 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1091 {
1092         int ret, cfd;
1093         size_t len;
1094         char *fnam;
1095
1096         cfd = get_cgroup_fd(controller);
1097         if (cfd < 0)
1098                 return false;
1099
1100         /* Make sure we pass a relative path to *at() family of functions.
1101          * . + /cgroup + / + file + \0
1102          */
1103         len = strlen(cgroup) + strlen(file) + 3;
1104         fnam = alloca(len);
1105         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1106         if (ret < 0 || (size_t)ret >= len)
1107                 return false;
1108
1109         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1110 }
1111
1112 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1113 {
1114         int ret, cfd;
1115         size_t len;
1116         char *fnam;
1117         struct stat sb;
1118         struct cgfs_files *newkey;
1119
1120         cfd = get_cgroup_fd(controller);
1121         if (cfd < 0)
1122                 return false;
1123
1124         if (file && *file == '/')
1125                 file++;
1126
1127         if (file && strchr(file, '/'))
1128                 return NULL;
1129
1130         /* Make sure we pass a relative path to *at() family of functions.
1131          * . + /cgroup + / + file + \0
1132          */
1133         len = strlen(cgroup) + 3;
1134         if (file)
1135                 len += strlen(file) + 1;
1136         fnam = alloca(len);
1137         snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
1138                  file ? "/" : "", file ? file : "");
1139
1140         ret = fstatat(cfd, fnam, &sb, 0);
1141         if (ret < 0)
1142                 return NULL;
1143
1144         do {
1145                 newkey = malloc(sizeof(struct cgfs_files));
1146         } while (!newkey);
1147         if (file)
1148                 newkey->name = must_copy_string(file);
1149         else if (strrchr(cgroup, '/'))
1150                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1151         else
1152                 newkey->name = must_copy_string(cgroup);
1153         newkey->uid = sb.st_uid;
1154         newkey->gid = sb.st_gid;
1155         newkey->mode = sb.st_mode;
1156
1157         return newkey;
1158 }
1159
1160 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1161 {
1162         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1163         if (!entry) {
1164                 lxcfs_error("Error getting files under %s:%s\n", controller,
1165                              cgroup);
1166         }
1167         return entry;
1168 }
1169
1170 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1171 {
1172         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1173 }
1174
1175 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1176 {
1177         int cfd;
1178         size_t len;
1179         char *fnam;
1180         int ret;
1181         struct stat sb;
1182
1183         cfd = get_cgroup_fd(controller);
1184         if (cfd < 0)
1185                 return false;
1186
1187         /* Make sure we pass a relative path to *at() family of functions.
1188          * . + /cgroup + / + f + \0
1189          */
1190         len = strlen(cgroup) + strlen(f) + 3;
1191         fnam = alloca(len);
1192         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
1193         if (ret < 0 || (size_t)ret >= len)
1194                 return false;
1195
1196         ret = fstatat(cfd, fnam, &sb, 0);
1197         if (ret < 0 || !S_ISDIR(sb.st_mode))
1198                 return false;
1199
1200         return true;
1201 }
1202
1203 #define SEND_CREDS_OK 0
1204 #define SEND_CREDS_NOTSK 1
1205 #define SEND_CREDS_FAIL 2
1206 static bool recv_creds(int sock, struct ucred *cred, char *v);
1207 static int wait_for_pid(pid_t pid);
1208 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1209 static int send_creds_clone_wrapper(void *arg);
1210
1211 /*
1212  * clone a task which switches to @task's namespace and writes '1'.
1213  * over a unix sock so we can read the task's reaper's pid in our
1214  * namespace
1215  *
1216  * Note: glibc's fork() does not respect pidns, which can lead to failed
1217  * assertions inside glibc (and thus failed forks) if the child's pid in
1218  * the pidns and the parent pid outside are identical. Using clone prevents
1219  * this issue.
1220  */
1221 static void write_task_init_pid_exit(int sock, pid_t target)
1222 {
1223         char fnam[100];
1224         pid_t pid;
1225         int fd, ret;
1226         size_t stack_size = sysconf(_SC_PAGESIZE);
1227         void *stack = alloca(stack_size);
1228
1229         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1230         if (ret < 0 || ret >= sizeof(fnam))
1231                 _exit(1);
1232
1233         fd = open(fnam, O_RDONLY);
1234         if (fd < 0) {
1235                 perror("write_task_init_pid_exit open of ns/pid");
1236                 _exit(1);
1237         }
1238         if (setns(fd, 0)) {
1239                 perror("write_task_init_pid_exit setns 1");
1240                 close(fd);
1241                 _exit(1);
1242         }
1243         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1244         if (pid < 0)
1245                 _exit(1);
1246         if (pid != 0) {
1247                 if (!wait_for_pid(pid))
1248                         _exit(1);
1249                 _exit(0);
1250         }
1251 }
1252
1253 static int send_creds_clone_wrapper(void *arg) {
1254         struct ucred cred;
1255         char v;
1256         int sock = *(int *)arg;
1257
1258         /* we are the child */
1259         cred.uid = 0;
1260         cred.gid = 0;
1261         cred.pid = 1;
1262         v = '1';
1263         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1264                 return 1;
1265         return 0;
1266 }
1267
1268 static pid_t get_init_pid_for_task(pid_t task)
1269 {
1270         int sock[2];
1271         pid_t pid;
1272         pid_t ret = -1;
1273         char v = '0';
1274         struct ucred cred;
1275
1276         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1277                 perror("socketpair");
1278                 return -1;
1279         }
1280
1281         pid = fork();
1282         if (pid < 0)
1283                 goto out;
1284         if (!pid) {
1285                 close(sock[1]);
1286                 write_task_init_pid_exit(sock[0], task);
1287                 _exit(0);
1288         }
1289
1290         if (!recv_creds(sock[1], &cred, &v))
1291                 goto out;
1292         ret = cred.pid;
1293
1294 out:
1295         close(sock[0]);
1296         close(sock[1]);
1297         if (pid > 0)
1298                 wait_for_pid(pid);
1299         return ret;
1300 }
1301
1302 pid_t lookup_initpid_in_store(pid_t qpid)
1303 {
1304         pid_t answer = 0;
1305         struct stat sb;
1306         struct pidns_init_store *e;
1307         char fnam[100];
1308
1309         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1310         store_lock();
1311         if (stat(fnam, &sb) < 0)
1312                 goto out;
1313         e = lookup_verify_initpid(&sb);
1314         if (e) {
1315                 answer = e->initpid;
1316                 goto out;
1317         }
1318         answer = get_init_pid_for_task(qpid);
1319         if (answer > 0)
1320                 save_initpid(&sb, answer);
1321
1322 out:
1323         /* we prune at end in case we are returning
1324          * the value we were about to return */
1325         prune_initpid_store();
1326         store_unlock();
1327         return answer;
1328 }
1329
1330 static int wait_for_pid(pid_t pid)
1331 {
1332         int status, ret;
1333
1334         if (pid <= 0)
1335                 return -1;
1336
1337 again:
1338         ret = waitpid(pid, &status, 0);
1339         if (ret == -1) {
1340                 if (errno == EINTR)
1341                         goto again;
1342                 return -1;
1343         }
1344         if (ret != pid)
1345                 goto again;
1346         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1347                 return -1;
1348         return 0;
1349 }
1350
1351 /*
1352  * append the given formatted string to *src.
1353  * src: a pointer to a char* in which to append the formatted string.
1354  * sz: the number of characters printed so far, minus trailing \0.
1355  * asz: the allocated size so far
1356  * format: string format. See printf for details.
1357  * ...: varargs. See printf for details.
1358  */
1359 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1360 {
1361         char tmp[BUF_RESERVE_SIZE];
1362         va_list         args;
1363
1364         va_start (args, format);
1365         int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1366         va_end(args);
1367
1368         if (!*src || tmplen + *sz + 1 >= *asz) {
1369                 char *tmp;
1370                 do {
1371                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1372                 } while (!tmp);
1373                 *src = tmp;
1374                 *asz += BUF_RESERVE_SIZE;
1375         }
1376         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1377         *sz += tmplen;
1378 }
1379
1380 /*
1381  * append pid to *src.
1382  * src: a pointer to a char* in which ot append the pid.
1383  * sz: the number of characters printed so far, minus trailing \0.
1384  * asz: the allocated size so far
1385  * pid: the pid to append
1386  */
1387 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1388 {
1389         must_strcat(src, sz, asz, "%d\n", (int)pid);
1390 }
1391
1392 /*
1393  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1394  * valid in the caller's namespace, return the id mapped into
1395  * pid's namespace.
1396  * Returns the mapped id, or -1 on error.
1397  */
1398 unsigned int
1399 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1400 {
1401         unsigned int nsuid,   // base id for a range in the idfile's namespace
1402                      hostuid, // base id for a range in the caller's namespace
1403                      count;   // number of ids in this range
1404         char line[400];
1405         int ret;
1406
1407         fseek(idfile, 0L, SEEK_SET);
1408         while (fgets(line, 400, idfile)) {
1409                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1410                 if (ret != 3)
1411                         continue;
1412                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1413                         /*
1414                          * uids wrapped around - unexpected as this is a procfile,
1415                          * so just bail.
1416                          */
1417                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1418                                 nsuid, hostuid, count, line);
1419                         return -1;
1420                 }
1421                 if (hostuid <= in_id && hostuid+count > in_id) {
1422                         /*
1423                          * now since hostuid <= in_id < hostuid+count, and
1424                          * hostuid+count and nsuid+count do not wrap around,
1425                          * we know that nsuid+(in_id-hostuid) which must be
1426                          * less that nsuid+(count) must not wrap around
1427                          */
1428                         return (in_id - hostuid) + nsuid;
1429                 }
1430         }
1431
1432         // no answer found
1433         return -1;
1434 }
1435
1436 /*
1437  * for is_privileged_over,
1438  * specify whether we require the calling uid to be root in his
1439  * namespace
1440  */
1441 #define NS_ROOT_REQD true
1442 #define NS_ROOT_OPT false
1443
1444 #define PROCLEN 100
1445
1446 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1447 {
1448         char fpath[PROCLEN];
1449         int ret;
1450         bool answer = false;
1451         uid_t nsuid;
1452
1453         if (victim == -1 || uid == -1)
1454                 return false;
1455
1456         /*
1457          * If the request is one not requiring root in the namespace,
1458          * then having the same uid suffices.  (i.e. uid 1000 has write
1459          * access to files owned by uid 1000
1460          */
1461         if (!req_ns_root && uid == victim)
1462                 return true;
1463
1464         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1465         if (ret < 0 || ret >= PROCLEN)
1466                 return false;
1467         FILE *f = fopen(fpath, "r");
1468         if (!f)
1469                 return false;
1470
1471         /* if caller's not root in his namespace, reject */
1472         nsuid = convert_id_to_ns(f, uid);
1473         if (nsuid)
1474                 goto out;
1475
1476         /*
1477          * If victim is not mapped into caller's ns, reject.
1478          * XXX I'm not sure this check is needed given that fuse
1479          * will be sending requests where the vfs has converted
1480          */
1481         nsuid = convert_id_to_ns(f, victim);
1482         if (nsuid == -1)
1483                 goto out;
1484
1485         answer = true;
1486
1487 out:
1488         fclose(f);
1489         return answer;
1490 }
1491
1492 static bool perms_include(int fmode, mode_t req_mode)
1493 {
1494         mode_t r;
1495
1496         switch (req_mode & O_ACCMODE) {
1497         case O_RDONLY:
1498                 r = S_IROTH;
1499                 break;
1500         case O_WRONLY:
1501                 r = S_IWOTH;
1502                 break;
1503         case O_RDWR:
1504                 r = S_IROTH | S_IWOTH;
1505                 break;
1506         default:
1507                 return false;
1508         }
1509         return ((fmode & r) == r);
1510 }
1511
1512
1513 /*
1514  * taskcg is  a/b/c
1515  * querycg is /a/b/c/d/e
1516  * we return 'd'
1517  */
1518 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1519 {
1520         char *start, *end;
1521
1522         if (strlen(taskcg) <= strlen(querycg)) {
1523                 lxcfs_error("%s\n", "I was fed bad input.");
1524                 return NULL;
1525         }
1526
1527         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1528                 start =  strdup(taskcg + 1);
1529         else
1530                 start = strdup(taskcg + strlen(querycg) + 1);
1531         if (!start)
1532                 return NULL;
1533         end = strchr(start, '/');
1534         if (end)
1535                 *end = '\0';
1536         return start;
1537 }
1538
1539 char *get_pid_cgroup(pid_t pid, const char *contrl)
1540 {
1541         int cfd;
1542
1543         cfd = get_cgroup_fd(contrl);
1544         if (cfd < 0)
1545                 return false;
1546
1547         if (pure_unified_layout(cgroup_ops))
1548                 return cg_unified_get_current_cgroup(pid);
1549
1550         return cg_legacy_get_current_cgroup(pid, contrl);
1551 }
1552
1553 /*
1554  * check whether a fuse context may access a cgroup dir or file
1555  *
1556  * If file is not null, it is a cgroup file to check under cg.
1557  * If file is null, then we are checking perms on cg itself.
1558  *
1559  * For files we can check the mode of the list_keys result.
1560  * For cgroups, we must make assumptions based on the files under the
1561  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1562  * yet.
1563  */
1564 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1565 {
1566         struct cgfs_files *k = NULL;
1567         bool ret = false;
1568
1569         k = cgfs_get_key(contrl, cg, file);
1570         if (!k)
1571                 return false;
1572
1573         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1574                 if (perms_include(k->mode >> 6, mode)) {
1575                         ret = true;
1576                         goto out;
1577                 }
1578         }
1579         if (fc->gid == k->gid) {
1580                 if (perms_include(k->mode >> 3, mode)) {
1581                         ret = true;
1582                         goto out;
1583                 }
1584         }
1585         ret = perms_include(k->mode, mode);
1586
1587 out:
1588         free_key(k);
1589         return ret;
1590 }
1591
1592 #define INITSCOPE "/init.scope"
1593 void prune_init_slice(char *cg)
1594 {
1595         char *point;
1596         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1597
1598         if (cg_len < initscope_len)
1599                 return;
1600
1601         point = cg + cg_len - initscope_len;
1602         if (strcmp(point, INITSCOPE) == 0) {
1603                 if (point == cg)
1604                         *(point+1) = '\0';
1605                 else
1606                         *point = '\0';
1607         }
1608 }
1609
1610 /*
1611  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1612  * If pid is in /a, he may act on /a/b, but not on /b.
1613  * if the answer is false and nextcg is not NULL, then *nextcg will point
1614  * to a string containing the next cgroup directory under cg, which must be
1615  * freed by the caller.
1616  */
1617 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1618 {
1619         bool answer = false;
1620         char *c2 = get_pid_cgroup(pid, contrl);
1621         char *linecmp;
1622
1623         if (!c2)
1624                 return false;
1625         prune_init_slice(c2);
1626
1627         /*
1628          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1629          * they pass in a cgroup without leading '/'
1630          *
1631          * The original line here was:
1632          *      linecmp = *cg == '/' ? c2 : c2+1;
1633          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1634          *       Serge, do you know?
1635          */
1636         if (*cg == '/' || !strncmp(cg, "./", 2))
1637                 linecmp = c2;
1638         else
1639                 linecmp = c2 + 1;
1640         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1641                 if (nextcg) {
1642                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1643                 }
1644                 goto out;
1645         }
1646         answer = true;
1647
1648 out:
1649         free(c2);
1650         return answer;
1651 }
1652
1653 /*
1654  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1655  */
1656 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1657 {
1658         bool answer = false;
1659         char *c2, *task_cg;
1660         size_t target_len, task_len;
1661
1662         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1663                 return true;
1664
1665         c2 = get_pid_cgroup(pid, contrl);
1666         if (!c2)
1667                 return false;
1668         prune_init_slice(c2);
1669
1670         task_cg = c2 + 1;
1671         target_len = strlen(cg);
1672         task_len = strlen(task_cg);
1673         if (task_len == 0) {
1674                 /* Task is in the root cg, it can see everything. This case is
1675                  * not handled by the strmcps below, since they test for the
1676                  * last /, but that is the first / that we've chopped off
1677                  * above.
1678                  */
1679                 answer = true;
1680                 goto out;
1681         }
1682         if (strcmp(cg, task_cg) == 0) {
1683                 answer = true;
1684                 goto out;
1685         }
1686         if (target_len < task_len) {
1687                 /* looking up a parent dir */
1688                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1689                         answer = true;
1690                 goto out;
1691         }
1692         if (target_len > task_len) {
1693                 /* looking up a child dir */
1694                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1695                         answer = true;
1696                 goto out;
1697         }
1698
1699 out:
1700         free(c2);
1701         return answer;
1702 }
1703
1704 /*
1705  * given /cgroup/freezer/a/b, return "freezer".
1706  * the returned char* should NOT be freed.
1707  */
1708 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1709 {
1710         const char *p1;
1711         char *contr, *slash;
1712
1713         if (strlen(path) < 9) {
1714                 errno = EACCES;
1715                 return NULL;
1716         }
1717         if (*(path + 7) != '/') {
1718                 errno = EINVAL;
1719                 return NULL;
1720         }
1721         p1 = path + 8;
1722         contr = strdupa(p1);
1723         if (!contr) {
1724                 errno = ENOMEM;
1725                 return NULL;
1726         }
1727         slash = strstr(contr, "/");
1728         if (slash)
1729                 *slash = '\0';
1730
1731         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1732                 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1733                         return (*h)->__controllers;
1734         }
1735         errno = ENOENT;
1736         return NULL;
1737 }
1738
1739 /*
1740  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1741  * Note that the returned value may include files (keynames) etc
1742  */
1743 static const char *find_cgroup_in_path(const char *path)
1744 {
1745         const char *p1;
1746
1747         if (strlen(path) < 9) {
1748                 errno = EACCES;
1749                 return NULL;
1750         }
1751         p1 = strstr(path + 8, "/");
1752         if (!p1) {
1753                 errno = EINVAL;
1754                 return NULL;
1755         }
1756         errno = 0;
1757         return p1 + 1;
1758 }
1759
1760 /*
1761  * split the last path element from the path in @cg.
1762  * @dir is newly allocated and should be freed, @last not
1763 */
1764 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1765 {
1766         char *p;
1767
1768         do {
1769                 *dir = strdup(cg);
1770         } while (!*dir);
1771         *last = strrchr(cg, '/');
1772         if (!*last) {
1773                 *last = NULL;
1774                 return;
1775         }
1776         p = strrchr(*dir, '/');
1777         *p = '\0';
1778 }
1779
1780 /*
1781  * FUSE ops for /cgroup
1782  */
1783
1784 int cg_getattr(const char *path, struct stat *sb)
1785 {
1786         struct timespec now;
1787         struct fuse_context *fc = fuse_get_context();
1788         char * cgdir = NULL;
1789         char *last = NULL, *path1, *path2;
1790         struct cgfs_files *k = NULL;
1791         const char *cgroup;
1792         const char *controller = NULL;
1793         int ret = -ENOENT;
1794
1795
1796         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1797                 return -EIO;
1798
1799         memset(sb, 0, sizeof(struct stat));
1800
1801         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1802                 return -EINVAL;
1803
1804         sb->st_uid = sb->st_gid = 0;
1805         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1806         sb->st_size = 0;
1807
1808         if (strcmp(path, "/cgroup") == 0) {
1809                 sb->st_mode = S_IFDIR | 00755;
1810                 sb->st_nlink = 2;
1811                 return 0;
1812         }
1813
1814         controller = pick_controller_from_path(fc, path);
1815         if (!controller)
1816                 return -errno;
1817         cgroup = find_cgroup_in_path(path);
1818         if (!cgroup) {
1819                 /* this is just /cgroup/controller, return it as a dir */
1820                 sb->st_mode = S_IFDIR | 00755;
1821                 sb->st_nlink = 2;
1822                 return 0;
1823         }
1824
1825         get_cgdir_and_path(cgroup, &cgdir, &last);
1826
1827         if (!last) {
1828                 path1 = "/";
1829                 path2 = cgdir;
1830         } else {
1831                 path1 = cgdir;
1832                 path2 = last;
1833         }
1834
1835         pid_t initpid = lookup_initpid_in_store(fc->pid);
1836         if (initpid <= 1 || is_shared_pidns(initpid))
1837                 initpid = fc->pid;
1838         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1839          * Then check that caller's cgroup is under path if last is a child
1840          * cgroup, or cgdir if last is a file */
1841
1842         if (is_child_cgroup(controller, path1, path2)) {
1843                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1844                         ret = -ENOENT;
1845                         goto out;
1846                 }
1847                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1848                         /* this is just /cgroup/controller, return it as a dir */
1849                         sb->st_mode = S_IFDIR | 00555;
1850                         sb->st_nlink = 2;
1851                         ret = 0;
1852                         goto out;
1853                 }
1854                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1855                         ret = -EACCES;
1856                         goto out;
1857                 }
1858
1859                 // get uid, gid, from '/tasks' file and make up a mode
1860                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1861                 sb->st_mode = S_IFDIR | 00755;
1862                 k = cgfs_get_key(controller, cgroup, NULL);
1863                 if (!k) {
1864                         sb->st_uid = sb->st_gid = 0;
1865                 } else {
1866                         sb->st_uid = k->uid;
1867                         sb->st_gid = k->gid;
1868                 }
1869                 free_key(k);
1870                 sb->st_nlink = 2;
1871                 ret = 0;
1872                 goto out;
1873         }
1874
1875         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1876                 sb->st_mode = S_IFREG | k->mode;
1877                 sb->st_nlink = 1;
1878                 sb->st_uid = k->uid;
1879                 sb->st_gid = k->gid;
1880                 sb->st_size = 0;
1881                 free_key(k);
1882                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1883                         ret = -ENOENT;
1884                         goto out;
1885                 }
1886                 ret = 0;
1887         }
1888
1889 out:
1890         free(cgdir);
1891         return ret;
1892 }
1893
1894 int cg_opendir(const char *path, struct fuse_file_info *fi)
1895 {
1896         struct fuse_context *fc = fuse_get_context();
1897         const char *cgroup;
1898         struct file_info *dir_info;
1899         char *controller = NULL;
1900
1901         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1902                 return -EIO;
1903
1904         if (strcmp(path, "/cgroup") == 0) {
1905                 cgroup = NULL;
1906                 controller = NULL;
1907         } else {
1908                 // return list of keys for the controller, and list of child cgroups
1909                 controller = pick_controller_from_path(fc, path);
1910                 if (!controller)
1911                         return -errno;
1912
1913                 cgroup = find_cgroup_in_path(path);
1914                 if (!cgroup) {
1915                         /* this is just /cgroup/controller, return its contents */
1916                         cgroup = "/";
1917                 }
1918         }
1919
1920         pid_t initpid = lookup_initpid_in_store(fc->pid);
1921         if (initpid <= 1 || is_shared_pidns(initpid))
1922                 initpid = fc->pid;
1923         if (cgroup) {
1924                 if (!caller_may_see_dir(initpid, controller, cgroup))
1925                         return -ENOENT;
1926                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1927                         return -EACCES;
1928         }
1929
1930         /* we'll free this at cg_releasedir */
1931         dir_info = malloc(sizeof(*dir_info));
1932         if (!dir_info)
1933                 return -ENOMEM;
1934         dir_info->controller = must_copy_string(controller);
1935         dir_info->cgroup = must_copy_string(cgroup);
1936         dir_info->type = LXC_TYPE_CGDIR;
1937         dir_info->buf = NULL;
1938         dir_info->file = NULL;
1939         dir_info->buflen = 0;
1940
1941         fi->fh = (unsigned long)dir_info;
1942         return 0;
1943 }
1944
1945 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1946                 struct fuse_file_info *fi)
1947 {
1948         struct file_info *d = (struct file_info *)fi->fh;
1949         struct cgfs_files **list = NULL;
1950         int i, ret;
1951         char *nextcg = NULL;
1952         struct fuse_context *fc = fuse_get_context();
1953         char **clist = NULL;
1954
1955         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1956                 return -EIO;
1957
1958         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1959                 return -EIO;
1960
1961         if (d->type != LXC_TYPE_CGDIR) {
1962                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1963                 return -EIO;
1964         }
1965         if (!d->cgroup && !d->controller) {
1966                 /*
1967                  * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1968                  * This only works with the legacy hierarchy.
1969                  */
1970                 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1971                         if (is_unified_hierarchy(*h))
1972                                 continue;
1973
1974                         if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
1975                                 return -EIO;
1976                 }
1977
1978                 return 0;
1979         }
1980
1981         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1982                 // not a valid cgroup
1983                 ret = -EINVAL;
1984                 goto out;
1985         }
1986
1987         pid_t initpid = lookup_initpid_in_store(fc->pid);
1988         if (initpid <= 1 || is_shared_pidns(initpid))
1989                 initpid = fc->pid;
1990         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1991                 if (nextcg) {
1992                         ret = filler(buf, nextcg,  NULL, 0);
1993                         free(nextcg);
1994                         if (ret != 0) {
1995                                 ret = -EIO;
1996                                 goto out;
1997                         }
1998                 }
1999                 ret = 0;
2000                 goto out;
2001         }
2002
2003         for (i = 0; list && list[i]; i++) {
2004                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2005                         ret = -EIO;
2006                         goto out;
2007                 }
2008         }
2009
2010         // now get the list of child cgroups
2011
2012         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2013                 ret = 0;
2014                 goto out;
2015         }
2016         if (clist) {
2017                 for (i = 0; clist[i]; i++) {
2018                         if (filler(buf, clist[i], NULL, 0) != 0) {
2019                                 ret = -EIO;
2020                                 goto out;
2021                         }
2022                 }
2023         }
2024         ret = 0;
2025
2026 out:
2027         free_keys(list);
2028         if (clist) {
2029                 for (i = 0; clist[i]; i++)
2030                         free(clist[i]);
2031                 free(clist);
2032         }
2033         return ret;
2034 }
2035
2036 void do_release_file_info(struct fuse_file_info *fi)
2037 {
2038         struct file_info *f = (struct file_info *)fi->fh;
2039
2040         if (!f)
2041                 return;
2042
2043         fi->fh = 0;
2044
2045         free_disarm(f->controller);
2046         free_disarm(f->cgroup);
2047         free_disarm(f->file);
2048         free_disarm(f->buf);
2049         free_disarm(f);
2050 }
2051
2052 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2053 {
2054         do_release_file_info(fi);
2055         return 0;
2056 }
2057
2058 int cg_open(const char *path, struct fuse_file_info *fi)
2059 {
2060         const char *cgroup;
2061         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2062         struct cgfs_files *k = NULL;
2063         struct file_info *file_info;
2064         struct fuse_context *fc = fuse_get_context();
2065         int ret;
2066
2067         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2068                 return -EIO;
2069
2070         controller = pick_controller_from_path(fc, path);
2071         if (!controller)
2072                 return -errno;
2073         cgroup = find_cgroup_in_path(path);
2074         if (!cgroup)
2075                 return -errno;
2076
2077         get_cgdir_and_path(cgroup, &cgdir, &last);
2078         if (!last) {
2079                 path1 = "/";
2080                 path2 = cgdir;
2081         } else {
2082                 path1 = cgdir;
2083                 path2 = last;
2084         }
2085
2086         k = cgfs_get_key(controller, path1, path2);
2087         if (!k) {
2088                 ret = -EINVAL;
2089                 goto out;
2090         }
2091         free_key(k);
2092
2093         pid_t initpid = lookup_initpid_in_store(fc->pid);
2094         if (initpid <= 1 || is_shared_pidns(initpid))
2095                 initpid = fc->pid;
2096         if (!caller_may_see_dir(initpid, controller, path1)) {
2097                 ret = -ENOENT;
2098                 goto out;
2099         }
2100         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2101                 ret = -EACCES;
2102                 goto out;
2103         }
2104
2105         /* we'll free this at cg_release */
2106         file_info = malloc(sizeof(*file_info));
2107         if (!file_info) {
2108                 ret = -ENOMEM;
2109                 goto out;
2110         }
2111         file_info->controller = must_copy_string(controller);
2112         file_info->cgroup = must_copy_string(path1);
2113         file_info->file = must_copy_string(path2);
2114         file_info->type = LXC_TYPE_CGFILE;
2115         file_info->buf = NULL;
2116         file_info->buflen = 0;
2117
2118         fi->fh = (unsigned long)file_info;
2119         ret = 0;
2120
2121 out:
2122         free(cgdir);
2123         return ret;
2124 }
2125
2126 int cg_access(const char *path, int mode)
2127 {
2128         int ret;
2129         const char *cgroup;
2130         char *path1, *path2, *controller;
2131         char *last = NULL, *cgdir = NULL;
2132         struct cgfs_files *k = NULL;
2133         struct fuse_context *fc = fuse_get_context();
2134
2135         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2136                 return -EIO;
2137
2138         if (strcmp(path, "/cgroup") == 0)
2139                 return 0;
2140
2141         controller = pick_controller_from_path(fc, path);
2142         if (!controller)
2143                 return -errno;
2144         cgroup = find_cgroup_in_path(path);
2145         if (!cgroup) {
2146                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2147                 if ((mode & W_OK) == 0)
2148                         return 0;
2149                 return -EACCES;
2150         }
2151
2152         get_cgdir_and_path(cgroup, &cgdir, &last);
2153         if (!last) {
2154                 path1 = "/";
2155                 path2 = cgdir;
2156         } else {
2157                 path1 = cgdir;
2158                 path2 = last;
2159         }
2160
2161         k = cgfs_get_key(controller, path1, path2);
2162         if (!k) {
2163                 if ((mode & W_OK) == 0)
2164                         ret = 0;
2165                 else
2166                         ret = -EACCES;
2167                 goto out;
2168         }
2169         free_key(k);
2170
2171         pid_t initpid = lookup_initpid_in_store(fc->pid);
2172         if (initpid <= 1 || is_shared_pidns(initpid))
2173                 initpid = fc->pid;
2174         if (!caller_may_see_dir(initpid, controller, path1)) {
2175                 ret = -ENOENT;
2176                 goto out;
2177         }
2178         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2179                 ret = -EACCES;
2180                 goto out;
2181         }
2182
2183         ret = 0;
2184
2185 out:
2186         free(cgdir);
2187         return ret;
2188 }
2189
2190 int cg_release(const char *path, struct fuse_file_info *fi)
2191 {
2192         do_release_file_info(fi);
2193         return 0;
2194 }
2195
2196 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2197
2198 static bool wait_for_sock(int sock, int timeout)
2199 {
2200         struct epoll_event ev;
2201         int epfd, ret, now, starttime, deltatime, saved_errno;
2202
2203         if ((starttime = time(NULL)) < 0)
2204                 return false;
2205
2206         if ((epfd = epoll_create(1)) < 0) {
2207                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2208                 return false;
2209         }
2210
2211         ev.events = POLLIN_SET;
2212         ev.data.fd = sock;
2213         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2214                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2215                 close(epfd);
2216                 return false;
2217         }
2218
2219 again:
2220         if ((now = time(NULL)) < 0) {
2221                 close(epfd);
2222                 return false;
2223         }
2224
2225         deltatime = (starttime + timeout) - now;
2226         if (deltatime < 0) { // timeout
2227                 errno = 0;
2228                 close(epfd);
2229                 return false;
2230         }
2231         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2232         if (ret < 0 && errno == EINTR)
2233                 goto again;
2234         saved_errno = errno;
2235         close(epfd);
2236
2237         if (ret <= 0) {
2238                 errno = saved_errno;
2239                 return false;
2240         }
2241         return true;
2242 }
2243
2244 static int msgrecv(int sockfd, void *buf, size_t len)
2245 {
2246         if (!wait_for_sock(sockfd, 2))
2247                 return -1;
2248         return recv(sockfd, buf, len, MSG_DONTWAIT);
2249 }
2250
2251 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2252 {
2253         struct msghdr msg = { 0 };
2254         struct iovec iov;
2255         struct cmsghdr *cmsg;
2256         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2257         char buf[1];
2258         buf[0] = 'p';
2259
2260         if (pingfirst) {
2261                 if (msgrecv(sock, buf, 1) != 1) {
2262                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2263                         return SEND_CREDS_FAIL;
2264                 }
2265         }
2266
2267         msg.msg_control = cmsgbuf;
2268         msg.msg_controllen = sizeof(cmsgbuf);
2269
2270         cmsg = CMSG_FIRSTHDR(&msg);
2271         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2272         cmsg->cmsg_level = SOL_SOCKET;
2273         cmsg->cmsg_type = SCM_CREDENTIALS;
2274         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2275
2276         msg.msg_name = NULL;
2277         msg.msg_namelen = 0;
2278
2279         buf[0] = v;
2280         iov.iov_base = buf;
2281         iov.iov_len = sizeof(buf);
2282         msg.msg_iov = &iov;
2283         msg.msg_iovlen = 1;
2284
2285         if (sendmsg(sock, &msg, 0) < 0) {
2286                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2287                 if (errno == 3)
2288                         return SEND_CREDS_NOTSK;
2289                 return SEND_CREDS_FAIL;
2290         }
2291
2292         return SEND_CREDS_OK;
2293 }
2294
2295 static bool recv_creds(int sock, struct ucred *cred, char *v)
2296 {
2297         struct msghdr msg = { 0 };
2298         struct iovec iov;
2299         struct cmsghdr *cmsg;
2300         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2301         char buf[1];
2302         int ret;
2303         int optval = 1;
2304
2305         *v = '1';
2306
2307         cred->pid = -1;
2308         cred->uid = -1;
2309         cred->gid = -1;
2310
2311         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2312                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2313                 return false;
2314         }
2315         buf[0] = '1';
2316         if (write(sock, buf, 1) != 1) {
2317                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2318                 return false;
2319         }
2320
2321         msg.msg_name = NULL;
2322         msg.msg_namelen = 0;
2323         msg.msg_control = cmsgbuf;
2324         msg.msg_controllen = sizeof(cmsgbuf);
2325
2326         iov.iov_base = buf;
2327         iov.iov_len = sizeof(buf);
2328         msg.msg_iov = &iov;
2329         msg.msg_iovlen = 1;
2330
2331         if (!wait_for_sock(sock, 2)) {
2332                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2333                 return false;
2334         }
2335         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2336         if (ret < 0) {
2337                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2338                 return false;
2339         }
2340
2341         cmsg = CMSG_FIRSTHDR(&msg);
2342
2343         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2344                         cmsg->cmsg_level == SOL_SOCKET &&
2345                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2346                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2347         }
2348         *v = buf[0];
2349
2350         return true;
2351 }
2352
2353 struct pid_ns_clone_args {
2354         int *cpipe;
2355         int sock;
2356         pid_t tpid;
2357         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2358 };
2359
2360 /*
2361  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2362  * with clone(). This simply writes '1' as ACK back to the parent
2363  * before calling the actual wrapped function.
2364  */
2365 static int pid_ns_clone_wrapper(void *arg) {
2366         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2367         char b = '1';
2368
2369         close(args->cpipe[0]);
2370         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2371                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2372         close(args->cpipe[1]);
2373         return args->wrapped(args->sock, args->tpid);
2374 }
2375
2376 /*
2377  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2378  * int value back over the socket.  This shifts the pid from the
2379  * sender's pidns into tpid's pidns.
2380  */
2381 static int pid_to_ns(int sock, pid_t tpid)
2382 {
2383         char v = '0';
2384         struct ucred cred;
2385
2386         while (recv_creds(sock, &cred, &v)) {
2387                 if (v == '1')
2388                         return 0;
2389                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2390                         return 1;
2391         }
2392         return 0;
2393 }
2394
2395
2396 /*
2397  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2398  * in your old pidns.  Only children which you clone will be in the target
2399  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2400  * actually convert pids.
2401  *
2402  * Note: glibc's fork() does not respect pidns, which can lead to failed
2403  * assertions inside glibc (and thus failed forks) if the child's pid in
2404  * the pidns and the parent pid outside are identical. Using clone prevents
2405  * this issue.
2406  */
2407 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2408 {
2409         int newnsfd = -1, ret, cpipe[2];
2410         char fnam[100];
2411         pid_t cpid;
2412         char v;
2413
2414         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2415         if (ret < 0 || ret >= sizeof(fnam))
2416                 _exit(1);
2417         newnsfd = open(fnam, O_RDONLY);
2418         if (newnsfd < 0)
2419                 _exit(1);
2420         if (setns(newnsfd, 0) < 0)
2421                 _exit(1);
2422         close(newnsfd);
2423
2424         if (pipe(cpipe) < 0)
2425                 _exit(1);
2426
2427         struct pid_ns_clone_args args = {
2428                 .cpipe = cpipe,
2429                 .sock = sock,
2430                 .tpid = tpid,
2431                 .wrapped = &pid_to_ns
2432         };
2433         size_t stack_size = sysconf(_SC_PAGESIZE);
2434         void *stack = alloca(stack_size);
2435
2436         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2437         if (cpid < 0)
2438                 _exit(1);
2439
2440         // give the child 1 second to be done forking and
2441         // write its ack
2442         if (!wait_for_sock(cpipe[0], 1))
2443                 _exit(1);
2444         ret = read(cpipe[0], &v, 1);
2445         if (ret != sizeof(char) || v != '1')
2446                 _exit(1);
2447
2448         if (!wait_for_pid(cpid))
2449                 _exit(1);
2450         _exit(0);
2451 }
2452
2453 /*
2454  * To read cgroup files with a particular pid, we will setns into the child
2455  * pidns, open a pipe, fork a child - which will be the first to really be in
2456  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2457  */
2458 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2459 {
2460         int sock[2] = {-1, -1};
2461         char *tmpdata = NULL;
2462         int ret;
2463         pid_t qpid, cpid = -1;
2464         bool answer = false;
2465         char v = '0';
2466         struct ucred cred;
2467         size_t sz = 0, asz = 0;
2468
2469         if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
2470                 return false;
2471
2472         /*
2473          * Now we read the pids from returned data one by one, pass
2474          * them into a child in the target namespace, read back the
2475          * translated pids, and put them into our to-return data
2476          */
2477
2478         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2479                 perror("socketpair");
2480                 free(tmpdata);
2481                 return false;
2482         }
2483
2484         cpid = fork();
2485         if (cpid == -1)
2486                 goto out;
2487
2488         if (!cpid) // child - exits when done
2489                 pid_to_ns_wrapper(sock[1], tpid);
2490
2491         char *ptr = tmpdata;
2492         cred.uid = 0;
2493         cred.gid = 0;
2494         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2495                 cred.pid = qpid;
2496                 ret = send_creds(sock[0], &cred, v, true);
2497
2498                 if (ret == SEND_CREDS_NOTSK)
2499                         goto next;
2500                 if (ret == SEND_CREDS_FAIL)
2501                         goto out;
2502
2503                 // read converted results
2504                 if (!wait_for_sock(sock[0], 2)) {
2505                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2506                         goto out;
2507                 }
2508                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2509                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2510                         goto out;
2511                 }
2512                 must_strcat_pid(d, &sz, &asz, qpid);
2513 next:
2514                 ptr = strchr(ptr, '\n');
2515                 if (!ptr)
2516                         break;
2517                 ptr++;
2518         }
2519
2520         cred.pid = getpid();
2521         v = '1';
2522         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2523                 // failed to ask child to exit
2524                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2525                 goto out;
2526         }
2527
2528         answer = true;
2529
2530 out:
2531         free(tmpdata);
2532         if (cpid != -1)
2533                 wait_for_pid(cpid);
2534         if (sock[0] != -1) {
2535                 close(sock[0]);
2536                 close(sock[1]);
2537         }
2538         return answer;
2539 }
2540
2541 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2542                 struct fuse_file_info *fi)
2543 {
2544         struct fuse_context *fc = fuse_get_context();
2545         struct file_info *f = (struct file_info *)fi->fh;
2546         struct cgfs_files *k = NULL;
2547         char *data = NULL;
2548         int ret, s;
2549         bool r;
2550
2551         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2552                 return -EIO;
2553
2554         if (f->type != LXC_TYPE_CGFILE) {
2555                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2556                 return -EIO;
2557         }
2558
2559         if (offset)
2560                 return 0;
2561
2562         if (!f->controller)
2563                 return -EINVAL;
2564
2565         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2566                 return -EINVAL;
2567         }
2568         free_key(k);
2569
2570
2571         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2572                 ret = -EACCES;
2573                 goto out;
2574         }
2575
2576         if (strcmp(f->file, "tasks") == 0 ||
2577                         strcmp(f->file, "/tasks") == 0 ||
2578                         strcmp(f->file, "/cgroup.procs") == 0 ||
2579                         strcmp(f->file, "cgroup.procs") == 0)
2580                 // special case - we have to translate the pids
2581                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2582         else
2583                 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
2584
2585         if (!r) {
2586                 ret = -EINVAL;
2587                 goto out;
2588         }
2589
2590         if (!data) {
2591                 ret = 0;
2592                 goto out;
2593         }
2594         s = strlen(data);
2595         if (s > size)
2596                 s = size;
2597         memcpy(buf, data, s);
2598         if (s > 0 && s < size && data[s-1] != '\n')
2599                 buf[s++] = '\n';
2600
2601         ret = s;
2602
2603 out:
2604         free(data);
2605         return ret;
2606 }
2607
2608 static int pid_from_ns(int sock, pid_t tpid)
2609 {
2610         pid_t vpid;
2611         struct ucred cred;
2612         char v;
2613         int ret;
2614
2615         cred.uid = 0;
2616         cred.gid = 0;
2617         while (1) {
2618                 if (!wait_for_sock(sock, 2)) {
2619                         lxcfs_error("%s\n", "Timeout reading from parent.");
2620                         return 1;
2621                 }
2622                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2623                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2624                         return 1;
2625                 }
2626                 if (vpid == -1) // done
2627                         break;
2628                 v = '0';
2629                 cred.pid = vpid;
2630                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2631                         v = '1';
2632                         cred.pid = getpid();
2633                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2634                                 return 1;
2635                 }
2636         }
2637         return 0;
2638 }
2639
2640 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2641 {
2642         int newnsfd = -1, ret, cpipe[2];
2643         char fnam[100];
2644         pid_t cpid;
2645         char v;
2646
2647         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2648         if (ret < 0 || ret >= sizeof(fnam))
2649                 _exit(1);
2650         newnsfd = open(fnam, O_RDONLY);
2651         if (newnsfd < 0)
2652                 _exit(1);
2653         if (setns(newnsfd, 0) < 0)
2654                 _exit(1);
2655         close(newnsfd);
2656
2657         if (pipe(cpipe) < 0)
2658                 _exit(1);
2659
2660         struct pid_ns_clone_args args = {
2661                 .cpipe = cpipe,
2662                 .sock = sock,
2663                 .tpid = tpid,
2664                 .wrapped = &pid_from_ns
2665         };
2666         size_t stack_size = sysconf(_SC_PAGESIZE);
2667         void *stack = alloca(stack_size);
2668
2669         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2670         if (cpid < 0)
2671                 _exit(1);
2672
2673         // give the child 1 second to be done forking and
2674         // write its ack
2675         if (!wait_for_sock(cpipe[0], 1))
2676                 _exit(1);
2677         ret = read(cpipe[0], &v, 1);
2678         if (ret != sizeof(char) || v != '1')
2679                 _exit(1);
2680
2681         if (!wait_for_pid(cpid))
2682                 _exit(1);
2683         _exit(0);
2684 }
2685
2686 /*
2687  * Given host @uid, return the uid to which it maps in
2688  * @pid's user namespace, or -1 if none.
2689  */
2690 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2691 {
2692         FILE *f;
2693         char line[400];
2694
2695         sprintf(line, "/proc/%d/uid_map", pid);
2696         if ((f = fopen(line, "r")) == NULL) {
2697                 return false;
2698         }
2699
2700         *answer = convert_id_to_ns(f, uid);
2701         fclose(f);
2702
2703         if (*answer == -1)
2704                 return false;
2705         return true;
2706 }
2707
2708 /*
2709  * get_pid_creds: get the real uid and gid of @pid from
2710  * /proc/$$/status
2711  * (XXX should we use euid here?)
2712  */
2713 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2714 {
2715         char line[400];
2716         uid_t u;
2717         gid_t g;
2718         FILE *f;
2719
2720         *uid = -1;
2721         *gid = -1;
2722         sprintf(line, "/proc/%d/status", pid);
2723         if ((f = fopen(line, "r")) == NULL) {
2724                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2725                 return;
2726         }
2727         while (fgets(line, 400, f)) {
2728                 if (strncmp(line, "Uid:", 4) == 0) {
2729                         if (sscanf(line+4, "%u", &u) != 1) {
2730                                 lxcfs_error("bad uid line for pid %u\n", pid);
2731                                 fclose(f);
2732                                 return;
2733                         }
2734                         *uid = u;
2735                 } else if (strncmp(line, "Gid:", 4) == 0) {
2736                         if (sscanf(line+4, "%u", &g) != 1) {
2737                                 lxcfs_error("bad gid line for pid %u\n", pid);
2738                                 fclose(f);
2739                                 return;
2740                         }
2741                         *gid = g;
2742                 }
2743         }
2744         fclose(f);
2745 }
2746
2747 /*
2748  * May the requestor @r move victim @v to a new cgroup?
2749  * This is allowed if
2750  *   . they are the same task
2751  *   . they are ownedy by the same uid
2752  *   . @r is root on the host, or
2753  *   . @v's uid is mapped into @r's where @r is root.
2754  */
2755 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2756 {
2757         uid_t v_uid, tmpuid;
2758         gid_t v_gid;
2759
2760         if (r == v)
2761                 return true;
2762         if (r_uid == 0)
2763                 return true;
2764         get_pid_creds(v, &v_uid, &v_gid);
2765         if (r_uid == v_uid)
2766                 return true;
2767         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2768                         && hostuid_to_ns(v_uid, r, &tmpuid))
2769                 return true;
2770         return false;
2771 }
2772
2773 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2774                 const char *file, const char *buf)
2775 {
2776         int sock[2] = {-1, -1};
2777         pid_t qpid, cpid = -1;
2778         FILE *pids_file = NULL;
2779         bool answer = false, fail = false;
2780
2781         pids_file = open_pids_file(contrl, cg);
2782         if (!pids_file)
2783                 return false;
2784
2785         /*
2786          * write the pids to a socket, have helper in writer's pidns
2787          * call movepid for us
2788          */
2789         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2790                 perror("socketpair");
2791                 goto out;
2792         }
2793
2794         cpid = fork();
2795         if (cpid == -1)
2796                 goto out;
2797
2798         if (!cpid) { // child
2799                 fclose(pids_file);
2800                 pid_from_ns_wrapper(sock[1], tpid);
2801         }
2802
2803         const char *ptr = buf;
2804         while (sscanf(ptr, "%d", &qpid) == 1) {
2805                 struct ucred cred;
2806                 char v;
2807
2808                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2809                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2810                         goto out;
2811                 }
2812
2813                 if (recv_creds(sock[0], &cred, &v)) {
2814                         if (v == '0') {
2815                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2816                                         fail = true;
2817                                         break;
2818                                 }
2819                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2820                                         fail = true;
2821                         }
2822                 }
2823
2824                 ptr = strchr(ptr, '\n');
2825                 if (!ptr)
2826                         break;
2827                 ptr++;
2828         }
2829
2830         /* All good, write the value */
2831         qpid = -1;
2832         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2833                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2834
2835         if (!fail)
2836                 answer = true;
2837
2838 out:
2839         if (cpid != -1)
2840                 wait_for_pid(cpid);
2841         if (sock[0] != -1) {
2842                 close(sock[0]);
2843                 close(sock[1]);
2844         }
2845         if (pids_file) {
2846                 if (fclose(pids_file) != 0)
2847                         answer = false;
2848         }
2849         return answer;
2850 }
2851
2852 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2853              struct fuse_file_info *fi)
2854 {
2855         struct fuse_context *fc = fuse_get_context();
2856         char *localbuf = NULL;
2857         struct cgfs_files *k = NULL;
2858         struct file_info *f = (struct file_info *)fi->fh;
2859         bool r;
2860
2861         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2862                 return -EIO;
2863
2864         if (f->type != LXC_TYPE_CGFILE) {
2865                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2866                 return -EIO;
2867         }
2868
2869         if (offset)
2870                 return 0;
2871
2872         localbuf = alloca(size+1);
2873         localbuf[size] = '\0';
2874         memcpy(localbuf, buf, size);
2875
2876         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2877                 size = -EINVAL;
2878                 goto out;
2879         }
2880
2881         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2882                 size = -EACCES;
2883                 goto out;
2884         }
2885
2886         if (strcmp(f->file, "tasks") == 0 ||
2887                         strcmp(f->file, "/tasks") == 0 ||
2888                         strcmp(f->file, "/cgroup.procs") == 0 ||
2889                         strcmp(f->file, "cgroup.procs") == 0)
2890                 // special case - we have to translate the pids
2891                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2892         else
2893                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2894
2895         if (!r)
2896                 size = -EINVAL;
2897
2898 out:
2899         free_key(k);
2900         return size;
2901 }
2902
2903 int cg_chown(const char *path, uid_t uid, gid_t gid)
2904 {
2905         struct fuse_context *fc = fuse_get_context();
2906         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2907         struct cgfs_files *k = NULL;
2908         const char *cgroup;
2909         int ret;
2910
2911         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2912                 return -EIO;
2913
2914         if (strcmp(path, "/cgroup") == 0)
2915                 return -EPERM;
2916
2917         controller = pick_controller_from_path(fc, path);
2918         if (!controller)
2919                 return errno == ENOENT ? -EPERM : -errno;
2920
2921         cgroup = find_cgroup_in_path(path);
2922         if (!cgroup)
2923                 /* this is just /cgroup/controller */
2924                 return -EPERM;
2925
2926         get_cgdir_and_path(cgroup, &cgdir, &last);
2927
2928         if (!last) {
2929                 path1 = "/";
2930                 path2 = cgdir;
2931         } else {
2932                 path1 = cgdir;
2933                 path2 = last;
2934         }
2935
2936         if (is_child_cgroup(controller, path1, path2)) {
2937                 // get uid, gid, from '/tasks' file and make up a mode
2938                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2939                 k = cgfs_get_key(controller, cgroup, "tasks");
2940
2941         } else
2942                 k = cgfs_get_key(controller, path1, path2);
2943
2944         if (!k) {
2945                 ret = -EINVAL;
2946                 goto out;
2947         }
2948
2949         /*
2950          * This being a fuse request, the uid and gid must be valid
2951          * in the caller's namespace.  So we can just check to make
2952          * sure that the caller is root in his uid, and privileged
2953          * over the file's current owner.
2954          */
2955         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2956                 ret = -EACCES;
2957                 goto out;
2958         }
2959
2960         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2961
2962 out:
2963         free_key(k);
2964         free(cgdir);
2965
2966         return ret;
2967 }
2968
2969 int cg_chmod(const char *path, mode_t mode)
2970 {
2971         struct fuse_context *fc = fuse_get_context();
2972         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2973         struct cgfs_files *k = NULL;
2974         const char *cgroup;
2975         int ret;
2976
2977         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2978                 return -EIO;
2979
2980         if (strcmp(path, "/cgroup") == 0)
2981                 return -EPERM;
2982
2983         controller = pick_controller_from_path(fc, path);
2984         if (!controller)
2985                 return errno == ENOENT ? -EPERM : -errno;
2986
2987         cgroup = find_cgroup_in_path(path);
2988         if (!cgroup)
2989                 /* this is just /cgroup/controller */
2990                 return -EPERM;
2991
2992         get_cgdir_and_path(cgroup, &cgdir, &last);
2993
2994         if (!last) {
2995                 path1 = "/";
2996                 path2 = cgdir;
2997         } else {
2998                 path1 = cgdir;
2999                 path2 = last;
3000         }
3001
3002         if (is_child_cgroup(controller, path1, path2)) {
3003                 // get uid, gid, from '/tasks' file and make up a mode
3004                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3005                 k = cgfs_get_key(controller, cgroup, "tasks");
3006
3007         } else
3008                 k = cgfs_get_key(controller, path1, path2);
3009
3010         if (!k) {
3011                 ret = -EINVAL;
3012                 goto out;
3013         }
3014
3015         /*
3016          * This being a fuse request, the uid and gid must be valid
3017          * in the caller's namespace.  So we can just check to make
3018          * sure that the caller is root in his uid, and privileged
3019          * over the file's current owner.
3020          */
3021         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3022                 ret = -EPERM;
3023                 goto out;
3024         }
3025
3026         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3027                 ret = -EINVAL;
3028                 goto out;
3029         }
3030
3031         ret = 0;
3032 out:
3033         free_key(k);
3034         free(cgdir);
3035         return ret;
3036 }
3037
3038 int cg_mkdir(const char *path, mode_t mode)
3039 {
3040         struct fuse_context *fc = fuse_get_context();
3041         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3042         const char *cgroup;
3043         int ret;
3044
3045         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3046                 return -EIO;
3047
3048         controller = pick_controller_from_path(fc, path);
3049         if (!controller)
3050                 return errno == ENOENT ? -EPERM : -errno;
3051
3052         cgroup = find_cgroup_in_path(path);
3053         if (!cgroup)
3054                 return -errno;
3055
3056         get_cgdir_and_path(cgroup, &cgdir, &last);
3057         if (!last)
3058                 path1 = "/";
3059         else
3060                 path1 = cgdir;
3061
3062         pid_t initpid = lookup_initpid_in_store(fc->pid);
3063         if (initpid <= 1 || is_shared_pidns(initpid))
3064                 initpid = fc->pid;
3065         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3066                 if (!next)
3067                         ret = -EINVAL;
3068                 else if (last && strcmp(next, last) == 0)
3069                         ret = -EEXIST;
3070                 else
3071                         ret = -EPERM;
3072                 goto out;
3073         }
3074
3075         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3076                 ret = -EACCES;
3077                 goto out;
3078         }
3079         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3080                 ret = -EACCES;
3081                 goto out;
3082         }
3083
3084         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3085
3086 out:
3087         free(cgdir);
3088         free(next);
3089         return ret;
3090 }
3091
3092 int cg_rmdir(const char *path)
3093 {
3094         struct fuse_context *fc = fuse_get_context();
3095         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3096         const char *cgroup;
3097         int ret;
3098
3099         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3100                 return -EIO;
3101
3102         controller = pick_controller_from_path(fc, path);
3103         if (!controller) /* Someone's trying to delete "/cgroup". */
3104                 return -EPERM;
3105
3106         cgroup = find_cgroup_in_path(path);
3107         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3108                 return -EPERM;
3109
3110         get_cgdir_and_path(cgroup, &cgdir, &last);
3111         if (!last) {
3112                 /* Someone's trying to delete a cgroup on the same level as the
3113                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3114                  * rmdir "/cgroup/blkio/init.slice".
3115                  */
3116                 ret = -EPERM;
3117                 goto out;
3118         }
3119
3120         pid_t initpid = lookup_initpid_in_store(fc->pid);
3121         if (initpid <= 1 || is_shared_pidns(initpid))
3122                 initpid = fc->pid;
3123         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3124                 if (!last || (next && (strcmp(next, last) == 0)))
3125                         ret = -EBUSY;
3126                 else
3127                         ret = -ENOENT;
3128                 goto out;
3129         }
3130
3131         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3132                 ret = -EACCES;
3133                 goto out;
3134         }
3135         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3136                 ret = -EACCES;
3137                 goto out;
3138         }
3139
3140         if (!cgfs_remove(controller, cgroup)) {
3141                 ret = -EINVAL;
3142                 goto out;
3143         }
3144
3145         ret = 0;
3146
3147 out:
3148         free(cgdir);
3149         free(next);
3150         return ret;
3151 }
3152
3153 static bool startswith(const char *line, const char *pref)
3154 {
3155         if (strncmp(line, pref, strlen(pref)) == 0)
3156                 return true;
3157         return false;
3158 }
3159
3160 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3161 static void parse_memstat(int version,
3162                           char *memstat,
3163                           unsigned long *cached,
3164                           unsigned long *active_anon,
3165                           unsigned long *inactive_anon,
3166                           unsigned long *active_file,
3167                           unsigned long *inactive_file,
3168                           unsigned long *unevictable,
3169                           unsigned long *shmem)
3170 {
3171         char *eol;
3172
3173         while (*memstat) {
3174                 if (startswith(memstat, is_unified_controller(version)
3175                                             ? "cache"
3176                                             : "total_cache")) {
3177                         sscanf(memstat + 11, "%lu", cached);
3178                         *cached /= 1024;
3179                 } else if (startswith(memstat, is_unified_controller(version)
3180                                                    ? "active_anon"
3181                                                    : "total_active_anon")) {
3182                         sscanf(memstat + 17, "%lu", active_anon);
3183                         *active_anon /= 1024;
3184                 } else if (startswith(memstat, is_unified_controller(version)
3185                                                    ? "inactive_anon"
3186                                                    : "total_inactive_anon")) {
3187                         sscanf(memstat + 19, "%lu", inactive_anon);
3188                         *inactive_anon /= 1024;
3189                 } else if (startswith(memstat, is_unified_controller(version)
3190                                                    ? "active_file"
3191                                                    : "total_active_file")) {
3192                         sscanf(memstat + 17, "%lu", active_file);
3193                         *active_file /= 1024;
3194                 } else if (startswith(memstat, is_unified_controller(version)
3195                                                    ? "inactive_file"
3196                                                    : "total_inactive_file")) {
3197                         sscanf(memstat + 19, "%lu", inactive_file);
3198                         *inactive_file /= 1024;
3199                 } else if (startswith(memstat, is_unified_controller(version)
3200                                                    ? "unevictable"
3201                                                    : "total_unevictable")) {
3202                         sscanf(memstat + 17, "%lu", unevictable);
3203                         *unevictable /= 1024;
3204                 } else if (startswith(memstat, is_unified_controller(version)
3205                                                    ? "shmem"
3206                                                    : "total_shmem")) {
3207                         sscanf(memstat + 11, "%lu", shmem);
3208                         *shmem /= 1024;
3209                 }
3210                 eol = strchr(memstat, '\n');
3211                 if (!eol)
3212                         return;
3213                 memstat = eol+1;
3214         }
3215 }
3216
3217 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3218 {
3219         char *eol;
3220         char key[32];
3221
3222         memset(key, 0, 32);
3223         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3224
3225         size_t len = strlen(key);
3226         *v = 0;
3227
3228         while (*str) {
3229                 if (startswith(str, key)) {
3230                         sscanf(str + len, "%lu", v);
3231                         return;
3232                 }
3233                 eol = strchr(str, '\n');
3234                 if (!eol)
3235                         return;
3236                 str = eol+1;
3237         }
3238 }
3239
3240 int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
3241 {
3242         __do_free char *line = NULL;
3243         __do_fclose FILE *f = NULL;
3244         size_t linelen = 0, total_len = 0;
3245         char *cache = d->buf;
3246         size_t cache_size = d->buflen;
3247
3248         f = fopen(path, "r");
3249         if (!f)
3250                 return 0;
3251
3252         while (getline(&line, &linelen, f) != -1) {
3253                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3254                 if (l < 0) {
3255                         perror("Error writing to cache");
3256                         return 0;
3257                 }
3258                 if (l >= cache_size) {
3259                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3260                         return 0;
3261                 }
3262                 cache += l;
3263                 cache_size -= l;
3264                 total_len += l;
3265         }
3266
3267         d->size = total_len;
3268         if (total_len > size)
3269                 total_len = size;
3270
3271         /* read from off 0 */
3272         memcpy(buf, d->buf, total_len);
3273
3274         if (d->size > total_len)
3275                 d->cached = d->size - total_len;
3276         return total_len;
3277 }
3278
3279 /*
3280  * FUSE ops for /proc
3281  */
3282
3283 static unsigned long get_memlimit(const char *cgroup, bool swap)
3284 {
3285         int ret;
3286         __do_free char *memlimit_str = NULL;
3287         unsigned long memlimit = -1;
3288
3289         if (swap)
3290                 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3291         else
3292                 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3293         if (ret > 0)
3294                 memlimit = strtoul(memlimit_str, NULL, 10);
3295
3296         return memlimit;
3297 }
3298
3299 static unsigned long get_min_memlimit(const char *cgroup, bool swap)
3300 {
3301         __do_free char *copy = NULL;
3302         unsigned long memlimit = 0;
3303         unsigned long retlimit;
3304
3305         copy = strdup(cgroup);
3306         retlimit = get_memlimit(copy, swap);
3307
3308         while (strcmp(copy, "/") != 0) {
3309                 char *it = copy;
3310
3311                 it = dirname(it);
3312                 memlimit = get_memlimit(it, swap);
3313                 if (memlimit != -1 && memlimit < retlimit)
3314                         retlimit = memlimit;
3315         };
3316
3317         return retlimit;
3318 }
3319
3320 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3321                              struct fuse_file_info *fi)
3322 {
3323         __do_free char *cgroup = NULL, *line = NULL,
3324                        *memusage_str = NULL, *memstat_str = NULL,
3325                        *memswlimit_str = NULL, *memswusage_str = NULL;
3326         __do_fclose FILE *f = NULL;
3327         struct fuse_context *fc = fuse_get_context();
3328         struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3329         struct file_info *d = (struct file_info *)fi->fh;
3330         unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3331                       memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3332                       inactive_anon = 0, active_file = 0, inactive_file = 0,
3333                       unevictable = 0, shmem = 0, hostswtotal = 0;
3334         size_t linelen = 0, total_len = 0;
3335         char *cache = d->buf;
3336         size_t cache_size = d->buflen;
3337         int ret;
3338
3339         if (offset) {
3340                 int left;
3341
3342                 if (offset > d->size)
3343                         return -EINVAL;
3344
3345                 if (!d->cached)
3346                         return 0;
3347
3348                 left = d->size - offset;
3349                 total_len = left > size ? size : left;
3350                 memcpy(buf, cache + offset, total_len);
3351
3352                 return total_len;
3353         }
3354
3355         pid_t initpid = lookup_initpid_in_store(fc->pid);
3356         if (initpid <= 1 || is_shared_pidns(initpid))
3357                 initpid = fc->pid;
3358
3359         cgroup = get_pid_cgroup(initpid, "memory");
3360         if (!cgroup)
3361                 return read_file_fuse("/proc/meminfo", buf, size, d);
3362
3363         prune_init_slice(cgroup);
3364
3365         memlimit = get_min_memlimit(cgroup, false);
3366
3367         ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3368         if (ret < 0)
3369                 return 0;
3370
3371         ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3372         if (ret < 0)
3373                 return 0;
3374         parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3375                       &active_file, &inactive_file, &unevictable, &shmem);
3376
3377         /*
3378          * Following values are allowed to fail, because swapaccount might be
3379          * turned off for current kernel.
3380          */
3381         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3382         if (ret >= 0)
3383                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3384         if (ret >= 0) {
3385                 memswlimit = get_min_memlimit(cgroup, true);
3386                 memswusage = strtoul(memswusage_str, NULL, 10);
3387                 memswlimit = memswlimit / 1024;
3388                 memswusage = memswusage / 1024;
3389         }
3390
3391         memusage = strtoul(memusage_str, NULL, 10);
3392         memlimit /= 1024;
3393         memusage /= 1024;
3394
3395         f = fopen("/proc/meminfo", "r");
3396         if (!f)
3397                 return 0;
3398
3399         while (getline(&line, &linelen, f) != -1) {
3400                 ssize_t l;
3401                 char *printme, lbuf[100];
3402
3403                 memset(lbuf, 0, 100);
3404                 if (startswith(line, "MemTotal:")) {
3405                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3406                         if (hosttotal < memlimit)
3407                                 memlimit = hosttotal;
3408                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3409                         printme = lbuf;
3410                 } else if (startswith(line, "MemFree:")) {
3411                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3412                         printme = lbuf;
3413                 } else if (startswith(line, "MemAvailable:")) {
3414                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3415                         printme = lbuf;
3416                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3417                            opts && opts->swap_off == false) {
3418                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3419                         if (hostswtotal < memswlimit)
3420                                 memswlimit = hostswtotal;
3421                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3422                         printme = lbuf;
3423                 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3424                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", 0UL);
3425                         printme = lbuf;
3426                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3427                            memswusage > 0 && opts && opts->swap_off == false) {
3428                         unsigned long swaptotal = memswlimit,
3429                                       swapusage = memusage > memswusage
3430                                                       ? 0
3431                                                       : memswusage - memusage,
3432                                       swapfree = swapusage < swaptotal
3433                                                      ? swaptotal - swapusage
3434                                                      : 0;
3435                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3436                         printme = lbuf;
3437                 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3438                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", 0UL);
3439                         printme = lbuf;
3440                 } else if (startswith(line, "Slab:")) {
3441                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3442                         printme = lbuf;
3443                 } else if (startswith(line, "Buffers:")) {
3444                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3445                         printme = lbuf;
3446                 } else if (startswith(line, "Cached:")) {
3447                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3448                         printme = lbuf;
3449                 } else if (startswith(line, "SwapCached:")) {
3450                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3451                         printme = lbuf;
3452                 } else if (startswith(line, "Active:")) {
3453                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3454                                         active_anon + active_file);
3455                         printme = lbuf;
3456                 } else if (startswith(line, "Inactive:")) {
3457                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3458                                         inactive_anon + inactive_file);
3459                         printme = lbuf;
3460                 } else if (startswith(line, "Active(anon)")) {
3461                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3462                         printme = lbuf;
3463                 } else if (startswith(line, "Inactive(anon)")) {
3464                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3465                         printme = lbuf;
3466                 } else if (startswith(line, "Active(file)")) {
3467                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3468                         printme = lbuf;
3469                 } else if (startswith(line, "Inactive(file)")) {
3470                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3471                         printme = lbuf;
3472                 } else if (startswith(line, "Unevictable")) {
3473                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3474                         printme = lbuf;
3475                 } else if (startswith(line, "SReclaimable")) {
3476                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3477                         printme = lbuf;
3478                 } else if (startswith(line, "SUnreclaim")) {
3479                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3480                         printme = lbuf;
3481                 } else if (startswith(line, "Shmem:")) {
3482                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3483                         printme = lbuf;
3484                 } else if (startswith(line, "ShmemHugePages")) {
3485                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3486                         printme = lbuf;
3487                 } else if (startswith(line, "ShmemPmdMapped")) {
3488                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3489                         printme = lbuf;
3490                 } else
3491                         printme = line;
3492
3493                 l = snprintf(cache, cache_size, "%s", printme);
3494                 if (l < 0) {
3495                         perror("Error writing to cache");
3496                         return 0;
3497
3498                 }
3499                 if (l >= cache_size) {
3500                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3501                         return 0;
3502                 }
3503
3504                 cache += l;
3505                 cache_size -= l;
3506                 total_len += l;
3507         }
3508
3509         d->cached = 1;
3510         d->size = total_len;
3511         if (total_len > size ) total_len = size;
3512         memcpy(buf, d->buf, total_len);
3513
3514         return total_len;
3515 }
3516
3517 /*
3518  * Read the cpuset.cpus for cg
3519  * Return the answer in a newly allocated string which must be freed
3520  */
3521 char *get_cpuset(const char *cg)
3522 {
3523         char *value = NULL;
3524         int ret;
3525
3526         ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3527         if (ret < 0)
3528                 return NULL;
3529
3530         return value;
3531 }
3532
3533 bool cpu_in_cpuset(int cpu, const char *cpuset);
3534
3535 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3536 {
3537         int cpu;
3538
3539         if (sscanf(line, "processor       : %d", &cpu) != 1)
3540                 return false;
3541         return cpu_in_cpuset(cpu, cpuset);
3542 }
3543
3544 /*
3545  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3546  * depending on `param`. Parameter value is returned throuh `value`.
3547  */
3548 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3549 {
3550         __do_free char *str = NULL;
3551         char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
3552
3553         snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
3554
3555         if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
3556                 return false;
3557
3558         if (sscanf(str, "%ld", value) != 1)
3559                 return false;
3560
3561         return true;
3562 }
3563
3564 /*
3565  * Return the maximum number of visible CPUs based on CPU quotas.
3566  * If there is no quota set, zero is returned.
3567  */
3568 int max_cpu_count(const char *cg)
3569 {
3570         int rv, nprocs;
3571         int64_t cfs_quota, cfs_period;
3572         int nr_cpus_in_cpuset = 0;
3573         char *cpuset = NULL;
3574
3575         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3576                 return 0;
3577
3578         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3579                 return 0;
3580
3581         cpuset = get_cpuset(cg);
3582         if (cpuset)
3583                 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3584
3585         if (cfs_quota <= 0 || cfs_period <= 0){
3586                 if (nr_cpus_in_cpuset > 0)
3587                         return nr_cpus_in_cpuset;
3588
3589                 return 0;
3590         }
3591
3592         rv = cfs_quota / cfs_period;
3593
3594         /* In case quota/period does not yield a whole number, add one CPU for
3595          * the remainder.
3596          */
3597         if ((cfs_quota % cfs_period) > 0)
3598                 rv += 1;
3599
3600         nprocs = get_nprocs();
3601
3602         if (rv > nprocs)
3603                 rv = nprocs;
3604
3605         /* use min value in cpu quota and cpuset */
3606         if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3607                 rv = nr_cpus_in_cpuset;
3608
3609         return rv;
3610 }
3611
3612 /*
3613  * Return the exact number of visible CPUs based on CPU quotas.
3614  * If there is no quota set, zero is returned.
3615  */
3616 static double exact_cpu_count(const char *cg)
3617 {
3618         double rv;
3619         int nprocs;
3620         int64_t cfs_quota, cfs_period;
3621
3622         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3623                 return 0;
3624
3625         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3626                 return 0;
3627
3628         if (cfs_quota <= 0 || cfs_period <= 0)
3629                 return 0;
3630
3631         rv = (double)cfs_quota / (double)cfs_period;
3632
3633         nprocs = get_nprocs();
3634
3635         if (rv > nprocs)
3636                 rv = nprocs;
3637
3638         return rv;
3639 }
3640
3641 /*
3642  * check whether this is a '^processor" line in /proc/cpuinfo
3643  */
3644 static bool is_processor_line(const char *line)
3645 {
3646         int cpu;
3647
3648         if (sscanf(line, "processor       : %d", &cpu) == 1)
3649                 return true;
3650         return false;
3651 }
3652
3653 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3654                              struct fuse_file_info *fi)
3655 {
3656         __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3657         __do_fclose FILE *f = NULL;
3658         struct fuse_context *fc = fuse_get_context();
3659         struct file_info *d = (struct file_info *)fi->fh;
3660         size_t linelen = 0, total_len = 0;
3661         bool am_printing = false, firstline = true, is_s390x = false;
3662         int curcpu = -1, cpu, max_cpus = 0;
3663         bool use_view;
3664         char *cache = d->buf;
3665         size_t cache_size = d->buflen;
3666
3667         if (offset){
3668                 int left;
3669
3670                 if (offset > d->size)
3671                         return -EINVAL;
3672
3673                 if (!d->cached)
3674                         return 0;
3675
3676                 left = d->size - offset;
3677                 total_len = left > size ? size: left;
3678                 memcpy(buf, cache + offset, total_len);
3679
3680                 return total_len;
3681         }
3682
3683         pid_t initpid = lookup_initpid_in_store(fc->pid);
3684         if (initpid <= 1 || is_shared_pidns(initpid))
3685                 initpid = fc->pid;
3686         cg = get_pid_cgroup(initpid, "cpuset");
3687         if (!cg)
3688                 return read_file_fuse("proc/cpuinfo", buf, size, d);
3689         prune_init_slice(cg);
3690
3691         cpuset = get_cpuset(cg);
3692         if (!cpuset)
3693                 return 0;
3694
3695         use_view = cgroup_ops->can_use_cpuview(cgroup_ops);
3696         if (use_view)
3697                 max_cpus = max_cpu_count(cg);
3698
3699         f = fopen("/proc/cpuinfo", "r");
3700         if (!f)
3701                 return 0;
3702
3703         while (getline(&line, &linelen, f) != -1) {
3704                 ssize_t l;
3705                 if (firstline) {
3706                         firstline = false;
3707                         if (strstr(line, "IBM/S390") != NULL) {
3708                                 is_s390x = true;
3709                                 am_printing = true;
3710                                 continue;
3711                         }
3712                 }
3713                 if (strncmp(line, "# processors:", 12) == 0)
3714                         continue;
3715                 if (is_processor_line(line)) {
3716                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3717                                 break;
3718                         am_printing = cpuline_in_cpuset(line, cpuset);
3719                         if (am_printing) {
3720                                 curcpu ++;
3721                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3722                                 if (l < 0) {
3723                                         perror("Error writing to cache");
3724                                         return 0;
3725                                 }
3726                                 if (l >= cache_size) {
3727                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3728                                         return 0;
3729                                 }
3730                                 cache += l;
3731                                 cache_size -= l;
3732                                 total_len += l;
3733                         }
3734                         continue;
3735                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3736                         char *p;
3737                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3738                                 break;
3739                         if (!cpu_in_cpuset(cpu, cpuset))
3740                                 continue;
3741                         curcpu ++;
3742                         p = strchr(line, ':');
3743                         if (!p || !*p)
3744                                 return 0;
3745                         p++;
3746                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3747                         if (l < 0) {
3748                                 perror("Error writing to cache");
3749                                 return 0;
3750                         }
3751                         if (l >= cache_size) {
3752                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3753                                 return 0;
3754                         }
3755                         cache += l;
3756                         cache_size -= l;
3757                         total_len += l;
3758                         continue;
3759
3760                 }
3761                 if (am_printing) {
3762                         l = snprintf(cache, cache_size, "%s", line);
3763                         if (l < 0) {
3764                                 perror("Error writing to cache");
3765                                 return 0;
3766                         }
3767                         if (l >= cache_size) {
3768                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3769                                 return 0;
3770                         }
3771                         cache += l;
3772                         cache_size -= l;
3773                         total_len += l;
3774                 }
3775         }
3776
3777         if (is_s390x) {
3778                 __do_free char *origcache = d->buf;
3779                 ssize_t l;
3780
3781                 d->buf = malloc(d->buflen);
3782                 if (!d->buf) {
3783                         d->buf = move_ptr(origcache);
3784                         return 0;
3785                 }
3786
3787                 cache = d->buf;
3788                 cache_size = d->buflen;
3789                 total_len = 0;
3790                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3791                 if (l < 0 || l >= cache_size)
3792                         return 0;
3793
3794                 cache_size -= l;
3795                 cache += l;
3796                 total_len += l;
3797                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3798                 if (l < 0 || l >= cache_size)
3799                         return 0;
3800
3801                 cache_size -= l;
3802                 cache += l;
3803                 total_len += l;
3804                 l = snprintf(cache, cache_size, "%s", origcache);
3805                 if (l < 0 || l >= cache_size)
3806                         return 0;
3807                 total_len += l;
3808         }
3809
3810         d->cached = 1;
3811         d->size = total_len;
3812         if (total_len > size ) total_len = size;
3813
3814         /* read from off 0 */
3815         memcpy(buf, d->buf, total_len);
3816         return total_len;
3817 }
3818
3819 static uint64_t get_reaper_start_time(pid_t pid)
3820 {
3821         int ret;
3822         FILE *f;
3823         uint64_t starttime;
3824         /* strlen("/proc/") = 6
3825          * +
3826          * LXCFS_NUMSTRLEN64
3827          * +
3828          * strlen("/stat") = 5
3829          * +
3830          * \0 = 1
3831          * */
3832 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3833         char path[__PROC_PID_STAT_LEN];
3834         pid_t qpid;
3835
3836         qpid = lookup_initpid_in_store(pid);
3837         if (qpid <= 0) {
3838                 /* Caller can check for EINVAL on 0. */
3839                 errno = EINVAL;
3840                 return 0;
3841         }
3842
3843         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3844         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3845                 /* Caller can check for EINVAL on 0. */
3846                 errno = EINVAL;
3847                 return 0;
3848         }
3849
3850         f = fopen(path, "r");
3851         if (!f) {
3852                 /* Caller can check for EINVAL on 0. */
3853                 errno = EINVAL;
3854                 return 0;
3855         }
3856
3857         /* Note that the *scanf() argument supression requires that length
3858          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3859          * at us. It's like telling someone you're not married and then asking
3860          * if you can bring your wife to the party.
3861          */
3862         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3863                         "%*s "      /* (2)  comm        %s   */
3864                         "%*c "      /* (3)  state       %c   */
3865                         "%*d "      /* (4)  ppid        %d   */
3866                         "%*d "      /* (5)  pgrp        %d   */
3867                         "%*d "      /* (6)  session     %d   */
3868                         "%*d "      /* (7)  tty_nr      %d   */
3869                         "%*d "      /* (8)  tpgid       %d   */
3870                         "%*u "      /* (9)  flags       %u   */
3871                         "%*u "      /* (10) minflt      %lu  */
3872                         "%*u "      /* (11) cminflt     %lu  */
3873                         "%*u "      /* (12) majflt      %lu  */
3874                         "%*u "      /* (13) cmajflt     %lu  */
3875                         "%*u "      /* (14) utime       %lu  */
3876                         "%*u "      /* (15) stime       %lu  */
3877                         "%*d "      /* (16) cutime      %ld  */
3878                         "%*d "      /* (17) cstime      %ld  */
3879                         "%*d "      /* (18) priority    %ld  */
3880                         "%*d "      /* (19) nice        %ld  */
3881                         "%*d "      /* (20) num_threads %ld  */
3882                         "%*d "      /* (21) itrealvalue %ld  */
3883                         "%" PRIu64, /* (22) starttime   %llu */
3884                      &starttime);
3885         if (ret != 1) {
3886                 fclose(f);
3887                 /* Caller can check for EINVAL on 0. */
3888                 errno = EINVAL;
3889                 return 0;
3890         }
3891
3892         fclose(f);
3893
3894         errno = 0;
3895         return starttime;
3896 }
3897
3898 static double get_reaper_start_time_in_sec(pid_t pid)
3899 {
3900         uint64_t clockticks, ticks_per_sec;
3901         int64_t ret;
3902         double res = 0;
3903
3904         clockticks = get_reaper_start_time(pid);
3905         if (clockticks == 0 && errno == EINVAL) {
3906                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3907                 return 0;
3908         }
3909
3910         ret = sysconf(_SC_CLK_TCK);
3911         if (ret < 0 && errno == EINVAL) {
3912                 lxcfs_debug(
3913                     "%s\n",
3914                     "failed to determine number of clock ticks in a second");
3915                 return 0;
3916         }
3917
3918         ticks_per_sec = (uint64_t)ret;
3919         res = (double)clockticks / ticks_per_sec;
3920         return res;
3921 }
3922
3923 static double get_reaper_age(pid_t pid)
3924 {
3925         uint64_t uptime_ms;
3926         double procstart, procage;
3927
3928         /* We need to substract the time the process has started since system
3929          * boot minus the time when the system has started to get the actual
3930          * reaper age.
3931          */
3932         procstart = get_reaper_start_time_in_sec(pid);
3933         procage = procstart;
3934         if (procstart > 0) {
3935                 int ret;
3936                 struct timespec spec;
3937
3938                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3939                 if (ret < 0)
3940                         return 0;
3941
3942                 /* We could make this more precise here by using the tv_nsec
3943                  * field in the timespec struct and convert it to milliseconds
3944                  * and then create a double for the seconds and milliseconds but
3945                  * that seems more work than it is worth.
3946                  */
3947                 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
3948                 procage = (uptime_ms - (procstart * 1000)) / 1000;
3949         }
3950
3951         return procage;
3952 }
3953
3954 /*
3955  * Returns 0 on success.
3956  * It is the caller's responsibility to free `return_usage`, unless this
3957  * function returns an error.
3958  */
3959 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
3960 {
3961         __do_free char *usage_str = NULL;
3962         __do_free struct cpuacct_usage *cpu_usage = NULL;
3963         int cpucount = get_nprocs_conf();
3964         int read_pos = 0, read_cnt=0;
3965         int i, j, ret;
3966         int cg_cpu;
3967         uint64_t cg_user, cg_system;
3968         int64_t ticks_per_sec;
3969
3970         ticks_per_sec = sysconf(_SC_CLK_TCK);
3971
3972         if (ticks_per_sec < 0 && errno == EINVAL) {
3973                 lxcfs_v(
3974                         "%s\n",
3975                         "read_cpuacct_usage_all failed to determine number of clock ticks "
3976                         "in a second");
3977                 return -1;
3978         }
3979
3980         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3981         if (!cpu_usage)
3982                 return -ENOMEM;
3983
3984         memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
3985         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3986                 char *data = NULL;
3987                 int i = 0, read_pos = 0, read_cnt=0;
3988                 size_t sz = 0, asz = 0;
3989
3990                 /* read cpuacct.usage_percpu instead. */
3991                 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
3992                 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
3993                         return -1;
3994                 lxcfs_v("usage_str: %s\n", usage_str);
3995
3996                 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
3997                 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
3998
3999                 must_strcat(&data, &sz, &asz, "cpu user system\n");
4000
4001                 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4002                         lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4003                         must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4004                         i++;
4005                         read_pos += read_cnt;
4006                 }
4007
4008                 usage_str = data;
4009
4010                 lxcfs_v("usage_str: %s\n", usage_str);
4011         }
4012
4013         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4014                 lxcfs_error("read_cpuacct_usage_all reading first line from "
4015                                 "%s/cpuacct.usage_all failed.\n", cg);
4016                 return -1;
4017         }
4018
4019         read_pos += read_cnt;
4020
4021         for (i = 0, j = 0; i < cpucount; i++) {
4022                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4023                                 &cg_system, &read_cnt);
4024
4025                 if (ret == EOF)
4026                         break;
4027
4028                 if (ret != 3) {
4029                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4030                                         "failed.\n", cg);
4031                         return -1;
4032                 }
4033
4034                 read_pos += read_cnt;
4035
4036                 /* Convert the time from nanoseconds to USER_HZ */
4037                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4038                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4039                 j++;
4040         }
4041
4042         *return_usage = move_ptr(cpu_usage);
4043         *size = cpucount;
4044         return 0;
4045 }
4046
4047 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4048 {
4049         int i;
4050         unsigned long sum = 0;
4051
4052         for (i = 0; i < cpu_count; i++) {
4053                 if (!newer[i].online)
4054                         continue;
4055
4056                 /* When cpuset is changed on the fly, the CPUs might get reordered.
4057                  * We could either reset all counters, or check that the substractions
4058                  * below will return expected results.
4059                  */
4060                 if (newer[i].user > older[i].user)
4061                         diff[i].user = newer[i].user - older[i].user;
4062                 else
4063                         diff[i].user = 0;
4064
4065                 if (newer[i].system > older[i].system)
4066                         diff[i].system = newer[i].system - older[i].system;
4067                 else
4068                         diff[i].system = 0;
4069
4070                 if (newer[i].idle > older[i].idle)
4071                         diff[i].idle = newer[i].idle - older[i].idle;
4072                 else
4073                         diff[i].idle = 0;
4074
4075                 sum += diff[i].user;
4076                 sum += diff[i].system;
4077                 sum += diff[i].idle;
4078         }
4079
4080         return sum;
4081 }
4082
4083 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4084 {
4085         unsigned long free_space, to_add;
4086
4087         free_space = threshold - usage->user - usage->system;
4088
4089         if (free_space > usage->idle)
4090                 free_space = usage->idle;
4091
4092         to_add = free_space > *surplus ? *surplus : free_space;
4093
4094         *counter += to_add;
4095         usage->idle -= to_add;
4096         *surplus -= to_add;
4097 }
4098
4099 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4100 {
4101         struct cg_proc_stat *first = NULL, *prev, *tmp;
4102
4103         for (prev = NULL; node; ) {
4104                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4105                         tmp = node;
4106                         lxcfs_debug("Removing stat node for %s\n", node->cg);
4107
4108                         if (prev)
4109                                 prev->next = node->next;
4110                         else
4111                                 first = node->next;
4112
4113                         node = node->next;
4114                         free_proc_stat_node(tmp);
4115                 } else {
4116                         if (!first)
4117                                 first = node;
4118                         prev = node;
4119                         node = node->next;
4120                 }
4121         }
4122
4123         return first;
4124 }
4125
4126 #define PROC_STAT_PRUNE_INTERVAL 10
4127 static void prune_proc_stat_history(void)
4128 {
4129         int i;
4130         time_t now = time(NULL);
4131
4132         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4133                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4134
4135                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4136                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4137                         return;
4138                 }
4139
4140                 if (proc_stat_history[i]->next) {
4141                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4142                         proc_stat_history[i]->lastcheck = now;
4143                 }
4144
4145                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4146         }
4147 }
4148
4149 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4150 {
4151         struct cg_proc_stat *node;
4152
4153         pthread_rwlock_rdlock(&head->lock);
4154
4155         if (!head->next) {
4156                 pthread_rwlock_unlock(&head->lock);
4157                 return NULL;
4158         }
4159
4160         node = head->next;
4161
4162         do {
4163                 if (strcmp(cg, node->cg) == 0)
4164                         goto out;
4165         } while ((node = node->next));
4166
4167         node = NULL;
4168
4169 out:
4170         pthread_rwlock_unlock(&head->lock);
4171         prune_proc_stat_history();
4172         return node;
4173 }
4174
4175 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4176 {
4177         struct cg_proc_stat *node;
4178         int i;
4179
4180         node = malloc(sizeof(struct cg_proc_stat));
4181         if (!node)
4182                 goto err;
4183
4184         node->cg = NULL;
4185         node->usage = NULL;
4186         node->view = NULL;
4187
4188         node->cg = malloc(strlen(cg) + 1);
4189         if (!node->cg)
4190                 goto err;
4191
4192         strcpy(node->cg, cg);
4193
4194         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4195         if (!node->usage)
4196                 goto err;
4197
4198         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4199
4200         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4201         if (!node->view)
4202                 goto err;
4203
4204         node->cpu_count = cpu_count;
4205         node->next = NULL;
4206
4207         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4208                 lxcfs_error("%s\n", "Failed to initialize node lock");
4209                 goto err;
4210         }
4211
4212         for (i = 0; i < cpu_count; i++) {
4213                 node->view[i].user = 0;
4214                 node->view[i].system = 0;
4215                 node->view[i].idle = 0;
4216         }
4217
4218         return node;
4219
4220 err:
4221         if (node && node->cg)
4222                 free(node->cg);
4223         if (node && node->usage)
4224                 free(node->usage);
4225         if (node && node->view)
4226                 free(node->view);
4227         if (node)
4228                 free(node);
4229
4230         return NULL;
4231 }
4232
4233 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4234 {
4235         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4236         struct cg_proc_stat_head *head = proc_stat_history[hash];
4237         struct cg_proc_stat *node, *rv = new_node;
4238
4239         pthread_rwlock_wrlock(&head->lock);
4240
4241         if (!head->next) {
4242                 head->next = new_node;
4243                 goto out;
4244         }
4245
4246         node = head->next;
4247
4248         for (;;) {
4249                 if (strcmp(node->cg, new_node->cg) == 0) {
4250                         /* The node is already present, return it */
4251                         free_proc_stat_node(new_node);
4252                         rv = node;
4253                         goto out;
4254                 }
4255
4256                 if (node->next) {
4257                         node = node->next;
4258                         continue;
4259                 }
4260
4261                 node->next = new_node;
4262                 goto out;
4263         }
4264
4265 out:
4266         pthread_rwlock_unlock(&head->lock);
4267         return rv;
4268 }
4269
4270 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4271 {
4272         __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
4273
4274         /* Allocate new memory */
4275         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4276         if (!new_usage)
4277                 return false;
4278
4279         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4280         if (!new_view)
4281                 return false;
4282
4283         /* Copy existing data & initialize new elements */
4284         for (int i = 0; i < cpu_count; i++) {
4285                 if (i < node->cpu_count) {
4286                         new_usage[i].user = node->usage[i].user;
4287                         new_usage[i].system = node->usage[i].system;
4288                         new_usage[i].idle = node->usage[i].idle;
4289
4290                         new_view[i].user = node->view[i].user;
4291                         new_view[i].system = node->view[i].system;
4292                         new_view[i].idle = node->view[i].idle;
4293                 } else {
4294                         new_usage[i].user = 0;
4295                         new_usage[i].system = 0;
4296                         new_usage[i].idle = 0;
4297
4298                         new_view[i].user = 0;
4299                         new_view[i].system = 0;
4300                         new_view[i].idle = 0;
4301                 }
4302         }
4303
4304         free(node->usage);
4305         node->usage = move_ptr(new_usage);
4306
4307         free(node->view);
4308         node->view = move_ptr(new_view);
4309         node->cpu_count = cpu_count;
4310
4311         return true;
4312 }
4313
4314 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4315 {
4316         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4317         struct cg_proc_stat_head *head = proc_stat_history[hash];
4318         struct cg_proc_stat *node;
4319
4320         node = find_proc_stat_node(head, cg);
4321
4322         if (!node) {
4323                 node = new_proc_stat_node(usage, cpu_count, cg);
4324                 if (!node)
4325                         return NULL;
4326
4327                 node = add_proc_stat_node(node);
4328                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4329         }
4330
4331         pthread_mutex_lock(&node->lock);
4332
4333         /* If additional CPUs on the host have been enabled, CPU usage counter
4334          * arrays have to be expanded */
4335         if (node->cpu_count < cpu_count) {
4336                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4337                                 node->cpu_count, cpu_count, cg);
4338
4339                 if (!expand_proc_stat_node(node, cpu_count)) {
4340                         pthread_mutex_unlock(&node->lock);
4341                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4342                                         node->cpu_count, cpu_count, cg);
4343                         return NULL;
4344                 }
4345         }
4346
4347         return node;
4348 }
4349
4350 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4351 {
4352         int i;
4353
4354         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4355         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4356
4357         for (i = 0; i < cpu_count; i++) {
4358                 node->view[i].user = 0;
4359                 node->view[i].system = 0;
4360                 node->view[i].idle = 0;
4361         }
4362
4363         node->cpu_count = cpu_count;
4364 }
4365
4366 static int cpuview_proc_stat(const char *cg, const char *cpuset,
4367                              struct cpuacct_usage *cg_cpu_usage,
4368                              int cg_cpu_usage_size, FILE *f, char *buf,
4369                              size_t buf_size)
4370 {
4371         __do_free char *line = NULL;
4372         __do_free struct cpuacct_usage *diff = NULL;
4373         size_t linelen = 0, total_len = 0, l;
4374         int curcpu = -1; /* cpu numbering starts at 0 */
4375         int physcpu, i;
4376         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4377         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4378                       irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4379         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4380         unsigned long user_surplus = 0, system_surplus = 0;
4381         unsigned long total_sum, threshold;
4382         struct cg_proc_stat *stat_node;
4383         int nprocs = get_nprocs_conf();
4384
4385         if (cg_cpu_usage_size < nprocs)
4386                 nprocs = cg_cpu_usage_size;
4387
4388         /* Read all CPU stats and stop when we've encountered other lines */
4389         while (getline(&line, &linelen, f) != -1) {
4390                 int ret;
4391                 char cpu_char[10]; /* That's a lot of cores */
4392                 uint64_t all_used, cg_used;
4393
4394                 if (strlen(line) == 0)
4395                         continue;
4396
4397                 /* not a ^cpuN line containing a number N */
4398                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
4399                         break;
4400
4401                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4402                         continue;
4403
4404                 if (physcpu >= cg_cpu_usage_size)
4405                         continue;
4406
4407                 curcpu ++;
4408                 cpu_cnt ++;
4409
4410                 if (!cpu_in_cpuset(physcpu, cpuset)) {
4411                         for (i = curcpu; i <= physcpu; i++)
4412                                 cg_cpu_usage[i].online = false;
4413                         continue;
4414                 }
4415
4416                 if (curcpu < physcpu) {
4417                         /* Some CPUs may be disabled */
4418                         for (i = curcpu; i < physcpu; i++)
4419                                 cg_cpu_usage[i].online = false;
4420
4421                         curcpu = physcpu;
4422                 }
4423
4424                 cg_cpu_usage[curcpu].online = true;
4425
4426                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4427                            &user,
4428                            &nice,
4429                            &system,
4430                            &idle,
4431                            &iowait,
4432                            &irq,
4433                            &softirq,
4434                            &steal,
4435                            &guest,
4436                            &guest_nice);
4437
4438                 if (ret != 10)
4439                         continue;
4440
4441                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4442                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4443
4444                 if (all_used >= cg_used) {
4445                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4446
4447                 } else {
4448                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4449                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4450                                         curcpu, cg, all_used, cg_used);
4451                         cg_cpu_usage[curcpu].idle = idle;
4452                 }
4453         }
4454
4455         /* Cannot use more CPUs than is available due to cpuset */
4456         if (max_cpus > cpu_cnt)
4457                 max_cpus = cpu_cnt;
4458
4459         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4460
4461         if (!stat_node) {
4462                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4463                 return 0;
4464         }
4465
4466         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4467         if (!diff) {
4468                 return 0;
4469         }
4470
4471         /*
4472          * If the new values are LOWER than values stored in memory, it means
4473          * the cgroup has been reset/recreated and we should reset too.
4474          */
4475         for (curcpu = 0; curcpu < nprocs; curcpu++) {
4476                 if (!cg_cpu_usage[curcpu].online)
4477                         continue;
4478
4479                 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4480                         reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4481
4482                 break;
4483         }
4484
4485         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4486
4487         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4488                 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4489
4490                 if (!stat_node->usage[curcpu].online)
4491                         continue;
4492
4493                 i++;
4494
4495                 stat_node->usage[curcpu].user += diff[curcpu].user;
4496                 stat_node->usage[curcpu].system += diff[curcpu].system;
4497                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4498
4499                 if (max_cpus > 0 && i >= max_cpus) {
4500                         user_surplus += diff[curcpu].user;
4501                         system_surplus += diff[curcpu].system;
4502                 }
4503         }
4504
4505         /* Calculate usage counters of visible CPUs */
4506         if (max_cpus > 0) {
4507                 unsigned long diff_user = 0;
4508                 unsigned long diff_system = 0;
4509                 unsigned long diff_idle = 0;
4510                 unsigned long max_diff_idle = 0;
4511                 unsigned long max_diff_idle_index = 0;
4512                 double exact_cpus;
4513
4514                 /* threshold = maximum usage per cpu, including idle */
4515                 threshold = total_sum / cpu_cnt * max_cpus;
4516
4517                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4518                         if (!stat_node->usage[curcpu].online)
4519                                 continue;
4520
4521                         i++;
4522
4523                         if (i == max_cpus)
4524                                 break;
4525
4526                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4527                                 continue;
4528
4529                         /* Add user */
4530                         add_cpu_usage(&user_surplus, &diff[curcpu],
4531                                       &diff[curcpu].user, threshold);
4532
4533                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4534                                 continue;
4535
4536                         /* If there is still room, add system */
4537                         add_cpu_usage(&system_surplus, &diff[curcpu],
4538                                       &diff[curcpu].system, threshold);
4539                 }
4540
4541                 if (user_surplus > 0)
4542                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4543                 if (system_surplus > 0)
4544                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4545
4546                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4547                         if (!stat_node->usage[curcpu].online)
4548                                 continue;
4549
4550                         i++;
4551
4552                         if (i == max_cpus)
4553                                 break;
4554
4555                         stat_node->view[curcpu].user += diff[curcpu].user;
4556                         stat_node->view[curcpu].system += diff[curcpu].system;
4557                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4558
4559                         user_sum += stat_node->view[curcpu].user;
4560                         system_sum += stat_node->view[curcpu].system;
4561                         idle_sum += stat_node->view[curcpu].idle;
4562
4563                         diff_user += diff[curcpu].user;
4564                         diff_system += diff[curcpu].system;
4565                         diff_idle += diff[curcpu].idle;
4566                         if (diff[curcpu].idle > max_diff_idle) {
4567                                 max_diff_idle = diff[curcpu].idle;
4568                                 max_diff_idle_index = curcpu;
4569                         }
4570
4571                         lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4572                 }
4573                 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4574
4575                 /* revise cpu usage view to support partial cpu case. */
4576                 exact_cpus = exact_cpu_count(cg);
4577                 if (exact_cpus < (double)max_cpus){
4578                         unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4579
4580                         lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4581                         lxcfs_v("delta: %lu\n", delta);
4582                         lxcfs_v("idle_sum before: %lu\n", idle_sum);
4583                         idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4584                         lxcfs_v("idle_sum after: %lu\n", idle_sum);
4585
4586                         curcpu = max_diff_idle_index;
4587                         lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4588                         stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4589                         lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4590                 }
4591         } else {
4592                 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4593                         if (!stat_node->usage[curcpu].online)
4594                                 continue;
4595
4596                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4597                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4598                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4599
4600                         user_sum += stat_node->view[curcpu].user;
4601                         system_sum += stat_node->view[curcpu].system;
4602                         idle_sum += stat_node->view[curcpu].idle;
4603                 }
4604         }
4605
4606         /* Render the file */
4607         /* cpu-all */
4608         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4609                         user_sum,
4610                         system_sum,
4611                         idle_sum);
4612         lxcfs_v("cpu-all: %s\n", buf);
4613
4614         if (l < 0) {
4615                 perror("Error writing to cache");
4616                 return 0;
4617         }
4618         if (l >= buf_size) {
4619                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4620                 return 0;
4621         }
4622
4623         buf += l;
4624         buf_size -= l;
4625         total_len += l;
4626
4627         /* Render visible CPUs */
4628         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4629                 if (!stat_node->usage[curcpu].online)
4630                         continue;
4631
4632                 i++;
4633
4634                 if (max_cpus > 0 && i == max_cpus)
4635                         break;
4636
4637                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4638                                 i,
4639                                 stat_node->view[curcpu].user,
4640                                 stat_node->view[curcpu].system,
4641                                 stat_node->view[curcpu].idle);
4642                 lxcfs_v("cpu: %s\n", buf);
4643
4644                 if (l < 0) {
4645                         perror("Error writing to cache");
4646                         return 0;
4647
4648                 }
4649                 if (l >= buf_size) {
4650                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4651                         return 0;
4652                 }
4653
4654                 buf += l;
4655                 buf_size -= l;
4656                 total_len += l;
4657         }
4658
4659         /* Pass the rest of /proc/stat, start with the last line read */
4660         l = snprintf(buf, buf_size, "%s", line);
4661
4662         if (l < 0) {
4663                 perror("Error writing to cache");
4664                 return 0;
4665
4666         }
4667         if (l >= buf_size) {
4668                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4669                 return 0;
4670         }
4671
4672         buf += l;
4673         buf_size -= l;
4674         total_len += l;
4675
4676         /* Pass the rest of the host's /proc/stat */
4677         while (getline(&line, &linelen, f) != -1) {
4678                 l = snprintf(buf, buf_size, "%s", line);
4679                 if (l < 0) {
4680                         perror("Error writing to cache");
4681                         return 0;
4682                 }
4683                 if (l >= buf_size) {
4684                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4685                         return 0;
4686                 }
4687                 buf += l;
4688                 buf_size -= l;
4689                 total_len += l;
4690         }
4691
4692         if (stat_node)
4693                 pthread_mutex_unlock(&stat_node->lock);
4694         return total_len;
4695 }
4696
4697 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4698 static int proc_stat_read(char *buf, size_t size, off_t offset,
4699                           struct fuse_file_info *fi)
4700 {
4701         __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
4702         __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
4703         __do_fclose FILE *f = NULL;
4704         struct fuse_context *fc = fuse_get_context();
4705         struct file_info *d = (struct file_info *)fi->fh;
4706         size_t linelen = 0, total_len = 0;
4707         int curcpu = -1; /* cpu numbering starts at 0 */
4708         int physcpu = 0;
4709         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4710                       irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4711         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
4712                       iowait_sum = 0, irq_sum = 0, softirq_sum = 0,
4713                       steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4714         char cpuall[CPUALL_MAX_SIZE];
4715         /* reserve for cpu all */
4716         char *cache = d->buf + CPUALL_MAX_SIZE;
4717         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4718         int cg_cpu_usage_size = 0;
4719
4720         if (offset){
4721                 if (offset > d->size)
4722                         return -EINVAL;
4723                 if (!d->cached)
4724                         return 0;
4725                 int left = d->size - offset;
4726                 total_len = left > size ? size: left;
4727                 memcpy(buf, d->buf + offset, total_len);
4728                 return total_len;
4729         }
4730
4731         pid_t initpid = lookup_initpid_in_store(fc->pid);
4732         lxcfs_v("initpid: %d\n", initpid);
4733         if (initpid <= 0)
4734                 initpid = fc->pid;
4735
4736         /*
4737          * when container run with host pid namespace initpid == 1, cgroup will "/"
4738          * we should return host os's /proc contents.
4739          * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4740          */
4741         if (initpid == 1) {
4742             return read_file_fuse("/proc/stat", buf, size, d);
4743         }
4744
4745         cg = get_pid_cgroup(initpid, "cpuset");
4746         lxcfs_v("cg: %s\n", cg);
4747         if (!cg)
4748                 return read_file_fuse("/proc/stat", buf, size, d);
4749         prune_init_slice(cg);
4750
4751         cpuset = get_cpuset(cg);
4752         if (!cpuset)
4753                 return 0;
4754
4755         /*
4756          * Read cpuacct.usage_all for all CPUs.
4757          * If the cpuacct cgroup is present, it is used to calculate the container's
4758          * CPU usage. If not, values from the host's /proc/stat are used.
4759          */
4760         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4761                 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4762                                 "falling back to the host's /proc/stat");
4763         }
4764
4765         f = fopen("/proc/stat", "r");
4766         if (!f)
4767                 return 0;
4768
4769         //skip first line
4770         if (getline(&line, &linelen, f) < 0) {
4771                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4772                 return 0;
4773         }
4774
4775         if (cgroup_ops->can_use_cpuview(cgroup_ops) && cg_cpu_usage) {
4776                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4777                                 f, d->buf, d->buflen);
4778                 goto out;
4779         }
4780
4781         while (getline(&line, &linelen, f) != -1) {
4782                 ssize_t l;
4783                 char cpu_char[10]; /* That's a lot of cores */
4784                 char *c;
4785                 uint64_t all_used, cg_used, new_idle;
4786                 int ret;
4787
4788                 if (strlen(line) == 0)
4789                         continue;
4790                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4791                         /* not a ^cpuN line containing a number N, just print it */
4792                         l = snprintf(cache, cache_size, "%s", line);
4793                         if (l < 0) {
4794                                 perror("Error writing to cache");
4795                                 return 0;
4796                         }
4797                         if (l >= cache_size) {
4798                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4799                                 return 0;
4800                         }
4801                         cache += l;
4802                         cache_size -= l;
4803                         total_len += l;
4804                         continue;
4805                 }
4806
4807                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4808                         continue;
4809                 if (!cpu_in_cpuset(physcpu, cpuset))
4810                         continue;
4811                 curcpu ++;
4812
4813                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4814                            &user,
4815                            &nice,
4816                            &system,
4817                            &idle,
4818                            &iowait,
4819                            &irq,
4820                            &softirq,
4821                            &steal,
4822                            &guest,
4823                            &guest_nice);
4824
4825                 if (ret != 10 || !cg_cpu_usage) {
4826                         c = strchr(line, ' ');
4827                         if (!c)
4828                                 continue;
4829                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4830                         if (l < 0) {
4831                                 perror("Error writing to cache");
4832                                 return 0;
4833
4834                         }
4835                         if (l >= cache_size) {
4836                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4837                                 return 0;
4838                         }
4839
4840                         cache += l;
4841                         cache_size -= l;
4842                         total_len += l;
4843
4844                         if (ret != 10)
4845                                 continue;
4846                 }
4847
4848                 if (cg_cpu_usage) {
4849                         if (physcpu >= cg_cpu_usage_size)
4850                                 break;
4851
4852                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4853                         cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4854
4855                         if (all_used >= cg_used) {
4856                                 new_idle = idle + (all_used - cg_used);
4857
4858                         } else {
4859                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4860                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4861                                                 curcpu, cg, all_used, cg_used);
4862                                 new_idle = idle;
4863                         }
4864
4865                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4866                                         curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4867                                         new_idle);
4868
4869                         if (l < 0) {
4870                                 perror("Error writing to cache");
4871                                 return 0;
4872
4873                         }
4874                         if (l >= cache_size) {
4875                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4876                                 return 0;
4877                         }
4878
4879                         cache += l;
4880                         cache_size -= l;
4881                         total_len += l;
4882
4883                         user_sum += cg_cpu_usage[physcpu].user;
4884                         system_sum += cg_cpu_usage[physcpu].system;
4885                         idle_sum += new_idle;
4886
4887                 } else {
4888                         user_sum += user;
4889                         nice_sum += nice;
4890                         system_sum += system;
4891                         idle_sum += idle;
4892                         iowait_sum += iowait;
4893                         irq_sum += irq;
4894                         softirq_sum += softirq;
4895                         steal_sum += steal;
4896                         guest_sum += guest;
4897                         guest_nice_sum += guest_nice;
4898                 }
4899         }
4900
4901         cache = d->buf;
4902
4903         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4904                         user_sum,
4905                         nice_sum,
4906                         system_sum,
4907                         idle_sum,
4908                         iowait_sum,
4909                         irq_sum,
4910                         softirq_sum,
4911                         steal_sum,
4912                         guest_sum,
4913                         guest_nice_sum);
4914         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4915                 memcpy(cache, cpuall, cpuall_len);
4916                 cache += cpuall_len;
4917         } else {
4918                 /* shouldn't happen */
4919                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4920                 cpuall_len = 0;
4921         }
4922
4923         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4924         total_len += cpuall_len;
4925
4926 out:
4927         d->cached = 1;
4928         d->size = total_len;
4929         if (total_len > size)
4930                 total_len = size;
4931
4932         memcpy(buf, d->buf, total_len);
4933         return total_len;
4934 }
4935
4936 /* This function retrieves the busy time of a group of tasks by looking at
4937  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4938  * been given it's own cpuacct cgroup. If not, this function will take the busy
4939  * time of all other taks that do not actually belong to the container into
4940  * account as well. If someone has a clever solution for this please send a
4941  * patch!
4942  */
4943 static double get_reaper_busy(pid_t task)
4944 {
4945         __do_free char *cgroup = NULL, *usage_str = NULL;
4946         unsigned long usage = 0;
4947         pid_t initpid;
4948
4949         initpid = lookup_initpid_in_store(task);
4950         if (initpid <= 0)
4951                 return 0;
4952
4953         cgroup = get_pid_cgroup(initpid, "cpuacct");
4954         if (!cgroup)
4955                 return 0;
4956         prune_init_slice(cgroup);
4957         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage",
4958                              &usage_str))
4959                 return 0;
4960
4961         usage = strtoul(usage_str, NULL, 10);
4962         return ((double)usage / 1000000000);
4963 }
4964
4965 #if RELOADTEST
4966 void iwashere(void)
4967 {
4968         int fd;
4969
4970         fd = creat("/tmp/lxcfs-iwashere", 0644);
4971         if (fd >= 0)
4972                 close(fd);
4973 }
4974 #endif
4975
4976 /*
4977  * We read /proc/uptime and reuse its second field.
4978  * For the first field, we use the mtime for the reaper for
4979  * the calling pid as returned by getreaperage
4980  */
4981 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4982                 struct fuse_file_info *fi)
4983 {
4984         struct fuse_context *fc = fuse_get_context();
4985         struct file_info *d = (struct file_info *)fi->fh;
4986         double busytime = get_reaper_busy(fc->pid);
4987         char *cache = d->buf;
4988         ssize_t total_len = 0;
4989         double idletime, reaperage;
4990
4991 #if RELOADTEST
4992         iwashere();
4993 #endif
4994
4995         if (offset){
4996                 if (!d->cached)
4997                         return 0;
4998                 if (offset > d->size)
4999                         return -EINVAL;
5000                 int left = d->size - offset;
5001                 total_len = left > size ? size: left;
5002                 memcpy(buf, cache + offset, total_len);
5003                 return total_len;
5004         }
5005
5006         reaperage = get_reaper_age(fc->pid);
5007         /* To understand why this is done, please read the comment to the
5008          * get_reaper_busy() function.
5009          */
5010         idletime = reaperage;
5011         if (reaperage >= busytime)
5012                 idletime = reaperage - busytime;
5013
5014         total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5015         if (total_len < 0 || total_len >=  d->buflen){
5016                 lxcfs_error("%s\n", "failed to write to cache");
5017                 return 0;
5018         }
5019
5020         d->size = (int)total_len;
5021         d->cached = 1;
5022
5023         if (total_len > size) total_len = size;
5024
5025         memcpy(buf, d->buf, total_len);
5026         return total_len;
5027 }
5028
5029 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5030                                struct fuse_file_info *fi)
5031 {
5032         __do_free char *cg = NULL, *io_serviced_str = NULL,
5033                        *io_merged_str = NULL, *io_service_bytes_str = NULL,
5034                        *io_wait_time_str = NULL, *io_service_time_str = NULL,
5035                        *line = NULL;
5036         __do_fclose FILE *f = NULL;
5037         struct fuse_context *fc = fuse_get_context();
5038         struct file_info *d = (struct file_info *)fi->fh;
5039         unsigned long read = 0, write = 0;
5040         unsigned long read_merged = 0, write_merged = 0;
5041         unsigned long read_sectors = 0, write_sectors = 0;
5042         unsigned long read_ticks = 0, write_ticks = 0;
5043         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5044         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5045         char *cache = d->buf;
5046         size_t cache_size = d->buflen;
5047         size_t linelen = 0, total_len = 0;
5048         unsigned int major = 0, minor = 0;
5049         int i = 0;
5050         int ret;
5051         char dev_name[72];
5052
5053         if (offset){
5054                 int left;
5055
5056                 if (offset > d->size)
5057                         return -EINVAL;
5058
5059                 if (!d->cached)
5060                         return 0;
5061
5062                 left = d->size - offset;
5063                 total_len = left > size ? size: left;
5064                 memcpy(buf, cache + offset, total_len);
5065
5066                 return total_len;
5067         }
5068
5069         pid_t initpid = lookup_initpid_in_store(fc->pid);
5070         if (initpid <= 1 || is_shared_pidns(initpid))
5071                 initpid = fc->pid;
5072         cg = get_pid_cgroup(initpid, "blkio");
5073         if (!cg)
5074                 return read_file_fuse("/proc/diskstats", buf, size, d);
5075         prune_init_slice(cg);
5076
5077         ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
5078         if (ret < 0) {
5079                 if (ret == -EOPNOTSUPP)
5080                         return read_file_fuse("/proc/diskstats", buf, size, d);
5081         }
5082
5083         ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
5084         if (ret < 0) {
5085                 if (ret == -EOPNOTSUPP)
5086                         return read_file_fuse("/proc/diskstats", buf, size, d);
5087         }
5088
5089         ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
5090         if (ret < 0) {
5091                 if (ret == -EOPNOTSUPP)
5092                         return read_file_fuse("/proc/diskstats", buf, size, d);
5093         }
5094
5095         ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
5096         if (ret < 0) {
5097                 if (ret == -EOPNOTSUPP)
5098                         return read_file_fuse("/proc/diskstats", buf, size, d);
5099         }
5100
5101         ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
5102         if (ret < 0) {
5103                 if (ret == -EOPNOTSUPP)
5104                         return read_file_fuse("/proc/diskstats", buf, size, d);
5105         }
5106
5107         f = fopen("/proc/diskstats", "r");
5108         if (!f)
5109                 return 0;
5110
5111         while (getline(&line, &linelen, f) != -1) {
5112                 ssize_t l;
5113                 char lbuf[256];
5114
5115                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5116                 if (i != 3)
5117                         continue;
5118
5119                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5120                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5121                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5122                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5123                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5124                 read_sectors = read_sectors/512;
5125                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5126                 write_sectors = write_sectors/512;
5127
5128                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5129                 rd_svctm = rd_svctm/1000000;
5130                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5131                 rd_wait = rd_wait/1000000;
5132                 read_ticks = rd_svctm + rd_wait;
5133
5134                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5135                 wr_svctm =  wr_svctm/1000000;
5136                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5137                 wr_wait =  wr_wait/1000000;
5138                 write_ticks = wr_svctm + wr_wait;
5139
5140                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5141                 tot_ticks =  tot_ticks/1000000;
5142
5143                 memset(lbuf, 0, 256);
5144                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5145                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5146                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5147                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5148                 else
5149                         continue;
5150
5151                 l = snprintf(cache, cache_size, "%s", lbuf);
5152                 if (l < 0) {
5153                         perror("Error writing to fuse buf");
5154                         return 0;
5155                 }
5156                 if (l >= cache_size) {
5157                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5158                         return 0;
5159                 }
5160                 cache += l;
5161                 cache_size -= l;
5162                 total_len += l;
5163         }
5164
5165         d->cached = 1;
5166         d->size = total_len;
5167         if (total_len > size ) total_len = size;
5168         memcpy(buf, d->buf, total_len);
5169
5170         return total_len;
5171 }
5172
5173 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5174                            struct fuse_file_info *fi)
5175 {
5176         __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5177                        *memswusage_str = NULL;
5178         struct fuse_context *fc = fuse_get_context();
5179         struct file_info *d = (struct file_info *)fi->fh;
5180         unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5181                       memswusage = 0, swap_total = 0, swap_free = 0;
5182         ssize_t total_len = 0;
5183         ssize_t l = 0;
5184         char *cache = d->buf;
5185         int ret;
5186
5187         if (offset) {
5188                 int left;
5189
5190                 if (offset > d->size)
5191                         return -EINVAL;
5192
5193                 if (!d->cached)
5194                         return 0;
5195
5196                 left = d->size - offset;
5197                 total_len = left > size ? size: left;
5198                 memcpy(buf, cache + offset, total_len);
5199
5200                 return total_len;
5201         }
5202
5203         pid_t initpid = lookup_initpid_in_store(fc->pid);
5204         if (initpid <= 1 || is_shared_pidns(initpid))
5205                 initpid = fc->pid;
5206         cg = get_pid_cgroup(initpid, "memory");
5207         if (!cg)
5208                 return read_file_fuse("/proc/swaps", buf, size, d);
5209         prune_init_slice(cg);
5210
5211         memlimit = get_min_memlimit(cg, false);
5212
5213         ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5214         if (ret < 0)
5215                 return 0;
5216
5217         memusage = strtoul(memusage_str, NULL, 10);
5218
5219         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5220         if (ret >= 0)
5221                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5222         if (ret >= 0) {
5223                 memswlimit = get_min_memlimit(cg, true);
5224                 memswusage = strtoul(memswusage_str, NULL, 10);
5225                 swap_total = (memswlimit - memlimit) / 1024;
5226                 swap_free = (memswusage - memusage) / 1024;
5227         }
5228
5229         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5230
5231         /* When no mem + swap limit is specified or swapaccount=0*/
5232         if (!memswlimit) {
5233                 __do_free char *line = NULL;
5234                 __do_fclose FILE *f = NULL;
5235                 size_t linelen = 0;
5236
5237                 f = fopen("/proc/meminfo", "r");
5238                 if (!f)
5239                         return 0;
5240
5241                 while (getline(&line, &linelen, f) != -1) {
5242                         if (startswith(line, "SwapTotal:"))
5243                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5244                         else if (startswith(line, "SwapFree:"))
5245                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5246                 }
5247         }
5248
5249         if (swap_total > 0) {
5250                 l = snprintf(d->buf + total_len, d->size - total_len,
5251                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5252                                 swap_total, swap_free);
5253                 total_len += l;
5254         }
5255
5256         if (total_len < 0 || l < 0) {
5257                 perror("Error writing to cache");
5258                 return 0;
5259         }
5260
5261         d->cached = 1;
5262         d->size = (int)total_len;
5263
5264         if (total_len > size) total_len = size;
5265         memcpy(buf, d->buf, total_len);
5266         return total_len;
5267 }
5268
5269 /*
5270  * Find the process pid from cgroup path.
5271  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5272  * @pid_buf : put pid to pid_buf.
5273  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5274  * @depth : the depth of cgroup in container.
5275  * @sum : return the number of pid.
5276  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5277  */
5278 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5279 {
5280         __do_free char *path = NULL;
5281         __do_close_prot_errno int fd = -EBADF;
5282         __do_fclose FILE *f = NULL;
5283         __do_closedir DIR *dir = NULL;
5284         struct dirent *file;
5285         size_t linelen = 0;
5286         char *line = NULL;
5287         int pd;
5288         char **pid;
5289
5290         /* path = dpath + "/cgroup.procs" + /0 */
5291         path = malloc(strlen(dpath) + 20);
5292         if (!path)
5293                 return sum;
5294
5295         strcpy(path, dpath);
5296         fd = openat(cfd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
5297         if (fd < 0)
5298                 return sum;
5299
5300         dir = fdopendir(move_fd(fd));
5301         if (!dir)
5302                 return sum;
5303
5304         while (((file = readdir(dir)) != NULL) && depth > 0) {
5305                 if (strcmp(file->d_name, ".") == 0)
5306                         continue;
5307
5308                 if (strcmp(file->d_name, "..") == 0)
5309                         continue;
5310
5311                 if (file->d_type == DT_DIR) {
5312                         __do_free char *path_dir = NULL;
5313
5314                         /* path + '/' + d_name +/0 */
5315                         path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5316                         if (!path_dir)
5317                                 return sum;
5318
5319                         strcpy(path_dir, path);
5320                         strcat(path_dir, "/");
5321                         strcat(path_dir, file->d_name);
5322                         pd = depth - 1;
5323                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5324                 }
5325         }
5326
5327         strcat(path, "/cgroup.procs");
5328         fd = openat(cfd, path, O_RDONLY);
5329         if (fd < 0)
5330                 return sum;
5331
5332         f = fdopen(move_fd(fd), "r");
5333         if (!f)
5334                 return sum;
5335
5336         while (getline(&line, &linelen, f) != -1) {
5337                 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5338                 if (!pid)
5339                         return sum;
5340                 *pid_buf = pid;
5341
5342                 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5343                 if (!*(*pid_buf + sum))
5344                         return sum;
5345
5346                 strcpy(*(*pid_buf + sum), line);
5347                 sum++;
5348         }
5349
5350         return sum;
5351 }
5352
5353 /*
5354  * calc_load calculates the load according to the following formula:
5355  * load1 = load0 * exp + active * (1 - exp)
5356  *
5357  * @load1: the new loadavg.
5358  * @load0: the former loadavg.
5359  * @active: the total number of running pid at this moment.
5360  * @exp: the fixed-point defined in the beginning.
5361  */
5362 static unsigned long
5363 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5364 {
5365         unsigned long newload;
5366
5367         active = active > 0 ? active * FIXED_1 : 0;
5368         newload = load * exp + active * (FIXED_1 - exp);
5369         if (active >= load)
5370                 newload += FIXED_1 - 1;
5371
5372         return newload / FIXED_1;
5373 }
5374
5375 /*
5376  * Return 0 means that container p->cg is closed.
5377  * Return -1 means that error occurred in refresh.
5378  * Positive num equals the total number of pid.
5379  */
5380 static int refresh_load(struct load_node *p, char *path)
5381 {
5382         __do_free char *line = NULL;
5383         char **idbuf;
5384         char proc_path[256];
5385         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5386         size_t linelen = 0;
5387         int sum, length;
5388         struct dirent *file;
5389
5390         idbuf = malloc(sizeof(char *));
5391         if (!idbuf)
5392                 return -1;
5393
5394         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5395         /*  normal exit  */
5396         if (sum == 0)
5397                 goto out;
5398
5399         for (i = 0; i < sum; i++) {
5400                 __do_closedir DIR *dp = NULL;
5401
5402                 /*clean up '\n' */
5403                 length = strlen(idbuf[i])-1;
5404                 idbuf[i][length] = '\0';
5405                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5406                 if (ret < 0 || ret > 255) {
5407                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5408                         i = sum;
5409                         sum = -1;
5410                         goto err_out;
5411                 }
5412
5413                 dp = opendir(proc_path);
5414                 if (!dp) {
5415                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5416                         continue;
5417                 }
5418                 while ((file = readdir(dp)) != NULL) {
5419                         __do_fclose FILE *f = NULL;
5420
5421                         if (strncmp(file->d_name, ".", 1) == 0)
5422                                 continue;
5423                         if (strncmp(file->d_name, "..", 1) == 0)
5424                                 continue;
5425                         total_pid++;
5426                         /* We make the biggest pid become last_pid.*/
5427                         ret = atof(file->d_name);
5428                         last_pid = (ret > last_pid) ? ret : last_pid;
5429
5430                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5431                         if (ret < 0 || ret > 255) {
5432                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5433                                 i = sum;
5434                                 sum = -1;
5435                                 goto err_out;
5436                         }
5437
5438                         f = fopen(proc_path, "r");
5439                         if (f != NULL) {
5440                                 while (getline(&line, &linelen, f) != -1) {
5441                                         /* Find State */
5442                                         if ((line[0] == 'S') && (line[1] == 't'))
5443                                                 break;
5444                                 }
5445
5446                         if ((line[7] == 'R') || (line[7] == 'D'))
5447                                 run_pid++;
5448                         }
5449                 }
5450         }
5451         /*Calculate the loadavg.*/
5452         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5453         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5454         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5455         p->run_pid = run_pid;
5456         p->total_pid = total_pid;
5457         p->last_pid = last_pid;
5458
5459 err_out:
5460         for (; i > 0; i--)
5461                 free(idbuf[i-1]);
5462 out:
5463         free(idbuf);
5464         return sum;
5465 }
5466
5467 /*
5468  * Traverse the hash table and update it.
5469  */
5470 void *load_begin(void *arg)
5471 {
5472
5473         int i, sum, length, ret;
5474         struct load_node *f;
5475         int first_node;
5476         clock_t time1, time2;
5477
5478         while (1) {
5479                 if (loadavg_stop == 1)
5480                         return NULL;
5481
5482                 time1 = clock();
5483                 for (i = 0; i < LOAD_SIZE; i++) {
5484                         pthread_mutex_lock(&load_hash[i].lock);
5485                         if (load_hash[i].next == NULL) {
5486                                 pthread_mutex_unlock(&load_hash[i].lock);
5487                                 continue;
5488                         }
5489                         f = load_hash[i].next;
5490                         first_node = 1;
5491                         while (f) {
5492                                 __do_free char *path = NULL;
5493
5494                                 length = strlen(f->cg) + 2;
5495                                         /* strlen(f->cg) + '.' or '' + \0 */
5496                                 path = malloc(length);
5497                                 if  (!path)
5498                                         goto out;
5499
5500                                 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
5501                                 if (ret < 0 || ret > length - 1) {
5502                                         /* snprintf failed, ignore the node.*/
5503                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5504                                         goto out;
5505                                 }
5506
5507                                 sum = refresh_load(f, path);
5508                                 if (sum == 0)
5509                                         f = del_node(f, i);
5510                                 else
5511 out:                                    f = f->next;
5512                                 /* load_hash[i].lock locks only on the first node.*/
5513                                 if (first_node == 1) {
5514                                         first_node = 0;
5515                                         pthread_mutex_unlock(&load_hash[i].lock);
5516                                 }
5517                         }
5518                 }
5519
5520                 if (loadavg_stop == 1)
5521                         return NULL;
5522
5523                 time2 = clock();
5524                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5525         }
5526 }
5527
5528 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5529                 struct fuse_file_info *fi)
5530 {
5531         struct fuse_context *fc = fuse_get_context();
5532         struct file_info *d = (struct file_info *)fi->fh;
5533         pid_t initpid;
5534         char *cg;
5535         size_t total_len = 0;
5536         char *cache = d->buf;
5537         struct load_node *n;
5538         int hash;
5539         int cfd, rv = 0;
5540         unsigned long a, b, c;
5541
5542         if (offset) {
5543                 if (offset > d->size)
5544                         return -EINVAL;
5545                 if (!d->cached)
5546                         return 0;
5547                 int left = d->size - offset;
5548                 total_len = left > size ? size : left;
5549                 memcpy(buf, cache + offset, total_len);
5550                 return total_len;
5551         }
5552         if (!loadavg)
5553                 return read_file_fuse("/proc/loadavg", buf, size, d);
5554
5555         initpid = lookup_initpid_in_store(fc->pid);
5556         if (initpid <= 1 || is_shared_pidns(initpid))
5557                 initpid = fc->pid;
5558         cg = get_pid_cgroup(initpid, "cpu");
5559         if (!cg)
5560                 return read_file_fuse("/proc/loadavg", buf, size, d);
5561
5562         prune_init_slice(cg);
5563         hash = calc_hash(cg) % LOAD_SIZE;
5564         n = locate_node(cg, hash);
5565
5566         /* First time */
5567         if (n == NULL) {
5568                 cfd = get_cgroup_fd("cpu");
5569                 if (cfd >= 0) {
5570                         /*
5571                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5572                          * because delete is not allowed before read has ended.
5573                          */
5574                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5575                         rv = 0;
5576                         goto err;
5577                 }
5578                 do {
5579                         n = malloc(sizeof(struct load_node));
5580                 } while (!n);
5581
5582                 do {
5583                         n->cg = malloc(strlen(cg)+1);
5584                 } while (!n->cg);
5585                 strcpy(n->cg, cg);
5586                 n->avenrun[0] = 0;
5587                 n->avenrun[1] = 0;
5588                 n->avenrun[2] = 0;
5589                 n->run_pid = 0;
5590                 n->total_pid = 1;
5591                 n->last_pid = initpid;
5592                 n->cfd = cfd;
5593                 insert_node(&n, hash);
5594         }
5595         a = n->avenrun[0] + (FIXED_1/200);
5596         b = n->avenrun[1] + (FIXED_1/200);
5597         c = n->avenrun[2] + (FIXED_1/200);
5598         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5599                 LOAD_INT(a), LOAD_FRAC(a),
5600                 LOAD_INT(b), LOAD_FRAC(b),
5601                 LOAD_INT(c), LOAD_FRAC(c),
5602                 n->run_pid, n->total_pid, n->last_pid);
5603         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5604         if (total_len < 0 || total_len >=  d->buflen) {
5605                 lxcfs_error("%s\n", "Failed to write to cache");
5606                 rv = 0;
5607                 goto err;
5608         }
5609         d->size = (int)total_len;
5610         d->cached = 1;
5611
5612         if (total_len > size)
5613                 total_len = size;
5614         memcpy(buf, d->buf, total_len);
5615         rv = total_len;
5616
5617 err:
5618         free(cg);
5619         return rv;
5620 }
5621 /* Return a positive number on success, return 0 on failure.*/
5622 pthread_t load_daemon(int load_use)
5623 {
5624         int ret;
5625         pthread_t pid;
5626
5627         ret = init_load();
5628         if (ret == -1) {
5629                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5630                 return 0;
5631         }
5632         ret = pthread_create(&pid, NULL, load_begin, NULL);
5633         if (ret != 0) {
5634                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5635                 load_free();
5636                 return 0;
5637         }
5638         /* use loadavg, here loadavg = 1*/
5639         loadavg = load_use;
5640         return pid;
5641 }
5642
5643 /* Returns 0 on success. */
5644 int stop_load_daemon(pthread_t pid)
5645 {
5646         int s;
5647
5648         /* Signal the thread to gracefully stop */
5649         loadavg_stop = 1;
5650
5651         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5652         if (s != 0) {
5653                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5654                 return -1;
5655         }
5656
5657         load_free();
5658         loadavg_stop = 0;
5659
5660         return 0;
5661 }
5662
5663 static off_t get_procfile_size(const char *which)
5664 {
5665         FILE *f = fopen(which, "r");
5666         char *line = NULL;
5667         size_t len = 0;
5668         ssize_t sz, answer = 0;
5669         if (!f)
5670                 return 0;
5671
5672         while ((sz = getline(&line, &len, f)) != -1)
5673                 answer += sz;
5674         fclose (f);
5675         free(line);
5676
5677         return answer;
5678 }
5679
5680 int proc_getattr(const char *path, struct stat *sb)
5681 {
5682         struct timespec now;
5683
5684         memset(sb, 0, sizeof(struct stat));
5685         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5686                 return -EINVAL;
5687         sb->st_uid = sb->st_gid = 0;
5688         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5689         if (strcmp(path, "/proc") == 0) {
5690                 sb->st_mode = S_IFDIR | 00555;
5691                 sb->st_nlink = 2;
5692                 return 0;
5693         }
5694         if (strcmp(path, "/proc/meminfo") == 0 ||
5695                         strcmp(path, "/proc/cpuinfo") == 0 ||
5696                         strcmp(path, "/proc/uptime") == 0 ||
5697                         strcmp(path, "/proc/stat") == 0 ||
5698                         strcmp(path, "/proc/diskstats") == 0 ||
5699                         strcmp(path, "/proc/swaps") == 0 ||
5700                         strcmp(path, "/proc/loadavg") == 0) {
5701                 sb->st_size = 0;
5702                 sb->st_mode = S_IFREG | 00444;
5703                 sb->st_nlink = 1;
5704                 return 0;
5705         }
5706
5707         return -ENOENT;
5708 }
5709
5710 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5711                 struct fuse_file_info *fi)
5712 {
5713         if (filler(buf, ".", NULL, 0) != 0 ||
5714             filler(buf, "..", NULL, 0) != 0 ||
5715             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5716             filler(buf, "meminfo", NULL, 0) != 0 ||
5717             filler(buf, "stat", NULL, 0) != 0 ||
5718             filler(buf, "uptime", NULL, 0) != 0 ||
5719             filler(buf, "diskstats", NULL, 0) != 0 ||
5720             filler(buf, "swaps", NULL, 0) != 0   ||
5721             filler(buf, "loadavg", NULL, 0) != 0)
5722                 return -EINVAL;
5723         return 0;
5724 }
5725
5726 int proc_open(const char *path, struct fuse_file_info *fi)
5727 {
5728         int type = -1;
5729         struct file_info *info;
5730
5731         if (strcmp(path, "/proc/meminfo") == 0)
5732                 type = LXC_TYPE_PROC_MEMINFO;
5733         else if (strcmp(path, "/proc/cpuinfo") == 0)
5734                 type = LXC_TYPE_PROC_CPUINFO;
5735         else if (strcmp(path, "/proc/uptime") == 0)
5736                 type = LXC_TYPE_PROC_UPTIME;
5737         else if (strcmp(path, "/proc/stat") == 0)
5738                 type = LXC_TYPE_PROC_STAT;
5739         else if (strcmp(path, "/proc/diskstats") == 0)
5740                 type = LXC_TYPE_PROC_DISKSTATS;
5741         else if (strcmp(path, "/proc/swaps") == 0)
5742                 type = LXC_TYPE_PROC_SWAPS;
5743         else if (strcmp(path, "/proc/loadavg") == 0)
5744                 type = LXC_TYPE_PROC_LOADAVG;
5745         if (type == -1)
5746                 return -ENOENT;
5747
5748         info = malloc(sizeof(*info));
5749         if (!info)
5750                 return -ENOMEM;
5751
5752         memset(info, 0, sizeof(*info));
5753         info->type = type;
5754
5755         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5756         do {
5757                 info->buf = malloc(info->buflen);
5758         } while (!info->buf);
5759         memset(info->buf, 0, info->buflen);
5760         /* set actual size to buffer size */
5761         info->size = info->buflen;
5762
5763         fi->fh = (unsigned long)info;
5764         return 0;
5765 }
5766
5767 int proc_access(const char *path, int mask)
5768 {
5769         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5770                 return 0;
5771
5772         /* these are all read-only */
5773         if ((mask & ~R_OK) != 0)
5774                 return -EACCES;
5775         return 0;
5776 }
5777
5778 int proc_release(const char *path, struct fuse_file_info *fi)
5779 {
5780         do_release_file_info(fi);
5781         return 0;
5782 }
5783
5784 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5785                 struct fuse_file_info *fi)
5786 {
5787         struct file_info *f = (struct file_info *) fi->fh;
5788
5789         switch (f->type) {
5790         case LXC_TYPE_PROC_MEMINFO:
5791                 return proc_meminfo_read(buf, size, offset, fi);
5792         case LXC_TYPE_PROC_CPUINFO:
5793                 return proc_cpuinfo_read(buf, size, offset, fi);
5794         case LXC_TYPE_PROC_UPTIME:
5795                 return proc_uptime_read(buf, size, offset, fi);
5796         case LXC_TYPE_PROC_STAT:
5797                 return proc_stat_read(buf, size, offset, fi);
5798         case LXC_TYPE_PROC_DISKSTATS:
5799                 return proc_diskstats_read(buf, size, offset, fi);
5800         case LXC_TYPE_PROC_SWAPS:
5801                 return proc_swaps_read(buf, size, offset, fi);
5802         case LXC_TYPE_PROC_LOADAVG:
5803                 return proc_loadavg_read(buf, size, offset, fi);
5804         default:
5805                 return -EINVAL;
5806         }
5807 }
5808
5809 /*
5810  * Functions needed to setup cgroups in the __constructor__.
5811  */
5812
5813 static bool umount_if_mounted(void)
5814 {
5815         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5816                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5817                 return false;
5818         }
5819         return true;
5820 }
5821
5822 /* __typeof__ should be safe to use with all compilers. */
5823 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5824 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5825 {
5826         return (fs->f_type == (fs_type_magic)magic_val);
5827 }
5828
5829 /*
5830  * looking at fs/proc_namespace.c, it appears we can
5831  * actually expect the rootfs entry to very specifically contain
5832  * " - rootfs rootfs "
5833  * IIUC, so long as we've chrooted so that rootfs is not our root,
5834  * the rootfs entry should always be skipped in mountinfo contents.
5835  */
5836 static bool is_on_ramfs(void)
5837 {
5838         FILE *f;
5839         char *p, *p2;
5840         char *line = NULL;
5841         size_t len = 0;
5842         int i;
5843
5844         f = fopen("/proc/self/mountinfo", "r");
5845         if (!f)
5846                 return false;
5847
5848         while (getline(&line, &len, f) != -1) {
5849                 for (p = line, i = 0; p && i < 4; i++)
5850                         p = strchr(p + 1, ' ');
5851                 if (!p)
5852                         continue;
5853                 p2 = strchr(p + 1, ' ');
5854                 if (!p2)
5855                         continue;
5856                 *p2 = '\0';
5857                 if (strcmp(p + 1, "/") == 0) {
5858                         // this is '/'.  is it the ramfs?
5859                         p = strchr(p2 + 1, '-');
5860                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5861                                 free(line);
5862                                 fclose(f);
5863                                 return true;
5864                         }
5865                 }
5866         }
5867         free(line);
5868         fclose(f);
5869         return false;
5870 }
5871
5872 static int pivot_enter()
5873 {
5874         int ret = -1, oldroot = -1, newroot = -1;
5875
5876         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5877         if (oldroot < 0) {
5878                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5879                 return ret;
5880         }
5881
5882         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5883         if (newroot < 0) {
5884                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5885                 goto err;
5886         }
5887
5888         /* change into new root fs */
5889         if (fchdir(newroot) < 0) {
5890                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5891                 goto err;
5892         }
5893
5894         /* pivot_root into our new root fs */
5895         if (pivot_root(".", ".") < 0) {
5896                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5897                 goto err;
5898         }
5899
5900         /*
5901          * At this point the old-root is mounted on top of our new-root.
5902          * To unmounted it we must not be chdir'd into it, so escape back
5903          * to the old-root.
5904          */
5905         if (fchdir(oldroot) < 0) {
5906                 lxcfs_error("%s\n", "Failed to enter old root.");
5907                 goto err;
5908         }
5909
5910         if (umount2(".", MNT_DETACH) < 0) {
5911                 lxcfs_error("%s\n", "Failed to detach old root.");
5912                 goto err;
5913         }
5914
5915         if (fchdir(newroot) < 0) {
5916                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5917                 goto err;
5918         }
5919
5920         ret = 0;
5921
5922 err:
5923         if (oldroot > 0)
5924                 close(oldroot);
5925         if (newroot > 0)
5926                 close(newroot);
5927
5928         return ret;
5929 }
5930
5931 static int chroot_enter()
5932 {
5933         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5934                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5935                 return -1;
5936         }
5937
5938         if (chroot(".") < 0) {
5939                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5940                 return -1;
5941         }
5942
5943         if (chdir("/") < 0) {
5944                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5945                 return -1;
5946         }
5947
5948         return 0;
5949 }
5950
5951 static int permute_and_enter(void)
5952 {
5953         struct statfs sb;
5954
5955         if (statfs("/", &sb) < 0) {
5956                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5957                 return -1;
5958         }
5959
5960         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5961          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5962          * /proc/1/mountinfo. */
5963         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5964                 return chroot_enter();
5965
5966         if (pivot_enter() < 0) {
5967                 lxcfs_error("%s\n", "Could not perform pivot root.");
5968                 return -1;
5969         }
5970
5971         return 0;
5972 }
5973
5974 /* Prepare our new clean root. */
5975 static int permute_prepare(void)
5976 {
5977         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5978                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5979                 return -1;
5980         }
5981
5982         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5983                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5984                 return -1;
5985         }
5986
5987         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
5988                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
5989                 return -1;
5990         }
5991
5992         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
5993                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
5994                 return -1;
5995         }
5996
5997         return 0;
5998 }
5999
6000 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6001 static bool permute_root(void)
6002 {
6003         /* Prepare new root. */
6004         if (permute_prepare() < 0)
6005                 return false;
6006
6007         /* Pivot into new root. */
6008         if (permute_and_enter() < 0)
6009                 return false;
6010
6011         return true;
6012 }
6013
6014 static int preserve_mnt_ns(int pid)
6015 {
6016         int ret;
6017         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6018         char path[len];
6019
6020         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6021         if (ret < 0 || (size_t)ret >= len)
6022                 return -1;
6023
6024         return open(path, O_RDONLY | O_CLOEXEC);
6025 }
6026
6027 static bool cgfs_prepare_mounts(void)
6028 {
6029         if (!mkdir_p(BASEDIR, 0700)) {
6030                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6031                 return false;
6032         }
6033
6034         if (!umount_if_mounted()) {
6035                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6036                 return false;
6037         }
6038
6039         if (unshare(CLONE_NEWNS) < 0) {
6040                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6041                 return false;
6042         }
6043
6044         cgroup_ops->mntns_fd = preserve_mnt_ns(getpid());
6045         if (cgroup_ops->mntns_fd < 0) {
6046                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6047                 return false;
6048         }
6049
6050         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6051                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6052                 return false;
6053         }
6054
6055         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6056                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6057                 return false;
6058         }
6059
6060         return true;
6061 }
6062
6063 static bool cgfs_mount_hierarchies(void)
6064 {
6065         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
6066                 return false;
6067
6068         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
6069                 return false;
6070
6071         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
6072                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
6073                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
6074                 if ((*h)->fd < 0)
6075                         return false;
6076         }
6077
6078         return true;
6079 }
6080
6081 static bool cgfs_setup_controllers(void)
6082 {
6083         if (!cgfs_prepare_mounts())
6084                 return false;
6085
6086         if (!cgfs_mount_hierarchies()) {
6087                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6088                 return false;
6089         }
6090
6091         if (!permute_root())
6092                 return false;
6093
6094         return true;
6095 }
6096
6097 static void __attribute__((constructor)) lxcfs_init(void)
6098 {
6099         __do_close_prot_errno int init_ns = -EBADF;
6100         char *cret;
6101         char cwd[MAXPATHLEN];
6102
6103         cgroup_ops = cgroup_init();
6104         if (!cgroup_ops)
6105                 log_exit("Failed to initialize cgroup support");
6106
6107         /* Preserve initial namespace. */
6108         init_ns = preserve_mnt_ns(getpid());
6109         if (init_ns < 0)
6110                 log_exit("Failed to preserve initial mount namespace");
6111
6112         cret = getcwd(cwd, MAXPATHLEN);
6113                 log_exit("%s - Could not retrieve current working directory", strerror(errno));
6114
6115         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6116          * to privately mount lxcfs cgroups. */
6117         if (!cgfs_setup_controllers())
6118                 log_exit("Failed to setup private cgroup mounts for lxcfs");
6119
6120         if (setns(init_ns, 0) < 0)
6121                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
6122
6123         if (!cret || chdir(cwd) < 0)
6124                 log_exit("%s - Could not change back to original working directory", strerror(errno));
6125
6126         if (!init_cpuview())
6127                 log_exit("Failed to init CPU view");
6128
6129         print_subsystems();
6130 }
6131
6132 static void __attribute__((destructor)) lxcfs_exit(void)
6133 {
6134         lxcfs_debug("%s\n", "Running destructor for liblxcfs");
6135         free_cpuview();
6136         cgroup_exit(cgroup_ops);
6137 }