bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdarg.h>
  21 #include <stdbool.h>
  22 #include <stdint.h>
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <time.h>
  27 #include <unistd.h>
  28 #include <wait.h>
  29 #include <linux/magic.h>
  30 #include <linux/sched.h>
  31 #include <sys/epoll.h>
  32 #include <sys/mman.h>
  33 #include <sys/mount.h>
  34 #include <sys/param.h>
  35 #include <sys/socket.h>
  36 #include <sys/syscall.h>
  37 #include <sys/sysinfo.h>
  38 #include <sys/vfs.h>
  39
  40 #include "bindings.h"
  41 #include "cgroups/cgroup.h"
  42 #include "cgroups/cgroup_utils.h"
  43 #include "memory_utils.h"
  44 #include "config.h"
  45
  46 /* Define pivot_root() if missing from the C library */
  47 #ifndef HAVE_PIVOT_ROOT
  48 static int pivot_root(const char * new_root, const char * put_old)
  49 {
  50 #ifdef __NR_pivot_root
  51 return syscall(__NR_pivot_root, new_root, put_old);
  52 #else
  53 errno = ENOSYS;
  54 return -1;
  55 #endif
  56 }
  57 #else
  58 extern int pivot_root(const char * new_root, const char * put_old);
  59 #endif
  60
  61 struct cpuacct_usage {
  62         uint64_t user;
  63         uint64_t system;
  64         uint64_t idle;
  65         bool online;
  66 };
  67
  68 /* The function of hash table.*/
  69 #define LOAD_SIZE 100 /*the size of hash_table */
  70 #define FLUSH_TIME 5  /*the flush rate */
  71 #define DEPTH_DIR 3   /*the depth of per cgroup */
  72 /* The function of calculate loadavg .*/
  73 #define FSHIFT          11              /* nr of bits of precision */
  74 #define FIXED_1         (1<<FSHIFT)     /* 1.0 as fixed-point */
  75 #define EXP_1           1884            /* 1/exp(5sec/1min) as fixed-point */
  76 #define EXP_5           2014            /* 1/exp(5sec/5min) */
  77 #define EXP_15          2037            /* 1/exp(5sec/15min) */
  78 #define LOAD_INT(x) ((x) >> FSHIFT)
  79 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
  80 /*
  81  * This parameter is used for proc_loadavg_read().
  82  * 1 means use loadavg, 0 means not use.
  83  */
  84 static int loadavg = 0;
  85 static volatile sig_atomic_t loadavg_stop = 0;
  86 static int calc_hash(const char *name)
  87 {
  88         unsigned int hash = 0;
  89         unsigned int x = 0;
  90         /* ELFHash algorithm. */
  91         while (*name) {
  92                 hash = (hash << 4) + *name++;
  93                 x = hash & 0xf0000000;
  94                 if (x != 0)
  95                         hash ^= (x >> 24);
  96                 hash &= ~x;
  97         }
  98         return (hash & 0x7fffffff);
  99 }
 100
 101 struct load_node {
 102         char *cg;  /*cg */
 103         unsigned long avenrun[3];               /* Load averages */
 104         unsigned int run_pid;
 105         unsigned int total_pid;
 106         unsigned int last_pid;
 107         int cfd; /* The file descriptor of the mounted cgroup */
 108         struct  load_node *next;
 109         struct  load_node **pre;
 110 };
 111
 112 struct load_head {
 113         /*
 114          * The lock is about insert load_node and refresh load_node.To the first
 115          * load_node of each hash bucket, insert and refresh in this hash bucket is
 116          * mutually exclusive.
 117          */
 118         pthread_mutex_t lock;
 119         /*
 120          * The rdlock is about read loadavg and delete load_node.To each hash
 121          * bucket, read and delete is mutually exclusive. But at the same time, we
 122          * allow paratactic read operation. This rdlock is at list level.
 123          */
 124         pthread_rwlock_t rdlock;
 125         /*
 126          * The rilock is about read loadavg and insert load_node.To the first
 127          * load_node of each hash bucket, read and insert is mutually exclusive.
 128          * But at the same time, we allow paratactic read operation.
 129          */
 130         pthread_rwlock_t rilock;
 131         struct load_node *next;
 132 };
 133
 134 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 135 /*
 136  * init_load initialize the hash table.
 137  * Return 0 on success, return -1 on failure.
 138  */
 139 static int init_load(void)
 140 {
 141         int i;
 142         int ret;
 143
 144         for (i = 0; i < LOAD_SIZE; i++) {
 145                 load_hash[i].next = NULL;
 146                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 147                 if (ret != 0) {
 148                         lxcfs_error("%s\n", "Failed to initialize lock");
 149                         goto out3;
 150                 }
 151                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 152                 if (ret != 0) {
 153                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 154                         goto out2;
 155                 }
 156                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 157                 if (ret != 0) {
 158                         lxcfs_error("%s\n", "Failed to initialize rilock");
 159                         goto out1;
 160                 }
 161         }
 162         return 0;
 163 out1:
 164         pthread_rwlock_destroy(&load_hash[i].rdlock);
 165 out2:
 166         pthread_mutex_destroy(&load_hash[i].lock);
 167 out3:
 168         while (i > 0) {
 169                 i--;
 170                 pthread_mutex_destroy(&load_hash[i].lock);
 171                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 172                 pthread_rwlock_destroy(&load_hash[i].rilock);
 173         }
 174         return -1;
 175 }
 176
 177 static void insert_node(struct load_node **n, int locate)
 178 {
 179         struct load_node *f;
 180
 181         pthread_mutex_lock(&load_hash[locate].lock);
 182         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 183         f = load_hash[locate].next;
 184         load_hash[locate].next = *n;
 185
 186         (*n)->pre = &(load_hash[locate].next);
 187         if (f)
 188                 f->pre = &((*n)->next);
 189         (*n)->next = f;
 190         pthread_mutex_unlock(&load_hash[locate].lock);
 191         pthread_rwlock_unlock(&load_hash[locate].rilock);
 192 }
 193 /*
 194  * locate_node() finds special node. Not return NULL means success.
 195  * It should be noted that rdlock isn't unlocked at the end of code
 196  * because this function is used to read special node. Delete is not
 197  * allowed before read has ended.
 198  * unlock rdlock only in proc_loadavg_read().
 199  */
 200 static struct load_node *locate_node(char *cg, int locate)
 201 {
 202         struct load_node *f = NULL;
 203         int i = 0;
 204
 205         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 206         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 207         if (load_hash[locate].next == NULL) {
 208                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 209                 return f;
 210         }
 211         f = load_hash[locate].next;
 212         pthread_rwlock_unlock(&load_hash[locate].rilock);
 213         while (f && ((i = strcmp(f->cg, cg)) != 0))
 214                 f = f->next;
 215         return f;
 216 }
 217
 218 /* Delete the load_node n and return the next node of it. */
 219 static struct load_node *del_node(struct load_node *n, int locate)
 220 {
 221         struct load_node *g;
 222
 223         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 224         if (n->next == NULL) {
 225                 *(n->pre) = NULL;
 226         } else {
 227                 *(n->pre) = n->next;
 228                 n->next->pre = n->pre;
 229         }
 230         g = n->next;
 231         free_disarm(n->cg);
 232         free_disarm(n);
 233         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 234         return g;
 235 }
 236
 237 static void load_free(void)
 238 {
 239         struct load_node *f, *p;
 240
 241         for (int i = 0; i < LOAD_SIZE; i++) {
 242                 pthread_mutex_lock(&load_hash[i].lock);
 243                 pthread_rwlock_wrlock(&load_hash[i].rilock);
 244                 pthread_rwlock_wrlock(&load_hash[i].rdlock);
 245                 if (load_hash[i].next == NULL) {
 246                         pthread_mutex_unlock(&load_hash[i].lock);
 247                         pthread_mutex_destroy(&load_hash[i].lock);
 248                         pthread_rwlock_unlock(&load_hash[i].rilock);
 249                         pthread_rwlock_destroy(&load_hash[i].rilock);
 250                         pthread_rwlock_unlock(&load_hash[i].rdlock);
 251                         pthread_rwlock_destroy(&load_hash[i].rdlock);
 252                         continue;
 253                 }
 254
 255                 for (f = load_hash[i].next; f;) {
 256                         free_disarm(f->cg);
 257                         p = f->next;
 258                         free_disarm(f);
 259                         f = p;
 260                 }
 261
 262                 pthread_mutex_unlock(&load_hash[i].lock);
 263                 pthread_mutex_destroy(&load_hash[i].lock);
 264                 pthread_rwlock_unlock(&load_hash[i].rilock);
 265                 pthread_rwlock_destroy(&load_hash[i].rilock);
 266                 pthread_rwlock_unlock(&load_hash[i].rdlock);
 267                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 268         }
 269 }
 270
 271 /* Data for CPU view */
 272 struct cg_proc_stat {
 273         char *cg;
 274         struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
 275         struct cpuacct_usage *view; // Usage stats reported to the container
 276         int cpu_count;
 277         pthread_mutex_t lock; // For node manipulation
 278         struct cg_proc_stat *next;
 279 };
 280
 281 struct cg_proc_stat_head {
 282         struct cg_proc_stat *next;
 283         time_t lastcheck;
 284
 285         /*
 286          * For access to the list. Reading can be parallel, pruning is exclusive.
 287          */
 288         pthread_rwlock_t lock;
 289 };
 290
 291 #define CPUVIEW_HASH_SIZE 100
 292 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
 293
 294 static bool cpuview_init_head(struct cg_proc_stat_head **head)
 295 {
 296         *head = malloc(sizeof(struct cg_proc_stat_head));
 297         if (!(*head)) {
 298                 lxcfs_error("%s\n", strerror(errno));
 299                 return false;
 300         }
 301
 302         (*head)->lastcheck = time(NULL);
 303         (*head)->next = NULL;
 304
 305         if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
 306                 lxcfs_error("%s\n", "Failed to initialize list lock");
 307                 free_disarm(*head);
 308                 return false;
 309         }
 310
 311         return true;
 312 }
 313
 314 static bool init_cpuview()
 315 {
 316         int i;
 317
 318         for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
 319                 proc_stat_history[i] = NULL;
 320
 321         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 322                 if (!cpuview_init_head(&proc_stat_history[i]))
 323                         goto err;
 324         }
 325
 326         return true;
 327
 328 err:
 329         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 330                 if (proc_stat_history[i])
 331                         free_disarm(proc_stat_history[i]);
 332         }
 333
 334         return false;
 335 }
 336
 337 static void free_proc_stat_node(struct cg_proc_stat *node)
 338 {
 339         pthread_mutex_destroy(&node->lock);
 340         free_disarm(node->cg);
 341         free_disarm(node->usage);
 342         free_disarm(node->view);
 343         free_disarm(node);
 344 }
 345
 346 static void cpuview_free_head(struct cg_proc_stat_head *head)
 347 {
 348         struct cg_proc_stat *node, *tmp;
 349
 350         if (head->next) {
 351                 node = head->next;
 352
 353                 for (;;) {
 354                         tmp = node;
 355                         node = node->next;
 356                         free_proc_stat_node(tmp);
 357
 358                         if (!node)
 359                                 break;
 360                 }
 361         }
 362
 363         pthread_rwlock_destroy(&head->lock);
 364         free_disarm(head);
 365 }
 366
 367 static void free_cpuview()
 368 {
 369         int i;
 370
 371         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
 372                 if (proc_stat_history[i])
 373                         cpuview_free_head(proc_stat_history[i]);
 374         }
 375 }
 376
 377 /*
 378  * A table caching which pid is init for a pid namespace.
 379  * When looking up which pid is init for $qpid, we first
 380  * 1. Stat /proc/$qpid/ns/pid.
 381  * 2. Check whether the ino_t is in our store.
 382  *   a. if not, fork a child in qpid's ns to send us
 383  *       ucred.pid = 1, and read the initpid.  Cache
 384  *       initpid and creation time for /proc/initpid
 385  *       in a new store entry.
 386  *   b. if so, verify that /proc/initpid still matches
 387  *       what we have saved.  If not, clear the store
 388  *       entry and go back to a.  If so, return the
 389  *       cached initpid.
 390  */
 391 struct pidns_init_store {
 392         ino_t ino;          // inode number for /proc/$pid/ns/pid
 393         pid_t initpid;      // the pid of nit in that ns
 394         long int ctime;     // the time at which /proc/$initpid was created
 395         struct pidns_init_store *next;
 396         long int lastcheck;
 397 };
 398
 399 /* lol - look at how they are allocated in the kernel */
 400 #define PIDNS_HASH_SIZE 4096
 401 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 402
 403 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 404 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 405 static void lock_mutex(pthread_mutex_t *l)
 406 {
 407         int ret;
 408
 409         if ((ret = pthread_mutex_lock(l)) != 0) {
 410                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 411                 exit(1);
 412         }
 413 }
 414
 415 struct cgroup_ops *cgroup_ops;
 416
 417 static void unlock_mutex(pthread_mutex_t *l)
 418 {
 419         int ret;
 420
 421         if ((ret = pthread_mutex_unlock(l)) != 0) {
 422                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 423                 exit(1);
 424         }
 425 }
 426
 427 static void store_lock(void)
 428 {
 429         lock_mutex(&pidns_store_mutex);
 430 }
 431
 432 static void store_unlock(void)
 433 {
 434         unlock_mutex(&pidns_store_mutex);
 435 }
 436
 437 /* Must be called under store_lock */
 438 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 439 {
 440         struct stat initsb;
 441         char fnam[100];
 442
 443         snprintf(fnam, 100, "/proc/%d", e->initpid);
 444         if (stat(fnam, &initsb) < 0)
 445                 return false;
 446
 447         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 448                     initsb.st_ctime, e->initpid);
 449
 450         if (e->ctime != initsb.st_ctime)
 451                 return false;
 452         return true;
 453 }
 454
 455 /* Must be called under store_lock */
 456 static void remove_initpid(struct pidns_init_store *e)
 457 {
 458         struct pidns_init_store *tmp;
 459         int h;
 460
 461         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 462
 463         h = HASH(e->ino);
 464         if (pidns_hash_table[h] == e) {
 465                 pidns_hash_table[h] = e->next;
 466                 free_disarm(e);
 467                 return;
 468         }
 469
 470         tmp = pidns_hash_table[h];
 471         while (tmp) {
 472                 if (tmp->next == e) {
 473                         tmp->next = e->next;
 474                         free_disarm(e);
 475                         return;
 476                 }
 477                 tmp = tmp->next;
 478         }
 479 }
 480
 481 #define PURGE_SECS 5
 482 /* Must be called under store_lock */
 483 static void prune_initpid_store(void)
 484 {
 485         static long int last_prune = 0;
 486         struct pidns_init_store *e, *prev, *delme;
 487         long int now, threshold;
 488         int i;
 489
 490         if (!last_prune) {
 491                 last_prune = time(NULL);
 492                 return;
 493         }
 494         now = time(NULL);
 495         if (now < last_prune + PURGE_SECS)
 496                 return;
 497
 498         lxcfs_debug("%s\n", "Pruning.");
 499
 500         last_prune = now;
 501         threshold = now - 2 * PURGE_SECS;
 502
 503         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 504                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 505                         if (e->lastcheck < threshold) {
 506
 507                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 508
 509                                 delme = e;
 510                                 if (prev)
 511                                         prev->next = e->next;
 512                                 else
 513                                         pidns_hash_table[i] = e->next;
 514                                 e = e->next;
 515                                 free_disarm(delme);
 516                         } else {
 517                                 prev = e;
 518                                 e = e->next;
 519                         }
 520                 }
 521         }
 522 }
 523
 524 /* Must be called under store_lock */
 525 static void save_initpid(struct stat *sb, pid_t pid)
 526 {
 527         struct pidns_init_store *e;
 528         char fpath[100];
 529         struct stat procsb;
 530         int h;
 531
 532         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 533
 534         snprintf(fpath, 100, "/proc/%d", pid);
 535         if (stat(fpath, &procsb) < 0)
 536                 return;
 537         do {
 538                 e = malloc(sizeof(*e));
 539         } while (!e);
 540         e->ino = sb->st_ino;
 541         e->initpid = pid;
 542         e->ctime = procsb.st_ctime;
 543         h = HASH(e->ino);
 544         e->next = pidns_hash_table[h];
 545         e->lastcheck = time(NULL);
 546         pidns_hash_table[h] = e;
 547 }
 548
 549 /*
 550  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 551  * entry for the inode number and creation time.  Verify that the init pid
 552  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 553  * otherwise.
 554  * Must be called under store_lock
 555  */
 556 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 557 {
 558         int h = HASH(sb->st_ino);
 559         struct pidns_init_store *e = pidns_hash_table[h];
 560
 561         while (e) {
 562                 if (e->ino == sb->st_ino) {
 563                         if (initpid_still_valid(e, sb)) {
 564                                 e->lastcheck = time(NULL);
 565                                 return e;
 566                         }
 567                         remove_initpid(e);
 568                         return NULL;
 569                 }
 570                 e = e->next;
 571         }
 572
 573         return NULL;
 574 }
 575
 576 static int is_dir(const char *path, int fd)
 577 {
 578         struct stat statbuf;
 579         int ret = fstatat(fd, path, &statbuf, fd);
 580         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 581                 return 1;
 582         return 0;
 583 }
 584
 585 static int preserve_ns(const int pid, const char *ns)
 586 {
 587         int ret;
 588 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
 589 #define __NS_PATH_LEN 50
 590         char path[__NS_PATH_LEN];
 591
 592         /* This way we can use this function to also check whether namespaces
 593          * are supported by the kernel by passing in the NULL or the empty
 594          * string.
 595          */
 596         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
 597                        !ns || strcmp(ns, "") == 0 ? "" : "/",
 598                        !ns || strcmp(ns, "") == 0 ? "" : ns);
 599         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
 600                 errno = EFBIG;
 601                 return -1;
 602         }
 603
 604         return open(path, O_RDONLY | O_CLOEXEC);
 605 }
 606
 607 /**
 608  * in_same_namespace - Check whether two processes are in the same namespace.
 609  * @pid1 - PID of the first process.
 610  * @pid2 - PID of the second process.
 611  * @ns   - Name of the namespace to check. Must correspond to one of the names
 612  *         for the namespaces as shown in /proc/<pid/ns/
 613  *
 614  * If the two processes are not in the same namespace returns an fd to the
 615  * namespace of the second process identified by @pid2. If the two processes are
 616  * in the same namespace returns -EINVAL, -1 if an error occurred.
 617  */
 618 static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
 619 {
 620         __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
 621         int ret = -1;
 622         struct stat ns_st1, ns_st2;
 623
 624         ns_fd1 = preserve_ns(pid1, ns);
 625         if (ns_fd1 < 0) {
 626                 /* The kernel does not support this namespace. This is not an
 627                  * error.
 628                  */
 629                 if (errno == ENOENT)
 630                         return -EINVAL;
 631
 632                 return -1;
 633         }
 634
 635         ns_fd2 = preserve_ns(pid2, ns);
 636         if (ns_fd2 < 0)
 637                 return -1;
 638
 639         ret = fstat(ns_fd1, &ns_st1);
 640         if (ret < 0)
 641                 return -1;
 642
 643         ret = fstat(ns_fd2, &ns_st2);
 644         if (ret < 0)
 645                 return -1;
 646
 647         /* processes are in the same namespace */
 648         if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
 649                 return -EINVAL;
 650
 651         /* processes are in different namespaces */
 652         return move_fd(ns_fd2);
 653 }
 654
 655 static bool is_shared_pidns(pid_t pid)
 656 {
 657         if (pid != 1)
 658                 return false;
 659
 660         if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
 661                 return true;
 662
 663         return false;
 664 }
 665
 666 static bool write_string(const char *fnam, const char *string, int fd)
 667 {
 668         FILE *f;
 669         size_t len, ret;
 670
 671         f = fdopen(fd, "w");
 672         if (!f)
 673                 return false;
 674
 675         len = strlen(string);
 676         ret = fwrite(string, 1, len, f);
 677         if (ret != len) {
 678                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
 679                             strerror(errno), string, fnam);
 680                 fclose(f);
 681                 return false;
 682         }
 683
 684         if (fclose(f) < 0) {
 685                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
 686                 return false;
 687         }
 688
 689         return true;
 690 }
 691
 692 struct cgfs_files {
 693         char *name;
 694         uint32_t uid, gid;
 695         uint32_t mode;
 696 };
 697
 698 static void print_subsystems(void)
 699 {
 700         int i = 0;
 701
 702         fprintf(stderr, "mount namespace: %d\n", cgroup_ops->mntns_fd);
 703         fprintf(stderr, "hierarchies:\n");
 704         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) {
 705                 __do_free char *controllers = lxc_string_join(",", (const char **)(*h)->controllers, false);
 706                 fprintf(stderr, " %2d: fd: %3d: %s\n", i, (*h)->fd, controllers ?: "");
 707         }
 708 }
 709
 710 /* do we need to do any massaging here?  I'm not sure... */
 711 /* Return the mounted controller and store the corresponding open file descriptor
 712  * referring to the controller mountpoint in the private lxcfs namespace in
 713  * @cfd.
 714  */
 715 static int find_mounted_controller(const char *controller)
 716 {
 717         struct hierarchy *h;
 718
 719         h = cgroup_ops->get_hierarchy(cgroup_ops, controller);
 720         return h ? h->fd : -EBADF;
 721 }
 722
 723 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 724                 const char *value)
 725 {
 726         int ret, fd, cfd;
 727         size_t len;
 728         char *fnam;
 729
 730         cfd = find_mounted_controller(controller);
 731         if (cfd < 0)
 732                 return false;
 733
 734         /* Make sure we pass a relative path to *at() family of functions.
 735          * . + /cgroup + / + file + \0
 736          */
 737         len = strlen(cgroup) + strlen(file) + 3;
 738         fnam = alloca(len);
 739         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
 740         if (ret < 0 || (size_t)ret >= len)
 741                 return false;
 742
 743         fd = openat(cfd, fnam, O_WRONLY);
 744         if (fd < 0)
 745                 return false;
 746
 747         return write_string(fnam, value, fd);
 748 }
 749
 750 // Chown all the files in the cgroup directory.  We do this when we create
 751 // a cgroup on behalf of a user.
 752 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 753 {
 754         struct dirent *direntp;
 755         char path[MAXPATHLEN];
 756         size_t len;
 757         DIR *d;
 758         int fd1, ret;
 759
 760         len = strlen(dirname);
 761         if (len >= MAXPATHLEN) {
 762                 lxcfs_error("Pathname too long: %s\n", dirname);
 763                 return;
 764         }
 765
 766         fd1 = openat(fd, dirname, O_DIRECTORY);
 767         if (fd1 < 0)
 768                 return;
 769
 770         d = fdopendir(fd1);
 771         if (!d) {
 772                 lxcfs_error("Failed to open %s\n", dirname);
 773                 return;
 774         }
 775
 776         while ((direntp = readdir(d))) {
 777                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 778                         continue;
 779                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 780                 if (ret < 0 || ret >= MAXPATHLEN) {
 781                         lxcfs_error("Pathname too long under %s\n", dirname);
 782                         continue;
 783                 }
 784                 if (fchownat(fd, path, uid, gid, 0) < 0)
 785                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 786         }
 787         closedir(d);
 788 }
 789
 790 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 791 {
 792         int cfd;
 793         size_t len;
 794         char *dirnam;
 795
 796         cfd = find_mounted_controller(controller);
 797         if (cfd < 0)
 798                 return -EINVAL;
 799
 800         /* Make sure we pass a relative path to *at() family of functions.
 801          * . + /cg + \0
 802          */
 803         len = strlen(cg) + 2;
 804         dirnam = alloca(len);
 805         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 806
 807         if (mkdirat(cfd, dirnam, 0755) < 0)
 808                 return -errno;
 809
 810         if (uid == 0 && gid == 0)
 811                 return 0;
 812
 813         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 814                 return -errno;
 815
 816         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 817
 818         return 0;
 819 }
 820
 821 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 822 {
 823         struct dirent *direntp;
 824         DIR *dir;
 825         bool ret = false;
 826         char pathname[MAXPATHLEN];
 827         int dupfd;
 828
 829         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 830         if (dupfd < 0)
 831                 return false;
 832
 833         dir = fdopendir(dupfd);
 834         if (!dir) {
 835                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 836                 close(dupfd);
 837                 return false;
 838         }
 839
 840         while ((direntp = readdir(dir))) {
 841                 struct stat mystat;
 842                 int rc;
 843
 844                 if (!strcmp(direntp->d_name, ".") ||
 845                     !strcmp(direntp->d_name, ".."))
 846                         continue;
 847
 848                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 849                 if (rc < 0 || rc >= MAXPATHLEN) {
 850                         lxcfs_error("%s\n", "Pathname too long.");
 851                         continue;
 852                 }
 853
 854                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 855                 if (rc) {
 856                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 857                         continue;
 858                 }
 859                 if (S_ISDIR(mystat.st_mode))
 860                         if (!recursive_rmdir(pathname, fd, cfd))
 861                                 lxcfs_debug("Error removing %s.\n", pathname);
 862         }
 863
 864         ret = true;
 865         if (closedir(dir) < 0) {
 866                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 867                 ret = false;
 868         }
 869
 870         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 871                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 872                 ret = false;
 873         }
 874
 875         close(dupfd);
 876
 877         return ret;
 878 }
 879
 880 bool cgfs_remove(const char *controller, const char *cg)
 881 {
 882         int fd, cfd;
 883         size_t len;
 884         char *dirnam;
 885         bool bret;
 886
 887         cfd = find_mounted_controller(controller);
 888         if (cfd < 0)
 889                 return false;
 890
 891         /* Make sure we pass a relative path to *at() family of functions.
 892          * . +  /cg + \0
 893          */
 894         len = strlen(cg) + 2;
 895         dirnam = alloca(len);
 896         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 897
 898         fd = openat(cfd, dirnam, O_DIRECTORY);
 899         if (fd < 0)
 900                 return false;
 901
 902         bret = recursive_rmdir(dirnam, fd, cfd);
 903         close(fd);
 904         return bret;
 905 }
 906
 907 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 908 {
 909         int cfd;
 910         size_t len;
 911         char *pathname;
 912
 913         cfd = find_mounted_controller(controller);
 914         if (cfd < 0)
 915                 return false;
 916
 917         /* Make sure we pass a relative path to *at() family of functions.
 918          * . + /file + \0
 919          */
 920         len = strlen(file) + 2;
 921         pathname = alloca(len);
 922         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 923         if (fchmodat(cfd, pathname, mode, 0) < 0)
 924                 return false;
 925         return true;
 926 }
 927
 928 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 929 {
 930         size_t len;
 931         char *fname;
 932
 933         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 934         fname = alloca(len);
 935         snprintf(fname, len, "%s/tasks", dirname);
 936         if (fchownat(fd, fname, uid, gid, 0) != 0)
 937                 return -errno;
 938         snprintf(fname, len, "%s/cgroup.procs", dirname);
 939         if (fchownat(fd, fname, uid, gid, 0) != 0)
 940                 return -errno;
 941         return 0;
 942 }
 943
 944 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 945 {
 946         int cfd;
 947         size_t len;
 948         char *pathname;
 949
 950         cfd = find_mounted_controller(controller);
 951         if (cfd < 0)
 952                 return false;
 953
 954         /* Make sure we pass a relative path to *at() family of functions.
 955          * . + /file + \0
 956          */
 957         len = strlen(file) + 2;
 958         pathname = alloca(len);
 959         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 960         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 961                 return -errno;
 962
 963         if (is_dir(pathname, cfd))
 964                 // like cgmanager did, we want to chown the tasks file as well
 965                 return chown_tasks_files(pathname, uid, gid, cfd);
 966
 967         return 0;
 968 }
 969
 970 FILE *open_pids_file(const char *controller, const char *cgroup)
 971 {
 972         int fd, cfd;
 973         size_t len;
 974         char *pathname;
 975
 976         cfd = find_mounted_controller(controller);
 977         if (cfd < 0)
 978                 return false;
 979
 980         /* Make sure we pass a relative path to *at() family of functions.
 981          * . + /cgroup + / "cgroup.procs" + \0
 982          */
 983         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 984         pathname = alloca(len);
 985         snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
 986
 987         fd = openat(cfd, pathname, O_WRONLY);
 988         if (fd < 0)
 989                 return NULL;
 990
 991         return fdopen(fd, "w");
 992 }
 993
 994 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 995                                 void ***list, size_t typesize,
 996                                 void* (*iterator)(const char*, const char*, const char*))
 997 {
 998         int cfd, fd, ret;
 999         size_t len;
1000         char *cg;
1001         char pathname[MAXPATHLEN];
1002         size_t sz = 0, asz = 0;
1003         struct dirent *dirent;
1004         DIR *dir;
1005
1006         cfd = find_mounted_controller(controller);
1007         *list = NULL;
1008         if (cfd < 0)
1009                 return false;
1010
1011         /* Make sure we pass a relative path to *at() family of functions. */
1012         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1013         cg = alloca(len);
1014         ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
1015         if (ret < 0 || (size_t)ret >= len) {
1016                 lxcfs_error("Pathname too long under %s\n", cgroup);
1017                 return false;
1018         }
1019
1020         fd = openat(cfd, cg, O_DIRECTORY);
1021         if (fd < 0)
1022                 return false;
1023
1024         dir = fdopendir(fd);
1025         if (!dir)
1026                 return false;
1027
1028         while ((dirent = readdir(dir))) {
1029                 struct stat mystat;
1030
1031                 if (!strcmp(dirent->d_name, ".") ||
1032                     !strcmp(dirent->d_name, ".."))
1033                         continue;
1034
1035                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1036                 if (ret < 0 || ret >= MAXPATHLEN) {
1037                         lxcfs_error("Pathname too long under %s\n", cg);
1038                         continue;
1039                 }
1040
1041                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1042                 if (ret) {
1043                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1044                         continue;
1045                 }
1046                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1047                     (directories && !S_ISDIR(mystat.st_mode)))
1048                         continue;
1049
1050                 if (sz+2 >= asz) {
1051                         void **tmp;
1052                         asz += BATCH_SIZE;
1053                         do {
1054                                 tmp = realloc(*list, asz * typesize);
1055                         } while  (!tmp);
1056                         *list = tmp;
1057                 }
1058                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1059                 (*list)[sz+1] = NULL;
1060                 sz++;
1061         }
1062         if (closedir(dir) < 0) {
1063                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1064                 return false;
1065         }
1066         return true;
1067 }
1068
1069 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1070 {
1071         char *dup;
1072         do {
1073                 dup = strdup(dir_entry);
1074         } while (!dup);
1075         return dup;
1076 }
1077
1078 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1079 {
1080         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1081 }
1082
1083 void free_key(struct cgfs_files *k)
1084 {
1085         if (!k)
1086                 return;
1087         free_disarm(k->name);
1088         free_disarm(k);
1089 }
1090
1091 void free_keys(struct cgfs_files **keys)
1092 {
1093         int i;
1094
1095         if (!keys)
1096                 return;
1097         for (i = 0; keys[i]; i++) {
1098                 free_key(keys[i]);
1099         }
1100         free_disarm(keys);
1101 }
1102
1103 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1104 {
1105         int ret, cfd;
1106         size_t len;
1107         char *fnam;
1108
1109         cfd = find_mounted_controller(controller);
1110         if (cfd < 0)
1111                 return false;
1112
1113         /* Make sure we pass a relative path to *at() family of functions.
1114          * . + /cgroup + / + file + \0
1115          */
1116         len = strlen(cgroup) + strlen(file) + 3;
1117         fnam = alloca(len);
1118         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1119         if (ret < 0 || (size_t)ret >= len)
1120                 return false;
1121
1122         return (faccessat(cfd, fnam, F_OK, 0) == 0);
1123 }
1124
1125 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1126 {
1127         int ret, cfd;
1128         size_t len;
1129         char *fnam;
1130         struct stat sb;
1131         struct cgfs_files *newkey;
1132
1133         cfd = find_mounted_controller(controller);
1134         if (cfd < 0)
1135                 return false;
1136
1137         if (file && *file == '/')
1138                 file++;
1139
1140         if (file && strchr(file, '/'))
1141                 return NULL;
1142
1143         /* Make sure we pass a relative path to *at() family of functions.
1144          * . + /cgroup + / + file + \0
1145          */
1146         len = strlen(cgroup) + 3;
1147         if (file)
1148                 len += strlen(file) + 1;
1149         fnam = alloca(len);
1150         snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
1151                  file ? "/" : "", file ? file : "");
1152
1153         ret = fstatat(cfd, fnam, &sb, 0);
1154         if (ret < 0)
1155                 return NULL;
1156
1157         do {
1158                 newkey = malloc(sizeof(struct cgfs_files));
1159         } while (!newkey);
1160         if (file)
1161                 newkey->name = must_copy_string(file);
1162         else if (strrchr(cgroup, '/'))
1163                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1164         else
1165                 newkey->name = must_copy_string(cgroup);
1166         newkey->uid = sb.st_uid;
1167         newkey->gid = sb.st_gid;
1168         newkey->mode = sb.st_mode;
1169
1170         return newkey;
1171 }
1172
1173 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1174 {
1175         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1176         if (!entry) {
1177                 lxcfs_error("Error getting files under %s:%s\n", controller,
1178                              cgroup);
1179         }
1180         return entry;
1181 }
1182
1183 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1184 {
1185         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1186 }
1187
1188 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1189 {
1190         int cfd;
1191         size_t len;
1192         char *fnam;
1193         int ret;
1194         struct stat sb;
1195
1196         cfd = find_mounted_controller(controller);
1197         if (cfd < 0)
1198                 return false;
1199
1200         /* Make sure we pass a relative path to *at() family of functions.
1201          * . + /cgroup + / + f + \0
1202          */
1203         len = strlen(cgroup) + strlen(f) + 3;
1204         fnam = alloca(len);
1205         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
1206         if (ret < 0 || (size_t)ret >= len)
1207                 return false;
1208
1209         ret = fstatat(cfd, fnam, &sb, 0);
1210         if (ret < 0 || !S_ISDIR(sb.st_mode))
1211                 return false;
1212
1213         return true;
1214 }
1215
1216 #define SEND_CREDS_OK 0
1217 #define SEND_CREDS_NOTSK 1
1218 #define SEND_CREDS_FAIL 2
1219 static bool recv_creds(int sock, struct ucred *cred, char *v);
1220 static int wait_for_pid(pid_t pid);
1221 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1222 static int send_creds_clone_wrapper(void *arg);
1223
1224 /*
1225  * clone a task which switches to @task's namespace and writes '1'.
1226  * over a unix sock so we can read the task's reaper's pid in our
1227  * namespace
1228  *
1229  * Note: glibc's fork() does not respect pidns, which can lead to failed
1230  * assertions inside glibc (and thus failed forks) if the child's pid in
1231  * the pidns and the parent pid outside are identical. Using clone prevents
1232  * this issue.
1233  */
1234 static void write_task_init_pid_exit(int sock, pid_t target)
1235 {
1236         char fnam[100];
1237         pid_t pid;
1238         int fd, ret;
1239         size_t stack_size = sysconf(_SC_PAGESIZE);
1240         void *stack = alloca(stack_size);
1241
1242         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1243         if (ret < 0 || ret >= sizeof(fnam))
1244                 _exit(1);
1245
1246         fd = open(fnam, O_RDONLY);
1247         if (fd < 0) {
1248                 perror("write_task_init_pid_exit open of ns/pid");
1249                 _exit(1);
1250         }
1251         if (setns(fd, 0)) {
1252                 perror("write_task_init_pid_exit setns 1");
1253                 close(fd);
1254                 _exit(1);
1255         }
1256         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1257         if (pid < 0)
1258                 _exit(1);
1259         if (pid != 0) {
1260                 if (!wait_for_pid(pid))
1261                         _exit(1);
1262                 _exit(0);
1263         }
1264 }
1265
1266 static int send_creds_clone_wrapper(void *arg) {
1267         struct ucred cred;
1268         char v;
1269         int sock = *(int *)arg;
1270
1271         /* we are the child */
1272         cred.uid = 0;
1273         cred.gid = 0;
1274         cred.pid = 1;
1275         v = '1';
1276         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1277                 return 1;
1278         return 0;
1279 }
1280
1281 static pid_t get_init_pid_for_task(pid_t task)
1282 {
1283         int sock[2];
1284         pid_t pid;
1285         pid_t ret = -1;
1286         char v = '0';
1287         struct ucred cred;
1288
1289         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1290                 perror("socketpair");
1291                 return -1;
1292         }
1293
1294         pid = fork();
1295         if (pid < 0)
1296                 goto out;
1297         if (!pid) {
1298                 close(sock[1]);
1299                 write_task_init_pid_exit(sock[0], task);
1300                 _exit(0);
1301         }
1302
1303         if (!recv_creds(sock[1], &cred, &v))
1304                 goto out;
1305         ret = cred.pid;
1306
1307 out:
1308         close(sock[0]);
1309         close(sock[1]);
1310         if (pid > 0)
1311                 wait_for_pid(pid);
1312         return ret;
1313 }
1314
1315 pid_t lookup_initpid_in_store(pid_t qpid)
1316 {
1317         pid_t answer = 0;
1318         struct stat sb;
1319         struct pidns_init_store *e;
1320         char fnam[100];
1321
1322         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1323         store_lock();
1324         if (stat(fnam, &sb) < 0)
1325                 goto out;
1326         e = lookup_verify_initpid(&sb);
1327         if (e) {
1328                 answer = e->initpid;
1329                 goto out;
1330         }
1331         answer = get_init_pid_for_task(qpid);
1332         if (answer > 0)
1333                 save_initpid(&sb, answer);
1334
1335 out:
1336         /* we prune at end in case we are returning
1337          * the value we were about to return */
1338         prune_initpid_store();
1339         store_unlock();
1340         return answer;
1341 }
1342
1343 static int wait_for_pid(pid_t pid)
1344 {
1345         int status, ret;
1346
1347         if (pid <= 0)
1348                 return -1;
1349
1350 again:
1351         ret = waitpid(pid, &status, 0);
1352         if (ret == -1) {
1353                 if (errno == EINTR)
1354                         goto again;
1355                 return -1;
1356         }
1357         if (ret != pid)
1358                 goto again;
1359         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1360                 return -1;
1361         return 0;
1362 }
1363
1364 /*
1365  * append the given formatted string to *src.
1366  * src: a pointer to a char* in which to append the formatted string.
1367  * sz: the number of characters printed so far, minus trailing \0.
1368  * asz: the allocated size so far
1369  * format: string format. See printf for details.
1370  * ...: varargs. See printf for details.
1371  */
1372 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1373 {
1374         char tmp[BUF_RESERVE_SIZE];
1375         va_list         args;
1376
1377         va_start (args, format);
1378         int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1379         va_end(args);
1380
1381         if (!*src || tmplen + *sz + 1 >= *asz) {
1382                 char *tmp;
1383                 do {
1384                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1385                 } while (!tmp);
1386                 *src = tmp;
1387                 *asz += BUF_RESERVE_SIZE;
1388         }
1389         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1390         *sz += tmplen;
1391 }
1392
1393 /*
1394  * append pid to *src.
1395  * src: a pointer to a char* in which ot append the pid.
1396  * sz: the number of characters printed so far, minus trailing \0.
1397  * asz: the allocated size so far
1398  * pid: the pid to append
1399  */
1400 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1401 {
1402         must_strcat(src, sz, asz, "%d\n", (int)pid);
1403 }
1404
1405 /*
1406  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1407  * valid in the caller's namespace, return the id mapped into
1408  * pid's namespace.
1409  * Returns the mapped id, or -1 on error.
1410  */
1411 unsigned int
1412 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1413 {
1414         unsigned int nsuid,   // base id for a range in the idfile's namespace
1415                      hostuid, // base id for a range in the caller's namespace
1416                      count;   // number of ids in this range
1417         char line[400];
1418         int ret;
1419
1420         fseek(idfile, 0L, SEEK_SET);
1421         while (fgets(line, 400, idfile)) {
1422                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1423                 if (ret != 3)
1424                         continue;
1425                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1426                         /*
1427                          * uids wrapped around - unexpected as this is a procfile,
1428                          * so just bail.
1429                          */
1430                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1431                                 nsuid, hostuid, count, line);
1432                         return -1;
1433                 }
1434                 if (hostuid <= in_id && hostuid+count > in_id) {
1435                         /*
1436                          * now since hostuid <= in_id < hostuid+count, and
1437                          * hostuid+count and nsuid+count do not wrap around,
1438                          * we know that nsuid+(in_id-hostuid) which must be
1439                          * less that nsuid+(count) must not wrap around
1440                          */
1441                         return (in_id - hostuid) + nsuid;
1442                 }
1443         }
1444
1445         // no answer found
1446         return -1;
1447 }
1448
1449 /*
1450  * for is_privileged_over,
1451  * specify whether we require the calling uid to be root in his
1452  * namespace
1453  */
1454 #define NS_ROOT_REQD true
1455 #define NS_ROOT_OPT false
1456
1457 #define PROCLEN 100
1458
1459 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1460 {
1461         char fpath[PROCLEN];
1462         int ret;
1463         bool answer = false;
1464         uid_t nsuid;
1465
1466         if (victim == -1 || uid == -1)
1467                 return false;
1468
1469         /*
1470          * If the request is one not requiring root in the namespace,
1471          * then having the same uid suffices.  (i.e. uid 1000 has write
1472          * access to files owned by uid 1000
1473          */
1474         if (!req_ns_root && uid == victim)
1475                 return true;
1476
1477         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1478         if (ret < 0 || ret >= PROCLEN)
1479                 return false;
1480         FILE *f = fopen(fpath, "r");
1481         if (!f)
1482                 return false;
1483
1484         /* if caller's not root in his namespace, reject */
1485         nsuid = convert_id_to_ns(f, uid);
1486         if (nsuid)
1487                 goto out;
1488
1489         /*
1490          * If victim is not mapped into caller's ns, reject.
1491          * XXX I'm not sure this check is needed given that fuse
1492          * will be sending requests where the vfs has converted
1493          */
1494         nsuid = convert_id_to_ns(f, victim);
1495         if (nsuid == -1)
1496                 goto out;
1497
1498         answer = true;
1499
1500 out:
1501         fclose(f);
1502         return answer;
1503 }
1504
1505 static bool perms_include(int fmode, mode_t req_mode)
1506 {
1507         mode_t r;
1508
1509         switch (req_mode & O_ACCMODE) {
1510         case O_RDONLY:
1511                 r = S_IROTH;
1512                 break;
1513         case O_WRONLY:
1514                 r = S_IWOTH;
1515                 break;
1516         case O_RDWR:
1517                 r = S_IROTH | S_IWOTH;
1518                 break;
1519         default:
1520                 return false;
1521         }
1522         return ((fmode & r) == r);
1523 }
1524
1525
1526 /*
1527  * taskcg is  a/b/c
1528  * querycg is /a/b/c/d/e
1529  * we return 'd'
1530  */
1531 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1532 {
1533         char *start, *end;
1534
1535         if (strlen(taskcg) <= strlen(querycg)) {
1536                 lxcfs_error("%s\n", "I was fed bad input.");
1537                 return NULL;
1538         }
1539
1540         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1541                 start =  strdup(taskcg + 1);
1542         else
1543                 start = strdup(taskcg + strlen(querycg) + 1);
1544         if (!start)
1545                 return NULL;
1546         end = strchr(start, '/');
1547         if (end)
1548                 *end = '\0';
1549         return start;
1550 }
1551
1552 char *get_pid_cgroup(pid_t pid, const char *contrl)
1553 {
1554         int cfd;
1555
1556         cfd = find_mounted_controller(contrl);
1557         if (cfd < 0)
1558                 return false;
1559
1560         if (pure_unified_layout(cgroup_ops))
1561                 return cg_unified_get_current_cgroup(pid);
1562
1563         return cg_legacy_get_current_cgroup(pid, contrl);
1564 }
1565
1566 /*
1567  * check whether a fuse context may access a cgroup dir or file
1568  *
1569  * If file is not null, it is a cgroup file to check under cg.
1570  * If file is null, then we are checking perms on cg itself.
1571  *
1572  * For files we can check the mode of the list_keys result.
1573  * For cgroups, we must make assumptions based on the files under the
1574  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1575  * yet.
1576  */
1577 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1578 {
1579         struct cgfs_files *k = NULL;
1580         bool ret = false;
1581
1582         k = cgfs_get_key(contrl, cg, file);
1583         if (!k)
1584                 return false;
1585
1586         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1587                 if (perms_include(k->mode >> 6, mode)) {
1588                         ret = true;
1589                         goto out;
1590                 }
1591         }
1592         if (fc->gid == k->gid) {
1593                 if (perms_include(k->mode >> 3, mode)) {
1594                         ret = true;
1595                         goto out;
1596                 }
1597         }
1598         ret = perms_include(k->mode, mode);
1599
1600 out:
1601         free_key(k);
1602         return ret;
1603 }
1604
1605 #define INITSCOPE "/init.scope"
1606 void prune_init_slice(char *cg)
1607 {
1608         char *point;
1609         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1610
1611         if (cg_len < initscope_len)
1612                 return;
1613
1614         point = cg + cg_len - initscope_len;
1615         if (strcmp(point, INITSCOPE) == 0) {
1616                 if (point == cg)
1617                         *(point+1) = '\0';
1618                 else
1619                         *point = '\0';
1620         }
1621 }
1622
1623 /*
1624  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1625  * If pid is in /a, he may act on /a/b, but not on /b.
1626  * if the answer is false and nextcg is not NULL, then *nextcg will point
1627  * to a string containing the next cgroup directory under cg, which must be
1628  * freed by the caller.
1629  */
1630 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1631 {
1632         bool answer = false;
1633         char *c2 = get_pid_cgroup(pid, contrl);
1634         char *linecmp;
1635
1636         if (!c2)
1637                 return false;
1638         prune_init_slice(c2);
1639
1640         /*
1641          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1642          * they pass in a cgroup without leading '/'
1643          *
1644          * The original line here was:
1645          *      linecmp = *cg == '/' ? c2 : c2+1;
1646          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1647          *       Serge, do you know?
1648          */
1649         if (*cg == '/' || !strncmp(cg, "./", 2))
1650                 linecmp = c2;
1651         else
1652                 linecmp = c2 + 1;
1653         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1654                 if (nextcg) {
1655                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1656                 }
1657                 goto out;
1658         }
1659         answer = true;
1660
1661 out:
1662         free(c2);
1663         return answer;
1664 }
1665
1666 /*
1667  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1668  */
1669 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1670 {
1671         bool answer = false;
1672         char *c2, *task_cg;
1673         size_t target_len, task_len;
1674
1675         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1676                 return true;
1677
1678         c2 = get_pid_cgroup(pid, contrl);
1679         if (!c2)
1680                 return false;
1681         prune_init_slice(c2);
1682
1683         task_cg = c2 + 1;
1684         target_len = strlen(cg);
1685         task_len = strlen(task_cg);
1686         if (task_len == 0) {
1687                 /* Task is in the root cg, it can see everything. This case is
1688                  * not handled by the strmcps below, since they test for the
1689                  * last /, but that is the first / that we've chopped off
1690                  * above.
1691                  */
1692                 answer = true;
1693                 goto out;
1694         }
1695         if (strcmp(cg, task_cg) == 0) {
1696                 answer = true;
1697                 goto out;
1698         }
1699         if (target_len < task_len) {
1700                 /* looking up a parent dir */
1701                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1702                         answer = true;
1703                 goto out;
1704         }
1705         if (target_len > task_len) {
1706                 /* looking up a child dir */
1707                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1708                         answer = true;
1709                 goto out;
1710         }
1711
1712 out:
1713         free(c2);
1714         return answer;
1715 }
1716
1717 /*
1718  * given /cgroup/freezer/a/b, return "freezer".
1719  * the returned char* should NOT be freed.
1720  */
1721 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1722 {
1723         const char *p1;
1724         char *contr, *slash;
1725
1726         if (strlen(path) < 9) {
1727                 errno = EACCES;
1728                 return NULL;
1729         }
1730         if (*(path + 7) != '/') {
1731                 errno = EINVAL;
1732                 return NULL;
1733         }
1734         p1 = path + 8;
1735         contr = strdupa(p1);
1736         if (!contr) {
1737                 errno = ENOMEM;
1738                 return NULL;
1739         }
1740         slash = strstr(contr, "/");
1741         if (slash)
1742                 *slash = '\0';
1743
1744         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1745                 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
1746                         return (*h)->__controllers;
1747         }
1748         errno = ENOENT;
1749         return NULL;
1750 }
1751
1752 /*
1753  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1754  * Note that the returned value may include files (keynames) etc
1755  */
1756 static const char *find_cgroup_in_path(const char *path)
1757 {
1758         const char *p1;
1759
1760         if (strlen(path) < 9) {
1761                 errno = EACCES;
1762                 return NULL;
1763         }
1764         p1 = strstr(path + 8, "/");
1765         if (!p1) {
1766                 errno = EINVAL;
1767                 return NULL;
1768         }
1769         errno = 0;
1770         return p1 + 1;
1771 }
1772
1773 /*
1774  * split the last path element from the path in @cg.
1775  * @dir is newly allocated and should be freed, @last not
1776 */
1777 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1778 {
1779         char *p;
1780
1781         do {
1782                 *dir = strdup(cg);
1783         } while (!*dir);
1784         *last = strrchr(cg, '/');
1785         if (!*last) {
1786                 *last = NULL;
1787                 return;
1788         }
1789         p = strrchr(*dir, '/');
1790         *p = '\0';
1791 }
1792
1793 /*
1794  * FUSE ops for /cgroup
1795  */
1796
1797 int cg_getattr(const char *path, struct stat *sb)
1798 {
1799         struct timespec now;
1800         struct fuse_context *fc = fuse_get_context();
1801         char * cgdir = NULL;
1802         char *last = NULL, *path1, *path2;
1803         struct cgfs_files *k = NULL;
1804         const char *cgroup;
1805         const char *controller = NULL;
1806         int ret = -ENOENT;
1807
1808
1809         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1810                 return -EIO;
1811
1812         memset(sb, 0, sizeof(struct stat));
1813
1814         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1815                 return -EINVAL;
1816
1817         sb->st_uid = sb->st_gid = 0;
1818         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1819         sb->st_size = 0;
1820
1821         if (strcmp(path, "/cgroup") == 0) {
1822                 sb->st_mode = S_IFDIR | 00755;
1823                 sb->st_nlink = 2;
1824                 return 0;
1825         }
1826
1827         controller = pick_controller_from_path(fc, path);
1828         if (!controller)
1829                 return -errno;
1830         cgroup = find_cgroup_in_path(path);
1831         if (!cgroup) {
1832                 /* this is just /cgroup/controller, return it as a dir */
1833                 sb->st_mode = S_IFDIR | 00755;
1834                 sb->st_nlink = 2;
1835                 return 0;
1836         }
1837
1838         get_cgdir_and_path(cgroup, &cgdir, &last);
1839
1840         if (!last) {
1841                 path1 = "/";
1842                 path2 = cgdir;
1843         } else {
1844                 path1 = cgdir;
1845                 path2 = last;
1846         }
1847
1848         pid_t initpid = lookup_initpid_in_store(fc->pid);
1849         if (initpid <= 1 || is_shared_pidns(initpid))
1850                 initpid = fc->pid;
1851         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1852          * Then check that caller's cgroup is under path if last is a child
1853          * cgroup, or cgdir if last is a file */
1854
1855         if (is_child_cgroup(controller, path1, path2)) {
1856                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1857                         ret = -ENOENT;
1858                         goto out;
1859                 }
1860                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1861                         /* this is just /cgroup/controller, return it as a dir */
1862                         sb->st_mode = S_IFDIR | 00555;
1863                         sb->st_nlink = 2;
1864                         ret = 0;
1865                         goto out;
1866                 }
1867                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1868                         ret = -EACCES;
1869                         goto out;
1870                 }
1871
1872                 // get uid, gid, from '/tasks' file and make up a mode
1873                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1874                 sb->st_mode = S_IFDIR | 00755;
1875                 k = cgfs_get_key(controller, cgroup, NULL);
1876                 if (!k) {
1877                         sb->st_uid = sb->st_gid = 0;
1878                 } else {
1879                         sb->st_uid = k->uid;
1880                         sb->st_gid = k->gid;
1881                 }
1882                 free_key(k);
1883                 sb->st_nlink = 2;
1884                 ret = 0;
1885                 goto out;
1886         }
1887
1888         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1889                 sb->st_mode = S_IFREG | k->mode;
1890                 sb->st_nlink = 1;
1891                 sb->st_uid = k->uid;
1892                 sb->st_gid = k->gid;
1893                 sb->st_size = 0;
1894                 free_key(k);
1895                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1896                         ret = -ENOENT;
1897                         goto out;
1898                 }
1899                 ret = 0;
1900         }
1901
1902 out:
1903         free(cgdir);
1904         return ret;
1905 }
1906
1907 int cg_opendir(const char *path, struct fuse_file_info *fi)
1908 {
1909         struct fuse_context *fc = fuse_get_context();
1910         const char *cgroup;
1911         struct file_info *dir_info;
1912         char *controller = NULL;
1913
1914         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1915                 return -EIO;
1916
1917         if (strcmp(path, "/cgroup") == 0) {
1918                 cgroup = NULL;
1919                 controller = NULL;
1920         } else {
1921                 // return list of keys for the controller, and list of child cgroups
1922                 controller = pick_controller_from_path(fc, path);
1923                 if (!controller)
1924                         return -errno;
1925
1926                 cgroup = find_cgroup_in_path(path);
1927                 if (!cgroup) {
1928                         /* this is just /cgroup/controller, return its contents */
1929                         cgroup = "/";
1930                 }
1931         }
1932
1933         pid_t initpid = lookup_initpid_in_store(fc->pid);
1934         if (initpid <= 1 || is_shared_pidns(initpid))
1935                 initpid = fc->pid;
1936         if (cgroup) {
1937                 if (!caller_may_see_dir(initpid, controller, cgroup))
1938                         return -ENOENT;
1939                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1940                         return -EACCES;
1941         }
1942
1943         /* we'll free this at cg_releasedir */
1944         dir_info = malloc(sizeof(*dir_info));
1945         if (!dir_info)
1946                 return -ENOMEM;
1947         dir_info->controller = must_copy_string(controller);
1948         dir_info->cgroup = must_copy_string(cgroup);
1949         dir_info->type = LXC_TYPE_CGDIR;
1950         dir_info->buf = NULL;
1951         dir_info->file = NULL;
1952         dir_info->buflen = 0;
1953
1954         fi->fh = (unsigned long)dir_info;
1955         return 0;
1956 }
1957
1958 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1959                 struct fuse_file_info *fi)
1960 {
1961         struct file_info *d = (struct file_info *)fi->fh;
1962         struct cgfs_files **list = NULL;
1963         int i, ret;
1964         char *nextcg = NULL;
1965         struct fuse_context *fc = fuse_get_context();
1966         char **clist = NULL;
1967
1968         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1969                 return -EIO;
1970
1971         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1972                 return -EIO;
1973
1974         if (d->type != LXC_TYPE_CGDIR) {
1975                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1976                 return -EIO;
1977         }
1978         if (!d->cgroup && !d->controller) {
1979                 /*
1980                  * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1981                  * This only works with the legacy hierarchy.
1982                  */
1983                 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1984                         if (is_unified_hierarchy(*h))
1985                                 continue;
1986
1987                         if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
1988                                 return -EIO;
1989                 }
1990
1991                 return 0;
1992         }
1993
1994         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1995                 // not a valid cgroup
1996                 ret = -EINVAL;
1997                 goto out;
1998         }
1999
2000         pid_t initpid = lookup_initpid_in_store(fc->pid);
2001         if (initpid <= 1 || is_shared_pidns(initpid))
2002                 initpid = fc->pid;
2003         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2004                 if (nextcg) {
2005                         ret = filler(buf, nextcg,  NULL, 0);
2006                         free(nextcg);
2007                         if (ret != 0) {
2008                                 ret = -EIO;
2009                                 goto out;
2010                         }
2011                 }
2012                 ret = 0;
2013                 goto out;
2014         }
2015
2016         for (i = 0; list && list[i]; i++) {
2017                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2018                         ret = -EIO;
2019                         goto out;
2020                 }
2021         }
2022
2023         // now get the list of child cgroups
2024
2025         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2026                 ret = 0;
2027                 goto out;
2028         }
2029         if (clist) {
2030                 for (i = 0; clist[i]; i++) {
2031                         if (filler(buf, clist[i], NULL, 0) != 0) {
2032                                 ret = -EIO;
2033                                 goto out;
2034                         }
2035                 }
2036         }
2037         ret = 0;
2038
2039 out:
2040         free_keys(list);
2041         if (clist) {
2042                 for (i = 0; clist[i]; i++)
2043                         free(clist[i]);
2044                 free(clist);
2045         }
2046         return ret;
2047 }
2048
2049 void do_release_file_info(struct fuse_file_info *fi)
2050 {
2051         struct file_info *f = (struct file_info *)fi->fh;
2052
2053         if (!f)
2054                 return;
2055
2056         fi->fh = 0;
2057
2058         free_disarm(f->controller);
2059         free_disarm(f->cgroup);
2060         free_disarm(f->file);
2061         free_disarm(f->buf);
2062         free_disarm(f);
2063 }
2064
2065 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2066 {
2067         do_release_file_info(fi);
2068         return 0;
2069 }
2070
2071 int cg_open(const char *path, struct fuse_file_info *fi)
2072 {
2073         const char *cgroup;
2074         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2075         struct cgfs_files *k = NULL;
2076         struct file_info *file_info;
2077         struct fuse_context *fc = fuse_get_context();
2078         int ret;
2079
2080         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2081                 return -EIO;
2082
2083         controller = pick_controller_from_path(fc, path);
2084         if (!controller)
2085                 return -errno;
2086         cgroup = find_cgroup_in_path(path);
2087         if (!cgroup)
2088                 return -errno;
2089
2090         get_cgdir_and_path(cgroup, &cgdir, &last);
2091         if (!last) {
2092                 path1 = "/";
2093                 path2 = cgdir;
2094         } else {
2095                 path1 = cgdir;
2096                 path2 = last;
2097         }
2098
2099         k = cgfs_get_key(controller, path1, path2);
2100         if (!k) {
2101                 ret = -EINVAL;
2102                 goto out;
2103         }
2104         free_key(k);
2105
2106         pid_t initpid = lookup_initpid_in_store(fc->pid);
2107         if (initpid <= 1 || is_shared_pidns(initpid))
2108                 initpid = fc->pid;
2109         if (!caller_may_see_dir(initpid, controller, path1)) {
2110                 ret = -ENOENT;
2111                 goto out;
2112         }
2113         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2114                 ret = -EACCES;
2115                 goto out;
2116         }
2117
2118         /* we'll free this at cg_release */
2119         file_info = malloc(sizeof(*file_info));
2120         if (!file_info) {
2121                 ret = -ENOMEM;
2122                 goto out;
2123         }
2124         file_info->controller = must_copy_string(controller);
2125         file_info->cgroup = must_copy_string(path1);
2126         file_info->file = must_copy_string(path2);
2127         file_info->type = LXC_TYPE_CGFILE;
2128         file_info->buf = NULL;
2129         file_info->buflen = 0;
2130
2131         fi->fh = (unsigned long)file_info;
2132         ret = 0;
2133
2134 out:
2135         free(cgdir);
2136         return ret;
2137 }
2138
2139 int cg_access(const char *path, int mode)
2140 {
2141         int ret;
2142         const char *cgroup;
2143         char *path1, *path2, *controller;
2144         char *last = NULL, *cgdir = NULL;
2145         struct cgfs_files *k = NULL;
2146         struct fuse_context *fc = fuse_get_context();
2147
2148         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2149                 return -EIO;
2150
2151         if (strcmp(path, "/cgroup") == 0)
2152                 return 0;
2153
2154         controller = pick_controller_from_path(fc, path);
2155         if (!controller)
2156                 return -errno;
2157         cgroup = find_cgroup_in_path(path);
2158         if (!cgroup) {
2159                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2160                 if ((mode & W_OK) == 0)
2161                         return 0;
2162                 return -EACCES;
2163         }
2164
2165         get_cgdir_and_path(cgroup, &cgdir, &last);
2166         if (!last) {
2167                 path1 = "/";
2168                 path2 = cgdir;
2169         } else {
2170                 path1 = cgdir;
2171                 path2 = last;
2172         }
2173
2174         k = cgfs_get_key(controller, path1, path2);
2175         if (!k) {
2176                 if ((mode & W_OK) == 0)
2177                         ret = 0;
2178                 else
2179                         ret = -EACCES;
2180                 goto out;
2181         }
2182         free_key(k);
2183
2184         pid_t initpid = lookup_initpid_in_store(fc->pid);
2185         if (initpid <= 1 || is_shared_pidns(initpid))
2186                 initpid = fc->pid;
2187         if (!caller_may_see_dir(initpid, controller, path1)) {
2188                 ret = -ENOENT;
2189                 goto out;
2190         }
2191         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2192                 ret = -EACCES;
2193                 goto out;
2194         }
2195
2196         ret = 0;
2197
2198 out:
2199         free(cgdir);
2200         return ret;
2201 }
2202
2203 int cg_release(const char *path, struct fuse_file_info *fi)
2204 {
2205         do_release_file_info(fi);
2206         return 0;
2207 }
2208
2209 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2210
2211 static bool wait_for_sock(int sock, int timeout)
2212 {
2213         struct epoll_event ev;
2214         int epfd, ret, now, starttime, deltatime, saved_errno;
2215
2216         if ((starttime = time(NULL)) < 0)
2217                 return false;
2218
2219         if ((epfd = epoll_create(1)) < 0) {
2220                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2221                 return false;
2222         }
2223
2224         ev.events = POLLIN_SET;
2225         ev.data.fd = sock;
2226         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2227                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2228                 close(epfd);
2229                 return false;
2230         }
2231
2232 again:
2233         if ((now = time(NULL)) < 0) {
2234                 close(epfd);
2235                 return false;
2236         }
2237
2238         deltatime = (starttime + timeout) - now;
2239         if (deltatime < 0) { // timeout
2240                 errno = 0;
2241                 close(epfd);
2242                 return false;
2243         }
2244         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2245         if (ret < 0 && errno == EINTR)
2246                 goto again;
2247         saved_errno = errno;
2248         close(epfd);
2249
2250         if (ret <= 0) {
2251                 errno = saved_errno;
2252                 return false;
2253         }
2254         return true;
2255 }
2256
2257 static int msgrecv(int sockfd, void *buf, size_t len)
2258 {
2259         if (!wait_for_sock(sockfd, 2))
2260                 return -1;
2261         return recv(sockfd, buf, len, MSG_DONTWAIT);
2262 }
2263
2264 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2265 {
2266         struct msghdr msg = { 0 };
2267         struct iovec iov;
2268         struct cmsghdr *cmsg;
2269         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2270         char buf[1];
2271         buf[0] = 'p';
2272
2273         if (pingfirst) {
2274                 if (msgrecv(sock, buf, 1) != 1) {
2275                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2276                         return SEND_CREDS_FAIL;
2277                 }
2278         }
2279
2280         msg.msg_control = cmsgbuf;
2281         msg.msg_controllen = sizeof(cmsgbuf);
2282
2283         cmsg = CMSG_FIRSTHDR(&msg);
2284         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2285         cmsg->cmsg_level = SOL_SOCKET;
2286         cmsg->cmsg_type = SCM_CREDENTIALS;
2287         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2288
2289         msg.msg_name = NULL;
2290         msg.msg_namelen = 0;
2291
2292         buf[0] = v;
2293         iov.iov_base = buf;
2294         iov.iov_len = sizeof(buf);
2295         msg.msg_iov = &iov;
2296         msg.msg_iovlen = 1;
2297
2298         if (sendmsg(sock, &msg, 0) < 0) {
2299                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2300                 if (errno == 3)
2301                         return SEND_CREDS_NOTSK;
2302                 return SEND_CREDS_FAIL;
2303         }
2304
2305         return SEND_CREDS_OK;
2306 }
2307
2308 static bool recv_creds(int sock, struct ucred *cred, char *v)
2309 {
2310         struct msghdr msg = { 0 };
2311         struct iovec iov;
2312         struct cmsghdr *cmsg;
2313         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2314         char buf[1];
2315         int ret;
2316         int optval = 1;
2317
2318         *v = '1';
2319
2320         cred->pid = -1;
2321         cred->uid = -1;
2322         cred->gid = -1;
2323
2324         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2325                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2326                 return false;
2327         }
2328         buf[0] = '1';
2329         if (write(sock, buf, 1) != 1) {
2330                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2331                 return false;
2332         }
2333
2334         msg.msg_name = NULL;
2335         msg.msg_namelen = 0;
2336         msg.msg_control = cmsgbuf;
2337         msg.msg_controllen = sizeof(cmsgbuf);
2338
2339         iov.iov_base = buf;
2340         iov.iov_len = sizeof(buf);
2341         msg.msg_iov = &iov;
2342         msg.msg_iovlen = 1;
2343
2344         if (!wait_for_sock(sock, 2)) {
2345                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2346                 return false;
2347         }
2348         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2349         if (ret < 0) {
2350                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2351                 return false;
2352         }
2353
2354         cmsg = CMSG_FIRSTHDR(&msg);
2355
2356         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2357                         cmsg->cmsg_level == SOL_SOCKET &&
2358                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2359                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2360         }
2361         *v = buf[0];
2362
2363         return true;
2364 }
2365
2366 struct pid_ns_clone_args {
2367         int *cpipe;
2368         int sock;
2369         pid_t tpid;
2370         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2371 };
2372
2373 /*
2374  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2375  * with clone(). This simply writes '1' as ACK back to the parent
2376  * before calling the actual wrapped function.
2377  */
2378 static int pid_ns_clone_wrapper(void *arg) {
2379         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2380         char b = '1';
2381
2382         close(args->cpipe[0]);
2383         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2384                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2385         close(args->cpipe[1]);
2386         return args->wrapped(args->sock, args->tpid);
2387 }
2388
2389 /*
2390  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2391  * int value back over the socket.  This shifts the pid from the
2392  * sender's pidns into tpid's pidns.
2393  */
2394 static int pid_to_ns(int sock, pid_t tpid)
2395 {
2396         char v = '0';
2397         struct ucred cred;
2398
2399         while (recv_creds(sock, &cred, &v)) {
2400                 if (v == '1')
2401                         return 0;
2402                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2403                         return 1;
2404         }
2405         return 0;
2406 }
2407
2408
2409 /*
2410  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2411  * in your old pidns.  Only children which you clone will be in the target
2412  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2413  * actually convert pids.
2414  *
2415  * Note: glibc's fork() does not respect pidns, which can lead to failed
2416  * assertions inside glibc (and thus failed forks) if the child's pid in
2417  * the pidns and the parent pid outside are identical. Using clone prevents
2418  * this issue.
2419  */
2420 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2421 {
2422         int newnsfd = -1, ret, cpipe[2];
2423         char fnam[100];
2424         pid_t cpid;
2425         char v;
2426
2427         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2428         if (ret < 0 || ret >= sizeof(fnam))
2429                 _exit(1);
2430         newnsfd = open(fnam, O_RDONLY);
2431         if (newnsfd < 0)
2432                 _exit(1);
2433         if (setns(newnsfd, 0) < 0)
2434                 _exit(1);
2435         close(newnsfd);
2436
2437         if (pipe(cpipe) < 0)
2438                 _exit(1);
2439
2440         struct pid_ns_clone_args args = {
2441                 .cpipe = cpipe,
2442                 .sock = sock,
2443                 .tpid = tpid,
2444                 .wrapped = &pid_to_ns
2445         };
2446         size_t stack_size = sysconf(_SC_PAGESIZE);
2447         void *stack = alloca(stack_size);
2448
2449         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2450         if (cpid < 0)
2451                 _exit(1);
2452
2453         // give the child 1 second to be done forking and
2454         // write its ack
2455         if (!wait_for_sock(cpipe[0], 1))
2456                 _exit(1);
2457         ret = read(cpipe[0], &v, 1);
2458         if (ret != sizeof(char) || v != '1')
2459                 _exit(1);
2460
2461         if (!wait_for_pid(cpid))
2462                 _exit(1);
2463         _exit(0);
2464 }
2465
2466 /*
2467  * To read cgroup files with a particular pid, we will setns into the child
2468  * pidns, open a pipe, fork a child - which will be the first to really be in
2469  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2470  */
2471 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2472 {
2473         int sock[2] = {-1, -1};
2474         char *tmpdata = NULL;
2475         int ret;
2476         pid_t qpid, cpid = -1;
2477         bool answer = false;
2478         char v = '0';
2479         struct ucred cred;
2480         size_t sz = 0, asz = 0;
2481
2482         if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
2483                 return false;
2484
2485         /*
2486          * Now we read the pids from returned data one by one, pass
2487          * them into a child in the target namespace, read back the
2488          * translated pids, and put them into our to-return data
2489          */
2490
2491         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2492                 perror("socketpair");
2493                 free(tmpdata);
2494                 return false;
2495         }
2496
2497         cpid = fork();
2498         if (cpid == -1)
2499                 goto out;
2500
2501         if (!cpid) // child - exits when done
2502                 pid_to_ns_wrapper(sock[1], tpid);
2503
2504         char *ptr = tmpdata;
2505         cred.uid = 0;
2506         cred.gid = 0;
2507         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2508                 cred.pid = qpid;
2509                 ret = send_creds(sock[0], &cred, v, true);
2510
2511                 if (ret == SEND_CREDS_NOTSK)
2512                         goto next;
2513                 if (ret == SEND_CREDS_FAIL)
2514                         goto out;
2515
2516                 // read converted results
2517                 if (!wait_for_sock(sock[0], 2)) {
2518                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2519                         goto out;
2520                 }
2521                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2522                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2523                         goto out;
2524                 }
2525                 must_strcat_pid(d, &sz, &asz, qpid);
2526 next:
2527                 ptr = strchr(ptr, '\n');
2528                 if (!ptr)
2529                         break;
2530                 ptr++;
2531         }
2532
2533         cred.pid = getpid();
2534         v = '1';
2535         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2536                 // failed to ask child to exit
2537                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2538                 goto out;
2539         }
2540
2541         answer = true;
2542
2543 out:
2544         free(tmpdata);
2545         if (cpid != -1)
2546                 wait_for_pid(cpid);
2547         if (sock[0] != -1) {
2548                 close(sock[0]);
2549                 close(sock[1]);
2550         }
2551         return answer;
2552 }
2553
2554 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2555                 struct fuse_file_info *fi)
2556 {
2557         struct fuse_context *fc = fuse_get_context();
2558         struct file_info *f = (struct file_info *)fi->fh;
2559         struct cgfs_files *k = NULL;
2560         char *data = NULL;
2561         int ret, s;
2562         bool r;
2563
2564         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2565                 return -EIO;
2566
2567         if (f->type != LXC_TYPE_CGFILE) {
2568                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2569                 return -EIO;
2570         }
2571
2572         if (offset)
2573                 return 0;
2574
2575         if (!f->controller)
2576                 return -EINVAL;
2577
2578         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2579                 return -EINVAL;
2580         }
2581         free_key(k);
2582
2583
2584         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2585                 ret = -EACCES;
2586                 goto out;
2587         }
2588
2589         if (strcmp(f->file, "tasks") == 0 ||
2590                         strcmp(f->file, "/tasks") == 0 ||
2591                         strcmp(f->file, "/cgroup.procs") == 0 ||
2592                         strcmp(f->file, "cgroup.procs") == 0)
2593                 // special case - we have to translate the pids
2594                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2595         else
2596                 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
2597
2598         if (!r) {
2599                 ret = -EINVAL;
2600                 goto out;
2601         }
2602
2603         if (!data) {
2604                 ret = 0;
2605                 goto out;
2606         }
2607         s = strlen(data);
2608         if (s > size)
2609                 s = size;
2610         memcpy(buf, data, s);
2611         if (s > 0 && s < size && data[s-1] != '\n')
2612                 buf[s++] = '\n';
2613
2614         ret = s;
2615
2616 out:
2617         free(data);
2618         return ret;
2619 }
2620
2621 static int pid_from_ns(int sock, pid_t tpid)
2622 {
2623         pid_t vpid;
2624         struct ucred cred;
2625         char v;
2626         int ret;
2627
2628         cred.uid = 0;
2629         cred.gid = 0;
2630         while (1) {
2631                 if (!wait_for_sock(sock, 2)) {
2632                         lxcfs_error("%s\n", "Timeout reading from parent.");
2633                         return 1;
2634                 }
2635                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2636                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2637                         return 1;
2638                 }
2639                 if (vpid == -1) // done
2640                         break;
2641                 v = '0';
2642                 cred.pid = vpid;
2643                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2644                         v = '1';
2645                         cred.pid = getpid();
2646                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2647                                 return 1;
2648                 }
2649         }
2650         return 0;
2651 }
2652
2653 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2654 {
2655         int newnsfd = -1, ret, cpipe[2];
2656         char fnam[100];
2657         pid_t cpid;
2658         char v;
2659
2660         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2661         if (ret < 0 || ret >= sizeof(fnam))
2662                 _exit(1);
2663         newnsfd = open(fnam, O_RDONLY);
2664         if (newnsfd < 0)
2665                 _exit(1);
2666         if (setns(newnsfd, 0) < 0)
2667                 _exit(1);
2668         close(newnsfd);
2669
2670         if (pipe(cpipe) < 0)
2671                 _exit(1);
2672
2673         struct pid_ns_clone_args args = {
2674                 .cpipe = cpipe,
2675                 .sock = sock,
2676                 .tpid = tpid,
2677                 .wrapped = &pid_from_ns
2678         };
2679         size_t stack_size = sysconf(_SC_PAGESIZE);
2680         void *stack = alloca(stack_size);
2681
2682         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2683         if (cpid < 0)
2684                 _exit(1);
2685
2686         // give the child 1 second to be done forking and
2687         // write its ack
2688         if (!wait_for_sock(cpipe[0], 1))
2689                 _exit(1);
2690         ret = read(cpipe[0], &v, 1);
2691         if (ret != sizeof(char) || v != '1')
2692                 _exit(1);
2693
2694         if (!wait_for_pid(cpid))
2695                 _exit(1);
2696         _exit(0);
2697 }
2698
2699 /*
2700  * Given host @uid, return the uid to which it maps in
2701  * @pid's user namespace, or -1 if none.
2702  */
2703 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2704 {
2705         FILE *f;
2706         char line[400];
2707
2708         sprintf(line, "/proc/%d/uid_map", pid);
2709         if ((f = fopen(line, "r")) == NULL) {
2710                 return false;
2711         }
2712
2713         *answer = convert_id_to_ns(f, uid);
2714         fclose(f);
2715
2716         if (*answer == -1)
2717                 return false;
2718         return true;
2719 }
2720
2721 /*
2722  * get_pid_creds: get the real uid and gid of @pid from
2723  * /proc/$$/status
2724  * (XXX should we use euid here?)
2725  */
2726 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2727 {
2728         char line[400];
2729         uid_t u;
2730         gid_t g;
2731         FILE *f;
2732
2733         *uid = -1;
2734         *gid = -1;
2735         sprintf(line, "/proc/%d/status", pid);
2736         if ((f = fopen(line, "r")) == NULL) {
2737                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2738                 return;
2739         }
2740         while (fgets(line, 400, f)) {
2741                 if (strncmp(line, "Uid:", 4) == 0) {
2742                         if (sscanf(line+4, "%u", &u) != 1) {
2743                                 lxcfs_error("bad uid line for pid %u\n", pid);
2744                                 fclose(f);
2745                                 return;
2746                         }
2747                         *uid = u;
2748                 } else if (strncmp(line, "Gid:", 4) == 0) {
2749                         if (sscanf(line+4, "%u", &g) != 1) {
2750                                 lxcfs_error("bad gid line for pid %u\n", pid);
2751                                 fclose(f);
2752                                 return;
2753                         }
2754                         *gid = g;
2755                 }
2756         }
2757         fclose(f);
2758 }
2759
2760 /*
2761  * May the requestor @r move victim @v to a new cgroup?
2762  * This is allowed if
2763  *   . they are the same task
2764  *   . they are ownedy by the same uid
2765  *   . @r is root on the host, or
2766  *   . @v's uid is mapped into @r's where @r is root.
2767  */
2768 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2769 {
2770         uid_t v_uid, tmpuid;
2771         gid_t v_gid;
2772
2773         if (r == v)
2774                 return true;
2775         if (r_uid == 0)
2776                 return true;
2777         get_pid_creds(v, &v_uid, &v_gid);
2778         if (r_uid == v_uid)
2779                 return true;
2780         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2781                         && hostuid_to_ns(v_uid, r, &tmpuid))
2782                 return true;
2783         return false;
2784 }
2785
2786 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2787                 const char *file, const char *buf)
2788 {
2789         int sock[2] = {-1, -1};
2790         pid_t qpid, cpid = -1;
2791         FILE *pids_file = NULL;
2792         bool answer = false, fail = false;
2793
2794         pids_file = open_pids_file(contrl, cg);
2795         if (!pids_file)
2796                 return false;
2797
2798         /*
2799          * write the pids to a socket, have helper in writer's pidns
2800          * call movepid for us
2801          */
2802         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2803                 perror("socketpair");
2804                 goto out;
2805         }
2806
2807         cpid = fork();
2808         if (cpid == -1)
2809                 goto out;
2810
2811         if (!cpid) { // child
2812                 fclose(pids_file);
2813                 pid_from_ns_wrapper(sock[1], tpid);
2814         }
2815
2816         const char *ptr = buf;
2817         while (sscanf(ptr, "%d", &qpid) == 1) {
2818                 struct ucred cred;
2819                 char v;
2820
2821                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2822                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2823                         goto out;
2824                 }
2825
2826                 if (recv_creds(sock[0], &cred, &v)) {
2827                         if (v == '0') {
2828                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2829                                         fail = true;
2830                                         break;
2831                                 }
2832                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2833                                         fail = true;
2834                         }
2835                 }
2836
2837                 ptr = strchr(ptr, '\n');
2838                 if (!ptr)
2839                         break;
2840                 ptr++;
2841         }
2842
2843         /* All good, write the value */
2844         qpid = -1;
2845         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2846                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2847
2848         if (!fail)
2849                 answer = true;
2850
2851 out:
2852         if (cpid != -1)
2853                 wait_for_pid(cpid);
2854         if (sock[0] != -1) {
2855                 close(sock[0]);
2856                 close(sock[1]);
2857         }
2858         if (pids_file) {
2859                 if (fclose(pids_file) != 0)
2860                         answer = false;
2861         }
2862         return answer;
2863 }
2864
2865 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2866              struct fuse_file_info *fi)
2867 {
2868         struct fuse_context *fc = fuse_get_context();
2869         char *localbuf = NULL;
2870         struct cgfs_files *k = NULL;
2871         struct file_info *f = (struct file_info *)fi->fh;
2872         bool r;
2873
2874         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2875                 return -EIO;
2876
2877         if (f->type != LXC_TYPE_CGFILE) {
2878                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2879                 return -EIO;
2880         }
2881
2882         if (offset)
2883                 return 0;
2884
2885         localbuf = alloca(size+1);
2886         localbuf[size] = '\0';
2887         memcpy(localbuf, buf, size);
2888
2889         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2890                 size = -EINVAL;
2891                 goto out;
2892         }
2893
2894         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2895                 size = -EACCES;
2896                 goto out;
2897         }
2898
2899         if (strcmp(f->file, "tasks") == 0 ||
2900                         strcmp(f->file, "/tasks") == 0 ||
2901                         strcmp(f->file, "/cgroup.procs") == 0 ||
2902                         strcmp(f->file, "cgroup.procs") == 0)
2903                 // special case - we have to translate the pids
2904                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2905         else
2906                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2907
2908         if (!r)
2909                 size = -EINVAL;
2910
2911 out:
2912         free_key(k);
2913         return size;
2914 }
2915
2916 int cg_chown(const char *path, uid_t uid, gid_t gid)
2917 {
2918         struct fuse_context *fc = fuse_get_context();
2919         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2920         struct cgfs_files *k = NULL;
2921         const char *cgroup;
2922         int ret;
2923
2924         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2925                 return -EIO;
2926
2927         if (strcmp(path, "/cgroup") == 0)
2928                 return -EPERM;
2929
2930         controller = pick_controller_from_path(fc, path);
2931         if (!controller)
2932                 return errno == ENOENT ? -EPERM : -errno;
2933
2934         cgroup = find_cgroup_in_path(path);
2935         if (!cgroup)
2936                 /* this is just /cgroup/controller */
2937                 return -EPERM;
2938
2939         get_cgdir_and_path(cgroup, &cgdir, &last);
2940
2941         if (!last) {
2942                 path1 = "/";
2943                 path2 = cgdir;
2944         } else {
2945                 path1 = cgdir;
2946                 path2 = last;
2947         }
2948
2949         if (is_child_cgroup(controller, path1, path2)) {
2950                 // get uid, gid, from '/tasks' file and make up a mode
2951                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2952                 k = cgfs_get_key(controller, cgroup, "tasks");
2953
2954         } else
2955                 k = cgfs_get_key(controller, path1, path2);
2956
2957         if (!k) {
2958                 ret = -EINVAL;
2959                 goto out;
2960         }
2961
2962         /*
2963          * This being a fuse request, the uid and gid must be valid
2964          * in the caller's namespace.  So we can just check to make
2965          * sure that the caller is root in his uid, and privileged
2966          * over the file's current owner.
2967          */
2968         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2969                 ret = -EACCES;
2970                 goto out;
2971         }
2972
2973         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2974
2975 out:
2976         free_key(k);
2977         free(cgdir);
2978
2979         return ret;
2980 }
2981
2982 int cg_chmod(const char *path, mode_t mode)
2983 {
2984         struct fuse_context *fc = fuse_get_context();
2985         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2986         struct cgfs_files *k = NULL;
2987         const char *cgroup;
2988         int ret;
2989
2990         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2991                 return -EIO;
2992
2993         if (strcmp(path, "/cgroup") == 0)
2994                 return -EPERM;
2995
2996         controller = pick_controller_from_path(fc, path);
2997         if (!controller)
2998                 return errno == ENOENT ? -EPERM : -errno;
2999
3000         cgroup = find_cgroup_in_path(path);
3001         if (!cgroup)
3002                 /* this is just /cgroup/controller */
3003                 return -EPERM;
3004
3005         get_cgdir_and_path(cgroup, &cgdir, &last);
3006
3007         if (!last) {
3008                 path1 = "/";
3009                 path2 = cgdir;
3010         } else {
3011                 path1 = cgdir;
3012                 path2 = last;
3013         }
3014
3015         if (is_child_cgroup(controller, path1, path2)) {
3016                 // get uid, gid, from '/tasks' file and make up a mode
3017                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3018                 k = cgfs_get_key(controller, cgroup, "tasks");
3019
3020         } else
3021                 k = cgfs_get_key(controller, path1, path2);
3022
3023         if (!k) {
3024                 ret = -EINVAL;
3025                 goto out;
3026         }
3027
3028         /*
3029          * This being a fuse request, the uid and gid must be valid
3030          * in the caller's namespace.  So we can just check to make
3031          * sure that the caller is root in his uid, and privileged
3032          * over the file's current owner.
3033          */
3034         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3035                 ret = -EPERM;
3036                 goto out;
3037         }
3038
3039         if (!cgfs_chmod_file(controller, cgroup, mode)) {
3040                 ret = -EINVAL;
3041                 goto out;
3042         }
3043
3044         ret = 0;
3045 out:
3046         free_key(k);
3047         free(cgdir);
3048         return ret;
3049 }
3050
3051 int cg_mkdir(const char *path, mode_t mode)
3052 {
3053         struct fuse_context *fc = fuse_get_context();
3054         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3055         const char *cgroup;
3056         int ret;
3057
3058         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3059                 return -EIO;
3060
3061         controller = pick_controller_from_path(fc, path);
3062         if (!controller)
3063                 return errno == ENOENT ? -EPERM : -errno;
3064
3065         cgroup = find_cgroup_in_path(path);
3066         if (!cgroup)
3067                 return -errno;
3068
3069         get_cgdir_and_path(cgroup, &cgdir, &last);
3070         if (!last)
3071                 path1 = "/";
3072         else
3073                 path1 = cgdir;
3074
3075         pid_t initpid = lookup_initpid_in_store(fc->pid);
3076         if (initpid <= 1 || is_shared_pidns(initpid))
3077                 initpid = fc->pid;
3078         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3079                 if (!next)
3080                         ret = -EINVAL;
3081                 else if (last && strcmp(next, last) == 0)
3082                         ret = -EEXIST;
3083                 else
3084                         ret = -EPERM;
3085                 goto out;
3086         }
3087
3088         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3089                 ret = -EACCES;
3090                 goto out;
3091         }
3092         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3093                 ret = -EACCES;
3094                 goto out;
3095         }
3096
3097         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3098
3099 out:
3100         free(cgdir);
3101         free(next);
3102         return ret;
3103 }
3104
3105 int cg_rmdir(const char *path)
3106 {
3107         struct fuse_context *fc = fuse_get_context();
3108         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3109         const char *cgroup;
3110         int ret;
3111
3112         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
3113                 return -EIO;
3114
3115         controller = pick_controller_from_path(fc, path);
3116         if (!controller) /* Someone's trying to delete "/cgroup". */
3117                 return -EPERM;
3118
3119         cgroup = find_cgroup_in_path(path);
3120         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3121                 return -EPERM;
3122
3123         get_cgdir_and_path(cgroup, &cgdir, &last);
3124         if (!last) {
3125                 /* Someone's trying to delete a cgroup on the same level as the
3126                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3127                  * rmdir "/cgroup/blkio/init.slice".
3128                  */
3129                 ret = -EPERM;
3130                 goto out;
3131         }
3132
3133         pid_t initpid = lookup_initpid_in_store(fc->pid);
3134         if (initpid <= 1 || is_shared_pidns(initpid))
3135                 initpid = fc->pid;
3136         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3137                 if (!last || (next && (strcmp(next, last) == 0)))
3138                         ret = -EBUSY;
3139                 else
3140                         ret = -ENOENT;
3141                 goto out;
3142         }
3143
3144         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3145                 ret = -EACCES;
3146                 goto out;
3147         }
3148         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3149                 ret = -EACCES;
3150                 goto out;
3151         }
3152
3153         if (!cgfs_remove(controller, cgroup)) {
3154                 ret = -EINVAL;
3155                 goto out;
3156         }
3157
3158         ret = 0;
3159
3160 out:
3161         free(cgdir);
3162         free(next);
3163         return ret;
3164 }
3165
3166 static bool startswith(const char *line, const char *pref)
3167 {
3168         if (strncmp(line, pref, strlen(pref)) == 0)
3169                 return true;
3170         return false;
3171 }
3172
3173 /* Note that "memory.stat" in cgroup2 is hierarchical by default. */
3174 static void parse_memstat(int version,
3175                           char *memstat,
3176                           unsigned long *cached,
3177                           unsigned long *active_anon,
3178                           unsigned long *inactive_anon,
3179                           unsigned long *active_file,
3180                           unsigned long *inactive_file,
3181                           unsigned long *unevictable,
3182                           unsigned long *shmem)
3183 {
3184         char *eol;
3185
3186         while (*memstat) {
3187                 if (startswith(memstat, is_unified_controller(version)
3188                                             ? "cache"
3189                                             : "total_cache")) {
3190                         sscanf(memstat + 11, "%lu", cached);
3191                         *cached /= 1024;
3192                 } else if (startswith(memstat, is_unified_controller(version)
3193                                                    ? "active_anon"
3194                                                    : "total_active_anon")) {
3195                         sscanf(memstat + 17, "%lu", active_anon);
3196                         *active_anon /= 1024;
3197                 } else if (startswith(memstat, is_unified_controller(version)
3198                                                    ? "inactive_anon"
3199                                                    : "total_inactive_anon")) {
3200                         sscanf(memstat + 19, "%lu", inactive_anon);
3201                         *inactive_anon /= 1024;
3202                 } else if (startswith(memstat, is_unified_controller(version)
3203                                                    ? "active_file"
3204                                                    : "total_active_file")) {
3205                         sscanf(memstat + 17, "%lu", active_file);
3206                         *active_file /= 1024;
3207                 } else if (startswith(memstat, is_unified_controller(version)
3208                                                    ? "inactive_file"
3209                                                    : "total_inactive_file")) {
3210                         sscanf(memstat + 19, "%lu", inactive_file);
3211                         *inactive_file /= 1024;
3212                 } else if (startswith(memstat, is_unified_controller(version)
3213                                                    ? "unevictable"
3214                                                    : "total_unevictable")) {
3215                         sscanf(memstat + 17, "%lu", unevictable);
3216                         *unevictable /= 1024;
3217                 } else if (startswith(memstat, is_unified_controller(version)
3218                                                    ? "shmem"
3219                                                    : "total_shmem")) {
3220                         sscanf(memstat + 11, "%lu", shmem);
3221                         *shmem /= 1024;
3222                 }
3223                 eol = strchr(memstat, '\n');
3224                 if (!eol)
3225                         return;
3226                 memstat = eol+1;
3227         }
3228 }
3229
3230 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3231 {
3232         char *eol;
3233         char key[32];
3234
3235         memset(key, 0, 32);
3236         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3237
3238         size_t len = strlen(key);
3239         *v = 0;
3240
3241         while (*str) {
3242                 if (startswith(str, key)) {
3243                         sscanf(str + len, "%lu", v);
3244                         return;
3245                 }
3246                 eol = strchr(str, '\n');
3247                 if (!eol)
3248                         return;
3249                 str = eol+1;
3250         }
3251 }
3252
3253 int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d)
3254 {
3255         __do_free char *line = NULL;
3256         __do_fclose FILE *f = NULL;
3257         size_t linelen = 0, total_len = 0;
3258         char *cache = d->buf;
3259         size_t cache_size = d->buflen;
3260
3261         f = fopen(path, "r");
3262         if (!f)
3263                 return 0;
3264
3265         while (getline(&line, &linelen, f) != -1) {
3266                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3267                 if (l < 0) {
3268                         perror("Error writing to cache");
3269                         return 0;
3270                 }
3271                 if (l >= cache_size) {
3272                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3273                         return 0;
3274                 }
3275                 cache += l;
3276                 cache_size -= l;
3277                 total_len += l;
3278         }
3279
3280         d->size = total_len;
3281         if (total_len > size)
3282                 total_len = size;
3283
3284         /* read from off 0 */
3285         memcpy(buf, d->buf, total_len);
3286
3287         if (d->size > total_len)
3288                 d->cached = d->size - total_len;
3289         return total_len;
3290 }
3291
3292 /*
3293  * FUSE ops for /proc
3294  */
3295
3296 static unsigned long get_memlimit(const char *cgroup, bool swap)
3297 {
3298         int ret;
3299         __do_free char *memlimit_str = NULL;
3300         unsigned long memlimit = -1;
3301
3302         if (swap)
3303                 ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str);
3304         else
3305                 ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str);
3306         if (ret > 0)
3307                 memlimit = strtoul(memlimit_str, NULL, 10);
3308
3309         return memlimit;
3310 }
3311
3312 static unsigned long get_min_memlimit(const char *cgroup, bool swap)
3313 {
3314         __do_free char *copy = NULL;
3315         unsigned long memlimit = 0;
3316         unsigned long retlimit;
3317
3318         copy = strdup(cgroup);
3319         retlimit = get_memlimit(copy, swap);
3320
3321         while (strcmp(copy, "/") != 0) {
3322                 char *it = copy;
3323
3324                 it = dirname(it);
3325                 memlimit = get_memlimit(it, swap);
3326                 if (memlimit != -1 && memlimit < retlimit)
3327                         retlimit = memlimit;
3328         };
3329
3330         return retlimit;
3331 }
3332
3333 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3334                              struct fuse_file_info *fi)
3335 {
3336         __do_free char *cgroup = NULL, *line = NULL,
3337                        *memusage_str = NULL, *memstat_str = NULL,
3338                        *memswlimit_str = NULL, *memswusage_str = NULL;
3339         __do_fclose FILE *f = NULL;
3340         struct fuse_context *fc = fuse_get_context();
3341         struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3342         struct file_info *d = (struct file_info *)fi->fh;
3343         unsigned long memlimit = 0, memusage = 0, memswlimit = 0,
3344                       memswusage = 0, cached = 0, hosttotal = 0, active_anon = 0,
3345                       inactive_anon = 0, active_file = 0, inactive_file = 0,
3346                       unevictable = 0, shmem = 0, hostswtotal = 0;
3347         size_t linelen = 0, total_len = 0;
3348         char *cache = d->buf;
3349         size_t cache_size = d->buflen;
3350         int ret;
3351
3352         if (offset) {
3353                 int left;
3354
3355                 if (offset > d->size)
3356                         return -EINVAL;
3357
3358                 if (!d->cached)
3359                         return 0;
3360
3361                 left = d->size - offset;
3362                 total_len = left > size ? size : left;
3363                 memcpy(buf, cache + offset, total_len);
3364
3365                 return total_len;
3366         }
3367
3368         pid_t initpid = lookup_initpid_in_store(fc->pid);
3369         if (initpid <= 1 || is_shared_pidns(initpid))
3370                 initpid = fc->pid;
3371
3372         cgroup = get_pid_cgroup(initpid, "memory");
3373         if (!cgroup)
3374                 return read_file_fuse("/proc/meminfo", buf, size, d);
3375
3376         prune_init_slice(cgroup);
3377
3378         memlimit = get_min_memlimit(cgroup, false);
3379
3380         ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str);
3381         if (ret < 0)
3382                 return 0;
3383
3384         ret = cgroup_ops->get_memory_stats(cgroup_ops, cgroup, &memstat_str);
3385         if (ret < 0)
3386                 return 0;
3387         parse_memstat(ret, memstat_str, &cached, &active_anon, &inactive_anon,
3388                       &active_file, &inactive_file, &unevictable, &shmem);
3389
3390         /*
3391          * Following values are allowed to fail, because swapaccount might be
3392          * turned off for current kernel.
3393          */
3394         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memswlimit_str);
3395         if (ret >= 0)
3396                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str);
3397         if (ret >= 0) {
3398                 memswlimit = get_min_memlimit(cgroup, true);
3399                 memswusage = strtoul(memswusage_str, NULL, 10);
3400                 memswlimit = memswlimit / 1024;
3401                 memswusage = memswusage / 1024;
3402         }
3403
3404         memusage = strtoul(memusage_str, NULL, 10);
3405         memlimit /= 1024;
3406         memusage /= 1024;
3407
3408         f = fopen("/proc/meminfo", "r");
3409         if (!f)
3410                 return 0;
3411
3412         while (getline(&line, &linelen, f) != -1) {
3413                 ssize_t l;
3414                 char *printme, lbuf[100];
3415
3416                 memset(lbuf, 0, 100);
3417                 if (startswith(line, "MemTotal:")) {
3418                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3419                         if (hosttotal < memlimit)
3420                                 memlimit = hosttotal;
3421                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3422                         printme = lbuf;
3423                 } else if (startswith(line, "MemFree:")) {
3424                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3425                         printme = lbuf;
3426                 } else if (startswith(line, "MemAvailable:")) {
3427                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3428                         printme = lbuf;
3429                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 &&
3430                            opts && opts->swap_off == false) {
3431                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3432                         if (hostswtotal < memswlimit)
3433                                 memswlimit = hostswtotal;
3434                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3435                         printme = lbuf;
3436                 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3437                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", 0UL);
3438                         printme = lbuf;
3439                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 &&
3440                            memswusage > 0 && opts && opts->swap_off == false) {
3441                         unsigned long swaptotal = memswlimit,
3442                                       swapusage = memusage > memswusage
3443                                                       ? 0
3444                                                       : memswusage - memusage,
3445                                       swapfree = swapusage < swaptotal
3446                                                      ? swaptotal - swapusage
3447                                                      : 0;
3448                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3449                         printme = lbuf;
3450                 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3451                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", 0UL);
3452                         printme = lbuf;
3453                 } else if (startswith(line, "Slab:")) {
3454                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3455                         printme = lbuf;
3456                 } else if (startswith(line, "Buffers:")) {
3457                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3458                         printme = lbuf;
3459                 } else if (startswith(line, "Cached:")) {
3460                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3461                         printme = lbuf;
3462                 } else if (startswith(line, "SwapCached:")) {
3463                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3464                         printme = lbuf;
3465                 } else if (startswith(line, "Active:")) {
3466                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3467                                         active_anon + active_file);
3468                         printme = lbuf;
3469                 } else if (startswith(line, "Inactive:")) {
3470                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3471                                         inactive_anon + inactive_file);
3472                         printme = lbuf;
3473                 } else if (startswith(line, "Active(anon)")) {
3474                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3475                         printme = lbuf;
3476                 } else if (startswith(line, "Inactive(anon)")) {
3477                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3478                         printme = lbuf;
3479                 } else if (startswith(line, "Active(file)")) {
3480                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3481                         printme = lbuf;
3482                 } else if (startswith(line, "Inactive(file)")) {
3483                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3484                         printme = lbuf;
3485                 } else if (startswith(line, "Unevictable")) {
3486                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3487                         printme = lbuf;
3488                 } else if (startswith(line, "SReclaimable")) {
3489                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3490                         printme = lbuf;
3491                 } else if (startswith(line, "SUnreclaim")) {
3492                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3493                         printme = lbuf;
3494                 } else if (startswith(line, "Shmem:")) {
3495                         snprintf(lbuf, 100, "Shmem:          %8lu kB\n", shmem);
3496                         printme = lbuf;
3497                 } else if (startswith(line, "ShmemHugePages")) {
3498                         snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3499                         printme = lbuf;
3500                 } else if (startswith(line, "ShmemPmdMapped")) {
3501                         snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3502                         printme = lbuf;
3503                 } else
3504                         printme = line;
3505
3506                 l = snprintf(cache, cache_size, "%s", printme);
3507                 if (l < 0) {
3508                         perror("Error writing to cache");
3509                         return 0;
3510
3511                 }
3512                 if (l >= cache_size) {
3513                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3514                         return 0;
3515                 }
3516
3517                 cache += l;
3518                 cache_size -= l;
3519                 total_len += l;
3520         }
3521
3522         d->cached = 1;
3523         d->size = total_len;
3524         if (total_len > size ) total_len = size;
3525         memcpy(buf, d->buf, total_len);
3526
3527         return total_len;
3528 }
3529
3530 /*
3531  * Read the cpuset.cpus for cg
3532  * Return the answer in a newly allocated string which must be freed
3533  */
3534 char *get_cpuset(const char *cg)
3535 {
3536         char *value = NULL;
3537         int ret;
3538
3539         ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value);
3540         if (ret < 0)
3541                 return NULL;
3542
3543         return value;
3544 }
3545
3546 bool cpu_in_cpuset(int cpu, const char *cpuset);
3547
3548 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3549 {
3550         int cpu;
3551
3552         if (sscanf(line, "processor       : %d", &cpu) != 1)
3553                 return false;
3554         return cpu_in_cpuset(cpu, cpuset);
3555 }
3556
3557 /*
3558  * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3559  * depending on `param`. Parameter value is returned throuh `value`.
3560  */
3561 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3562 {
3563         __do_free char *str = NULL;
3564         char file[11 + 6 + 1]; /* cpu.cfs__us + quota/period + \0 */
3565
3566         snprintf(file, sizeof(file), "cpu.cfs_%s_us", param);
3567
3568         if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str))
3569                 return false;
3570
3571         if (sscanf(str, "%ld", value) != 1)
3572                 return false;
3573
3574         return true;
3575 }
3576
3577 /*
3578  * Return the maximum number of visible CPUs based on CPU quotas.
3579  * If there is no quota set, zero is returned.
3580  */
3581 int max_cpu_count(const char *cg)
3582 {
3583         int rv, nprocs;
3584         int64_t cfs_quota, cfs_period;
3585         int nr_cpus_in_cpuset = 0;
3586         char *cpuset = NULL;
3587
3588         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3589                 return 0;
3590
3591         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3592                 return 0;
3593
3594         cpuset = get_cpuset(cg);
3595         if (cpuset)
3596                 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3597
3598         if (cfs_quota <= 0 || cfs_period <= 0){
3599                 if (nr_cpus_in_cpuset > 0)
3600                         return nr_cpus_in_cpuset;
3601
3602                 return 0;
3603         }
3604
3605         rv = cfs_quota / cfs_period;
3606
3607         /* In case quota/period does not yield a whole number, add one CPU for
3608          * the remainder.
3609          */
3610         if ((cfs_quota % cfs_period) > 0)
3611                 rv += 1;
3612
3613         nprocs = get_nprocs();
3614
3615         if (rv > nprocs)
3616                 rv = nprocs;
3617
3618         /* use min value in cpu quota and cpuset */
3619         if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3620                 rv = nr_cpus_in_cpuset;
3621
3622         return rv;
3623 }
3624
3625 /*
3626  * Return the exact number of visible CPUs based on CPU quotas.
3627  * If there is no quota set, zero is returned.
3628  */
3629 static double exact_cpu_count(const char *cg)
3630 {
3631         double rv;
3632         int nprocs;
3633         int64_t cfs_quota, cfs_period;
3634
3635         if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3636                 return 0;
3637
3638         if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3639                 return 0;
3640
3641         if (cfs_quota <= 0 || cfs_period <= 0)
3642                 return 0;
3643
3644         rv = (double)cfs_quota / (double)cfs_period;
3645
3646         nprocs = get_nprocs();
3647
3648         if (rv > nprocs)
3649                 rv = nprocs;
3650
3651         return rv;
3652 }
3653
3654 /*
3655  * check whether this is a '^processor" line in /proc/cpuinfo
3656  */
3657 static bool is_processor_line(const char *line)
3658 {
3659         int cpu;
3660
3661         if (sscanf(line, "processor       : %d", &cpu) == 1)
3662                 return true;
3663         return false;
3664 }
3665
3666 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3667                              struct fuse_file_info *fi)
3668 {
3669         __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
3670         __do_fclose FILE *f = NULL;
3671         struct fuse_context *fc = fuse_get_context();
3672         struct file_info *d = (struct file_info *)fi->fh;
3673         size_t linelen = 0, total_len = 0;
3674         bool am_printing = false, firstline = true, is_s390x = false;
3675         int curcpu = -1, cpu, max_cpus = 0;
3676         bool use_view;
3677         char *cache = d->buf;
3678         size_t cache_size = d->buflen;
3679
3680         if (offset){
3681                 int left;
3682
3683                 if (offset > d->size)
3684                         return -EINVAL;
3685
3686                 if (!d->cached)
3687                         return 0;
3688
3689                 left = d->size - offset;
3690                 total_len = left > size ? size: left;
3691                 memcpy(buf, cache + offset, total_len);
3692
3693                 return total_len;
3694         }
3695
3696         pid_t initpid = lookup_initpid_in_store(fc->pid);
3697         if (initpid <= 1 || is_shared_pidns(initpid))
3698                 initpid = fc->pid;
3699         cg = get_pid_cgroup(initpid, "cpuset");
3700         if (!cg)
3701                 return read_file_fuse("proc/cpuinfo", buf, size, d);
3702         prune_init_slice(cg);
3703
3704         cpuset = get_cpuset(cg);
3705         if (!cpuset)
3706                 return 0;
3707
3708         use_view = cgroup_ops->can_use_cpuview(cgroup_ops);
3709         if (use_view)
3710                 max_cpus = max_cpu_count(cg);
3711
3712         f = fopen("/proc/cpuinfo", "r");
3713         if (!f)
3714                 return 0;
3715
3716         while (getline(&line, &linelen, f) != -1) {
3717                 ssize_t l;
3718                 if (firstline) {
3719                         firstline = false;
3720                         if (strstr(line, "IBM/S390") != NULL) {
3721                                 is_s390x = true;
3722                                 am_printing = true;
3723                                 continue;
3724                         }
3725                 }
3726                 if (strncmp(line, "# processors:", 12) == 0)
3727                         continue;
3728                 if (is_processor_line(line)) {
3729                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3730                                 break;
3731                         am_printing = cpuline_in_cpuset(line, cpuset);
3732                         if (am_printing) {
3733                                 curcpu ++;
3734                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3735                                 if (l < 0) {
3736                                         perror("Error writing to cache");
3737                                         return 0;
3738                                 }
3739                                 if (l >= cache_size) {
3740                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3741                                         return 0;
3742                                 }
3743                                 cache += l;
3744                                 cache_size -= l;
3745                                 total_len += l;
3746                         }
3747                         continue;
3748                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3749                         char *p;
3750                         if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3751                                 break;
3752                         if (!cpu_in_cpuset(cpu, cpuset))
3753                                 continue;
3754                         curcpu ++;
3755                         p = strchr(line, ':');
3756                         if (!p || !*p)
3757                                 return 0;
3758                         p++;
3759                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3760                         if (l < 0) {
3761                                 perror("Error writing to cache");
3762                                 return 0;
3763                         }
3764                         if (l >= cache_size) {
3765                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3766                                 return 0;
3767                         }
3768                         cache += l;
3769                         cache_size -= l;
3770                         total_len += l;
3771                         continue;
3772
3773                 }
3774                 if (am_printing) {
3775                         l = snprintf(cache, cache_size, "%s", line);
3776                         if (l < 0) {
3777                                 perror("Error writing to cache");
3778                                 return 0;
3779                         }
3780                         if (l >= cache_size) {
3781                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3782                                 return 0;
3783                         }
3784                         cache += l;
3785                         cache_size -= l;
3786                         total_len += l;
3787                 }
3788         }
3789
3790         if (is_s390x) {
3791                 __do_free char *origcache = d->buf;
3792                 ssize_t l;
3793
3794                 d->buf = malloc(d->buflen);
3795                 if (!d->buf) {
3796                         d->buf = move_ptr(origcache);
3797                         return 0;
3798                 }
3799
3800                 cache = d->buf;
3801                 cache_size = d->buflen;
3802                 total_len = 0;
3803                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3804                 if (l < 0 || l >= cache_size)
3805                         return 0;
3806
3807                 cache_size -= l;
3808                 cache += l;
3809                 total_len += l;
3810                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3811                 if (l < 0 || l >= cache_size)
3812                         return 0;
3813
3814                 cache_size -= l;
3815                 cache += l;
3816                 total_len += l;
3817                 l = snprintf(cache, cache_size, "%s", origcache);
3818                 if (l < 0 || l >= cache_size)
3819                         return 0;
3820                 total_len += l;
3821         }
3822
3823         d->cached = 1;
3824         d->size = total_len;
3825         if (total_len > size ) total_len = size;
3826
3827         /* read from off 0 */
3828         memcpy(buf, d->buf, total_len);
3829         return total_len;
3830 }
3831
3832 static uint64_t get_reaper_start_time(pid_t pid)
3833 {
3834         int ret;
3835         FILE *f;
3836         uint64_t starttime;
3837         /* strlen("/proc/") = 6
3838          * +
3839          * LXCFS_NUMSTRLEN64
3840          * +
3841          * strlen("/stat") = 5
3842          * +
3843          * \0 = 1
3844          * */
3845 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3846         char path[__PROC_PID_STAT_LEN];
3847         pid_t qpid;
3848
3849         qpid = lookup_initpid_in_store(pid);
3850         if (qpid <= 0) {
3851                 /* Caller can check for EINVAL on 0. */
3852                 errno = EINVAL;
3853                 return 0;
3854         }
3855
3856         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3857         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3858                 /* Caller can check for EINVAL on 0. */
3859                 errno = EINVAL;
3860                 return 0;
3861         }
3862
3863         f = fopen(path, "r");
3864         if (!f) {
3865                 /* Caller can check for EINVAL on 0. */
3866                 errno = EINVAL;
3867                 return 0;
3868         }
3869
3870         /* Note that the *scanf() argument supression requires that length
3871          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3872          * at us. It's like telling someone you're not married and then asking
3873          * if you can bring your wife to the party.
3874          */
3875         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3876                         "%*s "      /* (2)  comm        %s   */
3877                         "%*c "      /* (3)  state       %c   */
3878                         "%*d "      /* (4)  ppid        %d   */
3879                         "%*d "      /* (5)  pgrp        %d   */
3880                         "%*d "      /* (6)  session     %d   */
3881                         "%*d "      /* (7)  tty_nr      %d   */
3882                         "%*d "      /* (8)  tpgid       %d   */
3883                         "%*u "      /* (9)  flags       %u   */
3884                         "%*u "      /* (10) minflt      %lu  */
3885                         "%*u "      /* (11) cminflt     %lu  */
3886                         "%*u "      /* (12) majflt      %lu  */
3887                         "%*u "      /* (13) cmajflt     %lu  */
3888                         "%*u "      /* (14) utime       %lu  */
3889                         "%*u "      /* (15) stime       %lu  */
3890                         "%*d "      /* (16) cutime      %ld  */
3891                         "%*d "      /* (17) cstime      %ld  */
3892                         "%*d "      /* (18) priority    %ld  */
3893                         "%*d "      /* (19) nice        %ld  */
3894                         "%*d "      /* (20) num_threads %ld  */
3895                         "%*d "      /* (21) itrealvalue %ld  */
3896                         "%" PRIu64, /* (22) starttime   %llu */
3897                      &starttime);
3898         if (ret != 1) {
3899                 fclose(f);
3900                 /* Caller can check for EINVAL on 0. */
3901                 errno = EINVAL;
3902                 return 0;
3903         }
3904
3905         fclose(f);
3906
3907         errno = 0;
3908         return starttime;
3909 }
3910
3911 static double get_reaper_start_time_in_sec(pid_t pid)
3912 {
3913         uint64_t clockticks, ticks_per_sec;
3914         int64_t ret;
3915         double res = 0;
3916
3917         clockticks = get_reaper_start_time(pid);
3918         if (clockticks == 0 && errno == EINVAL) {
3919                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3920                 return 0;
3921         }
3922
3923         ret = sysconf(_SC_CLK_TCK);
3924         if (ret < 0 && errno == EINVAL) {
3925                 lxcfs_debug(
3926                     "%s\n",
3927                     "failed to determine number of clock ticks in a second");
3928                 return 0;
3929         }
3930
3931         ticks_per_sec = (uint64_t)ret;
3932         res = (double)clockticks / ticks_per_sec;
3933         return res;
3934 }
3935
3936 static double get_reaper_age(pid_t pid)
3937 {
3938         uint64_t uptime_ms;
3939         double procstart, procage;
3940
3941         /* We need to substract the time the process has started since system
3942          * boot minus the time when the system has started to get the actual
3943          * reaper age.
3944          */
3945         procstart = get_reaper_start_time_in_sec(pid);
3946         procage = procstart;
3947         if (procstart > 0) {
3948                 int ret;
3949                 struct timespec spec;
3950
3951                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3952                 if (ret < 0)
3953                         return 0;
3954
3955                 /* We could make this more precise here by using the tv_nsec
3956                  * field in the timespec struct and convert it to milliseconds
3957                  * and then create a double for the seconds and milliseconds but
3958                  * that seems more work than it is worth.
3959                  */
3960                 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
3961                 procage = (uptime_ms - (procstart * 1000)) / 1000;
3962         }
3963
3964         return procage;
3965 }
3966
3967 /*
3968  * Returns 0 on success.
3969  * It is the caller's responsibility to free `return_usage`, unless this
3970  * function returns an error.
3971  */
3972 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
3973 {
3974         __do_free char *usage_str = NULL;
3975         __do_free struct cpuacct_usage *cpu_usage = NULL;
3976         int cpucount = get_nprocs_conf();
3977         int read_pos = 0, read_cnt=0;
3978         int i, j, ret;
3979         int cg_cpu;
3980         uint64_t cg_user, cg_system;
3981         int64_t ticks_per_sec;
3982
3983         ticks_per_sec = sysconf(_SC_CLK_TCK);
3984
3985         if (ticks_per_sec < 0 && errno == EINVAL) {
3986                 lxcfs_v(
3987                         "%s\n",
3988                         "read_cpuacct_usage_all failed to determine number of clock ticks "
3989                         "in a second");
3990                 return -1;
3991         }
3992
3993         cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
3994         if (!cpu_usage)
3995                 return -ENOMEM;
3996
3997         memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
3998         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
3999                 char *data = NULL;
4000                 int i = 0, read_pos = 0, read_cnt=0;
4001                 size_t sz = 0, asz = 0;
4002
4003                 /* read cpuacct.usage_percpu instead. */
4004                 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4005                 if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str))
4006                         return -1;
4007                 lxcfs_v("usage_str: %s\n", usage_str);
4008
4009                 /* convert cpuacct.usage_percpu into cpuacct.usage_all. */
4010                 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4011
4012                 must_strcat(&data, &sz, &asz, "cpu user system\n");
4013
4014                 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4015                         lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4016                         must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4017                         i++;
4018                         read_pos += read_cnt;
4019                 }
4020
4021                 usage_str = data;
4022
4023                 lxcfs_v("usage_str: %s\n", usage_str);
4024         }
4025
4026         if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4027                 lxcfs_error("read_cpuacct_usage_all reading first line from "
4028                                 "%s/cpuacct.usage_all failed.\n", cg);
4029                 return -1;
4030         }
4031
4032         read_pos += read_cnt;
4033
4034         for (i = 0, j = 0; i < cpucount; i++) {
4035                 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4036                                 &cg_system, &read_cnt);
4037
4038                 if (ret == EOF)
4039                         break;
4040
4041                 if (ret != 3) {
4042                         lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4043                                         "failed.\n", cg);
4044                         return -1;
4045                 }
4046
4047                 read_pos += read_cnt;
4048
4049                 /* Convert the time from nanoseconds to USER_HZ */
4050                 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4051                 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4052                 j++;
4053         }
4054
4055         *return_usage = move_ptr(cpu_usage);
4056         *size = cpucount;
4057         return 0;
4058 }
4059
4060 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4061 {
4062         int i;
4063         unsigned long sum = 0;
4064
4065         for (i = 0; i < cpu_count; i++) {
4066                 if (!newer[i].online)
4067                         continue;
4068
4069                 /* When cpuset is changed on the fly, the CPUs might get reordered.
4070                  * We could either reset all counters, or check that the substractions
4071                  * below will return expected results.
4072                  */
4073                 if (newer[i].user > older[i].user)
4074                         diff[i].user = newer[i].user - older[i].user;
4075                 else
4076                         diff[i].user = 0;
4077
4078                 if (newer[i].system > older[i].system)
4079                         diff[i].system = newer[i].system - older[i].system;
4080                 else
4081                         diff[i].system = 0;
4082
4083                 if (newer[i].idle > older[i].idle)
4084                         diff[i].idle = newer[i].idle - older[i].idle;
4085                 else
4086                         diff[i].idle = 0;
4087
4088                 sum += diff[i].user;
4089                 sum += diff[i].system;
4090                 sum += diff[i].idle;
4091         }
4092
4093         return sum;
4094 }
4095
4096 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4097 {
4098         unsigned long free_space, to_add;
4099
4100         free_space = threshold - usage->user - usage->system;
4101
4102         if (free_space > usage->idle)
4103                 free_space = usage->idle;
4104
4105         to_add = free_space > *surplus ? *surplus : free_space;
4106
4107         *counter += to_add;
4108         usage->idle -= to_add;
4109         *surplus -= to_add;
4110 }
4111
4112 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4113 {
4114         struct cg_proc_stat *first = NULL, *prev, *tmp;
4115
4116         for (prev = NULL; node; ) {
4117                 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4118                         tmp = node;
4119                         lxcfs_debug("Removing stat node for %s\n", node->cg);
4120
4121                         if (prev)
4122                                 prev->next = node->next;
4123                         else
4124                                 first = node->next;
4125
4126                         node = node->next;
4127                         free_proc_stat_node(tmp);
4128                 } else {
4129                         if (!first)
4130                                 first = node;
4131                         prev = node;
4132                         node = node->next;
4133                 }
4134         }
4135
4136         return first;
4137 }
4138
4139 #define PROC_STAT_PRUNE_INTERVAL 10
4140 static void prune_proc_stat_history(void)
4141 {
4142         int i;
4143         time_t now = time(NULL);
4144
4145         for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4146                 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4147
4148                 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4149                         pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4150                         return;
4151                 }
4152
4153                 if (proc_stat_history[i]->next) {
4154                         proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4155                         proc_stat_history[i]->lastcheck = now;
4156                 }
4157
4158                 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4159         }
4160 }
4161
4162 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4163 {
4164         struct cg_proc_stat *node;
4165
4166         pthread_rwlock_rdlock(&head->lock);
4167
4168         if (!head->next) {
4169                 pthread_rwlock_unlock(&head->lock);
4170                 return NULL;
4171         }
4172
4173         node = head->next;
4174
4175         do {
4176                 if (strcmp(cg, node->cg) == 0)
4177                         goto out;
4178         } while ((node = node->next));
4179
4180         node = NULL;
4181
4182 out:
4183         pthread_rwlock_unlock(&head->lock);
4184         prune_proc_stat_history();
4185         return node;
4186 }
4187
4188 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4189 {
4190         struct cg_proc_stat *node;
4191         int i;
4192
4193         node = malloc(sizeof(struct cg_proc_stat));
4194         if (!node)
4195                 goto err;
4196
4197         node->cg = NULL;
4198         node->usage = NULL;
4199         node->view = NULL;
4200
4201         node->cg = malloc(strlen(cg) + 1);
4202         if (!node->cg)
4203                 goto err;
4204
4205         strcpy(node->cg, cg);
4206
4207         node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4208         if (!node->usage)
4209                 goto err;
4210
4211         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4212
4213         node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4214         if (!node->view)
4215                 goto err;
4216
4217         node->cpu_count = cpu_count;
4218         node->next = NULL;
4219
4220         if (pthread_mutex_init(&node->lock, NULL) != 0) {
4221                 lxcfs_error("%s\n", "Failed to initialize node lock");
4222                 goto err;
4223         }
4224
4225         for (i = 0; i < cpu_count; i++) {
4226                 node->view[i].user = 0;
4227                 node->view[i].system = 0;
4228                 node->view[i].idle = 0;
4229         }
4230
4231         return node;
4232
4233 err:
4234         if (node && node->cg)
4235                 free(node->cg);
4236         if (node && node->usage)
4237                 free(node->usage);
4238         if (node && node->view)
4239                 free(node->view);
4240         if (node)
4241                 free(node);
4242
4243         return NULL;
4244 }
4245
4246 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4247 {
4248         int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4249         struct cg_proc_stat_head *head = proc_stat_history[hash];
4250         struct cg_proc_stat *node, *rv = new_node;
4251
4252         pthread_rwlock_wrlock(&head->lock);
4253
4254         if (!head->next) {
4255                 head->next = new_node;
4256                 goto out;
4257         }
4258
4259         node = head->next;
4260
4261         for (;;) {
4262                 if (strcmp(node->cg, new_node->cg) == 0) {
4263                         /* The node is already present, return it */
4264                         free_proc_stat_node(new_node);
4265                         rv = node;
4266                         goto out;
4267                 }
4268
4269                 if (node->next) {
4270                         node = node->next;
4271                         continue;
4272                 }
4273
4274                 node->next = new_node;
4275                 goto out;
4276         }
4277
4278 out:
4279         pthread_rwlock_unlock(&head->lock);
4280         return rv;
4281 }
4282
4283 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4284 {
4285         __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL;
4286
4287         /* Allocate new memory */
4288         new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4289         if (!new_usage)
4290                 return false;
4291
4292         new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4293         if (!new_view)
4294                 return false;
4295
4296         /* Copy existing data & initialize new elements */
4297         for (int i = 0; i < cpu_count; i++) {
4298                 if (i < node->cpu_count) {
4299                         new_usage[i].user = node->usage[i].user;
4300                         new_usage[i].system = node->usage[i].system;
4301                         new_usage[i].idle = node->usage[i].idle;
4302
4303                         new_view[i].user = node->view[i].user;
4304                         new_view[i].system = node->view[i].system;
4305                         new_view[i].idle = node->view[i].idle;
4306                 } else {
4307                         new_usage[i].user = 0;
4308                         new_usage[i].system = 0;
4309                         new_usage[i].idle = 0;
4310
4311                         new_view[i].user = 0;
4312                         new_view[i].system = 0;
4313                         new_view[i].idle = 0;
4314                 }
4315         }
4316
4317         free(node->usage);
4318         node->usage = move_ptr(new_usage);
4319
4320         free(node->view);
4321         node->view = move_ptr(new_view);
4322         node->cpu_count = cpu_count;
4323
4324         return true;
4325 }
4326
4327 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4328 {
4329         int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4330         struct cg_proc_stat_head *head = proc_stat_history[hash];
4331         struct cg_proc_stat *node;
4332
4333         node = find_proc_stat_node(head, cg);
4334
4335         if (!node) {
4336                 node = new_proc_stat_node(usage, cpu_count, cg);
4337                 if (!node)
4338                         return NULL;
4339
4340                 node = add_proc_stat_node(node);
4341                 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4342         }
4343
4344         pthread_mutex_lock(&node->lock);
4345
4346         /* If additional CPUs on the host have been enabled, CPU usage counter
4347          * arrays have to be expanded */
4348         if (node->cpu_count < cpu_count) {
4349                 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4350                                 node->cpu_count, cpu_count, cg);
4351
4352                 if (!expand_proc_stat_node(node, cpu_count)) {
4353                         pthread_mutex_unlock(&node->lock);
4354                         lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4355                                         node->cpu_count, cpu_count, cg);
4356                         return NULL;
4357                 }
4358         }
4359
4360         return node;
4361 }
4362
4363 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4364 {
4365         int i;
4366
4367         lxcfs_debug("Resetting stat node for %s\n", node->cg);
4368         memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4369
4370         for (i = 0; i < cpu_count; i++) {
4371                 node->view[i].user = 0;
4372                 node->view[i].system = 0;
4373                 node->view[i].idle = 0;
4374         }
4375
4376         node->cpu_count = cpu_count;
4377 }
4378
4379 static int cpuview_proc_stat(const char *cg, const char *cpuset,
4380                              struct cpuacct_usage *cg_cpu_usage,
4381                              int cg_cpu_usage_size, FILE *f, char *buf,
4382                              size_t buf_size)
4383 {
4384         __do_free char *line = NULL;
4385         __do_free struct cpuacct_usage *diff = NULL;
4386         size_t linelen = 0, total_len = 0, l;
4387         int curcpu = -1; /* cpu numbering starts at 0 */
4388         int physcpu, i;
4389         int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4390         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4391                       irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4392         unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4393         unsigned long user_surplus = 0, system_surplus = 0;
4394         unsigned long total_sum, threshold;
4395         struct cg_proc_stat *stat_node;
4396         int nprocs = get_nprocs_conf();
4397
4398         if (cg_cpu_usage_size < nprocs)
4399                 nprocs = cg_cpu_usage_size;
4400
4401         /* Read all CPU stats and stop when we've encountered other lines */
4402         while (getline(&line, &linelen, f) != -1) {
4403                 int ret;
4404                 char cpu_char[10]; /* That's a lot of cores */
4405                 uint64_t all_used, cg_used;
4406
4407                 if (strlen(line) == 0)
4408                         continue;
4409
4410                 /* not a ^cpuN line containing a number N */
4411                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1)
4412                         break;
4413
4414                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4415                         continue;
4416
4417                 if (physcpu >= cg_cpu_usage_size)
4418                         continue;
4419
4420                 curcpu ++;
4421                 cpu_cnt ++;
4422
4423                 if (!cpu_in_cpuset(physcpu, cpuset)) {
4424                         for (i = curcpu; i <= physcpu; i++)
4425                                 cg_cpu_usage[i].online = false;
4426                         continue;
4427                 }
4428
4429                 if (curcpu < physcpu) {
4430                         /* Some CPUs may be disabled */
4431                         for (i = curcpu; i < physcpu; i++)
4432                                 cg_cpu_usage[i].online = false;
4433
4434                         curcpu = physcpu;
4435                 }
4436
4437                 cg_cpu_usage[curcpu].online = true;
4438
4439                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4440                            &user,
4441                            &nice,
4442                            &system,
4443                            &idle,
4444                            &iowait,
4445                            &irq,
4446                            &softirq,
4447                            &steal,
4448                            &guest,
4449                            &guest_nice);
4450
4451                 if (ret != 10)
4452                         continue;
4453
4454                 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4455                 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4456
4457                 if (all_used >= cg_used) {
4458                         cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4459
4460                 } else {
4461                         lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4462                                         "%lu in cpuacct.usage_all; unable to determine idle time\n",
4463                                         curcpu, cg, all_used, cg_used);
4464                         cg_cpu_usage[curcpu].idle = idle;
4465                 }
4466         }
4467
4468         /* Cannot use more CPUs than is available due to cpuset */
4469         if (max_cpus > cpu_cnt)
4470                 max_cpus = cpu_cnt;
4471
4472         stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4473
4474         if (!stat_node) {
4475                 lxcfs_error("unable to find/create stat node for %s\n", cg);
4476                 return 0;
4477         }
4478
4479         diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4480         if (!diff) {
4481                 return 0;
4482         }
4483
4484         /*
4485          * If the new values are LOWER than values stored in memory, it means
4486          * the cgroup has been reset/recreated and we should reset too.
4487          */
4488         for (curcpu = 0; curcpu < nprocs; curcpu++) {
4489                 if (!cg_cpu_usage[curcpu].online)
4490                         continue;
4491
4492                 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4493                         reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4494
4495                 break;
4496         }
4497
4498         total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4499
4500         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4501                 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4502
4503                 if (!stat_node->usage[curcpu].online)
4504                         continue;
4505
4506                 i++;
4507
4508                 stat_node->usage[curcpu].user += diff[curcpu].user;
4509                 stat_node->usage[curcpu].system += diff[curcpu].system;
4510                 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4511
4512                 if (max_cpus > 0 && i >= max_cpus) {
4513                         user_surplus += diff[curcpu].user;
4514                         system_surplus += diff[curcpu].system;
4515                 }
4516         }
4517
4518         /* Calculate usage counters of visible CPUs */
4519         if (max_cpus > 0) {
4520                 unsigned long diff_user = 0;
4521                 unsigned long diff_system = 0;
4522                 unsigned long diff_idle = 0;
4523                 unsigned long max_diff_idle = 0;
4524                 unsigned long max_diff_idle_index = 0;
4525                 double exact_cpus;
4526
4527                 /* threshold = maximum usage per cpu, including idle */
4528                 threshold = total_sum / cpu_cnt * max_cpus;
4529
4530                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4531                         if (!stat_node->usage[curcpu].online)
4532                                 continue;
4533
4534                         i++;
4535
4536                         if (i == max_cpus)
4537                                 break;
4538
4539                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4540                                 continue;
4541
4542                         /* Add user */
4543                         add_cpu_usage(&user_surplus, &diff[curcpu],
4544                                       &diff[curcpu].user, threshold);
4545
4546                         if (diff[curcpu].user + diff[curcpu].system >= threshold)
4547                                 continue;
4548
4549                         /* If there is still room, add system */
4550                         add_cpu_usage(&system_surplus, &diff[curcpu],
4551                                       &diff[curcpu].system, threshold);
4552                 }
4553
4554                 if (user_surplus > 0)
4555                         lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4556                 if (system_surplus > 0)
4557                         lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4558
4559                 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4560                         if (!stat_node->usage[curcpu].online)
4561                                 continue;
4562
4563                         i++;
4564
4565                         if (i == max_cpus)
4566                                 break;
4567
4568                         stat_node->view[curcpu].user += diff[curcpu].user;
4569                         stat_node->view[curcpu].system += diff[curcpu].system;
4570                         stat_node->view[curcpu].idle += diff[curcpu].idle;
4571
4572                         user_sum += stat_node->view[curcpu].user;
4573                         system_sum += stat_node->view[curcpu].system;
4574                         idle_sum += stat_node->view[curcpu].idle;
4575
4576                         diff_user += diff[curcpu].user;
4577                         diff_system += diff[curcpu].system;
4578                         diff_idle += diff[curcpu].idle;
4579                         if (diff[curcpu].idle > max_diff_idle) {
4580                                 max_diff_idle = diff[curcpu].idle;
4581                                 max_diff_idle_index = curcpu;
4582                         }
4583
4584                         lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4585                 }
4586                 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4587
4588                 /* revise cpu usage view to support partial cpu case. */
4589                 exact_cpus = exact_cpu_count(cg);
4590                 if (exact_cpus < (double)max_cpus){
4591                         unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4592
4593                         lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4594                         lxcfs_v("delta: %lu\n", delta);
4595                         lxcfs_v("idle_sum before: %lu\n", idle_sum);
4596                         idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4597                         lxcfs_v("idle_sum after: %lu\n", idle_sum);
4598
4599                         curcpu = max_diff_idle_index;
4600                         lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4601                         stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4602                         lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4603                 }
4604         } else {
4605                 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4606                         if (!stat_node->usage[curcpu].online)
4607                                 continue;
4608
4609                         stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4610                         stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4611                         stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4612
4613                         user_sum += stat_node->view[curcpu].user;
4614                         system_sum += stat_node->view[curcpu].system;
4615                         idle_sum += stat_node->view[curcpu].idle;
4616                 }
4617         }
4618
4619         /* Render the file */
4620         /* cpu-all */
4621         l = snprintf(buf, buf_size, "cpu  %lu 0 %lu %lu 0 0 0 0 0 0\n",
4622                         user_sum,
4623                         system_sum,
4624                         idle_sum);
4625         lxcfs_v("cpu-all: %s\n", buf);
4626
4627         if (l < 0) {
4628                 perror("Error writing to cache");
4629                 return 0;
4630         }
4631         if (l >= buf_size) {
4632                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4633                 return 0;
4634         }
4635
4636         buf += l;
4637         buf_size -= l;
4638         total_len += l;
4639
4640         /* Render visible CPUs */
4641         for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4642                 if (!stat_node->usage[curcpu].online)
4643                         continue;
4644
4645                 i++;
4646
4647                 if (max_cpus > 0 && i == max_cpus)
4648                         break;
4649
4650                 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4651                                 i,
4652                                 stat_node->view[curcpu].user,
4653                                 stat_node->view[curcpu].system,
4654                                 stat_node->view[curcpu].idle);
4655                 lxcfs_v("cpu: %s\n", buf);
4656
4657                 if (l < 0) {
4658                         perror("Error writing to cache");
4659                         return 0;
4660
4661                 }
4662                 if (l >= buf_size) {
4663                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4664                         return 0;
4665                 }
4666
4667                 buf += l;
4668                 buf_size -= l;
4669                 total_len += l;
4670         }
4671
4672         /* Pass the rest of /proc/stat, start with the last line read */
4673         l = snprintf(buf, buf_size, "%s", line);
4674
4675         if (l < 0) {
4676                 perror("Error writing to cache");
4677                 return 0;
4678
4679         }
4680         if (l >= buf_size) {
4681                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4682                 return 0;
4683         }
4684
4685         buf += l;
4686         buf_size -= l;
4687         total_len += l;
4688
4689         /* Pass the rest of the host's /proc/stat */
4690         while (getline(&line, &linelen, f) != -1) {
4691                 l = snprintf(buf, buf_size, "%s", line);
4692                 if (l < 0) {
4693                         perror("Error writing to cache");
4694                         return 0;
4695                 }
4696                 if (l >= buf_size) {
4697                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4698                         return 0;
4699                 }
4700                 buf += l;
4701                 buf_size -= l;
4702                 total_len += l;
4703         }
4704
4705         if (stat_node)
4706                 pthread_mutex_unlock(&stat_node->lock);
4707         return total_len;
4708 }
4709
4710 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4711 static int proc_stat_read(char *buf, size_t size, off_t offset,
4712                           struct fuse_file_info *fi)
4713 {
4714         __do_free char *cg = NULL, *cpuset = NULL, *line = NULL;
4715         __do_free struct cpuacct_usage *cg_cpu_usage = NULL;
4716         __do_fclose FILE *f = NULL;
4717         struct fuse_context *fc = fuse_get_context();
4718         struct file_info *d = (struct file_info *)fi->fh;
4719         size_t linelen = 0, total_len = 0;
4720         int curcpu = -1; /* cpu numbering starts at 0 */
4721         int physcpu = 0;
4722         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0,
4723                       irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4724         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0,
4725                       iowait_sum = 0, irq_sum = 0, softirq_sum = 0,
4726                       steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4727         char cpuall[CPUALL_MAX_SIZE];
4728         /* reserve for cpu all */
4729         char *cache = d->buf + CPUALL_MAX_SIZE;
4730         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4731         int cg_cpu_usage_size = 0;
4732
4733         if (offset){
4734                 if (offset > d->size)
4735                         return -EINVAL;
4736                 if (!d->cached)
4737                         return 0;
4738                 int left = d->size - offset;
4739                 total_len = left > size ? size: left;
4740                 memcpy(buf, d->buf + offset, total_len);
4741                 return total_len;
4742         }
4743
4744         pid_t initpid = lookup_initpid_in_store(fc->pid);
4745         lxcfs_v("initpid: %d\n", initpid);
4746         if (initpid <= 0)
4747                 initpid = fc->pid;
4748
4749         /*
4750          * when container run with host pid namespace initpid == 1, cgroup will "/"
4751          * we should return host os's /proc contents.
4752          * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4753          */
4754         if (initpid == 1) {
4755             return read_file_fuse("/proc/stat", buf, size, d);
4756         }
4757
4758         cg = get_pid_cgroup(initpid, "cpuset");
4759         lxcfs_v("cg: %s\n", cg);
4760         if (!cg)
4761                 return read_file_fuse("/proc/stat", buf, size, d);
4762         prune_init_slice(cg);
4763
4764         cpuset = get_cpuset(cg);
4765         if (!cpuset)
4766                 return 0;
4767
4768         /*
4769          * Read cpuacct.usage_all for all CPUs.
4770          * If the cpuacct cgroup is present, it is used to calculate the container's
4771          * CPU usage. If not, values from the host's /proc/stat are used.
4772          */
4773         if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
4774                 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
4775                                 "falling back to the host's /proc/stat");
4776         }
4777
4778         f = fopen("/proc/stat", "r");
4779         if (!f)
4780                 return 0;
4781
4782         //skip first line
4783         if (getline(&line, &linelen, f) < 0) {
4784                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
4785                 return 0;
4786         }
4787
4788         if (cgroup_ops->can_use_cpuview(cgroup_ops) && cg_cpu_usage) {
4789                 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
4790                                 f, d->buf, d->buflen);
4791                 goto out;
4792         }
4793
4794         while (getline(&line, &linelen, f) != -1) {
4795                 ssize_t l;
4796                 char cpu_char[10]; /* That's a lot of cores */
4797                 char *c;
4798                 uint64_t all_used, cg_used, new_idle;
4799                 int ret;
4800
4801                 if (strlen(line) == 0)
4802                         continue;
4803                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4804                         /* not a ^cpuN line containing a number N, just print it */
4805                         l = snprintf(cache, cache_size, "%s", line);
4806                         if (l < 0) {
4807                                 perror("Error writing to cache");
4808                                 return 0;
4809                         }
4810                         if (l >= cache_size) {
4811                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4812                                 return 0;
4813                         }
4814                         cache += l;
4815                         cache_size -= l;
4816                         total_len += l;
4817                         continue;
4818                 }
4819
4820                 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4821                         continue;
4822                 if (!cpu_in_cpuset(physcpu, cpuset))
4823                         continue;
4824                 curcpu ++;
4825
4826                 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4827                            &user,
4828                            &nice,
4829                            &system,
4830                            &idle,
4831                            &iowait,
4832                            &irq,
4833                            &softirq,
4834                            &steal,
4835                            &guest,
4836                            &guest_nice);
4837
4838                 if (ret != 10 || !cg_cpu_usage) {
4839                         c = strchr(line, ' ');
4840                         if (!c)
4841                                 continue;
4842                         l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
4843                         if (l < 0) {
4844                                 perror("Error writing to cache");
4845                                 return 0;
4846
4847                         }
4848                         if (l >= cache_size) {
4849                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4850                                 return 0;
4851                         }
4852
4853                         cache += l;
4854                         cache_size -= l;
4855                         total_len += l;
4856
4857                         if (ret != 10)
4858                                 continue;
4859                 }
4860
4861                 if (cg_cpu_usage) {
4862                         if (physcpu >= cg_cpu_usage_size)
4863                                 break;
4864
4865                         all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4866                         cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
4867
4868                         if (all_used >= cg_used) {
4869                                 new_idle = idle + (all_used - cg_used);
4870
4871                         } else {
4872                                 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4873                                                 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4874                                                 curcpu, cg, all_used, cg_used);
4875                                 new_idle = idle;
4876                         }
4877
4878                         l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4879                                         curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
4880                                         new_idle);
4881
4882                         if (l < 0) {
4883                                 perror("Error writing to cache");
4884                                 return 0;
4885
4886                         }
4887                         if (l >= cache_size) {
4888                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4889                                 return 0;
4890                         }
4891
4892                         cache += l;
4893                         cache_size -= l;
4894                         total_len += l;
4895
4896                         user_sum += cg_cpu_usage[physcpu].user;
4897                         system_sum += cg_cpu_usage[physcpu].system;
4898                         idle_sum += new_idle;
4899
4900                 } else {
4901                         user_sum += user;
4902                         nice_sum += nice;
4903                         system_sum += system;
4904                         idle_sum += idle;
4905                         iowait_sum += iowait;
4906                         irq_sum += irq;
4907                         softirq_sum += softirq;
4908                         steal_sum += steal;
4909                         guest_sum += guest;
4910                         guest_nice_sum += guest_nice;
4911                 }
4912         }
4913
4914         cache = d->buf;
4915
4916         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4917                         user_sum,
4918                         nice_sum,
4919                         system_sum,
4920                         idle_sum,
4921                         iowait_sum,
4922                         irq_sum,
4923                         softirq_sum,
4924                         steal_sum,
4925                         guest_sum,
4926                         guest_nice_sum);
4927         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
4928                 memcpy(cache, cpuall, cpuall_len);
4929                 cache += cpuall_len;
4930         } else {
4931                 /* shouldn't happen */
4932                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
4933                 cpuall_len = 0;
4934         }
4935
4936         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
4937         total_len += cpuall_len;
4938
4939 out:
4940         d->cached = 1;
4941         d->size = total_len;
4942         if (total_len > size)
4943                 total_len = size;
4944
4945         memcpy(buf, d->buf, total_len);
4946         return total_len;
4947 }
4948
4949 /* This function retrieves the busy time of a group of tasks by looking at
4950  * cpuacct.usage. Unfortunately, this only makes sense when the container has
4951  * been given it's own cpuacct cgroup. If not, this function will take the busy
4952  * time of all other taks that do not actually belong to the container into
4953  * account as well. If someone has a clever solution for this please send a
4954  * patch!
4955  */
4956 static double get_reaper_busy(pid_t task)
4957 {
4958         __do_free char *cgroup = NULL, *usage_str = NULL;
4959         unsigned long usage = 0;
4960         pid_t initpid;
4961
4962         initpid = lookup_initpid_in_store(task);
4963         if (initpid <= 0)
4964                 return 0;
4965
4966         cgroup = get_pid_cgroup(initpid, "cpuacct");
4967         if (!cgroup)
4968                 return 0;
4969         prune_init_slice(cgroup);
4970         if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage",
4971                              &usage_str))
4972                 return 0;
4973
4974         usage = strtoul(usage_str, NULL, 10);
4975         return ((double)usage / 1000000000);
4976 }
4977
4978 #if RELOADTEST
4979 void iwashere(void)
4980 {
4981         int fd;
4982
4983         fd = creat("/tmp/lxcfs-iwashere", 0644);
4984         if (fd >= 0)
4985                 close(fd);
4986 }
4987 #endif
4988
4989 /*
4990  * We read /proc/uptime and reuse its second field.
4991  * For the first field, we use the mtime for the reaper for
4992  * the calling pid as returned by getreaperage
4993  */
4994 static int proc_uptime_read(char *buf, size_t size, off_t offset,
4995                 struct fuse_file_info *fi)
4996 {
4997         struct fuse_context *fc = fuse_get_context();
4998         struct file_info *d = (struct file_info *)fi->fh;
4999         double busytime = get_reaper_busy(fc->pid);
5000         char *cache = d->buf;
5001         ssize_t total_len = 0;
5002         double idletime, reaperage;
5003
5004 #if RELOADTEST
5005         iwashere();
5006 #endif
5007
5008         if (offset){
5009                 if (!d->cached)
5010                         return 0;
5011                 if (offset > d->size)
5012                         return -EINVAL;
5013                 int left = d->size - offset;
5014                 total_len = left > size ? size: left;
5015                 memcpy(buf, cache + offset, total_len);
5016                 return total_len;
5017         }
5018
5019         reaperage = get_reaper_age(fc->pid);
5020         /* To understand why this is done, please read the comment to the
5021          * get_reaper_busy() function.
5022          */
5023         idletime = reaperage;
5024         if (reaperage >= busytime)
5025                 idletime = reaperage - busytime;
5026
5027         total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5028         if (total_len < 0 || total_len >=  d->buflen){
5029                 lxcfs_error("%s\n", "failed to write to cache");
5030                 return 0;
5031         }
5032
5033         d->size = (int)total_len;
5034         d->cached = 1;
5035
5036         if (total_len > size) total_len = size;
5037
5038         memcpy(buf, d->buf, total_len);
5039         return total_len;
5040 }
5041
5042 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5043                                struct fuse_file_info *fi)
5044 {
5045         __do_free char *cg = NULL, *io_serviced_str = NULL,
5046                        *io_merged_str = NULL, *io_service_bytes_str = NULL,
5047                        *io_wait_time_str = NULL, *io_service_time_str = NULL,
5048                        *line = NULL;
5049         __do_fclose FILE *f = NULL;
5050         struct fuse_context *fc = fuse_get_context();
5051         struct file_info *d = (struct file_info *)fi->fh;
5052         unsigned long read = 0, write = 0;
5053         unsigned long read_merged = 0, write_merged = 0;
5054         unsigned long read_sectors = 0, write_sectors = 0;
5055         unsigned long read_ticks = 0, write_ticks = 0;
5056         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5057         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5058         char *cache = d->buf;
5059         size_t cache_size = d->buflen;
5060         size_t linelen = 0, total_len = 0;
5061         unsigned int major = 0, minor = 0;
5062         int i = 0;
5063         int ret;
5064         char dev_name[72];
5065
5066         if (offset){
5067                 int left;
5068
5069                 if (offset > d->size)
5070                         return -EINVAL;
5071
5072                 if (!d->cached)
5073                         return 0;
5074
5075                 left = d->size - offset;
5076                 total_len = left > size ? size: left;
5077                 memcpy(buf, cache + offset, total_len);
5078
5079                 return total_len;
5080         }
5081
5082         pid_t initpid = lookup_initpid_in_store(fc->pid);
5083         if (initpid <= 1 || is_shared_pidns(initpid))
5084                 initpid = fc->pid;
5085         cg = get_pid_cgroup(initpid, "blkio");
5086         if (!cg)
5087                 return read_file_fuse("/proc/diskstats", buf, size, d);
5088         prune_init_slice(cg);
5089
5090         ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str);
5091         if (ret < 0) {
5092                 if (ret == -EOPNOTSUPP)
5093                         return read_file_fuse("/proc/diskstats", buf, size, d);
5094         }
5095
5096         ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str);
5097         if (ret < 0) {
5098                 if (ret == -EOPNOTSUPP)
5099                         return read_file_fuse("/proc/diskstats", buf, size, d);
5100         }
5101
5102         ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str);
5103         if (ret < 0) {
5104                 if (ret == -EOPNOTSUPP)
5105                         return read_file_fuse("/proc/diskstats", buf, size, d);
5106         }
5107
5108         ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str);
5109         if (ret < 0) {
5110                 if (ret == -EOPNOTSUPP)
5111                         return read_file_fuse("/proc/diskstats", buf, size, d);
5112         }
5113
5114         ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str);
5115         if (ret < 0) {
5116                 if (ret == -EOPNOTSUPP)
5117                         return read_file_fuse("/proc/diskstats", buf, size, d);
5118         }
5119
5120         f = fopen("/proc/diskstats", "r");
5121         if (!f)
5122                 return 0;
5123
5124         while (getline(&line, &linelen, f) != -1) {
5125                 ssize_t l;
5126                 char lbuf[256];
5127
5128                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5129                 if (i != 3)
5130                         continue;
5131
5132                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5133                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5134                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5135                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5136                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5137                 read_sectors = read_sectors/512;
5138                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5139                 write_sectors = write_sectors/512;
5140
5141                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5142                 rd_svctm = rd_svctm/1000000;
5143                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5144                 rd_wait = rd_wait/1000000;
5145                 read_ticks = rd_svctm + rd_wait;
5146
5147                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5148                 wr_svctm =  wr_svctm/1000000;
5149                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5150                 wr_wait =  wr_wait/1000000;
5151                 write_ticks = wr_svctm + wr_wait;
5152
5153                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5154                 tot_ticks =  tot_ticks/1000000;
5155
5156                 memset(lbuf, 0, 256);
5157                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5158                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5159                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5160                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5161                 else
5162                         continue;
5163
5164                 l = snprintf(cache, cache_size, "%s", lbuf);
5165                 if (l < 0) {
5166                         perror("Error writing to fuse buf");
5167                         return 0;
5168                 }
5169                 if (l >= cache_size) {
5170                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5171                         return 0;
5172                 }
5173                 cache += l;
5174                 cache_size -= l;
5175                 total_len += l;
5176         }
5177
5178         d->cached = 1;
5179         d->size = total_len;
5180         if (total_len > size ) total_len = size;
5181         memcpy(buf, d->buf, total_len);
5182
5183         return total_len;
5184 }
5185
5186 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5187                            struct fuse_file_info *fi)
5188 {
5189         __do_free char *cg = NULL, *memswlimit_str = NULL, *memusage_str = NULL,
5190                        *memswusage_str = NULL;
5191         struct fuse_context *fc = fuse_get_context();
5192         struct file_info *d = (struct file_info *)fi->fh;
5193         unsigned long memswlimit = 0, memlimit = 0, memusage = 0,
5194                       memswusage = 0, swap_total = 0, swap_free = 0;
5195         ssize_t total_len = 0;
5196         ssize_t l = 0;
5197         char *cache = d->buf;
5198         int ret;
5199
5200         if (offset) {
5201                 int left;
5202
5203                 if (offset > d->size)
5204                         return -EINVAL;
5205
5206                 if (!d->cached)
5207                         return 0;
5208
5209                 left = d->size - offset;
5210                 total_len = left > size ? size: left;
5211                 memcpy(buf, cache + offset, total_len);
5212
5213                 return total_len;
5214         }
5215
5216         pid_t initpid = lookup_initpid_in_store(fc->pid);
5217         if (initpid <= 1 || is_shared_pidns(initpid))
5218                 initpid = fc->pid;
5219         cg = get_pid_cgroup(initpid, "memory");
5220         if (!cg)
5221                 return read_file_fuse("/proc/swaps", buf, size, d);
5222         prune_init_slice(cg);
5223
5224         memlimit = get_min_memlimit(cg, false);
5225
5226         ret = cgroup_ops->get_memory_current(cgroup_ops, cg, &memusage_str);
5227         if (ret < 0)
5228                 return 0;
5229
5230         memusage = strtoul(memusage_str, NULL, 10);
5231
5232         ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cg, &memswlimit_str);
5233         if (ret >= 0)
5234                 ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cg, &memswusage_str);
5235         if (ret >= 0) {
5236                 memswlimit = get_min_memlimit(cg, true);
5237                 memswusage = strtoul(memswusage_str, NULL, 10);
5238                 swap_total = (memswlimit - memlimit) / 1024;
5239                 swap_free = (memswusage - memusage) / 1024;
5240         }
5241
5242         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5243
5244         /* When no mem + swap limit is specified or swapaccount=0*/
5245         if (!memswlimit) {
5246                 __do_free char *line = NULL;
5247                 __do_fclose FILE *f = NULL;
5248                 size_t linelen = 0;
5249
5250                 f = fopen("/proc/meminfo", "r");
5251                 if (!f)
5252                         return 0;
5253
5254                 while (getline(&line, &linelen, f) != -1) {
5255                         if (startswith(line, "SwapTotal:"))
5256                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
5257                         else if (startswith(line, "SwapFree:"))
5258                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
5259                 }
5260         }
5261
5262         if (swap_total > 0) {
5263                 l = snprintf(d->buf + total_len, d->size - total_len,
5264                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5265                                 swap_total, swap_free);
5266                 total_len += l;
5267         }
5268
5269         if (total_len < 0 || l < 0) {
5270                 perror("Error writing to cache");
5271                 return 0;
5272         }
5273
5274         d->cached = 1;
5275         d->size = (int)total_len;
5276
5277         if (total_len > size) total_len = size;
5278         memcpy(buf, d->buf, total_len);
5279         return total_len;
5280 }
5281
5282 /*
5283  * Find the process pid from cgroup path.
5284  * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5285  * @pid_buf : put pid to pid_buf.
5286  * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5287  * @depth : the depth of cgroup in container.
5288  * @sum : return the number of pid.
5289  * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5290  */
5291 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5292 {
5293         __do_free char *path = NULL;
5294         __do_close_prot_errno int fd = -EBADF;
5295         __do_fclose FILE *f = NULL;
5296         __do_closedir DIR *dir = NULL;
5297         struct dirent *file;
5298         size_t linelen = 0;
5299         char *line = NULL;
5300         int pd;
5301         char **pid;
5302
5303         /* path = dpath + "/cgroup.procs" + /0 */
5304         path = malloc(strlen(dpath) + 20);
5305         if (!path)
5306                 return sum;
5307
5308         strcpy(path, dpath);
5309         fd = openat(cfd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW);
5310         if (fd < 0)
5311                 return sum;
5312
5313         dir = fdopendir(move_fd(fd));
5314         if (!dir)
5315                 return sum;
5316
5317         while (((file = readdir(dir)) != NULL) && depth > 0) {
5318                 if (strcmp(file->d_name, ".") == 0)
5319                         continue;
5320
5321                 if (strcmp(file->d_name, "..") == 0)
5322                         continue;
5323
5324                 if (file->d_type == DT_DIR) {
5325                         __do_free char *path_dir = NULL;
5326
5327                         /* path + '/' + d_name +/0 */
5328                         path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5329                         if (!path_dir)
5330                                 return sum;
5331
5332                         strcpy(path_dir, path);
5333                         strcat(path_dir, "/");
5334                         strcat(path_dir, file->d_name);
5335                         pd = depth - 1;
5336                         sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5337                 }
5338         }
5339
5340         strcat(path, "/cgroup.procs");
5341         fd = openat(cfd, path, O_RDONLY);
5342         if (fd < 0)
5343                 return sum;
5344
5345         f = fdopen(move_fd(fd), "r");
5346         if (!f)
5347                 return sum;
5348
5349         while (getline(&line, &linelen, f) != -1) {
5350                 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5351                 if (!pid)
5352                         return sum;
5353                 *pid_buf = pid;
5354
5355                 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5356                 if (!*(*pid_buf + sum))
5357                         return sum;
5358
5359                 strcpy(*(*pid_buf + sum), line);
5360                 sum++;
5361         }
5362
5363         return sum;
5364 }
5365
5366 /*
5367  * calc_load calculates the load according to the following formula:
5368  * load1 = load0 * exp + active * (1 - exp)
5369  *
5370  * @load1: the new loadavg.
5371  * @load0: the former loadavg.
5372  * @active: the total number of running pid at this moment.
5373  * @exp: the fixed-point defined in the beginning.
5374  */
5375 static unsigned long
5376 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5377 {
5378         unsigned long newload;
5379
5380         active = active > 0 ? active * FIXED_1 : 0;
5381         newload = load * exp + active * (FIXED_1 - exp);
5382         if (active >= load)
5383                 newload += FIXED_1 - 1;
5384
5385         return newload / FIXED_1;
5386 }
5387
5388 /*
5389  * Return 0 means that container p->cg is closed.
5390  * Return -1 means that error occurred in refresh.
5391  * Positive num equals the total number of pid.
5392  */
5393 static int refresh_load(struct load_node *p, char *path)
5394 {
5395         __do_free char *line = NULL;
5396         char **idbuf;
5397         char proc_path[256];
5398         int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5399         size_t linelen = 0;
5400         int sum, length;
5401         struct dirent *file;
5402
5403         idbuf = malloc(sizeof(char *));
5404         if (!idbuf)
5405                 return -1;
5406
5407         sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5408         /*  normal exit  */
5409         if (sum == 0)
5410                 goto out;
5411
5412         for (i = 0; i < sum; i++) {
5413                 __do_closedir DIR *dp = NULL;
5414
5415                 /*clean up '\n' */
5416                 length = strlen(idbuf[i])-1;
5417                 idbuf[i][length] = '\0';
5418                 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5419                 if (ret < 0 || ret > 255) {
5420                         lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5421                         i = sum;
5422                         sum = -1;
5423                         goto err_out;
5424                 }
5425
5426                 dp = opendir(proc_path);
5427                 if (!dp) {
5428                         lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5429                         continue;
5430                 }
5431                 while ((file = readdir(dp)) != NULL) {
5432                         __do_fclose FILE *f = NULL;
5433
5434                         if (strncmp(file->d_name, ".", 1) == 0)
5435                                 continue;
5436                         if (strncmp(file->d_name, "..", 1) == 0)
5437                                 continue;
5438                         total_pid++;
5439                         /* We make the biggest pid become last_pid.*/
5440                         ret = atof(file->d_name);
5441                         last_pid = (ret > last_pid) ? ret : last_pid;
5442
5443                         ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5444                         if (ret < 0 || ret > 255) {
5445                                 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5446                                 i = sum;
5447                                 sum = -1;
5448                                 goto err_out;
5449                         }
5450
5451                         f = fopen(proc_path, "r");
5452                         if (f != NULL) {
5453                                 while (getline(&line, &linelen, f) != -1) {
5454                                         /* Find State */
5455                                         if ((line[0] == 'S') && (line[1] == 't'))
5456                                                 break;
5457                                 }
5458
5459                         if ((line[7] == 'R') || (line[7] == 'D'))
5460                                 run_pid++;
5461                         }
5462                 }
5463         }
5464         /*Calculate the loadavg.*/
5465         p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5466         p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5467         p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5468         p->run_pid = run_pid;
5469         p->total_pid = total_pid;
5470         p->last_pid = last_pid;
5471
5472 err_out:
5473         for (; i > 0; i--)
5474                 free(idbuf[i-1]);
5475 out:
5476         free(idbuf);
5477         return sum;
5478 }
5479
5480 /*
5481  * Traverse the hash table and update it.
5482  */
5483 void *load_begin(void *arg)
5484 {
5485
5486         int i, sum, length, ret;
5487         struct load_node *f;
5488         int first_node;
5489         clock_t time1, time2;
5490
5491         while (1) {
5492                 if (loadavg_stop == 1)
5493                         return NULL;
5494
5495                 time1 = clock();
5496                 for (i = 0; i < LOAD_SIZE; i++) {
5497                         pthread_mutex_lock(&load_hash[i].lock);
5498                         if (load_hash[i].next == NULL) {
5499                                 pthread_mutex_unlock(&load_hash[i].lock);
5500                                 continue;
5501                         }
5502                         f = load_hash[i].next;
5503                         first_node = 1;
5504                         while (f) {
5505                                 __do_free char *path = NULL;
5506
5507                                 length = strlen(f->cg) + 2;
5508                                         /* strlen(f->cg) + '.' or '' + \0 */
5509                                 path = malloc(length);
5510                                 if  (!path)
5511                                         goto out;
5512
5513                                 ret = snprintf(path, length, "%s%s", dot_or_empty(f->cg), f->cg);
5514                                 if (ret < 0 || ret > length - 1) {
5515                                         /* snprintf failed, ignore the node.*/
5516                                         lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5517                                         goto out;
5518                                 }
5519
5520                                 sum = refresh_load(f, path);
5521                                 if (sum == 0)
5522                                         f = del_node(f, i);
5523                                 else
5524 out:                                    f = f->next;
5525                                 /* load_hash[i].lock locks only on the first node.*/
5526                                 if (first_node == 1) {
5527                                         first_node = 0;
5528                                         pthread_mutex_unlock(&load_hash[i].lock);
5529                                 }
5530                         }
5531                 }
5532
5533                 if (loadavg_stop == 1)
5534                         return NULL;
5535
5536                 time2 = clock();
5537                 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5538         }
5539 }
5540
5541 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5542                 struct fuse_file_info *fi)
5543 {
5544         struct fuse_context *fc = fuse_get_context();
5545         struct file_info *d = (struct file_info *)fi->fh;
5546         pid_t initpid;
5547         char *cg;
5548         size_t total_len = 0;
5549         char *cache = d->buf;
5550         struct load_node *n;
5551         int hash;
5552         int cfd, rv = 0;
5553         unsigned long a, b, c;
5554
5555         if (offset) {
5556                 if (offset > d->size)
5557                         return -EINVAL;
5558                 if (!d->cached)
5559                         return 0;
5560                 int left = d->size - offset;
5561                 total_len = left > size ? size : left;
5562                 memcpy(buf, cache + offset, total_len);
5563                 return total_len;
5564         }
5565         if (!loadavg)
5566                 return read_file_fuse("/proc/loadavg", buf, size, d);
5567
5568         initpid = lookup_initpid_in_store(fc->pid);
5569         if (initpid <= 1 || is_shared_pidns(initpid))
5570                 initpid = fc->pid;
5571         cg = get_pid_cgroup(initpid, "cpu");
5572         if (!cg)
5573                 return read_file_fuse("/proc/loadavg", buf, size, d);
5574
5575         prune_init_slice(cg);
5576         hash = calc_hash(cg) % LOAD_SIZE;
5577         n = locate_node(cg, hash);
5578
5579         /* First time */
5580         if (n == NULL) {
5581                 cfd = find_mounted_controller("cpu");
5582                 if (cfd >= 0) {
5583                         /*
5584                          * In locate_node() above, pthread_rwlock_unlock() isn't used
5585                          * because delete is not allowed before read has ended.
5586                          */
5587                         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5588                         rv = 0;
5589                         goto err;
5590                 }
5591                 do {
5592                         n = malloc(sizeof(struct load_node));
5593                 } while (!n);
5594
5595                 do {
5596                         n->cg = malloc(strlen(cg)+1);
5597                 } while (!n->cg);
5598                 strcpy(n->cg, cg);
5599                 n->avenrun[0] = 0;
5600                 n->avenrun[1] = 0;
5601                 n->avenrun[2] = 0;
5602                 n->run_pid = 0;
5603                 n->total_pid = 1;
5604                 n->last_pid = initpid;
5605                 n->cfd = cfd;
5606                 insert_node(&n, hash);
5607         }
5608         a = n->avenrun[0] + (FIXED_1/200);
5609         b = n->avenrun[1] + (FIXED_1/200);
5610         c = n->avenrun[2] + (FIXED_1/200);
5611         total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5612                 LOAD_INT(a), LOAD_FRAC(a),
5613                 LOAD_INT(b), LOAD_FRAC(b),
5614                 LOAD_INT(c), LOAD_FRAC(c),
5615                 n->run_pid, n->total_pid, n->last_pid);
5616         pthread_rwlock_unlock(&load_hash[hash].rdlock);
5617         if (total_len < 0 || total_len >=  d->buflen) {
5618                 lxcfs_error("%s\n", "Failed to write to cache");
5619                 rv = 0;
5620                 goto err;
5621         }
5622         d->size = (int)total_len;
5623         d->cached = 1;
5624
5625         if (total_len > size)
5626                 total_len = size;
5627         memcpy(buf, d->buf, total_len);
5628         rv = total_len;
5629
5630 err:
5631         free(cg);
5632         return rv;
5633 }
5634 /* Return a positive number on success, return 0 on failure.*/
5635 pthread_t load_daemon(int load_use)
5636 {
5637         int ret;
5638         pthread_t pid;
5639
5640         ret = init_load();
5641         if (ret == -1) {
5642                 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5643                 return 0;
5644         }
5645         ret = pthread_create(&pid, NULL, load_begin, NULL);
5646         if (ret != 0) {
5647                 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5648                 load_free();
5649                 return 0;
5650         }
5651         /* use loadavg, here loadavg = 1*/
5652         loadavg = load_use;
5653         return pid;
5654 }
5655
5656 /* Returns 0 on success. */
5657 int stop_load_daemon(pthread_t pid)
5658 {
5659         int s;
5660
5661         /* Signal the thread to gracefully stop */
5662         loadavg_stop = 1;
5663
5664         s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5665         if (s != 0) {
5666                 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5667                 return -1;
5668         }
5669
5670         load_free();
5671         loadavg_stop = 0;
5672
5673         return 0;
5674 }
5675
5676 static off_t get_procfile_size(const char *which)
5677 {
5678         FILE *f = fopen(which, "r");
5679         char *line = NULL;
5680         size_t len = 0;
5681         ssize_t sz, answer = 0;
5682         if (!f)
5683                 return 0;
5684
5685         while ((sz = getline(&line, &len, f)) != -1)
5686                 answer += sz;
5687         fclose (f);
5688         free(line);
5689
5690         return answer;
5691 }
5692
5693 int proc_getattr(const char *path, struct stat *sb)
5694 {
5695         struct timespec now;
5696
5697         memset(sb, 0, sizeof(struct stat));
5698         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5699                 return -EINVAL;
5700         sb->st_uid = sb->st_gid = 0;
5701         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5702         if (strcmp(path, "/proc") == 0) {
5703                 sb->st_mode = S_IFDIR | 00555;
5704                 sb->st_nlink = 2;
5705                 return 0;
5706         }
5707         if (strcmp(path, "/proc/meminfo") == 0 ||
5708                         strcmp(path, "/proc/cpuinfo") == 0 ||
5709                         strcmp(path, "/proc/uptime") == 0 ||
5710                         strcmp(path, "/proc/stat") == 0 ||
5711                         strcmp(path, "/proc/diskstats") == 0 ||
5712                         strcmp(path, "/proc/swaps") == 0 ||
5713                         strcmp(path, "/proc/loadavg") == 0) {
5714                 sb->st_size = 0;
5715                 sb->st_mode = S_IFREG | 00444;
5716                 sb->st_nlink = 1;
5717                 return 0;
5718         }
5719
5720         return -ENOENT;
5721 }
5722
5723 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5724                 struct fuse_file_info *fi)
5725 {
5726         if (filler(buf, ".", NULL, 0) != 0 ||
5727             filler(buf, "..", NULL, 0) != 0 ||
5728             filler(buf, "cpuinfo", NULL, 0) != 0 ||
5729             filler(buf, "meminfo", NULL, 0) != 0 ||
5730             filler(buf, "stat", NULL, 0) != 0 ||
5731             filler(buf, "uptime", NULL, 0) != 0 ||
5732             filler(buf, "diskstats", NULL, 0) != 0 ||
5733             filler(buf, "swaps", NULL, 0) != 0   ||
5734             filler(buf, "loadavg", NULL, 0) != 0)
5735                 return -EINVAL;
5736         return 0;
5737 }
5738
5739 int proc_open(const char *path, struct fuse_file_info *fi)
5740 {
5741         int type = -1;
5742         struct file_info *info;
5743
5744         if (strcmp(path, "/proc/meminfo") == 0)
5745                 type = LXC_TYPE_PROC_MEMINFO;
5746         else if (strcmp(path, "/proc/cpuinfo") == 0)
5747                 type = LXC_TYPE_PROC_CPUINFO;
5748         else if (strcmp(path, "/proc/uptime") == 0)
5749                 type = LXC_TYPE_PROC_UPTIME;
5750         else if (strcmp(path, "/proc/stat") == 0)
5751                 type = LXC_TYPE_PROC_STAT;
5752         else if (strcmp(path, "/proc/diskstats") == 0)
5753                 type = LXC_TYPE_PROC_DISKSTATS;
5754         else if (strcmp(path, "/proc/swaps") == 0)
5755                 type = LXC_TYPE_PROC_SWAPS;
5756         else if (strcmp(path, "/proc/loadavg") == 0)
5757                 type = LXC_TYPE_PROC_LOADAVG;
5758         if (type == -1)
5759                 return -ENOENT;
5760
5761         info = malloc(sizeof(*info));
5762         if (!info)
5763                 return -ENOMEM;
5764
5765         memset(info, 0, sizeof(*info));
5766         info->type = type;
5767
5768         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
5769         do {
5770                 info->buf = malloc(info->buflen);
5771         } while (!info->buf);
5772         memset(info->buf, 0, info->buflen);
5773         /* set actual size to buffer size */
5774         info->size = info->buflen;
5775
5776         fi->fh = (unsigned long)info;
5777         return 0;
5778 }
5779
5780 int proc_access(const char *path, int mask)
5781 {
5782         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
5783                 return 0;
5784
5785         /* these are all read-only */
5786         if ((mask & ~R_OK) != 0)
5787                 return -EACCES;
5788         return 0;
5789 }
5790
5791 int proc_release(const char *path, struct fuse_file_info *fi)
5792 {
5793         do_release_file_info(fi);
5794         return 0;
5795 }
5796
5797 int proc_read(const char *path, char *buf, size_t size, off_t offset,
5798                 struct fuse_file_info *fi)
5799 {
5800         struct file_info *f = (struct file_info *) fi->fh;
5801
5802         switch (f->type) {
5803         case LXC_TYPE_PROC_MEMINFO:
5804                 return proc_meminfo_read(buf, size, offset, fi);
5805         case LXC_TYPE_PROC_CPUINFO:
5806                 return proc_cpuinfo_read(buf, size, offset, fi);
5807         case LXC_TYPE_PROC_UPTIME:
5808                 return proc_uptime_read(buf, size, offset, fi);
5809         case LXC_TYPE_PROC_STAT:
5810                 return proc_stat_read(buf, size, offset, fi);
5811         case LXC_TYPE_PROC_DISKSTATS:
5812                 return proc_diskstats_read(buf, size, offset, fi);
5813         case LXC_TYPE_PROC_SWAPS:
5814                 return proc_swaps_read(buf, size, offset, fi);
5815         case LXC_TYPE_PROC_LOADAVG:
5816                 return proc_loadavg_read(buf, size, offset, fi);
5817         default:
5818                 return -EINVAL;
5819         }
5820 }
5821
5822 /*
5823  * Functions needed to setup cgroups in the __constructor__.
5824  */
5825
5826 static bool umount_if_mounted(void)
5827 {
5828         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
5829                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
5830                 return false;
5831         }
5832         return true;
5833 }
5834
5835 /* __typeof__ should be safe to use with all compilers. */
5836 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
5837 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
5838 {
5839         return (fs->f_type == (fs_type_magic)magic_val);
5840 }
5841
5842 /*
5843  * looking at fs/proc_namespace.c, it appears we can
5844  * actually expect the rootfs entry to very specifically contain
5845  * " - rootfs rootfs "
5846  * IIUC, so long as we've chrooted so that rootfs is not our root,
5847  * the rootfs entry should always be skipped in mountinfo contents.
5848  */
5849 static bool is_on_ramfs(void)
5850 {
5851         FILE *f;
5852         char *p, *p2;
5853         char *line = NULL;
5854         size_t len = 0;
5855         int i;
5856
5857         f = fopen("/proc/self/mountinfo", "r");
5858         if (!f)
5859                 return false;
5860
5861         while (getline(&line, &len, f) != -1) {
5862                 for (p = line, i = 0; p && i < 4; i++)
5863                         p = strchr(p + 1, ' ');
5864                 if (!p)
5865                         continue;
5866                 p2 = strchr(p + 1, ' ');
5867                 if (!p2)
5868                         continue;
5869                 *p2 = '\0';
5870                 if (strcmp(p + 1, "/") == 0) {
5871                         // this is '/'.  is it the ramfs?
5872                         p = strchr(p2 + 1, '-');
5873                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
5874                                 free(line);
5875                                 fclose(f);
5876                                 return true;
5877                         }
5878                 }
5879         }
5880         free(line);
5881         fclose(f);
5882         return false;
5883 }
5884
5885 static int pivot_enter()
5886 {
5887         int ret = -1, oldroot = -1, newroot = -1;
5888
5889         oldroot = open("/", O_DIRECTORY | O_RDONLY);
5890         if (oldroot < 0) {
5891                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
5892                 return ret;
5893         }
5894
5895         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
5896         if (newroot < 0) {
5897                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
5898                 goto err;
5899         }
5900
5901         /* change into new root fs */
5902         if (fchdir(newroot) < 0) {
5903                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
5904                 goto err;
5905         }
5906
5907         /* pivot_root into our new root fs */
5908         if (pivot_root(".", ".") < 0) {
5909                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
5910                 goto err;
5911         }
5912
5913         /*
5914          * At this point the old-root is mounted on top of our new-root.
5915          * To unmounted it we must not be chdir'd into it, so escape back
5916          * to the old-root.
5917          */
5918         if (fchdir(oldroot) < 0) {
5919                 lxcfs_error("%s\n", "Failed to enter old root.");
5920                 goto err;
5921         }
5922
5923         if (umount2(".", MNT_DETACH) < 0) {
5924                 lxcfs_error("%s\n", "Failed to detach old root.");
5925                 goto err;
5926         }
5927
5928         if (fchdir(newroot) < 0) {
5929                 lxcfs_error("%s\n", "Failed to re-enter new root.");
5930                 goto err;
5931         }
5932
5933         ret = 0;
5934
5935 err:
5936         if (oldroot > 0)
5937                 close(oldroot);
5938         if (newroot > 0)
5939                 close(newroot);
5940
5941         return ret;
5942 }
5943
5944 static int chroot_enter()
5945 {
5946         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
5947                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
5948                 return -1;
5949         }
5950
5951         if (chroot(".") < 0) {
5952                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
5953                 return -1;
5954         }
5955
5956         if (chdir("/") < 0) {
5957                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
5958                 return -1;
5959         }
5960
5961         return 0;
5962 }
5963
5964 static int permute_and_enter(void)
5965 {
5966         struct statfs sb;
5967
5968         if (statfs("/", &sb) < 0) {
5969                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
5970                 return -1;
5971         }
5972
5973         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
5974          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
5975          * /proc/1/mountinfo. */
5976         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
5977                 return chroot_enter();
5978
5979         if (pivot_enter() < 0) {
5980                 lxcfs_error("%s\n", "Could not perform pivot root.");
5981                 return -1;
5982         }
5983
5984         return 0;
5985 }
5986
5987 /* Prepare our new clean root. */
5988 static int permute_prepare(void)
5989 {
5990         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
5991                 lxcfs_error("%s\n", "Failed to create directory for new root.");
5992                 return -1;
5993         }
5994
5995         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
5996                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
5997                 return -1;
5998         }
5999
6000         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6001                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6002                 return -1;
6003         }
6004
6005         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6006                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6007                 return -1;
6008         }
6009
6010         return 0;
6011 }
6012
6013 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6014 static bool permute_root(void)
6015 {
6016         /* Prepare new root. */
6017         if (permute_prepare() < 0)
6018                 return false;
6019
6020         /* Pivot into new root. */
6021         if (permute_and_enter() < 0)
6022                 return false;
6023
6024         return true;
6025 }
6026
6027 static int preserve_mnt_ns(int pid)
6028 {
6029         int ret;
6030         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6031         char path[len];
6032
6033         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6034         if (ret < 0 || (size_t)ret >= len)
6035                 return -1;
6036
6037         return open(path, O_RDONLY | O_CLOEXEC);
6038 }
6039
6040 static bool cgfs_prepare_mounts(void)
6041 {
6042         if (!mkdir_p(BASEDIR, 0700)) {
6043                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6044                 return false;
6045         }
6046
6047         if (!umount_if_mounted()) {
6048                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6049                 return false;
6050         }
6051
6052         if (unshare(CLONE_NEWNS) < 0) {
6053                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6054                 return false;
6055         }
6056
6057         cgroup_ops->mntns_fd = preserve_mnt_ns(getpid());
6058         if (cgroup_ops->mntns_fd < 0) {
6059                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6060                 return false;
6061         }
6062
6063         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6064                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6065                 return false;
6066         }
6067
6068         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6069                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6070                 return false;
6071         }
6072
6073         return true;
6074 }
6075
6076 static bool cgfs_mount_hierarchies(void)
6077 {
6078         if (!mkdir_p(BASEDIR DEFAULT_CGROUP_MOUNTPOINT, 0755))
6079                 return false;
6080
6081         if (!cgroup_ops->mount(cgroup_ops, BASEDIR))
6082                 return false;
6083
6084         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
6085                 __do_free char *path = must_make_path(BASEDIR, (*h)->mountpoint, NULL);
6086                 (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
6087                 if ((*h)->fd < 0)
6088                         return false;
6089         }
6090
6091         return true;
6092 }
6093
6094 static bool cgfs_setup_controllers(void)
6095 {
6096         if (!cgfs_prepare_mounts())
6097                 return false;
6098
6099         if (!cgfs_mount_hierarchies()) {
6100                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6101                 return false;
6102         }
6103
6104         if (!permute_root())
6105                 return false;
6106
6107         return true;
6108 }
6109
6110 static void __attribute__((constructor)) lxcfs_init(void)
6111 {
6112         __do_close_prot_errno int init_ns = -EBADF;
6113         char *cret;
6114         char cwd[MAXPATHLEN];
6115
6116         cgroup_ops = cgroup_init();
6117         if (!cgroup_ops)
6118                 log_exit("Failed to initialize cgroup support");
6119
6120         /* Preserve initial namespace. */
6121         init_ns = preserve_mnt_ns(getpid());
6122         if (init_ns < 0)
6123                 log_exit("Failed to preserve initial mount namespace");
6124
6125         cret = getcwd(cwd, MAXPATHLEN);
6126                 log_exit("%s - Could not retrieve current working directory", strerror(errno));
6127
6128         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6129          * to privately mount lxcfs cgroups. */
6130         if (!cgfs_setup_controllers())
6131                 log_exit("Failed to setup private cgroup mounts for lxcfs");
6132
6133         if (setns(init_ns, 0) < 0)
6134                 log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno));
6135
6136         if (!cret || chdir(cwd) < 0)
6137                 log_exit("%s - Could not change back to original working directory", strerror(errno));
6138
6139         if (!init_cpuview())
6140                 log_exit("Failed to init CPU view");
6141
6142         print_subsystems();
6143 }
6144
6145 static void __attribute__((destructor)) lxcfs_exit(void)
6146 {
6147         lxcfs_debug("%s\n", "Running destructor for liblxcfs");
6148         free_cpuview();
6149         cgroup_exit(cgroup_ops);
6150 }