bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69         LXC_TYPE_PROC_LOADAVG,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 /* The function of hash table.*/
  84 #define LOAD_SIZE 100 /*the size of hash_table */
  85 static int calc_hash(char *name)
  86 {
  87         unsigned int hash = 0;
  88         unsigned int x = 0;
  89         /* ELFHash algorithm. */
  90         while (*name) {
  91                 hash = (hash << 4) + *name++;
  92                 x = hash & 0xf0000000;
  93                 if (x != 0)
  94                         hash ^= (x >> 24);
  95                 hash &= ~x;
  96         }
  97         return ((hash & 0x7fffffff) % LOAD_SIZE);
  98 }
  99
 100 struct load_node {
 101         char *cg;  /*cg */
 102         unsigned long avenrun[3];               /* Load averages */
 103         unsigned int run_pid;
 104         unsigned int total_pid;
 105         unsigned int last_pid;
 106         int cfd; /* The file descriptor of the mounted cgroup */
 107         struct  load_node *next;
 108         struct  load_node **pre;
 109 };
 110
 111 struct load_head {
 112         /*
 113          * The lock is about insert load_node and refresh load_node.To the first
 114          * load_node of each hash bucket, insert and refresh in this hash bucket is
 115          * mutually exclusive.
 116          */
 117         pthread_mutex_t lock;
 118         /*
 119          * The rdlock is about read loadavg and delete load_node.To each hash
 120          * bucket, read and delete is mutually exclusive. But at the same time, we
 121          * allow paratactic read operation. This rdlock is at list level.
 122          */
 123         pthread_rwlock_t rdlock;
 124         /*
 125          * The rilock is about read loadavg and insert load_node.To the first
 126          * load_node of each hash bucket, read and insert is mutually exclusive.
 127          * But at the same time, we allow paratactic read operation.
 128          */
 129         pthread_rwlock_t rilock;
 130         struct load_node *next;
 131 };
 132
 133 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
 134 /*
 135  * init_load initialize the hash table.
 136  * Return 0 on success, return -1 on failure.
 137  */
 138 static int init_load(void)
 139 {
 140         int i;
 141         int ret;
 142
 143         for (i = 0; i < LOAD_SIZE; i++) {
 144                 load_hash[i].next = NULL;
 145                 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
 146                 if (ret != 0) {
 147                         lxcfs_error("%s\n", "Failed to initialize lock");
 148                         goto out3;
 149                 }
 150                 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
 151                 if (ret != 0) {
 152                         lxcfs_error("%s\n", "Failed to initialize rdlock");
 153                         goto out2;
 154                 }
 155                 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
 156                 if (ret != 0) {
 157                         lxcfs_error("%s\n", "Failed to initialize rilock");
 158                         goto out1;
 159                 }
 160         }
 161         return 0;
 162 out1:
 163         pthread_rwlock_destroy(&load_hash[i].rdlock);
 164 out2:
 165         pthread_mutex_destroy(&load_hash[i].lock);
 166 out3:
 167         while (i > 0) {
 168                 i--;
 169                 pthread_mutex_destroy(&load_hash[i].lock);
 170                 pthread_rwlock_destroy(&load_hash[i].rdlock);
 171                 pthread_rwlock_destroy(&load_hash[i].rilock);
 172         }
 173         return -1;
 174 }
 175
 176 static void insert_node(struct load_node **n, int locate)
 177 {
 178         struct load_node *f;
 179
 180         pthread_mutex_lock(&load_hash[locate].lock);
 181         pthread_rwlock_wrlock(&load_hash[locate].rilock);
 182         f = load_hash[locate].next;
 183         load_hash[locate].next = *n;
 184
 185         (*n)->pre = &(load_hash[locate].next);
 186         if (f)
 187                 f->pre = &((*n)->next);
 188         (*n)->next = f;
 189         pthread_mutex_unlock(&load_hash[locate].lock);
 190         pthread_rwlock_unlock(&load_hash[locate].rilock);
 191 }
 192 /*
 193  * locate_node() finds special node. Not return NULL means success.
 194  * It should be noted that rdlock isn't unlocked at the end of code
 195  * because this function is used to read special node. Delete is not
 196  * allowed before read has ended.
 197  * unlock rdlock only in proc_loadavg_read().
 198  */
 199 static struct load_node *locate_node(char *cg, int locate)
 200 {
 201         struct load_node *f = NULL;
 202         int i = 0;
 203
 204         pthread_rwlock_rdlock(&load_hash[locate].rilock);
 205         pthread_rwlock_rdlock(&load_hash[locate].rdlock);
 206         if (load_hash[locate].next == NULL) {
 207                 pthread_rwlock_unlock(&load_hash[locate].rilock);
 208                 return f;
 209         }
 210         f = load_hash[locate].next;
 211         pthread_rwlock_unlock(&load_hash[locate].rilock);
 212         while (f && ((i = strcmp(f->cg, cg)) != 0))
 213                 f = f->next;
 214         return f;
 215 }
 216 /* Delete the load_node n and return the next node of it. */
 217 static struct load_node *del_node(struct load_node *n, int locate)
 218 {
 219         struct load_node *g;
 220
 221         pthread_rwlock_wrlock(&load_hash[locate].rdlock);
 222         if (n->next == NULL) {
 223                 *(n->pre) = NULL;
 224         } else {
 225                 *(n->pre) = n->next;
 226                 n->next->pre = n->pre;
 227         }
 228         g = n->next;
 229         free(n->cg);
 230         free(n);
 231         pthread_rwlock_unlock(&load_hash[locate].rdlock);
 232         return g;
 233 }
 234
 235 /* Reserve buffer size to account for file size changes. */
 236 #define BUF_RESERVE_SIZE 512
 237
 238 /*
 239  * A table caching which pid is init for a pid namespace.
 240  * When looking up which pid is init for $qpid, we first
 241  * 1. Stat /proc/$qpid/ns/pid.
 242  * 2. Check whether the ino_t is in our store.
 243  *   a. if not, fork a child in qpid's ns to send us
 244  *       ucred.pid = 1, and read the initpid.  Cache
 245  *       initpid and creation time for /proc/initpid
 246  *       in a new store entry.
 247  *   b. if so, verify that /proc/initpid still matches
 248  *       what we have saved.  If not, clear the store
 249  *       entry and go back to a.  If so, return the
 250  *       cached initpid.
 251  */
 252 struct pidns_init_store {
 253         ino_t ino;          // inode number for /proc/$pid/ns/pid
 254         pid_t initpid;      // the pid of nit in that ns
 255         long int ctime;     // the time at which /proc/$initpid was created
 256         struct pidns_init_store *next;
 257         long int lastcheck;
 258 };
 259
 260 /* lol - look at how they are allocated in the kernel */
 261 #define PIDNS_HASH_SIZE 4096
 262 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 263
 264 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 265 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 266 static void lock_mutex(pthread_mutex_t *l)
 267 {
 268         int ret;
 269
 270         if ((ret = pthread_mutex_lock(l)) != 0) {
 271                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 272                 exit(1);
 273         }
 274 }
 275
 276 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 277  * Number of hierarchies mounted. */
 278 static int num_hierarchies;
 279
 280 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 281  * Hierachies mounted {cpuset, blkio, ...}:
 282  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 283 static char **hierarchies;
 284
 285 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 286  * Open file descriptors:
 287  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 288  * private mount namespace.
 289  * Initialized via __constructor__ collect_and_mount_subsystems().
 290  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 291  * mounts and respective files in the private namespace even when located in
 292  * another namespace using the *at() family of functions
 293  * {openat(), fchownat(), ...}. */
 294 static int *fd_hierarchies;
 295 static int cgroup_mount_ns_fd = -1;
 296
 297 static void unlock_mutex(pthread_mutex_t *l)
 298 {
 299         int ret;
 300
 301         if ((ret = pthread_mutex_unlock(l)) != 0) {
 302                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 303                 exit(1);
 304         }
 305 }
 306
 307 static void store_lock(void)
 308 {
 309         lock_mutex(&pidns_store_mutex);
 310 }
 311
 312 static void store_unlock(void)
 313 {
 314         unlock_mutex(&pidns_store_mutex);
 315 }
 316
 317 /* Must be called under store_lock */
 318 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 319 {
 320         struct stat initsb;
 321         char fnam[100];
 322
 323         snprintf(fnam, 100, "/proc/%d", e->initpid);
 324         if (stat(fnam, &initsb) < 0)
 325                 return false;
 326
 327         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 328                     initsb.st_ctime, e->initpid);
 329
 330         if (e->ctime != initsb.st_ctime)
 331                 return false;
 332         return true;
 333 }
 334
 335 /* Must be called under store_lock */
 336 static void remove_initpid(struct pidns_init_store *e)
 337 {
 338         struct pidns_init_store *tmp;
 339         int h;
 340
 341         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 342
 343         h = HASH(e->ino);
 344         if (pidns_hash_table[h] == e) {
 345                 pidns_hash_table[h] = e->next;
 346                 free(e);
 347                 return;
 348         }
 349
 350         tmp = pidns_hash_table[h];
 351         while (tmp) {
 352                 if (tmp->next == e) {
 353                         tmp->next = e->next;
 354                         free(e);
 355                         return;
 356                 }
 357                 tmp = tmp->next;
 358         }
 359 }
 360
 361 #define PURGE_SECS 5
 362 /* Must be called under store_lock */
 363 static void prune_initpid_store(void)
 364 {
 365         static long int last_prune = 0;
 366         struct pidns_init_store *e, *prev, *delme;
 367         long int now, threshold;
 368         int i;
 369
 370         if (!last_prune) {
 371                 last_prune = time(NULL);
 372                 return;
 373         }
 374         now = time(NULL);
 375         if (now < last_prune + PURGE_SECS)
 376                 return;
 377
 378         lxcfs_debug("%s\n", "Pruning.");
 379
 380         last_prune = now;
 381         threshold = now - 2 * PURGE_SECS;
 382
 383         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 384                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 385                         if (e->lastcheck < threshold) {
 386
 387                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 388
 389                                 delme = e;
 390                                 if (prev)
 391                                         prev->next = e->next;
 392                                 else
 393                                         pidns_hash_table[i] = e->next;
 394                                 e = e->next;
 395                                 free(delme);
 396                         } else {
 397                                 prev = e;
 398                                 e = e->next;
 399                         }
 400                 }
 401         }
 402 }
 403
 404 /* Must be called under store_lock */
 405 static void save_initpid(struct stat *sb, pid_t pid)
 406 {
 407         struct pidns_init_store *e;
 408         char fpath[100];
 409         struct stat procsb;
 410         int h;
 411
 412         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 413
 414         snprintf(fpath, 100, "/proc/%d", pid);
 415         if (stat(fpath, &procsb) < 0)
 416                 return;
 417         do {
 418                 e = malloc(sizeof(*e));
 419         } while (!e);
 420         e->ino = sb->st_ino;
 421         e->initpid = pid;
 422         e->ctime = procsb.st_ctime;
 423         h = HASH(e->ino);
 424         e->next = pidns_hash_table[h];
 425         e->lastcheck = time(NULL);
 426         pidns_hash_table[h] = e;
 427 }
 428
 429 /*
 430  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 431  * entry for the inode number and creation time.  Verify that the init pid
 432  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 433  * otherwise.
 434  * Must be called under store_lock
 435  */
 436 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 437 {
 438         int h = HASH(sb->st_ino);
 439         struct pidns_init_store *e = pidns_hash_table[h];
 440
 441         while (e) {
 442                 if (e->ino == sb->st_ino) {
 443                         if (initpid_still_valid(e, sb)) {
 444                                 e->lastcheck = time(NULL);
 445                                 return e;
 446                         }
 447                         remove_initpid(e);
 448                         return NULL;
 449                 }
 450                 e = e->next;
 451         }
 452
 453         return NULL;
 454 }
 455
 456 static int is_dir(const char *path, int fd)
 457 {
 458         struct stat statbuf;
 459         int ret = fstatat(fd, path, &statbuf, fd);
 460         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 461                 return 1;
 462         return 0;
 463 }
 464
 465 static char *must_copy_string(const char *str)
 466 {
 467         char *dup = NULL;
 468         if (!str)
 469                 return NULL;
 470         do {
 471                 dup = strdup(str);
 472         } while (!dup);
 473
 474         return dup;
 475 }
 476
 477 static inline void drop_trailing_newlines(char *s)
 478 {
 479         int l;
 480
 481         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 482                 s[l-1] = '\0';
 483 }
 484
 485 #define BATCH_SIZE 50
 486 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 487 {
 488         int newbatches = (newlen / BATCH_SIZE) + 1;
 489         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 490
 491         if (!*mem || newbatches > oldbatches) {
 492                 char *tmp;
 493                 do {
 494                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 495                 } while (!tmp);
 496                 *mem = tmp;
 497         }
 498 }
 499 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 500 {
 501         size_t newlen = *len + linelen;
 502         dorealloc(contents, *len, newlen + 1);
 503         memcpy(*contents + *len, line, linelen+1);
 504         *len = newlen;
 505 }
 506
 507 static char *slurp_file(const char *from, int fd)
 508 {
 509         char *line = NULL;
 510         char *contents = NULL;
 511         FILE *f = fdopen(fd, "r");
 512         size_t len = 0, fulllen = 0;
 513         ssize_t linelen;
 514
 515         if (!f)
 516                 return NULL;
 517
 518         while ((linelen = getline(&line, &len, f)) != -1) {
 519                 append_line(&contents, &fulllen, line, linelen);
 520         }
 521         fclose(f);
 522
 523         if (contents)
 524                 drop_trailing_newlines(contents);
 525         free(line);
 526         return contents;
 527 }
 528
 529 static bool write_string(const char *fnam, const char *string, int fd)
 530 {
 531         FILE *f;
 532         size_t len, ret;
 533
 534         if (!(f = fdopen(fd, "w")))
 535                 return false;
 536         len = strlen(string);
 537         ret = fwrite(string, 1, len, f);
 538         if (ret != len) {
 539                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 540                 fclose(f);
 541                 return false;
 542         }
 543         if (fclose(f) < 0) {
 544                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 545                 return false;
 546         }
 547         return true;
 548 }
 549
 550 struct cgfs_files {
 551         char *name;
 552         uint32_t uid, gid;
 553         uint32_t mode;
 554 };
 555
 556 #define ALLOC_NUM 20
 557 static bool store_hierarchy(char *stridx, char *h)
 558 {
 559         if (num_hierarchies % ALLOC_NUM == 0) {
 560                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 561                 n *= ALLOC_NUM;
 562                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 563                 if (!tmp) {
 564                         lxcfs_error("%s\n", strerror(errno));
 565                         exit(1);
 566                 }
 567                 hierarchies = tmp;
 568         }
 569
 570         hierarchies[num_hierarchies++] = must_copy_string(h);
 571         return true;
 572 }
 573
 574 static void print_subsystems(void)
 575 {
 576         int i;
 577
 578         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 579         fprintf(stderr, "hierarchies:\n");
 580         for (i = 0; i < num_hierarchies; i++) {
 581                 if (hierarchies[i])
 582                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 583                                 fd_hierarchies[i], hierarchies[i]);
 584         }
 585 }
 586
 587 static bool in_comma_list(const char *needle, const char *haystack)
 588 {
 589         const char *s = haystack, *e;
 590         size_t nlen = strlen(needle);
 591
 592         while (*s && (e = strchr(s, ','))) {
 593                 if (nlen != e - s) {
 594                         s = e + 1;
 595                         continue;
 596                 }
 597                 if (strncmp(needle, s, nlen) == 0)
 598                         return true;
 599                 s = e + 1;
 600         }
 601         if (strcmp(needle, s) == 0)
 602                 return true;
 603         return false;
 604 }
 605
 606 /* do we need to do any massaging here?  I'm not sure... */
 607 /* Return the mounted controller and store the corresponding open file descriptor
 608  * referring to the controller mountpoint in the private lxcfs namespace in
 609  * @cfd.
 610  */
 611 static char *find_mounted_controller(const char *controller, int *cfd)
 612 {
 613         int i;
 614
 615         for (i = 0; i < num_hierarchies; i++) {
 616                 if (!hierarchies[i])
 617                         continue;
 618                 if (strcmp(hierarchies[i], controller) == 0) {
 619                         *cfd = fd_hierarchies[i];
 620                         return hierarchies[i];
 621                 }
 622                 if (in_comma_list(controller, hierarchies[i])) {
 623                         *cfd = fd_hierarchies[i];
 624                         return hierarchies[i];
 625                 }
 626         }
 627
 628         return NULL;
 629 }
 630
 631 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 632                 const char *value)
 633 {
 634         int ret, fd, cfd;
 635         size_t len;
 636         char *fnam, *tmpc;
 637
 638         tmpc = find_mounted_controller(controller, &cfd);
 639         if (!tmpc)
 640                 return false;
 641
 642         /* Make sure we pass a relative path to *at() family of functions.
 643          * . + /cgroup + / + file + \0
 644          */
 645         len = strlen(cgroup) + strlen(file) + 3;
 646         fnam = alloca(len);
 647         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 648         if (ret < 0 || (size_t)ret >= len)
 649                 return false;
 650
 651         fd = openat(cfd, fnam, O_WRONLY);
 652         if (fd < 0)
 653                 return false;
 654
 655         return write_string(fnam, value, fd);
 656 }
 657
 658 // Chown all the files in the cgroup directory.  We do this when we create
 659 // a cgroup on behalf of a user.
 660 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 661 {
 662         struct dirent *direntp;
 663         char path[MAXPATHLEN];
 664         size_t len;
 665         DIR *d;
 666         int fd1, ret;
 667
 668         len = strlen(dirname);
 669         if (len >= MAXPATHLEN) {
 670                 lxcfs_error("Pathname too long: %s\n", dirname);
 671                 return;
 672         }
 673
 674         fd1 = openat(fd, dirname, O_DIRECTORY);
 675         if (fd1 < 0)
 676                 return;
 677
 678         d = fdopendir(fd1);
 679         if (!d) {
 680                 lxcfs_error("Failed to open %s\n", dirname);
 681                 return;
 682         }
 683
 684         while ((direntp = readdir(d))) {
 685                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 686                         continue;
 687                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 688                 if (ret < 0 || ret >= MAXPATHLEN) {
 689                         lxcfs_error("Pathname too long under %s\n", dirname);
 690                         continue;
 691                 }
 692                 if (fchownat(fd, path, uid, gid, 0) < 0)
 693                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 694         }
 695         closedir(d);
 696 }
 697
 698 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 699 {
 700         int cfd;
 701         size_t len;
 702         char *dirnam, *tmpc;
 703
 704         tmpc = find_mounted_controller(controller, &cfd);
 705         if (!tmpc)
 706                 return -EINVAL;
 707
 708         /* Make sure we pass a relative path to *at() family of functions.
 709          * . + /cg + \0
 710          */
 711         len = strlen(cg) + 2;
 712         dirnam = alloca(len);
 713         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 714
 715         if (mkdirat(cfd, dirnam, 0755) < 0)
 716                 return -errno;
 717
 718         if (uid == 0 && gid == 0)
 719                 return 0;
 720
 721         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 722                 return -errno;
 723
 724         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 725
 726         return 0;
 727 }
 728
 729 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 730 {
 731         struct dirent *direntp;
 732         DIR *dir;
 733         bool ret = false;
 734         char pathname[MAXPATHLEN];
 735         int dupfd;
 736
 737         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 738         if (dupfd < 0)
 739                 return false;
 740
 741         dir = fdopendir(dupfd);
 742         if (!dir) {
 743                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 744                 close(dupfd);
 745                 return false;
 746         }
 747
 748         while ((direntp = readdir(dir))) {
 749                 struct stat mystat;
 750                 int rc;
 751
 752                 if (!strcmp(direntp->d_name, ".") ||
 753                     !strcmp(direntp->d_name, ".."))
 754                         continue;
 755
 756                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 757                 if (rc < 0 || rc >= MAXPATHLEN) {
 758                         lxcfs_error("%s\n", "Pathname too long.");
 759                         continue;
 760                 }
 761
 762                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 763                 if (rc) {
 764                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 765                         continue;
 766                 }
 767                 if (S_ISDIR(mystat.st_mode))
 768                         if (!recursive_rmdir(pathname, fd, cfd))
 769                                 lxcfs_debug("Error removing %s.\n", pathname);
 770         }
 771
 772         ret = true;
 773         if (closedir(dir) < 0) {
 774                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 775                 ret = false;
 776         }
 777
 778         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 779                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 780                 ret = false;
 781         }
 782
 783         close(dupfd);
 784
 785         return ret;
 786 }
 787
 788 bool cgfs_remove(const char *controller, const char *cg)
 789 {
 790         int fd, cfd;
 791         size_t len;
 792         char *dirnam, *tmpc;
 793         bool bret;
 794
 795         tmpc = find_mounted_controller(controller, &cfd);
 796         if (!tmpc)
 797                 return false;
 798
 799         /* Make sure we pass a relative path to *at() family of functions.
 800          * . +  /cg + \0
 801          */
 802         len = strlen(cg) + 2;
 803         dirnam = alloca(len);
 804         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 805
 806         fd = openat(cfd, dirnam, O_DIRECTORY);
 807         if (fd < 0)
 808                 return false;
 809
 810         bret = recursive_rmdir(dirnam, fd, cfd);
 811         close(fd);
 812         return bret;
 813 }
 814
 815 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 816 {
 817         int cfd;
 818         size_t len;
 819         char *pathname, *tmpc;
 820
 821         tmpc = find_mounted_controller(controller, &cfd);
 822         if (!tmpc)
 823                 return false;
 824
 825         /* Make sure we pass a relative path to *at() family of functions.
 826          * . + /file + \0
 827          */
 828         len = strlen(file) + 2;
 829         pathname = alloca(len);
 830         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 831         if (fchmodat(cfd, pathname, mode, 0) < 0)
 832                 return false;
 833         return true;
 834 }
 835
 836 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 837 {
 838         size_t len;
 839         char *fname;
 840
 841         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 842         fname = alloca(len);
 843         snprintf(fname, len, "%s/tasks", dirname);
 844         if (fchownat(fd, fname, uid, gid, 0) != 0)
 845                 return -errno;
 846         snprintf(fname, len, "%s/cgroup.procs", dirname);
 847         if (fchownat(fd, fname, uid, gid, 0) != 0)
 848                 return -errno;
 849         return 0;
 850 }
 851
 852 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 853 {
 854         int cfd;
 855         size_t len;
 856         char *pathname, *tmpc;
 857
 858         tmpc = find_mounted_controller(controller, &cfd);
 859         if (!tmpc)
 860                 return -EINVAL;
 861
 862         /* Make sure we pass a relative path to *at() family of functions.
 863          * . + /file + \0
 864          */
 865         len = strlen(file) + 2;
 866         pathname = alloca(len);
 867         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 868         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 869                 return -errno;
 870
 871         if (is_dir(pathname, cfd))
 872                 // like cgmanager did, we want to chown the tasks file as well
 873                 return chown_tasks_files(pathname, uid, gid, cfd);
 874
 875         return 0;
 876 }
 877
 878 FILE *open_pids_file(const char *controller, const char *cgroup)
 879 {
 880         int fd, cfd;
 881         size_t len;
 882         char *pathname, *tmpc;
 883
 884         tmpc = find_mounted_controller(controller, &cfd);
 885         if (!tmpc)
 886                 return NULL;
 887
 888         /* Make sure we pass a relative path to *at() family of functions.
 889          * . + /cgroup + / "cgroup.procs" + \0
 890          */
 891         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 892         pathname = alloca(len);
 893         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 894
 895         fd = openat(cfd, pathname, O_WRONLY);
 896         if (fd < 0)
 897                 return NULL;
 898
 899         return fdopen(fd, "w");
 900 }
 901
 902 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 903                                 void ***list, size_t typesize,
 904                                 void* (*iterator)(const char*, const char*, const char*))
 905 {
 906         int cfd, fd, ret;
 907         size_t len;
 908         char *cg, *tmpc;
 909         char pathname[MAXPATHLEN];
 910         size_t sz = 0, asz = 0;
 911         struct dirent *dirent;
 912         DIR *dir;
 913
 914         tmpc = find_mounted_controller(controller, &cfd);
 915         *list = NULL;
 916         if (!tmpc)
 917                 return false;
 918
 919         /* Make sure we pass a relative path to *at() family of functions. */
 920         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 921         cg = alloca(len);
 922         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 923         if (ret < 0 || (size_t)ret >= len) {
 924                 lxcfs_error("Pathname too long under %s\n", cgroup);
 925                 return false;
 926         }
 927
 928         fd = openat(cfd, cg, O_DIRECTORY);
 929         if (fd < 0)
 930                 return false;
 931
 932         dir = fdopendir(fd);
 933         if (!dir)
 934                 return false;
 935
 936         while ((dirent = readdir(dir))) {
 937                 struct stat mystat;
 938
 939                 if (!strcmp(dirent->d_name, ".") ||
 940                     !strcmp(dirent->d_name, ".."))
 941                         continue;
 942
 943                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 944                 if (ret < 0 || ret >= MAXPATHLEN) {
 945                         lxcfs_error("Pathname too long under %s\n", cg);
 946                         continue;
 947                 }
 948
 949                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 950                 if (ret) {
 951                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
 952                         continue;
 953                 }
 954                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
 955                     (directories && !S_ISDIR(mystat.st_mode)))
 956                         continue;
 957
 958                 if (sz+2 >= asz) {
 959                         void **tmp;
 960                         asz += BATCH_SIZE;
 961                         do {
 962                                 tmp = realloc(*list, asz * typesize);
 963                         } while  (!tmp);
 964                         *list = tmp;
 965                 }
 966                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
 967                 (*list)[sz+1] = NULL;
 968                 sz++;
 969         }
 970         if (closedir(dir) < 0) {
 971                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
 972                 return false;
 973         }
 974         return true;
 975 }
 976
 977 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 978 {
 979         char *dup;
 980         do {
 981                 dup = strdup(dir_entry);
 982         } while (!dup);
 983         return dup;
 984 }
 985
 986 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
 987 {
 988         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
 989 }
 990
 991 void free_key(struct cgfs_files *k)
 992 {
 993         if (!k)
 994                 return;
 995         free(k->name);
 996         free(k);
 997 }
 998
 999 void free_keys(struct cgfs_files **keys)
1000 {
1001         int i;
1002
1003         if (!keys)
1004                 return;
1005         for (i = 0; keys[i]; i++) {
1006                 free_key(keys[i]);
1007         }
1008         free(keys);
1009 }
1010
1011 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1012 {
1013         int ret, fd, cfd;
1014         size_t len;
1015         char *fnam, *tmpc;
1016
1017         tmpc = find_mounted_controller(controller, &cfd);
1018         if (!tmpc)
1019                 return false;
1020
1021         /* Make sure we pass a relative path to *at() family of functions.
1022          * . + /cgroup + / + file + \0
1023          */
1024         len = strlen(cgroup) + strlen(file) + 3;
1025         fnam = alloca(len);
1026         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1027         if (ret < 0 || (size_t)ret >= len)
1028                 return false;
1029
1030         fd = openat(cfd, fnam, O_RDONLY);
1031         if (fd < 0)
1032                 return false;
1033
1034         *value = slurp_file(fnam, fd);
1035         return *value != NULL;
1036 }
1037
1038 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1039 {
1040         int ret, cfd;
1041         size_t len;
1042         char *fnam, *tmpc;
1043         struct stat sb;
1044         struct cgfs_files *newkey;
1045
1046         tmpc = find_mounted_controller(controller, &cfd);
1047         if (!tmpc)
1048                 return false;
1049
1050         if (file && *file == '/')
1051                 file++;
1052
1053         if (file && strchr(file, '/'))
1054                 return NULL;
1055
1056         /* Make sure we pass a relative path to *at() family of functions.
1057          * . + /cgroup + / + file + \0
1058          */
1059         len = strlen(cgroup) + 3;
1060         if (file)
1061                 len += strlen(file) + 1;
1062         fnam = alloca(len);
1063         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1064                  file ? "/" : "", file ? file : "");
1065
1066         ret = fstatat(cfd, fnam, &sb, 0);
1067         if (ret < 0)
1068                 return NULL;
1069
1070         do {
1071                 newkey = malloc(sizeof(struct cgfs_files));
1072         } while (!newkey);
1073         if (file)
1074                 newkey->name = must_copy_string(file);
1075         else if (strrchr(cgroup, '/'))
1076                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1077         else
1078                 newkey->name = must_copy_string(cgroup);
1079         newkey->uid = sb.st_uid;
1080         newkey->gid = sb.st_gid;
1081         newkey->mode = sb.st_mode;
1082
1083         return newkey;
1084 }
1085
1086 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1087 {
1088         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1089         if (!entry) {
1090                 lxcfs_error("Error getting files under %s:%s\n", controller,
1091                              cgroup);
1092         }
1093         return entry;
1094 }
1095
1096 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1097 {
1098         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1099 }
1100
1101 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1102 {
1103         int cfd;
1104         size_t len;
1105         char *fnam, *tmpc;
1106         int ret;
1107         struct stat sb;
1108
1109         tmpc = find_mounted_controller(controller, &cfd);
1110         if (!tmpc)
1111                 return false;
1112
1113         /* Make sure we pass a relative path to *at() family of functions.
1114          * . + /cgroup + / + f + \0
1115          */
1116         len = strlen(cgroup) + strlen(f) + 3;
1117         fnam = alloca(len);
1118         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1119         if (ret < 0 || (size_t)ret >= len)
1120                 return false;
1121
1122         ret = fstatat(cfd, fnam, &sb, 0);
1123         if (ret < 0 || !S_ISDIR(sb.st_mode))
1124                 return false;
1125
1126         return true;
1127 }
1128
1129 #define SEND_CREDS_OK 0
1130 #define SEND_CREDS_NOTSK 1
1131 #define SEND_CREDS_FAIL 2
1132 static bool recv_creds(int sock, struct ucred *cred, char *v);
1133 static int wait_for_pid(pid_t pid);
1134 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1135 static int send_creds_clone_wrapper(void *arg);
1136
1137 /*
1138  * clone a task which switches to @task's namespace and writes '1'.
1139  * over a unix sock so we can read the task's reaper's pid in our
1140  * namespace
1141  *
1142  * Note: glibc's fork() does not respect pidns, which can lead to failed
1143  * assertions inside glibc (and thus failed forks) if the child's pid in
1144  * the pidns and the parent pid outside are identical. Using clone prevents
1145  * this issue.
1146  */
1147 static void write_task_init_pid_exit(int sock, pid_t target)
1148 {
1149         char fnam[100];
1150         pid_t pid;
1151         int fd, ret;
1152         size_t stack_size = sysconf(_SC_PAGESIZE);
1153         void *stack = alloca(stack_size);
1154
1155         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1156         if (ret < 0 || ret >= sizeof(fnam))
1157                 _exit(1);
1158
1159         fd = open(fnam, O_RDONLY);
1160         if (fd < 0) {
1161                 perror("write_task_init_pid_exit open of ns/pid");
1162                 _exit(1);
1163         }
1164         if (setns(fd, 0)) {
1165                 perror("write_task_init_pid_exit setns 1");
1166                 close(fd);
1167                 _exit(1);
1168         }
1169         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1170         if (pid < 0)
1171                 _exit(1);
1172         if (pid != 0) {
1173                 if (!wait_for_pid(pid))
1174                         _exit(1);
1175                 _exit(0);
1176         }
1177 }
1178
1179 static int send_creds_clone_wrapper(void *arg) {
1180         struct ucred cred;
1181         char v;
1182         int sock = *(int *)arg;
1183
1184         /* we are the child */
1185         cred.uid = 0;
1186         cred.gid = 0;
1187         cred.pid = 1;
1188         v = '1';
1189         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1190                 return 1;
1191         return 0;
1192 }
1193
1194 static pid_t get_init_pid_for_task(pid_t task)
1195 {
1196         int sock[2];
1197         pid_t pid;
1198         pid_t ret = -1;
1199         char v = '0';
1200         struct ucred cred;
1201
1202         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1203                 perror("socketpair");
1204                 return -1;
1205         }
1206
1207         pid = fork();
1208         if (pid < 0)
1209                 goto out;
1210         if (!pid) {
1211                 close(sock[1]);
1212                 write_task_init_pid_exit(sock[0], task);
1213                 _exit(0);
1214         }
1215
1216         if (!recv_creds(sock[1], &cred, &v))
1217                 goto out;
1218         ret = cred.pid;
1219
1220 out:
1221         close(sock[0]);
1222         close(sock[1]);
1223         if (pid > 0)
1224                 wait_for_pid(pid);
1225         return ret;
1226 }
1227
1228 static pid_t lookup_initpid_in_store(pid_t qpid)
1229 {
1230         pid_t answer = 0;
1231         struct stat sb;
1232         struct pidns_init_store *e;
1233         char fnam[100];
1234
1235         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1236         store_lock();
1237         if (stat(fnam, &sb) < 0)
1238                 goto out;
1239         e = lookup_verify_initpid(&sb);
1240         if (e) {
1241                 answer = e->initpid;
1242                 goto out;
1243         }
1244         answer = get_init_pid_for_task(qpid);
1245         if (answer > 0)
1246                 save_initpid(&sb, answer);
1247
1248 out:
1249         /* we prune at end in case we are returning
1250          * the value we were about to return */
1251         prune_initpid_store();
1252         store_unlock();
1253         return answer;
1254 }
1255
1256 static int wait_for_pid(pid_t pid)
1257 {
1258         int status, ret;
1259
1260         if (pid <= 0)
1261                 return -1;
1262
1263 again:
1264         ret = waitpid(pid, &status, 0);
1265         if (ret == -1) {
1266                 if (errno == EINTR)
1267                         goto again;
1268                 return -1;
1269         }
1270         if (ret != pid)
1271                 goto again;
1272         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1273                 return -1;
1274         return 0;
1275 }
1276
1277
1278 /*
1279  * append pid to *src.
1280  * src: a pointer to a char* in which ot append the pid.
1281  * sz: the number of characters printed so far, minus trailing \0.
1282  * asz: the allocated size so far
1283  * pid: the pid to append
1284  */
1285 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1286 {
1287         char tmp[30];
1288
1289         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1290
1291         if (!*src || tmplen + *sz + 1 >= *asz) {
1292                 char *tmp;
1293                 do {
1294                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1295                 } while (!tmp);
1296                 *src = tmp;
1297                 *asz += BUF_RESERVE_SIZE;
1298         }
1299         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1300         *sz += tmplen;
1301 }
1302
1303 /*
1304  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1305  * valid in the caller's namespace, return the id mapped into
1306  * pid's namespace.
1307  * Returns the mapped id, or -1 on error.
1308  */
1309 unsigned int
1310 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1311 {
1312         unsigned int nsuid,   // base id for a range in the idfile's namespace
1313                      hostuid, // base id for a range in the caller's namespace
1314                      count;   // number of ids in this range
1315         char line[400];
1316         int ret;
1317
1318         fseek(idfile, 0L, SEEK_SET);
1319         while (fgets(line, 400, idfile)) {
1320                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1321                 if (ret != 3)
1322                         continue;
1323                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1324                         /*
1325                          * uids wrapped around - unexpected as this is a procfile,
1326                          * so just bail.
1327                          */
1328                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1329                                 nsuid, hostuid, count, line);
1330                         return -1;
1331                 }
1332                 if (hostuid <= in_id && hostuid+count > in_id) {
1333                         /*
1334                          * now since hostuid <= in_id < hostuid+count, and
1335                          * hostuid+count and nsuid+count do not wrap around,
1336                          * we know that nsuid+(in_id-hostuid) which must be
1337                          * less that nsuid+(count) must not wrap around
1338                          */
1339                         return (in_id - hostuid) + nsuid;
1340                 }
1341         }
1342
1343         // no answer found
1344         return -1;
1345 }
1346
1347 /*
1348  * for is_privileged_over,
1349  * specify whether we require the calling uid to be root in his
1350  * namespace
1351  */
1352 #define NS_ROOT_REQD true
1353 #define NS_ROOT_OPT false
1354
1355 #define PROCLEN 100
1356
1357 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1358 {
1359         char fpath[PROCLEN];
1360         int ret;
1361         bool answer = false;
1362         uid_t nsuid;
1363
1364         if (victim == -1 || uid == -1)
1365                 return false;
1366
1367         /*
1368          * If the request is one not requiring root in the namespace,
1369          * then having the same uid suffices.  (i.e. uid 1000 has write
1370          * access to files owned by uid 1000
1371          */
1372         if (!req_ns_root && uid == victim)
1373                 return true;
1374
1375         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1376         if (ret < 0 || ret >= PROCLEN)
1377                 return false;
1378         FILE *f = fopen(fpath, "r");
1379         if (!f)
1380                 return false;
1381
1382         /* if caller's not root in his namespace, reject */
1383         nsuid = convert_id_to_ns(f, uid);
1384         if (nsuid)
1385                 goto out;
1386
1387         /*
1388          * If victim is not mapped into caller's ns, reject.
1389          * XXX I'm not sure this check is needed given that fuse
1390          * will be sending requests where the vfs has converted
1391          */
1392         nsuid = convert_id_to_ns(f, victim);
1393         if (nsuid == -1)
1394                 goto out;
1395
1396         answer = true;
1397
1398 out:
1399         fclose(f);
1400         return answer;
1401 }
1402
1403 static bool perms_include(int fmode, mode_t req_mode)
1404 {
1405         mode_t r;
1406
1407         switch (req_mode & O_ACCMODE) {
1408         case O_RDONLY:
1409                 r = S_IROTH;
1410                 break;
1411         case O_WRONLY:
1412                 r = S_IWOTH;
1413                 break;
1414         case O_RDWR:
1415                 r = S_IROTH | S_IWOTH;
1416                 break;
1417         default:
1418                 return false;
1419         }
1420         return ((fmode & r) == r);
1421 }
1422
1423
1424 /*
1425  * taskcg is  a/b/c
1426  * querycg is /a/b/c/d/e
1427  * we return 'd'
1428  */
1429 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1430 {
1431         char *start, *end;
1432
1433         if (strlen(taskcg) <= strlen(querycg)) {
1434                 lxcfs_error("%s\n", "I was fed bad input.");
1435                 return NULL;
1436         }
1437
1438         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1439                 start =  strdup(taskcg + 1);
1440         else
1441                 start = strdup(taskcg + strlen(querycg) + 1);
1442         if (!start)
1443                 return NULL;
1444         end = strchr(start, '/');
1445         if (end)
1446                 *end = '\0';
1447         return start;
1448 }
1449
1450 static void stripnewline(char *x)
1451 {
1452         size_t l = strlen(x);
1453         if (l && x[l-1] == '\n')
1454                 x[l-1] = '\0';
1455 }
1456
1457 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1458 {
1459         int cfd;
1460         char fnam[PROCLEN];
1461         FILE *f;
1462         char *answer = NULL;
1463         char *line = NULL;
1464         size_t len = 0;
1465         int ret;
1466         const char *h = find_mounted_controller(contrl, &cfd);
1467         if (!h)
1468                 return NULL;
1469
1470         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1471         if (ret < 0 || ret >= PROCLEN)
1472                 return NULL;
1473         if (!(f = fopen(fnam, "r")))
1474                 return NULL;
1475
1476         while (getline(&line, &len, f) != -1) {
1477                 char *c1, *c2;
1478                 if (!line[0])
1479                         continue;
1480                 c1 = strchr(line, ':');
1481                 if (!c1)
1482                         goto out;
1483                 c1++;
1484                 c2 = strchr(c1, ':');
1485                 if (!c2)
1486                         goto out;
1487                 *c2 = '\0';
1488                 if (strcmp(c1, h) != 0)
1489                         continue;
1490                 c2++;
1491                 stripnewline(c2);
1492                 do {
1493                         answer = strdup(c2);
1494                 } while (!answer);
1495                 break;
1496         }
1497
1498 out:
1499         fclose(f);
1500         free(line);
1501         return answer;
1502 }
1503
1504 /*
1505  * check whether a fuse context may access a cgroup dir or file
1506  *
1507  * If file is not null, it is a cgroup file to check under cg.
1508  * If file is null, then we are checking perms on cg itself.
1509  *
1510  * For files we can check the mode of the list_keys result.
1511  * For cgroups, we must make assumptions based on the files under the
1512  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1513  * yet.
1514  */
1515 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1516 {
1517         struct cgfs_files *k = NULL;
1518         bool ret = false;
1519
1520         k = cgfs_get_key(contrl, cg, file);
1521         if (!k)
1522                 return false;
1523
1524         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1525                 if (perms_include(k->mode >> 6, mode)) {
1526                         ret = true;
1527                         goto out;
1528                 }
1529         }
1530         if (fc->gid == k->gid) {
1531                 if (perms_include(k->mode >> 3, mode)) {
1532                         ret = true;
1533                         goto out;
1534                 }
1535         }
1536         ret = perms_include(k->mode, mode);
1537
1538 out:
1539         free_key(k);
1540         return ret;
1541 }
1542
1543 #define INITSCOPE "/init.scope"
1544 static void prune_init_slice(char *cg)
1545 {
1546         char *point;
1547         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1548
1549         if (cg_len < initscope_len)
1550                 return;
1551
1552         point = cg + cg_len - initscope_len;
1553         if (strcmp(point, INITSCOPE) == 0) {
1554                 if (point == cg)
1555                         *(point+1) = '\0';
1556                 else
1557                         *point = '\0';
1558         }
1559 }
1560
1561 /*
1562  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1563  * If pid is in /a, he may act on /a/b, but not on /b.
1564  * if the answer is false and nextcg is not NULL, then *nextcg will point
1565  * to a string containing the next cgroup directory under cg, which must be
1566  * freed by the caller.
1567  */
1568 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1569 {
1570         bool answer = false;
1571         char *c2 = get_pid_cgroup(pid, contrl);
1572         char *linecmp;
1573
1574         if (!c2)
1575                 return false;
1576         prune_init_slice(c2);
1577
1578         /*
1579          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1580          * they pass in a cgroup without leading '/'
1581          *
1582          * The original line here was:
1583          *      linecmp = *cg == '/' ? c2 : c2+1;
1584          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1585          *       Serge, do you know?
1586          */
1587         if (*cg == '/' || !strncmp(cg, "./", 2))
1588                 linecmp = c2;
1589         else
1590                 linecmp = c2 + 1;
1591         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1592                 if (nextcg) {
1593                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1594                 }
1595                 goto out;
1596         }
1597         answer = true;
1598
1599 out:
1600         free(c2);
1601         return answer;
1602 }
1603
1604 /*
1605  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1606  */
1607 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1608 {
1609         bool answer = false;
1610         char *c2, *task_cg;
1611         size_t target_len, task_len;
1612
1613         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1614                 return true;
1615
1616         c2 = get_pid_cgroup(pid, contrl);
1617         if (!c2)
1618                 return false;
1619         prune_init_slice(c2);
1620
1621         task_cg = c2 + 1;
1622         target_len = strlen(cg);
1623         task_len = strlen(task_cg);
1624         if (task_len == 0) {
1625                 /* Task is in the root cg, it can see everything. This case is
1626                  * not handled by the strmcps below, since they test for the
1627                  * last /, but that is the first / that we've chopped off
1628                  * above.
1629                  */
1630                 answer = true;
1631                 goto out;
1632         }
1633         if (strcmp(cg, task_cg) == 0) {
1634                 answer = true;
1635                 goto out;
1636         }
1637         if (target_len < task_len) {
1638                 /* looking up a parent dir */
1639                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1640                         answer = true;
1641                 goto out;
1642         }
1643         if (target_len > task_len) {
1644                 /* looking up a child dir */
1645                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1646                         answer = true;
1647                 goto out;
1648         }
1649
1650 out:
1651         free(c2);
1652         return answer;
1653 }
1654
1655 /*
1656  * given /cgroup/freezer/a/b, return "freezer".
1657  * the returned char* should NOT be freed.
1658  */
1659 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1660 {
1661         const char *p1;
1662         char *contr, *slash;
1663
1664         if (strlen(path) < 9) {
1665                 errno = EACCES;
1666                 return NULL;
1667         }
1668         if (*(path + 7) != '/') {
1669                 errno = EINVAL;
1670                 return NULL;
1671         }
1672         p1 = path + 8;
1673         contr = strdupa(p1);
1674         if (!contr) {
1675                 errno = ENOMEM;
1676                 return NULL;
1677         }
1678         slash = strstr(contr, "/");
1679         if (slash)
1680                 *slash = '\0';
1681
1682         int i;
1683         for (i = 0; i < num_hierarchies; i++) {
1684                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1685                         return hierarchies[i];
1686         }
1687         errno = ENOENT;
1688         return NULL;
1689 }
1690
1691 /*
1692  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1693  * Note that the returned value may include files (keynames) etc
1694  */
1695 static const char *find_cgroup_in_path(const char *path)
1696 {
1697         const char *p1;
1698
1699         if (strlen(path) < 9) {
1700                 errno = EACCES;
1701                 return NULL;
1702         }
1703         p1 = strstr(path + 8, "/");
1704         if (!p1) {
1705                 errno = EINVAL;
1706                 return NULL;
1707         }
1708         errno = 0;
1709         return p1 + 1;
1710 }
1711
1712 /*
1713  * split the last path element from the path in @cg.
1714  * @dir is newly allocated and should be freed, @last not
1715 */
1716 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1717 {
1718         char *p;
1719
1720         do {
1721                 *dir = strdup(cg);
1722         } while (!*dir);
1723         *last = strrchr(cg, '/');
1724         if (!*last) {
1725                 *last = NULL;
1726                 return;
1727         }
1728         p = strrchr(*dir, '/');
1729         *p = '\0';
1730 }
1731
1732 /*
1733  * FUSE ops for /cgroup
1734  */
1735
1736 int cg_getattr(const char *path, struct stat *sb)
1737 {
1738         struct timespec now;
1739         struct fuse_context *fc = fuse_get_context();
1740         char * cgdir = NULL;
1741         char *last = NULL, *path1, *path2;
1742         struct cgfs_files *k = NULL;
1743         const char *cgroup;
1744         const char *controller = NULL;
1745         int ret = -ENOENT;
1746
1747
1748         if (!fc)
1749                 return -EIO;
1750
1751         memset(sb, 0, sizeof(struct stat));
1752
1753         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1754                 return -EINVAL;
1755
1756         sb->st_uid = sb->st_gid = 0;
1757         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1758         sb->st_size = 0;
1759
1760         if (strcmp(path, "/cgroup") == 0) {
1761                 sb->st_mode = S_IFDIR | 00755;
1762                 sb->st_nlink = 2;
1763                 return 0;
1764         }
1765
1766         controller = pick_controller_from_path(fc, path);
1767         if (!controller)
1768                 return -errno;
1769         cgroup = find_cgroup_in_path(path);
1770         if (!cgroup) {
1771                 /* this is just /cgroup/controller, return it as a dir */
1772                 sb->st_mode = S_IFDIR | 00755;
1773                 sb->st_nlink = 2;
1774                 return 0;
1775         }
1776
1777         get_cgdir_and_path(cgroup, &cgdir, &last);
1778
1779         if (!last) {
1780                 path1 = "/";
1781                 path2 = cgdir;
1782         } else {
1783                 path1 = cgdir;
1784                 path2 = last;
1785         }
1786
1787         pid_t initpid = lookup_initpid_in_store(fc->pid);
1788         if (initpid <= 0)
1789                 initpid = fc->pid;
1790         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1791          * Then check that caller's cgroup is under path if last is a child
1792          * cgroup, or cgdir if last is a file */
1793
1794         if (is_child_cgroup(controller, path1, path2)) {
1795                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1796                         ret = -ENOENT;
1797                         goto out;
1798                 }
1799                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1800                         /* this is just /cgroup/controller, return it as a dir */
1801                         sb->st_mode = S_IFDIR | 00555;
1802                         sb->st_nlink = 2;
1803                         ret = 0;
1804                         goto out;
1805                 }
1806                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1807                         ret = -EACCES;
1808                         goto out;
1809                 }
1810
1811                 // get uid, gid, from '/tasks' file and make up a mode
1812                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1813                 sb->st_mode = S_IFDIR | 00755;
1814                 k = cgfs_get_key(controller, cgroup, NULL);
1815                 if (!k) {
1816                         sb->st_uid = sb->st_gid = 0;
1817                 } else {
1818                         sb->st_uid = k->uid;
1819                         sb->st_gid = k->gid;
1820                 }
1821                 free_key(k);
1822                 sb->st_nlink = 2;
1823                 ret = 0;
1824                 goto out;
1825         }
1826
1827         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1828                 sb->st_mode = S_IFREG | k->mode;
1829                 sb->st_nlink = 1;
1830                 sb->st_uid = k->uid;
1831                 sb->st_gid = k->gid;
1832                 sb->st_size = 0;
1833                 free_key(k);
1834                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1835                         ret = -ENOENT;
1836                         goto out;
1837                 }
1838                 ret = 0;
1839         }
1840
1841 out:
1842         free(cgdir);
1843         return ret;
1844 }
1845
1846 int cg_opendir(const char *path, struct fuse_file_info *fi)
1847 {
1848         struct fuse_context *fc = fuse_get_context();
1849         const char *cgroup;
1850         struct file_info *dir_info;
1851         char *controller = NULL;
1852
1853         if (!fc)
1854                 return -EIO;
1855
1856         if (strcmp(path, "/cgroup") == 0) {
1857                 cgroup = NULL;
1858                 controller = NULL;
1859         } else {
1860                 // return list of keys for the controller, and list of child cgroups
1861                 controller = pick_controller_from_path(fc, path);
1862                 if (!controller)
1863                         return -errno;
1864
1865                 cgroup = find_cgroup_in_path(path);
1866                 if (!cgroup) {
1867                         /* this is just /cgroup/controller, return its contents */
1868                         cgroup = "/";
1869                 }
1870         }
1871
1872         pid_t initpid = lookup_initpid_in_store(fc->pid);
1873         if (initpid <= 0)
1874                 initpid = fc->pid;
1875         if (cgroup) {
1876                 if (!caller_may_see_dir(initpid, controller, cgroup))
1877                         return -ENOENT;
1878                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1879                         return -EACCES;
1880         }
1881
1882         /* we'll free this at cg_releasedir */
1883         dir_info = malloc(sizeof(*dir_info));
1884         if (!dir_info)
1885                 return -ENOMEM;
1886         dir_info->controller = must_copy_string(controller);
1887         dir_info->cgroup = must_copy_string(cgroup);
1888         dir_info->type = LXC_TYPE_CGDIR;
1889         dir_info->buf = NULL;
1890         dir_info->file = NULL;
1891         dir_info->buflen = 0;
1892
1893         fi->fh = (unsigned long)dir_info;
1894         return 0;
1895 }
1896
1897 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1898                 struct fuse_file_info *fi)
1899 {
1900         struct file_info *d = (struct file_info *)fi->fh;
1901         struct cgfs_files **list = NULL;
1902         int i, ret;
1903         char *nextcg = NULL;
1904         struct fuse_context *fc = fuse_get_context();
1905         char **clist = NULL;
1906
1907         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1908                 return -EIO;
1909
1910         if (d->type != LXC_TYPE_CGDIR) {
1911                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1912                 return -EIO;
1913         }
1914         if (!d->cgroup && !d->controller) {
1915                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1916                 int i;
1917
1918                 for (i = 0;  i < num_hierarchies; i++) {
1919                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1920                                 return -EIO;
1921                         }
1922                 }
1923                 return 0;
1924         }
1925
1926         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1927                 // not a valid cgroup
1928                 ret = -EINVAL;
1929                 goto out;
1930         }
1931
1932         pid_t initpid = lookup_initpid_in_store(fc->pid);
1933         if (initpid <= 0)
1934                 initpid = fc->pid;
1935         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1936                 if (nextcg) {
1937                         ret = filler(buf, nextcg,  NULL, 0);
1938                         free(nextcg);
1939                         if (ret != 0) {
1940                                 ret = -EIO;
1941                                 goto out;
1942                         }
1943                 }
1944                 ret = 0;
1945                 goto out;
1946         }
1947
1948         for (i = 0; list[i]; i++) {
1949                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1950                         ret = -EIO;
1951                         goto out;
1952                 }
1953         }
1954
1955         // now get the list of child cgroups
1956
1957         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1958                 ret = 0;
1959                 goto out;
1960         }
1961         if (clist) {
1962                 for (i = 0; clist[i]; i++) {
1963                         if (filler(buf, clist[i], NULL, 0) != 0) {
1964                                 ret = -EIO;
1965                                 goto out;
1966                         }
1967                 }
1968         }
1969         ret = 0;
1970
1971 out:
1972         free_keys(list);
1973         if (clist) {
1974                 for (i = 0; clist[i]; i++)
1975                         free(clist[i]);
1976                 free(clist);
1977         }
1978         return ret;
1979 }
1980
1981 static void do_release_file_info(struct fuse_file_info *fi)
1982 {
1983         struct file_info *f = (struct file_info *)fi->fh;
1984
1985         if (!f)
1986                 return;
1987
1988         fi->fh = 0;
1989
1990         free(f->controller);
1991         f->controller = NULL;
1992         free(f->cgroup);
1993         f->cgroup = NULL;
1994         free(f->file);
1995         f->file = NULL;
1996         free(f->buf);
1997         f->buf = NULL;
1998         free(f);
1999 }
2000
2001 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2002 {
2003         do_release_file_info(fi);
2004         return 0;
2005 }
2006
2007 int cg_open(const char *path, struct fuse_file_info *fi)
2008 {
2009         const char *cgroup;
2010         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2011         struct cgfs_files *k = NULL;
2012         struct file_info *file_info;
2013         struct fuse_context *fc = fuse_get_context();
2014         int ret;
2015
2016         if (!fc)
2017                 return -EIO;
2018
2019         controller = pick_controller_from_path(fc, path);
2020         if (!controller)
2021                 return -errno;
2022         cgroup = find_cgroup_in_path(path);
2023         if (!cgroup)
2024                 return -errno;
2025
2026         get_cgdir_and_path(cgroup, &cgdir, &last);
2027         if (!last) {
2028                 path1 = "/";
2029                 path2 = cgdir;
2030         } else {
2031                 path1 = cgdir;
2032                 path2 = last;
2033         }
2034
2035         k = cgfs_get_key(controller, path1, path2);
2036         if (!k) {
2037                 ret = -EINVAL;
2038                 goto out;
2039         }
2040         free_key(k);
2041
2042         pid_t initpid = lookup_initpid_in_store(fc->pid);
2043         if (initpid <= 0)
2044                 initpid = fc->pid;
2045         if (!caller_may_see_dir(initpid, controller, path1)) {
2046                 ret = -ENOENT;
2047                 goto out;
2048         }
2049         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2050                 ret = -EACCES;
2051                 goto out;
2052         }
2053
2054         /* we'll free this at cg_release */
2055         file_info = malloc(sizeof(*file_info));
2056         if (!file_info) {
2057                 ret = -ENOMEM;
2058                 goto out;
2059         }
2060         file_info->controller = must_copy_string(controller);
2061         file_info->cgroup = must_copy_string(path1);
2062         file_info->file = must_copy_string(path2);
2063         file_info->type = LXC_TYPE_CGFILE;
2064         file_info->buf = NULL;
2065         file_info->buflen = 0;
2066
2067         fi->fh = (unsigned long)file_info;
2068         ret = 0;
2069
2070 out:
2071         free(cgdir);
2072         return ret;
2073 }
2074
2075 int cg_access(const char *path, int mode)
2076 {
2077         int ret;
2078         const char *cgroup;
2079         char *path1, *path2, *controller;
2080         char *last = NULL, *cgdir = NULL;
2081         struct cgfs_files *k = NULL;
2082         struct fuse_context *fc = fuse_get_context();
2083
2084         if (strcmp(path, "/cgroup") == 0)
2085                 return 0;
2086
2087         if (!fc)
2088                 return -EIO;
2089
2090         controller = pick_controller_from_path(fc, path);
2091         if (!controller)
2092                 return -errno;
2093         cgroup = find_cgroup_in_path(path);
2094         if (!cgroup) {
2095                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2096                 if ((mode & W_OK) == 0)
2097                         return 0;
2098                 return -EACCES;
2099         }
2100
2101         get_cgdir_and_path(cgroup, &cgdir, &last);
2102         if (!last) {
2103                 path1 = "/";
2104                 path2 = cgdir;
2105         } else {
2106                 path1 = cgdir;
2107                 path2 = last;
2108         }
2109
2110         k = cgfs_get_key(controller, path1, path2);
2111         if (!k) {
2112                 if ((mode & W_OK) == 0)
2113                         ret = 0;
2114                 else
2115                         ret = -EACCES;
2116                 goto out;
2117         }
2118         free_key(k);
2119
2120         pid_t initpid = lookup_initpid_in_store(fc->pid);
2121         if (initpid <= 0)
2122                 initpid = fc->pid;
2123         if (!caller_may_see_dir(initpid, controller, path1)) {
2124                 ret = -ENOENT;
2125                 goto out;
2126         }
2127         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2128                 ret = -EACCES;
2129                 goto out;
2130         }
2131
2132         ret = 0;
2133
2134 out:
2135         free(cgdir);
2136         return ret;
2137 }
2138
2139 int cg_release(const char *path, struct fuse_file_info *fi)
2140 {
2141         do_release_file_info(fi);
2142         return 0;
2143 }
2144
2145 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2146
2147 static bool wait_for_sock(int sock, int timeout)
2148 {
2149         struct epoll_event ev;
2150         int epfd, ret, now, starttime, deltatime, saved_errno;
2151
2152         if ((starttime = time(NULL)) < 0)
2153                 return false;
2154
2155         if ((epfd = epoll_create(1)) < 0) {
2156                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2157                 return false;
2158         }
2159
2160         ev.events = POLLIN_SET;
2161         ev.data.fd = sock;
2162         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2163                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2164                 close(epfd);
2165                 return false;
2166         }
2167
2168 again:
2169         if ((now = time(NULL)) < 0) {
2170                 close(epfd);
2171                 return false;
2172         }
2173
2174         deltatime = (starttime + timeout) - now;
2175         if (deltatime < 0) { // timeout
2176                 errno = 0;
2177                 close(epfd);
2178                 return false;
2179         }
2180         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2181         if (ret < 0 && errno == EINTR)
2182                 goto again;
2183         saved_errno = errno;
2184         close(epfd);
2185
2186         if (ret <= 0) {
2187                 errno = saved_errno;
2188                 return false;
2189         }
2190         return true;
2191 }
2192
2193 static int msgrecv(int sockfd, void *buf, size_t len)
2194 {
2195         if (!wait_for_sock(sockfd, 2))
2196                 return -1;
2197         return recv(sockfd, buf, len, MSG_DONTWAIT);
2198 }
2199
2200 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2201 {
2202         struct msghdr msg = { 0 };
2203         struct iovec iov;
2204         struct cmsghdr *cmsg;
2205         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2206         char buf[1];
2207         buf[0] = 'p';
2208
2209         if (pingfirst) {
2210                 if (msgrecv(sock, buf, 1) != 1) {
2211                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2212                         return SEND_CREDS_FAIL;
2213                 }
2214         }
2215
2216         msg.msg_control = cmsgbuf;
2217         msg.msg_controllen = sizeof(cmsgbuf);
2218
2219         cmsg = CMSG_FIRSTHDR(&msg);
2220         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2221         cmsg->cmsg_level = SOL_SOCKET;
2222         cmsg->cmsg_type = SCM_CREDENTIALS;
2223         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2224
2225         msg.msg_name = NULL;
2226         msg.msg_namelen = 0;
2227
2228         buf[0] = v;
2229         iov.iov_base = buf;
2230         iov.iov_len = sizeof(buf);
2231         msg.msg_iov = &iov;
2232         msg.msg_iovlen = 1;
2233
2234         if (sendmsg(sock, &msg, 0) < 0) {
2235                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2236                 if (errno == 3)
2237                         return SEND_CREDS_NOTSK;
2238                 return SEND_CREDS_FAIL;
2239         }
2240
2241         return SEND_CREDS_OK;
2242 }
2243
2244 static bool recv_creds(int sock, struct ucred *cred, char *v)
2245 {
2246         struct msghdr msg = { 0 };
2247         struct iovec iov;
2248         struct cmsghdr *cmsg;
2249         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2250         char buf[1];
2251         int ret;
2252         int optval = 1;
2253
2254         *v = '1';
2255
2256         cred->pid = -1;
2257         cred->uid = -1;
2258         cred->gid = -1;
2259
2260         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2261                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2262                 return false;
2263         }
2264         buf[0] = '1';
2265         if (write(sock, buf, 1) != 1) {
2266                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2267                 return false;
2268         }
2269
2270         msg.msg_name = NULL;
2271         msg.msg_namelen = 0;
2272         msg.msg_control = cmsgbuf;
2273         msg.msg_controllen = sizeof(cmsgbuf);
2274
2275         iov.iov_base = buf;
2276         iov.iov_len = sizeof(buf);
2277         msg.msg_iov = &iov;
2278         msg.msg_iovlen = 1;
2279
2280         if (!wait_for_sock(sock, 2)) {
2281                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2282                 return false;
2283         }
2284         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2285         if (ret < 0) {
2286                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2287                 return false;
2288         }
2289
2290         cmsg = CMSG_FIRSTHDR(&msg);
2291
2292         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2293                         cmsg->cmsg_level == SOL_SOCKET &&
2294                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2295                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2296         }
2297         *v = buf[0];
2298
2299         return true;
2300 }
2301
2302 struct pid_ns_clone_args {
2303         int *cpipe;
2304         int sock;
2305         pid_t tpid;
2306         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2307 };
2308
2309 /*
2310  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2311  * with clone(). This simply writes '1' as ACK back to the parent
2312  * before calling the actual wrapped function.
2313  */
2314 static int pid_ns_clone_wrapper(void *arg) {
2315         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2316         char b = '1';
2317
2318         close(args->cpipe[0]);
2319         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2320                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2321         close(args->cpipe[1]);
2322         return args->wrapped(args->sock, args->tpid);
2323 }
2324
2325 /*
2326  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2327  * int value back over the socket.  This shifts the pid from the
2328  * sender's pidns into tpid's pidns.
2329  */
2330 static int pid_to_ns(int sock, pid_t tpid)
2331 {
2332         char v = '0';
2333         struct ucred cred;
2334
2335         while (recv_creds(sock, &cred, &v)) {
2336                 if (v == '1')
2337                         return 0;
2338                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2339                         return 1;
2340         }
2341         return 0;
2342 }
2343
2344
2345 /*
2346  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2347  * in your old pidns.  Only children which you clone will be in the target
2348  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2349  * actually convert pids.
2350  *
2351  * Note: glibc's fork() does not respect pidns, which can lead to failed
2352  * assertions inside glibc (and thus failed forks) if the child's pid in
2353  * the pidns and the parent pid outside are identical. Using clone prevents
2354  * this issue.
2355  */
2356 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2357 {
2358         int newnsfd = -1, ret, cpipe[2];
2359         char fnam[100];
2360         pid_t cpid;
2361         char v;
2362
2363         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2364         if (ret < 0 || ret >= sizeof(fnam))
2365                 _exit(1);
2366         newnsfd = open(fnam, O_RDONLY);
2367         if (newnsfd < 0)
2368                 _exit(1);
2369         if (setns(newnsfd, 0) < 0)
2370                 _exit(1);
2371         close(newnsfd);
2372
2373         if (pipe(cpipe) < 0)
2374                 _exit(1);
2375
2376         struct pid_ns_clone_args args = {
2377                 .cpipe = cpipe,
2378                 .sock = sock,
2379                 .tpid = tpid,
2380                 .wrapped = &pid_to_ns
2381         };
2382         size_t stack_size = sysconf(_SC_PAGESIZE);
2383         void *stack = alloca(stack_size);
2384
2385         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2386         if (cpid < 0)
2387                 _exit(1);
2388
2389         // give the child 1 second to be done forking and
2390         // write its ack
2391         if (!wait_for_sock(cpipe[0], 1))
2392                 _exit(1);
2393         ret = read(cpipe[0], &v, 1);
2394         if (ret != sizeof(char) || v != '1')
2395                 _exit(1);
2396
2397         if (!wait_for_pid(cpid))
2398                 _exit(1);
2399         _exit(0);
2400 }
2401
2402 /*
2403  * To read cgroup files with a particular pid, we will setns into the child
2404  * pidns, open a pipe, fork a child - which will be the first to really be in
2405  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2406  */
2407 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2408 {
2409         int sock[2] = {-1, -1};
2410         char *tmpdata = NULL;
2411         int ret;
2412         pid_t qpid, cpid = -1;
2413         bool answer = false;
2414         char v = '0';
2415         struct ucred cred;
2416         size_t sz = 0, asz = 0;
2417
2418         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2419                 return false;
2420
2421         /*
2422          * Now we read the pids from returned data one by one, pass
2423          * them into a child in the target namespace, read back the
2424          * translated pids, and put them into our to-return data
2425          */
2426
2427         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2428                 perror("socketpair");
2429                 free(tmpdata);
2430                 return false;
2431         }
2432
2433         cpid = fork();
2434         if (cpid == -1)
2435                 goto out;
2436
2437         if (!cpid) // child - exits when done
2438                 pid_to_ns_wrapper(sock[1], tpid);
2439
2440         char *ptr = tmpdata;
2441         cred.uid = 0;
2442         cred.gid = 0;
2443         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2444                 cred.pid = qpid;
2445                 ret = send_creds(sock[0], &cred, v, true);
2446
2447                 if (ret == SEND_CREDS_NOTSK)
2448                         goto next;
2449                 if (ret == SEND_CREDS_FAIL)
2450                         goto out;
2451
2452                 // read converted results
2453                 if (!wait_for_sock(sock[0], 2)) {
2454                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2455                         goto out;
2456                 }
2457                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2458                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2459                         goto out;
2460                 }
2461                 must_strcat_pid(d, &sz, &asz, qpid);
2462 next:
2463                 ptr = strchr(ptr, '\n');
2464                 if (!ptr)
2465                         break;
2466                 ptr++;
2467         }
2468
2469         cred.pid = getpid();
2470         v = '1';
2471         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2472                 // failed to ask child to exit
2473                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2474                 goto out;
2475         }
2476
2477         answer = true;
2478
2479 out:
2480         free(tmpdata);
2481         if (cpid != -1)
2482                 wait_for_pid(cpid);
2483         if (sock[0] != -1) {
2484                 close(sock[0]);
2485                 close(sock[1]);
2486         }
2487         return answer;
2488 }
2489
2490 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2491                 struct fuse_file_info *fi)
2492 {
2493         struct fuse_context *fc = fuse_get_context();
2494         struct file_info *f = (struct file_info *)fi->fh;
2495         struct cgfs_files *k = NULL;
2496         char *data = NULL;
2497         int ret, s;
2498         bool r;
2499
2500         if (f->type != LXC_TYPE_CGFILE) {
2501                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2502                 return -EIO;
2503         }
2504
2505         if (offset)
2506                 return 0;
2507
2508         if (!fc)
2509                 return -EIO;
2510
2511         if (!f->controller)
2512                 return -EINVAL;
2513
2514         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2515                 return -EINVAL;
2516         }
2517         free_key(k);
2518
2519
2520         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2521                 ret = -EACCES;
2522                 goto out;
2523         }
2524
2525         if (strcmp(f->file, "tasks") == 0 ||
2526                         strcmp(f->file, "/tasks") == 0 ||
2527                         strcmp(f->file, "/cgroup.procs") == 0 ||
2528                         strcmp(f->file, "cgroup.procs") == 0)
2529                 // special case - we have to translate the pids
2530                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2531         else
2532                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2533
2534         if (!r) {
2535                 ret = -EINVAL;
2536                 goto out;
2537         }
2538
2539         if (!data) {
2540                 ret = 0;
2541                 goto out;
2542         }
2543         s = strlen(data);
2544         if (s > size)
2545                 s = size;
2546         memcpy(buf, data, s);
2547         if (s > 0 && s < size && data[s-1] != '\n')
2548                 buf[s++] = '\n';
2549
2550         ret = s;
2551
2552 out:
2553         free(data);
2554         return ret;
2555 }
2556
2557 static int pid_from_ns(int sock, pid_t tpid)
2558 {
2559         pid_t vpid;
2560         struct ucred cred;
2561         char v;
2562         int ret;
2563
2564         cred.uid = 0;
2565         cred.gid = 0;
2566         while (1) {
2567                 if (!wait_for_sock(sock, 2)) {
2568                         lxcfs_error("%s\n", "Timeout reading from parent.");
2569                         return 1;
2570                 }
2571                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2572                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2573                         return 1;
2574                 }
2575                 if (vpid == -1) // done
2576                         break;
2577                 v = '0';
2578                 cred.pid = vpid;
2579                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2580                         v = '1';
2581                         cred.pid = getpid();
2582                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2583                                 return 1;
2584                 }
2585         }
2586         return 0;
2587 }
2588
2589 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2590 {
2591         int newnsfd = -1, ret, cpipe[2];
2592         char fnam[100];
2593         pid_t cpid;
2594         char v;
2595
2596         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2597         if (ret < 0 || ret >= sizeof(fnam))
2598                 _exit(1);
2599         newnsfd = open(fnam, O_RDONLY);
2600         if (newnsfd < 0)
2601                 _exit(1);
2602         if (setns(newnsfd, 0) < 0)
2603                 _exit(1);
2604         close(newnsfd);
2605
2606         if (pipe(cpipe) < 0)
2607                 _exit(1);
2608
2609         struct pid_ns_clone_args args = {
2610                 .cpipe = cpipe,
2611                 .sock = sock,
2612                 .tpid = tpid,
2613                 .wrapped = &pid_from_ns
2614         };
2615         size_t stack_size = sysconf(_SC_PAGESIZE);
2616         void *stack = alloca(stack_size);
2617
2618         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2619         if (cpid < 0)
2620                 _exit(1);
2621
2622         // give the child 1 second to be done forking and
2623         // write its ack
2624         if (!wait_for_sock(cpipe[0], 1))
2625                 _exit(1);
2626         ret = read(cpipe[0], &v, 1);
2627         if (ret != sizeof(char) || v != '1')
2628                 _exit(1);
2629
2630         if (!wait_for_pid(cpid))
2631                 _exit(1);
2632         _exit(0);
2633 }
2634
2635 /*
2636  * Given host @uid, return the uid to which it maps in
2637  * @pid's user namespace, or -1 if none.
2638  */
2639 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2640 {
2641         FILE *f;
2642         char line[400];
2643
2644         sprintf(line, "/proc/%d/uid_map", pid);
2645         if ((f = fopen(line, "r")) == NULL) {
2646                 return false;
2647         }
2648
2649         *answer = convert_id_to_ns(f, uid);
2650         fclose(f);
2651
2652         if (*answer == -1)
2653                 return false;
2654         return true;
2655 }
2656
2657 /*
2658  * get_pid_creds: get the real uid and gid of @pid from
2659  * /proc/$$/status
2660  * (XXX should we use euid here?)
2661  */
2662 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2663 {
2664         char line[400];
2665         uid_t u;
2666         gid_t g;
2667         FILE *f;
2668
2669         *uid = -1;
2670         *gid = -1;
2671         sprintf(line, "/proc/%d/status", pid);
2672         if ((f = fopen(line, "r")) == NULL) {
2673                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2674                 return;
2675         }
2676         while (fgets(line, 400, f)) {
2677                 if (strncmp(line, "Uid:", 4) == 0) {
2678                         if (sscanf(line+4, "%u", &u) != 1) {
2679                                 lxcfs_error("bad uid line for pid %u\n", pid);
2680                                 fclose(f);
2681                                 return;
2682                         }
2683                         *uid = u;
2684                 } else if (strncmp(line, "Gid:", 4) == 0) {
2685                         if (sscanf(line+4, "%u", &g) != 1) {
2686                                 lxcfs_error("bad gid line for pid %u\n", pid);
2687                                 fclose(f);
2688                                 return;
2689                         }
2690                         *gid = g;
2691                 }
2692         }
2693         fclose(f);
2694 }
2695
2696 /*
2697  * May the requestor @r move victim @v to a new cgroup?
2698  * This is allowed if
2699  *   . they are the same task
2700  *   . they are ownedy by the same uid
2701  *   . @r is root on the host, or
2702  *   . @v's uid is mapped into @r's where @r is root.
2703  */
2704 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2705 {
2706         uid_t v_uid, tmpuid;
2707         gid_t v_gid;
2708
2709         if (r == v)
2710                 return true;
2711         if (r_uid == 0)
2712                 return true;
2713         get_pid_creds(v, &v_uid, &v_gid);
2714         if (r_uid == v_uid)
2715                 return true;
2716         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2717                         && hostuid_to_ns(v_uid, r, &tmpuid))
2718                 return true;
2719         return false;
2720 }
2721
2722 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2723                 const char *file, const char *buf)
2724 {
2725         int sock[2] = {-1, -1};
2726         pid_t qpid, cpid = -1;
2727         FILE *pids_file = NULL;
2728         bool answer = false, fail = false;
2729
2730         pids_file = open_pids_file(contrl, cg);
2731         if (!pids_file)
2732                 return false;
2733
2734         /*
2735          * write the pids to a socket, have helper in writer's pidns
2736          * call movepid for us
2737          */
2738         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2739                 perror("socketpair");
2740                 goto out;
2741         }
2742
2743         cpid = fork();
2744         if (cpid == -1)
2745                 goto out;
2746
2747         if (!cpid) { // child
2748                 fclose(pids_file);
2749                 pid_from_ns_wrapper(sock[1], tpid);
2750         }
2751
2752         const char *ptr = buf;
2753         while (sscanf(ptr, "%d", &qpid) == 1) {
2754                 struct ucred cred;
2755                 char v;
2756
2757                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2758                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2759                         goto out;
2760                 }
2761
2762                 if (recv_creds(sock[0], &cred, &v)) {
2763                         if (v == '0') {
2764                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2765                                         fail = true;
2766                                         break;
2767                                 }
2768                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2769                                         fail = true;
2770                         }
2771                 }
2772
2773                 ptr = strchr(ptr, '\n');
2774                 if (!ptr)
2775                         break;
2776                 ptr++;
2777         }
2778
2779         /* All good, write the value */
2780         qpid = -1;
2781         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2782                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2783
2784         if (!fail)
2785                 answer = true;
2786
2787 out:
2788         if (cpid != -1)
2789                 wait_for_pid(cpid);
2790         if (sock[0] != -1) {
2791                 close(sock[0]);
2792                 close(sock[1]);
2793         }
2794         if (pids_file) {
2795                 if (fclose(pids_file) != 0)
2796                         answer = false;
2797         }
2798         return answer;
2799 }
2800
2801 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2802              struct fuse_file_info *fi)
2803 {
2804         struct fuse_context *fc = fuse_get_context();
2805         char *localbuf = NULL;
2806         struct cgfs_files *k = NULL;
2807         struct file_info *f = (struct file_info *)fi->fh;
2808         bool r;
2809
2810         if (f->type != LXC_TYPE_CGFILE) {
2811                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2812                 return -EIO;
2813         }
2814
2815         if (offset)
2816                 return 0;
2817
2818         if (!fc)
2819                 return -EIO;
2820
2821         localbuf = alloca(size+1);
2822         localbuf[size] = '\0';
2823         memcpy(localbuf, buf, size);
2824
2825         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2826                 size = -EINVAL;
2827                 goto out;
2828         }
2829
2830         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2831                 size = -EACCES;
2832                 goto out;
2833         }
2834
2835         if (strcmp(f->file, "tasks") == 0 ||
2836                         strcmp(f->file, "/tasks") == 0 ||
2837                         strcmp(f->file, "/cgroup.procs") == 0 ||
2838                         strcmp(f->file, "cgroup.procs") == 0)
2839                 // special case - we have to translate the pids
2840                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2841         else
2842                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2843
2844         if (!r)
2845                 size = -EINVAL;
2846
2847 out:
2848         free_key(k);
2849         return size;
2850 }
2851
2852 int cg_chown(const char *path, uid_t uid, gid_t gid)
2853 {
2854         struct fuse_context *fc = fuse_get_context();
2855         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2856         struct cgfs_files *k = NULL;
2857         const char *cgroup;
2858         int ret;
2859
2860         if (!fc)
2861                 return -EIO;
2862
2863         if (strcmp(path, "/cgroup") == 0)
2864                 return -EPERM;
2865
2866         controller = pick_controller_from_path(fc, path);
2867         if (!controller)
2868                 return errno == ENOENT ? -EPERM : -errno;
2869
2870         cgroup = find_cgroup_in_path(path);
2871         if (!cgroup)
2872                 /* this is just /cgroup/controller */
2873                 return -EPERM;
2874
2875         get_cgdir_and_path(cgroup, &cgdir, &last);
2876
2877         if (!last) {
2878                 path1 = "/";
2879                 path2 = cgdir;
2880         } else {
2881                 path1 = cgdir;
2882                 path2 = last;
2883         }
2884
2885         if (is_child_cgroup(controller, path1, path2)) {
2886                 // get uid, gid, from '/tasks' file and make up a mode
2887                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2888                 k = cgfs_get_key(controller, cgroup, "tasks");
2889
2890         } else
2891                 k = cgfs_get_key(controller, path1, path2);
2892
2893         if (!k) {
2894                 ret = -EINVAL;
2895                 goto out;
2896         }
2897
2898         /*
2899          * This being a fuse request, the uid and gid must be valid
2900          * in the caller's namespace.  So we can just check to make
2901          * sure that the caller is root in his uid, and privileged
2902          * over the file's current owner.
2903          */
2904         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2905                 ret = -EACCES;
2906                 goto out;
2907         }
2908
2909         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2910
2911 out:
2912         free_key(k);
2913         free(cgdir);
2914
2915         return ret;
2916 }
2917
2918 int cg_chmod(const char *path, mode_t mode)
2919 {
2920         struct fuse_context *fc = fuse_get_context();
2921         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2922         struct cgfs_files *k = NULL;
2923         const char *cgroup;
2924         int ret;
2925
2926         if (!fc)
2927                 return -EIO;
2928
2929         if (strcmp(path, "/cgroup") == 0)
2930                 return -EPERM;
2931
2932         controller = pick_controller_from_path(fc, path);
2933         if (!controller)
2934                 return errno == ENOENT ? -EPERM : -errno;
2935
2936         cgroup = find_cgroup_in_path(path);
2937         if (!cgroup)
2938                 /* this is just /cgroup/controller */
2939                 return -EPERM;
2940
2941         get_cgdir_and_path(cgroup, &cgdir, &last);
2942
2943         if (!last) {
2944                 path1 = "/";
2945                 path2 = cgdir;
2946         } else {
2947                 path1 = cgdir;
2948                 path2 = last;
2949         }
2950
2951         if (is_child_cgroup(controller, path1, path2)) {
2952                 // get uid, gid, from '/tasks' file and make up a mode
2953                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2954                 k = cgfs_get_key(controller, cgroup, "tasks");
2955
2956         } else
2957                 k = cgfs_get_key(controller, path1, path2);
2958
2959         if (!k) {
2960                 ret = -EINVAL;
2961                 goto out;
2962         }
2963
2964         /*
2965          * This being a fuse request, the uid and gid must be valid
2966          * in the caller's namespace.  So we can just check to make
2967          * sure that the caller is root in his uid, and privileged
2968          * over the file's current owner.
2969          */
2970         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2971                 ret = -EPERM;
2972                 goto out;
2973         }
2974
2975         if (!cgfs_chmod_file(controller, cgroup, mode)) {
2976                 ret = -EINVAL;
2977                 goto out;
2978         }
2979
2980         ret = 0;
2981 out:
2982         free_key(k);
2983         free(cgdir);
2984         return ret;
2985 }
2986
2987 int cg_mkdir(const char *path, mode_t mode)
2988 {
2989         struct fuse_context *fc = fuse_get_context();
2990         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2991         const char *cgroup;
2992         int ret;
2993
2994         if (!fc)
2995                 return -EIO;
2996
2997         controller = pick_controller_from_path(fc, path);
2998         if (!controller)
2999                 return errno == ENOENT ? -EPERM : -errno;
3000
3001         cgroup = find_cgroup_in_path(path);
3002         if (!cgroup)
3003                 return -errno;
3004
3005         get_cgdir_and_path(cgroup, &cgdir, &last);
3006         if (!last)
3007                 path1 = "/";
3008         else
3009                 path1 = cgdir;
3010
3011         pid_t initpid = lookup_initpid_in_store(fc->pid);
3012         if (initpid <= 0)
3013                 initpid = fc->pid;
3014         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3015                 if (!next)
3016                         ret = -EINVAL;
3017                 else if (last && strcmp(next, last) == 0)
3018                         ret = -EEXIST;
3019                 else
3020                         ret = -EPERM;
3021                 goto out;
3022         }
3023
3024         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3025                 ret = -EACCES;
3026                 goto out;
3027         }
3028         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3029                 ret = -EACCES;
3030                 goto out;
3031         }
3032
3033         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3034
3035 out:
3036         free(cgdir);
3037         free(next);
3038         return ret;
3039 }
3040
3041 int cg_rmdir(const char *path)
3042 {
3043         struct fuse_context *fc = fuse_get_context();
3044         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3045         const char *cgroup;
3046         int ret;
3047
3048         if (!fc)
3049                 return -EIO;
3050
3051         controller = pick_controller_from_path(fc, path);
3052         if (!controller) /* Someone's trying to delete "/cgroup". */
3053                 return -EPERM;
3054
3055         cgroup = find_cgroup_in_path(path);
3056         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3057                 return -EPERM;
3058
3059         get_cgdir_and_path(cgroup, &cgdir, &last);
3060         if (!last) {
3061                 /* Someone's trying to delete a cgroup on the same level as the
3062                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3063                  * rmdir "/cgroup/blkio/init.slice".
3064                  */
3065                 ret = -EPERM;
3066                 goto out;
3067         }
3068
3069         pid_t initpid = lookup_initpid_in_store(fc->pid);
3070         if (initpid <= 0)
3071                 initpid = fc->pid;
3072         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3073                 if (!last || (next && (strcmp(next, last) == 0)))
3074                         ret = -EBUSY;
3075                 else
3076                         ret = -ENOENT;
3077                 goto out;
3078         }
3079
3080         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3081                 ret = -EACCES;
3082                 goto out;
3083         }
3084         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3085                 ret = -EACCES;
3086                 goto out;
3087         }
3088
3089         if (!cgfs_remove(controller, cgroup)) {
3090                 ret = -EINVAL;
3091                 goto out;
3092         }
3093
3094         ret = 0;
3095
3096 out:
3097         free(cgdir);
3098         free(next);
3099         return ret;
3100 }
3101
3102 static bool startswith(const char *line, const char *pref)
3103 {
3104         if (strncmp(line, pref, strlen(pref)) == 0)
3105                 return true;
3106         return false;
3107 }
3108
3109 static void parse_memstat(char *memstat, unsigned long *cached,
3110                 unsigned long *active_anon, unsigned long *inactive_anon,
3111                 unsigned long *active_file, unsigned long *inactive_file,
3112                 unsigned long *unevictable)
3113 {
3114         char *eol;
3115
3116         while (*memstat) {
3117                 if (startswith(memstat, "total_cache")) {
3118                         sscanf(memstat + 11, "%lu", cached);
3119                         *cached /= 1024;
3120                 } else if (startswith(memstat, "total_active_anon")) {
3121                         sscanf(memstat + 17, "%lu", active_anon);
3122                         *active_anon /= 1024;
3123                 } else if (startswith(memstat, "total_inactive_anon")) {
3124                         sscanf(memstat + 19, "%lu", inactive_anon);
3125                         *inactive_anon /= 1024;
3126                 } else if (startswith(memstat, "total_active_file")) {
3127                         sscanf(memstat + 17, "%lu", active_file);
3128                         *active_file /= 1024;
3129                 } else if (startswith(memstat, "total_inactive_file")) {
3130                         sscanf(memstat + 19, "%lu", inactive_file);
3131                         *inactive_file /= 1024;
3132                 } else if (startswith(memstat, "total_unevictable")) {
3133                         sscanf(memstat + 17, "%lu", unevictable);
3134                         *unevictable /= 1024;
3135                 }
3136                 eol = strchr(memstat, '\n');
3137                 if (!eol)
3138                         return;
3139                 memstat = eol+1;
3140         }
3141 }
3142
3143 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3144 {
3145         char *eol;
3146         char key[32];
3147
3148         memset(key, 0, 32);
3149         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3150
3151         size_t len = strlen(key);
3152         *v = 0;
3153
3154         while (*str) {
3155                 if (startswith(str, key)) {
3156                         sscanf(str + len, "%lu", v);
3157                         return;
3158                 }
3159                 eol = strchr(str, '\n');
3160                 if (!eol)
3161                         return;
3162                 str = eol+1;
3163         }
3164 }
3165
3166 static int read_file(const char *path, char *buf, size_t size,
3167                      struct file_info *d)
3168 {
3169         size_t linelen = 0, total_len = 0, rv = 0;
3170         char *line = NULL;
3171         char *cache = d->buf;
3172         size_t cache_size = d->buflen;
3173         FILE *f = fopen(path, "r");
3174         if (!f)
3175                 return 0;
3176
3177         while (getline(&line, &linelen, f) != -1) {
3178                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3179                 if (l < 0) {
3180                         perror("Error writing to cache");
3181                         rv = 0;
3182                         goto err;
3183                 }
3184                 if (l >= cache_size) {
3185                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3186                         rv = 0;
3187                         goto err;
3188                 }
3189                 cache += l;
3190                 cache_size -= l;
3191                 total_len += l;
3192         }
3193
3194         d->size = total_len;
3195         if (total_len > size)
3196                 total_len = size;
3197
3198         /* read from off 0 */
3199         memcpy(buf, d->buf, total_len);
3200         rv = total_len;
3201   err:
3202         fclose(f);
3203         free(line);
3204         return rv;
3205 }
3206
3207 /*
3208  * FUSE ops for /proc
3209  */
3210
3211 static unsigned long get_memlimit(const char *cgroup, const char *file)
3212 {
3213         char *memlimit_str = NULL;
3214         unsigned long memlimit = -1;
3215
3216         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3217                 memlimit = strtoul(memlimit_str, NULL, 10);
3218
3219         free(memlimit_str);
3220
3221         return memlimit;
3222 }
3223
3224 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3225 {
3226         char *copy = strdupa(cgroup);
3227         unsigned long memlimit = 0, retlimit;
3228
3229         retlimit = get_memlimit(copy, file);
3230
3231         while (strcmp(copy, "/") != 0) {
3232                 copy = dirname(copy);
3233                 memlimit = get_memlimit(copy, file);
3234                 if (memlimit != -1 && memlimit < retlimit)
3235                         retlimit = memlimit;
3236         };
3237
3238         return retlimit;
3239 }
3240
3241 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3242                 struct fuse_file_info *fi)
3243 {
3244         struct fuse_context *fc = fuse_get_context();
3245         struct file_info *d = (struct file_info *)fi->fh;
3246         char *cg;
3247         char *memusage_str = NULL, *memstat_str = NULL,
3248                 *memswlimit_str = NULL, *memswusage_str = NULL;
3249         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3250                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3251                 active_file = 0, inactive_file = 0, unevictable = 0,
3252                 hostswtotal = 0;
3253         char *line = NULL;
3254         size_t linelen = 0, total_len = 0, rv = 0;
3255         char *cache = d->buf;
3256         size_t cache_size = d->buflen;
3257         FILE *f = NULL;
3258
3259         if (offset){
3260                 if (offset > d->size)
3261                         return -EINVAL;
3262                 if (!d->cached)
3263                         return 0;
3264                 int left = d->size - offset;
3265                 total_len = left > size ? size: left;
3266                 memcpy(buf, cache + offset, total_len);
3267                 return total_len;
3268         }
3269
3270         pid_t initpid = lookup_initpid_in_store(fc->pid);
3271         if (initpid <= 0)
3272                 initpid = fc->pid;
3273         cg = get_pid_cgroup(initpid, "memory");
3274         if (!cg)
3275                 return read_file("/proc/meminfo", buf, size, d);
3276         prune_init_slice(cg);
3277
3278         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3279         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3280                 goto err;
3281         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3282                 goto err;
3283
3284         // Following values are allowed to fail, because swapaccount might be turned
3285         // off for current kernel
3286         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3287                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3288         {
3289                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3290                 memswusage = strtoul(memswusage_str, NULL, 10);
3291
3292                 memswlimit = memswlimit / 1024;
3293                 memswusage = memswusage / 1024;
3294         }
3295
3296         memusage = strtoul(memusage_str, NULL, 10);
3297         memlimit /= 1024;
3298         memusage /= 1024;
3299
3300         parse_memstat(memstat_str, &cached, &active_anon,
3301                         &inactive_anon, &active_file, &inactive_file,
3302                         &unevictable);
3303
3304         f = fopen("/proc/meminfo", "r");
3305         if (!f)
3306                 goto err;
3307
3308         while (getline(&line, &linelen, f) != -1) {
3309                 ssize_t l;
3310                 char *printme, lbuf[100];
3311
3312                 memset(lbuf, 0, 100);
3313                 if (startswith(line, "MemTotal:")) {
3314                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3315                         if (hosttotal < memlimit)
3316                                 memlimit = hosttotal;
3317                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3318                         printme = lbuf;
3319                 } else if (startswith(line, "MemFree:")) {
3320                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3321                         printme = lbuf;
3322                 } else if (startswith(line, "MemAvailable:")) {
3323                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3324                         printme = lbuf;
3325                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3326                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3327                         if (hostswtotal < memswlimit)
3328                                 memswlimit = hostswtotal;
3329                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3330                         printme = lbuf;
3331                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3332                         unsigned long swaptotal = memswlimit,
3333                                         swapusage = memswusage - memusage,
3334                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3335                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3336                         printme = lbuf;
3337                 } else if (startswith(line, "Slab:")) {
3338                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3339                         printme = lbuf;
3340                 } else if (startswith(line, "Buffers:")) {
3341                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3342                         printme = lbuf;
3343                 } else if (startswith(line, "Cached:")) {
3344                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3345                         printme = lbuf;
3346                 } else if (startswith(line, "SwapCached:")) {
3347                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3348                         printme = lbuf;
3349                 } else if (startswith(line, "Active:")) {
3350                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3351                                         active_anon + active_file);
3352                         printme = lbuf;
3353                 } else if (startswith(line, "Inactive:")) {
3354                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3355                                         inactive_anon + inactive_file);
3356                         printme = lbuf;
3357                 } else if (startswith(line, "Active(anon)")) {
3358                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3359                         printme = lbuf;
3360                 } else if (startswith(line, "Inactive(anon)")) {
3361                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3362                         printme = lbuf;
3363                 } else if (startswith(line, "Active(file)")) {
3364                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3365                         printme = lbuf;
3366                 } else if (startswith(line, "Inactive(file)")) {
3367                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3368                         printme = lbuf;
3369                 } else if (startswith(line, "Unevictable")) {
3370                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3371                         printme = lbuf;
3372                 } else if (startswith(line, "SReclaimable")) {
3373                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3374                         printme = lbuf;
3375                 } else if (startswith(line, "SUnreclaim")) {
3376                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3377                         printme = lbuf;
3378                 } else
3379                         printme = line;
3380
3381                 l = snprintf(cache, cache_size, "%s", printme);
3382                 if (l < 0) {
3383                         perror("Error writing to cache");
3384                         rv = 0;
3385                         goto err;
3386
3387                 }
3388                 if (l >= cache_size) {
3389                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3390                         rv = 0;
3391                         goto err;
3392                 }
3393
3394                 cache += l;
3395                 cache_size -= l;
3396                 total_len += l;
3397         }
3398
3399         d->cached = 1;
3400         d->size = total_len;
3401         if (total_len > size ) total_len = size;
3402         memcpy(buf, d->buf, total_len);
3403
3404         rv = total_len;
3405 err:
3406         if (f)
3407                 fclose(f);
3408         free(line);
3409         free(cg);
3410         free(memusage_str);
3411         free(memswlimit_str);
3412         free(memswusage_str);
3413         free(memstat_str);
3414         return rv;
3415 }
3416
3417 /*
3418  * Read the cpuset.cpus for cg
3419  * Return the answer in a newly allocated string which must be freed
3420  */
3421 static char *get_cpuset(const char *cg)
3422 {
3423         char *answer;
3424
3425         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3426                 return NULL;
3427         return answer;
3428 }
3429
3430 bool cpu_in_cpuset(int cpu, const char *cpuset);
3431
3432 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3433 {
3434         int cpu;
3435
3436         if (sscanf(line, "processor       : %d", &cpu) != 1)
3437                 return false;
3438         return cpu_in_cpuset(cpu, cpuset);
3439 }
3440
3441 /*
3442  * check whether this is a '^processor" line in /proc/cpuinfo
3443  */
3444 static bool is_processor_line(const char *line)
3445 {
3446         int cpu;
3447
3448         if (sscanf(line, "processor       : %d", &cpu) == 1)
3449                 return true;
3450         return false;
3451 }
3452
3453 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3454                 struct fuse_file_info *fi)
3455 {
3456         struct fuse_context *fc = fuse_get_context();
3457         struct file_info *d = (struct file_info *)fi->fh;
3458         char *cg;
3459         char *cpuset = NULL;
3460         char *line = NULL;
3461         size_t linelen = 0, total_len = 0, rv = 0;
3462         bool am_printing = false, firstline = true, is_s390x = false;
3463         int curcpu = -1, cpu;
3464         char *cache = d->buf;
3465         size_t cache_size = d->buflen;
3466         FILE *f = NULL;
3467
3468         if (offset){
3469                 if (offset > d->size)
3470                         return -EINVAL;
3471                 if (!d->cached)
3472                         return 0;
3473                 int left = d->size - offset;
3474                 total_len = left > size ? size: left;
3475                 memcpy(buf, cache + offset, total_len);
3476                 return total_len;
3477         }
3478
3479         pid_t initpid = lookup_initpid_in_store(fc->pid);
3480         if (initpid <= 0)
3481                 initpid = fc->pid;
3482         cg = get_pid_cgroup(initpid, "cpuset");
3483         if (!cg)
3484                 return read_file("proc/cpuinfo", buf, size, d);
3485         prune_init_slice(cg);
3486
3487         cpuset = get_cpuset(cg);
3488         if (!cpuset)
3489                 goto err;
3490
3491         f = fopen("/proc/cpuinfo", "r");
3492         if (!f)
3493                 goto err;
3494
3495         while (getline(&line, &linelen, f) != -1) {
3496                 ssize_t l;
3497                 if (firstline) {
3498                         firstline = false;
3499                         if (strstr(line, "IBM/S390") != NULL) {
3500                                 is_s390x = true;
3501                                 am_printing = true;
3502                                 continue;
3503                         }
3504                 }
3505                 if (strncmp(line, "# processors:", 12) == 0)
3506                         continue;
3507                 if (is_processor_line(line)) {
3508                         am_printing = cpuline_in_cpuset(line, cpuset);
3509                         if (am_printing) {
3510                                 curcpu ++;
3511                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3512                                 if (l < 0) {
3513                                         perror("Error writing to cache");
3514                                         rv = 0;
3515                                         goto err;
3516                                 }
3517                                 if (l >= cache_size) {
3518                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3519                                         rv = 0;
3520                                         goto err;
3521                                 }
3522                                 cache += l;
3523                                 cache_size -= l;
3524                                 total_len += l;
3525                         }
3526                         continue;
3527                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3528                         char *p;
3529                         if (!cpu_in_cpuset(cpu, cpuset))
3530                                 continue;
3531                         curcpu ++;
3532                         p = strchr(line, ':');
3533                         if (!p || !*p)
3534                                 goto err;
3535                         p++;
3536                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3537                         if (l < 0) {
3538                                 perror("Error writing to cache");
3539                                 rv = 0;
3540                                 goto err;
3541                         }
3542                         if (l >= cache_size) {
3543                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3544                                 rv = 0;
3545                                 goto err;
3546                         }
3547                         cache += l;
3548                         cache_size -= l;
3549                         total_len += l;
3550                         continue;
3551
3552                 }
3553                 if (am_printing) {
3554                         l = snprintf(cache, cache_size, "%s", line);
3555                         if (l < 0) {
3556                                 perror("Error writing to cache");
3557                                 rv = 0;
3558                                 goto err;
3559                         }
3560                         if (l >= cache_size) {
3561                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3562                                 rv = 0;
3563                                 goto err;
3564                         }
3565                         cache += l;
3566                         cache_size -= l;
3567                         total_len += l;
3568                 }
3569         }
3570
3571         if (is_s390x) {
3572                 char *origcache = d->buf;
3573                 ssize_t l;
3574                 do {
3575                         d->buf = malloc(d->buflen);
3576                 } while (!d->buf);
3577                 cache = d->buf;
3578                 cache_size = d->buflen;
3579                 total_len = 0;
3580                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3581                 if (l < 0 || l >= cache_size) {
3582                         free(origcache);
3583                         goto err;
3584                 }
3585                 cache_size -= l;
3586                 cache += l;
3587                 total_len += l;
3588                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3589                 if (l < 0 || l >= cache_size) {
3590                         free(origcache);
3591                         goto err;
3592                 }
3593                 cache_size -= l;
3594                 cache += l;
3595                 total_len += l;
3596                 l = snprintf(cache, cache_size, "%s", origcache);
3597                 free(origcache);
3598                 if (l < 0 || l >= cache_size)
3599                         goto err;
3600                 total_len += l;
3601         }
3602
3603         d->cached = 1;
3604         d->size = total_len;
3605         if (total_len > size ) total_len = size;
3606
3607         /* read from off 0 */
3608         memcpy(buf, d->buf, total_len);
3609         rv = total_len;
3610 err:
3611         if (f)
3612                 fclose(f);
3613         free(line);
3614         free(cpuset);
3615         free(cg);
3616         return rv;
3617 }
3618
3619 static uint64_t get_reaper_start_time(pid_t pid)
3620 {
3621         int ret;
3622         FILE *f;
3623         uint64_t starttime;
3624         /* strlen("/proc/") = 6
3625          * +
3626          * LXCFS_NUMSTRLEN64
3627          * +
3628          * strlen("/stat") = 5
3629          * +
3630          * \0 = 1
3631          * */
3632 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3633         char path[__PROC_PID_STAT_LEN];
3634         pid_t qpid;
3635
3636         qpid = lookup_initpid_in_store(pid);
3637         if (qpid <= 0) {
3638                 /* Caller can check for EINVAL on 0. */
3639                 errno = EINVAL;
3640                 return 0;
3641         }
3642
3643         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3644         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3645                 /* Caller can check for EINVAL on 0. */
3646                 errno = EINVAL;
3647                 return 0;
3648         }
3649
3650         f = fopen(path, "r");
3651         if (!f) {
3652                 /* Caller can check for EINVAL on 0. */
3653                 errno = EINVAL;
3654                 return 0;
3655         }
3656
3657         /* Note that the *scanf() argument supression requires that length
3658          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3659          * at us. It's like telling someone you're not married and then asking
3660          * if you can bring your wife to the party.
3661          */
3662         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3663                         "%*s "      /* (2)  comm        %s   */
3664                         "%*c "      /* (3)  state       %c   */
3665                         "%*d "      /* (4)  ppid        %d   */
3666                         "%*d "      /* (5)  pgrp        %d   */
3667                         "%*d "      /* (6)  session     %d   */
3668                         "%*d "      /* (7)  tty_nr      %d   */
3669                         "%*d "      /* (8)  tpgid       %d   */
3670                         "%*u "      /* (9)  flags       %u   */
3671                         "%*u "      /* (10) minflt      %lu  */
3672                         "%*u "      /* (11) cminflt     %lu  */
3673                         "%*u "      /* (12) majflt      %lu  */
3674                         "%*u "      /* (13) cmajflt     %lu  */
3675                         "%*u "      /* (14) utime       %lu  */
3676                         "%*u "      /* (15) stime       %lu  */
3677                         "%*d "      /* (16) cutime      %ld  */
3678                         "%*d "      /* (17) cstime      %ld  */
3679                         "%*d "      /* (18) priority    %ld  */
3680                         "%*d "      /* (19) nice        %ld  */
3681                         "%*d "      /* (20) num_threads %ld  */
3682                         "%*d "      /* (21) itrealvalue %ld  */
3683                         "%" PRIu64, /* (22) starttime   %llu */
3684                      &starttime);
3685         if (ret != 1) {
3686                 fclose(f);
3687                 /* Caller can check for EINVAL on 0. */
3688                 errno = EINVAL;
3689                 return 0;
3690         }
3691
3692         fclose(f);
3693
3694         errno = 0;
3695         return starttime;
3696 }
3697
3698 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3699 {
3700         uint64_t clockticks;
3701         int64_t ticks_per_sec;
3702
3703         clockticks = get_reaper_start_time(pid);
3704         if (clockticks == 0 && errno == EINVAL) {
3705                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3706                 return 0;
3707         }
3708
3709         ticks_per_sec = sysconf(_SC_CLK_TCK);
3710         if (ticks_per_sec < 0 && errno == EINVAL) {
3711                 lxcfs_debug(
3712                     "%s\n",
3713                     "failed to determine number of clock ticks in a second");
3714                 return 0;
3715         }
3716
3717         return (clockticks /= ticks_per_sec);
3718 }
3719
3720 static uint64_t get_reaper_age(pid_t pid)
3721 {
3722         uint64_t procstart, uptime, procage;
3723
3724         /* We need to substract the time the process has started since system
3725          * boot minus the time when the system has started to get the actual
3726          * reaper age.
3727          */
3728         procstart = get_reaper_start_time_in_sec(pid);
3729         procage = procstart;
3730         if (procstart > 0) {
3731                 int ret;
3732                 struct timespec spec;
3733
3734                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3735                 if (ret < 0)
3736                         return 0;
3737                 /* We could make this more precise here by using the tv_nsec
3738                  * field in the timespec struct and convert it to milliseconds
3739                  * and then create a double for the seconds and milliseconds but
3740                  * that seems more work than it is worth.
3741                  */
3742                 uptime = spec.tv_sec;
3743                 procage = uptime - procstart;
3744         }
3745
3746         return procage;
3747 }
3748
3749 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3750 static int proc_stat_read(char *buf, size_t size, off_t offset,
3751                 struct fuse_file_info *fi)
3752 {
3753         struct fuse_context *fc = fuse_get_context();
3754         struct file_info *d = (struct file_info *)fi->fh;
3755         char *cg;
3756         char *cpuset = NULL;
3757         char *line = NULL;
3758         size_t linelen = 0, total_len = 0, rv = 0;
3759         int curcpu = -1; /* cpu numbering starts at 0 */
3760         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3761         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3762                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3763         char cpuall[CPUALL_MAX_SIZE];
3764         /* reserve for cpu all */
3765         char *cache = d->buf + CPUALL_MAX_SIZE;
3766         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3767         FILE *f = NULL;
3768
3769         if (offset){
3770                 if (offset > d->size)
3771                         return -EINVAL;
3772                 if (!d->cached)
3773                         return 0;
3774                 int left = d->size - offset;
3775                 total_len = left > size ? size: left;
3776                 memcpy(buf, d->buf + offset, total_len);
3777                 return total_len;
3778         }
3779
3780         pid_t initpid = lookup_initpid_in_store(fc->pid);
3781         if (initpid <= 0)
3782                 initpid = fc->pid;
3783         cg = get_pid_cgroup(initpid, "cpuset");
3784         if (!cg)
3785                 return read_file("/proc/stat", buf, size, d);
3786         prune_init_slice(cg);
3787
3788         cpuset = get_cpuset(cg);
3789         if (!cpuset)
3790                 goto err;
3791
3792         f = fopen("/proc/stat", "r");
3793         if (!f)
3794                 goto err;
3795
3796         //skip first line
3797         if (getline(&line, &linelen, f) < 0) {
3798                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3799                 goto err;
3800         }
3801
3802         while (getline(&line, &linelen, f) != -1) {
3803                 ssize_t l;
3804                 int cpu;
3805                 char cpu_char[10]; /* That's a lot of cores */
3806                 char *c;
3807
3808                 if (strlen(line) == 0)
3809                         continue;
3810                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3811                         /* not a ^cpuN line containing a number N, just print it */
3812                         l = snprintf(cache, cache_size, "%s", line);
3813                         if (l < 0) {
3814                                 perror("Error writing to cache");
3815                                 rv = 0;
3816                                 goto err;
3817                         }
3818                         if (l >= cache_size) {
3819                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3820                                 rv = 0;
3821                                 goto err;
3822                         }
3823                         cache += l;
3824                         cache_size -= l;
3825                         total_len += l;
3826                         continue;
3827                 }
3828
3829                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3830                         continue;
3831                 if (!cpu_in_cpuset(cpu, cpuset))
3832                         continue;
3833                 curcpu ++;
3834
3835                 c = strchr(line, ' ');
3836                 if (!c)
3837                         continue;
3838                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3839                 if (l < 0) {
3840                         perror("Error writing to cache");
3841                         rv = 0;
3842                         goto err;
3843
3844                 }
3845                 if (l >= cache_size) {
3846                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3847                         rv = 0;
3848                         goto err;
3849                 }
3850
3851                 cache += l;
3852                 cache_size -= l;
3853                 total_len += l;
3854
3855                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3856                            &user,
3857                            &nice,
3858                            &system,
3859                            &idle,
3860                            &iowait,
3861                            &irq,
3862                            &softirq,
3863                            &steal,
3864                            &guest,
3865                            &guest_nice) != 10)
3866                         continue;
3867                 user_sum += user;
3868                 nice_sum += nice;
3869                 system_sum += system;
3870                 idle_sum += idle;
3871                 iowait_sum += iowait;
3872                 irq_sum += irq;
3873                 softirq_sum += softirq;
3874                 steal_sum += steal;
3875                 guest_sum += guest;
3876                 guest_nice_sum += guest_nice;
3877         }
3878
3879         cache = d->buf;
3880
3881         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3882                         user_sum,
3883                         nice_sum,
3884                         system_sum,
3885                         idle_sum,
3886                         iowait_sum,
3887                         irq_sum,
3888                         softirq_sum,
3889                         steal_sum,
3890                         guest_sum,
3891                         guest_nice_sum);
3892         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3893                 memcpy(cache, cpuall, cpuall_len);
3894                 cache += cpuall_len;
3895         } else {
3896                 /* shouldn't happen */
3897                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3898                 cpuall_len = 0;
3899         }
3900
3901         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3902         total_len += cpuall_len;
3903         d->cached = 1;
3904         d->size = total_len;
3905         if (total_len > size)
3906                 total_len = size;
3907
3908         memcpy(buf, d->buf, total_len);
3909         rv = total_len;
3910
3911 err:
3912         if (f)
3913                 fclose(f);
3914         free(line);
3915         free(cpuset);
3916         free(cg);
3917         return rv;
3918 }
3919
3920 /* This function retrieves the busy time of a group of tasks by looking at
3921  * cpuacct.usage. Unfortunately, this only makes sense when the container has
3922  * been given it's own cpuacct cgroup. If not, this function will take the busy
3923  * time of all other taks that do not actually belong to the container into
3924  * account as well. If someone has a clever solution for this please send a
3925  * patch!
3926  */
3927 static unsigned long get_reaper_busy(pid_t task)
3928 {
3929         pid_t initpid = lookup_initpid_in_store(task);
3930         char *cgroup = NULL, *usage_str = NULL;
3931         unsigned long usage = 0;
3932
3933         if (initpid <= 0)
3934                 return 0;
3935
3936         cgroup = get_pid_cgroup(initpid, "cpuacct");
3937         if (!cgroup)
3938                 goto out;
3939         prune_init_slice(cgroup);
3940         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3941                 goto out;
3942         usage = strtoul(usage_str, NULL, 10);
3943         usage /= 1000000000;
3944
3945 out:
3946         free(cgroup);
3947         free(usage_str);
3948         return usage;
3949 }
3950
3951 #if RELOADTEST
3952 void iwashere(void)
3953 {
3954         int fd;
3955
3956         fd = creat("/tmp/lxcfs-iwashere", 0644);
3957         if (fd >= 0)
3958                 close(fd);
3959 }
3960 #endif
3961
3962 /*
3963  * We read /proc/uptime and reuse its second field.
3964  * For the first field, we use the mtime for the reaper for
3965  * the calling pid as returned by getreaperage
3966  */
3967 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3968                 struct fuse_file_info *fi)
3969 {
3970         struct fuse_context *fc = fuse_get_context();
3971         struct file_info *d = (struct file_info *)fi->fh;
3972         unsigned long int busytime = get_reaper_busy(fc->pid);
3973         char *cache = d->buf;
3974         ssize_t total_len = 0;
3975         uint64_t idletime, reaperage;
3976
3977 #if RELOADTEST
3978         iwashere();
3979 #endif
3980
3981         if (offset){
3982                 if (!d->cached)
3983                         return 0;
3984                 if (offset > d->size)
3985                         return -EINVAL;
3986                 int left = d->size - offset;
3987                 total_len = left > size ? size: left;
3988                 memcpy(buf, cache + offset, total_len);
3989                 return total_len;
3990         }
3991
3992         reaperage = get_reaper_age(fc->pid);
3993         /* To understand why this is done, please read the comment to the
3994          * get_reaper_busy() function.
3995          */
3996         idletime = reaperage;
3997         if (reaperage >= busytime)
3998                 idletime = reaperage - busytime;
3999
4000         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
4001         if (total_len < 0 || total_len >=  d->buflen){
4002                 lxcfs_error("%s\n", "failed to write to cache");
4003                 return 0;
4004         }
4005
4006         d->size = (int)total_len;
4007         d->cached = 1;
4008
4009         if (total_len > size) total_len = size;
4010
4011         memcpy(buf, d->buf, total_len);
4012         return total_len;
4013 }
4014
4015 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
4016                 struct fuse_file_info *fi)
4017 {
4018         char dev_name[72];
4019         struct fuse_context *fc = fuse_get_context();
4020         struct file_info *d = (struct file_info *)fi->fh;
4021         char *cg;
4022         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
4023                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
4024         unsigned long read = 0, write = 0;
4025         unsigned long read_merged = 0, write_merged = 0;
4026         unsigned long read_sectors = 0, write_sectors = 0;
4027         unsigned long read_ticks = 0, write_ticks = 0;
4028         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
4029         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
4030         char *cache = d->buf;
4031         size_t cache_size = d->buflen;
4032         char *line = NULL;
4033         size_t linelen = 0, total_len = 0, rv = 0;
4034         unsigned int major = 0, minor = 0;
4035         int i = 0;
4036         FILE *f = NULL;
4037
4038         if (offset){
4039                 if (offset > d->size)
4040                         return -EINVAL;
4041                 if (!d->cached)
4042                         return 0;
4043                 int left = d->size - offset;
4044                 total_len = left > size ? size: left;
4045                 memcpy(buf, cache + offset, total_len);
4046                 return total_len;
4047         }
4048
4049         pid_t initpid = lookup_initpid_in_store(fc->pid);
4050         if (initpid <= 0)
4051                 initpid = fc->pid;
4052         cg = get_pid_cgroup(initpid, "blkio");
4053         if (!cg)
4054                 return read_file("/proc/diskstats", buf, size, d);
4055         prune_init_slice(cg);
4056
4057         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
4058                 goto err;
4059         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
4060                 goto err;
4061         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
4062                 goto err;
4063         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
4064                 goto err;
4065         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
4066                 goto err;
4067
4068
4069         f = fopen("/proc/diskstats", "r");
4070         if (!f)
4071                 goto err;
4072
4073         while (getline(&line, &linelen, f) != -1) {
4074                 ssize_t l;
4075                 char lbuf[256];
4076
4077                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
4078                 if (i != 3)
4079                         continue;
4080
4081                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
4082                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
4083                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
4084                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
4085                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
4086                 read_sectors = read_sectors/512;
4087                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
4088                 write_sectors = write_sectors/512;
4089
4090                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
4091                 rd_svctm = rd_svctm/1000000;
4092                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
4093                 rd_wait = rd_wait/1000000;
4094                 read_ticks = rd_svctm + rd_wait;
4095
4096                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
4097                 wr_svctm =  wr_svctm/1000000;
4098                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
4099                 wr_wait =  wr_wait/1000000;
4100                 write_ticks = wr_svctm + wr_wait;
4101
4102                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
4103                 tot_ticks =  tot_ticks/1000000;
4104
4105                 memset(lbuf, 0, 256);
4106                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
4107                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
4108                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
4109                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
4110                 else
4111                         continue;
4112
4113                 l = snprintf(cache, cache_size, "%s", lbuf);
4114                 if (l < 0) {
4115                         perror("Error writing to fuse buf");
4116                         rv = 0;
4117                         goto err;
4118                 }
4119                 if (l >= cache_size) {
4120                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4121                         rv = 0;
4122                         goto err;
4123                 }
4124                 cache += l;
4125                 cache_size -= l;
4126                 total_len += l;
4127         }
4128
4129         d->cached = 1;
4130         d->size = total_len;
4131         if (total_len > size ) total_len = size;
4132         memcpy(buf, d->buf, total_len);
4133
4134         rv = total_len;
4135 err:
4136         free(cg);
4137         if (f)
4138                 fclose(f);
4139         free(line);
4140         free(io_serviced_str);
4141         free(io_merged_str);
4142         free(io_service_bytes_str);
4143         free(io_wait_time_str);
4144         free(io_service_time_str);
4145         return rv;
4146 }
4147
4148 static int proc_swaps_read(char *buf, size_t size, off_t offset,
4149                 struct fuse_file_info *fi)
4150 {
4151         struct fuse_context *fc = fuse_get_context();
4152         struct file_info *d = (struct file_info *)fi->fh;
4153         char *cg = NULL;
4154         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4155         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4156         ssize_t total_len = 0, rv = 0;
4157         ssize_t l = 0;
4158         char *cache = d->buf;
4159
4160         if (offset) {
4161                 if (offset > d->size)
4162                         return -EINVAL;
4163                 if (!d->cached)
4164                         return 0;
4165                 int left = d->size - offset;
4166                 total_len = left > size ? size: left;
4167                 memcpy(buf, cache + offset, total_len);
4168                 return total_len;
4169         }
4170
4171         pid_t initpid = lookup_initpid_in_store(fc->pid);
4172         if (initpid <= 0)
4173                 initpid = fc->pid;
4174         cg = get_pid_cgroup(initpid, "memory");
4175         if (!cg)
4176                 return read_file("/proc/swaps", buf, size, d);
4177         prune_init_slice(cg);
4178
4179         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4180
4181         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4182                 goto err;
4183
4184         memusage = strtoul(memusage_str, NULL, 10);
4185
4186         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4187             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4188
4189                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4190                 memswusage = strtoul(memswusage_str, NULL, 10);
4191
4192                 swap_total = (memswlimit - memlimit) / 1024;
4193                 swap_free = (memswusage - memusage) / 1024;
4194         }
4195
4196         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4197
4198         /* When no mem + swap limit is specified or swapaccount=0*/
4199         if (!memswlimit) {
4200                 char *line = NULL;
4201                 size_t linelen = 0;
4202                 FILE *f = fopen("/proc/meminfo", "r");
4203
4204                 if (!f)
4205                         goto err;
4206
4207                 while (getline(&line, &linelen, f) != -1) {
4208                         if (startswith(line, "SwapTotal:")) {
4209                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4210                         } else if (startswith(line, "SwapFree:")) {
4211                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4212                         }
4213                 }
4214
4215                 free(line);
4216                 fclose(f);
4217         }
4218
4219         if (swap_total > 0) {
4220                 l = snprintf(d->buf + total_len, d->size - total_len,
4221                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4222                                 swap_total, swap_free);
4223                 total_len += l;
4224         }
4225
4226         if (total_len < 0 || l < 0) {
4227                 perror("Error writing to cache");
4228                 rv = 0;
4229                 goto err;
4230         }
4231
4232         d->cached = 1;
4233         d->size = (int)total_len;
4234
4235         if (total_len > size) total_len = size;
4236         memcpy(buf, d->buf, total_len);
4237         rv = total_len;
4238
4239 err:
4240         free(cg);
4241         free(memswlimit_str);
4242         free(memlimit_str);
4243         free(memusage_str);
4244         free(memswusage_str);
4245         return rv;
4246 }
4247
4248 static off_t get_procfile_size(const char *which)
4249 {
4250         FILE *f = fopen(which, "r");
4251         char *line = NULL;
4252         size_t len = 0;
4253         ssize_t sz, answer = 0;
4254         if (!f)
4255                 return 0;
4256
4257         while ((sz = getline(&line, &len, f)) != -1)
4258                 answer += sz;
4259         fclose (f);
4260         free(line);
4261
4262         return answer;
4263 }
4264
4265 int proc_getattr(const char *path, struct stat *sb)
4266 {
4267         struct timespec now;
4268
4269         memset(sb, 0, sizeof(struct stat));
4270         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4271                 return -EINVAL;
4272         sb->st_uid = sb->st_gid = 0;
4273         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4274         if (strcmp(path, "/proc") == 0) {
4275                 sb->st_mode = S_IFDIR | 00555;
4276                 sb->st_nlink = 2;
4277                 return 0;
4278         }
4279         if (strcmp(path, "/proc/meminfo") == 0 ||
4280                         strcmp(path, "/proc/cpuinfo") == 0 ||
4281                         strcmp(path, "/proc/uptime") == 0 ||
4282                         strcmp(path, "/proc/stat") == 0 ||
4283                         strcmp(path, "/proc/diskstats") == 0 ||
4284                         strcmp(path, "/proc/swaps") == 0 ||
4285                         strcmp(path, "/proc/loadavg") == 0) {
4286                 sb->st_size = 0;
4287                 sb->st_mode = S_IFREG | 00444;
4288                 sb->st_nlink = 1;
4289                 return 0;
4290         }
4291
4292         return -ENOENT;
4293 }
4294
4295 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4296                 struct fuse_file_info *fi)
4297 {
4298         if (filler(buf, ".", NULL, 0) != 0 ||
4299             filler(buf, "..", NULL, 0) != 0 ||
4300             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4301             filler(buf, "meminfo", NULL, 0) != 0 ||
4302             filler(buf, "stat", NULL, 0) != 0 ||
4303             filler(buf, "uptime", NULL, 0) != 0 ||
4304             filler(buf, "diskstats", NULL, 0) != 0 ||
4305             filler(buf, "swaps", NULL, 0) != 0   ||
4306             filler(buf, "loadavg", NULL, 0) != 0)
4307                 return -EINVAL;
4308         return 0;
4309 }
4310
4311 int proc_open(const char *path, struct fuse_file_info *fi)
4312 {
4313         int type = -1;
4314         struct file_info *info;
4315
4316         if (strcmp(path, "/proc/meminfo") == 0)
4317                 type = LXC_TYPE_PROC_MEMINFO;
4318         else if (strcmp(path, "/proc/cpuinfo") == 0)
4319                 type = LXC_TYPE_PROC_CPUINFO;
4320         else if (strcmp(path, "/proc/uptime") == 0)
4321                 type = LXC_TYPE_PROC_UPTIME;
4322         else if (strcmp(path, "/proc/stat") == 0)
4323                 type = LXC_TYPE_PROC_STAT;
4324         else if (strcmp(path, "/proc/diskstats") == 0)
4325                 type = LXC_TYPE_PROC_DISKSTATS;
4326         else if (strcmp(path, "/proc/swaps") == 0)
4327                 type = LXC_TYPE_PROC_SWAPS;
4328         else if (strcmp(path, "/proc/loadavg") == 0)
4329                 type = LXC_TYPE_PROC_LOADAVG;
4330         if (type == -1)
4331                 return -ENOENT;
4332
4333         info = malloc(sizeof(*info));
4334         if (!info)
4335                 return -ENOMEM;
4336
4337         memset(info, 0, sizeof(*info));
4338         info->type = type;
4339
4340         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4341         do {
4342                 info->buf = malloc(info->buflen);
4343         } while (!info->buf);
4344         memset(info->buf, 0, info->buflen);
4345         /* set actual size to buffer size */
4346         info->size = info->buflen;
4347
4348         fi->fh = (unsigned long)info;
4349         return 0;
4350 }
4351
4352 int proc_access(const char *path, int mask)
4353 {
4354         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4355                 return 0;
4356
4357         /* these are all read-only */
4358         if ((mask & ~R_OK) != 0)
4359                 return -EACCES;
4360         return 0;
4361 }
4362
4363 int proc_release(const char *path, struct fuse_file_info *fi)
4364 {
4365         do_release_file_info(fi);
4366         return 0;
4367 }
4368
4369 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4370                 struct fuse_file_info *fi)
4371 {
4372         struct file_info *f = (struct file_info *) fi->fh;
4373
4374         switch (f->type) {
4375         case LXC_TYPE_PROC_MEMINFO:
4376                 return proc_meminfo_read(buf, size, offset, fi);
4377         case LXC_TYPE_PROC_CPUINFO:
4378                 return proc_cpuinfo_read(buf, size, offset, fi);
4379         case LXC_TYPE_PROC_UPTIME:
4380                 return proc_uptime_read(buf, size, offset, fi);
4381         case LXC_TYPE_PROC_STAT:
4382                 return proc_stat_read(buf, size, offset, fi);
4383         case LXC_TYPE_PROC_DISKSTATS:
4384                 return proc_diskstats_read(buf, size, offset, fi);
4385         case LXC_TYPE_PROC_SWAPS:
4386                 return proc_swaps_read(buf, size, offset, fi);
4387         case LXC_TYPE_PROC_LOADAVG:
4388                 return proc_loadavg_read(buf, size, offset, fi);
4389         default:
4390                 return -EINVAL;
4391         }
4392 }
4393
4394 /*
4395  * Functions needed to setup cgroups in the __constructor__.
4396  */
4397
4398 static bool mkdir_p(const char *dir, mode_t mode)
4399 {
4400         const char *tmp = dir;
4401         const char *orig = dir;
4402         char *makeme;
4403
4404         do {
4405                 dir = tmp + strspn(tmp, "/");
4406                 tmp = dir + strcspn(dir, "/");
4407                 makeme = strndup(orig, dir - orig);
4408                 if (!makeme)
4409                         return false;
4410                 if (mkdir(makeme, mode) && errno != EEXIST) {
4411                         lxcfs_error("Failed to create directory '%s': %s.\n",
4412                                 makeme, strerror(errno));
4413                         free(makeme);
4414                         return false;
4415                 }
4416                 free(makeme);
4417         } while(tmp != dir);
4418
4419         return true;
4420 }
4421
4422 static bool umount_if_mounted(void)
4423 {
4424         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4425                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4426                 return false;
4427         }
4428         return true;
4429 }
4430
4431 /* __typeof__ should be safe to use with all compilers. */
4432 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4433 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4434 {
4435         return (fs->f_type == (fs_type_magic)magic_val);
4436 }
4437
4438 /*
4439  * looking at fs/proc_namespace.c, it appears we can
4440  * actually expect the rootfs entry to very specifically contain
4441  * " - rootfs rootfs "
4442  * IIUC, so long as we've chrooted so that rootfs is not our root,
4443  * the rootfs entry should always be skipped in mountinfo contents.
4444  */
4445 static bool is_on_ramfs(void)
4446 {
4447         FILE *f;
4448         char *p, *p2;
4449         char *line = NULL;
4450         size_t len = 0;
4451         int i;
4452
4453         f = fopen("/proc/self/mountinfo", "r");
4454         if (!f)
4455                 return false;
4456
4457         while (getline(&line, &len, f) != -1) {
4458                 for (p = line, i = 0; p && i < 4; i++)
4459                         p = strchr(p + 1, ' ');
4460                 if (!p)
4461                         continue;
4462                 p2 = strchr(p + 1, ' ');
4463                 if (!p2)
4464                         continue;
4465                 *p2 = '\0';
4466                 if (strcmp(p + 1, "/") == 0) {
4467                         // this is '/'.  is it the ramfs?
4468                         p = strchr(p2 + 1, '-');
4469                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4470                                 free(line);
4471                                 fclose(f);
4472                                 return true;
4473                         }
4474                 }
4475         }
4476         free(line);
4477         fclose(f);
4478         return false;
4479 }
4480
4481 static int pivot_enter()
4482 {
4483         int ret = -1, oldroot = -1, newroot = -1;
4484
4485         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4486         if (oldroot < 0) {
4487                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4488                 return ret;
4489         }
4490
4491         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4492         if (newroot < 0) {
4493                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4494                 goto err;
4495         }
4496
4497         /* change into new root fs */
4498         if (fchdir(newroot) < 0) {
4499                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4500                 goto err;
4501         }
4502
4503         /* pivot_root into our new root fs */
4504         if (pivot_root(".", ".") < 0) {
4505                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4506                 goto err;
4507         }
4508
4509         /*
4510          * At this point the old-root is mounted on top of our new-root.
4511          * To unmounted it we must not be chdir'd into it, so escape back
4512          * to the old-root.
4513          */
4514         if (fchdir(oldroot) < 0) {
4515                 lxcfs_error("%s\n", "Failed to enter old root.");
4516                 goto err;
4517         }
4518
4519         if (umount2(".", MNT_DETACH) < 0) {
4520                 lxcfs_error("%s\n", "Failed to detach old root.");
4521                 goto err;
4522         }
4523
4524         if (fchdir(newroot) < 0) {
4525                 lxcfs_error("%s\n", "Failed to re-enter new root.");
4526                 goto err;
4527         }
4528
4529         ret = 0;
4530
4531 err:
4532         if (oldroot > 0)
4533                 close(oldroot);
4534         if (newroot > 0)
4535                 close(newroot);
4536
4537         return ret;
4538 }
4539
4540 static int chroot_enter()
4541 {
4542         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4543                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4544                 return -1;
4545         }
4546
4547         if (chroot(".") < 0) {
4548                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4549                 return -1;
4550         }
4551
4552         if (chdir("/") < 0) {
4553                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4554                 return -1;
4555         }
4556
4557         return 0;
4558 }
4559
4560 static int permute_and_enter(void)
4561 {
4562         struct statfs sb;
4563
4564         if (statfs("/", &sb) < 0) {
4565                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
4566                 return -1;
4567         }
4568
4569         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
4570          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
4571          * /proc/1/mountinfo. */
4572         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
4573                 return chroot_enter();
4574
4575         if (pivot_enter() < 0) {
4576                 lxcfs_error("%s\n", "Could not perform pivot root.");
4577                 return -1;
4578         }
4579
4580         return 0;
4581 }
4582
4583 /* Prepare our new clean root. */
4584 static int permute_prepare(void)
4585 {
4586         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4587                 lxcfs_error("%s\n", "Failed to create directory for new root.");
4588                 return -1;
4589         }
4590
4591         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4592                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4593                 return -1;
4594         }
4595
4596         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4597                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
4598                 return -1;
4599         }
4600
4601         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4602                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
4603                 return -1;
4604         }
4605
4606         return 0;
4607 }
4608
4609 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
4610 static bool permute_root(void)
4611 {
4612         /* Prepare new root. */
4613         if (permute_prepare() < 0)
4614                 return false;
4615
4616         /* Pivot into new root. */
4617         if (permute_and_enter() < 0)
4618                 return false;
4619
4620         return true;
4621 }
4622
4623 static int preserve_mnt_ns(int pid)
4624 {
4625         int ret;
4626         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
4627         char path[len];
4628
4629         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4630         if (ret < 0 || (size_t)ret >= len)
4631                 return -1;
4632
4633         return open(path, O_RDONLY | O_CLOEXEC);
4634 }
4635
4636 static bool cgfs_prepare_mounts(void)
4637 {
4638         if (!mkdir_p(BASEDIR, 0700)) {
4639                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
4640                 return false;
4641         }
4642
4643         if (!umount_if_mounted()) {
4644                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
4645                 return false;
4646         }
4647
4648         if (unshare(CLONE_NEWNS) < 0) {
4649                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
4650                 return false;
4651         }
4652
4653         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
4654         if (cgroup_mount_ns_fd < 0) {
4655                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
4656                 return false;
4657         }
4658
4659         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4660                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
4661                 return false;
4662         }
4663
4664         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
4665                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
4666                 return false;
4667         }
4668
4669         return true;
4670 }
4671
4672 static bool cgfs_mount_hierarchies(void)
4673 {
4674         char *target;
4675         size_t clen, len;
4676         int i, ret;
4677
4678         for (i = 0; i < num_hierarchies; i++) {
4679                 char *controller = hierarchies[i];
4680
4681                 clen = strlen(controller);
4682                 len = strlen(BASEDIR) + clen + 2;
4683                 target = malloc(len);
4684                 if (!target)
4685                         return false;
4686
4687                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4688                 if (ret < 0 || ret >= len) {
4689                         free(target);
4690                         return false;
4691                 }
4692                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4693                         free(target);
4694                         return false;
4695                 }
4696                 if (!strcmp(controller, "unified"))
4697                         ret = mount("none", target, "cgroup2", 0, NULL);
4698                 else
4699                         ret = mount(controller, target, "cgroup", 0, controller);
4700                 if (ret < 0) {
4701                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
4702                         free(target);
4703                         return false;
4704                 }
4705
4706                 fd_hierarchies[i] = open(target, O_DIRECTORY);
4707                 if (fd_hierarchies[i] < 0) {
4708                         free(target);
4709                         return false;
4710                 }
4711                 free(target);
4712         }
4713         return true;
4714 }
4715
4716 static bool cgfs_setup_controllers(void)
4717 {
4718         if (!cgfs_prepare_mounts())
4719                 return false;
4720
4721         if (!cgfs_mount_hierarchies()) {
4722                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
4723                 return false;
4724         }
4725
4726         if (!permute_root())
4727                 return false;
4728
4729         return true;
4730 }
4731
4732 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
4733 {
4734         FILE *f;
4735         char *cret, *line = NULL;
4736         char cwd[MAXPATHLEN];
4737         size_t len = 0;
4738         int i, init_ns = -1;
4739         bool found_unified = false;
4740
4741         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4742                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
4743                 return;
4744         }
4745
4746         while (getline(&line, &len, f) != -1) {
4747                 char *idx, *p, *p2;
4748
4749                 p = strchr(line, ':');
4750                 if (!p)
4751                         goto out;
4752                 idx = line;
4753                 *(p++) = '\0';
4754
4755                 p2 = strrchr(p, ':');
4756                 if (!p2)
4757                         goto out;
4758                 *p2 = '\0';
4759
4760                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4761                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4762                  * because it parses out the empty string "" and later on passes
4763                  * it to mount(). Let's skip such entries.
4764                  */
4765                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
4766                         found_unified = true;
4767                         p = "unified";
4768                 }
4769
4770                 if (!store_hierarchy(line, p))
4771                         goto out;
4772         }
4773
4774         /* Preserve initial namespace. */
4775         init_ns = preserve_mnt_ns(getpid());
4776         if (init_ns < 0) {
4777                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
4778                 goto out;
4779         }
4780
4781         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
4782         if (!fd_hierarchies) {
4783                 lxcfs_error("%s\n", strerror(errno));
4784                 goto out;
4785         }
4786
4787         for (i = 0; i < num_hierarchies; i++)
4788                 fd_hierarchies[i] = -1;
4789
4790         cret = getcwd(cwd, MAXPATHLEN);
4791         if (!cret)
4792                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
4793
4794         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4795          * to privately mount lxcfs cgroups. */
4796         if (!cgfs_setup_controllers()) {
4797                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
4798                 goto out;
4799         }
4800
4801         if (setns(init_ns, 0) < 0) {
4802                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
4803                 goto out;
4804         }
4805
4806         if (!cret || chdir(cwd) < 0)
4807                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
4808
4809         print_subsystems();
4810
4811 out:
4812         free(line);
4813         fclose(f);
4814         if (init_ns >= 0)
4815                 close(init_ns);
4816 }
4817
4818 static void __attribute__((destructor)) free_subsystems(void)
4819 {
4820         int i;
4821
4822         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
4823
4824         for (i = 0; i < num_hierarchies; i++) {
4825                 if (hierarchies[i])
4826                         free(hierarchies[i]);
4827                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
4828                         close(fd_hierarchies[i]);
4829         }
4830         free(hierarchies);
4831         free(fd_hierarchies);
4832
4833         if (cgroup_mount_ns_fd >= 0)
4834                 close(cgroup_mount_ns_fd);
4835 }