bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #include <dirent.h>
  12 #include <errno.h>
  13 #include <fcntl.h>
  14 #include <fuse.h>
  15 #include <libgen.h>
  16 #include <pthread.h>
  17 #include <sched.h>
  18 #include <stdbool.h>
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include <string.h>
  22 #include <time.h>
  23 #include <unistd.h>
  24 #include <wait.h>
  25 #include <linux/sched.h>
  26 #include <sys/epoll.h>
  27 #include <sys/mman.h>
  28 #include <sys/mount.h>
  29 #include <sys/param.h>
  30 #include <sys/socket.h>
  31 #include <sys/syscall.h>
  32
  33 #include "bindings.h"
  34 #include "config.h" // for VERSION
  35
  36 /* Define pivot_root() if missing from the C library */
  37 #ifndef HAVE_PIVOT_ROOT
  38 static int pivot_root(const char * new_root, const char * put_old)
  39 {
  40 #ifdef __NR_pivot_root
  41 return syscall(__NR_pivot_root, new_root, put_old);
  42 #else
  43 errno = ENOSYS;
  44 return -1;
  45 #endif
  46 }
  47 #else
  48 extern int pivot_root(const char * new_root, const char * put_old);
  49 #endif
  50
  51 #ifdef DEBUG
  52 #define lxcfs_debug(format, ...)                                               \
  53         do {                                                                   \
  54                 fprintf(stderr, "%s: %d: %s: " format, __FILE__, __LINE__,     \
  55                         __func__, __VA_ARGS__);                                \
  56         } while (false)
  57 #else
  58 #define lxcfs_debug(format, ...)
  59 #endif /* DEBUG */
  60
  61 enum {
  62         LXC_TYPE_CGDIR,
  63         LXC_TYPE_CGFILE,
  64         LXC_TYPE_PROC_MEMINFO,
  65         LXC_TYPE_PROC_CPUINFO,
  66         LXC_TYPE_PROC_UPTIME,
  67         LXC_TYPE_PROC_STAT,
  68         LXC_TYPE_PROC_DISKSTATS,
  69         LXC_TYPE_PROC_SWAPS,
  70 };
  71
  72 struct file_info {
  73         char *controller;
  74         char *cgroup;
  75         char *file;
  76         int type;
  77         char *buf;  // unused as of yet
  78         int buflen;
  79         int size; //actual data size
  80         int cached;
  81 };
  82
  83 /* reserve buffer size, for cpuall in /proc/stat */
  84 #define BUF_RESERVE_SIZE 256
  85
  86 /*
  87  * A table caching which pid is init for a pid namespace.
  88  * When looking up which pid is init for $qpid, we first
  89  * 1. Stat /proc/$qpid/ns/pid.
  90  * 2. Check whether the ino_t is in our store.
  91  *   a. if not, fork a child in qpid's ns to send us
  92  *       ucred.pid = 1, and read the initpid.  Cache
  93  *       initpid and creation time for /proc/initpid
  94  *       in a new store entry.
  95  *   b. if so, verify that /proc/initpid still matches
  96  *       what we have saved.  If not, clear the store
  97  *       entry and go back to a.  If so, return the
  98  *       cached initpid.
  99  */
 100 struct pidns_init_store {
 101         ino_t ino;          // inode number for /proc/$pid/ns/pid
 102         pid_t initpid;      // the pid of nit in that ns
 103         long int ctime;     // the time at which /proc/$initpid was created
 104         struct pidns_init_store *next;
 105         long int lastcheck;
 106 };
 107
 108 /* lol - look at how they are allocated in the kernel */
 109 #define PIDNS_HASH_SIZE 4096
 110 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 111
 112 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 113 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 114 static void lock_mutex(pthread_mutex_t *l)
 115 {
 116         int ret;
 117
 118         if ((ret = pthread_mutex_lock(l)) != 0) {
 119                 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
 120                 exit(1);
 121         }
 122 }
 123
 124 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 125  * Number of hierarchies mounted. */
 126 static int num_hierarchies;
 127
 128 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 129  * Hierachies mounted {cpuset, blkio, ...}:
 130  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 131 static char **hierarchies;
 132
 133 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 134  * Open file descriptors:
 135  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 136  * private mount namespace.
 137  * Initialized via __constructor__ collect_and_mount_subsystems().
 138  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 139  * mounts and respective files in the private namespace even when located in
 140  * another namespace using the *at() family of functions
 141  * {openat(), fchownat(), ...}. */
 142 static int *fd_hierarchies;
 143
 144 static void unlock_mutex(pthread_mutex_t *l)
 145 {
 146         int ret;
 147
 148         if ((ret = pthread_mutex_unlock(l)) != 0) {
 149                 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
 150                 exit(1);
 151         }
 152 }
 153
 154 static void store_lock(void)
 155 {
 156         lock_mutex(&pidns_store_mutex);
 157 }
 158
 159 static void store_unlock(void)
 160 {
 161         unlock_mutex(&pidns_store_mutex);
 162 }
 163
 164 /* Must be called under store_lock */
 165 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 166 {
 167         struct stat initsb;
 168         char fnam[100];
 169
 170         snprintf(fnam, 100, "/proc/%d", e->initpid);
 171         if (stat(fnam, &initsb) < 0)
 172                 return false;
 173
 174         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 175                     initsb.st_ctime, e->initpid);
 176
 177         if (e->ctime != initsb.st_ctime)
 178                 return false;
 179         return true;
 180 }
 181
 182 /* Must be called under store_lock */
 183 static void remove_initpid(struct pidns_init_store *e)
 184 {
 185         struct pidns_init_store *tmp;
 186         int h;
 187
 188         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 189
 190         h = HASH(e->ino);
 191         if (pidns_hash_table[h] == e) {
 192                 pidns_hash_table[h] = e->next;
 193                 free(e);
 194                 return;
 195         }
 196
 197         tmp = pidns_hash_table[h];
 198         while (tmp) {
 199                 if (tmp->next == e) {
 200                         tmp->next = e->next;
 201                         free(e);
 202                         return;
 203                 }
 204                 tmp = tmp->next;
 205         }
 206 }
 207
 208 #define PURGE_SECS 5
 209 /* Must be called under store_lock */
 210 static void prune_initpid_store(void)
 211 {
 212         static long int last_prune = 0;
 213         struct pidns_init_store *e, *prev, *delme;
 214         long int now, threshold;
 215         int i;
 216
 217         if (!last_prune) {
 218                 last_prune = time(NULL);
 219                 return;
 220         }
 221         now = time(NULL);
 222         if (now < last_prune + PURGE_SECS)
 223                 return;
 224
 225         lxcfs_debug("%s\n", "Pruning.");
 226
 227         last_prune = now;
 228         threshold = now - 2 * PURGE_SECS;
 229
 230         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 231                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 232                         if (e->lastcheck < threshold) {
 233
 234                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 235
 236                                 delme = e;
 237                                 if (prev)
 238                                         prev->next = e->next;
 239                                 else
 240                                         pidns_hash_table[i] = e->next;
 241                                 e = e->next;
 242                                 free(delme);
 243                         } else {
 244                                 prev = e;
 245                                 e = e->next;
 246                         }
 247                 }
 248         }
 249 }
 250
 251 /* Must be called under store_lock */
 252 static void save_initpid(struct stat *sb, pid_t pid)
 253 {
 254         struct pidns_init_store *e;
 255         char fpath[100];
 256         struct stat procsb;
 257         int h;
 258
 259         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 260
 261         snprintf(fpath, 100, "/proc/%d", pid);
 262         if (stat(fpath, &procsb) < 0)
 263                 return;
 264         do {
 265                 e = malloc(sizeof(*e));
 266         } while (!e);
 267         e->ino = sb->st_ino;
 268         e->initpid = pid;
 269         e->ctime = procsb.st_ctime;
 270         h = HASH(e->ino);
 271         e->next = pidns_hash_table[h];
 272         e->lastcheck = time(NULL);
 273         pidns_hash_table[h] = e;
 274 }
 275
 276 /*
 277  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 278  * entry for the inode number and creation time.  Verify that the init pid
 279  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 280  * otherwise.
 281  * Must be called under store_lock
 282  */
 283 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 284 {
 285         int h = HASH(sb->st_ino);
 286         struct pidns_init_store *e = pidns_hash_table[h];
 287
 288         while (e) {
 289                 if (e->ino == sb->st_ino) {
 290                         if (initpid_still_valid(e, sb)) {
 291                                 e->lastcheck = time(NULL);
 292                                 return e;
 293                         }
 294                         remove_initpid(e);
 295                         return NULL;
 296                 }
 297                 e = e->next;
 298         }
 299
 300         return NULL;
 301 }
 302
 303 static int is_dir(const char *path, int fd)
 304 {
 305         struct stat statbuf;
 306         int ret = fstatat(fd, path, &statbuf, fd);
 307         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 308                 return 1;
 309         return 0;
 310 }
 311
 312 static char *must_copy_string(const char *str)
 313 {
 314         char *dup = NULL;
 315         if (!str)
 316                 return NULL;
 317         do {
 318                 dup = strdup(str);
 319         } while (!dup);
 320
 321         return dup;
 322 }
 323
 324 static inline void drop_trailing_newlines(char *s)
 325 {
 326         int l;
 327
 328         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 329                 s[l-1] = '\0';
 330 }
 331
 332 #define BATCH_SIZE 50
 333 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 334 {
 335         int newbatches = (newlen / BATCH_SIZE) + 1;
 336         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 337
 338         if (!*mem || newbatches > oldbatches) {
 339                 char *tmp;
 340                 do {
 341                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 342                 } while (!tmp);
 343                 *mem = tmp;
 344         }
 345 }
 346 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 347 {
 348         size_t newlen = *len + linelen;
 349         dorealloc(contents, *len, newlen + 1);
 350         memcpy(*contents + *len, line, linelen+1);
 351         *len = newlen;
 352 }
 353
 354 static char *slurp_file(const char *from, int fd)
 355 {
 356         char *line = NULL;
 357         char *contents = NULL;
 358         FILE *f = fdopen(fd, "r");
 359         size_t len = 0, fulllen = 0;
 360         ssize_t linelen;
 361
 362         if (!f)
 363                 return NULL;
 364
 365         while ((linelen = getline(&line, &len, f)) != -1) {
 366                 append_line(&contents, &fulllen, line, linelen);
 367         }
 368         fclose(f);
 369
 370         if (contents)
 371                 drop_trailing_newlines(contents);
 372         free(line);
 373         return contents;
 374 }
 375
 376 static bool write_string(const char *fnam, const char *string, int fd)
 377 {
 378         FILE *f;
 379         size_t len, ret;
 380
 381         if (!(f = fdopen(fd, "w")))
 382                 return false;
 383         len = strlen(string);
 384         ret = fwrite(string, 1, len, f);
 385         if (ret != len) {
 386                 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
 387                 fclose(f);
 388                 return false;
 389         }
 390         if (fclose(f) < 0) {
 391                 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
 392                 return false;
 393         }
 394         return true;
 395 }
 396
 397 struct cgfs_files {
 398         char *name;
 399         uint32_t uid, gid;
 400         uint32_t mode;
 401 };
 402
 403 #define ALLOC_NUM 20
 404 static bool store_hierarchy(char *stridx, char *h)
 405 {
 406         if (num_hierarchies % ALLOC_NUM == 0) {
 407                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 408                 n *= ALLOC_NUM;
 409                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 410                 if (!tmp) {
 411                         fprintf(stderr, "Out of memory\n");
 412                         exit(1);
 413                 }
 414                 hierarchies = tmp;
 415         }
 416
 417         hierarchies[num_hierarchies++] = must_copy_string(h);
 418         return true;
 419 }
 420
 421 static void print_subsystems(void)
 422 {
 423         int i;
 424
 425         fprintf(stderr, "hierarchies:\n");
 426         for (i = 0; i < num_hierarchies; i++) {
 427                 if (hierarchies[i])
 428                         fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
 429         }
 430 }
 431
 432 static bool in_comma_list(const char *needle, const char *haystack)
 433 {
 434         const char *s = haystack, *e;
 435         size_t nlen = strlen(needle);
 436
 437         while (*s && (e = strchr(s, ','))) {
 438                 if (nlen != e - s) {
 439                         s = e + 1;
 440                         continue;
 441                 }
 442                 if (strncmp(needle, s, nlen) == 0)
 443                         return true;
 444                 s = e + 1;
 445         }
 446         if (strcmp(needle, s) == 0)
 447                 return true;
 448         return false;
 449 }
 450
 451 /* do we need to do any massaging here?  I'm not sure... */
 452 /* Return the mounted controller and store the corresponding open file descriptor
 453  * referring to the controller mountpoint in the private lxcfs namespace in
 454  * @cfd.
 455  */
 456 static char *find_mounted_controller(const char *controller, int *cfd)
 457 {
 458         int i;
 459
 460         for (i = 0; i < num_hierarchies; i++) {
 461                 if (!hierarchies[i])
 462                         continue;
 463                 if (strcmp(hierarchies[i], controller) == 0) {
 464                         *cfd = fd_hierarchies[i];
 465                         return hierarchies[i];
 466                 }
 467                 if (in_comma_list(controller, hierarchies[i])) {
 468                         *cfd = fd_hierarchies[i];
 469                         return hierarchies[i];
 470                 }
 471         }
 472
 473         return NULL;
 474 }
 475
 476 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 477                 const char *value)
 478 {
 479         int ret, fd, cfd;
 480         size_t len;
 481         char *fnam, *tmpc;
 482
 483         tmpc = find_mounted_controller(controller, &cfd);
 484         if (!tmpc)
 485                 return false;
 486
 487         /* Make sure we pass a relative path to *at() family of functions.
 488          * . + /cgroup + / + file + \0
 489          */
 490         len = strlen(cgroup) + strlen(file) + 3;
 491         fnam = alloca(len);
 492         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 493         if (ret < 0 || (size_t)ret >= len)
 494                 return false;
 495
 496         fd = openat(cfd, fnam, O_WRONLY);
 497         if (fd < 0)
 498                 return false;
 499
 500         return write_string(fnam, value, fd);
 501 }
 502
 503 // Chown all the files in the cgroup directory.  We do this when we create
 504 // a cgroup on behalf of a user.
 505 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 506 {
 507         struct dirent *direntp;
 508         char path[MAXPATHLEN];
 509         size_t len;
 510         DIR *d;
 511         int fd1, ret;
 512
 513         len = strlen(dirname);
 514         if (len >= MAXPATHLEN) {
 515                 fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname);
 516                 return;
 517         }
 518
 519         fd1 = openat(fd, dirname, O_DIRECTORY);
 520         if (fd1 < 0)
 521                 return;
 522
 523         d = fdopendir(fd1);
 524         if (!d) {
 525                 fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname);
 526                 return;
 527         }
 528
 529         while ((direntp = readdir(d))) {
 530                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 531                         continue;
 532                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 533                 if (ret < 0 || ret >= MAXPATHLEN) {
 534                         fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname);
 535                         continue;
 536                 }
 537                 if (fchownat(fd, path, uid, gid, 0) < 0)
 538                         fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid);
 539         }
 540         closedir(d);
 541 }
 542
 543 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 544 {
 545         int cfd;
 546         size_t len;
 547         char *dirnam, *tmpc;
 548
 549         tmpc = find_mounted_controller(controller, &cfd);
 550         if (!tmpc)
 551                 return -EINVAL;
 552
 553         /* Make sure we pass a relative path to *at() family of functions.
 554          * . + /cg + \0
 555          */
 556         len = strlen(cg) + 2;
 557         dirnam = alloca(len);
 558         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 559
 560         if (mkdirat(cfd, dirnam, 0755) < 0)
 561                 return -errno;
 562
 563         if (uid == 0 && gid == 0)
 564                 return 0;
 565
 566         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 567                 return -errno;
 568
 569         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 570
 571         return 0;
 572 }
 573
 574 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 575 {
 576         struct dirent *direntp;
 577         DIR *dir;
 578         bool ret = false;
 579         char pathname[MAXPATHLEN];
 580         int dupfd;
 581
 582         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 583         if (dupfd < 0)
 584                 return false;
 585
 586         dir = fdopendir(dupfd);
 587         if (!dir) {
 588                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 589                 close(dupfd);
 590                 return false;
 591         }
 592
 593         while ((direntp = readdir(dir))) {
 594                 struct stat mystat;
 595                 int rc;
 596
 597                 if (!strcmp(direntp->d_name, ".") ||
 598                     !strcmp(direntp->d_name, ".."))
 599                         continue;
 600
 601                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 602                 if (rc < 0 || rc >= MAXPATHLEN) {
 603                         fprintf(stderr, "pathname too long\n");
 604                         continue;
 605                 }
 606
 607                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 608                 if (rc) {
 609                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 610                         continue;
 611                 }
 612                 if (S_ISDIR(mystat.st_mode))
 613                         if (!recursive_rmdir(pathname, fd, cfd))
 614                                 lxcfs_debug("Error removing %s.\n", pathname);
 615         }
 616
 617         ret = true;
 618         if (closedir(dir) < 0) {
 619                 fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno));
 620                 ret = false;
 621         }
 622
 623         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 624                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 625                 ret = false;
 626         }
 627
 628         close(dupfd);
 629
 630         return ret;
 631 }
 632
 633 bool cgfs_remove(const char *controller, const char *cg)
 634 {
 635         int fd, cfd;
 636         size_t len;
 637         char *dirnam, *tmpc;
 638         bool bret;
 639
 640         tmpc = find_mounted_controller(controller, &cfd);
 641         if (!tmpc)
 642                 return false;
 643
 644         /* Make sure we pass a relative path to *at() family of functions.
 645          * . +  /cg + \0
 646          */
 647         len = strlen(cg) + 2;
 648         dirnam = alloca(len);
 649         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 650
 651         fd = openat(cfd, dirnam, O_DIRECTORY);
 652         if (fd < 0)
 653                 return false;
 654
 655         bret = recursive_rmdir(dirnam, fd, cfd);
 656         close(fd);
 657         return bret;
 658 }
 659
 660 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 661 {
 662         int cfd;
 663         size_t len;
 664         char *pathname, *tmpc;
 665
 666         tmpc = find_mounted_controller(controller, &cfd);
 667         if (!tmpc)
 668                 return false;
 669
 670         /* Make sure we pass a relative path to *at() family of functions.
 671          * . + /file + \0
 672          */
 673         len = strlen(file) + 2;
 674         pathname = alloca(len);
 675         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 676         if (fchmodat(cfd, pathname, mode, 0) < 0)
 677                 return false;
 678         return true;
 679 }
 680
 681 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 682 {
 683         size_t len;
 684         char *fname;
 685
 686         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 687         fname = alloca(len);
 688         snprintf(fname, len, "%s/tasks", dirname);
 689         if (fchownat(fd, fname, uid, gid, 0) != 0)
 690                 return -errno;
 691         snprintf(fname, len, "%s/cgroup.procs", dirname);
 692         if (fchownat(fd, fname, uid, gid, 0) != 0)
 693                 return -errno;
 694         return 0;
 695 }
 696
 697 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 698 {
 699         int cfd;
 700         size_t len;
 701         char *pathname, *tmpc;
 702
 703         tmpc = find_mounted_controller(controller, &cfd);
 704         if (!tmpc)
 705                 return -EINVAL;
 706
 707         /* Make sure we pass a relative path to *at() family of functions.
 708          * . + /file + \0
 709          */
 710         len = strlen(file) + 2;
 711         pathname = alloca(len);
 712         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 713         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 714                 return -errno;
 715
 716         if (is_dir(pathname, cfd))
 717                 // like cgmanager did, we want to chown the tasks file as well
 718                 return chown_tasks_files(pathname, uid, gid, cfd);
 719
 720         return 0;
 721 }
 722
 723 FILE *open_pids_file(const char *controller, const char *cgroup)
 724 {
 725         int fd, cfd;
 726         size_t len;
 727         char *pathname, *tmpc;
 728
 729         tmpc = find_mounted_controller(controller, &cfd);
 730         if (!tmpc)
 731                 return NULL;
 732
 733         /* Make sure we pass a relative path to *at() family of functions.
 734          * . + /cgroup + / "cgroup.procs" + \0
 735          */
 736         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 737         pathname = alloca(len);
 738         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 739
 740         fd = openat(cfd, pathname, O_WRONLY);
 741         if (fd < 0)
 742                 return NULL;
 743
 744         return fdopen(fd, "w");
 745 }
 746
 747 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 748                                 void ***list, size_t typesize,
 749                                 void* (*iterator)(const char*, const char*, const char*))
 750 {
 751         int cfd, fd, ret;
 752         size_t len;
 753         char *cg, *tmpc;
 754         char pathname[MAXPATHLEN];
 755         size_t sz = 0, asz = 0;
 756         struct dirent *dirent;
 757         DIR *dir;
 758
 759         tmpc = find_mounted_controller(controller, &cfd);
 760         *list = NULL;
 761         if (!tmpc)
 762                 return false;
 763
 764         /* Make sure we pass a relative path to *at() family of functions. */
 765         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 766         cg = alloca(len);
 767         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 768         if (ret < 0 || (size_t)ret >= len) {
 769                 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup);
 770                 return false;
 771         }
 772
 773         fd = openat(cfd, cg, O_DIRECTORY);
 774         if (fd < 0)
 775                 return false;
 776
 777         dir = fdopendir(fd);
 778         if (!dir)
 779                 return false;
 780
 781         while ((dirent = readdir(dir))) {
 782                 struct stat mystat;
 783
 784                 if (!strcmp(dirent->d_name, ".") ||
 785                     !strcmp(dirent->d_name, ".."))
 786                         continue;
 787
 788                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 789                 if (ret < 0 || ret >= MAXPATHLEN) {
 790                         fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg);
 791                         continue;
 792                 }
 793
 794                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 795                 if (ret) {
 796                         fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
 797                         continue;
 798                 }
 799                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
 800                     (directories && !S_ISDIR(mystat.st_mode)))
 801                         continue;
 802
 803                 if (sz+2 >= asz) {
 804                         void **tmp;
 805                         asz += BATCH_SIZE;
 806                         do {
 807                                 tmp = realloc(*list, asz * typesize);
 808                         } while  (!tmp);
 809                         *list = tmp;
 810                 }
 811                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
 812                 (*list)[sz+1] = NULL;
 813                 sz++;
 814         }
 815         if (closedir(dir) < 0) {
 816                 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno));
 817                 return false;
 818         }
 819         return true;
 820 }
 821
 822 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 823 {
 824         char *dup;
 825         do {
 826                 dup = strdup(dir_entry);
 827         } while (!dup);
 828         return dup;
 829 }
 830
 831 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
 832 {
 833         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
 834 }
 835
 836 void free_key(struct cgfs_files *k)
 837 {
 838         if (!k)
 839                 return;
 840         free(k->name);
 841         free(k);
 842 }
 843
 844 void free_keys(struct cgfs_files **keys)
 845 {
 846         int i;
 847
 848         if (!keys)
 849                 return;
 850         for (i = 0; keys[i]; i++) {
 851                 free_key(keys[i]);
 852         }
 853         free(keys);
 854 }
 855
 856 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
 857 {
 858         int ret, fd, cfd;
 859         size_t len;
 860         char *fnam, *tmpc;
 861
 862         tmpc = find_mounted_controller(controller, &cfd);
 863         if (!tmpc)
 864                 return false;
 865
 866         /* Make sure we pass a relative path to *at() family of functions.
 867          * . + /cgroup + / + file + \0
 868          */
 869         len = strlen(cgroup) + strlen(file) + 3;
 870         fnam = alloca(len);
 871         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 872         if (ret < 0 || (size_t)ret >= len)
 873                 return NULL;
 874
 875         fd = openat(cfd, fnam, O_RDONLY);
 876         if (fd < 0)
 877                 return NULL;
 878
 879         *value = slurp_file(fnam, fd);
 880         return *value != NULL;
 881 }
 882
 883 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
 884 {
 885         int ret, cfd;
 886         size_t len;
 887         char *fnam, *tmpc;
 888         struct stat sb;
 889         struct cgfs_files *newkey;
 890
 891         tmpc = find_mounted_controller(controller, &cfd);
 892         if (!tmpc)
 893                 return false;
 894
 895         if (file && *file == '/')
 896                 file++;
 897
 898         if (file && strchr(file, '/'))
 899                 return NULL;
 900
 901         /* Make sure we pass a relative path to *at() family of functions.
 902          * . + /cgroup + / + file + \0
 903          */
 904         len = strlen(cgroup) + 3;
 905         if (file)
 906                 len += strlen(file) + 1;
 907         fnam = alloca(len);
 908         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
 909                  file ? "/" : "", file ? file : "");
 910
 911         ret = fstatat(cfd, fnam, &sb, 0);
 912         if (ret < 0)
 913                 return NULL;
 914
 915         do {
 916                 newkey = malloc(sizeof(struct cgfs_files));
 917         } while (!newkey);
 918         if (file)
 919                 newkey->name = must_copy_string(file);
 920         else if (strrchr(cgroup, '/'))
 921                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
 922         else
 923                 newkey->name = must_copy_string(cgroup);
 924         newkey->uid = sb.st_uid;
 925         newkey->gid = sb.st_gid;
 926         newkey->mode = sb.st_mode;
 927
 928         return newkey;
 929 }
 930
 931 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 932 {
 933         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
 934         if (!entry) {
 935                 fprintf(stderr, "%s: Error getting files under %s:%s\n",
 936                         __func__, controller, cgroup);
 937         }
 938         return entry;
 939 }
 940
 941 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
 942 {
 943         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
 944 }
 945
 946 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
 947 {
 948         int cfd;
 949         size_t len;
 950         char *fnam, *tmpc;
 951         int ret;
 952         struct stat sb;
 953
 954         tmpc = find_mounted_controller(controller, &cfd);
 955         if (!tmpc)
 956                 return false;
 957
 958         /* Make sure we pass a relative path to *at() family of functions.
 959          * . + /cgroup + / + f + \0
 960          */
 961         len = strlen(cgroup) + strlen(f) + 3;
 962         fnam = alloca(len);
 963         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
 964         if (ret < 0 || (size_t)ret >= len)
 965                 return false;
 966
 967         ret = fstatat(cfd, fnam, &sb, 0);
 968         if (ret < 0 || !S_ISDIR(sb.st_mode))
 969                 return false;
 970
 971         return true;
 972 }
 973
 974 #define SEND_CREDS_OK 0
 975 #define SEND_CREDS_NOTSK 1
 976 #define SEND_CREDS_FAIL 2
 977 static bool recv_creds(int sock, struct ucred *cred, char *v);
 978 static int wait_for_pid(pid_t pid);
 979 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
 980 static int send_creds_clone_wrapper(void *arg);
 981
 982 /*
 983  * clone a task which switches to @task's namespace and writes '1'.
 984  * over a unix sock so we can read the task's reaper's pid in our
 985  * namespace
 986  *
 987  * Note: glibc's fork() does not respect pidns, which can lead to failed
 988  * assertions inside glibc (and thus failed forks) if the child's pid in
 989  * the pidns and the parent pid outside are identical. Using clone prevents
 990  * this issue.
 991  */
 992 static void write_task_init_pid_exit(int sock, pid_t target)
 993 {
 994         char fnam[100];
 995         pid_t pid;
 996         int fd, ret;
 997         size_t stack_size = sysconf(_SC_PAGESIZE);
 998         void *stack = alloca(stack_size);
 999
1000         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1001         if (ret < 0 || ret >= sizeof(fnam))
1002                 _exit(1);
1003
1004         fd = open(fnam, O_RDONLY);
1005         if (fd < 0) {
1006                 perror("write_task_init_pid_exit open of ns/pid");
1007                 _exit(1);
1008         }
1009         if (setns(fd, 0)) {
1010                 perror("write_task_init_pid_exit setns 1");
1011                 close(fd);
1012                 _exit(1);
1013         }
1014         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1015         if (pid < 0)
1016                 _exit(1);
1017         if (pid != 0) {
1018                 if (!wait_for_pid(pid))
1019                         _exit(1);
1020                 _exit(0);
1021         }
1022 }
1023
1024 static int send_creds_clone_wrapper(void *arg) {
1025         struct ucred cred;
1026         char v;
1027         int sock = *(int *)arg;
1028
1029         /* we are the child */
1030         cred.uid = 0;
1031         cred.gid = 0;
1032         cred.pid = 1;
1033         v = '1';
1034         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1035                 return 1;
1036         return 0;
1037 }
1038
1039 static pid_t get_init_pid_for_task(pid_t task)
1040 {
1041         int sock[2];
1042         pid_t pid;
1043         pid_t ret = -1;
1044         char v = '0';
1045         struct ucred cred;
1046
1047         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1048                 perror("socketpair");
1049                 return -1;
1050         }
1051
1052         pid = fork();
1053         if (pid < 0)
1054                 goto out;
1055         if (!pid) {
1056                 close(sock[1]);
1057                 write_task_init_pid_exit(sock[0], task);
1058                 _exit(0);
1059         }
1060
1061         if (!recv_creds(sock[1], &cred, &v))
1062                 goto out;
1063         ret = cred.pid;
1064
1065 out:
1066         close(sock[0]);
1067         close(sock[1]);
1068         if (pid > 0)
1069                 wait_for_pid(pid);
1070         return ret;
1071 }
1072
1073 static pid_t lookup_initpid_in_store(pid_t qpid)
1074 {
1075         pid_t answer = 0;
1076         struct stat sb;
1077         struct pidns_init_store *e;
1078         char fnam[100];
1079
1080         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1081         store_lock();
1082         if (stat(fnam, &sb) < 0)
1083                 goto out;
1084         e = lookup_verify_initpid(&sb);
1085         if (e) {
1086                 answer = e->initpid;
1087                 goto out;
1088         }
1089         answer = get_init_pid_for_task(qpid);
1090         if (answer > 0)
1091                 save_initpid(&sb, answer);
1092
1093 out:
1094         /* we prune at end in case we are returning
1095          * the value we were about to return */
1096         prune_initpid_store();
1097         store_unlock();
1098         return answer;
1099 }
1100
1101 static int wait_for_pid(pid_t pid)
1102 {
1103         int status, ret;
1104
1105         if (pid <= 0)
1106                 return -1;
1107
1108 again:
1109         ret = waitpid(pid, &status, 0);
1110         if (ret == -1) {
1111                 if (errno == EINTR)
1112                         goto again;
1113                 return -1;
1114         }
1115         if (ret != pid)
1116                 goto again;
1117         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1118                 return -1;
1119         return 0;
1120 }
1121
1122
1123 /*
1124  * append pid to *src.
1125  * src: a pointer to a char* in which ot append the pid.
1126  * sz: the number of characters printed so far, minus trailing \0.
1127  * asz: the allocated size so far
1128  * pid: the pid to append
1129  */
1130 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1131 {
1132         char tmp[30];
1133
1134         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1135
1136         if (!*src || tmplen + *sz + 1 >= *asz) {
1137                 char *tmp;
1138                 do {
1139                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1140                 } while (!tmp);
1141                 *src = tmp;
1142                 *asz += BUF_RESERVE_SIZE;
1143         }
1144         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1145         *sz += tmplen;
1146 }
1147
1148 /*
1149  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1150  * valid in the caller's namespace, return the id mapped into
1151  * pid's namespace.
1152  * Returns the mapped id, or -1 on error.
1153  */
1154 unsigned int
1155 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1156 {
1157         unsigned int nsuid,   // base id for a range in the idfile's namespace
1158                      hostuid, // base id for a range in the caller's namespace
1159                      count;   // number of ids in this range
1160         char line[400];
1161         int ret;
1162
1163         fseek(idfile, 0L, SEEK_SET);
1164         while (fgets(line, 400, idfile)) {
1165                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1166                 if (ret != 3)
1167                         continue;
1168                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1169                         /*
1170                          * uids wrapped around - unexpected as this is a procfile,
1171                          * so just bail.
1172                          */
1173                         fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
1174                                 nsuid, hostuid, count, line);
1175                         return -1;
1176                 }
1177                 if (hostuid <= in_id && hostuid+count > in_id) {
1178                         /*
1179                          * now since hostuid <= in_id < hostuid+count, and
1180                          * hostuid+count and nsuid+count do not wrap around,
1181                          * we know that nsuid+(in_id-hostuid) which must be
1182                          * less that nsuid+(count) must not wrap around
1183                          */
1184                         return (in_id - hostuid) + nsuid;
1185                 }
1186         }
1187
1188         // no answer found
1189         return -1;
1190 }
1191
1192 /*
1193  * for is_privileged_over,
1194  * specify whether we require the calling uid to be root in his
1195  * namespace
1196  */
1197 #define NS_ROOT_REQD true
1198 #define NS_ROOT_OPT false
1199
1200 #define PROCLEN 100
1201
1202 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1203 {
1204         char fpath[PROCLEN];
1205         int ret;
1206         bool answer = false;
1207         uid_t nsuid;
1208
1209         if (victim == -1 || uid == -1)
1210                 return false;
1211
1212         /*
1213          * If the request is one not requiring root in the namespace,
1214          * then having the same uid suffices.  (i.e. uid 1000 has write
1215          * access to files owned by uid 1000
1216          */
1217         if (!req_ns_root && uid == victim)
1218                 return true;
1219
1220         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1221         if (ret < 0 || ret >= PROCLEN)
1222                 return false;
1223         FILE *f = fopen(fpath, "r");
1224         if (!f)
1225                 return false;
1226
1227         /* if caller's not root in his namespace, reject */
1228         nsuid = convert_id_to_ns(f, uid);
1229         if (nsuid)
1230                 goto out;
1231
1232         /*
1233          * If victim is not mapped into caller's ns, reject.
1234          * XXX I'm not sure this check is needed given that fuse
1235          * will be sending requests where the vfs has converted
1236          */
1237         nsuid = convert_id_to_ns(f, victim);
1238         if (nsuid == -1)
1239                 goto out;
1240
1241         answer = true;
1242
1243 out:
1244         fclose(f);
1245         return answer;
1246 }
1247
1248 static bool perms_include(int fmode, mode_t req_mode)
1249 {
1250         mode_t r;
1251
1252         switch (req_mode & O_ACCMODE) {
1253         case O_RDONLY:
1254                 r = S_IROTH;
1255                 break;
1256         case O_WRONLY:
1257                 r = S_IWOTH;
1258                 break;
1259         case O_RDWR:
1260                 r = S_IROTH | S_IWOTH;
1261                 break;
1262         default:
1263                 return false;
1264         }
1265         return ((fmode & r) == r);
1266 }
1267
1268
1269 /*
1270  * taskcg is  a/b/c
1271  * querycg is /a/b/c/d/e
1272  * we return 'd'
1273  */
1274 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1275 {
1276         char *start, *end;
1277
1278         if (strlen(taskcg) <= strlen(querycg)) {
1279                 fprintf(stderr, "%s: I was fed bad input\n", __func__);
1280                 return NULL;
1281         }
1282
1283         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1284                 start =  strdup(taskcg + 1);
1285         else
1286                 start = strdup(taskcg + strlen(querycg) + 1);
1287         if (!start)
1288                 return NULL;
1289         end = strchr(start, '/');
1290         if (end)
1291                 *end = '\0';
1292         return start;
1293 }
1294
1295 static void stripnewline(char *x)
1296 {
1297         size_t l = strlen(x);
1298         if (l && x[l-1] == '\n')
1299                 x[l-1] = '\0';
1300 }
1301
1302 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1303 {
1304         int cfd;
1305         char fnam[PROCLEN];
1306         FILE *f;
1307         char *answer = NULL;
1308         char *line = NULL;
1309         size_t len = 0;
1310         int ret;
1311         const char *h = find_mounted_controller(contrl, &cfd);
1312         if (!h)
1313                 return NULL;
1314
1315         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1316         if (ret < 0 || ret >= PROCLEN)
1317                 return NULL;
1318         if (!(f = fopen(fnam, "r")))
1319                 return NULL;
1320
1321         while (getline(&line, &len, f) != -1) {
1322                 char *c1, *c2;
1323                 if (!line[0])
1324                         continue;
1325                 c1 = strchr(line, ':');
1326                 if (!c1)
1327                         goto out;
1328                 c1++;
1329                 c2 = strchr(c1, ':');
1330                 if (!c2)
1331                         goto out;
1332                 *c2 = '\0';
1333                 if (strcmp(c1, h) != 0)
1334                         continue;
1335                 c2++;
1336                 stripnewline(c2);
1337                 do {
1338                         answer = strdup(c2);
1339                 } while (!answer);
1340                 break;
1341         }
1342
1343 out:
1344         fclose(f);
1345         free(line);
1346         return answer;
1347 }
1348
1349 /*
1350  * check whether a fuse context may access a cgroup dir or file
1351  *
1352  * If file is not null, it is a cgroup file to check under cg.
1353  * If file is null, then we are checking perms on cg itself.
1354  *
1355  * For files we can check the mode of the list_keys result.
1356  * For cgroups, we must make assumptions based on the files under the
1357  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1358  * yet.
1359  */
1360 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1361 {
1362         struct cgfs_files *k = NULL;
1363         bool ret = false;
1364
1365         k = cgfs_get_key(contrl, cg, file);
1366         if (!k)
1367                 return false;
1368
1369         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1370                 if (perms_include(k->mode >> 6, mode)) {
1371                         ret = true;
1372                         goto out;
1373                 }
1374         }
1375         if (fc->gid == k->gid) {
1376                 if (perms_include(k->mode >> 3, mode)) {
1377                         ret = true;
1378                         goto out;
1379                 }
1380         }
1381         ret = perms_include(k->mode, mode);
1382
1383 out:
1384         free_key(k);
1385         return ret;
1386 }
1387
1388 #define INITSCOPE "/init.scope"
1389 static void prune_init_slice(char *cg)
1390 {
1391         char *point;
1392         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1393
1394         if (cg_len < initscope_len)
1395                 return;
1396
1397         point = cg + cg_len - initscope_len;
1398         if (strcmp(point, INITSCOPE) == 0) {
1399                 if (point == cg)
1400                         *(point+1) = '\0';
1401                 else
1402                         *point = '\0';
1403         }
1404 }
1405
1406 /*
1407  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1408  * If pid is in /a, he may act on /a/b, but not on /b.
1409  * if the answer is false and nextcg is not NULL, then *nextcg will point
1410  * to a string containing the next cgroup directory under cg, which must be
1411  * freed by the caller.
1412  */
1413 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1414 {
1415         bool answer = false;
1416         char *c2 = get_pid_cgroup(pid, contrl);
1417         char *linecmp;
1418
1419         if (!c2)
1420                 return false;
1421         prune_init_slice(c2);
1422
1423         /*
1424          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1425          * they pass in a cgroup without leading '/'
1426          *
1427          * The original line here was:
1428          *      linecmp = *cg == '/' ? c2 : c2+1;
1429          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1430          *       Serge, do you know?
1431          */
1432         if (*cg == '/' || !strncmp(cg, "./", 2))
1433                 linecmp = c2;
1434         else
1435                 linecmp = c2 + 1;
1436         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1437                 if (nextcg) {
1438                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1439                 }
1440                 goto out;
1441         }
1442         answer = true;
1443
1444 out:
1445         free(c2);
1446         return answer;
1447 }
1448
1449 /*
1450  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1451  */
1452 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1453 {
1454         bool answer = false;
1455         char *c2, *task_cg;
1456         size_t target_len, task_len;
1457
1458         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1459                 return true;
1460
1461         c2 = get_pid_cgroup(pid, contrl);
1462         if (!c2)
1463                 return false;
1464         prune_init_slice(c2);
1465
1466         task_cg = c2 + 1;
1467         target_len = strlen(cg);
1468         task_len = strlen(task_cg);
1469         if (task_len == 0) {
1470                 /* Task is in the root cg, it can see everything. This case is
1471                  * not handled by the strmcps below, since they test for the
1472                  * last /, but that is the first / that we've chopped off
1473                  * above.
1474                  */
1475                 answer = true;
1476                 goto out;
1477         }
1478         if (strcmp(cg, task_cg) == 0) {
1479                 answer = true;
1480                 goto out;
1481         }
1482         if (target_len < task_len) {
1483                 /* looking up a parent dir */
1484                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1485                         answer = true;
1486                 goto out;
1487         }
1488         if (target_len > task_len) {
1489                 /* looking up a child dir */
1490                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1491                         answer = true;
1492                 goto out;
1493         }
1494
1495 out:
1496         free(c2);
1497         return answer;
1498 }
1499
1500 /*
1501  * given /cgroup/freezer/a/b, return "freezer".
1502  * the returned char* should NOT be freed.
1503  */
1504 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1505 {
1506         const char *p1;
1507         char *contr, *slash;
1508
1509         if (strlen(path) < 9) {
1510                 errno = EINVAL;
1511                 return NULL;
1512         }
1513         if (*(path + 7) != '/') {
1514                 errno = EINVAL;
1515                 return NULL;
1516         }
1517         p1 = path + 8;
1518         contr = strdupa(p1);
1519         if (!contr) {
1520                 errno = ENOMEM;
1521                 return NULL;
1522         }
1523         slash = strstr(contr, "/");
1524         if (slash)
1525                 *slash = '\0';
1526
1527         int i;
1528         for (i = 0; i < num_hierarchies; i++) {
1529                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1530                         return hierarchies[i];
1531         }
1532         errno = ENOENT;
1533         return NULL;
1534 }
1535
1536 /*
1537  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1538  * Note that the returned value may include files (keynames) etc
1539  */
1540 static const char *find_cgroup_in_path(const char *path)
1541 {
1542         const char *p1;
1543
1544         if (strlen(path) < 9)
1545                 return NULL;
1546         p1 = strstr(path+8, "/");
1547         if (!p1)
1548                 return NULL;
1549         return p1+1;
1550 }
1551
1552 /*
1553  * split the last path element from the path in @cg.
1554  * @dir is newly allocated and should be freed, @last not
1555 */
1556 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1557 {
1558         char *p;
1559
1560         do {
1561                 *dir = strdup(cg);
1562         } while (!*dir);
1563         *last = strrchr(cg, '/');
1564         if (!*last) {
1565                 *last = NULL;
1566                 return;
1567         }
1568         p = strrchr(*dir, '/');
1569         *p = '\0';
1570 }
1571
1572 /*
1573  * FUSE ops for /cgroup
1574  */
1575
1576 int cg_getattr(const char *path, struct stat *sb)
1577 {
1578         struct timespec now;
1579         struct fuse_context *fc = fuse_get_context();
1580         char * cgdir = NULL;
1581         char *last = NULL, *path1, *path2;
1582         struct cgfs_files *k = NULL;
1583         const char *cgroup;
1584         const char *controller = NULL;
1585         int ret = -ENOENT;
1586
1587
1588         if (!fc)
1589                 return -EIO;
1590
1591         memset(sb, 0, sizeof(struct stat));
1592
1593         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1594                 return -EINVAL;
1595
1596         sb->st_uid = sb->st_gid = 0;
1597         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1598         sb->st_size = 0;
1599
1600         if (strcmp(path, "/cgroup") == 0) {
1601                 sb->st_mode = S_IFDIR | 00755;
1602                 sb->st_nlink = 2;
1603                 return 0;
1604         }
1605
1606         controller = pick_controller_from_path(fc, path);
1607         if (!controller)
1608                 return -errno;
1609         cgroup = find_cgroup_in_path(path);
1610         if (!cgroup) {
1611                 /* this is just /cgroup/controller, return it as a dir */
1612                 sb->st_mode = S_IFDIR | 00755;
1613                 sb->st_nlink = 2;
1614                 return 0;
1615         }
1616
1617         get_cgdir_and_path(cgroup, &cgdir, &last);
1618
1619         if (!last) {
1620                 path1 = "/";
1621                 path2 = cgdir;
1622         } else {
1623                 path1 = cgdir;
1624                 path2 = last;
1625         }
1626
1627         pid_t initpid = lookup_initpid_in_store(fc->pid);
1628         if (initpid <= 0)
1629                 initpid = fc->pid;
1630         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1631          * Then check that caller's cgroup is under path if last is a child
1632          * cgroup, or cgdir if last is a file */
1633
1634         if (is_child_cgroup(controller, path1, path2)) {
1635                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1636                         ret = -ENOENT;
1637                         goto out;
1638                 }
1639                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1640                         /* this is just /cgroup/controller, return it as a dir */
1641                         sb->st_mode = S_IFDIR | 00555;
1642                         sb->st_nlink = 2;
1643                         ret = 0;
1644                         goto out;
1645                 }
1646                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1647                         ret = -EACCES;
1648                         goto out;
1649                 }
1650
1651                 // get uid, gid, from '/tasks' file and make up a mode
1652                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1653                 sb->st_mode = S_IFDIR | 00755;
1654                 k = cgfs_get_key(controller, cgroup, NULL);
1655                 if (!k) {
1656                         sb->st_uid = sb->st_gid = 0;
1657                 } else {
1658                         sb->st_uid = k->uid;
1659                         sb->st_gid = k->gid;
1660                 }
1661                 free_key(k);
1662                 sb->st_nlink = 2;
1663                 ret = 0;
1664                 goto out;
1665         }
1666
1667         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1668                 sb->st_mode = S_IFREG | k->mode;
1669                 sb->st_nlink = 1;
1670                 sb->st_uid = k->uid;
1671                 sb->st_gid = k->gid;
1672                 sb->st_size = 0;
1673                 free_key(k);
1674                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1675                         ret = -ENOENT;
1676                         goto out;
1677                 }
1678                 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1679                         ret = -EACCES;
1680                         goto out;
1681                 }
1682
1683                 ret = 0;
1684         }
1685
1686 out:
1687         free(cgdir);
1688         return ret;
1689 }
1690
1691 int cg_opendir(const char *path, struct fuse_file_info *fi)
1692 {
1693         struct fuse_context *fc = fuse_get_context();
1694         const char *cgroup;
1695         struct file_info *dir_info;
1696         char *controller = NULL;
1697
1698         if (!fc)
1699                 return -EIO;
1700
1701         if (strcmp(path, "/cgroup") == 0) {
1702                 cgroup = NULL;
1703                 controller = NULL;
1704         } else {
1705                 // return list of keys for the controller, and list of child cgroups
1706                 controller = pick_controller_from_path(fc, path);
1707                 if (!controller)
1708                         return -errno;
1709
1710                 cgroup = find_cgroup_in_path(path);
1711                 if (!cgroup) {
1712                         /* this is just /cgroup/controller, return its contents */
1713                         cgroup = "/";
1714                 }
1715         }
1716
1717         pid_t initpid = lookup_initpid_in_store(fc->pid);
1718         if (initpid <= 0)
1719                 initpid = fc->pid;
1720         if (cgroup) {
1721                 if (!caller_may_see_dir(initpid, controller, cgroup))
1722                         return -ENOENT;
1723                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1724                         return -EACCES;
1725         }
1726
1727         /* we'll free this at cg_releasedir */
1728         dir_info = malloc(sizeof(*dir_info));
1729         if (!dir_info)
1730                 return -ENOMEM;
1731         dir_info->controller = must_copy_string(controller);
1732         dir_info->cgroup = must_copy_string(cgroup);
1733         dir_info->type = LXC_TYPE_CGDIR;
1734         dir_info->buf = NULL;
1735         dir_info->file = NULL;
1736         dir_info->buflen = 0;
1737
1738         fi->fh = (unsigned long)dir_info;
1739         return 0;
1740 }
1741
1742 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1743                 struct fuse_file_info *fi)
1744 {
1745         struct file_info *d = (struct file_info *)fi->fh;
1746         struct cgfs_files **list = NULL;
1747         int i, ret;
1748         char *nextcg = NULL;
1749         struct fuse_context *fc = fuse_get_context();
1750         char **clist = NULL;
1751
1752         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1753                 return -EIO;
1754
1755         if (d->type != LXC_TYPE_CGDIR) {
1756                 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1757                 return -EIO;
1758         }
1759         if (!d->cgroup && !d->controller) {
1760                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1761                 int i;
1762
1763                 for (i = 0;  i < num_hierarchies; i++) {
1764                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1765                                 return -EIO;
1766                         }
1767                 }
1768                 return 0;
1769         }
1770
1771         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1772                 // not a valid cgroup
1773                 ret = -EINVAL;
1774                 goto out;
1775         }
1776
1777         pid_t initpid = lookup_initpid_in_store(fc->pid);
1778         if (initpid <= 0)
1779                 initpid = fc->pid;
1780         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1781                 if (nextcg) {
1782                         ret = filler(buf, nextcg,  NULL, 0);
1783                         free(nextcg);
1784                         if (ret != 0) {
1785                                 ret = -EIO;
1786                                 goto out;
1787                         }
1788                 }
1789                 ret = 0;
1790                 goto out;
1791         }
1792
1793         for (i = 0; list[i]; i++) {
1794                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1795                         ret = -EIO;
1796                         goto out;
1797                 }
1798         }
1799
1800         // now get the list of child cgroups
1801
1802         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1803                 ret = 0;
1804                 goto out;
1805         }
1806         if (clist) {
1807                 for (i = 0; clist[i]; i++) {
1808                         if (filler(buf, clist[i], NULL, 0) != 0) {
1809                                 ret = -EIO;
1810                                 goto out;
1811                         }
1812                 }
1813         }
1814         ret = 0;
1815
1816 out:
1817         free_keys(list);
1818         if (clist) {
1819                 for (i = 0; clist[i]; i++)
1820                         free(clist[i]);
1821                 free(clist);
1822         }
1823         return ret;
1824 }
1825
1826 static void do_release_file_info(struct fuse_file_info *fi)
1827 {
1828         struct file_info *f = (struct file_info *)fi->fh;
1829
1830         if (!f)
1831                 return;
1832
1833         fi->fh = 0;
1834
1835         free(f->controller);
1836         f->controller = NULL;
1837         free(f->cgroup);
1838         f->cgroup = NULL;
1839         free(f->file);
1840         f->file = NULL;
1841         free(f->buf);
1842         f->buf = NULL;
1843         free(f);
1844 }
1845
1846 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1847 {
1848         do_release_file_info(fi);
1849         return 0;
1850 }
1851
1852 int cg_open(const char *path, struct fuse_file_info *fi)
1853 {
1854         const char *cgroup;
1855         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1856         struct cgfs_files *k = NULL;
1857         struct file_info *file_info;
1858         struct fuse_context *fc = fuse_get_context();
1859         int ret;
1860
1861         if (!fc)
1862                 return -EIO;
1863
1864         controller = pick_controller_from_path(fc, path);
1865         if (!controller)
1866                 return -errno;
1867         cgroup = find_cgroup_in_path(path);
1868         if (!cgroup)
1869                 return -EINVAL;
1870
1871         get_cgdir_and_path(cgroup, &cgdir, &last);
1872         if (!last) {
1873                 path1 = "/";
1874                 path2 = cgdir;
1875         } else {
1876                 path1 = cgdir;
1877                 path2 = last;
1878         }
1879
1880         k = cgfs_get_key(controller, path1, path2);
1881         if (!k) {
1882                 ret = -EINVAL;
1883                 goto out;
1884         }
1885         free_key(k);
1886
1887         pid_t initpid = lookup_initpid_in_store(fc->pid);
1888         if (initpid <= 0)
1889                 initpid = fc->pid;
1890         if (!caller_may_see_dir(initpid, controller, path1)) {
1891                 ret = -ENOENT;
1892                 goto out;
1893         }
1894         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1895                 ret = -EACCES;
1896                 goto out;
1897         }
1898
1899         /* we'll free this at cg_release */
1900         file_info = malloc(sizeof(*file_info));
1901         if (!file_info) {
1902                 ret = -ENOMEM;
1903                 goto out;
1904         }
1905         file_info->controller = must_copy_string(controller);
1906         file_info->cgroup = must_copy_string(path1);
1907         file_info->file = must_copy_string(path2);
1908         file_info->type = LXC_TYPE_CGFILE;
1909         file_info->buf = NULL;
1910         file_info->buflen = 0;
1911
1912         fi->fh = (unsigned long)file_info;
1913         ret = 0;
1914
1915 out:
1916         free(cgdir);
1917         return ret;
1918 }
1919
1920 int cg_access(const char *path, int mode)
1921 {
1922         int ret;
1923         const char *cgroup;
1924         char *path1, *path2, *controller;
1925         char *last = NULL, *cgdir = NULL;
1926         struct cgfs_files *k = NULL;
1927         struct fuse_context *fc = fuse_get_context();
1928
1929         if (strcmp(path, "/cgroup") == 0) {
1930                 if ((mode & W_OK) == 0)
1931                         return -EACCES;
1932                 return 0;
1933         }
1934
1935         if (!fc)
1936                 return -EIO;
1937
1938         controller = pick_controller_from_path(fc, path);
1939         if (!controller)
1940                 return -errno;
1941         cgroup = find_cgroup_in_path(path);
1942         if (!cgroup) {
1943                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
1944                 if ((mode & W_OK) == 0)
1945                         return 0;
1946                 return -EACCES;
1947         }
1948
1949         get_cgdir_and_path(cgroup, &cgdir, &last);
1950         if (!last) {
1951                 path1 = "/";
1952                 path2 = cgdir;
1953         } else {
1954                 path1 = cgdir;
1955                 path2 = last;
1956         }
1957
1958         k = cgfs_get_key(controller, path1, path2);
1959         if (!k) {
1960                 if ((mode & W_OK) == 0)
1961                         ret = 0;
1962                 else
1963                         ret = -EACCES;
1964                 goto out;
1965         }
1966         free_key(k);
1967
1968         pid_t initpid = lookup_initpid_in_store(fc->pid);
1969         if (initpid <= 0)
1970                 initpid = fc->pid;
1971         if (!caller_may_see_dir(initpid, controller, path1)) {
1972                 ret = -ENOENT;
1973                 goto out;
1974         }
1975         if (!fc_may_access(fc, controller, path1, path2, mode)) {
1976                 ret = -EACCES;
1977                 goto out;
1978         }
1979
1980         ret = 0;
1981
1982 out:
1983         free(cgdir);
1984         return ret;
1985 }
1986
1987 int cg_release(const char *path, struct fuse_file_info *fi)
1988 {
1989         do_release_file_info(fi);
1990         return 0;
1991 }
1992
1993 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1994
1995 static bool wait_for_sock(int sock, int timeout)
1996 {
1997         struct epoll_event ev;
1998         int epfd, ret, now, starttime, deltatime, saved_errno;
1999
2000         if ((starttime = time(NULL)) < 0)
2001                 return false;
2002
2003         if ((epfd = epoll_create(1)) < 0) {
2004                 fprintf(stderr, "Failed to create epoll socket: %m\n");
2005                 return false;
2006         }
2007
2008         ev.events = POLLIN_SET;
2009         ev.data.fd = sock;
2010         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2011                 fprintf(stderr, "Failed adding socket to epoll: %m\n");
2012                 close(epfd);
2013                 return false;
2014         }
2015
2016 again:
2017         if ((now = time(NULL)) < 0) {
2018                 close(epfd);
2019                 return false;
2020         }
2021
2022         deltatime = (starttime + timeout) - now;
2023         if (deltatime < 0) { // timeout
2024                 errno = 0;
2025                 close(epfd);
2026                 return false;
2027         }
2028         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2029         if (ret < 0 && errno == EINTR)
2030                 goto again;
2031         saved_errno = errno;
2032         close(epfd);
2033
2034         if (ret <= 0) {
2035                 errno = saved_errno;
2036                 return false;
2037         }
2038         return true;
2039 }
2040
2041 static int msgrecv(int sockfd, void *buf, size_t len)
2042 {
2043         if (!wait_for_sock(sockfd, 2))
2044                 return -1;
2045         return recv(sockfd, buf, len, MSG_DONTWAIT);
2046 }
2047
2048 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2049 {
2050         struct msghdr msg = { 0 };
2051         struct iovec iov;
2052         struct cmsghdr *cmsg;
2053         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2054         char buf[1];
2055         buf[0] = 'p';
2056
2057         if (pingfirst) {
2058                 if (msgrecv(sock, buf, 1) != 1) {
2059                         fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
2060                                   __func__);
2061                         return SEND_CREDS_FAIL;
2062                 }
2063         }
2064
2065         msg.msg_control = cmsgbuf;
2066         msg.msg_controllen = sizeof(cmsgbuf);
2067
2068         cmsg = CMSG_FIRSTHDR(&msg);
2069         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2070         cmsg->cmsg_level = SOL_SOCKET;
2071         cmsg->cmsg_type = SCM_CREDENTIALS;
2072         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2073
2074         msg.msg_name = NULL;
2075         msg.msg_namelen = 0;
2076
2077         buf[0] = v;
2078         iov.iov_base = buf;
2079         iov.iov_len = sizeof(buf);
2080         msg.msg_iov = &iov;
2081         msg.msg_iovlen = 1;
2082
2083         if (sendmsg(sock, &msg, 0) < 0) {
2084                 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
2085                           strerror(errno));
2086                 if (errno == 3)
2087                         return SEND_CREDS_NOTSK;
2088                 return SEND_CREDS_FAIL;
2089         }
2090
2091         return SEND_CREDS_OK;
2092 }
2093
2094 static bool recv_creds(int sock, struct ucred *cred, char *v)
2095 {
2096         struct msghdr msg = { 0 };
2097         struct iovec iov;
2098         struct cmsghdr *cmsg;
2099         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2100         char buf[1];
2101         int ret;
2102         int optval = 1;
2103
2104         *v = '1';
2105
2106         cred->pid = -1;
2107         cred->uid = -1;
2108         cred->gid = -1;
2109
2110         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2111                 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
2112                 return false;
2113         }
2114         buf[0] = '1';
2115         if (write(sock, buf, 1) != 1) {
2116                 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
2117                 return false;
2118         }
2119
2120         msg.msg_name = NULL;
2121         msg.msg_namelen = 0;
2122         msg.msg_control = cmsgbuf;
2123         msg.msg_controllen = sizeof(cmsgbuf);
2124
2125         iov.iov_base = buf;
2126         iov.iov_len = sizeof(buf);
2127         msg.msg_iov = &iov;
2128         msg.msg_iovlen = 1;
2129
2130         if (!wait_for_sock(sock, 2)) {
2131                 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
2132                           strerror(errno));
2133                 return false;
2134         }
2135         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2136         if (ret < 0) {
2137                 fprintf(stderr, "Failed to receive scm_cred: %s\n",
2138                           strerror(errno));
2139                 return false;
2140         }
2141
2142         cmsg = CMSG_FIRSTHDR(&msg);
2143
2144         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2145                         cmsg->cmsg_level == SOL_SOCKET &&
2146                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2147                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2148         }
2149         *v = buf[0];
2150
2151         return true;
2152 }
2153
2154 struct pid_ns_clone_args {
2155         int *cpipe;
2156         int sock;
2157         pid_t tpid;
2158         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2159 };
2160
2161 /*
2162  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2163  * with clone(). This simply writes '1' as ACK back to the parent
2164  * before calling the actual wrapped function.
2165  */
2166 static int pid_ns_clone_wrapper(void *arg) {
2167         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2168         char b = '1';
2169
2170         close(args->cpipe[0]);
2171         if (write(args->cpipe[1], &b, sizeof(char)) < 0) {
2172                 fprintf(stderr, "%s (child): error on write: %s\n",
2173                         __func__, strerror(errno));
2174         }
2175         close(args->cpipe[1]);
2176         return args->wrapped(args->sock, args->tpid);
2177 }
2178
2179 /*
2180  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2181  * int value back over the socket.  This shifts the pid from the
2182  * sender's pidns into tpid's pidns.
2183  */
2184 static int pid_to_ns(int sock, pid_t tpid)
2185 {
2186         char v = '0';
2187         struct ucred cred;
2188
2189         while (recv_creds(sock, &cred, &v)) {
2190                 if (v == '1')
2191                         return 0;
2192                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2193                         return 1;
2194         }
2195         return 0;
2196 }
2197
2198
2199 /*
2200  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2201  * in your old pidns.  Only children which you clone will be in the target
2202  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2203  * actually convert pids.
2204  *
2205  * Note: glibc's fork() does not respect pidns, which can lead to failed
2206  * assertions inside glibc (and thus failed forks) if the child's pid in
2207  * the pidns and the parent pid outside are identical. Using clone prevents
2208  * this issue.
2209  */
2210 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2211 {
2212         int newnsfd = -1, ret, cpipe[2];
2213         char fnam[100];
2214         pid_t cpid;
2215         char v;
2216
2217         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2218         if (ret < 0 || ret >= sizeof(fnam))
2219                 _exit(1);
2220         newnsfd = open(fnam, O_RDONLY);
2221         if (newnsfd < 0)
2222                 _exit(1);
2223         if (setns(newnsfd, 0) < 0)
2224                 _exit(1);
2225         close(newnsfd);
2226
2227         if (pipe(cpipe) < 0)
2228                 _exit(1);
2229
2230         struct pid_ns_clone_args args = {
2231                 .cpipe = cpipe,
2232                 .sock = sock,
2233                 .tpid = tpid,
2234                 .wrapped = &pid_to_ns
2235         };
2236         size_t stack_size = sysconf(_SC_PAGESIZE);
2237         void *stack = alloca(stack_size);
2238
2239         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2240         if (cpid < 0)
2241                 _exit(1);
2242
2243         // give the child 1 second to be done forking and
2244         // write its ack
2245         if (!wait_for_sock(cpipe[0], 1))
2246                 _exit(1);
2247         ret = read(cpipe[0], &v, 1);
2248         if (ret != sizeof(char) || v != '1')
2249                 _exit(1);
2250
2251         if (!wait_for_pid(cpid))
2252                 _exit(1);
2253         _exit(0);
2254 }
2255
2256 /*
2257  * To read cgroup files with a particular pid, we will setns into the child
2258  * pidns, open a pipe, fork a child - which will be the first to really be in
2259  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2260  */
2261 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2262 {
2263         int sock[2] = {-1, -1};
2264         char *tmpdata = NULL;
2265         int ret;
2266         pid_t qpid, cpid = -1;
2267         bool answer = false;
2268         char v = '0';
2269         struct ucred cred;
2270         size_t sz = 0, asz = 0;
2271
2272         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2273                 return false;
2274
2275         /*
2276          * Now we read the pids from returned data one by one, pass
2277          * them into a child in the target namespace, read back the
2278          * translated pids, and put them into our to-return data
2279          */
2280
2281         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2282                 perror("socketpair");
2283                 free(tmpdata);
2284                 return false;
2285         }
2286
2287         cpid = fork();
2288         if (cpid == -1)
2289                 goto out;
2290
2291         if (!cpid) // child - exits when done
2292                 pid_to_ns_wrapper(sock[1], tpid);
2293
2294         char *ptr = tmpdata;
2295         cred.uid = 0;
2296         cred.gid = 0;
2297         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2298                 cred.pid = qpid;
2299                 ret = send_creds(sock[0], &cred, v, true);
2300
2301                 if (ret == SEND_CREDS_NOTSK)
2302                         goto next;
2303                 if (ret == SEND_CREDS_FAIL)
2304                         goto out;
2305
2306                 // read converted results
2307                 if (!wait_for_sock(sock[0], 2)) {
2308                         fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
2309                                 __func__, strerror(errno));
2310                         goto out;
2311                 }
2312                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2313                         fprintf(stderr, "%s: error reading pid from child: %s\n",
2314                                 __func__, strerror(errno));
2315                         goto out;
2316                 }
2317                 must_strcat_pid(d, &sz, &asz, qpid);
2318 next:
2319                 ptr = strchr(ptr, '\n');
2320                 if (!ptr)
2321                         break;
2322                 ptr++;
2323         }
2324
2325         cred.pid = getpid();
2326         v = '1';
2327         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2328                 // failed to ask child to exit
2329                 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
2330                         __func__, strerror(errno));
2331                 goto out;
2332         }
2333
2334         answer = true;
2335
2336 out:
2337         free(tmpdata);
2338         if (cpid != -1)
2339                 wait_for_pid(cpid);
2340         if (sock[0] != -1) {
2341                 close(sock[0]);
2342                 close(sock[1]);
2343         }
2344         return answer;
2345 }
2346
2347 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2348                 struct fuse_file_info *fi)
2349 {
2350         struct fuse_context *fc = fuse_get_context();
2351         struct file_info *f = (struct file_info *)fi->fh;
2352         struct cgfs_files *k = NULL;
2353         char *data = NULL;
2354         int ret, s;
2355         bool r;
2356
2357         if (f->type != LXC_TYPE_CGFILE) {
2358                 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
2359                 return -EIO;
2360         }
2361
2362         if (offset)
2363                 return 0;
2364
2365         if (!fc)
2366                 return -EIO;
2367
2368         if (!f->controller)
2369                 return -EINVAL;
2370
2371         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2372                 return -EINVAL;
2373         }
2374         free_key(k);
2375
2376
2377         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2378                 ret = -EACCES;
2379                 goto out;
2380         }
2381
2382         if (strcmp(f->file, "tasks") == 0 ||
2383                         strcmp(f->file, "/tasks") == 0 ||
2384                         strcmp(f->file, "/cgroup.procs") == 0 ||
2385                         strcmp(f->file, "cgroup.procs") == 0)
2386                 // special case - we have to translate the pids
2387                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2388         else
2389                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2390
2391         if (!r) {
2392                 ret = -EINVAL;
2393                 goto out;
2394         }
2395
2396         if (!data) {
2397                 ret = 0;
2398                 goto out;
2399         }
2400         s = strlen(data);
2401         if (s > size)
2402                 s = size;
2403         memcpy(buf, data, s);
2404         if (s > 0 && s < size && data[s-1] != '\n')
2405                 buf[s++] = '\n';
2406
2407         ret = s;
2408
2409 out:
2410         free(data);
2411         return ret;
2412 }
2413
2414 static int pid_from_ns(int sock, pid_t tpid)
2415 {
2416         pid_t vpid;
2417         struct ucred cred;
2418         char v;
2419         int ret;
2420
2421         cred.uid = 0;
2422         cred.gid = 0;
2423         while (1) {
2424                 if (!wait_for_sock(sock, 2)) {
2425                         fprintf(stderr, "%s: timeout reading from parent\n", __func__);
2426                         return 1;
2427                 }
2428                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2429                         fprintf(stderr, "%s: bad read from parent: %s\n",
2430                                 __func__, strerror(errno));
2431                         return 1;
2432                 }
2433                 if (vpid == -1) // done
2434                         break;
2435                 v = '0';
2436                 cred.pid = vpid;
2437                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2438                         v = '1';
2439                         cred.pid = getpid();
2440                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2441                                 return 1;
2442                 }
2443         }
2444         return 0;
2445 }
2446
2447 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2448 {
2449         int newnsfd = -1, ret, cpipe[2];
2450         char fnam[100];
2451         pid_t cpid;
2452         char v;
2453
2454         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2455         if (ret < 0 || ret >= sizeof(fnam))
2456                 _exit(1);
2457         newnsfd = open(fnam, O_RDONLY);
2458         if (newnsfd < 0)
2459                 _exit(1);
2460         if (setns(newnsfd, 0) < 0)
2461                 _exit(1);
2462         close(newnsfd);
2463
2464         if (pipe(cpipe) < 0)
2465                 _exit(1);
2466
2467         struct pid_ns_clone_args args = {
2468                 .cpipe = cpipe,
2469                 .sock = sock,
2470                 .tpid = tpid,
2471                 .wrapped = &pid_from_ns
2472         };
2473         size_t stack_size = sysconf(_SC_PAGESIZE);
2474         void *stack = alloca(stack_size);
2475
2476         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2477         if (cpid < 0)
2478                 _exit(1);
2479
2480         // give the child 1 second to be done forking and
2481         // write its ack
2482         if (!wait_for_sock(cpipe[0], 1))
2483                 _exit(1);
2484         ret = read(cpipe[0], &v, 1);
2485         if (ret != sizeof(char) || v != '1')
2486                 _exit(1);
2487
2488         if (!wait_for_pid(cpid))
2489                 _exit(1);
2490         _exit(0);
2491 }
2492
2493 /*
2494  * Given host @uid, return the uid to which it maps in
2495  * @pid's user namespace, or -1 if none.
2496  */
2497 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2498 {
2499         FILE *f;
2500         char line[400];
2501
2502         sprintf(line, "/proc/%d/uid_map", pid);
2503         if ((f = fopen(line, "r")) == NULL) {
2504                 return false;
2505         }
2506
2507         *answer = convert_id_to_ns(f, uid);
2508         fclose(f);
2509
2510         if (*answer == -1)
2511                 return false;
2512         return true;
2513 }
2514
2515 /*
2516  * get_pid_creds: get the real uid and gid of @pid from
2517  * /proc/$$/status
2518  * (XXX should we use euid here?)
2519  */
2520 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2521 {
2522         char line[400];
2523         uid_t u;
2524         gid_t g;
2525         FILE *f;
2526
2527         *uid = -1;
2528         *gid = -1;
2529         sprintf(line, "/proc/%d/status", pid);
2530         if ((f = fopen(line, "r")) == NULL) {
2531                 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
2532                 return;
2533         }
2534         while (fgets(line, 400, f)) {
2535                 if (strncmp(line, "Uid:", 4) == 0) {
2536                         if (sscanf(line+4, "%u", &u) != 1) {
2537                                 fprintf(stderr, "bad uid line for pid %u\n", pid);
2538                                 fclose(f);
2539                                 return;
2540                         }
2541                         *uid = u;
2542                 } else if (strncmp(line, "Gid:", 4) == 0) {
2543                         if (sscanf(line+4, "%u", &g) != 1) {
2544                                 fprintf(stderr, "bad gid line for pid %u\n", pid);
2545                                 fclose(f);
2546                                 return;
2547                         }
2548                         *gid = g;
2549                 }
2550         }
2551         fclose(f);
2552 }
2553
2554 /*
2555  * May the requestor @r move victim @v to a new cgroup?
2556  * This is allowed if
2557  *   . they are the same task
2558  *   . they are ownedy by the same uid
2559  *   . @r is root on the host, or
2560  *   . @v's uid is mapped into @r's where @r is root.
2561  */
2562 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2563 {
2564         uid_t v_uid, tmpuid;
2565         gid_t v_gid;
2566
2567         if (r == v)
2568                 return true;
2569         if (r_uid == 0)
2570                 return true;
2571         get_pid_creds(v, &v_uid, &v_gid);
2572         if (r_uid == v_uid)
2573                 return true;
2574         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2575                         && hostuid_to_ns(v_uid, r, &tmpuid))
2576                 return true;
2577         return false;
2578 }
2579
2580 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2581                 const char *file, const char *buf)
2582 {
2583         int sock[2] = {-1, -1};
2584         pid_t qpid, cpid = -1;
2585         FILE *pids_file = NULL;
2586         bool answer = false, fail = false;
2587
2588         pids_file = open_pids_file(contrl, cg);
2589         if (!pids_file)
2590                 return false;
2591
2592         /*
2593          * write the pids to a socket, have helper in writer's pidns
2594          * call movepid for us
2595          */
2596         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2597                 perror("socketpair");
2598                 goto out;
2599         }
2600
2601         cpid = fork();
2602         if (cpid == -1)
2603                 goto out;
2604
2605         if (!cpid) { // child
2606                 fclose(pids_file);
2607                 pid_from_ns_wrapper(sock[1], tpid);
2608         }
2609
2610         const char *ptr = buf;
2611         while (sscanf(ptr, "%d", &qpid) == 1) {
2612                 struct ucred cred;
2613                 char v;
2614
2615                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2616                         fprintf(stderr, "%s: error writing pid to child: %s\n",
2617                                 __func__, strerror(errno));
2618                         goto out;
2619                 }
2620
2621                 if (recv_creds(sock[0], &cred, &v)) {
2622                         if (v == '0') {
2623                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2624                                         fail = true;
2625                                         break;
2626                                 }
2627                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2628                                         fail = true;
2629                         }
2630                 }
2631
2632                 ptr = strchr(ptr, '\n');
2633                 if (!ptr)
2634                         break;
2635                 ptr++;
2636         }
2637
2638         /* All good, write the value */
2639         qpid = -1;
2640         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2641                 fprintf(stderr, "Warning: failed to ask child to exit\n");
2642
2643         if (!fail)
2644                 answer = true;
2645
2646 out:
2647         if (cpid != -1)
2648                 wait_for_pid(cpid);
2649         if (sock[0] != -1) {
2650                 close(sock[0]);
2651                 close(sock[1]);
2652         }
2653         if (pids_file) {
2654                 if (fclose(pids_file) != 0)
2655                         answer = false;
2656         }
2657         return answer;
2658 }
2659
2660 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2661              struct fuse_file_info *fi)
2662 {
2663         struct fuse_context *fc = fuse_get_context();
2664         char *localbuf = NULL;
2665         struct cgfs_files *k = NULL;
2666         struct file_info *f = (struct file_info *)fi->fh;
2667         bool r;
2668
2669         if (f->type != LXC_TYPE_CGFILE) {
2670                 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
2671                 return -EIO;
2672         }
2673
2674         if (offset)
2675                 return 0;
2676
2677         if (!fc)
2678                 return -EIO;
2679
2680         localbuf = alloca(size+1);
2681         localbuf[size] = '\0';
2682         memcpy(localbuf, buf, size);
2683
2684         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2685                 size = -EINVAL;
2686                 goto out;
2687         }
2688
2689         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2690                 size = -EACCES;
2691                 goto out;
2692         }
2693
2694         if (strcmp(f->file, "tasks") == 0 ||
2695                         strcmp(f->file, "/tasks") == 0 ||
2696                         strcmp(f->file, "/cgroup.procs") == 0 ||
2697                         strcmp(f->file, "cgroup.procs") == 0)
2698                 // special case - we have to translate the pids
2699                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2700         else
2701                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2702
2703         if (!r)
2704                 size = -EINVAL;
2705
2706 out:
2707         free_key(k);
2708         return size;
2709 }
2710
2711 int cg_chown(const char *path, uid_t uid, gid_t gid)
2712 {
2713         struct fuse_context *fc = fuse_get_context();
2714         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2715         struct cgfs_files *k = NULL;
2716         const char *cgroup;
2717         int ret;
2718
2719         if (!fc)
2720                 return -EIO;
2721
2722         if (strcmp(path, "/cgroup") == 0)
2723                 return -EINVAL;
2724
2725         controller = pick_controller_from_path(fc, path);
2726         if (!controller)
2727                 return -errno;
2728         cgroup = find_cgroup_in_path(path);
2729         if (!cgroup)
2730                 /* this is just /cgroup/controller */
2731                 return -EINVAL;
2732
2733         get_cgdir_and_path(cgroup, &cgdir, &last);
2734
2735         if (!last) {
2736                 path1 = "/";
2737                 path2 = cgdir;
2738         } else {
2739                 path1 = cgdir;
2740                 path2 = last;
2741         }
2742
2743         if (is_child_cgroup(controller, path1, path2)) {
2744                 // get uid, gid, from '/tasks' file and make up a mode
2745                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2746                 k = cgfs_get_key(controller, cgroup, "tasks");
2747
2748         } else
2749                 k = cgfs_get_key(controller, path1, path2);
2750
2751         if (!k) {
2752                 ret = -EINVAL;
2753                 goto out;
2754         }
2755
2756         /*
2757          * This being a fuse request, the uid and gid must be valid
2758          * in the caller's namespace.  So we can just check to make
2759          * sure that the caller is root in his uid, and privileged
2760          * over the file's current owner.
2761          */
2762         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2763                 ret = -EACCES;
2764                 goto out;
2765         }
2766
2767         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2768
2769 out:
2770         free_key(k);
2771         free(cgdir);
2772
2773         return ret;
2774 }
2775
2776 int cg_chmod(const char *path, mode_t mode)
2777 {
2778         struct fuse_context *fc = fuse_get_context();
2779         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2780         struct cgfs_files *k = NULL;
2781         const char *cgroup;
2782         int ret;
2783
2784         if (!fc)
2785                 return -EIO;
2786
2787         if (strcmp(path, "/cgroup") == 0)
2788                 return -EINVAL;
2789
2790         controller = pick_controller_from_path(fc, path);
2791         if (!controller)
2792                 return -errno;
2793         cgroup = find_cgroup_in_path(path);
2794         if (!cgroup)
2795                 /* this is just /cgroup/controller */
2796                 return -EINVAL;
2797
2798         get_cgdir_and_path(cgroup, &cgdir, &last);
2799
2800         if (!last) {
2801                 path1 = "/";
2802                 path2 = cgdir;
2803         } else {
2804                 path1 = cgdir;
2805                 path2 = last;
2806         }
2807
2808         if (is_child_cgroup(controller, path1, path2)) {
2809                 // get uid, gid, from '/tasks' file and make up a mode
2810                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2811                 k = cgfs_get_key(controller, cgroup, "tasks");
2812
2813         } else
2814                 k = cgfs_get_key(controller, path1, path2);
2815
2816         if (!k) {
2817                 ret = -EINVAL;
2818                 goto out;
2819         }
2820
2821         /*
2822          * This being a fuse request, the uid and gid must be valid
2823          * in the caller's namespace.  So we can just check to make
2824          * sure that the caller is root in his uid, and privileged
2825          * over the file's current owner.
2826          */
2827         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2828                 ret = -EPERM;
2829                 goto out;
2830         }
2831
2832         if (!cgfs_chmod_file(controller, cgroup, mode)) {
2833                 ret = -EINVAL;
2834                 goto out;
2835         }
2836
2837         ret = 0;
2838 out:
2839         free_key(k);
2840         free(cgdir);
2841         return ret;
2842 }
2843
2844 int cg_mkdir(const char *path, mode_t mode)
2845 {
2846         struct fuse_context *fc = fuse_get_context();
2847         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2848         const char *cgroup;
2849         int ret;
2850
2851         if (!fc)
2852                 return -EIO;
2853
2854
2855         controller = pick_controller_from_path(fc, path);
2856         if (!controller)
2857                 return errno == ENOENT ? -EPERM : -errno;
2858
2859         cgroup = find_cgroup_in_path(path);
2860         if (!cgroup)
2861                 return -EINVAL;
2862
2863         get_cgdir_and_path(cgroup, &cgdir, &last);
2864         if (!last)
2865                 path1 = "/";
2866         else
2867                 path1 = cgdir;
2868
2869         pid_t initpid = lookup_initpid_in_store(fc->pid);
2870         if (initpid <= 0)
2871                 initpid = fc->pid;
2872         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2873                 if (!next)
2874                         ret = -EINVAL;
2875                 else if (last && strcmp(next, last) == 0)
2876                         ret = -EEXIST;
2877                 else
2878                         ret = -EPERM;
2879                 goto out;
2880         }
2881
2882         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2883                 ret = -EACCES;
2884                 goto out;
2885         }
2886         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2887                 ret = -EACCES;
2888                 goto out;
2889         }
2890
2891         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2892
2893 out:
2894         free(cgdir);
2895         free(next);
2896         return ret;
2897 }
2898
2899 int cg_rmdir(const char *path)
2900 {
2901         struct fuse_context *fc = fuse_get_context();
2902         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2903         const char *cgroup;
2904         int ret;
2905
2906         if (!fc)
2907                 return -EIO;
2908
2909         controller = pick_controller_from_path(fc, path);
2910         if (!controller)
2911                 return -errno;
2912
2913         cgroup = find_cgroup_in_path(path);
2914         if (!cgroup)
2915                 return -EINVAL;
2916
2917         get_cgdir_and_path(cgroup, &cgdir, &last);
2918         if (!last) {
2919                 ret = -EINVAL;
2920                 goto out;
2921         }
2922
2923         pid_t initpid = lookup_initpid_in_store(fc->pid);
2924         if (initpid <= 0)
2925                 initpid = fc->pid;
2926         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2927                 if (!last || strcmp(next, last) == 0)
2928                         ret = -EBUSY;
2929                 else
2930                         ret = -ENOENT;
2931                 goto out;
2932         }
2933
2934         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2935                 ret = -EACCES;
2936                 goto out;
2937         }
2938         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2939                 ret = -EACCES;
2940                 goto out;
2941         }
2942
2943         if (!cgfs_remove(controller, cgroup)) {
2944                 ret = -EINVAL;
2945                 goto out;
2946         }
2947
2948         ret = 0;
2949
2950 out:
2951         free(cgdir);
2952         free(next);
2953         return ret;
2954 }
2955
2956 static bool startswith(const char *line, const char *pref)
2957 {
2958         if (strncmp(line, pref, strlen(pref)) == 0)
2959                 return true;
2960         return false;
2961 }
2962
2963 static void parse_memstat(char *memstat, unsigned long *cached,
2964                 unsigned long *active_anon, unsigned long *inactive_anon,
2965                 unsigned long *active_file, unsigned long *inactive_file,
2966                 unsigned long *unevictable)
2967 {
2968         char *eol;
2969
2970         while (*memstat) {
2971                 if (startswith(memstat, "cache")) {
2972                         sscanf(memstat + 11, "%lu", cached);
2973                         *cached /= 1024;
2974                 } else if (startswith(memstat, "active_anon")) {
2975                         sscanf(memstat + 11, "%lu", active_anon);
2976                         *active_anon /= 1024;
2977                 } else if (startswith(memstat, "inactive_anon")) {
2978                         sscanf(memstat + 11, "%lu", inactive_anon);
2979                         *inactive_anon /= 1024;
2980                 } else if (startswith(memstat, "active_file")) {
2981                         sscanf(memstat + 11, "%lu", active_file);
2982                         *active_file /= 1024;
2983                 } else if (startswith(memstat, "inactive_file")) {
2984                         sscanf(memstat + 11, "%lu", inactive_file);
2985                         *inactive_file /= 1024;
2986                 } else if (startswith(memstat, "unevictable")) {
2987                         sscanf(memstat + 11, "%lu", unevictable);
2988                         *unevictable /= 1024;
2989                 }
2990                 eol = strchr(memstat, '\n');
2991                 if (!eol)
2992                         return;
2993                 memstat = eol+1;
2994         }
2995 }
2996
2997 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2998 {
2999         char *eol;
3000         char key[32];
3001
3002         memset(key, 0, 32);
3003         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3004
3005         size_t len = strlen(key);
3006         *v = 0;
3007
3008         while (*str) {
3009                 if (startswith(str, key)) {
3010                         sscanf(str + len, "%lu", v);
3011                         return;
3012                 }
3013                 eol = strchr(str, '\n');
3014                 if (!eol)
3015                         return;
3016                 str = eol+1;
3017         }
3018 }
3019
3020 static int read_file(const char *path, char *buf, size_t size,
3021                      struct file_info *d)
3022 {
3023         size_t linelen = 0, total_len = 0, rv = 0;
3024         char *line = NULL;
3025         char *cache = d->buf;
3026         size_t cache_size = d->buflen;
3027         FILE *f = fopen(path, "r");
3028         if (!f)
3029                 return 0;
3030
3031         while (getline(&line, &linelen, f) != -1) {
3032                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3033                 if (l < 0) {
3034                         perror("Error writing to cache");
3035                         rv = 0;
3036                         goto err;
3037                 }
3038                 if (l >= cache_size) {
3039                         fprintf(stderr, "Internal error: truncated write to cache\n");
3040                         rv = 0;
3041                         goto err;
3042                 }
3043                 cache += l;
3044                 cache_size -= l;
3045                 total_len += l;
3046         }
3047
3048         d->size = total_len;
3049         if (total_len > size)
3050                 total_len = size;
3051
3052         /* read from off 0 */
3053         memcpy(buf, d->buf, total_len);
3054         rv = total_len;
3055   err:
3056         fclose(f);
3057         free(line);
3058         return rv;
3059 }
3060
3061 /*
3062  * FUSE ops for /proc
3063  */
3064
3065 static unsigned long get_memlimit(const char *cgroup)
3066 {
3067         char *memlimit_str = NULL;
3068         unsigned long memlimit = -1;
3069
3070         if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
3071                 memlimit = strtoul(memlimit_str, NULL, 10);
3072
3073         free(memlimit_str);
3074
3075         return memlimit;
3076 }
3077
3078 static unsigned long get_min_memlimit(const char *cgroup)
3079 {
3080         char *copy = strdupa(cgroup);
3081         unsigned long memlimit = 0, retlimit;
3082
3083         retlimit = get_memlimit(copy);
3084
3085         while (strcmp(copy, "/") != 0) {
3086                 copy = dirname(copy);
3087                 memlimit = get_memlimit(copy);
3088                 if (memlimit != -1 && memlimit < retlimit)
3089                         retlimit = memlimit;
3090         };
3091
3092         return retlimit;
3093 }
3094
3095 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3096                 struct fuse_file_info *fi)
3097 {
3098         struct fuse_context *fc = fuse_get_context();
3099         struct file_info *d = (struct file_info *)fi->fh;
3100         char *cg;
3101         char *memusage_str = NULL, *memstat_str = NULL,
3102                 *memswlimit_str = NULL, *memswusage_str = NULL,
3103                 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3104         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3105                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3106                 active_file = 0, inactive_file = 0, unevictable = 0;
3107         char *line = NULL;
3108         size_t linelen = 0, total_len = 0, rv = 0;
3109         char *cache = d->buf;
3110         size_t cache_size = d->buflen;
3111         FILE *f = NULL;
3112
3113         if (offset){
3114                 if (offset > d->size)
3115                         return -EINVAL;
3116                 if (!d->cached)
3117                         return 0;
3118                 int left = d->size - offset;
3119                 total_len = left > size ? size: left;
3120                 memcpy(buf, cache + offset, total_len);
3121                 return total_len;
3122         }
3123
3124         pid_t initpid = lookup_initpid_in_store(fc->pid);
3125         if (initpid <= 0)
3126                 initpid = fc->pid;
3127         cg = get_pid_cgroup(initpid, "memory");
3128         if (!cg)
3129                 return read_file("/proc/meminfo", buf, size, d);
3130         prune_init_slice(cg);
3131
3132         memlimit = get_min_memlimit(cg);
3133         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3134                 goto err;
3135         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3136                 goto err;
3137
3138         // Following values are allowed to fail, because swapaccount might be turned
3139         // off for current kernel
3140         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3141                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3142         {
3143                 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
3144                 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3145                         goto err;
3146                 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3147                         goto err;
3148
3149                 memswlimit = strtoul(memswlimit_str, NULL, 10);
3150                 memswusage = strtoul(memswusage_str, NULL, 10);
3151
3152                 if (!strcmp(memswlimit_str, memswlimit_default_str))
3153                         memswlimit = 0;
3154                 if (!strcmp(memswusage_str, memswusage_default_str))
3155                         memswusage = 0;
3156
3157                 memswlimit = memswlimit / 1024;
3158                 memswusage = memswusage / 1024;
3159         }
3160
3161         memusage = strtoul(memusage_str, NULL, 10);
3162         memlimit /= 1024;
3163         memusage /= 1024;
3164
3165         parse_memstat(memstat_str, &cached, &active_anon,
3166                         &inactive_anon, &active_file, &inactive_file,
3167                         &unevictable);
3168
3169         f = fopen("/proc/meminfo", "r");
3170         if (!f)
3171                 goto err;
3172
3173         while (getline(&line, &linelen, f) != -1) {
3174                 ssize_t l;
3175                 char *printme, lbuf[100];
3176
3177                 memset(lbuf, 0, 100);
3178                 if (startswith(line, "MemTotal:")) {
3179                         sscanf(line+14, "%lu", &hosttotal);
3180                         if (hosttotal < memlimit)
3181                                 memlimit = hosttotal;
3182                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3183                         printme = lbuf;
3184                 } else if (startswith(line, "MemFree:")) {
3185                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3186                         printme = lbuf;
3187                 } else if (startswith(line, "MemAvailable:")) {
3188                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage);
3189                         printme = lbuf;
3190                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3191                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit - memlimit);
3192                         printme = lbuf;
3193                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3194                         unsigned long swaptotal = memswlimit - memlimit,
3195                                         swapusage = memswusage - memusage,
3196                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3197                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3198                         printme = lbuf;
3199                 } else if (startswith(line, "Slab:")) {
3200                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3201                         printme = lbuf;
3202                 } else if (startswith(line, "Buffers:")) {
3203                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3204                         printme = lbuf;
3205                 } else if (startswith(line, "Cached:")) {
3206                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3207                         printme = lbuf;
3208                 } else if (startswith(line, "SwapCached:")) {
3209                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3210                         printme = lbuf;
3211                 } else if (startswith(line, "Active")) {
3212                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3213                                         active_anon + active_file);
3214                         printme = lbuf;
3215                 } else if (startswith(line, "Inactive")) {
3216                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3217                                         inactive_anon + inactive_file);
3218                         printme = lbuf;
3219                 } else if (startswith(line, "Active(anon)")) {
3220                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3221                         printme = lbuf;
3222                 } else if (startswith(line, "Inactive(anon)")) {
3223                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3224                         printme = lbuf;
3225                 } else if (startswith(line, "Active(file)")) {
3226                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3227                         printme = lbuf;
3228                 } else if (startswith(line, "Inactive(file)")) {
3229                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3230                         printme = lbuf;
3231                 } else if (startswith(line, "Unevictable")) {
3232                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3233                         printme = lbuf;
3234                 } else if (startswith(line, "SReclaimable")) {
3235                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3236                         printme = lbuf;
3237                 } else if (startswith(line, "SUnreclaim")) {
3238                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3239                         printme = lbuf;
3240                 } else
3241                         printme = line;
3242
3243                 l = snprintf(cache, cache_size, "%s", printme);
3244                 if (l < 0) {
3245                         perror("Error writing to cache");
3246                         rv = 0;
3247                         goto err;
3248
3249                 }
3250                 if (l >= cache_size) {
3251                         fprintf(stderr, "Internal error: truncated write to cache\n");
3252                         rv = 0;
3253                         goto err;
3254                 }
3255
3256                 cache += l;
3257                 cache_size -= l;
3258                 total_len += l;
3259         }
3260
3261         d->cached = 1;
3262         d->size = total_len;
3263         if (total_len > size ) total_len = size;
3264         memcpy(buf, d->buf, total_len);
3265
3266         rv = total_len;
3267 err:
3268         if (f)
3269                 fclose(f);
3270         free(line);
3271         free(cg);
3272         free(memusage_str);
3273         free(memswlimit_str);
3274         free(memswusage_str);
3275         free(memstat_str);
3276         free(memswlimit_default_str);
3277         free(memswusage_default_str);
3278         return rv;
3279 }
3280
3281 /*
3282  * Read the cpuset.cpus for cg
3283  * Return the answer in a newly allocated string which must be freed
3284  */
3285 static char *get_cpuset(const char *cg)
3286 {
3287         char *answer;
3288
3289         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3290                 return NULL;
3291         return answer;
3292 }
3293
3294 bool cpu_in_cpuset(int cpu, const char *cpuset);
3295
3296 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3297 {
3298         int cpu;
3299
3300         if (sscanf(line, "processor       : %d", &cpu) != 1)
3301                 return false;
3302         return cpu_in_cpuset(cpu, cpuset);
3303 }
3304
3305 /*
3306  * check whether this is a '^processor" line in /proc/cpuinfo
3307  */
3308 static bool is_processor_line(const char *line)
3309 {
3310         int cpu;
3311
3312         if (sscanf(line, "processor       : %d", &cpu) == 1)
3313                 return true;
3314         return false;
3315 }
3316
3317 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3318                 struct fuse_file_info *fi)
3319 {
3320         struct fuse_context *fc = fuse_get_context();
3321         struct file_info *d = (struct file_info *)fi->fh;
3322         char *cg;
3323         char *cpuset = NULL;
3324         char *line = NULL;
3325         size_t linelen = 0, total_len = 0, rv = 0;
3326         bool am_printing = false, firstline = true, is_s390x = false;
3327         int curcpu = -1, cpu;
3328         char *cache = d->buf;
3329         size_t cache_size = d->buflen;
3330         FILE *f = NULL;
3331
3332         if (offset){
3333                 if (offset > d->size)
3334                         return -EINVAL;
3335                 if (!d->cached)
3336                         return 0;
3337                 int left = d->size - offset;
3338                 total_len = left > size ? size: left;
3339                 memcpy(buf, cache + offset, total_len);
3340                 return total_len;
3341         }
3342
3343         pid_t initpid = lookup_initpid_in_store(fc->pid);
3344         if (initpid <= 0)
3345                 initpid = fc->pid;
3346         cg = get_pid_cgroup(initpid, "cpuset");
3347         if (!cg)
3348                 return read_file("proc/cpuinfo", buf, size, d);
3349         prune_init_slice(cg);
3350
3351         cpuset = get_cpuset(cg);
3352         if (!cpuset)
3353                 goto err;
3354
3355         f = fopen("/proc/cpuinfo", "r");
3356         if (!f)
3357                 goto err;
3358
3359         while (getline(&line, &linelen, f) != -1) {
3360                 ssize_t l;
3361                 if (firstline) {
3362                         firstline = false;
3363                         if (strstr(line, "IBM/S390") != NULL) {
3364                                 is_s390x = true;
3365                                 am_printing = true;
3366                                 continue;
3367                         }
3368                 }
3369                 if (strncmp(line, "# processors:", 12) == 0)
3370                         continue;
3371                 if (is_processor_line(line)) {
3372                         am_printing = cpuline_in_cpuset(line, cpuset);
3373                         if (am_printing) {
3374                                 curcpu ++;
3375                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3376                                 if (l < 0) {
3377                                         perror("Error writing to cache");
3378                                         rv = 0;
3379                                         goto err;
3380                                 }
3381                                 if (l >= cache_size) {
3382                                         fprintf(stderr, "Internal error: truncated write to cache\n");
3383                                         rv = 0;
3384                                         goto err;
3385                                 }
3386                                 cache += l;
3387                                 cache_size -= l;
3388                                 total_len += l;
3389                         }
3390                         continue;
3391                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3392                         char *p;
3393                         if (!cpu_in_cpuset(cpu, cpuset))
3394                                 continue;
3395                         curcpu ++;
3396                         p = strchr(line, ':');
3397                         if (!p || !*p)
3398                                 goto err;
3399                         p++;
3400                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3401                         if (l < 0) {
3402                                 perror("Error writing to cache");
3403                                 rv = 0;
3404                                 goto err;
3405                         }
3406                         if (l >= cache_size) {
3407                                 fprintf(stderr, "Internal error: truncated write to cache\n");
3408                                 rv = 0;
3409                                 goto err;
3410                         }
3411                         cache += l;
3412                         cache_size -= l;
3413                         total_len += l;
3414                         continue;
3415
3416                 }
3417                 if (am_printing) {
3418                         l = snprintf(cache, cache_size, "%s", line);
3419                         if (l < 0) {
3420                                 perror("Error writing to cache");
3421                                 rv = 0;
3422                                 goto err;
3423                         }
3424                         if (l >= cache_size) {
3425                                 fprintf(stderr, "Internal error: truncated write to cache\n");
3426                                 rv = 0;
3427                                 goto err;
3428                         }
3429                         cache += l;
3430                         cache_size -= l;
3431                         total_len += l;
3432                 }
3433         }
3434
3435         if (is_s390x) {
3436                 char *origcache = d->buf;
3437                 ssize_t l;
3438                 do {
3439                         d->buf = malloc(d->buflen);
3440                 } while (!d->buf);
3441                 cache = d->buf;
3442                 cache_size = d->buflen;
3443                 total_len = 0;
3444                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3445                 if (l < 0 || l >= cache_size) {
3446                         free(origcache);
3447                         goto err;
3448                 }
3449                 cache_size -= l;
3450                 cache += l;
3451                 total_len += l;
3452                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3453                 if (l < 0 || l >= cache_size) {
3454                         free(origcache);
3455                         goto err;
3456                 }
3457                 cache_size -= l;
3458                 cache += l;
3459                 total_len += l;
3460                 l = snprintf(cache, cache_size, "%s", origcache);
3461                 free(origcache);
3462                 if (l < 0 || l >= cache_size)
3463                         goto err;
3464                 total_len += l;
3465         }
3466
3467         d->cached = 1;
3468         d->size = total_len;
3469         if (total_len > size ) total_len = size;
3470
3471         /* read from off 0 */
3472         memcpy(buf, d->buf, total_len);
3473         rv = total_len;
3474 err:
3475         if (f)
3476                 fclose(f);
3477         free(line);
3478         free(cpuset);
3479         free(cg);
3480         return rv;
3481 }
3482
3483 static int proc_stat_read(char *buf, size_t size, off_t offset,
3484                 struct fuse_file_info *fi)
3485 {
3486         struct fuse_context *fc = fuse_get_context();
3487         struct file_info *d = (struct file_info *)fi->fh;
3488         char *cg;
3489         char *cpuset = NULL;
3490         char *line = NULL;
3491         size_t linelen = 0, total_len = 0, rv = 0;
3492         int curcpu = -1; /* cpu numbering starts at 0 */
3493         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3494         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3495                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3496 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3497         char cpuall[CPUALL_MAX_SIZE];
3498         /* reserve for cpu all */
3499         char *cache = d->buf + CPUALL_MAX_SIZE;
3500         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3501         FILE *f = NULL;
3502
3503         if (offset){
3504                 if (offset > d->size)
3505                         return -EINVAL;
3506                 if (!d->cached)
3507                         return 0;
3508                 int left = d->size - offset;
3509                 total_len = left > size ? size: left;
3510                 memcpy(buf, d->buf + offset, total_len);
3511                 return total_len;
3512         }
3513
3514         pid_t initpid = lookup_initpid_in_store(fc->pid);
3515         if (initpid <= 0)
3516                 initpid = fc->pid;
3517         cg = get_pid_cgroup(initpid, "cpuset");
3518         if (!cg)
3519                 return read_file("/proc/stat", buf, size, d);
3520         prune_init_slice(cg);
3521
3522         cpuset = get_cpuset(cg);
3523         if (!cpuset)
3524                 goto err;
3525
3526         f = fopen("/proc/stat", "r");
3527         if (!f)
3528                 goto err;
3529
3530         //skip first line
3531         if (getline(&line, &linelen, f) < 0) {
3532                 fprintf(stderr, "proc_stat_read read first line failed\n");
3533                 goto err;
3534         }
3535
3536         while (getline(&line, &linelen, f) != -1) {
3537                 ssize_t l;
3538                 int cpu;
3539                 char cpu_char[10]; /* That's a lot of cores */
3540                 char *c;
3541
3542                 if (strlen(line) == 0)
3543                         continue;
3544                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3545                         /* not a ^cpuN line containing a number N, just print it */
3546                         l = snprintf(cache, cache_size, "%s", line);
3547                         if (l < 0) {
3548                                 perror("Error writing to cache");
3549                                 rv = 0;
3550                                 goto err;
3551                         }
3552                         if (l >= cache_size) {
3553                                 fprintf(stderr, "Internal error: truncated write to cache\n");
3554                                 rv = 0;
3555                                 goto err;
3556                         }
3557                         cache += l;
3558                         cache_size -= l;
3559                         total_len += l;
3560                         continue;
3561                 }
3562
3563                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3564                         continue;
3565                 if (!cpu_in_cpuset(cpu, cpuset))
3566                         continue;
3567                 curcpu ++;
3568
3569                 c = strchr(line, ' ');
3570                 if (!c)
3571                         continue;
3572                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3573                 if (l < 0) {
3574                         perror("Error writing to cache");
3575                         rv = 0;
3576                         goto err;
3577
3578                 }
3579                 if (l >= cache_size) {
3580                         fprintf(stderr, "Internal error: truncated write to cache\n");
3581                         rv = 0;
3582                         goto err;
3583                 }
3584
3585                 cache += l;
3586                 cache_size -= l;
3587                 total_len += l;
3588
3589                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3590                         &softirq, &steal, &guest) != 9)
3591                         continue;
3592                 user_sum += user;
3593                 nice_sum += nice;
3594                 system_sum += system;
3595                 idle_sum += idle;
3596                 iowait_sum += iowait;
3597                 irq_sum += irq;
3598                 softirq_sum += softirq;
3599                 steal_sum += steal;
3600                 guest_sum += guest;
3601         }
3602
3603         cache = d->buf;
3604
3605         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3606                 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3607         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3608                 memcpy(cache, cpuall, cpuall_len);
3609                 cache += cpuall_len;
3610         } else{
3611                 /* shouldn't happen */
3612                 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
3613                 cpuall_len = 0;
3614         }
3615
3616         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3617         total_len += cpuall_len;
3618         d->cached = 1;
3619         d->size = total_len;
3620         if (total_len > size ) total_len = size;
3621
3622         memcpy(buf, d->buf, total_len);
3623         rv = total_len;
3624
3625 err:
3626         if (f)
3627                 fclose(f);
3628         free(line);
3629         free(cpuset);
3630         free(cg);
3631         return rv;
3632 }
3633
3634 static long int getreaperage(pid_t pid)
3635 {
3636         char fnam[100];
3637         struct stat sb;
3638         int ret;
3639         pid_t qpid;
3640
3641         qpid = lookup_initpid_in_store(pid);
3642         if (qpid <= 0)
3643                 return 0;
3644
3645         ret = snprintf(fnam, 100, "/proc/%d", qpid);
3646         if (ret < 0 || ret >= 100)
3647                 return 0;
3648
3649         if (lstat(fnam, &sb) < 0)
3650                 return 0;
3651
3652         return time(NULL) - sb.st_ctime;
3653 }
3654
3655 static unsigned long get_reaper_busy(pid_t task)
3656 {
3657         pid_t initpid = lookup_initpid_in_store(task);
3658         char *cgroup = NULL, *usage_str = NULL;
3659         unsigned long usage = 0;
3660
3661         if (initpid <= 0)
3662                 return 0;
3663
3664         cgroup = get_pid_cgroup(initpid, "cpuacct");
3665         if (!cgroup)
3666                 goto out;
3667         prune_init_slice(cgroup);
3668         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3669                 goto out;
3670         usage = strtoul(usage_str, NULL, 10);
3671         usage /= 1000000000;
3672
3673 out:
3674         free(cgroup);
3675         free(usage_str);
3676         return usage;
3677 }
3678
3679 #if RELOADTEST
3680 void iwashere(void)
3681 {
3682         int fd;
3683
3684         fd = creat("/tmp/lxcfs-iwashere", 0644);
3685         if (fd >= 0)
3686                 close(fd);
3687 }
3688 #endif
3689
3690 /*
3691  * We read /proc/uptime and reuse its second field.
3692  * For the first field, we use the mtime for the reaper for
3693  * the calling pid as returned by getreaperage
3694  */
3695 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3696                 struct fuse_file_info *fi)
3697 {
3698         struct fuse_context *fc = fuse_get_context();
3699         struct file_info *d = (struct file_info *)fi->fh;
3700         long int reaperage = getreaperage(fc->pid);
3701         unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3702         char *cache = d->buf;
3703         ssize_t total_len = 0;
3704
3705 #if RELOADTEST
3706         iwashere();
3707 #endif
3708
3709         if (offset){
3710                 if (offset > d->size)
3711                         return -EINVAL;
3712                 if (!d->cached)
3713                         return 0;
3714                 int left = d->size - offset;
3715                 total_len = left > size ? size: left;
3716                 memcpy(buf, cache + offset, total_len);
3717                 return total_len;
3718         }
3719
3720         idletime = reaperage - busytime;
3721         if (idletime > reaperage)
3722                 idletime = reaperage;
3723
3724         total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3725         if (total_len < 0){
3726                 perror("Error writing to cache");
3727                 return 0;
3728         }
3729
3730         d->size = (int)total_len;
3731         d->cached = 1;
3732
3733         if (total_len > size) total_len = size;
3734
3735         memcpy(buf, d->buf, total_len);
3736         return total_len;
3737 }
3738
3739 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3740                 struct fuse_file_info *fi)
3741 {
3742         char dev_name[72];
3743         struct fuse_context *fc = fuse_get_context();
3744         struct file_info *d = (struct file_info *)fi->fh;
3745         char *cg;
3746         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3747                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
3748         unsigned long read = 0, write = 0;
3749         unsigned long read_merged = 0, write_merged = 0;
3750         unsigned long read_sectors = 0, write_sectors = 0;
3751         unsigned long read_ticks = 0, write_ticks = 0;
3752         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3753         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3754         char *cache = d->buf;
3755         size_t cache_size = d->buflen;
3756         char *line = NULL;
3757         size_t linelen = 0, total_len = 0, rv = 0;
3758         unsigned int major = 0, minor = 0;
3759         int i = 0;
3760         FILE *f = NULL;
3761
3762         if (offset){
3763                 if (offset > d->size)
3764                         return -EINVAL;
3765                 if (!d->cached)
3766                         return 0;
3767                 int left = d->size - offset;
3768                 total_len = left > size ? size: left;
3769                 memcpy(buf, cache + offset, total_len);
3770                 return total_len;
3771         }
3772
3773         pid_t initpid = lookup_initpid_in_store(fc->pid);
3774         if (initpid <= 0)
3775                 initpid = fc->pid;
3776         cg = get_pid_cgroup(initpid, "blkio");
3777         if (!cg)
3778                 return read_file("/proc/diskstats", buf, size, d);
3779         prune_init_slice(cg);
3780
3781         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
3782                 goto err;
3783         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
3784                 goto err;
3785         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
3786                 goto err;
3787         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
3788                 goto err;
3789         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
3790                 goto err;
3791
3792
3793         f = fopen("/proc/diskstats", "r");
3794         if (!f)
3795                 goto err;
3796
3797         while (getline(&line, &linelen, f) != -1) {
3798                 ssize_t l;
3799                 char lbuf[256];
3800
3801                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
3802                 if (i != 3)
3803                         continue;
3804
3805                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3806                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3807                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3808                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3809                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3810                 read_sectors = read_sectors/512;
3811                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3812                 write_sectors = write_sectors/512;
3813
3814                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3815                 rd_svctm = rd_svctm/1000000;
3816                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3817                 rd_wait = rd_wait/1000000;
3818                 read_ticks = rd_svctm + rd_wait;
3819
3820                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3821                 wr_svctm =  wr_svctm/1000000;
3822                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3823                 wr_wait =  wr_wait/1000000;
3824                 write_ticks = wr_svctm + wr_wait;
3825
3826                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3827                 tot_ticks =  tot_ticks/1000000;
3828
3829                 memset(lbuf, 0, 256);
3830                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3831                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3832                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3833                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3834                 else
3835                         continue;
3836
3837                 l = snprintf(cache, cache_size, "%s", lbuf);
3838                 if (l < 0) {
3839                         perror("Error writing to fuse buf");
3840                         rv = 0;
3841                         goto err;
3842                 }
3843                 if (l >= cache_size) {
3844                         fprintf(stderr, "Internal error: truncated write to cache\n");
3845                         rv = 0;
3846                         goto err;
3847                 }
3848                 cache += l;
3849                 cache_size -= l;
3850                 total_len += l;
3851         }
3852
3853         d->cached = 1;
3854         d->size = total_len;
3855         if (total_len > size ) total_len = size;
3856         memcpy(buf, d->buf, total_len);
3857
3858         rv = total_len;
3859 err:
3860         free(cg);
3861         if (f)
3862                 fclose(f);
3863         free(line);
3864         free(io_serviced_str);
3865         free(io_merged_str);
3866         free(io_service_bytes_str);
3867         free(io_wait_time_str);
3868         free(io_service_time_str);
3869         return rv;
3870 }
3871
3872 static int proc_swaps_read(char *buf, size_t size, off_t offset,
3873                 struct fuse_file_info *fi)
3874 {
3875         struct fuse_context *fc = fuse_get_context();
3876         struct file_info *d = (struct file_info *)fi->fh;
3877         char *cg = NULL;
3878         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
3879              *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3880         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
3881         ssize_t total_len = 0, rv = 0;
3882         ssize_t l = 0;
3883         char *cache = d->buf;
3884
3885         if (offset) {
3886                 if (offset > d->size)
3887                         return -EINVAL;
3888                 if (!d->cached)
3889                         return 0;
3890                 int left = d->size - offset;
3891                 total_len = left > size ? size: left;
3892                 memcpy(buf, cache + offset, total_len);
3893                 return total_len;
3894         }
3895
3896         pid_t initpid = lookup_initpid_in_store(fc->pid);
3897         if (initpid <= 0)
3898                 initpid = fc->pid;
3899         cg = get_pid_cgroup(initpid, "memory");
3900         if (!cg)
3901                 return read_file("/proc/swaps", buf, size, d);
3902         prune_init_slice(cg);
3903
3904         if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
3905                 goto err;
3906
3907         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3908                 goto err;
3909
3910         memlimit = strtoul(memlimit_str, NULL, 10);
3911         memusage = strtoul(memusage_str, NULL, 10);
3912
3913         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
3914             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
3915
3916                 /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
3917                 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3918                     goto err;
3919                 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3920                     goto err;
3921
3922                 memswlimit = strtoul(memswlimit_str, NULL, 10);
3923                 memswusage = strtoul(memswusage_str, NULL, 10);
3924
3925                 if (!strcmp(memswlimit_str, memswlimit_default_str))
3926                     memswlimit = 0;
3927                 if (!strcmp(memswusage_str, memswusage_default_str))
3928                     memswusage = 0;
3929
3930                 swap_total = (memswlimit - memlimit) / 1024;
3931                 swap_free = (memswusage - memusage) / 1024;
3932         }
3933
3934         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
3935
3936         /* When no mem + swap limit is specified or swapaccount=0*/
3937         if (!memswlimit) {
3938                 char *line = NULL;
3939                 size_t linelen = 0;
3940                 FILE *f = fopen("/proc/meminfo", "r");
3941
3942                 if (!f)
3943                         goto err;
3944
3945                 while (getline(&line, &linelen, f) != -1) {
3946                         if (startswith(line, "SwapTotal:")) {
3947                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
3948                         } else if (startswith(line, "SwapFree:")) {
3949                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
3950                         }
3951                 }
3952
3953                 free(line);
3954                 fclose(f);
3955         }
3956
3957         if (swap_total > 0) {
3958                 l = snprintf(d->buf + total_len, d->size - total_len,
3959                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
3960                                 swap_total, swap_free);
3961                 total_len += l;
3962         }
3963
3964         if (total_len < 0 || l < 0) {
3965                 perror("Error writing to cache");
3966                 rv = 0;
3967                 goto err;
3968         }
3969
3970         d->cached = 1;
3971         d->size = (int)total_len;
3972
3973         if (total_len > size) total_len = size;
3974         memcpy(buf, d->buf, total_len);
3975         rv = total_len;
3976
3977 err:
3978         free(cg);
3979         free(memswlimit_str);
3980         free(memlimit_str);
3981         free(memusage_str);
3982         free(memswusage_str);
3983         free(memswusage_default_str);
3984         free(memswlimit_default_str);
3985         return rv;
3986 }
3987
3988 static off_t get_procfile_size(const char *which)
3989 {
3990         FILE *f = fopen(which, "r");
3991         char *line = NULL;
3992         size_t len = 0;
3993         ssize_t sz, answer = 0;
3994         if (!f)
3995                 return 0;
3996
3997         while ((sz = getline(&line, &len, f)) != -1)
3998                 answer += sz;
3999         fclose (f);
4000         free(line);
4001
4002         return answer;
4003 }
4004
4005 int proc_getattr(const char *path, struct stat *sb)
4006 {
4007         struct timespec now;
4008
4009         memset(sb, 0, sizeof(struct stat));
4010         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4011                 return -EINVAL;
4012         sb->st_uid = sb->st_gid = 0;
4013         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4014         if (strcmp(path, "/proc") == 0) {
4015                 sb->st_mode = S_IFDIR | 00555;
4016                 sb->st_nlink = 2;
4017                 return 0;
4018         }
4019         if (strcmp(path, "/proc/meminfo") == 0 ||
4020                         strcmp(path, "/proc/cpuinfo") == 0 ||
4021                         strcmp(path, "/proc/uptime") == 0 ||
4022                         strcmp(path, "/proc/stat") == 0 ||
4023                         strcmp(path, "/proc/diskstats") == 0 ||
4024                         strcmp(path, "/proc/swaps") == 0) {
4025                 sb->st_size = 0;
4026                 sb->st_mode = S_IFREG | 00444;
4027                 sb->st_nlink = 1;
4028                 return 0;
4029         }
4030
4031         return -ENOENT;
4032 }
4033
4034 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4035                 struct fuse_file_info *fi)
4036 {
4037         if (filler(buf, ".", NULL, 0) != 0 ||
4038             filler(buf, "..", NULL, 0) != 0 ||
4039             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4040             filler(buf, "meminfo", NULL, 0) != 0 ||
4041             filler(buf, "stat", NULL, 0) != 0 ||
4042             filler(buf, "uptime", NULL, 0) != 0 ||
4043             filler(buf, "diskstats", NULL, 0) != 0 ||
4044             filler(buf, "swaps", NULL, 0) != 0)
4045                 return -EINVAL;
4046         return 0;
4047 }
4048
4049 int proc_open(const char *path, struct fuse_file_info *fi)
4050 {
4051         int type = -1;
4052         struct file_info *info;
4053
4054         if (strcmp(path, "/proc/meminfo") == 0)
4055                 type = LXC_TYPE_PROC_MEMINFO;
4056         else if (strcmp(path, "/proc/cpuinfo") == 0)
4057                 type = LXC_TYPE_PROC_CPUINFO;
4058         else if (strcmp(path, "/proc/uptime") == 0)
4059                 type = LXC_TYPE_PROC_UPTIME;
4060         else if (strcmp(path, "/proc/stat") == 0)
4061                 type = LXC_TYPE_PROC_STAT;
4062         else if (strcmp(path, "/proc/diskstats") == 0)
4063                 type = LXC_TYPE_PROC_DISKSTATS;
4064         else if (strcmp(path, "/proc/swaps") == 0)
4065                 type = LXC_TYPE_PROC_SWAPS;
4066         if (type == -1)
4067                 return -ENOENT;
4068
4069         info = malloc(sizeof(*info));
4070         if (!info)
4071                 return -ENOMEM;
4072
4073         memset(info, 0, sizeof(*info));
4074         info->type = type;
4075
4076         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4077         do {
4078                 info->buf = malloc(info->buflen);
4079         } while (!info->buf);
4080         memset(info->buf, 0, info->buflen);
4081         /* set actual size to buffer size */
4082         info->size = info->buflen;
4083
4084         fi->fh = (unsigned long)info;
4085         return 0;
4086 }
4087
4088 int proc_access(const char *path, int mask)
4089 {
4090         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4091                 return 0;
4092
4093         /* these are all read-only */
4094         if ((mask & ~R_OK) != 0)
4095                 return -EACCES;
4096         return 0;
4097 }
4098
4099 int proc_release(const char *path, struct fuse_file_info *fi)
4100 {
4101         do_release_file_info(fi);
4102         return 0;
4103 }
4104
4105 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4106                 struct fuse_file_info *fi)
4107 {
4108         struct file_info *f = (struct file_info *) fi->fh;
4109
4110         switch (f->type) {
4111         case LXC_TYPE_PROC_MEMINFO:
4112                 return proc_meminfo_read(buf, size, offset, fi);
4113         case LXC_TYPE_PROC_CPUINFO:
4114                 return proc_cpuinfo_read(buf, size, offset, fi);
4115         case LXC_TYPE_PROC_UPTIME:
4116                 return proc_uptime_read(buf, size, offset, fi);
4117         case LXC_TYPE_PROC_STAT:
4118                 return proc_stat_read(buf, size, offset, fi);
4119         case LXC_TYPE_PROC_DISKSTATS:
4120                 return proc_diskstats_read(buf, size, offset, fi);
4121         case LXC_TYPE_PROC_SWAPS:
4122                 return proc_swaps_read(buf, size, offset, fi);
4123         default:
4124                 return -EINVAL;
4125         }
4126 }
4127
4128 /*
4129  * Functions needed to setup cgroups in the __constructor__.
4130  */
4131
4132 static bool mkdir_p(const char *dir, mode_t mode)
4133 {
4134         const char *tmp = dir;
4135         const char *orig = dir;
4136         char *makeme;
4137
4138         do {
4139                 dir = tmp + strspn(tmp, "/");
4140                 tmp = dir + strcspn(dir, "/");
4141                 makeme = strndup(orig, dir - orig);
4142                 if (!makeme)
4143                         return false;
4144                 if (mkdir(makeme, mode) && errno != EEXIST) {
4145                         fprintf(stderr, "failed to create directory '%s': %s",
4146                                 makeme, strerror(errno));
4147                         free(makeme);
4148                         return false;
4149                 }
4150                 free(makeme);
4151         } while(tmp != dir);
4152
4153         return true;
4154 }
4155
4156 static bool umount_if_mounted(void)
4157 {
4158         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4159                 fprintf(stderr, "failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4160                 return false;
4161         }
4162         return true;
4163 }
4164
4165 static int pivot_enter(void)
4166 {
4167         int ret = -1, oldroot = -1, newroot = -1;
4168
4169         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4170         if (oldroot < 0) {
4171                 fprintf(stderr, "%s: Failed to open old root for fchdir.\n", __func__);
4172                 return ret;
4173         }
4174
4175         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4176         if (newroot < 0) {
4177                 fprintf(stderr, "%s: Failed to open new root for fchdir.\n", __func__);
4178                 goto err;
4179         }
4180
4181         /* change into new root fs */
4182         if (fchdir(newroot) < 0) {
4183                 fprintf(stderr, "%s: Failed to change directory to new rootfs: %s.\n", __func__, ROOTDIR);
4184                 goto err;
4185         }
4186
4187         /* pivot_root into our new root fs */
4188         if (pivot_root(".", ".") < 0) {
4189                 fprintf(stderr, "%s: pivot_root() syscall failed: %s.\n", __func__, strerror(errno));
4190                 goto err;
4191         }
4192
4193         /*
4194          * At this point the old-root is mounted on top of our new-root.
4195          * To unmounted it we must not be chdir'd into it, so escape back
4196          * to the old-root.
4197          */
4198         if (fchdir(oldroot) < 0) {
4199                 fprintf(stderr, "%s: Failed to enter old root.\n", __func__);
4200                 goto err;
4201         }
4202         if (umount2(".", MNT_DETACH) < 0) {
4203                 fprintf(stderr, "%s: Failed to detach old root.\n", __func__);
4204                 goto err;
4205         }
4206
4207         if (fchdir(newroot) < 0) {
4208                 fprintf(stderr, "%s: Failed to re-enter new root.\n", __func__);
4209                 goto err;
4210         }
4211
4212         ret = 0;
4213
4214 err:
4215         if (oldroot > 0)
4216                 close(oldroot);
4217         if (newroot > 0)
4218                 close(newroot);
4219         return ret;
4220 }
4221
4222 /* Prepare our new clean root. */
4223 static int pivot_prepare(void)
4224 {
4225         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4226                 fprintf(stderr, "%s: Failed to create directory for new root.\n", __func__);
4227                 return -1;
4228         }
4229
4230         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4231                 fprintf(stderr, "%s: Failed to bind-mount / for new root: %s.\n", __func__, strerror(errno));
4232                 return -1;
4233         }
4234
4235         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4236                 fprintf(stderr, "%s: Failed to bind-mount /run into new root: %s.\n", __func__, strerror(errno));
4237                 return -1;
4238         }
4239
4240         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4241                 printf("%s: failed to move " BASEDIR " into new root: %s.\n", __func__, strerror(errno));
4242                 return -1;
4243         }
4244
4245         return 0;
4246 }
4247
4248 static bool pivot_new_root(void)
4249 {
4250         /* Prepare new root. */
4251         if (pivot_prepare() < 0)
4252                 return false;
4253
4254         /* Pivot into new root. */
4255         if (pivot_enter() < 0)
4256                 return false;
4257
4258         return true;
4259 }
4260
4261 static bool setup_cgfs_dir(void)
4262 {
4263         if (!mkdir_p(BASEDIR, 0700)) {
4264                 fprintf(stderr, "Failed to create lxcfs cgroup mountpoint.\n");
4265                 return false;
4266         }
4267
4268         if (!umount_if_mounted()) {
4269                 fprintf(stderr, "Failed to clean up old lxcfs cgroup mountpoint.\n");
4270                 return false;
4271         }
4272
4273         if (unshare(CLONE_NEWNS) < 0) {
4274                 fprintf(stderr, "%s: Failed to unshare mount namespace: %s.\n", __func__, strerror(errno));
4275                 return false;
4276         }
4277
4278         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4279                 fprintf(stderr, "%s: Failed to remount / private: %s.\n", __func__, strerror(errno));
4280                 return false;
4281         }
4282
4283         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
4284                 fprintf(stderr, "Failed to mount tmpfs over lxcfs cgroup mountpoint.\n");
4285                 return false;
4286         }
4287
4288         return true;
4289 }
4290
4291 static bool do_mount_cgroups(void)
4292 {
4293         char *target;
4294         size_t clen, len;
4295         int i, ret;
4296
4297         for (i = 0; i < num_hierarchies; i++) {
4298                 char *controller = hierarchies[i];
4299                 clen = strlen(controller);
4300                 len = strlen(BASEDIR) + clen + 2;
4301                 target = malloc(len);
4302                 if (!target)
4303                         return false;
4304                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4305                 if (ret < 0 || ret >= len) {
4306                         free(target);
4307                         return false;
4308                 }
4309                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4310                         free(target);
4311                         return false;
4312                 }
4313                 if (mount(controller, target, "cgroup", 0, controller) < 0) {
4314                         fprintf(stderr, "Failed mounting cgroup %s\n", controller);
4315                         free(target);
4316                         return false;
4317                 }
4318
4319                 fd_hierarchies[i] = open(target, O_DIRECTORY);
4320                 if (fd_hierarchies[i] < 0) {
4321                         free(target);
4322                         return false;
4323                 }
4324                 free(target);
4325         }
4326         return true;
4327 }
4328
4329 static bool cgfs_setup_controllers(void)
4330 {
4331         if (!setup_cgfs_dir())
4332                 return false;
4333
4334         if (!do_mount_cgroups()) {
4335                 fprintf(stderr, "Failed to set up private lxcfs cgroup mounts.\n");
4336                 return false;
4337         }
4338
4339         if (!pivot_new_root())
4340                 return false;
4341
4342         return true;
4343 }
4344
4345 static int preserve_ns(int pid)
4346 {
4347         int ret;
4348         size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
4349         char path[len];
4350
4351         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4352         if (ret < 0 || (size_t)ret >= len)
4353                 return -1;
4354
4355         return open(path, O_RDONLY | O_CLOEXEC);
4356 }
4357
4358 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
4359 {
4360         FILE *f;
4361         char *line = NULL;
4362         size_t len = 0;
4363         int i, init_ns = -1;
4364
4365         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4366                 fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno));
4367                 return;
4368         }
4369         while (getline(&line, &len, f) != -1) {
4370                 char *p, *p2;
4371
4372                 p = strchr(line, ':');
4373                 if (!p)
4374                         goto out;
4375                 *(p++) = '\0';
4376
4377                 p2 = strrchr(p, ':');
4378                 if (!p2)
4379                         goto out;
4380                 *p2 = '\0';
4381
4382                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4383                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4384                  * because it parses out the empty string "" and later on passes
4385                  * it to mount(). Let's skip such entries.
4386                  */
4387                 if (!strcmp(p, ""))
4388                         continue;
4389
4390                 if (!store_hierarchy(line, p))
4391                         goto out;
4392         }
4393
4394         /* Preserve initial namespace. */
4395         init_ns = preserve_ns(getpid());
4396         if (init_ns < 0)
4397                 goto out;
4398
4399         fd_hierarchies = malloc(sizeof(int *) * num_hierarchies);
4400         if (!fd_hierarchies)
4401                 goto out;
4402
4403         for (i = 0; i < num_hierarchies; i++)
4404                 fd_hierarchies[i] = -1;
4405
4406         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4407          * to privately mount lxcfs cgroups. */
4408         if (!cgfs_setup_controllers())
4409                 goto out;
4410
4411         if (setns(init_ns, 0) < 0)
4412                 goto out;
4413
4414         print_subsystems();
4415
4416 out:
4417         free(line);
4418         fclose(f);
4419         if (init_ns >= 0)
4420                 close(init_ns);
4421 }
4422
4423 static void __attribute__((destructor)) free_subsystems(void)
4424 {
4425         int i;
4426
4427         for (i = 0; i < num_hierarchies; i++) {
4428                 if (hierarchies[i])
4429                         free(hierarchies[i]);
4430                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
4431                         close(fd_hierarchies[i]);
4432         }
4433         free(hierarchies);
4434         free(fd_hierarchies);
4435 }