bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #include <dirent.h>
  12 #include <errno.h>
  13 #include <fcntl.h>
  14 #include <fuse.h>
  15 #include <libgen.h>
  16 #include <pthread.h>
  17 #include <sched.h>
  18 #include <stdbool.h>
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include <string.h>
  22 #include <time.h>
  23 #include <unistd.h>
  24 #include <wait.h>
  25 #include <linux/sched.h>
  26 #include <sys/epoll.h>
  27 #include <sys/mman.h>
  28 #include <sys/mount.h>
  29 #include <sys/param.h>
  30 #include <sys/socket.h>
  31 #include <sys/syscall.h>
  32
  33 #include "bindings.h"
  34 #include "config.h" // for VERSION
  35
  36 /* Define pivot_root() if missing from the C library */
  37 #ifndef HAVE_PIVOT_ROOT
  38 static int pivot_root(const char * new_root, const char * put_old)
  39 {
  40 #ifdef __NR_pivot_root
  41 return syscall(__NR_pivot_root, new_root, put_old);
  42 #else
  43 errno = ENOSYS;
  44 return -1;
  45 #endif
  46 }
  47 #else
  48 extern int pivot_root(const char * new_root, const char * put_old);
  49 #endif
  50
  51 enum {
  52         LXC_TYPE_CGDIR,
  53         LXC_TYPE_CGFILE,
  54         LXC_TYPE_PROC_MEMINFO,
  55         LXC_TYPE_PROC_CPUINFO,
  56         LXC_TYPE_PROC_UPTIME,
  57         LXC_TYPE_PROC_STAT,
  58         LXC_TYPE_PROC_DISKSTATS,
  59         LXC_TYPE_PROC_SWAPS,
  60 };
  61
  62 struct file_info {
  63         char *controller;
  64         char *cgroup;
  65         char *file;
  66         int type;
  67         char *buf;  // unused as of yet
  68         int buflen;
  69         int size; //actual data size
  70         int cached;
  71 };
  72
  73 /* reserve buffer size, for cpuall in /proc/stat */
  74 #define BUF_RESERVE_SIZE 256
  75
  76 /*
  77  * A table caching which pid is init for a pid namespace.
  78  * When looking up which pid is init for $qpid, we first
  79  * 1. Stat /proc/$qpid/ns/pid.
  80  * 2. Check whether the ino_t is in our store.
  81  *   a. if not, fork a child in qpid's ns to send us
  82  *       ucred.pid = 1, and read the initpid.  Cache
  83  *       initpid and creation time for /proc/initpid
  84  *       in a new store entry.
  85  *   b. if so, verify that /proc/initpid still matches
  86  *       what we have saved.  If not, clear the store
  87  *       entry and go back to a.  If so, return the
  88  *       cached initpid.
  89  */
  90 struct pidns_init_store {
  91         ino_t ino;          // inode number for /proc/$pid/ns/pid
  92         pid_t initpid;      // the pid of nit in that ns
  93         long int ctime;     // the time at which /proc/$initpid was created
  94         struct pidns_init_store *next;
  95         long int lastcheck;
  96 };
  97
  98 /* lol - look at how they are allocated in the kernel */
  99 #define PIDNS_HASH_SIZE 4096
 100 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 101
 102 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 103 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 104 static void lock_mutex(pthread_mutex_t *l)
 105 {
 106         int ret;
 107
 108         if ((ret = pthread_mutex_lock(l)) != 0) {
 109                 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
 110                 exit(1);
 111         }
 112 }
 113
 114 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 115  * Number of hierarchies mounted. */
 116 static int num_hierarchies;
 117
 118 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 119  * Hierachies mounted {cpuset, blkio, ...}:
 120  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 121 static char **hierarchies;
 122
 123 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 124  * Open file descriptors:
 125  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 126  * private mount namespace.
 127  * Initialized via __constructor__ collect_and_mount_subsystems().
 128  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 129  * mounts and respective files in the private namespace even when located in
 130  * another namespace using the *at() family of functions
 131  * {openat(), fchownat(), ...}. */
 132 static int *fd_hierarchies;
 133
 134 static void unlock_mutex(pthread_mutex_t *l)
 135 {
 136         int ret;
 137
 138         if ((ret = pthread_mutex_unlock(l)) != 0) {
 139                 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
 140                 exit(1);
 141         }
 142 }
 143
 144 static void store_lock(void)
 145 {
 146         lock_mutex(&pidns_store_mutex);
 147 }
 148
 149 static void store_unlock(void)
 150 {
 151         unlock_mutex(&pidns_store_mutex);
 152 }
 153
 154 /* Must be called under store_lock */
 155 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 156 {
 157         struct stat initsb;
 158         char fnam[100];
 159
 160         snprintf(fnam, 100, "/proc/%d", e->initpid);
 161         if (stat(fnam, &initsb) < 0)
 162                 return false;
 163 #if DEBUG
 164         fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
 165                 e->ctime, initsb.st_ctime, e->initpid);
 166 #endif
 167         if (e->ctime != initsb.st_ctime)
 168                 return false;
 169         return true;
 170 }
 171
 172 /* Must be called under store_lock */
 173 static void remove_initpid(struct pidns_init_store *e)
 174 {
 175         struct pidns_init_store *tmp;
 176         int h;
 177
 178 #if DEBUG
 179         fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
 180 #endif
 181         h = HASH(e->ino);
 182         if (pidns_hash_table[h] == e) {
 183                 pidns_hash_table[h] = e->next;
 184                 free(e);
 185                 return;
 186         }
 187
 188         tmp = pidns_hash_table[h];
 189         while (tmp) {
 190                 if (tmp->next == e) {
 191                         tmp->next = e->next;
 192                         free(e);
 193                         return;
 194                 }
 195                 tmp = tmp->next;
 196         }
 197 }
 198
 199 #define PURGE_SECS 5
 200 /* Must be called under store_lock */
 201 static void prune_initpid_store(void)
 202 {
 203         static long int last_prune = 0;
 204         struct pidns_init_store *e, *prev, *delme;
 205         long int now, threshold;
 206         int i;
 207
 208         if (!last_prune) {
 209                 last_prune = time(NULL);
 210                 return;
 211         }
 212         now = time(NULL);
 213         if (now < last_prune + PURGE_SECS)
 214                 return;
 215 #if DEBUG
 216         fprintf(stderr, "pruning\n");
 217 #endif
 218         last_prune = now;
 219         threshold = now - 2 * PURGE_SECS;
 220
 221         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 222                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 223                         if (e->lastcheck < threshold) {
 224 #if DEBUG
 225                                 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
 226 #endif
 227                                 delme = e;
 228                                 if (prev)
 229                                         prev->next = e->next;
 230                                 else
 231                                         pidns_hash_table[i] = e->next;
 232                                 e = e->next;
 233                                 free(delme);
 234                         } else {
 235                                 prev = e;
 236                                 e = e->next;
 237                         }
 238                 }
 239         }
 240 }
 241
 242 /* Must be called under store_lock */
 243 static void save_initpid(struct stat *sb, pid_t pid)
 244 {
 245         struct pidns_init_store *e;
 246         char fpath[100];
 247         struct stat procsb;
 248         int h;
 249
 250 #if DEBUG
 251         fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
 252 #endif
 253         snprintf(fpath, 100, "/proc/%d", pid);
 254         if (stat(fpath, &procsb) < 0)
 255                 return;
 256         do {
 257                 e = malloc(sizeof(*e));
 258         } while (!e);
 259         e->ino = sb->st_ino;
 260         e->initpid = pid;
 261         e->ctime = procsb.st_ctime;
 262         h = HASH(e->ino);
 263         e->next = pidns_hash_table[h];
 264         e->lastcheck = time(NULL);
 265         pidns_hash_table[h] = e;
 266 }
 267
 268 /*
 269  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 270  * entry for the inode number and creation time.  Verify that the init pid
 271  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 272  * otherwise.
 273  * Must be called under store_lock
 274  */
 275 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 276 {
 277         int h = HASH(sb->st_ino);
 278         struct pidns_init_store *e = pidns_hash_table[h];
 279
 280         while (e) {
 281                 if (e->ino == sb->st_ino) {
 282                         if (initpid_still_valid(e, sb)) {
 283                                 e->lastcheck = time(NULL);
 284                                 return e;
 285                         }
 286                         remove_initpid(e);
 287                         return NULL;
 288                 }
 289                 e = e->next;
 290         }
 291
 292         return NULL;
 293 }
 294
 295 static int is_dir(const char *path, int fd)
 296 {
 297         struct stat statbuf;
 298         int ret = fstatat(fd, path, &statbuf, fd);
 299         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 300                 return 1;
 301         return 0;
 302 }
 303
 304 static char *must_copy_string(const char *str)
 305 {
 306         char *dup = NULL;
 307         if (!str)
 308                 return NULL;
 309         do {
 310                 dup = strdup(str);
 311         } while (!dup);
 312
 313         return dup;
 314 }
 315
 316 static inline void drop_trailing_newlines(char *s)
 317 {
 318         int l;
 319
 320         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 321                 s[l-1] = '\0';
 322 }
 323
 324 #define BATCH_SIZE 50
 325 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 326 {
 327         int newbatches = (newlen / BATCH_SIZE) + 1;
 328         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 329
 330         if (!*mem || newbatches > oldbatches) {
 331                 char *tmp;
 332                 do {
 333                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 334                 } while (!tmp);
 335                 *mem = tmp;
 336         }
 337 }
 338 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 339 {
 340         size_t newlen = *len + linelen;
 341         dorealloc(contents, *len, newlen + 1);
 342         memcpy(*contents + *len, line, linelen+1);
 343         *len = newlen;
 344 }
 345
 346 static char *slurp_file(const char *from, int fd)
 347 {
 348         char *line = NULL;
 349         char *contents = NULL;
 350         FILE *f = fdopen(fd, "r");
 351         size_t len = 0, fulllen = 0;
 352         ssize_t linelen;
 353
 354         if (!f)
 355                 return NULL;
 356
 357         while ((linelen = getline(&line, &len, f)) != -1) {
 358                 append_line(&contents, &fulllen, line, linelen);
 359         }
 360         fclose(f);
 361
 362         if (contents)
 363                 drop_trailing_newlines(contents);
 364         free(line);
 365         return contents;
 366 }
 367
 368 static bool write_string(const char *fnam, const char *string, int fd)
 369 {
 370         FILE *f;
 371         size_t len, ret;
 372
 373         if (!(f = fdopen(fd, "w")))
 374                 return false;
 375         len = strlen(string);
 376         ret = fwrite(string, 1, len, f);
 377         if (ret != len) {
 378                 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
 379                 fclose(f);
 380                 return false;
 381         }
 382         if (fclose(f) < 0) {
 383                 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
 384                 return false;
 385         }
 386         return true;
 387 }
 388
 389 struct cgfs_files {
 390         char *name;
 391         uint32_t uid, gid;
 392         uint32_t mode;
 393 };
 394
 395 #define ALLOC_NUM 20
 396 static bool store_hierarchy(char *stridx, char *h)
 397 {
 398         if (num_hierarchies % ALLOC_NUM == 0) {
 399                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 400                 n *= ALLOC_NUM;
 401                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 402                 if (!tmp) {
 403                         fprintf(stderr, "Out of memory\n");
 404                         exit(1);
 405                 }
 406                 hierarchies = tmp;
 407         }
 408
 409         hierarchies[num_hierarchies++] = must_copy_string(h);
 410         return true;
 411 }
 412
 413 static void print_subsystems(void)
 414 {
 415         int i;
 416
 417         fprintf(stderr, "hierarchies:\n");
 418         for (i = 0; i < num_hierarchies; i++) {
 419                 if (hierarchies[i])
 420                         fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
 421         }
 422 }
 423
 424 static bool in_comma_list(const char *needle, const char *haystack)
 425 {
 426         const char *s = haystack, *e;
 427         size_t nlen = strlen(needle);
 428
 429         while (*s && (e = strchr(s, ','))) {
 430                 if (nlen != e - s) {
 431                         s = e + 1;
 432                         continue;
 433                 }
 434                 if (strncmp(needle, s, nlen) == 0)
 435                         return true;
 436                 s = e + 1;
 437         }
 438         if (strcmp(needle, s) == 0)
 439                 return true;
 440         return false;
 441 }
 442
 443 /* do we need to do any massaging here?  I'm not sure... */
 444 /* Return the mounted controller and store the corresponding open file descriptor
 445  * referring to the controller mountpoint in the private lxcfs namespace in
 446  * @cfd.
 447  */
 448 static char *find_mounted_controller(const char *controller, int *cfd)
 449 {
 450         int i;
 451
 452         for (i = 0; i < num_hierarchies; i++) {
 453                 if (!hierarchies[i])
 454                         continue;
 455                 if (strcmp(hierarchies[i], controller) == 0) {
 456                         *cfd = fd_hierarchies[i];
 457                         return hierarchies[i];
 458                 }
 459                 if (in_comma_list(controller, hierarchies[i])) {
 460                         *cfd = fd_hierarchies[i];
 461                         return hierarchies[i];
 462                 }
 463         }
 464
 465         return NULL;
 466 }
 467
 468 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 469                 const char *value)
 470 {
 471         int ret, fd, cfd;
 472         size_t len;
 473         char *fnam, *tmpc;
 474
 475         tmpc = find_mounted_controller(controller, &cfd);
 476         if (!tmpc)
 477                 return false;
 478
 479         /* Make sure we pass a relative path to *at() family of functions.
 480          * . + /cgroup + / + file + \0
 481          */
 482         len = strlen(cgroup) + strlen(file) + 3;
 483         fnam = alloca(len);
 484         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 485         if (ret < 0 || (size_t)ret >= len)
 486                 return false;
 487
 488         fd = openat(cfd, fnam, O_WRONLY);
 489         if (fd < 0)
 490                 return false;
 491
 492         return write_string(fnam, value, fd);
 493 }
 494
 495 // Chown all the files in the cgroup directory.  We do this when we create
 496 // a cgroup on behalf of a user.
 497 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 498 {
 499         struct dirent *direntp;
 500         char path[MAXPATHLEN];
 501         size_t len;
 502         DIR *d;
 503         int fd1, ret;
 504
 505         len = strlen(dirname);
 506         if (len >= MAXPATHLEN) {
 507                 fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname);
 508                 return;
 509         }
 510
 511         fd1 = openat(fd, dirname, O_DIRECTORY);
 512         if (fd1 < 0)
 513                 return;
 514
 515         d = fdopendir(fd1);
 516         if (!d) {
 517                 fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname);
 518                 return;
 519         }
 520
 521         while ((direntp = readdir(d))) {
 522                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 523                         continue;
 524                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 525                 if (ret < 0 || ret >= MAXPATHLEN) {
 526                         fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname);
 527                         continue;
 528                 }
 529                 if (fchownat(fd, path, uid, gid, 0) < 0)
 530                         fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid);
 531         }
 532         closedir(d);
 533 }
 534
 535 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 536 {
 537         int cfd;
 538         size_t len;
 539         char *dirnam, *tmpc;
 540
 541         tmpc = find_mounted_controller(controller, &cfd);
 542         if (!tmpc)
 543                 return -EINVAL;
 544
 545         /* Make sure we pass a relative path to *at() family of functions.
 546          * . + /cg + \0
 547          */
 548         len = strlen(cg) + 2;
 549         dirnam = alloca(len);
 550         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 551
 552         if (mkdirat(cfd, dirnam, 0755) < 0)
 553                 return -errno;
 554
 555         if (uid == 0 && gid == 0)
 556                 return 0;
 557
 558         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 559                 return -errno;
 560
 561         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 562
 563         return 0;
 564 }
 565
 566 static bool recursive_rmdir(const char *dirname, int fd)
 567 {
 568         struct dirent *direntp;
 569         DIR *dir;
 570         bool ret = false;
 571         char pathname[MAXPATHLEN];
 572         int dupfd;
 573
 574         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 575         if (dupfd < 0)
 576                 return false;
 577
 578         dir = fdopendir(dupfd);
 579         if (!dir) {
 580 #if DEBUG
 581                 fprintf(stderr, "%s: failed to open %s: %s\n", __func__, dirname, strerror(errno));
 582 #endif
 583                 return false;
 584         }
 585
 586         while ((direntp = readdir(dir))) {
 587                 struct stat mystat;
 588                 int rc;
 589
 590                 if (!direntp)
 591                         break;
 592
 593                 if (!strcmp(direntp->d_name, ".") ||
 594                     !strcmp(direntp->d_name, ".."))
 595                         continue;
 596
 597                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 598                 if (rc < 0 || rc >= MAXPATHLEN) {
 599                         fprintf(stderr, "pathname too long\n");
 600                         continue;
 601                 }
 602
 603                 ret = fstatat(fd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 604                 if (ret) {
 605 #if DEBUG
 606                         fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
 607 #endif
 608                         continue;
 609                 }
 610                 if (S_ISDIR(mystat.st_mode)) {
 611                         if (!recursive_rmdir(pathname, fd)) {
 612 #if DEBUG
 613                                 fprintf(stderr, "Error removing %s\n", pathname);
 614 #endif
 615                         }
 616                 }
 617         }
 618
 619         ret = true;
 620         if (closedir(dir) < 0) {
 621                 fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno));
 622                 ret = false;
 623         }
 624
 625         if (unlinkat(fd, dirname, AT_REMOVEDIR) < 0) {
 626 #if DEBUG
 627                 fprintf(stderr, "%s: failed to delete %s: %s\n", __func__, dirname, strerror(errno));
 628 #endif
 629                 ret = false;
 630         }
 631         close(fd);
 632
 633         return ret;
 634 }
 635
 636 bool cgfs_remove(const char *controller, const char *cg)
 637 {
 638         int fd, cfd;
 639         size_t len;
 640         char *dirnam, *tmpc;
 641
 642         tmpc = find_mounted_controller(controller, &cfd);
 643         if (!tmpc)
 644                 return false;
 645
 646         /* Make sure we pass a relative path to *at() family of functions.
 647          * . +  /cg + \0
 648          */
 649         len = strlen(cg) + 2;
 650         dirnam = alloca(len);
 651         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 652
 653         fd = openat(cfd, dirnam, O_DIRECTORY);
 654         if (fd < 0)
 655                 return false;
 656
 657         return recursive_rmdir(dirnam, fd);
 658 }
 659
 660 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 661 {
 662         int cfd;
 663         size_t len;
 664         char *pathname, *tmpc;
 665
 666         tmpc = find_mounted_controller(controller, &cfd);
 667         if (!tmpc)
 668                 return false;
 669
 670         /* Make sure we pass a relative path to *at() family of functions.
 671          * . + /file + \0
 672          */
 673         len = strlen(file) + 2;
 674         pathname = alloca(len);
 675         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 676         if (fchmodat(cfd, pathname, mode, 0) < 0)
 677                 return false;
 678         return true;
 679 }
 680
 681 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 682 {
 683         size_t len;
 684         char *fname;
 685
 686         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 687         fname = alloca(len);
 688         snprintf(fname, len, "%s/tasks", dirname);
 689         if (fchownat(fd, fname, uid, gid, 0) != 0)
 690                 return -errno;
 691         snprintf(fname, len, "%s/cgroup.procs", dirname);
 692         if (fchownat(fd, fname, uid, gid, 0) != 0)
 693                 return -errno;
 694         return 0;
 695 }
 696
 697 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 698 {
 699         int cfd;
 700         size_t len;
 701         char *pathname, *tmpc;
 702
 703         tmpc = find_mounted_controller(controller, &cfd);
 704         if (!tmpc)
 705                 return -EINVAL;
 706
 707         /* Make sure we pass a relative path to *at() family of functions.
 708          * . + /file + \0
 709          */
 710         len = strlen(file) + 2;
 711         pathname = alloca(len);
 712         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 713         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 714                 return -errno;
 715
 716         if (is_dir(pathname, cfd))
 717                 // like cgmanager did, we want to chown the tasks file as well
 718                 return chown_tasks_files(pathname, uid, gid, cfd);
 719
 720         return 0;
 721 }
 722
 723 FILE *open_pids_file(const char *controller, const char *cgroup)
 724 {
 725         int fd, cfd;
 726         size_t len;
 727         char *pathname, *tmpc;
 728
 729         tmpc = find_mounted_controller(controller, &cfd);
 730         if (!tmpc)
 731                 return NULL;
 732
 733         /* Make sure we pass a relative path to *at() family of functions.
 734          * . + /cgroup + / "cgroup.procs" + \0
 735          */
 736         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 737         pathname = alloca(len);
 738         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 739
 740         fd = openat(cfd, pathname, O_WRONLY);
 741         if (fd < 0)
 742                 return NULL;
 743
 744         return fdopen(fd, "w");
 745 }
 746
 747 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 748                                 void ***list, size_t typesize,
 749                                 void* (*iterator)(const char*, const char*, const char*))
 750 {
 751         int cfd, fd, ret;
 752         size_t len;
 753         char *cg, *tmpc;
 754         char pathname[MAXPATHLEN];
 755         size_t sz = 0, asz = 0;
 756         struct dirent *dirent;
 757         DIR *dir;
 758
 759         tmpc = find_mounted_controller(controller, &cfd);
 760         *list = NULL;
 761         if (!tmpc)
 762                 return false;
 763
 764         /* Make sure we pass a relative path to *at() family of functions. */
 765         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 766         cg = alloca(len);
 767         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 768         if (ret < 0 || (size_t)ret >= len) {
 769                 fprintf(stderr, "%s: pathname too long under %s\n", __func__, cgroup);
 770                 return false;
 771         }
 772
 773         fd = openat(cfd, cg, O_DIRECTORY);
 774         if (fd < 0)
 775                 return false;
 776
 777         dir = fdopendir(fd);
 778         if (!dir)
 779                 return false;
 780
 781         while ((dirent = readdir(dir))) {
 782                 struct stat mystat;
 783
 784                 if (!strcmp(dirent->d_name, ".") ||
 785                     !strcmp(dirent->d_name, ".."))
 786                         continue;
 787
 788                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 789                 if (ret < 0 || ret >= MAXPATHLEN) {
 790                         fprintf(stderr, "%s: pathname too long under %s\n", __func__, cg);
 791                         continue;
 792                 }
 793
 794                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 795                 if (ret) {
 796                         fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
 797                         continue;
 798                 }
 799                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
 800                     (directories && !S_ISDIR(mystat.st_mode)))
 801                         continue;
 802
 803                 if (sz+2 >= asz) {
 804                         void **tmp;
 805                         asz += BATCH_SIZE;
 806                         do {
 807                                 tmp = realloc(*list, asz * typesize);
 808                         } while  (!tmp);
 809                         *list = tmp;
 810                 }
 811                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
 812                 (*list)[sz+1] = NULL;
 813                 sz++;
 814         }
 815         if (closedir(dir) < 0) {
 816                 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, cgroup, strerror(errno));
 817                 return false;
 818         }
 819         return true;
 820 }
 821
 822 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 823 {
 824         char *dup;
 825         do {
 826                 dup = strdup(dir_entry);
 827         } while (!dup);
 828         return dup;
 829 }
 830
 831 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
 832 {
 833         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
 834 }
 835
 836 void free_key(struct cgfs_files *k)
 837 {
 838         if (!k)
 839                 return;
 840         free(k->name);
 841         free(k);
 842 }
 843
 844 void free_keys(struct cgfs_files **keys)
 845 {
 846         int i;
 847
 848         if (!keys)
 849                 return;
 850         for (i = 0; keys[i]; i++) {
 851                 free_key(keys[i]);
 852         }
 853         free(keys);
 854 }
 855
 856 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
 857 {
 858         int ret, fd, cfd;
 859         size_t len;
 860         char *fnam, *tmpc;
 861
 862         tmpc = find_mounted_controller(controller, &cfd);
 863         if (!tmpc)
 864                 return false;
 865
 866         /* Make sure we pass a relative path to *at() family of functions.
 867          * . + /cgroup + / + file + \0
 868          */
 869         len = strlen(cgroup) + strlen(file) + 3;
 870         fnam = alloca(len);
 871         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 872         if (ret < 0 || (size_t)ret >= len)
 873                 return NULL;
 874
 875         fd = openat(cfd, fnam, O_RDONLY);
 876         if (fd < 0)
 877                 return NULL;
 878
 879         *value = slurp_file(fnam, fd);
 880         return *value != NULL;
 881 }
 882
 883 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
 884 {
 885         int ret, cfd;
 886         size_t len;
 887         char *fnam, *tmpc;
 888         struct stat sb;
 889         struct cgfs_files *newkey;
 890
 891         tmpc = find_mounted_controller(controller, &cfd);
 892         if (!tmpc)
 893                 return false;
 894
 895         if (file && *file == '/')
 896                 file++;
 897
 898         if (file && strchr(file, '/'))
 899                 return NULL;
 900
 901         /* Make sure we pass a relative path to *at() family of functions.
 902          * . + /cgroup + / + file + \0
 903          */
 904         len = strlen(cgroup) + 3;
 905         if (file)
 906                 len += strlen(file) + 1;
 907         fnam = alloca(len);
 908         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
 909                  file ? "/" : "", file ? file : "");
 910
 911         ret = fstatat(cfd, fnam, &sb, 0);
 912         if (ret < 0)
 913                 return NULL;
 914
 915         do {
 916                 newkey = malloc(sizeof(struct cgfs_files));
 917         } while (!newkey);
 918         if (file)
 919                 newkey->name = must_copy_string(file);
 920         else if (strrchr(cgroup, '/'))
 921                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
 922         else
 923                 newkey->name = must_copy_string(cgroup);
 924         newkey->uid = sb.st_uid;
 925         newkey->gid = sb.st_gid;
 926         newkey->mode = sb.st_mode;
 927
 928         return newkey;
 929 }
 930
 931 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 932 {
 933         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
 934         if (!entry) {
 935                 fprintf(stderr, "%s: Error getting files under %s:%s\n",
 936                         __func__, controller, cgroup);
 937         }
 938         return entry;
 939 }
 940
 941 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
 942 {
 943         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
 944 }
 945
 946 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
 947 {
 948         int cfd;
 949         size_t len;
 950         char *fnam, *tmpc;
 951         int ret;
 952         struct stat sb;
 953
 954         tmpc = find_mounted_controller(controller, &cfd);
 955         if (!tmpc)
 956                 return false;
 957
 958         /* Make sure we pass a relative path to *at() family of functions.
 959          * . + /cgroup + / + f + \0
 960          */
 961         len = strlen(cgroup) + strlen(f) + 3;
 962         fnam = alloca(len);
 963         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
 964         if (ret < 0 || (size_t)ret >= len)
 965                 return false;
 966
 967         ret = fstatat(cfd, fnam, &sb, 0);
 968         if (ret < 0 || !S_ISDIR(sb.st_mode))
 969                 return false;
 970
 971         return true;
 972 }
 973
 974 #define SEND_CREDS_OK 0
 975 #define SEND_CREDS_NOTSK 1
 976 #define SEND_CREDS_FAIL 2
 977 static bool recv_creds(int sock, struct ucred *cred, char *v);
 978 static int wait_for_pid(pid_t pid);
 979 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
 980 static int send_creds_clone_wrapper(void *arg);
 981
 982 /*
 983  * clone a task which switches to @task's namespace and writes '1'.
 984  * over a unix sock so we can read the task's reaper's pid in our
 985  * namespace
 986  *
 987  * Note: glibc's fork() does not respect pidns, which can lead to failed
 988  * assertions inside glibc (and thus failed forks) if the child's pid in
 989  * the pidns and the parent pid outside are identical. Using clone prevents
 990  * this issue.
 991  */
 992 static void write_task_init_pid_exit(int sock, pid_t target)
 993 {
 994         char fnam[100];
 995         pid_t pid;
 996         int fd, ret;
 997         size_t stack_size = sysconf(_SC_PAGESIZE);
 998         void *stack = alloca(stack_size);
 999
1000         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1001         if (ret < 0 || ret >= sizeof(fnam))
1002                 _exit(1);
1003
1004         fd = open(fnam, O_RDONLY);
1005         if (fd < 0) {
1006                 perror("write_task_init_pid_exit open of ns/pid");
1007                 _exit(1);
1008         }
1009         if (setns(fd, 0)) {
1010                 perror("write_task_init_pid_exit setns 1");
1011                 close(fd);
1012                 _exit(1);
1013         }
1014         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1015         if (pid < 0)
1016                 _exit(1);
1017         if (pid != 0) {
1018                 if (!wait_for_pid(pid))
1019                         _exit(1);
1020                 _exit(0);
1021         }
1022 }
1023
1024 static int send_creds_clone_wrapper(void *arg) {
1025         struct ucred cred;
1026         char v;
1027         int sock = *(int *)arg;
1028
1029         /* we are the child */
1030         cred.uid = 0;
1031         cred.gid = 0;
1032         cred.pid = 1;
1033         v = '1';
1034         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1035                 return 1;
1036         return 0;
1037 }
1038
1039 static pid_t get_init_pid_for_task(pid_t task)
1040 {
1041         int sock[2];
1042         pid_t pid;
1043         pid_t ret = -1;
1044         char v = '0';
1045         struct ucred cred;
1046
1047         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1048                 perror("socketpair");
1049                 return -1;
1050         }
1051
1052         pid = fork();
1053         if (pid < 0)
1054                 goto out;
1055         if (!pid) {
1056                 close(sock[1]);
1057                 write_task_init_pid_exit(sock[0], task);
1058                 _exit(0);
1059         }
1060
1061         if (!recv_creds(sock[1], &cred, &v))
1062                 goto out;
1063         ret = cred.pid;
1064
1065 out:
1066         close(sock[0]);
1067         close(sock[1]);
1068         if (pid > 0)
1069                 wait_for_pid(pid);
1070         return ret;
1071 }
1072
1073 static pid_t lookup_initpid_in_store(pid_t qpid)
1074 {
1075         pid_t answer = 0;
1076         struct stat sb;
1077         struct pidns_init_store *e;
1078         char fnam[100];
1079
1080         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1081         store_lock();
1082         if (stat(fnam, &sb) < 0)
1083                 goto out;
1084         e = lookup_verify_initpid(&sb);
1085         if (e) {
1086                 answer = e->initpid;
1087                 goto out;
1088         }
1089         answer = get_init_pid_for_task(qpid);
1090         if (answer > 0)
1091                 save_initpid(&sb, answer);
1092
1093 out:
1094         /* we prune at end in case we are returning
1095          * the value we were about to return */
1096         prune_initpid_store();
1097         store_unlock();
1098         return answer;
1099 }
1100
1101 static int wait_for_pid(pid_t pid)
1102 {
1103         int status, ret;
1104
1105         if (pid <= 0)
1106                 return -1;
1107
1108 again:
1109         ret = waitpid(pid, &status, 0);
1110         if (ret == -1) {
1111                 if (errno == EINTR)
1112                         goto again;
1113                 return -1;
1114         }
1115         if (ret != pid)
1116                 goto again;
1117         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1118                 return -1;
1119         return 0;
1120 }
1121
1122
1123 /*
1124  * append pid to *src.
1125  * src: a pointer to a char* in which ot append the pid.
1126  * sz: the number of characters printed so far, minus trailing \0.
1127  * asz: the allocated size so far
1128  * pid: the pid to append
1129  */
1130 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1131 {
1132         char tmp[30];
1133
1134         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1135
1136         if (!*src || tmplen + *sz + 1 >= *asz) {
1137                 char *tmp;
1138                 do {
1139                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1140                 } while (!tmp);
1141                 *src = tmp;
1142                 *asz += BUF_RESERVE_SIZE;
1143         }
1144         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1145         *sz += tmplen;
1146 }
1147
1148 /*
1149  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1150  * valid in the caller's namespace, return the id mapped into
1151  * pid's namespace.
1152  * Returns the mapped id, or -1 on error.
1153  */
1154 unsigned int
1155 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1156 {
1157         unsigned int nsuid,   // base id for a range in the idfile's namespace
1158                      hostuid, // base id for a range in the caller's namespace
1159                      count;   // number of ids in this range
1160         char line[400];
1161         int ret;
1162
1163         fseek(idfile, 0L, SEEK_SET);
1164         while (fgets(line, 400, idfile)) {
1165                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1166                 if (ret != 3)
1167                         continue;
1168                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1169                         /*
1170                          * uids wrapped around - unexpected as this is a procfile,
1171                          * so just bail.
1172                          */
1173                         fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
1174                                 nsuid, hostuid, count, line);
1175                         return -1;
1176                 }
1177                 if (hostuid <= in_id && hostuid+count > in_id) {
1178                         /*
1179                          * now since hostuid <= in_id < hostuid+count, and
1180                          * hostuid+count and nsuid+count do not wrap around,
1181                          * we know that nsuid+(in_id-hostuid) which must be
1182                          * less that nsuid+(count) must not wrap around
1183                          */
1184                         return (in_id - hostuid) + nsuid;
1185                 }
1186         }
1187
1188         // no answer found
1189         return -1;
1190 }
1191
1192 /*
1193  * for is_privileged_over,
1194  * specify whether we require the calling uid to be root in his
1195  * namespace
1196  */
1197 #define NS_ROOT_REQD true
1198 #define NS_ROOT_OPT false
1199
1200 #define PROCLEN 100
1201
1202 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1203 {
1204         char fpath[PROCLEN];
1205         int ret;
1206         bool answer = false;
1207         uid_t nsuid;
1208
1209         if (victim == -1 || uid == -1)
1210                 return false;
1211
1212         /*
1213          * If the request is one not requiring root in the namespace,
1214          * then having the same uid suffices.  (i.e. uid 1000 has write
1215          * access to files owned by uid 1000
1216          */
1217         if (!req_ns_root && uid == victim)
1218                 return true;
1219
1220         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1221         if (ret < 0 || ret >= PROCLEN)
1222                 return false;
1223         FILE *f = fopen(fpath, "r");
1224         if (!f)
1225                 return false;
1226
1227         /* if caller's not root in his namespace, reject */
1228         nsuid = convert_id_to_ns(f, uid);
1229         if (nsuid)
1230                 goto out;
1231
1232         /*
1233          * If victim is not mapped into caller's ns, reject.
1234          * XXX I'm not sure this check is needed given that fuse
1235          * will be sending requests where the vfs has converted
1236          */
1237         nsuid = convert_id_to_ns(f, victim);
1238         if (nsuid == -1)
1239                 goto out;
1240
1241         answer = true;
1242
1243 out:
1244         fclose(f);
1245         return answer;
1246 }
1247
1248 static bool perms_include(int fmode, mode_t req_mode)
1249 {
1250         mode_t r;
1251
1252         switch (req_mode & O_ACCMODE) {
1253         case O_RDONLY:
1254                 r = S_IROTH;
1255                 break;
1256         case O_WRONLY:
1257                 r = S_IWOTH;
1258                 break;
1259         case O_RDWR:
1260                 r = S_IROTH | S_IWOTH;
1261                 break;
1262         default:
1263                 return false;
1264         }
1265         return ((fmode & r) == r);
1266 }
1267
1268
1269 /*
1270  * taskcg is  a/b/c
1271  * querycg is /a/b/c/d/e
1272  * we return 'd'
1273  */
1274 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1275 {
1276         char *start, *end;
1277
1278         if (strlen(taskcg) <= strlen(querycg)) {
1279                 fprintf(stderr, "%s: I was fed bad input\n", __func__);
1280                 return NULL;
1281         }
1282
1283         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1284                 start =  strdup(taskcg + 1);
1285         else
1286                 start = strdup(taskcg + strlen(querycg) + 1);
1287         if (!start)
1288                 return NULL;
1289         end = strchr(start, '/');
1290         if (end)
1291                 *end = '\0';
1292         return start;
1293 }
1294
1295 static void stripnewline(char *x)
1296 {
1297         size_t l = strlen(x);
1298         if (l && x[l-1] == '\n')
1299                 x[l-1] = '\0';
1300 }
1301
1302 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1303 {
1304         int cfd;
1305         char fnam[PROCLEN];
1306         FILE *f;
1307         char *answer = NULL;
1308         char *line = NULL;
1309         size_t len = 0;
1310         int ret;
1311         const char *h = find_mounted_controller(contrl, &cfd);
1312         if (!h)
1313                 return NULL;
1314
1315         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1316         if (ret < 0 || ret >= PROCLEN)
1317                 return NULL;
1318         if (!(f = fopen(fnam, "r")))
1319                 return NULL;
1320
1321         while (getline(&line, &len, f) != -1) {
1322                 char *c1, *c2;
1323                 if (!line[0])
1324                         continue;
1325                 c1 = strchr(line, ':');
1326                 if (!c1)
1327                         goto out;
1328                 c1++;
1329                 c2 = strchr(c1, ':');
1330                 if (!c2)
1331                         goto out;
1332                 *c2 = '\0';
1333                 if (strcmp(c1, h) != 0)
1334                         continue;
1335                 c2++;
1336                 stripnewline(c2);
1337                 do {
1338                         answer = strdup(c2);
1339                 } while (!answer);
1340                 break;
1341         }
1342
1343 out:
1344         fclose(f);
1345         free(line);
1346         return answer;
1347 }
1348
1349 /*
1350  * check whether a fuse context may access a cgroup dir or file
1351  *
1352  * If file is not null, it is a cgroup file to check under cg.
1353  * If file is null, then we are checking perms on cg itself.
1354  *
1355  * For files we can check the mode of the list_keys result.
1356  * For cgroups, we must make assumptions based on the files under the
1357  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1358  * yet.
1359  */
1360 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1361 {
1362         struct cgfs_files *k = NULL;
1363         bool ret = false;
1364
1365         k = cgfs_get_key(contrl, cg, file);
1366         if (!k)
1367                 return false;
1368
1369         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1370                 if (perms_include(k->mode >> 6, mode)) {
1371                         ret = true;
1372                         goto out;
1373                 }
1374         }
1375         if (fc->gid == k->gid) {
1376                 if (perms_include(k->mode >> 3, mode)) {
1377                         ret = true;
1378                         goto out;
1379                 }
1380         }
1381         ret = perms_include(k->mode, mode);
1382
1383 out:
1384         free_key(k);
1385         return ret;
1386 }
1387
1388 #define INITSCOPE "/init.scope"
1389 static void prune_init_slice(char *cg)
1390 {
1391         char *point;
1392         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1393
1394         if (cg_len < initscope_len)
1395                 return;
1396
1397         point = cg + cg_len - initscope_len;
1398         if (strcmp(point, INITSCOPE) == 0) {
1399                 if (point == cg)
1400                         *(point+1) = '\0';
1401                 else
1402                         *point = '\0';
1403         }
1404 }
1405
1406 /*
1407  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1408  * If pid is in /a, he may act on /a/b, but not on /b.
1409  * if the answer is false and nextcg is not NULL, then *nextcg will point
1410  * to a string containing the next cgroup directory under cg, which must be
1411  * freed by the caller.
1412  */
1413 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1414 {
1415         bool answer = false;
1416         char *c2 = get_pid_cgroup(pid, contrl);
1417         char *linecmp;
1418
1419         if (!c2)
1420                 return false;
1421         prune_init_slice(c2);
1422
1423         /*
1424          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1425          * they pass in a cgroup without leading '/'
1426          *
1427          * The original line here was:
1428          *      linecmp = *cg == '/' ? c2 : c2+1;
1429          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1430          *       Serge, do you know?
1431          */
1432         if (*cg == '/' || !strncmp(cg, "./", 2))
1433                 linecmp = c2;
1434         else
1435                 linecmp = c2 + 1;
1436         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1437                 if (nextcg) {
1438                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1439                 }
1440                 goto out;
1441         }
1442         answer = true;
1443
1444 out:
1445         free(c2);
1446         return answer;
1447 }
1448
1449 /*
1450  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1451  */
1452 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1453 {
1454         bool answer = false;
1455         char *c2, *task_cg;
1456         size_t target_len, task_len;
1457
1458         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1459                 return true;
1460
1461         c2 = get_pid_cgroup(pid, contrl);
1462         if (!c2)
1463                 return false;
1464         prune_init_slice(c2);
1465
1466         task_cg = c2 + 1;
1467         target_len = strlen(cg);
1468         task_len = strlen(task_cg);
1469         if (task_len == 0) {
1470                 /* Task is in the root cg, it can see everything. This case is
1471                  * not handled by the strmcps below, since they test for the
1472                  * last /, but that is the first / that we've chopped off
1473                  * above.
1474                  */
1475                 answer = true;
1476                 goto out;
1477         }
1478         if (strcmp(cg, task_cg) == 0) {
1479                 answer = true;
1480                 goto out;
1481         }
1482         if (target_len < task_len) {
1483                 /* looking up a parent dir */
1484                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1485                         answer = true;
1486                 goto out;
1487         }
1488         if (target_len > task_len) {
1489                 /* looking up a child dir */
1490                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1491                         answer = true;
1492                 goto out;
1493         }
1494
1495 out:
1496         free(c2);
1497         return answer;
1498 }
1499
1500 /*
1501  * given /cgroup/freezer/a/b, return "freezer".
1502  * the returned char* should NOT be freed.
1503  */
1504 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1505 {
1506         const char *p1;
1507         char *contr, *slash;
1508
1509         if (strlen(path) < 9)
1510                 return NULL;
1511         if (*(path+7) != '/')
1512                 return NULL;
1513         p1 = path+8;
1514         contr = strdupa(p1);
1515         if (!contr)
1516                 return NULL;
1517         slash = strstr(contr, "/");
1518         if (slash)
1519                 *slash = '\0';
1520
1521         int i;
1522         for (i = 0;  i < num_hierarchies;  i++) {
1523                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1524                         return hierarchies[i];
1525         }
1526         return NULL;
1527 }
1528
1529 /*
1530  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1531  * Note that the returned value may include files (keynames) etc
1532  */
1533 static const char *find_cgroup_in_path(const char *path)
1534 {
1535         const char *p1;
1536
1537         if (strlen(path) < 9)
1538                 return NULL;
1539         p1 = strstr(path+8, "/");
1540         if (!p1)
1541                 return NULL;
1542         return p1+1;
1543 }
1544
1545 /*
1546  * split the last path element from the path in @cg.
1547  * @dir is newly allocated and should be freed, @last not
1548 */
1549 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1550 {
1551         char *p;
1552
1553         do {
1554                 *dir = strdup(cg);
1555         } while (!*dir);
1556         *last = strrchr(cg, '/');
1557         if (!*last) {
1558                 *last = NULL;
1559                 return;
1560         }
1561         p = strrchr(*dir, '/');
1562         *p = '\0';
1563 }
1564
1565 /*
1566  * FUSE ops for /cgroup
1567  */
1568
1569 int cg_getattr(const char *path, struct stat *sb)
1570 {
1571         struct timespec now;
1572         struct fuse_context *fc = fuse_get_context();
1573         char * cgdir = NULL;
1574         char *last = NULL, *path1, *path2;
1575         struct cgfs_files *k = NULL;
1576         const char *cgroup;
1577         const char *controller = NULL;
1578         int ret = -ENOENT;
1579
1580
1581         if (!fc)
1582                 return -EIO;
1583
1584         memset(sb, 0, sizeof(struct stat));
1585
1586         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1587                 return -EINVAL;
1588
1589         sb->st_uid = sb->st_gid = 0;
1590         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1591         sb->st_size = 0;
1592
1593         if (strcmp(path, "/cgroup") == 0) {
1594                 sb->st_mode = S_IFDIR | 00755;
1595                 sb->st_nlink = 2;
1596                 return 0;
1597         }
1598
1599         controller = pick_controller_from_path(fc, path);
1600         if (!controller)
1601                 return -EIO;
1602         cgroup = find_cgroup_in_path(path);
1603         if (!cgroup) {
1604                 /* this is just /cgroup/controller, return it as a dir */
1605                 sb->st_mode = S_IFDIR | 00755;
1606                 sb->st_nlink = 2;
1607                 return 0;
1608         }
1609
1610         get_cgdir_and_path(cgroup, &cgdir, &last);
1611
1612         if (!last) {
1613                 path1 = "/";
1614                 path2 = cgdir;
1615         } else {
1616                 path1 = cgdir;
1617                 path2 = last;
1618         }
1619
1620         pid_t initpid = lookup_initpid_in_store(fc->pid);
1621         if (initpid <= 0)
1622                 initpid = fc->pid;
1623         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1624          * Then check that caller's cgroup is under path if last is a child
1625          * cgroup, or cgdir if last is a file */
1626
1627         if (is_child_cgroup(controller, path1, path2)) {
1628                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1629                         ret = -ENOENT;
1630                         goto out;
1631                 }
1632                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1633                         /* this is just /cgroup/controller, return it as a dir */
1634                         sb->st_mode = S_IFDIR | 00555;
1635                         sb->st_nlink = 2;
1636                         ret = 0;
1637                         goto out;
1638                 }
1639                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1640                         ret = -EACCES;
1641                         goto out;
1642                 }
1643
1644                 // get uid, gid, from '/tasks' file and make up a mode
1645                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1646                 sb->st_mode = S_IFDIR | 00755;
1647                 k = cgfs_get_key(controller, cgroup, NULL);
1648                 if (!k) {
1649                         sb->st_uid = sb->st_gid = 0;
1650                 } else {
1651                         sb->st_uid = k->uid;
1652                         sb->st_gid = k->gid;
1653                 }
1654                 free_key(k);
1655                 sb->st_nlink = 2;
1656                 ret = 0;
1657                 goto out;
1658         }
1659
1660         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1661                 sb->st_mode = S_IFREG | k->mode;
1662                 sb->st_nlink = 1;
1663                 sb->st_uid = k->uid;
1664                 sb->st_gid = k->gid;
1665                 sb->st_size = 0;
1666                 free_key(k);
1667                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1668                         ret = -ENOENT;
1669                         goto out;
1670                 }
1671                 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1672                         ret = -EACCES;
1673                         goto out;
1674                 }
1675
1676                 ret = 0;
1677         }
1678
1679 out:
1680         free(cgdir);
1681         return ret;
1682 }
1683
1684 int cg_opendir(const char *path, struct fuse_file_info *fi)
1685 {
1686         struct fuse_context *fc = fuse_get_context();
1687         const char *cgroup;
1688         struct file_info *dir_info;
1689         char *controller = NULL;
1690
1691         if (!fc)
1692                 return -EIO;
1693
1694         if (strcmp(path, "/cgroup") == 0) {
1695                 cgroup = NULL;
1696                 controller = NULL;
1697         } else {
1698                 // return list of keys for the controller, and list of child cgroups
1699                 controller = pick_controller_from_path(fc, path);
1700                 if (!controller)
1701                         return -EIO;
1702
1703                 cgroup = find_cgroup_in_path(path);
1704                 if (!cgroup) {
1705                         /* this is just /cgroup/controller, return its contents */
1706                         cgroup = "/";
1707                 }
1708         }
1709
1710         pid_t initpid = lookup_initpid_in_store(fc->pid);
1711         if (initpid <= 0)
1712                 initpid = fc->pid;
1713         if (cgroup) {
1714                 if (!caller_may_see_dir(initpid, controller, cgroup))
1715                         return -ENOENT;
1716                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1717                         return -EACCES;
1718         }
1719
1720         /* we'll free this at cg_releasedir */
1721         dir_info = malloc(sizeof(*dir_info));
1722         if (!dir_info)
1723                 return -ENOMEM;
1724         dir_info->controller = must_copy_string(controller);
1725         dir_info->cgroup = must_copy_string(cgroup);
1726         dir_info->type = LXC_TYPE_CGDIR;
1727         dir_info->buf = NULL;
1728         dir_info->file = NULL;
1729         dir_info->buflen = 0;
1730
1731         fi->fh = (unsigned long)dir_info;
1732         return 0;
1733 }
1734
1735 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1736                 struct fuse_file_info *fi)
1737 {
1738         struct file_info *d = (struct file_info *)fi->fh;
1739         struct cgfs_files **list = NULL;
1740         int i, ret;
1741         char *nextcg = NULL;
1742         struct fuse_context *fc = fuse_get_context();
1743         char **clist = NULL;
1744
1745         if (d->type != LXC_TYPE_CGDIR) {
1746                 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1747                 return -EIO;
1748         }
1749         if (!d->cgroup && !d->controller) {
1750                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1751                 int i;
1752
1753                 for (i = 0;  i < num_hierarchies; i++) {
1754                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1755                                 return -EIO;
1756                         }
1757                 }
1758                 return 0;
1759         }
1760
1761         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1762                 // not a valid cgroup
1763                 ret = -EINVAL;
1764                 goto out;
1765         }
1766
1767         pid_t initpid = lookup_initpid_in_store(fc->pid);
1768         if (initpid <= 0)
1769                 initpid = fc->pid;
1770         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1771                 if (nextcg) {
1772                         ret = filler(buf, nextcg,  NULL, 0);
1773                         free(nextcg);
1774                         if (ret != 0) {
1775                                 ret = -EIO;
1776                                 goto out;
1777                         }
1778                 }
1779                 ret = 0;
1780                 goto out;
1781         }
1782
1783         for (i = 0; list[i]; i++) {
1784                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1785                         ret = -EIO;
1786                         goto out;
1787                 }
1788         }
1789
1790         // now get the list of child cgroups
1791
1792         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1793                 ret = 0;
1794                 goto out;
1795         }
1796         if (clist) {
1797                 for (i = 0; clist[i]; i++) {
1798                         if (filler(buf, clist[i], NULL, 0) != 0) {
1799                                 ret = -EIO;
1800                                 goto out;
1801                         }
1802                 }
1803         }
1804         ret = 0;
1805
1806 out:
1807         free_keys(list);
1808         if (clist) {
1809                 for (i = 0; clist[i]; i++)
1810                         free(clist[i]);
1811                 free(clist);
1812         }
1813         return ret;
1814 }
1815
1816 static void do_release_file_info(struct fuse_file_info *fi)
1817 {
1818         struct file_info *f = (struct file_info *)fi->fh;
1819
1820         if (!f)
1821                 return;
1822
1823         fi->fh = 0;
1824
1825         free(f->controller);
1826         f->controller = NULL;
1827         free(f->cgroup);
1828         f->cgroup = NULL;
1829         free(f->file);
1830         f->file = NULL;
1831         free(f->buf);
1832         f->buf = NULL;
1833         free(f);
1834 }
1835
1836 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1837 {
1838         do_release_file_info(fi);
1839         return 0;
1840 }
1841
1842 int cg_open(const char *path, struct fuse_file_info *fi)
1843 {
1844         const char *cgroup;
1845         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1846         struct cgfs_files *k = NULL;
1847         struct file_info *file_info;
1848         struct fuse_context *fc = fuse_get_context();
1849         int ret;
1850
1851         if (!fc)
1852                 return -EIO;
1853
1854         controller = pick_controller_from_path(fc, path);
1855         if (!controller)
1856                 return -EIO;
1857         cgroup = find_cgroup_in_path(path);
1858         if (!cgroup)
1859                 return -EINVAL;
1860
1861         get_cgdir_and_path(cgroup, &cgdir, &last);
1862         if (!last) {
1863                 path1 = "/";
1864                 path2 = cgdir;
1865         } else {
1866                 path1 = cgdir;
1867                 path2 = last;
1868         }
1869
1870         k = cgfs_get_key(controller, path1, path2);
1871         if (!k) {
1872                 ret = -EINVAL;
1873                 goto out;
1874         }
1875         free_key(k);
1876
1877         pid_t initpid = lookup_initpid_in_store(fc->pid);
1878         if (initpid <= 0)
1879                 initpid = fc->pid;
1880         if (!caller_may_see_dir(initpid, controller, path1)) {
1881                 ret = -ENOENT;
1882                 goto out;
1883         }
1884         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1885                 ret = -EACCES;
1886                 goto out;
1887         }
1888
1889         /* we'll free this at cg_release */
1890         file_info = malloc(sizeof(*file_info));
1891         if (!file_info) {
1892                 ret = -ENOMEM;
1893                 goto out;
1894         }
1895         file_info->controller = must_copy_string(controller);
1896         file_info->cgroup = must_copy_string(path1);
1897         file_info->file = must_copy_string(path2);
1898         file_info->type = LXC_TYPE_CGFILE;
1899         file_info->buf = NULL;
1900         file_info->buflen = 0;
1901
1902         fi->fh = (unsigned long)file_info;
1903         ret = 0;
1904
1905 out:
1906         free(cgdir);
1907         return ret;
1908 }
1909
1910 int cg_access(const char *path, int mode)
1911 {
1912         const char *cgroup;
1913         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1914         struct cgfs_files *k = NULL;
1915         struct fuse_context *fc = fuse_get_context();
1916         int ret;
1917
1918         if (!fc)
1919                 return -EIO;
1920
1921         controller = pick_controller_from_path(fc, path);
1922         if (!controller)
1923                 return -EIO;
1924         cgroup = find_cgroup_in_path(path);
1925         if (!cgroup) {
1926                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
1927                 if ((mode & W_OK) == 0)
1928                         return 0;
1929                 return -EACCES;
1930         }
1931
1932         get_cgdir_and_path(cgroup, &cgdir, &last);
1933         if (!last) {
1934                 path1 = "/";
1935                 path2 = cgdir;
1936         } else {
1937                 path1 = cgdir;
1938                 path2 = last;
1939         }
1940
1941         k = cgfs_get_key(controller, path1, path2);
1942         if (!k) {
1943                 if ((mode & W_OK) == 0)
1944                         ret = 0;
1945                 else
1946                         ret = -EACCES;
1947                 goto out;
1948         }
1949         free_key(k);
1950
1951         pid_t initpid = lookup_initpid_in_store(fc->pid);
1952         if (initpid <= 0)
1953                 initpid = fc->pid;
1954         if (!caller_may_see_dir(initpid, controller, path1)) {
1955                 ret = -ENOENT;
1956                 goto out;
1957         }
1958         if (!fc_may_access(fc, controller, path1, path2, mode)) {
1959                 ret = -EACCES;
1960                 goto out;
1961         }
1962
1963         ret = 0;
1964
1965 out:
1966         free(cgdir);
1967         return ret;
1968 }
1969
1970 int cg_release(const char *path, struct fuse_file_info *fi)
1971 {
1972         do_release_file_info(fi);
1973         return 0;
1974 }
1975
1976 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1977
1978 static bool wait_for_sock(int sock, int timeout)
1979 {
1980         struct epoll_event ev;
1981         int epfd, ret, now, starttime, deltatime, saved_errno;
1982
1983         if ((starttime = time(NULL)) < 0)
1984                 return false;
1985
1986         if ((epfd = epoll_create(1)) < 0) {
1987                 fprintf(stderr, "Failed to create epoll socket: %m\n");
1988                 return false;
1989         }
1990
1991         ev.events = POLLIN_SET;
1992         ev.data.fd = sock;
1993         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1994                 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1995                 close(epfd);
1996                 return false;
1997         }
1998
1999 again:
2000         if ((now = time(NULL)) < 0) {
2001                 close(epfd);
2002                 return false;
2003         }
2004
2005         deltatime = (starttime + timeout) - now;
2006         if (deltatime < 0) { // timeout
2007                 errno = 0;
2008                 close(epfd);
2009                 return false;
2010         }
2011         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2012         if (ret < 0 && errno == EINTR)
2013                 goto again;
2014         saved_errno = errno;
2015         close(epfd);
2016
2017         if (ret <= 0) {
2018                 errno = saved_errno;
2019                 return false;
2020         }
2021         return true;
2022 }
2023
2024 static int msgrecv(int sockfd, void *buf, size_t len)
2025 {
2026         if (!wait_for_sock(sockfd, 2))
2027                 return -1;
2028         return recv(sockfd, buf, len, MSG_DONTWAIT);
2029 }
2030
2031 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2032 {
2033         struct msghdr msg = { 0 };
2034         struct iovec iov;
2035         struct cmsghdr *cmsg;
2036         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2037         char buf[1];
2038         buf[0] = 'p';
2039
2040         if (pingfirst) {
2041                 if (msgrecv(sock, buf, 1) != 1) {
2042                         fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
2043                                   __func__);
2044                         return SEND_CREDS_FAIL;
2045                 }
2046         }
2047
2048         msg.msg_control = cmsgbuf;
2049         msg.msg_controllen = sizeof(cmsgbuf);
2050
2051         cmsg = CMSG_FIRSTHDR(&msg);
2052         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2053         cmsg->cmsg_level = SOL_SOCKET;
2054         cmsg->cmsg_type = SCM_CREDENTIALS;
2055         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2056
2057         msg.msg_name = NULL;
2058         msg.msg_namelen = 0;
2059
2060         buf[0] = v;
2061         iov.iov_base = buf;
2062         iov.iov_len = sizeof(buf);
2063         msg.msg_iov = &iov;
2064         msg.msg_iovlen = 1;
2065
2066         if (sendmsg(sock, &msg, 0) < 0) {
2067                 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
2068                           strerror(errno));
2069                 if (errno == 3)
2070                         return SEND_CREDS_NOTSK;
2071                 return SEND_CREDS_FAIL;
2072         }
2073
2074         return SEND_CREDS_OK;
2075 }
2076
2077 static bool recv_creds(int sock, struct ucred *cred, char *v)
2078 {
2079         struct msghdr msg = { 0 };
2080         struct iovec iov;
2081         struct cmsghdr *cmsg;
2082         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2083         char buf[1];
2084         int ret;
2085         int optval = 1;
2086
2087         *v = '1';
2088
2089         cred->pid = -1;
2090         cred->uid = -1;
2091         cred->gid = -1;
2092
2093         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2094                 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
2095                 return false;
2096         }
2097         buf[0] = '1';
2098         if (write(sock, buf, 1) != 1) {
2099                 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
2100                 return false;
2101         }
2102
2103         msg.msg_name = NULL;
2104         msg.msg_namelen = 0;
2105         msg.msg_control = cmsgbuf;
2106         msg.msg_controllen = sizeof(cmsgbuf);
2107
2108         iov.iov_base = buf;
2109         iov.iov_len = sizeof(buf);
2110         msg.msg_iov = &iov;
2111         msg.msg_iovlen = 1;
2112
2113         if (!wait_for_sock(sock, 2)) {
2114                 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
2115                           strerror(errno));
2116                 return false;
2117         }
2118         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2119         if (ret < 0) {
2120                 fprintf(stderr, "Failed to receive scm_cred: %s\n",
2121                           strerror(errno));
2122                 return false;
2123         }
2124
2125         cmsg = CMSG_FIRSTHDR(&msg);
2126
2127         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2128                         cmsg->cmsg_level == SOL_SOCKET &&
2129                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2130                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2131         }
2132         *v = buf[0];
2133
2134         return true;
2135 }
2136
2137 struct pid_ns_clone_args {
2138         int *cpipe;
2139         int sock;
2140         pid_t tpid;
2141         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2142 };
2143
2144 /*
2145  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2146  * with clone(). This simply writes '1' as ACK back to the parent
2147  * before calling the actual wrapped function.
2148  */
2149 static int pid_ns_clone_wrapper(void *arg) {
2150         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2151         char b = '1';
2152
2153         close(args->cpipe[0]);
2154         if (write(args->cpipe[1], &b, sizeof(char)) < 0) {
2155                 fprintf(stderr, "%s (child): error on write: %s\n",
2156                         __func__, strerror(errno));
2157         }
2158         close(args->cpipe[1]);
2159         return args->wrapped(args->sock, args->tpid);
2160 }
2161
2162 /*
2163  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2164  * int value back over the socket.  This shifts the pid from the
2165  * sender's pidns into tpid's pidns.
2166  */
2167 static int pid_to_ns(int sock, pid_t tpid)
2168 {
2169         char v = '0';
2170         struct ucred cred;
2171
2172         while (recv_creds(sock, &cred, &v)) {
2173                 if (v == '1')
2174                         return 0;
2175                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2176                         return 1;
2177         }
2178         return 0;
2179 }
2180
2181
2182 /*
2183  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2184  * in your old pidns.  Only children which you clone will be in the target
2185  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2186  * actually convert pids.
2187  *
2188  * Note: glibc's fork() does not respect pidns, which can lead to failed
2189  * assertions inside glibc (and thus failed forks) if the child's pid in
2190  * the pidns and the parent pid outside are identical. Using clone prevents
2191  * this issue.
2192  */
2193 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2194 {
2195         int newnsfd = -1, ret, cpipe[2];
2196         char fnam[100];
2197         pid_t cpid;
2198         char v;
2199
2200         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2201         if (ret < 0 || ret >= sizeof(fnam))
2202                 _exit(1);
2203         newnsfd = open(fnam, O_RDONLY);
2204         if (newnsfd < 0)
2205                 _exit(1);
2206         if (setns(newnsfd, 0) < 0)
2207                 _exit(1);
2208         close(newnsfd);
2209
2210         if (pipe(cpipe) < 0)
2211                 _exit(1);
2212
2213         struct pid_ns_clone_args args = {
2214                 .cpipe = cpipe,
2215                 .sock = sock,
2216                 .tpid = tpid,
2217                 .wrapped = &pid_to_ns
2218         };
2219         size_t stack_size = sysconf(_SC_PAGESIZE);
2220         void *stack = alloca(stack_size);
2221
2222         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2223         if (cpid < 0)
2224                 _exit(1);
2225
2226         // give the child 1 second to be done forking and
2227         // write its ack
2228         if (!wait_for_sock(cpipe[0], 1))
2229                 _exit(1);
2230         ret = read(cpipe[0], &v, 1);
2231         if (ret != sizeof(char) || v != '1')
2232                 _exit(1);
2233
2234         if (!wait_for_pid(cpid))
2235                 _exit(1);
2236         _exit(0);
2237 }
2238
2239 /*
2240  * To read cgroup files with a particular pid, we will setns into the child
2241  * pidns, open a pipe, fork a child - which will be the first to really be in
2242  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2243  */
2244 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2245 {
2246         int sock[2] = {-1, -1};
2247         char *tmpdata = NULL;
2248         int ret;
2249         pid_t qpid, cpid = -1;
2250         bool answer = false;
2251         char v = '0';
2252         struct ucred cred;
2253         size_t sz = 0, asz = 0;
2254
2255         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2256                 return false;
2257
2258         /*
2259          * Now we read the pids from returned data one by one, pass
2260          * them into a child in the target namespace, read back the
2261          * translated pids, and put them into our to-return data
2262          */
2263
2264         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2265                 perror("socketpair");
2266                 free(tmpdata);
2267                 return false;
2268         }
2269
2270         cpid = fork();
2271         if (cpid == -1)
2272                 goto out;
2273
2274         if (!cpid) // child - exits when done
2275                 pid_to_ns_wrapper(sock[1], tpid);
2276
2277         char *ptr = tmpdata;
2278         cred.uid = 0;
2279         cred.gid = 0;
2280         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2281                 cred.pid = qpid;
2282                 ret = send_creds(sock[0], &cred, v, true);
2283
2284                 if (ret == SEND_CREDS_NOTSK)
2285                         goto next;
2286                 if (ret == SEND_CREDS_FAIL)
2287                         goto out;
2288
2289                 // read converted results
2290                 if (!wait_for_sock(sock[0], 2)) {
2291                         fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
2292                                 __func__, strerror(errno));
2293                         goto out;
2294                 }
2295                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2296                         fprintf(stderr, "%s: error reading pid from child: %s\n",
2297                                 __func__, strerror(errno));
2298                         goto out;
2299                 }
2300                 must_strcat_pid(d, &sz, &asz, qpid);
2301 next:
2302                 ptr = strchr(ptr, '\n');
2303                 if (!ptr)
2304                         break;
2305                 ptr++;
2306         }
2307
2308         cred.pid = getpid();
2309         v = '1';
2310         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2311                 // failed to ask child to exit
2312                 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
2313                         __func__, strerror(errno));
2314                 goto out;
2315         }
2316
2317         answer = true;
2318
2319 out:
2320         free(tmpdata);
2321         if (cpid != -1)
2322                 wait_for_pid(cpid);
2323         if (sock[0] != -1) {
2324                 close(sock[0]);
2325                 close(sock[1]);
2326         }
2327         return answer;
2328 }
2329
2330 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2331                 struct fuse_file_info *fi)
2332 {
2333         struct fuse_context *fc = fuse_get_context();
2334         struct file_info *f = (struct file_info *)fi->fh;
2335         struct cgfs_files *k = NULL;
2336         char *data = NULL;
2337         int ret, s;
2338         bool r;
2339
2340         if (f->type != LXC_TYPE_CGFILE) {
2341                 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
2342                 return -EIO;
2343         }
2344
2345         if (offset)
2346                 return 0;
2347
2348         if (!fc)
2349                 return -EIO;
2350
2351         if (!f->controller)
2352                 return -EINVAL;
2353
2354         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2355                 return -EINVAL;
2356         }
2357         free_key(k);
2358
2359
2360         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2361                 ret = -EACCES;
2362                 goto out;
2363         }
2364
2365         if (strcmp(f->file, "tasks") == 0 ||
2366                         strcmp(f->file, "/tasks") == 0 ||
2367                         strcmp(f->file, "/cgroup.procs") == 0 ||
2368                         strcmp(f->file, "cgroup.procs") == 0)
2369                 // special case - we have to translate the pids
2370                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2371         else
2372                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2373
2374         if (!r) {
2375                 ret = -EINVAL;
2376                 goto out;
2377         }
2378
2379         if (!data) {
2380                 ret = 0;
2381                 goto out;
2382         }
2383         s = strlen(data);
2384         if (s > size)
2385                 s = size;
2386         memcpy(buf, data, s);
2387         if (s > 0 && s < size && data[s-1] != '\n')
2388                 buf[s++] = '\n';
2389
2390         ret = s;
2391
2392 out:
2393         free(data);
2394         return ret;
2395 }
2396
2397 static int pid_from_ns(int sock, pid_t tpid)
2398 {
2399         pid_t vpid;
2400         struct ucred cred;
2401         char v;
2402         int ret;
2403
2404         cred.uid = 0;
2405         cred.gid = 0;
2406         while (1) {
2407                 if (!wait_for_sock(sock, 2)) {
2408                         fprintf(stderr, "%s: timeout reading from parent\n", __func__);
2409                         return 1;
2410                 }
2411                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2412                         fprintf(stderr, "%s: bad read from parent: %s\n",
2413                                 __func__, strerror(errno));
2414                         return 1;
2415                 }
2416                 if (vpid == -1) // done
2417                         break;
2418                 v = '0';
2419                 cred.pid = vpid;
2420                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2421                         v = '1';
2422                         cred.pid = getpid();
2423                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2424                                 return 1;
2425                 }
2426         }
2427         return 0;
2428 }
2429
2430 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2431 {
2432         int newnsfd = -1, ret, cpipe[2];
2433         char fnam[100];
2434         pid_t cpid;
2435         char v;
2436
2437         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2438         if (ret < 0 || ret >= sizeof(fnam))
2439                 _exit(1);
2440         newnsfd = open(fnam, O_RDONLY);
2441         if (newnsfd < 0)
2442                 _exit(1);
2443         if (setns(newnsfd, 0) < 0)
2444                 _exit(1);
2445         close(newnsfd);
2446
2447         if (pipe(cpipe) < 0)
2448                 _exit(1);
2449
2450         struct pid_ns_clone_args args = {
2451                 .cpipe = cpipe,
2452                 .sock = sock,
2453                 .tpid = tpid,
2454                 .wrapped = &pid_from_ns
2455         };
2456         size_t stack_size = sysconf(_SC_PAGESIZE);
2457         void *stack = alloca(stack_size);
2458
2459         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2460         if (cpid < 0)
2461                 _exit(1);
2462
2463         // give the child 1 second to be done forking and
2464         // write its ack
2465         if (!wait_for_sock(cpipe[0], 1))
2466                 _exit(1);
2467         ret = read(cpipe[0], &v, 1);
2468         if (ret != sizeof(char) || v != '1')
2469                 _exit(1);
2470
2471         if (!wait_for_pid(cpid))
2472                 _exit(1);
2473         _exit(0);
2474 }
2475
2476 /*
2477  * Given host @uid, return the uid to which it maps in
2478  * @pid's user namespace, or -1 if none.
2479  */
2480 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2481 {
2482         FILE *f;
2483         char line[400];
2484
2485         sprintf(line, "/proc/%d/uid_map", pid);
2486         if ((f = fopen(line, "r")) == NULL) {
2487                 return false;
2488         }
2489
2490         *answer = convert_id_to_ns(f, uid);
2491         fclose(f);
2492
2493         if (*answer == -1)
2494                 return false;
2495         return true;
2496 }
2497
2498 /*
2499  * get_pid_creds: get the real uid and gid of @pid from
2500  * /proc/$$/status
2501  * (XXX should we use euid here?)
2502  */
2503 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2504 {
2505         char line[400];
2506         uid_t u;
2507         gid_t g;
2508         FILE *f;
2509
2510         *uid = -1;
2511         *gid = -1;
2512         sprintf(line, "/proc/%d/status", pid);
2513         if ((f = fopen(line, "r")) == NULL) {
2514                 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
2515                 return;
2516         }
2517         while (fgets(line, 400, f)) {
2518                 if (strncmp(line, "Uid:", 4) == 0) {
2519                         if (sscanf(line+4, "%u", &u) != 1) {
2520                                 fprintf(stderr, "bad uid line for pid %u\n", pid);
2521                                 fclose(f);
2522                                 return;
2523                         }
2524                         *uid = u;
2525                 } else if (strncmp(line, "Gid:", 4) == 0) {
2526                         if (sscanf(line+4, "%u", &g) != 1) {
2527                                 fprintf(stderr, "bad gid line for pid %u\n", pid);
2528                                 fclose(f);
2529                                 return;
2530                         }
2531                         *gid = g;
2532                 }
2533         }
2534         fclose(f);
2535 }
2536
2537 /*
2538  * May the requestor @r move victim @v to a new cgroup?
2539  * This is allowed if
2540  *   . they are the same task
2541  *   . they are ownedy by the same uid
2542  *   . @r is root on the host, or
2543  *   . @v's uid is mapped into @r's where @r is root.
2544  */
2545 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2546 {
2547         uid_t v_uid, tmpuid;
2548         gid_t v_gid;
2549
2550         if (r == v)
2551                 return true;
2552         if (r_uid == 0)
2553                 return true;
2554         get_pid_creds(v, &v_uid, &v_gid);
2555         if (r_uid == v_uid)
2556                 return true;
2557         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2558                         && hostuid_to_ns(v_uid, r, &tmpuid))
2559                 return true;
2560         return false;
2561 }
2562
2563 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2564                 const char *file, const char *buf)
2565 {
2566         int sock[2] = {-1, -1};
2567         pid_t qpid, cpid = -1;
2568         FILE *pids_file = NULL;
2569         bool answer = false, fail = false;
2570
2571         pids_file = open_pids_file(contrl, cg);
2572         if (!pids_file)
2573                 return false;
2574
2575         /*
2576          * write the pids to a socket, have helper in writer's pidns
2577          * call movepid for us
2578          */
2579         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2580                 perror("socketpair");
2581                 goto out;
2582         }
2583
2584         cpid = fork();
2585         if (cpid == -1)
2586                 goto out;
2587
2588         if (!cpid) { // child
2589                 fclose(pids_file);
2590                 pid_from_ns_wrapper(sock[1], tpid);
2591         }
2592
2593         const char *ptr = buf;
2594         while (sscanf(ptr, "%d", &qpid) == 1) {
2595                 struct ucred cred;
2596                 char v;
2597
2598                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2599                         fprintf(stderr, "%s: error writing pid to child: %s\n",
2600                                 __func__, strerror(errno));
2601                         goto out;
2602                 }
2603
2604                 if (recv_creds(sock[0], &cred, &v)) {
2605                         if (v == '0') {
2606                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2607                                         fail = true;
2608                                         break;
2609                                 }
2610                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2611                                         fail = true;
2612                         }
2613                 }
2614
2615                 ptr = strchr(ptr, '\n');
2616                 if (!ptr)
2617                         break;
2618                 ptr++;
2619         }
2620
2621         /* All good, write the value */
2622         qpid = -1;
2623         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2624                 fprintf(stderr, "Warning: failed to ask child to exit\n");
2625
2626         if (!fail)
2627                 answer = true;
2628
2629 out:
2630         if (cpid != -1)
2631                 wait_for_pid(cpid);
2632         if (sock[0] != -1) {
2633                 close(sock[0]);
2634                 close(sock[1]);
2635         }
2636         if (pids_file) {
2637                 if (fclose(pids_file) != 0)
2638                         answer = false;
2639         }
2640         return answer;
2641 }
2642
2643 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2644              struct fuse_file_info *fi)
2645 {
2646         struct fuse_context *fc = fuse_get_context();
2647         char *localbuf = NULL;
2648         struct cgfs_files *k = NULL;
2649         struct file_info *f = (struct file_info *)fi->fh;
2650         bool r;
2651
2652         if (f->type != LXC_TYPE_CGFILE) {
2653                 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
2654                 return -EIO;
2655         }
2656
2657         if (offset)
2658                 return 0;
2659
2660         if (!fc)
2661                 return -EIO;
2662
2663         localbuf = alloca(size+1);
2664         localbuf[size] = '\0';
2665         memcpy(localbuf, buf, size);
2666
2667         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2668                 size = -EINVAL;
2669                 goto out;
2670         }
2671
2672         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2673                 size = -EACCES;
2674                 goto out;
2675         }
2676
2677         if (strcmp(f->file, "tasks") == 0 ||
2678                         strcmp(f->file, "/tasks") == 0 ||
2679                         strcmp(f->file, "/cgroup.procs") == 0 ||
2680                         strcmp(f->file, "cgroup.procs") == 0)
2681                 // special case - we have to translate the pids
2682                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2683         else
2684                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2685
2686         if (!r)
2687                 size = -EINVAL;
2688
2689 out:
2690         free_key(k);
2691         return size;
2692 }
2693
2694 int cg_chown(const char *path, uid_t uid, gid_t gid)
2695 {
2696         struct fuse_context *fc = fuse_get_context();
2697         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2698         struct cgfs_files *k = NULL;
2699         const char *cgroup;
2700         int ret;
2701
2702         if (!fc)
2703                 return -EIO;
2704
2705         if (strcmp(path, "/cgroup") == 0)
2706                 return -EINVAL;
2707
2708         controller = pick_controller_from_path(fc, path);
2709         if (!controller)
2710                 return -EINVAL;
2711         cgroup = find_cgroup_in_path(path);
2712         if (!cgroup)
2713                 /* this is just /cgroup/controller */
2714                 return -EINVAL;
2715
2716         get_cgdir_and_path(cgroup, &cgdir, &last);
2717
2718         if (!last) {
2719                 path1 = "/";
2720                 path2 = cgdir;
2721         } else {
2722                 path1 = cgdir;
2723                 path2 = last;
2724         }
2725
2726         if (is_child_cgroup(controller, path1, path2)) {
2727                 // get uid, gid, from '/tasks' file and make up a mode
2728                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2729                 k = cgfs_get_key(controller, cgroup, "tasks");
2730
2731         } else
2732                 k = cgfs_get_key(controller, path1, path2);
2733
2734         if (!k) {
2735                 ret = -EINVAL;
2736                 goto out;
2737         }
2738
2739         /*
2740          * This being a fuse request, the uid and gid must be valid
2741          * in the caller's namespace.  So we can just check to make
2742          * sure that the caller is root in his uid, and privileged
2743          * over the file's current owner.
2744          */
2745         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2746                 ret = -EACCES;
2747                 goto out;
2748         }
2749
2750         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2751
2752 out:
2753         free_key(k);
2754         free(cgdir);
2755
2756         return ret;
2757 }
2758
2759 int cg_chmod(const char *path, mode_t mode)
2760 {
2761         struct fuse_context *fc = fuse_get_context();
2762         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2763         struct cgfs_files *k = NULL;
2764         const char *cgroup;
2765         int ret;
2766
2767         if (!fc)
2768                 return -EIO;
2769
2770         if (strcmp(path, "/cgroup") == 0)
2771                 return -EINVAL;
2772
2773         controller = pick_controller_from_path(fc, path);
2774         if (!controller)
2775                 return -EINVAL;
2776         cgroup = find_cgroup_in_path(path);
2777         if (!cgroup)
2778                 /* this is just /cgroup/controller */
2779                 return -EINVAL;
2780
2781         get_cgdir_and_path(cgroup, &cgdir, &last);
2782
2783         if (!last) {
2784                 path1 = "/";
2785                 path2 = cgdir;
2786         } else {
2787                 path1 = cgdir;
2788                 path2 = last;
2789         }
2790
2791         if (is_child_cgroup(controller, path1, path2)) {
2792                 // get uid, gid, from '/tasks' file and make up a mode
2793                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2794                 k = cgfs_get_key(controller, cgroup, "tasks");
2795
2796         } else
2797                 k = cgfs_get_key(controller, path1, path2);
2798
2799         if (!k) {
2800                 ret = -EINVAL;
2801                 goto out;
2802         }
2803
2804         /*
2805          * This being a fuse request, the uid and gid must be valid
2806          * in the caller's namespace.  So we can just check to make
2807          * sure that the caller is root in his uid, and privileged
2808          * over the file's current owner.
2809          */
2810         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2811                 ret = -EPERM;
2812                 goto out;
2813         }
2814
2815         if (!cgfs_chmod_file(controller, cgroup, mode)) {
2816                 ret = -EINVAL;
2817                 goto out;
2818         }
2819
2820         ret = 0;
2821 out:
2822         free_key(k);
2823         free(cgdir);
2824         return ret;
2825 }
2826
2827 int cg_mkdir(const char *path, mode_t mode)
2828 {
2829         struct fuse_context *fc = fuse_get_context();
2830         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2831         const char *cgroup;
2832         int ret;
2833
2834         if (!fc)
2835                 return -EIO;
2836
2837
2838         controller = pick_controller_from_path(fc, path);
2839         if (!controller)
2840                 return -EINVAL;
2841
2842         cgroup = find_cgroup_in_path(path);
2843         if (!cgroup)
2844                 return -EINVAL;
2845
2846         get_cgdir_and_path(cgroup, &cgdir, &last);
2847         if (!last)
2848                 path1 = "/";
2849         else
2850                 path1 = cgdir;
2851
2852         pid_t initpid = lookup_initpid_in_store(fc->pid);
2853         if (initpid <= 0)
2854                 initpid = fc->pid;
2855         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2856                 if (!next)
2857                         ret = -EINVAL;
2858                 else if (last && strcmp(next, last) == 0)
2859                         ret = -EEXIST;
2860                 else
2861                         ret = -ENOENT;
2862                 goto out;
2863         }
2864
2865         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2866                 ret = -EACCES;
2867                 goto out;
2868         }
2869         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2870                 ret = -EACCES;
2871                 goto out;
2872         }
2873
2874         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2875
2876 out:
2877         free(cgdir);
2878         free(next);
2879         return ret;
2880 }
2881
2882 int cg_rmdir(const char *path)
2883 {
2884         struct fuse_context *fc = fuse_get_context();
2885         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2886         const char *cgroup;
2887         int ret;
2888
2889         if (!fc)
2890                 return -EIO;
2891
2892         controller = pick_controller_from_path(fc, path);
2893         if (!controller)
2894                 return -EINVAL;
2895
2896         cgroup = find_cgroup_in_path(path);
2897         if (!cgroup)
2898                 return -EINVAL;
2899
2900         get_cgdir_and_path(cgroup, &cgdir, &last);
2901         if (!last) {
2902                 ret = -EINVAL;
2903                 goto out;
2904         }
2905
2906         pid_t initpid = lookup_initpid_in_store(fc->pid);
2907         if (initpid <= 0)
2908                 initpid = fc->pid;
2909         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2910                 if (!last || strcmp(next, last) == 0)
2911                         ret = -EBUSY;
2912                 else
2913                         ret = -ENOENT;
2914                 goto out;
2915         }
2916
2917         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2918                 ret = -EACCES;
2919                 goto out;
2920         }
2921         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2922                 ret = -EACCES;
2923                 goto out;
2924         }
2925
2926         if (!cgfs_remove(controller, cgroup)) {
2927                 ret = -EINVAL;
2928                 goto out;
2929         }
2930
2931         ret = 0;
2932
2933 out:
2934         free(cgdir);
2935         free(next);
2936         return ret;
2937 }
2938
2939 static bool startswith(const char *line, const char *pref)
2940 {
2941         if (strncmp(line, pref, strlen(pref)) == 0)
2942                 return true;
2943         return false;
2944 }
2945
2946 static void parse_memstat(char *memstat, unsigned long *cached,
2947                 unsigned long *active_anon, unsigned long *inactive_anon,
2948                 unsigned long *active_file, unsigned long *inactive_file,
2949                 unsigned long *unevictable)
2950 {
2951         char *eol;
2952
2953         while (*memstat) {
2954                 if (startswith(memstat, "cache")) {
2955                         sscanf(memstat + 11, "%lu", cached);
2956                         *cached /= 1024;
2957                 } else if (startswith(memstat, "active_anon")) {
2958                         sscanf(memstat + 11, "%lu", active_anon);
2959                         *active_anon /= 1024;
2960                 } else if (startswith(memstat, "inactive_anon")) {
2961                         sscanf(memstat + 11, "%lu", inactive_anon);
2962                         *inactive_anon /= 1024;
2963                 } else if (startswith(memstat, "active_file")) {
2964                         sscanf(memstat + 11, "%lu", active_file);
2965                         *active_file /= 1024;
2966                 } else if (startswith(memstat, "inactive_file")) {
2967                         sscanf(memstat + 11, "%lu", inactive_file);
2968                         *inactive_file /= 1024;
2969                 } else if (startswith(memstat, "unevictable")) {
2970                         sscanf(memstat + 11, "%lu", unevictable);
2971                         *unevictable /= 1024;
2972                 }
2973                 eol = strchr(memstat, '\n');
2974                 if (!eol)
2975                         return;
2976                 memstat = eol+1;
2977         }
2978 }
2979
2980 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2981 {
2982         char *eol;
2983         char key[32];
2984
2985         memset(key, 0, 32);
2986         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2987
2988         size_t len = strlen(key);
2989         *v = 0;
2990
2991         while (*str) {
2992                 if (startswith(str, key)) {
2993                         sscanf(str + len, "%lu", v);
2994                         return;
2995                 }
2996                 eol = strchr(str, '\n');
2997                 if (!eol)
2998                         return;
2999                 str = eol+1;
3000         }
3001 }
3002
3003 static int read_file(const char *path, char *buf, size_t size,
3004                      struct file_info *d)
3005 {
3006         size_t linelen = 0, total_len = 0, rv = 0;
3007         char *line = NULL;
3008         char *cache = d->buf;
3009         size_t cache_size = d->buflen;
3010         FILE *f = fopen(path, "r");
3011         if (!f)
3012                 return 0;
3013
3014         while (getline(&line, &linelen, f) != -1) {
3015                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3016                 if (l < 0) {
3017                         perror("Error writing to cache");
3018                         rv = 0;
3019                         goto err;
3020                 }
3021                 if (l >= cache_size) {
3022                         fprintf(stderr, "Internal error: truncated write to cache\n");
3023                         rv = 0;
3024                         goto err;
3025                 }
3026                 cache += l;
3027                 cache_size -= l;
3028                 total_len += l;
3029         }
3030
3031         d->size = total_len;
3032         if (total_len > size)
3033                 total_len = size;
3034
3035         /* read from off 0 */
3036         memcpy(buf, d->buf, total_len);
3037         rv = total_len;
3038   err:
3039         fclose(f);
3040         free(line);
3041         return rv;
3042 }
3043
3044 /*
3045  * FUSE ops for /proc
3046  */
3047
3048 static unsigned long get_memlimit(const char *cgroup)
3049 {
3050         char *memlimit_str = NULL;
3051         unsigned long memlimit = -1;
3052
3053         if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
3054                 memlimit = strtoul(memlimit_str, NULL, 10);
3055
3056         free(memlimit_str);
3057
3058         return memlimit;
3059 }
3060
3061 static unsigned long get_min_memlimit(const char *cgroup)
3062 {
3063         char *copy = strdupa(cgroup);
3064         unsigned long memlimit = 0, retlimit;
3065
3066         retlimit = get_memlimit(copy);
3067
3068         while (strcmp(copy, "/") != 0) {
3069                 copy = dirname(copy);
3070                 memlimit = get_memlimit(copy);
3071                 if (memlimit != -1 && memlimit < retlimit)
3072                         retlimit = memlimit;
3073         };
3074
3075         return retlimit;
3076 }
3077
3078 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3079                 struct fuse_file_info *fi)
3080 {
3081         struct fuse_context *fc = fuse_get_context();
3082         struct file_info *d = (struct file_info *)fi->fh;
3083         char *cg;
3084         char *memusage_str = NULL, *memstat_str = NULL,
3085                 *memswlimit_str = NULL, *memswusage_str = NULL,
3086                 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3087         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3088                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3089                 active_file = 0, inactive_file = 0, unevictable = 0;
3090         char *line = NULL;
3091         size_t linelen = 0, total_len = 0, rv = 0;
3092         char *cache = d->buf;
3093         size_t cache_size = d->buflen;
3094         FILE *f = NULL;
3095
3096         if (offset){
3097                 if (offset > d->size)
3098                         return -EINVAL;
3099                 if (!d->cached)
3100                         return 0;
3101                 int left = d->size - offset;
3102                 total_len = left > size ? size: left;
3103                 memcpy(buf, cache + offset, total_len);
3104                 return total_len;
3105         }
3106
3107         pid_t initpid = lookup_initpid_in_store(fc->pid);
3108         if (initpid <= 0)
3109                 initpid = fc->pid;
3110         cg = get_pid_cgroup(initpid, "memory");
3111         if (!cg)
3112                 return read_file("/proc/meminfo", buf, size, d);
3113         prune_init_slice(cg);
3114
3115         memlimit = get_min_memlimit(cg);
3116         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3117                 goto err;
3118         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3119                 goto err;
3120
3121         // Following values are allowed to fail, because swapaccount might be turned
3122         // off for current kernel
3123         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3124                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3125         {
3126                 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
3127                 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3128                         goto err;
3129                 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3130                         goto err;
3131
3132                 memswlimit = strtoul(memswlimit_str, NULL, 10);
3133                 memswusage = strtoul(memswusage_str, NULL, 10);
3134
3135                 if (!strcmp(memswlimit_str, memswlimit_default_str))
3136                         memswlimit = 0;
3137                 if (!strcmp(memswusage_str, memswusage_default_str))
3138                         memswusage = 0;
3139
3140                 memswlimit = memswlimit / 1024;
3141                 memswusage = memswusage / 1024;
3142         }
3143
3144         memusage = strtoul(memusage_str, NULL, 10);
3145         memlimit /= 1024;
3146         memusage /= 1024;
3147
3148         parse_memstat(memstat_str, &cached, &active_anon,
3149                         &inactive_anon, &active_file, &inactive_file,
3150                         &unevictable);
3151
3152         f = fopen("/proc/meminfo", "r");
3153         if (!f)
3154                 goto err;
3155
3156         while (getline(&line, &linelen, f) != -1) {
3157                 ssize_t l;
3158                 char *printme, lbuf[100];
3159
3160                 memset(lbuf, 0, 100);
3161                 if (startswith(line, "MemTotal:")) {
3162                         sscanf(line+14, "%lu", &hosttotal);
3163                         if (hosttotal < memlimit)
3164                                 memlimit = hosttotal;
3165                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3166                         printme = lbuf;
3167                 } else if (startswith(line, "MemFree:")) {
3168                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3169                         printme = lbuf;
3170                 } else if (startswith(line, "MemAvailable:")) {
3171                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage);
3172                         printme = lbuf;
3173                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3174                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit - memlimit);
3175                         printme = lbuf;
3176                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3177                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n",
3178                                 (memswlimit - memlimit) - (memswusage - memusage));
3179                         printme = lbuf;
3180                 } else if (startswith(line, "Slab:")) {
3181                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3182                         printme = lbuf;
3183                 } else if (startswith(line, "Buffers:")) {
3184                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3185                         printme = lbuf;
3186                 } else if (startswith(line, "Cached:")) {
3187                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3188                         printme = lbuf;
3189                 } else if (startswith(line, "SwapCached:")) {
3190                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3191                         printme = lbuf;
3192                 } else if (startswith(line, "Active")) {
3193                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3194                                         active_anon + active_file);
3195                         printme = lbuf;
3196                 } else if (startswith(line, "Inactive")) {
3197                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3198                                         inactive_anon + inactive_file);
3199                         printme = lbuf;
3200                 } else if (startswith(line, "Active(anon)")) {
3201                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3202                         printme = lbuf;
3203                 } else if (startswith(line, "Inactive(anon)")) {
3204                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3205                         printme = lbuf;
3206                 } else if (startswith(line, "Active(file)")) {
3207                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3208                         printme = lbuf;
3209                 } else if (startswith(line, "Inactive(file)")) {
3210                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3211                         printme = lbuf;
3212                 } else if (startswith(line, "Unevictable")) {
3213                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3214                         printme = lbuf;
3215                 } else if (startswith(line, "SReclaimable")) {
3216                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3217                         printme = lbuf;
3218                 } else if (startswith(line, "SUnreclaim")) {
3219                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3220                         printme = lbuf;
3221                 } else
3222                         printme = line;
3223
3224                 l = snprintf(cache, cache_size, "%s", printme);
3225                 if (l < 0) {
3226                         perror("Error writing to cache");
3227                         rv = 0;
3228                         goto err;
3229
3230                 }
3231                 if (l >= cache_size) {
3232                         fprintf(stderr, "Internal error: truncated write to cache\n");
3233                         rv = 0;
3234                         goto err;
3235                 }
3236
3237                 cache += l;
3238                 cache_size -= l;
3239                 total_len += l;
3240         }
3241
3242         d->cached = 1;
3243         d->size = total_len;
3244         if (total_len > size ) total_len = size;
3245         memcpy(buf, d->buf, total_len);
3246
3247         rv = total_len;
3248 err:
3249         if (f)
3250                 fclose(f);
3251         free(line);
3252         free(cg);
3253         free(memusage_str);
3254         free(memswlimit_str);
3255         free(memswusage_str);
3256         free(memstat_str);
3257         free(memswlimit_default_str);
3258         free(memswusage_default_str);
3259         return rv;
3260 }
3261
3262 /*
3263  * Read the cpuset.cpus for cg
3264  * Return the answer in a newly allocated string which must be freed
3265  */
3266 static char *get_cpuset(const char *cg)
3267 {
3268         char *answer;
3269
3270         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3271                 return NULL;
3272         return answer;
3273 }
3274
3275 bool cpu_in_cpuset(int cpu, const char *cpuset);
3276
3277 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3278 {
3279         int cpu;
3280
3281         if (sscanf(line, "processor       : %d", &cpu) != 1)
3282                 return false;
3283         return cpu_in_cpuset(cpu, cpuset);
3284 }
3285
3286 /*
3287  * check whether this is a '^processor" line in /proc/cpuinfo
3288  */
3289 static bool is_processor_line(const char *line)
3290 {
3291         int cpu;
3292
3293         if (sscanf(line, "processor       : %d", &cpu) == 1)
3294                 return true;
3295         return false;
3296 }
3297
3298 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3299                 struct fuse_file_info *fi)
3300 {
3301         struct fuse_context *fc = fuse_get_context();
3302         struct file_info *d = (struct file_info *)fi->fh;
3303         char *cg;
3304         char *cpuset = NULL;
3305         char *line = NULL;
3306         size_t linelen = 0, total_len = 0, rv = 0;
3307         bool am_printing = false, firstline = true, is_s390x = false;
3308         int curcpu = -1, cpu;
3309         char *cache = d->buf;
3310         size_t cache_size = d->buflen;
3311         FILE *f = NULL;
3312
3313         if (offset){
3314                 if (offset > d->size)
3315                         return -EINVAL;
3316                 if (!d->cached)
3317                         return 0;
3318                 int left = d->size - offset;
3319                 total_len = left > size ? size: left;
3320                 memcpy(buf, cache + offset, total_len);
3321                 return total_len;
3322         }
3323
3324         pid_t initpid = lookup_initpid_in_store(fc->pid);
3325         if (initpid <= 0)
3326                 initpid = fc->pid;
3327         cg = get_pid_cgroup(initpid, "cpuset");
3328         if (!cg)
3329                 return read_file("proc/cpuinfo", buf, size, d);
3330         prune_init_slice(cg);
3331
3332         cpuset = get_cpuset(cg);
3333         if (!cpuset)
3334                 goto err;
3335
3336         f = fopen("/proc/cpuinfo", "r");
3337         if (!f)
3338                 goto err;
3339
3340         while (getline(&line, &linelen, f) != -1) {
3341                 ssize_t l;
3342                 if (firstline) {
3343                         firstline = false;
3344                         if (strstr(line, "IBM/S390") != NULL) {
3345                                 is_s390x = true;
3346                                 am_printing = true;
3347                                 continue;
3348                         }
3349                 }
3350                 if (strncmp(line, "# processors:", 12) == 0)
3351                         continue;
3352                 if (is_processor_line(line)) {
3353                         am_printing = cpuline_in_cpuset(line, cpuset);
3354                         if (am_printing) {
3355                                 curcpu ++;
3356                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3357                                 if (l < 0) {
3358                                         perror("Error writing to cache");
3359                                         rv = 0;
3360                                         goto err;
3361                                 }
3362                                 if (l >= cache_size) {
3363                                         fprintf(stderr, "Internal error: truncated write to cache\n");
3364                                         rv = 0;
3365                                         goto err;
3366                                 }
3367                                 cache += l;
3368                                 cache_size -= l;
3369                                 total_len += l;
3370                         }
3371                         continue;
3372                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3373                         char *p;
3374                         if (!cpu_in_cpuset(cpu, cpuset))
3375                                 continue;
3376                         curcpu ++;
3377                         p = strchr(line, ':');
3378                         if (!p || !*p)
3379                                 goto err;
3380                         p++;
3381                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3382                         if (l < 0) {
3383                                 perror("Error writing to cache");
3384                                 rv = 0;
3385                                 goto err;
3386                         }
3387                         if (l >= cache_size) {
3388                                 fprintf(stderr, "Internal error: truncated write to cache\n");
3389                                 rv = 0;
3390                                 goto err;
3391                         }
3392                         cache += l;
3393                         cache_size -= l;
3394                         total_len += l;
3395                         continue;
3396
3397                 }
3398                 if (am_printing) {
3399                         l = snprintf(cache, cache_size, "%s", line);
3400                         if (l < 0) {
3401                                 perror("Error writing to cache");
3402                                 rv = 0;
3403                                 goto err;
3404                         }
3405                         if (l >= cache_size) {
3406                                 fprintf(stderr, "Internal error: truncated write to cache\n");
3407                                 rv = 0;
3408                                 goto err;
3409                         }
3410                         cache += l;
3411                         cache_size -= l;
3412                         total_len += l;
3413                 }
3414         }
3415
3416         if (is_s390x) {
3417                 char *origcache = d->buf;
3418                 ssize_t l;
3419                 do {
3420                         d->buf = malloc(d->buflen);
3421                 } while (!d->buf);
3422                 cache = d->buf;
3423                 cache_size = d->buflen;
3424                 total_len = 0;
3425                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3426                 if (l < 0 || l >= cache_size) {
3427                         free(origcache);
3428                         goto err;
3429                 }
3430                 cache_size -= l;
3431                 cache += l;
3432                 total_len += l;
3433                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3434                 if (l < 0 || l >= cache_size) {
3435                         free(origcache);
3436                         goto err;
3437                 }
3438                 cache_size -= l;
3439                 cache += l;
3440                 total_len += l;
3441                 l = snprintf(cache, cache_size, "%s", origcache);
3442                 free(origcache);
3443                 if (l < 0 || l >= cache_size)
3444                         goto err;
3445                 total_len += l;
3446         }
3447
3448         d->cached = 1;
3449         d->size = total_len;
3450         if (total_len > size ) total_len = size;
3451
3452         /* read from off 0 */
3453         memcpy(buf, d->buf, total_len);
3454         rv = total_len;
3455 err:
3456         if (f)
3457                 fclose(f);
3458         free(line);
3459         free(cpuset);
3460         free(cg);
3461         return rv;
3462 }
3463
3464 static int proc_stat_read(char *buf, size_t size, off_t offset,
3465                 struct fuse_file_info *fi)
3466 {
3467         struct fuse_context *fc = fuse_get_context();
3468         struct file_info *d = (struct file_info *)fi->fh;
3469         char *cg;
3470         char *cpuset = NULL;
3471         char *line = NULL;
3472         size_t linelen = 0, total_len = 0, rv = 0;
3473         int curcpu = -1; /* cpu numbering starts at 0 */
3474         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3475         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3476                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3477 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3478         char cpuall[CPUALL_MAX_SIZE];
3479         /* reserve for cpu all */
3480         char *cache = d->buf + CPUALL_MAX_SIZE;
3481         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3482         FILE *f = NULL;
3483
3484         if (offset){
3485                 if (offset > d->size)
3486                         return -EINVAL;
3487                 if (!d->cached)
3488                         return 0;
3489                 int left = d->size - offset;
3490                 total_len = left > size ? size: left;
3491                 memcpy(buf, d->buf + offset, total_len);
3492                 return total_len;
3493         }
3494
3495         pid_t initpid = lookup_initpid_in_store(fc->pid);
3496         if (initpid <= 0)
3497                 initpid = fc->pid;
3498         cg = get_pid_cgroup(initpid, "cpuset");
3499         if (!cg)
3500                 return read_file("/proc/stat", buf, size, d);
3501         prune_init_slice(cg);
3502
3503         cpuset = get_cpuset(cg);
3504         if (!cpuset)
3505                 goto err;
3506
3507         f = fopen("/proc/stat", "r");
3508         if (!f)
3509                 goto err;
3510
3511         //skip first line
3512         if (getline(&line, &linelen, f) < 0) {
3513                 fprintf(stderr, "proc_stat_read read first line failed\n");
3514                 goto err;
3515         }
3516
3517         while (getline(&line, &linelen, f) != -1) {
3518                 ssize_t l;
3519                 int cpu;
3520                 char cpu_char[10]; /* That's a lot of cores */
3521                 char *c;
3522
3523                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3524                         /* not a ^cpuN line containing a number N, just print it */
3525                         l = snprintf(cache, cache_size, "%s", line);
3526                         if (l < 0) {
3527                                 perror("Error writing to cache");
3528                                 rv = 0;
3529                                 goto err;
3530                         }
3531                         if (l >= cache_size) {
3532                                 fprintf(stderr, "Internal error: truncated write to cache\n");
3533                                 rv = 0;
3534                                 goto err;
3535                         }
3536                         cache += l;
3537                         cache_size -= l;
3538                         total_len += l;
3539                         continue;
3540                 }
3541
3542                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3543                         continue;
3544                 if (!cpu_in_cpuset(cpu, cpuset))
3545                         continue;
3546                 curcpu ++;
3547
3548                 c = strchr(line, ' ');
3549                 if (!c)
3550                         continue;
3551                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3552                 if (l < 0) {
3553                         perror("Error writing to cache");
3554                         rv = 0;
3555                         goto err;
3556
3557                 }
3558                 if (l >= cache_size) {
3559                         fprintf(stderr, "Internal error: truncated write to cache\n");
3560                         rv = 0;
3561                         goto err;
3562                 }
3563
3564                 cache += l;
3565                 cache_size -= l;
3566                 total_len += l;
3567
3568                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3569                         &softirq, &steal, &guest) != 9)
3570                         continue;
3571                 user_sum += user;
3572                 nice_sum += nice;
3573                 system_sum += system;
3574                 idle_sum += idle;
3575                 iowait_sum += iowait;
3576                 irq_sum += irq;
3577                 softirq_sum += softirq;
3578                 steal_sum += steal;
3579                 guest_sum += guest;
3580         }
3581
3582         cache = d->buf;
3583
3584         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3585                 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3586         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3587                 memcpy(cache, cpuall, cpuall_len);
3588                 cache += cpuall_len;
3589         } else{
3590                 /* shouldn't happen */
3591                 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
3592                 cpuall_len = 0;
3593         }
3594
3595         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3596         total_len += cpuall_len;
3597         d->cached = 1;
3598         d->size = total_len;
3599         if (total_len > size ) total_len = size;
3600
3601         memcpy(buf, d->buf, total_len);
3602         rv = total_len;
3603
3604 err:
3605         if (f)
3606                 fclose(f);
3607         free(line);
3608         free(cpuset);
3609         free(cg);
3610         return rv;
3611 }
3612
3613 static long int getreaperage(pid_t pid)
3614 {
3615         char fnam[100];
3616         struct stat sb;
3617         int ret;
3618         pid_t qpid;
3619
3620         qpid = lookup_initpid_in_store(pid);
3621         if (qpid <= 0)
3622                 return 0;
3623
3624         ret = snprintf(fnam, 100, "/proc/%d", qpid);
3625         if (ret < 0 || ret >= 100)
3626                 return 0;
3627
3628         if (lstat(fnam, &sb) < 0)
3629                 return 0;
3630
3631         return time(NULL) - sb.st_ctime;
3632 }
3633
3634 static unsigned long get_reaper_busy(pid_t task)
3635 {
3636         pid_t initpid = lookup_initpid_in_store(task);
3637         char *cgroup = NULL, *usage_str = NULL;
3638         unsigned long usage = 0;
3639
3640         if (initpid <= 0)
3641                 return 0;
3642
3643         cgroup = get_pid_cgroup(initpid, "cpuacct");
3644         if (!cgroup)
3645                 goto out;
3646         prune_init_slice(cgroup);
3647         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3648                 goto out;
3649         usage = strtoul(usage_str, NULL, 10);
3650         usage /= 1000000000;
3651
3652 out:
3653         free(cgroup);
3654         free(usage_str);
3655         return usage;
3656 }
3657
3658 #if RELOADTEST
3659 void iwashere(void)
3660 {
3661         int fd;
3662
3663         fd = creat("/tmp/lxcfs-iwashere", 0644);
3664         if (fd >= 0)
3665                 close(fd);
3666 }
3667 #endif
3668
3669 /*
3670  * We read /proc/uptime and reuse its second field.
3671  * For the first field, we use the mtime for the reaper for
3672  * the calling pid as returned by getreaperage
3673  */
3674 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3675                 struct fuse_file_info *fi)
3676 {
3677         struct fuse_context *fc = fuse_get_context();
3678         struct file_info *d = (struct file_info *)fi->fh;
3679         long int reaperage = getreaperage(fc->pid);
3680         unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3681         char *cache = d->buf;
3682         ssize_t total_len = 0;
3683
3684 #if RELOADTEST
3685         iwashere();
3686 #endif
3687
3688         if (offset){
3689                 if (offset > d->size)
3690                         return -EINVAL;
3691                 if (!d->cached)
3692                         return 0;
3693                 int left = d->size - offset;
3694                 total_len = left > size ? size: left;
3695                 memcpy(buf, cache + offset, total_len);
3696                 return total_len;
3697         }
3698
3699         idletime = reaperage - busytime;
3700         if (idletime > reaperage)
3701                 idletime = reaperage;
3702
3703         total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3704         if (total_len < 0){
3705                 perror("Error writing to cache");
3706                 return 0;
3707         }
3708
3709         d->size = (int)total_len;
3710         d->cached = 1;
3711
3712         if (total_len > size) total_len = size;
3713
3714         memcpy(buf, d->buf, total_len);
3715         return total_len;
3716 }
3717
3718 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3719                 struct fuse_file_info *fi)
3720 {
3721         char dev_name[72];
3722         struct fuse_context *fc = fuse_get_context();
3723         struct file_info *d = (struct file_info *)fi->fh;
3724         char *cg;
3725         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3726                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
3727         unsigned long read = 0, write = 0;
3728         unsigned long read_merged = 0, write_merged = 0;
3729         unsigned long read_sectors = 0, write_sectors = 0;
3730         unsigned long read_ticks = 0, write_ticks = 0;
3731         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3732         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3733         char *cache = d->buf;
3734         size_t cache_size = d->buflen;
3735         char *line = NULL;
3736         size_t linelen = 0, total_len = 0, rv = 0;
3737         unsigned int major = 0, minor = 0;
3738         int i = 0;
3739         FILE *f = NULL;
3740
3741         if (offset){
3742                 if (offset > d->size)
3743                         return -EINVAL;
3744                 if (!d->cached)
3745                         return 0;
3746                 int left = d->size - offset;
3747                 total_len = left > size ? size: left;
3748                 memcpy(buf, cache + offset, total_len);
3749                 return total_len;
3750         }
3751
3752         pid_t initpid = lookup_initpid_in_store(fc->pid);
3753         if (initpid <= 0)
3754                 initpid = fc->pid;
3755         cg = get_pid_cgroup(initpid, "blkio");
3756         if (!cg)
3757                 return read_file("/proc/diskstats", buf, size, d);
3758         prune_init_slice(cg);
3759
3760         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
3761                 goto err;
3762         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
3763                 goto err;
3764         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
3765                 goto err;
3766         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
3767                 goto err;
3768         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
3769                 goto err;
3770
3771
3772         f = fopen("/proc/diskstats", "r");
3773         if (!f)
3774                 goto err;
3775
3776         while (getline(&line, &linelen, f) != -1) {
3777                 ssize_t l;
3778                 char lbuf[256];
3779
3780                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
3781                 if (i != 3)
3782                         continue;
3783
3784                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3785                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3786                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3787                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3788                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3789                 read_sectors = read_sectors/512;
3790                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3791                 write_sectors = write_sectors/512;
3792
3793                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3794                 rd_svctm = rd_svctm/1000000;
3795                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3796                 rd_wait = rd_wait/1000000;
3797                 read_ticks = rd_svctm + rd_wait;
3798
3799                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3800                 wr_svctm =  wr_svctm/1000000;
3801                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3802                 wr_wait =  wr_wait/1000000;
3803                 write_ticks = wr_svctm + wr_wait;
3804
3805                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3806                 tot_ticks =  tot_ticks/1000000;
3807
3808                 memset(lbuf, 0, 256);
3809                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3810                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3811                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3812                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3813                 else
3814                         continue;
3815
3816                 l = snprintf(cache, cache_size, "%s", lbuf);
3817                 if (l < 0) {
3818                         perror("Error writing to fuse buf");
3819                         rv = 0;
3820                         goto err;
3821                 }
3822                 if (l >= cache_size) {
3823                         fprintf(stderr, "Internal error: truncated write to cache\n");
3824                         rv = 0;
3825                         goto err;
3826                 }
3827                 cache += l;
3828                 cache_size -= l;
3829                 total_len += l;
3830         }
3831
3832         d->cached = 1;
3833         d->size = total_len;
3834         if (total_len > size ) total_len = size;
3835         memcpy(buf, d->buf, total_len);
3836
3837         rv = total_len;
3838 err:
3839         free(cg);
3840         if (f)
3841                 fclose(f);
3842         free(line);
3843         free(io_serviced_str);
3844         free(io_merged_str);
3845         free(io_service_bytes_str);
3846         free(io_wait_time_str);
3847         free(io_service_time_str);
3848         return rv;
3849 }
3850
3851 static int proc_swaps_read(char *buf, size_t size, off_t offset,
3852                 struct fuse_file_info *fi)
3853 {
3854         struct fuse_context *fc = fuse_get_context();
3855         struct file_info *d = (struct file_info *)fi->fh;
3856         char *cg = NULL;
3857         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
3858              *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3859         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
3860         ssize_t total_len = 0, rv = 0;
3861         ssize_t l = 0;
3862         char *cache = d->buf;
3863
3864         if (offset) {
3865                 if (offset > d->size)
3866                         return -EINVAL;
3867                 if (!d->cached)
3868                         return 0;
3869                 int left = d->size - offset;
3870                 total_len = left > size ? size: left;
3871                 memcpy(buf, cache + offset, total_len);
3872                 return total_len;
3873         }
3874
3875         pid_t initpid = lookup_initpid_in_store(fc->pid);
3876         if (initpid <= 0)
3877                 initpid = fc->pid;
3878         cg = get_pid_cgroup(initpid, "memory");
3879         if (!cg)
3880                 return read_file("/proc/swaps", buf, size, d);
3881         prune_init_slice(cg);
3882
3883         if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
3884                 goto err;
3885
3886         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3887                 goto err;
3888
3889         memlimit = strtoul(memlimit_str, NULL, 10);
3890         memusage = strtoul(memusage_str, NULL, 10);
3891
3892         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
3893             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
3894
3895                 /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
3896                 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3897                     goto err;
3898                 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3899                     goto err;
3900
3901                 memswlimit = strtoul(memswlimit_str, NULL, 10);
3902                 memswusage = strtoul(memswusage_str, NULL, 10);
3903
3904                 if (!strcmp(memswlimit_str, memswlimit_default_str))
3905                     memswlimit = 0;
3906                 if (!strcmp(memswusage_str, memswusage_default_str))
3907                     memswusage = 0;
3908
3909                 swap_total = (memswlimit - memlimit) / 1024;
3910                 swap_free = (memswusage - memusage) / 1024;
3911         }
3912
3913         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
3914
3915         /* When no mem + swap limit is specified or swapaccount=0*/
3916         if (!memswlimit) {
3917                 char *line = NULL;
3918                 size_t linelen = 0;
3919                 FILE *f = fopen("/proc/meminfo", "r");
3920
3921                 if (!f)
3922                         goto err;
3923
3924                 while (getline(&line, &linelen, f) != -1) {
3925                         if (startswith(line, "SwapTotal:")) {
3926                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
3927                         } else if (startswith(line, "SwapFree:")) {
3928                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
3929                         }
3930                 }
3931
3932                 free(line);
3933                 fclose(f);
3934         }
3935
3936         if (swap_total > 0) {
3937                 l = snprintf(d->buf + total_len, d->size - total_len,
3938                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
3939                                 swap_total, swap_free);
3940                 total_len += l;
3941         }
3942
3943         if (total_len < 0 || l < 0) {
3944                 perror("Error writing to cache");
3945                 rv = 0;
3946                 goto err;
3947         }
3948
3949         d->cached = 1;
3950         d->size = (int)total_len;
3951
3952         if (total_len > size) total_len = size;
3953         memcpy(buf, d->buf, total_len);
3954         rv = total_len;
3955
3956 err:
3957         free(cg);
3958         free(memswlimit_str);
3959         free(memlimit_str);
3960         free(memusage_str);
3961         free(memswusage_str);
3962         free(memswusage_default_str);
3963         free(memswlimit_default_str);
3964         return rv;
3965 }
3966
3967 static off_t get_procfile_size(const char *which)
3968 {
3969         FILE *f = fopen(which, "r");
3970         char *line = NULL;
3971         size_t len = 0;
3972         ssize_t sz, answer = 0;
3973         if (!f)
3974                 return 0;
3975
3976         while ((sz = getline(&line, &len, f)) != -1)
3977                 answer += sz;
3978         fclose (f);
3979         free(line);
3980
3981         return answer;
3982 }
3983
3984 int proc_getattr(const char *path, struct stat *sb)
3985 {
3986         struct timespec now;
3987
3988         memset(sb, 0, sizeof(struct stat));
3989         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3990                 return -EINVAL;
3991         sb->st_uid = sb->st_gid = 0;
3992         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
3993         if (strcmp(path, "/proc") == 0) {
3994                 sb->st_mode = S_IFDIR | 00555;
3995                 sb->st_nlink = 2;
3996                 return 0;
3997         }
3998         if (strcmp(path, "/proc/meminfo") == 0 ||
3999                         strcmp(path, "/proc/cpuinfo") == 0 ||
4000                         strcmp(path, "/proc/uptime") == 0 ||
4001                         strcmp(path, "/proc/stat") == 0 ||
4002                         strcmp(path, "/proc/diskstats") == 0 ||
4003                         strcmp(path, "/proc/swaps") == 0) {
4004                 sb->st_size = 0;
4005                 sb->st_mode = S_IFREG | 00444;
4006                 sb->st_nlink = 1;
4007                 return 0;
4008         }
4009
4010         return -ENOENT;
4011 }
4012
4013 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4014                 struct fuse_file_info *fi)
4015 {
4016         if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
4017                                 filler(buf, "meminfo", NULL, 0) != 0 ||
4018                                 filler(buf, "stat", NULL, 0) != 0 ||
4019                                 filler(buf, "uptime", NULL, 0) != 0 ||
4020                                 filler(buf, "diskstats", NULL, 0) != 0 ||
4021                                 filler(buf, "swaps", NULL, 0) != 0)
4022                 return -EINVAL;
4023         return 0;
4024 }
4025
4026 int proc_open(const char *path, struct fuse_file_info *fi)
4027 {
4028         int type = -1;
4029         struct file_info *info;
4030
4031         if (strcmp(path, "/proc/meminfo") == 0)
4032                 type = LXC_TYPE_PROC_MEMINFO;
4033         else if (strcmp(path, "/proc/cpuinfo") == 0)
4034                 type = LXC_TYPE_PROC_CPUINFO;
4035         else if (strcmp(path, "/proc/uptime") == 0)
4036                 type = LXC_TYPE_PROC_UPTIME;
4037         else if (strcmp(path, "/proc/stat") == 0)
4038                 type = LXC_TYPE_PROC_STAT;
4039         else if (strcmp(path, "/proc/diskstats") == 0)
4040                 type = LXC_TYPE_PROC_DISKSTATS;
4041         else if (strcmp(path, "/proc/swaps") == 0)
4042                 type = LXC_TYPE_PROC_SWAPS;
4043         if (type == -1)
4044                 return -ENOENT;
4045
4046         info = malloc(sizeof(*info));
4047         if (!info)
4048                 return -ENOMEM;
4049
4050         memset(info, 0, sizeof(*info));
4051         info->type = type;
4052
4053         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4054         do {
4055                 info->buf = malloc(info->buflen);
4056         } while (!info->buf);
4057         memset(info->buf, 0, info->buflen);
4058         /* set actual size to buffer size */
4059         info->size = info->buflen;
4060
4061         fi->fh = (unsigned long)info;
4062         return 0;
4063 }
4064
4065 int proc_access(const char *path, int mask)
4066 {
4067         /* these are all read-only */
4068         if ((mask & ~R_OK) != 0)
4069                 return -EACCES;
4070         return 0;
4071 }
4072
4073 int proc_release(const char *path, struct fuse_file_info *fi)
4074 {
4075         do_release_file_info(fi);
4076         return 0;
4077 }
4078
4079 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4080                 struct fuse_file_info *fi)
4081 {
4082         struct file_info *f = (struct file_info *) fi->fh;
4083
4084         switch (f->type) {
4085         case LXC_TYPE_PROC_MEMINFO:
4086                 return proc_meminfo_read(buf, size, offset, fi);
4087         case LXC_TYPE_PROC_CPUINFO:
4088                 return proc_cpuinfo_read(buf, size, offset, fi);
4089         case LXC_TYPE_PROC_UPTIME:
4090                 return proc_uptime_read(buf, size, offset, fi);
4091         case LXC_TYPE_PROC_STAT:
4092                 return proc_stat_read(buf, size, offset, fi);
4093         case LXC_TYPE_PROC_DISKSTATS:
4094                 return proc_diskstats_read(buf, size, offset, fi);
4095         case LXC_TYPE_PROC_SWAPS:
4096                 return proc_swaps_read(buf, size, offset, fi);
4097         default:
4098                 return -EINVAL;
4099         }
4100 }
4101
4102 /*
4103  * Functions needed to setup cgroups in the __constructor__.
4104  */
4105
4106 static bool mkdir_p(const char *dir, mode_t mode)
4107 {
4108         const char *tmp = dir;
4109         const char *orig = dir;
4110         char *makeme;
4111
4112         do {
4113                 dir = tmp + strspn(tmp, "/");
4114                 tmp = dir + strcspn(dir, "/");
4115                 makeme = strndup(orig, dir - orig);
4116                 if (!makeme)
4117                         return false;
4118                 if (mkdir(makeme, mode) && errno != EEXIST) {
4119                         fprintf(stderr, "failed to create directory '%s': %s",
4120                                 makeme, strerror(errno));
4121                         free(makeme);
4122                         return false;
4123                 }
4124                 free(makeme);
4125         } while(tmp != dir);
4126
4127         return true;
4128 }
4129
4130 static bool umount_if_mounted(void)
4131 {
4132         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4133                 fprintf(stderr, "failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4134                 return false;
4135         }
4136         return true;
4137 }
4138
4139 static int pivot_enter(void)
4140 {
4141         int ret = -1, oldroot = -1, newroot = -1;
4142
4143         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4144         if (oldroot < 0) {
4145                 fprintf(stderr, "%s: Failed to open old root for fchdir.\n", __func__);
4146                 return ret;
4147         }
4148
4149         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4150         if (newroot < 0) {
4151                 fprintf(stderr, "%s: Failed to open new root for fchdir.\n", __func__);
4152                 goto err;
4153         }
4154
4155         /* change into new root fs */
4156         if (fchdir(newroot) < 0) {
4157                 fprintf(stderr, "%s: Failed to change directory to new rootfs: %s.\n", __func__, ROOTDIR);
4158                 goto err;
4159         }
4160
4161         /* pivot_root into our new root fs */
4162         if (pivot_root(".", ".") < 0) {
4163                 fprintf(stderr, "%s: pivot_root() syscall failed: %s.\n", __func__, strerror(errno));
4164                 goto err;
4165         }
4166
4167         /*
4168          * At this point the old-root is mounted on top of our new-root.
4169          * To unmounted it we must not be chdir'd into it, so escape back
4170          * to the old-root.
4171          */
4172         if (fchdir(oldroot) < 0) {
4173                 fprintf(stderr, "%s: Failed to enter old root.\n", __func__);
4174                 goto err;
4175         }
4176         if (umount2(".", MNT_DETACH) < 0) {
4177                 fprintf(stderr, "%s: Failed to detach old root.\n", __func__);
4178                 goto err;
4179         }
4180
4181         if (fchdir(newroot) < 0) {
4182                 fprintf(stderr, "%s: Failed to re-enter new root.\n", __func__);
4183                 goto err;
4184         }
4185
4186         ret = 0;
4187
4188 err:
4189         if (oldroot > 0)
4190                 close(oldroot);
4191         if (newroot > 0)
4192                 close(newroot);
4193         return ret;
4194 }
4195
4196 /* Prepare our new clean root. */
4197 static int pivot_prepare(void)
4198 {
4199         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4200                 fprintf(stderr, "%s: Failed to create directory for new root.\n", __func__);
4201                 return -1;
4202         }
4203
4204         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4205                 fprintf(stderr, "%s: Failed to bind-mount / for new root: %s.\n", __func__, strerror(errno));
4206                 return -1;
4207         }
4208
4209         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4210                 fprintf(stderr, "%s: Failed to bind-mount /run into new root: %s.\n", __func__, strerror(errno));
4211                 return -1;
4212         }
4213
4214         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4215                 printf("%s: failed to move " BASEDIR " into new root: %s.\n", __func__, strerror(errno));
4216                 return -1;
4217         }
4218
4219         return 0;
4220 }
4221
4222 static bool pivot_new_root(void)
4223 {
4224         /* Prepare new root. */
4225         if (pivot_prepare() < 0)
4226                 return false;
4227
4228         /* Pivot into new root. */
4229         if (pivot_enter() < 0)
4230                 return false;
4231
4232         return true;
4233 }
4234
4235 static bool setup_cgfs_dir(void)
4236 {
4237         if (!mkdir_p(BASEDIR, 0700)) {
4238                 fprintf(stderr, "Failed to create lxcfs cgroup mountpoint.\n");
4239                 return false;
4240         }
4241
4242         if (!umount_if_mounted()) {
4243                 fprintf(stderr, "Failed to clean up old lxcfs cgroup mountpoint.\n");
4244                 return false;
4245         }
4246
4247         if (unshare(CLONE_NEWNS) < 0) {
4248                 fprintf(stderr, "%s: Failed to unshare mount namespace: %s.\n", __func__, strerror(errno));
4249                 return false;
4250         }
4251
4252         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4253                 fprintf(stderr, "%s: Failed to remount / private: %s.\n", __func__, strerror(errno));
4254                 return false;
4255         }
4256
4257         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
4258                 fprintf(stderr, "Failed to mount tmpfs over lxcfs cgroup mountpoint.\n");
4259                 return false;
4260         }
4261
4262         return true;
4263 }
4264
4265 static bool do_mount_cgroups(void)
4266 {
4267         char *target;
4268         size_t clen, len;
4269         int i, ret;
4270
4271         for (i = 0; i < num_hierarchies; i++) {
4272                 char *controller = hierarchies[i];
4273                 clen = strlen(controller);
4274                 len = strlen(BASEDIR) + clen + 2;
4275                 target = malloc(len);
4276                 if (!target)
4277                         return false;
4278                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4279                 if (ret < 0 || ret >= len) {
4280                         free(target);
4281                         return false;
4282                 }
4283                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4284                         free(target);
4285                         return false;
4286                 }
4287                 if (mount(controller, target, "cgroup", 0, controller) < 0) {
4288                         fprintf(stderr, "Failed mounting cgroup %s\n", controller);
4289                         free(target);
4290                         return false;
4291                 }
4292
4293                 fd_hierarchies[i] = open(target, O_DIRECTORY);
4294                 if (fd_hierarchies[i] < 0) {
4295                         free(target);
4296                         return false;
4297                 }
4298                 free(target);
4299         }
4300         return true;
4301 }
4302
4303 static bool cgfs_setup_controllers(void)
4304 {
4305         if (!setup_cgfs_dir())
4306                 return false;
4307
4308         if (!do_mount_cgroups()) {
4309                 fprintf(stderr, "Failed to set up private lxcfs cgroup mounts.\n");
4310                 return false;
4311         }
4312
4313         if (!pivot_new_root())
4314                 return false;
4315
4316         return true;
4317 }
4318
4319 static int preserve_ns(int pid)
4320 {
4321         int ret;
4322         size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
4323         char path[len];
4324
4325         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4326         if (ret < 0 || (size_t)ret >= len)
4327                 return -1;
4328
4329         return open(path, O_RDONLY | O_CLOEXEC);
4330 }
4331
4332 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
4333 {
4334         FILE *f;
4335         char *line = NULL;
4336         size_t len = 0;
4337         int i, init_ns = -1;
4338
4339         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4340                 fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno));
4341                 return;
4342         }
4343         while (getline(&line, &len, f) != -1) {
4344                 char *p, *p2;
4345
4346                 p = strchr(line, ':');
4347                 if (!p)
4348                         goto out;
4349                 *(p++) = '\0';
4350
4351                 p2 = strrchr(p, ':');
4352                 if (!p2)
4353                         goto out;
4354                 *p2 = '\0';
4355
4356                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4357                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4358                  * because it parses out the empty string "" and later on passes
4359                  * it to mount(). Let's skip such entries.
4360                  */
4361                 if (!strcmp(p, ""))
4362                         continue;
4363
4364                 if (!store_hierarchy(line, p))
4365                         goto out;
4366         }
4367
4368         /* Preserve initial namespace. */
4369         init_ns = preserve_ns(getpid());
4370         if (init_ns < 0)
4371                 goto out;
4372
4373         fd_hierarchies = malloc(sizeof(int *) * num_hierarchies);
4374         if (!fd_hierarchies)
4375                 goto out;
4376
4377         for (i = 0; i < num_hierarchies; i++)
4378                 fd_hierarchies[i] = -1;
4379
4380         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4381          * to privately mount lxcfs cgroups. */
4382         if (!cgfs_setup_controllers())
4383                 goto out;
4384
4385         if (setns(init_ns, 0) < 0)
4386                 goto out;
4387
4388         print_subsystems();
4389
4390 out:
4391         free(line);
4392         fclose(f);
4393         if (init_ns >= 0)
4394                 close(init_ns);
4395 }
4396
4397 static void __attribute__((destructor)) free_subsystems(void)
4398 {
4399         int i;
4400
4401         for (i = 0; i < num_hierarchies; i++) {
4402                 if (hierarchies[i])
4403                         free(hierarchies[i]);
4404                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
4405                         close(fd_hierarchies[i]);
4406         }
4407         free(hierarchies);
4408         free(fd_hierarchies);
4409 }