bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #include <dirent.h>
  12 #include <errno.h>
  13 #include <fcntl.h>
  14 #include <fuse.h>
  15 #include <libgen.h>
  16 #include <pthread.h>
  17 #include <sched.h>
  18 #include <stdbool.h>
  19 #include <stdio.h>
  20 #include <stdlib.h>
  21 #include <string.h>
  22 #include <time.h>
  23 #include <unistd.h>
  24 #include <wait.h>
  25 #include <linux/sched.h>
  26 #include <sys/epoll.h>
  27 #include <sys/mman.h>
  28 #include <sys/mount.h>
  29 #include <sys/param.h>
  30 #include <sys/socket.h>
  31 #include <sys/syscall.h>
  32
  33 #include "bindings.h"
  34 #include "config.h" // for VERSION
  35
  36 /* Define pivot_root() if missing from the C library */
  37 #ifndef HAVE_PIVOT_ROOT
  38 static int pivot_root(const char * new_root, const char * put_old)
  39 {
  40 #ifdef __NR_pivot_root
  41 return syscall(__NR_pivot_root, new_root, put_old);
  42 #else
  43 errno = ENOSYS;
  44 return -1;
  45 #endif
  46 }
  47 #else
  48 extern int pivot_root(const char * new_root, const char * put_old);
  49 #endif
  50
  51 enum {
  52         LXC_TYPE_CGDIR,
  53         LXC_TYPE_CGFILE,
  54         LXC_TYPE_PROC_MEMINFO,
  55         LXC_TYPE_PROC_CPUINFO,
  56         LXC_TYPE_PROC_UPTIME,
  57         LXC_TYPE_PROC_STAT,
  58         LXC_TYPE_PROC_DISKSTATS,
  59         LXC_TYPE_PROC_SWAPS,
  60 };
  61
  62 struct file_info {
  63         char *controller;
  64         char *cgroup;
  65         char *file;
  66         int type;
  67         char *buf;  // unused as of yet
  68         int buflen;
  69         int size; //actual data size
  70         int cached;
  71 };
  72
  73 /* reserve buffer size, for cpuall in /proc/stat */
  74 #define BUF_RESERVE_SIZE 256
  75
  76 /*
  77  * A table caching which pid is init for a pid namespace.
  78  * When looking up which pid is init for $qpid, we first
  79  * 1. Stat /proc/$qpid/ns/pid.
  80  * 2. Check whether the ino_t is in our store.
  81  *   a. if not, fork a child in qpid's ns to send us
  82  *       ucred.pid = 1, and read the initpid.  Cache
  83  *       initpid and creation time for /proc/initpid
  84  *       in a new store entry.
  85  *   b. if so, verify that /proc/initpid still matches
  86  *       what we have saved.  If not, clear the store
  87  *       entry and go back to a.  If so, return the
  88  *       cached initpid.
  89  */
  90 struct pidns_init_store {
  91         ino_t ino;          // inode number for /proc/$pid/ns/pid
  92         pid_t initpid;      // the pid of nit in that ns
  93         long int ctime;     // the time at which /proc/$initpid was created
  94         struct pidns_init_store *next;
  95         long int lastcheck;
  96 };
  97
  98 /* lol - look at how they are allocated in the kernel */
  99 #define PIDNS_HASH_SIZE 4096
 100 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 101
 102 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 103 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 104 static void lock_mutex(pthread_mutex_t *l)
 105 {
 106         int ret;
 107
 108         if ((ret = pthread_mutex_lock(l)) != 0) {
 109                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 110                 exit(1);
 111         }
 112 }
 113
 114 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 115  * Number of hierarchies mounted. */
 116 static int num_hierarchies;
 117
 118 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 119  * Hierachies mounted {cpuset, blkio, ...}:
 120  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 121 static char **hierarchies;
 122
 123 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 124  * Open file descriptors:
 125  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 126  * private mount namespace.
 127  * Initialized via __constructor__ collect_and_mount_subsystems().
 128  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 129  * mounts and respective files in the private namespace even when located in
 130  * another namespace using the *at() family of functions
 131  * {openat(), fchownat(), ...}. */
 132 static int *fd_hierarchies;
 133
 134 static void unlock_mutex(pthread_mutex_t *l)
 135 {
 136         int ret;
 137
 138         if ((ret = pthread_mutex_unlock(l)) != 0) {
 139                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 140                 exit(1);
 141         }
 142 }
 143
 144 static void store_lock(void)
 145 {
 146         lock_mutex(&pidns_store_mutex);
 147 }
 148
 149 static void store_unlock(void)
 150 {
 151         unlock_mutex(&pidns_store_mutex);
 152 }
 153
 154 /* Must be called under store_lock */
 155 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 156 {
 157         struct stat initsb;
 158         char fnam[100];
 159
 160         snprintf(fnam, 100, "/proc/%d", e->initpid);
 161         if (stat(fnam, &initsb) < 0)
 162                 return false;
 163
 164         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 165                     initsb.st_ctime, e->initpid);
 166
 167         if (e->ctime != initsb.st_ctime)
 168                 return false;
 169         return true;
 170 }
 171
 172 /* Must be called under store_lock */
 173 static void remove_initpid(struct pidns_init_store *e)
 174 {
 175         struct pidns_init_store *tmp;
 176         int h;
 177
 178         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 179
 180         h = HASH(e->ino);
 181         if (pidns_hash_table[h] == e) {
 182                 pidns_hash_table[h] = e->next;
 183                 free(e);
 184                 return;
 185         }
 186
 187         tmp = pidns_hash_table[h];
 188         while (tmp) {
 189                 if (tmp->next == e) {
 190                         tmp->next = e->next;
 191                         free(e);
 192                         return;
 193                 }
 194                 tmp = tmp->next;
 195         }
 196 }
 197
 198 #define PURGE_SECS 5
 199 /* Must be called under store_lock */
 200 static void prune_initpid_store(void)
 201 {
 202         static long int last_prune = 0;
 203         struct pidns_init_store *e, *prev, *delme;
 204         long int now, threshold;
 205         int i;
 206
 207         if (!last_prune) {
 208                 last_prune = time(NULL);
 209                 return;
 210         }
 211         now = time(NULL);
 212         if (now < last_prune + PURGE_SECS)
 213                 return;
 214
 215         lxcfs_debug("%s\n", "Pruning.");
 216
 217         last_prune = now;
 218         threshold = now - 2 * PURGE_SECS;
 219
 220         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 221                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 222                         if (e->lastcheck < threshold) {
 223
 224                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 225
 226                                 delme = e;
 227                                 if (prev)
 228                                         prev->next = e->next;
 229                                 else
 230                                         pidns_hash_table[i] = e->next;
 231                                 e = e->next;
 232                                 free(delme);
 233                         } else {
 234                                 prev = e;
 235                                 e = e->next;
 236                         }
 237                 }
 238         }
 239 }
 240
 241 /* Must be called under store_lock */
 242 static void save_initpid(struct stat *sb, pid_t pid)
 243 {
 244         struct pidns_init_store *e;
 245         char fpath[100];
 246         struct stat procsb;
 247         int h;
 248
 249         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 250
 251         snprintf(fpath, 100, "/proc/%d", pid);
 252         if (stat(fpath, &procsb) < 0)
 253                 return;
 254         do {
 255                 e = malloc(sizeof(*e));
 256         } while (!e);
 257         e->ino = sb->st_ino;
 258         e->initpid = pid;
 259         e->ctime = procsb.st_ctime;
 260         h = HASH(e->ino);
 261         e->next = pidns_hash_table[h];
 262         e->lastcheck = time(NULL);
 263         pidns_hash_table[h] = e;
 264 }
 265
 266 /*
 267  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 268  * entry for the inode number and creation time.  Verify that the init pid
 269  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 270  * otherwise.
 271  * Must be called under store_lock
 272  */
 273 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 274 {
 275         int h = HASH(sb->st_ino);
 276         struct pidns_init_store *e = pidns_hash_table[h];
 277
 278         while (e) {
 279                 if (e->ino == sb->st_ino) {
 280                         if (initpid_still_valid(e, sb)) {
 281                                 e->lastcheck = time(NULL);
 282                                 return e;
 283                         }
 284                         remove_initpid(e);
 285                         return NULL;
 286                 }
 287                 e = e->next;
 288         }
 289
 290         return NULL;
 291 }
 292
 293 static int is_dir(const char *path, int fd)
 294 {
 295         struct stat statbuf;
 296         int ret = fstatat(fd, path, &statbuf, fd);
 297         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 298                 return 1;
 299         return 0;
 300 }
 301
 302 static char *must_copy_string(const char *str)
 303 {
 304         char *dup = NULL;
 305         if (!str)
 306                 return NULL;
 307         do {
 308                 dup = strdup(str);
 309         } while (!dup);
 310
 311         return dup;
 312 }
 313
 314 static inline void drop_trailing_newlines(char *s)
 315 {
 316         int l;
 317
 318         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 319                 s[l-1] = '\0';
 320 }
 321
 322 #define BATCH_SIZE 50
 323 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 324 {
 325         int newbatches = (newlen / BATCH_SIZE) + 1;
 326         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 327
 328         if (!*mem || newbatches > oldbatches) {
 329                 char *tmp;
 330                 do {
 331                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 332                 } while (!tmp);
 333                 *mem = tmp;
 334         }
 335 }
 336 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 337 {
 338         size_t newlen = *len + linelen;
 339         dorealloc(contents, *len, newlen + 1);
 340         memcpy(*contents + *len, line, linelen+1);
 341         *len = newlen;
 342 }
 343
 344 static char *slurp_file(const char *from, int fd)
 345 {
 346         char *line = NULL;
 347         char *contents = NULL;
 348         FILE *f = fdopen(fd, "r");
 349         size_t len = 0, fulllen = 0;
 350         ssize_t linelen;
 351
 352         if (!f)
 353                 return NULL;
 354
 355         while ((linelen = getline(&line, &len, f)) != -1) {
 356                 append_line(&contents, &fulllen, line, linelen);
 357         }
 358         fclose(f);
 359
 360         if (contents)
 361                 drop_trailing_newlines(contents);
 362         free(line);
 363         return contents;
 364 }
 365
 366 static bool write_string(const char *fnam, const char *string, int fd)
 367 {
 368         FILE *f;
 369         size_t len, ret;
 370
 371         if (!(f = fdopen(fd, "w")))
 372                 return false;
 373         len = strlen(string);
 374         ret = fwrite(string, 1, len, f);
 375         if (ret != len) {
 376                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 377                 fclose(f);
 378                 return false;
 379         }
 380         if (fclose(f) < 0) {
 381                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 382                 return false;
 383         }
 384         return true;
 385 }
 386
 387 struct cgfs_files {
 388         char *name;
 389         uint32_t uid, gid;
 390         uint32_t mode;
 391 };
 392
 393 #define ALLOC_NUM 20
 394 static bool store_hierarchy(char *stridx, char *h)
 395 {
 396         if (num_hierarchies % ALLOC_NUM == 0) {
 397                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 398                 n *= ALLOC_NUM;
 399                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 400                 if (!tmp) {
 401                         lxcfs_error("%s\n", strerror(errno));
 402                         exit(1);
 403                 }
 404                 hierarchies = tmp;
 405         }
 406
 407         hierarchies[num_hierarchies++] = must_copy_string(h);
 408         return true;
 409 }
 410
 411 static void print_subsystems(void)
 412 {
 413         int i;
 414
 415         fprintf(stderr, "hierarchies:\n");
 416         for (i = 0; i < num_hierarchies; i++) {
 417                 if (hierarchies[i])
 418                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 419                                 fd_hierarchies[i], hierarchies[i]);
 420         }
 421 }
 422
 423 static bool in_comma_list(const char *needle, const char *haystack)
 424 {
 425         const char *s = haystack, *e;
 426         size_t nlen = strlen(needle);
 427
 428         while (*s && (e = strchr(s, ','))) {
 429                 if (nlen != e - s) {
 430                         s = e + 1;
 431                         continue;
 432                 }
 433                 if (strncmp(needle, s, nlen) == 0)
 434                         return true;
 435                 s = e + 1;
 436         }
 437         if (strcmp(needle, s) == 0)
 438                 return true;
 439         return false;
 440 }
 441
 442 /* do we need to do any massaging here?  I'm not sure... */
 443 /* Return the mounted controller and store the corresponding open file descriptor
 444  * referring to the controller mountpoint in the private lxcfs namespace in
 445  * @cfd.
 446  */
 447 static char *find_mounted_controller(const char *controller, int *cfd)
 448 {
 449         int i;
 450
 451         for (i = 0; i < num_hierarchies; i++) {
 452                 if (!hierarchies[i])
 453                         continue;
 454                 if (strcmp(hierarchies[i], controller) == 0) {
 455                         *cfd = fd_hierarchies[i];
 456                         return hierarchies[i];
 457                 }
 458                 if (in_comma_list(controller, hierarchies[i])) {
 459                         *cfd = fd_hierarchies[i];
 460                         return hierarchies[i];
 461                 }
 462         }
 463
 464         return NULL;
 465 }
 466
 467 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 468                 const char *value)
 469 {
 470         int ret, fd, cfd;
 471         size_t len;
 472         char *fnam, *tmpc;
 473
 474         tmpc = find_mounted_controller(controller, &cfd);
 475         if (!tmpc)
 476                 return false;
 477
 478         /* Make sure we pass a relative path to *at() family of functions.
 479          * . + /cgroup + / + file + \0
 480          */
 481         len = strlen(cgroup) + strlen(file) + 3;
 482         fnam = alloca(len);
 483         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 484         if (ret < 0 || (size_t)ret >= len)
 485                 return false;
 486
 487         fd = openat(cfd, fnam, O_WRONLY);
 488         if (fd < 0)
 489                 return false;
 490
 491         return write_string(fnam, value, fd);
 492 }
 493
 494 // Chown all the files in the cgroup directory.  We do this when we create
 495 // a cgroup on behalf of a user.
 496 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 497 {
 498         struct dirent *direntp;
 499         char path[MAXPATHLEN];
 500         size_t len;
 501         DIR *d;
 502         int fd1, ret;
 503
 504         len = strlen(dirname);
 505         if (len >= MAXPATHLEN) {
 506                 lxcfs_error("Pathname too long: %s\n", dirname);
 507                 return;
 508         }
 509
 510         fd1 = openat(fd, dirname, O_DIRECTORY);
 511         if (fd1 < 0)
 512                 return;
 513
 514         d = fdopendir(fd1);
 515         if (!d) {
 516                 lxcfs_error("Failed to open %s\n", dirname);
 517                 return;
 518         }
 519
 520         while ((direntp = readdir(d))) {
 521                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 522                         continue;
 523                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 524                 if (ret < 0 || ret >= MAXPATHLEN) {
 525                         lxcfs_error("Pathname too long under %s\n", dirname);
 526                         continue;
 527                 }
 528                 if (fchownat(fd, path, uid, gid, 0) < 0)
 529                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 530         }
 531         closedir(d);
 532 }
 533
 534 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 535 {
 536         int cfd;
 537         size_t len;
 538         char *dirnam, *tmpc;
 539
 540         tmpc = find_mounted_controller(controller, &cfd);
 541         if (!tmpc)
 542                 return -EINVAL;
 543
 544         /* Make sure we pass a relative path to *at() family of functions.
 545          * . + /cg + \0
 546          */
 547         len = strlen(cg) + 2;
 548         dirnam = alloca(len);
 549         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 550
 551         if (mkdirat(cfd, dirnam, 0755) < 0)
 552                 return -errno;
 553
 554         if (uid == 0 && gid == 0)
 555                 return 0;
 556
 557         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 558                 return -errno;
 559
 560         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 561
 562         return 0;
 563 }
 564
 565 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 566 {
 567         struct dirent *direntp;
 568         DIR *dir;
 569         bool ret = false;
 570         char pathname[MAXPATHLEN];
 571         int dupfd;
 572
 573         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 574         if (dupfd < 0)
 575                 return false;
 576
 577         dir = fdopendir(dupfd);
 578         if (!dir) {
 579                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 580                 close(dupfd);
 581                 return false;
 582         }
 583
 584         while ((direntp = readdir(dir))) {
 585                 struct stat mystat;
 586                 int rc;
 587
 588                 if (!strcmp(direntp->d_name, ".") ||
 589                     !strcmp(direntp->d_name, ".."))
 590                         continue;
 591
 592                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 593                 if (rc < 0 || rc >= MAXPATHLEN) {
 594                         lxcfs_error("%s\n", "Pathname too long.");
 595                         continue;
 596                 }
 597
 598                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 599                 if (rc) {
 600                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 601                         continue;
 602                 }
 603                 if (S_ISDIR(mystat.st_mode))
 604                         if (!recursive_rmdir(pathname, fd, cfd))
 605                                 lxcfs_debug("Error removing %s.\n", pathname);
 606         }
 607
 608         ret = true;
 609         if (closedir(dir) < 0) {
 610                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 611                 ret = false;
 612         }
 613
 614         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 615                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 616                 ret = false;
 617         }
 618
 619         close(dupfd);
 620
 621         return ret;
 622 }
 623
 624 bool cgfs_remove(const char *controller, const char *cg)
 625 {
 626         int fd, cfd;
 627         size_t len;
 628         char *dirnam, *tmpc;
 629         bool bret;
 630
 631         tmpc = find_mounted_controller(controller, &cfd);
 632         if (!tmpc)
 633                 return false;
 634
 635         /* Make sure we pass a relative path to *at() family of functions.
 636          * . +  /cg + \0
 637          */
 638         len = strlen(cg) + 2;
 639         dirnam = alloca(len);
 640         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 641
 642         fd = openat(cfd, dirnam, O_DIRECTORY);
 643         if (fd < 0)
 644                 return false;
 645
 646         bret = recursive_rmdir(dirnam, fd, cfd);
 647         close(fd);
 648         return bret;
 649 }
 650
 651 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 652 {
 653         int cfd;
 654         size_t len;
 655         char *pathname, *tmpc;
 656
 657         tmpc = find_mounted_controller(controller, &cfd);
 658         if (!tmpc)
 659                 return false;
 660
 661         /* Make sure we pass a relative path to *at() family of functions.
 662          * . + /file + \0
 663          */
 664         len = strlen(file) + 2;
 665         pathname = alloca(len);
 666         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 667         if (fchmodat(cfd, pathname, mode, 0) < 0)
 668                 return false;
 669         return true;
 670 }
 671
 672 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 673 {
 674         size_t len;
 675         char *fname;
 676
 677         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 678         fname = alloca(len);
 679         snprintf(fname, len, "%s/tasks", dirname);
 680         if (fchownat(fd, fname, uid, gid, 0) != 0)
 681                 return -errno;
 682         snprintf(fname, len, "%s/cgroup.procs", dirname);
 683         if (fchownat(fd, fname, uid, gid, 0) != 0)
 684                 return -errno;
 685         return 0;
 686 }
 687
 688 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 689 {
 690         int cfd;
 691         size_t len;
 692         char *pathname, *tmpc;
 693
 694         tmpc = find_mounted_controller(controller, &cfd);
 695         if (!tmpc)
 696                 return -EINVAL;
 697
 698         /* Make sure we pass a relative path to *at() family of functions.
 699          * . + /file + \0
 700          */
 701         len = strlen(file) + 2;
 702         pathname = alloca(len);
 703         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 704         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 705                 return -errno;
 706
 707         if (is_dir(pathname, cfd))
 708                 // like cgmanager did, we want to chown the tasks file as well
 709                 return chown_tasks_files(pathname, uid, gid, cfd);
 710
 711         return 0;
 712 }
 713
 714 FILE *open_pids_file(const char *controller, const char *cgroup)
 715 {
 716         int fd, cfd;
 717         size_t len;
 718         char *pathname, *tmpc;
 719
 720         tmpc = find_mounted_controller(controller, &cfd);
 721         if (!tmpc)
 722                 return NULL;
 723
 724         /* Make sure we pass a relative path to *at() family of functions.
 725          * . + /cgroup + / "cgroup.procs" + \0
 726          */
 727         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 728         pathname = alloca(len);
 729         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 730
 731         fd = openat(cfd, pathname, O_WRONLY);
 732         if (fd < 0)
 733                 return NULL;
 734
 735         return fdopen(fd, "w");
 736 }
 737
 738 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 739                                 void ***list, size_t typesize,
 740                                 void* (*iterator)(const char*, const char*, const char*))
 741 {
 742         int cfd, fd, ret;
 743         size_t len;
 744         char *cg, *tmpc;
 745         char pathname[MAXPATHLEN];
 746         size_t sz = 0, asz = 0;
 747         struct dirent *dirent;
 748         DIR *dir;
 749
 750         tmpc = find_mounted_controller(controller, &cfd);
 751         *list = NULL;
 752         if (!tmpc)
 753                 return false;
 754
 755         /* Make sure we pass a relative path to *at() family of functions. */
 756         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 757         cg = alloca(len);
 758         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 759         if (ret < 0 || (size_t)ret >= len) {
 760                 lxcfs_error("Pathname too long under %s\n", cgroup);
 761                 return false;
 762         }
 763
 764         fd = openat(cfd, cg, O_DIRECTORY);
 765         if (fd < 0)
 766                 return false;
 767
 768         dir = fdopendir(fd);
 769         if (!dir)
 770                 return false;
 771
 772         while ((dirent = readdir(dir))) {
 773                 struct stat mystat;
 774
 775                 if (!strcmp(dirent->d_name, ".") ||
 776                     !strcmp(dirent->d_name, ".."))
 777                         continue;
 778
 779                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 780                 if (ret < 0 || ret >= MAXPATHLEN) {
 781                         lxcfs_error("Pathname too long under %s\n", cg);
 782                         continue;
 783                 }
 784
 785                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 786                 if (ret) {
 787                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
 788                         continue;
 789                 }
 790                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
 791                     (directories && !S_ISDIR(mystat.st_mode)))
 792                         continue;
 793
 794                 if (sz+2 >= asz) {
 795                         void **tmp;
 796                         asz += BATCH_SIZE;
 797                         do {
 798                                 tmp = realloc(*list, asz * typesize);
 799                         } while  (!tmp);
 800                         *list = tmp;
 801                 }
 802                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
 803                 (*list)[sz+1] = NULL;
 804                 sz++;
 805         }
 806         if (closedir(dir) < 0) {
 807                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
 808                 return false;
 809         }
 810         return true;
 811 }
 812
 813 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 814 {
 815         char *dup;
 816         do {
 817                 dup = strdup(dir_entry);
 818         } while (!dup);
 819         return dup;
 820 }
 821
 822 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
 823 {
 824         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
 825 }
 826
 827 void free_key(struct cgfs_files *k)
 828 {
 829         if (!k)
 830                 return;
 831         free(k->name);
 832         free(k);
 833 }
 834
 835 void free_keys(struct cgfs_files **keys)
 836 {
 837         int i;
 838
 839         if (!keys)
 840                 return;
 841         for (i = 0; keys[i]; i++) {
 842                 free_key(keys[i]);
 843         }
 844         free(keys);
 845 }
 846
 847 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
 848 {
 849         int ret, fd, cfd;
 850         size_t len;
 851         char *fnam, *tmpc;
 852
 853         tmpc = find_mounted_controller(controller, &cfd);
 854         if (!tmpc)
 855                 return false;
 856
 857         /* Make sure we pass a relative path to *at() family of functions.
 858          * . + /cgroup + / + file + \0
 859          */
 860         len = strlen(cgroup) + strlen(file) + 3;
 861         fnam = alloca(len);
 862         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 863         if (ret < 0 || (size_t)ret >= len)
 864                 return NULL;
 865
 866         fd = openat(cfd, fnam, O_RDONLY);
 867         if (fd < 0)
 868                 return NULL;
 869
 870         *value = slurp_file(fnam, fd);
 871         return *value != NULL;
 872 }
 873
 874 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
 875 {
 876         int ret, cfd;
 877         size_t len;
 878         char *fnam, *tmpc;
 879         struct stat sb;
 880         struct cgfs_files *newkey;
 881
 882         tmpc = find_mounted_controller(controller, &cfd);
 883         if (!tmpc)
 884                 return false;
 885
 886         if (file && *file == '/')
 887                 file++;
 888
 889         if (file && strchr(file, '/'))
 890                 return NULL;
 891
 892         /* Make sure we pass a relative path to *at() family of functions.
 893          * . + /cgroup + / + file + \0
 894          */
 895         len = strlen(cgroup) + 3;
 896         if (file)
 897                 len += strlen(file) + 1;
 898         fnam = alloca(len);
 899         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
 900                  file ? "/" : "", file ? file : "");
 901
 902         ret = fstatat(cfd, fnam, &sb, 0);
 903         if (ret < 0)
 904                 return NULL;
 905
 906         do {
 907                 newkey = malloc(sizeof(struct cgfs_files));
 908         } while (!newkey);
 909         if (file)
 910                 newkey->name = must_copy_string(file);
 911         else if (strrchr(cgroup, '/'))
 912                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
 913         else
 914                 newkey->name = must_copy_string(cgroup);
 915         newkey->uid = sb.st_uid;
 916         newkey->gid = sb.st_gid;
 917         newkey->mode = sb.st_mode;
 918
 919         return newkey;
 920 }
 921
 922 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 923 {
 924         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
 925         if (!entry) {
 926                 lxcfs_error("Error getting files under %s:%s\n", controller,
 927                              cgroup);
 928         }
 929         return entry;
 930 }
 931
 932 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
 933 {
 934         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
 935 }
 936
 937 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
 938 {
 939         int cfd;
 940         size_t len;
 941         char *fnam, *tmpc;
 942         int ret;
 943         struct stat sb;
 944
 945         tmpc = find_mounted_controller(controller, &cfd);
 946         if (!tmpc)
 947                 return false;
 948
 949         /* Make sure we pass a relative path to *at() family of functions.
 950          * . + /cgroup + / + f + \0
 951          */
 952         len = strlen(cgroup) + strlen(f) + 3;
 953         fnam = alloca(len);
 954         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
 955         if (ret < 0 || (size_t)ret >= len)
 956                 return false;
 957
 958         ret = fstatat(cfd, fnam, &sb, 0);
 959         if (ret < 0 || !S_ISDIR(sb.st_mode))
 960                 return false;
 961
 962         return true;
 963 }
 964
 965 #define SEND_CREDS_OK 0
 966 #define SEND_CREDS_NOTSK 1
 967 #define SEND_CREDS_FAIL 2
 968 static bool recv_creds(int sock, struct ucred *cred, char *v);
 969 static int wait_for_pid(pid_t pid);
 970 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
 971 static int send_creds_clone_wrapper(void *arg);
 972
 973 /*
 974  * clone a task which switches to @task's namespace and writes '1'.
 975  * over a unix sock so we can read the task's reaper's pid in our
 976  * namespace
 977  *
 978  * Note: glibc's fork() does not respect pidns, which can lead to failed
 979  * assertions inside glibc (and thus failed forks) if the child's pid in
 980  * the pidns and the parent pid outside are identical. Using clone prevents
 981  * this issue.
 982  */
 983 static void write_task_init_pid_exit(int sock, pid_t target)
 984 {
 985         char fnam[100];
 986         pid_t pid;
 987         int fd, ret;
 988         size_t stack_size = sysconf(_SC_PAGESIZE);
 989         void *stack = alloca(stack_size);
 990
 991         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
 992         if (ret < 0 || ret >= sizeof(fnam))
 993                 _exit(1);
 994
 995         fd = open(fnam, O_RDONLY);
 996         if (fd < 0) {
 997                 perror("write_task_init_pid_exit open of ns/pid");
 998                 _exit(1);
 999         }
1000         if (setns(fd, 0)) {
1001                 perror("write_task_init_pid_exit setns 1");
1002                 close(fd);
1003                 _exit(1);
1004         }
1005         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1006         if (pid < 0)
1007                 _exit(1);
1008         if (pid != 0) {
1009                 if (!wait_for_pid(pid))
1010                         _exit(1);
1011                 _exit(0);
1012         }
1013 }
1014
1015 static int send_creds_clone_wrapper(void *arg) {
1016         struct ucred cred;
1017         char v;
1018         int sock = *(int *)arg;
1019
1020         /* we are the child */
1021         cred.uid = 0;
1022         cred.gid = 0;
1023         cred.pid = 1;
1024         v = '1';
1025         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1026                 return 1;
1027         return 0;
1028 }
1029
1030 static pid_t get_init_pid_for_task(pid_t task)
1031 {
1032         int sock[2];
1033         pid_t pid;
1034         pid_t ret = -1;
1035         char v = '0';
1036         struct ucred cred;
1037
1038         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1039                 perror("socketpair");
1040                 return -1;
1041         }
1042
1043         pid = fork();
1044         if (pid < 0)
1045                 goto out;
1046         if (!pid) {
1047                 close(sock[1]);
1048                 write_task_init_pid_exit(sock[0], task);
1049                 _exit(0);
1050         }
1051
1052         if (!recv_creds(sock[1], &cred, &v))
1053                 goto out;
1054         ret = cred.pid;
1055
1056 out:
1057         close(sock[0]);
1058         close(sock[1]);
1059         if (pid > 0)
1060                 wait_for_pid(pid);
1061         return ret;
1062 }
1063
1064 static pid_t lookup_initpid_in_store(pid_t qpid)
1065 {
1066         pid_t answer = 0;
1067         struct stat sb;
1068         struct pidns_init_store *e;
1069         char fnam[100];
1070
1071         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1072         store_lock();
1073         if (stat(fnam, &sb) < 0)
1074                 goto out;
1075         e = lookup_verify_initpid(&sb);
1076         if (e) {
1077                 answer = e->initpid;
1078                 goto out;
1079         }
1080         answer = get_init_pid_for_task(qpid);
1081         if (answer > 0)
1082                 save_initpid(&sb, answer);
1083
1084 out:
1085         /* we prune at end in case we are returning
1086          * the value we were about to return */
1087         prune_initpid_store();
1088         store_unlock();
1089         return answer;
1090 }
1091
1092 static int wait_for_pid(pid_t pid)
1093 {
1094         int status, ret;
1095
1096         if (pid <= 0)
1097                 return -1;
1098
1099 again:
1100         ret = waitpid(pid, &status, 0);
1101         if (ret == -1) {
1102                 if (errno == EINTR)
1103                         goto again;
1104                 return -1;
1105         }
1106         if (ret != pid)
1107                 goto again;
1108         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1109                 return -1;
1110         return 0;
1111 }
1112
1113
1114 /*
1115  * append pid to *src.
1116  * src: a pointer to a char* in which ot append the pid.
1117  * sz: the number of characters printed so far, minus trailing \0.
1118  * asz: the allocated size so far
1119  * pid: the pid to append
1120  */
1121 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1122 {
1123         char tmp[30];
1124
1125         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1126
1127         if (!*src || tmplen + *sz + 1 >= *asz) {
1128                 char *tmp;
1129                 do {
1130                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1131                 } while (!tmp);
1132                 *src = tmp;
1133                 *asz += BUF_RESERVE_SIZE;
1134         }
1135         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1136         *sz += tmplen;
1137 }
1138
1139 /*
1140  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1141  * valid in the caller's namespace, return the id mapped into
1142  * pid's namespace.
1143  * Returns the mapped id, or -1 on error.
1144  */
1145 unsigned int
1146 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1147 {
1148         unsigned int nsuid,   // base id for a range in the idfile's namespace
1149                      hostuid, // base id for a range in the caller's namespace
1150                      count;   // number of ids in this range
1151         char line[400];
1152         int ret;
1153
1154         fseek(idfile, 0L, SEEK_SET);
1155         while (fgets(line, 400, idfile)) {
1156                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1157                 if (ret != 3)
1158                         continue;
1159                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1160                         /*
1161                          * uids wrapped around - unexpected as this is a procfile,
1162                          * so just bail.
1163                          */
1164                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1165                                 nsuid, hostuid, count, line);
1166                         return -1;
1167                 }
1168                 if (hostuid <= in_id && hostuid+count > in_id) {
1169                         /*
1170                          * now since hostuid <= in_id < hostuid+count, and
1171                          * hostuid+count and nsuid+count do not wrap around,
1172                          * we know that nsuid+(in_id-hostuid) which must be
1173                          * less that nsuid+(count) must not wrap around
1174                          */
1175                         return (in_id - hostuid) + nsuid;
1176                 }
1177         }
1178
1179         // no answer found
1180         return -1;
1181 }
1182
1183 /*
1184  * for is_privileged_over,
1185  * specify whether we require the calling uid to be root in his
1186  * namespace
1187  */
1188 #define NS_ROOT_REQD true
1189 #define NS_ROOT_OPT false
1190
1191 #define PROCLEN 100
1192
1193 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1194 {
1195         char fpath[PROCLEN];
1196         int ret;
1197         bool answer = false;
1198         uid_t nsuid;
1199
1200         if (victim == -1 || uid == -1)
1201                 return false;
1202
1203         /*
1204          * If the request is one not requiring root in the namespace,
1205          * then having the same uid suffices.  (i.e. uid 1000 has write
1206          * access to files owned by uid 1000
1207          */
1208         if (!req_ns_root && uid == victim)
1209                 return true;
1210
1211         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1212         if (ret < 0 || ret >= PROCLEN)
1213                 return false;
1214         FILE *f = fopen(fpath, "r");
1215         if (!f)
1216                 return false;
1217
1218         /* if caller's not root in his namespace, reject */
1219         nsuid = convert_id_to_ns(f, uid);
1220         if (nsuid)
1221                 goto out;
1222
1223         /*
1224          * If victim is not mapped into caller's ns, reject.
1225          * XXX I'm not sure this check is needed given that fuse
1226          * will be sending requests where the vfs has converted
1227          */
1228         nsuid = convert_id_to_ns(f, victim);
1229         if (nsuid == -1)
1230                 goto out;
1231
1232         answer = true;
1233
1234 out:
1235         fclose(f);
1236         return answer;
1237 }
1238
1239 static bool perms_include(int fmode, mode_t req_mode)
1240 {
1241         mode_t r;
1242
1243         switch (req_mode & O_ACCMODE) {
1244         case O_RDONLY:
1245                 r = S_IROTH;
1246                 break;
1247         case O_WRONLY:
1248                 r = S_IWOTH;
1249                 break;
1250         case O_RDWR:
1251                 r = S_IROTH | S_IWOTH;
1252                 break;
1253         default:
1254                 return false;
1255         }
1256         return ((fmode & r) == r);
1257 }
1258
1259
1260 /*
1261  * taskcg is  a/b/c
1262  * querycg is /a/b/c/d/e
1263  * we return 'd'
1264  */
1265 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1266 {
1267         char *start, *end;
1268
1269         if (strlen(taskcg) <= strlen(querycg)) {
1270                 lxcfs_error("%s\n", "I was fed bad input.");
1271                 return NULL;
1272         }
1273
1274         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1275                 start =  strdup(taskcg + 1);
1276         else
1277                 start = strdup(taskcg + strlen(querycg) + 1);
1278         if (!start)
1279                 return NULL;
1280         end = strchr(start, '/');
1281         if (end)
1282                 *end = '\0';
1283         return start;
1284 }
1285
1286 static void stripnewline(char *x)
1287 {
1288         size_t l = strlen(x);
1289         if (l && x[l-1] == '\n')
1290                 x[l-1] = '\0';
1291 }
1292
1293 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1294 {
1295         int cfd;
1296         char fnam[PROCLEN];
1297         FILE *f;
1298         char *answer = NULL;
1299         char *line = NULL;
1300         size_t len = 0;
1301         int ret;
1302         const char *h = find_mounted_controller(contrl, &cfd);
1303         if (!h)
1304                 return NULL;
1305
1306         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1307         if (ret < 0 || ret >= PROCLEN)
1308                 return NULL;
1309         if (!(f = fopen(fnam, "r")))
1310                 return NULL;
1311
1312         while (getline(&line, &len, f) != -1) {
1313                 char *c1, *c2;
1314                 if (!line[0])
1315                         continue;
1316                 c1 = strchr(line, ':');
1317                 if (!c1)
1318                         goto out;
1319                 c1++;
1320                 c2 = strchr(c1, ':');
1321                 if (!c2)
1322                         goto out;
1323                 *c2 = '\0';
1324                 if (strcmp(c1, h) != 0)
1325                         continue;
1326                 c2++;
1327                 stripnewline(c2);
1328                 do {
1329                         answer = strdup(c2);
1330                 } while (!answer);
1331                 break;
1332         }
1333
1334 out:
1335         fclose(f);
1336         free(line);
1337         return answer;
1338 }
1339
1340 /*
1341  * check whether a fuse context may access a cgroup dir or file
1342  *
1343  * If file is not null, it is a cgroup file to check under cg.
1344  * If file is null, then we are checking perms on cg itself.
1345  *
1346  * For files we can check the mode of the list_keys result.
1347  * For cgroups, we must make assumptions based on the files under the
1348  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1349  * yet.
1350  */
1351 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1352 {
1353         struct cgfs_files *k = NULL;
1354         bool ret = false;
1355
1356         k = cgfs_get_key(contrl, cg, file);
1357         if (!k)
1358                 return false;
1359
1360         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1361                 if (perms_include(k->mode >> 6, mode)) {
1362                         ret = true;
1363                         goto out;
1364                 }
1365         }
1366         if (fc->gid == k->gid) {
1367                 if (perms_include(k->mode >> 3, mode)) {
1368                         ret = true;
1369                         goto out;
1370                 }
1371         }
1372         ret = perms_include(k->mode, mode);
1373
1374 out:
1375         free_key(k);
1376         return ret;
1377 }
1378
1379 #define INITSCOPE "/init.scope"
1380 static void prune_init_slice(char *cg)
1381 {
1382         char *point;
1383         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1384
1385         if (cg_len < initscope_len)
1386                 return;
1387
1388         point = cg + cg_len - initscope_len;
1389         if (strcmp(point, INITSCOPE) == 0) {
1390                 if (point == cg)
1391                         *(point+1) = '\0';
1392                 else
1393                         *point = '\0';
1394         }
1395 }
1396
1397 /*
1398  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1399  * If pid is in /a, he may act on /a/b, but not on /b.
1400  * if the answer is false and nextcg is not NULL, then *nextcg will point
1401  * to a string containing the next cgroup directory under cg, which must be
1402  * freed by the caller.
1403  */
1404 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1405 {
1406         bool answer = false;
1407         char *c2 = get_pid_cgroup(pid, contrl);
1408         char *linecmp;
1409
1410         if (!c2)
1411                 return false;
1412         prune_init_slice(c2);
1413
1414         /*
1415          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1416          * they pass in a cgroup without leading '/'
1417          *
1418          * The original line here was:
1419          *      linecmp = *cg == '/' ? c2 : c2+1;
1420          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1421          *       Serge, do you know?
1422          */
1423         if (*cg == '/' || !strncmp(cg, "./", 2))
1424                 linecmp = c2;
1425         else
1426                 linecmp = c2 + 1;
1427         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1428                 if (nextcg) {
1429                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1430                 }
1431                 goto out;
1432         }
1433         answer = true;
1434
1435 out:
1436         free(c2);
1437         return answer;
1438 }
1439
1440 /*
1441  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1442  */
1443 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1444 {
1445         bool answer = false;
1446         char *c2, *task_cg;
1447         size_t target_len, task_len;
1448
1449         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1450                 return true;
1451
1452         c2 = get_pid_cgroup(pid, contrl);
1453         if (!c2)
1454                 return false;
1455         prune_init_slice(c2);
1456
1457         task_cg = c2 + 1;
1458         target_len = strlen(cg);
1459         task_len = strlen(task_cg);
1460         if (task_len == 0) {
1461                 /* Task is in the root cg, it can see everything. This case is
1462                  * not handled by the strmcps below, since they test for the
1463                  * last /, but that is the first / that we've chopped off
1464                  * above.
1465                  */
1466                 answer = true;
1467                 goto out;
1468         }
1469         if (strcmp(cg, task_cg) == 0) {
1470                 answer = true;
1471                 goto out;
1472         }
1473         if (target_len < task_len) {
1474                 /* looking up a parent dir */
1475                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1476                         answer = true;
1477                 goto out;
1478         }
1479         if (target_len > task_len) {
1480                 /* looking up a child dir */
1481                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1482                         answer = true;
1483                 goto out;
1484         }
1485
1486 out:
1487         free(c2);
1488         return answer;
1489 }
1490
1491 /*
1492  * given /cgroup/freezer/a/b, return "freezer".
1493  * the returned char* should NOT be freed.
1494  */
1495 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1496 {
1497         const char *p1;
1498         char *contr, *slash;
1499
1500         if (strlen(path) < 9) {
1501                 errno = EACCES;
1502                 return NULL;
1503         }
1504         if (*(path + 7) != '/') {
1505                 errno = EINVAL;
1506                 return NULL;
1507         }
1508         p1 = path + 8;
1509         contr = strdupa(p1);
1510         if (!contr) {
1511                 errno = ENOMEM;
1512                 return NULL;
1513         }
1514         slash = strstr(contr, "/");
1515         if (slash)
1516                 *slash = '\0';
1517
1518         int i;
1519         for (i = 0; i < num_hierarchies; i++) {
1520                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1521                         return hierarchies[i];
1522         }
1523         errno = ENOENT;
1524         return NULL;
1525 }
1526
1527 /*
1528  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1529  * Note that the returned value may include files (keynames) etc
1530  */
1531 static const char *find_cgroup_in_path(const char *path)
1532 {
1533         const char *p1;
1534
1535         if (strlen(path) < 9) {
1536                 errno = EACCES;
1537                 return NULL;
1538         }
1539         p1 = strstr(path + 8, "/");
1540         if (!p1) {
1541                 errno = EINVAL;
1542                 return NULL;
1543         }
1544         errno = 0;
1545         return p1 + 1;
1546 }
1547
1548 /*
1549  * split the last path element from the path in @cg.
1550  * @dir is newly allocated and should be freed, @last not
1551 */
1552 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1553 {
1554         char *p;
1555
1556         do {
1557                 *dir = strdup(cg);
1558         } while (!*dir);
1559         *last = strrchr(cg, '/');
1560         if (!*last) {
1561                 *last = NULL;
1562                 return;
1563         }
1564         p = strrchr(*dir, '/');
1565         *p = '\0';
1566 }
1567
1568 /*
1569  * FUSE ops for /cgroup
1570  */
1571
1572 int cg_getattr(const char *path, struct stat *sb)
1573 {
1574         struct timespec now;
1575         struct fuse_context *fc = fuse_get_context();
1576         char * cgdir = NULL;
1577         char *last = NULL, *path1, *path2;
1578         struct cgfs_files *k = NULL;
1579         const char *cgroup;
1580         const char *controller = NULL;
1581         int ret = -ENOENT;
1582
1583
1584         if (!fc)
1585                 return -EIO;
1586
1587         memset(sb, 0, sizeof(struct stat));
1588
1589         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1590                 return -EINVAL;
1591
1592         sb->st_uid = sb->st_gid = 0;
1593         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1594         sb->st_size = 0;
1595
1596         if (strcmp(path, "/cgroup") == 0) {
1597                 sb->st_mode = S_IFDIR | 00755;
1598                 sb->st_nlink = 2;
1599                 return 0;
1600         }
1601
1602         controller = pick_controller_from_path(fc, path);
1603         if (!controller)
1604                 return -errno;
1605         cgroup = find_cgroup_in_path(path);
1606         if (!cgroup) {
1607                 /* this is just /cgroup/controller, return it as a dir */
1608                 sb->st_mode = S_IFDIR | 00755;
1609                 sb->st_nlink = 2;
1610                 return 0;
1611         }
1612
1613         get_cgdir_and_path(cgroup, &cgdir, &last);
1614
1615         if (!last) {
1616                 path1 = "/";
1617                 path2 = cgdir;
1618         } else {
1619                 path1 = cgdir;
1620                 path2 = last;
1621         }
1622
1623         pid_t initpid = lookup_initpid_in_store(fc->pid);
1624         if (initpid <= 0)
1625                 initpid = fc->pid;
1626         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1627          * Then check that caller's cgroup is under path if last is a child
1628          * cgroup, or cgdir if last is a file */
1629
1630         if (is_child_cgroup(controller, path1, path2)) {
1631                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1632                         ret = -ENOENT;
1633                         goto out;
1634                 }
1635                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1636                         /* this is just /cgroup/controller, return it as a dir */
1637                         sb->st_mode = S_IFDIR | 00555;
1638                         sb->st_nlink = 2;
1639                         ret = 0;
1640                         goto out;
1641                 }
1642                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1643                         ret = -EACCES;
1644                         goto out;
1645                 }
1646
1647                 // get uid, gid, from '/tasks' file and make up a mode
1648                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1649                 sb->st_mode = S_IFDIR | 00755;
1650                 k = cgfs_get_key(controller, cgroup, NULL);
1651                 if (!k) {
1652                         sb->st_uid = sb->st_gid = 0;
1653                 } else {
1654                         sb->st_uid = k->uid;
1655                         sb->st_gid = k->gid;
1656                 }
1657                 free_key(k);
1658                 sb->st_nlink = 2;
1659                 ret = 0;
1660                 goto out;
1661         }
1662
1663         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1664                 sb->st_mode = S_IFREG | k->mode;
1665                 sb->st_nlink = 1;
1666                 sb->st_uid = k->uid;
1667                 sb->st_gid = k->gid;
1668                 sb->st_size = 0;
1669                 free_key(k);
1670                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1671                         ret = -ENOENT;
1672                         goto out;
1673                 }
1674                 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1675                         ret = -EACCES;
1676                         goto out;
1677                 }
1678
1679                 ret = 0;
1680         }
1681
1682 out:
1683         free(cgdir);
1684         return ret;
1685 }
1686
1687 int cg_opendir(const char *path, struct fuse_file_info *fi)
1688 {
1689         struct fuse_context *fc = fuse_get_context();
1690         const char *cgroup;
1691         struct file_info *dir_info;
1692         char *controller = NULL;
1693
1694         if (!fc)
1695                 return -EIO;
1696
1697         if (strcmp(path, "/cgroup") == 0) {
1698                 cgroup = NULL;
1699                 controller = NULL;
1700         } else {
1701                 // return list of keys for the controller, and list of child cgroups
1702                 controller = pick_controller_from_path(fc, path);
1703                 if (!controller)
1704                         return -errno;
1705
1706                 cgroup = find_cgroup_in_path(path);
1707                 if (!cgroup) {
1708                         /* this is just /cgroup/controller, return its contents */
1709                         cgroup = "/";
1710                 }
1711         }
1712
1713         pid_t initpid = lookup_initpid_in_store(fc->pid);
1714         if (initpid <= 0)
1715                 initpid = fc->pid;
1716         if (cgroup) {
1717                 if (!caller_may_see_dir(initpid, controller, cgroup))
1718                         return -ENOENT;
1719                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1720                         return -EACCES;
1721         }
1722
1723         /* we'll free this at cg_releasedir */
1724         dir_info = malloc(sizeof(*dir_info));
1725         if (!dir_info)
1726                 return -ENOMEM;
1727         dir_info->controller = must_copy_string(controller);
1728         dir_info->cgroup = must_copy_string(cgroup);
1729         dir_info->type = LXC_TYPE_CGDIR;
1730         dir_info->buf = NULL;
1731         dir_info->file = NULL;
1732         dir_info->buflen = 0;
1733
1734         fi->fh = (unsigned long)dir_info;
1735         return 0;
1736 }
1737
1738 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1739                 struct fuse_file_info *fi)
1740 {
1741         struct file_info *d = (struct file_info *)fi->fh;
1742         struct cgfs_files **list = NULL;
1743         int i, ret;
1744         char *nextcg = NULL;
1745         struct fuse_context *fc = fuse_get_context();
1746         char **clist = NULL;
1747
1748         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1749                 return -EIO;
1750
1751         if (d->type != LXC_TYPE_CGDIR) {
1752                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1753                 return -EIO;
1754         }
1755         if (!d->cgroup && !d->controller) {
1756                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1757                 int i;
1758
1759                 for (i = 0;  i < num_hierarchies; i++) {
1760                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1761                                 return -EIO;
1762                         }
1763                 }
1764                 return 0;
1765         }
1766
1767         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1768                 // not a valid cgroup
1769                 ret = -EINVAL;
1770                 goto out;
1771         }
1772
1773         pid_t initpid = lookup_initpid_in_store(fc->pid);
1774         if (initpid <= 0)
1775                 initpid = fc->pid;
1776         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1777                 if (nextcg) {
1778                         ret = filler(buf, nextcg,  NULL, 0);
1779                         free(nextcg);
1780                         if (ret != 0) {
1781                                 ret = -EIO;
1782                                 goto out;
1783                         }
1784                 }
1785                 ret = 0;
1786                 goto out;
1787         }
1788
1789         for (i = 0; list[i]; i++) {
1790                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1791                         ret = -EIO;
1792                         goto out;
1793                 }
1794         }
1795
1796         // now get the list of child cgroups
1797
1798         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1799                 ret = 0;
1800                 goto out;
1801         }
1802         if (clist) {
1803                 for (i = 0; clist[i]; i++) {
1804                         if (filler(buf, clist[i], NULL, 0) != 0) {
1805                                 ret = -EIO;
1806                                 goto out;
1807                         }
1808                 }
1809         }
1810         ret = 0;
1811
1812 out:
1813         free_keys(list);
1814         if (clist) {
1815                 for (i = 0; clist[i]; i++)
1816                         free(clist[i]);
1817                 free(clist);
1818         }
1819         return ret;
1820 }
1821
1822 static void do_release_file_info(struct fuse_file_info *fi)
1823 {
1824         struct file_info *f = (struct file_info *)fi->fh;
1825
1826         if (!f)
1827                 return;
1828
1829         fi->fh = 0;
1830
1831         free(f->controller);
1832         f->controller = NULL;
1833         free(f->cgroup);
1834         f->cgroup = NULL;
1835         free(f->file);
1836         f->file = NULL;
1837         free(f->buf);
1838         f->buf = NULL;
1839         free(f);
1840 }
1841
1842 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1843 {
1844         do_release_file_info(fi);
1845         return 0;
1846 }
1847
1848 int cg_open(const char *path, struct fuse_file_info *fi)
1849 {
1850         const char *cgroup;
1851         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1852         struct cgfs_files *k = NULL;
1853         struct file_info *file_info;
1854         struct fuse_context *fc = fuse_get_context();
1855         int ret;
1856
1857         if (!fc)
1858                 return -EIO;
1859
1860         controller = pick_controller_from_path(fc, path);
1861         if (!controller)
1862                 return -errno;
1863         cgroup = find_cgroup_in_path(path);
1864         if (!cgroup)
1865                 return -errno;
1866
1867         get_cgdir_and_path(cgroup, &cgdir, &last);
1868         if (!last) {
1869                 path1 = "/";
1870                 path2 = cgdir;
1871         } else {
1872                 path1 = cgdir;
1873                 path2 = last;
1874         }
1875
1876         k = cgfs_get_key(controller, path1, path2);
1877         if (!k) {
1878                 ret = -EINVAL;
1879                 goto out;
1880         }
1881         free_key(k);
1882
1883         pid_t initpid = lookup_initpid_in_store(fc->pid);
1884         if (initpid <= 0)
1885                 initpid = fc->pid;
1886         if (!caller_may_see_dir(initpid, controller, path1)) {
1887                 ret = -ENOENT;
1888                 goto out;
1889         }
1890         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1891                 ret = -EACCES;
1892                 goto out;
1893         }
1894
1895         /* we'll free this at cg_release */
1896         file_info = malloc(sizeof(*file_info));
1897         if (!file_info) {
1898                 ret = -ENOMEM;
1899                 goto out;
1900         }
1901         file_info->controller = must_copy_string(controller);
1902         file_info->cgroup = must_copy_string(path1);
1903         file_info->file = must_copy_string(path2);
1904         file_info->type = LXC_TYPE_CGFILE;
1905         file_info->buf = NULL;
1906         file_info->buflen = 0;
1907
1908         fi->fh = (unsigned long)file_info;
1909         ret = 0;
1910
1911 out:
1912         free(cgdir);
1913         return ret;
1914 }
1915
1916 int cg_access(const char *path, int mode)
1917 {
1918         int ret;
1919         const char *cgroup;
1920         char *path1, *path2, *controller;
1921         char *last = NULL, *cgdir = NULL;
1922         struct cgfs_files *k = NULL;
1923         struct fuse_context *fc = fuse_get_context();
1924
1925         if (strcmp(path, "/cgroup") == 0)
1926                 return 0;
1927
1928         if (!fc)
1929                 return -EIO;
1930
1931         controller = pick_controller_from_path(fc, path);
1932         if (!controller)
1933                 return -errno;
1934         cgroup = find_cgroup_in_path(path);
1935         if (!cgroup) {
1936                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
1937                 if ((mode & W_OK) == 0)
1938                         return 0;
1939                 return -EACCES;
1940         }
1941
1942         get_cgdir_and_path(cgroup, &cgdir, &last);
1943         if (!last) {
1944                 path1 = "/";
1945                 path2 = cgdir;
1946         } else {
1947                 path1 = cgdir;
1948                 path2 = last;
1949         }
1950
1951         k = cgfs_get_key(controller, path1, path2);
1952         if (!k) {
1953                 if ((mode & W_OK) == 0)
1954                         ret = 0;
1955                 else
1956                         ret = -EACCES;
1957                 goto out;
1958         }
1959         free_key(k);
1960
1961         pid_t initpid = lookup_initpid_in_store(fc->pid);
1962         if (initpid <= 0)
1963                 initpid = fc->pid;
1964         if (!caller_may_see_dir(initpid, controller, path1)) {
1965                 ret = -ENOENT;
1966                 goto out;
1967         }
1968         if (!fc_may_access(fc, controller, path1, path2, mode)) {
1969                 ret = -EACCES;
1970                 goto out;
1971         }
1972
1973         ret = 0;
1974
1975 out:
1976         free(cgdir);
1977         return ret;
1978 }
1979
1980 int cg_release(const char *path, struct fuse_file_info *fi)
1981 {
1982         do_release_file_info(fi);
1983         return 0;
1984 }
1985
1986 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1987
1988 static bool wait_for_sock(int sock, int timeout)
1989 {
1990         struct epoll_event ev;
1991         int epfd, ret, now, starttime, deltatime, saved_errno;
1992
1993         if ((starttime = time(NULL)) < 0)
1994                 return false;
1995
1996         if ((epfd = epoll_create(1)) < 0) {
1997                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
1998                 return false;
1999         }
2000
2001         ev.events = POLLIN_SET;
2002         ev.data.fd = sock;
2003         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2004                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2005                 close(epfd);
2006                 return false;
2007         }
2008
2009 again:
2010         if ((now = time(NULL)) < 0) {
2011                 close(epfd);
2012                 return false;
2013         }
2014
2015         deltatime = (starttime + timeout) - now;
2016         if (deltatime < 0) { // timeout
2017                 errno = 0;
2018                 close(epfd);
2019                 return false;
2020         }
2021         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2022         if (ret < 0 && errno == EINTR)
2023                 goto again;
2024         saved_errno = errno;
2025         close(epfd);
2026
2027         if (ret <= 0) {
2028                 errno = saved_errno;
2029                 return false;
2030         }
2031         return true;
2032 }
2033
2034 static int msgrecv(int sockfd, void *buf, size_t len)
2035 {
2036         if (!wait_for_sock(sockfd, 2))
2037                 return -1;
2038         return recv(sockfd, buf, len, MSG_DONTWAIT);
2039 }
2040
2041 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2042 {
2043         struct msghdr msg = { 0 };
2044         struct iovec iov;
2045         struct cmsghdr *cmsg;
2046         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2047         char buf[1];
2048         buf[0] = 'p';
2049
2050         if (pingfirst) {
2051                 if (msgrecv(sock, buf, 1) != 1) {
2052                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2053                         return SEND_CREDS_FAIL;
2054                 }
2055         }
2056
2057         msg.msg_control = cmsgbuf;
2058         msg.msg_controllen = sizeof(cmsgbuf);
2059
2060         cmsg = CMSG_FIRSTHDR(&msg);
2061         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2062         cmsg->cmsg_level = SOL_SOCKET;
2063         cmsg->cmsg_type = SCM_CREDENTIALS;
2064         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2065
2066         msg.msg_name = NULL;
2067         msg.msg_namelen = 0;
2068
2069         buf[0] = v;
2070         iov.iov_base = buf;
2071         iov.iov_len = sizeof(buf);
2072         msg.msg_iov = &iov;
2073         msg.msg_iovlen = 1;
2074
2075         if (sendmsg(sock, &msg, 0) < 0) {
2076                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2077                 if (errno == 3)
2078                         return SEND_CREDS_NOTSK;
2079                 return SEND_CREDS_FAIL;
2080         }
2081
2082         return SEND_CREDS_OK;
2083 }
2084
2085 static bool recv_creds(int sock, struct ucred *cred, char *v)
2086 {
2087         struct msghdr msg = { 0 };
2088         struct iovec iov;
2089         struct cmsghdr *cmsg;
2090         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2091         char buf[1];
2092         int ret;
2093         int optval = 1;
2094
2095         *v = '1';
2096
2097         cred->pid = -1;
2098         cred->uid = -1;
2099         cred->gid = -1;
2100
2101         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2102                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2103                 return false;
2104         }
2105         buf[0] = '1';
2106         if (write(sock, buf, 1) != 1) {
2107                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2108                 return false;
2109         }
2110
2111         msg.msg_name = NULL;
2112         msg.msg_namelen = 0;
2113         msg.msg_control = cmsgbuf;
2114         msg.msg_controllen = sizeof(cmsgbuf);
2115
2116         iov.iov_base = buf;
2117         iov.iov_len = sizeof(buf);
2118         msg.msg_iov = &iov;
2119         msg.msg_iovlen = 1;
2120
2121         if (!wait_for_sock(sock, 2)) {
2122                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2123                 return false;
2124         }
2125         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2126         if (ret < 0) {
2127                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2128                 return false;
2129         }
2130
2131         cmsg = CMSG_FIRSTHDR(&msg);
2132
2133         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2134                         cmsg->cmsg_level == SOL_SOCKET &&
2135                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2136                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2137         }
2138         *v = buf[0];
2139
2140         return true;
2141 }
2142
2143 struct pid_ns_clone_args {
2144         int *cpipe;
2145         int sock;
2146         pid_t tpid;
2147         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2148 };
2149
2150 /*
2151  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2152  * with clone(). This simply writes '1' as ACK back to the parent
2153  * before calling the actual wrapped function.
2154  */
2155 static int pid_ns_clone_wrapper(void *arg) {
2156         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2157         char b = '1';
2158
2159         close(args->cpipe[0]);
2160         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2161                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2162         close(args->cpipe[1]);
2163         return args->wrapped(args->sock, args->tpid);
2164 }
2165
2166 /*
2167  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2168  * int value back over the socket.  This shifts the pid from the
2169  * sender's pidns into tpid's pidns.
2170  */
2171 static int pid_to_ns(int sock, pid_t tpid)
2172 {
2173         char v = '0';
2174         struct ucred cred;
2175
2176         while (recv_creds(sock, &cred, &v)) {
2177                 if (v == '1')
2178                         return 0;
2179                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2180                         return 1;
2181         }
2182         return 0;
2183 }
2184
2185
2186 /*
2187  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2188  * in your old pidns.  Only children which you clone will be in the target
2189  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2190  * actually convert pids.
2191  *
2192  * Note: glibc's fork() does not respect pidns, which can lead to failed
2193  * assertions inside glibc (and thus failed forks) if the child's pid in
2194  * the pidns and the parent pid outside are identical. Using clone prevents
2195  * this issue.
2196  */
2197 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2198 {
2199         int newnsfd = -1, ret, cpipe[2];
2200         char fnam[100];
2201         pid_t cpid;
2202         char v;
2203
2204         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2205         if (ret < 0 || ret >= sizeof(fnam))
2206                 _exit(1);
2207         newnsfd = open(fnam, O_RDONLY);
2208         if (newnsfd < 0)
2209                 _exit(1);
2210         if (setns(newnsfd, 0) < 0)
2211                 _exit(1);
2212         close(newnsfd);
2213
2214         if (pipe(cpipe) < 0)
2215                 _exit(1);
2216
2217         struct pid_ns_clone_args args = {
2218                 .cpipe = cpipe,
2219                 .sock = sock,
2220                 .tpid = tpid,
2221                 .wrapped = &pid_to_ns
2222         };
2223         size_t stack_size = sysconf(_SC_PAGESIZE);
2224         void *stack = alloca(stack_size);
2225
2226         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2227         if (cpid < 0)
2228                 _exit(1);
2229
2230         // give the child 1 second to be done forking and
2231         // write its ack
2232         if (!wait_for_sock(cpipe[0], 1))
2233                 _exit(1);
2234         ret = read(cpipe[0], &v, 1);
2235         if (ret != sizeof(char) || v != '1')
2236                 _exit(1);
2237
2238         if (!wait_for_pid(cpid))
2239                 _exit(1);
2240         _exit(0);
2241 }
2242
2243 /*
2244  * To read cgroup files with a particular pid, we will setns into the child
2245  * pidns, open a pipe, fork a child - which will be the first to really be in
2246  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2247  */
2248 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2249 {
2250         int sock[2] = {-1, -1};
2251         char *tmpdata = NULL;
2252         int ret;
2253         pid_t qpid, cpid = -1;
2254         bool answer = false;
2255         char v = '0';
2256         struct ucred cred;
2257         size_t sz = 0, asz = 0;
2258
2259         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2260                 return false;
2261
2262         /*
2263          * Now we read the pids from returned data one by one, pass
2264          * them into a child in the target namespace, read back the
2265          * translated pids, and put them into our to-return data
2266          */
2267
2268         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2269                 perror("socketpair");
2270                 free(tmpdata);
2271                 return false;
2272         }
2273
2274         cpid = fork();
2275         if (cpid == -1)
2276                 goto out;
2277
2278         if (!cpid) // child - exits when done
2279                 pid_to_ns_wrapper(sock[1], tpid);
2280
2281         char *ptr = tmpdata;
2282         cred.uid = 0;
2283         cred.gid = 0;
2284         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2285                 cred.pid = qpid;
2286                 ret = send_creds(sock[0], &cred, v, true);
2287
2288                 if (ret == SEND_CREDS_NOTSK)
2289                         goto next;
2290                 if (ret == SEND_CREDS_FAIL)
2291                         goto out;
2292
2293                 // read converted results
2294                 if (!wait_for_sock(sock[0], 2)) {
2295                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2296                         goto out;
2297                 }
2298                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2299                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2300                         goto out;
2301                 }
2302                 must_strcat_pid(d, &sz, &asz, qpid);
2303 next:
2304                 ptr = strchr(ptr, '\n');
2305                 if (!ptr)
2306                         break;
2307                 ptr++;
2308         }
2309
2310         cred.pid = getpid();
2311         v = '1';
2312         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2313                 // failed to ask child to exit
2314                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2315                 goto out;
2316         }
2317
2318         answer = true;
2319
2320 out:
2321         free(tmpdata);
2322         if (cpid != -1)
2323                 wait_for_pid(cpid);
2324         if (sock[0] != -1) {
2325                 close(sock[0]);
2326                 close(sock[1]);
2327         }
2328         return answer;
2329 }
2330
2331 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2332                 struct fuse_file_info *fi)
2333 {
2334         struct fuse_context *fc = fuse_get_context();
2335         struct file_info *f = (struct file_info *)fi->fh;
2336         struct cgfs_files *k = NULL;
2337         char *data = NULL;
2338         int ret, s;
2339         bool r;
2340
2341         if (f->type != LXC_TYPE_CGFILE) {
2342                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2343                 return -EIO;
2344         }
2345
2346         if (offset)
2347                 return 0;
2348
2349         if (!fc)
2350                 return -EIO;
2351
2352         if (!f->controller)
2353                 return -EINVAL;
2354
2355         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2356                 return -EINVAL;
2357         }
2358         free_key(k);
2359
2360
2361         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2362                 ret = -EACCES;
2363                 goto out;
2364         }
2365
2366         if (strcmp(f->file, "tasks") == 0 ||
2367                         strcmp(f->file, "/tasks") == 0 ||
2368                         strcmp(f->file, "/cgroup.procs") == 0 ||
2369                         strcmp(f->file, "cgroup.procs") == 0)
2370                 // special case - we have to translate the pids
2371                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2372         else
2373                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2374
2375         if (!r) {
2376                 ret = -EINVAL;
2377                 goto out;
2378         }
2379
2380         if (!data) {
2381                 ret = 0;
2382                 goto out;
2383         }
2384         s = strlen(data);
2385         if (s > size)
2386                 s = size;
2387         memcpy(buf, data, s);
2388         if (s > 0 && s < size && data[s-1] != '\n')
2389                 buf[s++] = '\n';
2390
2391         ret = s;
2392
2393 out:
2394         free(data);
2395         return ret;
2396 }
2397
2398 static int pid_from_ns(int sock, pid_t tpid)
2399 {
2400         pid_t vpid;
2401         struct ucred cred;
2402         char v;
2403         int ret;
2404
2405         cred.uid = 0;
2406         cred.gid = 0;
2407         while (1) {
2408                 if (!wait_for_sock(sock, 2)) {
2409                         lxcfs_error("%s\n", "Timeout reading from parent.");
2410                         return 1;
2411                 }
2412                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2413                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2414                         return 1;
2415                 }
2416                 if (vpid == -1) // done
2417                         break;
2418                 v = '0';
2419                 cred.pid = vpid;
2420                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2421                         v = '1';
2422                         cred.pid = getpid();
2423                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2424                                 return 1;
2425                 }
2426         }
2427         return 0;
2428 }
2429
2430 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2431 {
2432         int newnsfd = -1, ret, cpipe[2];
2433         char fnam[100];
2434         pid_t cpid;
2435         char v;
2436
2437         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2438         if (ret < 0 || ret >= sizeof(fnam))
2439                 _exit(1);
2440         newnsfd = open(fnam, O_RDONLY);
2441         if (newnsfd < 0)
2442                 _exit(1);
2443         if (setns(newnsfd, 0) < 0)
2444                 _exit(1);
2445         close(newnsfd);
2446
2447         if (pipe(cpipe) < 0)
2448                 _exit(1);
2449
2450         struct pid_ns_clone_args args = {
2451                 .cpipe = cpipe,
2452                 .sock = sock,
2453                 .tpid = tpid,
2454                 .wrapped = &pid_from_ns
2455         };
2456         size_t stack_size = sysconf(_SC_PAGESIZE);
2457         void *stack = alloca(stack_size);
2458
2459         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2460         if (cpid < 0)
2461                 _exit(1);
2462
2463         // give the child 1 second to be done forking and
2464         // write its ack
2465         if (!wait_for_sock(cpipe[0], 1))
2466                 _exit(1);
2467         ret = read(cpipe[0], &v, 1);
2468         if (ret != sizeof(char) || v != '1')
2469                 _exit(1);
2470
2471         if (!wait_for_pid(cpid))
2472                 _exit(1);
2473         _exit(0);
2474 }
2475
2476 /*
2477  * Given host @uid, return the uid to which it maps in
2478  * @pid's user namespace, or -1 if none.
2479  */
2480 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2481 {
2482         FILE *f;
2483         char line[400];
2484
2485         sprintf(line, "/proc/%d/uid_map", pid);
2486         if ((f = fopen(line, "r")) == NULL) {
2487                 return false;
2488         }
2489
2490         *answer = convert_id_to_ns(f, uid);
2491         fclose(f);
2492
2493         if (*answer == -1)
2494                 return false;
2495         return true;
2496 }
2497
2498 /*
2499  * get_pid_creds: get the real uid and gid of @pid from
2500  * /proc/$$/status
2501  * (XXX should we use euid here?)
2502  */
2503 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2504 {
2505         char line[400];
2506         uid_t u;
2507         gid_t g;
2508         FILE *f;
2509
2510         *uid = -1;
2511         *gid = -1;
2512         sprintf(line, "/proc/%d/status", pid);
2513         if ((f = fopen(line, "r")) == NULL) {
2514                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2515                 return;
2516         }
2517         while (fgets(line, 400, f)) {
2518                 if (strncmp(line, "Uid:", 4) == 0) {
2519                         if (sscanf(line+4, "%u", &u) != 1) {
2520                                 lxcfs_error("bad uid line for pid %u\n", pid);
2521                                 fclose(f);
2522                                 return;
2523                         }
2524                         *uid = u;
2525                 } else if (strncmp(line, "Gid:", 4) == 0) {
2526                         if (sscanf(line+4, "%u", &g) != 1) {
2527                                 lxcfs_error("bad gid line for pid %u\n", pid);
2528                                 fclose(f);
2529                                 return;
2530                         }
2531                         *gid = g;
2532                 }
2533         }
2534         fclose(f);
2535 }
2536
2537 /*
2538  * May the requestor @r move victim @v to a new cgroup?
2539  * This is allowed if
2540  *   . they are the same task
2541  *   . they are ownedy by the same uid
2542  *   . @r is root on the host, or
2543  *   . @v's uid is mapped into @r's where @r is root.
2544  */
2545 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2546 {
2547         uid_t v_uid, tmpuid;
2548         gid_t v_gid;
2549
2550         if (r == v)
2551                 return true;
2552         if (r_uid == 0)
2553                 return true;
2554         get_pid_creds(v, &v_uid, &v_gid);
2555         if (r_uid == v_uid)
2556                 return true;
2557         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2558                         && hostuid_to_ns(v_uid, r, &tmpuid))
2559                 return true;
2560         return false;
2561 }
2562
2563 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2564                 const char *file, const char *buf)
2565 {
2566         int sock[2] = {-1, -1};
2567         pid_t qpid, cpid = -1;
2568         FILE *pids_file = NULL;
2569         bool answer = false, fail = false;
2570
2571         pids_file = open_pids_file(contrl, cg);
2572         if (!pids_file)
2573                 return false;
2574
2575         /*
2576          * write the pids to a socket, have helper in writer's pidns
2577          * call movepid for us
2578          */
2579         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2580                 perror("socketpair");
2581                 goto out;
2582         }
2583
2584         cpid = fork();
2585         if (cpid == -1)
2586                 goto out;
2587
2588         if (!cpid) { // child
2589                 fclose(pids_file);
2590                 pid_from_ns_wrapper(sock[1], tpid);
2591         }
2592
2593         const char *ptr = buf;
2594         while (sscanf(ptr, "%d", &qpid) == 1) {
2595                 struct ucred cred;
2596                 char v;
2597
2598                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2599                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2600                         goto out;
2601                 }
2602
2603                 if (recv_creds(sock[0], &cred, &v)) {
2604                         if (v == '0') {
2605                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2606                                         fail = true;
2607                                         break;
2608                                 }
2609                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2610                                         fail = true;
2611                         }
2612                 }
2613
2614                 ptr = strchr(ptr, '\n');
2615                 if (!ptr)
2616                         break;
2617                 ptr++;
2618         }
2619
2620         /* All good, write the value */
2621         qpid = -1;
2622         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2623                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2624
2625         if (!fail)
2626                 answer = true;
2627
2628 out:
2629         if (cpid != -1)
2630                 wait_for_pid(cpid);
2631         if (sock[0] != -1) {
2632                 close(sock[0]);
2633                 close(sock[1]);
2634         }
2635         if (pids_file) {
2636                 if (fclose(pids_file) != 0)
2637                         answer = false;
2638         }
2639         return answer;
2640 }
2641
2642 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2643              struct fuse_file_info *fi)
2644 {
2645         struct fuse_context *fc = fuse_get_context();
2646         char *localbuf = NULL;
2647         struct cgfs_files *k = NULL;
2648         struct file_info *f = (struct file_info *)fi->fh;
2649         bool r;
2650
2651         if (f->type != LXC_TYPE_CGFILE) {
2652                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2653                 return -EIO;
2654         }
2655
2656         if (offset)
2657                 return 0;
2658
2659         if (!fc)
2660                 return -EIO;
2661
2662         localbuf = alloca(size+1);
2663         localbuf[size] = '\0';
2664         memcpy(localbuf, buf, size);
2665
2666         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2667                 size = -EINVAL;
2668                 goto out;
2669         }
2670
2671         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2672                 size = -EACCES;
2673                 goto out;
2674         }
2675
2676         if (strcmp(f->file, "tasks") == 0 ||
2677                         strcmp(f->file, "/tasks") == 0 ||
2678                         strcmp(f->file, "/cgroup.procs") == 0 ||
2679                         strcmp(f->file, "cgroup.procs") == 0)
2680                 // special case - we have to translate the pids
2681                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2682         else
2683                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2684
2685         if (!r)
2686                 size = -EINVAL;
2687
2688 out:
2689         free_key(k);
2690         return size;
2691 }
2692
2693 int cg_chown(const char *path, uid_t uid, gid_t gid)
2694 {
2695         struct fuse_context *fc = fuse_get_context();
2696         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2697         struct cgfs_files *k = NULL;
2698         const char *cgroup;
2699         int ret;
2700
2701         if (!fc)
2702                 return -EIO;
2703
2704         if (strcmp(path, "/cgroup") == 0)
2705                 return -EPERM;
2706
2707         controller = pick_controller_from_path(fc, path);
2708         if (!controller)
2709                 return errno == ENOENT ? -EPERM : -errno;
2710
2711         cgroup = find_cgroup_in_path(path);
2712         if (!cgroup)
2713                 /* this is just /cgroup/controller */
2714                 return -EPERM;
2715
2716         get_cgdir_and_path(cgroup, &cgdir, &last);
2717
2718         if (!last) {
2719                 path1 = "/";
2720                 path2 = cgdir;
2721         } else {
2722                 path1 = cgdir;
2723                 path2 = last;
2724         }
2725
2726         if (is_child_cgroup(controller, path1, path2)) {
2727                 // get uid, gid, from '/tasks' file and make up a mode
2728                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2729                 k = cgfs_get_key(controller, cgroup, "tasks");
2730
2731         } else
2732                 k = cgfs_get_key(controller, path1, path2);
2733
2734         if (!k) {
2735                 ret = -EINVAL;
2736                 goto out;
2737         }
2738
2739         /*
2740          * This being a fuse request, the uid and gid must be valid
2741          * in the caller's namespace.  So we can just check to make
2742          * sure that the caller is root in his uid, and privileged
2743          * over the file's current owner.
2744          */
2745         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2746                 ret = -EACCES;
2747                 goto out;
2748         }
2749
2750         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2751
2752 out:
2753         free_key(k);
2754         free(cgdir);
2755
2756         return ret;
2757 }
2758
2759 int cg_chmod(const char *path, mode_t mode)
2760 {
2761         struct fuse_context *fc = fuse_get_context();
2762         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2763         struct cgfs_files *k = NULL;
2764         const char *cgroup;
2765         int ret;
2766
2767         if (!fc)
2768                 return -EIO;
2769
2770         if (strcmp(path, "/cgroup") == 0)
2771                 return -EPERM;
2772
2773         controller = pick_controller_from_path(fc, path);
2774         if (!controller)
2775                 return errno == ENOENT ? -EPERM : -errno;
2776
2777         cgroup = find_cgroup_in_path(path);
2778         if (!cgroup)
2779                 /* this is just /cgroup/controller */
2780                 return -EPERM;
2781
2782         get_cgdir_and_path(cgroup, &cgdir, &last);
2783
2784         if (!last) {
2785                 path1 = "/";
2786                 path2 = cgdir;
2787         } else {
2788                 path1 = cgdir;
2789                 path2 = last;
2790         }
2791
2792         if (is_child_cgroup(controller, path1, path2)) {
2793                 // get uid, gid, from '/tasks' file and make up a mode
2794                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2795                 k = cgfs_get_key(controller, cgroup, "tasks");
2796
2797         } else
2798                 k = cgfs_get_key(controller, path1, path2);
2799
2800         if (!k) {
2801                 ret = -EINVAL;
2802                 goto out;
2803         }
2804
2805         /*
2806          * This being a fuse request, the uid and gid must be valid
2807          * in the caller's namespace.  So we can just check to make
2808          * sure that the caller is root in his uid, and privileged
2809          * over the file's current owner.
2810          */
2811         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2812                 ret = -EPERM;
2813                 goto out;
2814         }
2815
2816         if (!cgfs_chmod_file(controller, cgroup, mode)) {
2817                 ret = -EINVAL;
2818                 goto out;
2819         }
2820
2821         ret = 0;
2822 out:
2823         free_key(k);
2824         free(cgdir);
2825         return ret;
2826 }
2827
2828 int cg_mkdir(const char *path, mode_t mode)
2829 {
2830         struct fuse_context *fc = fuse_get_context();
2831         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2832         const char *cgroup;
2833         int ret;
2834
2835         if (!fc)
2836                 return -EIO;
2837
2838         controller = pick_controller_from_path(fc, path);
2839         if (!controller)
2840                 return errno == ENOENT ? -EPERM : -errno;
2841
2842         cgroup = find_cgroup_in_path(path);
2843         if (!cgroup)
2844                 return -errno;
2845
2846         get_cgdir_and_path(cgroup, &cgdir, &last);
2847         if (!last)
2848                 path1 = "/";
2849         else
2850                 path1 = cgdir;
2851
2852         pid_t initpid = lookup_initpid_in_store(fc->pid);
2853         if (initpid <= 0)
2854                 initpid = fc->pid;
2855         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2856                 if (!next)
2857                         ret = -EINVAL;
2858                 else if (last && strcmp(next, last) == 0)
2859                         ret = -EEXIST;
2860                 else
2861                         ret = -EPERM;
2862                 goto out;
2863         }
2864
2865         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2866                 ret = -EACCES;
2867                 goto out;
2868         }
2869         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2870                 ret = -EACCES;
2871                 goto out;
2872         }
2873
2874         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2875
2876 out:
2877         free(cgdir);
2878         free(next);
2879         return ret;
2880 }
2881
2882 int cg_rmdir(const char *path)
2883 {
2884         struct fuse_context *fc = fuse_get_context();
2885         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2886         const char *cgroup;
2887         int ret;
2888
2889         if (!fc)
2890                 return -EIO;
2891
2892         controller = pick_controller_from_path(fc, path);
2893         if (!controller) /* Someone's trying to delete "/cgroup". */
2894                 return -EPERM;
2895
2896         cgroup = find_cgroup_in_path(path);
2897         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
2898                 return -EPERM;
2899
2900         get_cgdir_and_path(cgroup, &cgdir, &last);
2901         if (!last) {
2902                 /* Someone's trying to delete a cgroup on the same level as the
2903                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
2904                  * rmdir "/cgroup/blkio/init.slice".
2905                  */
2906                 ret = -EPERM;
2907                 goto out;
2908         }
2909
2910         pid_t initpid = lookup_initpid_in_store(fc->pid);
2911         if (initpid <= 0)
2912                 initpid = fc->pid;
2913         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2914                 if (!last || strcmp(next, last) == 0)
2915                         ret = -EBUSY;
2916                 else
2917                         ret = -ENOENT;
2918                 goto out;
2919         }
2920
2921         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2922                 ret = -EACCES;
2923                 goto out;
2924         }
2925         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2926                 ret = -EACCES;
2927                 goto out;
2928         }
2929
2930         if (!cgfs_remove(controller, cgroup)) {
2931                 ret = -EINVAL;
2932                 goto out;
2933         }
2934
2935         ret = 0;
2936
2937 out:
2938         free(cgdir);
2939         free(next);
2940         return ret;
2941 }
2942
2943 static bool startswith(const char *line, const char *pref)
2944 {
2945         if (strncmp(line, pref, strlen(pref)) == 0)
2946                 return true;
2947         return false;
2948 }
2949
2950 static void parse_memstat(char *memstat, unsigned long *cached,
2951                 unsigned long *active_anon, unsigned long *inactive_anon,
2952                 unsigned long *active_file, unsigned long *inactive_file,
2953                 unsigned long *unevictable)
2954 {
2955         char *eol;
2956
2957         while (*memstat) {
2958                 if (startswith(memstat, "cache")) {
2959                         sscanf(memstat + 11, "%lu", cached);
2960                         *cached /= 1024;
2961                 } else if (startswith(memstat, "active_anon")) {
2962                         sscanf(memstat + 11, "%lu", active_anon);
2963                         *active_anon /= 1024;
2964                 } else if (startswith(memstat, "inactive_anon")) {
2965                         sscanf(memstat + 11, "%lu", inactive_anon);
2966                         *inactive_anon /= 1024;
2967                 } else if (startswith(memstat, "active_file")) {
2968                         sscanf(memstat + 11, "%lu", active_file);
2969                         *active_file /= 1024;
2970                 } else if (startswith(memstat, "inactive_file")) {
2971                         sscanf(memstat + 11, "%lu", inactive_file);
2972                         *inactive_file /= 1024;
2973                 } else if (startswith(memstat, "unevictable")) {
2974                         sscanf(memstat + 11, "%lu", unevictable);
2975                         *unevictable /= 1024;
2976                 }
2977                 eol = strchr(memstat, '\n');
2978                 if (!eol)
2979                         return;
2980                 memstat = eol+1;
2981         }
2982 }
2983
2984 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2985 {
2986         char *eol;
2987         char key[32];
2988
2989         memset(key, 0, 32);
2990         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2991
2992         size_t len = strlen(key);
2993         *v = 0;
2994
2995         while (*str) {
2996                 if (startswith(str, key)) {
2997                         sscanf(str + len, "%lu", v);
2998                         return;
2999                 }
3000                 eol = strchr(str, '\n');
3001                 if (!eol)
3002                         return;
3003                 str = eol+1;
3004         }
3005 }
3006
3007 static int read_file(const char *path, char *buf, size_t size,
3008                      struct file_info *d)
3009 {
3010         size_t linelen = 0, total_len = 0, rv = 0;
3011         char *line = NULL;
3012         char *cache = d->buf;
3013         size_t cache_size = d->buflen;
3014         FILE *f = fopen(path, "r");
3015         if (!f)
3016                 return 0;
3017
3018         while (getline(&line, &linelen, f) != -1) {
3019                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3020                 if (l < 0) {
3021                         perror("Error writing to cache");
3022                         rv = 0;
3023                         goto err;
3024                 }
3025                 if (l >= cache_size) {
3026                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3027                         rv = 0;
3028                         goto err;
3029                 }
3030                 cache += l;
3031                 cache_size -= l;
3032                 total_len += l;
3033         }
3034
3035         d->size = total_len;
3036         if (total_len > size)
3037                 total_len = size;
3038
3039         /* read from off 0 */
3040         memcpy(buf, d->buf, total_len);
3041         rv = total_len;
3042   err:
3043         fclose(f);
3044         free(line);
3045         return rv;
3046 }
3047
3048 /*
3049  * FUSE ops for /proc
3050  */
3051
3052 static unsigned long get_memlimit(const char *cgroup)
3053 {
3054         char *memlimit_str = NULL;
3055         unsigned long memlimit = -1;
3056
3057         if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
3058                 memlimit = strtoul(memlimit_str, NULL, 10);
3059
3060         free(memlimit_str);
3061
3062         return memlimit;
3063 }
3064
3065 static unsigned long get_min_memlimit(const char *cgroup)
3066 {
3067         char *copy = strdupa(cgroup);
3068         unsigned long memlimit = 0, retlimit;
3069
3070         retlimit = get_memlimit(copy);
3071
3072         while (strcmp(copy, "/") != 0) {
3073                 copy = dirname(copy);
3074                 memlimit = get_memlimit(copy);
3075                 if (memlimit != -1 && memlimit < retlimit)
3076                         retlimit = memlimit;
3077         };
3078
3079         return retlimit;
3080 }
3081
3082 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3083                 struct fuse_file_info *fi)
3084 {
3085         struct fuse_context *fc = fuse_get_context();
3086         struct file_info *d = (struct file_info *)fi->fh;
3087         char *cg;
3088         char *memusage_str = NULL, *memstat_str = NULL,
3089                 *memswlimit_str = NULL, *memswusage_str = NULL,
3090                 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3091         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3092                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3093                 active_file = 0, inactive_file = 0, unevictable = 0;
3094         char *line = NULL;
3095         size_t linelen = 0, total_len = 0, rv = 0;
3096         char *cache = d->buf;
3097         size_t cache_size = d->buflen;
3098         FILE *f = NULL;
3099
3100         if (offset){
3101                 if (offset > d->size)
3102                         return -EINVAL;
3103                 if (!d->cached)
3104                         return 0;
3105                 int left = d->size - offset;
3106                 total_len = left > size ? size: left;
3107                 memcpy(buf, cache + offset, total_len);
3108                 return total_len;
3109         }
3110
3111         pid_t initpid = lookup_initpid_in_store(fc->pid);
3112         if (initpid <= 0)
3113                 initpid = fc->pid;
3114         cg = get_pid_cgroup(initpid, "memory");
3115         if (!cg)
3116                 return read_file("/proc/meminfo", buf, size, d);
3117         prune_init_slice(cg);
3118
3119         memlimit = get_min_memlimit(cg);
3120         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3121                 goto err;
3122         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3123                 goto err;
3124
3125         // Following values are allowed to fail, because swapaccount might be turned
3126         // off for current kernel
3127         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3128                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3129         {
3130                 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
3131                 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3132                         goto err;
3133                 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3134                         goto err;
3135
3136                 memswlimit = strtoul(memswlimit_str, NULL, 10);
3137                 memswusage = strtoul(memswusage_str, NULL, 10);
3138
3139                 if (!strcmp(memswlimit_str, memswlimit_default_str))
3140                         memswlimit = 0;
3141                 if (!strcmp(memswusage_str, memswusage_default_str))
3142                         memswusage = 0;
3143
3144                 memswlimit = memswlimit / 1024;
3145                 memswusage = memswusage / 1024;
3146         }
3147
3148         memusage = strtoul(memusage_str, NULL, 10);
3149         memlimit /= 1024;
3150         memusage /= 1024;
3151
3152         parse_memstat(memstat_str, &cached, &active_anon,
3153                         &inactive_anon, &active_file, &inactive_file,
3154                         &unevictable);
3155
3156         f = fopen("/proc/meminfo", "r");
3157         if (!f)
3158                 goto err;
3159
3160         while (getline(&line, &linelen, f) != -1) {
3161                 ssize_t l;
3162                 char *printme, lbuf[100];
3163
3164                 memset(lbuf, 0, 100);
3165                 if (startswith(line, "MemTotal:")) {
3166                         sscanf(line+14, "%lu", &hosttotal);
3167                         if (hosttotal < memlimit)
3168                                 memlimit = hosttotal;
3169                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3170                         printme = lbuf;
3171                 } else if (startswith(line, "MemFree:")) {
3172                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3173                         printme = lbuf;
3174                 } else if (startswith(line, "MemAvailable:")) {
3175                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage);
3176                         printme = lbuf;
3177                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3178                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit - memlimit);
3179                         printme = lbuf;
3180                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3181                         unsigned long swaptotal = memswlimit - memlimit,
3182                                         swapusage = memswusage - memusage,
3183                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3184                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3185                         printme = lbuf;
3186                 } else if (startswith(line, "Slab:")) {
3187                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3188                         printme = lbuf;
3189                 } else if (startswith(line, "Buffers:")) {
3190                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3191                         printme = lbuf;
3192                 } else if (startswith(line, "Cached:")) {
3193                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3194                         printme = lbuf;
3195                 } else if (startswith(line, "SwapCached:")) {
3196                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3197                         printme = lbuf;
3198                 } else if (startswith(line, "Active")) {
3199                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3200                                         active_anon + active_file);
3201                         printme = lbuf;
3202                 } else if (startswith(line, "Inactive")) {
3203                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3204                                         inactive_anon + inactive_file);
3205                         printme = lbuf;
3206                 } else if (startswith(line, "Active(anon)")) {
3207                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3208                         printme = lbuf;
3209                 } else if (startswith(line, "Inactive(anon)")) {
3210                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3211                         printme = lbuf;
3212                 } else if (startswith(line, "Active(file)")) {
3213                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3214                         printme = lbuf;
3215                 } else if (startswith(line, "Inactive(file)")) {
3216                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3217                         printme = lbuf;
3218                 } else if (startswith(line, "Unevictable")) {
3219                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3220                         printme = lbuf;
3221                 } else if (startswith(line, "SReclaimable")) {
3222                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3223                         printme = lbuf;
3224                 } else if (startswith(line, "SUnreclaim")) {
3225                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3226                         printme = lbuf;
3227                 } else
3228                         printme = line;
3229
3230                 l = snprintf(cache, cache_size, "%s", printme);
3231                 if (l < 0) {
3232                         perror("Error writing to cache");
3233                         rv = 0;
3234                         goto err;
3235
3236                 }
3237                 if (l >= cache_size) {
3238                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3239                         rv = 0;
3240                         goto err;
3241                 }
3242
3243                 cache += l;
3244                 cache_size -= l;
3245                 total_len += l;
3246         }
3247
3248         d->cached = 1;
3249         d->size = total_len;
3250         if (total_len > size ) total_len = size;
3251         memcpy(buf, d->buf, total_len);
3252
3253         rv = total_len;
3254 err:
3255         if (f)
3256                 fclose(f);
3257         free(line);
3258         free(cg);
3259         free(memusage_str);
3260         free(memswlimit_str);
3261         free(memswusage_str);
3262         free(memstat_str);
3263         free(memswlimit_default_str);
3264         free(memswusage_default_str);
3265         return rv;
3266 }
3267
3268 /*
3269  * Read the cpuset.cpus for cg
3270  * Return the answer in a newly allocated string which must be freed
3271  */
3272 static char *get_cpuset(const char *cg)
3273 {
3274         char *answer;
3275
3276         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3277                 return NULL;
3278         return answer;
3279 }
3280
3281 bool cpu_in_cpuset(int cpu, const char *cpuset);
3282
3283 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3284 {
3285         int cpu;
3286
3287         if (sscanf(line, "processor       : %d", &cpu) != 1)
3288                 return false;
3289         return cpu_in_cpuset(cpu, cpuset);
3290 }
3291
3292 /*
3293  * check whether this is a '^processor" line in /proc/cpuinfo
3294  */
3295 static bool is_processor_line(const char *line)
3296 {
3297         int cpu;
3298
3299         if (sscanf(line, "processor       : %d", &cpu) == 1)
3300                 return true;
3301         return false;
3302 }
3303
3304 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3305                 struct fuse_file_info *fi)
3306 {
3307         struct fuse_context *fc = fuse_get_context();
3308         struct file_info *d = (struct file_info *)fi->fh;
3309         char *cg;
3310         char *cpuset = NULL;
3311         char *line = NULL;
3312         size_t linelen = 0, total_len = 0, rv = 0;
3313         bool am_printing = false, firstline = true, is_s390x = false;
3314         int curcpu = -1, cpu;
3315         char *cache = d->buf;
3316         size_t cache_size = d->buflen;
3317         FILE *f = NULL;
3318
3319         if (offset){
3320                 if (offset > d->size)
3321                         return -EINVAL;
3322                 if (!d->cached)
3323                         return 0;
3324                 int left = d->size - offset;
3325                 total_len = left > size ? size: left;
3326                 memcpy(buf, cache + offset, total_len);
3327                 return total_len;
3328         }
3329
3330         pid_t initpid = lookup_initpid_in_store(fc->pid);
3331         if (initpid <= 0)
3332                 initpid = fc->pid;
3333         cg = get_pid_cgroup(initpid, "cpuset");
3334         if (!cg)
3335                 return read_file("proc/cpuinfo", buf, size, d);
3336         prune_init_slice(cg);
3337
3338         cpuset = get_cpuset(cg);
3339         if (!cpuset)
3340                 goto err;
3341
3342         f = fopen("/proc/cpuinfo", "r");
3343         if (!f)
3344                 goto err;
3345
3346         while (getline(&line, &linelen, f) != -1) {
3347                 ssize_t l;
3348                 if (firstline) {
3349                         firstline = false;
3350                         if (strstr(line, "IBM/S390") != NULL) {
3351                                 is_s390x = true;
3352                                 am_printing = true;
3353                                 continue;
3354                         }
3355                 }
3356                 if (strncmp(line, "# processors:", 12) == 0)
3357                         continue;
3358                 if (is_processor_line(line)) {
3359                         am_printing = cpuline_in_cpuset(line, cpuset);
3360                         if (am_printing) {
3361                                 curcpu ++;
3362                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3363                                 if (l < 0) {
3364                                         perror("Error writing to cache");
3365                                         rv = 0;
3366                                         goto err;
3367                                 }
3368                                 if (l >= cache_size) {
3369                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3370                                         rv = 0;
3371                                         goto err;
3372                                 }
3373                                 cache += l;
3374                                 cache_size -= l;
3375                                 total_len += l;
3376                         }
3377                         continue;
3378                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3379                         char *p;
3380                         if (!cpu_in_cpuset(cpu, cpuset))
3381                                 continue;
3382                         curcpu ++;
3383                         p = strchr(line, ':');
3384                         if (!p || !*p)
3385                                 goto err;
3386                         p++;
3387                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3388                         if (l < 0) {
3389                                 perror("Error writing to cache");
3390                                 rv = 0;
3391                                 goto err;
3392                         }
3393                         if (l >= cache_size) {
3394                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3395                                 rv = 0;
3396                                 goto err;
3397                         }
3398                         cache += l;
3399                         cache_size -= l;
3400                         total_len += l;
3401                         continue;
3402
3403                 }
3404                 if (am_printing) {
3405                         l = snprintf(cache, cache_size, "%s", line);
3406                         if (l < 0) {
3407                                 perror("Error writing to cache");
3408                                 rv = 0;
3409                                 goto err;
3410                         }
3411                         if (l >= cache_size) {
3412                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3413                                 rv = 0;
3414                                 goto err;
3415                         }
3416                         cache += l;
3417                         cache_size -= l;
3418                         total_len += l;
3419                 }
3420         }
3421
3422         if (is_s390x) {
3423                 char *origcache = d->buf;
3424                 ssize_t l;
3425                 do {
3426                         d->buf = malloc(d->buflen);
3427                 } while (!d->buf);
3428                 cache = d->buf;
3429                 cache_size = d->buflen;
3430                 total_len = 0;
3431                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3432                 if (l < 0 || l >= cache_size) {
3433                         free(origcache);
3434                         goto err;
3435                 }
3436                 cache_size -= l;
3437                 cache += l;
3438                 total_len += l;
3439                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3440                 if (l < 0 || l >= cache_size) {
3441                         free(origcache);
3442                         goto err;
3443                 }
3444                 cache_size -= l;
3445                 cache += l;
3446                 total_len += l;
3447                 l = snprintf(cache, cache_size, "%s", origcache);
3448                 free(origcache);
3449                 if (l < 0 || l >= cache_size)
3450                         goto err;
3451                 total_len += l;
3452         }
3453
3454         d->cached = 1;
3455         d->size = total_len;
3456         if (total_len > size ) total_len = size;
3457
3458         /* read from off 0 */
3459         memcpy(buf, d->buf, total_len);
3460         rv = total_len;
3461 err:
3462         if (f)
3463                 fclose(f);
3464         free(line);
3465         free(cpuset);
3466         free(cg);
3467         return rv;
3468 }
3469
3470 static int proc_stat_read(char *buf, size_t size, off_t offset,
3471                 struct fuse_file_info *fi)
3472 {
3473         struct fuse_context *fc = fuse_get_context();
3474         struct file_info *d = (struct file_info *)fi->fh;
3475         char *cg;
3476         char *cpuset = NULL;
3477         char *line = NULL;
3478         size_t linelen = 0, total_len = 0, rv = 0;
3479         int curcpu = -1; /* cpu numbering starts at 0 */
3480         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3481         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3482                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3483 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3484         char cpuall[CPUALL_MAX_SIZE];
3485         /* reserve for cpu all */
3486         char *cache = d->buf + CPUALL_MAX_SIZE;
3487         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3488         FILE *f = NULL;
3489
3490         if (offset){
3491                 if (offset > d->size)
3492                         return -EINVAL;
3493                 if (!d->cached)
3494                         return 0;
3495                 int left = d->size - offset;
3496                 total_len = left > size ? size: left;
3497                 memcpy(buf, d->buf + offset, total_len);
3498                 return total_len;
3499         }
3500
3501         pid_t initpid = lookup_initpid_in_store(fc->pid);
3502         if (initpid <= 0)
3503                 initpid = fc->pid;
3504         cg = get_pid_cgroup(initpid, "cpuset");
3505         if (!cg)
3506                 return read_file("/proc/stat", buf, size, d);
3507         prune_init_slice(cg);
3508
3509         cpuset = get_cpuset(cg);
3510         if (!cpuset)
3511                 goto err;
3512
3513         f = fopen("/proc/stat", "r");
3514         if (!f)
3515                 goto err;
3516
3517         //skip first line
3518         if (getline(&line, &linelen, f) < 0) {
3519                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3520                 goto err;
3521         }
3522
3523         while (getline(&line, &linelen, f) != -1) {
3524                 ssize_t l;
3525                 int cpu;
3526                 char cpu_char[10]; /* That's a lot of cores */
3527                 char *c;
3528
3529                 if (strlen(line) == 0)
3530                         continue;
3531                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3532                         /* not a ^cpuN line containing a number N, just print it */
3533                         l = snprintf(cache, cache_size, "%s", line);
3534                         if (l < 0) {
3535                                 perror("Error writing to cache");
3536                                 rv = 0;
3537                                 goto err;
3538                         }
3539                         if (l >= cache_size) {
3540                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3541                                 rv = 0;
3542                                 goto err;
3543                         }
3544                         cache += l;
3545                         cache_size -= l;
3546                         total_len += l;
3547                         continue;
3548                 }
3549
3550                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3551                         continue;
3552                 if (!cpu_in_cpuset(cpu, cpuset))
3553                         continue;
3554                 curcpu ++;
3555
3556                 c = strchr(line, ' ');
3557                 if (!c)
3558                         continue;
3559                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3560                 if (l < 0) {
3561                         perror("Error writing to cache");
3562                         rv = 0;
3563                         goto err;
3564
3565                 }
3566                 if (l >= cache_size) {
3567                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3568                         rv = 0;
3569                         goto err;
3570                 }
3571
3572                 cache += l;
3573                 cache_size -= l;
3574                 total_len += l;
3575
3576                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3577                         &softirq, &steal, &guest) != 9)
3578                         continue;
3579                 user_sum += user;
3580                 nice_sum += nice;
3581                 system_sum += system;
3582                 idle_sum += idle;
3583                 iowait_sum += iowait;
3584                 irq_sum += irq;
3585                 softirq_sum += softirq;
3586                 steal_sum += steal;
3587                 guest_sum += guest;
3588         }
3589
3590         cache = d->buf;
3591
3592         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3593                 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3594         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3595                 memcpy(cache, cpuall, cpuall_len);
3596                 cache += cpuall_len;
3597         } else{
3598                 /* shouldn't happen */
3599                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3600                 cpuall_len = 0;
3601         }
3602
3603         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3604         total_len += cpuall_len;
3605         d->cached = 1;
3606         d->size = total_len;
3607         if (total_len > size ) total_len = size;
3608
3609         memcpy(buf, d->buf, total_len);
3610         rv = total_len;
3611
3612 err:
3613         if (f)
3614                 fclose(f);
3615         free(line);
3616         free(cpuset);
3617         free(cg);
3618         return rv;
3619 }
3620
3621 static long int getreaperage(pid_t pid)
3622 {
3623         char fnam[100];
3624         struct stat sb;
3625         int ret;
3626         pid_t qpid;
3627
3628         qpid = lookup_initpid_in_store(pid);
3629         if (qpid <= 0)
3630                 return 0;
3631
3632         ret = snprintf(fnam, 100, "/proc/%d", qpid);
3633         if (ret < 0 || ret >= 100)
3634                 return 0;
3635
3636         if (lstat(fnam, &sb) < 0)
3637                 return 0;
3638
3639         return time(NULL) - sb.st_ctime;
3640 }
3641
3642 static unsigned long get_reaper_busy(pid_t task)
3643 {
3644         pid_t initpid = lookup_initpid_in_store(task);
3645         char *cgroup = NULL, *usage_str = NULL;
3646         unsigned long usage = 0;
3647
3648         if (initpid <= 0)
3649                 return 0;
3650
3651         cgroup = get_pid_cgroup(initpid, "cpuacct");
3652         if (!cgroup)
3653                 goto out;
3654         prune_init_slice(cgroup);
3655         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3656                 goto out;
3657         usage = strtoul(usage_str, NULL, 10);
3658         usage /= 1000000000;
3659
3660 out:
3661         free(cgroup);
3662         free(usage_str);
3663         return usage;
3664 }
3665
3666 #if RELOADTEST
3667 void iwashere(void)
3668 {
3669         int fd;
3670
3671         fd = creat("/tmp/lxcfs-iwashere", 0644);
3672         if (fd >= 0)
3673                 close(fd);
3674 }
3675 #endif
3676
3677 /*
3678  * We read /proc/uptime and reuse its second field.
3679  * For the first field, we use the mtime for the reaper for
3680  * the calling pid as returned by getreaperage
3681  */
3682 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3683                 struct fuse_file_info *fi)
3684 {
3685         struct fuse_context *fc = fuse_get_context();
3686         struct file_info *d = (struct file_info *)fi->fh;
3687         long int reaperage = getreaperage(fc->pid);
3688         unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3689         char *cache = d->buf;
3690         ssize_t total_len = 0;
3691
3692 #if RELOADTEST
3693         iwashere();
3694 #endif
3695
3696         if (offset){
3697                 if (offset > d->size)
3698                         return -EINVAL;
3699                 if (!d->cached)
3700                         return 0;
3701                 int left = d->size - offset;
3702                 total_len = left > size ? size: left;
3703                 memcpy(buf, cache + offset, total_len);
3704                 return total_len;
3705         }
3706
3707         idletime = reaperage - busytime;
3708         if (idletime > reaperage)
3709                 idletime = reaperage;
3710
3711         total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3712         if (total_len < 0){
3713                 perror("Error writing to cache");
3714                 return 0;
3715         }
3716
3717         d->size = (int)total_len;
3718         d->cached = 1;
3719
3720         if (total_len > size) total_len = size;
3721
3722         memcpy(buf, d->buf, total_len);
3723         return total_len;
3724 }
3725
3726 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3727                 struct fuse_file_info *fi)
3728 {
3729         char dev_name[72];
3730         struct fuse_context *fc = fuse_get_context();
3731         struct file_info *d = (struct file_info *)fi->fh;
3732         char *cg;
3733         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3734                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
3735         unsigned long read = 0, write = 0;
3736         unsigned long read_merged = 0, write_merged = 0;
3737         unsigned long read_sectors = 0, write_sectors = 0;
3738         unsigned long read_ticks = 0, write_ticks = 0;
3739         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3740         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3741         char *cache = d->buf;
3742         size_t cache_size = d->buflen;
3743         char *line = NULL;
3744         size_t linelen = 0, total_len = 0, rv = 0;
3745         unsigned int major = 0, minor = 0;
3746         int i = 0;
3747         FILE *f = NULL;
3748
3749         if (offset){
3750                 if (offset > d->size)
3751                         return -EINVAL;
3752                 if (!d->cached)
3753                         return 0;
3754                 int left = d->size - offset;
3755                 total_len = left > size ? size: left;
3756                 memcpy(buf, cache + offset, total_len);
3757                 return total_len;
3758         }
3759
3760         pid_t initpid = lookup_initpid_in_store(fc->pid);
3761         if (initpid <= 0)
3762                 initpid = fc->pid;
3763         cg = get_pid_cgroup(initpid, "blkio");
3764         if (!cg)
3765                 return read_file("/proc/diskstats", buf, size, d);
3766         prune_init_slice(cg);
3767
3768         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
3769                 goto err;
3770         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
3771                 goto err;
3772         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
3773                 goto err;
3774         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
3775                 goto err;
3776         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
3777                 goto err;
3778
3779
3780         f = fopen("/proc/diskstats", "r");
3781         if (!f)
3782                 goto err;
3783
3784         while (getline(&line, &linelen, f) != -1) {
3785                 ssize_t l;
3786                 char lbuf[256];
3787
3788                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
3789                 if (i != 3)
3790                         continue;
3791
3792                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3793                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3794                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3795                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3796                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3797                 read_sectors = read_sectors/512;
3798                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3799                 write_sectors = write_sectors/512;
3800
3801                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3802                 rd_svctm = rd_svctm/1000000;
3803                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3804                 rd_wait = rd_wait/1000000;
3805                 read_ticks = rd_svctm + rd_wait;
3806
3807                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3808                 wr_svctm =  wr_svctm/1000000;
3809                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3810                 wr_wait =  wr_wait/1000000;
3811                 write_ticks = wr_svctm + wr_wait;
3812
3813                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3814                 tot_ticks =  tot_ticks/1000000;
3815
3816                 memset(lbuf, 0, 256);
3817                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3818                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3819                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3820                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3821                 else
3822                         continue;
3823
3824                 l = snprintf(cache, cache_size, "%s", lbuf);
3825                 if (l < 0) {
3826                         perror("Error writing to fuse buf");
3827                         rv = 0;
3828                         goto err;
3829                 }
3830                 if (l >= cache_size) {
3831                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3832                         rv = 0;
3833                         goto err;
3834                 }
3835                 cache += l;
3836                 cache_size -= l;
3837                 total_len += l;
3838         }
3839
3840         d->cached = 1;
3841         d->size = total_len;
3842         if (total_len > size ) total_len = size;
3843         memcpy(buf, d->buf, total_len);
3844
3845         rv = total_len;
3846 err:
3847         free(cg);
3848         if (f)
3849                 fclose(f);
3850         free(line);
3851         free(io_serviced_str);
3852         free(io_merged_str);
3853         free(io_service_bytes_str);
3854         free(io_wait_time_str);
3855         free(io_service_time_str);
3856         return rv;
3857 }
3858
3859 static int proc_swaps_read(char *buf, size_t size, off_t offset,
3860                 struct fuse_file_info *fi)
3861 {
3862         struct fuse_context *fc = fuse_get_context();
3863         struct file_info *d = (struct file_info *)fi->fh;
3864         char *cg = NULL;
3865         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
3866              *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3867         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
3868         ssize_t total_len = 0, rv = 0;
3869         ssize_t l = 0;
3870         char *cache = d->buf;
3871
3872         if (offset) {
3873                 if (offset > d->size)
3874                         return -EINVAL;
3875                 if (!d->cached)
3876                         return 0;
3877                 int left = d->size - offset;
3878                 total_len = left > size ? size: left;
3879                 memcpy(buf, cache + offset, total_len);
3880                 return total_len;
3881         }
3882
3883         pid_t initpid = lookup_initpid_in_store(fc->pid);
3884         if (initpid <= 0)
3885                 initpid = fc->pid;
3886         cg = get_pid_cgroup(initpid, "memory");
3887         if (!cg)
3888                 return read_file("/proc/swaps", buf, size, d);
3889         prune_init_slice(cg);
3890
3891         if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
3892                 goto err;
3893
3894         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3895                 goto err;
3896
3897         memlimit = strtoul(memlimit_str, NULL, 10);
3898         memusage = strtoul(memusage_str, NULL, 10);
3899
3900         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
3901             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
3902
3903                 /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
3904                 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3905                     goto err;
3906                 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3907                     goto err;
3908
3909                 memswlimit = strtoul(memswlimit_str, NULL, 10);
3910                 memswusage = strtoul(memswusage_str, NULL, 10);
3911
3912                 if (!strcmp(memswlimit_str, memswlimit_default_str))
3913                     memswlimit = 0;
3914                 if (!strcmp(memswusage_str, memswusage_default_str))
3915                     memswusage = 0;
3916
3917                 swap_total = (memswlimit - memlimit) / 1024;
3918                 swap_free = (memswusage - memusage) / 1024;
3919         }
3920
3921         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
3922
3923         /* When no mem + swap limit is specified or swapaccount=0*/
3924         if (!memswlimit) {
3925                 char *line = NULL;
3926                 size_t linelen = 0;
3927                 FILE *f = fopen("/proc/meminfo", "r");
3928
3929                 if (!f)
3930                         goto err;
3931
3932                 while (getline(&line, &linelen, f) != -1) {
3933                         if (startswith(line, "SwapTotal:")) {
3934                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
3935                         } else if (startswith(line, "SwapFree:")) {
3936                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
3937                         }
3938                 }
3939
3940                 free(line);
3941                 fclose(f);
3942         }
3943
3944         if (swap_total > 0) {
3945                 l = snprintf(d->buf + total_len, d->size - total_len,
3946                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
3947                                 swap_total, swap_free);
3948                 total_len += l;
3949         }
3950
3951         if (total_len < 0 || l < 0) {
3952                 perror("Error writing to cache");
3953                 rv = 0;
3954                 goto err;
3955         }
3956
3957         d->cached = 1;
3958         d->size = (int)total_len;
3959
3960         if (total_len > size) total_len = size;
3961         memcpy(buf, d->buf, total_len);
3962         rv = total_len;
3963
3964 err:
3965         free(cg);
3966         free(memswlimit_str);
3967         free(memlimit_str);
3968         free(memusage_str);
3969         free(memswusage_str);
3970         free(memswusage_default_str);
3971         free(memswlimit_default_str);
3972         return rv;
3973 }
3974
3975 static off_t get_procfile_size(const char *which)
3976 {
3977         FILE *f = fopen(which, "r");
3978         char *line = NULL;
3979         size_t len = 0;
3980         ssize_t sz, answer = 0;
3981         if (!f)
3982                 return 0;
3983
3984         while ((sz = getline(&line, &len, f)) != -1)
3985                 answer += sz;
3986         fclose (f);
3987         free(line);
3988
3989         return answer;
3990 }
3991
3992 int proc_getattr(const char *path, struct stat *sb)
3993 {
3994         struct timespec now;
3995
3996         memset(sb, 0, sizeof(struct stat));
3997         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3998                 return -EINVAL;
3999         sb->st_uid = sb->st_gid = 0;
4000         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4001         if (strcmp(path, "/proc") == 0) {
4002                 sb->st_mode = S_IFDIR | 00555;
4003                 sb->st_nlink = 2;
4004                 return 0;
4005         }
4006         if (strcmp(path, "/proc/meminfo") == 0 ||
4007                         strcmp(path, "/proc/cpuinfo") == 0 ||
4008                         strcmp(path, "/proc/uptime") == 0 ||
4009                         strcmp(path, "/proc/stat") == 0 ||
4010                         strcmp(path, "/proc/diskstats") == 0 ||
4011                         strcmp(path, "/proc/swaps") == 0) {
4012                 sb->st_size = 0;
4013                 sb->st_mode = S_IFREG | 00444;
4014                 sb->st_nlink = 1;
4015                 return 0;
4016         }
4017
4018         return -ENOENT;
4019 }
4020
4021 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4022                 struct fuse_file_info *fi)
4023 {
4024         if (filler(buf, ".", NULL, 0) != 0 ||
4025             filler(buf, "..", NULL, 0) != 0 ||
4026             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4027             filler(buf, "meminfo", NULL, 0) != 0 ||
4028             filler(buf, "stat", NULL, 0) != 0 ||
4029             filler(buf, "uptime", NULL, 0) != 0 ||
4030             filler(buf, "diskstats", NULL, 0) != 0 ||
4031             filler(buf, "swaps", NULL, 0) != 0)
4032                 return -EINVAL;
4033         return 0;
4034 }
4035
4036 int proc_open(const char *path, struct fuse_file_info *fi)
4037 {
4038         int type = -1;
4039         struct file_info *info;
4040
4041         if (strcmp(path, "/proc/meminfo") == 0)
4042                 type = LXC_TYPE_PROC_MEMINFO;
4043         else if (strcmp(path, "/proc/cpuinfo") == 0)
4044                 type = LXC_TYPE_PROC_CPUINFO;
4045         else if (strcmp(path, "/proc/uptime") == 0)
4046                 type = LXC_TYPE_PROC_UPTIME;
4047         else if (strcmp(path, "/proc/stat") == 0)
4048                 type = LXC_TYPE_PROC_STAT;
4049         else if (strcmp(path, "/proc/diskstats") == 0)
4050                 type = LXC_TYPE_PROC_DISKSTATS;
4051         else if (strcmp(path, "/proc/swaps") == 0)
4052                 type = LXC_TYPE_PROC_SWAPS;
4053         if (type == -1)
4054                 return -ENOENT;
4055
4056         info = malloc(sizeof(*info));
4057         if (!info)
4058                 return -ENOMEM;
4059
4060         memset(info, 0, sizeof(*info));
4061         info->type = type;
4062
4063         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4064         do {
4065                 info->buf = malloc(info->buflen);
4066         } while (!info->buf);
4067         memset(info->buf, 0, info->buflen);
4068         /* set actual size to buffer size */
4069         info->size = info->buflen;
4070
4071         fi->fh = (unsigned long)info;
4072         return 0;
4073 }
4074
4075 int proc_access(const char *path, int mask)
4076 {
4077         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4078                 return 0;
4079
4080         /* these are all read-only */
4081         if ((mask & ~R_OK) != 0)
4082                 return -EACCES;
4083         return 0;
4084 }
4085
4086 int proc_release(const char *path, struct fuse_file_info *fi)
4087 {
4088         do_release_file_info(fi);
4089         return 0;
4090 }
4091
4092 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4093                 struct fuse_file_info *fi)
4094 {
4095         struct file_info *f = (struct file_info *) fi->fh;
4096
4097         switch (f->type) {
4098         case LXC_TYPE_PROC_MEMINFO:
4099                 return proc_meminfo_read(buf, size, offset, fi);
4100         case LXC_TYPE_PROC_CPUINFO:
4101                 return proc_cpuinfo_read(buf, size, offset, fi);
4102         case LXC_TYPE_PROC_UPTIME:
4103                 return proc_uptime_read(buf, size, offset, fi);
4104         case LXC_TYPE_PROC_STAT:
4105                 return proc_stat_read(buf, size, offset, fi);
4106         case LXC_TYPE_PROC_DISKSTATS:
4107                 return proc_diskstats_read(buf, size, offset, fi);
4108         case LXC_TYPE_PROC_SWAPS:
4109                 return proc_swaps_read(buf, size, offset, fi);
4110         default:
4111                 return -EINVAL;
4112         }
4113 }
4114
4115 /*
4116  * Functions needed to setup cgroups in the __constructor__.
4117  */
4118
4119 static bool mkdir_p(const char *dir, mode_t mode)
4120 {
4121         const char *tmp = dir;
4122         const char *orig = dir;
4123         char *makeme;
4124
4125         do {
4126                 dir = tmp + strspn(tmp, "/");
4127                 tmp = dir + strcspn(dir, "/");
4128                 makeme = strndup(orig, dir - orig);
4129                 if (!makeme)
4130                         return false;
4131                 if (mkdir(makeme, mode) && errno != EEXIST) {
4132                         lxcfs_error("Failed to create directory '%s': %s.\n",
4133                                 makeme, strerror(errno));
4134                         free(makeme);
4135                         return false;
4136                 }
4137                 free(makeme);
4138         } while(tmp != dir);
4139
4140         return true;
4141 }
4142
4143 static bool umount_if_mounted(void)
4144 {
4145         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4146                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4147                 return false;
4148         }
4149         return true;
4150 }
4151
4152 static int pivot_enter(void)
4153 {
4154         int ret = -1, oldroot = -1, newroot = -1;
4155
4156         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4157         if (oldroot < 0) {
4158                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4159                 return ret;
4160         }
4161
4162         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4163         if (newroot < 0) {
4164                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4165                 goto err;
4166         }
4167
4168         /* change into new root fs */
4169         if (fchdir(newroot) < 0) {
4170                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4171                 goto err;
4172         }
4173
4174         /* pivot_root into our new root fs */
4175         if (pivot_root(".", ".") < 0) {
4176                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4177                 goto err;
4178         }
4179
4180         /*
4181          * At this point the old-root is mounted on top of our new-root.
4182          * To unmounted it we must not be chdir'd into it, so escape back
4183          * to the old-root.
4184          */
4185         if (fchdir(oldroot) < 0) {
4186                 lxcfs_error("%s\n", "Failed to enter old root.");
4187                 goto err;
4188         }
4189         if (umount2(".", MNT_DETACH) < 0) {
4190                 lxcfs_error("%s\n", "Failed to detach old root.");
4191                 goto err;
4192         }
4193
4194         if (fchdir(newroot) < 0) {
4195                 lxcfs_error("%s\n", "Failed to re-enter new root.");
4196                 goto err;
4197         }
4198
4199         ret = 0;
4200
4201 err:
4202         if (oldroot > 0)
4203                 close(oldroot);
4204         if (newroot > 0)
4205                 close(newroot);
4206         return ret;
4207 }
4208
4209 /* Prepare our new clean root. */
4210 static int pivot_prepare(void)
4211 {
4212         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4213                 lxcfs_error("%s\n", "Failed to create directory for new root.");
4214                 return -1;
4215         }
4216
4217         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4218                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4219                 return -1;
4220         }
4221
4222         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4223                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
4224                 return -1;
4225         }
4226
4227         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4228                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
4229                 return -1;
4230         }
4231
4232         return 0;
4233 }
4234
4235 static bool pivot_new_root(void)
4236 {
4237         /* Prepare new root. */
4238         if (pivot_prepare() < 0)
4239                 return false;
4240
4241         /* Pivot into new root. */
4242         if (pivot_enter() < 0)
4243                 return false;
4244
4245         return true;
4246 }
4247
4248 static bool setup_cgfs_dir(void)
4249 {
4250         if (!mkdir_p(BASEDIR, 0700)) {
4251                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
4252                 return false;
4253         }
4254
4255         if (!umount_if_mounted()) {
4256                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
4257                 return false;
4258         }
4259
4260         if (unshare(CLONE_NEWNS) < 0) {
4261                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
4262                 return false;
4263         }
4264
4265         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4266                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
4267                 return false;
4268         }
4269
4270         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
4271                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
4272                 return false;
4273         }
4274
4275         return true;
4276 }
4277
4278 static bool do_mount_cgroups(void)
4279 {
4280         char *target;
4281         size_t clen, len;
4282         int i, ret;
4283
4284         for (i = 0; i < num_hierarchies; i++) {
4285                 char *controller = hierarchies[i];
4286                 clen = strlen(controller);
4287                 len = strlen(BASEDIR) + clen + 2;
4288                 target = malloc(len);
4289                 if (!target)
4290                         return false;
4291                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4292                 if (ret < 0 || ret >= len) {
4293                         free(target);
4294                         return false;
4295                 }
4296                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4297                         free(target);
4298                         return false;
4299                 }
4300                 if (mount(controller, target, "cgroup", 0, controller) < 0) {
4301                         lxcfs_error("Failed mounting cgroup %s\n", controller);
4302                         free(target);
4303                         return false;
4304                 }
4305
4306                 fd_hierarchies[i] = open(target, O_DIRECTORY);
4307                 if (fd_hierarchies[i] < 0) {
4308                         free(target);
4309                         return false;
4310                 }
4311                 free(target);
4312         }
4313         return true;
4314 }
4315
4316 static bool cgfs_setup_controllers(void)
4317 {
4318         if (!setup_cgfs_dir())
4319                 return false;
4320
4321         if (!do_mount_cgroups()) {
4322                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
4323                 return false;
4324         }
4325
4326         if (!pivot_new_root())
4327                 return false;
4328
4329         return true;
4330 }
4331
4332 static int preserve_ns(int pid)
4333 {
4334         int ret;
4335         size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
4336         char path[len];
4337
4338         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4339         if (ret < 0 || (size_t)ret >= len)
4340                 return -1;
4341
4342         return open(path, O_RDONLY | O_CLOEXEC);
4343 }
4344
4345 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
4346 {
4347         FILE *f;
4348         char *cret, *line = NULL;
4349         char cwd[MAXPATHLEN];
4350         size_t len = 0;
4351         int i, init_ns = -1;
4352
4353         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4354                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
4355                 return;
4356         }
4357
4358         while (getline(&line, &len, f) != -1) {
4359                 char *p, *p2;
4360
4361                 p = strchr(line, ':');
4362                 if (!p)
4363                         goto out;
4364                 *(p++) = '\0';
4365
4366                 p2 = strrchr(p, ':');
4367                 if (!p2)
4368                         goto out;
4369                 *p2 = '\0';
4370
4371                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4372                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4373                  * because it parses out the empty string "" and later on passes
4374                  * it to mount(). Let's skip such entries.
4375                  */
4376                 if (!strcmp(p, ""))
4377                         continue;
4378
4379                 if (!store_hierarchy(line, p))
4380                         goto out;
4381         }
4382
4383         /* Preserve initial namespace. */
4384         init_ns = preserve_ns(getpid());
4385         if (init_ns < 0) {
4386                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
4387                 goto out;
4388         }
4389
4390         fd_hierarchies = malloc(sizeof(int *) * num_hierarchies);
4391         if (!fd_hierarchies) {
4392                 lxcfs_error("%s\n", strerror(errno));
4393                 goto out;
4394         }
4395
4396         for (i = 0; i < num_hierarchies; i++)
4397                 fd_hierarchies[i] = -1;
4398
4399         cret = getcwd(cwd, MAXPATHLEN);
4400         if (!cret)
4401                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
4402
4403         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4404          * to privately mount lxcfs cgroups. */
4405         if (!cgfs_setup_controllers()) {
4406                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
4407                 goto out;
4408         }
4409
4410         if (setns(init_ns, 0) < 0) {
4411                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
4412                 goto out;
4413         }
4414
4415         if (!cret || chdir(cwd) < 0)
4416                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
4417
4418         print_subsystems();
4419
4420 out:
4421         free(line);
4422         fclose(f);
4423         if (init_ns >= 0)
4424                 close(init_ns);
4425 }
4426
4427 static void __attribute__((destructor)) free_subsystems(void)
4428 {
4429         int i;
4430
4431         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
4432
4433         for (i = 0; i < num_hierarchies; i++) {
4434                 if (hierarchies[i])
4435                         free(hierarchies[i]);
4436                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
4437                         close(fd_hierarchies[i]);
4438         }
4439         free(hierarchies);
4440         free(fd_hierarchies);
4441 }