bindings.c

   1 /* lxcfs
   2  *
   3  * Copyright © 2014-2016 Canonical, Inc
   4  * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
   5  *
   6  * See COPYING file for details.
   7  */
   8
   9 #define FUSE_USE_VERSION 26
  10
  11 #define __STDC_FORMAT_MACROS
  12 #include <dirent.h>
  13 #include <errno.h>
  14 #include <fcntl.h>
  15 #include <fuse.h>
  16 #include <inttypes.h>
  17 #include <libgen.h>
  18 #include <pthread.h>
  19 #include <sched.h>
  20 #include <stdbool.h>
  21 #include <stdint.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <time.h>
  26 #include <unistd.h>
  27 #include <wait.h>
  28 #include <linux/magic.h>
  29 #include <linux/sched.h>
  30 #include <sys/epoll.h>
  31 #include <sys/mman.h>
  32 #include <sys/mount.h>
  33 #include <sys/param.h>
  34 #include <sys/socket.h>
  35 #include <sys/syscall.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/vfs.h>
  38
  39 #include "bindings.h"
  40 #include "config.h" // for VERSION
  41
  42 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
  43 #define LXCFS_NUMSTRLEN64 21
  44
  45 /* Define pivot_root() if missing from the C library */
  46 #ifndef HAVE_PIVOT_ROOT
  47 static int pivot_root(const char * new_root, const char * put_old)
  48 {
  49 #ifdef __NR_pivot_root
  50 return syscall(__NR_pivot_root, new_root, put_old);
  51 #else
  52 errno = ENOSYS;
  53 return -1;
  54 #endif
  55 }
  56 #else
  57 extern int pivot_root(const char * new_root, const char * put_old);
  58 #endif
  59
  60 enum {
  61         LXC_TYPE_CGDIR,
  62         LXC_TYPE_CGFILE,
  63         LXC_TYPE_PROC_MEMINFO,
  64         LXC_TYPE_PROC_CPUINFO,
  65         LXC_TYPE_PROC_UPTIME,
  66         LXC_TYPE_PROC_STAT,
  67         LXC_TYPE_PROC_DISKSTATS,
  68         LXC_TYPE_PROC_SWAPS,
  69 };
  70
  71 struct file_info {
  72         char *controller;
  73         char *cgroup;
  74         char *file;
  75         int type;
  76         char *buf;  // unused as of yet
  77         int buflen;
  78         int size; //actual data size
  79         int cached;
  80 };
  81
  82 /* Reserve buffer size to account for file size changes. */
  83 #define BUF_RESERVE_SIZE 512
  84
  85 /*
  86  * A table caching which pid is init for a pid namespace.
  87  * When looking up which pid is init for $qpid, we first
  88  * 1. Stat /proc/$qpid/ns/pid.
  89  * 2. Check whether the ino_t is in our store.
  90  *   a. if not, fork a child in qpid's ns to send us
  91  *       ucred.pid = 1, and read the initpid.  Cache
  92  *       initpid and creation time for /proc/initpid
  93  *       in a new store entry.
  94  *   b. if so, verify that /proc/initpid still matches
  95  *       what we have saved.  If not, clear the store
  96  *       entry and go back to a.  If so, return the
  97  *       cached initpid.
  98  */
  99 struct pidns_init_store {
 100         ino_t ino;          // inode number for /proc/$pid/ns/pid
 101         pid_t initpid;      // the pid of nit in that ns
 102         long int ctime;     // the time at which /proc/$initpid was created
 103         struct pidns_init_store *next;
 104         long int lastcheck;
 105 };
 106
 107 /* lol - look at how they are allocated in the kernel */
 108 #define PIDNS_HASH_SIZE 4096
 109 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
 110
 111 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
 112 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
 113 static void lock_mutex(pthread_mutex_t *l)
 114 {
 115         int ret;
 116
 117         if ((ret = pthread_mutex_lock(l)) != 0) {
 118                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 119                 exit(1);
 120         }
 121 }
 122
 123 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 124  * Number of hierarchies mounted. */
 125 static int num_hierarchies;
 126
 127 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 128  * Hierachies mounted {cpuset, blkio, ...}:
 129  * Initialized via __constructor__ collect_and_mount_subsystems(). */
 130 static char **hierarchies;
 131
 132 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
 133  * Open file descriptors:
 134  * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
 135  * private mount namespace.
 136  * Initialized via __constructor__ collect_and_mount_subsystems().
 137  * @fd_hierarchies[i] can be used to perform file operations on the cgroup
 138  * mounts and respective files in the private namespace even when located in
 139  * another namespace using the *at() family of functions
 140  * {openat(), fchownat(), ...}. */
 141 static int *fd_hierarchies;
 142 static int cgroup_mount_ns_fd = -1;
 143
 144 static void unlock_mutex(pthread_mutex_t *l)
 145 {
 146         int ret;
 147
 148         if ((ret = pthread_mutex_unlock(l)) != 0) {
 149                 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
 150                 exit(1);
 151         }
 152 }
 153
 154 static void store_lock(void)
 155 {
 156         lock_mutex(&pidns_store_mutex);
 157 }
 158
 159 static void store_unlock(void)
 160 {
 161         unlock_mutex(&pidns_store_mutex);
 162 }
 163
 164 /* Must be called under store_lock */
 165 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
 166 {
 167         struct stat initsb;
 168         char fnam[100];
 169
 170         snprintf(fnam, 100, "/proc/%d", e->initpid);
 171         if (stat(fnam, &initsb) < 0)
 172                 return false;
 173
 174         lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
 175                     initsb.st_ctime, e->initpid);
 176
 177         if (e->ctime != initsb.st_ctime)
 178                 return false;
 179         return true;
 180 }
 181
 182 /* Must be called under store_lock */
 183 static void remove_initpid(struct pidns_init_store *e)
 184 {
 185         struct pidns_init_store *tmp;
 186         int h;
 187
 188         lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
 189
 190         h = HASH(e->ino);
 191         if (pidns_hash_table[h] == e) {
 192                 pidns_hash_table[h] = e->next;
 193                 free(e);
 194                 return;
 195         }
 196
 197         tmp = pidns_hash_table[h];
 198         while (tmp) {
 199                 if (tmp->next == e) {
 200                         tmp->next = e->next;
 201                         free(e);
 202                         return;
 203                 }
 204                 tmp = tmp->next;
 205         }
 206 }
 207
 208 #define PURGE_SECS 5
 209 /* Must be called under store_lock */
 210 static void prune_initpid_store(void)
 211 {
 212         static long int last_prune = 0;
 213         struct pidns_init_store *e, *prev, *delme;
 214         long int now, threshold;
 215         int i;
 216
 217         if (!last_prune) {
 218                 last_prune = time(NULL);
 219                 return;
 220         }
 221         now = time(NULL);
 222         if (now < last_prune + PURGE_SECS)
 223                 return;
 224
 225         lxcfs_debug("%s\n", "Pruning.");
 226
 227         last_prune = now;
 228         threshold = now - 2 * PURGE_SECS;
 229
 230         for (i = 0; i < PIDNS_HASH_SIZE; i++) {
 231                 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
 232                         if (e->lastcheck < threshold) {
 233
 234                                 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
 235
 236                                 delme = e;
 237                                 if (prev)
 238                                         prev->next = e->next;
 239                                 else
 240                                         pidns_hash_table[i] = e->next;
 241                                 e = e->next;
 242                                 free(delme);
 243                         } else {
 244                                 prev = e;
 245                                 e = e->next;
 246                         }
 247                 }
 248         }
 249 }
 250
 251 /* Must be called under store_lock */
 252 static void save_initpid(struct stat *sb, pid_t pid)
 253 {
 254         struct pidns_init_store *e;
 255         char fpath[100];
 256         struct stat procsb;
 257         int h;
 258
 259         lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
 260
 261         snprintf(fpath, 100, "/proc/%d", pid);
 262         if (stat(fpath, &procsb) < 0)
 263                 return;
 264         do {
 265                 e = malloc(sizeof(*e));
 266         } while (!e);
 267         e->ino = sb->st_ino;
 268         e->initpid = pid;
 269         e->ctime = procsb.st_ctime;
 270         h = HASH(e->ino);
 271         e->next = pidns_hash_table[h];
 272         e->lastcheck = time(NULL);
 273         pidns_hash_table[h] = e;
 274 }
 275
 276 /*
 277  * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
 278  * entry for the inode number and creation time.  Verify that the init pid
 279  * is still valid.  If not, remove it.  Return the entry if valid, NULL
 280  * otherwise.
 281  * Must be called under store_lock
 282  */
 283 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
 284 {
 285         int h = HASH(sb->st_ino);
 286         struct pidns_init_store *e = pidns_hash_table[h];
 287
 288         while (e) {
 289                 if (e->ino == sb->st_ino) {
 290                         if (initpid_still_valid(e, sb)) {
 291                                 e->lastcheck = time(NULL);
 292                                 return e;
 293                         }
 294                         remove_initpid(e);
 295                         return NULL;
 296                 }
 297                 e = e->next;
 298         }
 299
 300         return NULL;
 301 }
 302
 303 static int is_dir(const char *path, int fd)
 304 {
 305         struct stat statbuf;
 306         int ret = fstatat(fd, path, &statbuf, fd);
 307         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 308                 return 1;
 309         return 0;
 310 }
 311
 312 static char *must_copy_string(const char *str)
 313 {
 314         char *dup = NULL;
 315         if (!str)
 316                 return NULL;
 317         do {
 318                 dup = strdup(str);
 319         } while (!dup);
 320
 321         return dup;
 322 }
 323
 324 static inline void drop_trailing_newlines(char *s)
 325 {
 326         int l;
 327
 328         for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
 329                 s[l-1] = '\0';
 330 }
 331
 332 #define BATCH_SIZE 50
 333 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
 334 {
 335         int newbatches = (newlen / BATCH_SIZE) + 1;
 336         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 337
 338         if (!*mem || newbatches > oldbatches) {
 339                 char *tmp;
 340                 do {
 341                         tmp = realloc(*mem, newbatches * BATCH_SIZE);
 342                 } while (!tmp);
 343                 *mem = tmp;
 344         }
 345 }
 346 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
 347 {
 348         size_t newlen = *len + linelen;
 349         dorealloc(contents, *len, newlen + 1);
 350         memcpy(*contents + *len, line, linelen+1);
 351         *len = newlen;
 352 }
 353
 354 static char *slurp_file(const char *from, int fd)
 355 {
 356         char *line = NULL;
 357         char *contents = NULL;
 358         FILE *f = fdopen(fd, "r");
 359         size_t len = 0, fulllen = 0;
 360         ssize_t linelen;
 361
 362         if (!f)
 363                 return NULL;
 364
 365         while ((linelen = getline(&line, &len, f)) != -1) {
 366                 append_line(&contents, &fulllen, line, linelen);
 367         }
 368         fclose(f);
 369
 370         if (contents)
 371                 drop_trailing_newlines(contents);
 372         free(line);
 373         return contents;
 374 }
 375
 376 static bool write_string(const char *fnam, const char *string, int fd)
 377 {
 378         FILE *f;
 379         size_t len, ret;
 380
 381         if (!(f = fdopen(fd, "w")))
 382                 return false;
 383         len = strlen(string);
 384         ret = fwrite(string, 1, len, f);
 385         if (ret != len) {
 386                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 387                 fclose(f);
 388                 return false;
 389         }
 390         if (fclose(f) < 0) {
 391                 lxcfs_error("Error writing to file: %s\n", strerror(errno));
 392                 return false;
 393         }
 394         return true;
 395 }
 396
 397 struct cgfs_files {
 398         char *name;
 399         uint32_t uid, gid;
 400         uint32_t mode;
 401 };
 402
 403 #define ALLOC_NUM 20
 404 static bool store_hierarchy(char *stridx, char *h)
 405 {
 406         if (num_hierarchies % ALLOC_NUM == 0) {
 407                 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
 408                 n *= ALLOC_NUM;
 409                 char **tmp = realloc(hierarchies, n * sizeof(char *));
 410                 if (!tmp) {
 411                         lxcfs_error("%s\n", strerror(errno));
 412                         exit(1);
 413                 }
 414                 hierarchies = tmp;
 415         }
 416
 417         hierarchies[num_hierarchies++] = must_copy_string(h);
 418         return true;
 419 }
 420
 421 static void print_subsystems(void)
 422 {
 423         int i;
 424
 425         fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
 426         fprintf(stderr, "hierarchies:\n");
 427         for (i = 0; i < num_hierarchies; i++) {
 428                 if (hierarchies[i])
 429                         fprintf(stderr, " %2d: fd: %3d: %s\n", i,
 430                                 fd_hierarchies[i], hierarchies[i]);
 431         }
 432 }
 433
 434 static bool in_comma_list(const char *needle, const char *haystack)
 435 {
 436         const char *s = haystack, *e;
 437         size_t nlen = strlen(needle);
 438
 439         while (*s && (e = strchr(s, ','))) {
 440                 if (nlen != e - s) {
 441                         s = e + 1;
 442                         continue;
 443                 }
 444                 if (strncmp(needle, s, nlen) == 0)
 445                         return true;
 446                 s = e + 1;
 447         }
 448         if (strcmp(needle, s) == 0)
 449                 return true;
 450         return false;
 451 }
 452
 453 /* do we need to do any massaging here?  I'm not sure... */
 454 /* Return the mounted controller and store the corresponding open file descriptor
 455  * referring to the controller mountpoint in the private lxcfs namespace in
 456  * @cfd.
 457  */
 458 static char *find_mounted_controller(const char *controller, int *cfd)
 459 {
 460         int i;
 461
 462         for (i = 0; i < num_hierarchies; i++) {
 463                 if (!hierarchies[i])
 464                         continue;
 465                 if (strcmp(hierarchies[i], controller) == 0) {
 466                         *cfd = fd_hierarchies[i];
 467                         return hierarchies[i];
 468                 }
 469                 if (in_comma_list(controller, hierarchies[i])) {
 470                         *cfd = fd_hierarchies[i];
 471                         return hierarchies[i];
 472                 }
 473         }
 474
 475         return NULL;
 476 }
 477
 478 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
 479                 const char *value)
 480 {
 481         int ret, fd, cfd;
 482         size_t len;
 483         char *fnam, *tmpc;
 484
 485         tmpc = find_mounted_controller(controller, &cfd);
 486         if (!tmpc)
 487                 return false;
 488
 489         /* Make sure we pass a relative path to *at() family of functions.
 490          * . + /cgroup + / + file + \0
 491          */
 492         len = strlen(cgroup) + strlen(file) + 3;
 493         fnam = alloca(len);
 494         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 495         if (ret < 0 || (size_t)ret >= len)
 496                 return false;
 497
 498         fd = openat(cfd, fnam, O_WRONLY);
 499         if (fd < 0)
 500                 return false;
 501
 502         return write_string(fnam, value, fd);
 503 }
 504
 505 // Chown all the files in the cgroup directory.  We do this when we create
 506 // a cgroup on behalf of a user.
 507 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 508 {
 509         struct dirent *direntp;
 510         char path[MAXPATHLEN];
 511         size_t len;
 512         DIR *d;
 513         int fd1, ret;
 514
 515         len = strlen(dirname);
 516         if (len >= MAXPATHLEN) {
 517                 lxcfs_error("Pathname too long: %s\n", dirname);
 518                 return;
 519         }
 520
 521         fd1 = openat(fd, dirname, O_DIRECTORY);
 522         if (fd1 < 0)
 523                 return;
 524
 525         d = fdopendir(fd1);
 526         if (!d) {
 527                 lxcfs_error("Failed to open %s\n", dirname);
 528                 return;
 529         }
 530
 531         while ((direntp = readdir(d))) {
 532                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 533                         continue;
 534                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 535                 if (ret < 0 || ret >= MAXPATHLEN) {
 536                         lxcfs_error("Pathname too long under %s\n", dirname);
 537                         continue;
 538                 }
 539                 if (fchownat(fd, path, uid, gid, 0) < 0)
 540                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 541         }
 542         closedir(d);
 543 }
 544
 545 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 546 {
 547         int cfd;
 548         size_t len;
 549         char *dirnam, *tmpc;
 550
 551         tmpc = find_mounted_controller(controller, &cfd);
 552         if (!tmpc)
 553                 return -EINVAL;
 554
 555         /* Make sure we pass a relative path to *at() family of functions.
 556          * . + /cg + \0
 557          */
 558         len = strlen(cg) + 2;
 559         dirnam = alloca(len);
 560         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 561
 562         if (mkdirat(cfd, dirnam, 0755) < 0)
 563                 return -errno;
 564
 565         if (uid == 0 && gid == 0)
 566                 return 0;
 567
 568         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 569                 return -errno;
 570
 571         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 572
 573         return 0;
 574 }
 575
 576 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 577 {
 578         struct dirent *direntp;
 579         DIR *dir;
 580         bool ret = false;
 581         char pathname[MAXPATHLEN];
 582         int dupfd;
 583
 584         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 585         if (dupfd < 0)
 586                 return false;
 587
 588         dir = fdopendir(dupfd);
 589         if (!dir) {
 590                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 591                 close(dupfd);
 592                 return false;
 593         }
 594
 595         while ((direntp = readdir(dir))) {
 596                 struct stat mystat;
 597                 int rc;
 598
 599                 if (!strcmp(direntp->d_name, ".") ||
 600                     !strcmp(direntp->d_name, ".."))
 601                         continue;
 602
 603                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 604                 if (rc < 0 || rc >= MAXPATHLEN) {
 605                         lxcfs_error("%s\n", "Pathname too long.");
 606                         continue;
 607                 }
 608
 609                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 610                 if (rc) {
 611                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 612                         continue;
 613                 }
 614                 if (S_ISDIR(mystat.st_mode))
 615                         if (!recursive_rmdir(pathname, fd, cfd))
 616                                 lxcfs_debug("Error removing %s.\n", pathname);
 617         }
 618
 619         ret = true;
 620         if (closedir(dir) < 0) {
 621                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 622                 ret = false;
 623         }
 624
 625         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 626                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 627                 ret = false;
 628         }
 629
 630         close(dupfd);
 631
 632         return ret;
 633 }
 634
 635 bool cgfs_remove(const char *controller, const char *cg)
 636 {
 637         int fd, cfd;
 638         size_t len;
 639         char *dirnam, *tmpc;
 640         bool bret;
 641
 642         tmpc = find_mounted_controller(controller, &cfd);
 643         if (!tmpc)
 644                 return false;
 645
 646         /* Make sure we pass a relative path to *at() family of functions.
 647          * . +  /cg + \0
 648          */
 649         len = strlen(cg) + 2;
 650         dirnam = alloca(len);
 651         snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
 652
 653         fd = openat(cfd, dirnam, O_DIRECTORY);
 654         if (fd < 0)
 655                 return false;
 656
 657         bret = recursive_rmdir(dirnam, fd, cfd);
 658         close(fd);
 659         return bret;
 660 }
 661
 662 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 663 {
 664         int cfd;
 665         size_t len;
 666         char *pathname, *tmpc;
 667
 668         tmpc = find_mounted_controller(controller, &cfd);
 669         if (!tmpc)
 670                 return false;
 671
 672         /* Make sure we pass a relative path to *at() family of functions.
 673          * . + /file + \0
 674          */
 675         len = strlen(file) + 2;
 676         pathname = alloca(len);
 677         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 678         if (fchmodat(cfd, pathname, mode, 0) < 0)
 679                 return false;
 680         return true;
 681 }
 682
 683 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 684 {
 685         size_t len;
 686         char *fname;
 687
 688         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 689         fname = alloca(len);
 690         snprintf(fname, len, "%s/tasks", dirname);
 691         if (fchownat(fd, fname, uid, gid, 0) != 0)
 692                 return -errno;
 693         snprintf(fname, len, "%s/cgroup.procs", dirname);
 694         if (fchownat(fd, fname, uid, gid, 0) != 0)
 695                 return -errno;
 696         return 0;
 697 }
 698
 699 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
 700 {
 701         int cfd;
 702         size_t len;
 703         char *pathname, *tmpc;
 704
 705         tmpc = find_mounted_controller(controller, &cfd);
 706         if (!tmpc)
 707                 return -EINVAL;
 708
 709         /* Make sure we pass a relative path to *at() family of functions.
 710          * . + /file + \0
 711          */
 712         len = strlen(file) + 2;
 713         pathname = alloca(len);
 714         snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
 715         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
 716                 return -errno;
 717
 718         if (is_dir(pathname, cfd))
 719                 // like cgmanager did, we want to chown the tasks file as well
 720                 return chown_tasks_files(pathname, uid, gid, cfd);
 721
 722         return 0;
 723 }
 724
 725 FILE *open_pids_file(const char *controller, const char *cgroup)
 726 {
 727         int fd, cfd;
 728         size_t len;
 729         char *pathname, *tmpc;
 730
 731         tmpc = find_mounted_controller(controller, &cfd);
 732         if (!tmpc)
 733                 return NULL;
 734
 735         /* Make sure we pass a relative path to *at() family of functions.
 736          * . + /cgroup + / "cgroup.procs" + \0
 737          */
 738         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
 739         pathname = alloca(len);
 740         snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
 741
 742         fd = openat(cfd, pathname, O_WRONLY);
 743         if (fd < 0)
 744                 return NULL;
 745
 746         return fdopen(fd, "w");
 747 }
 748
 749 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
 750                                 void ***list, size_t typesize,
 751                                 void* (*iterator)(const char*, const char*, const char*))
 752 {
 753         int cfd, fd, ret;
 754         size_t len;
 755         char *cg, *tmpc;
 756         char pathname[MAXPATHLEN];
 757         size_t sz = 0, asz = 0;
 758         struct dirent *dirent;
 759         DIR *dir;
 760
 761         tmpc = find_mounted_controller(controller, &cfd);
 762         *list = NULL;
 763         if (!tmpc)
 764                 return false;
 765
 766         /* Make sure we pass a relative path to *at() family of functions. */
 767         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
 768         cg = alloca(len);
 769         ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
 770         if (ret < 0 || (size_t)ret >= len) {
 771                 lxcfs_error("Pathname too long under %s\n", cgroup);
 772                 return false;
 773         }
 774
 775         fd = openat(cfd, cg, O_DIRECTORY);
 776         if (fd < 0)
 777                 return false;
 778
 779         dir = fdopendir(fd);
 780         if (!dir)
 781                 return false;
 782
 783         while ((dirent = readdir(dir))) {
 784                 struct stat mystat;
 785
 786                 if (!strcmp(dirent->d_name, ".") ||
 787                     !strcmp(dirent->d_name, ".."))
 788                         continue;
 789
 790                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
 791                 if (ret < 0 || ret >= MAXPATHLEN) {
 792                         lxcfs_error("Pathname too long under %s\n", cg);
 793                         continue;
 794                 }
 795
 796                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 797                 if (ret) {
 798                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
 799                         continue;
 800                 }
 801                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
 802                     (directories && !S_ISDIR(mystat.st_mode)))
 803                         continue;
 804
 805                 if (sz+2 >= asz) {
 806                         void **tmp;
 807                         asz += BATCH_SIZE;
 808                         do {
 809                                 tmp = realloc(*list, asz * typesize);
 810                         } while  (!tmp);
 811                         *list = tmp;
 812                 }
 813                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
 814                 (*list)[sz+1] = NULL;
 815                 sz++;
 816         }
 817         if (closedir(dir) < 0) {
 818                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
 819                 return false;
 820         }
 821         return true;
 822 }
 823
 824 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 825 {
 826         char *dup;
 827         do {
 828                 dup = strdup(dir_entry);
 829         } while (!dup);
 830         return dup;
 831 }
 832
 833 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
 834 {
 835         return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
 836 }
 837
 838 void free_key(struct cgfs_files *k)
 839 {
 840         if (!k)
 841                 return;
 842         free(k->name);
 843         free(k);
 844 }
 845
 846 void free_keys(struct cgfs_files **keys)
 847 {
 848         int i;
 849
 850         if (!keys)
 851                 return;
 852         for (i = 0; keys[i]; i++) {
 853                 free_key(keys[i]);
 854         }
 855         free(keys);
 856 }
 857
 858 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
 859 {
 860         int ret, fd, cfd;
 861         size_t len;
 862         char *fnam, *tmpc;
 863
 864         tmpc = find_mounted_controller(controller, &cfd);
 865         if (!tmpc)
 866                 return false;
 867
 868         /* Make sure we pass a relative path to *at() family of functions.
 869          * . + /cgroup + / + file + \0
 870          */
 871         len = strlen(cgroup) + strlen(file) + 3;
 872         fnam = alloca(len);
 873         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
 874         if (ret < 0 || (size_t)ret >= len)
 875                 return false;
 876
 877         fd = openat(cfd, fnam, O_RDONLY);
 878         if (fd < 0)
 879                 return false;
 880
 881         *value = slurp_file(fnam, fd);
 882         return *value != NULL;
 883 }
 884
 885 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
 886 {
 887         int ret, cfd;
 888         size_t len;
 889         char *fnam, *tmpc;
 890         struct stat sb;
 891         struct cgfs_files *newkey;
 892
 893         tmpc = find_mounted_controller(controller, &cfd);
 894         if (!tmpc)
 895                 return false;
 896
 897         if (file && *file == '/')
 898                 file++;
 899
 900         if (file && strchr(file, '/'))
 901                 return NULL;
 902
 903         /* Make sure we pass a relative path to *at() family of functions.
 904          * . + /cgroup + / + file + \0
 905          */
 906         len = strlen(cgroup) + 3;
 907         if (file)
 908                 len += strlen(file) + 1;
 909         fnam = alloca(len);
 910         snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
 911                  file ? "/" : "", file ? file : "");
 912
 913         ret = fstatat(cfd, fnam, &sb, 0);
 914         if (ret < 0)
 915                 return NULL;
 916
 917         do {
 918                 newkey = malloc(sizeof(struct cgfs_files));
 919         } while (!newkey);
 920         if (file)
 921                 newkey->name = must_copy_string(file);
 922         else if (strrchr(cgroup, '/'))
 923                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
 924         else
 925                 newkey->name = must_copy_string(cgroup);
 926         newkey->uid = sb.st_uid;
 927         newkey->gid = sb.st_gid;
 928         newkey->mode = sb.st_mode;
 929
 930         return newkey;
 931 }
 932
 933 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
 934 {
 935         struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
 936         if (!entry) {
 937                 lxcfs_error("Error getting files under %s:%s\n", controller,
 938                              cgroup);
 939         }
 940         return entry;
 941 }
 942
 943 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
 944 {
 945         return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
 946 }
 947
 948 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
 949 {
 950         int cfd;
 951         size_t len;
 952         char *fnam, *tmpc;
 953         int ret;
 954         struct stat sb;
 955
 956         tmpc = find_mounted_controller(controller, &cfd);
 957         if (!tmpc)
 958                 return false;
 959
 960         /* Make sure we pass a relative path to *at() family of functions.
 961          * . + /cgroup + / + f + \0
 962          */
 963         len = strlen(cgroup) + strlen(f) + 3;
 964         fnam = alloca(len);
 965         ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
 966         if (ret < 0 || (size_t)ret >= len)
 967                 return false;
 968
 969         ret = fstatat(cfd, fnam, &sb, 0);
 970         if (ret < 0 || !S_ISDIR(sb.st_mode))
 971                 return false;
 972
 973         return true;
 974 }
 975
 976 #define SEND_CREDS_OK 0
 977 #define SEND_CREDS_NOTSK 1
 978 #define SEND_CREDS_FAIL 2
 979 static bool recv_creds(int sock, struct ucred *cred, char *v);
 980 static int wait_for_pid(pid_t pid);
 981 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
 982 static int send_creds_clone_wrapper(void *arg);
 983
 984 /*
 985  * clone a task which switches to @task's namespace and writes '1'.
 986  * over a unix sock so we can read the task's reaper's pid in our
 987  * namespace
 988  *
 989  * Note: glibc's fork() does not respect pidns, which can lead to failed
 990  * assertions inside glibc (and thus failed forks) if the child's pid in
 991  * the pidns and the parent pid outside are identical. Using clone prevents
 992  * this issue.
 993  */
 994 static void write_task_init_pid_exit(int sock, pid_t target)
 995 {
 996         char fnam[100];
 997         pid_t pid;
 998         int fd, ret;
 999         size_t stack_size = sysconf(_SC_PAGESIZE);
1000         void *stack = alloca(stack_size);
1001
1002         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1003         if (ret < 0 || ret >= sizeof(fnam))
1004                 _exit(1);
1005
1006         fd = open(fnam, O_RDONLY);
1007         if (fd < 0) {
1008                 perror("write_task_init_pid_exit open of ns/pid");
1009                 _exit(1);
1010         }
1011         if (setns(fd, 0)) {
1012                 perror("write_task_init_pid_exit setns 1");
1013                 close(fd);
1014                 _exit(1);
1015         }
1016         pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1017         if (pid < 0)
1018                 _exit(1);
1019         if (pid != 0) {
1020                 if (!wait_for_pid(pid))
1021                         _exit(1);
1022                 _exit(0);
1023         }
1024 }
1025
1026 static int send_creds_clone_wrapper(void *arg) {
1027         struct ucred cred;
1028         char v;
1029         int sock = *(int *)arg;
1030
1031         /* we are the child */
1032         cred.uid = 0;
1033         cred.gid = 0;
1034         cred.pid = 1;
1035         v = '1';
1036         if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1037                 return 1;
1038         return 0;
1039 }
1040
1041 static pid_t get_init_pid_for_task(pid_t task)
1042 {
1043         int sock[2];
1044         pid_t pid;
1045         pid_t ret = -1;
1046         char v = '0';
1047         struct ucred cred;
1048
1049         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1050                 perror("socketpair");
1051                 return -1;
1052         }
1053
1054         pid = fork();
1055         if (pid < 0)
1056                 goto out;
1057         if (!pid) {
1058                 close(sock[1]);
1059                 write_task_init_pid_exit(sock[0], task);
1060                 _exit(0);
1061         }
1062
1063         if (!recv_creds(sock[1], &cred, &v))
1064                 goto out;
1065         ret = cred.pid;
1066
1067 out:
1068         close(sock[0]);
1069         close(sock[1]);
1070         if (pid > 0)
1071                 wait_for_pid(pid);
1072         return ret;
1073 }
1074
1075 static pid_t lookup_initpid_in_store(pid_t qpid)
1076 {
1077         pid_t answer = 0;
1078         struct stat sb;
1079         struct pidns_init_store *e;
1080         char fnam[100];
1081
1082         snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1083         store_lock();
1084         if (stat(fnam, &sb) < 0)
1085                 goto out;
1086         e = lookup_verify_initpid(&sb);
1087         if (e) {
1088                 answer = e->initpid;
1089                 goto out;
1090         }
1091         answer = get_init_pid_for_task(qpid);
1092         if (answer > 0)
1093                 save_initpid(&sb, answer);
1094
1095 out:
1096         /* we prune at end in case we are returning
1097          * the value we were about to return */
1098         prune_initpid_store();
1099         store_unlock();
1100         return answer;
1101 }
1102
1103 static int wait_for_pid(pid_t pid)
1104 {
1105         int status, ret;
1106
1107         if (pid <= 0)
1108                 return -1;
1109
1110 again:
1111         ret = waitpid(pid, &status, 0);
1112         if (ret == -1) {
1113                 if (errno == EINTR)
1114                         goto again;
1115                 return -1;
1116         }
1117         if (ret != pid)
1118                 goto again;
1119         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1120                 return -1;
1121         return 0;
1122 }
1123
1124
1125 /*
1126  * append pid to *src.
1127  * src: a pointer to a char* in which ot append the pid.
1128  * sz: the number of characters printed so far, minus trailing \0.
1129  * asz: the allocated size so far
1130  * pid: the pid to append
1131  */
1132 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1133 {
1134         char tmp[30];
1135
1136         int tmplen = sprintf(tmp, "%d\n", (int)pid);
1137
1138         if (!*src || tmplen + *sz + 1 >= *asz) {
1139                 char *tmp;
1140                 do {
1141                         tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1142                 } while (!tmp);
1143                 *src = tmp;
1144                 *asz += BUF_RESERVE_SIZE;
1145         }
1146         memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1147         *sz += tmplen;
1148 }
1149
1150 /*
1151  * Given a open file * to /proc/pid/{u,g}id_map, and an id
1152  * valid in the caller's namespace, return the id mapped into
1153  * pid's namespace.
1154  * Returns the mapped id, or -1 on error.
1155  */
1156 unsigned int
1157 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1158 {
1159         unsigned int nsuid,   // base id for a range in the idfile's namespace
1160                      hostuid, // base id for a range in the caller's namespace
1161                      count;   // number of ids in this range
1162         char line[400];
1163         int ret;
1164
1165         fseek(idfile, 0L, SEEK_SET);
1166         while (fgets(line, 400, idfile)) {
1167                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1168                 if (ret != 3)
1169                         continue;
1170                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1171                         /*
1172                          * uids wrapped around - unexpected as this is a procfile,
1173                          * so just bail.
1174                          */
1175                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1176                                 nsuid, hostuid, count, line);
1177                         return -1;
1178                 }
1179                 if (hostuid <= in_id && hostuid+count > in_id) {
1180                         /*
1181                          * now since hostuid <= in_id < hostuid+count, and
1182                          * hostuid+count and nsuid+count do not wrap around,
1183                          * we know that nsuid+(in_id-hostuid) which must be
1184                          * less that nsuid+(count) must not wrap around
1185                          */
1186                         return (in_id - hostuid) + nsuid;
1187                 }
1188         }
1189
1190         // no answer found
1191         return -1;
1192 }
1193
1194 /*
1195  * for is_privileged_over,
1196  * specify whether we require the calling uid to be root in his
1197  * namespace
1198  */
1199 #define NS_ROOT_REQD true
1200 #define NS_ROOT_OPT false
1201
1202 #define PROCLEN 100
1203
1204 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1205 {
1206         char fpath[PROCLEN];
1207         int ret;
1208         bool answer = false;
1209         uid_t nsuid;
1210
1211         if (victim == -1 || uid == -1)
1212                 return false;
1213
1214         /*
1215          * If the request is one not requiring root in the namespace,
1216          * then having the same uid suffices.  (i.e. uid 1000 has write
1217          * access to files owned by uid 1000
1218          */
1219         if (!req_ns_root && uid == victim)
1220                 return true;
1221
1222         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1223         if (ret < 0 || ret >= PROCLEN)
1224                 return false;
1225         FILE *f = fopen(fpath, "r");
1226         if (!f)
1227                 return false;
1228
1229         /* if caller's not root in his namespace, reject */
1230         nsuid = convert_id_to_ns(f, uid);
1231         if (nsuid)
1232                 goto out;
1233
1234         /*
1235          * If victim is not mapped into caller's ns, reject.
1236          * XXX I'm not sure this check is needed given that fuse
1237          * will be sending requests where the vfs has converted
1238          */
1239         nsuid = convert_id_to_ns(f, victim);
1240         if (nsuid == -1)
1241                 goto out;
1242
1243         answer = true;
1244
1245 out:
1246         fclose(f);
1247         return answer;
1248 }
1249
1250 static bool perms_include(int fmode, mode_t req_mode)
1251 {
1252         mode_t r;
1253
1254         switch (req_mode & O_ACCMODE) {
1255         case O_RDONLY:
1256                 r = S_IROTH;
1257                 break;
1258         case O_WRONLY:
1259                 r = S_IWOTH;
1260                 break;
1261         case O_RDWR:
1262                 r = S_IROTH | S_IWOTH;
1263                 break;
1264         default:
1265                 return false;
1266         }
1267         return ((fmode & r) == r);
1268 }
1269
1270
1271 /*
1272  * taskcg is  a/b/c
1273  * querycg is /a/b/c/d/e
1274  * we return 'd'
1275  */
1276 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1277 {
1278         char *start, *end;
1279
1280         if (strlen(taskcg) <= strlen(querycg)) {
1281                 lxcfs_error("%s\n", "I was fed bad input.");
1282                 return NULL;
1283         }
1284
1285         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1286                 start =  strdup(taskcg + 1);
1287         else
1288                 start = strdup(taskcg + strlen(querycg) + 1);
1289         if (!start)
1290                 return NULL;
1291         end = strchr(start, '/');
1292         if (end)
1293                 *end = '\0';
1294         return start;
1295 }
1296
1297 static void stripnewline(char *x)
1298 {
1299         size_t l = strlen(x);
1300         if (l && x[l-1] == '\n')
1301                 x[l-1] = '\0';
1302 }
1303
1304 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1305 {
1306         int cfd;
1307         char fnam[PROCLEN];
1308         FILE *f;
1309         char *answer = NULL;
1310         char *line = NULL;
1311         size_t len = 0;
1312         int ret;
1313         const char *h = find_mounted_controller(contrl, &cfd);
1314         if (!h)
1315                 return NULL;
1316
1317         ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1318         if (ret < 0 || ret >= PROCLEN)
1319                 return NULL;
1320         if (!(f = fopen(fnam, "r")))
1321                 return NULL;
1322
1323         while (getline(&line, &len, f) != -1) {
1324                 char *c1, *c2;
1325                 if (!line[0])
1326                         continue;
1327                 c1 = strchr(line, ':');
1328                 if (!c1)
1329                         goto out;
1330                 c1++;
1331                 c2 = strchr(c1, ':');
1332                 if (!c2)
1333                         goto out;
1334                 *c2 = '\0';
1335                 if (strcmp(c1, h) != 0)
1336                         continue;
1337                 c2++;
1338                 stripnewline(c2);
1339                 do {
1340                         answer = strdup(c2);
1341                 } while (!answer);
1342                 break;
1343         }
1344
1345 out:
1346         fclose(f);
1347         free(line);
1348         return answer;
1349 }
1350
1351 /*
1352  * check whether a fuse context may access a cgroup dir or file
1353  *
1354  * If file is not null, it is a cgroup file to check under cg.
1355  * If file is null, then we are checking perms on cg itself.
1356  *
1357  * For files we can check the mode of the list_keys result.
1358  * For cgroups, we must make assumptions based on the files under the
1359  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1360  * yet.
1361  */
1362 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1363 {
1364         struct cgfs_files *k = NULL;
1365         bool ret = false;
1366
1367         k = cgfs_get_key(contrl, cg, file);
1368         if (!k)
1369                 return false;
1370
1371         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1372                 if (perms_include(k->mode >> 6, mode)) {
1373                         ret = true;
1374                         goto out;
1375                 }
1376         }
1377         if (fc->gid == k->gid) {
1378                 if (perms_include(k->mode >> 3, mode)) {
1379                         ret = true;
1380                         goto out;
1381                 }
1382         }
1383         ret = perms_include(k->mode, mode);
1384
1385 out:
1386         free_key(k);
1387         return ret;
1388 }
1389
1390 #define INITSCOPE "/init.scope"
1391 static void prune_init_slice(char *cg)
1392 {
1393         char *point;
1394         size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1395
1396         if (cg_len < initscope_len)
1397                 return;
1398
1399         point = cg + cg_len - initscope_len;
1400         if (strcmp(point, INITSCOPE) == 0) {
1401                 if (point == cg)
1402                         *(point+1) = '\0';
1403                 else
1404                         *point = '\0';
1405         }
1406 }
1407
1408 /*
1409  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1410  * If pid is in /a, he may act on /a/b, but not on /b.
1411  * if the answer is false and nextcg is not NULL, then *nextcg will point
1412  * to a string containing the next cgroup directory under cg, which must be
1413  * freed by the caller.
1414  */
1415 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1416 {
1417         bool answer = false;
1418         char *c2 = get_pid_cgroup(pid, contrl);
1419         char *linecmp;
1420
1421         if (!c2)
1422                 return false;
1423         prune_init_slice(c2);
1424
1425         /*
1426          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1427          * they pass in a cgroup without leading '/'
1428          *
1429          * The original line here was:
1430          *      linecmp = *cg == '/' ? c2 : c2+1;
1431          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1432          *       Serge, do you know?
1433          */
1434         if (*cg == '/' || !strncmp(cg, "./", 2))
1435                 linecmp = c2;
1436         else
1437                 linecmp = c2 + 1;
1438         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1439                 if (nextcg) {
1440                         *nextcg = get_next_cgroup_dir(linecmp, cg);
1441                 }
1442                 goto out;
1443         }
1444         answer = true;
1445
1446 out:
1447         free(c2);
1448         return answer;
1449 }
1450
1451 /*
1452  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1453  */
1454 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1455 {
1456         bool answer = false;
1457         char *c2, *task_cg;
1458         size_t target_len, task_len;
1459
1460         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1461                 return true;
1462
1463         c2 = get_pid_cgroup(pid, contrl);
1464         if (!c2)
1465                 return false;
1466         prune_init_slice(c2);
1467
1468         task_cg = c2 + 1;
1469         target_len = strlen(cg);
1470         task_len = strlen(task_cg);
1471         if (task_len == 0) {
1472                 /* Task is in the root cg, it can see everything. This case is
1473                  * not handled by the strmcps below, since they test for the
1474                  * last /, but that is the first / that we've chopped off
1475                  * above.
1476                  */
1477                 answer = true;
1478                 goto out;
1479         }
1480         if (strcmp(cg, task_cg) == 0) {
1481                 answer = true;
1482                 goto out;
1483         }
1484         if (target_len < task_len) {
1485                 /* looking up a parent dir */
1486                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1487                         answer = true;
1488                 goto out;
1489         }
1490         if (target_len > task_len) {
1491                 /* looking up a child dir */
1492                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1493                         answer = true;
1494                 goto out;
1495         }
1496
1497 out:
1498         free(c2);
1499         return answer;
1500 }
1501
1502 /*
1503  * given /cgroup/freezer/a/b, return "freezer".
1504  * the returned char* should NOT be freed.
1505  */
1506 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1507 {
1508         const char *p1;
1509         char *contr, *slash;
1510
1511         if (strlen(path) < 9) {
1512                 errno = EACCES;
1513                 return NULL;
1514         }
1515         if (*(path + 7) != '/') {
1516                 errno = EINVAL;
1517                 return NULL;
1518         }
1519         p1 = path + 8;
1520         contr = strdupa(p1);
1521         if (!contr) {
1522                 errno = ENOMEM;
1523                 return NULL;
1524         }
1525         slash = strstr(contr, "/");
1526         if (slash)
1527                 *slash = '\0';
1528
1529         int i;
1530         for (i = 0; i < num_hierarchies; i++) {
1531                 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1532                         return hierarchies[i];
1533         }
1534         errno = ENOENT;
1535         return NULL;
1536 }
1537
1538 /*
1539  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1540  * Note that the returned value may include files (keynames) etc
1541  */
1542 static const char *find_cgroup_in_path(const char *path)
1543 {
1544         const char *p1;
1545
1546         if (strlen(path) < 9) {
1547                 errno = EACCES;
1548                 return NULL;
1549         }
1550         p1 = strstr(path + 8, "/");
1551         if (!p1) {
1552                 errno = EINVAL;
1553                 return NULL;
1554         }
1555         errno = 0;
1556         return p1 + 1;
1557 }
1558
1559 /*
1560  * split the last path element from the path in @cg.
1561  * @dir is newly allocated and should be freed, @last not
1562 */
1563 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1564 {
1565         char *p;
1566
1567         do {
1568                 *dir = strdup(cg);
1569         } while (!*dir);
1570         *last = strrchr(cg, '/');
1571         if (!*last) {
1572                 *last = NULL;
1573                 return;
1574         }
1575         p = strrchr(*dir, '/');
1576         *p = '\0';
1577 }
1578
1579 /*
1580  * FUSE ops for /cgroup
1581  */
1582
1583 int cg_getattr(const char *path, struct stat *sb)
1584 {
1585         struct timespec now;
1586         struct fuse_context *fc = fuse_get_context();
1587         char * cgdir = NULL;
1588         char *last = NULL, *path1, *path2;
1589         struct cgfs_files *k = NULL;
1590         const char *cgroup;
1591         const char *controller = NULL;
1592         int ret = -ENOENT;
1593
1594
1595         if (!fc)
1596                 return -EIO;
1597
1598         memset(sb, 0, sizeof(struct stat));
1599
1600         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1601                 return -EINVAL;
1602
1603         sb->st_uid = sb->st_gid = 0;
1604         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1605         sb->st_size = 0;
1606
1607         if (strcmp(path, "/cgroup") == 0) {
1608                 sb->st_mode = S_IFDIR | 00755;
1609                 sb->st_nlink = 2;
1610                 return 0;
1611         }
1612
1613         controller = pick_controller_from_path(fc, path);
1614         if (!controller)
1615                 return -errno;
1616         cgroup = find_cgroup_in_path(path);
1617         if (!cgroup) {
1618                 /* this is just /cgroup/controller, return it as a dir */
1619                 sb->st_mode = S_IFDIR | 00755;
1620                 sb->st_nlink = 2;
1621                 return 0;
1622         }
1623
1624         get_cgdir_and_path(cgroup, &cgdir, &last);
1625
1626         if (!last) {
1627                 path1 = "/";
1628                 path2 = cgdir;
1629         } else {
1630                 path1 = cgdir;
1631                 path2 = last;
1632         }
1633
1634         pid_t initpid = lookup_initpid_in_store(fc->pid);
1635         if (initpid <= 0)
1636                 initpid = fc->pid;
1637         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1638          * Then check that caller's cgroup is under path if last is a child
1639          * cgroup, or cgdir if last is a file */
1640
1641         if (is_child_cgroup(controller, path1, path2)) {
1642                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1643                         ret = -ENOENT;
1644                         goto out;
1645                 }
1646                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1647                         /* this is just /cgroup/controller, return it as a dir */
1648                         sb->st_mode = S_IFDIR | 00555;
1649                         sb->st_nlink = 2;
1650                         ret = 0;
1651                         goto out;
1652                 }
1653                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1654                         ret = -EACCES;
1655                         goto out;
1656                 }
1657
1658                 // get uid, gid, from '/tasks' file and make up a mode
1659                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1660                 sb->st_mode = S_IFDIR | 00755;
1661                 k = cgfs_get_key(controller, cgroup, NULL);
1662                 if (!k) {
1663                         sb->st_uid = sb->st_gid = 0;
1664                 } else {
1665                         sb->st_uid = k->uid;
1666                         sb->st_gid = k->gid;
1667                 }
1668                 free_key(k);
1669                 sb->st_nlink = 2;
1670                 ret = 0;
1671                 goto out;
1672         }
1673
1674         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1675                 sb->st_mode = S_IFREG | k->mode;
1676                 sb->st_nlink = 1;
1677                 sb->st_uid = k->uid;
1678                 sb->st_gid = k->gid;
1679                 sb->st_size = 0;
1680                 free_key(k);
1681                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1682                         ret = -ENOENT;
1683                         goto out;
1684                 }
1685                 ret = 0;
1686         }
1687
1688 out:
1689         free(cgdir);
1690         return ret;
1691 }
1692
1693 int cg_opendir(const char *path, struct fuse_file_info *fi)
1694 {
1695         struct fuse_context *fc = fuse_get_context();
1696         const char *cgroup;
1697         struct file_info *dir_info;
1698         char *controller = NULL;
1699
1700         if (!fc)
1701                 return -EIO;
1702
1703         if (strcmp(path, "/cgroup") == 0) {
1704                 cgroup = NULL;
1705                 controller = NULL;
1706         } else {
1707                 // return list of keys for the controller, and list of child cgroups
1708                 controller = pick_controller_from_path(fc, path);
1709                 if (!controller)
1710                         return -errno;
1711
1712                 cgroup = find_cgroup_in_path(path);
1713                 if (!cgroup) {
1714                         /* this is just /cgroup/controller, return its contents */
1715                         cgroup = "/";
1716                 }
1717         }
1718
1719         pid_t initpid = lookup_initpid_in_store(fc->pid);
1720         if (initpid <= 0)
1721                 initpid = fc->pid;
1722         if (cgroup) {
1723                 if (!caller_may_see_dir(initpid, controller, cgroup))
1724                         return -ENOENT;
1725                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1726                         return -EACCES;
1727         }
1728
1729         /* we'll free this at cg_releasedir */
1730         dir_info = malloc(sizeof(*dir_info));
1731         if (!dir_info)
1732                 return -ENOMEM;
1733         dir_info->controller = must_copy_string(controller);
1734         dir_info->cgroup = must_copy_string(cgroup);
1735         dir_info->type = LXC_TYPE_CGDIR;
1736         dir_info->buf = NULL;
1737         dir_info->file = NULL;
1738         dir_info->buflen = 0;
1739
1740         fi->fh = (unsigned long)dir_info;
1741         return 0;
1742 }
1743
1744 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1745                 struct fuse_file_info *fi)
1746 {
1747         struct file_info *d = (struct file_info *)fi->fh;
1748         struct cgfs_files **list = NULL;
1749         int i, ret;
1750         char *nextcg = NULL;
1751         struct fuse_context *fc = fuse_get_context();
1752         char **clist = NULL;
1753
1754         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1755                 return -EIO;
1756
1757         if (d->type != LXC_TYPE_CGDIR) {
1758                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1759                 return -EIO;
1760         }
1761         if (!d->cgroup && !d->controller) {
1762                 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1763                 int i;
1764
1765                 for (i = 0;  i < num_hierarchies; i++) {
1766                         if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1767                                 return -EIO;
1768                         }
1769                 }
1770                 return 0;
1771         }
1772
1773         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1774                 // not a valid cgroup
1775                 ret = -EINVAL;
1776                 goto out;
1777         }
1778
1779         pid_t initpid = lookup_initpid_in_store(fc->pid);
1780         if (initpid <= 0)
1781                 initpid = fc->pid;
1782         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1783                 if (nextcg) {
1784                         ret = filler(buf, nextcg,  NULL, 0);
1785                         free(nextcg);
1786                         if (ret != 0) {
1787                                 ret = -EIO;
1788                                 goto out;
1789                         }
1790                 }
1791                 ret = 0;
1792                 goto out;
1793         }
1794
1795         for (i = 0; list[i]; i++) {
1796                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1797                         ret = -EIO;
1798                         goto out;
1799                 }
1800         }
1801
1802         // now get the list of child cgroups
1803
1804         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1805                 ret = 0;
1806                 goto out;
1807         }
1808         if (clist) {
1809                 for (i = 0; clist[i]; i++) {
1810                         if (filler(buf, clist[i], NULL, 0) != 0) {
1811                                 ret = -EIO;
1812                                 goto out;
1813                         }
1814                 }
1815         }
1816         ret = 0;
1817
1818 out:
1819         free_keys(list);
1820         if (clist) {
1821                 for (i = 0; clist[i]; i++)
1822                         free(clist[i]);
1823                 free(clist);
1824         }
1825         return ret;
1826 }
1827
1828 static void do_release_file_info(struct fuse_file_info *fi)
1829 {
1830         struct file_info *f = (struct file_info *)fi->fh;
1831
1832         if (!f)
1833                 return;
1834
1835         fi->fh = 0;
1836
1837         free(f->controller);
1838         f->controller = NULL;
1839         free(f->cgroup);
1840         f->cgroup = NULL;
1841         free(f->file);
1842         f->file = NULL;
1843         free(f->buf);
1844         f->buf = NULL;
1845         free(f);
1846 }
1847
1848 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1849 {
1850         do_release_file_info(fi);
1851         return 0;
1852 }
1853
1854 int cg_open(const char *path, struct fuse_file_info *fi)
1855 {
1856         const char *cgroup;
1857         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1858         struct cgfs_files *k = NULL;
1859         struct file_info *file_info;
1860         struct fuse_context *fc = fuse_get_context();
1861         int ret;
1862
1863         if (!fc)
1864                 return -EIO;
1865
1866         controller = pick_controller_from_path(fc, path);
1867         if (!controller)
1868                 return -errno;
1869         cgroup = find_cgroup_in_path(path);
1870         if (!cgroup)
1871                 return -errno;
1872
1873         get_cgdir_and_path(cgroup, &cgdir, &last);
1874         if (!last) {
1875                 path1 = "/";
1876                 path2 = cgdir;
1877         } else {
1878                 path1 = cgdir;
1879                 path2 = last;
1880         }
1881
1882         k = cgfs_get_key(controller, path1, path2);
1883         if (!k) {
1884                 ret = -EINVAL;
1885                 goto out;
1886         }
1887         free_key(k);
1888
1889         pid_t initpid = lookup_initpid_in_store(fc->pid);
1890         if (initpid <= 0)
1891                 initpid = fc->pid;
1892         if (!caller_may_see_dir(initpid, controller, path1)) {
1893                 ret = -ENOENT;
1894                 goto out;
1895         }
1896         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1897                 ret = -EACCES;
1898                 goto out;
1899         }
1900
1901         /* we'll free this at cg_release */
1902         file_info = malloc(sizeof(*file_info));
1903         if (!file_info) {
1904                 ret = -ENOMEM;
1905                 goto out;
1906         }
1907         file_info->controller = must_copy_string(controller);
1908         file_info->cgroup = must_copy_string(path1);
1909         file_info->file = must_copy_string(path2);
1910         file_info->type = LXC_TYPE_CGFILE;
1911         file_info->buf = NULL;
1912         file_info->buflen = 0;
1913
1914         fi->fh = (unsigned long)file_info;
1915         ret = 0;
1916
1917 out:
1918         free(cgdir);
1919         return ret;
1920 }
1921
1922 int cg_access(const char *path, int mode)
1923 {
1924         int ret;
1925         const char *cgroup;
1926         char *path1, *path2, *controller;
1927         char *last = NULL, *cgdir = NULL;
1928         struct cgfs_files *k = NULL;
1929         struct fuse_context *fc = fuse_get_context();
1930
1931         if (strcmp(path, "/cgroup") == 0)
1932                 return 0;
1933
1934         if (!fc)
1935                 return -EIO;
1936
1937         controller = pick_controller_from_path(fc, path);
1938         if (!controller)
1939                 return -errno;
1940         cgroup = find_cgroup_in_path(path);
1941         if (!cgroup) {
1942                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
1943                 if ((mode & W_OK) == 0)
1944                         return 0;
1945                 return -EACCES;
1946         }
1947
1948         get_cgdir_and_path(cgroup, &cgdir, &last);
1949         if (!last) {
1950                 path1 = "/";
1951                 path2 = cgdir;
1952         } else {
1953                 path1 = cgdir;
1954                 path2 = last;
1955         }
1956
1957         k = cgfs_get_key(controller, path1, path2);
1958         if (!k) {
1959                 if ((mode & W_OK) == 0)
1960                         ret = 0;
1961                 else
1962                         ret = -EACCES;
1963                 goto out;
1964         }
1965         free_key(k);
1966
1967         pid_t initpid = lookup_initpid_in_store(fc->pid);
1968         if (initpid <= 0)
1969                 initpid = fc->pid;
1970         if (!caller_may_see_dir(initpid, controller, path1)) {
1971                 ret = -ENOENT;
1972                 goto out;
1973         }
1974         if (!fc_may_access(fc, controller, path1, path2, mode)) {
1975                 ret = -EACCES;
1976                 goto out;
1977         }
1978
1979         ret = 0;
1980
1981 out:
1982         free(cgdir);
1983         return ret;
1984 }
1985
1986 int cg_release(const char *path, struct fuse_file_info *fi)
1987 {
1988         do_release_file_info(fi);
1989         return 0;
1990 }
1991
1992 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1993
1994 static bool wait_for_sock(int sock, int timeout)
1995 {
1996         struct epoll_event ev;
1997         int epfd, ret, now, starttime, deltatime, saved_errno;
1998
1999         if ((starttime = time(NULL)) < 0)
2000                 return false;
2001
2002         if ((epfd = epoll_create(1)) < 0) {
2003                 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2004                 return false;
2005         }
2006
2007         ev.events = POLLIN_SET;
2008         ev.data.fd = sock;
2009         if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2010                 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2011                 close(epfd);
2012                 return false;
2013         }
2014
2015 again:
2016         if ((now = time(NULL)) < 0) {
2017                 close(epfd);
2018                 return false;
2019         }
2020
2021         deltatime = (starttime + timeout) - now;
2022         if (deltatime < 0) { // timeout
2023                 errno = 0;
2024                 close(epfd);
2025                 return false;
2026         }
2027         ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2028         if (ret < 0 && errno == EINTR)
2029                 goto again;
2030         saved_errno = errno;
2031         close(epfd);
2032
2033         if (ret <= 0) {
2034                 errno = saved_errno;
2035                 return false;
2036         }
2037         return true;
2038 }
2039
2040 static int msgrecv(int sockfd, void *buf, size_t len)
2041 {
2042         if (!wait_for_sock(sockfd, 2))
2043                 return -1;
2044         return recv(sockfd, buf, len, MSG_DONTWAIT);
2045 }
2046
2047 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2048 {
2049         struct msghdr msg = { 0 };
2050         struct iovec iov;
2051         struct cmsghdr *cmsg;
2052         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2053         char buf[1];
2054         buf[0] = 'p';
2055
2056         if (pingfirst) {
2057                 if (msgrecv(sock, buf, 1) != 1) {
2058                         lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2059                         return SEND_CREDS_FAIL;
2060                 }
2061         }
2062
2063         msg.msg_control = cmsgbuf;
2064         msg.msg_controllen = sizeof(cmsgbuf);
2065
2066         cmsg = CMSG_FIRSTHDR(&msg);
2067         cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2068         cmsg->cmsg_level = SOL_SOCKET;
2069         cmsg->cmsg_type = SCM_CREDENTIALS;
2070         memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2071
2072         msg.msg_name = NULL;
2073         msg.msg_namelen = 0;
2074
2075         buf[0] = v;
2076         iov.iov_base = buf;
2077         iov.iov_len = sizeof(buf);
2078         msg.msg_iov = &iov;
2079         msg.msg_iovlen = 1;
2080
2081         if (sendmsg(sock, &msg, 0) < 0) {
2082                 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2083                 if (errno == 3)
2084                         return SEND_CREDS_NOTSK;
2085                 return SEND_CREDS_FAIL;
2086         }
2087
2088         return SEND_CREDS_OK;
2089 }
2090
2091 static bool recv_creds(int sock, struct ucred *cred, char *v)
2092 {
2093         struct msghdr msg = { 0 };
2094         struct iovec iov;
2095         struct cmsghdr *cmsg;
2096         char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2097         char buf[1];
2098         int ret;
2099         int optval = 1;
2100
2101         *v = '1';
2102
2103         cred->pid = -1;
2104         cred->uid = -1;
2105         cred->gid = -1;
2106
2107         if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2108                 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2109                 return false;
2110         }
2111         buf[0] = '1';
2112         if (write(sock, buf, 1) != 1) {
2113                 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2114                 return false;
2115         }
2116
2117         msg.msg_name = NULL;
2118         msg.msg_namelen = 0;
2119         msg.msg_control = cmsgbuf;
2120         msg.msg_controllen = sizeof(cmsgbuf);
2121
2122         iov.iov_base = buf;
2123         iov.iov_len = sizeof(buf);
2124         msg.msg_iov = &iov;
2125         msg.msg_iovlen = 1;
2126
2127         if (!wait_for_sock(sock, 2)) {
2128                 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2129                 return false;
2130         }
2131         ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2132         if (ret < 0) {
2133                 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2134                 return false;
2135         }
2136
2137         cmsg = CMSG_FIRSTHDR(&msg);
2138
2139         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2140                         cmsg->cmsg_level == SOL_SOCKET &&
2141                         cmsg->cmsg_type == SCM_CREDENTIALS) {
2142                 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2143         }
2144         *v = buf[0];
2145
2146         return true;
2147 }
2148
2149 struct pid_ns_clone_args {
2150         int *cpipe;
2151         int sock;
2152         pid_t tpid;
2153         int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2154 };
2155
2156 /*
2157  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2158  * with clone(). This simply writes '1' as ACK back to the parent
2159  * before calling the actual wrapped function.
2160  */
2161 static int pid_ns_clone_wrapper(void *arg) {
2162         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2163         char b = '1';
2164
2165         close(args->cpipe[0]);
2166         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2167                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2168         close(args->cpipe[1]);
2169         return args->wrapped(args->sock, args->tpid);
2170 }
2171
2172 /*
2173  * pid_to_ns - reads pids from a ucred over a socket, then writes the
2174  * int value back over the socket.  This shifts the pid from the
2175  * sender's pidns into tpid's pidns.
2176  */
2177 static int pid_to_ns(int sock, pid_t tpid)
2178 {
2179         char v = '0';
2180         struct ucred cred;
2181
2182         while (recv_creds(sock, &cred, &v)) {
2183                 if (v == '1')
2184                         return 0;
2185                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2186                         return 1;
2187         }
2188         return 0;
2189 }
2190
2191
2192 /*
2193  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2194  * in your old pidns.  Only children which you clone will be in the target
2195  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
2196  * actually convert pids.
2197  *
2198  * Note: glibc's fork() does not respect pidns, which can lead to failed
2199  * assertions inside glibc (and thus failed forks) if the child's pid in
2200  * the pidns and the parent pid outside are identical. Using clone prevents
2201  * this issue.
2202  */
2203 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2204 {
2205         int newnsfd = -1, ret, cpipe[2];
2206         char fnam[100];
2207         pid_t cpid;
2208         char v;
2209
2210         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2211         if (ret < 0 || ret >= sizeof(fnam))
2212                 _exit(1);
2213         newnsfd = open(fnam, O_RDONLY);
2214         if (newnsfd < 0)
2215                 _exit(1);
2216         if (setns(newnsfd, 0) < 0)
2217                 _exit(1);
2218         close(newnsfd);
2219
2220         if (pipe(cpipe) < 0)
2221                 _exit(1);
2222
2223         struct pid_ns_clone_args args = {
2224                 .cpipe = cpipe,
2225                 .sock = sock,
2226                 .tpid = tpid,
2227                 .wrapped = &pid_to_ns
2228         };
2229         size_t stack_size = sysconf(_SC_PAGESIZE);
2230         void *stack = alloca(stack_size);
2231
2232         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2233         if (cpid < 0)
2234                 _exit(1);
2235
2236         // give the child 1 second to be done forking and
2237         // write its ack
2238         if (!wait_for_sock(cpipe[0], 1))
2239                 _exit(1);
2240         ret = read(cpipe[0], &v, 1);
2241         if (ret != sizeof(char) || v != '1')
2242                 _exit(1);
2243
2244         if (!wait_for_pid(cpid))
2245                 _exit(1);
2246         _exit(0);
2247 }
2248
2249 /*
2250  * To read cgroup files with a particular pid, we will setns into the child
2251  * pidns, open a pipe, fork a child - which will be the first to really be in
2252  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2253  */
2254 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2255 {
2256         int sock[2] = {-1, -1};
2257         char *tmpdata = NULL;
2258         int ret;
2259         pid_t qpid, cpid = -1;
2260         bool answer = false;
2261         char v = '0';
2262         struct ucred cred;
2263         size_t sz = 0, asz = 0;
2264
2265         if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2266                 return false;
2267
2268         /*
2269          * Now we read the pids from returned data one by one, pass
2270          * them into a child in the target namespace, read back the
2271          * translated pids, and put them into our to-return data
2272          */
2273
2274         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2275                 perror("socketpair");
2276                 free(tmpdata);
2277                 return false;
2278         }
2279
2280         cpid = fork();
2281         if (cpid == -1)
2282                 goto out;
2283
2284         if (!cpid) // child - exits when done
2285                 pid_to_ns_wrapper(sock[1], tpid);
2286
2287         char *ptr = tmpdata;
2288         cred.uid = 0;
2289         cred.gid = 0;
2290         while (sscanf(ptr, "%d\n", &qpid) == 1) {
2291                 cred.pid = qpid;
2292                 ret = send_creds(sock[0], &cred, v, true);
2293
2294                 if (ret == SEND_CREDS_NOTSK)
2295                         goto next;
2296                 if (ret == SEND_CREDS_FAIL)
2297                         goto out;
2298
2299                 // read converted results
2300                 if (!wait_for_sock(sock[0], 2)) {
2301                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2302                         goto out;
2303                 }
2304                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2305                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2306                         goto out;
2307                 }
2308                 must_strcat_pid(d, &sz, &asz, qpid);
2309 next:
2310                 ptr = strchr(ptr, '\n');
2311                 if (!ptr)
2312                         break;
2313                 ptr++;
2314         }
2315
2316         cred.pid = getpid();
2317         v = '1';
2318         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2319                 // failed to ask child to exit
2320                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2321                 goto out;
2322         }
2323
2324         answer = true;
2325
2326 out:
2327         free(tmpdata);
2328         if (cpid != -1)
2329                 wait_for_pid(cpid);
2330         if (sock[0] != -1) {
2331                 close(sock[0]);
2332                 close(sock[1]);
2333         }
2334         return answer;
2335 }
2336
2337 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2338                 struct fuse_file_info *fi)
2339 {
2340         struct fuse_context *fc = fuse_get_context();
2341         struct file_info *f = (struct file_info *)fi->fh;
2342         struct cgfs_files *k = NULL;
2343         char *data = NULL;
2344         int ret, s;
2345         bool r;
2346
2347         if (f->type != LXC_TYPE_CGFILE) {
2348                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2349                 return -EIO;
2350         }
2351
2352         if (offset)
2353                 return 0;
2354
2355         if (!fc)
2356                 return -EIO;
2357
2358         if (!f->controller)
2359                 return -EINVAL;
2360
2361         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2362                 return -EINVAL;
2363         }
2364         free_key(k);
2365
2366
2367         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2368                 ret = -EACCES;
2369                 goto out;
2370         }
2371
2372         if (strcmp(f->file, "tasks") == 0 ||
2373                         strcmp(f->file, "/tasks") == 0 ||
2374                         strcmp(f->file, "/cgroup.procs") == 0 ||
2375                         strcmp(f->file, "cgroup.procs") == 0)
2376                 // special case - we have to translate the pids
2377                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2378         else
2379                 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2380
2381         if (!r) {
2382                 ret = -EINVAL;
2383                 goto out;
2384         }
2385
2386         if (!data) {
2387                 ret = 0;
2388                 goto out;
2389         }
2390         s = strlen(data);
2391         if (s > size)
2392                 s = size;
2393         memcpy(buf, data, s);
2394         if (s > 0 && s < size && data[s-1] != '\n')
2395                 buf[s++] = '\n';
2396
2397         ret = s;
2398
2399 out:
2400         free(data);
2401         return ret;
2402 }
2403
2404 static int pid_from_ns(int sock, pid_t tpid)
2405 {
2406         pid_t vpid;
2407         struct ucred cred;
2408         char v;
2409         int ret;
2410
2411         cred.uid = 0;
2412         cred.gid = 0;
2413         while (1) {
2414                 if (!wait_for_sock(sock, 2)) {
2415                         lxcfs_error("%s\n", "Timeout reading from parent.");
2416                         return 1;
2417                 }
2418                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2419                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2420                         return 1;
2421                 }
2422                 if (vpid == -1) // done
2423                         break;
2424                 v = '0';
2425                 cred.pid = vpid;
2426                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2427                         v = '1';
2428                         cred.pid = getpid();
2429                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2430                                 return 1;
2431                 }
2432         }
2433         return 0;
2434 }
2435
2436 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2437 {
2438         int newnsfd = -1, ret, cpipe[2];
2439         char fnam[100];
2440         pid_t cpid;
2441         char v;
2442
2443         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2444         if (ret < 0 || ret >= sizeof(fnam))
2445                 _exit(1);
2446         newnsfd = open(fnam, O_RDONLY);
2447         if (newnsfd < 0)
2448                 _exit(1);
2449         if (setns(newnsfd, 0) < 0)
2450                 _exit(1);
2451         close(newnsfd);
2452
2453         if (pipe(cpipe) < 0)
2454                 _exit(1);
2455
2456         struct pid_ns_clone_args args = {
2457                 .cpipe = cpipe,
2458                 .sock = sock,
2459                 .tpid = tpid,
2460                 .wrapped = &pid_from_ns
2461         };
2462         size_t stack_size = sysconf(_SC_PAGESIZE);
2463         void *stack = alloca(stack_size);
2464
2465         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2466         if (cpid < 0)
2467                 _exit(1);
2468
2469         // give the child 1 second to be done forking and
2470         // write its ack
2471         if (!wait_for_sock(cpipe[0], 1))
2472                 _exit(1);
2473         ret = read(cpipe[0], &v, 1);
2474         if (ret != sizeof(char) || v != '1')
2475                 _exit(1);
2476
2477         if (!wait_for_pid(cpid))
2478                 _exit(1);
2479         _exit(0);
2480 }
2481
2482 /*
2483  * Given host @uid, return the uid to which it maps in
2484  * @pid's user namespace, or -1 if none.
2485  */
2486 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2487 {
2488         FILE *f;
2489         char line[400];
2490
2491         sprintf(line, "/proc/%d/uid_map", pid);
2492         if ((f = fopen(line, "r")) == NULL) {
2493                 return false;
2494         }
2495
2496         *answer = convert_id_to_ns(f, uid);
2497         fclose(f);
2498
2499         if (*answer == -1)
2500                 return false;
2501         return true;
2502 }
2503
2504 /*
2505  * get_pid_creds: get the real uid and gid of @pid from
2506  * /proc/$$/status
2507  * (XXX should we use euid here?)
2508  */
2509 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2510 {
2511         char line[400];
2512         uid_t u;
2513         gid_t g;
2514         FILE *f;
2515
2516         *uid = -1;
2517         *gid = -1;
2518         sprintf(line, "/proc/%d/status", pid);
2519         if ((f = fopen(line, "r")) == NULL) {
2520                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2521                 return;
2522         }
2523         while (fgets(line, 400, f)) {
2524                 if (strncmp(line, "Uid:", 4) == 0) {
2525                         if (sscanf(line+4, "%u", &u) != 1) {
2526                                 lxcfs_error("bad uid line for pid %u\n", pid);
2527                                 fclose(f);
2528                                 return;
2529                         }
2530                         *uid = u;
2531                 } else if (strncmp(line, "Gid:", 4) == 0) {
2532                         if (sscanf(line+4, "%u", &g) != 1) {
2533                                 lxcfs_error("bad gid line for pid %u\n", pid);
2534                                 fclose(f);
2535                                 return;
2536                         }
2537                         *gid = g;
2538                 }
2539         }
2540         fclose(f);
2541 }
2542
2543 /*
2544  * May the requestor @r move victim @v to a new cgroup?
2545  * This is allowed if
2546  *   . they are the same task
2547  *   . they are ownedy by the same uid
2548  *   . @r is root on the host, or
2549  *   . @v's uid is mapped into @r's where @r is root.
2550  */
2551 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2552 {
2553         uid_t v_uid, tmpuid;
2554         gid_t v_gid;
2555
2556         if (r == v)
2557                 return true;
2558         if (r_uid == 0)
2559                 return true;
2560         get_pid_creds(v, &v_uid, &v_gid);
2561         if (r_uid == v_uid)
2562                 return true;
2563         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2564                         && hostuid_to_ns(v_uid, r, &tmpuid))
2565                 return true;
2566         return false;
2567 }
2568
2569 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2570                 const char *file, const char *buf)
2571 {
2572         int sock[2] = {-1, -1};
2573         pid_t qpid, cpid = -1;
2574         FILE *pids_file = NULL;
2575         bool answer = false, fail = false;
2576
2577         pids_file = open_pids_file(contrl, cg);
2578         if (!pids_file)
2579                 return false;
2580
2581         /*
2582          * write the pids to a socket, have helper in writer's pidns
2583          * call movepid for us
2584          */
2585         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2586                 perror("socketpair");
2587                 goto out;
2588         }
2589
2590         cpid = fork();
2591         if (cpid == -1)
2592                 goto out;
2593
2594         if (!cpid) { // child
2595                 fclose(pids_file);
2596                 pid_from_ns_wrapper(sock[1], tpid);
2597         }
2598
2599         const char *ptr = buf;
2600         while (sscanf(ptr, "%d", &qpid) == 1) {
2601                 struct ucred cred;
2602                 char v;
2603
2604                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2605                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2606                         goto out;
2607                 }
2608
2609                 if (recv_creds(sock[0], &cred, &v)) {
2610                         if (v == '0') {
2611                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
2612                                         fail = true;
2613                                         break;
2614                                 }
2615                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2616                                         fail = true;
2617                         }
2618                 }
2619
2620                 ptr = strchr(ptr, '\n');
2621                 if (!ptr)
2622                         break;
2623                 ptr++;
2624         }
2625
2626         /* All good, write the value */
2627         qpid = -1;
2628         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2629                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2630
2631         if (!fail)
2632                 answer = true;
2633
2634 out:
2635         if (cpid != -1)
2636                 wait_for_pid(cpid);
2637         if (sock[0] != -1) {
2638                 close(sock[0]);
2639                 close(sock[1]);
2640         }
2641         if (pids_file) {
2642                 if (fclose(pids_file) != 0)
2643                         answer = false;
2644         }
2645         return answer;
2646 }
2647
2648 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2649              struct fuse_file_info *fi)
2650 {
2651         struct fuse_context *fc = fuse_get_context();
2652         char *localbuf = NULL;
2653         struct cgfs_files *k = NULL;
2654         struct file_info *f = (struct file_info *)fi->fh;
2655         bool r;
2656
2657         if (f->type != LXC_TYPE_CGFILE) {
2658                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2659                 return -EIO;
2660         }
2661
2662         if (offset)
2663                 return 0;
2664
2665         if (!fc)
2666                 return -EIO;
2667
2668         localbuf = alloca(size+1);
2669         localbuf[size] = '\0';
2670         memcpy(localbuf, buf, size);
2671
2672         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2673                 size = -EINVAL;
2674                 goto out;
2675         }
2676
2677         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2678                 size = -EACCES;
2679                 goto out;
2680         }
2681
2682         if (strcmp(f->file, "tasks") == 0 ||
2683                         strcmp(f->file, "/tasks") == 0 ||
2684                         strcmp(f->file, "/cgroup.procs") == 0 ||
2685                         strcmp(f->file, "cgroup.procs") == 0)
2686                 // special case - we have to translate the pids
2687                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2688         else
2689                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2690
2691         if (!r)
2692                 size = -EINVAL;
2693
2694 out:
2695         free_key(k);
2696         return size;
2697 }
2698
2699 int cg_chown(const char *path, uid_t uid, gid_t gid)
2700 {
2701         struct fuse_context *fc = fuse_get_context();
2702         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2703         struct cgfs_files *k = NULL;
2704         const char *cgroup;
2705         int ret;
2706
2707         if (!fc)
2708                 return -EIO;
2709
2710         if (strcmp(path, "/cgroup") == 0)
2711                 return -EPERM;
2712
2713         controller = pick_controller_from_path(fc, path);
2714         if (!controller)
2715                 return errno == ENOENT ? -EPERM : -errno;
2716
2717         cgroup = find_cgroup_in_path(path);
2718         if (!cgroup)
2719                 /* this is just /cgroup/controller */
2720                 return -EPERM;
2721
2722         get_cgdir_and_path(cgroup, &cgdir, &last);
2723
2724         if (!last) {
2725                 path1 = "/";
2726                 path2 = cgdir;
2727         } else {
2728                 path1 = cgdir;
2729                 path2 = last;
2730         }
2731
2732         if (is_child_cgroup(controller, path1, path2)) {
2733                 // get uid, gid, from '/tasks' file and make up a mode
2734                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2735                 k = cgfs_get_key(controller, cgroup, "tasks");
2736
2737         } else
2738                 k = cgfs_get_key(controller, path1, path2);
2739
2740         if (!k) {
2741                 ret = -EINVAL;
2742                 goto out;
2743         }
2744
2745         /*
2746          * This being a fuse request, the uid and gid must be valid
2747          * in the caller's namespace.  So we can just check to make
2748          * sure that the caller is root in his uid, and privileged
2749          * over the file's current owner.
2750          */
2751         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2752                 ret = -EACCES;
2753                 goto out;
2754         }
2755
2756         ret = cgfs_chown_file(controller, cgroup, uid, gid);
2757
2758 out:
2759         free_key(k);
2760         free(cgdir);
2761
2762         return ret;
2763 }
2764
2765 int cg_chmod(const char *path, mode_t mode)
2766 {
2767         struct fuse_context *fc = fuse_get_context();
2768         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2769         struct cgfs_files *k = NULL;
2770         const char *cgroup;
2771         int ret;
2772
2773         if (!fc)
2774                 return -EIO;
2775
2776         if (strcmp(path, "/cgroup") == 0)
2777                 return -EPERM;
2778
2779         controller = pick_controller_from_path(fc, path);
2780         if (!controller)
2781                 return errno == ENOENT ? -EPERM : -errno;
2782
2783         cgroup = find_cgroup_in_path(path);
2784         if (!cgroup)
2785                 /* this is just /cgroup/controller */
2786                 return -EPERM;
2787
2788         get_cgdir_and_path(cgroup, &cgdir, &last);
2789
2790         if (!last) {
2791                 path1 = "/";
2792                 path2 = cgdir;
2793         } else {
2794                 path1 = cgdir;
2795                 path2 = last;
2796         }
2797
2798         if (is_child_cgroup(controller, path1, path2)) {
2799                 // get uid, gid, from '/tasks' file and make up a mode
2800                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2801                 k = cgfs_get_key(controller, cgroup, "tasks");
2802
2803         } else
2804                 k = cgfs_get_key(controller, path1, path2);
2805
2806         if (!k) {
2807                 ret = -EINVAL;
2808                 goto out;
2809         }
2810
2811         /*
2812          * This being a fuse request, the uid and gid must be valid
2813          * in the caller's namespace.  So we can just check to make
2814          * sure that the caller is root in his uid, and privileged
2815          * over the file's current owner.
2816          */
2817         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2818                 ret = -EPERM;
2819                 goto out;
2820         }
2821
2822         if (!cgfs_chmod_file(controller, cgroup, mode)) {
2823                 ret = -EINVAL;
2824                 goto out;
2825         }
2826
2827         ret = 0;
2828 out:
2829         free_key(k);
2830         free(cgdir);
2831         return ret;
2832 }
2833
2834 int cg_mkdir(const char *path, mode_t mode)
2835 {
2836         struct fuse_context *fc = fuse_get_context();
2837         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2838         const char *cgroup;
2839         int ret;
2840
2841         if (!fc)
2842                 return -EIO;
2843
2844         controller = pick_controller_from_path(fc, path);
2845         if (!controller)
2846                 return errno == ENOENT ? -EPERM : -errno;
2847
2848         cgroup = find_cgroup_in_path(path);
2849         if (!cgroup)
2850                 return -errno;
2851
2852         get_cgdir_and_path(cgroup, &cgdir, &last);
2853         if (!last)
2854                 path1 = "/";
2855         else
2856                 path1 = cgdir;
2857
2858         pid_t initpid = lookup_initpid_in_store(fc->pid);
2859         if (initpid <= 0)
2860                 initpid = fc->pid;
2861         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2862                 if (!next)
2863                         ret = -EINVAL;
2864                 else if (last && strcmp(next, last) == 0)
2865                         ret = -EEXIST;
2866                 else
2867                         ret = -EPERM;
2868                 goto out;
2869         }
2870
2871         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2872                 ret = -EACCES;
2873                 goto out;
2874         }
2875         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2876                 ret = -EACCES;
2877                 goto out;
2878         }
2879
2880         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2881
2882 out:
2883         free(cgdir);
2884         free(next);
2885         return ret;
2886 }
2887
2888 int cg_rmdir(const char *path)
2889 {
2890         struct fuse_context *fc = fuse_get_context();
2891         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2892         const char *cgroup;
2893         int ret;
2894
2895         if (!fc)
2896                 return -EIO;
2897
2898         controller = pick_controller_from_path(fc, path);
2899         if (!controller) /* Someone's trying to delete "/cgroup". */
2900                 return -EPERM;
2901
2902         cgroup = find_cgroup_in_path(path);
2903         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
2904                 return -EPERM;
2905
2906         get_cgdir_and_path(cgroup, &cgdir, &last);
2907         if (!last) {
2908                 /* Someone's trying to delete a cgroup on the same level as the
2909                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
2910                  * rmdir "/cgroup/blkio/init.slice".
2911                  */
2912                 ret = -EPERM;
2913                 goto out;
2914         }
2915
2916         pid_t initpid = lookup_initpid_in_store(fc->pid);
2917         if (initpid <= 0)
2918                 initpid = fc->pid;
2919         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2920                 if (!last || (next && (strcmp(next, last) == 0)))
2921                         ret = -EBUSY;
2922                 else
2923                         ret = -ENOENT;
2924                 goto out;
2925         }
2926
2927         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2928                 ret = -EACCES;
2929                 goto out;
2930         }
2931         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2932                 ret = -EACCES;
2933                 goto out;
2934         }
2935
2936         if (!cgfs_remove(controller, cgroup)) {
2937                 ret = -EINVAL;
2938                 goto out;
2939         }
2940
2941         ret = 0;
2942
2943 out:
2944         free(cgdir);
2945         free(next);
2946         return ret;
2947 }
2948
2949 static bool startswith(const char *line, const char *pref)
2950 {
2951         if (strncmp(line, pref, strlen(pref)) == 0)
2952                 return true;
2953         return false;
2954 }
2955
2956 static void parse_memstat(char *memstat, unsigned long *cached,
2957                 unsigned long *active_anon, unsigned long *inactive_anon,
2958                 unsigned long *active_file, unsigned long *inactive_file,
2959                 unsigned long *unevictable)
2960 {
2961         char *eol;
2962
2963         while (*memstat) {
2964                 if (startswith(memstat, "total_cache")) {
2965                         sscanf(memstat + 11, "%lu", cached);
2966                         *cached /= 1024;
2967                 } else if (startswith(memstat, "total_active_anon")) {
2968                         sscanf(memstat + 17, "%lu", active_anon);
2969                         *active_anon /= 1024;
2970                 } else if (startswith(memstat, "total_inactive_anon")) {
2971                         sscanf(memstat + 19, "%lu", inactive_anon);
2972                         *inactive_anon /= 1024;
2973                 } else if (startswith(memstat, "total_active_file")) {
2974                         sscanf(memstat + 17, "%lu", active_file);
2975                         *active_file /= 1024;
2976                 } else if (startswith(memstat, "total_inactive_file")) {
2977                         sscanf(memstat + 19, "%lu", inactive_file);
2978                         *inactive_file /= 1024;
2979                 } else if (startswith(memstat, "total_unevictable")) {
2980                         sscanf(memstat + 17, "%lu", unevictable);
2981                         *unevictable /= 1024;
2982                 }
2983                 eol = strchr(memstat, '\n');
2984                 if (!eol)
2985                         return;
2986                 memstat = eol+1;
2987         }
2988 }
2989
2990 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2991 {
2992         char *eol;
2993         char key[32];
2994
2995         memset(key, 0, 32);
2996         snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2997
2998         size_t len = strlen(key);
2999         *v = 0;
3000
3001         while (*str) {
3002                 if (startswith(str, key)) {
3003                         sscanf(str + len, "%lu", v);
3004                         return;
3005                 }
3006                 eol = strchr(str, '\n');
3007                 if (!eol)
3008                         return;
3009                 str = eol+1;
3010         }
3011 }
3012
3013 static int read_file(const char *path, char *buf, size_t size,
3014                      struct file_info *d)
3015 {
3016         size_t linelen = 0, total_len = 0, rv = 0;
3017         char *line = NULL;
3018         char *cache = d->buf;
3019         size_t cache_size = d->buflen;
3020         FILE *f = fopen(path, "r");
3021         if (!f)
3022                 return 0;
3023
3024         while (getline(&line, &linelen, f) != -1) {
3025                 ssize_t l = snprintf(cache, cache_size, "%s", line);
3026                 if (l < 0) {
3027                         perror("Error writing to cache");
3028                         rv = 0;
3029                         goto err;
3030                 }
3031                 if (l >= cache_size) {
3032                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3033                         rv = 0;
3034                         goto err;
3035                 }
3036                 cache += l;
3037                 cache_size -= l;
3038                 total_len += l;
3039         }
3040
3041         d->size = total_len;
3042         if (total_len > size)
3043                 total_len = size;
3044
3045         /* read from off 0 */
3046         memcpy(buf, d->buf, total_len);
3047         rv = total_len;
3048   err:
3049         fclose(f);
3050         free(line);
3051         return rv;
3052 }
3053
3054 /*
3055  * FUSE ops for /proc
3056  */
3057
3058 static unsigned long get_memlimit(const char *cgroup, const char *file)
3059 {
3060         char *memlimit_str = NULL;
3061         unsigned long memlimit = -1;
3062
3063         if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3064                 memlimit = strtoul(memlimit_str, NULL, 10);
3065
3066         free(memlimit_str);
3067
3068         return memlimit;
3069 }
3070
3071 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3072 {
3073         char *copy = strdupa(cgroup);
3074         unsigned long memlimit = 0, retlimit;
3075
3076         retlimit = get_memlimit(copy, file);
3077
3078         while (strcmp(copy, "/") != 0) {
3079                 copy = dirname(copy);
3080                 memlimit = get_memlimit(copy, file);
3081                 if (memlimit != -1 && memlimit < retlimit)
3082                         retlimit = memlimit;
3083         };
3084
3085         return retlimit;
3086 }
3087
3088 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3089                 struct fuse_file_info *fi)
3090 {
3091         struct fuse_context *fc = fuse_get_context();
3092         struct file_info *d = (struct file_info *)fi->fh;
3093         char *cg;
3094         char *memusage_str = NULL, *memstat_str = NULL,
3095                 *memswlimit_str = NULL, *memswusage_str = NULL;
3096         unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3097                 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3098                 active_file = 0, inactive_file = 0, unevictable = 0,
3099                 hostswtotal = 0;
3100         char *line = NULL;
3101         size_t linelen = 0, total_len = 0, rv = 0;
3102         char *cache = d->buf;
3103         size_t cache_size = d->buflen;
3104         FILE *f = NULL;
3105
3106         if (offset){
3107                 if (offset > d->size)
3108                         return -EINVAL;
3109                 if (!d->cached)
3110                         return 0;
3111                 int left = d->size - offset;
3112                 total_len = left > size ? size: left;
3113                 memcpy(buf, cache + offset, total_len);
3114                 return total_len;
3115         }
3116
3117         pid_t initpid = lookup_initpid_in_store(fc->pid);
3118         if (initpid <= 0)
3119                 initpid = fc->pid;
3120         cg = get_pid_cgroup(initpid, "memory");
3121         if (!cg)
3122                 return read_file("/proc/meminfo", buf, size, d);
3123         prune_init_slice(cg);
3124
3125         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3126         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3127                 goto err;
3128         if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3129                 goto err;
3130
3131         // Following values are allowed to fail, because swapaccount might be turned
3132         // off for current kernel
3133         if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3134                 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3135         {
3136                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3137                 memswusage = strtoul(memswusage_str, NULL, 10);
3138
3139                 memswlimit = memswlimit / 1024;
3140                 memswusage = memswusage / 1024;
3141         }
3142
3143         memusage = strtoul(memusage_str, NULL, 10);
3144         memlimit /= 1024;
3145         memusage /= 1024;
3146
3147         parse_memstat(memstat_str, &cached, &active_anon,
3148                         &inactive_anon, &active_file, &inactive_file,
3149                         &unevictable);
3150
3151         f = fopen("/proc/meminfo", "r");
3152         if (!f)
3153                 goto err;
3154
3155         while (getline(&line, &linelen, f) != -1) {
3156                 ssize_t l;
3157                 char *printme, lbuf[100];
3158
3159                 memset(lbuf, 0, 100);
3160                 if (startswith(line, "MemTotal:")) {
3161                         sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3162                         if (hosttotal < memlimit)
3163                                 memlimit = hosttotal;
3164                         snprintf(lbuf, 100, "MemTotal:       %8lu kB\n", memlimit);
3165                         printme = lbuf;
3166                 } else if (startswith(line, "MemFree:")) {
3167                         snprintf(lbuf, 100, "MemFree:        %8lu kB\n", memlimit - memusage);
3168                         printme = lbuf;
3169                 } else if (startswith(line, "MemAvailable:")) {
3170                         snprintf(lbuf, 100, "MemAvailable:   %8lu kB\n", memlimit - memusage + cached);
3171                         printme = lbuf;
3172                 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3173                         sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3174                         if (hostswtotal < memswlimit)
3175                                 memswlimit = hostswtotal;
3176                         snprintf(lbuf, 100, "SwapTotal:      %8lu kB\n", memswlimit);
3177                         printme = lbuf;
3178                 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3179                         unsigned long swaptotal = memswlimit,
3180                                         swapusage = memswusage - memusage,
3181                                         swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3182                         snprintf(lbuf, 100, "SwapFree:       %8lu kB\n", swapfree);
3183                         printme = lbuf;
3184                 } else if (startswith(line, "Slab:")) {
3185                         snprintf(lbuf, 100, "Slab:        %8lu kB\n", 0UL);
3186                         printme = lbuf;
3187                 } else if (startswith(line, "Buffers:")) {
3188                         snprintf(lbuf, 100, "Buffers:        %8lu kB\n", 0UL);
3189                         printme = lbuf;
3190                 } else if (startswith(line, "Cached:")) {
3191                         snprintf(lbuf, 100, "Cached:         %8lu kB\n", cached);
3192                         printme = lbuf;
3193                 } else if (startswith(line, "SwapCached:")) {
3194                         snprintf(lbuf, 100, "SwapCached:     %8lu kB\n", 0UL);
3195                         printme = lbuf;
3196                 } else if (startswith(line, "Active:")) {
3197                         snprintf(lbuf, 100, "Active:         %8lu kB\n",
3198                                         active_anon + active_file);
3199                         printme = lbuf;
3200                 } else if (startswith(line, "Inactive:")) {
3201                         snprintf(lbuf, 100, "Inactive:       %8lu kB\n",
3202                                         inactive_anon + inactive_file);
3203                         printme = lbuf;
3204                 } else if (startswith(line, "Active(anon)")) {
3205                         snprintf(lbuf, 100, "Active(anon):   %8lu kB\n", active_anon);
3206                         printme = lbuf;
3207                 } else if (startswith(line, "Inactive(anon)")) {
3208                         snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3209                         printme = lbuf;
3210                 } else if (startswith(line, "Active(file)")) {
3211                         snprintf(lbuf, 100, "Active(file):   %8lu kB\n", active_file);
3212                         printme = lbuf;
3213                 } else if (startswith(line, "Inactive(file)")) {
3214                         snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3215                         printme = lbuf;
3216                 } else if (startswith(line, "Unevictable")) {
3217                         snprintf(lbuf, 100, "Unevictable:    %8lu kB\n", unevictable);
3218                         printme = lbuf;
3219                 } else if (startswith(line, "SReclaimable")) {
3220                         snprintf(lbuf, 100, "SReclaimable:   %8lu kB\n", 0UL);
3221                         printme = lbuf;
3222                 } else if (startswith(line, "SUnreclaim")) {
3223                         snprintf(lbuf, 100, "SUnreclaim:     %8lu kB\n", 0UL);
3224                         printme = lbuf;
3225                 } else
3226                         printme = line;
3227
3228                 l = snprintf(cache, cache_size, "%s", printme);
3229                 if (l < 0) {
3230                         perror("Error writing to cache");
3231                         rv = 0;
3232                         goto err;
3233
3234                 }
3235                 if (l >= cache_size) {
3236                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3237                         rv = 0;
3238                         goto err;
3239                 }
3240
3241                 cache += l;
3242                 cache_size -= l;
3243                 total_len += l;
3244         }
3245
3246         d->cached = 1;
3247         d->size = total_len;
3248         if (total_len > size ) total_len = size;
3249         memcpy(buf, d->buf, total_len);
3250
3251         rv = total_len;
3252 err:
3253         if (f)
3254                 fclose(f);
3255         free(line);
3256         free(cg);
3257         free(memusage_str);
3258         free(memswlimit_str);
3259         free(memswusage_str);
3260         free(memstat_str);
3261         return rv;
3262 }
3263
3264 /*
3265  * Read the cpuset.cpus for cg
3266  * Return the answer in a newly allocated string which must be freed
3267  */
3268 static char *get_cpuset(const char *cg)
3269 {
3270         char *answer;
3271
3272         if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3273                 return NULL;
3274         return answer;
3275 }
3276
3277 bool cpu_in_cpuset(int cpu, const char *cpuset);
3278
3279 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3280 {
3281         int cpu;
3282
3283         if (sscanf(line, "processor       : %d", &cpu) != 1)
3284                 return false;
3285         return cpu_in_cpuset(cpu, cpuset);
3286 }
3287
3288 /*
3289  * check whether this is a '^processor" line in /proc/cpuinfo
3290  */
3291 static bool is_processor_line(const char *line)
3292 {
3293         int cpu;
3294
3295         if (sscanf(line, "processor       : %d", &cpu) == 1)
3296                 return true;
3297         return false;
3298 }
3299
3300 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3301                 struct fuse_file_info *fi)
3302 {
3303         struct fuse_context *fc = fuse_get_context();
3304         struct file_info *d = (struct file_info *)fi->fh;
3305         char *cg;
3306         char *cpuset = NULL;
3307         char *line = NULL;
3308         size_t linelen = 0, total_len = 0, rv = 0;
3309         bool am_printing = false, firstline = true, is_s390x = false;
3310         int curcpu = -1, cpu;
3311         char *cache = d->buf;
3312         size_t cache_size = d->buflen;
3313         FILE *f = NULL;
3314
3315         if (offset){
3316                 if (offset > d->size)
3317                         return -EINVAL;
3318                 if (!d->cached)
3319                         return 0;
3320                 int left = d->size - offset;
3321                 total_len = left > size ? size: left;
3322                 memcpy(buf, cache + offset, total_len);
3323                 return total_len;
3324         }
3325
3326         pid_t initpid = lookup_initpid_in_store(fc->pid);
3327         if (initpid <= 0)
3328                 initpid = fc->pid;
3329         cg = get_pid_cgroup(initpid, "cpuset");
3330         if (!cg)
3331                 return read_file("proc/cpuinfo", buf, size, d);
3332         prune_init_slice(cg);
3333
3334         cpuset = get_cpuset(cg);
3335         if (!cpuset)
3336                 goto err;
3337
3338         f = fopen("/proc/cpuinfo", "r");
3339         if (!f)
3340                 goto err;
3341
3342         while (getline(&line, &linelen, f) != -1) {
3343                 ssize_t l;
3344                 if (firstline) {
3345                         firstline = false;
3346                         if (strstr(line, "IBM/S390") != NULL) {
3347                                 is_s390x = true;
3348                                 am_printing = true;
3349                                 continue;
3350                         }
3351                 }
3352                 if (strncmp(line, "# processors:", 12) == 0)
3353                         continue;
3354                 if (is_processor_line(line)) {
3355                         am_printing = cpuline_in_cpuset(line, cpuset);
3356                         if (am_printing) {
3357                                 curcpu ++;
3358                                 l = snprintf(cache, cache_size, "processor      : %d\n", curcpu);
3359                                 if (l < 0) {
3360                                         perror("Error writing to cache");
3361                                         rv = 0;
3362                                         goto err;
3363                                 }
3364                                 if (l >= cache_size) {
3365                                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3366                                         rv = 0;
3367                                         goto err;
3368                                 }
3369                                 cache += l;
3370                                 cache_size -= l;
3371                                 total_len += l;
3372                         }
3373                         continue;
3374                 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3375                         char *p;
3376                         if (!cpu_in_cpuset(cpu, cpuset))
3377                                 continue;
3378                         curcpu ++;
3379                         p = strchr(line, ':');
3380                         if (!p || !*p)
3381                                 goto err;
3382                         p++;
3383                         l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3384                         if (l < 0) {
3385                                 perror("Error writing to cache");
3386                                 rv = 0;
3387                                 goto err;
3388                         }
3389                         if (l >= cache_size) {
3390                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3391                                 rv = 0;
3392                                 goto err;
3393                         }
3394                         cache += l;
3395                         cache_size -= l;
3396                         total_len += l;
3397                         continue;
3398
3399                 }
3400                 if (am_printing) {
3401                         l = snprintf(cache, cache_size, "%s", line);
3402                         if (l < 0) {
3403                                 perror("Error writing to cache");
3404                                 rv = 0;
3405                                 goto err;
3406                         }
3407                         if (l >= cache_size) {
3408                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3409                                 rv = 0;
3410                                 goto err;
3411                         }
3412                         cache += l;
3413                         cache_size -= l;
3414                         total_len += l;
3415                 }
3416         }
3417
3418         if (is_s390x) {
3419                 char *origcache = d->buf;
3420                 ssize_t l;
3421                 do {
3422                         d->buf = malloc(d->buflen);
3423                 } while (!d->buf);
3424                 cache = d->buf;
3425                 cache_size = d->buflen;
3426                 total_len = 0;
3427                 l = snprintf(cache, cache_size, "vendor_id       : IBM/S390\n");
3428                 if (l < 0 || l >= cache_size) {
3429                         free(origcache);
3430                         goto err;
3431                 }
3432                 cache_size -= l;
3433                 cache += l;
3434                 total_len += l;
3435                 l = snprintf(cache, cache_size, "# processors    : %d\n", curcpu + 1);
3436                 if (l < 0 || l >= cache_size) {
3437                         free(origcache);
3438                         goto err;
3439                 }
3440                 cache_size -= l;
3441                 cache += l;
3442                 total_len += l;
3443                 l = snprintf(cache, cache_size, "%s", origcache);
3444                 free(origcache);
3445                 if (l < 0 || l >= cache_size)
3446                         goto err;
3447                 total_len += l;
3448         }
3449
3450         d->cached = 1;
3451         d->size = total_len;
3452         if (total_len > size ) total_len = size;
3453
3454         /* read from off 0 */
3455         memcpy(buf, d->buf, total_len);
3456         rv = total_len;
3457 err:
3458         if (f)
3459                 fclose(f);
3460         free(line);
3461         free(cpuset);
3462         free(cg);
3463         return rv;
3464 }
3465
3466 static uint64_t get_reaper_start_time(pid_t pid)
3467 {
3468         int ret;
3469         FILE *f;
3470         uint64_t starttime;
3471         /* strlen("/proc/") = 6
3472          * +
3473          * LXCFS_NUMSTRLEN64
3474          * +
3475          * strlen("/stat") = 5
3476          * +
3477          * \0 = 1
3478          * */
3479 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
3480         char path[__PROC_PID_STAT_LEN];
3481         pid_t qpid;
3482
3483         qpid = lookup_initpid_in_store(pid);
3484         if (qpid <= 0) {
3485                 /* Caller can check for EINVAL on 0. */
3486                 errno = EINVAL;
3487                 return 0;
3488         }
3489
3490         ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
3491         if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
3492                 /* Caller can check for EINVAL on 0. */
3493                 errno = EINVAL;
3494                 return 0;
3495         }
3496
3497         f = fopen(path, "r");
3498         if (!f) {
3499                 /* Caller can check for EINVAL on 0. */
3500                 errno = EINVAL;
3501                 return 0;
3502         }
3503
3504         /* Note that the *scanf() argument supression requires that length
3505          * modifiers such as "l" are omitted. Otherwise some compilers will yell
3506          * at us. It's like telling someone you're not married and then asking
3507          * if you can bring your wife to the party.
3508          */
3509         ret = fscanf(f, "%*d "      /* (1)  pid         %d   */
3510                         "%*s "      /* (2)  comm        %s   */
3511                         "%*c "      /* (3)  state       %c   */
3512                         "%*d "      /* (4)  ppid        %d   */
3513                         "%*d "      /* (5)  pgrp        %d   */
3514                         "%*d "      /* (6)  session     %d   */
3515                         "%*d "      /* (7)  tty_nr      %d   */
3516                         "%*d "      /* (8)  tpgid       %d   */
3517                         "%*u "      /* (9)  flags       %u   */
3518                         "%*u "      /* (10) minflt      %lu  */
3519                         "%*u "      /* (11) cminflt     %lu  */
3520                         "%*u "      /* (12) majflt      %lu  */
3521                         "%*u "      /* (13) cmajflt     %lu  */
3522                         "%*u "      /* (14) utime       %lu  */
3523                         "%*u "      /* (15) stime       %lu  */
3524                         "%*d "      /* (16) cutime      %ld  */
3525                         "%*d "      /* (17) cstime      %ld  */
3526                         "%*d "      /* (18) priority    %ld  */
3527                         "%*d "      /* (19) nice        %ld  */
3528                         "%*d "      /* (20) num_threads %ld  */
3529                         "%*d "      /* (21) itrealvalue %ld  */
3530                         "%" PRIu64, /* (22) starttime   %llu */
3531                      &starttime);
3532         if (ret != 1) {
3533                 fclose(f);
3534                 /* Caller can check for EINVAL on 0. */
3535                 errno = EINVAL;
3536                 return 0;
3537         }
3538
3539         fclose(f);
3540
3541         errno = 0;
3542         return starttime;
3543 }
3544
3545 static uint64_t get_reaper_start_time_in_sec(pid_t pid)
3546 {
3547         uint64_t clockticks;
3548         int64_t ticks_per_sec;
3549
3550         clockticks = get_reaper_start_time(pid);
3551         if (clockticks == 0 && errno == EINVAL) {
3552                 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
3553                 return 0;
3554         }
3555
3556         ticks_per_sec = sysconf(_SC_CLK_TCK);
3557         if (ticks_per_sec < 0 && errno == EINVAL) {
3558                 lxcfs_debug(
3559                     "%s\n",
3560                     "failed to determine number of clock ticks in a second");
3561                 return 0;
3562         }
3563
3564         return (clockticks /= ticks_per_sec);
3565 }
3566
3567 static uint64_t get_reaper_age(pid_t pid)
3568 {
3569         uint64_t procstart, uptime, procage;
3570
3571         /* We need to substract the time the process has started since system
3572          * boot minus the time when the system has started to get the actual
3573          * reaper age.
3574          */
3575         procstart = get_reaper_start_time_in_sec(pid);
3576         procage = procstart;
3577         if (procstart > 0) {
3578                 int ret;
3579                 struct timespec spec;
3580
3581                 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
3582                 if (ret < 0)
3583                         return 0;
3584                 /* We could make this more precise here by using the tv_nsec
3585                  * field in the timespec struct and convert it to milliseconds
3586                  * and then create a double for the seconds and milliseconds but
3587                  * that seems more work than it is worth.
3588                  */
3589                 uptime = spec.tv_sec;
3590                 procage = uptime - procstart;
3591         }
3592
3593         return procage;
3594 }
3595
3596 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
3597 static int proc_stat_read(char *buf, size_t size, off_t offset,
3598                 struct fuse_file_info *fi)
3599 {
3600         struct fuse_context *fc = fuse_get_context();
3601         struct file_info *d = (struct file_info *)fi->fh;
3602         char *cg;
3603         char *cpuset = NULL;
3604         char *line = NULL;
3605         size_t linelen = 0, total_len = 0, rv = 0;
3606         int curcpu = -1; /* cpu numbering starts at 0 */
3607         unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
3608         unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3609                                         irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
3610         char cpuall[CPUALL_MAX_SIZE];
3611         /* reserve for cpu all */
3612         char *cache = d->buf + CPUALL_MAX_SIZE;
3613         size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3614         FILE *f = NULL;
3615
3616         if (offset){
3617                 if (offset > d->size)
3618                         return -EINVAL;
3619                 if (!d->cached)
3620                         return 0;
3621                 int left = d->size - offset;
3622                 total_len = left > size ? size: left;
3623                 memcpy(buf, d->buf + offset, total_len);
3624                 return total_len;
3625         }
3626
3627         pid_t initpid = lookup_initpid_in_store(fc->pid);
3628         if (initpid <= 0)
3629                 initpid = fc->pid;
3630         cg = get_pid_cgroup(initpid, "cpuset");
3631         if (!cg)
3632                 return read_file("/proc/stat", buf, size, d);
3633         prune_init_slice(cg);
3634
3635         cpuset = get_cpuset(cg);
3636         if (!cpuset)
3637                 goto err;
3638
3639         f = fopen("/proc/stat", "r");
3640         if (!f)
3641                 goto err;
3642
3643         //skip first line
3644         if (getline(&line, &linelen, f) < 0) {
3645                 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3646                 goto err;
3647         }
3648
3649         while (getline(&line, &linelen, f) != -1) {
3650                 ssize_t l;
3651                 int cpu;
3652                 char cpu_char[10]; /* That's a lot of cores */
3653                 char *c;
3654
3655                 if (strlen(line) == 0)
3656                         continue;
3657                 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3658                         /* not a ^cpuN line containing a number N, just print it */
3659                         l = snprintf(cache, cache_size, "%s", line);
3660                         if (l < 0) {
3661                                 perror("Error writing to cache");
3662                                 rv = 0;
3663                                 goto err;
3664                         }
3665                         if (l >= cache_size) {
3666                                 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3667                                 rv = 0;
3668                                 goto err;
3669                         }
3670                         cache += l;
3671                         cache_size -= l;
3672                         total_len += l;
3673                         continue;
3674                 }
3675
3676                 if (sscanf(cpu_char, "%d", &cpu) != 1)
3677                         continue;
3678                 if (!cpu_in_cpuset(cpu, cpuset))
3679                         continue;
3680                 curcpu ++;
3681
3682                 c = strchr(line, ' ');
3683                 if (!c)
3684                         continue;
3685                 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3686                 if (l < 0) {
3687                         perror("Error writing to cache");
3688                         rv = 0;
3689                         goto err;
3690
3691                 }
3692                 if (l >= cache_size) {
3693                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3694                         rv = 0;
3695                         goto err;
3696                 }
3697
3698                 cache += l;
3699                 cache_size -= l;
3700                 total_len += l;
3701
3702                 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
3703                            &user,
3704                            &nice,
3705                            &system,
3706                            &idle,
3707                            &iowait,
3708                            &irq,
3709                            &softirq,
3710                            &steal,
3711                            &guest,
3712                            &guest_nice) != 10)
3713                         continue;
3714                 user_sum += user;
3715                 nice_sum += nice;
3716                 system_sum += system;
3717                 idle_sum += idle;
3718                 iowait_sum += iowait;
3719                 irq_sum += irq;
3720                 softirq_sum += softirq;
3721                 steal_sum += steal;
3722                 guest_sum += guest;
3723                 guest_nice_sum += guest_nice;
3724         }
3725
3726         cache = d->buf;
3727
3728         int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu  %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3729                         user_sum,
3730                         nice_sum,
3731                         system_sum,
3732                         idle_sum,
3733                         iowait_sum,
3734                         irq_sum,
3735                         softirq_sum,
3736                         steal_sum,
3737                         guest_sum,
3738                         guest_nice_sum);
3739         if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
3740                 memcpy(cache, cpuall, cpuall_len);
3741                 cache += cpuall_len;
3742         } else {
3743                 /* shouldn't happen */
3744                 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3745                 cpuall_len = 0;
3746         }
3747
3748         memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3749         total_len += cpuall_len;
3750         d->cached = 1;
3751         d->size = total_len;
3752         if (total_len > size)
3753                 total_len = size;
3754
3755         memcpy(buf, d->buf, total_len);
3756         rv = total_len;
3757
3758 err:
3759         if (f)
3760                 fclose(f);
3761         free(line);
3762         free(cpuset);
3763         free(cg);
3764         return rv;
3765 }
3766
3767 /* This function retrieves the busy time of a group of tasks by looking at
3768  * cpuacct.usage. Unfortunately, this only makes sense when the container has
3769  * been given it's own cpuacct cgroup. If not, this function will take the busy
3770  * time of all other taks that do not actually belong to the container into
3771  * account as well. If someone has a clever solution for this please send a
3772  * patch!
3773  */
3774 static unsigned long get_reaper_busy(pid_t task)
3775 {
3776         pid_t initpid = lookup_initpid_in_store(task);
3777         char *cgroup = NULL, *usage_str = NULL;
3778         unsigned long usage = 0;
3779
3780         if (initpid <= 0)
3781                 return 0;
3782
3783         cgroup = get_pid_cgroup(initpid, "cpuacct");
3784         if (!cgroup)
3785                 goto out;
3786         prune_init_slice(cgroup);
3787         if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3788                 goto out;
3789         usage = strtoul(usage_str, NULL, 10);
3790         usage /= 1000000000;
3791
3792 out:
3793         free(cgroup);
3794         free(usage_str);
3795         return usage;
3796 }
3797
3798 #if RELOADTEST
3799 void iwashere(void)
3800 {
3801         int fd;
3802
3803         fd = creat("/tmp/lxcfs-iwashere", 0644);
3804         if (fd >= 0)
3805                 close(fd);
3806 }
3807 #endif
3808
3809 /*
3810  * We read /proc/uptime and reuse its second field.
3811  * For the first field, we use the mtime for the reaper for
3812  * the calling pid as returned by getreaperage
3813  */
3814 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3815                 struct fuse_file_info *fi)
3816 {
3817         struct fuse_context *fc = fuse_get_context();
3818         struct file_info *d = (struct file_info *)fi->fh;
3819         unsigned long int busytime = get_reaper_busy(fc->pid);
3820         char *cache = d->buf;
3821         ssize_t total_len = 0;
3822         uint64_t idletime, reaperage;
3823
3824 #if RELOADTEST
3825         iwashere();
3826 #endif
3827
3828         if (offset){
3829                 if (!d->cached)
3830                         return 0;
3831                 if (offset > d->size)
3832                         return -EINVAL;
3833                 int left = d->size - offset;
3834                 total_len = left > size ? size: left;
3835                 memcpy(buf, cache + offset, total_len);
3836                 return total_len;
3837         }
3838
3839         reaperage = get_reaper_age(fc->pid);
3840         /* To understand why this is done, please read the comment to the
3841          * get_reaper_busy() function.
3842          */
3843         idletime = reaperage;
3844         if (reaperage >= busytime)
3845                 idletime = reaperage - busytime;
3846
3847         total_len = snprintf(d->buf, d->buflen, "%"PRIu64".00 %"PRIu64".00\n", reaperage, idletime);
3848         if (total_len < 0 || total_len >=  d->buflen){
3849                 lxcfs_error("%s\n", "failed to write to cache");
3850                 return 0;
3851         }
3852
3853         d->size = (int)total_len;
3854         d->cached = 1;
3855
3856         if (total_len > size) total_len = size;
3857
3858         memcpy(buf, d->buf, total_len);
3859         return total_len;
3860 }
3861
3862 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3863                 struct fuse_file_info *fi)
3864 {
3865         char dev_name[72];
3866         struct fuse_context *fc = fuse_get_context();
3867         struct file_info *d = (struct file_info *)fi->fh;
3868         char *cg;
3869         char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3870                         *io_wait_time_str = NULL, *io_service_time_str = NULL;
3871         unsigned long read = 0, write = 0;
3872         unsigned long read_merged = 0, write_merged = 0;
3873         unsigned long read_sectors = 0, write_sectors = 0;
3874         unsigned long read_ticks = 0, write_ticks = 0;
3875         unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3876         unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3877         char *cache = d->buf;
3878         size_t cache_size = d->buflen;
3879         char *line = NULL;
3880         size_t linelen = 0, total_len = 0, rv = 0;
3881         unsigned int major = 0, minor = 0;
3882         int i = 0;
3883         FILE *f = NULL;
3884
3885         if (offset){
3886                 if (offset > d->size)
3887                         return -EINVAL;
3888                 if (!d->cached)
3889                         return 0;
3890                 int left = d->size - offset;
3891                 total_len = left > size ? size: left;
3892                 memcpy(buf, cache + offset, total_len);
3893                 return total_len;
3894         }
3895
3896         pid_t initpid = lookup_initpid_in_store(fc->pid);
3897         if (initpid <= 0)
3898                 initpid = fc->pid;
3899         cg = get_pid_cgroup(initpid, "blkio");
3900         if (!cg)
3901                 return read_file("/proc/diskstats", buf, size, d);
3902         prune_init_slice(cg);
3903
3904         if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
3905                 goto err;
3906         if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
3907                 goto err;
3908         if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
3909                 goto err;
3910         if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
3911                 goto err;
3912         if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
3913                 goto err;
3914
3915
3916         f = fopen("/proc/diskstats", "r");
3917         if (!f)
3918                 goto err;
3919
3920         while (getline(&line, &linelen, f) != -1) {
3921                 ssize_t l;
3922                 char lbuf[256];
3923
3924                 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
3925                 if (i != 3)
3926                         continue;
3927
3928                 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3929                 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3930                 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3931                 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3932                 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3933                 read_sectors = read_sectors/512;
3934                 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3935                 write_sectors = write_sectors/512;
3936
3937                 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3938                 rd_svctm = rd_svctm/1000000;
3939                 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3940                 rd_wait = rd_wait/1000000;
3941                 read_ticks = rd_svctm + rd_wait;
3942
3943                 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3944                 wr_svctm =  wr_svctm/1000000;
3945                 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3946                 wr_wait =  wr_wait/1000000;
3947                 write_ticks = wr_svctm + wr_wait;
3948
3949                 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3950                 tot_ticks =  tot_ticks/1000000;
3951
3952                 memset(lbuf, 0, 256);
3953                 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3954                         snprintf(lbuf, 256, "%u       %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3955                                 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3956                                 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3957                 else
3958                         continue;
3959
3960                 l = snprintf(cache, cache_size, "%s", lbuf);
3961                 if (l < 0) {
3962                         perror("Error writing to fuse buf");
3963                         rv = 0;
3964                         goto err;
3965                 }
3966                 if (l >= cache_size) {
3967                         lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3968                         rv = 0;
3969                         goto err;
3970                 }
3971                 cache += l;
3972                 cache_size -= l;
3973                 total_len += l;
3974         }
3975
3976         d->cached = 1;
3977         d->size = total_len;
3978         if (total_len > size ) total_len = size;
3979         memcpy(buf, d->buf, total_len);
3980
3981         rv = total_len;
3982 err:
3983         free(cg);
3984         if (f)
3985                 fclose(f);
3986         free(line);
3987         free(io_serviced_str);
3988         free(io_merged_str);
3989         free(io_service_bytes_str);
3990         free(io_wait_time_str);
3991         free(io_service_time_str);
3992         return rv;
3993 }
3994
3995 static int proc_swaps_read(char *buf, size_t size, off_t offset,
3996                 struct fuse_file_info *fi)
3997 {
3998         struct fuse_context *fc = fuse_get_context();
3999         struct file_info *d = (struct file_info *)fi->fh;
4000         char *cg = NULL;
4001         char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
4002         unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
4003         ssize_t total_len = 0, rv = 0;
4004         ssize_t l = 0;
4005         char *cache = d->buf;
4006
4007         if (offset) {
4008                 if (offset > d->size)
4009                         return -EINVAL;
4010                 if (!d->cached)
4011                         return 0;
4012                 int left = d->size - offset;
4013                 total_len = left > size ? size: left;
4014                 memcpy(buf, cache + offset, total_len);
4015                 return total_len;
4016         }
4017
4018         pid_t initpid = lookup_initpid_in_store(fc->pid);
4019         if (initpid <= 0)
4020                 initpid = fc->pid;
4021         cg = get_pid_cgroup(initpid, "memory");
4022         if (!cg)
4023                 return read_file("/proc/swaps", buf, size, d);
4024         prune_init_slice(cg);
4025
4026         memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
4027
4028         if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
4029                 goto err;
4030
4031         memusage = strtoul(memusage_str, NULL, 10);
4032
4033         if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
4034             cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
4035
4036                 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
4037                 memswusage = strtoul(memswusage_str, NULL, 10);
4038
4039                 swap_total = (memswlimit - memlimit) / 1024;
4040                 swap_free = (memswusage - memusage) / 1024;
4041         }
4042
4043         total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
4044
4045         /* When no mem + swap limit is specified or swapaccount=0*/
4046         if (!memswlimit) {
4047                 char *line = NULL;
4048                 size_t linelen = 0;
4049                 FILE *f = fopen("/proc/meminfo", "r");
4050
4051                 if (!f)
4052                         goto err;
4053
4054                 while (getline(&line, &linelen, f) != -1) {
4055                         if (startswith(line, "SwapTotal:")) {
4056                                 sscanf(line, "SwapTotal:      %8lu kB", &swap_total);
4057                         } else if (startswith(line, "SwapFree:")) {
4058                                 sscanf(line, "SwapFree:      %8lu kB", &swap_free);
4059                         }
4060                 }
4061
4062                 free(line);
4063                 fclose(f);
4064         }
4065
4066         if (swap_total > 0) {
4067                 l = snprintf(d->buf + total_len, d->size - total_len,
4068                                 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
4069                                 swap_total, swap_free);
4070                 total_len += l;
4071         }
4072
4073         if (total_len < 0 || l < 0) {
4074                 perror("Error writing to cache");
4075                 rv = 0;
4076                 goto err;
4077         }
4078
4079         d->cached = 1;
4080         d->size = (int)total_len;
4081
4082         if (total_len > size) total_len = size;
4083         memcpy(buf, d->buf, total_len);
4084         rv = total_len;
4085
4086 err:
4087         free(cg);
4088         free(memswlimit_str);
4089         free(memlimit_str);
4090         free(memusage_str);
4091         free(memswusage_str);
4092         return rv;
4093 }
4094
4095 static off_t get_procfile_size(const char *which)
4096 {
4097         FILE *f = fopen(which, "r");
4098         char *line = NULL;
4099         size_t len = 0;
4100         ssize_t sz, answer = 0;
4101         if (!f)
4102                 return 0;
4103
4104         while ((sz = getline(&line, &len, f)) != -1)
4105                 answer += sz;
4106         fclose (f);
4107         free(line);
4108
4109         return answer;
4110 }
4111
4112 int proc_getattr(const char *path, struct stat *sb)
4113 {
4114         struct timespec now;
4115
4116         memset(sb, 0, sizeof(struct stat));
4117         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
4118                 return -EINVAL;
4119         sb->st_uid = sb->st_gid = 0;
4120         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4121         if (strcmp(path, "/proc") == 0) {
4122                 sb->st_mode = S_IFDIR | 00555;
4123                 sb->st_nlink = 2;
4124                 return 0;
4125         }
4126         if (strcmp(path, "/proc/meminfo") == 0 ||
4127                         strcmp(path, "/proc/cpuinfo") == 0 ||
4128                         strcmp(path, "/proc/uptime") == 0 ||
4129                         strcmp(path, "/proc/stat") == 0 ||
4130                         strcmp(path, "/proc/diskstats") == 0 ||
4131                         strcmp(path, "/proc/swaps") == 0) {
4132                 sb->st_size = 0;
4133                 sb->st_mode = S_IFREG | 00444;
4134                 sb->st_nlink = 1;
4135                 return 0;
4136         }
4137
4138         return -ENOENT;
4139 }
4140
4141 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4142                 struct fuse_file_info *fi)
4143 {
4144         if (filler(buf, ".", NULL, 0) != 0 ||
4145             filler(buf, "..", NULL, 0) != 0 ||
4146             filler(buf, "cpuinfo", NULL, 0) != 0 ||
4147             filler(buf, "meminfo", NULL, 0) != 0 ||
4148             filler(buf, "stat", NULL, 0) != 0 ||
4149             filler(buf, "uptime", NULL, 0) != 0 ||
4150             filler(buf, "diskstats", NULL, 0) != 0 ||
4151             filler(buf, "swaps", NULL, 0) != 0)
4152                 return -EINVAL;
4153         return 0;
4154 }
4155
4156 int proc_open(const char *path, struct fuse_file_info *fi)
4157 {
4158         int type = -1;
4159         struct file_info *info;
4160
4161         if (strcmp(path, "/proc/meminfo") == 0)
4162                 type = LXC_TYPE_PROC_MEMINFO;
4163         else if (strcmp(path, "/proc/cpuinfo") == 0)
4164                 type = LXC_TYPE_PROC_CPUINFO;
4165         else if (strcmp(path, "/proc/uptime") == 0)
4166                 type = LXC_TYPE_PROC_UPTIME;
4167         else if (strcmp(path, "/proc/stat") == 0)
4168                 type = LXC_TYPE_PROC_STAT;
4169         else if (strcmp(path, "/proc/diskstats") == 0)
4170                 type = LXC_TYPE_PROC_DISKSTATS;
4171         else if (strcmp(path, "/proc/swaps") == 0)
4172                 type = LXC_TYPE_PROC_SWAPS;
4173         if (type == -1)
4174                 return -ENOENT;
4175
4176         info = malloc(sizeof(*info));
4177         if (!info)
4178                 return -ENOMEM;
4179
4180         memset(info, 0, sizeof(*info));
4181         info->type = type;
4182
4183         info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4184         do {
4185                 info->buf = malloc(info->buflen);
4186         } while (!info->buf);
4187         memset(info->buf, 0, info->buflen);
4188         /* set actual size to buffer size */
4189         info->size = info->buflen;
4190
4191         fi->fh = (unsigned long)info;
4192         return 0;
4193 }
4194
4195 int proc_access(const char *path, int mask)
4196 {
4197         if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4198                 return 0;
4199
4200         /* these are all read-only */
4201         if ((mask & ~R_OK) != 0)
4202                 return -EACCES;
4203         return 0;
4204 }
4205
4206 int proc_release(const char *path, struct fuse_file_info *fi)
4207 {
4208         do_release_file_info(fi);
4209         return 0;
4210 }
4211
4212 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4213                 struct fuse_file_info *fi)
4214 {
4215         struct file_info *f = (struct file_info *) fi->fh;
4216
4217         switch (f->type) {
4218         case LXC_TYPE_PROC_MEMINFO:
4219                 return proc_meminfo_read(buf, size, offset, fi);
4220         case LXC_TYPE_PROC_CPUINFO:
4221                 return proc_cpuinfo_read(buf, size, offset, fi);
4222         case LXC_TYPE_PROC_UPTIME:
4223                 return proc_uptime_read(buf, size, offset, fi);
4224         case LXC_TYPE_PROC_STAT:
4225                 return proc_stat_read(buf, size, offset, fi);
4226         case LXC_TYPE_PROC_DISKSTATS:
4227                 return proc_diskstats_read(buf, size, offset, fi);
4228         case LXC_TYPE_PROC_SWAPS:
4229                 return proc_swaps_read(buf, size, offset, fi);
4230         default:
4231                 return -EINVAL;
4232         }
4233 }
4234
4235 /*
4236  * Functions needed to setup cgroups in the __constructor__.
4237  */
4238
4239 static bool mkdir_p(const char *dir, mode_t mode)
4240 {
4241         const char *tmp = dir;
4242         const char *orig = dir;
4243         char *makeme;
4244
4245         do {
4246                 dir = tmp + strspn(tmp, "/");
4247                 tmp = dir + strcspn(dir, "/");
4248                 makeme = strndup(orig, dir - orig);
4249                 if (!makeme)
4250                         return false;
4251                 if (mkdir(makeme, mode) && errno != EEXIST) {
4252                         lxcfs_error("Failed to create directory '%s': %s.\n",
4253                                 makeme, strerror(errno));
4254                         free(makeme);
4255                         return false;
4256                 }
4257                 free(makeme);
4258         } while(tmp != dir);
4259
4260         return true;
4261 }
4262
4263 static bool umount_if_mounted(void)
4264 {
4265         if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4266                 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4267                 return false;
4268         }
4269         return true;
4270 }
4271
4272 /* __typeof__ should be safe to use with all compilers. */
4273 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
4274 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
4275 {
4276         return (fs->f_type == (fs_type_magic)magic_val);
4277 }
4278
4279 /*
4280  * looking at fs/proc_namespace.c, it appears we can
4281  * actually expect the rootfs entry to very specifically contain
4282  * " - rootfs rootfs "
4283  * IIUC, so long as we've chrooted so that rootfs is not our root,
4284  * the rootfs entry should always be skipped in mountinfo contents.
4285  */
4286 static bool is_on_ramfs(void)
4287 {
4288         FILE *f;
4289         char *p, *p2;
4290         char *line = NULL;
4291         size_t len = 0;
4292         int i;
4293
4294         f = fopen("/proc/self/mountinfo", "r");
4295         if (!f)
4296                 return false;
4297
4298         while (getline(&line, &len, f) != -1) {
4299                 for (p = line, i = 0; p && i < 4; i++)
4300                         p = strchr(p + 1, ' ');
4301                 if (!p)
4302                         continue;
4303                 p2 = strchr(p + 1, ' ');
4304                 if (!p2)
4305                         continue;
4306                 *p2 = '\0';
4307                 if (strcmp(p + 1, "/") == 0) {
4308                         // this is '/'.  is it the ramfs?
4309                         p = strchr(p2 + 1, '-');
4310                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
4311                                 free(line);
4312                                 fclose(f);
4313                                 return true;
4314                         }
4315                 }
4316         }
4317         free(line);
4318         fclose(f);
4319         return false;
4320 }
4321
4322 static int pivot_enter()
4323 {
4324         int ret = -1, oldroot = -1, newroot = -1;
4325
4326         oldroot = open("/", O_DIRECTORY | O_RDONLY);
4327         if (oldroot < 0) {
4328                 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4329                 return ret;
4330         }
4331
4332         newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4333         if (newroot < 0) {
4334                 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4335                 goto err;
4336         }
4337
4338         /* change into new root fs */
4339         if (fchdir(newroot) < 0) {
4340                 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4341                 goto err;
4342         }
4343
4344         /* pivot_root into our new root fs */
4345         if (pivot_root(".", ".") < 0) {
4346                 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4347                 goto err;
4348         }
4349
4350         /*
4351          * At this point the old-root is mounted on top of our new-root.
4352          * To unmounted it we must not be chdir'd into it, so escape back
4353          * to the old-root.
4354          */
4355         if (fchdir(oldroot) < 0) {
4356                 lxcfs_error("%s\n", "Failed to enter old root.");
4357                 goto err;
4358         }
4359
4360         if (umount2(".", MNT_DETACH) < 0) {
4361                 lxcfs_error("%s\n", "Failed to detach old root.");
4362                 goto err;
4363         }
4364
4365         if (fchdir(newroot) < 0) {
4366                 lxcfs_error("%s\n", "Failed to re-enter new root.");
4367                 goto err;
4368         }
4369
4370         ret = 0;
4371
4372 err:
4373         if (oldroot > 0)
4374                 close(oldroot);
4375         if (newroot > 0)
4376                 close(newroot);
4377
4378         return ret;
4379 }
4380
4381 static int chroot_enter()
4382 {
4383         if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
4384                 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
4385                 return -1;
4386         }
4387
4388         if (chroot(".") < 0) {
4389                 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
4390                 return -1;
4391         }
4392
4393         if (chdir("/") < 0) {
4394                 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
4395                 return -1;
4396         }
4397
4398         return 0;
4399 }
4400
4401 static int permute_and_enter(void)
4402 {
4403         struct statfs sb;
4404
4405         if (statfs("/", &sb) < 0) {
4406                 lxcfs_error("%s\n", "Could not stat / mountpoint.");
4407                 return -1;
4408         }
4409
4410         /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
4411          * likely report TMPFS_MAGIC. Hence, when it reports no we still check
4412          * /proc/1/mountinfo. */
4413         if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
4414                 return chroot_enter();
4415
4416         if (pivot_enter() < 0) {
4417                 lxcfs_error("%s\n", "Could not perform pivot root.");
4418                 return -1;
4419         }
4420
4421         return 0;
4422 }
4423
4424 /* Prepare our new clean root. */
4425 static int permute_prepare(void)
4426 {
4427         if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4428                 lxcfs_error("%s\n", "Failed to create directory for new root.");
4429                 return -1;
4430         }
4431
4432         if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4433                 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4434                 return -1;
4435         }
4436
4437         if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4438                 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
4439                 return -1;
4440         }
4441
4442         if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4443                 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
4444                 return -1;
4445         }
4446
4447         return 0;
4448 }
4449
4450 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
4451 static bool permute_root(void)
4452 {
4453         /* Prepare new root. */
4454         if (permute_prepare() < 0)
4455                 return false;
4456
4457         /* Pivot into new root. */
4458         if (permute_and_enter() < 0)
4459                 return false;
4460
4461         return true;
4462 }
4463
4464 static int preserve_mnt_ns(int pid)
4465 {
4466         int ret;
4467         size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
4468         char path[len];
4469
4470         ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4471         if (ret < 0 || (size_t)ret >= len)
4472                 return -1;
4473
4474         return open(path, O_RDONLY | O_CLOEXEC);
4475 }
4476
4477 static bool cgfs_prepare_mounts(void)
4478 {
4479         if (!mkdir_p(BASEDIR, 0700)) {
4480                 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
4481                 return false;
4482         }
4483
4484         if (!umount_if_mounted()) {
4485                 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
4486                 return false;
4487         }
4488
4489         if (unshare(CLONE_NEWNS) < 0) {
4490                 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
4491                 return false;
4492         }
4493
4494         cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
4495         if (cgroup_mount_ns_fd < 0) {
4496                 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
4497                 return false;
4498         }
4499
4500         if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4501                 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
4502                 return false;
4503         }
4504
4505         if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
4506                 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
4507                 return false;
4508         }
4509
4510         return true;
4511 }
4512
4513 static bool cgfs_mount_hierarchies(void)
4514 {
4515         char *target;
4516         size_t clen, len;
4517         int i, ret;
4518
4519         for (i = 0; i < num_hierarchies; i++) {
4520                 char *controller = hierarchies[i];
4521
4522                 clen = strlen(controller);
4523                 len = strlen(BASEDIR) + clen + 2;
4524                 target = malloc(len);
4525                 if (!target)
4526                         return false;
4527
4528                 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4529                 if (ret < 0 || ret >= len) {
4530                         free(target);
4531                         return false;
4532                 }
4533                 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4534                         free(target);
4535                         return false;
4536                 }
4537                 if (!strcmp(controller, "unified"))
4538                         ret = mount("none", target, "cgroup2", 0, NULL);
4539                 else
4540                         ret = mount(controller, target, "cgroup", 0, controller);
4541                 if (ret < 0) {
4542                         lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
4543                         free(target);
4544                         return false;
4545                 }
4546
4547                 fd_hierarchies[i] = open(target, O_DIRECTORY);
4548                 if (fd_hierarchies[i] < 0) {
4549                         free(target);
4550                         return false;
4551                 }
4552                 free(target);
4553         }
4554         return true;
4555 }
4556
4557 static bool cgfs_setup_controllers(void)
4558 {
4559         if (!cgfs_prepare_mounts())
4560                 return false;
4561
4562         if (!cgfs_mount_hierarchies()) {
4563                 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
4564                 return false;
4565         }
4566
4567         if (!permute_root())
4568                 return false;
4569
4570         return true;
4571 }
4572
4573 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
4574 {
4575         FILE *f;
4576         char *cret, *line = NULL;
4577         char cwd[MAXPATHLEN];
4578         size_t len = 0;
4579         int i, init_ns = -1;
4580         bool found_unified = false;
4581
4582         if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4583                 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
4584                 return;
4585         }
4586
4587         while (getline(&line, &len, f) != -1) {
4588                 char *idx, *p, *p2;
4589
4590                 p = strchr(line, ':');
4591                 if (!p)
4592                         goto out;
4593                 idx = line;
4594                 *(p++) = '\0';
4595
4596                 p2 = strrchr(p, ':');
4597                 if (!p2)
4598                         goto out;
4599                 *p2 = '\0';
4600
4601                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4602                  * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4603                  * because it parses out the empty string "" and later on passes
4604                  * it to mount(). Let's skip such entries.
4605                  */
4606                 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
4607                         found_unified = true;
4608                         p = "unified";
4609                 }
4610
4611                 if (!store_hierarchy(line, p))
4612                         goto out;
4613         }
4614
4615         /* Preserve initial namespace. */
4616         init_ns = preserve_mnt_ns(getpid());
4617         if (init_ns < 0) {
4618                 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
4619                 goto out;
4620         }
4621
4622         fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
4623         if (!fd_hierarchies) {
4624                 lxcfs_error("%s\n", strerror(errno));
4625                 goto out;
4626         }
4627
4628         for (i = 0; i < num_hierarchies; i++)
4629                 fd_hierarchies[i] = -1;
4630
4631         cret = getcwd(cwd, MAXPATHLEN);
4632         if (!cret)
4633                 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
4634
4635         /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4636          * to privately mount lxcfs cgroups. */
4637         if (!cgfs_setup_controllers()) {
4638                 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
4639                 goto out;
4640         }
4641
4642         if (setns(init_ns, 0) < 0) {
4643                 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
4644                 goto out;
4645         }
4646
4647         if (!cret || chdir(cwd) < 0)
4648                 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
4649
4650         print_subsystems();
4651
4652 out:
4653         free(line);
4654         fclose(f);
4655         if (init_ns >= 0)
4656                 close(init_ns);
4657 }
4658
4659 static void __attribute__((destructor)) free_subsystems(void)
4660 {
4661         int i;
4662
4663         lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
4664
4665         for (i = 0; i < num_hierarchies; i++) {
4666                 if (hierarchies[i])
4667                         free(hierarchies[i]);
4668                 if (fd_hierarchies && fd_hierarchies[i] >= 0)
4669                         close(fd_hierarchies[i]);
4670         }
4671         free(hierarchies);
4672         free(fd_hierarchies);
4673
4674         if (cgroup_mount_ns_fd >= 0)
4675                 close(cgroup_mount_ns_fd);
4676 }