src/cgroup_fuse.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE
   5 #endif
   6
   7 #ifndef FUSE_USE_VERSION
   8 #define FUSE_USE_VERSION 26
   9 #endif
  10
  11 #define _FILE_OFFSET_BITS 64
  12
  13 #define __STDC_FORMAT_MACROS
  14 #include <dirent.h>
  15 #include <errno.h>
  16 #include <fcntl.h>
  17 #include <fuse.h>
  18 #include <inttypes.h>
  19 #include <libgen.h>
  20 #include <pthread.h>
  21 #include <sched.h>
  22 #include <stdarg.h>
  23 #include <stdbool.h>
  24 #include <stdint.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <time.h>
  29 #include <unistd.h>
  30 #include <wait.h>
  31 #include <linux/magic.h>
  32 #include <linux/sched.h>
  33 #include <sys/epoll.h>
  34 #include <sys/mman.h>
  35 #include <sys/mount.h>
  36 #include <sys/param.h>
  37 #include <sys/socket.h>
  38 #include <sys/syscall.h>
  39 #include <sys/sysinfo.h>
  40 #include <sys/vfs.h>
  41
  42 #include "bindings.h"
  43 #include "config.h"
  44 #include "cgroups/cgroup.h"
  45 #include "cgroups/cgroup_utils.h"
  46 #include "memory_utils.h"
  47 #include "utils.h"
  48
  49 struct cgfs_files {
  50         char *name;
  51         uint32_t uid, gid;
  52         uint32_t mode;
  53 };
  54
  55 struct pid_ns_clone_args {
  56         int *cpipe;
  57         int sock;
  58         pid_t tpid;
  59         /* pid_from_ns or pid_to_ns. */
  60         int (*wrapped) (int, pid_t);
  61 };
  62
  63 /*
  64  * given /cgroup/freezer/a/b, return "freezer".
  65  * the returned char* should NOT be freed.
  66  */
  67 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
  68 {
  69         const char *p1;
  70         char *contr, *slash;
  71
  72         if (strlen(path) < 9) {
  73                 errno = EACCES;
  74                 return NULL;
  75         }
  76         if (*(path + 7) != '/') {
  77                 errno = EINVAL;
  78                 return NULL;
  79         }
  80         p1 = path + 8;
  81         contr = strdupa(p1);
  82         if (!contr) {
  83                 errno = ENOMEM;
  84                 return NULL;
  85         }
  86         slash = strstr(contr, "/");
  87         if (slash)
  88                 *slash = '\0';
  89
  90         for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
  91                 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
  92                         return (*h)->__controllers;
  93         }
  94         errno = ENOENT;
  95         return NULL;
  96 }
  97
  98 /*
  99  * Find the start of cgroup in /cgroup/controller/the/cgroup/path
 100  * Note that the returned value may include files (keynames) etc
 101  */
 102 static const char *find_cgroup_in_path(const char *path)
 103 {
 104         const char *p1;
 105
 106         if (strlen(path) < 9) {
 107                 errno = EACCES;
 108                 return NULL;
 109         }
 110         p1 = strstr(path + 8, "/");
 111         if (!p1) {
 112                 errno = EINVAL;
 113                 return NULL;
 114         }
 115         errno = 0;
 116         return p1 + 1;
 117 }
 118
 119 /*
 120  * split the last path element from the path in @cg.
 121  * @dir is newly allocated and should be freed, @last not
 122 */
 123 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
 124 {
 125         char *p;
 126
 127         do {
 128                 *dir = strdup(cg);
 129         } while (!*dir);
 130         *last = strrchr(cg, '/');
 131         if (!*last) {
 132                 *last = NULL;
 133                 return;
 134         }
 135         p = strrchr(*dir, '/');
 136         *p = '\0';
 137 }
 138
 139 static bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
 140 {
 141         int cfd;
 142         size_t len;
 143         char *fnam;
 144         int ret;
 145         struct stat sb;
 146
 147         cfd = get_cgroup_fd(controller);
 148         if (cfd < 0)
 149                 return false;
 150
 151         /* Make sure we pass a relative path to *at() family of functions.
 152          * . + /cgroup + / + f + \0
 153          */
 154         len = strlen(cgroup) + strlen(f) + 3;
 155         fnam = alloca(len);
 156         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, f);
 157         if (ret < 0 || (size_t)ret >= len)
 158                 return false;
 159
 160         ret = fstatat(cfd, fnam, &sb, 0);
 161         if (ret < 0 || !S_ISDIR(sb.st_mode))
 162                 return false;
 163
 164         return true;
 165 }
 166
 167 /*
 168  * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
 169  */
 170 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
 171 {
 172         bool answer = false;
 173         char *c2, *task_cg;
 174         size_t target_len, task_len;
 175
 176         if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
 177                 return true;
 178
 179         c2 = get_pid_cgroup(pid, contrl);
 180         if (!c2)
 181                 return false;
 182         prune_init_slice(c2);
 183
 184         task_cg = c2 + 1;
 185         target_len = strlen(cg);
 186         task_len = strlen(task_cg);
 187         if (task_len == 0) {
 188                 /* Task is in the root cg, it can see everything. This case is
 189                  * not handled by the strmcps below, since they test for the
 190                  * last /, but that is the first / that we've chopped off
 191                  * above.
 192                  */
 193                 answer = true;
 194                 goto out;
 195         }
 196         if (strcmp(cg, task_cg) == 0) {
 197                 answer = true;
 198                 goto out;
 199         }
 200         if (target_len < task_len) {
 201                 /* looking up a parent dir */
 202                 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
 203                         answer = true;
 204                 goto out;
 205         }
 206         if (target_len > task_len) {
 207                 /* looking up a child dir */
 208                 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
 209                         answer = true;
 210                 goto out;
 211         }
 212
 213 out:
 214         free(c2);
 215         return answer;
 216 }
 217
 218 /*
 219  * taskcg is  a/b/c
 220  * querycg is /a/b/c/d/e
 221  * we return 'd'
 222  */
 223 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
 224 {
 225         char *start, *end;
 226
 227         if (strlen(taskcg) <= strlen(querycg)) {
 228                 lxcfs_error("%s\n", "I was fed bad input.");
 229                 return NULL;
 230         }
 231
 232         if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
 233                 start =  strdup(taskcg + 1);
 234         else
 235                 start = strdup(taskcg + strlen(querycg) + 1);
 236         if (!start)
 237                 return NULL;
 238         end = strchr(start, '/');
 239         if (end)
 240                 *end = '\0';
 241         return start;
 242 }
 243
 244 /*
 245  * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
 246  * If pid is in /a, he may act on /a/b, but not on /b.
 247  * if the answer is false and nextcg is not NULL, then *nextcg will point
 248  * to a string containing the next cgroup directory under cg, which must be
 249  * freed by the caller.
 250  */
 251 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
 252 {
 253         bool answer = false;
 254         char *c2 = get_pid_cgroup(pid, contrl);
 255         char *linecmp;
 256
 257         if (!c2)
 258                 return false;
 259         prune_init_slice(c2);
 260
 261         /*
 262          * callers pass in '/' or './' (openat()) for root cgroup, otherwise
 263          * they pass in a cgroup without leading '/'
 264          *
 265          * The original line here was:
 266          *      linecmp = *cg == '/' ? c2 : c2+1;
 267          * TODO: I'm not sure why you'd want to increment when *cg != '/'?
 268          *       Serge, do you know?
 269          */
 270         if (*cg == '/' || !strncmp(cg, "./", 2))
 271                 linecmp = c2;
 272         else
 273                 linecmp = c2 + 1;
 274         if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
 275                 if (nextcg) {
 276                         *nextcg = get_next_cgroup_dir(linecmp, cg);
 277                 }
 278                 goto out;
 279         }
 280         answer = true;
 281
 282 out:
 283         free(c2);
 284         return answer;
 285 }
 286
 287 static struct cgfs_files *cgfs_get_key(const char *controller,
 288                                        const char *cgroup, const char *file)
 289 {
 290         int ret, cfd;
 291         size_t len;
 292         char *fnam;
 293         struct stat sb;
 294         struct cgfs_files *newkey;
 295
 296         cfd = get_cgroup_fd(controller);
 297         if (cfd < 0)
 298                 return false;
 299
 300         if (file && *file == '/')
 301                 file++;
 302
 303         if (file && strchr(file, '/'))
 304                 return NULL;
 305
 306         /* Make sure we pass a relative path to *at() family of functions.
 307          * . + /cgroup + / + file + \0
 308          */
 309         len = strlen(cgroup) + 3;
 310         if (file)
 311                 len += strlen(file) + 1;
 312         fnam = alloca(len);
 313         snprintf(fnam, len, "%s%s%s%s", dot_or_empty(cgroup), cgroup,
 314                  file ? "/" : "", file ? file : "");
 315
 316         ret = fstatat(cfd, fnam, &sb, 0);
 317         if (ret < 0)
 318                 return NULL;
 319
 320         do {
 321                 newkey = malloc(sizeof(struct cgfs_files));
 322         } while (!newkey);
 323         if (file)
 324                 newkey->name = must_copy_string(file);
 325         else if (strrchr(cgroup, '/'))
 326                 newkey->name = must_copy_string(strrchr(cgroup, '/'));
 327         else
 328                 newkey->name = must_copy_string(cgroup);
 329         newkey->uid = sb.st_uid;
 330         newkey->gid = sb.st_gid;
 331         newkey->mode = sb.st_mode;
 332
 333         return newkey;
 334 }
 335
 336 /*
 337  * Given a open file * to /proc/pid/{u,g}id_map, and an id
 338  * valid in the caller's namespace, return the id mapped into
 339  * pid's namespace.
 340  * Returns the mapped id, or -1 on error.
 341  */
 342 static unsigned int convert_id_to_ns(FILE *idfile, unsigned int in_id)
 343 {
 344         unsigned int nsuid,   // base id for a range in the idfile's namespace
 345                      hostuid, // base id for a range in the caller's namespace
 346                      count;   // number of ids in this range
 347         char line[400];
 348         int ret;
 349
 350         fseek(idfile, 0L, SEEK_SET);
 351         while (fgets(line, 400, idfile)) {
 352                 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
 353                 if (ret != 3)
 354                         continue;
 355                 if (hostuid + count < hostuid || nsuid + count < nsuid) {
 356                         /*
 357                          * uids wrapped around - unexpected as this is a procfile,
 358                          * so just bail.
 359                          */
 360                         lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
 361                                 nsuid, hostuid, count, line);
 362                         return -1;
 363                 }
 364                 if (hostuid <= in_id && hostuid+count > in_id) {
 365                         /*
 366                          * now since hostuid <= in_id < hostuid+count, and
 367                          * hostuid+count and nsuid+count do not wrap around,
 368                          * we know that nsuid+(in_id-hostuid) which must be
 369                          * less that nsuid+(count) must not wrap around
 370                          */
 371                         return (in_id - hostuid) + nsuid;
 372                 }
 373         }
 374
 375         // no answer found
 376         return -1;
 377 }
 378
 379 /*
 380  * for is_privileged_over,
 381  * specify whether we require the calling uid to be root in his
 382  * namespace
 383  */
 384 #define NS_ROOT_REQD true
 385 #define NS_ROOT_OPT false
 386
 387 #define PROCLEN 100
 388
 389 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
 390 {
 391         char fpath[PROCLEN];
 392         int ret;
 393         bool answer = false;
 394         uid_t nsuid;
 395
 396         if (victim == -1 || uid == -1)
 397                 return false;
 398
 399         /*
 400          * If the request is one not requiring root in the namespace,
 401          * then having the same uid suffices.  (i.e. uid 1000 has write
 402          * access to files owned by uid 1000
 403          */
 404         if (!req_ns_root && uid == victim)
 405                 return true;
 406
 407         ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
 408         if (ret < 0 || ret >= PROCLEN)
 409                 return false;
 410         FILE *f = fopen(fpath, "re");
 411         if (!f)
 412                 return false;
 413
 414         /* if caller's not root in his namespace, reject */
 415         nsuid = convert_id_to_ns(f, uid);
 416         if (nsuid)
 417                 goto out;
 418
 419         /*
 420          * If victim is not mapped into caller's ns, reject.
 421          * XXX I'm not sure this check is needed given that fuse
 422          * will be sending requests where the vfs has converted
 423          */
 424         nsuid = convert_id_to_ns(f, victim);
 425         if (nsuid == -1)
 426                 goto out;
 427
 428         answer = true;
 429
 430 out:
 431         fclose(f);
 432         return answer;
 433 }
 434
 435 static bool perms_include(int fmode, mode_t req_mode)
 436 {
 437         mode_t r;
 438
 439         switch (req_mode & O_ACCMODE) {
 440         case O_RDONLY:
 441                 r = S_IROTH;
 442                 break;
 443         case O_WRONLY:
 444                 r = S_IWOTH;
 445                 break;
 446         case O_RDWR:
 447                 r = S_IROTH | S_IWOTH;
 448                 break;
 449         default:
 450                 return false;
 451         }
 452         return ((fmode & r) == r);
 453 }
 454
 455 static void free_key(struct cgfs_files *k)
 456 {
 457         if (!k)
 458                 return;
 459         free_disarm(k->name);
 460         free_disarm(k);
 461 }
 462
 463 /*
 464  * check whether a fuse context may access a cgroup dir or file
 465  *
 466  * If file is not null, it is a cgroup file to check under cg.
 467  * If file is null, then we are checking perms on cg itself.
 468  *
 469  * For files we can check the mode of the list_keys result.
 470  * For cgroups, we must make assumptions based on the files under the
 471  * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
 472  * yet.
 473  */
 474 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
 475 {
 476         struct cgfs_files *k = NULL;
 477         bool ret = false;
 478
 479         k = cgfs_get_key(contrl, cg, file);
 480         if (!k)
 481                 return false;
 482
 483         if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
 484                 if (perms_include(k->mode >> 6, mode)) {
 485                         ret = true;
 486                         goto out;
 487                 }
 488         }
 489         if (fc->gid == k->gid) {
 490                 if (perms_include(k->mode >> 3, mode)) {
 491                         ret = true;
 492                         goto out;
 493                 }
 494         }
 495         ret = perms_include(k->mode, mode);
 496
 497 out:
 498         free_key(k);
 499         return ret;
 500 }
 501
 502 int cg_getattr(const char *path, struct stat *sb)
 503 {
 504         struct timespec now;
 505         struct fuse_context *fc = fuse_get_context();
 506         char * cgdir = NULL;
 507         char *last = NULL, *path1, *path2;
 508         struct cgfs_files *k = NULL;
 509         const char *cgroup;
 510         const char *controller = NULL;
 511         int ret = -ENOENT;
 512
 513
 514         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 515                 return -EIO;
 516
 517         memset(sb, 0, sizeof(struct stat));
 518
 519         if (clock_gettime(CLOCK_REALTIME, &now) < 0)
 520                 return -EINVAL;
 521
 522         sb->st_uid = sb->st_gid = 0;
 523         sb->st_atim = sb->st_mtim = sb->st_ctim = now;
 524         sb->st_size = 0;
 525
 526         if (strcmp(path, "/cgroup") == 0) {
 527                 sb->st_mode = S_IFDIR | 00755;
 528                 sb->st_nlink = 2;
 529                 return 0;
 530         }
 531
 532         controller = pick_controller_from_path(fc, path);
 533         if (!controller)
 534                 return -errno;
 535         cgroup = find_cgroup_in_path(path);
 536         if (!cgroup) {
 537                 /* this is just /cgroup/controller, return it as a dir */
 538                 sb->st_mode = S_IFDIR | 00755;
 539                 sb->st_nlink = 2;
 540                 return 0;
 541         }
 542
 543         get_cgdir_and_path(cgroup, &cgdir, &last);
 544
 545         if (!last) {
 546                 path1 = "/";
 547                 path2 = cgdir;
 548         } else {
 549                 path1 = cgdir;
 550                 path2 = last;
 551         }
 552
 553         pid_t initpid = lookup_initpid_in_store(fc->pid);
 554         if (initpid <= 1 || is_shared_pidns(initpid))
 555                 initpid = fc->pid;
 556         /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
 557          * Then check that caller's cgroup is under path if last is a child
 558          * cgroup, or cgdir if last is a file */
 559
 560         if (is_child_cgroup(controller, path1, path2)) {
 561                 if (!caller_may_see_dir(initpid, controller, cgroup)) {
 562                         ret = -ENOENT;
 563                         goto out;
 564                 }
 565                 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
 566                         /* this is just /cgroup/controller, return it as a dir */
 567                         sb->st_mode = S_IFDIR | 00555;
 568                         sb->st_nlink = 2;
 569                         ret = 0;
 570                         goto out;
 571                 }
 572                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
 573                         ret = -EACCES;
 574                         goto out;
 575                 }
 576
 577                 // get uid, gid, from '/tasks' file and make up a mode
 578                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
 579                 sb->st_mode = S_IFDIR | 00755;
 580                 k = cgfs_get_key(controller, cgroup, NULL);
 581                 if (!k) {
 582                         sb->st_uid = sb->st_gid = 0;
 583                 } else {
 584                         sb->st_uid = k->uid;
 585                         sb->st_gid = k->gid;
 586                 }
 587                 free_key(k);
 588                 sb->st_nlink = 2;
 589                 ret = 0;
 590                 goto out;
 591         }
 592
 593         if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
 594                 sb->st_mode = S_IFREG | k->mode;
 595                 sb->st_nlink = 1;
 596                 sb->st_uid = k->uid;
 597                 sb->st_gid = k->gid;
 598                 sb->st_size = 0;
 599                 free_key(k);
 600                 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
 601                         ret = -ENOENT;
 602                         goto out;
 603                 }
 604                 ret = 0;
 605         }
 606
 607 out:
 608         free(cgdir);
 609         return ret;
 610 }
 611
 612 /*
 613  * Chown all the files in the cgroup directory.  We do this when we create a
 614  * cgroup on behalf of a user.
 615  */
 616 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 617 {
 618         struct dirent *direntp;
 619         char path[MAXPATHLEN];
 620         size_t len;
 621         DIR *d;
 622         int fd1, ret;
 623
 624         len = strlen(dirname);
 625         if (len >= MAXPATHLEN) {
 626                 lxcfs_error("Pathname too long: %s\n", dirname);
 627                 return;
 628         }
 629
 630         fd1 = openat(fd, dirname, O_DIRECTORY);
 631         if (fd1 < 0)
 632                 return;
 633
 634         d = fdopendir(fd1);
 635         if (!d) {
 636                 lxcfs_error("Failed to open %s\n", dirname);
 637                 return;
 638         }
 639
 640         while ((direntp = readdir(d))) {
 641                 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
 642                         continue;
 643                 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 644                 if (ret < 0 || ret >= MAXPATHLEN) {
 645                         lxcfs_error("Pathname too long under %s\n", dirname);
 646                         continue;
 647                 }
 648                 if (fchownat(fd, path, uid, gid, 0) < 0)
 649                         lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
 650         }
 651         closedir(d);
 652 }
 653
 654 static int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
 655 {
 656         int cfd;
 657         size_t len;
 658         char *dirnam;
 659
 660         cfd = get_cgroup_fd(controller);
 661         if (cfd < 0)
 662                 return -EINVAL;
 663
 664         /* Make sure we pass a relative path to *at() family of functions.
 665          * . + /cg + \0
 666          */
 667         len = strlen(cg) + 2;
 668         dirnam = alloca(len);
 669         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 670
 671         if (mkdirat(cfd, dirnam, 0755) < 0)
 672                 return -errno;
 673
 674         if (uid == 0 && gid == 0)
 675                 return 0;
 676
 677         if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
 678                 return -errno;
 679
 680         chown_all_cgroup_files(dirnam, uid, gid, cfd);
 681
 682         return 0;
 683 }
 684
 685 int cg_mkdir(const char *path, mode_t mode)
 686 {
 687         struct fuse_context *fc = fuse_get_context();
 688         char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
 689         const char *cgroup;
 690         int ret;
 691
 692         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 693                 return -EIO;
 694
 695         controller = pick_controller_from_path(fc, path);
 696         if (!controller)
 697                 return errno == ENOENT ? -EPERM : -errno;
 698
 699         cgroup = find_cgroup_in_path(path);
 700         if (!cgroup)
 701                 return -errno;
 702
 703         get_cgdir_and_path(cgroup, &cgdir, &last);
 704         if (!last)
 705                 path1 = "/";
 706         else
 707                 path1 = cgdir;
 708
 709         pid_t initpid = lookup_initpid_in_store(fc->pid);
 710         if (initpid <= 1 || is_shared_pidns(initpid))
 711                 initpid = fc->pid;
 712         if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
 713                 if (!next)
 714                         ret = -EINVAL;
 715                 else if (last && strcmp(next, last) == 0)
 716                         ret = -EEXIST;
 717                 else
 718                         ret = -EPERM;
 719                 goto out;
 720         }
 721
 722         if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
 723                 ret = -EACCES;
 724                 goto out;
 725         }
 726         if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
 727                 ret = -EACCES;
 728                 goto out;
 729         }
 730
 731         ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
 732
 733 out:
 734         free(cgdir);
 735         free(next);
 736         return ret;
 737 }
 738
 739 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
 740 {
 741         struct dirent *direntp;
 742         DIR *dir;
 743         bool ret = false;
 744         char pathname[MAXPATHLEN];
 745         int dupfd;
 746
 747         dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
 748         if (dupfd < 0)
 749                 return false;
 750
 751         dir = fdopendir(dupfd);
 752         if (!dir) {
 753                 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
 754                 close(dupfd);
 755                 return false;
 756         }
 757
 758         while ((direntp = readdir(dir))) {
 759                 struct stat mystat;
 760                 int rc;
 761
 762                 if (!strcmp(direntp->d_name, ".") ||
 763                     !strcmp(direntp->d_name, ".."))
 764                         continue;
 765
 766                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 767                 if (rc < 0 || rc >= MAXPATHLEN) {
 768                         lxcfs_error("%s\n", "Pathname too long.");
 769                         continue;
 770                 }
 771
 772                 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
 773                 if (rc) {
 774                         lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
 775                         continue;
 776                 }
 777                 if (S_ISDIR(mystat.st_mode))
 778                         if (!recursive_rmdir(pathname, fd, cfd))
 779                                 lxcfs_debug("Error removing %s.\n", pathname);
 780         }
 781
 782         ret = true;
 783         if (closedir(dir) < 0) {
 784                 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
 785                 ret = false;
 786         }
 787
 788         if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
 789                 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
 790                 ret = false;
 791         }
 792
 793         close(dupfd);
 794
 795         return ret;
 796 }
 797
 798 static bool cgfs_remove(const char *controller, const char *cg)
 799 {
 800         int fd, cfd;
 801         size_t len;
 802         char *dirnam;
 803         bool bret;
 804
 805         cfd = get_cgroup_fd(controller);
 806         if (cfd < 0)
 807                 return false;
 808
 809         /* Make sure we pass a relative path to *at() family of functions.
 810          * . +  /cg + \0
 811          */
 812         len = strlen(cg) + 2;
 813         dirnam = alloca(len);
 814         snprintf(dirnam, len, "%s%s", dot_or_empty(cg), cg);
 815
 816         fd = openat(cfd, dirnam, O_DIRECTORY);
 817         if (fd < 0)
 818                 return false;
 819
 820         bret = recursive_rmdir(dirnam, fd, cfd);
 821         close(fd);
 822         return bret;
 823 }
 824
 825 int cg_rmdir(const char *path)
 826 {
 827         struct fuse_context *fc = fuse_get_context();
 828         char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
 829         const char *cgroup;
 830         int ret;
 831
 832         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 833                 return -EIO;
 834
 835         controller = pick_controller_from_path(fc, path);
 836         if (!controller) /* Someone's trying to delete "/cgroup". */
 837                 return -EPERM;
 838
 839         cgroup = find_cgroup_in_path(path);
 840         if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
 841                 return -EPERM;
 842
 843         get_cgdir_and_path(cgroup, &cgdir, &last);
 844         if (!last) {
 845                 /* Someone's trying to delete a cgroup on the same level as the
 846                  * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
 847                  * rmdir "/cgroup/blkio/init.slice".
 848                  */
 849                 ret = -EPERM;
 850                 goto out;
 851         }
 852
 853         pid_t initpid = lookup_initpid_in_store(fc->pid);
 854         if (initpid <= 1 || is_shared_pidns(initpid))
 855                 initpid = fc->pid;
 856         if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
 857                 if (!last || (next && (strcmp(next, last) == 0)))
 858                         ret = -EBUSY;
 859                 else
 860                         ret = -ENOENT;
 861                 goto out;
 862         }
 863
 864         if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
 865                 ret = -EACCES;
 866                 goto out;
 867         }
 868         if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
 869                 ret = -EACCES;
 870                 goto out;
 871         }
 872
 873         if (!cgfs_remove(controller, cgroup)) {
 874                 ret = -EINVAL;
 875                 goto out;
 876         }
 877
 878         ret = 0;
 879
 880 out:
 881         free(cgdir);
 882         free(next);
 883         return ret;
 884 }
 885
 886 static bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
 887 {
 888         int cfd;
 889         size_t len;
 890         char *pathname;
 891
 892         cfd = get_cgroup_fd(controller);
 893         if (cfd < 0)
 894                 return false;
 895
 896         /* Make sure we pass a relative path to *at() family of functions.
 897          * . + /file + \0
 898          */
 899         len = strlen(file) + 2;
 900         pathname = alloca(len);
 901         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
 902         if (fchmodat(cfd, pathname, mode, 0) < 0)
 903                 return false;
 904         return true;
 905 }
 906
 907 int cg_chmod(const char *path, mode_t mode)
 908 {
 909         struct fuse_context *fc = fuse_get_context();
 910         char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
 911         struct cgfs_files *k = NULL;
 912         const char *cgroup;
 913         int ret;
 914
 915         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
 916                 return -EIO;
 917
 918         if (strcmp(path, "/cgroup") == 0)
 919                 return -EPERM;
 920
 921         controller = pick_controller_from_path(fc, path);
 922         if (!controller)
 923                 return errno == ENOENT ? -EPERM : -errno;
 924
 925         cgroup = find_cgroup_in_path(path);
 926         if (!cgroup)
 927                 /* this is just /cgroup/controller */
 928                 return -EPERM;
 929
 930         get_cgdir_and_path(cgroup, &cgdir, &last);
 931
 932         if (!last) {
 933                 path1 = "/";
 934                 path2 = cgdir;
 935         } else {
 936                 path1 = cgdir;
 937                 path2 = last;
 938         }
 939
 940         if (is_child_cgroup(controller, path1, path2)) {
 941                 // get uid, gid, from '/tasks' file and make up a mode
 942                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
 943                 k = cgfs_get_key(controller, cgroup, "tasks");
 944
 945         } else
 946                 k = cgfs_get_key(controller, path1, path2);
 947
 948         if (!k) {
 949                 ret = -EINVAL;
 950                 goto out;
 951         }
 952
 953         /*
 954          * This being a fuse request, the uid and gid must be valid
 955          * in the caller's namespace.  So we can just check to make
 956          * sure that the caller is root in his uid, and privileged
 957          * over the file's current owner.
 958          */
 959         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
 960                 ret = -EPERM;
 961                 goto out;
 962         }
 963
 964         if (!cgfs_chmod_file(controller, cgroup, mode)) {
 965                 ret = -EINVAL;
 966                 goto out;
 967         }
 968
 969         ret = 0;
 970 out:
 971         free_key(k);
 972         free(cgdir);
 973         return ret;
 974 }
 975
 976 static int is_dir(const char *path, int fd)
 977 {
 978         struct stat statbuf;
 979         int ret = fstatat(fd, path, &statbuf, fd);
 980         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 981                 return 1;
 982         return 0;
 983 }
 984
 985 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
 986 {
 987         size_t len;
 988         char *fname;
 989
 990         len = strlen(dirname) + strlen("/cgroup.procs") + 1;
 991         fname = alloca(len);
 992         snprintf(fname, len, "%s/tasks", dirname);
 993         if (fchownat(fd, fname, uid, gid, 0) != 0)
 994                 return -errno;
 995         snprintf(fname, len, "%s/cgroup.procs", dirname);
 996         if (fchownat(fd, fname, uid, gid, 0) != 0)
 997                 return -errno;
 998         return 0;
 999 }
1000
1001 static int cgfs_chown_file(const char *controller, const char *file, uid_t uid,
1002                            gid_t gid)
1003 {
1004         int cfd;
1005         size_t len;
1006         char *pathname;
1007
1008         cfd = get_cgroup_fd(controller);
1009         if (cfd < 0)
1010                 return false;
1011
1012         /* Make sure we pass a relative path to *at() family of functions.
1013          * . + /file + \0
1014          */
1015         len = strlen(file) + 2;
1016         pathname = alloca(len);
1017         snprintf(pathname, len, "%s%s", dot_or_empty(file), file);
1018         if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1019                 return -errno;
1020
1021         if (is_dir(pathname, cfd))
1022                 return chown_tasks_files(pathname, uid, gid, cfd);
1023
1024         return 0;
1025 }
1026
1027 int cg_chown(const char *path, uid_t uid, gid_t gid)
1028 {
1029         struct fuse_context *fc = fuse_get_context();
1030         char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1031         struct cgfs_files *k = NULL;
1032         const char *cgroup;
1033         int ret;
1034
1035         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1036                 return -EIO;
1037
1038         if (strcmp(path, "/cgroup") == 0)
1039                 return -EPERM;
1040
1041         controller = pick_controller_from_path(fc, path);
1042         if (!controller)
1043                 return errno == ENOENT ? -EPERM : -errno;
1044
1045         cgroup = find_cgroup_in_path(path);
1046         if (!cgroup)
1047                 /* this is just /cgroup/controller */
1048                 return -EPERM;
1049
1050         get_cgdir_and_path(cgroup, &cgdir, &last);
1051
1052         if (!last) {
1053                 path1 = "/";
1054                 path2 = cgdir;
1055         } else {
1056                 path1 = cgdir;
1057                 path2 = last;
1058         }
1059
1060         if (is_child_cgroup(controller, path1, path2)) {
1061                 // get uid, gid, from '/tasks' file and make up a mode
1062                 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1063                 k = cgfs_get_key(controller, cgroup, "tasks");
1064
1065         } else
1066                 k = cgfs_get_key(controller, path1, path2);
1067
1068         if (!k) {
1069                 ret = -EINVAL;
1070                 goto out;
1071         }
1072
1073         /*
1074          * This being a fuse request, the uid and gid must be valid
1075          * in the caller's namespace.  So we can just check to make
1076          * sure that the caller is root in his uid, and privileged
1077          * over the file's current owner.
1078          */
1079         if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1080                 ret = -EACCES;
1081                 goto out;
1082         }
1083
1084         ret = cgfs_chown_file(controller, cgroup, uid, gid);
1085
1086 out:
1087         free_key(k);
1088         free(cgdir);
1089
1090         return ret;
1091 }
1092
1093 int cg_open(const char *path, struct fuse_file_info *fi)
1094 {
1095         const char *cgroup;
1096         char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1097         struct cgfs_files *k = NULL;
1098         struct file_info *file_info;
1099         struct fuse_context *fc = fuse_get_context();
1100         int ret;
1101
1102         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1103                 return -EIO;
1104
1105         controller = pick_controller_from_path(fc, path);
1106         if (!controller)
1107                 return -errno;
1108         cgroup = find_cgroup_in_path(path);
1109         if (!cgroup)
1110                 return -errno;
1111
1112         get_cgdir_and_path(cgroup, &cgdir, &last);
1113         if (!last) {
1114                 path1 = "/";
1115                 path2 = cgdir;
1116         } else {
1117                 path1 = cgdir;
1118                 path2 = last;
1119         }
1120
1121         k = cgfs_get_key(controller, path1, path2);
1122         if (!k) {
1123                 ret = -EINVAL;
1124                 goto out;
1125         }
1126         free_key(k);
1127
1128         pid_t initpid = lookup_initpid_in_store(fc->pid);
1129         if (initpid <= 1 || is_shared_pidns(initpid))
1130                 initpid = fc->pid;
1131         if (!caller_may_see_dir(initpid, controller, path1)) {
1132                 ret = -ENOENT;
1133                 goto out;
1134         }
1135         if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1136                 ret = -EACCES;
1137                 goto out;
1138         }
1139
1140         /* we'll free this at cg_release */
1141         file_info = malloc(sizeof(*file_info));
1142         if (!file_info) {
1143                 ret = -ENOMEM;
1144                 goto out;
1145         }
1146         file_info->controller = must_copy_string(controller);
1147         file_info->cgroup = must_copy_string(path1);
1148         file_info->file = must_copy_string(path2);
1149         file_info->type = LXC_TYPE_CGFILE;
1150         file_info->buf = NULL;
1151         file_info->buflen = 0;
1152
1153         fi->fh = PTR_TO_UINT64(file_info);
1154         ret = 0;
1155
1156 out:
1157         free(cgdir);
1158         return ret;
1159 }
1160
1161 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1162
1163 /*
1164  * pid_to_ns - reads pids from a ucred over a socket, then writes the
1165  * int value back over the socket.  This shifts the pid from the
1166  * sender's pidns into tpid's pidns.
1167  */
1168 static int pid_to_ns(int sock, pid_t tpid)
1169 {
1170         char v = '0';
1171         struct ucred cred;
1172
1173         while (recv_creds(sock, &cred, &v)) {
1174                 if (v == '1')
1175                         return 0;
1176
1177                 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1178                         return 1;
1179         }
1180
1181         return 0;
1182 }
1183
1184 /*
1185  * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
1186  * with clone(). This simply writes '1' as ACK back to the parent
1187  * before calling the actual wrapped function.
1188  */
1189 static int pid_ns_clone_wrapper(void *arg) {
1190         struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
1191         char b = '1';
1192
1193         close(args->cpipe[0]);
1194         if (write(args->cpipe[1], &b, sizeof(char)) < 0)
1195                 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
1196         close(args->cpipe[1]);
1197         return args->wrapped(args->sock, args->tpid);
1198 }
1199
1200 /*
1201  * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1202  * in your old pidns.  Only children which you clone will be in the target
1203  * pidns.  So the pid_to_ns_wrapper does the setns, then clones a child to
1204  * actually convert pids.
1205  *
1206  * Note: glibc's fork() does not respect pidns, which can lead to failed
1207  * assertions inside glibc (and thus failed forks) if the child's pid in
1208  * the pidns and the parent pid outside are identical. Using clone prevents
1209  * this issue.
1210  */
1211 static void pid_to_ns_wrapper(int sock, pid_t tpid)
1212 {
1213         int newnsfd = -1, ret, cpipe[2];
1214         char fnam[100];
1215         pid_t cpid;
1216         char v;
1217
1218         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1219         if (ret < 0 || ret >= sizeof(fnam))
1220                 _exit(1);
1221         newnsfd = open(fnam, O_RDONLY);
1222         if (newnsfd < 0)
1223                 _exit(1);
1224         if (setns(newnsfd, 0) < 0)
1225                 _exit(1);
1226         close(newnsfd);
1227
1228         if (pipe(cpipe) < 0)
1229                 _exit(1);
1230
1231         struct pid_ns_clone_args args = {
1232                 .cpipe = cpipe,
1233                 .sock = sock,
1234                 .tpid = tpid,
1235                 .wrapped = &pid_to_ns
1236         };
1237         size_t stack_size = sysconf(_SC_PAGESIZE);
1238         void *stack = alloca(stack_size);
1239
1240         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
1241         if (cpid < 0)
1242                 _exit(1);
1243
1244         /* Give the child 1 second to be done forking and write its ack. */
1245         if (!wait_for_sock(cpipe[0], 1))
1246                 _exit(1);
1247         ret = read(cpipe[0], &v, 1);
1248         if (ret != sizeof(char) || v != '1')
1249                 _exit(1);
1250
1251         if (!wait_for_pid(cpid))
1252                 _exit(1);
1253         _exit(0);
1254 }
1255
1256 /*
1257  * append pid to *src.
1258  * src: a pointer to a char* in which ot append the pid.
1259  * sz: the number of characters printed so far, minus trailing \0.
1260  * asz: the allocated size so far
1261  * pid: the pid to append
1262  */
1263 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1264 {
1265         must_strcat(src, sz, asz, "%d\n", (int)pid);
1266 }
1267
1268 /*
1269  * To read cgroup files with a particular pid, we will setns into the child
1270  * pidns, open a pipe, fork a child - which will be the first to really be in
1271  * the child ns - which does the cgfs_get_value and writes the data to the pipe.
1272  */
1273 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg,
1274                          const char *file, char **d)
1275 {
1276         int sock[2] = {-1, -1};
1277         char *tmpdata = NULL;
1278         int ret;
1279         pid_t qpid, cpid = -1;
1280         bool answer = false;
1281         char v = '0';
1282         struct ucred cred;
1283         size_t sz = 0, asz = 0;
1284
1285         if (!cgroup_ops->get(cgroup_ops, contrl, cg, file, &tmpdata))
1286                 return false;
1287
1288         /*
1289          * Now we read the pids from returned data one by one, pass
1290          * them into a child in the target namespace, read back the
1291          * translated pids, and put them into our to-return data
1292          */
1293
1294         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1295                 perror("socketpair");
1296                 free(tmpdata);
1297                 return false;
1298         }
1299
1300         cpid = fork();
1301         if (cpid == -1)
1302                 goto out;
1303
1304         if (!cpid) // child - exits when done
1305                 pid_to_ns_wrapper(sock[1], tpid);
1306
1307         char *ptr = tmpdata;
1308         cred.uid = 0;
1309         cred.gid = 0;
1310         while (sscanf(ptr, "%d\n", &qpid) == 1) {
1311                 cred.pid = qpid;
1312                 ret = send_creds(sock[0], &cred, v, true);
1313
1314                 if (ret == SEND_CREDS_NOTSK)
1315                         goto next;
1316                 if (ret == SEND_CREDS_FAIL)
1317                         goto out;
1318
1319                 // read converted results
1320                 if (!wait_for_sock(sock[0], 2)) {
1321                         lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
1322                         goto out;
1323                 }
1324                 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1325                         lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
1326                         goto out;
1327                 }
1328                 must_strcat_pid(d, &sz, &asz, qpid);
1329 next:
1330                 ptr = strchr(ptr, '\n');
1331                 if (!ptr)
1332                         break;
1333                 ptr++;
1334         }
1335
1336         cred.pid = getpid();
1337         v = '1';
1338         if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1339                 // failed to ask child to exit
1340                 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
1341                 goto out;
1342         }
1343
1344         answer = true;
1345
1346 out:
1347         free(tmpdata);
1348         if (cpid != -1)
1349                 wait_for_pid(cpid);
1350         if (sock[0] != -1) {
1351                 close(sock[0]);
1352                 close(sock[1]);
1353         }
1354         return answer;
1355 }
1356
1357 int cg_read(const char *path, char *buf, size_t size, off_t offset,
1358             struct fuse_file_info *fi)
1359 {
1360         struct fuse_context *fc = fuse_get_context();
1361         struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1362         struct cgfs_files *k = NULL;
1363         char *data = NULL;
1364         int ret, s;
1365         bool r;
1366
1367         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1368                 return -EIO;
1369
1370         if (f->type != LXC_TYPE_CGFILE) {
1371                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
1372                 return -EIO;
1373         }
1374
1375         if (offset)
1376                 return 0;
1377
1378         if (!f->controller)
1379                 return -EINVAL;
1380
1381         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1382                 return -EINVAL;
1383         }
1384         free_key(k);
1385
1386
1387         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
1388                 ret = -EACCES;
1389                 goto out;
1390         }
1391
1392         if (strcmp(f->file, "tasks") == 0 ||
1393                         strcmp(f->file, "/tasks") == 0 ||
1394                         strcmp(f->file, "/cgroup.procs") == 0 ||
1395                         strcmp(f->file, "cgroup.procs") == 0)
1396                 // special case - we have to translate the pids
1397                 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1398         else
1399                 r = cgroup_ops->get(cgroup_ops, f->controller, f->cgroup, f->file, &data);
1400
1401         if (!r) {
1402                 ret = -EINVAL;
1403                 goto out;
1404         }
1405
1406         if (!data) {
1407                 ret = 0;
1408                 goto out;
1409         }
1410         s = strlen(data);
1411         if (s > size)
1412                 s = size;
1413         memcpy(buf, data, s);
1414         if (s > 0 && s < size && data[s-1] != '\n')
1415                 buf[s++] = '\n';
1416
1417         ret = s;
1418
1419 out:
1420         free(data);
1421         return ret;
1422 }
1423
1424 int cg_opendir(const char *path, struct fuse_file_info *fi)
1425 {
1426         struct fuse_context *fc = fuse_get_context();
1427         const char *cgroup;
1428         struct file_info *dir_info;
1429         char *controller = NULL;
1430
1431         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1432                 return -EIO;
1433
1434         if (strcmp(path, "/cgroup") == 0) {
1435                 cgroup = NULL;
1436                 controller = NULL;
1437         } else {
1438                 // return list of keys for the controller, and list of child cgroups
1439                 controller = pick_controller_from_path(fc, path);
1440                 if (!controller)
1441                         return -errno;
1442
1443                 cgroup = find_cgroup_in_path(path);
1444                 if (!cgroup) {
1445                         /* this is just /cgroup/controller, return its contents */
1446                         cgroup = "/";
1447                 }
1448         }
1449
1450         pid_t initpid = lookup_initpid_in_store(fc->pid);
1451         if (initpid <= 1 || is_shared_pidns(initpid))
1452                 initpid = fc->pid;
1453         if (cgroup) {
1454                 if (!caller_may_see_dir(initpid, controller, cgroup))
1455                         return -ENOENT;
1456                 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1457                         return -EACCES;
1458         }
1459
1460         /* we'll free this at cg_releasedir */
1461         dir_info = malloc(sizeof(*dir_info));
1462         if (!dir_info)
1463                 return -ENOMEM;
1464         dir_info->controller = must_copy_string(controller);
1465         dir_info->cgroup = must_copy_string(cgroup);
1466         dir_info->type = LXC_TYPE_CGDIR;
1467         dir_info->buf = NULL;
1468         dir_info->file = NULL;
1469         dir_info->buflen = 0;
1470
1471         fi->fh = PTR_TO_UINT64(dir_info);
1472         return 0;
1473 }
1474
1475 int cg_release(const char *path, struct fuse_file_info *fi)
1476 {
1477         do_release_file_info(fi);
1478         return 0;
1479 }
1480
1481 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1482 {
1483         do_release_file_info(fi);
1484         return 0;
1485 }
1486
1487 static FILE *open_pids_file(const char *controller, const char *cgroup)
1488 {
1489         int fd, cfd;
1490         size_t len;
1491         char *pathname;
1492
1493         cfd = get_cgroup_fd(controller);
1494         if (cfd < 0)
1495                 return false;
1496
1497         /* Make sure we pass a relative path to *at() family of functions.
1498          * . + /cgroup + / "cgroup.procs" + \0
1499          */
1500         len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1501         pathname = alloca(len);
1502         snprintf(pathname, len, "%s%s/cgroup.procs", dot_or_empty(cgroup), cgroup);
1503
1504         fd = openat(cfd, pathname, O_WRONLY);
1505         if (fd < 0)
1506                 return NULL;
1507
1508         return fdopen(fd, "w");
1509 }
1510
1511 static int pid_from_ns(int sock, pid_t tpid)
1512 {
1513         pid_t vpid;
1514         struct ucred cred;
1515         char v;
1516         int ret;
1517
1518         cred.uid = 0;
1519         cred.gid = 0;
1520         while (1) {
1521                 if (!wait_for_sock(sock, 2)) {
1522                         lxcfs_error("%s\n", "Timeout reading from parent.");
1523                         return 1;
1524                 }
1525                 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1526                         lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
1527                         return 1;
1528                 }
1529                 if (vpid == -1) // done
1530                         break;
1531                 v = '0';
1532                 cred.pid = vpid;
1533                 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1534                         v = '1';
1535                         cred.pid = getpid();
1536                         if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1537                                 return 1;
1538                 }
1539         }
1540         return 0;
1541 }
1542
1543 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1544 {
1545         int newnsfd = -1, ret, cpipe[2];
1546         char fnam[100];
1547         pid_t cpid;
1548         char v;
1549
1550         ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1551         if (ret < 0 || ret >= sizeof(fnam))
1552                 _exit(1);
1553         newnsfd = open(fnam, O_RDONLY);
1554         if (newnsfd < 0)
1555                 _exit(1);
1556         if (setns(newnsfd, 0) < 0)
1557                 _exit(1);
1558         close(newnsfd);
1559
1560         if (pipe(cpipe) < 0)
1561                 _exit(1);
1562
1563         struct pid_ns_clone_args args = {
1564                 .cpipe = cpipe,
1565                 .sock = sock,
1566                 .tpid = tpid,
1567                 .wrapped = &pid_from_ns
1568         };
1569         size_t stack_size = sysconf(_SC_PAGESIZE);
1570         void *stack = alloca(stack_size);
1571
1572         cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
1573         if (cpid < 0)
1574                 _exit(1);
1575
1576         // give the child 1 second to be done forking and
1577         // write its ack
1578         if (!wait_for_sock(cpipe[0], 1))
1579                 _exit(1);
1580         ret = read(cpipe[0], &v, 1);
1581         if (ret != sizeof(char) || v != '1')
1582                 _exit(1);
1583
1584         if (!wait_for_pid(cpid))
1585                 _exit(1);
1586         _exit(0);
1587 }
1588
1589 /*
1590  * get_pid_creds: get the real uid and gid of @pid from
1591  * /proc/$$/status
1592  * (XXX should we use euid here?)
1593  */
1594 static void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1595 {
1596         char line[400];
1597         uid_t u;
1598         gid_t g;
1599         FILE *f;
1600
1601         *uid = -1;
1602         *gid = -1;
1603         sprintf(line, "/proc/%d/status", pid);
1604         if ((f = fopen(line, "re")) == NULL) {
1605                 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
1606                 return;
1607         }
1608         while (fgets(line, 400, f)) {
1609                 if (strncmp(line, "Uid:", 4) == 0) {
1610                         if (sscanf(line+4, "%u", &u) != 1) {
1611                                 lxcfs_error("bad uid line for pid %u\n", pid);
1612                                 fclose(f);
1613                                 return;
1614                         }
1615                         *uid = u;
1616                 } else if (strncmp(line, "Gid:", 4) == 0) {
1617                         if (sscanf(line+4, "%u", &g) != 1) {
1618                                 lxcfs_error("bad gid line for pid %u\n", pid);
1619                                 fclose(f);
1620                                 return;
1621                         }
1622                         *gid = g;
1623                 }
1624         }
1625         fclose(f);
1626 }
1627
1628 /*
1629  * Given host @uid, return the uid to which it maps in
1630  * @pid's user namespace, or -1 if none.
1631  */
1632 static bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1633 {
1634         FILE *f;
1635         char line[400];
1636
1637         sprintf(line, "/proc/%d/uid_map", pid);
1638         if ((f = fopen(line, "re")) == NULL) {
1639                 return false;
1640         }
1641
1642         *answer = convert_id_to_ns(f, uid);
1643         fclose(f);
1644
1645         if (*answer == -1)
1646                 return false;
1647         return true;
1648 }
1649
1650 /*
1651  * May the requestor @r move victim @v to a new cgroup?
1652  * This is allowed if
1653  *   . they are the same task
1654  *   . they are ownedy by the same uid
1655  *   . @r is root on the host, or
1656  *   . @v's uid is mapped into @r's where @r is root.
1657  */
1658 static bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1659 {
1660         uid_t v_uid, tmpuid;
1661         gid_t v_gid;
1662
1663         if (r == v)
1664                 return true;
1665         if (r_uid == 0)
1666                 return true;
1667         get_pid_creds(v, &v_uid, &v_gid);
1668         if (r_uid == v_uid)
1669                 return true;
1670         if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1671                         && hostuid_to_ns(v_uid, r, &tmpuid))
1672                 return true;
1673         return false;
1674 }
1675
1676 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl,
1677                           const char *cg, const char *file, const char *buf)
1678 {
1679         int sock[2] = {-1, -1};
1680         pid_t qpid, cpid = -1;
1681         FILE *pids_file = NULL;
1682         bool answer = false, fail = false;
1683
1684         pids_file = open_pids_file(contrl, cg);
1685         if (!pids_file)
1686                 return false;
1687
1688         /*
1689          * write the pids to a socket, have helper in writer's pidns
1690          * call movepid for us
1691          */
1692         if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1693                 perror("socketpair");
1694                 goto out;
1695         }
1696
1697         cpid = fork();
1698         if (cpid == -1)
1699                 goto out;
1700
1701         if (!cpid) { // child
1702                 fclose(pids_file);
1703                 pid_from_ns_wrapper(sock[1], tpid);
1704         }
1705
1706         const char *ptr = buf;
1707         while (sscanf(ptr, "%d", &qpid) == 1) {
1708                 struct ucred cred;
1709                 char v;
1710
1711                 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1712                         lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
1713                         goto out;
1714                 }
1715
1716                 if (recv_creds(sock[0], &cred, &v)) {
1717                         if (v == '0') {
1718                                 if (!may_move_pid(tpid, tuid, cred.pid)) {
1719                                         fail = true;
1720                                         break;
1721                                 }
1722                                 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
1723                                         fail = true;
1724                         }
1725                 }
1726
1727                 ptr = strchr(ptr, '\n');
1728                 if (!ptr)
1729                         break;
1730                 ptr++;
1731         }
1732
1733         /* All good, write the value */
1734         qpid = -1;
1735         if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1736                 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
1737
1738         if (!fail)
1739                 answer = true;
1740
1741 out:
1742         if (cpid != -1)
1743                 wait_for_pid(cpid);
1744         if (sock[0] != -1) {
1745                 close(sock[0]);
1746                 close(sock[1]);
1747         }
1748         if (pids_file) {
1749                 if (fclose(pids_file) != 0)
1750                         answer = false;
1751         }
1752         return answer;
1753 }
1754
1755 static bool write_string(const char *fnam, const char *string, int fd)
1756 {
1757         FILE *f;
1758         size_t len, ret;
1759
1760         f = fdopen(fd, "w");
1761         if (!f)
1762                 return false;
1763
1764         len = strlen(string);
1765         ret = fwrite(string, 1, len, f);
1766         if (ret != len) {
1767                 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
1768                             strerror(errno), string, fnam);
1769                 fclose(f);
1770                 return false;
1771         }
1772
1773         if (fclose(f) < 0) {
1774                 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
1775                 return false;
1776         }
1777
1778         return true;
1779 }
1780
1781 static bool cgfs_set_value(const char *controller, const char *cgroup,
1782                            const char *file, const char *value)
1783 {
1784         int ret, fd, cfd;
1785         size_t len;
1786         char *fnam;
1787
1788         cfd = get_cgroup_fd(controller);
1789         if (cfd < 0)
1790                 return false;
1791
1792         /* Make sure we pass a relative path to *at() family of functions.
1793          * . + /cgroup + / + file + \0
1794          */
1795         len = strlen(cgroup) + strlen(file) + 3;
1796         fnam = alloca(len);
1797         ret = snprintf(fnam, len, "%s%s/%s", dot_or_empty(cgroup), cgroup, file);
1798         if (ret < 0 || (size_t)ret >= len)
1799                 return false;
1800
1801         fd = openat(cfd, fnam, O_WRONLY);
1802         if (fd < 0)
1803                 return false;
1804
1805         return write_string(fnam, value, fd);
1806 }
1807
1808 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1809              struct fuse_file_info *fi)
1810 {
1811         struct fuse_context *fc = fuse_get_context();
1812         char *localbuf = NULL;
1813         struct cgfs_files *k = NULL;
1814         struct file_info *f = INTTYPE_TO_PTR(fi->fh);
1815         bool r;
1816
1817         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1818                 return -EIO;
1819
1820         if (f->type != LXC_TYPE_CGFILE) {
1821                 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
1822                 return -EIO;
1823         }
1824
1825         if (offset)
1826                 return 0;
1827
1828         localbuf = alloca(size+1);
1829         localbuf[size] = '\0';
1830         memcpy(localbuf, buf, size);
1831
1832         if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1833                 size = -EINVAL;
1834                 goto out;
1835         }
1836
1837         if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1838                 size = -EACCES;
1839                 goto out;
1840         }
1841
1842         if (strcmp(f->file, "tasks") == 0 ||
1843                         strcmp(f->file, "/tasks") == 0 ||
1844                         strcmp(f->file, "/cgroup.procs") == 0 ||
1845                         strcmp(f->file, "cgroup.procs") == 0)
1846                 // special case - we have to translate the pids
1847                 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
1848         else
1849                 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
1850
1851         if (!r)
1852                 size = -EINVAL;
1853
1854 out:
1855         free_key(k);
1856         return size;
1857 }
1858
1859 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup,
1860                                 bool directories, void ***list, size_t typesize,
1861                                 void *(*iterator)(const char *, const char *, const char *))
1862 {
1863         int cfd, fd, ret;
1864         size_t len;
1865         char *cg;
1866         char pathname[MAXPATHLEN];
1867         size_t sz = 0, asz = 0;
1868         struct dirent *dirent;
1869         DIR *dir;
1870
1871         cfd = get_cgroup_fd(controller);
1872         *list = NULL;
1873         if (cfd < 0)
1874                 return false;
1875
1876         /* Make sure we pass a relative path to *at() family of functions. */
1877         len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1878         cg = alloca(len);
1879         ret = snprintf(cg, len, "%s%s", dot_or_empty(cgroup), cgroup);
1880         if (ret < 0 || (size_t)ret >= len) {
1881                 lxcfs_error("Pathname too long under %s\n", cgroup);
1882                 return false;
1883         }
1884
1885         fd = openat(cfd, cg, O_DIRECTORY);
1886         if (fd < 0)
1887                 return false;
1888
1889         dir = fdopendir(fd);
1890         if (!dir)
1891                 return false;
1892
1893         while ((dirent = readdir(dir))) {
1894                 struct stat mystat;
1895
1896                 if (!strcmp(dirent->d_name, ".") ||
1897                     !strcmp(dirent->d_name, ".."))
1898                         continue;
1899
1900                 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1901                 if (ret < 0 || ret >= MAXPATHLEN) {
1902                         lxcfs_error("Pathname too long under %s\n", cg);
1903                         continue;
1904                 }
1905
1906                 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1907                 if (ret) {
1908                         lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1909                         continue;
1910                 }
1911                 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1912                     (directories && !S_ISDIR(mystat.st_mode)))
1913                         continue;
1914
1915                 if (sz+2 >= asz) {
1916                         void **tmp;
1917                         asz += BATCH_SIZE;
1918                         do {
1919                                 tmp = realloc(*list, asz * typesize);
1920                         } while  (!tmp);
1921                         *list = tmp;
1922                 }
1923                 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1924                 (*list)[sz+1] = NULL;
1925                 sz++;
1926         }
1927         if (closedir(dir) < 0) {
1928                 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1929                 return false;
1930         }
1931         return true;
1932 }
1933
1934 static void *make_key_list_entry(const char *controller, const char *cgroup,
1935                                  const char *dir_entry)
1936 {
1937         struct cgfs_files *entry;
1938
1939         entry = cgfs_get_key(controller, cgroup, dir_entry);
1940         if (!entry)
1941                 lxcfs_error("Failed to retrieve files under %s:%s\n",
1942                             controller, cgroup);
1943         return entry;
1944 }
1945
1946 static bool cgfs_list_keys(const char *controller, const char *cgroup,
1947                            struct cgfs_files ***keys)
1948 {
1949         return cgfs_iterate_cgroup(controller, cgroup, false, (void ***)keys,
1950                                    sizeof(*keys), &make_key_list_entry);
1951 }
1952
1953 static void *make_children_list_entry(const char *controller,
1954                                       const char *cgroup, const char *dir_entry)
1955 {
1956         return strdup(dir_entry);
1957 }
1958
1959 static bool cgfs_list_children(const char *controller, const char *cgroup,
1960                                char ***list)
1961 {
1962         return cgfs_iterate_cgroup(controller, cgroup, true, (void ***)list,
1963                                    sizeof(*list), &make_children_list_entry);
1964 }
1965
1966 static void free_keys(struct cgfs_files **keys)
1967 {
1968         if (!keys)
1969                 return;
1970
1971         for (int i = 0; keys[i]; i++)
1972                 free_key(keys[i]);
1973
1974         free_disarm(keys);
1975 }
1976
1977 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
1978                off_t offset, struct fuse_file_info *fi)
1979 {
1980         struct file_info *d = INTTYPE_TO_PTR(fi->fh);
1981         struct cgfs_files **list = NULL;
1982         int i, ret;
1983         char *nextcg = NULL;
1984         struct fuse_context *fc = fuse_get_context();
1985         char **clist = NULL;
1986
1987         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1988                 return -EIO;
1989
1990         if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1991                 return -EIO;
1992
1993         if (d->type != LXC_TYPE_CGDIR) {
1994                 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1995                 return -EIO;
1996         }
1997         if (!d->cgroup && !d->controller) {
1998                 /*
1999                  * ls /var/lib/lxcfs/cgroup - just show list of controllers.
2000                  * This only works with the legacy hierarchy.
2001                  */
2002                 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
2003                         if (is_unified_hierarchy(*h))
2004                                 continue;
2005
2006                         if ((*h)->__controllers && filler(buf, (*h)->__controllers, NULL, 0))
2007                                 return -EIO;
2008                 }
2009
2010                 return 0;
2011         }
2012
2013         if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2014                 // not a valid cgroup
2015                 ret = -EINVAL;
2016                 goto out;
2017         }
2018
2019         pid_t initpid = lookup_initpid_in_store(fc->pid);
2020         if (initpid <= 1 || is_shared_pidns(initpid))
2021                 initpid = fc->pid;
2022         if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2023                 if (nextcg) {
2024                         ret = filler(buf, nextcg,  NULL, 0);
2025                         free(nextcg);
2026                         if (ret != 0) {
2027                                 ret = -EIO;
2028                                 goto out;
2029                         }
2030                 }
2031                 ret = 0;
2032                 goto out;
2033         }
2034
2035         for (i = 0; list && list[i]; i++) {
2036                 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2037                         ret = -EIO;
2038                         goto out;
2039                 }
2040         }
2041
2042         // now get the list of child cgroups
2043
2044         if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2045                 ret = 0;
2046                 goto out;
2047         }
2048         if (clist) {
2049                 for (i = 0; clist[i]; i++) {
2050                         if (filler(buf, clist[i], NULL, 0) != 0) {
2051                                 ret = -EIO;
2052                                 goto out;
2053                         }
2054                 }
2055         }
2056         ret = 0;
2057
2058 out:
2059         free_keys(list);
2060         if (clist) {
2061                 for (i = 0; clist[i]; i++)
2062                         free(clist[i]);
2063                 free(clist);
2064         }
2065         return ret;
2066 }
2067
2068 int cg_access(const char *path, int mode)
2069 {
2070         int ret;
2071         const char *cgroup;
2072         char *path1, *path2, *controller;
2073         char *last = NULL, *cgdir = NULL;
2074         struct cgfs_files *k = NULL;
2075         struct fuse_context *fc = fuse_get_context();
2076
2077         if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2078                 return -EIO;
2079
2080         if (strcmp(path, "/cgroup") == 0)
2081                 return 0;
2082
2083         controller = pick_controller_from_path(fc, path);
2084         if (!controller)
2085                 return -errno;
2086         cgroup = find_cgroup_in_path(path);
2087         if (!cgroup) {
2088                 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2089                 if ((mode & W_OK) == 0)
2090                         return 0;
2091                 return -EACCES;
2092         }
2093
2094         get_cgdir_and_path(cgroup, &cgdir, &last);
2095         if (!last) {
2096                 path1 = "/";
2097                 path2 = cgdir;
2098         } else {
2099                 path1 = cgdir;
2100                 path2 = last;
2101         }
2102
2103         k = cgfs_get_key(controller, path1, path2);
2104         if (!k) {
2105                 if ((mode & W_OK) == 0)
2106                         ret = 0;
2107                 else
2108                         ret = -EACCES;
2109                 goto out;
2110         }
2111         free_key(k);
2112
2113         pid_t initpid = lookup_initpid_in_store(fc->pid);
2114         if (initpid <= 1 || is_shared_pidns(initpid))
2115                 initpid = fc->pid;
2116         if (!caller_may_see_dir(initpid, controller, path1)) {
2117                 ret = -ENOENT;
2118                 goto out;
2119         }
2120         if (!fc_may_access(fc, controller, path1, path2, mode)) {
2121                 ret = -EACCES;
2122                 goto out;
2123         }
2124
2125         ret = 0;
2126
2127 out:
2128         free(cgdir);
2129         return ret;
2130 }