src/lxc/utils.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE 1
   5 #endif
   6 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
   7 #include <ctype.h>
   8 #include <dirent.h>
   9 #include <errno.h>
  10 #include <fcntl.h>
  11 #include <grp.h>
  12 #include <inttypes.h>
  13 #include <libgen.h>
  14 #include <pthread.h>
  15 #include <signal.h>
  16 #include <stddef.h>
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <sys/mman.h>
  21 #include <sys/mount.h>
  22 /* Needs to be after sys/mount.h header */
  23 #include <linux/fs.h>
  24 #include <sys/param.h>
  25 #include <sys/prctl.h>
  26 #include <sys/stat.h>
  27 #include <sys/types.h>
  28 #include <sys/wait.h>
  29 #include <unistd.h>
  30
  31 #include "config.h"
  32 #include "log.h"
  33 #include "lsm/lsm.h"
  34 #include "lxclock.h"
  35 #include "memory_utils.h"
  36 #include "namespace.h"
  37 #include "parse.h"
  38 #include "raw_syscalls.h"
  39 #include "syscall_wrappers.h"
  40 #include "utils.h"
  41
  42 #ifndef HAVE_STRLCPY
  43 #include "include/strlcpy.h"
  44 #endif
  45
  46 #ifndef HAVE_STRLCAT
  47 #include "include/strlcat.h"
  48 #endif
  49
  50 #ifndef O_PATH
  51 #define O_PATH      010000000
  52 #endif
  53
  54 #ifndef O_NOFOLLOW
  55 #define O_NOFOLLOW  00400000
  56 #endif
  57
  58 lxc_log_define(utils, lxc);
  59
  60 /*
  61  * if path is btrfs, tries to remove it and any subvolumes beneath it
  62  */
  63 extern bool btrfs_try_remove_subvol(const char *path);
  64
  65 static int _recursive_rmdir(const char *dirname, dev_t pdev,
  66                             const char *exclude, int level, bool onedev)
  67 {
  68         __do_closedir DIR *dir = NULL;
  69         int failed = 0;
  70         bool hadexclude = false;
  71         int ret;
  72         struct dirent *direntp;
  73         char pathname[PATH_MAX];
  74
  75         dir = opendir(dirname);
  76         if (!dir)
  77                 return log_error(-1, "Failed to open \"%s\"", dirname);
  78
  79         while ((direntp = readdir(dir))) {
  80                 int rc;
  81                 struct stat mystat;
  82
  83                 if (!strcmp(direntp->d_name, ".") ||
  84                     !strcmp(direntp->d_name, ".."))
  85                         continue;
  86
  87                 rc = snprintf(pathname, PATH_MAX, "%s/%s", dirname, direntp->d_name);
  88                 if (rc < 0 || rc >= PATH_MAX) {
  89                         ERROR("The name of path is too long");
  90                         failed = 1;
  91                         continue;
  92                 }
  93
  94                 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
  95                         ret = rmdir(pathname);
  96                         if (ret < 0) {
  97                                 switch (errno) {
  98                                 case ENOTEMPTY:
  99                                         INFO("Not deleting snapshot \"%s\"", pathname);
 100                                         hadexclude = true;
 101                                         break;
 102                                 case ENOTDIR:
 103                                         ret = unlink(pathname);
 104                                         if (ret)
 105                                                 INFO("Failed to remove \"%s\"", pathname);
 106                                         break;
 107                                 default:
 108                                         SYSERROR("Failed to rmdir \"%s\"", pathname);
 109                                         failed = 1;
 110                                         break;
 111                                 }
 112                         }
 113
 114                         continue;
 115                 }
 116
 117                 ret = lstat(pathname, &mystat);
 118                 if (ret) {
 119                         SYSERROR("Failed to stat \"%s\"", pathname);
 120                         failed = 1;
 121                         continue;
 122                 }
 123
 124                 if (onedev && mystat.st_dev != pdev) {
 125                         if (btrfs_try_remove_subvol(pathname))
 126                                 INFO("Removed btrfs subvolume at \"%s\"", pathname);
 127                         continue;
 128                 }
 129
 130                 if (S_ISDIR(mystat.st_mode)) {
 131                         if (_recursive_rmdir(pathname, pdev, exclude, level + 1, onedev) < 0)
 132                                 failed = 1;
 133                 } else {
 134                         ret = unlink(pathname);
 135                         if (ret < 0) {
 136                                 __do_close int fd = -EBADF;
 137
 138                                 fd = open(pathname, O_RDONLY | O_CLOEXEC | O_NONBLOCK);
 139                                 if (fd >= 0) {
 140                                         /* The file might be marked immutable. */
 141                                         int attr = 0;
 142                                         ret = ioctl(fd, FS_IOC_GETFLAGS, &attr);
 143                                         if (ret < 0)
 144                                                 SYSERROR("Failed to retrieve file flags");
 145                                         attr &= ~FS_IMMUTABLE_FL;
 146                                         ret = ioctl(fd, FS_IOC_SETFLAGS, &attr);
 147                                         if (ret < 0)
 148                                                 SYSERROR("Failed to set file flags");
 149                                 }
 150
 151                                 ret = unlink(pathname);
 152                                 if (ret < 0) {
 153                                         SYSERROR("Failed to delete \"%s\"", pathname);
 154                                         failed = 1;
 155                                 }
 156                         }
 157                 }
 158         }
 159
 160         if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
 161                 SYSERROR("Failed to delete \"%s\"", dirname);
 162                 failed = 1;
 163         }
 164
 165         return failed ? -1 : 0;
 166 }
 167
 168 /*
 169  * In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
 170  * lxc_rmdir_onedev().
 171  */
 172 static inline bool is_native_overlayfs(const char *path)
 173 {
 174         return has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
 175                has_fs_type(path, OVERLAYFS_SUPER_MAGIC);
 176 }
 177
 178 /* returns 0 on success, -1 if there were any failures */
 179 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
 180 {
 181         struct stat mystat;
 182         bool onedev = true;
 183
 184         if (is_native_overlayfs(path))
 185                 onedev = false;
 186
 187         if (lstat(path, &mystat) < 0) {
 188                 if (errno == ENOENT)
 189                         return 0;
 190
 191                 return log_error_errno(-1, errno, "Failed to stat \"%s\"", path);
 192         }
 193
 194         return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 195 }
 196
 197 /* borrowed from iproute2 */
 198 extern int get_u16(unsigned short *val, const char *arg, int base)
 199 {
 200         unsigned long res;
 201         char *ptr;
 202
 203         if (!arg || !*arg)
 204                 return -1;
 205
 206         errno = 0;
 207         res = strtoul(arg, &ptr, base);
 208         if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
 209                 return -1;
 210
 211         *val = res;
 212
 213         return 0;
 214 }
 215
 216 int mkdir_p(const char *dir, mode_t mode)
 217 {
 218         const char *tmp = dir;
 219         const char *orig = dir;
 220
 221         do {
 222                 __do_free char *makeme = NULL;
 223                 int ret;
 224
 225                 dir = tmp + strspn(tmp, "/");
 226                 tmp = dir + strcspn(dir, "/");
 227
 228                 makeme = strndup(orig, dir - orig);
 229                 if (!makeme)
 230                         return ret_set_errno(-1, ENOMEM);
 231
 232                 ret = mkdir(makeme, mode);
 233                 if (ret < 0 && errno != EEXIST)
 234                         return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
 235
 236         } while (tmp != dir);
 237
 238         return 0;
 239 }
 240
 241 char *get_rundir()
 242 {
 243         char *rundir;
 244         size_t len;
 245         const char *homedir;
 246         struct stat sb;
 247
 248         if (stat(RUNTIME_PATH, &sb) < 0)
 249                 return NULL;
 250
 251         if (geteuid() == sb.st_uid || getegid() == sb.st_gid)
 252                 return strdup(RUNTIME_PATH);
 253
 254         rundir = getenv("XDG_RUNTIME_DIR");
 255         if (rundir)
 256                 return strdup(rundir);
 257
 258         INFO("XDG_RUNTIME_DIR isn't set in the environment");
 259         homedir = getenv("HOME");
 260         if (!homedir)
 261                 return log_error(NULL, "HOME isn't set in the environment");
 262
 263         len = strlen(homedir) + 17;
 264         rundir = malloc(sizeof(char) * len);
 265         if (!rundir)
 266                 return NULL;
 267
 268         snprintf(rundir, len, "%s/.cache/lxc/run/", homedir);
 269         return rundir;
 270 }
 271
 272 int wait_for_pid(pid_t pid)
 273 {
 274         int status, ret;
 275
 276 again:
 277         ret = waitpid(pid, &status, 0);
 278         if (ret == -1) {
 279                 if (errno == EINTR)
 280                         goto again;
 281
 282                 return -1;
 283         }
 284
 285         if (ret != pid)
 286                 goto again;
 287
 288         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
 289                 return -1;
 290
 291         return 0;
 292 }
 293
 294 int wait_for_pidfd(int pidfd)
 295 {
 296         int ret;
 297         siginfo_t info = {
 298                 .si_signo = 0,
 299         };
 300
 301         do {
 302                 ret = waitid(P_PIDFD, pidfd, &info, __WALL | WEXITED);
 303         } while (ret < 0 && errno == EINTR);
 304
 305         return !ret && WIFEXITED(info.si_status) && WEXITSTATUS(info.si_status) == 0;
 306 }
 307
 308 int lxc_wait_for_pid_status(pid_t pid)
 309 {
 310         int status, ret;
 311
 312 again:
 313         ret = waitpid(pid, &status, 0);
 314         if (ret == -1) {
 315                 if (errno == EINTR)
 316                         goto again;
 317
 318                 return -1;
 319         }
 320
 321         if (ret != pid)
 322                 goto again;
 323
 324         return status;
 325 }
 326
 327 #ifdef HAVE_OPENSSL
 328 #include <openssl/evp.h>
 329
 330 static int do_sha1_hash(const char *buf, int buflen, unsigned char *md_value,
 331                         unsigned int *md_len)
 332 {
 333         EVP_MD_CTX *mdctx;
 334         const EVP_MD *md;
 335
 336         md = EVP_get_digestbyname("sha1");
 337         if (!md)
 338                 return log_error(-1, "Unknown message digest: sha1\n");
 339
 340         mdctx = EVP_MD_CTX_create();
 341         EVP_DigestInit_ex(mdctx, md, NULL);
 342         EVP_DigestUpdate(mdctx, buf, buflen);
 343         EVP_DigestFinal_ex(mdctx, md_value, md_len);
 344         EVP_MD_CTX_destroy(mdctx);
 345
 346         return 0;
 347 }
 348
 349 int sha1sum_file(char *fnam, unsigned char *digest, unsigned int *md_len)
 350 {
 351         __do_free char *buf = NULL;
 352         __do_fclose FILE *f = NULL;
 353         int ret;
 354         long flen;
 355
 356         if (!fnam)
 357                 return -1;
 358
 359         f = fopen_cloexec(fnam, "r");
 360         if (!f)
 361                 return log_error_errno(-1, errno, "Failed to open template \"%s\"", fnam);
 362
 363         if (fseek(f, 0, SEEK_END) < 0)
 364                 return log_error_errno(-1, errno, "Failed to seek to end of template");
 365
 366         flen = ftell(f);
 367         if (flen < 0)
 368                 return log_error_errno(-1, errno, "Failed to tell size of template");
 369
 370         if (fseek(f, 0, SEEK_SET) < 0)
 371                 return log_error_errno(-1, errno, "Failed to seek to start of template");
 372
 373         buf = malloc(flen + 1);
 374         if (!buf)
 375                 return log_error_errno(-1, ENOMEM, "Out of memory");
 376
 377         if (fread(buf, 1, flen, f) != flen)
 378                 return log_error_errno(-1, errno, "Failed to read template");
 379
 380         buf[flen] = '\0';
 381         ret = do_sha1_hash(buf, flen, (void *)digest, md_len);
 382         return ret;
 383 }
 384 #endif
 385
 386 struct lxc_popen_FILE *lxc_popen(const char *command)
 387 {
 388         int ret;
 389         int pipe_fds[2];
 390         pid_t child_pid;
 391         struct lxc_popen_FILE *fp = NULL;
 392
 393         ret = pipe2(pipe_fds, O_CLOEXEC);
 394         if (ret < 0)
 395                 return NULL;
 396
 397         child_pid = fork();
 398         if (child_pid < 0)
 399                 goto on_error;
 400
 401         if (!child_pid) {
 402                 sigset_t mask;
 403
 404                 close(pipe_fds[0]);
 405
 406                 /* duplicate stdout */
 407                 if (pipe_fds[1] != STDOUT_FILENO)
 408                         ret = dup2(pipe_fds[1], STDOUT_FILENO);
 409                 else
 410                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 411                 if (ret < 0) {
 412                         close(pipe_fds[1]);
 413                         _exit(EXIT_FAILURE);
 414                 }
 415
 416                 /* duplicate stderr */
 417                 if (pipe_fds[1] != STDERR_FILENO)
 418                         ret = dup2(pipe_fds[1], STDERR_FILENO);
 419                 else
 420                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 421                 close(pipe_fds[1]);
 422                 if (ret < 0)
 423                         _exit(EXIT_FAILURE);
 424
 425                 /* unblock all signals */
 426                 ret = sigfillset(&mask);
 427                 if (ret < 0)
 428                         _exit(EXIT_FAILURE);
 429
 430                 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
 431                 if (ret < 0)
 432                         _exit(EXIT_FAILURE);
 433
 434                 /* check if /bin/sh exist, otherwise try Android location /system/bin/sh */
 435                 if (file_exists("/bin/sh"))
 436                         execl("/bin/sh", "sh", "-c", command, (char *)NULL);
 437                 else
 438                         execl("/system/bin/sh", "sh", "-c", command, (char *)NULL);
 439
 440                 _exit(127);
 441         }
 442
 443         close(pipe_fds[1]);
 444         pipe_fds[1] = -1;
 445
 446         fp = malloc(sizeof(*fp));
 447         if (!fp)
 448                 goto on_error;
 449
 450         memset(fp, 0, sizeof(*fp));
 451
 452         fp->child_pid = child_pid;
 453         fp->pipe = pipe_fds[0];
 454
 455         /* From now on, closing fp->f will also close fp->pipe. So only ever
 456          * call fclose(fp->f).
 457          */
 458         fp->f = fdopen(pipe_fds[0], "r");
 459         if (!fp->f)
 460                 goto on_error;
 461
 462         return fp;
 463
 464 on_error:
 465         /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
 466          * called yet. Otherwise the fd belongs to the file opened by fdopen()
 467          * since it isn't dup()ed.
 468          */
 469         if (fp && !fp->f && pipe_fds[0] >= 0)
 470                 close(pipe_fds[0]);
 471
 472         if (pipe_fds[1] >= 0)
 473                 close(pipe_fds[1]);
 474
 475         if (fp && fp->f)
 476                 fclose(fp->f);
 477
 478         if (fp)
 479                 free(fp);
 480
 481         return NULL;
 482 }
 483
 484 int lxc_pclose(struct lxc_popen_FILE *fp)
 485 {
 486         pid_t wait_pid;
 487         int wstatus = 0;
 488
 489         if (!fp)
 490                 return -1;
 491
 492         do {
 493                 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
 494         } while (wait_pid < 0 && errno == EINTR);
 495
 496         fclose(fp->f);
 497         free(fp);
 498
 499         if (wait_pid < 0)
 500                 return -1;
 501
 502         return wstatus;
 503 }
 504
 505 int randseed(bool srand_it)
 506 {
 507         __do_fclose FILE *f = NULL;
 508         /*
 509          * srand pre-seed function based on /dev/urandom
 510          */
 511         unsigned int seed = time(NULL) + getpid();
 512
 513         f = fopen("/dev/urandom", "re");
 514         if (f) {
 515                 int ret = fread(&seed, sizeof(seed), 1, f);
 516                 if (ret != 1)
 517                         SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
 518         }
 519
 520         if (srand_it)
 521                 srand(seed);
 522
 523         return seed;
 524 }
 525
 526 uid_t get_ns_uid(uid_t orig)
 527 {
 528         __do_free char *line = NULL;
 529         __do_fclose FILE *f = NULL;
 530         size_t sz = 0;
 531         uid_t nsid, hostid, range;
 532
 533         f = fopen("/proc/self/uid_map", "re");
 534         if (!f)
 535                 return log_error_errno(0, errno, "Failed to open uid_map");
 536
 537         while (getline(&line, &sz, f) != -1) {
 538                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 539                         continue;
 540
 541                 if (hostid <= orig && hostid + range > orig)
 542                         return nsid += orig - hostid;
 543         }
 544
 545         return LXC_INVALID_UID;
 546 }
 547
 548 gid_t get_ns_gid(gid_t orig)
 549 {
 550         __do_free char *line = NULL;
 551         __do_fclose FILE *f = NULL;
 552         size_t sz = 0;
 553         gid_t nsid, hostid, range;
 554
 555         f = fopen("/proc/self/gid_map", "re");
 556         if (!f)
 557                 return log_error_errno(0, errno, "Failed to open gid_map");
 558
 559         while (getline(&line, &sz, f) != -1) {
 560                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 561                         continue;
 562
 563                 if (hostid <= orig && hostid + range > orig)
 564                         return nsid += orig - hostid;
 565         }
 566
 567         return LXC_INVALID_GID;
 568 }
 569
 570 bool dir_exists(const char *path)
 571 {
 572         struct stat sb;
 573         int ret;
 574
 575         ret = stat(path, &sb);
 576         if (ret < 0)
 577                 /* Could be something other than eexist, just say "no". */
 578                 return false;
 579
 580         return S_ISDIR(sb.st_mode);
 581 }
 582
 583 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
 584  * FNV has good anti collision properties and we're not worried
 585  * about pre-image resistance or one-way-ness, we're just trying to make
 586  * the name unique in the 108 bytes of space we have.
 587  */
 588 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
 589 {
 590         unsigned char *bp;
 591
 592         for(bp = buf; bp < (unsigned char *)buf + len; bp++) {
 593                 /* xor the bottom with the current octet */
 594                 hval ^= (uint64_t)*bp;
 595
 596                 /* gcc optimised:
 597                  * multiply by the 64 bit FNV magic prime mod 2^64
 598                  */
 599                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
 600                         (hval << 7) + (hval << 8) + (hval << 40);
 601         }
 602
 603         return hval;
 604 }
 605
 606 bool is_shared_mountpoint(const char *path)
 607 {
 608         __do_fclose FILE *f = NULL;
 609         __do_free char *line = NULL;
 610         int i;
 611         size_t len = 0;
 612
 613         f = fopen("/proc/self/mountinfo", "re");
 614         if (!f)
 615                 return 0;
 616
 617         while (getline(&line, &len, f) > 0) {
 618                 char *slider1, *slider2;
 619
 620                 for (slider1 = line, i = 0; slider1 && i < 4; i++)
 621                         slider1 = strchr(slider1 + 1, ' ');
 622
 623                 if (!slider1)
 624                         continue;
 625
 626                 slider2 = strchr(slider1 + 1, ' ');
 627                 if (!slider2)
 628                         continue;
 629
 630                 *slider2 = '\0';
 631                 if (strcmp(slider1 + 1, path) == 0) {
 632                         /* This is the path. Is it shared? */
 633                         slider1 = strchr(slider2 + 1, ' ');
 634                         if (slider1 && strstr(slider1, "shared:"))
 635                                 return true;
 636                 }
 637         }
 638
 639         return false;
 640 }
 641
 642 /*
 643  * Detect whether / is mounted MS_SHARED.  The only way I know of to
 644  * check that is through /proc/self/mountinfo.
 645  * I'm only checking for /.  If the container rootfs or mount location
 646  * is MS_SHARED, but not '/', then you're out of luck - figuring that
 647  * out would be too much work to be worth it.
 648  */
 649 int detect_shared_rootfs(void)
 650 {
 651         if (is_shared_mountpoint("/"))
 652                 return 1;
 653
 654         return 0;
 655 }
 656
 657 bool switch_to_ns(pid_t pid, const char *ns)
 658 {
 659         __do_close int fd = -EBADF;
 660         int ret;
 661         char nspath[STRLITERALLEN("/proc//ns/")
 662                     + INTTYPE_TO_STRLEN(pid_t)
 663                     + LXC_NAMESPACE_NAME_MAX];
 664
 665         /* Switch to new ns */
 666         ret = snprintf(nspath, sizeof(nspath), "/proc/%d/ns/%s", pid, ns);
 667         if (ret < 0 || ret >= sizeof(nspath))
 668                 return false;
 669
 670         fd = open(nspath, O_RDONLY | O_CLOEXEC);
 671         if (fd < 0)
 672                 return log_error_errno(false, errno, "Failed to open \"%s\"", nspath);
 673
 674         ret = setns(fd, 0);
 675         if (ret)
 676                 return log_error_errno(false, errno, "Failed to set process %d to \"%s\" of %d", pid, ns, fd);
 677
 678         return true;
 679 }
 680
 681 /*
 682  * looking at fs/proc_namespace.c, it appears we can
 683  * actually expect the rootfs entry to very specifically contain
 684  * " - rootfs rootfs "
 685  * IIUC, so long as we've chrooted so that rootfs is not our root,
 686  * the rootfs entry should always be skipped in mountinfo contents.
 687  */
 688 bool detect_ramfs_rootfs(void)
 689 {
 690         __do_free char *line = NULL;
 691         __do_free void *fopen_cache = NULL;
 692         __do_fclose FILE *f = NULL;
 693         size_t len = 0;
 694
 695         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 696         if (!f)
 697                 return false;
 698
 699         while (getline(&line, &len, f) != -1) {
 700                 int i;
 701                 char *p, *p2;
 702
 703                 for (p = line, i = 0; p && i < 4; i++)
 704                         p = strchr(p + 1, ' ');
 705                 if (!p)
 706                         continue;
 707
 708                 p2 = strchr(p + 1, ' ');
 709                 if (!p2)
 710                         continue;
 711                 *p2 = '\0';
 712                 if (strcmp(p + 1, "/") == 0) {
 713                         /* This is '/'. Is it the ramfs? */
 714                         p = strchr(p2 + 1, '-');
 715                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0)
 716                                 return true;
 717                 }
 718         }
 719
 720         return false;
 721 }
 722
 723 char *on_path(const char *cmd, const char *rootfs)
 724 {
 725         __do_free char *path = NULL;
 726         char *entry = NULL;
 727         char cmdpath[PATH_MAX];
 728         int ret;
 729
 730         path = getenv("PATH");
 731         if (!path)
 732                 return NULL;
 733
 734         path = strdup(path);
 735         if (!path)
 736                 return NULL;
 737
 738         lxc_iterate_parts(entry, path, ":") {
 739                 if (rootfs)
 740                         ret = snprintf(cmdpath, PATH_MAX, "%s/%s/%s", rootfs,
 741                                        entry, cmd);
 742                 else
 743                         ret = snprintf(cmdpath, PATH_MAX, "%s/%s", entry, cmd);
 744                 if (ret < 0 || ret >= PATH_MAX)
 745                         continue;
 746
 747                 if (access(cmdpath, X_OK) == 0)
 748                         return strdup(cmdpath);
 749         }
 750
 751         return NULL;
 752 }
 753
 754 bool cgns_supported(void)
 755 {
 756         return file_exists("/proc/self/ns/cgroup");
 757 }
 758
 759 /* historically lxc-init has been under /usr/lib/lxc and under
 760  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
 761  */
 762 char *choose_init(const char *rootfs)
 763 {
 764         char *retv = NULL;
 765         const char *empty = "",
 766                    *tmp;
 767         int ret, env_set = 0;
 768
 769         if (!getenv("PATH")) {
 770                 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
 771                         SYSERROR("Failed to setenv");
 772
 773                 env_set = 1;
 774         }
 775
 776         retv = on_path("init.lxc", rootfs);
 777
 778         if (env_set)
 779                 if (unsetenv("PATH"))
 780                         SYSERROR("Failed to unsetenv");
 781
 782         if (retv)
 783                 return retv;
 784
 785         retv = malloc(PATH_MAX);
 786         if (!retv)
 787                 return NULL;
 788
 789         if (rootfs)
 790                 tmp = rootfs;
 791         else
 792                 tmp = empty;
 793
 794         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
 795         if (ret < 0 || ret >= PATH_MAX) {
 796                 ERROR("The name of path is too long");
 797                 goto out1;
 798         }
 799
 800         if (access(retv, X_OK) == 0)
 801                 return retv;
 802
 803         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
 804         if (ret < 0 || ret >= PATH_MAX) {
 805                 ERROR("The name of path is too long");
 806                 goto out1;
 807         }
 808
 809         if (access(retv, X_OK) == 0)
 810                 return retv;
 811
 812         ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
 813         if (ret < 0 || ret >= PATH_MAX) {
 814                 ERROR("The name of path is too long");
 815                 goto out1;
 816         }
 817
 818         if (access(retv, X_OK) == 0)
 819                 return retv;
 820
 821         ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
 822         if (ret < 0 || ret >= PATH_MAX) {
 823                 ERROR("The name of path is too long");
 824                 goto out1;
 825         }
 826
 827         if (access(retv, X_OK) == 0)
 828                 return retv;
 829
 830         /*
 831          * Last resort, look for the statically compiled init.lxc which we
 832          * hopefully bind-mounted in.
 833          * If we are called during container setup, and we get to this point,
 834          * then the init.lxc.static from the host will need to be bind-mounted
 835          * in.  So we return NULL here to indicate that.
 836          */
 837         if (rootfs)
 838                 goto out1;
 839
 840         ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
 841         if (ret < 0 || ret >= PATH_MAX) {
 842                 WARN("Nonsense - name /lxc.init.static too long");
 843                 goto out1;
 844         }
 845
 846         if (access(retv, X_OK) == 0)
 847                 return retv;
 848
 849 out1:
 850         free(retv);
 851         return NULL;
 852 }
 853
 854 /*
 855  * Given the '-t' template option to lxc-create, figure out what to
 856  * do.  If the template is a full executable path, use that.  If it
 857  * is something like 'sshd', then return $templatepath/lxc-sshd.
 858  * On success return the template, on error return NULL.
 859  */
 860 char *get_template_path(const char *t)
 861 {
 862         int ret, len;
 863         char *tpath;
 864
 865         if (t[0] == '/') {
 866                 if (access(t, X_OK) == 0) {
 867                         return strdup(t);
 868                 } else {
 869                         SYSERROR("Bad template pathname: %s", t);
 870                         return NULL;
 871                 }
 872         }
 873
 874         len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
 875
 876         tpath = malloc(len);
 877         if (!tpath)
 878                 return NULL;
 879
 880         ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
 881         if (ret < 0 || ret >= len) {
 882                 free(tpath);
 883                 return NULL;
 884         }
 885
 886         if (access(tpath, X_OK) < 0) {
 887                 SYSERROR("bad template: %s", t);
 888                 free(tpath);
 889                 return NULL;
 890         }
 891
 892         return tpath;
 893 }
 894
 895 /*
 896  * @path:    a pathname where / replaced with '\0'.
 897  * @offsetp: pointer to int showing which path segment was last seen.
 898  *           Updated on return to reflect the next segment.
 899  * @fulllen: full original path length.
 900  * Returns a pointer to the next path segment, or NULL if done.
 901  */
 902 static char *get_nextpath(char *path, int *offsetp, int fulllen)
 903 {
 904         int offset = *offsetp;
 905
 906         if (offset >= fulllen)
 907                 return NULL;
 908
 909         while (offset < fulllen && path[offset] != '\0')
 910                 offset++;
 911
 912         while (offset < fulllen && path[offset] == '\0')
 913                 offset++;
 914
 915         *offsetp = offset;
 916
 917         return (offset < fulllen) ? &path[offset] : NULL;
 918 }
 919
 920 /*
 921  * Check that @subdir is a subdir of @dir.  @len is the length of
 922  * @dir (to avoid having to recalculate it).
 923  */
 924 static bool is_subdir(const char *subdir, const char *dir, size_t len)
 925 {
 926         size_t subdirlen = strlen(subdir);
 927
 928         if (subdirlen < len)
 929                 return false;
 930
 931         if (strncmp(subdir, dir, len) != 0)
 932                 return false;
 933
 934         if (dir[len-1] == '/')
 935                 return true;
 936
 937         if (subdir[len] == '/' || subdirlen == len)
 938                 return true;
 939
 940         return false;
 941 }
 942
 943 /*
 944  * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
 945  * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
 946  */
 947 static int check_symlink(int fd)
 948 {
 949         struct stat sb;
 950         int ret;
 951
 952         ret = fstat(fd, &sb);
 953         if (ret < 0)
 954                 return -ENOENT;
 955
 956         if (S_ISLNK(sb.st_mode))
 957                 return -ELOOP;
 958
 959         return 0;
 960 }
 961
 962 /*
 963  * Open a file or directory, provided that it contains no symlinks.
 964  *
 965  * CAVEAT: This function must not be used for other purposes than container
 966  * setup before executing the container's init
 967  */
 968 static int open_if_safe(int dirfd, const char *nextpath)
 969 {
 970         int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
 971         if (newfd >= 0) /* Was not a symlink, all good. */
 972                 return newfd;
 973
 974         if (errno == ELOOP)
 975                 return newfd;
 976
 977         if (errno == EPERM || errno == EACCES) {
 978                 /* We're not root (cause we got EPERM) so try opening with
 979                  * O_PATH.
 980                  */
 981                 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
 982                 if (newfd >= 0) {
 983                         /* O_PATH will return an fd for symlinks. We know
 984                          * nextpath wasn't a symlink at last openat, so if fd is
 985                          * now a link, then something * fishy is going on.
 986                          */
 987                         int ret = check_symlink(newfd);
 988                         if (ret < 0) {
 989                                 close(newfd);
 990                                 newfd = ret;
 991                         }
 992                 }
 993         }
 994
 995         return newfd;
 996 }
 997
 998 /*
 999  * Open a path intending for mounting, ensuring that the final path
1000  * is inside the container's rootfs.
1001  *
1002  * CAVEAT: This function must not be used for other purposes than container
1003  * setup before executing the container's init
1004  *
1005  * @target: path to be opened
1006  * @prefix_skip: a part of @target in which to ignore symbolic links.  This
1007  * would be the container's rootfs.
1008  *
1009  * Return an open fd for the path, or <0 on error.
1010  */
1011 static int open_without_symlink(const char *target, const char *prefix_skip)
1012 {
1013         int curlen = 0, dirfd, fulllen, i;
1014         char *dup;
1015
1016         fulllen = strlen(target);
1017
1018         /* make sure prefix-skip makes sense */
1019         if (prefix_skip && strlen(prefix_skip) > 0) {
1020                 curlen = strlen(prefix_skip);
1021                 if (!is_subdir(target, prefix_skip, curlen)) {
1022                         ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1023                               target, prefix_skip);
1024                         return -EINVAL;
1025                 }
1026
1027                 /*
1028                  * get_nextpath() expects the curlen argument to be
1029                  * on a  (turned into \0) / or before it, so decrement
1030                  * curlen to make sure that happens
1031                  */
1032                 if (curlen)
1033                         curlen--;
1034         } else {
1035                 prefix_skip = "/";
1036                 curlen = 0;
1037         }
1038
1039         /* Make a copy of target which we can hack up, and tokenize it */
1040         if ((dup = strdup(target)) == NULL) {
1041                 ERROR("Out of memory checking for symbolic link");
1042                 return -ENOMEM;
1043         }
1044
1045         for (i = 0; i < fulllen; i++) {
1046                 if (dup[i] == '/')
1047                         dup[i] = '\0';
1048         }
1049
1050         dirfd = open(prefix_skip, O_RDONLY);
1051         if (dirfd < 0) {
1052                 SYSERROR("Failed to open path \"%s\"", prefix_skip);
1053                 goto out;
1054         }
1055
1056         for (;;) {
1057                 int newfd, saved_errno;
1058                 char *nextpath;
1059
1060                 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1061                         goto out;
1062
1063                 newfd = open_if_safe(dirfd, nextpath);
1064                 saved_errno = errno;
1065                 close(dirfd);
1066
1067                 dirfd = newfd;
1068                 if (newfd < 0) {
1069                         errno = saved_errno;
1070                         if (errno == ELOOP)
1071                                 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1072
1073                         goto out;
1074                 }
1075         }
1076
1077 out:
1078         free(dup);
1079         return dirfd;
1080 }
1081
1082 /*
1083  * Safely mount a path into a container, ensuring that the mount target
1084  * is under the container's @rootfs.  (If @rootfs is NULL, then the container
1085  * uses the host's /)
1086  *
1087  * CAVEAT: This function must not be used for other purposes than container
1088  * setup before executing the container's init
1089  */
1090 int safe_mount(const char *src, const char *dest, const char *fstype,
1091                 unsigned long flags, const void *data, const char *rootfs)
1092 {
1093         int destfd, ret, saved_errno;
1094         /* Only needs enough for /proc/self/fd/<fd>. */
1095         char srcbuf[50], destbuf[50];
1096         int srcfd = -1;
1097         const char *mntsrc = src;
1098
1099         if (!rootfs)
1100                 rootfs = "";
1101
1102         /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1103         if (flags & MS_BIND && src && src[0] != '/') {
1104                 INFO("This is a relative bind mount");
1105
1106                 srcfd = open_without_symlink(src, NULL);
1107                 if (srcfd < 0)
1108                         return srcfd;
1109
1110                 ret = snprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd);
1111                 if (ret < 0 || ret >= (int)sizeof(srcbuf)) {
1112                         close(srcfd);
1113                         ERROR("Out of memory");
1114                         return -EINVAL;
1115                 }
1116                 mntsrc = srcbuf;
1117         }
1118
1119         destfd = open_without_symlink(dest, rootfs);
1120         if (destfd < 0) {
1121                 if (srcfd != -1) {
1122                         saved_errno = errno;
1123                         close(srcfd);
1124                         errno = saved_errno;
1125                 }
1126
1127                 return destfd;
1128         }
1129
1130         ret = snprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd);
1131         if (ret < 0 || ret >= (int)sizeof(destbuf)) {
1132                 if (srcfd != -1)
1133                         close(srcfd);
1134
1135                 close(destfd);
1136                 ERROR("Out of memory");
1137                 return -EINVAL;
1138         }
1139
1140         ret = mount(mntsrc, destbuf, fstype, flags, data);
1141         saved_errno = errno;
1142         if (srcfd != -1)
1143                 close(srcfd);
1144
1145         close(destfd);
1146         if (ret < 0) {
1147                 errno = saved_errno;
1148                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src ? src : "(null)", dest);
1149                 return ret;
1150         }
1151
1152         return 0;
1153 }
1154
1155 /*
1156  * Mount a proc under @rootfs if proc self points to a pid other than
1157  * my own.  This is needed to have a known-good proc mount for setting
1158  * up LSMs both at container startup and attach.
1159  *
1160  * @rootfs : the rootfs where proc should be mounted
1161  *
1162  * Returns < 0 on failure, 0 if the correct proc was already mounted
1163  * and 1 if a new proc was mounted.
1164  *
1165  * NOTE: not to be called from inside the container namespace!
1166  */
1167 int lxc_mount_proc_if_needed(const char *rootfs)
1168 {
1169         char path[PATH_MAX] = {0};
1170         int link_to_pid, linklen, mypid, ret;
1171         char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
1172
1173         ret = snprintf(path, PATH_MAX, "%s/proc/self", rootfs);
1174         if (ret < 0 || ret >= PATH_MAX) {
1175                 SYSERROR("The name of proc path is too long");
1176                 return -1;
1177         }
1178
1179         linklen = readlink(path, link, sizeof(link));
1180
1181         ret = snprintf(path, PATH_MAX, "%s/proc", rootfs);
1182         if (ret < 0 || ret >= PATH_MAX) {
1183                 SYSERROR("The name of proc path is too long");
1184                 return -1;
1185         }
1186
1187         /* /proc not mounted */
1188         if (linklen < 0) {
1189                 if (mkdir(path, 0755) && errno != EEXIST)
1190                         return -1;
1191
1192                 goto domount;
1193         } else if (linklen >= sizeof(link)) {
1194                 link[linklen - 1] = '\0';
1195                 ERROR("Readlink returned truncated content: \"%s\"", link);
1196                 return -1;
1197         }
1198
1199         mypid = lxc_raw_getpid();
1200         INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1201
1202         if (lxc_safe_int(link, &link_to_pid) < 0)
1203                 return -1;
1204
1205         /* correct procfs is already mounted */
1206         if (link_to_pid == mypid)
1207                 return 0;
1208
1209         ret = umount2(path, MNT_DETACH);
1210         if (ret < 0)
1211                 SYSWARN("Failed to umount \"%s\" with MNT_DETACH", path);
1212
1213 domount:
1214         /* rootfs is NULL */
1215         if (!strcmp(rootfs, ""))
1216                 ret = mount("proc", path, "proc", 0, NULL);
1217         else
1218                 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
1219         if (ret < 0)
1220                 return -1;
1221
1222         INFO("Mounted /proc in container for security transition");
1223         return 1;
1224 }
1225
1226 int open_devnull(void)
1227 {
1228         int fd = open("/dev/null", O_RDWR);
1229         if (fd < 0)
1230                 SYSERROR("Can't open /dev/null");
1231
1232         return fd;
1233 }
1234
1235 int set_stdfds(int fd)
1236 {
1237         int ret;
1238
1239         if (fd < 0)
1240                 return -1;
1241
1242         ret = dup2(fd, STDIN_FILENO);
1243         if (ret < 0)
1244                 return -1;
1245
1246         ret = dup2(fd, STDOUT_FILENO);
1247         if (ret < 0)
1248                 return -1;
1249
1250         ret = dup2(fd, STDERR_FILENO);
1251         if (ret < 0)
1252                 return -1;
1253
1254         return 0;
1255 }
1256
1257 int null_stdfds(void)
1258 {
1259         int ret = -1;
1260         int fd;
1261
1262         fd = open_devnull();
1263         if (fd >= 0) {
1264                 ret = set_stdfds(fd);
1265                 close(fd);
1266         }
1267
1268         return ret;
1269 }
1270
1271 /* Check whether a signal is blocked by a process. */
1272 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1273 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1274 bool task_blocks_signal(pid_t pid, int signal)
1275 {
1276         __do_free char *line = NULL;
1277         __do_fclose FILE *f = NULL;
1278         int ret;
1279         char status[__PROC_STATUS_LEN] = {0};
1280         uint64_t sigblk = 0, one = 1;
1281         size_t n = 0;
1282         bool bret = false;
1283
1284         ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1285         if (ret < 0 || ret >= __PROC_STATUS_LEN)
1286                 return bret;
1287
1288         f = fopen(status, "re");
1289         if (!f)
1290                 return false;
1291
1292         while (getline(&line, &n, f) != -1) {
1293                 char *numstr;
1294
1295                 if (strncmp(line, "SigBlk:", 7))
1296                         continue;
1297
1298                 numstr = lxc_trim_whitespace_in_place(line + 7);
1299                 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1300                 if (ret < 0)
1301                         return false;
1302
1303                 break;
1304         }
1305
1306         if (sigblk & (one << (signal - 1)))
1307                 bret = true;
1308
1309         return bret;
1310 }
1311
1312 int lxc_preserve_ns(const int pid, const char *ns)
1313 {
1314         int ret;
1315 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1316 #define __NS_PATH_LEN 50
1317         char path[__NS_PATH_LEN];
1318
1319         /* This way we can use this function to also check whether namespaces
1320          * are supported by the kernel by passing in the NULL or the empty
1321          * string.
1322          */
1323         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
1324                        !ns || strcmp(ns, "") == 0 ? "" : "/",
1325                        !ns || strcmp(ns, "") == 0 ? "" : ns);
1326         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1327                 errno = EFBIG;
1328                 return -1;
1329         }
1330
1331         return open(path, O_RDONLY | O_CLOEXEC);
1332 }
1333
1334 bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
1335 {
1336         int ret = 0;
1337
1338         if (gid != LXC_INVALID_GID) {
1339                 ret = setgid(gid);
1340                 if (ret < 0) {
1341                         SYSERROR("Failed to switch to gid %d", gid);
1342                         return false;
1343                 }
1344                 NOTICE("Switched to gid %d", gid);
1345         }
1346
1347         if (uid != LXC_INVALID_UID) {
1348                 ret = setuid(uid);
1349                 if (ret < 0) {
1350                         SYSERROR("Failed to switch to uid %d", uid);
1351                         return false;
1352                 }
1353                 NOTICE("Switched to uid %d", uid);
1354         }
1355
1356         return true;
1357 }
1358
1359 /* Simple convenience function which enables uniform logging. */
1360 bool lxc_setgroups(int size, gid_t list[])
1361 {
1362         if (setgroups(size, list) < 0) {
1363                 SYSERROR("Failed to setgroups()");
1364                 return false;
1365         }
1366         NOTICE("Dropped additional groups");
1367
1368         return true;
1369 }
1370
1371 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1372 {
1373         struct dirent *dp;
1374         struct loop_info64 lo64;
1375         DIR *dir;
1376         int dfd = -1, fd = -1, ret = -1;
1377
1378         dir = opendir("/dev");
1379         if (!dir) {
1380                 SYSERROR("Failed to open \"/dev\"");
1381                 return -1;
1382         }
1383
1384         while ((dp = readdir(dir))) {
1385                 if (strncmp(dp->d_name, "loop", 4) != 0)
1386                         continue;
1387
1388                 dfd = dirfd(dir);
1389                 if (dfd < 0)
1390                         continue;
1391
1392                 fd = openat(dfd, dp->d_name, O_RDWR);
1393                 if (fd < 0)
1394                         continue;
1395
1396                 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1397                 if (ret < 0) {
1398                         if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1399                             errno != ENXIO) {
1400                                 close(fd);
1401                                 fd = -1;
1402                                 continue;
1403                         }
1404                 }
1405
1406                 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1407                 if (ret < 0 || ret >= LO_NAME_SIZE) {
1408                         close(fd);
1409                         fd = -1;
1410                         continue;
1411                 }
1412
1413                 break;
1414         }
1415
1416         closedir(dir);
1417
1418         if (fd < 0)
1419                 return -1;
1420
1421         return fd;
1422 }
1423
1424 static int lxc_get_unused_loop_dev(char *name_loop)
1425 {
1426         int loop_nr, ret;
1427         int fd_ctl = -1, fd_tmp = -1;
1428
1429         fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1430         if (fd_ctl < 0) {
1431                 SYSERROR("Failed to open loop control");
1432                 return -ENODEV;
1433         }
1434
1435         loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1436         if (loop_nr < 0) {
1437                 SYSERROR("Failed to get loop control");
1438                 goto on_error;
1439         }
1440
1441         ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1442         if (ret < 0 || ret >= LO_NAME_SIZE)
1443                 goto on_error;
1444
1445         fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1446         if (fd_tmp < 0) {
1447                 /* on Android loop devices are moved under /dev/block, give it a shot */
1448                 ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/block/loop%d", loop_nr);
1449                 if (ret < 0 || ret >= LO_NAME_SIZE)
1450                         goto on_error;
1451
1452                 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1453                 if (fd_tmp < 0)
1454                         SYSERROR("Failed to open loop \"%s\"", name_loop);
1455         }
1456
1457 on_error:
1458         close(fd_ctl);
1459         return fd_tmp;
1460 }
1461
1462 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1463 {
1464         int ret;
1465         struct loop_info64 lo64;
1466         int fd_img = -1, fret = -1, fd_loop = -1;
1467
1468         fd_loop = lxc_get_unused_loop_dev(loop_dev);
1469         if (fd_loop < 0) {
1470                 if (fd_loop != -ENODEV)
1471                         goto on_error;
1472
1473                 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1474                 if (fd_loop < 0)
1475                         goto on_error;
1476         }
1477
1478         fd_img = open(source, O_RDWR | O_CLOEXEC);
1479         if (fd_img < 0) {
1480                 SYSERROR("Failed to open source \"%s\"", source);
1481                 goto on_error;
1482         }
1483
1484         ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1485         if (ret < 0) {
1486                 SYSERROR("Failed to set loop fd");
1487                 goto on_error;
1488         }
1489
1490         memset(&lo64, 0, sizeof(lo64));
1491         lo64.lo_flags = flags;
1492
1493         strlcpy((char *)lo64.lo_file_name, source, LO_NAME_SIZE);
1494
1495         ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1496         if (ret < 0) {
1497                 SYSERROR("Failed to set loop status64");
1498                 goto on_error;
1499         }
1500
1501         fret = 0;
1502
1503 on_error:
1504         if (fd_img >= 0)
1505                 close(fd_img);
1506
1507         if (fret < 0 && fd_loop >= 0) {
1508                 close(fd_loop);
1509                 fd_loop = -1;
1510         }
1511
1512         return fd_loop;
1513 }
1514
1515 int lxc_unstack_mountpoint(const char *path, bool lazy)
1516 {
1517         int ret;
1518         int umounts = 0;
1519
1520 pop_stack:
1521         ret = umount2(path, lazy ? MNT_DETACH : 0);
1522         if (ret < 0) {
1523                 /* We consider anything else than EINVAL deadly to prevent going
1524                  * into an infinite loop. (The other alternative is constantly
1525                  * parsing /proc/self/mountinfo which is yucky and probably
1526                  * racy.)
1527                  */
1528                 if (errno != EINVAL)
1529                         return -errno;
1530         } else {
1531                 /* Just stop counting when this happens. That'd just be so
1532                  * stupid that we won't even bother trying to report back the
1533                  * correct value anymore.
1534                  */
1535                 if (umounts != INT_MAX)
1536                         umounts++;
1537
1538                 /* We succeeded in umounting. Make sure that there's no other
1539                  * mountpoint stacked underneath.
1540                  */
1541                 goto pop_stack;
1542         }
1543
1544         return umounts;
1545 }
1546
1547 int run_command_internal(char *buf, size_t buf_size, int (*child_fn)(void *), void *args, bool wait_status)
1548 {
1549         pid_t child;
1550         int ret, fret, pipefd[2];
1551         ssize_t bytes;
1552
1553         /* Make sure our callers do not receive uninitialized memory. */
1554         if (buf_size > 0 && buf)
1555                 buf[0] = '\0';
1556
1557         if (pipe(pipefd) < 0) {
1558                 SYSERROR("Failed to create pipe");
1559                 return -1;
1560         }
1561
1562         child = lxc_raw_clone(0, NULL);
1563         if (child < 0) {
1564                 close(pipefd[0]);
1565                 close(pipefd[1]);
1566                 SYSERROR("Failed to create new process");
1567                 return -1;
1568         }
1569
1570         if (child == 0) {
1571                 /* Close the read-end of the pipe. */
1572                 close(pipefd[0]);
1573
1574                 /* Redirect std{err,out} to write-end of the
1575                  * pipe.
1576                  */
1577                 ret = dup2(pipefd[1], STDOUT_FILENO);
1578                 if (ret >= 0)
1579                         ret = dup2(pipefd[1], STDERR_FILENO);
1580
1581                 /* Close the write-end of the pipe. */
1582                 close(pipefd[1]);
1583
1584                 if (ret < 0) {
1585                         SYSERROR("Failed to duplicate std{err,out} file descriptor");
1586                         _exit(EXIT_FAILURE);
1587                 }
1588
1589                 /* Does not return. */
1590                 child_fn(args);
1591                 ERROR("Failed to exec command");
1592                 _exit(EXIT_FAILURE);
1593         }
1594
1595         /* close the write-end of the pipe */
1596         close(pipefd[1]);
1597
1598         if (buf && buf_size > 0) {
1599                 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1600                 if (bytes > 0)
1601                         buf[bytes - 1] = '\0';
1602         }
1603
1604         if (wait_status)
1605                 fret = lxc_wait_for_pid_status(child);
1606         else
1607                 fret = wait_for_pid(child);
1608
1609         /* close the read-end of the pipe */
1610         close(pipefd[0]);
1611
1612         return fret;
1613 }
1614
1615 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1616 {
1617     return run_command_internal(buf, buf_size, child_fn, args, false);
1618 }
1619
1620 int run_command_status(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1621 {
1622     return run_command_internal(buf, buf_size, child_fn, args, true);
1623 }
1624
1625 bool lxc_nic_exists(char *nic)
1626 {
1627 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1628         char path[__LXC_SYS_CLASS_NET_LEN];
1629         int ret;
1630         struct stat sb;
1631
1632         if (!strcmp(nic, "none"))
1633                 return true;
1634
1635         ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1636         if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1637                 return false;
1638
1639         ret = stat(path, &sb);
1640         if (ret < 0)
1641                 return false;
1642
1643         return true;
1644 }
1645
1646 uint64_t lxc_find_next_power2(uint64_t n)
1647 {
1648         /* 0 is not valid input. We return 0 to the caller since 0 is not a
1649          * valid power of two.
1650          */
1651         if (n == 0)
1652                 return 0;
1653
1654         if (!(n & (n - 1)))
1655                 return n;
1656
1657         while (n & (n - 1))
1658                 n = n & (n - 1);
1659
1660         n = n << 1;
1661         return n;
1662 }
1663
1664 static int process_dead(/* takes */ int status_fd)
1665 {
1666         __do_close int dupfd = -EBADF;
1667         __do_free char *line = NULL;
1668         __do_fclose FILE *f = NULL;
1669         int ret = 0;
1670         size_t n = 0;
1671
1672         dupfd = dup(status_fd);
1673         if (dupfd < 0)
1674                 return -1;
1675
1676         if (fd_cloexec(dupfd, true) < 0)
1677                 return -1;
1678
1679         f = fdopen(dupfd, "re");
1680         if (!f)
1681                 return -1;
1682
1683         /* Transfer ownership of fd. */
1684         move_fd(dupfd);
1685
1686         ret = 0;
1687         while (getline(&line, &n, f) != -1) {
1688                 char *state;
1689
1690                 if (strncmp(line, "State:", 6))
1691                         continue;
1692
1693                 state = lxc_trim_whitespace_in_place(line + 6);
1694                 /* only check whether process is dead or zombie for now */
1695                 if (*state == 'X' || *state == 'Z')
1696                         ret = 1;
1697         }
1698
1699         return ret;
1700 }
1701
1702 int lxc_set_death_signal(int signal, pid_t parent, int parent_status_fd)
1703 {
1704         int ret;
1705         pid_t ppid;
1706
1707         ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1708                     prctl_arg(0), prctl_arg(0));
1709
1710         /* verify that we haven't been orphaned in the meantime */
1711         ppid = (pid_t)syscall(SYS_getppid);
1712         if (ppid == 0) { /* parent outside our pidns */
1713                 if (parent_status_fd < 0)
1714                         return 0;
1715
1716                 if (process_dead(parent_status_fd) == 1)
1717                         return raise(SIGKILL);
1718         } else if (ppid != parent) {
1719                 return raise(SIGKILL);
1720         }
1721
1722         if (ret < 0)
1723                 return -1;
1724
1725         return 0;
1726 }
1727
1728 int fd_cloexec(int fd, bool cloexec)
1729 {
1730         int oflags, nflags;
1731
1732         oflags = fcntl(fd, F_GETFD, 0);
1733         if (oflags < 0)
1734                 return -errno;
1735
1736         if (cloexec)
1737                 nflags = oflags | FD_CLOEXEC;
1738         else
1739                 nflags = oflags & ~FD_CLOEXEC;
1740
1741         if (nflags == oflags)
1742                 return 0;
1743
1744         if (fcntl(fd, F_SETFD, nflags) < 0)
1745                 return -errno;
1746
1747         return 0;
1748 }
1749
1750 int lxc_rm_rf(const char *dirname)
1751 {
1752         __do_closedir DIR *dir = NULL;
1753         int fret = 0;
1754         int ret;
1755         struct dirent *direntp;
1756
1757         dir = opendir(dirname);
1758         if (!dir)
1759                 return log_error_errno(-1, errno, "Failed to open dir \"%s\"", dirname);
1760
1761         while ((direntp = readdir(dir))) {
1762                 __do_free char *pathname = NULL;
1763                 struct stat mystat;
1764
1765                 if (!strcmp(direntp->d_name, ".") ||
1766                     !strcmp(direntp->d_name, ".."))
1767                         continue;
1768
1769                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1770                 ret = lstat(pathname, &mystat);
1771                 if (ret < 0) {
1772                         if (!fret)
1773                                 SYSWARN("Failed to stat \"%s\"", pathname);
1774
1775                         fret = -1;
1776                         continue;
1777                 }
1778
1779                 if (!S_ISDIR(mystat.st_mode))
1780                         continue;
1781
1782                 ret = lxc_rm_rf(pathname);
1783                 if (ret < 0)
1784                         fret = -1;
1785         }
1786
1787         ret = rmdir(dirname);
1788         if (ret < 0)
1789                 return log_warn_errno(-1, errno, "Failed to delete \"%s\"", dirname);
1790
1791         return fret;
1792 }
1793
1794 int lxc_setup_keyring(char *keyring_label)
1795 {
1796         key_serial_t keyring;
1797         int ret = 0;
1798
1799         if (keyring_label) {
1800                 if (lsm_keyring_label_set(keyring_label) < 0) {
1801                         ERROR("Couldn't set keyring label");
1802                 }
1803         }
1804
1805         /* Try to allocate a new session keyring for the container to prevent
1806          * information leaks.
1807          */
1808         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
1809                          prctl_arg(0), prctl_arg(0), prctl_arg(0));
1810         if (keyring < 0) {
1811                 switch (errno) {
1812                 case ENOSYS:
1813                         DEBUG("The keyctl() syscall is not supported or blocked");
1814                         break;
1815                 case EACCES:
1816                         __fallthrough;
1817                 case EPERM:
1818                         DEBUG("Failed to access kernel keyring. Continuing...");
1819                         break;
1820                 default:
1821                         SYSERROR("Failed to create kernel keyring");
1822                         break;
1823                 }
1824         }
1825
1826         return ret;
1827 }
1828
1829 bool lxc_can_use_pidfd(int pidfd)
1830 {
1831         int ret;
1832
1833         if (pidfd < 0)
1834                 return log_error(false, "Kernel does not support pidfds");
1835
1836         /*
1837          * We don't care whether or not children were in a waitable state. We
1838          * just care whether waitid() recognizes P_PIDFD.
1839          *
1840          * Btw, while I have your attention, the above waitid() code is an
1841          * excellent example of how _not_ to do flag-based kernel APIs. So if
1842          * you ever go into kernel development or are already and you add this
1843          * kind of flag potpourri even though you have read this comment shame
1844          * on you. May the gods of operating system development have mercy on
1845          * your soul because I won't.
1846          */
1847         ret = waitid(P_PIDFD, pidfd, NULL,
1848                     /* Type of children to wait for. */
1849                     __WALL |
1850                     /* How to wait for them. */
1851                     WNOHANG | WNOWAIT |
1852                     /* What state to wait for. */
1853                     WEXITED | WSTOPPED | WCONTINUED);
1854         if (ret < 0)
1855                 return log_error_errno(false, errno, "Kernel does not support waiting on processes through pidfds");
1856
1857         ret = lxc_raw_pidfd_send_signal(pidfd, 0, NULL, 0);
1858         if (ret)
1859                 return log_error_errno(false, errno, "Kernel does not support sending singals through pidfds");
1860
1861         return log_trace(true, "Kernel supports pidfds");
1862 }
1863
1864 int fix_stdio_permissions(uid_t uid)
1865 {
1866         __do_close int devnull_fd = -EBADF;
1867         int fret = 0;
1868         int std_fds[] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO};
1869         int ret;
1870         struct stat st, st_null;
1871
1872         devnull_fd = open_devnull();
1873         if (devnull_fd < 0)
1874                 return log_warn_errno(-1, errno, "Failed to open \"/dev/null\"");
1875
1876         ret = fstat(devnull_fd, &st_null);
1877         if (ret)
1878                 return log_warn_errno(-errno, errno, "Failed to stat \"/dev/null\"");
1879
1880         for (int i = 0; i < ARRAY_SIZE(std_fds); i++) {
1881                 ret = fstat(std_fds[i], &st);
1882                 if (ret) {
1883                         SYSWARN("Failed to stat standard I/O file descriptor %d", std_fds[i]);
1884                         fret = -1;
1885                         continue;
1886                 }
1887
1888                 if (st.st_rdev == st_null.st_rdev)
1889                         continue;
1890
1891                 ret = fchown(std_fds[i], uid, st.st_gid);
1892                 if (ret) {
1893                         SYSWARN("Failed to chown standard I/O file descriptor %d to uid %d and gid %d",
1894                                 std_fds[i], uid, st.st_gid);
1895                         fret = -1;
1896                 }
1897
1898                 ret = fchmod(std_fds[i], 0700);
1899                 if (ret) {
1900                         SYSWARN("Failed to chmod standard I/O file descriptor %d", std_fds[i]);
1901                         fret = -1;
1902                 }
1903         }
1904
1905         return fret;
1906 }