src/lxc/utils.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE 1
   5 #endif
   6 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
   7 #include <ctype.h>
   8 #include <dirent.h>
   9 #include <errno.h>
  10 #include <fcntl.h>
  11 #include <grp.h>
  12 #include <inttypes.h>
  13 #include <libgen.h>
  14 #include <pthread.h>
  15 #include <signal.h>
  16 #include <stddef.h>
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <sys/mman.h>
  21 #include <sys/mount.h>
  22 /* Needs to be after sys/mount.h header */
  23 #include <linux/fs.h>
  24 #include <sys/param.h>
  25 #include <sys/prctl.h>
  26 #include <sys/stat.h>
  27 #include <sys/types.h>
  28 #include <sys/wait.h>
  29 #include <unistd.h>
  30
  31 #include "config.h"
  32 #include "log.h"
  33 #include "lsm/lsm.h"
  34 #include "lxclock.h"
  35 #include "memory_utils.h"
  36 #include "namespace.h"
  37 #include "parse.h"
  38 #include "process_utils.h"
  39 #include "syscall_wrappers.h"
  40 #include "utils.h"
  41
  42 #ifndef HAVE_STRLCPY
  43 #include "include/strlcpy.h"
  44 #endif
  45
  46 #ifndef HAVE_STRLCAT
  47 #include "include/strlcat.h"
  48 #endif
  49
  50 #ifndef O_PATH
  51 #define O_PATH      010000000
  52 #endif
  53
  54 #ifndef O_NOFOLLOW
  55 #define O_NOFOLLOW  00400000
  56 #endif
  57
  58 lxc_log_define(utils, lxc);
  59
  60 /*
  61  * if path is btrfs, tries to remove it and any subvolumes beneath it
  62  */
  63 extern bool btrfs_try_remove_subvol(const char *path);
  64
  65 static int _recursive_rmdir(const char *dirname, dev_t pdev,
  66                             const char *exclude, int level, bool onedev)
  67 {
  68         __do_closedir DIR *dir = NULL;
  69         int failed = 0;
  70         bool hadexclude = false;
  71         int ret;
  72         struct dirent *direntp;
  73         char pathname[PATH_MAX];
  74
  75         dir = opendir(dirname);
  76         if (!dir)
  77                 return log_error(-1, "Failed to open \"%s\"", dirname);
  78
  79         while ((direntp = readdir(dir))) {
  80                 int rc;
  81                 struct stat mystat;
  82
  83                 if (strequal(direntp->d_name, ".") ||
  84                     strequal(direntp->d_name, ".."))
  85                         continue;
  86
  87                 rc = strnprintf(pathname, sizeof(pathname), "%s/%s", dirname, direntp->d_name);
  88                 if (rc < 0) {
  89                         ERROR("The name of path is too long");
  90                         failed = 1;
  91                         continue;
  92                 }
  93
  94                 if (!level && exclude && strequal(direntp->d_name, exclude)) {
  95                         ret = rmdir(pathname);
  96                         if (ret < 0) {
  97                                 switch (errno) {
  98                                 case ENOTEMPTY:
  99                                         INFO("Not deleting snapshot \"%s\"", pathname);
 100                                         hadexclude = true;
 101                                         break;
 102                                 case ENOTDIR:
 103                                         ret = unlink(pathname);
 104                                         if (ret)
 105                                                 INFO("Failed to remove \"%s\"", pathname);
 106                                         break;
 107                                 default:
 108                                         SYSERROR("Failed to rmdir \"%s\"", pathname);
 109                                         failed = 1;
 110                                         break;
 111                                 }
 112                         }
 113
 114                         continue;
 115                 }
 116
 117                 ret = lstat(pathname, &mystat);
 118                 if (ret) {
 119                         SYSERROR("Failed to stat \"%s\"", pathname);
 120                         failed = 1;
 121                         continue;
 122                 }
 123
 124                 if (onedev && mystat.st_dev != pdev) {
 125                         if (btrfs_try_remove_subvol(pathname))
 126                                 INFO("Removed btrfs subvolume at \"%s\"", pathname);
 127                         continue;
 128                 }
 129
 130                 if (S_ISDIR(mystat.st_mode)) {
 131                         if (_recursive_rmdir(pathname, pdev, exclude, level + 1, onedev) < 0)
 132                                 failed = 1;
 133                 } else {
 134                         ret = unlink(pathname);
 135                         if (ret < 0) {
 136                                 __do_close int fd = -EBADF;
 137
 138                                 fd = open(pathname, O_RDONLY | O_CLOEXEC | O_NONBLOCK);
 139                                 if (fd >= 0) {
 140                                         /* The file might be marked immutable. */
 141                                         int attr = 0;
 142                                         ret = ioctl(fd, FS_IOC_GETFLAGS, &attr);
 143                                         if (ret < 0)
 144                                                 SYSERROR("Failed to retrieve file flags");
 145                                         attr &= ~FS_IMMUTABLE_FL;
 146                                         ret = ioctl(fd, FS_IOC_SETFLAGS, &attr);
 147                                         if (ret < 0)
 148                                                 SYSERROR("Failed to set file flags");
 149                                 }
 150
 151                                 ret = unlink(pathname);
 152                                 if (ret < 0) {
 153                                         SYSERROR("Failed to delete \"%s\"", pathname);
 154                                         failed = 1;
 155                                 }
 156                         }
 157                 }
 158         }
 159
 160         if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
 161                 SYSERROR("Failed to delete \"%s\"", dirname);
 162                 failed = 1;
 163         }
 164
 165         return failed ? -1 : 0;
 166 }
 167
 168 /*
 169  * In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
 170  * lxc_rmdir_onedev().
 171  */
 172 static inline bool is_native_overlayfs(const char *path)
 173 {
 174         return has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
 175                has_fs_type(path, OVERLAYFS_SUPER_MAGIC);
 176 }
 177
 178 /* returns 0 on success, -1 if there were any failures */
 179 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
 180 {
 181         struct stat mystat;
 182         bool onedev = true;
 183
 184         if (is_native_overlayfs(path))
 185                 onedev = false;
 186
 187         if (lstat(path, &mystat) < 0) {
 188                 if (errno == ENOENT)
 189                         return 0;
 190
 191                 return log_error_errno(-1, errno, "Failed to stat \"%s\"", path);
 192         }
 193
 194         return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 195 }
 196
 197 /* borrowed from iproute2 */
 198 extern int get_u16(unsigned short *val, const char *arg, int base)
 199 {
 200         unsigned long res;
 201         char *ptr;
 202
 203         if (!arg || !*arg)
 204                 return ret_errno(EINVAL);
 205
 206         errno = 0;
 207         res = strtoul(arg, &ptr, base);
 208         if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
 209                 return ret_errno(ERANGE);
 210
 211         *val = res;
 212
 213         return 0;
 214 }
 215
 216 int mkdir_p(const char *dir, mode_t mode)
 217 {
 218         const char *tmp = dir;
 219         const char *orig = dir;
 220
 221         do {
 222                 __do_free char *makeme = NULL;
 223                 int ret;
 224
 225                 dir = tmp + strspn(tmp, "/");
 226                 tmp = dir + strcspn(dir, "/");
 227
 228                 makeme = strndup(orig, dir - orig);
 229                 if (!makeme)
 230                         return ret_set_errno(-1, ENOMEM);
 231
 232                 ret = mkdir(makeme, mode);
 233                 if (ret < 0 && errno != EEXIST)
 234                         return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
 235
 236         } while (tmp != dir);
 237
 238         return 0;
 239 }
 240
 241 char *get_rundir(void)
 242 {
 243         __do_free char *rundir = NULL;
 244         char *static_rundir;
 245         int ret;
 246         size_t len;
 247         const char *homedir;
 248         struct stat sb;
 249
 250         if (stat(RUNTIME_PATH, &sb) < 0)
 251                 return NULL;
 252
 253         if (geteuid() == sb.st_uid || getegid() == sb.st_gid)
 254                 return strdup(RUNTIME_PATH);
 255
 256         static_rundir = getenv("XDG_RUNTIME_DIR");
 257         if (static_rundir)
 258                 return strdup(static_rundir);
 259
 260         INFO("XDG_RUNTIME_DIR isn't set in the environment");
 261         homedir = getenv("HOME");
 262         if (!homedir)
 263                 return log_error(NULL, "HOME isn't set in the environment");
 264
 265         len = strlen(homedir) + 17;
 266         rundir = malloc(sizeof(char) * len);
 267         if (!rundir)
 268                 return NULL;
 269
 270         ret = strnprintf(rundir, len, "%s/.cache/lxc/run/", homedir);
 271         if (ret < 0)
 272                 return ret_set_errno(NULL, EIO);
 273
 274         return move_ptr(rundir);
 275 }
 276
 277 int wait_for_pid(pid_t pid)
 278 {
 279         int status, ret;
 280
 281 again:
 282         ret = waitpid(pid, &status, 0);
 283         if (ret == -1) {
 284                 if (errno == EINTR)
 285                         goto again;
 286
 287                 return -1;
 288         }
 289
 290         if (ret != pid)
 291                 goto again;
 292
 293         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
 294                 return -1;
 295
 296         return 0;
 297 }
 298
 299 int wait_for_pidfd(int pidfd)
 300 {
 301         int ret;
 302         siginfo_t info = {
 303                 .si_signo = 0,
 304         };
 305
 306         do {
 307                 ret = waitid(P_PIDFD, pidfd, &info, __WALL | WEXITED);
 308         } while (ret < 0 && errno == EINTR);
 309
 310         return !ret && WIFEXITED(info.si_status) && WEXITSTATUS(info.si_status) == 0;
 311 }
 312
 313 int lxc_wait_for_pid_status(pid_t pid)
 314 {
 315         int status, ret;
 316
 317 again:
 318         ret = waitpid(pid, &status, 0);
 319         if (ret == -1) {
 320                 if (errno == EINTR)
 321                         goto again;
 322
 323                 return -1;
 324         }
 325
 326         if (ret != pid)
 327                 goto again;
 328
 329         return status;
 330 }
 331
 332 #ifdef HAVE_OPENSSL
 333 #include <openssl/evp.h>
 334
 335 static int do_sha1_hash(const char *buf, int buflen, unsigned char *md_value,
 336                         unsigned int *md_len)
 337 {
 338         EVP_MD_CTX *mdctx;
 339         const EVP_MD *md;
 340
 341         md = EVP_get_digestbyname("sha1");
 342         if (!md)
 343                 return log_error(-1, "Unknown message digest: sha1\n");
 344
 345         mdctx = EVP_MD_CTX_create();
 346         EVP_DigestInit_ex(mdctx, md, NULL);
 347         EVP_DigestUpdate(mdctx, buf, buflen);
 348         EVP_DigestFinal_ex(mdctx, md_value, md_len);
 349         EVP_MD_CTX_destroy(mdctx);
 350
 351         return 0;
 352 }
 353
 354 int sha1sum_file(char *fnam, unsigned char *digest, unsigned int *md_len)
 355 {
 356         __do_free char *buf = NULL;
 357         __do_fclose FILE *f = NULL;
 358         int ret;
 359         long flen;
 360
 361         if (!fnam)
 362                 return -1;
 363
 364         f = fopen_cloexec(fnam, "r");
 365         if (!f)
 366                 return log_error_errno(-1, errno, "Failed to open template \"%s\"", fnam);
 367
 368         if (fseek(f, 0, SEEK_END) < 0)
 369                 return log_error_errno(-1, errno, "Failed to seek to end of template");
 370
 371         flen = ftell(f);
 372         if (flen < 0)
 373                 return log_error_errno(-1, errno, "Failed to tell size of template");
 374
 375         if (fseek(f, 0, SEEK_SET) < 0)
 376                 return log_error_errno(-1, errno, "Failed to seek to start of template");
 377
 378         buf = malloc(flen + 1);
 379         if (!buf)
 380                 return log_error_errno(-1, ENOMEM, "Out of memory");
 381
 382         if (fread(buf, 1, flen, f) != flen)
 383                 return log_error_errno(-1, errno, "Failed to read template");
 384
 385         buf[flen] = '\0';
 386         ret = do_sha1_hash(buf, flen, (void *)digest, md_len);
 387         return ret;
 388 }
 389 #endif
 390
 391 struct lxc_popen_FILE *lxc_popen(const char *command)
 392 {
 393         int ret;
 394         int pipe_fds[2];
 395         pid_t child_pid;
 396         struct lxc_popen_FILE *fp = NULL;
 397
 398         ret = pipe2(pipe_fds, O_CLOEXEC);
 399         if (ret < 0)
 400                 return NULL;
 401
 402         child_pid = fork();
 403         if (child_pid < 0)
 404                 goto on_error;
 405
 406         if (!child_pid) {
 407                 sigset_t mask;
 408
 409                 close(pipe_fds[0]);
 410
 411                 /* duplicate stdout */
 412                 if (pipe_fds[1] != STDOUT_FILENO)
 413                         ret = dup2(pipe_fds[1], STDOUT_FILENO);
 414                 else
 415                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 416                 if (ret < 0) {
 417                         close(pipe_fds[1]);
 418                         _exit(EXIT_FAILURE);
 419                 }
 420
 421                 /* duplicate stderr */
 422                 if (pipe_fds[1] != STDERR_FILENO)
 423                         ret = dup2(pipe_fds[1], STDERR_FILENO);
 424                 else
 425                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 426                 close(pipe_fds[1]);
 427                 if (ret < 0)
 428                         _exit(EXIT_FAILURE);
 429
 430                 /* unblock all signals */
 431                 ret = sigfillset(&mask);
 432                 if (ret < 0)
 433                         _exit(EXIT_FAILURE);
 434
 435                 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
 436                 if (ret < 0)
 437                         _exit(EXIT_FAILURE);
 438
 439                 /* check if /bin/sh exist, otherwise try Android location /system/bin/sh */
 440                 if (file_exists("/bin/sh"))
 441                         execl("/bin/sh", "sh", "-c", command, (char *)NULL);
 442                 else
 443                         execl("/system/bin/sh", "sh", "-c", command, (char *)NULL);
 444
 445                 _exit(127);
 446         }
 447
 448         close(pipe_fds[1]);
 449         pipe_fds[1] = -1;
 450
 451         fp = malloc(sizeof(*fp));
 452         if (!fp)
 453                 goto on_error;
 454
 455         memset(fp, 0, sizeof(*fp));
 456
 457         fp->child_pid = child_pid;
 458         fp->pipe = pipe_fds[0];
 459
 460         /* From now on, closing fp->f will also close fp->pipe. So only ever
 461          * call fclose(fp->f).
 462          */
 463         fp->f = fdopen(pipe_fds[0], "r");
 464         if (!fp->f)
 465                 goto on_error;
 466
 467         return fp;
 468
 469 on_error:
 470         /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
 471          * called yet. Otherwise the fd belongs to the file opened by fdopen()
 472          * since it isn't dup()ed.
 473          */
 474         if (fp && !fp->f && pipe_fds[0] >= 0)
 475                 close(pipe_fds[0]);
 476
 477         if (pipe_fds[1] >= 0)
 478                 close(pipe_fds[1]);
 479
 480         if (fp && fp->f)
 481                 fclose(fp->f);
 482
 483         if (fp)
 484                 free(fp);
 485
 486         return NULL;
 487 }
 488
 489 int lxc_pclose(struct lxc_popen_FILE *fp)
 490 {
 491         pid_t wait_pid;
 492         int wstatus = 0;
 493
 494         if (!fp)
 495                 return -1;
 496
 497         do {
 498                 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
 499         } while (wait_pid < 0 && errno == EINTR);
 500
 501         fclose(fp->f);
 502         free(fp);
 503
 504         if (wait_pid < 0)
 505                 return -1;
 506
 507         return wstatus;
 508 }
 509
 510 int randseed(bool srand_it)
 511 {
 512         __do_fclose FILE *f = NULL;
 513         /*
 514          * srand pre-seed function based on /dev/urandom
 515          */
 516         unsigned int seed = time(NULL) + getpid();
 517
 518         f = fopen("/dev/urandom", "re");
 519         if (f) {
 520                 int ret = fread(&seed, sizeof(seed), 1, f);
 521                 if (ret != 1)
 522                         SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
 523         }
 524
 525         if (srand_it)
 526                 srand(seed);
 527
 528         return seed;
 529 }
 530
 531 uid_t get_ns_uid(uid_t orig)
 532 {
 533         __do_free char *line = NULL;
 534         __do_fclose FILE *f = NULL;
 535         size_t sz = 0;
 536         uid_t nsid, hostid, range;
 537
 538         f = fopen("/proc/self/uid_map", "re");
 539         if (!f)
 540                 return log_error_errno(0, errno, "Failed to open uid_map");
 541
 542         while (getline(&line, &sz, f) != -1) {
 543                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 544                         continue;
 545
 546                 if (hostid <= orig && hostid + range > orig)
 547                         return nsid += orig - hostid;
 548         }
 549
 550         return LXC_INVALID_UID;
 551 }
 552
 553 gid_t get_ns_gid(gid_t orig)
 554 {
 555         __do_free char *line = NULL;
 556         __do_fclose FILE *f = NULL;
 557         size_t sz = 0;
 558         gid_t nsid, hostid, range;
 559
 560         f = fopen("/proc/self/gid_map", "re");
 561         if (!f)
 562                 return log_error_errno(0, errno, "Failed to open gid_map");
 563
 564         while (getline(&line, &sz, f) != -1) {
 565                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 566                         continue;
 567
 568                 if (hostid <= orig && hostid + range > orig)
 569                         return nsid += orig - hostid;
 570         }
 571
 572         return LXC_INVALID_GID;
 573 }
 574
 575 bool dir_exists(const char *path)
 576 {
 577         return exists_dir_at(-1, path);
 578 }
 579
 580 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
 581  * FNV has good anti collision properties and we're not worried
 582  * about pre-image resistance or one-way-ness, we're just trying to make
 583  * the name unique in the 108 bytes of space we have.
 584  */
 585 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
 586 {
 587         unsigned char *bp;
 588
 589         for(bp = buf; bp < (unsigned char *)buf + len; bp++) {
 590                 /* xor the bottom with the current octet */
 591                 hval ^= (uint64_t)*bp;
 592
 593                 /* gcc optimised:
 594                  * multiply by the 64 bit FNV magic prime mod 2^64
 595                  */
 596                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
 597                         (hval << 7) + (hval << 8) + (hval << 40);
 598         }
 599
 600         return hval;
 601 }
 602
 603 bool is_shared_mountpoint(const char *path)
 604 {
 605         __do_fclose FILE *f = NULL;
 606         __do_free char *line = NULL;
 607         int i;
 608         size_t len = 0;
 609
 610         f = fopen("/proc/self/mountinfo", "re");
 611         if (!f)
 612                 return 0;
 613
 614         while (getline(&line, &len, f) > 0) {
 615                 char *slider1, *slider2;
 616
 617                 for (slider1 = line, i = 0; slider1 && i < 4; i++)
 618                         slider1 = strchr(slider1 + 1, ' ');
 619
 620                 if (!slider1)
 621                         continue;
 622
 623                 slider2 = strchr(slider1 + 1, ' ');
 624                 if (!slider2)
 625                         continue;
 626
 627                 *slider2 = '\0';
 628                 if (strequal(slider1 + 1, path)) {
 629                         /* This is the path. Is it shared? */
 630                         slider1 = strchr(slider2 + 1, ' ');
 631                         if (slider1 && strstr(slider1, "shared:"))
 632                                 return true;
 633                 }
 634         }
 635
 636         return false;
 637 }
 638
 639 /*
 640  * Detect whether / is mounted MS_SHARED.  The only way I know of to
 641  * check that is through /proc/self/mountinfo.
 642  * I'm only checking for /.  If the container rootfs or mount location
 643  * is MS_SHARED, but not '/', then you're out of luck - figuring that
 644  * out would be too much work to be worth it.
 645  */
 646 int detect_shared_rootfs(void)
 647 {
 648         if (is_shared_mountpoint("/"))
 649                 return 1;
 650
 651         return 0;
 652 }
 653
 654 bool switch_to_ns(pid_t pid, const char *ns)
 655 {
 656         __do_close int fd = -EBADF;
 657         int ret;
 658         char nspath[STRLITERALLEN("/proc//ns/")
 659                     + INTTYPE_TO_STRLEN(pid_t)
 660                     + LXC_NAMESPACE_NAME_MAX];
 661
 662         /* Switch to new ns */
 663         ret = strnprintf(nspath, sizeof(nspath), "/proc/%d/ns/%s", pid, ns);
 664         if (ret < 0)
 665                 return false;
 666
 667         fd = open(nspath, O_RDONLY | O_CLOEXEC);
 668         if (fd < 0)
 669                 return log_error_errno(false, errno, "Failed to open \"%s\"", nspath);
 670
 671         ret = setns(fd, 0);
 672         if (ret)
 673                 return log_error_errno(false, errno, "Failed to set process %d to \"%s\" of %d", pid, ns, fd);
 674
 675         return true;
 676 }
 677
 678 /*
 679  * looking at fs/proc_namespace.c, it appears we can
 680  * actually expect the rootfs entry to very specifically contain
 681  * " - rootfs rootfs "
 682  * IIUC, so long as we've chrooted so that rootfs is not our root,
 683  * the rootfs entry should always be skipped in mountinfo contents.
 684  */
 685 bool detect_ramfs_rootfs(void)
 686 {
 687         __do_free char *line = NULL;
 688         __do_free void *fopen_cache = NULL;
 689         __do_fclose FILE *f = NULL;
 690         size_t len = 0;
 691
 692         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 693         if (!f)
 694                 return false;
 695
 696         while (getline(&line, &len, f) != -1) {
 697                 int i;
 698                 char *p, *p2;
 699
 700                 for (p = line, i = 0; p && i < 4; i++)
 701                         p = strchr(p + 1, ' ');
 702                 if (!p)
 703                         continue;
 704
 705                 p2 = strchr(p + 1, ' ');
 706                 if (!p2)
 707                         continue;
 708                 *p2 = '\0';
 709                 if (strequal(p + 1, "/")) {
 710                         /* This is '/'. Is it the ramfs? */
 711                         p = strchr(p2 + 1, '-');
 712                         if (p && strnequal(p, "- rootfs ", 9))
 713                                 return true;
 714                 }
 715         }
 716
 717         return false;
 718 }
 719
 720 char *on_path(const char *cmd, const char *rootfs)
 721 {
 722         __do_free char *path = NULL;
 723         char *entry = NULL;
 724         char cmdpath[PATH_MAX];
 725         int ret;
 726
 727         path = getenv("PATH");
 728         if (!path)
 729                 return NULL;
 730
 731         path = strdup(path);
 732         if (!path)
 733                 return NULL;
 734
 735         lxc_iterate_parts(entry, path, ":") {
 736                 if (rootfs)
 737                         ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s/%s", rootfs, entry, cmd);
 738                 else
 739                         ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s", entry, cmd);
 740                 if (ret < 0)
 741                         continue;
 742
 743                 if (access(cmdpath, X_OK) == 0)
 744                         return strdup(cmdpath);
 745         }
 746
 747         return NULL;
 748 }
 749
 750 /* historically lxc-init has been under /usr/lib/lxc and under
 751  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
 752  */
 753 char *choose_init(const char *rootfs)
 754 {
 755         char *retv = NULL;
 756         const char *empty = "",
 757                    *tmp;
 758         int ret, env_set = 0;
 759
 760         if (!getenv("PATH")) {
 761                 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
 762                         SYSERROR("Failed to setenv");
 763
 764                 env_set = 1;
 765         }
 766
 767         retv = on_path("init.lxc", rootfs);
 768
 769         if (env_set)
 770                 if (unsetenv("PATH"))
 771                         SYSERROR("Failed to unsetenv");
 772
 773         if (retv)
 774                 return retv;
 775
 776         retv = malloc(PATH_MAX);
 777         if (!retv)
 778                 return NULL;
 779
 780         if (rootfs)
 781                 tmp = rootfs;
 782         else
 783                 tmp = empty;
 784
 785         ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
 786         if (ret < 0) {
 787                 ERROR("The name of path is too long");
 788                 goto out1;
 789         }
 790
 791         if (access(retv, X_OK) == 0)
 792                 return retv;
 793
 794         ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
 795         if (ret < 0) {
 796                 ERROR("The name of path is too long");
 797                 goto out1;
 798         }
 799
 800         if (access(retv, X_OK) == 0)
 801                 return retv;
 802
 803         ret = strnprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
 804         if (ret < 0) {
 805                 ERROR("The name of path is too long");
 806                 goto out1;
 807         }
 808
 809         if (access(retv, X_OK) == 0)
 810                 return retv;
 811
 812         ret = strnprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
 813         if (ret < 0) {
 814                 ERROR("The name of path is too long");
 815                 goto out1;
 816         }
 817
 818         if (access(retv, X_OK) == 0)
 819                 return retv;
 820
 821         /*
 822          * Last resort, look for the statically compiled init.lxc which we
 823          * hopefully bind-mounted in.
 824          * If we are called during container setup, and we get to this point,
 825          * then the init.lxc.static from the host will need to be bind-mounted
 826          * in.  So we return NULL here to indicate that.
 827          */
 828         if (rootfs)
 829                 goto out1;
 830
 831         ret = strnprintf(retv, PATH_MAX, "/init.lxc.static");
 832         if (ret < 0) {
 833                 WARN("Nonsense - name /lxc.init.static too long");
 834                 goto out1;
 835         }
 836
 837         if (access(retv, X_OK) == 0)
 838                 return retv;
 839
 840 out1:
 841         free(retv);
 842         return NULL;
 843 }
 844
 845 /*
 846  * Given the '-t' template option to lxc-create, figure out what to
 847  * do.  If the template is a full executable path, use that.  If it
 848  * is something like 'sshd', then return $templatepath/lxc-sshd.
 849  * On success return the template, on error return NULL.
 850  */
 851 char *get_template_path(const char *t)
 852 {
 853         int ret, len;
 854         char *tpath;
 855
 856         if (t[0] == '/') {
 857                 if (access(t, X_OK) == 0) {
 858                         return strdup(t);
 859                 } else {
 860                         SYSERROR("Bad template pathname: %s", t);
 861                         return NULL;
 862                 }
 863         }
 864
 865         len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
 866
 867         tpath = malloc(len);
 868         if (!tpath)
 869                 return NULL;
 870
 871         ret = strnprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
 872         if (ret < 0) {
 873                 free(tpath);
 874                 return NULL;
 875         }
 876
 877         if (access(tpath, X_OK) < 0) {
 878                 SYSERROR("bad template: %s", t);
 879                 free(tpath);
 880                 return NULL;
 881         }
 882
 883         return tpath;
 884 }
 885
 886 /*
 887  * @path:    a pathname where / replaced with '\0'.
 888  * @offsetp: pointer to int showing which path segment was last seen.
 889  *           Updated on return to reflect the next segment.
 890  * @fulllen: full original path length.
 891  * Returns a pointer to the next path segment, or NULL if done.
 892  */
 893 static char *get_nextpath(char *path, int *offsetp, int fulllen)
 894 {
 895         int offset = *offsetp;
 896
 897         if (offset >= fulllen)
 898                 return NULL;
 899
 900         while (offset < fulllen && path[offset] != '\0')
 901                 offset++;
 902
 903         while (offset < fulllen && path[offset] == '\0')
 904                 offset++;
 905
 906         *offsetp = offset;
 907
 908         return (offset < fulllen) ? &path[offset] : NULL;
 909 }
 910
 911 /*
 912  * Check that @subdir is a subdir of @dir.  @len is the length of
 913  * @dir (to avoid having to recalculate it).
 914  */
 915 static bool is_subdir(const char *subdir, const char *dir, size_t len)
 916 {
 917         size_t subdirlen = strlen(subdir);
 918
 919         if (subdirlen < len)
 920                 return false;
 921
 922         if (!strnequal(subdir, dir, len))
 923                 return false;
 924
 925         if (dir[len-1] == '/')
 926                 return true;
 927
 928         if (subdir[len] == '/' || subdirlen == len)
 929                 return true;
 930
 931         return false;
 932 }
 933
 934 /*
 935  * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
 936  * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
 937  */
 938 static int check_symlink(int fd)
 939 {
 940         struct stat sb;
 941         int ret;
 942
 943         ret = fstat(fd, &sb);
 944         if (ret < 0)
 945                 return -ENOENT;
 946
 947         if (S_ISLNK(sb.st_mode))
 948                 return -ELOOP;
 949
 950         return 0;
 951 }
 952
 953 /*
 954  * Open a file or directory, provided that it contains no symlinks.
 955  *
 956  * CAVEAT: This function must not be used for other purposes than container
 957  * setup before executing the container's init
 958  */
 959 static int open_if_safe(int dirfd, const char *nextpath)
 960 {
 961         int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
 962         if (newfd >= 0) /* Was not a symlink, all good. */
 963                 return newfd;
 964
 965         if (errno == ELOOP)
 966                 return newfd;
 967
 968         if (errno == EPERM || errno == EACCES) {
 969                 /* We're not root (cause we got EPERM) so try opening with
 970                  * O_PATH.
 971                  */
 972                 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
 973                 if (newfd >= 0) {
 974                         /* O_PATH will return an fd for symlinks. We know
 975                          * nextpath wasn't a symlink at last openat, so if fd is
 976                          * now a link, then something * fishy is going on.
 977                          */
 978                         int ret = check_symlink(newfd);
 979                         if (ret < 0) {
 980                                 close(newfd);
 981                                 newfd = ret;
 982                         }
 983                 }
 984         }
 985
 986         return newfd;
 987 }
 988
 989 /*
 990  * Open a path intending for mounting, ensuring that the final path
 991  * is inside the container's rootfs.
 992  *
 993  * CAVEAT: This function must not be used for other purposes than container
 994  * setup before executing the container's init
 995  *
 996  * @target: path to be opened
 997  * @prefix_skip: a part of @target in which to ignore symbolic links.  This
 998  * would be the container's rootfs.
 999  *
1000  * Return an open fd for the path, or <0 on error.
1001  */
1002 static int open_without_symlink(const char *target, const char *prefix_skip)
1003 {
1004         int curlen = 0, dirfd, fulllen, i;
1005         char *dup;
1006
1007         fulllen = strlen(target);
1008
1009         /* make sure prefix-skip makes sense */
1010         if (prefix_skip && strlen(prefix_skip) > 0) {
1011                 curlen = strlen(prefix_skip);
1012                 if (!is_subdir(target, prefix_skip, curlen)) {
1013                         ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1014                               target, prefix_skip);
1015                         return -EINVAL;
1016                 }
1017
1018                 /*
1019                  * get_nextpath() expects the curlen argument to be
1020                  * on a  (turned into \0) / or before it, so decrement
1021                  * curlen to make sure that happens
1022                  */
1023                 if (curlen)
1024                         curlen--;
1025         } else {
1026                 prefix_skip = "/";
1027                 curlen = 0;
1028         }
1029
1030         /* Make a copy of target which we can hack up, and tokenize it */
1031         if ((dup = strdup(target)) == NULL) {
1032                 ERROR("Out of memory checking for symbolic link");
1033                 return -ENOMEM;
1034         }
1035
1036         for (i = 0; i < fulllen; i++) {
1037                 if (dup[i] == '/')
1038                         dup[i] = '\0';
1039         }
1040
1041         dirfd = open(prefix_skip, O_RDONLY);
1042         if (dirfd < 0) {
1043                 SYSERROR("Failed to open path \"%s\"", prefix_skip);
1044                 goto out;
1045         }
1046
1047         for (;;) {
1048                 int newfd, saved_errno;
1049                 char *nextpath;
1050
1051                 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1052                         goto out;
1053
1054                 newfd = open_if_safe(dirfd, nextpath);
1055                 saved_errno = errno;
1056                 close(dirfd);
1057
1058                 dirfd = newfd;
1059                 if (newfd < 0) {
1060                         errno = saved_errno;
1061                         if (errno == ELOOP)
1062                                 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1063
1064                         goto out;
1065                 }
1066         }
1067
1068 out:
1069         free(dup);
1070         return dirfd;
1071 }
1072
1073 int __safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
1074                             unsigned int flags, const void *data)
1075 {
1076         __do_close int source_fd = -EBADF, target_fd = -EBADF;
1077         struct lxc_open_how how = {
1078                 .flags          = PROTECT_OPATH_DIRECTORY,
1079                 .resolve        = PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS,
1080         };
1081         int ret;
1082         char src_buf[LXC_PROC_PID_FD_LEN], tgt_buf[LXC_PROC_PID_FD_LEN];
1083
1084         if (beneath_fd < 0)
1085                 return -EINVAL;
1086
1087         if ((flags & MS_BIND) && src && src[0] != '/') {
1088                 source_fd = openat2(beneath_fd, src, &how, sizeof(how));
1089                 if (source_fd < 0)
1090                         return -errno;
1091                 ret = strnprintf(src_buf, sizeof(src_buf), "/proc/self/fd/%d", source_fd);
1092                 if (ret < 0)
1093                         return -EIO;
1094         } else {
1095                 src_buf[0] = '\0';
1096         }
1097
1098         target_fd = openat2(beneath_fd, dst, &how, sizeof(how));
1099         if (target_fd < 0)
1100                 return log_error_errno(-errno, errno, "Failed to open %d(%s)", beneath_fd, dst);
1101         ret = strnprintf(tgt_buf, sizeof(tgt_buf), "/proc/self/fd/%d", target_fd);
1102         if (ret < 0)
1103                 return -EIO;
1104
1105         if (!is_empty_string(src_buf))
1106                 ret = mount(src_buf, tgt_buf, fstype, flags, data);
1107         else
1108                 ret = mount(src, tgt_buf, fstype, flags, data);
1109
1110         return ret;
1111 }
1112
1113 int safe_mount_beneath(const char *beneath, const char *src, const char *dst, const char *fstype,
1114                        unsigned int flags, const void *data)
1115 {
1116         __do_close int beneath_fd = -EBADF;
1117         const char *path = beneath ? beneath : "/";
1118
1119         beneath_fd = openat(-1, path, PROTECT_OPATH_DIRECTORY);
1120         if (beneath_fd < 0)
1121                 return log_error_errno(-errno, errno, "Failed to open %s", path);
1122
1123         return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
1124 }
1125
1126 int safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
1127                           unsigned int flags, const void *data)
1128 {
1129         return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
1130 }
1131
1132 /*
1133  * Safely mount a path into a container, ensuring that the mount target
1134  * is under the container's @rootfs.  (If @rootfs is NULL, then the container
1135  * uses the host's /)
1136  *
1137  * CAVEAT: This function must not be used for other purposes than container
1138  * setup before executing the container's init
1139  */
1140 int safe_mount(const char *src, const char *dest, const char *fstype,
1141                 unsigned long flags, const void *data, const char *rootfs)
1142 {
1143         int destfd, ret, saved_errno;
1144         /* Only needs enough for /proc/self/fd/<fd>. */
1145         char srcbuf[50], destbuf[50];
1146         int srcfd = -1;
1147         const char *mntsrc = src;
1148
1149         if (!rootfs)
1150                 rootfs = "";
1151
1152         /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1153         if (flags & MS_BIND && src && src[0] != '/') {
1154                 INFO("This is a relative bind mount");
1155
1156                 srcfd = open_without_symlink(src, NULL);
1157                 if (srcfd < 0)
1158                         return srcfd;
1159
1160                 ret = strnprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd);
1161                 if (ret < 0) {
1162                         close(srcfd);
1163                         ERROR("Out of memory");
1164                         return -EINVAL;
1165                 }
1166                 mntsrc = srcbuf;
1167         }
1168
1169         destfd = open_without_symlink(dest, rootfs);
1170         if (destfd < 0) {
1171                 if (srcfd != -1) {
1172                         saved_errno = errno;
1173                         close(srcfd);
1174                         errno = saved_errno;
1175                 }
1176
1177                 return destfd;
1178         }
1179
1180         ret = strnprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd);
1181         if (ret < 0) {
1182                 if (srcfd != -1)
1183                         close(srcfd);
1184
1185                 close(destfd);
1186                 ERROR("Out of memory");
1187                 return -EINVAL;
1188         }
1189
1190         ret = mount(mntsrc, destbuf, fstype, flags, data);
1191         saved_errno = errno;
1192         if (srcfd != -1)
1193                 close(srcfd);
1194
1195         close(destfd);
1196         if (ret < 0) {
1197                 errno = saved_errno;
1198                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src ? src : "(null)", dest);
1199                 return ret;
1200         }
1201
1202         return 0;
1203 }
1204
1205 int open_devnull(void)
1206 {
1207         int fd = open("/dev/null", O_RDWR);
1208         if (fd < 0)
1209                 SYSERROR("Can't open /dev/null");
1210
1211         return fd;
1212 }
1213
1214 int set_stdfds(int fd)
1215 {
1216         int ret;
1217
1218         if (fd < 0)
1219                 return -1;
1220
1221         ret = dup2(fd, STDIN_FILENO);
1222         if (ret < 0)
1223                 return -1;
1224
1225         ret = dup2(fd, STDOUT_FILENO);
1226         if (ret < 0)
1227                 return -1;
1228
1229         ret = dup2(fd, STDERR_FILENO);
1230         if (ret < 0)
1231                 return -1;
1232
1233         return 0;
1234 }
1235
1236 int null_stdfds(void)
1237 {
1238         int ret = -1;
1239         int fd;
1240
1241         fd = open_devnull();
1242         if (fd >= 0) {
1243                 ret = set_stdfds(fd);
1244                 close(fd);
1245         }
1246
1247         return ret;
1248 }
1249
1250 /* Check whether a signal is blocked by a process. */
1251 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1252 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1253 bool task_blocks_signal(pid_t pid, int signal)
1254 {
1255         __do_free char *line = NULL;
1256         __do_fclose FILE *f = NULL;
1257         int ret;
1258         char status[__PROC_STATUS_LEN] = {0};
1259         uint64_t sigblk = 0, one = 1;
1260         size_t n = 0;
1261         bool bret = false;
1262
1263         ret = strnprintf(status, sizeof(status), "/proc/%d/status", pid);
1264         if (ret < 0)
1265                 return bret;
1266
1267         f = fopen(status, "re");
1268         if (!f)
1269                 return false;
1270
1271         while (getline(&line, &n, f) != -1) {
1272                 char *numstr;
1273
1274                 if (!strnequal(line, "SigBlk:", 7))
1275                         continue;
1276
1277                 numstr = lxc_trim_whitespace_in_place(line + 7);
1278                 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1279                 if (ret < 0)
1280                         return false;
1281
1282                 break;
1283         }
1284
1285         if (sigblk & (one << (signal - 1)))
1286                 bret = true;
1287
1288         return bret;
1289 }
1290
1291 int lxc_preserve_ns(const int pid, const char *ns)
1292 {
1293         int ret;
1294 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1295 #define __NS_PATH_LEN 50
1296         char path[__NS_PATH_LEN];
1297
1298         /* This way we can use this function to also check whether namespaces
1299          * are supported by the kernel by passing in the NULL or the empty
1300          * string.
1301          */
1302         ret = strnprintf(path, sizeof(path), "/proc/%d/ns%s%s", pid,
1303                          !ns || strequal(ns, "") ? "" : "/",
1304                          !ns || strequal(ns, "") ? "" : ns);
1305         if (ret < 0)
1306                 return ret_errno(EIO);
1307
1308         return open(path, O_RDONLY | O_CLOEXEC);
1309 }
1310
1311 bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
1312 {
1313         int ret = 0;
1314
1315         if (gid != LXC_INVALID_GID) {
1316                 ret = setresgid(gid, gid, gid);
1317                 if (ret < 0) {
1318                         SYSERROR("Failed to switch to gid %d", gid);
1319                         return false;
1320                 }
1321                 NOTICE("Switched to gid %d", gid);
1322         }
1323
1324         if (uid != LXC_INVALID_UID) {
1325                 ret = setresuid(uid, uid, uid);
1326                 if (ret < 0) {
1327                         SYSERROR("Failed to switch to uid %d", uid);
1328                         return false;
1329                 }
1330                 NOTICE("Switched to uid %d", uid);
1331         }
1332
1333         return true;
1334 }
1335
1336 /* Simple convenience function which enables uniform logging. */
1337 bool lxc_drop_groups(void)
1338 {
1339         int ret;
1340
1341         ret = setgroups(0, NULL);
1342         if (ret)
1343                 return log_error_errno(false, errno, "Failed to drop supplimentary groups");
1344
1345         NOTICE("Dropped supplimentary groups");
1346         return ret == 0;
1347 }
1348
1349 bool lxc_setgroups(gid_t list[], size_t size)
1350 {
1351         int ret;
1352
1353         ret = setgroups(size, list);
1354         if (ret)
1355                 return log_error_errno(false, errno, "Failed to set supplimentary groups");
1356
1357         if (size > 0 && lxc_log_trace()) {
1358                 for (size_t i = 0; i < size; i++)
1359                         TRACE("Setting supplimentary group %d", list[i]);
1360         }
1361
1362         NOTICE("Set supplimentary groups");
1363         return true;
1364 }
1365
1366 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1367 {
1368         struct dirent *dp;
1369         struct loop_info64 lo64;
1370         DIR *dir;
1371         int dfd = -1, fd = -1, ret = -1;
1372
1373         dir = opendir("/dev");
1374         if (!dir) {
1375                 SYSERROR("Failed to open \"/dev\"");
1376                 return -1;
1377         }
1378
1379         while ((dp = readdir(dir))) {
1380                 if (!strnequal(dp->d_name, "loop", 4))
1381                         continue;
1382
1383                 dfd = dirfd(dir);
1384                 if (dfd < 0)
1385                         continue;
1386
1387                 fd = openat(dfd, dp->d_name, O_RDWR);
1388                 if (fd < 0)
1389                         continue;
1390
1391                 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1392                 if (ret < 0) {
1393                         if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1394                             errno != ENXIO) {
1395                                 close(fd);
1396                                 fd = -1;
1397                                 continue;
1398                         }
1399                 }
1400
1401                 ret = strnprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1402                 if (ret < 0) {
1403                         close(fd);
1404                         fd = -1;
1405                         continue;
1406                 }
1407
1408                 break;
1409         }
1410
1411         closedir(dir);
1412
1413         if (fd < 0)
1414                 return -1;
1415
1416         return fd;
1417 }
1418
1419 static int lxc_get_unused_loop_dev(char *name_loop)
1420 {
1421         int loop_nr, ret;
1422         int fd_ctl = -1, fd_tmp = -1;
1423
1424         fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1425         if (fd_ctl < 0) {
1426                 SYSERROR("Failed to open loop control");
1427                 return -ENODEV;
1428         }
1429
1430         loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1431         if (loop_nr < 0) {
1432                 SYSERROR("Failed to get loop control");
1433                 goto on_error;
1434         }
1435
1436         ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1437         if (ret < 0)
1438                 goto on_error;
1439
1440         fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1441         if (fd_tmp < 0) {
1442                 /* on Android loop devices are moved under /dev/block, give it a shot */
1443                 ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/block/loop%d", loop_nr);
1444                 if (ret < 0)
1445                         goto on_error;
1446
1447                 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1448                 if (fd_tmp < 0)
1449                         SYSERROR("Failed to open loop \"%s\"", name_loop);
1450         }
1451
1452 on_error:
1453         close(fd_ctl);
1454         return fd_tmp;
1455 }
1456
1457 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1458 {
1459         int ret;
1460         struct loop_info64 lo64;
1461         int fd_img = -1, fret = -1, fd_loop = -1;
1462
1463         fd_loop = lxc_get_unused_loop_dev(loop_dev);
1464         if (fd_loop < 0) {
1465                 if (fd_loop != -ENODEV)
1466                         goto on_error;
1467
1468                 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1469                 if (fd_loop < 0)
1470                         goto on_error;
1471         }
1472
1473         fd_img = open(source, O_RDWR | O_CLOEXEC);
1474         if (fd_img < 0) {
1475                 SYSERROR("Failed to open source \"%s\"", source);
1476                 goto on_error;
1477         }
1478
1479         ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1480         if (ret < 0) {
1481                 SYSERROR("Failed to set loop fd");
1482                 goto on_error;
1483         }
1484
1485         memset(&lo64, 0, sizeof(lo64));
1486         lo64.lo_flags = flags;
1487
1488         strlcpy((char *)lo64.lo_file_name, source, LO_NAME_SIZE);
1489
1490         ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1491         if (ret < 0) {
1492                 SYSERROR("Failed to set loop status64");
1493                 goto on_error;
1494         }
1495
1496         fret = 0;
1497
1498 on_error:
1499         if (fd_img >= 0)
1500                 close(fd_img);
1501
1502         if (fret < 0 && fd_loop >= 0) {
1503                 close(fd_loop);
1504                 fd_loop = -1;
1505         }
1506
1507         return fd_loop;
1508 }
1509
1510 int lxc_unstack_mountpoint(const char *path, bool lazy)
1511 {
1512         int ret;
1513         int umounts = 0;
1514
1515 pop_stack:
1516         ret = umount2(path, lazy ? MNT_DETACH : 0);
1517         if (ret < 0) {
1518                 /* We consider anything else than EINVAL deadly to prevent going
1519                  * into an infinite loop. (The other alternative is constantly
1520                  * parsing /proc/self/mountinfo which is yucky and probably
1521                  * racy.)
1522                  */
1523                 if (errno != EINVAL)
1524                         return -errno;
1525         } else {
1526                 /* Just stop counting when this happens. That'd just be so
1527                  * stupid that we won't even bother trying to report back the
1528                  * correct value anymore.
1529                  */
1530                 if (umounts != INT_MAX)
1531                         umounts++;
1532
1533                 /* We succeeded in umounting. Make sure that there's no other
1534                  * mountpoint stacked underneath.
1535                  */
1536                 goto pop_stack;
1537         }
1538
1539         return umounts;
1540 }
1541
1542 static int run_command_internal(char *buf, size_t buf_size, int (*child_fn)(void *), void *args, bool wait_status)
1543 {
1544         pid_t child;
1545         int ret, fret, pipefd[2];
1546         ssize_t bytes;
1547
1548         /* Make sure our callers do not receive uninitialized memory. */
1549         if (buf_size > 0 && buf)
1550                 buf[0] = '\0';
1551
1552         if (pipe(pipefd) < 0) {
1553                 SYSERROR("Failed to create pipe");
1554                 return -1;
1555         }
1556
1557         child = lxc_raw_clone(0, NULL);
1558         if (child < 0) {
1559                 close(pipefd[0]);
1560                 close(pipefd[1]);
1561                 SYSERROR("Failed to create new process");
1562                 return -1;
1563         }
1564
1565         if (child == 0) {
1566                 /* Close the read-end of the pipe. */
1567                 close(pipefd[0]);
1568
1569                 /* Redirect std{err,out} to write-end of the
1570                  * pipe.
1571                  */
1572                 ret = dup2(pipefd[1], STDOUT_FILENO);
1573                 if (ret >= 0)
1574                         ret = dup2(pipefd[1], STDERR_FILENO);
1575
1576                 /* Close the write-end of the pipe. */
1577                 close(pipefd[1]);
1578
1579                 if (ret < 0) {
1580                         SYSERROR("Failed to duplicate std{err,out} file descriptor");
1581                         _exit(EXIT_FAILURE);
1582                 }
1583
1584                 /* Does not return. */
1585                 child_fn(args);
1586                 ERROR("Failed to exec command");
1587                 _exit(EXIT_FAILURE);
1588         }
1589
1590         /* close the write-end of the pipe */
1591         close(pipefd[1]);
1592
1593         if (buf && buf_size > 0) {
1594                 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1595                 if (bytes > 0)
1596                         buf[bytes - 1] = '\0';
1597         }
1598
1599         if (wait_status)
1600                 fret = lxc_wait_for_pid_status(child);
1601         else
1602                 fret = wait_for_pid(child);
1603
1604         /* close the read-end of the pipe */
1605         close(pipefd[0]);
1606
1607         return fret;
1608 }
1609
1610 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1611 {
1612     return run_command_internal(buf, buf_size, child_fn, args, false);
1613 }
1614
1615 int run_command_status(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1616 {
1617     return run_command_internal(buf, buf_size, child_fn, args, true);
1618 }
1619
1620 bool lxc_nic_exists(char *nic)
1621 {
1622 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1623         char path[__LXC_SYS_CLASS_NET_LEN];
1624         int ret;
1625         struct stat sb;
1626
1627         if (strequal(nic, "none"))
1628                 return true;
1629
1630         ret = strnprintf(path, sizeof(path), "/sys/class/net/%s", nic);
1631         if (ret < 0)
1632                 return false;
1633
1634         ret = stat(path, &sb);
1635         if (ret < 0)
1636                 return false;
1637
1638         return true;
1639 }
1640
1641 uint64_t lxc_find_next_power2(uint64_t n)
1642 {
1643         /* 0 is not valid input. We return 0 to the caller since 0 is not a
1644          * valid power of two.
1645          */
1646         if (n == 0)
1647                 return 0;
1648
1649         if (!(n & (n - 1)))
1650                 return n;
1651
1652         while (n & (n - 1))
1653                 n = n & (n - 1);
1654
1655         n = n << 1;
1656         return n;
1657 }
1658
1659 static int process_dead(/* takes */ int status_fd)
1660 {
1661         __do_close int dupfd = -EBADF;
1662         __do_free char *line = NULL;
1663         __do_fclose FILE *f = NULL;
1664         int ret = 0;
1665         size_t n = 0;
1666
1667         dupfd = dup(status_fd);
1668         if (dupfd < 0)
1669                 return -1;
1670
1671         if (fd_cloexec(dupfd, true) < 0)
1672                 return -1;
1673
1674         f = fdopen(dupfd, "re");
1675         if (!f)
1676                 return -1;
1677
1678         /* Transfer ownership of fd. */
1679         move_fd(dupfd);
1680
1681         ret = 0;
1682         while (getline(&line, &n, f) != -1) {
1683                 char *state;
1684
1685                 if (!strnequal(line, "State:", 6))
1686                         continue;
1687
1688                 state = lxc_trim_whitespace_in_place(line + 6);
1689                 /* only check whether process is dead or zombie for now */
1690                 if (*state == 'X' || *state == 'Z')
1691                         ret = 1;
1692         }
1693
1694         return ret;
1695 }
1696
1697 int lxc_set_death_signal(int signal, pid_t parent, int parent_status_fd)
1698 {
1699         int ret;
1700         pid_t ppid;
1701
1702         ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1703                     prctl_arg(0), prctl_arg(0));
1704
1705         /* verify that we haven't been orphaned in the meantime */
1706         ppid = (pid_t)syscall(SYS_getppid);
1707         if (ppid == 0) { /* parent outside our pidns */
1708                 if (parent_status_fd < 0)
1709                         return 0;
1710
1711                 if (process_dead(parent_status_fd) == 1)
1712                         return raise(SIGKILL);
1713         } else if (ppid != parent) {
1714                 return raise(SIGKILL);
1715         }
1716
1717         if (ret < 0)
1718                 return -1;
1719
1720         return 0;
1721 }
1722
1723 int lxc_rm_rf(const char *dirname)
1724 {
1725         __do_closedir DIR *dir = NULL;
1726         int fret = 0;
1727         int ret;
1728         struct dirent *direntp;
1729
1730         dir = opendir(dirname);
1731         if (!dir)
1732                 return log_error_errno(-1, errno, "Failed to open dir \"%s\"", dirname);
1733
1734         while ((direntp = readdir(dir))) {
1735                 __do_free char *pathname = NULL;
1736                 struct stat mystat;
1737
1738                 if (strequal(direntp->d_name, ".") ||
1739                     strequal(direntp->d_name, ".."))
1740                         continue;
1741
1742                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1743                 ret = lstat(pathname, &mystat);
1744                 if (ret < 0) {
1745                         if (!fret)
1746                                 SYSWARN("Failed to stat \"%s\"", pathname);
1747
1748                         fret = -1;
1749                         continue;
1750                 }
1751
1752                 if (!S_ISDIR(mystat.st_mode))
1753                         continue;
1754
1755                 ret = lxc_rm_rf(pathname);
1756                 if (ret < 0)
1757                         fret = -1;
1758         }
1759
1760         ret = rmdir(dirname);
1761         if (ret < 0)
1762                 return log_warn_errno(-1, errno, "Failed to delete \"%s\"", dirname);
1763
1764         return fret;
1765 }
1766
1767 bool lxc_can_use_pidfd(int pidfd)
1768 {
1769         int ret;
1770
1771         if (pidfd < 0)
1772                 return log_error(false, "Kernel does not support pidfds");
1773
1774         /*
1775          * We don't care whether or not children were in a waitable state. We
1776          * just care whether waitid() recognizes P_PIDFD.
1777          *
1778          * Btw, while I have your attention, the above waitid() code is an
1779          * excellent example of how _not_ to do flag-based kernel APIs. So if
1780          * you ever go into kernel development or are already and you add this
1781          * kind of flag potpourri even though you have read this comment shame
1782          * on you. May the gods of operating system development have mercy on
1783          * your soul because I won't.
1784          */
1785         ret = waitid(P_PIDFD, pidfd, NULL,
1786                     /* Type of children to wait for. */
1787                     __WALL |
1788                     /* How to wait for them. */
1789                     WNOHANG | WNOWAIT |
1790                     /* What state to wait for. */
1791                     WEXITED | WSTOPPED | WCONTINUED);
1792         if (ret < 0)
1793                 return log_error_errno(false, errno, "Kernel does not support waiting on processes through pidfds");
1794
1795         ret = lxc_raw_pidfd_send_signal(pidfd, 0, NULL, 0);
1796         if (ret)
1797                 return log_error_errno(false, errno, "Kernel does not support sending singals through pidfds");
1798
1799         return log_trace(true, "Kernel supports pidfds");
1800 }
1801
1802 int fix_stdio_permissions(uid_t uid)
1803 {
1804         __do_close int devnull_fd = -EBADF;
1805         int fret = 0;
1806         int std_fds[] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO};
1807         int ret;
1808         struct stat st, st_null;
1809
1810         devnull_fd = open_devnull();
1811         if (devnull_fd < 0)
1812                 return log_trace_errno(-1, errno, "Failed to open \"/dev/null\"");
1813
1814         ret = fstat(devnull_fd, &st_null);
1815         if (ret)
1816                 return log_trace_errno(-errno, errno, "Failed to stat \"/dev/null\"");
1817
1818         for (int i = 0; i < ARRAY_SIZE(std_fds); i++) {
1819                 ret = fstat(std_fds[i], &st);
1820                 if (ret) {
1821                         SYSWARN("Failed to stat standard I/O file descriptor %d", std_fds[i]);
1822                         fret = -1;
1823                         continue;
1824                 }
1825
1826                 if (st.st_rdev == st_null.st_rdev)
1827                         continue;
1828
1829                 ret = fchown(std_fds[i], uid, st.st_gid);
1830                 if (ret) {
1831                         SYSTRACE("Failed to chown standard I/O file descriptor %d to uid %d and gid %d",
1832                                  std_fds[i], uid, st.st_gid);
1833                         fret = -1;
1834                         continue;
1835                 }
1836
1837                 ret = fchmod(std_fds[i], 0700);
1838                 if (ret) {
1839                         SYSTRACE("Failed to chmod standard I/O file descriptor %d", std_fds[i]);
1840                         fret = -1;
1841                 }
1842         }
1843
1844         return fret;
1845 }
1846
1847 bool multiply_overflow(int64_t base, uint64_t mult, int64_t *res)
1848 {
1849         if (base > 0 && base > (INT64_MAX / mult))
1850                 return false;
1851
1852         if (base < 0 && base < (INT64_MIN / mult))
1853                 return false;
1854
1855         *res = base * mult;
1856         return true;
1857 }
1858
1859 int print_r(int fd, const char *path)
1860 {
1861         __do_close int dfd = -EBADF, dfd_dup = -EBADF;
1862         __do_closedir DIR *dir = NULL;
1863         int ret = 0;
1864         struct dirent *direntp;
1865         struct stat st;
1866
1867         if (is_empty_string(path)) {
1868                 char buf[LXC_PROC_SELF_FD_LEN];
1869
1870                 ret = strnprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd);
1871                 if (ret < 0)
1872                         return ret_errno(EIO);
1873
1874                 /*
1875                  * O_PATH file descriptors can't be used so we need to re-open
1876                  * just in case.
1877                  */
1878                 dfd = openat(-EBADF, buf, O_CLOEXEC | O_DIRECTORY, 0);
1879         } else {
1880                 dfd = openat(fd, path, O_CLOEXEC | O_DIRECTORY, 0);
1881         }
1882         if (dfd < 0)
1883                 return -1;
1884
1885         dfd_dup = dup_cloexec(dfd);
1886         if (dfd_dup < 0)
1887                 return -1;
1888
1889         dir = fdopendir(dfd);
1890         if (!dir)
1891                 return -1;
1892         /* Transfer ownership to fdopendir(). */
1893         move_fd(dfd);
1894
1895         while ((direntp = readdir(dir))) {
1896                 if (!strcmp(direntp->d_name, ".") ||
1897                     !strcmp(direntp->d_name, ".."))
1898                         continue;
1899
1900                 ret = fstatat(dfd_dup, direntp->d_name, &st, AT_SYMLINK_NOFOLLOW);
1901                 if (ret < 0 && errno != ENOENT)
1902                         break;
1903
1904                 ret = 0;
1905                 if (S_ISDIR(st.st_mode))
1906                         ret = print_r(dfd_dup, direntp->d_name);
1907                 else
1908                         INFO("mode(%o):uid(%d):gid(%d) -> %d/%s\n",
1909                              (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, dfd_dup,
1910                              direntp->d_name);
1911                 if (ret < 0 && errno != ENOENT)
1912                         break;
1913         }
1914
1915         if (is_empty_string(path))
1916                 ret = fstatat(fd, "", &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH);
1917         else
1918                 ret = fstatat(fd, path, &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW);
1919         if (ret)
1920                 return -1;
1921         else
1922                 INFO("mode(%o):uid(%d):gid(%d) -> %s",
1923                      (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, maybe_empty(path));
1924         return ret;
1925 }