src/lxc/utils.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #ifndef _GNU_SOURCE
   4 #define _GNU_SOURCE 1
   5 #endif
   6 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
   7 #include <ctype.h>
   8 #include <dirent.h>
   9 #include <errno.h>
  10 #include <fcntl.h>
  11 #include <grp.h>
  12 #include <inttypes.h>
  13 #include <libgen.h>
  14 #include <pthread.h>
  15 #include <signal.h>
  16 #include <stddef.h>
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <sys/mman.h>
  21 #include <sys/mount.h>
  22 #include <sys/param.h>
  23 #include <sys/prctl.h>
  24 #include <sys/stat.h>
  25 #include <sys/types.h>
  26 #include <sys/wait.h>
  27 #include <unistd.h>
  28
  29 #include "config.h"
  30 #include "log.h"
  31 #include "lsm/lsm.h"
  32 #include "lxclock.h"
  33 #include "memory_utils.h"
  34 #include "namespace.h"
  35 #include "open_utils.h"
  36 #include "parse.h"
  37 #include "process_utils.h"
  38 #include "syscall_wrappers.h"
  39 #include "utils.h"
  40
  41 #if !HAVE_STRLCPY
  42 #include "strlcpy.h"
  43 #endif
  44
  45 #if !HAVE_STRLCAT
  46 #include "strlcat.h"
  47 #endif
  48
  49 #ifndef O_PATH
  50 #define O_PATH      010000000
  51 #endif
  52
  53 #ifndef O_NOFOLLOW
  54 #define O_NOFOLLOW  00400000
  55 #endif
  56
  57 lxc_log_define(utils, lxc);
  58
  59 /*
  60  * if path is btrfs, tries to remove it and any subvolumes beneath it
  61  */
  62 extern bool btrfs_try_remove_subvol(const char *path);
  63
  64 static int _recursive_rmdir(const char *dirname, dev_t pdev,
  65                             const char *exclude, int level, bool onedev)
  66 {
  67         __do_closedir DIR *dir = NULL;
  68         int failed = 0;
  69         bool hadexclude = false;
  70         int ret;
  71         struct dirent *direntp;
  72         char pathname[PATH_MAX];
  73
  74         dir = opendir(dirname);
  75         if (!dir)
  76                 return log_error(-1, "Failed to open \"%s\"", dirname);
  77
  78         while ((direntp = readdir(dir))) {
  79                 int rc;
  80                 struct stat mystat;
  81
  82                 if (strequal(direntp->d_name, ".") ||
  83                     strequal(direntp->d_name, ".."))
  84                         continue;
  85
  86                 rc = strnprintf(pathname, sizeof(pathname), "%s/%s", dirname, direntp->d_name);
  87                 if (rc < 0) {
  88                         ERROR("The name of path is too long");
  89                         failed = 1;
  90                         continue;
  91                 }
  92
  93                 if (!level && exclude && strequal(direntp->d_name, exclude)) {
  94                         ret = rmdir(pathname);
  95                         if (ret < 0) {
  96                                 switch (errno) {
  97                                 case ENOTEMPTY:
  98                                         INFO("Not deleting snapshot \"%s\"", pathname);
  99                                         hadexclude = true;
 100                                         break;
 101                                 case ENOTDIR:
 102                                         ret = unlink(pathname);
 103                                         if (ret)
 104                                                 INFO("Failed to remove \"%s\"", pathname);
 105                                         break;
 106                                 default:
 107                                         SYSERROR("Failed to rmdir \"%s\"", pathname);
 108                                         failed = 1;
 109                                         break;
 110                                 }
 111                         }
 112
 113                         continue;
 114                 }
 115
 116                 ret = lstat(pathname, &mystat);
 117                 if (ret) {
 118                         SYSERROR("Failed to stat \"%s\"", pathname);
 119                         failed = 1;
 120                         continue;
 121                 }
 122
 123                 if (onedev && mystat.st_dev != pdev) {
 124                         if (btrfs_try_remove_subvol(pathname))
 125                                 INFO("Removed btrfs subvolume at \"%s\"", pathname);
 126                         continue;
 127                 }
 128
 129                 if (S_ISDIR(mystat.st_mode)) {
 130                         if (_recursive_rmdir(pathname, pdev, exclude, level + 1, onedev) < 0)
 131                                 failed = 1;
 132                 } else {
 133                         ret = unlink(pathname);
 134                         if (ret < 0) {
 135                                 __do_close int fd = -EBADF;
 136
 137                                 fd = open(pathname, O_RDONLY | O_CLOEXEC | O_NONBLOCK);
 138                                 if (fd >= 0) {
 139                                         /* The file might be marked immutable. */
 140                                         int attr = 0;
 141                                         ret = ioctl(fd, FS_IOC_GETFLAGS, &attr);
 142                                         if (ret < 0)
 143                                                 SYSERROR("Failed to retrieve file flags");
 144                                         attr &= ~FS_IMMUTABLE_FL;
 145                                         ret = ioctl(fd, FS_IOC_SETFLAGS, &attr);
 146                                         if (ret < 0)
 147                                                 SYSERROR("Failed to set file flags");
 148                                 }
 149
 150                                 ret = unlink(pathname);
 151                                 if (ret < 0) {
 152                                         SYSERROR("Failed to delete \"%s\"", pathname);
 153                                         failed = 1;
 154                                 }
 155                         }
 156                 }
 157         }
 158
 159         if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
 160                 SYSERROR("Failed to delete \"%s\"", dirname);
 161                 failed = 1;
 162         }
 163
 164         return failed ? -1 : 0;
 165 }
 166
 167 /*
 168  * In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
 169  * lxc_rmdir_onedev().
 170  */
 171 static inline bool is_native_overlayfs(const char *path)
 172 {
 173         return has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
 174                has_fs_type(path, OVERLAYFS_SUPER_MAGIC);
 175 }
 176
 177 /* returns 0 on success, -1 if there were any failures */
 178 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
 179 {
 180         struct stat mystat;
 181         bool onedev = true;
 182
 183         if (is_native_overlayfs(path))
 184                 onedev = false;
 185
 186         if (lstat(path, &mystat) < 0) {
 187                 if (errno == ENOENT)
 188                         return 0;
 189
 190                 return log_error_errno(-1, errno, "Failed to stat \"%s\"", path);
 191         }
 192
 193         return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 194 }
 195
 196 /* borrowed from iproute2 */
 197 extern int get_u16(unsigned short *val, const char *arg, int base)
 198 {
 199         unsigned long res;
 200         char *ptr;
 201
 202         if (!arg || !*arg)
 203                 return ret_errno(EINVAL);
 204
 205         errno = 0;
 206         res = strtoul(arg, &ptr, base);
 207         if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
 208                 return ret_errno(ERANGE);
 209
 210         *val = res;
 211
 212         return 0;
 213 }
 214
 215 int mkdir_p(const char *dir, mode_t mode)
 216 {
 217         const char *tmp = dir;
 218         const char *orig = dir;
 219
 220         if (access(dir, F_OK) != -1)
 221                 return 0;
 222
 223         do {
 224                 __do_free char *makeme = NULL;
 225                 int ret;
 226
 227                 dir = tmp + strspn(tmp, "/");
 228                 tmp = dir + strcspn(dir, "/");
 229
 230                 makeme = strndup(orig, dir - orig);
 231                 if (!makeme)
 232                         return ret_set_errno(-1, ENOMEM);
 233
 234                 ret = mkdir(makeme, mode);
 235                 if (ret < 0 && errno != EEXIST)
 236                         return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
 237
 238         } while (tmp != dir);
 239
 240         return 0;
 241 }
 242
 243 char *get_rundir(void)
 244 {
 245         __do_free char *rundir = NULL;
 246         char *static_rundir;
 247         int ret;
 248         size_t len;
 249         const char *homedir;
 250         struct stat sb;
 251
 252         if (stat(RUNTIME_PATH, &sb) < 0)
 253                 return NULL;
 254
 255         if (geteuid() == sb.st_uid || getegid() == sb.st_gid)
 256                 return strdup(RUNTIME_PATH);
 257
 258         static_rundir = getenv("XDG_RUNTIME_DIR");
 259         if (static_rundir)
 260                 return strdup(static_rundir);
 261
 262         INFO("XDG_RUNTIME_DIR isn't set in the environment");
 263         homedir = getenv("HOME");
 264         if (!homedir)
 265                 return log_error(NULL, "HOME isn't set in the environment");
 266
 267         len = strlen(homedir) + 17;
 268         rundir = malloc(sizeof(char) * len);
 269         if (!rundir)
 270                 return NULL;
 271
 272         ret = strnprintf(rundir, len, "%s/.cache/lxc/run/", homedir);
 273         if (ret < 0)
 274                 return ret_set_errno(NULL, EIO);
 275
 276         return move_ptr(rundir);
 277 }
 278
 279 int wait_for_pid(pid_t pid)
 280 {
 281         int status, ret;
 282
 283 again:
 284         ret = waitpid(pid, &status, 0);
 285         if (ret == -1) {
 286                 if (errno == EINTR)
 287                         goto again;
 288
 289                 return -1;
 290         }
 291
 292         if (ret != pid)
 293                 goto again;
 294
 295         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
 296                 return -1;
 297
 298         return 0;
 299 }
 300
 301 int wait_for_pidfd(int pidfd)
 302 {
 303         int ret;
 304         siginfo_t info = {
 305                 .si_signo = 0,
 306         };
 307
 308         do {
 309                 ret = waitid(P_PIDFD, pidfd, &info, __WALL | WEXITED);
 310         } while (ret < 0 && errno == EINTR);
 311
 312         return !ret && WIFEXITED(info.si_status) && WEXITSTATUS(info.si_status) == 0;
 313 }
 314
 315 int lxc_wait_for_pid_status(pid_t pid)
 316 {
 317         int status, ret;
 318
 319 again:
 320         ret = waitpid(pid, &status, 0);
 321         if (ret == -1) {
 322                 if (errno == EINTR)
 323                         goto again;
 324
 325                 return -1;
 326         }
 327
 328         if (ret != pid)
 329                 goto again;
 330
 331         return status;
 332 }
 333
 334 bool wait_exited(pid_t pid)
 335 {
 336         int status;
 337
 338         status = lxc_wait_for_pid_status(pid);
 339         if (status < 0)
 340                 return log_error(false, "Failed to reap on child process %d", pid);
 341         if (WIFSIGNALED(status))
 342                 return log_error(false, "Child process %d terminated by signal %d", pid, WTERMSIG(status));
 343         if (!WIFEXITED(status))
 344                 return log_error(false, "Child did not termiate correctly");
 345         if (WEXITSTATUS(status))
 346                 return log_error(false, "Child terminated with error %d", WEXITSTATUS(status));
 347
 348         TRACE("Reaped child process %d", pid);
 349         return true;
 350 }
 351
 352 #if HAVE_OPENSSL
 353 #include <openssl/evp.h>
 354
 355 static int do_sha1_hash(const char *buf, int buflen, unsigned char *md_value,
 356                         unsigned int *md_len)
 357 {
 358         EVP_MD_CTX *mdctx;
 359         const EVP_MD *md;
 360
 361         md = EVP_get_digestbyname("sha1");
 362         if (!md)
 363                 return log_error(-1, "Unknown message digest: sha1\n");
 364
 365         mdctx = EVP_MD_CTX_create();
 366         EVP_DigestInit_ex(mdctx, md, NULL);
 367         EVP_DigestUpdate(mdctx, buf, buflen);
 368         EVP_DigestFinal_ex(mdctx, md_value, md_len);
 369         EVP_MD_CTX_destroy(mdctx);
 370
 371         return 0;
 372 }
 373
 374 int sha1sum_file(char *fnam, unsigned char *digest, unsigned int *md_len)
 375 {
 376         __do_free char *buf = NULL;
 377         __do_fclose FILE *f = NULL;
 378         int ret;
 379         ssize_t flen;
 380         ssize_t nbytes;
 381
 382         if (!fnam)
 383                 return -1;
 384
 385         f = fopen_cloexec(fnam, "r");
 386         if (!f)
 387                 return log_error_errno(-1, errno, "Failed to open template \"%s\"", fnam);
 388
 389         if (fseek(f, 0, SEEK_END) < 0)
 390                 return log_error_errno(-1, errno, "Failed to seek to end of template");
 391
 392         flen = ftell(f);
 393         if (flen < 0)
 394                 return log_error_errno(-1, errno, "Failed to tell size of template");
 395
 396         if (fseek(f, 0, SEEK_SET) < 0)
 397                 return log_error_errno(-1, errno, "Failed to seek to start of template");
 398
 399         buf = malloc(flen + 1);
 400         if (!buf)
 401                 return log_error_errno(-1, ENOMEM, "Out of memory");
 402
 403         nbytes = fread(buf, 1, flen, f);
 404         if (nbytes < 0 || nbytes != flen)
 405                 return log_error_errno(-1, errno, "Failed to read template");
 406
 407         buf[flen] = '\0';
 408         ret = do_sha1_hash(buf, flen, (void *)digest, md_len);
 409         return ret;
 410 }
 411 #endif
 412
 413 struct lxc_popen_FILE *lxc_popen(const char *command)
 414 {
 415         int ret;
 416         int pipe_fds[2];
 417         pid_t child_pid;
 418         struct lxc_popen_FILE *fp = NULL;
 419
 420         ret = pipe2(pipe_fds, O_CLOEXEC);
 421         if (ret < 0)
 422                 return NULL;
 423
 424         child_pid = fork();
 425         if (child_pid < 0)
 426                 goto on_error;
 427
 428         if (!child_pid) {
 429                 sigset_t mask;
 430
 431                 close(pipe_fds[0]);
 432
 433                 /* duplicate stdout */
 434                 if (pipe_fds[1] != STDOUT_FILENO)
 435                         ret = dup2(pipe_fds[1], STDOUT_FILENO);
 436                 else
 437                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 438                 if (ret < 0) {
 439                         close(pipe_fds[1]);
 440                         _exit(EXIT_FAILURE);
 441                 }
 442
 443                 /* duplicate stderr */
 444                 if (pipe_fds[1] != STDERR_FILENO)
 445                         ret = dup2(pipe_fds[1], STDERR_FILENO);
 446                 else
 447                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 448                 close(pipe_fds[1]);
 449                 if (ret < 0)
 450                         _exit(EXIT_FAILURE);
 451
 452                 /* unblock all signals */
 453                 ret = sigfillset(&mask);
 454                 if (ret < 0)
 455                         _exit(EXIT_FAILURE);
 456
 457                 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
 458                 if (ret < 0)
 459                         _exit(EXIT_FAILURE);
 460
 461                 /* check if /bin/sh exist, otherwise try Android location /system/bin/sh */
 462                 if (file_exists("/bin/sh"))
 463                         execl("/bin/sh", "sh", "-c", command, (char *)NULL);
 464                 else
 465                         execl("/system/bin/sh", "sh", "-c", command, (char *)NULL);
 466
 467                 _exit(127);
 468         }
 469
 470         close(pipe_fds[1]);
 471         pipe_fds[1] = -1;
 472
 473         fp = malloc(sizeof(*fp));
 474         if (!fp)
 475                 goto on_error;
 476
 477         memset(fp, 0, sizeof(*fp));
 478
 479         fp->child_pid = child_pid;
 480         fp->pipe = pipe_fds[0];
 481
 482         /* From now on, closing fp->f will also close fp->pipe. So only ever
 483          * call fclose(fp->f).
 484          */
 485         fp->f = fdopen(pipe_fds[0], "r");
 486         if (!fp->f)
 487                 goto on_error;
 488
 489         return fp;
 490
 491 on_error:
 492         /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
 493          * called yet. Otherwise the fd belongs to the file opened by fdopen()
 494          * since it isn't dup()ed.
 495          */
 496         if (fp && !fp->f && pipe_fds[0] >= 0)
 497                 close(pipe_fds[0]);
 498
 499         if (pipe_fds[1] >= 0)
 500                 close(pipe_fds[1]);
 501
 502         if (fp && fp->f)
 503                 fclose(fp->f);
 504
 505         if (fp)
 506                 free(fp);
 507
 508         return NULL;
 509 }
 510
 511 int lxc_pclose(struct lxc_popen_FILE *fp)
 512 {
 513         pid_t wait_pid;
 514         int wstatus = 0;
 515
 516         if (!fp)
 517                 return -1;
 518
 519         do {
 520                 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
 521         } while (wait_pid < 0 && errno == EINTR);
 522
 523         fclose(fp->f);
 524         free(fp);
 525
 526         if (wait_pid < 0)
 527                 return -1;
 528
 529         return wstatus;
 530 }
 531
 532 int randseed(bool srand_it)
 533 {
 534         __do_fclose FILE *f = NULL;
 535         /*
 536          * srand pre-seed function based on /dev/urandom
 537          */
 538         unsigned int seed = time(NULL) + getpid();
 539
 540         f = fopen("/dev/urandom", "re");
 541         if (f) {
 542                 int ret = fread(&seed, sizeof(seed), 1, f);
 543                 if (ret != 1)
 544                         SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
 545         }
 546
 547         if (srand_it)
 548                 srand(seed);
 549
 550         return seed;
 551 }
 552
 553 uid_t get_ns_uid(uid_t orig)
 554 {
 555         __do_free char *line = NULL;
 556         __do_fclose FILE *f = NULL;
 557         size_t sz = 0;
 558         uid_t nsid, hostid, range;
 559
 560         f = fopen("/proc/self/uid_map", "re");
 561         if (!f)
 562                 return log_error_errno(0, errno, "Failed to open uid_map");
 563
 564         while (getline(&line, &sz, f) != -1) {
 565                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 566                         continue;
 567
 568                 if (hostid <= orig && hostid + range > orig)
 569                         return nsid += orig - hostid;
 570         }
 571
 572         return LXC_INVALID_UID;
 573 }
 574
 575 gid_t get_ns_gid(gid_t orig)
 576 {
 577         __do_free char *line = NULL;
 578         __do_fclose FILE *f = NULL;
 579         size_t sz = 0;
 580         gid_t nsid, hostid, range;
 581
 582         f = fopen("/proc/self/gid_map", "re");
 583         if (!f)
 584                 return log_error_errno(0, errno, "Failed to open gid_map");
 585
 586         while (getline(&line, &sz, f) != -1) {
 587                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 588                         continue;
 589
 590                 if (hostid <= orig && hostid + range > orig)
 591                         return nsid += orig - hostid;
 592         }
 593
 594         return LXC_INVALID_GID;
 595 }
 596
 597 bool dir_exists(const char *path)
 598 {
 599         return exists_dir_at(-1, path);
 600 }
 601
 602 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
 603  * FNV has good anti collision properties and we're not worried
 604  * about pre-image resistance or one-way-ness, we're just trying to make
 605  * the name unique in the 108 bytes of space we have.
 606  */
 607 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
 608 {
 609         unsigned char *bp;
 610
 611         for(bp = buf; bp < (unsigned char *)buf + len; bp++) {
 612                 /* xor the bottom with the current octet */
 613                 hval ^= (uint64_t)*bp;
 614
 615                 /* gcc optimised:
 616                  * multiply by the 64 bit FNV magic prime mod 2^64
 617                  */
 618                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
 619                         (hval << 7) + (hval << 8) + (hval << 40);
 620         }
 621
 622         return hval;
 623 }
 624
 625 bool is_shared_mountpoint(const char *path)
 626 {
 627         __do_fclose FILE *f = NULL;
 628         __do_free char *line = NULL;
 629         int i;
 630         size_t len = 0;
 631
 632         f = fopen("/proc/self/mountinfo", "re");
 633         if (!f)
 634                 return 0;
 635
 636         while (getline(&line, &len, f) > 0) {
 637                 char *slider1, *slider2;
 638
 639                 for (slider1 = line, i = 0; slider1 && i < 4; i++)
 640                         slider1 = strchr(slider1 + 1, ' ');
 641
 642                 if (!slider1)
 643                         continue;
 644
 645                 slider2 = strchr(slider1 + 1, ' ');
 646                 if (!slider2)
 647                         continue;
 648
 649                 *slider2 = '\0';
 650                 if (strequal(slider1 + 1, path)) {
 651                         /* This is the path. Is it shared? */
 652                         slider1 = strchr(slider2 + 1, ' ');
 653                         if (slider1 && strstr(slider1, "shared:"))
 654                                 return true;
 655                 }
 656         }
 657
 658         return false;
 659 }
 660
 661 /*
 662  * Detect whether / is mounted MS_SHARED.  The only way I know of to
 663  * check that is through /proc/self/mountinfo.
 664  * I'm only checking for /.  If the container rootfs or mount location
 665  * is MS_SHARED, but not '/', then you're out of luck - figuring that
 666  * out would be too much work to be worth it.
 667  */
 668 int detect_shared_rootfs(void)
 669 {
 670         if (is_shared_mountpoint("/"))
 671                 return 1;
 672
 673         return 0;
 674 }
 675
 676 bool switch_to_ns(pid_t pid, const char *ns)
 677 {
 678         __do_close int fd = -EBADF;
 679         int ret;
 680         char nspath[STRLITERALLEN("/proc//ns/")
 681                     + INTTYPE_TO_STRLEN(pid_t)
 682                     + LXC_NAMESPACE_NAME_MAX];
 683
 684         /* Switch to new ns */
 685         ret = strnprintf(nspath, sizeof(nspath), "/proc/%d/ns/%s", pid, ns);
 686         if (ret < 0)
 687                 return false;
 688
 689         fd = open(nspath, O_RDONLY | O_CLOEXEC);
 690         if (fd < 0)
 691                 return log_error_errno(false, errno, "Failed to open \"%s\"", nspath);
 692
 693         ret = setns(fd, 0);
 694         if (ret)
 695                 return log_error_errno(false, errno, "Failed to set process %d to \"%s\" of %d", pid, ns, fd);
 696
 697         return true;
 698 }
 699
 700 /*
 701  * looking at fs/proc_namespace.c, it appears we can
 702  * actually expect the rootfs entry to very specifically contain
 703  * " - rootfs rootfs "
 704  * IIUC, so long as we've chrooted so that rootfs is not our root,
 705  * the rootfs entry should always be skipped in mountinfo contents.
 706  */
 707 bool detect_ramfs_rootfs(void)
 708 {
 709         __do_free char *line = NULL;
 710         __do_free void *fopen_cache = NULL;
 711         __do_fclose FILE *f = NULL;
 712         size_t len = 0;
 713
 714         f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache);
 715         if (!f)
 716                 return false;
 717
 718         while (getline(&line, &len, f) != -1) {
 719                 int i;
 720                 char *p, *p2;
 721
 722                 for (p = line, i = 0; p && i < 4; i++)
 723                         p = strchr(p + 1, ' ');
 724                 if (!p)
 725                         continue;
 726
 727                 p2 = strchr(p + 1, ' ');
 728                 if (!p2)
 729                         continue;
 730                 *p2 = '\0';
 731                 if (strequal(p + 1, "/")) {
 732                         /* This is '/'. Is it the ramfs? */
 733                         p = strchr(p2 + 1, '-');
 734                         if (p && strnequal(p, "- rootfs ", 9))
 735                                 return true;
 736                 }
 737         }
 738
 739         return false;
 740 }
 741
 742 char *on_path(const char *cmd, const char *rootfs)
 743 {
 744         __do_free char *path = NULL;
 745         char *entry = NULL;
 746         char cmdpath[PATH_MAX];
 747         int ret;
 748
 749         path = getenv("PATH");
 750         if (!path)
 751                 return NULL;
 752
 753         path = strdup(path);
 754         if (!path)
 755                 return NULL;
 756
 757         lxc_iterate_parts(entry, path, ":") {
 758                 if (rootfs)
 759                         ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s/%s", rootfs, entry, cmd);
 760                 else
 761                         ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s", entry, cmd);
 762                 if (ret < 0)
 763                         continue;
 764
 765                 if (access(cmdpath, X_OK) == 0)
 766                         return strdup(cmdpath);
 767         }
 768
 769         return NULL;
 770 }
 771
 772 /* historically lxc-init has been under /usr/lib/lxc and under
 773  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
 774  */
 775 char *choose_init(const char *rootfs)
 776 {
 777         char *retv = NULL;
 778         const char *empty = "",
 779                    *tmp;
 780         int ret, env_set = 0;
 781
 782         if (!getenv("PATH")) {
 783                 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
 784                         SYSERROR("Failed to setenv");
 785
 786                 env_set = 1;
 787         }
 788
 789         retv = on_path("init.lxc", rootfs);
 790
 791         if (env_set)
 792                 if (unsetenv("PATH"))
 793                         SYSERROR("Failed to unsetenv");
 794
 795         if (retv)
 796                 return retv;
 797
 798         retv = malloc(PATH_MAX);
 799         if (!retv)
 800                 return NULL;
 801
 802         if (rootfs)
 803                 tmp = rootfs;
 804         else
 805                 tmp = empty;
 806
 807         ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
 808         if (ret < 0) {
 809                 ERROR("The name of path is too long");
 810                 goto out1;
 811         }
 812
 813         if (access(retv, X_OK) == 0)
 814                 return retv;
 815
 816         ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
 817         if (ret < 0) {
 818                 ERROR("The name of path is too long");
 819                 goto out1;
 820         }
 821
 822         if (access(retv, X_OK) == 0)
 823                 return retv;
 824
 825         ret = strnprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
 826         if (ret < 0) {
 827                 ERROR("The name of path is too long");
 828                 goto out1;
 829         }
 830
 831         if (access(retv, X_OK) == 0)
 832                 return retv;
 833
 834         ret = strnprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
 835         if (ret < 0) {
 836                 ERROR("The name of path is too long");
 837                 goto out1;
 838         }
 839
 840         if (access(retv, X_OK) == 0)
 841                 return retv;
 842
 843         /*
 844          * Last resort, look for the statically compiled init.lxc which we
 845          * hopefully bind-mounted in.
 846          * If we are called during container setup, and we get to this point,
 847          * then the init.lxc.static from the host will need to be bind-mounted
 848          * in.  So we return NULL here to indicate that.
 849          */
 850         if (rootfs)
 851                 goto out1;
 852
 853         ret = strnprintf(retv, PATH_MAX, "/init.lxc.static");
 854         if (ret < 0) {
 855                 WARN("Nonsense - name /lxc.init.static too long");
 856                 goto out1;
 857         }
 858
 859         if (access(retv, X_OK) == 0)
 860                 return retv;
 861
 862 out1:
 863         free(retv);
 864         return NULL;
 865 }
 866
 867 /*
 868  * Given the '-t' template option to lxc-create, figure out what to
 869  * do.  If the template is a full executable path, use that.  If it
 870  * is something like 'sshd', then return $templatepath/lxc-sshd.
 871  * On success return the template, on error return NULL.
 872  */
 873 char *get_template_path(const char *t)
 874 {
 875         int ret, len;
 876         char *tpath;
 877
 878         if (t[0] == '/') {
 879                 if (access(t, X_OK) == 0) {
 880                         return strdup(t);
 881                 } else {
 882                         SYSERROR("Bad template pathname: %s", t);
 883                         return NULL;
 884                 }
 885         }
 886
 887         len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
 888
 889         tpath = malloc(len);
 890         if (!tpath)
 891                 return NULL;
 892
 893         ret = strnprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
 894         if (ret < 0) {
 895                 free(tpath);
 896                 return NULL;
 897         }
 898
 899         if (access(tpath, X_OK) < 0) {
 900                 SYSERROR("bad template: %s", t);
 901                 free(tpath);
 902                 return NULL;
 903         }
 904
 905         return tpath;
 906 }
 907
 908 /*
 909  * @path:    a pathname where / replaced with '\0'.
 910  * @offsetp: pointer to int showing which path segment was last seen.
 911  *           Updated on return to reflect the next segment.
 912  * @fulllen: full original path length.
 913  * Returns a pointer to the next path segment, or NULL if done.
 914  */
 915 static char *get_nextpath(char *path, int *offsetp, int fulllen)
 916 {
 917         int offset = *offsetp;
 918
 919         if (offset >= fulllen)
 920                 return NULL;
 921
 922         while (offset < fulllen && path[offset] != '\0')
 923                 offset++;
 924
 925         while (offset < fulllen && path[offset] == '\0')
 926                 offset++;
 927
 928         *offsetp = offset;
 929
 930         return (offset < fulllen) ? &path[offset] : NULL;
 931 }
 932
 933 /*
 934  * Check that @subdir is a subdir of @dir.  @len is the length of
 935  * @dir (to avoid having to recalculate it).
 936  */
 937 static bool is_subdir(const char *subdir, const char *dir, size_t len)
 938 {
 939         size_t subdirlen = strlen(subdir);
 940
 941         if (subdirlen < len)
 942                 return false;
 943
 944         if (!strnequal(subdir, dir, len))
 945                 return false;
 946
 947         if (dir[len-1] == '/')
 948                 return true;
 949
 950         if (subdir[len] == '/' || subdirlen == len)
 951                 return true;
 952
 953         return false;
 954 }
 955
 956 /*
 957  * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
 958  * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
 959  */
 960 static int check_symlink(int fd)
 961 {
 962         struct stat sb;
 963         int ret;
 964
 965         ret = fstat(fd, &sb);
 966         if (ret < 0)
 967                 return -ENOENT;
 968
 969         if (S_ISLNK(sb.st_mode))
 970                 return -ELOOP;
 971
 972         return 0;
 973 }
 974
 975 /*
 976  * Open a file or directory, provided that it contains no symlinks.
 977  *
 978  * CAVEAT: This function must not be used for other purposes than container
 979  * setup before executing the container's init
 980  */
 981 static int open_if_safe(int dirfd, const char *nextpath)
 982 {
 983         int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
 984         if (newfd >= 0) /* Was not a symlink, all good. */
 985                 return newfd;
 986
 987         if (errno == ELOOP)
 988                 return newfd;
 989
 990         if (errno == EPERM || errno == EACCES) {
 991                 /* We're not root (cause we got EPERM) so try opening with
 992                  * O_PATH.
 993                  */
 994                 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
 995                 if (newfd >= 0) {
 996                         /* O_PATH will return an fd for symlinks. We know
 997                          * nextpath wasn't a symlink at last openat, so if fd is
 998                          * now a link, then something * fishy is going on.
 999                          */
1000                         int ret = check_symlink(newfd);
1001                         if (ret < 0) {
1002                                 close(newfd);
1003                                 newfd = ret;
1004                         }
1005                 }
1006         }
1007
1008         return newfd;
1009 }
1010
1011 /*
1012  * Open a path intending for mounting, ensuring that the final path
1013  * is inside the container's rootfs.
1014  *
1015  * CAVEAT: This function must not be used for other purposes than container
1016  * setup before executing the container's init
1017  *
1018  * @target: path to be opened
1019  * @prefix_skip: a part of @target in which to ignore symbolic links.  This
1020  * would be the container's rootfs.
1021  *
1022  * Return an open fd for the path, or <0 on error.
1023  */
1024 static int open_without_symlink(const char *target, const char *prefix_skip)
1025 {
1026         int curlen = 0, dirfd, fulllen, i;
1027         char *dup;
1028
1029         fulllen = strlen(target);
1030
1031         /* make sure prefix-skip makes sense */
1032         if (prefix_skip && strlen(prefix_skip) > 0) {
1033                 curlen = strlen(prefix_skip);
1034                 if (!is_subdir(target, prefix_skip, curlen)) {
1035                         ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1036                               target, prefix_skip);
1037                         return -EINVAL;
1038                 }
1039
1040                 /*
1041                  * get_nextpath() expects the curlen argument to be
1042                  * on a  (turned into \0) / or before it, so decrement
1043                  * curlen to make sure that happens
1044                  */
1045                 if (curlen)
1046                         curlen--;
1047         } else {
1048                 prefix_skip = "/";
1049                 curlen = 0;
1050         }
1051
1052         /* Make a copy of target which we can hack up, and tokenize it */
1053         if ((dup = strdup(target)) == NULL) {
1054                 ERROR("Out of memory checking for symbolic link");
1055                 return -ENOMEM;
1056         }
1057
1058         for (i = 0; i < fulllen; i++) {
1059                 if (dup[i] == '/')
1060                         dup[i] = '\0';
1061         }
1062
1063         dirfd = open(prefix_skip, O_RDONLY);
1064         if (dirfd < 0) {
1065                 SYSERROR("Failed to open path \"%s\"", prefix_skip);
1066                 goto out;
1067         }
1068
1069         for (;;) {
1070                 int newfd, saved_errno;
1071                 char *nextpath;
1072
1073                 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1074                         goto out;
1075
1076                 newfd = open_if_safe(dirfd, nextpath);
1077                 saved_errno = errno;
1078                 close(dirfd);
1079
1080                 dirfd = newfd;
1081                 if (newfd < 0) {
1082                         errno = saved_errno;
1083                         if (errno == ELOOP)
1084                                 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1085
1086                         goto out;
1087                 }
1088         }
1089
1090 out:
1091         free(dup);
1092         return dirfd;
1093 }
1094
1095 int __safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
1096                             unsigned int flags, const void *data)
1097 {
1098         __do_close int source_fd = -EBADF, target_fd = -EBADF;
1099         struct open_how how = {
1100                 .flags          = PROTECT_OPATH_DIRECTORY,
1101                 .resolve        = PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS,
1102         };
1103         int ret;
1104         char src_buf[LXC_PROC_PID_FD_LEN], tgt_buf[LXC_PROC_PID_FD_LEN];
1105
1106         if (beneath_fd < 0)
1107                 return -EINVAL;
1108
1109         if ((flags & MS_BIND) && src && src[0] != '/') {
1110                 source_fd = openat2(beneath_fd, src, &how, sizeof(how));
1111                 if (source_fd < 0)
1112                         return -errno;
1113                 ret = strnprintf(src_buf, sizeof(src_buf), "/proc/self/fd/%d", source_fd);
1114                 if (ret < 0)
1115                         return -EIO;
1116         } else {
1117                 src_buf[0] = '\0';
1118         }
1119
1120         target_fd = openat2(beneath_fd, dst, &how, sizeof(how));
1121         if (target_fd < 0)
1122                 return log_error_errno(-errno, errno, "Failed to open %d(%s)", beneath_fd, dst);
1123         ret = strnprintf(tgt_buf, sizeof(tgt_buf), "/proc/self/fd/%d", target_fd);
1124         if (ret < 0)
1125                 return -EIO;
1126
1127         if (!is_empty_string(src_buf))
1128                 ret = mount(src_buf, tgt_buf, fstype, flags, data);
1129         else
1130                 ret = mount(src, tgt_buf, fstype, flags, data);
1131
1132         return ret;
1133 }
1134
1135 int safe_mount_beneath(const char *beneath, const char *src, const char *dst, const char *fstype,
1136                        unsigned int flags, const void *data)
1137 {
1138         __do_close int beneath_fd = -EBADF;
1139         const char *path = beneath ? beneath : "/";
1140
1141         beneath_fd = openat(-1, path, PROTECT_OPATH_DIRECTORY);
1142         if (beneath_fd < 0)
1143                 return log_error_errno(-errno, errno, "Failed to open %s", path);
1144
1145         return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
1146 }
1147
1148 int safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype,
1149                           unsigned int flags, const void *data)
1150 {
1151         return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data);
1152 }
1153
1154 /*
1155  * Safely mount a path into a container, ensuring that the mount target
1156  * is under the container's @rootfs.  (If @rootfs is NULL, then the container
1157  * uses the host's /)
1158  *
1159  * CAVEAT: This function must not be used for other purposes than container
1160  * setup before executing the container's init
1161  */
1162 int safe_mount(const char *src, const char *dest, const char *fstype,
1163                 unsigned long flags, const void *data, const char *rootfs)
1164 {
1165         int destfd, ret, saved_errno;
1166         /* Only needs enough for /proc/self/fd/<fd>. */
1167         char srcbuf[50], destbuf[50];
1168         int srcfd = -1;
1169         const char *mntsrc = src;
1170
1171         if (!rootfs)
1172                 rootfs = "";
1173
1174         /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1175         if (flags & MS_BIND && src && src[0] != '/') {
1176                 INFO("This is a relative bind mount");
1177
1178                 srcfd = open_without_symlink(src, NULL);
1179                 if (srcfd < 0)
1180                         return srcfd;
1181
1182                 ret = strnprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd);
1183                 if (ret < 0) {
1184                         close(srcfd);
1185                         ERROR("Out of memory");
1186                         return -EINVAL;
1187                 }
1188                 mntsrc = srcbuf;
1189         }
1190
1191         destfd = open_without_symlink(dest, rootfs);
1192         if (destfd < 0) {
1193                 if (srcfd != -1) {
1194                         saved_errno = errno;
1195                         close(srcfd);
1196                         errno = saved_errno;
1197                 }
1198
1199                 return destfd;
1200         }
1201
1202         ret = strnprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd);
1203         if (ret < 0) {
1204                 if (srcfd != -1)
1205                         close(srcfd);
1206
1207                 close(destfd);
1208                 ERROR("Out of memory");
1209                 return -EINVAL;
1210         }
1211
1212         ret = mount(mntsrc, destbuf, fstype, flags, data);
1213         saved_errno = errno;
1214         if (srcfd != -1)
1215                 close(srcfd);
1216
1217         close(destfd);
1218         if (ret < 0) {
1219                 errno = saved_errno;
1220                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src ? src : "(null)", dest);
1221                 return ret;
1222         }
1223
1224         return 0;
1225 }
1226
1227 int open_devnull(void)
1228 {
1229         int fd = open("/dev/null", O_RDWR);
1230         if (fd < 0)
1231                 SYSERROR("Can't open /dev/null");
1232
1233         return fd;
1234 }
1235
1236 int set_stdfds(int fd)
1237 {
1238         int ret;
1239
1240         if (fd < 0)
1241                 return -1;
1242
1243         ret = dup2(fd, STDIN_FILENO);
1244         if (ret < 0)
1245                 return -1;
1246
1247         ret = dup2(fd, STDOUT_FILENO);
1248         if (ret < 0)
1249                 return -1;
1250
1251         ret = dup2(fd, STDERR_FILENO);
1252         if (ret < 0)
1253                 return -1;
1254
1255         return 0;
1256 }
1257
1258 int null_stdfds(void)
1259 {
1260         int ret = -1;
1261         int fd;
1262
1263         fd = open_devnull();
1264         if (fd >= 0) {
1265                 ret = set_stdfds(fd);
1266                 close(fd);
1267         }
1268
1269         return ret;
1270 }
1271
1272 /* Check whether a signal is blocked by a process. */
1273 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1274 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1275 bool task_blocks_signal(pid_t pid, int signal)
1276 {
1277         __do_free char *line = NULL;
1278         __do_fclose FILE *f = NULL;
1279         int ret;
1280         char status[__PROC_STATUS_LEN] = {0};
1281         uint64_t sigblk = 0, one = 1;
1282         size_t n = 0;
1283         bool bret = false;
1284
1285         ret = strnprintf(status, sizeof(status), "/proc/%d/status", pid);
1286         if (ret < 0)
1287                 return bret;
1288
1289         f = fopen(status, "re");
1290         if (!f)
1291                 return false;
1292
1293         while (getline(&line, &n, f) != -1) {
1294                 char *numstr;
1295
1296                 if (!strnequal(line, "SigBlk:", 7))
1297                         continue;
1298
1299                 numstr = lxc_trim_whitespace_in_place(line + 7);
1300                 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1301                 if (ret < 0)
1302                         return false;
1303
1304                 break;
1305         }
1306
1307         if (sigblk & (one << (signal - 1)))
1308                 bret = true;
1309
1310         return bret;
1311 }
1312
1313 int lxc_preserve_ns(const int pid, const char *ns)
1314 {
1315         int ret;
1316 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1317 #define __NS_PATH_LEN 50
1318         char path[__NS_PATH_LEN];
1319
1320         /* This way we can use this function to also check whether namespaces
1321          * are supported by the kernel by passing in the NULL or the empty
1322          * string.
1323          */
1324         ret = strnprintf(path, sizeof(path), "/proc/%d/ns%s%s", pid,
1325                          !ns || strequal(ns, "") ? "" : "/",
1326                          !ns || strequal(ns, "") ? "" : ns);
1327         if (ret < 0)
1328                 return ret_errno(EIO);
1329
1330         return open(path, O_RDONLY | O_CLOEXEC);
1331 }
1332
1333 bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
1334 {
1335         int ret = 0;
1336
1337         if (gid != LXC_INVALID_GID) {
1338                 ret = setresgid(gid, gid, gid);
1339                 if (ret < 0) {
1340                         SYSERROR("Failed to switch to gid %d", gid);
1341                         return false;
1342                 }
1343                 NOTICE("Switched to gid %d", gid);
1344         }
1345
1346         if (uid != LXC_INVALID_UID) {
1347                 ret = setresuid(uid, uid, uid);
1348                 if (ret < 0) {
1349                         SYSERROR("Failed to switch to uid %d", uid);
1350                         return false;
1351                 }
1352                 NOTICE("Switched to uid %d", uid);
1353         }
1354
1355         return true;
1356 }
1357
1358 /* Simple convenience function which enables uniform logging. */
1359 bool lxc_drop_groups(void)
1360 {
1361         int ret;
1362
1363         ret = setgroups(0, NULL);
1364         if (ret)
1365                 return log_error_errno(false, errno, "Failed to drop supplimentary groups");
1366
1367         NOTICE("Dropped supplimentary groups");
1368         return ret == 0;
1369 }
1370
1371 bool lxc_setgroups(gid_t list[], size_t size)
1372 {
1373         int ret;
1374
1375         ret = setgroups(size, list);
1376         if (ret)
1377                 return log_error_errno(false, errno, "Failed to set supplimentary groups");
1378
1379         if (size > 0 && lxc_log_trace()) {
1380                 for (size_t i = 0; i < size; i++)
1381                         TRACE("Setting supplimentary group %d", list[i]);
1382         }
1383
1384         NOTICE("Set supplimentary groups");
1385         return true;
1386 }
1387
1388 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1389 {
1390         struct dirent *dp;
1391         struct loop_info64 lo64;
1392         DIR *dir;
1393         int dfd = -1, fd = -1, ret = -1;
1394
1395         dir = opendir("/dev");
1396         if (!dir) {
1397                 SYSERROR("Failed to open \"/dev\"");
1398                 return -1;
1399         }
1400
1401         while ((dp = readdir(dir))) {
1402                 if (!strnequal(dp->d_name, "loop", 4))
1403                         continue;
1404
1405                 dfd = dirfd(dir);
1406                 if (dfd < 0)
1407                         continue;
1408
1409                 fd = openat(dfd, dp->d_name, O_RDWR);
1410                 if (fd < 0)
1411                         continue;
1412
1413                 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1414                 if (ret < 0) {
1415                         if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1416                             errno != ENXIO) {
1417                                 close(fd);
1418                                 fd = -1;
1419                                 continue;
1420                         }
1421                 }
1422
1423                 ret = strnprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1424                 if (ret < 0) {
1425                         close(fd);
1426                         fd = -1;
1427                         continue;
1428                 }
1429
1430                 break;
1431         }
1432
1433         closedir(dir);
1434
1435         if (fd < 0)
1436                 return -1;
1437
1438         return fd;
1439 }
1440
1441 static int lxc_get_unused_loop_dev(char *name_loop)
1442 {
1443         int loop_nr, ret;
1444         int fd_ctl = -1, fd_tmp = -1;
1445
1446         fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1447         if (fd_ctl < 0) {
1448                 SYSERROR("Failed to open loop control");
1449                 return -ENODEV;
1450         }
1451
1452         loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1453         if (loop_nr < 0) {
1454                 SYSERROR("Failed to get loop control");
1455                 goto on_error;
1456         }
1457
1458         ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1459         if (ret < 0)
1460                 goto on_error;
1461
1462         fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1463         if (fd_tmp < 0) {
1464                 /* on Android loop devices are moved under /dev/block, give it a shot */
1465                 ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/block/loop%d", loop_nr);
1466                 if (ret < 0)
1467                         goto on_error;
1468
1469                 fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1470                 if (fd_tmp < 0)
1471                         SYSERROR("Failed to open loop \"%s\"", name_loop);
1472         }
1473
1474 on_error:
1475         close(fd_ctl);
1476         return fd_tmp;
1477 }
1478
1479 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1480 {
1481         int ret;
1482         struct loop_info64 lo64;
1483         int fd_img = -1, fret = -1, fd_loop = -1;
1484
1485         fd_loop = lxc_get_unused_loop_dev(loop_dev);
1486         if (fd_loop < 0) {
1487                 if (fd_loop != -ENODEV)
1488                         goto on_error;
1489
1490                 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1491                 if (fd_loop < 0)
1492                         goto on_error;
1493         }
1494
1495         fd_img = open(source, O_RDWR | O_CLOEXEC);
1496         if (fd_img < 0) {
1497                 SYSERROR("Failed to open source \"%s\"", source);
1498                 goto on_error;
1499         }
1500
1501         ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1502         if (ret < 0) {
1503                 SYSERROR("Failed to set loop fd");
1504                 goto on_error;
1505         }
1506
1507         memset(&lo64, 0, sizeof(lo64));
1508         lo64.lo_flags = flags;
1509
1510         strlcpy((char *)lo64.lo_file_name, source, LO_NAME_SIZE);
1511
1512         ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1513         if (ret < 0) {
1514                 SYSERROR("Failed to set loop status64");
1515                 goto on_error;
1516         }
1517
1518         fret = 0;
1519
1520 on_error:
1521         if (fd_img >= 0)
1522                 close(fd_img);
1523
1524         if (fret < 0 && fd_loop >= 0) {
1525                 close(fd_loop);
1526                 fd_loop = -1;
1527         }
1528
1529         return fd_loop;
1530 }
1531
1532 int lxc_unstack_mountpoint(const char *path, bool lazy)
1533 {
1534         int ret;
1535         int umounts = 0;
1536
1537 pop_stack:
1538         ret = umount2(path, lazy ? MNT_DETACH : 0);
1539         if (ret < 0) {
1540                 /* We consider anything else than EINVAL deadly to prevent going
1541                  * into an infinite loop. (The other alternative is constantly
1542                  * parsing /proc/self/mountinfo which is yucky and probably
1543                  * racy.)
1544                  */
1545                 if (errno != EINVAL)
1546                         return -errno;
1547         } else {
1548                 /* Just stop counting when this happens. That'd just be so
1549                  * stupid that we won't even bother trying to report back the
1550                  * correct value anymore.
1551                  */
1552                 if (umounts != INT_MAX)
1553                         umounts++;
1554
1555                 /* We succeeded in umounting. Make sure that there's no other
1556                  * mountpoint stacked underneath.
1557                  */
1558                 goto pop_stack;
1559         }
1560
1561         return umounts;
1562 }
1563
1564 static int run_command_internal(char *buf, size_t buf_size, int (*child_fn)(void *), void *args, bool wait_status)
1565 {
1566         pid_t child;
1567         int ret, fret, pipefd[2];
1568         ssize_t bytes;
1569
1570         /* Make sure our callers do not receive uninitialized memory. */
1571         if (buf_size > 0 && buf)
1572                 buf[0] = '\0';
1573
1574         if (pipe(pipefd) < 0) {
1575                 SYSERROR("Failed to create pipe");
1576                 return -1;
1577         }
1578
1579         child = lxc_raw_clone(0, NULL);
1580         if (child < 0) {
1581                 close(pipefd[0]);
1582                 close(pipefd[1]);
1583                 SYSERROR("Failed to create new process");
1584                 return -1;
1585         }
1586
1587         if (child == 0) {
1588                 /* Close the read-end of the pipe. */
1589                 close(pipefd[0]);
1590
1591                 /* Redirect std{err,out} to write-end of the
1592                  * pipe.
1593                  */
1594                 ret = dup2(pipefd[1], STDOUT_FILENO);
1595                 if (ret >= 0)
1596                         ret = dup2(pipefd[1], STDERR_FILENO);
1597
1598                 /* Close the write-end of the pipe. */
1599                 close(pipefd[1]);
1600
1601                 if (ret < 0) {
1602                         SYSERROR("Failed to duplicate std{err,out} file descriptor");
1603                         _exit(EXIT_FAILURE);
1604                 }
1605
1606                 /* Does not return. */
1607                 child_fn(args);
1608                 ERROR("Failed to exec command");
1609                 _exit(EXIT_FAILURE);
1610         }
1611
1612         /* close the write-end of the pipe */
1613         close(pipefd[1]);
1614
1615         if (buf && buf_size > 0) {
1616                 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1617                 if (bytes > 0)
1618                         buf[bytes - 1] = '\0';
1619         }
1620
1621         if (wait_status)
1622                 fret = lxc_wait_for_pid_status(child);
1623         else
1624                 fret = wait_for_pid(child);
1625
1626         /* close the read-end of the pipe */
1627         close(pipefd[0]);
1628
1629         return fret;
1630 }
1631
1632 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1633 {
1634     return run_command_internal(buf, buf_size, child_fn, args, false);
1635 }
1636
1637 int run_command_status(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1638 {
1639     return run_command_internal(buf, buf_size, child_fn, args, true);
1640 }
1641
1642 bool lxc_nic_exists(char *nic)
1643 {
1644 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1645         char path[__LXC_SYS_CLASS_NET_LEN];
1646         int ret;
1647         struct stat sb;
1648
1649         if (strequal(nic, "none"))
1650                 return true;
1651
1652         ret = strnprintf(path, sizeof(path), "/sys/class/net/%s", nic);
1653         if (ret < 0)
1654                 return false;
1655
1656         ret = stat(path, &sb);
1657         if (ret < 0)
1658                 return false;
1659
1660         return true;
1661 }
1662
1663 uint64_t lxc_find_next_power2(uint64_t n)
1664 {
1665         /* 0 is not valid input. We return 0 to the caller since 0 is not a
1666          * valid power of two.
1667          */
1668         if (n == 0)
1669                 return 0;
1670
1671         if (!(n & (n - 1)))
1672                 return n;
1673
1674         while (n & (n - 1))
1675                 n = n & (n - 1);
1676
1677         n = n << 1;
1678         return n;
1679 }
1680
1681 static int process_dead(/* takes */ int status_fd)
1682 {
1683         __do_close int dupfd = -EBADF;
1684         __do_free char *line = NULL;
1685         __do_fclose FILE *f = NULL;
1686         int ret = 0;
1687         size_t n = 0;
1688
1689         dupfd = dup(status_fd);
1690         if (dupfd < 0)
1691                 return -1;
1692
1693         if (fd_cloexec(dupfd, true) < 0)
1694                 return -1;
1695
1696         f = fdopen(dupfd, "re");
1697         if (!f)
1698                 return -1;
1699
1700         /* Transfer ownership of fd. */
1701         move_fd(dupfd);
1702
1703         ret = 0;
1704         while (getline(&line, &n, f) != -1) {
1705                 char *state;
1706
1707                 if (!strnequal(line, "State:", 6))
1708                         continue;
1709
1710                 state = lxc_trim_whitespace_in_place(line + 6);
1711                 /* only check whether process is dead or zombie for now */
1712                 if (*state == 'X' || *state == 'Z')
1713                         ret = 1;
1714         }
1715
1716         return ret;
1717 }
1718
1719 int lxc_set_death_signal(int signal, pid_t parent, int parent_status_fd)
1720 {
1721         int ret;
1722         pid_t ppid;
1723
1724         ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1725                     prctl_arg(0), prctl_arg(0));
1726
1727         /* verify that we haven't been orphaned in the meantime */
1728         ppid = (pid_t)syscall(SYS_getppid);
1729         if (ppid == 0) { /* parent outside our pidns */
1730                 if (parent_status_fd < 0)
1731                         return 0;
1732
1733                 if (process_dead(parent_status_fd) == 1)
1734                         return raise(SIGKILL);
1735         } else if (ppid != parent) {
1736                 return raise(SIGKILL);
1737         }
1738
1739         if (ret < 0)
1740                 return -1;
1741
1742         return 0;
1743 }
1744
1745 int lxc_rm_rf(const char *dirname)
1746 {
1747         __do_closedir DIR *dir = NULL;
1748         int fret = 0;
1749         int ret;
1750         struct dirent *direntp;
1751
1752         dir = opendir(dirname);
1753         if (!dir)
1754                 return log_error_errno(-1, errno, "Failed to open dir \"%s\"", dirname);
1755
1756         while ((direntp = readdir(dir))) {
1757                 __do_free char *pathname = NULL;
1758                 struct stat mystat;
1759
1760                 if (strequal(direntp->d_name, ".") ||
1761                     strequal(direntp->d_name, ".."))
1762                         continue;
1763
1764                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1765                 ret = lstat(pathname, &mystat);
1766                 if (ret < 0) {
1767                         if (!fret)
1768                                 SYSWARN("Failed to stat \"%s\"", pathname);
1769
1770                         fret = -1;
1771                         continue;
1772                 }
1773
1774                 if (!S_ISDIR(mystat.st_mode))
1775                         continue;
1776
1777                 ret = lxc_rm_rf(pathname);
1778                 if (ret < 0)
1779                         fret = -1;
1780         }
1781
1782         ret = rmdir(dirname);
1783         if (ret < 0)
1784                 return log_warn_errno(-1, errno, "Failed to delete \"%s\"", dirname);
1785
1786         return fret;
1787 }
1788
1789 bool lxc_can_use_pidfd(int pidfd)
1790 {
1791         int ret;
1792
1793         if (pidfd < 0)
1794                 return log_trace(false, "Kernel does not support pidfds");
1795
1796         /*
1797          * We don't care whether or not children were in a waitable state. We
1798          * just care whether waitid() recognizes P_PIDFD.
1799          *
1800          * Btw, while I have your attention, the above waitid() code is an
1801          * excellent example of how _not_ to do flag-based kernel APIs. So if
1802          * you ever go into kernel development or are already and you add this
1803          * kind of flag potpourri even though you have read this comment shame
1804          * on you. May the gods of operating system development have mercy on
1805          * your soul because I won't.
1806          */
1807         ret = waitid(P_PIDFD, pidfd, NULL,
1808                     /* Type of children to wait for. */
1809                     __WALL |
1810                     /* How to wait for them. */
1811                     WNOHANG | WNOWAIT |
1812                     /* What state to wait for. */
1813                     WEXITED | WSTOPPED | WCONTINUED);
1814         if (ret < 0)
1815                 return log_error_errno(false, errno, "Kernel does not support waiting on processes through pidfds");
1816
1817         ret = lxc_raw_pidfd_send_signal(pidfd, 0, NULL, 0);
1818         if (ret)
1819                 return log_error_errno(false, errno, "Kernel does not support sending singals through pidfds");
1820
1821         return log_trace(true, "Kernel supports pidfds");
1822 }
1823
1824 int fix_stdio_permissions(uid_t uid)
1825 {
1826         __do_close int devnull_fd = -EBADF;
1827         int fret = 0;
1828         int std_fds[] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO};
1829         int ret;
1830         struct stat st, st_null;
1831
1832         devnull_fd = open_devnull();
1833         if (devnull_fd < 0)
1834                 return log_trace_errno(-1, errno, "Failed to open \"/dev/null\"");
1835
1836         ret = fstat(devnull_fd, &st_null);
1837         if (ret)
1838                 return log_trace_errno(-errno, errno, "Failed to stat \"/dev/null\"");
1839
1840         for (size_t i = 0; i < ARRAY_SIZE(std_fds); i++) {
1841                 ret = fstat(std_fds[i], &st);
1842                 if (ret) {
1843                         SYSWARN("Failed to stat standard I/O file descriptor %d", std_fds[i]);
1844                         fret = -1;
1845                         continue;
1846                 }
1847
1848                 if (st.st_rdev == st_null.st_rdev)
1849                         continue;
1850
1851                 ret = fchown(std_fds[i], uid, st.st_gid);
1852                 if (ret) {
1853                         SYSTRACE("Failed to chown standard I/O file descriptor %d to uid %d and gid %d",
1854                                  std_fds[i], uid, st.st_gid);
1855                         fret = -1;
1856                         continue;
1857                 }
1858
1859                 ret = fchmod(std_fds[i], 0700);
1860                 if (ret) {
1861                         SYSTRACE("Failed to chmod standard I/O file descriptor %d", std_fds[i]);
1862                         fret = -1;
1863                 }
1864         }
1865
1866         return fret;
1867 }
1868
1869 bool multiply_overflow(int64_t base, uint64_t mult, int64_t *res)
1870 {
1871         if (base > 0 && base > (int64_t)(INT64_MAX / mult))
1872                 return false;
1873
1874         if (base < 0 && base < (int64_t)(INT64_MIN / mult))
1875                 return false;
1876
1877         *res = (int64_t)(base * mult);
1878         return true;
1879 }
1880
1881 int print_r(int fd, const char *path)
1882 {
1883         __do_close int dfd = -EBADF, dfd_dup = -EBADF;
1884         __do_closedir DIR *dir = NULL;
1885         int ret = 0;
1886         struct dirent *direntp;
1887         struct stat st;
1888
1889         if (is_empty_string(path)) {
1890                 char buf[LXC_PROC_SELF_FD_LEN];
1891
1892                 ret = strnprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd);
1893                 if (ret < 0)
1894                         return ret_errno(EIO);
1895
1896                 /*
1897                  * O_PATH file descriptors can't be used so we need to re-open
1898                  * just in case.
1899                  */
1900                 dfd = openat(-EBADF, buf, O_CLOEXEC | O_DIRECTORY, 0);
1901         } else {
1902                 dfd = openat(fd, path, O_CLOEXEC | O_DIRECTORY, 0);
1903         }
1904         if (dfd < 0)
1905                 return -1;
1906
1907         dfd_dup = dup_cloexec(dfd);
1908         if (dfd_dup < 0)
1909                 return -1;
1910
1911         dir = fdopendir(dfd);
1912         if (!dir)
1913                 return -1;
1914         /* Transfer ownership to fdopendir(). */
1915         move_fd(dfd);
1916
1917         while ((direntp = readdir(dir))) {
1918                 if (!strcmp(direntp->d_name, ".") ||
1919                     !strcmp(direntp->d_name, ".."))
1920                         continue;
1921
1922                 ret = fstatat(dfd_dup, direntp->d_name, &st, AT_SYMLINK_NOFOLLOW);
1923                 if (ret < 0 && errno != ENOENT)
1924                         break;
1925
1926                 ret = 0;
1927                 if (S_ISDIR(st.st_mode))
1928                         ret = print_r(dfd_dup, direntp->d_name);
1929                 else
1930                         INFO("mode(%o):uid(%d):gid(%d) -> %d/%s\n",
1931                              (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, dfd_dup,
1932                              direntp->d_name);
1933                 if (ret < 0 && errno != ENOENT)
1934                         break;
1935         }
1936
1937         if (is_empty_string(path))
1938                 ret = fstatat(fd, "", &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH);
1939         else
1940                 ret = fstatat(fd, path, &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW);
1941         if (ret)
1942                 return -1;
1943         else
1944                 INFO("mode(%o):uid(%d):gid(%d) -> %s",
1945                      (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, maybe_empty(path));
1946         return ret;
1947 }