src/lxc/utils.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #ifndef _GNU_SOURCE
  25 #define _GNU_SOURCE 1
  26 #endif
  27 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
  28 #include <ctype.h>
  29 #include <dirent.h>
  30 #include <errno.h>
  31 #include <fcntl.h>
  32 #include <grp.h>
  33 #include <inttypes.h>
  34 #include <libgen.h>
  35 #include <pthread.h>
  36 #include <stddef.h>
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <sys/mman.h>
  41 #include <sys/mount.h>
  42 #include <sys/param.h>
  43 #include <sys/prctl.h>
  44 #include <sys/stat.h>
  45 #include <sys/types.h>
  46 #include <sys/wait.h>
  47 #include <unistd.h>
  48
  49 #include "config.h"
  50 #include "log.h"
  51 #include "lxclock.h"
  52 #include "namespace.h"
  53 #include "parse.h"
  54 #include "raw_syscalls.h"
  55 #include "syscall_wrappers.h"
  56 #include "utils.h"
  57
  58 #ifndef HAVE_STRLCPY
  59 #include "include/strlcpy.h"
  60 #endif
  61
  62 #ifndef HAVE_STRLCAT
  63 #include "include/strlcat.h"
  64 #endif
  65
  66 #ifndef O_PATH
  67 #define O_PATH      010000000
  68 #endif
  69
  70 #ifndef O_NOFOLLOW
  71 #define O_NOFOLLOW  00400000
  72 #endif
  73
  74 lxc_log_define(utils, lxc);
  75
  76 /*
  77  * if path is btrfs, tries to remove it and any subvolumes beneath it
  78  */
  79 extern bool btrfs_try_remove_subvol(const char *path);
  80
  81 static int _recursive_rmdir(const char *dirname, dev_t pdev,
  82                             const char *exclude, int level, bool onedev)
  83 {
  84         struct dirent *direntp;
  85         DIR *dir;
  86         int ret, failed = 0;
  87         char pathname[PATH_MAX];
  88         bool hadexclude = false;
  89
  90         dir = opendir(dirname);
  91         if (!dir) {
  92                 ERROR("Failed to open \"%s\"", dirname);
  93                 return -1;
  94         }
  95
  96         while ((direntp = readdir(dir))) {
  97                 struct stat mystat;
  98                 int rc;
  99
 100                 if (!strcmp(direntp->d_name, ".") ||
 101                     !strcmp(direntp->d_name, ".."))
 102                         continue;
 103
 104                 rc = snprintf(pathname, PATH_MAX, "%s/%s", dirname, direntp->d_name);
 105                 if (rc < 0 || rc >= PATH_MAX) {
 106                         ERROR("The name of path is too long");
 107                         failed=1;
 108                         continue;
 109                 }
 110
 111                 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
 112                         ret = rmdir(pathname);
 113                         if (ret < 0) {
 114                                 switch(errno) {
 115                                 case ENOTEMPTY:
 116                                         INFO("Not deleting snapshot \"%s\"", pathname);
 117                                         hadexclude = true;
 118                                         break;
 119                                 case ENOTDIR:
 120                                         ret = unlink(pathname);
 121                                         if (ret)
 122                                                 INFO("Failed to remove \"%s\"", pathname);
 123                                         break;
 124                                 default:
 125                                         SYSERROR("Failed to rmdir \"%s\"", pathname);
 126                                         failed = 1;
 127                                         break;
 128                                 }
 129                         }
 130
 131                         continue;
 132                 }
 133
 134                 ret = lstat(pathname, &mystat);
 135                 if (ret) {
 136                         SYSERROR("Failed to stat \"%s\"", pathname);
 137                         failed = 1;
 138                         continue;
 139                 }
 140
 141                 if (onedev && mystat.st_dev != pdev) {
 142                         /* TODO should we be checking /proc/self/mountinfo for
 143                          * pathname and not doing this if found? */
 144                         if (btrfs_try_remove_subvol(pathname))
 145                                 INFO("Removed btrfs subvolume at \"%s\"", pathname);
 146                         continue;
 147                 }
 148
 149                 if (S_ISDIR(mystat.st_mode)) {
 150                         if (_recursive_rmdir(pathname, pdev, exclude, level+1, onedev) < 0)
 151                                 failed=1;
 152                 } else {
 153                         if (unlink(pathname) < 0) {
 154                                 SYSERROR("Failed to delete \"%s\"", pathname);
 155                                 failed=1;
 156                         }
 157                 }
 158         }
 159
 160         if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
 161                 SYSERROR("Failed to delete \"%s\"", dirname);
 162                 failed=1;
 163         }
 164
 165         ret = closedir(dir);
 166         if (ret) {
 167                 SYSERROR("Failed to close directory \"%s\"", dirname);
 168                 failed=1;
 169         }
 170
 171         return failed ? -1 : 0;
 172 }
 173
 174 /* In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
 175  * lxc_rmdir_onedev()
 176  */
 177 static bool is_native_overlayfs(const char *path)
 178 {
 179         if (has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
 180             has_fs_type(path, OVERLAYFS_SUPER_MAGIC))
 181                 return true;
 182
 183         return false;
 184 }
 185
 186 /* returns 0 on success, -1 if there were any failures */
 187 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
 188 {
 189         struct stat mystat;
 190         bool onedev = true;
 191
 192         if (is_native_overlayfs(path))
 193                 onedev = false;
 194
 195         if (lstat(path, &mystat) < 0) {
 196                 if (errno == ENOENT)
 197                         return 0;
 198
 199                 SYSERROR("Failed to stat \"%s\"", path);
 200                 return -1;
 201         }
 202
 203         return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 204 }
 205
 206 /* borrowed from iproute2 */
 207 extern int get_u16(unsigned short *val, const char *arg, int base)
 208 {
 209         unsigned long res;
 210         char *ptr;
 211
 212         if (!arg || !*arg)
 213                 return -1;
 214
 215         errno = 0;
 216         res = strtoul(arg, &ptr, base);
 217         if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
 218                 return -1;
 219
 220         *val = res;
 221
 222         return 0;
 223 }
 224
 225 int mkdir_p(const char *dir, mode_t mode)
 226 {
 227         const char *tmp = dir;
 228         const char *orig = dir;
 229
 230         do {
 231                 int ret;
 232                 char *makeme;
 233
 234                 dir = tmp + strspn(tmp, "/");
 235                 tmp = dir + strcspn(dir, "/");
 236
 237                 errno = ENOMEM;
 238                 makeme = strndup(orig, dir - orig);
 239                 if (!makeme)
 240                         return -1;
 241
 242                 ret = mkdir(makeme, mode);
 243                 if (ret < 0 && errno != EEXIST) {
 244                         SYSERROR("Failed to create directory \"%s\"", makeme);
 245                         free(makeme);
 246                         return -1;
 247                 }
 248
 249                 free(makeme);
 250         } while (tmp != dir);
 251
 252         return 0;
 253 }
 254
 255 char *get_rundir()
 256 {
 257         char *rundir;
 258         const char *homedir;
 259         struct stat sb;
 260
 261         if (stat(RUNTIME_PATH, &sb) < 0)
 262                 return NULL;
 263
 264         if (geteuid() == sb.st_uid || getegid() == sb.st_gid) {
 265                 rundir = strdup(RUNTIME_PATH);
 266                 return rundir;
 267         }
 268
 269         rundir = getenv("XDG_RUNTIME_DIR");
 270         if (rundir) {
 271                 rundir = strdup(rundir);
 272                 return rundir;
 273         }
 274
 275         INFO("XDG_RUNTIME_DIR isn't set in the environment");
 276         homedir = getenv("HOME");
 277         if (!homedir) {
 278                 ERROR("HOME isn't set in the environment");
 279                 return NULL;
 280         }
 281
 282         rundir = malloc(sizeof(char) * (17 + strlen(homedir)));
 283         if (!rundir)
 284                 return NULL;
 285
 286         sprintf(rundir, "%s/.cache/lxc/run/", homedir);
 287
 288         return rundir;
 289 }
 290
 291 int wait_for_pid(pid_t pid)
 292 {
 293         int status, ret;
 294
 295 again:
 296         ret = waitpid(pid, &status, 0);
 297         if (ret == -1) {
 298                 if (errno == EINTR)
 299                         goto again;
 300
 301                 return -1;
 302         }
 303
 304         if (ret != pid)
 305                 goto again;
 306
 307         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
 308                 return -1;
 309
 310         return 0;
 311 }
 312
 313 int lxc_wait_for_pid_status(pid_t pid)
 314 {
 315         int status, ret;
 316
 317 again:
 318         ret = waitpid(pid, &status, 0);
 319         if (ret == -1) {
 320                 if (errno == EINTR)
 321                         goto again;
 322
 323                 return -1;
 324         }
 325
 326         if (ret != pid)
 327                 goto again;
 328
 329         return status;
 330 }
 331
 332 #if HAVE_LIBGNUTLS
 333 #include <gnutls/gnutls.h>
 334 #include <gnutls/crypto.h>
 335
 336 __attribute__((constructor))
 337 static void gnutls_lxc_init(void)
 338 {
 339         gnutls_global_init();
 340 }
 341
 342 int sha1sum_file(char *fnam, unsigned char *digest)
 343 {
 344         char *buf;
 345         int ret;
 346         FILE *f;
 347         long flen;
 348
 349         if (!fnam)
 350                 return -1;
 351
 352         f = fopen_cloexec(fnam, "r");
 353         if (!f) {
 354                 SYSERROR("Failed to open template \"%s\"", fnam);
 355                 return -1;
 356         }
 357
 358         if (fseek(f, 0, SEEK_END) < 0) {
 359                 SYSERROR("Failed to seek to end of template");
 360                 fclose(f);
 361                 return -1;
 362         }
 363
 364         if ((flen = ftell(f)) < 0) {
 365                 SYSERROR("Failed to tell size of template");
 366                 fclose(f);
 367                 return -1;
 368         }
 369
 370         if (fseek(f, 0, SEEK_SET) < 0) {
 371                 SYSERROR("Failed to seek to start of template");
 372                 fclose(f);
 373                 return -1;
 374         }
 375
 376         if ((buf = malloc(flen+1)) == NULL) {
 377                 SYSERROR("Out of memory");
 378                 fclose(f);
 379                 return -1;
 380         }
 381
 382         if (fread(buf, 1, flen, f) != flen) {
 383                 SYSERROR("Failed to read template");
 384                 free(buf);
 385                 fclose(f);
 386                 return -1;
 387         }
 388
 389         if (fclose(f) < 0) {
 390                 SYSERROR("Failed to close template");
 391                 free(buf);
 392                 return -1;
 393         }
 394
 395         buf[flen] = '\0';
 396         ret = gnutls_hash_fast(GNUTLS_DIG_SHA1, buf, flen, (void *)digest);
 397         free(buf);
 398         return ret;
 399 }
 400 #endif
 401
 402 struct lxc_popen_FILE *lxc_popen(const char *command)
 403 {
 404         int ret;
 405         int pipe_fds[2];
 406         pid_t child_pid;
 407         struct lxc_popen_FILE *fp = NULL;
 408
 409         ret = pipe2(pipe_fds, O_CLOEXEC);
 410         if (ret < 0)
 411                 return NULL;
 412
 413         child_pid = fork();
 414         if (child_pid < 0)
 415                 goto on_error;
 416
 417         if (!child_pid) {
 418                 sigset_t mask;
 419
 420                 close(pipe_fds[0]);
 421
 422                 /* duplicate stdout */
 423                 if (pipe_fds[1] != STDOUT_FILENO)
 424                         ret = dup2(pipe_fds[1], STDOUT_FILENO);
 425                 else
 426                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 427                 if (ret < 0) {
 428                         close(pipe_fds[1]);
 429                         _exit(EXIT_FAILURE);
 430                 }
 431
 432                 /* duplicate stderr */
 433                 if (pipe_fds[1] != STDERR_FILENO)
 434                         ret = dup2(pipe_fds[1], STDERR_FILENO);
 435                 else
 436                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 437                 close(pipe_fds[1]);
 438                 if (ret < 0)
 439                         _exit(EXIT_FAILURE);
 440
 441                 /* unblock all signals */
 442                 ret = sigfillset(&mask);
 443                 if (ret < 0)
 444                         _exit(EXIT_FAILURE);
 445
 446                 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
 447                 if (ret < 0)
 448                         _exit(EXIT_FAILURE);
 449
 450                 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
 451                 _exit(127);
 452         }
 453
 454         close(pipe_fds[1]);
 455         pipe_fds[1] = -1;
 456
 457         fp = malloc(sizeof(*fp));
 458         if (!fp)
 459                 goto on_error;
 460
 461         memset(fp, 0, sizeof(*fp));
 462
 463         fp->child_pid = child_pid;
 464         fp->pipe = pipe_fds[0];
 465
 466         /* From now on, closing fp->f will also close fp->pipe. So only ever
 467          * call fclose(fp->f).
 468          */
 469         fp->f = fdopen(pipe_fds[0], "r");
 470         if (!fp->f)
 471                 goto on_error;
 472
 473         return fp;
 474
 475 on_error:
 476         /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
 477          * called yet. Otherwise the fd belongs to the file opened by fdopen()
 478          * since it isn't dup()ed.
 479          */
 480         if (fp && !fp->f && pipe_fds[0] >= 0)
 481                 close(pipe_fds[0]);
 482
 483         if (pipe_fds[1] >= 0)
 484                 close(pipe_fds[1]);
 485
 486         if (fp && fp->f)
 487                 fclose(fp->f);
 488
 489         if (fp)
 490                 free(fp);
 491
 492         return NULL;
 493 }
 494
 495 int lxc_pclose(struct lxc_popen_FILE *fp)
 496 {
 497         pid_t wait_pid;
 498         int wstatus = 0;
 499
 500         if (!fp)
 501                 return -1;
 502
 503         do {
 504                 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
 505         } while (wait_pid < 0 && errno == EINTR);
 506
 507         fclose(fp->f);
 508         free(fp);
 509
 510         if (wait_pid < 0)
 511                 return -1;
 512
 513         return wstatus;
 514 }
 515
 516 int randseed(bool srand_it)
 517 {
 518         FILE *f;
 519         /*
 520          * srand pre-seed function based on /dev/urandom
 521          */
 522         unsigned int seed = time(NULL) + getpid();
 523
 524         f = fopen("/dev/urandom", "r");
 525         if (f) {
 526                 int ret = fread(&seed, sizeof(seed), 1, f);
 527                 if (ret != 1)
 528                         SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
 529
 530                 fclose(f);
 531         }
 532
 533         if (srand_it)
 534                 srand(seed);
 535
 536         return seed;
 537 }
 538
 539 uid_t get_ns_uid(uid_t orig)
 540 {
 541         char *line = NULL;
 542         size_t sz = 0;
 543         uid_t nsid, hostid, range;
 544         FILE *f;
 545
 546         f = fopen("/proc/self/uid_map", "r");
 547         if (!f) {
 548                 SYSERROR("Failed to open uid_map");
 549                 return 0;
 550         }
 551
 552         while (getline(&line, &sz, f) != -1) {
 553                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 554                         continue;
 555
 556                 if (hostid <= orig && hostid + range > orig) {
 557                         nsid += orig - hostid;
 558                         goto found;
 559                 }
 560         }
 561
 562         nsid = LXC_INVALID_UID;
 563
 564 found:
 565         fclose(f);
 566         free(line);
 567         return nsid;
 568 }
 569
 570 gid_t get_ns_gid(gid_t orig)
 571 {
 572         char *line = NULL;
 573         size_t sz = 0;
 574         gid_t nsid, hostid, range;
 575         FILE *f;
 576
 577         f = fopen("/proc/self/gid_map", "r");
 578         if (!f) {
 579                 SYSERROR("Failed to open gid_map");
 580                 return 0;
 581         }
 582
 583         while (getline(&line, &sz, f) != -1) {
 584                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 585                         continue;
 586
 587                 if (hostid <= orig && hostid + range > orig) {
 588                         nsid += orig - hostid;
 589                         goto found;
 590                 }
 591         }
 592
 593         nsid = LXC_INVALID_GID;
 594
 595 found:
 596         fclose(f);
 597         free(line);
 598         return nsid;
 599 }
 600
 601 bool dir_exists(const char *path)
 602 {
 603         struct stat sb;
 604         int ret;
 605
 606         ret = stat(path, &sb);
 607         if (ret < 0)
 608                 /* Could be something other than eexist, just say "no". */
 609                 return false;
 610
 611         return S_ISDIR(sb.st_mode);
 612 }
 613
 614 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
 615  * FNV has good anti collision properties and we're not worried
 616  * about pre-image resistance or one-way-ness, we're just trying to make
 617  * the name unique in the 108 bytes of space we have.
 618  */
 619 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
 620 {
 621         unsigned char *bp;
 622
 623         for(bp = buf; bp < (unsigned char *)buf + len; bp++) {
 624                 /* xor the bottom with the current octet */
 625                 hval ^= (uint64_t)*bp;
 626
 627                 /* gcc optimised:
 628                  * multiply by the 64 bit FNV magic prime mod 2^64
 629                  */
 630                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
 631                         (hval << 7) + (hval << 8) + (hval << 40);
 632         }
 633
 634         return hval;
 635 }
 636
 637 bool is_shared_mountpoint(const char *path)
 638 {
 639         char buf[LXC_LINELEN];
 640         FILE *f;
 641         int i;
 642         char *p, *p2;
 643
 644         f = fopen("/proc/self/mountinfo", "r");
 645         if (!f)
 646                 return 0;
 647
 648         while (fgets(buf, LXC_LINELEN, f)) {
 649                 for (p = buf, i = 0; p && i < 4; i++)
 650                         p = strchr(p + 1, ' ');
 651                 if (!p)
 652                         continue;
 653
 654                 p2 = strchr(p + 1, ' ');
 655                 if (!p2)
 656                         continue;
 657
 658                 *p2 = '\0';
 659                 if (strcmp(p + 1, path) == 0) {
 660                         /* This is the path. Is it shared? */
 661                         p = strchr(p2 + 1, ' ');
 662                         if (p && strstr(p, "shared:")) {
 663                                 fclose(f);
 664                                 return true;
 665                         }
 666                 }
 667         }
 668
 669         fclose(f);
 670         return false;
 671 }
 672
 673 /*
 674  * Detect whether / is mounted MS_SHARED.  The only way I know of to
 675  * check that is through /proc/self/mountinfo.
 676  * I'm only checking for /.  If the container rootfs or mount location
 677  * is MS_SHARED, but not '/', then you're out of luck - figuring that
 678  * out would be too much work to be worth it.
 679  */
 680 int detect_shared_rootfs(void)
 681 {
 682         if (is_shared_mountpoint("/"))
 683                 return 1;
 684
 685         return 0;
 686 }
 687
 688 bool switch_to_ns(pid_t pid, const char *ns)
 689 {
 690         int fd, ret;
 691         char nspath[PATH_MAX];
 692
 693         /* Switch to new ns */
 694         ret = snprintf(nspath, PATH_MAX, "/proc/%d/ns/%s", pid, ns);
 695         if (ret < 0 || ret >= PATH_MAX)
 696                 return false;
 697
 698         fd = open(nspath, O_RDONLY);
 699         if (fd < 0) {
 700                 SYSERROR("Failed to open \"%s\"", nspath);
 701                 return false;
 702         }
 703
 704         ret = setns(fd, 0);
 705         if (ret) {
 706                 SYSERROR("Failed to set process %d to \"%s\" of %d.", pid, ns, fd);
 707                 close(fd);
 708                 return false;
 709         }
 710
 711         close(fd);
 712         return true;
 713 }
 714
 715 /*
 716  * looking at fs/proc_namespace.c, it appears we can
 717  * actually expect the rootfs entry to very specifically contain
 718  * " - rootfs rootfs "
 719  * IIUC, so long as we've chrooted so that rootfs is not our root,
 720  * the rootfs entry should always be skipped in mountinfo contents.
 721  */
 722 bool detect_ramfs_rootfs(void)
 723 {
 724         FILE *f;
 725         char *p, *p2;
 726         char *line = NULL;
 727         size_t len = 0;
 728         int i;
 729
 730         f = fopen("/proc/self/mountinfo", "r");
 731         if (!f) {
 732                 SYSERROR("Failed to open mountinfo");
 733                 return false;
 734         }
 735
 736         while (getline(&line, &len, f) != -1) {
 737                 for (p = line, i = 0; p && i < 4; i++)
 738                         p = strchr(p + 1, ' ');
 739                 if (!p)
 740                         continue;
 741
 742                 p2 = strchr(p + 1, ' ');
 743                 if (!p2)
 744                         continue;
 745
 746                 *p2 = '\0';
 747                 if (strcmp(p + 1, "/") == 0) {
 748                         /* This is '/'. Is it the ramfs? */
 749                         p = strchr(p2 + 1, '-');
 750                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
 751                                 free(line);
 752                                 fclose(f);
 753                                 INFO("Rootfs is located on ramfs");
 754                                 return true;
 755                         }
 756                 }
 757         }
 758
 759         free(line);
 760         fclose(f);
 761         return false;
 762 }
 763
 764 char *on_path(const char *cmd, const char *rootfs)
 765 {
 766         char *entry = NULL, *path = NULL;
 767         char cmdpath[PATH_MAX];
 768         int ret;
 769
 770         path = getenv("PATH");
 771         if (!path)
 772                 return NULL;
 773
 774         path = strdup(path);
 775         if (!path)
 776                 return NULL;
 777
 778         lxc_iterate_parts (entry, path, ":") {
 779                 if (rootfs)
 780                         ret = snprintf(cmdpath, PATH_MAX, "%s/%s/%s", rootfs,
 781                                        entry, cmd);
 782                 else
 783                         ret = snprintf(cmdpath, PATH_MAX, "%s/%s", entry, cmd);
 784                 if (ret < 0 || ret >= PATH_MAX)
 785                         continue;
 786
 787                 if (access(cmdpath, X_OK) == 0) {
 788                         free(path);
 789                         return strdup(cmdpath);
 790                 }
 791         }
 792
 793         free(path);
 794         return NULL;
 795 }
 796
 797 bool cgns_supported(void)
 798 {
 799         return file_exists("/proc/self/ns/cgroup");
 800 }
 801
 802 /* historically lxc-init has been under /usr/lib/lxc and under
 803  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
 804  */
 805 char *choose_init(const char *rootfs)
 806 {
 807         char *retv = NULL;
 808         const char *empty = "",
 809                    *tmp;
 810         int ret, env_set = 0;
 811
 812         if (!getenv("PATH")) {
 813                 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
 814                         SYSERROR("Failed to setenv");
 815
 816                 env_set = 1;
 817         }
 818
 819         retv = on_path("init.lxc", rootfs);
 820
 821         if (env_set)
 822                 if (unsetenv("PATH"))
 823                         SYSERROR("Failed to unsetenv");
 824
 825         if (retv)
 826                 return retv;
 827
 828         retv = malloc(PATH_MAX);
 829         if (!retv)
 830                 return NULL;
 831
 832         if (rootfs)
 833                 tmp = rootfs;
 834         else
 835                 tmp = empty;
 836
 837         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
 838         if (ret < 0 || ret >= PATH_MAX) {
 839                 ERROR("The name of path is too long");
 840                 goto out1;
 841         }
 842
 843         if (access(retv, X_OK) == 0)
 844                 return retv;
 845
 846         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
 847         if (ret < 0 || ret >= PATH_MAX) {
 848                 ERROR("The name of path is too long");
 849                 goto out1;
 850         }
 851
 852         if (access(retv, X_OK) == 0)
 853                 return retv;
 854
 855         ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
 856         if (ret < 0 || ret >= PATH_MAX) {
 857                 ERROR("The name of path is too long");
 858                 goto out1;
 859         }
 860
 861         if (access(retv, X_OK) == 0)
 862                 return retv;
 863
 864         ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
 865         if (ret < 0 || ret >= PATH_MAX) {
 866                 ERROR("The name of path is too long");
 867                 goto out1;
 868         }
 869
 870         if (access(retv, X_OK) == 0)
 871                 return retv;
 872
 873         /*
 874          * Last resort, look for the statically compiled init.lxc which we
 875          * hopefully bind-mounted in.
 876          * If we are called during container setup, and we get to this point,
 877          * then the init.lxc.static from the host will need to be bind-mounted
 878          * in.  So we return NULL here to indicate that.
 879          */
 880         if (rootfs)
 881                 goto out1;
 882
 883         ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
 884         if (ret < 0 || ret >= PATH_MAX) {
 885                 WARN("Nonsense - name /lxc.init.static too long");
 886                 goto out1;
 887         }
 888
 889         if (access(retv, X_OK) == 0)
 890                 return retv;
 891
 892 out1:
 893         free(retv);
 894         return NULL;
 895 }
 896
 897 /*
 898  * Given the '-t' template option to lxc-create, figure out what to
 899  * do.  If the template is a full executable path, use that.  If it
 900  * is something like 'sshd', then return $templatepath/lxc-sshd.
 901  * On success return the template, on error return NULL.
 902  */
 903 char *get_template_path(const char *t)
 904 {
 905         int ret, len;
 906         char *tpath;
 907
 908         if (t[0] == '/' && access(t, X_OK) == 0) {
 909                 tpath = strdup(t);
 910                 return tpath;
 911         }
 912
 913         len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
 914
 915         tpath = malloc(len);
 916         if (!tpath)
 917                 return NULL;
 918
 919         ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
 920         if (ret < 0 || ret >= len) {
 921                 free(tpath);
 922                 return NULL;
 923         }
 924
 925         if (access(tpath, X_OK) < 0) {
 926                 SYSERROR("bad template: %s", t);
 927                 free(tpath);
 928                 return NULL;
 929         }
 930
 931         return tpath;
 932 }
 933
 934 /*
 935  * @path:    a pathname where / replaced with '\0'.
 936  * @offsetp: pointer to int showing which path segment was last seen.
 937  *           Updated on return to reflect the next segment.
 938  * @fulllen: full original path length.
 939  * Returns a pointer to the next path segment, or NULL if done.
 940  */
 941 static char *get_nextpath(char *path, int *offsetp, int fulllen)
 942 {
 943         int offset = *offsetp;
 944
 945         if (offset >= fulllen)
 946                 return NULL;
 947
 948         while (offset < fulllen && path[offset] != '\0')
 949                 offset++;
 950
 951         while (offset < fulllen && path[offset] == '\0')
 952                 offset++;
 953
 954         *offsetp = offset;
 955
 956         return (offset < fulllen) ? &path[offset] : NULL;
 957 }
 958
 959 /*
 960  * Check that @subdir is a subdir of @dir.  @len is the length of
 961  * @dir (to avoid having to recalculate it).
 962  */
 963 static bool is_subdir(const char *subdir, const char *dir, size_t len)
 964 {
 965         size_t subdirlen = strlen(subdir);
 966
 967         if (subdirlen < len)
 968                 return false;
 969
 970         if (strncmp(subdir, dir, len) != 0)
 971                 return false;
 972
 973         if (dir[len-1] == '/')
 974                 return true;
 975
 976         if (subdir[len] == '/' || subdirlen == len)
 977                 return true;
 978
 979         return false;
 980 }
 981
 982 /*
 983  * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
 984  * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
 985  */
 986 static int check_symlink(int fd)
 987 {
 988         struct stat sb;
 989         int ret;
 990
 991         ret = fstat(fd, &sb);
 992         if (ret < 0)
 993                 return -ENOENT;
 994
 995         if (S_ISLNK(sb.st_mode))
 996                 return -ELOOP;
 997
 998         return 0;
 999 }
1000
1001 /*
1002  * Open a file or directory, provided that it contains no symlinks.
1003  *
1004  * CAVEAT: This function must not be used for other purposes than container
1005  * setup before executing the container's init
1006  */
1007 static int open_if_safe(int dirfd, const char *nextpath)
1008 {
1009         int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
1010         if (newfd >= 0) /* Was not a symlink, all good. */
1011                 return newfd;
1012
1013         if (errno == ELOOP)
1014                 return newfd;
1015
1016         if (errno == EPERM || errno == EACCES) {
1017                 /* We're not root (cause we got EPERM) so try opening with
1018                  * O_PATH.
1019                  */
1020                 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
1021                 if (newfd >= 0) {
1022                         /* O_PATH will return an fd for symlinks. We know
1023                          * nextpath wasn't a symlink at last openat, so if fd is
1024                          * now a link, then something * fishy is going on.
1025                          */
1026                         int ret = check_symlink(newfd);
1027                         if (ret < 0) {
1028                                 close(newfd);
1029                                 newfd = ret;
1030                         }
1031                 }
1032         }
1033
1034         return newfd;
1035 }
1036
1037 /*
1038  * Open a path intending for mounting, ensuring that the final path
1039  * is inside the container's rootfs.
1040  *
1041  * CAVEAT: This function must not be used for other purposes than container
1042  * setup before executing the container's init
1043  *
1044  * @target: path to be opened
1045  * @prefix_skip: a part of @target in which to ignore symbolic links.  This
1046  * would be the container's rootfs.
1047  *
1048  * Return an open fd for the path, or <0 on error.
1049  */
1050 static int open_without_symlink(const char *target, const char *prefix_skip)
1051 {
1052         int curlen = 0, dirfd, fulllen, i;
1053         char *dup;
1054
1055         fulllen = strlen(target);
1056
1057         /* make sure prefix-skip makes sense */
1058         if (prefix_skip && strlen(prefix_skip) > 0) {
1059                 curlen = strlen(prefix_skip);
1060                 if (!is_subdir(target, prefix_skip, curlen)) {
1061                         ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1062                               target, prefix_skip);
1063                         return -EINVAL;
1064                 }
1065
1066                 /*
1067                  * get_nextpath() expects the curlen argument to be
1068                  * on a  (turned into \0) / or before it, so decrement
1069                  * curlen to make sure that happens
1070                  */
1071                 if (curlen)
1072                         curlen--;
1073         } else {
1074                 prefix_skip = "/";
1075                 curlen = 0;
1076         }
1077
1078         /* Make a copy of target which we can hack up, and tokenize it */
1079         if ((dup = strdup(target)) == NULL) {
1080                 ERROR("Out of memory checking for symbolic link");
1081                 return -ENOMEM;
1082         }
1083
1084         for (i = 0; i < fulllen; i++) {
1085                 if (dup[i] == '/')
1086                         dup[i] = '\0';
1087         }
1088
1089         dirfd = open(prefix_skip, O_RDONLY);
1090         if (dirfd < 0) {
1091                 SYSERROR("Failed to open path \"%s\"", prefix_skip);
1092                 goto out;
1093         }
1094
1095         while (1) {
1096                 int newfd, saved_errno;
1097                 char *nextpath;
1098
1099                 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1100                         goto out;
1101
1102                 newfd = open_if_safe(dirfd, nextpath);
1103                 saved_errno = errno;
1104                 close(dirfd);
1105
1106                 dirfd = newfd;
1107                 if (newfd < 0) {
1108                         errno = saved_errno;
1109                         if (errno == ELOOP)
1110                                 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1111
1112                         goto out;
1113                 }
1114         }
1115
1116 out:
1117         free(dup);
1118         return dirfd;
1119 }
1120
1121 /*
1122  * Safely mount a path into a container, ensuring that the mount target
1123  * is under the container's @rootfs.  (If @rootfs is NULL, then the container
1124  * uses the host's /)
1125  *
1126  * CAVEAT: This function must not be used for other purposes than container
1127  * setup before executing the container's init
1128  */
1129 int safe_mount(const char *src, const char *dest, const char *fstype,
1130                 unsigned long flags, const void *data, const char *rootfs)
1131 {
1132         int destfd, ret, saved_errno;
1133         /* Only needs enough for /proc/self/fd/<fd>. */
1134         char srcbuf[50], destbuf[50];
1135         int srcfd = -1;
1136         const char *mntsrc = src;
1137
1138         if (!rootfs)
1139                 rootfs = "";
1140
1141         /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1142         if (flags & MS_BIND && src && src[0] != '/') {
1143                 INFO("This is a relative bind mount");
1144
1145                 srcfd = open_without_symlink(src, NULL);
1146                 if (srcfd < 0)
1147                         return srcfd;
1148
1149                 ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
1150                 if (ret < 0 || ret > 50) {
1151                         close(srcfd);
1152                         ERROR("Out of memory");
1153                         return -EINVAL;
1154                 }
1155                 mntsrc = srcbuf;
1156         }
1157
1158         destfd = open_without_symlink(dest, rootfs);
1159         if (destfd < 0) {
1160                 if (srcfd != -1) {
1161                         saved_errno = errno;
1162                         close(srcfd);
1163                         errno = saved_errno;
1164                 }
1165
1166                 return destfd;
1167         }
1168
1169         ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
1170         if (ret < 0 || ret > 50) {
1171                 if (srcfd != -1)
1172                         close(srcfd);
1173
1174                 close(destfd);
1175                 ERROR("Out of memory");
1176                 return -EINVAL;
1177         }
1178
1179         ret = mount(mntsrc, destbuf, fstype, flags, data);
1180         saved_errno = errno;
1181         if (srcfd != -1)
1182                 close(srcfd);
1183
1184         close(destfd);
1185         if (ret < 0) {
1186                 errno = saved_errno;
1187                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src ? src : "(null)", dest);
1188                 return ret;
1189         }
1190
1191         return 0;
1192 }
1193
1194 /*
1195  * Mount a proc under @rootfs if proc self points to a pid other than
1196  * my own.  This is needed to have a known-good proc mount for setting
1197  * up LSMs both at container startup and attach.
1198  *
1199  * @rootfs : the rootfs where proc should be mounted
1200  *
1201  * Returns < 0 on failure, 0 if the correct proc was already mounted
1202  * and 1 if a new proc was mounted.
1203  *
1204  * NOTE: not to be called from inside the container namespace!
1205  */
1206 int lxc_mount_proc_if_needed(const char *rootfs)
1207 {
1208         char path[PATH_MAX] = {0};
1209         int link_to_pid, linklen, mypid, ret;
1210         char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
1211
1212         ret = snprintf(path, PATH_MAX, "%s/proc/self", rootfs);
1213         if (ret < 0 || ret >= PATH_MAX) {
1214                 SYSERROR("The name of proc path is too long");
1215                 return -1;
1216         }
1217
1218         linklen = readlink(path, link, sizeof(link));
1219
1220         ret = snprintf(path, PATH_MAX, "%s/proc", rootfs);
1221         if (ret < 0 || ret >= PATH_MAX) {
1222                 SYSERROR("The name of proc path is too long");
1223                 return -1;
1224         }
1225
1226         /* /proc not mounted */
1227         if (linklen < 0) {
1228                 if (mkdir(path, 0755) && errno != EEXIST)
1229                         return -1;
1230
1231                 goto domount;
1232         } else if (linklen >= sizeof(link)) {
1233                 link[linklen - 1] = '\0';
1234                 ERROR("Readlink returned truncated content: \"%s\"", link);
1235                 return -1;
1236         }
1237
1238         mypid = lxc_raw_getpid();
1239         INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1240
1241         if (lxc_safe_int(link, &link_to_pid) < 0)
1242                 return -1;
1243
1244         /* correct procfs is already mounted */
1245         if (link_to_pid == mypid)
1246                 return 0;
1247
1248         ret = umount2(path, MNT_DETACH);
1249         if (ret < 0)
1250                 SYSWARN("Failed to umount \"%s\" with MNT_DETACH", path);
1251
1252 domount:
1253         /* rootfs is NULL */
1254         if (!strcmp(rootfs, ""))
1255                 ret = mount("proc", path, "proc", 0, NULL);
1256         else
1257                 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
1258         if (ret < 0)
1259                 return -1;
1260
1261         INFO("Mounted /proc in container for security transition");
1262         return 1;
1263 }
1264
1265 int open_devnull(void)
1266 {
1267         int fd = open("/dev/null", O_RDWR);
1268         if (fd < 0)
1269                 SYSERROR("Can't open /dev/null");
1270
1271         return fd;
1272 }
1273
1274 int set_stdfds(int fd)
1275 {
1276         int ret;
1277
1278         if (fd < 0)
1279                 return -1;
1280
1281         ret = dup2(fd, STDIN_FILENO);
1282         if (ret < 0)
1283                 return -1;
1284
1285         ret = dup2(fd, STDOUT_FILENO);
1286         if (ret < 0)
1287                 return -1;
1288
1289         ret = dup2(fd, STDERR_FILENO);
1290         if (ret < 0)
1291                 return -1;
1292
1293         return 0;
1294 }
1295
1296 int null_stdfds(void)
1297 {
1298         int ret = -1;
1299         int fd;
1300
1301         fd = open_devnull();
1302         if (fd >= 0) {
1303                 ret = set_stdfds(fd);
1304                 close(fd);
1305         }
1306
1307         return ret;
1308 }
1309
1310 /* Check whether a signal is blocked by a process. */
1311 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1312 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1313 bool task_blocks_signal(pid_t pid, int signal)
1314 {
1315         int ret;
1316         char status[__PROC_STATUS_LEN] = {0};
1317         FILE *f;
1318         uint64_t sigblk = 0, one = 1;
1319         size_t n = 0;
1320         bool bret = false;
1321         char *line = NULL;
1322
1323         ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1324         if (ret < 0 || ret >= __PROC_STATUS_LEN)
1325                 return bret;
1326
1327         f = fopen(status, "r");
1328         if (!f)
1329                 return bret;
1330
1331         while (getline(&line, &n, f) != -1) {
1332                 char *numstr;
1333
1334                 if (strncmp(line, "SigBlk:", 7))
1335                         continue;
1336
1337                 numstr = lxc_trim_whitespace_in_place(line + 7);
1338                 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1339                 if (ret < 0)
1340                         goto out;
1341
1342                 break;
1343         }
1344
1345         if (sigblk & (one << (signal - 1)))
1346                 bret = true;
1347
1348 out:
1349         free(line);
1350         fclose(f);
1351         return bret;
1352 }
1353
1354 int lxc_preserve_ns(const int pid, const char *ns)
1355 {
1356         int ret;
1357 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1358 #define __NS_PATH_LEN 50
1359         char path[__NS_PATH_LEN];
1360
1361         /* This way we can use this function to also check whether namespaces
1362          * are supported by the kernel by passing in the NULL or the empty
1363          * string.
1364          */
1365         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
1366                        !ns || strcmp(ns, "") == 0 ? "" : "/",
1367                        !ns || strcmp(ns, "") == 0 ? "" : ns);
1368         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1369                 errno = EFBIG;
1370                 return -1;
1371         }
1372
1373         return open(path, O_RDONLY | O_CLOEXEC);
1374 }
1375
1376 bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
1377 {
1378         int ret = 0;
1379
1380         if (gid != LXC_INVALID_GID) {
1381                 ret = setgid(gid);
1382                 if (ret < 0) {
1383                         SYSERROR("Failed to switch to gid %d", gid);
1384                         return false;
1385                 }
1386                 NOTICE("Switched to gid %d", gid);
1387         }
1388
1389         if (uid != LXC_INVALID_UID) {
1390                 ret = setuid(uid);
1391                 if (ret < 0) {
1392                         SYSERROR("Failed to switch to uid %d", uid);
1393                         return false;
1394                 }
1395                 NOTICE("Switched to uid %d", uid);
1396         }
1397
1398         return true;
1399 }
1400
1401 /* Simple convenience function which enables uniform logging. */
1402 bool lxc_setgroups(int size, gid_t list[])
1403 {
1404         if (setgroups(size, list) < 0) {
1405                 SYSERROR("Failed to setgroups()");
1406                 return false;
1407         }
1408         NOTICE("Dropped additional groups");
1409
1410         return true;
1411 }
1412
1413 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1414 {
1415         struct dirent *dp;
1416         struct loop_info64 lo64;
1417         DIR *dir;
1418         int dfd = -1, fd = -1, ret = -1;
1419
1420         dir = opendir("/dev");
1421         if (!dir) {
1422                 SYSERROR("Failed to open \"/dev\"");
1423                 return -1;
1424         }
1425
1426         while ((dp = readdir(dir))) {
1427                 if (strncmp(dp->d_name, "loop", 4) != 0)
1428                         continue;
1429
1430                 dfd = dirfd(dir);
1431                 if (dfd < 0)
1432                         continue;
1433
1434                 fd = openat(dfd, dp->d_name, O_RDWR);
1435                 if (fd < 0)
1436                         continue;
1437
1438                 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1439                 if (ret < 0) {
1440                         if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1441                             errno != ENXIO) {
1442                                 close(fd);
1443                                 fd = -1;
1444                                 continue;
1445                         }
1446                 }
1447
1448                 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1449                 if (ret < 0 || ret >= LO_NAME_SIZE) {
1450                         close(fd);
1451                         fd = -1;
1452                         continue;
1453                 }
1454
1455                 break;
1456         }
1457
1458         closedir(dir);
1459
1460         if (fd < 0)
1461                 return -1;
1462
1463         return fd;
1464 }
1465
1466 static int lxc_get_unused_loop_dev(char *name_loop)
1467 {
1468         int loop_nr, ret;
1469         int fd_ctl = -1, fd_tmp = -1;
1470
1471         fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1472         if (fd_ctl < 0) {
1473                 SYSERROR("Failed to open loop control");
1474                 return -ENODEV;
1475         }
1476
1477         loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1478         if (loop_nr < 0) {
1479                 SYSERROR("Failed to get loop control");
1480                 goto on_error;
1481         }
1482
1483         ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1484         if (ret < 0 || ret >= LO_NAME_SIZE)
1485                 goto on_error;
1486
1487         fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1488         if (fd_tmp < 0)
1489                 SYSERROR("Failed to open loop \"%s\"", name_loop);
1490
1491 on_error:
1492         close(fd_ctl);
1493         return fd_tmp;
1494 }
1495
1496 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1497 {
1498         int ret;
1499         struct loop_info64 lo64;
1500         int fd_img = -1, fret = -1, fd_loop = -1;
1501
1502         fd_loop = lxc_get_unused_loop_dev(loop_dev);
1503         if (fd_loop < 0) {
1504                 if (fd_loop != -ENODEV)
1505                         goto on_error;
1506
1507                 fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1508                 if (fd_loop < 0)
1509                         goto on_error;
1510         }
1511
1512         fd_img = open(source, O_RDWR | O_CLOEXEC);
1513         if (fd_img < 0) {
1514                 SYSERROR("Failed to open source \"%s\"", source);
1515                 goto on_error;
1516         }
1517
1518         ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1519         if (ret < 0) {
1520                 SYSERROR("Failed to set loop fd");
1521                 goto on_error;
1522         }
1523
1524         memset(&lo64, 0, sizeof(lo64));
1525         lo64.lo_flags = flags;
1526
1527         ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1528         if (ret < 0) {
1529                 SYSERROR("Failed to set loop status64");
1530                 goto on_error;
1531         }
1532
1533         fret = 0;
1534
1535 on_error:
1536         if (fd_img >= 0)
1537                 close(fd_img);
1538
1539         if (fret < 0 && fd_loop >= 0) {
1540                 close(fd_loop);
1541                 fd_loop = -1;
1542         }
1543
1544         return fd_loop;
1545 }
1546
1547 int lxc_unstack_mountpoint(const char *path, bool lazy)
1548 {
1549         int ret;
1550         int umounts = 0;
1551
1552 pop_stack:
1553         ret = umount2(path, lazy ? MNT_DETACH : 0);
1554         if (ret < 0) {
1555                 /* We consider anything else than EINVAL deadly to prevent going
1556                  * into an infinite loop. (The other alternative is constantly
1557                  * parsing /proc/self/mountinfo which is yucky and probably
1558                  * racy.)
1559                  */
1560                 if (errno != EINVAL)
1561                         return -errno;
1562         } else {
1563                 /* Just stop counting when this happens. That'd just be so
1564                  * stupid that we won't even bother trying to report back the
1565                  * correct value anymore.
1566                  */
1567                 if (umounts != INT_MAX)
1568                         umounts++;
1569
1570                 /* We succeeded in umounting. Make sure that there's no other
1571                  * mountpoint stacked underneath.
1572                  */
1573                 goto pop_stack;
1574         }
1575
1576         return umounts;
1577 }
1578
1579 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1580 {
1581         pid_t child;
1582         int ret, fret, pipefd[2];
1583         ssize_t bytes;
1584
1585         /* Make sure our callers do not receive uninitialized memory. */
1586         if (buf_size > 0 && buf)
1587                 buf[0] = '\0';
1588
1589         if (pipe(pipefd) < 0) {
1590                 SYSERROR("Failed to create pipe");
1591                 return -1;
1592         }
1593
1594         child = lxc_raw_clone(0);
1595         if (child < 0) {
1596                 close(pipefd[0]);
1597                 close(pipefd[1]);
1598                 SYSERROR("Failed to create new process");
1599                 return -1;
1600         }
1601
1602         if (child == 0) {
1603                 /* Close the read-end of the pipe. */
1604                 close(pipefd[0]);
1605
1606                 /* Redirect std{err,out} to write-end of the
1607                  * pipe.
1608                  */
1609                 ret = dup2(pipefd[1], STDOUT_FILENO);
1610                 if (ret >= 0)
1611                         ret = dup2(pipefd[1], STDERR_FILENO);
1612
1613                 /* Close the write-end of the pipe. */
1614                 close(pipefd[1]);
1615
1616                 if (ret < 0) {
1617                         SYSERROR("Failed to duplicate std{err,out} file descriptor");
1618                         _exit(EXIT_FAILURE);
1619                 }
1620
1621                 /* Does not return. */
1622                 child_fn(args);
1623                 ERROR("Failed to exec command");
1624                 _exit(EXIT_FAILURE);
1625         }
1626
1627         /* close the write-end of the pipe */
1628         close(pipefd[1]);
1629
1630         if (buf && buf_size > 0) {
1631                 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1632                 if (bytes > 0)
1633                         buf[bytes - 1] = '\0';
1634         }
1635
1636         fret = wait_for_pid(child);
1637         /* close the read-end of the pipe */
1638         close(pipefd[0]);
1639
1640         return fret;
1641 }
1642
1643 bool lxc_nic_exists(char *nic)
1644 {
1645 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1646         char path[__LXC_SYS_CLASS_NET_LEN];
1647         int ret;
1648         struct stat sb;
1649
1650         if (!strcmp(nic, "none"))
1651                 return true;
1652
1653         ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1654         if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1655                 return false;
1656
1657         ret = stat(path, &sb);
1658         if (ret < 0)
1659                 return false;
1660
1661         return true;
1662 }
1663
1664 uint64_t lxc_find_next_power2(uint64_t n)
1665 {
1666         /* 0 is not valid input. We return 0 to the caller since 0 is not a
1667          * valid power of two.
1668          */
1669         if (n == 0)
1670                 return 0;
1671
1672         if (!(n & (n - 1)))
1673                 return n;
1674
1675         while (n & (n - 1))
1676                 n = n & (n - 1);
1677
1678         n = n << 1;
1679         return n;
1680 }
1681
1682 int lxc_set_death_signal(int signal, pid_t parent)
1683 {
1684         int ret;
1685         pid_t ppid;
1686
1687         ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1688                     prctl_arg(0), prctl_arg(0));
1689
1690         /* Check whether we have been orphaned. */
1691         ppid = (pid_t)syscall(SYS_getppid);
1692         if (ppid != parent) {
1693                 ret = raise(SIGKILL);
1694                 if (ret < 0)
1695                         return -1;
1696         }
1697
1698         if (ret < 0)
1699                 return -1;
1700
1701         return 0;
1702 }
1703
1704 int fd_cloexec(int fd, bool cloexec)
1705 {
1706         int oflags, nflags;
1707
1708         oflags = fcntl(fd, F_GETFD, 0);
1709         if (oflags < 0)
1710                 return -errno;
1711
1712         if (cloexec)
1713                 nflags = oflags | FD_CLOEXEC;
1714         else
1715                 nflags = oflags & ~FD_CLOEXEC;
1716
1717         if (nflags == oflags)
1718                 return 0;
1719
1720         if (fcntl(fd, F_SETFD, nflags) < 0)
1721                 return -errno;
1722
1723         return 0;
1724 }
1725
1726 int recursive_destroy(char *dirname)
1727 {
1728         int ret;
1729         struct dirent *direntp;
1730         DIR *dir;
1731         int r = 0;
1732
1733         dir = opendir(dirname);
1734         if (!dir) {
1735                 SYSERROR("Failed to open dir \"%s\"", dirname);
1736                 return -1;
1737         }
1738
1739         while ((direntp = readdir(dir))) {
1740                 char *pathname;
1741                 struct stat mystat;
1742
1743                 if (!strcmp(direntp->d_name, ".") ||
1744                     !strcmp(direntp->d_name, ".."))
1745                         continue;
1746
1747                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1748
1749                 ret = lstat(pathname, &mystat);
1750                 if (ret < 0) {
1751                         if (!r)
1752                                 SYSWARN("Failed to stat \"%s\"", pathname);
1753
1754                         r = -1;
1755                         goto next;
1756                 }
1757
1758                 if (!S_ISDIR(mystat.st_mode))
1759                         goto next;
1760
1761                 ret = recursive_destroy(pathname);
1762                 if (ret < 0)
1763                         r = -1;
1764
1765         next:
1766                 free(pathname);
1767         }
1768
1769         ret = rmdir(dirname);
1770         if (ret < 0) {
1771                 if (!r)
1772                         SYSWARN("Failed to delete \"%s\"", dirname);
1773
1774                 r = -1;
1775         }
1776
1777         ret = closedir(dir);
1778         if (ret < 0) {
1779                 if (!r)
1780                         SYSWARN("Failed to delete \"%s\"", dirname);
1781
1782                 r = -1;
1783         }
1784
1785         return r;
1786 }
1787
1788 int lxc_setup_keyring(void)
1789 {
1790         key_serial_t keyring;
1791         int ret = 0;
1792
1793         /* Try to allocate a new session keyring for the container to prevent
1794          * information leaks.
1795          */
1796         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
1797                          prctl_arg(0), prctl_arg(0), prctl_arg(0));
1798         if (keyring < 0) {
1799                 switch (errno) {
1800                 case ENOSYS:
1801                         DEBUG("The keyctl() syscall is not supported or blocked");
1802                         break;
1803                 case EACCES:
1804                         __fallthrough;
1805                 case EPERM:
1806                         DEBUG("Failed to access kernel keyring. Continuing...");
1807                         break;
1808                 default:
1809                         SYSERROR("Failed to create kernel keyring");
1810                         break;
1811                 }
1812         }
1813
1814         return ret;
1815 }