src/lxc/utils.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #ifndef _GNU_SOURCE
  25 #define _GNU_SOURCE 1
  26 #endif
  27 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
  28 #include <ctype.h>
  29 #include <dirent.h>
  30 #include <errno.h>
  31 #include <fcntl.h>
  32 #include <grp.h>
  33 #include <inttypes.h>
  34 #include <libgen.h>
  35 #include <pthread.h>
  36 #include <stddef.h>
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <sys/mman.h>
  41 #include <sys/mount.h>
  42 #include <sys/param.h>
  43 #include <sys/prctl.h>
  44 #include <sys/stat.h>
  45 #include <sys/types.h>
  46 #include <sys/wait.h>
  47 #include <unistd.h>
  48
  49 #include "config.h"
  50 #include "log.h"
  51 #include "lxclock.h"
  52 #include "namespace.h"
  53 #include "parse.h"
  54 #include "raw_syscalls.h"
  55 #include "syscall_wrappers.h"
  56 #include "utils.h"
  57
  58 #ifndef HAVE_STRLCPY
  59 #include "include/strlcpy.h"
  60 #endif
  61
  62 #ifndef HAVE_STRLCAT
  63 #include "include/strlcat.h"
  64 #endif
  65
  66 #ifndef O_PATH
  67 #define O_PATH      010000000
  68 #endif
  69
  70 #ifndef O_NOFOLLOW
  71 #define O_NOFOLLOW  00400000
  72 #endif
  73
  74 lxc_log_define(utils, lxc);
  75
  76 /*
  77  * if path is btrfs, tries to remove it and any subvolumes beneath it
  78  */
  79 extern bool btrfs_try_remove_subvol(const char *path);
  80
  81 static int _recursive_rmdir(const char *dirname, dev_t pdev,
  82                             const char *exclude, int level, bool onedev)
  83 {
  84         struct dirent *direntp;
  85         DIR *dir;
  86         int ret, failed=0;
  87         char pathname[PATH_MAX];
  88         bool hadexclude = false;
  89
  90         dir = opendir(dirname);
  91         if (!dir) {
  92                 ERROR("failed to open %s", dirname);
  93                 return -1;
  94         }
  95
  96         while ((direntp = readdir(dir))) {
  97                 struct stat mystat;
  98                 int rc;
  99
 100                 if (!strcmp(direntp->d_name, ".") ||
 101                     !strcmp(direntp->d_name, ".."))
 102                         continue;
 103
 104                 rc = snprintf(pathname, PATH_MAX, "%s/%s", dirname, direntp->d_name);
 105                 if (rc < 0 || rc >= PATH_MAX) {
 106                         ERROR("pathname too long");
 107                         failed=1;
 108                         continue;
 109                 }
 110
 111                 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
 112                         ret = rmdir(pathname);
 113                         if (ret < 0) {
 114                                 switch(errno) {
 115                                 case ENOTEMPTY:
 116                                         INFO("Not deleting snapshot %s", pathname);
 117                                         hadexclude = true;
 118                                         break;
 119                                 case ENOTDIR:
 120                                         ret = unlink(pathname);
 121                                         if (ret)
 122                                                 INFO("Failed to remove %s", pathname);
 123                                         break;
 124                                 default:
 125                                         SYSERROR("Failed to rmdir %s", pathname);
 126                                         failed = 1;
 127                                         break;
 128                                 }
 129                         }
 130                         continue;
 131                 }
 132
 133                 ret = lstat(pathname, &mystat);
 134                 if (ret) {
 135                         ERROR("Failed to stat %s", pathname);
 136                         failed = 1;
 137                         continue;
 138                 }
 139
 140                 if (onedev && mystat.st_dev != pdev) {
 141                         /* TODO should we be checking /proc/self/mountinfo for
 142                          * pathname and not doing this if found? */
 143                         if (btrfs_try_remove_subvol(pathname))
 144                                 INFO("Removed btrfs subvolume at %s\n", pathname);
 145                         continue;
 146                 }
 147
 148                 if (S_ISDIR(mystat.st_mode)) {
 149                         if (_recursive_rmdir(pathname, pdev, exclude, level+1, onedev) < 0)
 150                                 failed=1;
 151                 } else {
 152                         if (unlink(pathname) < 0) {
 153                                 SYSERROR("Failed to delete %s", pathname);
 154                                 failed=1;
 155                         }
 156                 }
 157         }
 158
 159         if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
 160                 ERROR("Failed to delete %s", dirname);
 161                 failed=1;
 162         }
 163
 164         ret = closedir(dir);
 165         if (ret) {
 166                 ERROR("Failed to close directory %s", dirname);
 167                 failed=1;
 168         }
 169
 170         return failed ? -1 : 0;
 171 }
 172
 173 /* In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
 174  * lxc_rmdir_onedev()
 175  */
 176 static bool is_native_overlayfs(const char *path)
 177 {
 178         if (has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
 179             has_fs_type(path, OVERLAYFS_SUPER_MAGIC))
 180                 return true;
 181
 182         return false;
 183 }
 184
 185 /* returns 0 on success, -1 if there were any failures */
 186 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
 187 {
 188         struct stat mystat;
 189         bool onedev = true;
 190
 191         if (is_native_overlayfs(path))
 192                 onedev = false;
 193
 194         if (lstat(path, &mystat) < 0) {
 195                 if (errno == ENOENT)
 196                         return 0;
 197
 198                 ERROR("Failed to stat %s", path);
 199                 return -1;
 200         }
 201
 202         return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 203 }
 204
 205 /* borrowed from iproute2 */
 206 extern int get_u16(unsigned short *val, const char *arg, int base)
 207 {
 208         unsigned long res;
 209         char *ptr;
 210
 211         if (!arg || !*arg)
 212                 return -1;
 213
 214         errno = 0;
 215         res = strtoul(arg, &ptr, base);
 216         if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
 217                 return -1;
 218
 219         *val = res;
 220
 221         return 0;
 222 }
 223
 224 int mkdir_p(const char *dir, mode_t mode)
 225 {
 226         const char *tmp = dir;
 227         const char *orig = dir;
 228         do {
 229                 int ret;
 230                 char *makeme;
 231
 232                 dir = tmp + strspn(tmp, "/");
 233                 tmp = dir + strcspn(dir, "/");
 234
 235                 errno = ENOMEM;
 236                 makeme = strndup(orig, dir - orig);
 237                 if (!makeme)
 238                         return -1;
 239
 240                 ret = mkdir(makeme, mode);
 241                 if (ret < 0 && errno != EEXIST) {
 242                         SYSERROR("Failed to create directory \"%s\"", makeme);
 243                         free(makeme);
 244                         return -1;
 245                 }
 246                 free(makeme);
 247
 248         } while (tmp != dir);
 249
 250         return 0;
 251 }
 252
 253 char *get_rundir()
 254 {
 255         char *rundir;
 256         const char *homedir;
 257         struct stat sb;
 258
 259         if (stat(RUNTIME_PATH, &sb) < 0)
 260                 return NULL;
 261
 262         if (geteuid() == sb.st_uid || getegid() == sb.st_gid) {
 263                 rundir = strdup(RUNTIME_PATH);
 264                 return rundir;
 265         }
 266
 267         rundir = getenv("XDG_RUNTIME_DIR");
 268         if (rundir) {
 269                 rundir = strdup(rundir);
 270                 return rundir;
 271         }
 272
 273         INFO("XDG_RUNTIME_DIR isn't set in the environment.");
 274         homedir = getenv("HOME");
 275         if (!homedir) {
 276                 ERROR("HOME isn't set in the environment.");
 277                 return NULL;
 278         }
 279
 280         rundir = malloc(sizeof(char) * (17 + strlen(homedir)));
 281         if (!rundir)
 282                 return NULL;
 283
 284         sprintf(rundir, "%s/.cache/lxc/run/", homedir);
 285
 286         return rundir;
 287 }
 288
 289 int wait_for_pid(pid_t pid)
 290 {
 291         int status, ret;
 292
 293 again:
 294         ret = waitpid(pid, &status, 0);
 295         if (ret == -1) {
 296                 if (errno == EINTR)
 297                         goto again;
 298
 299                 return -1;
 300         }
 301
 302         if (ret != pid)
 303                 goto again;
 304
 305         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
 306                 return -1;
 307
 308         return 0;
 309 }
 310
 311 int lxc_wait_for_pid_status(pid_t pid)
 312 {
 313         int status, ret;
 314
 315 again:
 316         ret = waitpid(pid, &status, 0);
 317         if (ret == -1) {
 318                 if (errno == EINTR)
 319                         goto again;
 320
 321                 return -1;
 322         }
 323
 324         if (ret != pid)
 325                 goto again;
 326
 327         return status;
 328 }
 329
 330 #if HAVE_LIBGNUTLS
 331 #include <gnutls/gnutls.h>
 332 #include <gnutls/crypto.h>
 333
 334 __attribute__((constructor))
 335 static void gnutls_lxc_init(void)
 336 {
 337         gnutls_global_init();
 338 }
 339
 340 int sha1sum_file(char *fnam, unsigned char *digest)
 341 {
 342         char *buf;
 343         int ret;
 344         FILE *f;
 345         long flen;
 346
 347         if (!fnam)
 348                 return -1;
 349
 350         f = fopen_cloexec(fnam, "r");
 351         if (!f) {
 352                 SYSERROR("Error opening template");
 353                 return -1;
 354         }
 355
 356         if (fseek(f, 0, SEEK_END) < 0) {
 357                 SYSERROR("Error seeking to end of template");
 358                 fclose(f);
 359                 return -1;
 360         }
 361
 362         if ((flen = ftell(f)) < 0) {
 363                 SYSERROR("Error telling size of template");
 364                 fclose(f);
 365                 return -1;
 366         }
 367
 368         if (fseek(f, 0, SEEK_SET) < 0) {
 369                 SYSERROR("Error seeking to start of template");
 370                 fclose(f);
 371                 return -1;
 372         }
 373
 374         if ((buf = malloc(flen+1)) == NULL) {
 375                 SYSERROR("Out of memory");
 376                 fclose(f);
 377                 return -1;
 378         }
 379
 380         if (fread(buf, 1, flen, f) != flen) {
 381                 SYSERROR("Failure reading template");
 382                 free(buf);
 383                 fclose(f);
 384                 return -1;
 385         }
 386
 387         if (fclose(f) < 0) {
 388                 SYSERROR("Failre closing template");
 389                 free(buf);
 390                 return -1;
 391         }
 392
 393         buf[flen] = '\0';
 394         ret = gnutls_hash_fast(GNUTLS_DIG_SHA1, buf, flen, (void *)digest);
 395         free(buf);
 396         return ret;
 397 }
 398 #endif
 399
 400 struct lxc_popen_FILE *lxc_popen(const char *command)
 401 {
 402         int ret;
 403         int pipe_fds[2];
 404         pid_t child_pid;
 405         struct lxc_popen_FILE *fp = NULL;
 406
 407         ret = pipe2(pipe_fds, O_CLOEXEC);
 408         if (ret < 0)
 409                 return NULL;
 410
 411         child_pid = fork();
 412         if (child_pid < 0)
 413                 goto on_error;
 414
 415         if (!child_pid) {
 416                 sigset_t mask;
 417
 418                 close(pipe_fds[0]);
 419
 420                 /* duplicate stdout */
 421                 if (pipe_fds[1] != STDOUT_FILENO)
 422                         ret = dup2(pipe_fds[1], STDOUT_FILENO);
 423                 else
 424                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 425                 if (ret < 0) {
 426                         close(pipe_fds[1]);
 427                         _exit(EXIT_FAILURE);
 428                 }
 429
 430                 /* duplicate stderr */
 431                 if (pipe_fds[1] != STDERR_FILENO)
 432                         ret = dup2(pipe_fds[1], STDERR_FILENO);
 433                 else
 434                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 435                 close(pipe_fds[1]);
 436                 if (ret < 0)
 437                         _exit(EXIT_FAILURE);
 438
 439                 /* unblock all signals */
 440                 ret = sigfillset(&mask);
 441                 if (ret < 0)
 442                         _exit(EXIT_FAILURE);
 443
 444                 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
 445                 if (ret < 0)
 446                         _exit(EXIT_FAILURE);
 447
 448                 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
 449                 _exit(127);
 450         }
 451
 452         close(pipe_fds[1]);
 453         pipe_fds[1] = -1;
 454
 455         fp = malloc(sizeof(*fp));
 456         if (!fp)
 457                 goto on_error;
 458
 459         memset(fp, 0, sizeof(*fp));
 460
 461         fp->child_pid = child_pid;
 462         fp->pipe = pipe_fds[0];
 463
 464         /* From now on, closing fp->f will also close fp->pipe. So only ever
 465          * call fclose(fp->f).
 466          */
 467         fp->f = fdopen(pipe_fds[0], "r");
 468         if (!fp->f)
 469                 goto on_error;
 470
 471         return fp;
 472
 473 on_error:
 474         /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
 475          * called yet. Otherwise the fd belongs to the file opened by fdopen()
 476          * since it isn't dup()ed.
 477          */
 478         if (fp && !fp->f && pipe_fds[0] >= 0)
 479                 close(pipe_fds[0]);
 480
 481         if (pipe_fds[1] >= 0)
 482                 close(pipe_fds[1]);
 483
 484         if (fp && fp->f)
 485                 fclose(fp->f);
 486
 487         if (fp)
 488                 free(fp);
 489
 490         return NULL;
 491 }
 492
 493 int lxc_pclose(struct lxc_popen_FILE *fp)
 494 {
 495         pid_t wait_pid;
 496         int wstatus = 0;
 497
 498         if (!fp)
 499                 return -1;
 500
 501         do {
 502                 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
 503         } while (wait_pid < 0 && errno == EINTR);
 504
 505         fclose(fp->f);
 506         free(fp);
 507
 508         if (wait_pid < 0)
 509                 return -1;
 510
 511         return wstatus;
 512 }
 513
 514 int randseed(bool srand_it)
 515 {
 516         /*
 517            srand pre-seed function based on /dev/urandom
 518            */
 519         unsigned int seed = time(NULL) + getpid();
 520
 521         FILE *f;
 522         f = fopen("/dev/urandom", "r");
 523         if (f) {
 524                 int ret = fread(&seed, sizeof(seed), 1, f);
 525                 if (ret != 1)
 526                         SYSDEBUG("unable to fread /dev/urandom, fallback to time+pid rand seed");
 527
 528                 fclose(f);
 529         }
 530
 531         if (srand_it)
 532                 srand(seed);
 533
 534         return seed;
 535 }
 536
 537 uid_t get_ns_uid(uid_t orig)
 538 {
 539         char *line = NULL;
 540         size_t sz = 0;
 541         uid_t nsid, hostid, range;
 542         FILE *f = fopen("/proc/self/uid_map", "r");
 543         if (!f)
 544                 return 0;
 545
 546         while (getline(&line, &sz, f) != -1) {
 547                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 548                         continue;
 549
 550                 if (hostid <= orig && hostid + range > orig) {
 551                         nsid += orig - hostid;
 552                         goto found;
 553                 }
 554         }
 555
 556         nsid = LXC_INVALID_UID;
 557
 558 found:
 559         fclose(f);
 560         free(line);
 561         return nsid;
 562 }
 563
 564 gid_t get_ns_gid(gid_t orig)
 565 {
 566         char *line = NULL;
 567         size_t sz = 0;
 568         gid_t nsid, hostid, range;
 569         FILE *f = fopen("/proc/self/gid_map", "r");
 570         if (!f)
 571                 return 0;
 572
 573         while (getline(&line, &sz, f) != -1) {
 574                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 575                         continue;
 576
 577                 if (hostid <= orig && hostid + range > orig) {
 578                         nsid += orig - hostid;
 579                         goto found;
 580                 }
 581         }
 582
 583         nsid = LXC_INVALID_GID;
 584
 585 found:
 586         fclose(f);
 587         free(line);
 588         return nsid;
 589 }
 590
 591 bool dir_exists(const char *path)
 592 {
 593         struct stat sb;
 594         int ret;
 595
 596         ret = stat(path, &sb);
 597         if (ret < 0)
 598                 /* Could be something other than eexist, just say "no". */
 599                 return false;
 600
 601         return S_ISDIR(sb.st_mode);
 602 }
 603
 604 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
 605  * FNV has good anti collision properties and we're not worried
 606  * about pre-image resistance or one-way-ness, we're just trying to make
 607  * the name unique in the 108 bytes of space we have.
 608  */
 609 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
 610 {
 611         unsigned char *bp;
 612
 613         for(bp = buf; bp < (unsigned char *)buf + len; bp++)
 614         {
 615                 /* xor the bottom with the current octet */
 616                 hval ^= (uint64_t)*bp;
 617
 618                 /* gcc optimised:
 619                  * multiply by the 64 bit FNV magic prime mod 2^64
 620                  */
 621                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
 622                         (hval << 7) + (hval << 8) + (hval << 40);
 623         }
 624
 625         return hval;
 626 }
 627
 628 bool is_shared_mountpoint(const char *path)
 629 {
 630         char buf[LXC_LINELEN];
 631         FILE *f;
 632         int i;
 633         char *p, *p2;
 634
 635         f = fopen("/proc/self/mountinfo", "r");
 636         if (!f)
 637                 return 0;
 638
 639         while (fgets(buf, LXC_LINELEN, f)) {
 640                 for (p = buf, i = 0; p && i < 4; i++)
 641                         p = strchr(p + 1, ' ');
 642                 if (!p)
 643                         continue;
 644
 645                 p2 = strchr(p + 1, ' ');
 646                 if (!p2)
 647                         continue;
 648
 649                 *p2 = '\0';
 650                 if (strcmp(p + 1, path) == 0) {
 651                         /* This is the path. Is it shared? */
 652                         p = strchr(p2 + 1, ' ');
 653                         if (p && strstr(p, "shared:")) {
 654                                 fclose(f);
 655                                 return true;
 656                         }
 657                 }
 658         }
 659
 660         fclose(f);
 661         return false;
 662 }
 663
 664 /*
 665  * Detect whether / is mounted MS_SHARED.  The only way I know of to
 666  * check that is through /proc/self/mountinfo.
 667  * I'm only checking for /.  If the container rootfs or mount location
 668  * is MS_SHARED, but not '/', then you're out of luck - figuring that
 669  * out would be too much work to be worth it.
 670  */
 671 int detect_shared_rootfs(void)
 672 {
 673         if (is_shared_mountpoint("/"))
 674                 return 1;
 675         return 0;
 676 }
 677
 678 bool switch_to_ns(pid_t pid, const char *ns)
 679 {
 680         int fd, ret;
 681         char nspath[PATH_MAX];
 682
 683         /* Switch to new ns */
 684         ret = snprintf(nspath, PATH_MAX, "/proc/%d/ns/%s", pid, ns);
 685         if (ret < 0 || ret >= PATH_MAX)
 686                 return false;
 687
 688         fd = open(nspath, O_RDONLY);
 689         if (fd < 0) {
 690                 SYSERROR("Failed to open %s", nspath);
 691                 return false;
 692         }
 693
 694         ret = setns(fd, 0);
 695         if (ret) {
 696                 SYSERROR("Failed to set process %d to %s of %d.", pid, ns, fd);
 697                 close(fd);
 698                 return false;
 699         }
 700
 701         close(fd);
 702         return true;
 703 }
 704
 705 /*
 706  * looking at fs/proc_namespace.c, it appears we can
 707  * actually expect the rootfs entry to very specifically contain
 708  * " - rootfs rootfs "
 709  * IIUC, so long as we've chrooted so that rootfs is not our root,
 710  * the rootfs entry should always be skipped in mountinfo contents.
 711  */
 712 bool detect_ramfs_rootfs(void)
 713 {
 714         FILE *f;
 715         char *p, *p2;
 716         char *line = NULL;
 717         size_t len = 0;
 718         int i;
 719
 720         f = fopen("/proc/self/mountinfo", "r");
 721         if (!f)
 722                 return false;
 723
 724         while (getline(&line, &len, f) != -1) {
 725                 for (p = line, i = 0; p && i < 4; i++)
 726                         p = strchr(p + 1, ' ');
 727                 if (!p)
 728                         continue;
 729
 730                 p2 = strchr(p + 1, ' ');
 731                 if (!p2)
 732                         continue;
 733
 734                 *p2 = '\0';
 735                 if (strcmp(p + 1, "/") == 0) {
 736                         /* This is '/'. Is it the ramfs? */
 737                         p = strchr(p2 + 1, '-');
 738                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
 739                                 free(line);
 740                                 fclose(f);
 741                                 INFO("Rootfs is located on ramfs");
 742                                 return true;
 743                         }
 744                 }
 745         }
 746
 747         free(line);
 748         fclose(f);
 749         return false;
 750 }
 751
 752 char *on_path(const char *cmd, const char *rootfs)
 753 {
 754         char *entry = NULL, *path = NULL;
 755         char cmdpath[PATH_MAX];
 756         int ret;
 757
 758         path = getenv("PATH");
 759         if (!path)
 760                 return NULL;
 761
 762         path = strdup(path);
 763         if (!path)
 764                 return NULL;
 765
 766         lxc_iterate_parts (entry, path, ":") {
 767                 if (rootfs)
 768                         ret = snprintf(cmdpath, PATH_MAX, "%s/%s/%s", rootfs,
 769                                        entry, cmd);
 770                 else
 771                         ret = snprintf(cmdpath, PATH_MAX, "%s/%s", entry, cmd);
 772                 if (ret < 0 || ret >= PATH_MAX)
 773                         continue;
 774
 775                 if (access(cmdpath, X_OK) == 0) {
 776                         free(path);
 777                         return strdup(cmdpath);
 778                 }
 779         }
 780
 781         free(path);
 782         return NULL;
 783 }
 784
 785 bool cgns_supported(void)
 786 {
 787         return file_exists("/proc/self/ns/cgroup");
 788 }
 789
 790 /* historically lxc-init has been under /usr/lib/lxc and under
 791  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
 792  */
 793 char *choose_init(const char *rootfs)
 794 {
 795         char *retv = NULL;
 796         const char *empty = "",
 797                    *tmp;
 798         int ret, env_set = 0;
 799
 800         if (!getenv("PATH")) {
 801                 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
 802                         SYSERROR("Failed to setenv");
 803
 804                 env_set = 1;
 805         }
 806
 807         retv = on_path("init.lxc", rootfs);
 808
 809         if (env_set) {
 810                 if (unsetenv("PATH"))
 811                         SYSERROR("Failed to unsetenv");
 812         }
 813
 814         if (retv)
 815                 return retv;
 816
 817         retv = malloc(PATH_MAX);
 818         if (!retv)
 819                 return NULL;
 820
 821         if (rootfs)
 822                 tmp = rootfs;
 823         else
 824                 tmp = empty;
 825
 826         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
 827         if (ret < 0 || ret >= PATH_MAX) {
 828                 ERROR("pathname too long");
 829                 goto out1;
 830         }
 831
 832         if (access(retv, X_OK) == 0)
 833                 return retv;
 834
 835         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
 836         if (ret < 0 || ret >= PATH_MAX) {
 837                 ERROR("pathname too long");
 838                 goto out1;
 839         }
 840
 841         if (access(retv, X_OK) == 0)
 842                 return retv;
 843
 844         ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
 845         if (ret < 0 || ret >= PATH_MAX) {
 846                 ERROR("pathname too long");
 847                 goto out1;
 848         }
 849
 850         if (access(retv, X_OK) == 0)
 851                 return retv;
 852
 853         ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
 854         if (ret < 0 || ret >= PATH_MAX) {
 855                 ERROR("pathname too long");
 856                 goto out1;
 857         }
 858
 859         if (access(retv, X_OK) == 0)
 860                 return retv;
 861
 862         /*
 863          * Last resort, look for the statically compiled init.lxc which we
 864          * hopefully bind-mounted in.
 865          * If we are called during container setup, and we get to this point,
 866          * then the init.lxc.static from the host will need to be bind-mounted
 867          * in.  So we return NULL here to indicate that.
 868          */
 869         if (rootfs)
 870                 goto out1;
 871
 872         ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
 873         if (ret < 0 || ret >= PATH_MAX) {
 874                 WARN("Nonsense - name /lxc.init.static too long");
 875                 goto out1;
 876         }
 877
 878         if (access(retv, X_OK) == 0)
 879                 return retv;
 880
 881 out1:
 882         free(retv);
 883         return NULL;
 884 }
 885
 886 /*
 887  * Given the '-t' template option to lxc-create, figure out what to
 888  * do.  If the template is a full executable path, use that.  If it
 889  * is something like 'sshd', then return $templatepath/lxc-sshd.
 890  * On success return the template, on error return NULL.
 891  */
 892 char *get_template_path(const char *t)
 893 {
 894         int ret, len;
 895         char *tpath;
 896
 897         if (t[0] == '/' && access(t, X_OK) == 0) {
 898                 tpath = strdup(t);
 899                 return tpath;
 900         }
 901
 902         len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
 903
 904         tpath = malloc(len);
 905         if (!tpath)
 906                 return NULL;
 907
 908         ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
 909         if (ret < 0 || ret >= len) {
 910                 free(tpath);
 911                 return NULL;
 912         }
 913
 914         if (access(tpath, X_OK) < 0) {
 915                 SYSERROR("bad template: %s", t);
 916                 free(tpath);
 917                 return NULL;
 918         }
 919
 920         return tpath;
 921 }
 922
 923 /*
 924  * @path:    a pathname where / replaced with '\0'.
 925  * @offsetp: pointer to int showing which path segment was last seen.
 926  *           Updated on return to reflect the next segment.
 927  * @fulllen: full original path length.
 928  * Returns a pointer to the next path segment, or NULL if done.
 929  */
 930 static char *get_nextpath(char *path, int *offsetp, int fulllen)
 931 {
 932         int offset = *offsetp;
 933
 934         if (offset >= fulllen)
 935                 return NULL;
 936
 937         while (offset < fulllen && path[offset] != '\0')
 938                 offset++;
 939
 940         while (offset < fulllen && path[offset] == '\0')
 941                 offset++;
 942
 943         *offsetp = offset;
 944         return (offset < fulllen) ? &path[offset] : NULL;
 945 }
 946
 947 /*
 948  * Check that @subdir is a subdir of @dir.  @len is the length of
 949  * @dir (to avoid having to recalculate it).
 950  */
 951 static bool is_subdir(const char *subdir, const char *dir, size_t len)
 952 {
 953         size_t subdirlen = strlen(subdir);
 954
 955         if (subdirlen < len)
 956                 return false;
 957
 958         if (strncmp(subdir, dir, len) != 0)
 959                 return false;
 960
 961         if (dir[len-1] == '/')
 962                 return true;
 963
 964         if (subdir[len] == '/' || subdirlen == len)
 965                 return true;
 966
 967         return false;
 968 }
 969
 970 /*
 971  * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
 972  * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
 973  */
 974 static int check_symlink(int fd)
 975 {
 976         struct stat sb;
 977         int ret;
 978
 979         ret = fstat(fd, &sb);
 980         if (ret < 0)
 981                 return -ENOENT;
 982
 983         if (S_ISLNK(sb.st_mode))
 984                 return -ELOOP;
 985
 986         return 0;
 987 }
 988
 989 /*
 990  * Open a file or directory, provided that it contains no symlinks.
 991  *
 992  * CAVEAT: This function must not be used for other purposes than container
 993  * setup before executing the container's init
 994  */
 995 static int open_if_safe(int dirfd, const char *nextpath)
 996 {
 997         int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
 998         if (newfd >= 0) /* Was not a symlink, all good. */
 999                 return newfd;
1000
1001         if (errno == ELOOP)
1002                 return newfd;
1003
1004         if (errno == EPERM || errno == EACCES) {
1005                 /* We're not root (cause we got EPERM) so try opening with
1006                  * O_PATH.
1007                  */
1008                 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
1009                 if (newfd >= 0) {
1010                         /* O_PATH will return an fd for symlinks. We know
1011                          * nextpath wasn't a symlink at last openat, so if fd is
1012                          * now a link, then something * fishy is going on.
1013                          */
1014                         int ret = check_symlink(newfd);
1015                         if (ret < 0) {
1016                                 close(newfd);
1017                                 newfd = ret;
1018                         }
1019                 }
1020         }
1021
1022         return newfd;
1023 }
1024
1025 /*
1026  * Open a path intending for mounting, ensuring that the final path
1027  * is inside the container's rootfs.
1028  *
1029  * CAVEAT: This function must not be used for other purposes than container
1030  * setup before executing the container's init
1031  *
1032  * @target: path to be opened
1033  * @prefix_skip: a part of @target in which to ignore symbolic links.  This
1034  * would be the container's rootfs.
1035  *
1036  * Return an open fd for the path, or <0 on error.
1037  */
1038 static int open_without_symlink(const char *target, const char *prefix_skip)
1039 {
1040         int curlen = 0, dirfd, fulllen, i;
1041         char *dup = NULL;
1042
1043         fulllen = strlen(target);
1044
1045         /* make sure prefix-skip makes sense */
1046         if (prefix_skip && strlen(prefix_skip) > 0) {
1047                 curlen = strlen(prefix_skip);
1048                 if (!is_subdir(target, prefix_skip, curlen)) {
1049                         ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
1050                                 target, prefix_skip);
1051                         return -EINVAL;
1052                 }
1053
1054                 /*
1055                  * get_nextpath() expects the curlen argument to be
1056                  * on a  (turned into \0) / or before it, so decrement
1057                  * curlen to make sure that happens
1058                  */
1059                 if (curlen)
1060                         curlen--;
1061         } else {
1062                 prefix_skip = "/";
1063                 curlen = 0;
1064         }
1065
1066         /* Make a copy of target which we can hack up, and tokenize it */
1067         if ((dup = strdup(target)) == NULL) {
1068                 SYSERROR("Out of memory checking for symbolic link");
1069                 return -ENOMEM;
1070         }
1071
1072         for (i = 0; i < fulllen; i++) {
1073                 if (dup[i] == '/')
1074                         dup[i] = '\0';
1075         }
1076
1077         dirfd = open(prefix_skip, O_RDONLY);
1078         if (dirfd < 0)
1079                 goto out;
1080
1081         while (1) {
1082                 int newfd, saved_errno;
1083                 char *nextpath;
1084
1085                 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1086                         goto out;
1087
1088                 newfd = open_if_safe(dirfd, nextpath);
1089                 saved_errno = errno;
1090                 close(dirfd);
1091
1092                 dirfd = newfd;
1093                 if (newfd < 0) {
1094                         errno = saved_errno;
1095                         if (errno == ELOOP)
1096                                 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1097
1098                         goto out;
1099                 }
1100         }
1101
1102 out:
1103         free(dup);
1104         return dirfd;
1105 }
1106
1107 /*
1108  * Safely mount a path into a container, ensuring that the mount target
1109  * is under the container's @rootfs.  (If @rootfs is NULL, then the container
1110  * uses the host's /)
1111  *
1112  * CAVEAT: This function must not be used for other purposes than container
1113  * setup before executing the container's init
1114  */
1115 int safe_mount(const char *src, const char *dest, const char *fstype,
1116                 unsigned long flags, const void *data, const char *rootfs)
1117 {
1118         int destfd, ret, saved_errno;
1119         /* Only needs enough for /proc/self/fd/<fd>. */
1120         char srcbuf[50], destbuf[50];
1121         int srcfd = -1;
1122         const char *mntsrc = src;
1123
1124         if (!rootfs)
1125                 rootfs = "";
1126
1127         /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1128         if (flags & MS_BIND && src && src[0] != '/') {
1129                 INFO("this is a relative bind mount");
1130
1131                 srcfd = open_without_symlink(src, NULL);
1132                 if (srcfd < 0)
1133                         return srcfd;
1134
1135                 ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
1136                 if (ret < 0 || ret > 50) {
1137                         close(srcfd);
1138                         ERROR("Out of memory");
1139                         return -EINVAL;
1140                 }
1141                 mntsrc = srcbuf;
1142         }
1143
1144         destfd = open_without_symlink(dest, rootfs);
1145         if (destfd < 0) {
1146                 if (srcfd != -1) {
1147                         saved_errno = errno;
1148                         close(srcfd);
1149                         errno = saved_errno;
1150                 }
1151
1152                 return destfd;
1153         }
1154
1155         ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
1156         if (ret < 0 || ret > 50) {
1157                 if (srcfd != -1)
1158                         close(srcfd);
1159
1160                 close(destfd);
1161                 ERROR("Out of memory");
1162                 return -EINVAL;
1163         }
1164
1165         ret = mount(mntsrc, destbuf, fstype, flags, data);
1166         saved_errno = errno;
1167         if (srcfd != -1)
1168                 close(srcfd);
1169
1170         close(destfd);
1171         if (ret < 0) {
1172                 errno = saved_errno;
1173                 SYSERROR("Failed to mount %s onto %s", src ? src : "(null)", dest);
1174                 return ret;
1175         }
1176
1177         return 0;
1178 }
1179
1180 /*
1181  * Mount a proc under @rootfs if proc self points to a pid other than
1182  * my own.  This is needed to have a known-good proc mount for setting
1183  * up LSMs both at container startup and attach.
1184  *
1185  * @rootfs : the rootfs where proc should be mounted
1186  *
1187  * Returns < 0 on failure, 0 if the correct proc was already mounted
1188  * and 1 if a new proc was mounted.
1189  *
1190  * NOTE: not to be called from inside the container namespace!
1191  */
1192 int lxc_mount_proc_if_needed(const char *rootfs)
1193 {
1194         char path[PATH_MAX];
1195         int link_to_pid, linklen, mypid, ret;
1196         char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
1197
1198         ret = snprintf(path, PATH_MAX, "%s/proc/self", rootfs);
1199         if (ret < 0 || ret >= PATH_MAX) {
1200                 SYSERROR("proc path name too long");
1201                 return -1;
1202         }
1203
1204         linklen = readlink(path, link, sizeof(link));
1205
1206         ret = snprintf(path, PATH_MAX, "%s/proc", rootfs);
1207         if (ret < 0 || ret >= PATH_MAX) {
1208                 SYSERROR("proc path name too long");
1209                 return -1;
1210         }
1211
1212         /* /proc not mounted */
1213         if (linklen < 0) {
1214                 if (mkdir(path, 0755) && errno != EEXIST)
1215                         return -1;
1216
1217                 goto domount;
1218         } else if (linklen >= sizeof(link)) {
1219                 link[linklen - 1] = '\0';
1220                 ERROR("readlink returned truncated content: \"%s\"", link);
1221                 return -1;
1222         }
1223
1224         mypid = lxc_raw_getpid();
1225         INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1226
1227         if (lxc_safe_int(link, &link_to_pid) < 0)
1228                 return -1;
1229
1230         /* correct procfs is already mounted */
1231         if (link_to_pid == mypid)
1232                 return 0;
1233
1234         ret = umount2(path, MNT_DETACH);
1235         if (ret < 0)
1236                 WARN("failed to umount \"%s\" with MNT_DETACH", path);
1237
1238 domount:
1239         /* rootfs is NULL */
1240         if (!strcmp(rootfs, ""))
1241                 ret = mount("proc", path, "proc", 0, NULL);
1242         else
1243                 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
1244         if (ret < 0)
1245                 return -1;
1246
1247         INFO("mounted /proc in container for security transition");
1248         return 1;
1249 }
1250
1251 int open_devnull(void)
1252 {
1253         int fd = open("/dev/null", O_RDWR);
1254
1255         if (fd < 0)
1256                 SYSERROR("Can't open /dev/null");
1257
1258         return fd;
1259 }
1260
1261 int set_stdfds(int fd)
1262 {
1263         int ret;
1264
1265         if (fd < 0)
1266                 return -1;
1267
1268         ret = dup2(fd, STDIN_FILENO);
1269         if (ret < 0)
1270                 return -1;
1271
1272         ret = dup2(fd, STDOUT_FILENO);
1273         if (ret < 0)
1274                 return -1;
1275
1276         ret = dup2(fd, STDERR_FILENO);
1277         if (ret < 0)
1278                 return -1;
1279
1280         return 0;
1281 }
1282
1283 int null_stdfds(void)
1284 {
1285         int ret = -1;
1286         int fd;
1287
1288         fd = open_devnull();
1289         if (fd >= 0) {
1290                 ret = set_stdfds(fd);
1291                 close(fd);
1292         }
1293
1294         return ret;
1295 }
1296
1297 /* Check whether a signal is blocked by a process. */
1298 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1299 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1300 bool task_blocks_signal(pid_t pid, int signal)
1301 {
1302         int ret;
1303         char status[__PROC_STATUS_LEN];
1304         FILE *f;
1305         uint64_t sigblk = 0, one = 1;
1306         size_t n = 0;
1307         bool bret = false;
1308         char *line = NULL;
1309
1310         ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1311         if (ret < 0 || ret >= __PROC_STATUS_LEN)
1312                 return bret;
1313
1314         f = fopen(status, "r");
1315         if (!f)
1316                 return bret;
1317
1318         while (getline(&line, &n, f) != -1) {
1319                 char *numstr;
1320
1321                 if (strncmp(line, "SigBlk:", 7))
1322                         continue;
1323
1324                 numstr = lxc_trim_whitespace_in_place(line + 7);
1325                 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1326                 if (ret < 0)
1327                         goto out;
1328
1329                 break;
1330         }
1331
1332         if (sigblk & (one << (signal - 1)))
1333                 bret = true;
1334
1335 out:
1336         free(line);
1337         fclose(f);
1338         return bret;
1339 }
1340
1341 int lxc_preserve_ns(const int pid, const char *ns)
1342 {
1343         int ret;
1344 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1345 #define __NS_PATH_LEN 50
1346         char path[__NS_PATH_LEN];
1347
1348         /* This way we can use this function to also check whether namespaces
1349          * are supported by the kernel by passing in the NULL or the empty
1350          * string.
1351          */
1352         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
1353                        !ns || strcmp(ns, "") == 0 ? "" : "/",
1354                        !ns || strcmp(ns, "") == 0 ? "" : ns);
1355         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1356                 errno = EFBIG;
1357                 return -1;
1358         }
1359
1360         return open(path, O_RDONLY | O_CLOEXEC);
1361 }
1362
1363 bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
1364 {
1365         int ret = 0;
1366
1367         if (gid != LXC_INVALID_GID) {
1368                 ret = setgid(gid);
1369                 if (ret < 0) {
1370                         SYSERROR("Failed to switch to gid %d", gid);
1371                         return false;
1372                 }
1373                 NOTICE("Switched to gid %d", gid);
1374         }
1375
1376         if (uid != LXC_INVALID_UID) {
1377                 ret = setuid(uid);
1378                 if (ret < 0) {
1379                         SYSERROR("Failed to switch to uid %d", uid);
1380                         return false;
1381                 }
1382                 NOTICE("Switched to uid %d", uid);
1383         }
1384
1385         return true;
1386 }
1387
1388 /* Simple convenience function which enables uniform logging. */
1389 bool lxc_setgroups(int size, gid_t list[])
1390 {
1391         if (setgroups(size, list) < 0) {
1392                 SYSERROR("Failed to setgroups()");
1393                 return false;
1394         }
1395         NOTICE("Dropped additional groups");
1396
1397         return true;
1398 }
1399
1400 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1401 {
1402         struct dirent *dp;
1403         struct loop_info64 lo64;
1404         DIR *dir;
1405         int dfd = -1, fd = -1, ret = -1;
1406
1407         dir = opendir("/dev");
1408         if (!dir)
1409                 return -1;
1410
1411         while ((dp = readdir(dir))) {
1412                 if (strncmp(dp->d_name, "loop", 4) != 0)
1413                         continue;
1414
1415                 dfd = dirfd(dir);
1416                 if (dfd < 0)
1417                         continue;
1418
1419                 fd = openat(dfd, dp->d_name, O_RDWR);
1420                 if (fd < 0)
1421                         continue;
1422
1423                 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1424                 if (ret < 0) {
1425                         if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1426                             errno != ENXIO) {
1427                                 close(fd);
1428                                 fd = -1;
1429                                 continue;
1430                         }
1431                 }
1432
1433                 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1434                 if (ret < 0 || ret >= LO_NAME_SIZE) {
1435                         close(fd);
1436                         fd = -1;
1437                         continue;
1438                 }
1439
1440                 break;
1441         }
1442
1443         closedir(dir);
1444
1445         if (fd < 0)
1446                 return -1;
1447
1448         return fd;
1449 }
1450
1451 static int lxc_get_unused_loop_dev(char *name_loop)
1452 {
1453         int loop_nr, ret;
1454         int fd_ctl = -1, fd_tmp = -1;
1455
1456         fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1457         if (fd_ctl < 0)
1458                 return -ENODEV;
1459
1460         loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1461         if (loop_nr < 0)
1462                 goto on_error;
1463
1464         ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1465         if (ret < 0 || ret >= LO_NAME_SIZE)
1466                 goto on_error;
1467
1468         fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1469         if (fd_tmp < 0)
1470                 goto on_error;
1471
1472 on_error:
1473         close(fd_ctl);
1474         return fd_tmp;
1475 }
1476
1477 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1478 {
1479         int ret;
1480         struct loop_info64 lo64;
1481         int fd_img = -1, fret = -1, fd_loop = -1;
1482
1483         fd_loop = lxc_get_unused_loop_dev(loop_dev);
1484         if (fd_loop < 0) {
1485                 if (fd_loop == -ENODEV)
1486                         fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1487                 else
1488                         goto on_error;
1489         }
1490
1491         fd_img = open(source, O_RDWR | O_CLOEXEC);
1492         if (fd_img < 0)
1493                 goto on_error;
1494
1495         ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1496         if (ret < 0)
1497                 goto on_error;
1498
1499         memset(&lo64, 0, sizeof(lo64));
1500         lo64.lo_flags = flags;
1501
1502         ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1503         if (ret < 0)
1504                 goto on_error;
1505
1506         fret = 0;
1507
1508 on_error:
1509         if (fd_img >= 0)
1510                 close(fd_img);
1511
1512         if (fret < 0 && fd_loop >= 0) {
1513                 close(fd_loop);
1514                 fd_loop = -1;
1515         }
1516
1517         return fd_loop;
1518 }
1519
1520 int lxc_unstack_mountpoint(const char *path, bool lazy)
1521 {
1522         int ret;
1523         int umounts = 0;
1524
1525 pop_stack:
1526         ret = umount2(path, lazy ? MNT_DETACH : 0);
1527         if (ret < 0) {
1528                 /* We consider anything else than EINVAL deadly to prevent going
1529                  * into an infinite loop. (The other alternative is constantly
1530                  * parsing /proc/self/mountinfo which is yucky and probably
1531                  * racy.)
1532                  */
1533                 if (errno != EINVAL)
1534                         return -errno;
1535         } else {
1536                 /* Just stop counting when this happens. That'd just be so
1537                  * stupid that we won't even bother trying to report back the
1538                  * correct value anymore.
1539                  */
1540                 if (umounts != INT_MAX)
1541                         umounts++;
1542
1543                 /* We succeeded in umounting. Make sure that there's no other
1544                  * mountpoint stacked underneath.
1545                  */
1546                 goto pop_stack;
1547         }
1548
1549         return umounts;
1550 }
1551
1552 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1553 {
1554         pid_t child;
1555         int ret, fret, pipefd[2];
1556         ssize_t bytes;
1557
1558         /* Make sure our callers do not receive uninitialized memory. */
1559         if (buf_size > 0 && buf)
1560                 buf[0] = '\0';
1561
1562         if (pipe(pipefd) < 0) {
1563                 SYSERROR("failed to create pipe");
1564                 return -1;
1565         }
1566
1567         child = lxc_raw_clone(0);
1568         if (child < 0) {
1569                 close(pipefd[0]);
1570                 close(pipefd[1]);
1571                 SYSERROR("failed to create new process");
1572                 return -1;
1573         }
1574
1575         if (child == 0) {
1576                 /* Close the read-end of the pipe. */
1577                 close(pipefd[0]);
1578
1579                 /* Redirect std{err,out} to write-end of the
1580                  * pipe.
1581                  */
1582                 ret = dup2(pipefd[1], STDOUT_FILENO);
1583                 if (ret >= 0)
1584                         ret = dup2(pipefd[1], STDERR_FILENO);
1585
1586                 /* Close the write-end of the pipe. */
1587                 close(pipefd[1]);
1588
1589                 if (ret < 0) {
1590                         SYSERROR("failed to duplicate std{err,out} file descriptor");
1591                         _exit(EXIT_FAILURE);
1592                 }
1593
1594                 /* Does not return. */
1595                 child_fn(args);
1596                 ERROR("failed to exec command");
1597                 _exit(EXIT_FAILURE);
1598         }
1599
1600         /* close the write-end of the pipe */
1601         close(pipefd[1]);
1602
1603         if (buf && buf_size > 0) {
1604                 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1605                 if (bytes > 0)
1606                         buf[bytes - 1] = '\0';
1607         }
1608
1609         fret = wait_for_pid(child);
1610         /* close the read-end of the pipe */
1611         close(pipefd[0]);
1612
1613         return fret;
1614 }
1615
1616 bool lxc_nic_exists(char *nic)
1617 {
1618 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1619         char path[__LXC_SYS_CLASS_NET_LEN];
1620         int ret;
1621         struct stat sb;
1622
1623         if (!strcmp(nic, "none"))
1624                 return true;
1625
1626         ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1627         if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1628                 return false;
1629
1630         ret = stat(path, &sb);
1631         if (ret < 0)
1632                 return false;
1633
1634         return true;
1635 }
1636
1637 uint64_t lxc_find_next_power2(uint64_t n)
1638 {
1639         /* 0 is not valid input. We return 0 to the caller since 0 is not a
1640          * valid power of two.
1641          */
1642         if (n == 0)
1643                 return 0;
1644
1645         if (!(n & (n - 1)))
1646                 return n;
1647
1648         while (n & (n - 1))
1649                 n = n & (n - 1);
1650
1651         n = n << 1;
1652         return n;
1653 }
1654
1655 int lxc_set_death_signal(int signal, pid_t parent)
1656 {
1657         int ret;
1658         pid_t ppid;
1659
1660         ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1661                     prctl_arg(0), prctl_arg(0));
1662
1663         /* Check whether we have been orphaned. */
1664         ppid = (pid_t)syscall(SYS_getppid);
1665         if (ppid != parent) {
1666                 ret = raise(SIGKILL);
1667                 if (ret < 0)
1668                         return -1;
1669         }
1670
1671         if (ret < 0) {
1672                 SYSERROR("Failed to set PR_SET_PDEATHSIG to %d", signal);
1673                 return -1;
1674         }
1675
1676         return 0;
1677 }
1678
1679 int fd_cloexec(int fd, bool cloexec)
1680 {
1681         int oflags, nflags;
1682
1683         oflags = fcntl(fd, F_GETFD, 0);
1684         if (oflags < 0)
1685                 return -errno;
1686
1687         if (cloexec)
1688                 nflags = oflags | FD_CLOEXEC;
1689         else
1690                 nflags = oflags & ~FD_CLOEXEC;
1691
1692         if (nflags == oflags)
1693                 return 0;
1694
1695         if (fcntl(fd, F_SETFD, nflags) < 0)
1696                 return -errno;
1697
1698         return 0;
1699 }
1700
1701 int recursive_destroy(char *dirname)
1702 {
1703         int ret;
1704         struct dirent *direntp;
1705         DIR *dir;
1706         int r = 0;
1707
1708         dir = opendir(dirname);
1709         if (!dir)
1710                 return -1;
1711
1712         while ((direntp = readdir(dir))) {
1713                 char *pathname;
1714                 struct stat mystat;
1715
1716                 if (!strcmp(direntp->d_name, ".") ||
1717                     !strcmp(direntp->d_name, ".."))
1718                         continue;
1719
1720                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1721
1722                 ret = lstat(pathname, &mystat);
1723                 if (ret < 0) {
1724                         if (!r)
1725                                 WARN("Failed to stat \"%s\"", pathname);
1726
1727                         r = -1;
1728                         goto next;
1729                 }
1730
1731                 if (!S_ISDIR(mystat.st_mode))
1732                         goto next;
1733
1734                 ret = recursive_destroy(pathname);
1735                 if (ret < 0)
1736                         r = -1;
1737
1738         next:
1739                 free(pathname);
1740         }
1741
1742         ret = rmdir(dirname);
1743         if (ret < 0) {
1744                 if (!r)
1745                         SYSWARN("Failed to delete \"%s\"", dirname);
1746
1747                 r = -1;
1748         }
1749
1750         ret = closedir(dir);
1751         if (ret < 0) {
1752                 if (!r)
1753                         SYSWARN("Failed to delete \"%s\"", dirname);
1754
1755                 r = -1;
1756         }
1757
1758         return r;
1759 }
1760
1761 int lxc_setup_keyring(void)
1762 {
1763         key_serial_t keyring;
1764         int ret = 0;
1765
1766         /* Try to allocate a new session keyring for the container to prevent
1767          * information leaks.
1768          */
1769         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
1770                          prctl_arg(0), prctl_arg(0), prctl_arg(0));
1771         if (keyring < 0) {
1772                 switch (errno) {
1773                 case ENOSYS:
1774                         DEBUG("The keyctl() syscall is not supported or blocked");
1775                         break;
1776                 case EACCES:
1777                         __fallthrough;
1778                 case EPERM:
1779                         DEBUG("Failed to access kernel keyring. Continuing...");
1780                         break;
1781                 default:
1782                         SYSERROR("Failed to create kernel keyring");
1783                         ret = -1;
1784                         break;
1785                 }
1786         }
1787
1788         return ret;
1789 }