src/lxc/utils.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #ifndef _GNU_SOURCE
  25 #define _GNU_SOURCE 1
  26 #endif
  27 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
  28 #include <ctype.h>
  29 #include <dirent.h>
  30 #include <errno.h>
  31 #include <fcntl.h>
  32 #include <grp.h>
  33 #include <inttypes.h>
  34 #include <libgen.h>
  35 #include <pthread.h>
  36 #include <stddef.h>
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <sys/mman.h>
  41 #include <sys/mount.h>
  42 #include <sys/param.h>
  43 #include <sys/prctl.h>
  44 #include <sys/stat.h>
  45 #include <sys/types.h>
  46 #include <sys/wait.h>
  47 #include <unistd.h>
  48
  49 #include "config.h"
  50 #include "log.h"
  51 #include "lxclock.h"
  52 #include "namespace.h"
  53 #include "parse.h"
  54 #include "raw_syscalls.h"
  55 #include "syscall_wrappers.h"
  56 #include "utils.h"
  57
  58 #ifndef HAVE_STRLCPY
  59 #include "include/strlcpy.h"
  60 #endif
  61
  62 #ifndef HAVE_STRLCAT
  63 #include "include/strlcat.h"
  64 #endif
  65
  66 #ifndef O_PATH
  67 #define O_PATH      010000000
  68 #endif
  69
  70 #ifndef O_NOFOLLOW
  71 #define O_NOFOLLOW  00400000
  72 #endif
  73
  74 lxc_log_define(utils, lxc);
  75
  76 /*
  77  * if path is btrfs, tries to remove it and any subvolumes beneath it
  78  */
  79 extern bool btrfs_try_remove_subvol(const char *path);
  80
  81 static int _recursive_rmdir(const char *dirname, dev_t pdev,
  82                             const char *exclude, int level, bool onedev)
  83 {
  84         struct dirent *direntp;
  85         DIR *dir;
  86         int ret, failed=0;
  87         char pathname[MAXPATHLEN];
  88         bool hadexclude = false;
  89
  90         dir = opendir(dirname);
  91         if (!dir) {
  92                 ERROR("failed to open %s", dirname);
  93                 return -1;
  94         }
  95
  96         while ((direntp = readdir(dir))) {
  97                 struct stat mystat;
  98                 int rc;
  99
 100                 if (!strcmp(direntp->d_name, ".") ||
 101                     !strcmp(direntp->d_name, ".."))
 102                         continue;
 103
 104                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 105                 if (rc < 0 || rc >= MAXPATHLEN) {
 106                         ERROR("pathname too long");
 107                         failed=1;
 108                         continue;
 109                 }
 110
 111                 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
 112                         ret = rmdir(pathname);
 113                         if (ret < 0) {
 114                                 switch(errno) {
 115                                 case ENOTEMPTY:
 116                                         INFO("Not deleting snapshot %s", pathname);
 117                                         hadexclude = true;
 118                                         break;
 119                                 case ENOTDIR:
 120                                         ret = unlink(pathname);
 121                                         if (ret)
 122                                                 INFO("Failed to remove %s", pathname);
 123                                         break;
 124                                 default:
 125                                         SYSERROR("Failed to rmdir %s", pathname);
 126                                         failed = 1;
 127                                         break;
 128                                 }
 129                         }
 130                         continue;
 131                 }
 132
 133                 ret = lstat(pathname, &mystat);
 134                 if (ret) {
 135                         ERROR("Failed to stat %s", pathname);
 136                         failed = 1;
 137                         continue;
 138                 }
 139
 140                 if (onedev && mystat.st_dev != pdev) {
 141                         /* TODO should we be checking /proc/self/mountinfo for
 142                          * pathname and not doing this if found? */
 143                         if (btrfs_try_remove_subvol(pathname))
 144                                 INFO("Removed btrfs subvolume at %s\n", pathname);
 145                         continue;
 146                 }
 147
 148                 if (S_ISDIR(mystat.st_mode)) {
 149                         if (_recursive_rmdir(pathname, pdev, exclude, level+1, onedev) < 0)
 150                                 failed=1;
 151                 } else {
 152                         if (unlink(pathname) < 0) {
 153                                 SYSERROR("Failed to delete %s", pathname);
 154                                 failed=1;
 155                         }
 156                 }
 157         }
 158
 159         if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
 160                 ERROR("Failed to delete %s", dirname);
 161                 failed=1;
 162         }
 163
 164         ret = closedir(dir);
 165         if (ret) {
 166                 ERROR("Failed to close directory %s", dirname);
 167                 failed=1;
 168         }
 169
 170         return failed ? -1 : 0;
 171 }
 172
 173 /* In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
 174  * lxc_rmdir_onedev()
 175  */
 176 static bool is_native_overlayfs(const char *path)
 177 {
 178         if (has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
 179             has_fs_type(path, OVERLAYFS_SUPER_MAGIC))
 180                 return true;
 181
 182         return false;
 183 }
 184
 185 /* returns 0 on success, -1 if there were any failures */
 186 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
 187 {
 188         struct stat mystat;
 189         bool onedev = true;
 190
 191         if (is_native_overlayfs(path))
 192                 onedev = false;
 193
 194         if (lstat(path, &mystat) < 0) {
 195                 if (errno == ENOENT)
 196                         return 0;
 197
 198                 ERROR("Failed to stat %s", path);
 199                 return -1;
 200         }
 201
 202         return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 203 }
 204
 205 /* borrowed from iproute2 */
 206 extern int get_u16(unsigned short *val, const char *arg, int base)
 207 {
 208         unsigned long res;
 209         char *ptr;
 210
 211         if (!arg || !*arg)
 212                 return -1;
 213
 214         errno = 0;
 215         res = strtoul(arg, &ptr, base);
 216         if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
 217                 return -1;
 218
 219         *val = res;
 220
 221         return 0;
 222 }
 223
 224 extern int mkdir_p(const char *dir, mode_t mode)
 225 {
 226         const char *tmp = dir;
 227         const char *orig = dir;
 228         char *makeme;
 229
 230         do {
 231                 dir = tmp + strspn(tmp, "/");
 232                 tmp = dir + strcspn(dir, "/");
 233
 234                 makeme = strndup(orig, dir - orig);
 235                 if (*makeme) {
 236                         if (mkdir(makeme, mode) && errno != EEXIST) {
 237                                 SYSERROR("failed to create directory '%s'", makeme);
 238                                 free(makeme);
 239                                 return -1;
 240                         }
 241                 }
 242                 free(makeme);
 243         } while(tmp != dir);
 244
 245         return 0;
 246 }
 247
 248 char *get_rundir()
 249 {
 250         char *rundir;
 251         const char *homedir;
 252         struct stat sb;
 253
 254         if (stat(RUNTIME_PATH, &sb) < 0)
 255                 return NULL;
 256
 257         if (geteuid() == sb.st_uid || getegid() == sb.st_gid) {
 258                 rundir = strdup(RUNTIME_PATH);
 259                 return rundir;
 260         }
 261
 262         rundir = getenv("XDG_RUNTIME_DIR");
 263         if (rundir) {
 264                 rundir = strdup(rundir);
 265                 return rundir;
 266         }
 267
 268         INFO("XDG_RUNTIME_DIR isn't set in the environment.");
 269         homedir = getenv("HOME");
 270         if (!homedir) {
 271                 ERROR("HOME isn't set in the environment.");
 272                 return NULL;
 273         }
 274
 275         rundir = malloc(sizeof(char) * (17 + strlen(homedir)));
 276         if (!rundir)
 277                 return NULL;
 278
 279         sprintf(rundir, "%s/.cache/lxc/run/", homedir);
 280
 281         return rundir;
 282 }
 283
 284 int wait_for_pid(pid_t pid)
 285 {
 286         int status, ret;
 287
 288 again:
 289         ret = waitpid(pid, &status, 0);
 290         if (ret == -1) {
 291                 if (errno == EINTR)
 292                         goto again;
 293
 294                 return -1;
 295         }
 296
 297         if (ret != pid)
 298                 goto again;
 299
 300         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
 301                 return -1;
 302
 303         return 0;
 304 }
 305
 306 int lxc_wait_for_pid_status(pid_t pid)
 307 {
 308         int status, ret;
 309
 310 again:
 311         ret = waitpid(pid, &status, 0);
 312         if (ret == -1) {
 313                 if (errno == EINTR)
 314                         goto again;
 315
 316                 return -1;
 317         }
 318
 319         if (ret != pid)
 320                 goto again;
 321
 322         return status;
 323 }
 324
 325 #if HAVE_LIBGNUTLS
 326 #include <gnutls/gnutls.h>
 327 #include <gnutls/crypto.h>
 328
 329 __attribute__((constructor))
 330 static void gnutls_lxc_init(void)
 331 {
 332         gnutls_global_init();
 333 }
 334
 335 int sha1sum_file(char *fnam, unsigned char *digest)
 336 {
 337         char *buf;
 338         int ret;
 339         FILE *f;
 340         long flen;
 341
 342         if (!fnam)
 343                 return -1;
 344
 345         f = fopen_cloexec(fnam, "r");
 346         if (!f) {
 347                 SYSERROR("Error opening template");
 348                 return -1;
 349         }
 350
 351         if (fseek(f, 0, SEEK_END) < 0) {
 352                 SYSERROR("Error seeking to end of template");
 353                 fclose(f);
 354                 return -1;
 355         }
 356
 357         if ((flen = ftell(f)) < 0) {
 358                 SYSERROR("Error telling size of template");
 359                 fclose(f);
 360                 return -1;
 361         }
 362
 363         if (fseek(f, 0, SEEK_SET) < 0) {
 364                 SYSERROR("Error seeking to start of template");
 365                 fclose(f);
 366                 return -1;
 367         }
 368
 369         if ((buf = malloc(flen+1)) == NULL) {
 370                 SYSERROR("Out of memory");
 371                 fclose(f);
 372                 return -1;
 373         }
 374
 375         if (fread(buf, 1, flen, f) != flen) {
 376                 SYSERROR("Failure reading template");
 377                 free(buf);
 378                 fclose(f);
 379                 return -1;
 380         }
 381
 382         if (fclose(f) < 0) {
 383                 SYSERROR("Failre closing template");
 384                 free(buf);
 385                 return -1;
 386         }
 387
 388         buf[flen] = '\0';
 389         ret = gnutls_hash_fast(GNUTLS_DIG_SHA1, buf, flen, (void *)digest);
 390         free(buf);
 391         return ret;
 392 }
 393 #endif
 394
 395 struct lxc_popen_FILE *lxc_popen(const char *command)
 396 {
 397         int ret;
 398         int pipe_fds[2];
 399         pid_t child_pid;
 400         struct lxc_popen_FILE *fp = NULL;
 401
 402         ret = pipe2(pipe_fds, O_CLOEXEC);
 403         if (ret < 0)
 404                 return NULL;
 405
 406         child_pid = fork();
 407         if (child_pid < 0)
 408                 goto on_error;
 409
 410         if (!child_pid) {
 411                 sigset_t mask;
 412
 413                 close(pipe_fds[0]);
 414
 415                 /* duplicate stdout */
 416                 if (pipe_fds[1] != STDOUT_FILENO)
 417                         ret = dup2(pipe_fds[1], STDOUT_FILENO);
 418                 else
 419                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 420                 if (ret < 0) {
 421                         close(pipe_fds[1]);
 422                         _exit(EXIT_FAILURE);
 423                 }
 424
 425                 /* duplicate stderr */
 426                 if (pipe_fds[1] != STDERR_FILENO)
 427                         ret = dup2(pipe_fds[1], STDERR_FILENO);
 428                 else
 429                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 430                 close(pipe_fds[1]);
 431                 if (ret < 0)
 432                         _exit(EXIT_FAILURE);
 433
 434                 /* unblock all signals */
 435                 ret = sigfillset(&mask);
 436                 if (ret < 0)
 437                         _exit(EXIT_FAILURE);
 438
 439                 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
 440                 if (ret < 0)
 441                         _exit(EXIT_FAILURE);
 442
 443                 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
 444                 _exit(127);
 445         }
 446
 447         close(pipe_fds[1]);
 448         pipe_fds[1] = -1;
 449
 450         fp = malloc(sizeof(*fp));
 451         if (!fp)
 452                 goto on_error;
 453
 454         memset(fp, 0, sizeof(*fp));
 455
 456         fp->child_pid = child_pid;
 457         fp->pipe = pipe_fds[0];
 458
 459         /* From now on, closing fp->f will also close fp->pipe. So only ever
 460          * call fclose(fp->f).
 461          */
 462         fp->f = fdopen(pipe_fds[0], "r");
 463         if (!fp->f)
 464                 goto on_error;
 465
 466         return fp;
 467
 468 on_error:
 469         /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
 470          * called yet. Otherwise the fd belongs to the file opened by fdopen()
 471          * since it isn't dup()ed.
 472          */
 473         if (fp && !fp->f && pipe_fds[0] >= 0)
 474                 close(pipe_fds[0]);
 475
 476         if (pipe_fds[1] >= 0)
 477                 close(pipe_fds[1]);
 478
 479         if (fp && fp->f)
 480                 fclose(fp->f);
 481
 482         if (fp)
 483                 free(fp);
 484
 485         return NULL;
 486 }
 487
 488 int lxc_pclose(struct lxc_popen_FILE *fp)
 489 {
 490         pid_t wait_pid;
 491         int wstatus = 0;
 492
 493         if (!fp)
 494                 return -1;
 495
 496         do {
 497                 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
 498         } while (wait_pid < 0 && errno == EINTR);
 499
 500         fclose(fp->f);
 501         free(fp);
 502
 503         if (wait_pid < 0)
 504                 return -1;
 505
 506         return wstatus;
 507 }
 508
 509 int randseed(bool srand_it)
 510 {
 511         /*
 512            srand pre-seed function based on /dev/urandom
 513            */
 514         unsigned int seed = time(NULL) + getpid();
 515
 516         FILE *f;
 517         f = fopen("/dev/urandom", "r");
 518         if (f) {
 519                 int ret = fread(&seed, sizeof(seed), 1, f);
 520                 if (ret != 1)
 521                         SYSDEBUG("unable to fread /dev/urandom, fallback to time+pid rand seed");
 522
 523                 fclose(f);
 524         }
 525
 526         if (srand_it)
 527                 srand(seed);
 528
 529         return seed;
 530 }
 531
 532 uid_t get_ns_uid(uid_t orig)
 533 {
 534         char *line = NULL;
 535         size_t sz = 0;
 536         uid_t nsid, hostid, range;
 537         FILE *f = fopen("/proc/self/uid_map", "r");
 538         if (!f)
 539                 return 0;
 540
 541         while (getline(&line, &sz, f) != -1) {
 542                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 543                         continue;
 544
 545                 if (hostid <= orig && hostid + range > orig) {
 546                         nsid += orig - hostid;
 547                         goto found;
 548                 }
 549         }
 550
 551         nsid = LXC_INVALID_UID;
 552
 553 found:
 554         fclose(f);
 555         free(line);
 556         return nsid;
 557 }
 558
 559 gid_t get_ns_gid(gid_t orig)
 560 {
 561         char *line = NULL;
 562         size_t sz = 0;
 563         gid_t nsid, hostid, range;
 564         FILE *f = fopen("/proc/self/gid_map", "r");
 565         if (!f)
 566                 return 0;
 567
 568         while (getline(&line, &sz, f) != -1) {
 569                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 570                         continue;
 571
 572                 if (hostid <= orig && hostid + range > orig) {
 573                         nsid += orig - hostid;
 574                         goto found;
 575                 }
 576         }
 577
 578         nsid = LXC_INVALID_GID;
 579
 580 found:
 581         fclose(f);
 582         free(line);
 583         return nsid;
 584 }
 585
 586 bool dir_exists(const char *path)
 587 {
 588         struct stat sb;
 589         int ret;
 590
 591         ret = stat(path, &sb);
 592         if (ret < 0)
 593                 /* Could be something other than eexist, just say "no". */
 594                 return false;
 595
 596         return S_ISDIR(sb.st_mode);
 597 }
 598
 599 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
 600  * FNV has good anti collision properties and we're not worried
 601  * about pre-image resistance or one-way-ness, we're just trying to make
 602  * the name unique in the 108 bytes of space we have.
 603  */
 604 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
 605 {
 606         unsigned char *bp;
 607
 608         for(bp = buf; bp < (unsigned char *)buf + len; bp++)
 609         {
 610                 /* xor the bottom with the current octet */
 611                 hval ^= (uint64_t)*bp;
 612
 613                 /* gcc optimised:
 614                  * multiply by the 64 bit FNV magic prime mod 2^64
 615                  */
 616                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
 617                         (hval << 7) + (hval << 8) + (hval << 40);
 618         }
 619
 620         return hval;
 621 }
 622
 623 bool is_shared_mountpoint(const char *path)
 624 {
 625         char buf[LXC_LINELEN];
 626         FILE *f;
 627         int i;
 628         char *p, *p2;
 629
 630         f = fopen("/proc/self/mountinfo", "r");
 631         if (!f)
 632                 return 0;
 633
 634         while (fgets(buf, LXC_LINELEN, f)) {
 635                 for (p = buf, i = 0; p && i < 4; i++)
 636                         p = strchr(p + 1, ' ');
 637                 if (!p)
 638                         continue;
 639
 640                 p2 = strchr(p + 1, ' ');
 641                 if (!p2)
 642                         continue;
 643
 644                 *p2 = '\0';
 645                 if (strcmp(p + 1, path) == 0) {
 646                         /* This is the path. Is it shared? */
 647                         p = strchr(p2 + 1, ' ');
 648                         if (p && strstr(p, "shared:")) {
 649                                 fclose(f);
 650                                 return true;
 651                         }
 652                 }
 653         }
 654
 655         fclose(f);
 656         return false;
 657 }
 658
 659 /*
 660  * Detect whether / is mounted MS_SHARED.  The only way I know of to
 661  * check that is through /proc/self/mountinfo.
 662  * I'm only checking for /.  If the container rootfs or mount location
 663  * is MS_SHARED, but not '/', then you're out of luck - figuring that
 664  * out would be too much work to be worth it.
 665  */
 666 int detect_shared_rootfs(void)
 667 {
 668         if (is_shared_mountpoint("/"))
 669                 return 1;
 670         return 0;
 671 }
 672
 673 bool switch_to_ns(pid_t pid, const char *ns)
 674 {
 675         int fd, ret;
 676         char nspath[MAXPATHLEN];
 677
 678         /* Switch to new ns */
 679         ret = snprintf(nspath, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns);
 680         if (ret < 0 || ret >= MAXPATHLEN)
 681                 return false;
 682
 683         fd = open(nspath, O_RDONLY);
 684         if (fd < 0) {
 685                 SYSERROR("Failed to open %s", nspath);
 686                 return false;
 687         }
 688
 689         ret = setns(fd, 0);
 690         if (ret) {
 691                 SYSERROR("Failed to set process %d to %s of %d.", pid, ns, fd);
 692                 close(fd);
 693                 return false;
 694         }
 695
 696         close(fd);
 697         return true;
 698 }
 699
 700 /*
 701  * looking at fs/proc_namespace.c, it appears we can
 702  * actually expect the rootfs entry to very specifically contain
 703  * " - rootfs rootfs "
 704  * IIUC, so long as we've chrooted so that rootfs is not our root,
 705  * the rootfs entry should always be skipped in mountinfo contents.
 706  */
 707 bool detect_ramfs_rootfs(void)
 708 {
 709         FILE *f;
 710         char *p, *p2;
 711         char *line = NULL;
 712         size_t len = 0;
 713         int i;
 714
 715         f = fopen("/proc/self/mountinfo", "r");
 716         if (!f)
 717                 return false;
 718
 719         while (getline(&line, &len, f) != -1) {
 720                 for (p = line, i = 0; p && i < 4; i++)
 721                         p = strchr(p + 1, ' ');
 722                 if (!p)
 723                         continue;
 724
 725                 p2 = strchr(p + 1, ' ');
 726                 if (!p2)
 727                         continue;
 728
 729                 *p2 = '\0';
 730                 if (strcmp(p + 1, "/") == 0) {
 731                         /* This is '/'. Is it the ramfs? */
 732                         p = strchr(p2 + 1, '-');
 733                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
 734                                 free(line);
 735                                 fclose(f);
 736                                 INFO("Rootfs is located on ramfs");
 737                                 return true;
 738                         }
 739                 }
 740         }
 741
 742         free(line);
 743         fclose(f);
 744         return false;
 745 }
 746
 747 char *on_path(const char *cmd, const char *rootfs)
 748 {
 749         char *entry = NULL, *path = NULL;
 750         char cmdpath[MAXPATHLEN];
 751         int ret;
 752
 753         path = getenv("PATH");
 754         if (!path)
 755                 return NULL;
 756
 757         path = strdup(path);
 758         if (!path)
 759                 return NULL;
 760
 761         lxc_iterate_parts (entry, path, ":") {
 762                 if (rootfs)
 763                         ret = snprintf(cmdpath, MAXPATHLEN, "%s/%s/%s", rootfs,
 764                                        entry, cmd);
 765                 else
 766                         ret = snprintf(cmdpath, MAXPATHLEN, "%s/%s", entry, cmd);
 767                 if (ret < 0 || ret >= MAXPATHLEN)
 768                         continue;
 769
 770                 if (access(cmdpath, X_OK) == 0) {
 771                         free(path);
 772                         return strdup(cmdpath);
 773                 }
 774         }
 775
 776         free(path);
 777         return NULL;
 778 }
 779
 780 bool cgns_supported(void)
 781 {
 782         return file_exists("/proc/self/ns/cgroup");
 783 }
 784
 785 /* historically lxc-init has been under /usr/lib/lxc and under
 786  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
 787  */
 788 char *choose_init(const char *rootfs)
 789 {
 790         char *retv = NULL;
 791         const char *empty = "",
 792                    *tmp;
 793         int ret, env_set = 0;
 794
 795         if (!getenv("PATH")) {
 796                 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
 797                         SYSERROR("Failed to setenv");
 798
 799                 env_set = 1;
 800         }
 801
 802         retv = on_path("init.lxc", rootfs);
 803
 804         if (env_set) {
 805                 if (unsetenv("PATH"))
 806                         SYSERROR("Failed to unsetenv");
 807         }
 808
 809         if (retv)
 810                 return retv;
 811
 812         retv = malloc(PATH_MAX);
 813         if (!retv)
 814                 return NULL;
 815
 816         if (rootfs)
 817                 tmp = rootfs;
 818         else
 819                 tmp = empty;
 820
 821         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
 822         if (ret < 0 || ret >= PATH_MAX) {
 823                 ERROR("pathname too long");
 824                 goto out1;
 825         }
 826
 827         if (access(retv, X_OK) == 0)
 828                 return retv;
 829
 830         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
 831         if (ret < 0 || ret >= PATH_MAX) {
 832                 ERROR("pathname too long");
 833                 goto out1;
 834         }
 835
 836         if (access(retv, X_OK) == 0)
 837                 return retv;
 838
 839         ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
 840         if (ret < 0 || ret >= PATH_MAX) {
 841                 ERROR("pathname too long");
 842                 goto out1;
 843         }
 844
 845         if (access(retv, X_OK) == 0)
 846                 return retv;
 847
 848         ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
 849         if (ret < 0 || ret >= PATH_MAX) {
 850                 ERROR("pathname too long");
 851                 goto out1;
 852         }
 853
 854         if (access(retv, X_OK) == 0)
 855                 return retv;
 856
 857         /*
 858          * Last resort, look for the statically compiled init.lxc which we
 859          * hopefully bind-mounted in.
 860          * If we are called during container setup, and we get to this point,
 861          * then the init.lxc.static from the host will need to be bind-mounted
 862          * in.  So we return NULL here to indicate that.
 863          */
 864         if (rootfs)
 865                 goto out1;
 866
 867         ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
 868         if (ret < 0 || ret >= PATH_MAX) {
 869                 WARN("Nonsense - name /lxc.init.static too long");
 870                 goto out1;
 871         }
 872
 873         if (access(retv, X_OK) == 0)
 874                 return retv;
 875
 876 out1:
 877         free(retv);
 878         return NULL;
 879 }
 880
 881 /*
 882  * Given the '-t' template option to lxc-create, figure out what to
 883  * do.  If the template is a full executable path, use that.  If it
 884  * is something like 'sshd', then return $templatepath/lxc-sshd.
 885  * On success return the template, on error return NULL.
 886  */
 887 char *get_template_path(const char *t)
 888 {
 889         int ret, len;
 890         char *tpath;
 891
 892         if (t[0] == '/' && access(t, X_OK) == 0) {
 893                 tpath = strdup(t);
 894                 return tpath;
 895         }
 896
 897         len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
 898
 899         tpath = malloc(len);
 900         if (!tpath)
 901                 return NULL;
 902
 903         ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
 904         if (ret < 0 || ret >= len) {
 905                 free(tpath);
 906                 return NULL;
 907         }
 908
 909         if (access(tpath, X_OK) < 0) {
 910                 SYSERROR("bad template: %s", t);
 911                 free(tpath);
 912                 return NULL;
 913         }
 914
 915         return tpath;
 916 }
 917
 918 /*
 919  * @path:    a pathname where / replaced with '\0'.
 920  * @offsetp: pointer to int showing which path segment was last seen.
 921  *           Updated on return to reflect the next segment.
 922  * @fulllen: full original path length.
 923  * Returns a pointer to the next path segment, or NULL if done.
 924  */
 925 static char *get_nextpath(char *path, int *offsetp, int fulllen)
 926 {
 927         int offset = *offsetp;
 928
 929         if (offset >= fulllen)
 930                 return NULL;
 931
 932         while (offset < fulllen && path[offset] != '\0')
 933                 offset++;
 934
 935         while (offset < fulllen && path[offset] == '\0')
 936                 offset++;
 937
 938         *offsetp = offset;
 939         return (offset < fulllen) ? &path[offset] : NULL;
 940 }
 941
 942 /*
 943  * Check that @subdir is a subdir of @dir.  @len is the length of
 944  * @dir (to avoid having to recalculate it).
 945  */
 946 static bool is_subdir(const char *subdir, const char *dir, size_t len)
 947 {
 948         size_t subdirlen = strlen(subdir);
 949
 950         if (subdirlen < len)
 951                 return false;
 952
 953         if (strncmp(subdir, dir, len) != 0)
 954                 return false;
 955
 956         if (dir[len-1] == '/')
 957                 return true;
 958
 959         if (subdir[len] == '/' || subdirlen == len)
 960                 return true;
 961
 962         return false;
 963 }
 964
 965 /*
 966  * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
 967  * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
 968  */
 969 static int check_symlink(int fd)
 970 {
 971         struct stat sb;
 972         int ret;
 973
 974         ret = fstat(fd, &sb);
 975         if (ret < 0)
 976                 return -ENOENT;
 977
 978         if (S_ISLNK(sb.st_mode))
 979                 return -ELOOP;
 980
 981         return 0;
 982 }
 983
 984 /*
 985  * Open a file or directory, provided that it contains no symlinks.
 986  *
 987  * CAVEAT: This function must not be used for other purposes than container
 988  * setup before executing the container's init
 989  */
 990 static int open_if_safe(int dirfd, const char *nextpath)
 991 {
 992         int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
 993         if (newfd >= 0) /* Was not a symlink, all good. */
 994                 return newfd;
 995
 996         if (errno == ELOOP)
 997                 return newfd;
 998
 999         if (errno == EPERM || errno == EACCES) {
1000                 /* We're not root (cause we got EPERM) so try opening with
1001                  * O_PATH.
1002                  */
1003                 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
1004                 if (newfd >= 0) {
1005                         /* O_PATH will return an fd for symlinks. We know
1006                          * nextpath wasn't a symlink at last openat, so if fd is
1007                          * now a link, then something * fishy is going on.
1008                          */
1009                         int ret = check_symlink(newfd);
1010                         if (ret < 0) {
1011                                 close(newfd);
1012                                 newfd = ret;
1013                         }
1014                 }
1015         }
1016
1017         return newfd;
1018 }
1019
1020 /*
1021  * Open a path intending for mounting, ensuring that the final path
1022  * is inside the container's rootfs.
1023  *
1024  * CAVEAT: This function must not be used for other purposes than container
1025  * setup before executing the container's init
1026  *
1027  * @target: path to be opened
1028  * @prefix_skip: a part of @target in which to ignore symbolic links.  This
1029  * would be the container's rootfs.
1030  *
1031  * Return an open fd for the path, or <0 on error.
1032  */
1033 static int open_without_symlink(const char *target, const char *prefix_skip)
1034 {
1035         int curlen = 0, dirfd, fulllen, i;
1036         char *dup = NULL;
1037
1038         fulllen = strlen(target);
1039
1040         /* make sure prefix-skip makes sense */
1041         if (prefix_skip && strlen(prefix_skip) > 0) {
1042                 curlen = strlen(prefix_skip);
1043                 if (!is_subdir(target, prefix_skip, curlen)) {
1044                         ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
1045                                 target, prefix_skip);
1046                         return -EINVAL;
1047                 }
1048
1049                 /*
1050                  * get_nextpath() expects the curlen argument to be
1051                  * on a  (turned into \0) / or before it, so decrement
1052                  * curlen to make sure that happens
1053                  */
1054                 if (curlen)
1055                         curlen--;
1056         } else {
1057                 prefix_skip = "/";
1058                 curlen = 0;
1059         }
1060
1061         /* Make a copy of target which we can hack up, and tokenize it */
1062         if ((dup = strdup(target)) == NULL) {
1063                 SYSERROR("Out of memory checking for symbolic link");
1064                 return -ENOMEM;
1065         }
1066
1067         for (i = 0; i < fulllen; i++) {
1068                 if (dup[i] == '/')
1069                         dup[i] = '\0';
1070         }
1071
1072         dirfd = open(prefix_skip, O_RDONLY);
1073         if (dirfd < 0)
1074                 goto out;
1075
1076         while (1) {
1077                 int newfd, saved_errno;
1078                 char *nextpath;
1079
1080                 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1081                         goto out;
1082
1083                 newfd = open_if_safe(dirfd, nextpath);
1084                 saved_errno = errno;
1085                 close(dirfd);
1086
1087                 dirfd = newfd;
1088                 if (newfd < 0) {
1089                         errno = saved_errno;
1090                         if (errno == ELOOP)
1091                                 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1092
1093                         goto out;
1094                 }
1095         }
1096
1097 out:
1098         free(dup);
1099         return dirfd;
1100 }
1101
1102 /*
1103  * Safely mount a path into a container, ensuring that the mount target
1104  * is under the container's @rootfs.  (If @rootfs is NULL, then the container
1105  * uses the host's /)
1106  *
1107  * CAVEAT: This function must not be used for other purposes than container
1108  * setup before executing the container's init
1109  */
1110 int safe_mount(const char *src, const char *dest, const char *fstype,
1111                 unsigned long flags, const void *data, const char *rootfs)
1112 {
1113         int destfd, ret, saved_errno;
1114         /* Only needs enough for /proc/self/fd/<fd>. */
1115         char srcbuf[50], destbuf[50];
1116         int srcfd = -1;
1117         const char *mntsrc = src;
1118
1119         if (!rootfs)
1120                 rootfs = "";
1121
1122         /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1123         if (flags & MS_BIND && src && src[0] != '/') {
1124                 INFO("this is a relative bind mount");
1125
1126                 srcfd = open_without_symlink(src, NULL);
1127                 if (srcfd < 0)
1128                         return srcfd;
1129
1130                 ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
1131                 if (ret < 0 || ret > 50) {
1132                         close(srcfd);
1133                         ERROR("Out of memory");
1134                         return -EINVAL;
1135                 }
1136                 mntsrc = srcbuf;
1137         }
1138
1139         destfd = open_without_symlink(dest, rootfs);
1140         if (destfd < 0) {
1141                 if (srcfd != -1) {
1142                         saved_errno = errno;
1143                         close(srcfd);
1144                         errno = saved_errno;
1145                 }
1146
1147                 return destfd;
1148         }
1149
1150         ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
1151         if (ret < 0 || ret > 50) {
1152                 if (srcfd != -1)
1153                         close(srcfd);
1154
1155                 close(destfd);
1156                 ERROR("Out of memory");
1157                 return -EINVAL;
1158         }
1159
1160         ret = mount(mntsrc, destbuf, fstype, flags, data);
1161         saved_errno = errno;
1162         if (srcfd != -1)
1163                 close(srcfd);
1164
1165         close(destfd);
1166         if (ret < 0) {
1167                 errno = saved_errno;
1168                 SYSERROR("Failed to mount %s onto %s", src ? src : "(null)", dest);
1169                 return ret;
1170         }
1171
1172         return 0;
1173 }
1174
1175 /*
1176  * Mount a proc under @rootfs if proc self points to a pid other than
1177  * my own.  This is needed to have a known-good proc mount for setting
1178  * up LSMs both at container startup and attach.
1179  *
1180  * @rootfs : the rootfs where proc should be mounted
1181  *
1182  * Returns < 0 on failure, 0 if the correct proc was already mounted
1183  * and 1 if a new proc was mounted.
1184  *
1185  * NOTE: not to be called from inside the container namespace!
1186  */
1187 int lxc_mount_proc_if_needed(const char *rootfs)
1188 {
1189         char path[MAXPATHLEN];
1190         int link_to_pid, linklen, mypid, ret;
1191         char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
1192
1193         ret = snprintf(path, MAXPATHLEN, "%s/proc/self", rootfs);
1194         if (ret < 0 || ret >= MAXPATHLEN) {
1195                 SYSERROR("proc path name too long");
1196                 return -1;
1197         }
1198
1199         linklen = readlink(path, link, sizeof(link));
1200
1201         ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
1202         if (ret < 0 || ret >= MAXPATHLEN) {
1203                 SYSERROR("proc path name too long");
1204                 return -1;
1205         }
1206
1207         /* /proc not mounted */
1208         if (linklen < 0) {
1209                 if (mkdir(path, 0755) && errno != EEXIST)
1210                         return -1;
1211
1212                 goto domount;
1213         } else if (linklen >= sizeof(link)) {
1214                 link[linklen - 1] = '\0';
1215                 ERROR("readlink returned truncated content: \"%s\"", link);
1216                 return -1;
1217         }
1218
1219         mypid = lxc_raw_getpid();
1220         INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1221
1222         if (lxc_safe_int(link, &link_to_pid) < 0)
1223                 return -1;
1224
1225         /* correct procfs is already mounted */
1226         if (link_to_pid == mypid)
1227                 return 0;
1228
1229         ret = umount2(path, MNT_DETACH);
1230         if (ret < 0)
1231                 WARN("failed to umount \"%s\" with MNT_DETACH", path);
1232
1233 domount:
1234         /* rootfs is NULL */
1235         if (!strcmp(rootfs, ""))
1236                 ret = mount("proc", path, "proc", 0, NULL);
1237         else
1238                 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
1239         if (ret < 0)
1240                 return -1;
1241
1242         INFO("mounted /proc in container for security transition");
1243         return 1;
1244 }
1245
1246 int open_devnull(void)
1247 {
1248         int fd = open("/dev/null", O_RDWR);
1249
1250         if (fd < 0)
1251                 SYSERROR("Can't open /dev/null");
1252
1253         return fd;
1254 }
1255
1256 int set_stdfds(int fd)
1257 {
1258         int ret;
1259
1260         if (fd < 0)
1261                 return -1;
1262
1263         ret = dup2(fd, STDIN_FILENO);
1264         if (ret < 0)
1265                 return -1;
1266
1267         ret = dup2(fd, STDOUT_FILENO);
1268         if (ret < 0)
1269                 return -1;
1270
1271         ret = dup2(fd, STDERR_FILENO);
1272         if (ret < 0)
1273                 return -1;
1274
1275         return 0;
1276 }
1277
1278 int null_stdfds(void)
1279 {
1280         int ret = -1;
1281         int fd;
1282
1283         fd = open_devnull();
1284         if (fd >= 0) {
1285                 ret = set_stdfds(fd);
1286                 close(fd);
1287         }
1288
1289         return ret;
1290 }
1291
1292 /* Check whether a signal is blocked by a process. */
1293 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1294 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1295 bool task_blocks_signal(pid_t pid, int signal)
1296 {
1297         int ret;
1298         char status[__PROC_STATUS_LEN];
1299         FILE *f;
1300         uint64_t sigblk = 0, one = 1;
1301         size_t n = 0;
1302         bool bret = false;
1303         char *line = NULL;
1304
1305         ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1306         if (ret < 0 || ret >= __PROC_STATUS_LEN)
1307                 return bret;
1308
1309         f = fopen(status, "r");
1310         if (!f)
1311                 return bret;
1312
1313         while (getline(&line, &n, f) != -1) {
1314                 char *numstr;
1315
1316                 if (strncmp(line, "SigBlk:", 7))
1317                         continue;
1318
1319                 numstr = lxc_trim_whitespace_in_place(line + 7);
1320                 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1321                 if (ret < 0)
1322                         goto out;
1323
1324                 break;
1325         }
1326
1327         if (sigblk & (one << (signal - 1)))
1328                 bret = true;
1329
1330 out:
1331         free(line);
1332         fclose(f);
1333         return bret;
1334 }
1335
1336 int lxc_preserve_ns(const int pid, const char *ns)
1337 {
1338         int ret;
1339 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1340 #define __NS_PATH_LEN 50
1341         char path[__NS_PATH_LEN];
1342
1343         /* This way we can use this function to also check whether namespaces
1344          * are supported by the kernel by passing in the NULL or the empty
1345          * string.
1346          */
1347         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
1348                        !ns || strcmp(ns, "") == 0 ? "" : "/",
1349                        !ns || strcmp(ns, "") == 0 ? "" : ns);
1350         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1351                 errno = EFBIG;
1352                 return -1;
1353         }
1354
1355         return open(path, O_RDONLY | O_CLOEXEC);
1356 }
1357
1358 bool lxc_switch_uid_gid(uid_t uid, gid_t gid)
1359 {
1360         int ret = 0;
1361
1362         if (gid != LXC_INVALID_GID) {
1363                 ret = setgid(gid);
1364                 if (ret < 0) {
1365                         SYSERROR("Failed to switch to gid %d", gid);
1366                         return false;
1367                 }
1368                 NOTICE("Switched to gid %d", gid);
1369         }
1370
1371         if (uid != LXC_INVALID_UID) {
1372                 ret = setuid(uid);
1373                 if (ret < 0) {
1374                         SYSERROR("Failed to switch to uid %d", uid);
1375                         return false;
1376                 }
1377                 NOTICE("Switched to uid %d", uid);
1378         }
1379
1380         return true;
1381 }
1382
1383 /* Simple covenience function which enables uniform logging. */
1384 bool lxc_setgroups(int size, gid_t list[])
1385 {
1386         if (setgroups(size, list) < 0) {
1387                 SYSERROR("Failed to setgroups()");
1388                 return false;
1389         }
1390         NOTICE("Dropped additional groups");
1391
1392         return true;
1393 }
1394
1395 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1396 {
1397         struct dirent *dp;
1398         struct loop_info64 lo64;
1399         DIR *dir;
1400         int dfd = -1, fd = -1, ret = -1;
1401
1402         dir = opendir("/dev");
1403         if (!dir)
1404                 return -1;
1405
1406         while ((dp = readdir(dir))) {
1407                 if (strncmp(dp->d_name, "loop", 4) != 0)
1408                         continue;
1409
1410                 dfd = dirfd(dir);
1411                 if (dfd < 0)
1412                         continue;
1413
1414                 fd = openat(dfd, dp->d_name, O_RDWR);
1415                 if (fd < 0)
1416                         continue;
1417
1418                 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1419                 if (ret < 0) {
1420                         if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1421                             errno != ENXIO) {
1422                                 close(fd);
1423                                 fd = -1;
1424                                 continue;
1425                         }
1426                 }
1427
1428                 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1429                 if (ret < 0 || ret >= LO_NAME_SIZE) {
1430                         close(fd);
1431                         fd = -1;
1432                         continue;
1433                 }
1434
1435                 break;
1436         }
1437
1438         closedir(dir);
1439
1440         if (fd < 0)
1441                 return -1;
1442
1443         return fd;
1444 }
1445
1446 static int lxc_get_unused_loop_dev(char *name_loop)
1447 {
1448         int loop_nr, ret;
1449         int fd_ctl = -1, fd_tmp = -1;
1450
1451         fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1452         if (fd_ctl < 0)
1453                 return -ENODEV;
1454
1455         loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1456         if (loop_nr < 0)
1457                 goto on_error;
1458
1459         ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1460         if (ret < 0 || ret >= LO_NAME_SIZE)
1461                 goto on_error;
1462
1463         fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1464         if (fd_tmp < 0)
1465                 goto on_error;
1466
1467 on_error:
1468         close(fd_ctl);
1469         return fd_tmp;
1470 }
1471
1472 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1473 {
1474         int ret;
1475         struct loop_info64 lo64;
1476         int fd_img = -1, fret = -1, fd_loop = -1;
1477
1478         fd_loop = lxc_get_unused_loop_dev(loop_dev);
1479         if (fd_loop < 0) {
1480                 if (fd_loop == -ENODEV)
1481                         fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1482                 else
1483                         goto on_error;
1484         }
1485
1486         fd_img = open(source, O_RDWR | O_CLOEXEC);
1487         if (fd_img < 0)
1488                 goto on_error;
1489
1490         ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1491         if (ret < 0)
1492                 goto on_error;
1493
1494         memset(&lo64, 0, sizeof(lo64));
1495         lo64.lo_flags = flags;
1496
1497         ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1498         if (ret < 0)
1499                 goto on_error;
1500
1501         fret = 0;
1502
1503 on_error:
1504         if (fd_img >= 0)
1505                 close(fd_img);
1506
1507         if (fret < 0 && fd_loop >= 0) {
1508                 close(fd_loop);
1509                 fd_loop = -1;
1510         }
1511
1512         return fd_loop;
1513 }
1514
1515 int lxc_unstack_mountpoint(const char *path, bool lazy)
1516 {
1517         int ret;
1518         int umounts = 0;
1519
1520 pop_stack:
1521         ret = umount2(path, lazy ? MNT_DETACH : 0);
1522         if (ret < 0) {
1523                 /* We consider anything else than EINVAL deadly to prevent going
1524                  * into an infinite loop. (The other alternative is constantly
1525                  * parsing /proc/self/mountinfo which is yucky and probably
1526                  * racy.)
1527                  */
1528                 if (errno != EINVAL)
1529                         return -errno;
1530         } else {
1531                 /* Just stop counting when this happens. That'd just be so
1532                  * stupid that we won't even bother trying to report back the
1533                  * correct value anymore.
1534                  */
1535                 if (umounts != INT_MAX)
1536                         umounts++;
1537
1538                 /* We succeeded in umounting. Make sure that there's no other
1539                  * mountpoint stacked underneath.
1540                  */
1541                 goto pop_stack;
1542         }
1543
1544         return umounts;
1545 }
1546
1547 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1548 {
1549         pid_t child;
1550         int ret, fret, pipefd[2];
1551         ssize_t bytes;
1552
1553         /* Make sure our callers do not receive uninitialized memory. */
1554         if (buf_size > 0 && buf)
1555                 buf[0] = '\0';
1556
1557         if (pipe(pipefd) < 0) {
1558                 SYSERROR("failed to create pipe");
1559                 return -1;
1560         }
1561
1562         child = lxc_raw_clone(0);
1563         if (child < 0) {
1564                 close(pipefd[0]);
1565                 close(pipefd[1]);
1566                 SYSERROR("failed to create new process");
1567                 return -1;
1568         }
1569
1570         if (child == 0) {
1571                 /* Close the read-end of the pipe. */
1572                 close(pipefd[0]);
1573
1574                 /* Redirect std{err,out} to write-end of the
1575                  * pipe.
1576                  */
1577                 ret = dup2(pipefd[1], STDOUT_FILENO);
1578                 if (ret >= 0)
1579                         ret = dup2(pipefd[1], STDERR_FILENO);
1580
1581                 /* Close the write-end of the pipe. */
1582                 close(pipefd[1]);
1583
1584                 if (ret < 0) {
1585                         SYSERROR("failed to duplicate std{err,out} file descriptor");
1586                         _exit(EXIT_FAILURE);
1587                 }
1588
1589                 /* Does not return. */
1590                 child_fn(args);
1591                 ERROR("failed to exec command");
1592                 _exit(EXIT_FAILURE);
1593         }
1594
1595         /* close the write-end of the pipe */
1596         close(pipefd[1]);
1597
1598         if (buf && buf_size > 0) {
1599                 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1600                 if (bytes > 0)
1601                         buf[bytes - 1] = '\0';
1602         }
1603
1604         fret = wait_for_pid(child);
1605         /* close the read-end of the pipe */
1606         close(pipefd[0]);
1607
1608         return fret;
1609 }
1610
1611 bool lxc_nic_exists(char *nic)
1612 {
1613 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1614         char path[__LXC_SYS_CLASS_NET_LEN];
1615         int ret;
1616         struct stat sb;
1617
1618         if (!strcmp(nic, "none"))
1619                 return true;
1620
1621         ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1622         if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1623                 return false;
1624
1625         ret = stat(path, &sb);
1626         if (ret < 0)
1627                 return false;
1628
1629         return true;
1630 }
1631
1632 uint64_t lxc_find_next_power2(uint64_t n)
1633 {
1634         /* 0 is not valid input. We return 0 to the caller since 0 is not a
1635          * valid power of two.
1636          */
1637         if (n == 0)
1638                 return 0;
1639
1640         if (!(n & (n - 1)))
1641                 return n;
1642
1643         while (n & (n - 1))
1644                 n = n & (n - 1);
1645
1646         n = n << 1;
1647         return n;
1648 }
1649
1650 int lxc_set_death_signal(int signal)
1651 {
1652         int ret;
1653         pid_t ppid;
1654
1655         ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1656                     prctl_arg(0), prctl_arg(0));
1657
1658         /* Check whether we have been orphaned. */
1659         ppid = (pid_t)syscall(SYS_getppid);
1660         if (ppid == 1) {
1661                 pid_t self;
1662
1663                 self = lxc_raw_getpid();
1664                 ret = kill(self, SIGKILL);
1665                 if (ret < 0)
1666                         return -1;
1667         }
1668
1669         if (ret < 0) {
1670                 SYSERROR("Failed to set PR_SET_PDEATHSIG to %d", signal);
1671                 return -1;
1672         }
1673
1674         return 0;
1675 }
1676
1677 int fd_cloexec(int fd, bool cloexec)
1678 {
1679         int oflags, nflags;
1680
1681         oflags = fcntl(fd, F_GETFD, 0);
1682         if (oflags < 0)
1683                 return -errno;
1684
1685         if (cloexec)
1686                 nflags = oflags | FD_CLOEXEC;
1687         else
1688                 nflags = oflags & ~FD_CLOEXEC;
1689
1690         if (nflags == oflags)
1691                 return 0;
1692
1693         if (fcntl(fd, F_SETFD, nflags) < 0)
1694                 return -errno;
1695
1696         return 0;
1697 }
1698
1699 int recursive_destroy(char *dirname)
1700 {
1701         int ret;
1702         struct dirent *direntp;
1703         DIR *dir;
1704         int r = 0;
1705
1706         dir = opendir(dirname);
1707         if (!dir)
1708                 return -1;
1709
1710         while ((direntp = readdir(dir))) {
1711                 char *pathname;
1712                 struct stat mystat;
1713
1714                 if (!strcmp(direntp->d_name, ".") ||
1715                     !strcmp(direntp->d_name, ".."))
1716                         continue;
1717
1718                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1719
1720                 ret = lstat(pathname, &mystat);
1721                 if (ret < 0) {
1722                         if (!r)
1723                                 WARN("Failed to stat \"%s\"", pathname);
1724
1725                         r = -1;
1726                         goto next;
1727                 }
1728
1729                 if (!S_ISDIR(mystat.st_mode))
1730                         goto next;
1731
1732                 ret = recursive_destroy(pathname);
1733                 if (ret < 0)
1734                         r = -1;
1735
1736         next:
1737                 free(pathname);
1738         }
1739
1740         ret = rmdir(dirname);
1741         if (ret < 0) {
1742                 if (!r)
1743                         SYSWARN("Failed to delete \"%s\"", dirname);
1744
1745                 r = -1;
1746         }
1747
1748         ret = closedir(dir);
1749         if (ret < 0) {
1750                 if (!r)
1751                         SYSWARN("Failed to delete \"%s\"", dirname);
1752
1753                 r = -1;
1754         }
1755
1756         return r;
1757 }
1758
1759 int lxc_setup_keyring(void)
1760 {
1761         key_serial_t keyring;
1762         int ret = 0;
1763
1764         /* Try to allocate a new session keyring for the container to prevent
1765          * information leaks.
1766          */
1767         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, prctl_arg(0),
1768                          prctl_arg(0), prctl_arg(0), prctl_arg(0));
1769         if (keyring < 0) {
1770                 switch (errno) {
1771                 case ENOSYS:
1772                         DEBUG("The keyctl() syscall is not supported or blocked");
1773                         break;
1774                 case EACCES:
1775                         __fallthrough;
1776                 case EPERM:
1777                         DEBUG("Failed to access kernel keyring. Continuing...");
1778                         break;
1779                 default:
1780                         SYSERROR("Failed to create kernel keyring");
1781                         ret = -1;
1782                         break;
1783                 }
1784         }
1785
1786         return ret;
1787 }