src/lxc/utils.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 #include "config.h"
  25
  26 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
  27 #include <ctype.h>
  28 #include <dirent.h>
  29 #include <errno.h>
  30 #include <fcntl.h>
  31 #include <grp.h>
  32 #include <inttypes.h>
  33 #include <libgen.h>
  34 #include <pthread.h>
  35 #include <stddef.h>
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38 #include <string.h>
  39 #include <unistd.h>
  40 #include <sys/mman.h>
  41 #include <sys/mount.h>
  42 #include <sys/param.h>
  43 #include <sys/prctl.h>
  44 #include <sys/stat.h>
  45 #include <sys/types.h>
  46 #include <sys/wait.h>
  47
  48 #include "log.h"
  49 #include "lxclock.h"
  50 #include "namespace.h"
  51 #include "parse.h"
  52 #include "utils.h"
  53
  54 #ifndef HAVE_STRLCPY
  55 #include "include/strlcpy.h"
  56 #endif
  57
  58 #ifndef HAVE_STRLCAT
  59 #include "include/strlcat.h"
  60 #endif
  61
  62 #ifndef O_PATH
  63 #define O_PATH      010000000
  64 #endif
  65
  66 #ifndef O_NOFOLLOW
  67 #define O_NOFOLLOW  00400000
  68 #endif
  69
  70 lxc_log_define(utils, lxc);
  71
  72 /*
  73  * if path is btrfs, tries to remove it and any subvolumes beneath it
  74  */
  75 extern bool btrfs_try_remove_subvol(const char *path);
  76
  77 static int _recursive_rmdir(const char *dirname, dev_t pdev,
  78                             const char *exclude, int level, bool onedev)
  79 {
  80         struct dirent *direntp;
  81         DIR *dir;
  82         int ret, failed=0;
  83         char pathname[MAXPATHLEN];
  84         bool hadexclude = false;
  85
  86         dir = opendir(dirname);
  87         if (!dir) {
  88                 ERROR("failed to open %s", dirname);
  89                 return -1;
  90         }
  91
  92         while ((direntp = readdir(dir))) {
  93                 struct stat mystat;
  94                 int rc;
  95
  96                 if (!strcmp(direntp->d_name, ".") ||
  97                     !strcmp(direntp->d_name, ".."))
  98                         continue;
  99
 100                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 101                 if (rc < 0 || rc >= MAXPATHLEN) {
 102                         ERROR("pathname too long");
 103                         failed=1;
 104                         continue;
 105                 }
 106
 107                 if (!level && exclude && !strcmp(direntp->d_name, exclude)) {
 108                         ret = rmdir(pathname);
 109                         if (ret < 0) {
 110                                 switch(errno) {
 111                                 case ENOTEMPTY:
 112                                         INFO("Not deleting snapshot %s", pathname);
 113                                         hadexclude = true;
 114                                         break;
 115                                 case ENOTDIR:
 116                                         ret = unlink(pathname);
 117                                         if (ret)
 118                                                 INFO("Failed to remove %s", pathname);
 119                                         break;
 120                                 default:
 121                                         SYSERROR("Failed to rmdir %s", pathname);
 122                                         failed = 1;
 123                                         break;
 124                                 }
 125                         }
 126                         continue;
 127                 }
 128
 129                 ret = lstat(pathname, &mystat);
 130                 if (ret) {
 131                         ERROR("Failed to stat %s", pathname);
 132                         failed = 1;
 133                         continue;
 134                 }
 135
 136                 if (onedev && mystat.st_dev != pdev) {
 137                         /* TODO should we be checking /proc/self/mountinfo for
 138                          * pathname and not doing this if found? */
 139                         if (btrfs_try_remove_subvol(pathname))
 140                                 INFO("Removed btrfs subvolume at %s\n", pathname);
 141                         continue;
 142                 }
 143
 144                 if (S_ISDIR(mystat.st_mode)) {
 145                         if (_recursive_rmdir(pathname, pdev, exclude, level+1, onedev) < 0)
 146                                 failed=1;
 147                 } else {
 148                         if (unlink(pathname) < 0) {
 149                                 SYSERROR("Failed to delete %s", pathname);
 150                                 failed=1;
 151                         }
 152                 }
 153         }
 154
 155         if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) {
 156                 ERROR("Failed to delete %s", dirname);
 157                 failed=1;
 158         }
 159
 160         ret = closedir(dir);
 161         if (ret) {
 162                 ERROR("Failed to close directory %s", dirname);
 163                 failed=1;
 164         }
 165
 166         return failed ? -1 : 0;
 167 }
 168
 169 /* In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
 170  * lxc_rmdir_onedev()
 171  */
 172 static bool is_native_overlayfs(const char *path)
 173 {
 174         if (has_fs_type(path, OVERLAY_SUPER_MAGIC) ||
 175             has_fs_type(path, OVERLAYFS_SUPER_MAGIC))
 176                 return true;
 177
 178         return false;
 179 }
 180
 181 /* returns 0 on success, -1 if there were any failures */
 182 extern int lxc_rmdir_onedev(const char *path, const char *exclude)
 183 {
 184         struct stat mystat;
 185         bool onedev = true;
 186
 187         if (is_native_overlayfs(path))
 188                 onedev = false;
 189
 190         if (lstat(path, &mystat) < 0) {
 191                 if (errno == ENOENT)
 192                         return 0;
 193
 194                 ERROR("Failed to stat %s", path);
 195                 return -1;
 196         }
 197
 198         return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev);
 199 }
 200
 201 /* borrowed from iproute2 */
 202 extern int get_u16(unsigned short *val, const char *arg, int base)
 203 {
 204         unsigned long res;
 205         char *ptr;
 206
 207         if (!arg || !*arg)
 208                 return -1;
 209
 210         errno = 0;
 211         res = strtoul(arg, &ptr, base);
 212         if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0)
 213                 return -1;
 214
 215         *val = res;
 216
 217         return 0;
 218 }
 219
 220 extern int mkdir_p(const char *dir, mode_t mode)
 221 {
 222         const char *tmp = dir;
 223         const char *orig = dir;
 224         char *makeme;
 225
 226         do {
 227                 dir = tmp + strspn(tmp, "/");
 228                 tmp = dir + strcspn(dir, "/");
 229
 230                 makeme = strndup(orig, dir - orig);
 231                 if (*makeme) {
 232                         if (mkdir(makeme, mode) && errno != EEXIST) {
 233                                 SYSERROR("failed to create directory '%s'", makeme);
 234                                 free(makeme);
 235                                 return -1;
 236                         }
 237                 }
 238                 free(makeme);
 239         } while(tmp != dir);
 240
 241         return 0;
 242 }
 243
 244 char *get_rundir()
 245 {
 246         char *rundir;
 247         const char *homedir;
 248         struct stat sb;
 249
 250         if (stat(RUNTIME_PATH, &sb) < 0)
 251                 return NULL;
 252
 253         if (geteuid() == sb.st_uid || getegid() == sb.st_gid) {
 254                 rundir = strdup(RUNTIME_PATH);
 255                 return rundir;
 256         }
 257
 258         rundir = getenv("XDG_RUNTIME_DIR");
 259         if (rundir) {
 260                 rundir = strdup(rundir);
 261                 return rundir;
 262         }
 263
 264         INFO("XDG_RUNTIME_DIR isn't set in the environment.");
 265         homedir = getenv("HOME");
 266         if (!homedir) {
 267                 ERROR("HOME isn't set in the environment.");
 268                 return NULL;
 269         }
 270
 271         rundir = malloc(sizeof(char) * (17 + strlen(homedir)));
 272         if (!rundir)
 273                 return NULL;
 274
 275         sprintf(rundir, "%s/.cache/lxc/run/", homedir);
 276
 277         return rundir;
 278 }
 279
 280 int wait_for_pid(pid_t pid)
 281 {
 282         int status, ret;
 283
 284 again:
 285         ret = waitpid(pid, &status, 0);
 286         if (ret == -1) {
 287                 if (errno == EINTR)
 288                         goto again;
 289
 290                 return -1;
 291         }
 292
 293         if (ret != pid)
 294                 goto again;
 295
 296         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
 297                 return -1;
 298
 299         return 0;
 300 }
 301
 302 int lxc_wait_for_pid_status(pid_t pid)
 303 {
 304         int status, ret;
 305
 306 again:
 307         ret = waitpid(pid, &status, 0);
 308         if (ret == -1) {
 309                 if (errno == EINTR)
 310                         goto again;
 311
 312                 return -1;
 313         }
 314
 315         if (ret != pid)
 316                 goto again;
 317
 318         return status;
 319 }
 320
 321 #if HAVE_LIBGNUTLS
 322 #include <gnutls/gnutls.h>
 323 #include <gnutls/crypto.h>
 324
 325 __attribute__((constructor))
 326 static void gnutls_lxc_init(void)
 327 {
 328         gnutls_global_init();
 329 }
 330
 331 int sha1sum_file(char *fnam, unsigned char *digest)
 332 {
 333         char *buf;
 334         int ret;
 335         FILE *f;
 336         long flen;
 337
 338         if (!fnam)
 339                 return -1;
 340
 341         f = fopen_cloexec(fnam, "r");
 342         if (!f) {
 343                 SYSERROR("Error opening template");
 344                 return -1;
 345         }
 346
 347         if (fseek(f, 0, SEEK_END) < 0) {
 348                 SYSERROR("Error seeking to end of template");
 349                 fclose(f);
 350                 return -1;
 351         }
 352
 353         if ((flen = ftell(f)) < 0) {
 354                 SYSERROR("Error telling size of template");
 355                 fclose(f);
 356                 return -1;
 357         }
 358
 359         if (fseek(f, 0, SEEK_SET) < 0) {
 360                 SYSERROR("Error seeking to start of template");
 361                 fclose(f);
 362                 return -1;
 363         }
 364
 365         if ((buf = malloc(flen+1)) == NULL) {
 366                 SYSERROR("Out of memory");
 367                 fclose(f);
 368                 return -1;
 369         }
 370
 371         if (fread(buf, 1, flen, f) != flen) {
 372                 SYSERROR("Failure reading template");
 373                 free(buf);
 374                 fclose(f);
 375                 return -1;
 376         }
 377
 378         if (fclose(f) < 0) {
 379                 SYSERROR("Failre closing template");
 380                 free(buf);
 381                 return -1;
 382         }
 383
 384         buf[flen] = '\0';
 385         ret = gnutls_hash_fast(GNUTLS_DIG_SHA1, buf, flen, (void *)digest);
 386         free(buf);
 387         return ret;
 388 }
 389 #endif
 390
 391 struct lxc_popen_FILE *lxc_popen(const char *command)
 392 {
 393         int ret;
 394         int pipe_fds[2];
 395         pid_t child_pid;
 396         struct lxc_popen_FILE *fp = NULL;
 397
 398         ret = pipe2(pipe_fds, O_CLOEXEC);
 399         if (ret < 0)
 400                 return NULL;
 401
 402         child_pid = fork();
 403         if (child_pid < 0)
 404                 goto on_error;
 405
 406         if (!child_pid) {
 407                 sigset_t mask;
 408
 409                 close(pipe_fds[0]);
 410
 411                 /* duplicate stdout */
 412                 if (pipe_fds[1] != STDOUT_FILENO)
 413                         ret = dup2(pipe_fds[1], STDOUT_FILENO);
 414                 else
 415                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 416                 if (ret < 0) {
 417                         close(pipe_fds[1]);
 418                         _exit(EXIT_FAILURE);
 419                 }
 420
 421                 /* duplicate stderr */
 422                 if (pipe_fds[1] != STDERR_FILENO)
 423                         ret = dup2(pipe_fds[1], STDERR_FILENO);
 424                 else
 425                         ret = fcntl(pipe_fds[1], F_SETFD, 0);
 426                 close(pipe_fds[1]);
 427                 if (ret < 0)
 428                         _exit(EXIT_FAILURE);
 429
 430                 /* unblock all signals */
 431                 ret = sigfillset(&mask);
 432                 if (ret < 0)
 433                         _exit(EXIT_FAILURE);
 434
 435                 ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL);
 436                 if (ret < 0)
 437                         _exit(EXIT_FAILURE);
 438
 439                 execl("/bin/sh", "sh", "-c", command, (char *)NULL);
 440                 _exit(127);
 441         }
 442
 443         close(pipe_fds[1]);
 444         pipe_fds[1] = -1;
 445
 446         fp = malloc(sizeof(*fp));
 447         if (!fp)
 448                 goto on_error;
 449
 450         memset(fp, 0, sizeof(*fp));
 451
 452         fp->child_pid = child_pid;
 453         fp->pipe = pipe_fds[0];
 454
 455         /* From now on, closing fp->f will also close fp->pipe. So only ever
 456          * call fclose(fp->f).
 457          */
 458         fp->f = fdopen(pipe_fds[0], "r");
 459         if (!fp->f)
 460                 goto on_error;
 461
 462         return fp;
 463
 464 on_error:
 465         /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
 466          * called yet. Otherwise the fd belongs to the file opened by fdopen()
 467          * since it isn't dup()ed.
 468          */
 469         if (fp && !fp->f && pipe_fds[0] >= 0)
 470                 close(pipe_fds[0]);
 471
 472         if (pipe_fds[1] >= 0)
 473                 close(pipe_fds[1]);
 474
 475         if (fp && fp->f)
 476                 fclose(fp->f);
 477
 478         if (fp)
 479                 free(fp);
 480
 481         return NULL;
 482 }
 483
 484 int lxc_pclose(struct lxc_popen_FILE *fp)
 485 {
 486         pid_t wait_pid;
 487         int wstatus = 0;
 488
 489         if (!fp)
 490                 return -1;
 491
 492         do {
 493                 wait_pid = waitpid(fp->child_pid, &wstatus, 0);
 494         } while (wait_pid < 0 && errno == EINTR);
 495
 496         fclose(fp->f);
 497         free(fp);
 498
 499         if (wait_pid < 0)
 500                 return -1;
 501
 502         return wstatus;
 503 }
 504
 505 int randseed(bool srand_it)
 506 {
 507         /*
 508            srand pre-seed function based on /dev/urandom
 509            */
 510         unsigned int seed = time(NULL) + getpid();
 511
 512         FILE *f;
 513         f = fopen("/dev/urandom", "r");
 514         if (f) {
 515                 int ret = fread(&seed, sizeof(seed), 1, f);
 516                 if (ret != 1)
 517                         SYSDEBUG("unable to fread /dev/urandom, fallback to time+pid rand seed");
 518
 519                 fclose(f);
 520         }
 521
 522         if (srand_it)
 523                 srand(seed);
 524
 525         return seed;
 526 }
 527
 528 uid_t get_ns_uid(uid_t orig)
 529 {
 530         char *line = NULL;
 531         size_t sz = 0;
 532         uid_t nsid, hostid, range;
 533         FILE *f = fopen("/proc/self/uid_map", "r");
 534         if (!f)
 535                 return 0;
 536
 537         while (getline(&line, &sz, f) != -1) {
 538                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 539                         continue;
 540
 541                 if (hostid <= orig && hostid + range > orig) {
 542                         nsid += orig - hostid;
 543                         goto found;
 544                 }
 545         }
 546
 547         nsid = LXC_INVALID_UID;
 548
 549 found:
 550         fclose(f);
 551         free(line);
 552         return nsid;
 553 }
 554
 555 gid_t get_ns_gid(gid_t orig)
 556 {
 557         char *line = NULL;
 558         size_t sz = 0;
 559         gid_t nsid, hostid, range;
 560         FILE *f = fopen("/proc/self/gid_map", "r");
 561         if (!f)
 562                 return 0;
 563
 564         while (getline(&line, &sz, f) != -1) {
 565                 if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3)
 566                         continue;
 567
 568                 if (hostid <= orig && hostid + range > orig) {
 569                         nsid += orig - hostid;
 570                         goto found;
 571                 }
 572         }
 573
 574         nsid = LXC_INVALID_GID;
 575
 576 found:
 577         fclose(f);
 578         free(line);
 579         return nsid;
 580 }
 581
 582 bool dir_exists(const char *path)
 583 {
 584         struct stat sb;
 585         int ret;
 586
 587         ret = stat(path, &sb);
 588         if (ret < 0)
 589                 /* Could be something other than eexist, just say "no". */
 590                 return false;
 591
 592         return S_ISDIR(sb.st_mode);
 593 }
 594
 595 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
 596  * FNV has good anti collision properties and we're not worried
 597  * about pre-image resistance or one-way-ness, we're just trying to make
 598  * the name unique in the 108 bytes of space we have.
 599  */
 600 uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
 601 {
 602         unsigned char *bp;
 603
 604         for(bp = buf; bp < (unsigned char *)buf + len; bp++)
 605         {
 606                 /* xor the bottom with the current octet */
 607                 hval ^= (uint64_t)*bp;
 608
 609                 /* gcc optimised:
 610                  * multiply by the 64 bit FNV magic prime mod 2^64
 611                  */
 612                 hval += (hval << 1) + (hval << 4) + (hval << 5) +
 613                         (hval << 7) + (hval << 8) + (hval << 40);
 614         }
 615
 616         return hval;
 617 }
 618
 619 bool is_shared_mountpoint(const char *path)
 620 {
 621         char buf[LXC_LINELEN];
 622         FILE *f;
 623         int i;
 624         char *p, *p2;
 625
 626         f = fopen("/proc/self/mountinfo", "r");
 627         if (!f)
 628                 return 0;
 629
 630         while (fgets(buf, LXC_LINELEN, f)) {
 631                 for (p = buf, i = 0; p && i < 4; i++)
 632                         p = strchr(p + 1, ' ');
 633                 if (!p)
 634                         continue;
 635
 636                 p2 = strchr(p + 1, ' ');
 637                 if (!p2)
 638                         continue;
 639
 640                 *p2 = '\0';
 641                 if (strcmp(p + 1, path) == 0) {
 642                         /* This is the path. Is it shared? */
 643                         p = strchr(p2 + 1, ' ');
 644                         if (p && strstr(p, "shared:")) {
 645                                 fclose(f);
 646                                 return true;
 647                         }
 648                 }
 649         }
 650
 651         fclose(f);
 652         return false;
 653 }
 654
 655 /*
 656  * Detect whether / is mounted MS_SHARED.  The only way I know of to
 657  * check that is through /proc/self/mountinfo.
 658  * I'm only checking for /.  If the container rootfs or mount location
 659  * is MS_SHARED, but not '/', then you're out of luck - figuring that
 660  * out would be too much work to be worth it.
 661  */
 662 int detect_shared_rootfs(void)
 663 {
 664         if (is_shared_mountpoint("/"))
 665                 return 1;
 666         return 0;
 667 }
 668
 669 bool switch_to_ns(pid_t pid, const char *ns)
 670 {
 671         int fd, ret;
 672         char nspath[MAXPATHLEN];
 673
 674         /* Switch to new ns */
 675         ret = snprintf(nspath, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns);
 676         if (ret < 0 || ret >= MAXPATHLEN)
 677                 return false;
 678
 679         fd = open(nspath, O_RDONLY);
 680         if (fd < 0) {
 681                 SYSERROR("Failed to open %s", nspath);
 682                 return false;
 683         }
 684
 685         ret = setns(fd, 0);
 686         if (ret) {
 687                 SYSERROR("Failed to set process %d to %s of %d.", pid, ns, fd);
 688                 close(fd);
 689                 return false;
 690         }
 691
 692         close(fd);
 693         return true;
 694 }
 695
 696 /*
 697  * looking at fs/proc_namespace.c, it appears we can
 698  * actually expect the rootfs entry to very specifically contain
 699  * " - rootfs rootfs "
 700  * IIUC, so long as we've chrooted so that rootfs is not our root,
 701  * the rootfs entry should always be skipped in mountinfo contents.
 702  */
 703 bool detect_ramfs_rootfs(void)
 704 {
 705         FILE *f;
 706         char *p, *p2;
 707         char *line = NULL;
 708         size_t len = 0;
 709         int i;
 710
 711         f = fopen("/proc/self/mountinfo", "r");
 712         if (!f)
 713                 return false;
 714
 715         while (getline(&line, &len, f) != -1) {
 716                 for (p = line, i = 0; p && i < 4; i++)
 717                         p = strchr(p + 1, ' ');
 718                 if (!p)
 719                         continue;
 720
 721                 p2 = strchr(p + 1, ' ');
 722                 if (!p2)
 723                         continue;
 724
 725                 *p2 = '\0';
 726                 if (strcmp(p + 1, "/") == 0) {
 727                         /* This is '/'. Is it the ramfs? */
 728                         p = strchr(p2 + 1, '-');
 729                         if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
 730                                 free(line);
 731                                 fclose(f);
 732                                 INFO("Rootfs is located on ramfs");
 733                                 return true;
 734                         }
 735                 }
 736         }
 737
 738         free(line);
 739         fclose(f);
 740         return false;
 741 }
 742
 743 char *on_path(const char *cmd, const char *rootfs)
 744 {
 745         char *entry = NULL, *path = NULL;
 746         char cmdpath[MAXPATHLEN];
 747         int ret;
 748
 749         path = getenv("PATH");
 750         if (!path)
 751                 return NULL;
 752
 753         path = strdup(path);
 754         if (!path)
 755                 return NULL;
 756
 757         lxc_iterate_parts (entry, path, ":") {
 758                 if (rootfs)
 759                         ret = snprintf(cmdpath, MAXPATHLEN, "%s/%s/%s", rootfs,
 760                                        entry, cmd);
 761                 else
 762                         ret = snprintf(cmdpath, MAXPATHLEN, "%s/%s", entry, cmd);
 763                 if (ret < 0 || ret >= MAXPATHLEN)
 764                         continue;
 765
 766                 if (access(cmdpath, X_OK) == 0) {
 767                         free(path);
 768                         return strdup(cmdpath);
 769                 }
 770         }
 771
 772         free(path);
 773         return NULL;
 774 }
 775
 776 bool cgns_supported(void)
 777 {
 778         return file_exists("/proc/self/ns/cgroup");
 779 }
 780
 781 /* historically lxc-init has been under /usr/lib/lxc and under
 782  * /usr/lib/$ARCH/lxc.  It now lives as $prefix/sbin/init.lxc.
 783  */
 784 char *choose_init(const char *rootfs)
 785 {
 786         char *retv = NULL;
 787         const char *empty = "",
 788                    *tmp;
 789         int ret, env_set = 0;
 790
 791         if (!getenv("PATH")) {
 792                 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
 793                         SYSERROR("Failed to setenv");
 794
 795                 env_set = 1;
 796         }
 797
 798         retv = on_path("init.lxc", rootfs);
 799
 800         if (env_set) {
 801                 if (unsetenv("PATH"))
 802                         SYSERROR("Failed to unsetenv");
 803         }
 804
 805         if (retv)
 806                 return retv;
 807
 808         retv = malloc(PATH_MAX);
 809         if (!retv)
 810                 return NULL;
 811
 812         if (rootfs)
 813                 tmp = rootfs;
 814         else
 815                 tmp = empty;
 816
 817         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc");
 818         if (ret < 0 || ret >= PATH_MAX) {
 819                 ERROR("pathname too long");
 820                 goto out1;
 821         }
 822
 823         if (access(retv, X_OK) == 0)
 824                 return retv;
 825
 826         ret = snprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init");
 827         if (ret < 0 || ret >= PATH_MAX) {
 828                 ERROR("pathname too long");
 829                 goto out1;
 830         }
 831
 832         if (access(retv, X_OK) == 0)
 833                 return retv;
 834
 835         ret = snprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp);
 836         if (ret < 0 || ret >= PATH_MAX) {
 837                 ERROR("pathname too long");
 838                 goto out1;
 839         }
 840
 841         if (access(retv, X_OK) == 0)
 842                 return retv;
 843
 844         ret = snprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp);
 845         if (ret < 0 || ret >= PATH_MAX) {
 846                 ERROR("pathname too long");
 847                 goto out1;
 848         }
 849
 850         if (access(retv, X_OK) == 0)
 851                 return retv;
 852
 853         /*
 854          * Last resort, look for the statically compiled init.lxc which we
 855          * hopefully bind-mounted in.
 856          * If we are called during container setup, and we get to this point,
 857          * then the init.lxc.static from the host will need to be bind-mounted
 858          * in.  So we return NULL here to indicate that.
 859          */
 860         if (rootfs)
 861                 goto out1;
 862
 863         ret = snprintf(retv, PATH_MAX, "/init.lxc.static");
 864         if (ret < 0 || ret >= PATH_MAX) {
 865                 WARN("Nonsense - name /lxc.init.static too long");
 866                 goto out1;
 867         }
 868
 869         if (access(retv, X_OK) == 0)
 870                 return retv;
 871
 872 out1:
 873         free(retv);
 874         return NULL;
 875 }
 876
 877 /*
 878  * Given the '-t' template option to lxc-create, figure out what to
 879  * do.  If the template is a full executable path, use that.  If it
 880  * is something like 'sshd', then return $templatepath/lxc-sshd.
 881  * On success return the template, on error return NULL.
 882  */
 883 char *get_template_path(const char *t)
 884 {
 885         int ret, len;
 886         char *tpath;
 887
 888         if (t[0] == '/' && access(t, X_OK) == 0) {
 889                 tpath = strdup(t);
 890                 return tpath;
 891         }
 892
 893         len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1;
 894
 895         tpath = malloc(len);
 896         if (!tpath)
 897                 return NULL;
 898
 899         ret = snprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t);
 900         if (ret < 0 || ret >= len) {
 901                 free(tpath);
 902                 return NULL;
 903         }
 904
 905         if (access(tpath, X_OK) < 0) {
 906                 SYSERROR("bad template: %s", t);
 907                 free(tpath);
 908                 return NULL;
 909         }
 910
 911         return tpath;
 912 }
 913
 914 /*
 915  * @path:    a pathname where / replaced with '\0'.
 916  * @offsetp: pointer to int showing which path segment was last seen.
 917  *           Updated on return to reflect the next segment.
 918  * @fulllen: full original path length.
 919  * Returns a pointer to the next path segment, or NULL if done.
 920  */
 921 static char *get_nextpath(char *path, int *offsetp, int fulllen)
 922 {
 923         int offset = *offsetp;
 924
 925         if (offset >= fulllen)
 926                 return NULL;
 927
 928         while (offset < fulllen && path[offset] != '\0')
 929                 offset++;
 930
 931         while (offset < fulllen && path[offset] == '\0')
 932                 offset++;
 933
 934         *offsetp = offset;
 935         return (offset < fulllen) ? &path[offset] : NULL;
 936 }
 937
 938 /*
 939  * Check that @subdir is a subdir of @dir.  @len is the length of
 940  * @dir (to avoid having to recalculate it).
 941  */
 942 static bool is_subdir(const char *subdir, const char *dir, size_t len)
 943 {
 944         size_t subdirlen = strlen(subdir);
 945
 946         if (subdirlen < len)
 947                 return false;
 948
 949         if (strncmp(subdir, dir, len) != 0)
 950                 return false;
 951
 952         if (dir[len-1] == '/')
 953                 return true;
 954
 955         if (subdir[len] == '/' || subdirlen == len)
 956                 return true;
 957
 958         return false;
 959 }
 960
 961 /*
 962  * Check if the open fd is a symlink.  Return -ELOOP if it is.  Return
 963  * -ENOENT if we couldn't fstat.  Return 0 if the fd is ok.
 964  */
 965 static int check_symlink(int fd)
 966 {
 967         struct stat sb;
 968         int ret;
 969
 970         ret = fstat(fd, &sb);
 971         if (ret < 0)
 972                 return -ENOENT;
 973
 974         if (S_ISLNK(sb.st_mode))
 975                 return -ELOOP;
 976
 977         return 0;
 978 }
 979
 980 /*
 981  * Open a file or directory, provided that it contains no symlinks.
 982  *
 983  * CAVEAT: This function must not be used for other purposes than container
 984  * setup before executing the container's init
 985  */
 986 static int open_if_safe(int dirfd, const char *nextpath)
 987 {
 988         int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
 989         if (newfd >= 0) /* Was not a symlink, all good. */
 990                 return newfd;
 991
 992         if (errno == ELOOP)
 993                 return newfd;
 994
 995         if (errno == EPERM || errno == EACCES) {
 996                 /* We're not root (cause we got EPERM) so try opening with
 997                  * O_PATH.
 998                  */
 999                 newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
1000                 if (newfd >= 0) {
1001                         /* O_PATH will return an fd for symlinks. We know
1002                          * nextpath wasn't a symlink at last openat, so if fd is
1003                          * now a link, then something * fishy is going on.
1004                          */
1005                         int ret = check_symlink(newfd);
1006                         if (ret < 0) {
1007                                 close(newfd);
1008                                 newfd = ret;
1009                         }
1010                 }
1011         }
1012
1013         return newfd;
1014 }
1015
1016 /*
1017  * Open a path intending for mounting, ensuring that the final path
1018  * is inside the container's rootfs.
1019  *
1020  * CAVEAT: This function must not be used for other purposes than container
1021  * setup before executing the container's init
1022  *
1023  * @target: path to be opened
1024  * @prefix_skip: a part of @target in which to ignore symbolic links.  This
1025  * would be the container's rootfs.
1026  *
1027  * Return an open fd for the path, or <0 on error.
1028  */
1029 static int open_without_symlink(const char *target, const char *prefix_skip)
1030 {
1031         int curlen = 0, dirfd, fulllen, i;
1032         char *dup = NULL;
1033
1034         fulllen = strlen(target);
1035
1036         /* make sure prefix-skip makes sense */
1037         if (prefix_skip && strlen(prefix_skip) > 0) {
1038                 curlen = strlen(prefix_skip);
1039                 if (!is_subdir(target, prefix_skip, curlen)) {
1040                         ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
1041                                 target, prefix_skip);
1042                         return -EINVAL;
1043                 }
1044
1045                 /*
1046                  * get_nextpath() expects the curlen argument to be
1047                  * on a  (turned into \0) / or before it, so decrement
1048                  * curlen to make sure that happens
1049                  */
1050                 if (curlen)
1051                         curlen--;
1052         } else {
1053                 prefix_skip = "/";
1054                 curlen = 0;
1055         }
1056
1057         /* Make a copy of target which we can hack up, and tokenize it */
1058         if ((dup = strdup(target)) == NULL) {
1059                 SYSERROR("Out of memory checking for symbolic link");
1060                 return -ENOMEM;
1061         }
1062
1063         for (i = 0; i < fulllen; i++) {
1064                 if (dup[i] == '/')
1065                         dup[i] = '\0';
1066         }
1067
1068         dirfd = open(prefix_skip, O_RDONLY);
1069         if (dirfd < 0)
1070                 goto out;
1071
1072         while (1) {
1073                 int newfd, saved_errno;
1074                 char *nextpath;
1075
1076                 if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
1077                         goto out;
1078
1079                 newfd = open_if_safe(dirfd, nextpath);
1080                 saved_errno = errno;
1081                 close(dirfd);
1082
1083                 dirfd = newfd;
1084                 if (newfd < 0) {
1085                         errno = saved_errno;
1086                         if (errno == ELOOP)
1087                                 SYSERROR("%s in %s was a symbolic link!", nextpath, target);
1088
1089                         goto out;
1090                 }
1091         }
1092
1093 out:
1094         free(dup);
1095         return dirfd;
1096 }
1097
1098 /*
1099  * Safely mount a path into a container, ensuring that the mount target
1100  * is under the container's @rootfs.  (If @rootfs is NULL, then the container
1101  * uses the host's /)
1102  *
1103  * CAVEAT: This function must not be used for other purposes than container
1104  * setup before executing the container's init
1105  */
1106 int safe_mount(const char *src, const char *dest, const char *fstype,
1107                 unsigned long flags, const void *data, const char *rootfs)
1108 {
1109         int destfd, ret, saved_errno;
1110         /* Only needs enough for /proc/self/fd/<fd>. */
1111         char srcbuf[50], destbuf[50];
1112         int srcfd = -1;
1113         const char *mntsrc = src;
1114
1115         if (!rootfs)
1116                 rootfs = "";
1117
1118         /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1119         if (flags & MS_BIND && src && src[0] != '/') {
1120                 INFO("this is a relative bind mount");
1121
1122                 srcfd = open_without_symlink(src, NULL);
1123                 if (srcfd < 0)
1124                         return srcfd;
1125
1126                 ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
1127                 if (ret < 0 || ret > 50) {
1128                         close(srcfd);
1129                         ERROR("Out of memory");
1130                         return -EINVAL;
1131                 }
1132                 mntsrc = srcbuf;
1133         }
1134
1135         destfd = open_without_symlink(dest, rootfs);
1136         if (destfd < 0) {
1137                 if (srcfd != -1) {
1138                         saved_errno = errno;
1139                         close(srcfd);
1140                         errno = saved_errno;
1141                 }
1142
1143                 return destfd;
1144         }
1145
1146         ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
1147         if (ret < 0 || ret > 50) {
1148                 if (srcfd != -1)
1149                         close(srcfd);
1150
1151                 close(destfd);
1152                 ERROR("Out of memory");
1153                 return -EINVAL;
1154         }
1155
1156         ret = mount(mntsrc, destbuf, fstype, flags, data);
1157         saved_errno = errno;
1158         if (srcfd != -1)
1159                 close(srcfd);
1160
1161         close(destfd);
1162         if (ret < 0) {
1163                 errno = saved_errno;
1164                 SYSERROR("Failed to mount %s onto %s", src ? src : "(null)", dest);
1165                 return ret;
1166         }
1167
1168         return 0;
1169 }
1170
1171 /*
1172  * Mount a proc under @rootfs if proc self points to a pid other than
1173  * my own.  This is needed to have a known-good proc mount for setting
1174  * up LSMs both at container startup and attach.
1175  *
1176  * @rootfs : the rootfs where proc should be mounted
1177  *
1178  * Returns < 0 on failure, 0 if the correct proc was already mounted
1179  * and 1 if a new proc was mounted.
1180  *
1181  * NOTE: not to be called from inside the container namespace!
1182  */
1183 int lxc_mount_proc_if_needed(const char *rootfs)
1184 {
1185         char path[MAXPATHLEN];
1186         int link_to_pid, linklen, mypid, ret;
1187         char link[INTTYPE_TO_STRLEN(pid_t)] = {0};
1188
1189         ret = snprintf(path, MAXPATHLEN, "%s/proc/self", rootfs);
1190         if (ret < 0 || ret >= MAXPATHLEN) {
1191                 SYSERROR("proc path name too long");
1192                 return -1;
1193         }
1194
1195         linklen = readlink(path, link, sizeof(link));
1196
1197         ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
1198         if (ret < 0 || ret >= MAXPATHLEN) {
1199                 SYSERROR("proc path name too long");
1200                 return -1;
1201         }
1202
1203         /* /proc not mounted */
1204         if (linklen < 0) {
1205                 if (mkdir(path, 0755) && errno != EEXIST)
1206                         return -1;
1207
1208                 goto domount;
1209         } else if (linklen >= sizeof(link)) {
1210                 link[linklen - 1] = '\0';
1211                 ERROR("readlink returned truncated content: \"%s\"", link);
1212                 return -1;
1213         }
1214
1215         mypid = lxc_raw_getpid();
1216         INFO("I am %d, /proc/self points to \"%s\"", mypid, link);
1217
1218         if (lxc_safe_int(link, &link_to_pid) < 0)
1219                 return -1;
1220
1221         /* correct procfs is already mounted */
1222         if (link_to_pid == mypid)
1223                 return 0;
1224
1225         ret = umount2(path, MNT_DETACH);
1226         if (ret < 0)
1227                 WARN("failed to umount \"%s\" with MNT_DETACH", path);
1228
1229 domount:
1230         /* rootfs is NULL */
1231         if (!strcmp(rootfs, ""))
1232                 ret = mount("proc", path, "proc", 0, NULL);
1233         else
1234                 ret = safe_mount("proc", path, "proc", 0, NULL, rootfs);
1235         if (ret < 0)
1236                 return -1;
1237
1238         INFO("mounted /proc in container for security transition");
1239         return 1;
1240 }
1241
1242 int open_devnull(void)
1243 {
1244         int fd = open("/dev/null", O_RDWR);
1245
1246         if (fd < 0)
1247                 SYSERROR("Can't open /dev/null");
1248
1249         return fd;
1250 }
1251
1252 int set_stdfds(int fd)
1253 {
1254         int ret;
1255
1256         if (fd < 0)
1257                 return -1;
1258
1259         ret = dup2(fd, STDIN_FILENO);
1260         if (ret < 0)
1261                 return -1;
1262
1263         ret = dup2(fd, STDOUT_FILENO);
1264         if (ret < 0)
1265                 return -1;
1266
1267         ret = dup2(fd, STDERR_FILENO);
1268         if (ret < 0)
1269                 return -1;
1270
1271         return 0;
1272 }
1273
1274 int null_stdfds(void)
1275 {
1276         int ret = -1;
1277         int fd;
1278
1279         fd = open_devnull();
1280         if (fd >= 0) {
1281                 ret = set_stdfds(fd);
1282                 close(fd);
1283         }
1284
1285         return ret;
1286 }
1287
1288 /* Check whether a signal is blocked by a process. */
1289 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1290 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1291 bool task_blocks_signal(pid_t pid, int signal)
1292 {
1293         int ret;
1294         char status[__PROC_STATUS_LEN];
1295         FILE *f;
1296         uint64_t sigblk = 0, one = 1;
1297         size_t n = 0;
1298         bool bret = false;
1299         char *line = NULL;
1300
1301         ret = snprintf(status, __PROC_STATUS_LEN, "/proc/%d/status", pid);
1302         if (ret < 0 || ret >= __PROC_STATUS_LEN)
1303                 return bret;
1304
1305         f = fopen(status, "r");
1306         if (!f)
1307                 return bret;
1308
1309         while (getline(&line, &n, f) != -1) {
1310                 char *numstr;
1311
1312                 if (strncmp(line, "SigBlk:", 7))
1313                         continue;
1314
1315                 numstr = lxc_trim_whitespace_in_place(line + 7);
1316                 ret = lxc_safe_uint64(numstr, &sigblk, 16);
1317                 if (ret < 0)
1318                         goto out;
1319
1320                 break;
1321         }
1322
1323         if (sigblk & (one << (signal - 1)))
1324                 bret = true;
1325
1326 out:
1327         free(line);
1328         fclose(f);
1329         return bret;
1330 }
1331
1332 int lxc_preserve_ns(const int pid, const char *ns)
1333 {
1334         int ret;
1335 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1336 #define __NS_PATH_LEN 50
1337         char path[__NS_PATH_LEN];
1338
1339         /* This way we can use this function to also check whether namespaces
1340          * are supported by the kernel by passing in the NULL or the empty
1341          * string.
1342          */
1343         ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
1344                        !ns || strcmp(ns, "") == 0 ? "" : "/",
1345                        !ns || strcmp(ns, "") == 0 ? "" : ns);
1346         if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
1347                 errno = EFBIG;
1348                 return -1;
1349         }
1350
1351         return open(path, O_RDONLY | O_CLOEXEC);
1352 }
1353
1354 int lxc_switch_uid_gid(uid_t uid, gid_t gid)
1355 {
1356         int ret = 0;
1357
1358         if (gid != LXC_INVALID_GID) {
1359                 ret = setgid(gid);
1360                 if (ret < 0) {
1361                         SYSERROR("Failed to switch to gid %d", gid);
1362                         return -1;
1363                 }
1364                 NOTICE("Switched to gid %d", gid);
1365         }
1366
1367         if (uid != LXC_INVALID_UID) {
1368                 ret = setuid(uid);
1369                 if (ret < 0) {
1370                         SYSERROR("Failed to switch to uid %d", uid);
1371                         return -1;
1372                 }
1373                 NOTICE("Switched to uid %d", uid);
1374         }
1375
1376         return ret;
1377 }
1378
1379 /* Simple covenience function which enables uniform logging. */
1380 int lxc_setgroups(int size, gid_t list[])
1381 {
1382         if (setgroups(size, list) < 0) {
1383                 SYSERROR("Failed to setgroups().");
1384                 return -errno;
1385         }
1386         NOTICE("Dropped additional groups.");
1387
1388         return 0;
1389 }
1390
1391 static int lxc_get_unused_loop_dev_legacy(char *loop_name)
1392 {
1393         struct dirent *dp;
1394         struct loop_info64 lo64;
1395         DIR *dir;
1396         int dfd = -1, fd = -1, ret = -1;
1397
1398         dir = opendir("/dev");
1399         if (!dir)
1400                 return -1;
1401
1402         while ((dp = readdir(dir))) {
1403                 if (strncmp(dp->d_name, "loop", 4) != 0)
1404                         continue;
1405
1406                 dfd = dirfd(dir);
1407                 if (dfd < 0)
1408                         continue;
1409
1410                 fd = openat(dfd, dp->d_name, O_RDWR);
1411                 if (fd < 0)
1412                         continue;
1413
1414                 ret = ioctl(fd, LOOP_GET_STATUS64, &lo64);
1415                 if (ret < 0) {
1416                         if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 ||
1417                             errno != ENXIO) {
1418                                 close(fd);
1419                                 fd = -1;
1420                                 continue;
1421                         }
1422                 }
1423
1424                 ret = snprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name);
1425                 if (ret < 0 || ret >= LO_NAME_SIZE) {
1426                         close(fd);
1427                         fd = -1;
1428                         continue;
1429                 }
1430
1431                 break;
1432         }
1433
1434         closedir(dir);
1435
1436         if (fd < 0)
1437                 return -1;
1438
1439         return fd;
1440 }
1441
1442 static int lxc_get_unused_loop_dev(char *name_loop)
1443 {
1444         int loop_nr, ret;
1445         int fd_ctl = -1, fd_tmp = -1;
1446
1447         fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC);
1448         if (fd_ctl < 0)
1449                 return -ENODEV;
1450
1451         loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE);
1452         if (loop_nr < 0)
1453                 goto on_error;
1454
1455         ret = snprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr);
1456         if (ret < 0 || ret >= LO_NAME_SIZE)
1457                 goto on_error;
1458
1459         fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC);
1460         if (fd_tmp < 0)
1461                 goto on_error;
1462
1463 on_error:
1464         close(fd_ctl);
1465         return fd_tmp;
1466 }
1467
1468 int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags)
1469 {
1470         int ret;
1471         struct loop_info64 lo64;
1472         int fd_img = -1, fret = -1, fd_loop = -1;
1473
1474         fd_loop = lxc_get_unused_loop_dev(loop_dev);
1475         if (fd_loop < 0) {
1476                 if (fd_loop == -ENODEV)
1477                         fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev);
1478                 else
1479                         goto on_error;
1480         }
1481
1482         fd_img = open(source, O_RDWR | O_CLOEXEC);
1483         if (fd_img < 0)
1484                 goto on_error;
1485
1486         ret = ioctl(fd_loop, LOOP_SET_FD, fd_img);
1487         if (ret < 0)
1488                 goto on_error;
1489
1490         memset(&lo64, 0, sizeof(lo64));
1491         lo64.lo_flags = flags;
1492
1493         ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64);
1494         if (ret < 0)
1495                 goto on_error;
1496
1497         fret = 0;
1498
1499 on_error:
1500         if (fd_img >= 0)
1501                 close(fd_img);
1502
1503         if (fret < 0 && fd_loop >= 0) {
1504                 close(fd_loop);
1505                 fd_loop = -1;
1506         }
1507
1508         return fd_loop;
1509 }
1510
1511 int lxc_unstack_mountpoint(const char *path, bool lazy)
1512 {
1513         int ret;
1514         int umounts = 0;
1515
1516 pop_stack:
1517         ret = umount2(path, lazy ? MNT_DETACH : 0);
1518         if (ret < 0) {
1519                 /* We consider anything else than EINVAL deadly to prevent going
1520                  * into an infinite loop. (The other alternative is constantly
1521                  * parsing /proc/self/mountinfo which is yucky and probably
1522                  * racy.)
1523                  */
1524                 if (errno != EINVAL)
1525                         return -errno;
1526         } else {
1527                 /* Just stop counting when this happens. That'd just be so
1528                  * stupid that we won't even bother trying to report back the
1529                  * correct value anymore.
1530                  */
1531                 if (umounts != INT_MAX)
1532                         umounts++;
1533
1534                 /* We succeeded in umounting. Make sure that there's no other
1535                  * mountpoint stacked underneath.
1536                  */
1537                 goto pop_stack;
1538         }
1539
1540         return umounts;
1541 }
1542
1543 int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args)
1544 {
1545         pid_t child;
1546         int ret, fret, pipefd[2];
1547         ssize_t bytes;
1548
1549         /* Make sure our callers do not receive uninitialized memory. */
1550         if (buf_size > 0 && buf)
1551                 buf[0] = '\0';
1552
1553         if (pipe(pipefd) < 0) {
1554                 SYSERROR("failed to create pipe");
1555                 return -1;
1556         }
1557
1558         child = lxc_raw_clone(0);
1559         if (child < 0) {
1560                 close(pipefd[0]);
1561                 close(pipefd[1]);
1562                 SYSERROR("failed to create new process");
1563                 return -1;
1564         }
1565
1566         if (child == 0) {
1567                 /* Close the read-end of the pipe. */
1568                 close(pipefd[0]);
1569
1570                 /* Redirect std{err,out} to write-end of the
1571                  * pipe.
1572                  */
1573                 ret = dup2(pipefd[1], STDOUT_FILENO);
1574                 if (ret >= 0)
1575                         ret = dup2(pipefd[1], STDERR_FILENO);
1576
1577                 /* Close the write-end of the pipe. */
1578                 close(pipefd[1]);
1579
1580                 if (ret < 0) {
1581                         SYSERROR("failed to duplicate std{err,out} file descriptor");
1582                         _exit(EXIT_FAILURE);
1583                 }
1584
1585                 /* Does not return. */
1586                 child_fn(args);
1587                 ERROR("failed to exec command");
1588                 _exit(EXIT_FAILURE);
1589         }
1590
1591         /* close the write-end of the pipe */
1592         close(pipefd[1]);
1593
1594         if (buf && buf_size > 0) {
1595                 bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1);
1596                 if (bytes > 0)
1597                         buf[bytes - 1] = '\0';
1598         }
1599
1600         fret = wait_for_pid(child);
1601         /* close the read-end of the pipe */
1602         close(pipefd[0]);
1603
1604         return fret;
1605 }
1606
1607 bool lxc_nic_exists(char *nic)
1608 {
1609 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1610         char path[__LXC_SYS_CLASS_NET_LEN];
1611         int ret;
1612         struct stat sb;
1613
1614         if (!strcmp(nic, "none"))
1615                 return true;
1616
1617         ret = snprintf(path, __LXC_SYS_CLASS_NET_LEN, "/sys/class/net/%s", nic);
1618         if (ret < 0 || (size_t)ret >= __LXC_SYS_CLASS_NET_LEN)
1619                 return false;
1620
1621         ret = stat(path, &sb);
1622         if (ret < 0)
1623                 return false;
1624
1625         return true;
1626 }
1627
1628 uint64_t lxc_find_next_power2(uint64_t n)
1629 {
1630         /* 0 is not valid input. We return 0 to the caller since 0 is not a
1631          * valid power of two.
1632          */
1633         if (n == 0)
1634                 return 0;
1635
1636         if (!(n & (n - 1)))
1637                 return n;
1638
1639         while (n & (n - 1))
1640                 n = n & (n - 1);
1641
1642         n = n << 1;
1643         return n;
1644 }
1645
1646 int lxc_set_death_signal(int signal)
1647 {
1648         int ret;
1649         pid_t ppid;
1650
1651         ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0),
1652                     prctl_arg(0), prctl_arg(0));
1653
1654         /* Check whether we have been orphaned. */
1655         ppid = (pid_t)syscall(SYS_getppid);
1656         if (ppid == 1) {
1657                 pid_t self;
1658
1659                 self = lxc_raw_getpid();
1660                 ret = kill(self, SIGKILL);
1661                 if (ret < 0)
1662                         return -1;
1663         }
1664
1665         if (ret < 0) {
1666                 SYSERROR("Failed to set PR_SET_PDEATHSIG to %d", signal);
1667                 return -1;
1668         }
1669
1670         return 0;
1671 }
1672
1673 int fd_cloexec(int fd, bool cloexec)
1674 {
1675         int oflags, nflags;
1676
1677         oflags = fcntl(fd, F_GETFD, 0);
1678         if (oflags < 0)
1679                 return -errno;
1680
1681         if (cloexec)
1682                 nflags = oflags | FD_CLOEXEC;
1683         else
1684                 nflags = oflags & ~FD_CLOEXEC;
1685
1686         if (nflags == oflags)
1687                 return 0;
1688
1689         if (fcntl(fd, F_SETFD, nflags) < 0)
1690                 return -errno;
1691
1692         return 0;
1693 }
1694
1695 int recursive_destroy(char *dirname)
1696 {
1697         int ret;
1698         struct dirent *direntp;
1699         DIR *dir;
1700         int r = 0;
1701
1702         dir = opendir(dirname);
1703         if (!dir)
1704                 return -1;
1705
1706         while ((direntp = readdir(dir))) {
1707                 char *pathname;
1708                 struct stat mystat;
1709
1710                 if (!strcmp(direntp->d_name, ".") ||
1711                     !strcmp(direntp->d_name, ".."))
1712                         continue;
1713
1714                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1715
1716                 ret = lstat(pathname, &mystat);
1717                 if (ret < 0) {
1718                         if (!r)
1719                                 WARN("Failed to stat \"%s\"", pathname);
1720
1721                         r = -1;
1722                         goto next;
1723                 }
1724
1725                 if (!S_ISDIR(mystat.st_mode))
1726                         goto next;
1727
1728                 ret = recursive_destroy(pathname);
1729                 if (ret < 0)
1730                         r = -1;
1731
1732         next:
1733                 free(pathname);
1734         }
1735
1736         ret = rmdir(dirname);
1737         if (ret < 0) {
1738                 if (!r)
1739                         SYSWARN("Failed to delete \"%s\"", dirname);
1740
1741                 r = -1;
1742         }
1743
1744         ret = closedir(dir);
1745         if (ret < 0) {
1746                 if (!r)
1747                         SYSWARN("Failed to delete \"%s\"", dirname);
1748
1749                 r = -1;
1750         }
1751
1752         return r;
1753 }