src/lxc/bdev.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 /*
  25  * this is all just a first shot for experiment.  If we go this route, much
  26  * shoudl change.  bdev should be a directory with per-bdev file.  Things which
  27  * I'm doing by calling out to userspace should sometimes be done through
  28  * libraries like liblvm2
  29  */
  30 #define _GNU_SOURCE
  31 #include <stdio.h>
  32 #include <stdint.h>
  33 #include <inttypes.h>
  34 #include <sys/types.h>
  35 #include <grp.h>
  36 #include <unistd.h>
  37 #include <errno.h>
  38 #include <sched.h>
  39 #include <sys/mount.h>
  40 #include <sys/wait.h>
  41 #include <libgen.h>
  42 #include <linux/loop.h>
  43 #include <dirent.h>
  44
  45 #include "lxc.h"
  46 #include "config.h"
  47 #include "conf.h"
  48 #include "bdev.h"
  49 #include "log.h"
  50 #include "error.h"
  51 #include "utils.h"
  52 #include "namespace.h"
  53 #include "parse.h"
  54 #include "lxclock.h"
  55
  56 #ifndef BLKGETSIZE64
  57 #define BLKGETSIZE64 _IOR(0x12,114,size_t)
  58 #endif
  59
  60 #ifndef LO_FLAGS_AUTOCLEAR
  61 #define LO_FLAGS_AUTOCLEAR 4
  62 #endif
  63
  64 #define DEFAULT_FS_SIZE 1073741824
  65 #define DEFAULT_FSTYPE "ext3"
  66
  67 lxc_log_define(bdev, lxc);
  68
  69 static int do_rsync(const char *src, const char *dest)
  70 {
  71         // call out to rsync
  72         pid_t pid;
  73         char *s;
  74         size_t l;
  75
  76         pid = fork();
  77         if (pid < 0)
  78                 return -1;
  79         if (pid > 0)
  80                 return wait_for_pid(pid);
  81
  82         l = strlen(src) + 2;
  83         s = malloc(l);
  84         if (!s)
  85                 exit(1);
  86         strcpy(s, src);
  87         s[l-2] = '/';
  88         s[l-1] = '\0';
  89
  90         execlp("rsync", "rsync", "-a", s, dest, (char *)NULL);
  91         exit(1);
  92 }
  93
  94 /*
  95  * return block size of dev->src in units of bytes
  96  */
  97 static int blk_getsize(struct bdev *bdev, uint64_t *size)
  98 {
  99         int fd, ret;
 100         char *path = bdev->src;
 101
 102         if (strcmp(bdev->type, "loop") == 0)
 103                 path = bdev->src + 5;
 104
 105         fd = open(path, O_RDONLY);
 106         if (fd < 0)
 107                 return -1;
 108
 109         ret = ioctl(fd, BLKGETSIZE64, size); // size of device in bytes
 110         close(fd);
 111         return ret;
 112 }
 113
 114 /*
 115  * These are copied from conf.c.  However as conf.c will be moved to using
 116  * the callback system, they can be pulled from there eventually, so we
 117  * don't need to pollute utils.c with these low level functions
 118  */
 119 static int find_fstype_cb(char* buffer, void *data)
 120 {
 121         struct cbarg {
 122                 const char *rootfs;
 123                 const char *target;
 124                 const char *options;
 125         } *cbarg = data;
 126
 127         unsigned long mntflags;
 128         char *mntdata;
 129         char *fstype;
 130
 131         /* we don't try 'nodev' entries */
 132         if (strstr(buffer, "nodev"))
 133                 return 0;
 134
 135         fstype = buffer;
 136         fstype += lxc_char_left_gc(fstype, strlen(fstype));
 137         fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
 138
 139         DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
 140               cbarg->rootfs, cbarg->target, fstype);
 141
 142         if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) {
 143                 free(mntdata);
 144                 return 0;
 145         }
 146
 147         if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) {
 148                 DEBUG("mount failed with error: %s", strerror(errno));
 149                 free(mntdata);
 150                 return 0;
 151         }
 152
 153         free(mntdata);
 154
 155         INFO("mounted '%s' on '%s', with fstype '%s'",
 156              cbarg->rootfs, cbarg->target, fstype);
 157
 158         return 1;
 159 }
 160
 161 static int mount_unknown_fs(const char *rootfs, const char *target,
 162                                         const char *options)
 163 {
 164         int i;
 165
 166         struct cbarg {
 167                 const char *rootfs;
 168                 const char *target;
 169                 const char *options;
 170         } cbarg = {
 171                 .rootfs = rootfs,
 172                 .target = target,
 173                 .options = options,
 174         };
 175
 176         /*
 177          * find the filesystem type with brute force:
 178          * first we check with /etc/filesystems, in case the modules
 179          * are auto-loaded and fall back to the supported kernel fs
 180          */
 181         char *fsfile[] = {
 182                 "/etc/filesystems",
 183                 "/proc/filesystems",
 184         };
 185
 186         for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
 187
 188                 int ret;
 189
 190                 if (access(fsfile[i], F_OK))
 191                         continue;
 192
 193                 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
 194                 if (ret < 0) {
 195                         ERROR("failed to parse '%s'", fsfile[i]);
 196                         return -1;
 197                 }
 198
 199                 if (ret)
 200                         return 0;
 201         }
 202
 203         ERROR("failed to determine fs type for '%s'", rootfs);
 204         return -1;
 205 }
 206
 207 static int do_mkfs(const char *path, const char *fstype)
 208 {
 209         pid_t pid;
 210
 211         if ((pid = fork()) < 0) {
 212                 ERROR("error forking");
 213                 return -1;
 214         }
 215         if (pid > 0)
 216                 return wait_for_pid(pid);
 217
 218         // If the file is not a block device, we don't want mkfs to ask
 219         // us about whether to proceed.
 220         close(0);
 221         close(1);
 222         close(2);
 223         open("/dev/zero", O_RDONLY);
 224         open("/dev/null", O_RDWR);
 225         open("/dev/null", O_RDWR);
 226         execlp("mkfs", "mkfs", "-t", fstype, path, NULL);
 227         exit(1);
 228 }
 229
 230 static char *linkderef(char *path, char *dest)
 231 {
 232         struct stat sbuf;
 233         ssize_t ret;
 234
 235         ret = stat(path, &sbuf);
 236         if (ret < 0)
 237                 return NULL;
 238         if (!S_ISLNK(sbuf.st_mode))
 239                 return path;
 240         ret = readlink(path, dest, MAXPATHLEN);
 241         if (ret < 0) {
 242                 SYSERROR("error reading link %s", path);
 243                 return NULL;
 244         } else if (ret >= MAXPATHLEN) {
 245                 ERROR("link in %s too long", path);
 246                 return NULL;
 247         }
 248         dest[ret] = '\0';
 249         return dest;
 250 }
 251
 252 /*
 253  * Given a bdev (presumably blockdev-based), detect the fstype
 254  * by trying mounting (in a private mntns) it.
 255  * @bdev: bdev to investigate
 256  * @type: preallocated char* in which to write the fstype
 257  * @len: length of passed in char*
 258  * Returns length of fstype, of -1 on error
 259  */
 260 static int detect_fs(struct bdev *bdev, char *type, int len)
 261 {
 262         int  p[2], ret;
 263         size_t linelen;
 264         pid_t pid;
 265         FILE *f;
 266         char *sp1, *sp2, *sp3, *line = NULL;
 267         char *srcdev;
 268
 269         if (!bdev || !bdev->src || !bdev->dest)
 270                 return -1;
 271
 272         srcdev = bdev->src;
 273         if (strcmp(bdev->type, "loop") == 0)
 274                 srcdev = bdev->src + 5;
 275
 276         ret = pipe(p);
 277         if (ret < 0)
 278                 return -1;
 279         if ((pid = fork()) < 0)
 280                 return -1;
 281         if (pid > 0) {
 282                 int status;
 283                 close(p[1]);
 284                 memset(type, 0, len);
 285                 ret = read(p[0], type, len-1);
 286                 close(p[0]);
 287                 if (ret < 0) {
 288                         SYSERROR("error reading from pipe");
 289                         wait(&status);
 290                         return -1;
 291                 } else if (ret == 0) {
 292                         ERROR("child exited early - fstype not found");
 293                         wait(&status);
 294                         return -1;
 295                 }
 296                 wait(&status);
 297                 type[len-1] = '\0';
 298                 INFO("detected fstype %s for %s", type, srcdev);
 299                 return ret;
 300         }
 301
 302         if (unshare(CLONE_NEWNS) < 0)
 303                 exit(1);
 304
 305         ret = mount_unknown_fs(srcdev, bdev->dest, bdev->mntopts);
 306         if (ret < 0) {
 307                 ERROR("failed mounting %s onto %s to detect fstype", srcdev, bdev->dest);
 308                 exit(1);
 309         }
 310         // if symlink, get the real dev name
 311         char devpath[MAXPATHLEN];
 312         char *l = linkderef(srcdev, devpath);
 313         if (!l)
 314                 exit(1);
 315         f = fopen("/proc/self/mounts", "r");
 316         if (!f)
 317                 exit(1);
 318         while (getline(&line, &linelen, f) != -1) {
 319                 sp1 = index(line, ' ');
 320                 if (!sp1)
 321                         exit(1);
 322                 *sp1 = '\0';
 323                 if (strcmp(line, l))
 324                         continue;
 325                 sp2 = index(sp1+1, ' ');
 326                 if (!sp2)
 327                         exit(1);
 328                 *sp2 = '\0';
 329                 sp3 = index(sp2+1, ' ');
 330                 if (!sp3)
 331                         exit(1);
 332                 *sp3 = '\0';
 333                 sp2++;
 334                 if (write(p[1], sp2, strlen(sp2)) != strlen(sp2))
 335                         exit(1);
 336                 exit(0);
 337         }
 338         exit(1);
 339 }
 340
 341 struct bdev_type {
 342         const char *name;
 343         const struct bdev_ops *ops;
 344 };
 345
 346 static int is_dir(const char *path)
 347 {
 348         struct stat statbuf;
 349         int ret = stat(path, &statbuf);
 350         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 351                 return 1;
 352         return 0;
 353 }
 354
 355 static int dir_detect(const char *path)
 356 {
 357         if (strncmp(path, "dir:", 4) == 0)
 358                 return 1; // take their word for it
 359         if (is_dir(path))
 360                 return 1;
 361         return 0;
 362 }
 363
 364 //
 365 // XXXXXXX plain directory bind mount ops
 366 //
 367 static int dir_mount(struct bdev *bdev)
 368 {
 369         unsigned long mntflags;
 370         char *mntdata;
 371         int ret;
 372
 373         if (strcmp(bdev->type, "dir"))
 374                 return -22;
 375         if (!bdev->src || !bdev->dest)
 376                 return -22;
 377
 378         if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
 379                 free(mntdata);
 380                 return -22;
 381         }
 382
 383         ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
 384         free(mntdata);
 385         return ret;
 386 }
 387
 388 static int dir_umount(struct bdev *bdev)
 389 {
 390         if (strcmp(bdev->type, "dir"))
 391                 return -22;
 392         if (!bdev->src || !bdev->dest)
 393                 return -22;
 394         return umount(bdev->dest);
 395 }
 396
 397 /* the bulk of this needs to become a common helper */
 398 static char *dir_new_path(char *src, const char *oldname, const char *name,
 399                         const char *oldpath, const char *lxcpath)
 400 {
 401         char *ret, *p, *p2;
 402         int l1, l2, nlen;
 403
 404         nlen = strlen(src) + 1;
 405         l1 = strlen(oldpath);
 406         p = src;
 407         /* if src starts with oldpath, look for oldname only after
 408          * that path */
 409         if (strncmp(src, oldpath, l1) == 0) {
 410                 p += l1;
 411                 nlen += (strlen(lxcpath) - l1);
 412         }
 413         l2 = strlen(oldname);
 414         while ((p = strstr(p, oldname)) != NULL) {
 415                 p += l2;
 416                 nlen += strlen(name) - l2;
 417         }
 418
 419         ret = malloc(nlen);
 420         if (!ret)
 421                 return NULL;
 422
 423         p = ret;
 424         if (strncmp(src, oldpath, l1) == 0) {
 425                 p += sprintf(p, "%s", lxcpath);
 426                 src += l1;
 427         }
 428
 429         while ((p2 = strstr(src, oldname)) != NULL) {
 430                 strncpy(p, src, p2-src); // copy text up to oldname
 431                 p += p2-src; // move target pointer (p)
 432                 p += sprintf(p, "%s", name); // print new name in place of oldname
 433                 src = p2 + l2;  // move src to end of oldname
 434         }
 435         sprintf(p, "%s", src);  // copy the rest of src
 436         return ret;
 437 }
 438
 439 /*
 440  * for a simple directory bind mount, we substitute the old container
 441  * name and paths for the new
 442  */
 443 static int dir_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
 444                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
 445                 uint64_t newsize)
 446 {
 447         int len, ret;
 448
 449         if (snap) {
 450                 ERROR("directories cannot be snapshotted.  Try overlayfs.");
 451                 return -1;
 452         }
 453
 454         if (!orig->dest || !orig->src)
 455                 return -1;
 456
 457         len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
 458         new->src = malloc(len);
 459         if (!new->src)
 460                 return -1;
 461         ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
 462         if (ret < 0 || ret >= len)
 463                 return -1;
 464         if ((new->dest = strdup(new->src)) == NULL)
 465                 return -1;
 466
 467         return 0;
 468 }
 469
 470 static int dir_destroy(struct bdev *orig)
 471 {
 472         if (lxc_rmdir_onedev(orig->src) < 0)
 473                 return -1;
 474         return 0;
 475 }
 476
 477 static int dir_create(struct bdev *bdev, const char *dest, const char *n,
 478                         struct bdev_specs *specs)
 479 {
 480         bdev->src = strdup(dest);
 481         bdev->dest = strdup(dest);
 482         if (!bdev->src || !bdev->dest) {
 483                 ERROR("Out of memory");
 484                 return -1;
 485         }
 486
 487         if (mkdir_p(bdev->src, 0755) < 0) {
 488                 ERROR("Error creating %s", bdev->src);
 489                 return -1;
 490         }
 491         if (mkdir_p(bdev->dest, 0755) < 0) {
 492                 ERROR("Error creating %s", bdev->dest);
 493                 return -1;
 494         }
 495
 496         return 0;
 497 }
 498
 499 static const struct bdev_ops dir_ops = {
 500         .detect = &dir_detect,
 501         .mount = &dir_mount,
 502         .umount = &dir_umount,
 503         .clone_paths = &dir_clonepaths,
 504         .destroy = &dir_destroy,
 505         .create = &dir_create,
 506         .can_snapshot = false,
 507 };
 508
 509
 510 //
 511 // XXXXXXX zfs ops
 512 // There are two ways we could do this.  We could always specify the
 513 // 'zfs device' (i.e. tank/lxc lxc/container) as rootfs.  But instead
 514 // (at least right now) we have lxc-create specify $lxcpath/$lxcname/rootfs
 515 // as the mountpoint, so that it is always mounted.
 516 //
 517 // That means 'mount' is really never needed and could be noop, but for the
 518 // sake of flexibility let's always bind-mount.
 519 //
 520
 521 static int zfs_list_entry(const char *path, char *output, size_t inlen)
 522 {
 523         struct lxc_popen_FILE *f;
 524         int found=0;
 525
 526         f = lxc_popen("zfs list 2> /dev/null");
 527         if (f == NULL) {
 528                 SYSERROR("popen failed");
 529                 return 0;
 530         }
 531         while (fgets(output, inlen, f->f)) {
 532                 if (strstr(output, path)) {
 533                         found = 1;
 534                         break;
 535                 }
 536         }
 537         (void) lxc_pclose(f);
 538
 539         return found;
 540 }
 541
 542 static int zfs_detect(const char *path)
 543 {
 544         char *output = malloc(LXC_LOG_BUFFER_SIZE);
 545         int found;
 546
 547         if (!output) {
 548                 ERROR("out of memory");
 549                 return 0;
 550         }
 551         found = zfs_list_entry(path, output, LXC_LOG_BUFFER_SIZE);
 552         free(output);
 553         return found;
 554 }
 555
 556 static int zfs_mount(struct bdev *bdev)
 557 {
 558         unsigned long mntflags;
 559         char *mntdata;
 560         int ret;
 561
 562         if (strcmp(bdev->type, "zfs"))
 563                 return -22;
 564         if (!bdev->src || !bdev->dest)
 565                 return -22;
 566
 567         if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
 568                 free(mntdata);
 569                 return -22;
 570         }
 571
 572         ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
 573         free(mntdata);
 574         return ret;
 575 }
 576
 577 static int zfs_umount(struct bdev *bdev)
 578 {
 579         if (strcmp(bdev->type, "zfs"))
 580                 return -22;
 581         if (!bdev->src || !bdev->dest)
 582                 return -22;
 583         return umount(bdev->dest);
 584 }
 585
 586 static int zfs_clone(const char *opath, const char *npath, const char *oname,
 587                         const char *nname, const char *lxcpath, int snapshot)
 588 {
 589         // use the 'zfs list | grep opath' entry to get the zfsroot
 590         char output[MAXPATHLEN], option[MAXPATHLEN], *p;
 591         const char *zfsroot = output;
 592         int ret;
 593         pid_t pid;
 594
 595         if (zfs_list_entry(opath, output, MAXPATHLEN)) {
 596                 // zfsroot is output up to ' '
 597                 if ((p = index(output, ' ')) == NULL)
 598                         return -1;
 599                 *p = '\0';
 600                 if ((p = strrchr(output, '/')) == NULL)
 601                         return -1;
 602                 *p = '\0';
 603         } else
 604                 zfsroot = lxc_global_config_value("lxc.bdev.zfs.root");
 605
 606         ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s/%s/rootfs",
 607                 lxcpath, nname);
 608         if (ret < 0  || ret >= MAXPATHLEN)
 609                 return -1;
 610
 611         // zfs create -omountpoint=$lxcpath/$lxcname $zfsroot/$nname
 612         if (!snapshot) {
 613                 if ((pid = fork()) < 0)
 614                         return -1;
 615                 if (!pid) {
 616                         char dev[MAXPATHLEN];
 617
 618                         ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, nname);
 619                         if (ret < 0  || ret >= MAXPATHLEN)
 620                                 exit(1);
 621                         execlp("zfs", "zfs", "create", option, dev, NULL);
 622                         exit(1);
 623                 }
 624                 return wait_for_pid(pid);
 625         } else {
 626                 // if snapshot, do
 627                 // 'zfs snapshot zfsroot/oname@nname
 628                 // zfs clone zfsroot/oname@nname zfsroot/nname
 629                 char path1[MAXPATHLEN], path2[MAXPATHLEN];
 630
 631                 ret = snprintf(path1, MAXPATHLEN, "%s/%s@%s", zfsroot,
 632                         oname, nname);
 633                 if (ret < 0 || ret >= MAXPATHLEN)
 634                         return -1;
 635                 (void) snprintf(path2, MAXPATHLEN, "%s/%s", zfsroot, nname);
 636
 637                 // if the snapshot exists, delete it
 638                 if ((pid = fork()) < 0)
 639                         return -1;
 640                 if (!pid) {
 641                         execlp("zfs", "zfs", "destroy", path1, NULL);
 642                         exit(1);
 643                 }
 644                 // it probably doesn't exist so destroy probably will fail.
 645                 (void) wait_for_pid(pid);
 646
 647                 // run first (snapshot) command
 648                 if ((pid = fork()) < 0)
 649                         return -1;
 650                 if (!pid) {
 651                         execlp("zfs", "zfs", "snapshot", path1, NULL);
 652                         exit(1);
 653                 }
 654                 if (wait_for_pid(pid) < 0)
 655                         return -1;
 656
 657                 // run second (clone) command
 658                 if ((pid = fork()) < 0)
 659                         return -1;
 660                 if (!pid) {
 661                         execlp("zfs", "zfs", "clone", option, path1, path2, NULL);
 662                         exit(1);
 663                 }
 664                 return wait_for_pid(pid);
 665         }
 666 }
 667
 668 static int zfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
 669                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
 670                 uint64_t newsize)
 671 {
 672         int len, ret;
 673
 674         if (!orig->src || !orig->dest)
 675                 return -1;
 676
 677         if (snap && strcmp(orig->type, "zfs")) {
 678                 ERROR("zfs snapshot from %s backing store is not supported",
 679                         orig->type);
 680                 return -1;
 681         }
 682
 683         len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
 684         new->src = malloc(len);
 685         if (!new->src)
 686                 return -1;
 687         ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
 688         if (ret < 0 || ret >= len)
 689                 return -1;
 690         if ((new->dest = strdup(new->src)) == NULL)
 691                 return -1;
 692
 693         return zfs_clone(orig->src, new->src, oldname, cname, lxcpath, snap);
 694 }
 695
 696 /*
 697  * TODO: detect whether this was a clone, and if so then also delete the
 698  * snapshot it was based on, so that we don't hold the original
 699  * container busy.
 700  */
 701 static int zfs_destroy(struct bdev *orig)
 702 {
 703         pid_t pid;
 704         char output[MAXPATHLEN], *p;
 705
 706         if ((pid = fork()) < 0)
 707                 return -1;
 708         if (pid)
 709                 return wait_for_pid(pid);
 710
 711         if (!zfs_list_entry(orig->src, output, MAXPATHLEN)) {
 712                 ERROR("Error: zfs entry for %s not found", orig->src);
 713                 return -1;
 714         }
 715
 716         // zfs mount is output up to ' '
 717         if ((p = index(output, ' ')) == NULL)
 718                 return -1;
 719         *p = '\0';
 720
 721         execlp("zfs", "zfs", "destroy", output, NULL);
 722         exit(1);
 723 }
 724
 725 static int zfs_create(struct bdev *bdev, const char *dest, const char *n,
 726                         struct bdev_specs *specs)
 727 {
 728         const char *zfsroot;
 729         char option[MAXPATHLEN];
 730         int ret;
 731         pid_t pid;
 732
 733         if (!specs || !specs->zfs.zfsroot)
 734                 zfsroot = lxc_global_config_value("lxc.bdev.zfs.root");
 735         else
 736                 zfsroot = specs->zfs.zfsroot;
 737
 738         if (!(bdev->dest = strdup(dest))) {
 739                 ERROR("No mount target specified or out of memory");
 740                 return -1;
 741         }
 742         if (!(bdev->src = strdup(bdev->dest))) {
 743                 ERROR("out of memory");
 744                 return -1;
 745         }
 746
 747         ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s", bdev->dest);
 748         if (ret < 0  || ret >= MAXPATHLEN)
 749                 return -1;
 750         if ((pid = fork()) < 0)
 751                 return -1;
 752         if (pid)
 753                 return wait_for_pid(pid);
 754
 755         char dev[MAXPATHLEN];
 756         ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, n);
 757         if (ret < 0  || ret >= MAXPATHLEN)
 758                 exit(1);
 759         execlp("zfs", "zfs", "create", option, dev, NULL);
 760         exit(1);
 761 }
 762
 763 static const struct bdev_ops zfs_ops = {
 764         .detect = &zfs_detect,
 765         .mount = &zfs_mount,
 766         .umount = &zfs_umount,
 767         .clone_paths = &zfs_clonepaths,
 768         .destroy = &zfs_destroy,
 769         .create = &zfs_create,
 770         .can_snapshot = true,
 771 };
 772
 773 //
 774 // LVM ops
 775 //
 776
 777 /*
 778  * Look at /sys/dev/block/maj:min/dm/uuid.  If it contains the hardcoded LVM
 779  * prefix "LVM-", then this is an lvm2 LV
 780  */
 781 static int lvm_detect(const char *path)
 782 {
 783         char devp[MAXPATHLEN], buf[4];
 784         FILE *fout;
 785         int ret;
 786         struct stat statbuf;
 787
 788         if (strncmp(path, "lvm:", 4) == 0)
 789                 return 1; // take their word for it
 790
 791         ret = stat(path, &statbuf);
 792         if (ret != 0)
 793                 return 0;
 794         if (!S_ISBLK(statbuf.st_mode))
 795                 return 0;
 796
 797         ret = snprintf(devp, MAXPATHLEN, "/sys/dev/block/%d:%d/dm/uuid",
 798                         major(statbuf.st_rdev), minor(statbuf.st_rdev));
 799         if (ret < 0 || ret >= MAXPATHLEN) {
 800                 ERROR("lvm uuid pathname too long");
 801                 return 0;
 802         }
 803         fout = fopen(devp, "r");
 804         if (!fout)
 805                 return 0;
 806         ret = fread(buf, 1, 4, fout);
 807         fclose(fout);
 808         if (ret != 4 || strncmp(buf, "LVM-", 4) != 0)
 809                 return 0;
 810         return 1;
 811 }
 812
 813 static int lvm_mount(struct bdev *bdev)
 814 {
 815         if (strcmp(bdev->type, "lvm"))
 816                 return -22;
 817         if (!bdev->src || !bdev->dest)
 818                 return -22;
 819         /* if we might pass in data sometime, then we'll have to enrich
 820          * mount_unknown_fs */
 821         return mount_unknown_fs(bdev->src, bdev->dest, bdev->mntopts);
 822 }
 823
 824 static int lvm_umount(struct bdev *bdev)
 825 {
 826         if (strcmp(bdev->type, "lvm"))
 827                 return -22;
 828         if (!bdev->src || !bdev->dest)
 829                 return -22;
 830         return umount(bdev->dest);
 831 }
 832
 833 static int lvm_compare_lv_attr(const char *path, int pos, const char expected) {
 834         struct lxc_popen_FILE *f;
 835         int ret, len, status, start=0;
 836         char *cmd, output[12];
 837         const char *lvscmd = "lvs --unbuffered --noheadings -o lv_attr %s 2>/dev/null";
 838
 839         len = strlen(lvscmd) + strlen(path) - 1;
 840         cmd = alloca(len);
 841
 842         ret = snprintf(cmd, len, lvscmd, path);
 843         if (ret < 0 || ret >= len)
 844                 return -1;
 845
 846         f = lxc_popen(cmd);
 847
 848         if (f == NULL) {
 849                 SYSERROR("popen failed");
 850                 return -1;
 851         }
 852
 853         ret = fgets(output, 12, f->f) == NULL;
 854
 855         status = lxc_pclose(f);
 856
 857         if (ret || WEXITSTATUS(status))
 858                 // Assume either vg or lvs do not exist, default
 859                 // comparison to false.
 860                 return 0;
 861
 862         len = strlen(output);
 863         while(start < len && output[start] == ' ') start++;
 864
 865         if (start + pos < len && output[start + pos] == expected)
 866                 return 1;
 867
 868         return 0;
 869 }
 870
 871 static int lvm_is_thin_volume(const char *path)
 872 {
 873         return lvm_compare_lv_attr(path, 6, 't');
 874 }
 875
 876 static int lvm_is_thin_pool(const char *path)
 877 {
 878         return lvm_compare_lv_attr(path, 0, 't');
 879 }
 880
 881 /*
 882  * path must be '/dev/$vg/$lv', $vg must be an existing VG, and $lv must not
 883  * yet exist.  This function will attempt to create /dev/$vg/$lv of size
 884  * $size. If thinpool is specified, we'll check for it's existence and if it's
 885  * a valid thin pool, and if so, we'll create the requested lv from that thin
 886  * pool.
 887  */
 888 static int do_lvm_create(const char *path, uint64_t size, const char *thinpool)
 889 {
 890         int ret, pid, len;
 891         char sz[24], *pathdup, *vg, *lv, *tp = NULL;
 892
 893         if ((pid = fork()) < 0) {
 894                 SYSERROR("failed fork");
 895                 return -1;
 896         }
 897         if (pid > 0)
 898                 return wait_for_pid(pid);
 899
 900         // specify bytes to lvcreate
 901         ret = snprintf(sz, 24, "%"PRIu64"b", size);
 902         if (ret < 0 || ret >= 24)
 903                 exit(1);
 904
 905         pathdup = strdup(path);
 906         if (!pathdup)
 907                 exit(1);
 908
 909         lv = strrchr(pathdup, '/');
 910         if (!lv)
 911                 exit(1);
 912
 913         *lv = '\0';
 914         lv++;
 915
 916         vg = strrchr(pathdup, '/');
 917         if (!vg)
 918                 exit(1);
 919         vg++;
 920
 921         if (thinpool) {
 922                 len = strlen(pathdup) + strlen(thinpool) + 2;
 923                 tp = alloca(len);
 924
 925                 ret = snprintf(tp, len, "%s/%s", pathdup, thinpool);
 926                 if (ret < 0 || ret >= len)
 927                         exit(1);
 928
 929                 ret = lvm_is_thin_pool(tp);
 930                 INFO("got %d for thin pool at path: %s", ret, tp);
 931                 if (ret < 0)
 932                         exit(1);
 933
 934                 if (!ret)
 935                         tp = NULL;
 936         }
 937
 938         if (!tp)
 939             execlp("lvcreate", "lvcreate", "-L", sz, vg, "-n", lv, (char *)NULL);
 940         else
 941             execlp("lvcreate", "lvcreate", "--thinpool", tp, "-V", sz, vg, "-n", lv, (char *)NULL);
 942
 943         SYSERROR("execlp");
 944         exit(1);
 945 }
 946
 947 static int lvm_snapshot(const char *orig, const char *path, uint64_t size)
 948 {
 949         int ret, pid;
 950         char sz[24], *pathdup, *lv;
 951
 952         if ((pid = fork()) < 0) {
 953                 SYSERROR("failed fork");
 954                 return -1;
 955         }
 956         if (pid > 0)
 957                 return wait_for_pid(pid);
 958
 959         // specify bytes to lvcreate
 960         ret = snprintf(sz, 24, "%"PRIu64"b", size);
 961         if (ret < 0 || ret >= 24)
 962                 exit(1);
 963
 964         pathdup = strdup(path);
 965         if (!pathdup)
 966                 exit(1);
 967         lv = strrchr(pathdup, '/');
 968         if (!lv) {
 969                 free(pathdup);
 970                 exit(1);
 971         }
 972         *lv = '\0';
 973         lv++;
 974
 975         // check if the original lv is backed by a thin pool, in which case we
 976         // cannot specify a size that's different from the original size.
 977         ret = lvm_is_thin_volume(orig);
 978         if (ret == -1) {
 979                 free(pathdup);
 980                 return -1;
 981         }
 982
 983         if (!ret) {
 984                 ret = execlp("lvcreate", "lvcreate", "-s", "-L", sz, "-n", lv, orig, (char *)NULL);
 985         } else {
 986                 ret = execlp("lvcreate", "lvcreate", "-s", "-n", lv, orig, (char *)NULL);
 987         }
 988
 989         free(pathdup);
 990         exit(1);
 991 }
 992
 993 // this will return 1 for physical disks, qemu-nbd, loop, etc
 994 // right now only lvm is a block device
 995 static int is_blktype(struct bdev *b)
 996 {
 997         if (strcmp(b->type, "lvm") == 0)
 998                 return 1;
 999         return 0;
1000 }
1001
1002 static int lvm_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1003                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1004                 uint64_t newsize)
1005 {
1006         char fstype[100];
1007         uint64_t size = newsize;
1008         int len, ret;
1009
1010         if (!orig->src || !orig->dest)
1011                 return -1;
1012
1013         if (strcmp(orig->type, "lvm")) {
1014                 const char *vg;
1015
1016                 if (snap) {
1017                         ERROR("LVM snapshot from %s backing store is not supported",
1018                                 orig->type);
1019                         return -1;
1020                 }
1021                 vg = lxc_global_config_value("lxc.bdev.lvm.vg");
1022                 len = strlen("/dev/") + strlen(vg) + strlen(cname) + 2;
1023                 if ((new->src = malloc(len)) == NULL)
1024                         return -1;
1025                 ret = snprintf(new->src, len, "/dev/%s/%s", vg, cname);
1026                 if (ret < 0 || ret >= len)
1027                         return -1;
1028         } else {
1029                 new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath);
1030                 if (!new->src)
1031                         return -1;
1032         }
1033
1034         if (orig->mntopts) {
1035                 new->mntopts = strdup(orig->mntopts);
1036                 if (!new->mntopts)
1037                         return -1;
1038         }
1039
1040         len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1041         new->dest = malloc(len);
1042         if (!new->dest)
1043                 return -1;
1044         ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
1045         if (ret < 0 || ret >= len)
1046                 return -1;
1047         if (mkdir_p(new->dest, 0755) < 0)
1048                 return -1;
1049
1050         if (is_blktype(orig)) {
1051                 if (!newsize && blk_getsize(orig, &size) < 0) {
1052                         ERROR("Error getting size of %s", orig->src);
1053                         return -1;
1054                 }
1055                 if (detect_fs(orig, fstype, 100) < 0) {
1056                         INFO("could not find fstype for %s, using ext3", orig->src);
1057                         return -1;
1058                 }
1059         } else {
1060                 sprintf(fstype, "ext3");
1061                 if (!newsize)
1062                         size = DEFAULT_FS_SIZE;
1063         }
1064
1065         if (snap) {
1066                 if (lvm_snapshot(orig->src, new->src, size) < 0) {
1067                         ERROR("could not create %s snapshot of %s", new->src, orig->src);
1068                         return -1;
1069                 }
1070         } else {
1071                 if (do_lvm_create(new->src, size, lxc_global_config_value("lxc.bdev.lvm.thin_pool")) < 0) {
1072                         ERROR("Error creating new lvm blockdev");
1073                         return -1;
1074                 }
1075                 if (do_mkfs(new->src, fstype) < 0) {
1076                         ERROR("Error creating filesystem type %s on %s", fstype,
1077                                 new->src);
1078                         return -1;
1079                 }
1080         }
1081
1082         return 0;
1083 }
1084
1085 static int lvm_destroy(struct bdev *orig)
1086 {
1087         pid_t pid;
1088
1089         if ((pid = fork()) < 0)
1090                 return -1;
1091         if (!pid) {
1092                 execlp("lvremove", "lvremove", "-f", orig->src, NULL);
1093                 exit(1);
1094         }
1095         return wait_for_pid(pid);
1096 }
1097
1098 static int lvm_create(struct bdev *bdev, const char *dest, const char *n,
1099                         struct bdev_specs *specs)
1100 {
1101         const char *vg, *thinpool, *fstype, *lv = n;
1102         uint64_t sz;
1103         int ret, len;
1104
1105         if (!specs)
1106                 return -1;
1107
1108         vg = specs->lvm.vg;
1109         if (!vg)
1110                 vg = lxc_global_config_value("lxc.bdev.lvm.vg");
1111
1112         thinpool = specs->lvm.thinpool;
1113         if (!thinpool)
1114                 thinpool = lxc_global_config_value("lxc.bdev.lvm.thin_pool");
1115
1116         /* /dev/$vg/$lv */
1117         if (specs->lvm.lv)
1118                 lv = specs->lvm.lv;
1119
1120         len = strlen(vg) + strlen(lv) + 7;
1121         bdev->src = malloc(len);
1122         if (!bdev->src)
1123                 return -1;
1124
1125         ret = snprintf(bdev->src, len, "/dev/%s/%s", vg, lv);
1126         if (ret < 0 || ret >= len)
1127                 return -1;
1128
1129         // fssize is in bytes.
1130         sz = specs->fssize;
1131         if (!sz)
1132                 sz = DEFAULT_FS_SIZE;
1133
1134         if (do_lvm_create(bdev->src, sz, thinpool) < 0) {
1135                 ERROR("Error creating new lvm blockdev %s size %"PRIu64" bytes", bdev->src, sz);
1136                 return -1;
1137         }
1138
1139         fstype = specs->fstype;
1140         if (!fstype)
1141                 fstype = DEFAULT_FSTYPE;
1142         if (do_mkfs(bdev->src, fstype) < 0) {
1143                 ERROR("Error creating filesystem type %s on %s", fstype,
1144                         bdev->src);
1145                 return -1;
1146         }
1147         if (!(bdev->dest = strdup(dest)))
1148                 return -1;
1149
1150         if (mkdir_p(bdev->dest, 0755) < 0) {
1151                 ERROR("Error creating %s", bdev->dest);
1152                 return -1;
1153         }
1154
1155         return 0;
1156 }
1157
1158 static const struct bdev_ops lvm_ops = {
1159         .detect = &lvm_detect,
1160         .mount = &lvm_mount,
1161         .umount = &lvm_umount,
1162         .clone_paths = &lvm_clonepaths,
1163         .destroy = &lvm_destroy,
1164         .create = &lvm_create,
1165         .can_snapshot = true,
1166 };
1167
1168 //
1169 // btrfs ops
1170 //
1171
1172 struct btrfs_ioctl_space_info {
1173         unsigned long long flags;
1174         unsigned long long total_bytes;
1175         unsigned long long used_bytes;
1176 };
1177
1178 struct btrfs_ioctl_space_args {
1179         unsigned long long space_slots;
1180         unsigned long long total_spaces;
1181         struct btrfs_ioctl_space_info spaces[0];
1182 };
1183
1184 #define BTRFS_IOCTL_MAGIC 0x94
1185 #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, unsigned long long)
1186 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
1187                                     struct btrfs_ioctl_space_args)
1188
1189 static bool is_btrfs_fs(const char *path)
1190 {
1191         int fd, ret;
1192         struct btrfs_ioctl_space_args sargs;
1193
1194         // make sure this is a btrfs filesystem
1195         fd = open(path, O_RDONLY);
1196         if (fd < 0)
1197                 return false;
1198         sargs.space_slots = 0;
1199         sargs.total_spaces = 0;
1200         ret = ioctl(fd, BTRFS_IOC_SPACE_INFO, &sargs);
1201         close(fd);
1202         if (ret < 0)
1203                 return false;
1204
1205         return true;
1206 }
1207
1208 static int btrfs_detect(const char *path)
1209 {
1210         struct stat st;
1211         int ret;
1212
1213         if (!is_btrfs_fs(path))
1214                 return 0;
1215
1216         // and make sure it's a subvolume.
1217         ret = stat(path, &st);
1218         if (ret < 0)
1219                 return 0;
1220
1221         if (st.st_ino == 256 && S_ISDIR(st.st_mode))
1222                 return 1;
1223
1224         return 0;
1225 }
1226
1227 static int btrfs_mount(struct bdev *bdev)
1228 {
1229         unsigned long mntflags;
1230         char *mntdata;
1231         int ret;
1232
1233         if (strcmp(bdev->type, "btrfs"))
1234                 return -22;
1235         if (!bdev->src || !bdev->dest)
1236                 return -22;
1237
1238         if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
1239                 free(mntdata);
1240                 return -22;
1241         }
1242
1243         ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
1244         free(mntdata);
1245         return ret;
1246 }
1247
1248 static int btrfs_umount(struct bdev *bdev)
1249 {
1250         if (strcmp(bdev->type, "btrfs"))
1251                 return -22;
1252         if (!bdev->src || !bdev->dest)
1253                 return -22;
1254         return umount(bdev->dest);
1255 }
1256
1257 #define BTRFS_SUBVOL_NAME_MAX 4039
1258 #define BTRFS_PATH_NAME_MAX 4087
1259
1260 struct btrfs_ioctl_vol_args {
1261         signed long long fd;
1262         char name[BTRFS_PATH_NAME_MAX + 1];
1263 };
1264
1265 #define BTRFS_IOCTL_MAGIC 0x94
1266 #define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
1267                                    struct btrfs_ioctl_vol_args_v2)
1268 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
1269                                    struct btrfs_ioctl_vol_args_v2)
1270 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
1271                                    struct btrfs_ioctl_vol_args)
1272 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
1273                                    struct btrfs_ioctl_vol_args)
1274
1275 #define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
1276
1277 struct btrfs_ioctl_vol_args_v2 {
1278         signed long long fd;
1279         unsigned long long transid;
1280         unsigned long long flags;
1281         union {
1282                 struct {
1283                         unsigned long long size;
1284                         //struct btrfs_qgroup_inherit *qgroup_inherit;
1285                         void *qgroup_inherit;
1286                 };
1287                 unsigned long long unused[4];
1288         };
1289         char name[BTRFS_SUBVOL_NAME_MAX + 1];
1290 };
1291
1292 static int btrfs_subvolume_create(const char *path)
1293 {
1294         int ret, fd = -1;
1295         struct btrfs_ioctl_vol_args  args;
1296         char *p, *newfull = strdup(path);
1297
1298         if (!newfull) {
1299                 ERROR("Error: out of memory");
1300                 return -1;
1301         }
1302
1303         p = strrchr(newfull, '/');
1304         if (!p) {
1305                 ERROR("bad path: %s", path);
1306                 free(newfull);
1307                 return -1;
1308         }
1309         *p = '\0';
1310
1311         fd = open(newfull, O_RDONLY);
1312         if (fd < 0) {
1313                 ERROR("Error opening %s", newfull);
1314                 free(newfull);
1315                 return -1;
1316         }
1317
1318         memset(&args, 0, sizeof(args));
1319         strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1320         args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1321         ret = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, &args);
1322         INFO("btrfs: snapshot create ioctl returned %d", ret);
1323
1324         free(newfull);
1325         close(fd);
1326         return ret;
1327 }
1328
1329 static int btrfs_snapshot(const char *orig, const char *new)
1330 {
1331         int fd = -1, fddst = -1, ret = -1;
1332         struct btrfs_ioctl_vol_args_v2  args;
1333         char *newdir, *newname, *newfull = NULL;
1334
1335         newfull = strdup(new);
1336         if (!newfull) {
1337                 ERROR("Error: out of memory");
1338                 goto out;
1339         }
1340         // make sure the directory doesn't already exist
1341         if (rmdir(newfull) < 0 && errno != -ENOENT) {
1342                 SYSERROR("Error removing empty new rootfs");
1343                 goto out;
1344         }
1345         newname = basename(newfull);
1346         newdir = dirname(newfull);
1347         fd = open(orig, O_RDONLY);
1348         if (fd < 0) {
1349                 SYSERROR("Error opening original rootfs %s", orig);
1350                 goto out;
1351         }
1352         fddst = open(newdir, O_RDONLY);
1353         if (fddst < 0) {
1354                 SYSERROR("Error opening new container dir %s", newdir);
1355                 goto out;
1356         }
1357
1358         memset(&args, 0, sizeof(args));
1359         args.fd = fd;
1360         strncpy(args.name, newname, BTRFS_SUBVOL_NAME_MAX);
1361         args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1362         ret = ioctl(fddst, BTRFS_IOC_SNAP_CREATE_V2, &args);
1363         INFO("btrfs: snapshot create ioctl returned %d", ret);
1364
1365 out:
1366         if (fddst != -1)
1367                 close(fddst);
1368         if (fd != -1)
1369                 close(fd);
1370         if (newfull)
1371                 free(newfull);
1372         return ret;
1373 }
1374
1375 static int btrfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1376                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1377                 uint64_t newsize)
1378 {
1379         if (!orig->dest || !orig->src)
1380                 return -1;
1381
1382         if (strcmp(orig->type, "btrfs")) {
1383                 int len, ret;
1384                 if (snap) {
1385                         ERROR("btrfs snapshot from %s backing store is not supported",
1386                                 orig->type);
1387                         return -1;
1388                 }
1389                 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1390                 new->src = malloc(len);
1391                 if (!new->src)
1392                         return -1;
1393                 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
1394                 if (ret < 0 || ret >= len)
1395                         return -1;
1396         } else {
1397                 // in case rootfs is in custom path, reuse it
1398                 if ((new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath)) == NULL)
1399                         return -1;
1400
1401         }
1402
1403         if ((new->dest = strdup(new->src)) == NULL)
1404                 return -1;
1405
1406         if (orig->mntopts && (new->mntopts = strdup(orig->mntopts)) == NULL)
1407                 return -1;
1408
1409         if (snap)
1410                 return btrfs_snapshot(orig->dest, new->dest);
1411
1412         if (rmdir(new->dest) < 0 && errno != -ENOENT) {
1413                 SYSERROR("removing %s", new->dest);
1414                 return -1;
1415         }
1416
1417         return btrfs_subvolume_create(new->dest);
1418 }
1419
1420 static int btrfs_destroy(struct bdev *orig)
1421 {
1422         int ret, fd = -1;
1423         struct btrfs_ioctl_vol_args  args;
1424         char *path = orig->src;
1425         char *p, *newfull = strdup(path);
1426
1427         if (!newfull) {
1428                 ERROR("Error: out of memory");
1429                 return -1;
1430         }
1431
1432         p = strrchr(newfull, '/');
1433         if (!p) {
1434                 ERROR("bad path: %s", path);
1435                 free(newfull);
1436                 return -1;
1437         }
1438         *p = '\0';
1439
1440         fd = open(newfull, O_RDONLY);
1441         if (fd < 0) {
1442                 ERROR("Error opening %s", newfull);
1443                 free(newfull);
1444                 return -1;
1445         }
1446
1447         memset(&args, 0, sizeof(args));
1448         strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1449         args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1450         ret = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &args);
1451         INFO("btrfs: snapshot create ioctl returned %d", ret);
1452
1453         free(newfull);
1454         close(fd);
1455         return ret;
1456 }
1457
1458 static int btrfs_create(struct bdev *bdev, const char *dest, const char *n,
1459                         struct bdev_specs *specs)
1460 {
1461         bdev->src = strdup(dest);
1462         bdev->dest = strdup(dest);
1463         if (!bdev->src || !bdev->dest)
1464                 return -1;
1465         return btrfs_subvolume_create(bdev->dest);
1466 }
1467
1468 static const struct bdev_ops btrfs_ops = {
1469         .detect = &btrfs_detect,
1470         .mount = &btrfs_mount,
1471         .umount = &btrfs_umount,
1472         .clone_paths = &btrfs_clonepaths,
1473         .destroy = &btrfs_destroy,
1474         .create = &btrfs_create,
1475         .can_snapshot = true,
1476 };
1477
1478 //
1479 // loopback dev ops
1480 //
1481 static int loop_detect(const char *path)
1482 {
1483         if (strncmp(path, "loop:", 5) == 0)
1484                 return 1;
1485         return 0;
1486 }
1487
1488 static int find_free_loopdev(int *retfd, char *namep)
1489 {
1490         struct dirent dirent, *direntp;
1491         struct loop_info64 lo;
1492         DIR *dir;
1493         int fd = -1;
1494
1495         dir = opendir("/dev");
1496         if (!dir) {
1497                 SYSERROR("Error opening /dev");
1498                 return -1;
1499         }
1500         while (!readdir_r(dir, &dirent, &direntp)) {
1501
1502                 if (!direntp)
1503                         break;
1504                 if (strncmp(direntp->d_name, "loop", 4) != 0)
1505                         continue;
1506                 fd = openat(dirfd(dir), direntp->d_name, O_RDWR);
1507                 if (fd < 0)
1508                         continue;
1509                 if (ioctl(fd, LOOP_GET_STATUS64, &lo) == 0 || errno != ENXIO) {
1510                         close(fd);
1511                         fd = -1;
1512                         continue;
1513                 }
1514                 // We can use this fd
1515                 snprintf(namep, 100, "/dev/%s", direntp->d_name);
1516                 break;
1517         }
1518         closedir(dir);
1519         if (fd == -1) {
1520                 ERROR("No loop device found");
1521                 return -1;
1522         }
1523
1524         *retfd = fd;
1525         return 0;
1526 }
1527
1528 static int loop_mount(struct bdev *bdev)
1529 {
1530         int lfd, ffd = -1, ret = -1;
1531         struct loop_info64 lo;
1532         char loname[100];
1533
1534         if (strcmp(bdev->type, "loop"))
1535                 return -22;
1536         if (!bdev->src || !bdev->dest)
1537                 return -22;
1538         if (find_free_loopdev(&lfd, loname) < 0)
1539                 return -22;
1540
1541         ffd = open(bdev->src + 5, O_RDWR);
1542         if (ffd < 0) {
1543                 SYSERROR("Error opening backing file %s", bdev->src);
1544                 goto out;
1545         }
1546
1547         if (ioctl(lfd, LOOP_SET_FD, ffd) < 0) {
1548                 SYSERROR("Error attaching backing file to loop dev");
1549                 goto out;
1550         }
1551         memset(&lo, 0, sizeof(lo));
1552         lo.lo_flags = LO_FLAGS_AUTOCLEAR;
1553         if (ioctl(lfd, LOOP_SET_STATUS64, &lo) < 0) {
1554                 SYSERROR("Error setting autoclear on loop dev");
1555                 goto out;
1556         }
1557
1558         ret = mount_unknown_fs(loname, bdev->dest, bdev->mntopts);
1559         if (ret < 0)
1560                 ERROR("Error mounting %s", bdev->src);
1561         else
1562                 bdev->lofd = lfd;
1563
1564 out:
1565         if (ffd > -1)
1566                 close(ffd);
1567         if (ret < 0) {
1568                 close(lfd);
1569                 bdev->lofd = -1;
1570         }
1571         return ret;
1572 }
1573
1574 static int loop_umount(struct bdev *bdev)
1575 {
1576         int ret;
1577
1578         if (strcmp(bdev->type, "loop"))
1579                 return -22;
1580         if (!bdev->src || !bdev->dest)
1581                 return -22;
1582         ret = umount(bdev->dest);
1583         if (bdev->lofd >= 0) {
1584                 close(bdev->lofd);
1585                 bdev->lofd = -1;
1586         }
1587         return ret;
1588 }
1589
1590 static int do_loop_create(const char *path, uint64_t size, const char *fstype)
1591 {
1592         int fd, ret;
1593         // create the new loopback file.
1594         fd = creat(path, S_IRUSR|S_IWUSR);
1595         if (fd < 0)
1596                 return -1;
1597         if (lseek(fd, size, SEEK_SET) < 0) {
1598                 SYSERROR("Error seeking to set new loop file size");
1599                 close(fd);
1600                 return -1;
1601         }
1602         if (write(fd, "1", 1) != 1) {
1603                 SYSERROR("Error creating new loop file");
1604                 close(fd);
1605                 return -1;
1606         }
1607         ret = close(fd);
1608         if (ret < 0) {
1609                 SYSERROR("Error closing new loop file");
1610                 return -1;
1611         }
1612
1613         // create an fs in the loopback file
1614         if (do_mkfs(path, fstype) < 0) {
1615                 ERROR("Error creating filesystem type %s on %s", fstype,
1616                         path);
1617                 return -1;
1618         }
1619
1620         return 0;
1621 }
1622
1623 /*
1624  * No idea what the original blockdev will be called, but the copy will be
1625  * called $lxcpath/$lxcname/rootdev
1626  */
1627 static int loop_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1628                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1629                 uint64_t newsize)
1630 {
1631         char fstype[100];
1632         uint64_t size = newsize;
1633         int len, ret;
1634         char *srcdev;
1635
1636         if (snap) {
1637                 ERROR("loop devices cannot be snapshotted.");
1638                 return -1;
1639         }
1640
1641         if (!orig->dest || !orig->src)
1642                 return -1;
1643
1644         len = strlen(lxcpath) + strlen(cname) + strlen("rootdev") + 3;
1645         srcdev = alloca(len);
1646         ret = snprintf(srcdev, len, "%s/%s/rootdev", lxcpath, cname);
1647         if (ret < 0 || ret >= len)
1648                 return -1;
1649
1650         new->src = malloc(len + 5);
1651         if (!new->src)
1652                 return -1;
1653         ret = snprintf(new->src, len + 5, "loop:%s", srcdev);
1654         if (ret < 0 || ret >= len + 5)
1655                 return -1;
1656
1657         new->dest = malloc(len);
1658         if (!new->dest)
1659                 return -1;
1660         ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
1661         if (ret < 0 || ret >= len)
1662                 return -1;
1663
1664         // it's tempting to say: if orig->src == loopback and !newsize, then
1665         // copy the loopback file.  However, we'd have to make sure to
1666         // correctly keep holes!  So punt for now.
1667
1668         if (is_blktype(orig)) {
1669                 if (!newsize && blk_getsize(orig, &size) < 0) {
1670                         ERROR("Error getting size of %s", orig->src);
1671                         return -1;
1672                 }
1673                 if (detect_fs(orig, fstype, 100) < 0) {
1674                         INFO("could not find fstype for %s, using %s", orig->src,
1675                                 DEFAULT_FSTYPE);
1676                         return -1;
1677                 }
1678         } else {
1679                 sprintf(fstype, "%s", DEFAULT_FSTYPE);
1680                 if (!newsize)
1681                         size = DEFAULT_FS_SIZE;
1682         }
1683         return do_loop_create(srcdev, size, fstype);
1684 }
1685
1686 static int loop_create(struct bdev *bdev, const char *dest, const char *n,
1687                         struct bdev_specs *specs)
1688 {
1689         const char *fstype;
1690         uint64_t sz;
1691         int ret, len;
1692         char *srcdev;
1693
1694         if (!specs)
1695                 return -1;
1696
1697         // dest is passed in as $lxcpath / $lxcname / rootfs
1698         // srcdev will be:      $lxcpath / $lxcname / rootdev
1699         // src will be 'loop:$srcdev'
1700         len = strlen(dest) + 2;
1701         srcdev = alloca(len);
1702
1703         ret = snprintf(srcdev, len, "%s", dest);
1704         if (ret < 0 || ret >= len)
1705                 return -1;
1706         sprintf(srcdev + len - 4, "dev");
1707
1708         bdev->src = malloc(len + 5);
1709         if (!bdev->src)
1710                 return -1;
1711         ret = snprintf(bdev->src, len + 5, "loop:%s", srcdev);
1712         if (ret < 0 || ret >= len + 5)
1713                 return -1;
1714
1715         sz = specs->fssize;
1716         if (!sz)
1717                 sz = DEFAULT_FS_SIZE;
1718
1719         fstype = specs->fstype;
1720         if (!fstype)
1721                 fstype = DEFAULT_FSTYPE;
1722
1723         if (!(bdev->dest = strdup(dest)))
1724                 return -1;
1725
1726         if (mkdir_p(bdev->dest, 0755) < 0) {
1727                 ERROR("Error creating %s", bdev->dest);
1728                 return -1;
1729         }
1730
1731         return do_loop_create(srcdev, sz, fstype);
1732 }
1733
1734 static int loop_destroy(struct bdev *orig)
1735 {
1736         return unlink(orig->src + 5);
1737 }
1738
1739 static const struct bdev_ops loop_ops = {
1740         .detect = &loop_detect,
1741         .mount = &loop_mount,
1742         .umount = &loop_umount,
1743         .clone_paths = &loop_clonepaths,
1744         .destroy = &loop_destroy,
1745         .create = &loop_create,
1746         .can_snapshot = false,
1747 };
1748
1749 //
1750 // overlayfs ops
1751 //
1752
1753 static int overlayfs_detect(const char *path)
1754 {
1755         if (strncmp(path, "overlayfs:", 10) == 0)
1756                 return 1; // take their word for it
1757         return 0;
1758 }
1759
1760 //
1761 // XXXXXXX plain directory bind mount ops
1762 //
1763 static int overlayfs_mount(struct bdev *bdev)
1764 {
1765         char *options, *dup, *lower, *upper;
1766         int len;
1767         unsigned long mntflags;
1768         char *mntdata;
1769         int ret;
1770
1771         if (strcmp(bdev->type, "overlayfs"))
1772                 return -22;
1773         if (!bdev->src || !bdev->dest)
1774                 return -22;
1775
1776         //  separately mount it first
1777         //  mount -t overlayfs -oupperdir=${upper},lowerdir=${lower} lower dest
1778         dup = alloca(strlen(bdev->src)+1);
1779         strcpy(dup, bdev->src);
1780         if (!(lower = index(dup, ':')))
1781                 return -22;
1782         if (!(upper = index(++lower, ':')))
1783                 return -22;
1784         *upper = '\0';
1785         upper++;
1786
1787         if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
1788                 free(mntdata);
1789                 return -22;
1790         }
1791
1792         // TODO We should check whether bdev->src is a blockdev, and if so
1793         // but for now, only support overlays of a basic directory
1794
1795         if (mntdata) {
1796                 len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=,") + strlen(mntdata) + 1;
1797                 options = alloca(len);
1798                 ret = snprintf(options, len, "upperdir=%s,lowerdir=%s,%s", upper, lower, mntdata);
1799         }
1800         else {
1801                 len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=") + 1;
1802                 options = alloca(len);
1803                 ret = snprintf(options, len, "upperdir=%s,lowerdir=%s", upper, lower);
1804         }
1805         if (ret < 0 || ret >= len) {
1806                 free(mntdata);
1807                 return -1;
1808         }
1809
1810         ret = mount(lower, bdev->dest, "overlayfs", MS_MGC_VAL | mntflags, options);
1811         if (ret < 0)
1812                 SYSERROR("overlayfs: error mounting %s onto %s options %s",
1813                         lower, bdev->dest, options);
1814         else
1815                 INFO("overlayfs: mounted %s onto %s options %s",
1816                         lower, bdev->dest, options);
1817         return ret;
1818 }
1819
1820 static int overlayfs_umount(struct bdev *bdev)
1821 {
1822         if (strcmp(bdev->type, "overlayfs"))
1823                 return -22;
1824         if (!bdev->src || !bdev->dest)
1825                 return -22;
1826         return umount(bdev->dest);
1827 }
1828
1829 static int overlayfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1830                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1831                 uint64_t newsize)
1832 {
1833         if (!snap) {
1834                 ERROR("overlayfs is only for snapshot clones");
1835                 return -22;
1836         }
1837
1838         if (!orig->src || !orig->dest)
1839                 return -1;
1840
1841         new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
1842         if (!new->dest)
1843                 return -1;
1844         if (mkdir_p(new->dest, 0755) < 0)
1845                 return -1;
1846
1847         if (strcmp(orig->type, "dir") == 0) {
1848                 char *delta;
1849                 int ret, len;
1850
1851                 // if we have /var/lib/lxc/c2/rootfs, then delta will be
1852                 //            /var/lib/lxc/c2/delta0
1853                 delta = strdup(new->dest);
1854                 if (!delta) {
1855                         return -1;
1856                 }
1857                 if (strlen(delta) < 6) {
1858                         free(delta);
1859                         return -22;
1860                 }
1861                 strcpy(&delta[strlen(delta)-6], "delta0");
1862                 if ((ret = mkdir(delta, 0755)) < 0) {
1863                         SYSERROR("error: mkdir %s", delta);
1864                         free(delta);
1865                         return -1;
1866                 }
1867
1868                 // the src will be 'overlayfs:lowerdir:upperdir'
1869                 len = strlen(delta) + strlen(orig->src) + 12;
1870                 new->src = malloc(len);
1871                 if (!new->src) {
1872                         free(delta);
1873                         return -ENOMEM;
1874                 }
1875                 ret = snprintf(new->src, len, "overlayfs:%s:%s", orig->src, delta);
1876                 free(delta);
1877                 if (ret < 0 || ret >= len)
1878                         return -ENOMEM;
1879         } else if (strcmp(orig->type, "overlayfs") == 0) {
1880                 // What exactly do we want to do here?
1881                 // I think we want to use the original lowerdir, with a
1882                 // private delta which is originally rsynced from the
1883                 // original delta
1884                 char *osrc, *odelta, *nsrc, *ndelta;
1885                 int len, ret;
1886                 if (!(osrc = strdup(orig->src)))
1887                         return -22;
1888                 nsrc = index(osrc, ':') + 1;
1889                 if (nsrc != osrc + 10 || (odelta = index(nsrc, ':')) == NULL) {
1890                         free(osrc);
1891                         return -22;
1892                 }
1893                 *odelta = '\0';
1894                 odelta++;
1895                 ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath);
1896                 if (!ndelta) {
1897                         free(osrc);
1898                         return -ENOMEM;
1899                 }
1900                 if (do_rsync(odelta, ndelta) < 0) {
1901                         free(osrc);
1902                         free(ndelta);
1903                         ERROR("copying overlayfs delta");
1904                         return -1;
1905                 }
1906                 len = strlen(nsrc) + strlen(ndelta) + 12;
1907                 new->src = malloc(len);
1908                 if (!new->src) {
1909                         free(osrc);
1910                         free(ndelta);
1911                         return -ENOMEM;
1912                 }
1913                 ret = snprintf(new->src, len, "overlayfs:%s:%s", nsrc, ndelta);
1914                 free(osrc);
1915                 free(ndelta);
1916                 if (ret < 0 || ret >= len)
1917                         return -ENOMEM;
1918         } else {
1919                 ERROR("overlayfs clone of %s container is not yet supported",
1920                         orig->type);
1921                 // Note, supporting this will require overlayfs_mount supporting
1922                 // mounting of the underlay.  No big deal, just needs to be done.
1923                 return -1;
1924         }
1925
1926         return 0;
1927 }
1928
1929 static int overlayfs_destroy(struct bdev *orig)
1930 {
1931         char *upper;
1932
1933         if (strncmp(orig->src, "overlayfs:", 10) != 0)
1934                 return -22;
1935         upper = index(orig->src + 10, ':');
1936         if (!upper)
1937                 return -22;
1938         upper++;
1939         return lxc_rmdir_onedev(upper);
1940 }
1941
1942 /*
1943  * to say 'lxc-create -t ubuntu -n o1 -B overlayfs' means you want
1944  * $lxcpath/$lxcname/rootfs to have the created container, while all
1945  * changes after starting the container are written to
1946  * $lxcpath/$lxcname/delta0
1947  */
1948 static int overlayfs_create(struct bdev *bdev, const char *dest, const char *n,
1949                         struct bdev_specs *specs)
1950 {
1951         char *delta;
1952         int ret, len = strlen(dest), newlen;
1953
1954         if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0)
1955                 return -1;
1956
1957         if (!(bdev->dest = strdup(dest))) {
1958                 ERROR("Out of memory");
1959                 return -1;
1960         }
1961
1962         delta = alloca(strlen(dest)+1);
1963         strcpy(delta, dest);
1964         strcpy(delta+len-6, "delta0");
1965
1966         if (mkdir_p(delta, 0755) < 0) {
1967                 ERROR("Error creating %s", delta);
1968                 return -1;
1969         }
1970
1971         /* overlayfs:lower:upper */
1972         newlen = (2 * len) + strlen("overlayfs:") + 2;
1973         bdev->src = malloc(newlen);
1974         if (!bdev->src) {
1975                 ERROR("Out of memory");
1976                 return -1;
1977         }
1978         ret = snprintf(bdev->src, newlen, "overlayfs:%s:%s", dest, delta);
1979         if (ret < 0 || ret >= newlen)
1980                 return -1;
1981
1982         if (mkdir_p(bdev->dest, 0755) < 0) {
1983                 ERROR("Error creating %s", bdev->dest);
1984                 return -1;
1985         }
1986
1987         return 0;
1988 }
1989
1990 static const struct bdev_ops overlayfs_ops = {
1991         .detect = &overlayfs_detect,
1992         .mount = &overlayfs_mount,
1993         .umount = &overlayfs_umount,
1994         .clone_paths = &overlayfs_clonepaths,
1995         .destroy = &overlayfs_destroy,
1996         .create = &overlayfs_create,
1997         .can_snapshot = true,
1998 };
1999
2000 static const struct bdev_type bdevs[] = {
2001         {.name = "zfs", .ops = &zfs_ops,},
2002         {.name = "lvm", .ops = &lvm_ops,},
2003         {.name = "btrfs", .ops = &btrfs_ops,},
2004         {.name = "dir", .ops = &dir_ops,},
2005         {.name = "overlayfs", .ops = &overlayfs_ops,},
2006         {.name = "loop", .ops = &loop_ops,},
2007 };
2008
2009 static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type);
2010
2011 void bdev_put(struct bdev *bdev)
2012 {
2013         if (bdev->mntopts)
2014                 free(bdev->mntopts);
2015         if (bdev->src)
2016                 free(bdev->src);
2017         if (bdev->dest)
2018                 free(bdev->dest);
2019         free(bdev);
2020 }
2021
2022 struct bdev *bdev_get(const char *type)
2023 {
2024         int i;
2025         struct bdev *bdev;
2026
2027         for (i=0; i<numbdevs; i++) {
2028                 if (strcmp(bdevs[i].name, type) == 0)
2029                         break;
2030         }
2031         if (i == numbdevs)
2032                 return NULL;
2033         bdev = malloc(sizeof(struct bdev));
2034         if (!bdev)
2035                 return NULL;
2036         memset(bdev, 0, sizeof(struct bdev));
2037         bdev->ops = bdevs[i].ops;
2038         bdev->type = bdevs[i].name;
2039         return bdev;
2040 }
2041
2042 struct bdev *bdev_init(const char *src, const char *dst, const char *mntopts)
2043 {
2044         int i;
2045         struct bdev *bdev;
2046
2047         for (i=0; i<numbdevs; i++) {
2048                 int r;
2049                 r = bdevs[i].ops->detect(src);
2050                 if (r)
2051                         break;
2052         }
2053
2054         if (i == numbdevs)
2055                 return NULL;
2056         bdev = malloc(sizeof(struct bdev));
2057         if (!bdev)
2058                 return NULL;
2059         memset(bdev, 0, sizeof(struct bdev));
2060         bdev->ops = bdevs[i].ops;
2061         bdev->type = bdevs[i].name;
2062         if (mntopts)
2063                 bdev->mntopts = strdup(mntopts);
2064         if (src)
2065                 bdev->src = strdup(src);
2066         if (dst)
2067                 bdev->dest = strdup(dst);
2068
2069         return bdev;
2070 }
2071
2072 struct rsync_data {
2073         struct bdev *orig;
2074         struct bdev *new;
2075 };
2076
2077 static int rsync_rootfs(struct rsync_data *data)
2078 {
2079         struct bdev *orig = data->orig,
2080                     *new = data->new;
2081
2082         if (unshare(CLONE_NEWNS) < 0) {
2083                 SYSERROR("unshare CLONE_NEWNS");
2084                 return -1;
2085         }
2086
2087         // If not a snapshot, copy the fs.
2088         if (orig->ops->mount(orig) < 0) {
2089                 ERROR("failed mounting %s onto %s", orig->src, orig->dest);
2090                 return -1;
2091         }
2092         if (new->ops->mount(new) < 0) {
2093                 ERROR("failed mounting %s onto %s", new->src, new->dest);
2094                 return -1;
2095         }
2096         if (setgid(0) < 0) {
2097                 ERROR("Failed to setgid to 0");
2098                 return -1;
2099         }
2100         if (setgroups(0, NULL) < 0)
2101                 WARN("Failed to clear groups");
2102         if (setuid(0) < 0) {
2103                 ERROR("Failed to setuid to 0");
2104                 return -1;
2105         }
2106         if (do_rsync(orig->dest, new->dest) < 0) {
2107                 ERROR("rsyncing %s to %s", orig->src, new->src);
2108                 return -1;
2109         }
2110
2111         return 0;
2112 }
2113
2114 static int rsync_rootfs_wrapper(void *data)
2115 {
2116         struct rsync_data *arg = data;
2117         return rsync_rootfs(arg);
2118 }
2119 /*
2120  * If we're not snaphotting, then bdev_copy becomes a simple case of mount
2121  * the original, mount the new, and rsync the contents.
2122  */
2123 struct bdev *bdev_copy(struct lxc_container *c0, const char *cname,
2124                         const char *lxcpath, const char *bdevtype,
2125                         int flags, const char *bdevdata, uint64_t newsize,
2126                         int *needs_rdep)
2127 {
2128         struct bdev *orig, *new;
2129         pid_t pid;
2130         int ret;
2131         bool snap = flags & LXC_CLONE_SNAPSHOT;
2132         bool maybe_snap = flags & LXC_CLONE_MAYBE_SNAPSHOT;
2133         bool keepbdevtype = flags & LXC_CLONE_KEEPBDEVTYPE;
2134         const char *src = c0->lxc_conf->rootfs.path;
2135         const char *oldname = c0->name;
2136         const char *oldpath = c0->config_path;
2137         struct rsync_data data;
2138
2139         /* if the container name doesn't show up in the rootfs path, then
2140          * we don't know how to come up with a new name
2141          */
2142         if (strstr(src, oldname) == NULL) {
2143                 ERROR("original rootfs path %s doesn't include container name %s",
2144                         src, oldname);
2145                 return NULL;
2146         }
2147
2148         orig = bdev_init(src, NULL, NULL);
2149         if (!orig) {
2150                 ERROR("failed to detect blockdev type for %s", src);
2151                 return NULL;
2152         }
2153
2154         if (!orig->dest) {
2155                 int ret;
2156                 orig->dest = malloc(MAXPATHLEN);
2157                 if (!orig->dest) {
2158                         ERROR("out of memory");
2159                         bdev_put(orig);
2160                         return NULL;
2161                 }
2162                 ret = snprintf(orig->dest, MAXPATHLEN, "%s/%s/rootfs", oldpath, oldname);
2163                 if (ret < 0 || ret >= MAXPATHLEN) {
2164                         ERROR("rootfs path too long");
2165                         bdev_put(orig);
2166                         return NULL;
2167                 }
2168         }
2169
2170         /* check for privilege */
2171         if (am_unpriv()) {
2172                 if (snap && !maybe_snap) {
2173                         ERROR("Unprivileged users cannot snapshot");
2174                         bdev_put(orig);
2175                         return NULL;
2176                 }
2177                 if (bdevtype && strcmp(bdevtype, "dir") != 0) {
2178                         ERROR("Unprivileged users can only make dir copy-clones");
2179                         bdev_put(orig);
2180                         return NULL;
2181                 }
2182                 if (strcmp(orig->type, "dir") != 0) {
2183                         ERROR("Unprivileged users can only make dir copy-clones");
2184                         bdev_put(orig);
2185                         return NULL;
2186                 }
2187         }
2188
2189
2190         /*
2191          * special case for snapshot - if caller requested maybe_snapshot and
2192          * keepbdevtype and backing store is directory, then proceed with a copy
2193          * clone rather than returning error
2194          */
2195         if (maybe_snap && keepbdevtype && !bdevtype && !orig->ops->can_snapshot)
2196                 snap = false;
2197
2198         /*
2199          * If newtype is NULL and snapshot is set, then use overlayfs
2200          */
2201         if (!bdevtype && !keepbdevtype && snap && strcmp(orig->type , "dir") == 0)
2202                 bdevtype = "overlayfs";
2203
2204         *needs_rdep = 0;
2205         if (bdevtype && strcmp(orig->type, "dir") == 0 &&
2206                         strcmp(bdevtype, "overlayfs") == 0)
2207                 *needs_rdep = 1;
2208
2209         new = bdev_get(bdevtype ? bdevtype : orig->type);
2210         if (!new) {
2211                 ERROR("no such block device type: %s", bdevtype ? bdevtype : orig->type);
2212                 bdev_put(orig);
2213                 return NULL;
2214         }
2215
2216         if (new->ops->clone_paths(orig, new, oldname, cname, oldpath, lxcpath, snap, newsize) < 0) {
2217                 ERROR("failed getting pathnames for cloned storage: %s", src);
2218                 bdev_put(orig);
2219                 bdev_put(new);
2220                 return NULL;
2221         }
2222         if (snap)
2223                 return new;
2224
2225         pid = fork();
2226         if (pid < 0) {
2227                 SYSERROR("fork");
2228                 bdev_put(orig);
2229                 bdev_put(new);
2230                 return NULL;
2231         }
2232
2233         if (pid > 0) {
2234                 int ret = wait_for_pid(pid);
2235                 bdev_put(orig);
2236                 if (ret < 0) {
2237                         bdev_put(new);
2238                         return NULL;
2239                 }
2240                 return new;
2241         }
2242
2243         data.orig = orig;
2244         data.new = new;
2245         if (am_unpriv())
2246                 ret = userns_exec_1(c0->lxc_conf, rsync_rootfs_wrapper, &data);
2247         else
2248                 ret = rsync_rootfs(&data);
2249
2250         exit(ret == 0 ? 0 : 1);
2251 }
2252
2253 static struct bdev * do_bdev_create(const char *dest, const char *type,
2254                         const char *cname, struct bdev_specs *specs)
2255 {
2256         struct bdev *bdev = bdev_get(type);
2257         if (!bdev) {
2258                 return NULL;
2259         }
2260
2261         if (bdev->ops->create(bdev, dest, cname, specs) < 0) {
2262                  bdev_put(bdev);
2263                  return NULL;
2264         }
2265
2266         return bdev;
2267 }
2268
2269 /*
2270  * bdev_create:
2271  * Create a backing store for a container.
2272  * If successfull, return a struct bdev *, with the bdev mounted and ready
2273  * for use.  Before completing, the caller will need to call the
2274  * umount operation and bdev_put().
2275  * @dest: the mountpoint (i.e. /var/lib/lxc/$name/rootfs)
2276  * @type: the bdevtype (dir, btrfs, zfs, etc)
2277  * @cname: the container name
2278  * @specs: details about the backing store to create, like fstype
2279  */
2280 struct bdev *bdev_create(const char *dest, const char *type,
2281                         const char *cname, struct bdev_specs *specs)
2282 {
2283         struct bdev *bdev;
2284         char *best_options[] = {"btrfs", "zfs", "lvm", "dir", NULL};
2285
2286         if (!type)
2287                 return do_bdev_create(dest, "dir", cname, specs);
2288
2289         if (strcmp(type, "best") == 0) {
2290                 int i;
2291                 // try for the best backing store type, according to our
2292                 // opinionated preferences
2293                 for (i=0; best_options[i]; i++) {
2294                         if ((bdev = do_bdev_create(dest, best_options[i], cname, specs)))
2295                                 return bdev;
2296                 }
2297                 return NULL;  // 'dir' should never fail, so this shouldn't happen
2298         }
2299
2300         // -B lvm,dir
2301         if (index(type, ',') != NULL) {
2302                 char *dup = alloca(strlen(type)+1), *saveptr, *token;
2303                 strcpy(dup, type);
2304                 for (token = strtok_r(dup, ",", &saveptr); token;
2305                                 token = strtok_r(NULL, ",", &saveptr)) {
2306                         if ((bdev = do_bdev_create(dest, token, cname, specs)))
2307                                 return bdev;
2308                 }
2309         }
2310
2311         return do_bdev_create(dest, type, cname, specs);
2312 }
2313
2314 char *overlayfs_getlower(char *p)
2315 {
2316         char *p1 = index(p, ':');
2317         if (p1)
2318                 *p1 = '\0';
2319         return p;
2320 }