src/lxc/bdev.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23
  24 /*
  25  * this is all just a first shot for experiment.  If we go this route, much
  26  * shoudl change.  bdev should be a directory with per-bdev file.  Things which
  27  * I'm doing by calling out to userspace should sometimes be done through
  28  * libraries like liblvm2
  29  */
  30 #define _GNU_SOURCE
  31 #include <stdio.h>
  32 #include <unistd.h>
  33 #include <errno.h>
  34 #include <sched.h>
  35 #include <sys/mount.h>
  36 #include <sys/wait.h>
  37 #include <libgen.h>
  38 #include <linux/loop.h>
  39 #include <dirent.h>
  40 #include "lxc.h"
  41 #include "config.h"
  42 #include "conf.h"
  43 #include "bdev.h"
  44 #include "log.h"
  45 #include "error.h"
  46 #include "utils.h"
  47 #include "namespace.h"
  48 #include "parse.h"
  49 #include "utils.h"
  50
  51 #ifndef BLKGETSIZE64
  52 #define BLKGETSIZE64 _IOR(0x12,114,size_t)
  53 #endif
  54
  55 #ifndef LO_FLAGS_AUTOCLEAR
  56 #define LO_FLAGS_AUTOCLEAR 4
  57 #endif
  58
  59 lxc_log_define(bdev, lxc);
  60
  61 static int do_rsync(const char *src, const char *dest)
  62 {
  63         // call out to rsync
  64         pid_t pid;
  65         char *s;
  66         size_t l;
  67
  68         pid = fork();
  69         if (pid < 0)
  70                 return -1;
  71         if (pid > 0)
  72                 return wait_for_pid(pid);
  73         l = strlen(src) + 2;
  74         s = malloc(l);
  75         if (!s)
  76                 exit(1);
  77         strcpy(s, src);
  78         s[l-2] = '/';
  79         s[l-1] = '\0';
  80
  81         execlp("rsync", "rsync", "-a", s, dest, (char *)NULL);
  82         exit(1);
  83 }
  84
  85 /*
  86  * return block size of dev->src
  87  */
  88 static int blk_getsize(struct bdev *bdev, unsigned long *size)
  89 {
  90         int fd, ret;
  91         char *path = bdev->src;
  92
  93         if (strcmp(bdev->type, "loop") == 0)
  94                 path = bdev->src + 5;
  95
  96         fd = open(path, O_RDONLY);
  97         if (fd < 0)
  98                 return -1;
  99         ret = ioctl(fd, BLKGETSIZE64, size);
 100         close(fd);
 101         return ret;
 102 }
 103
 104 /*
 105  * These are copied from conf.c.  However as conf.c will be moved to using
 106  * the callback system, they can be pulled from there eventually, so we
 107  * don't need to pollute utils.c with these low level functions
 108  */
 109 static int find_fstype_cb(char* buffer, void *data)
 110 {
 111         struct cbarg {
 112                 const char *rootfs;
 113                 const char *target;
 114                 int mntopt;
 115         } *cbarg = data;
 116
 117         char *fstype;
 118
 119         /* we don't try 'nodev' entries */
 120         if (strstr(buffer, "nodev"))
 121                 return 0;
 122
 123         fstype = buffer;
 124         fstype += lxc_char_left_gc(fstype, strlen(fstype));
 125         fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
 126
 127         DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
 128               cbarg->rootfs, cbarg->target, fstype);
 129
 130         if (mount(cbarg->rootfs, cbarg->target, fstype, cbarg->mntopt, NULL)) {
 131                 DEBUG("mount failed with error: %s", strerror(errno));
 132                 return 0;
 133         }
 134
 135         INFO("mounted '%s' on '%s', with fstype '%s'",
 136              cbarg->rootfs, cbarg->target, fstype);
 137
 138         return 1;
 139 }
 140
 141 static int mount_unknow_fs(const char *rootfs, const char *target, int mntopt)
 142 {
 143         int i;
 144
 145         struct cbarg {
 146                 const char *rootfs;
 147                 const char *target;
 148                 int mntopt;
 149         } cbarg = {
 150                 .rootfs = rootfs,
 151                 .target = target,
 152                 .mntopt = mntopt,
 153         };
 154
 155         /*
 156          * find the filesystem type with brute force:
 157          * first we check with /etc/filesystems, in case the modules
 158          * are auto-loaded and fall back to the supported kernel fs
 159          */
 160         char *fsfile[] = {
 161                 "/etc/filesystems",
 162                 "/proc/filesystems",
 163         };
 164
 165         for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
 166
 167                 int ret;
 168
 169                 if (access(fsfile[i], F_OK))
 170                         continue;
 171
 172                 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
 173                 if (ret < 0) {
 174                         ERROR("failed to parse '%s'", fsfile[i]);
 175                         return -1;
 176                 }
 177
 178                 if (ret)
 179                         return 0;
 180         }
 181
 182         ERROR("failed to determine fs type for '%s'", rootfs);
 183         return -1;
 184 }
 185
 186 static int do_mkfs(const char *path, const char *fstype)
 187 {
 188         pid_t pid;
 189
 190         if ((pid = fork()) < 0) {
 191                 ERROR("error forking");
 192                 return -1;
 193         }
 194         if (pid > 0)
 195                 return wait_for_pid(pid);
 196
 197         // If the file is not a block device, we don't want mkfs to ask
 198         // us about whether to proceed.
 199         close(0);
 200         close(1);
 201         close(2);
 202         open("/dev/zero", O_RDONLY);
 203         open("/dev/null", O_RDWR);
 204         open("/dev/null", O_RDWR);
 205         execlp("mkfs", "mkfs", "-t", fstype, path, NULL);
 206         exit(1);
 207 }
 208
 209 static char *linkderef(char *path, char *dest)
 210 {
 211         struct stat sbuf;
 212         ssize_t ret;
 213
 214         ret = stat(path, &sbuf);
 215         if (ret < 0)
 216                 return NULL;
 217         if (!S_ISLNK(sbuf.st_mode))
 218                 return path;
 219         ret = readlink(path, dest, MAXPATHLEN);
 220         if (ret < 0) {
 221                 SYSERROR("error reading link %s", path);
 222                 return NULL;
 223         } else if (ret >= MAXPATHLEN) {
 224                 ERROR("link in %s too long", path);
 225                 return NULL;
 226         }
 227         dest[ret] = '\0';
 228         return dest;
 229 }
 230
 231 /*
 232  * Given a bdev (presumably blockdev-based), detect the fstype
 233  * by trying mounting (in a private mntns) it.
 234  * @bdev: bdev to investigate
 235  * @type: preallocated char* in which to write the fstype
 236  * @len: length of passed in char*
 237  * Returns length of fstype, of -1 on error
 238  */
 239 static int detect_fs(struct bdev *bdev, char *type, int len)
 240 {
 241         int  p[2], ret;
 242         size_t linelen;
 243         pid_t pid;
 244         FILE *f;
 245         char *sp1, *sp2, *sp3, *line = NULL;
 246         char *srcdev;
 247
 248         if (!bdev || !bdev->src || !bdev->dest)
 249                 return -1;
 250
 251         srcdev = bdev->src;
 252         if (strcmp(bdev->type, "loop") == 0)
 253                 srcdev = bdev->src + 5;
 254
 255         if (pipe(p) < 0)
 256                 return -1;
 257         if ((pid = fork()) < 0)
 258                 return -1;
 259         if (pid > 0) {
 260                 int status;
 261                 close(p[1]);
 262                 memset(type, 0, len);
 263                 ret = read(p[0], type, len-1);
 264                 close(p[0]);
 265                 if (ret < 0) {
 266                         SYSERROR("error reading from pipe");
 267                         wait(&status);
 268                         return -1;
 269                 } else if (ret == 0) {
 270                         ERROR("child exited early - fstype not found");
 271                         wait(&status);
 272                         return -1;
 273                 }
 274                 wait(&status);
 275                 type[len-1] = '\0';
 276                 INFO("detected fstype %s for %s", type, srcdev);
 277                 return ret;
 278         }
 279
 280         if (unshare(CLONE_NEWNS) < 0)
 281                 exit(1);
 282
 283         ret = mount_unknow_fs(srcdev, bdev->dest, 0);
 284         if (ret < 0) {
 285                 ERROR("failed mounting %s onto %s to detect fstype", srcdev, bdev->dest);
 286                 exit(1);
 287         }
 288         // if symlink, get the real dev name
 289         char devpath[MAXPATHLEN];
 290         char *l = linkderef(srcdev, devpath);
 291         if (!l)
 292                 exit(1);
 293         f = fopen("/proc/self/mounts", "r");
 294         if (!f)
 295                 exit(1);
 296         while (getline(&line, &linelen, f) != -1) {
 297                 sp1 = index(line, ' ');
 298                 if (!sp1)
 299                         exit(1);
 300                 *sp1 = '\0';
 301                 if (strcmp(line, l))
 302                         continue;
 303                 sp2 = index(sp1+1, ' ');
 304                 if (!sp2)
 305                         exit(1);
 306                 *sp2 = '\0';
 307                 sp3 = index(sp2+1, ' ');
 308                 if (!sp3)
 309                         exit(1);
 310                 *sp3 = '\0';
 311                 sp2++;
 312                 if (write(p[1], sp2, strlen(sp2)) != strlen(sp2))
 313                         exit(1);
 314                 exit(0);
 315         }
 316         exit(1);
 317 }
 318
 319 struct bdev_type {
 320         char *name;
 321         struct bdev_ops *ops;
 322 };
 323
 324 static int is_dir(const char *path)
 325 {
 326         struct stat statbuf;
 327         int ret = stat(path, &statbuf);
 328         if (ret == 0 && S_ISDIR(statbuf.st_mode))
 329                 return 1;
 330         return 0;
 331 }
 332
 333 static int dir_detect(const char *path)
 334 {
 335         if (strncmp(path, "dir:", 4) == 0)
 336                 return 1; // take their word for it
 337         if (is_dir(path))
 338                 return 1;
 339         return 0;
 340 }
 341
 342 //
 343 // XXXXXXX plain directory bind mount ops
 344 //
 345 static int dir_mount(struct bdev *bdev)
 346 {
 347         if (strcmp(bdev->type, "dir"))
 348                 return -22;
 349         if (!bdev->src || !bdev->dest)
 350                 return -22;
 351         return mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC, NULL);
 352 }
 353
 354 static int dir_umount(struct bdev *bdev)
 355 {
 356         if (strcmp(bdev->type, "dir"))
 357                 return -22;
 358         if (!bdev->src || !bdev->dest)
 359                 return -22;
 360         return umount(bdev->dest);
 361 }
 362
 363 /* the bulk of this needs to become a common helper */
 364 static char *dir_new_path(char *src, const char *oldname, const char *name,
 365                         const char *oldpath, const char *lxcpath)
 366 {
 367         char *ret, *p, *p2;
 368         int l1, l2, nlen;
 369
 370         nlen = strlen(src) + 1;
 371         l1 = strlen(oldpath);
 372         p = src;
 373         /* if src starts with oldpath, look for oldname only after
 374          * that path */
 375         if (strncmp(src, oldpath, l1) == 0) {
 376                 p += l1;
 377                 nlen += (strlen(lxcpath) - l1);
 378         }
 379         l2 = strlen(oldname);
 380         while ((p = strstr(p, oldname)) != NULL) {
 381                 p += l2;
 382                 nlen += strlen(name) - l2;
 383         }
 384
 385         ret = malloc(nlen);
 386         if (!ret)
 387                 return NULL;
 388
 389         p = ret;
 390         if (strncmp(src, oldpath, l1) == 0) {
 391                 p += sprintf(p, "%s", lxcpath);
 392                 src += l1;
 393         }
 394
 395         while ((p2 = strstr(src, oldname)) != NULL) {
 396                 strncpy(p, src, p2-src); // copy text up to oldname
 397                 p += p2-src; // move target pointer (p)
 398                 p += sprintf(p, "%s", name); // print new name in place of oldname
 399                 src = p2 + l2;  // move src to end of oldname
 400         }
 401         sprintf(p, "%s", src);  // copy the rest of src
 402         return ret;
 403 }
 404
 405 /*
 406  * for a simple directory bind mount, we substitute the old container
 407  * name and paths for the new
 408  */
 409 static int dir_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
 410                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
 411                 unsigned long newsize)
 412 {
 413         int len, ret;
 414
 415         if (snap) {
 416                 ERROR("directories cannot be snapshotted.  Try overlayfs.");
 417                 return -1;
 418         }
 419
 420         if (!orig->dest || !orig->src)
 421                 return -1;
 422
 423         len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
 424         new->src = malloc(len);
 425         if (!new->src)
 426                 return -1;
 427         ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
 428         if (ret < 0 || ret >= len)
 429                 return -1;
 430         if ((new->dest = strdup(new->src)) == NULL)
 431                 return -1;
 432
 433         return 0;
 434 }
 435
 436 static int dir_destroy(struct bdev *orig)
 437 {
 438         if (!lxc_rmdir_onedev(orig->src))
 439                 return -1;
 440         return 0;
 441 }
 442
 443 static int dir_create(struct bdev *bdev, const char *dest, const char *n,
 444                         struct bdev_specs *specs)
 445 {
 446         bdev->src = strdup(dest);
 447         bdev->dest = strdup(dest);
 448         if (!bdev->src || !bdev->dest) {
 449                 ERROR("Out of memory");
 450                 return -1;
 451         }
 452
 453         if (mkdir_p(bdev->src, 0755) < 0) {
 454                 ERROR("Error creating %s\n", bdev->src);
 455                 return -1;
 456         }
 457         if (mkdir_p(bdev->dest, 0755) < 0) {
 458                 ERROR("Error creating %s\n", bdev->dest);
 459                 return -1;
 460         }
 461
 462         return 0;
 463 }
 464
 465 struct bdev_ops dir_ops = {
 466         .detect = &dir_detect,
 467         .mount = &dir_mount,
 468         .umount = &dir_umount,
 469         .clone_paths = &dir_clonepaths,
 470         .destroy = &dir_destroy,
 471         .create = &dir_create,
 472 };
 473
 474
 475 //
 476 // XXXXXXX zfs ops
 477 // There are two ways we could do this.  We could always specify the
 478 // 'zfs device' (i.e. tank/lxc lxc/container) as rootfs.  But instead
 479 // (at least right now) we have lxc-create specify $lxcpath/$lxcname/rootfs
 480 // as the mountpoint, so that it is always mounted.
 481 //
 482 // That means 'mount' is really never needed and could be noop, but for the
 483 // sake of flexibility let's always bind-mount.
 484 //
 485
 486 static int zfs_list_entry(const char *path, char *output, size_t inlen)
 487 {
 488         FILE *f;
 489         int found=0;
 490
 491         if ((f = popen("zfs list 2> /dev/null", "r")) == NULL) {
 492                 SYSERROR("popen failed");
 493                 return 0;
 494         }
 495         while (fgets(output, inlen, f)) {
 496                 if (strstr(output, path)) {
 497                         found = 1;
 498                         break;
 499                 }
 500         }
 501         (void) pclose(f);
 502
 503         return found;
 504 }
 505
 506 static int zfs_detect(const char *path)
 507 {
 508         char *output = malloc(LXC_LOG_BUFFER_SIZE);
 509         int found;
 510
 511         if (!output) {
 512                 ERROR("out of memory");
 513                 return 0;
 514         }
 515         found = zfs_list_entry(path, output, LXC_LOG_BUFFER_SIZE);
 516         free(output);
 517         return found;
 518 }
 519
 520 static int zfs_mount(struct bdev *bdev)
 521 {
 522         if (strcmp(bdev->type, "zfs"))
 523                 return -22;
 524         if (!bdev->src || !bdev->dest)
 525                 return -22;
 526         return mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC, NULL);
 527 }
 528
 529 static int zfs_umount(struct bdev *bdev)
 530 {
 531         if (strcmp(bdev->type, "zfs"))
 532                 return -22;
 533         if (!bdev->src || !bdev->dest)
 534                 return -22;
 535         return umount(bdev->dest);
 536 }
 537
 538 static int zfs_clone(const char *opath, const char *npath, const char *oname,
 539                         const char *nname, const char *lxcpath, int snapshot)
 540 {
 541         // use the 'zfs list | grep opath' entry to get the zfsroot
 542         char output[MAXPATHLEN], option[MAXPATHLEN], *p;
 543         const char *zfsroot = output;
 544         int ret;
 545         pid_t pid;
 546
 547         if (zfs_list_entry(opath, output, MAXPATHLEN)) {
 548                 // zfsroot is output up to ' '
 549                 if ((p = index(output, ' ')) == NULL)
 550                         return -1;
 551                 *p = '\0';
 552                 if ((p = strrchr(output, '/')) == NULL)
 553                         return -1;
 554                 *p = '\0';
 555         } else
 556                 zfsroot = default_zfs_root();
 557
 558         ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s/%s/rootfs",
 559                 lxcpath, nname);
 560         if (ret < 0  || ret >= MAXPATHLEN)
 561                 return -1;
 562
 563         // zfs create -omountpoint=$lxcpath/$lxcname $zfsroot/$nname
 564         if (!snapshot) {
 565                 if ((pid = fork()) < 0)
 566                         return -1;
 567                 if (!pid) {
 568                         char dev[MAXPATHLEN];
 569                         ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, nname);
 570                         if (ret < 0  || ret >= MAXPATHLEN)
 571                                 exit(1);
 572                         execlp("zfs", "zfs", "create", option, dev, NULL);
 573                         exit(1);
 574                 }
 575                 return wait_for_pid(pid);
 576         } else {
 577                 // if snapshot, do
 578                 // 'zfs snapshot zfsroot/oname@nname
 579                 // zfs clone zfsroot/oname@nname zfsroot/nname
 580                 char path1[MAXPATHLEN], path2[MAXPATHLEN];
 581
 582                 ret = snprintf(path1, MAXPATHLEN, "%s/%s@%s", zfsroot,
 583                         oname, nname);
 584                 if (ret < 0 || ret >= MAXPATHLEN)
 585                         return -1;
 586                 (void) snprintf(path2, MAXPATHLEN, "%s/%s", zfsroot, nname);
 587
 588                 // if the snapshot exists, delete it
 589                 if ((pid = fork()) < 0)
 590                         return -1;
 591                 if (!pid) {
 592                         execlp("zfs", "zfs", "destroy", path1, NULL);
 593                         exit(1);
 594                 }
 595                 // it probably doesn't exist so destroy probably will fail.
 596                 (void) wait_for_pid(pid);
 597
 598                 // run first (snapshot) command
 599                 if ((pid = fork()) < 0)
 600                         return -1;
 601                 if (!pid) {
 602                         execlp("zfs", "zfs", "snapshot", path1, NULL);
 603                         exit(1);
 604                 }
 605                 if (wait_for_pid(pid) < 0)
 606                         return -1;
 607
 608                 // run second (clone) command
 609                 if ((pid = fork()) < 0)
 610                         return -1;
 611                 if (!pid) {
 612                         execlp("zfs", "zfs", "clone", option, path1, path2, NULL);
 613                         exit(1);
 614                 }
 615                 return wait_for_pid(pid);
 616         }
 617 }
 618
 619 static int zfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
 620                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
 621                 unsigned long newsize)
 622 {
 623         int len, ret;
 624
 625         if (!orig->src || !orig->dest)
 626                 return -1;
 627
 628         if (snap && strcmp(orig->type, "zfs")) {
 629                 ERROR("zfs snapshot from %s backing store is not supported",
 630                         orig->type);
 631                 return -1;
 632         }
 633
 634         len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
 635         new->src = malloc(len);
 636         if (!new->src)
 637                 return -1;
 638         ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
 639         if (ret < 0 || ret >= len)
 640                 return -1;
 641         if ((new->dest = strdup(new->src)) == NULL)
 642                 return -1;
 643
 644         return zfs_clone(orig->src, new->src, oldname, cname, lxcpath, snap);
 645 }
 646
 647 /*
 648  * TODO: detect whether this was a clone, and if so then also delete the
 649  * snapshot it was based on, so that we don't hold the original
 650  * container busy.
 651  */
 652 static int zfs_destroy(struct bdev *orig)
 653 {
 654         pid_t pid;
 655         char output[MAXPATHLEN], *p;
 656
 657         if ((pid = fork()) < 0)
 658                 return -1;
 659         if (pid)
 660                 return wait_for_pid(pid);
 661
 662         if (!zfs_list_entry(orig->src, output, MAXPATHLEN)) {
 663                 ERROR("Error: zfs entry for %s not found", orig->src);
 664                 return -1;
 665         }
 666
 667         // zfs mount is output up to ' '
 668         if ((p = index(output, ' ')) == NULL)
 669                 return -1;
 670         *p = '\0';
 671
 672         execlp("zfs", "zfs", "destroy", output, NULL);
 673         exit(1);
 674 }
 675
 676 static int zfs_create(struct bdev *bdev, const char *dest, const char *n,
 677                         struct bdev_specs *specs)
 678 {
 679         const char *zfsroot;
 680         char option[MAXPATHLEN];
 681         int ret;
 682         pid_t pid;
 683
 684         if (!specs || !specs->u.zfs.zfsroot)
 685                 zfsroot = default_zfs_root();
 686         else
 687                 zfsroot = specs->u.zfs.zfsroot;
 688
 689         if (!(bdev->dest = strdup(dest))) {
 690                 ERROR("No mount target specified or out of memory");
 691                 return -1;
 692         }
 693         if (!(bdev->src = strdup(bdev->dest))) {
 694                 ERROR("out of memory");
 695                 return -1;
 696         }
 697
 698         ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s", bdev->dest);
 699         if (ret < 0  || ret >= MAXPATHLEN)
 700                 return -1;
 701         if ((pid = fork()) < 0)
 702                 return -1;
 703         if (pid)
 704                 return wait_for_pid(pid);
 705
 706         char dev[MAXPATHLEN];
 707         ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, n);
 708         if (ret < 0  || ret >= MAXPATHLEN)
 709                 exit(1);
 710         execlp("zfs", "zfs", "create", option, dev, NULL);
 711         exit(1);
 712 }
 713
 714 struct bdev_ops zfs_ops = {
 715         .detect = &zfs_detect,
 716         .mount = &zfs_mount,
 717         .umount = &zfs_umount,
 718         .clone_paths = &zfs_clonepaths,
 719         .destroy = &zfs_destroy,
 720         .create = &zfs_create,
 721 };
 722
 723 //
 724 // LVM ops
 725 //
 726
 727 /*
 728  * Look at /sys/dev/block/maj:min/dm/uuid.  If it contains the hardcoded LVM
 729  * prefix "LVM-", then this is an lvm2 LV
 730  */
 731 static int lvm_detect(const char *path)
 732 {
 733         char devp[MAXPATHLEN], buf[4];
 734         FILE *fout;
 735         int ret;
 736         struct stat statbuf;
 737
 738         if (strncmp(path, "lvm:", 4) == 0)
 739                 return 1; // take their word for it
 740
 741         ret = stat(path, &statbuf);
 742         if (ret != 0)
 743                 return 0;
 744         if (!S_ISBLK(statbuf.st_mode))
 745                 return 0;
 746
 747         ret = snprintf(devp, MAXPATHLEN, "/sys/dev/block/%d:%d/dm/uuid",
 748                         major(statbuf.st_rdev), minor(statbuf.st_rdev));
 749         if (ret < 0 || ret >= MAXPATHLEN) {
 750                 ERROR("lvm uuid pathname too long");
 751                 return 0;
 752         }
 753         fout = fopen(devp, "r");
 754         if (!fout)
 755                 return 0;
 756         ret = fread(buf, 1, 4, fout);
 757         fclose(fout);
 758         if (ret != 4 || strncmp(buf, "LVM-", 4) != 0)
 759                 return 0;
 760         return 1;
 761 }
 762
 763 static int lvm_mount(struct bdev *bdev)
 764 {
 765         if (strcmp(bdev->type, "lvm"))
 766                 return -22;
 767         if (!bdev->src || !bdev->dest)
 768                 return -22;
 769         /* if we might pass in data sometime, then we'll have to enrich
 770          * mount_unknow_fs */
 771         return mount_unknow_fs(bdev->src, bdev->dest, 0);
 772 }
 773
 774 static int lvm_umount(struct bdev *bdev)
 775 {
 776         if (strcmp(bdev->type, "lvm"))
 777                 return -22;
 778         if (!bdev->src || !bdev->dest)
 779                 return -22;
 780         return umount(bdev->dest);
 781 }
 782
 783 /*
 784  * path must be '/dev/$vg/$lv', $vg must be an existing VG, and $lv must
 785  * not yet exist.  This function will attempt to create /dev/$vg/$lv of
 786  * size $size.
 787  */
 788 static int do_lvm_create(const char *path, unsigned long size)
 789 {
 790         int ret, pid;
 791         char sz[24], *pathdup, *vg, *lv;
 792
 793         if ((pid = fork()) < 0) {
 794                 SYSERROR("failed fork");
 795                 return -1;
 796         }
 797         if (pid > 0)
 798                 return wait_for_pid(pid);
 799
 800         // lvcreate default size is in M, not bytes.
 801         ret = snprintf(sz, 24, "%lu", size/1000000);
 802         if (ret < 0 || ret >= 24)
 803                 exit(1);
 804
 805         pathdup = strdup(path);
 806         if (!pathdup)
 807                 exit(1);
 808         lv = strrchr(pathdup, '/');
 809         if (!lv) {
 810                 free(pathdup);
 811                 exit(1);
 812         }
 813         *lv = '\0';
 814         lv++;
 815         vg = strrchr(pathdup, '/');
 816         if (!vg)
 817                 exit(1);
 818         vg++;
 819         execlp("lvcreate", "lvcreate", "-L", sz, vg, "-n", lv, (char *)NULL);
 820         free(pathdup);
 821         exit(1);
 822 }
 823
 824 static int lvm_snapshot(const char *orig, const char *path, unsigned long size)
 825 {
 826         int ret, pid;
 827         char sz[24], *pathdup, *lv;
 828
 829         if ((pid = fork()) < 0) {
 830                 SYSERROR("failed fork");
 831                 return -1;
 832         }
 833         if (pid > 0)
 834                 return wait_for_pid(pid);
 835         // lvcreate default size is in M, not bytes.
 836         ret = snprintf(sz, 24, "%lu", size/1000000);
 837         if (ret < 0 || ret >= 24)
 838                 exit(1);
 839
 840         pathdup = strdup(path);
 841         if (!pathdup)
 842                 exit(1);
 843         lv = strrchr(pathdup, '/');
 844         if (!lv) {
 845                 free(pathdup);
 846                 exit(1);
 847         }
 848         *lv = '\0';
 849         lv++;
 850
 851         ret = execlp("lvcreate", "lvcreate", "-s", "-L", sz, "-n", lv, orig, (char *)NULL);
 852         free(pathdup);
 853         exit(1);
 854 }
 855
 856 // this will return 1 for physical disks, qemu-nbd, loop, etc
 857 // right now only lvm is a block device
 858 static int is_blktype(struct bdev *b)
 859 {
 860         if (strcmp(b->type, "lvm") == 0)
 861                 return 1;
 862         return 0;
 863 }
 864
 865 static int lvm_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
 866                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
 867                 unsigned long newsize)
 868 {
 869         char fstype[100];
 870         unsigned long size = newsize;
 871         int len, ret;
 872
 873         if (!orig->src || !orig->dest)
 874                 return -1;
 875
 876         if (strcmp(orig->type, "lvm")) {
 877                 const char *vg;
 878
 879                 if (snap) {
 880                         ERROR("LVM snapshot from %s backing store is not supported",
 881                                 orig->type);
 882                         return -1;
 883                 }
 884                 vg = default_lvm_vg();
 885                 len = strlen("/dev/") + strlen(vg) + strlen(cname) + 2;
 886                 if ((new->src = malloc(len)) == NULL)
 887                         return -1;
 888                 ret = snprintf(new->src, len, "/dev/%s/%s", vg, cname);
 889                 if (ret < 0 || ret >= len)
 890                         return -1;
 891         } else {
 892                 new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath);
 893                 if (!new->src)
 894                         return -1;
 895         }
 896
 897         if (orig->data) {
 898                 new->data = strdup(orig->data);
 899                 if (!new->data)
 900                         return -1;
 901         }
 902
 903         len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
 904         new->dest = malloc(len);
 905         if (!new->dest)
 906                 return -1;
 907         ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
 908         if (ret < 0 || ret >= len)
 909                 return -1;
 910         if (mkdir_p(new->dest, 0755) < 0)
 911                 return -1;
 912
 913         if (is_blktype(orig)) {
 914                 if (!newsize && blk_getsize(orig, &size) < 0) {
 915                         ERROR("Error getting size of %s", orig->src);
 916                         return -1;
 917                 }
 918                 if (detect_fs(orig, fstype, 100) < 0) {
 919                         INFO("could not find fstype for %s, using ext3", orig->src);
 920                         return -1;
 921                 }
 922         } else {
 923                 sprintf(fstype, "ext3");
 924                 if (!newsize)
 925                         size = 1000000000; // default to 1G
 926         }
 927
 928         if (snap) {
 929                 if (lvm_snapshot(orig->src, new->src, size) < 0) {
 930                         ERROR("could not create %s snapshot of %s", new->src, orig->src);
 931                         return -1;
 932                 }
 933         } else {
 934                 if (do_lvm_create(new->src, size) < 0) {
 935                         ERROR("Error creating new lvm blockdev");
 936                         return -1;
 937                 }
 938                 if (do_mkfs(new->src, fstype) < 0) {
 939                         ERROR("Error creating filesystem type %s on %s", fstype,
 940                                 new->src);
 941                         return -1;
 942                 }
 943         }
 944
 945         return 0;
 946 }
 947
 948 static int lvm_destroy(struct bdev *orig)
 949 {
 950         pid_t pid;
 951
 952         if ((pid = fork()) < 0)
 953                 return -1;
 954         if (!pid) {
 955                 execlp("lvremove", "lvremove", "-f", orig->src, NULL);
 956                 exit(1);
 957         }
 958         return wait_for_pid(pid);
 959 }
 960
 961 #define DEFAULT_FS_SIZE 1024000000
 962 #define DEFAULT_FSTYPE "ext3"
 963 static int lvm_create(struct bdev *bdev, const char *dest, const char *n,
 964                         struct bdev_specs *specs)
 965 {
 966         const char *vg, *fstype, *lv = n;
 967         unsigned long sz;
 968         int ret, len;
 969
 970         if (!specs)
 971                 return -1;
 972
 973         vg = specs->u.lvm.vg;
 974         if (!vg)
 975                 vg = default_lvm_vg();
 976
 977         /* /dev/$vg/$lv */
 978         if (specs->u.lvm.lv)
 979                 lv = specs->u.lvm.lv;
 980         len = strlen(vg) + strlen(lv) + 7;
 981         bdev->src = malloc(len);
 982         if (!bdev->src)
 983                 return -1;
 984
 985         ret = snprintf(bdev->src, len, "/dev/%s/%s", vg, lv);
 986         if (ret < 0 || ret >= len)
 987                 return -1;
 988
 989         // lvm.fssize is in bytes.
 990         sz = specs->u.lvm.fssize;
 991         if (!sz)
 992                 sz = DEFAULT_FS_SIZE;
 993
 994         INFO("Error creating new lvm blockdev %s size %lu", bdev->src, sz);
 995         if (do_lvm_create(bdev->src, sz) < 0) {
 996                 ERROR("Error creating new lvm blockdev %s size %lu", bdev->src, sz);
 997                 return -1;
 998         }
 999
1000         fstype = specs->u.lvm.fstype;
1001         if (!fstype)
1002                 fstype = DEFAULT_FSTYPE;
1003         if (do_mkfs(bdev->src, fstype) < 0) {
1004                 ERROR("Error creating filesystem type %s on %s", fstype,
1005                         bdev->src);
1006                 return -1;
1007         }
1008         if (!(bdev->dest = strdup(dest)))
1009                 return -1;
1010
1011         if (mkdir_p(bdev->dest, 0755) < 0) {
1012                 ERROR("Error creating %s\n", bdev->dest);
1013                 return -1;
1014         }
1015
1016         return 0;
1017 }
1018
1019 struct bdev_ops lvm_ops = {
1020         .detect = &lvm_detect,
1021         .mount = &lvm_mount,
1022         .umount = &lvm_umount,
1023         .clone_paths = &lvm_clonepaths,
1024         .destroy = &lvm_destroy,
1025         .create = &lvm_create,
1026 };
1027
1028 //
1029 // btrfs ops
1030 //
1031
1032 struct btrfs_ioctl_space_info {
1033         unsigned long long flags;
1034         unsigned long long total_bytes;
1035         unsigned long long used_bytes;
1036 };
1037
1038 struct btrfs_ioctl_space_args {
1039         unsigned long long space_slots;
1040         unsigned long long total_spaces;
1041         struct btrfs_ioctl_space_info spaces[0];
1042 };
1043
1044 #define BTRFS_IOCTL_MAGIC 0x94
1045 #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, unsigned long long)
1046 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
1047                                     struct btrfs_ioctl_space_args)
1048
1049 static bool is_btrfs_fs(const char *path)
1050 {
1051         int fd, ret;
1052         struct btrfs_ioctl_space_args sargs;
1053
1054         // make sure this is a btrfs filesystem
1055         fd = open(path, O_RDONLY);
1056         if (fd < 0)
1057                 return false;
1058         sargs.space_slots = 0;
1059         sargs.total_spaces = 0;
1060         ret = ioctl(fd, BTRFS_IOC_SPACE_INFO, &sargs);
1061         close(fd);
1062         if (ret < 0)
1063                 return false;
1064
1065         return true;
1066 }
1067
1068 static int btrfs_detect(const char *path)
1069 {
1070         struct stat st;
1071         int ret;
1072
1073         if (!is_btrfs_fs(path))
1074                 return 0;
1075
1076         // and make sure it's a subvolume.
1077         ret = stat(path, &st);
1078         if (ret < 0)
1079                 return 0;
1080
1081         if (st.st_ino == 256 && S_ISDIR(st.st_mode))
1082                 return 1;
1083
1084         return 0;
1085 }
1086
1087 static int btrfs_mount(struct bdev *bdev)
1088 {
1089         if (strcmp(bdev->type, "btrfs"))
1090                 return -22;
1091         if (!bdev->src || !bdev->dest)
1092                 return -22;
1093         return mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC, NULL);
1094 }
1095
1096 static int btrfs_umount(struct bdev *bdev)
1097 {
1098         if (strcmp(bdev->type, "btrfs"))
1099                 return -22;
1100         if (!bdev->src || !bdev->dest)
1101                 return -22;
1102         return umount(bdev->dest);
1103 }
1104
1105 #define BTRFS_SUBVOL_NAME_MAX 4039
1106 #define BTRFS_PATH_NAME_MAX 4087
1107
1108 struct btrfs_ioctl_vol_args {
1109         signed long long fd;
1110         char name[BTRFS_PATH_NAME_MAX + 1];
1111 };
1112
1113 #define BTRFS_IOCTL_MAGIC 0x94
1114 #define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
1115                                    struct btrfs_ioctl_vol_args_v2)
1116 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
1117                                    struct btrfs_ioctl_vol_args_v2)
1118 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
1119                                    struct btrfs_ioctl_vol_args)
1120 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
1121                                    struct btrfs_ioctl_vol_args)
1122
1123 #define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
1124
1125 struct btrfs_ioctl_vol_args_v2 {
1126         signed long long fd;
1127         unsigned long long transid;
1128         unsigned long long flags;
1129         union {
1130                 struct {
1131                         unsigned long long size;
1132                         //struct btrfs_qgroup_inherit *qgroup_inherit;
1133                         void *qgroup_inherit;
1134                 };
1135                 unsigned long long unused[4];
1136         };
1137         char name[BTRFS_SUBVOL_NAME_MAX + 1];
1138 };
1139
1140 static int btrfs_subvolume_create(const char *path)
1141 {
1142         int ret, fd = -1;
1143         struct btrfs_ioctl_vol_args  args;
1144         char *p, *newfull = strdup(path);
1145
1146         if (!newfull) {
1147                 ERROR("Error: out of memory");
1148                 return -1;
1149         }
1150
1151         p = strrchr(newfull, '/');
1152         if (!p) {
1153                 ERROR("bad path: %s", path);
1154                 return -1;
1155         }
1156         *p = '\0';
1157
1158         if ((fd = open(newfull, O_RDONLY)) < 0) {
1159                 ERROR("Error opening %s", newfull);
1160                 free(newfull);
1161                 return -1;
1162         }
1163
1164         memset(&args, 0, sizeof(args));
1165         strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1166         args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1167         ret = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, &args);
1168         INFO("btrfs: snapshot create ioctl returned %d", ret);
1169
1170         free(newfull);
1171         close(fd);
1172         return ret;
1173 }
1174
1175 static int btrfs_snapshot(const char *orig, const char *new)
1176 {
1177         int fd = -1, fddst = -1, ret = -1;
1178         struct btrfs_ioctl_vol_args_v2  args;
1179         char *newdir, *newname, *newfull = NULL;
1180
1181         newfull = strdup(new);
1182         if (!newfull) {
1183                 ERROR("Error: out of memory");
1184                 goto out;
1185         }
1186         // make sure the directory doesn't already exist
1187         if (rmdir(newfull) < 0 && errno != -ENOENT) {
1188                 SYSERROR("Error removing empty new rootfs");
1189                 goto out;
1190         }
1191         newname = basename(newfull);
1192         newdir = dirname(newfull);
1193         fd = open(orig, O_RDONLY);
1194         if (fd < 0) {
1195                 SYSERROR("Error opening original rootfs %s", orig);
1196                 goto out;
1197         }
1198         fddst = open(newdir, O_RDONLY);
1199         if (fddst < 0) {
1200                 SYSERROR("Error opening new container dir %s", newdir);
1201                 goto out;
1202         }
1203
1204         memset(&args, 0, sizeof(args));
1205         args.fd = fd;
1206         strncpy(args.name, newname, BTRFS_SUBVOL_NAME_MAX);
1207         args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1208         ret = ioctl(fddst, BTRFS_IOC_SNAP_CREATE_V2, &args);
1209         INFO("btrfs: snapshot create ioctl returned %d", ret);
1210
1211 out:
1212         if (fddst != -1)
1213                 close(fddst);
1214         if (fd != -1)
1215                 close(fd);
1216         if (newfull)
1217                 free(newfull);
1218         return ret;
1219 }
1220
1221 static int btrfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1222                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1223                 unsigned long newsize)
1224 {
1225         if (!orig->dest || !orig->src)
1226                 return -1;
1227
1228         if (strcmp(orig->type, "btrfs")) {
1229                 int len, ret;
1230                 if (snap) {
1231                         ERROR("btrfs snapshot from %s backing store is not supported",
1232                                 orig->type);
1233                         return -1;
1234                 }
1235                 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1236                 new->src = malloc(len);
1237                 if (!new->src)
1238                         return -1;
1239                 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
1240                 if (ret < 0 || ret >= len)
1241                         return -1;
1242         } else {
1243                 // in case rootfs is in custom path, reuse it
1244                 if ((new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath)) == NULL)
1245                         return -1;
1246
1247         }
1248
1249         if ((new->dest = strdup(new->src)) == NULL)
1250                 return -1;
1251
1252         if (orig->data && (new->data = strdup(orig->data)) == NULL)
1253                 return -1;
1254
1255         if (snap)
1256                 return btrfs_snapshot(orig->dest, new->dest);
1257
1258         if (rmdir(new->dest) < 0 && errno != -ENOENT) {
1259                 SYSERROR("removing %s\n", new->dest);
1260                 return -1;
1261         }
1262
1263         return btrfs_subvolume_create(new->dest);
1264 }
1265
1266 static int btrfs_destroy(struct bdev *orig)
1267 {
1268         int ret, fd = -1;
1269         struct btrfs_ioctl_vol_args  args;
1270         char *path = orig->src;
1271         char *p, *newfull = strdup(path);
1272
1273         if (!newfull) {
1274                 ERROR("Error: out of memory");
1275                 return -1;
1276         }
1277
1278         p = strrchr(newfull, '/');
1279         if (!p) {
1280                 ERROR("bad path: %s", path);
1281                 return -1;
1282         }
1283         *p = '\0';
1284
1285         if ((fd = open(newfull, O_RDONLY)) < 0) {
1286                 ERROR("Error opening %s", newfull);
1287                 free(newfull);
1288                 return -1;
1289         }
1290
1291         memset(&args, 0, sizeof(args));
1292         strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1293         args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1294         ret = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &args);
1295         INFO("btrfs: snapshot create ioctl returned %d", ret);
1296
1297         free(newfull);
1298         close(fd);
1299         return ret;
1300 }
1301
1302 static int btrfs_create(struct bdev *bdev, const char *dest, const char *n,
1303                         struct bdev_specs *specs)
1304 {
1305         bdev->src = strdup(dest);
1306         bdev->dest = strdup(dest);
1307         if (!bdev->src || !bdev->dest)
1308                 return -1;
1309         return btrfs_subvolume_create(bdev->dest);
1310 }
1311
1312 struct bdev_ops btrfs_ops = {
1313         .detect = &btrfs_detect,
1314         .mount = &btrfs_mount,
1315         .umount = &btrfs_umount,
1316         .clone_paths = &btrfs_clonepaths,
1317         .destroy = &btrfs_destroy,
1318         .create = &btrfs_create,
1319 };
1320
1321 //
1322 // loopback dev ops
1323 //
1324 static int loop_detect(const char *path)
1325 {
1326         if (strncmp(path, "loop:", 5) == 0)
1327                 return 1;
1328         return 0;
1329 }
1330
1331 static int find_free_loopdev(int *retfd, char *namep)
1332 {
1333         struct dirent dirent, *direntp;
1334         struct loop_info64 lo;
1335         DIR *dir;
1336         int fd = -1;
1337
1338         if (!(dir = opendir("/dev"))) {
1339                 SYSERROR("Error opening /dev");
1340                 return -1;
1341         }
1342         while (!readdir_r(dir, &dirent, &direntp)) {
1343
1344                 if (!direntp)
1345                         break;
1346                 if (strncmp(direntp->d_name, "loop", 4) != 0)
1347                         continue;
1348                 if ((fd = openat(dirfd(dir), direntp->d_name, O_RDWR)) < 0)
1349                         continue;
1350                 if (ioctl(fd, LOOP_GET_STATUS64, &lo) == 0 || errno != ENXIO) {
1351                         close(fd);
1352                         fd = -1;
1353                         continue;
1354                 }
1355                 // We can use this fd
1356                 snprintf(namep, 100, "/dev/%s", direntp->d_name);
1357                 break;
1358         }
1359         closedir(dir);
1360         if (fd == -1) {
1361                 ERROR("No loop device found");
1362                 return -1;
1363         }
1364
1365         *retfd = fd;
1366         return 0;
1367 }
1368
1369 static int loop_mount(struct bdev *bdev)
1370 {
1371         int lfd, ffd = -1, ret = -1;
1372         struct loop_info64 lo;
1373         char loname[100];
1374
1375         if (strcmp(bdev->type, "loop"))
1376                 return -22;
1377         if (!bdev->src || !bdev->dest)
1378                 return -22;
1379         if (find_free_loopdev(&lfd, loname) < 0)
1380                 return -22;
1381
1382         if ((ffd = open(bdev->src + 5, O_RDWR)) < 0) {
1383                 SYSERROR("Error opening backing file %s\n", bdev->src);
1384                 goto out;
1385         }
1386
1387         if (ioctl(lfd, LOOP_SET_FD, ffd) < 0) {
1388                 SYSERROR("Error attaching backing file to loop dev");
1389                 goto out;
1390         }
1391         memset(&lo, 0, sizeof(lo));
1392         lo.lo_flags = LO_FLAGS_AUTOCLEAR;
1393         if (ioctl(lfd, LOOP_SET_STATUS64, &lo) < 0) {
1394                 SYSERROR("Error setting autoclear on loop dev\n");
1395                 goto out;
1396         }
1397
1398         ret = mount_unknow_fs(loname, bdev->dest, 0);
1399         if (ret < 0)
1400                 ERROR("Error mounting %s\n", bdev->src);
1401         else
1402                 bdev->lofd = lfd;
1403
1404 out:
1405         if (ffd > -1)
1406                 close(ffd);
1407         if (ret < 0) {
1408                 close(lfd);
1409                 bdev->lofd = -1;
1410         }
1411         return ret;
1412 }
1413
1414 static int loop_umount(struct bdev *bdev)
1415 {
1416         int ret;
1417
1418         if (strcmp(bdev->type, "loop"))
1419                 return -22;
1420         if (!bdev->src || !bdev->dest)
1421                 return -22;
1422         ret = umount(bdev->dest);
1423         if (bdev->lofd >= 0) {
1424                 close(bdev->lofd);
1425                 bdev->lofd = -1;
1426         }
1427         return ret;
1428 }
1429
1430 static int do_loop_create(const char *path, unsigned long size, const char *fstype)
1431 {
1432         int fd;
1433         // create the new loopback file.
1434         fd = creat(path, S_IRUSR|S_IWUSR);
1435         if (fd < 0)
1436                 return -1;
1437         if (lseek(fd, size, SEEK_SET) < 0) {
1438                 SYSERROR("Error seeking to set new loop file size");
1439                 close(fd);
1440                 return -1;
1441         }
1442         if (write(fd, "1", 1) != 1) {
1443                 SYSERROR("Error creating new loop file");
1444                 close(fd);
1445                 return -1;
1446         }
1447         if (close(fd) < 0) {
1448                 SYSERROR("Error closing new loop file");
1449                 return -1;
1450         }
1451
1452         // create an fs in the loopback file
1453         if (do_mkfs(path, fstype) < 0) {
1454                 ERROR("Error creating filesystem type %s on %s", fstype,
1455                         path);
1456                 return -1;
1457         }
1458
1459         return 0;
1460 }
1461
1462 /*
1463  * No idea what the original blockdev will be called, but the copy will be
1464  * called $lxcpath/$lxcname/rootdev
1465  */
1466 static int loop_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1467                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1468                 unsigned long newsize)
1469 {
1470         char fstype[100];
1471         unsigned long size = newsize;
1472         int len, ret;
1473         char *srcdev;
1474
1475         if (snap) {
1476                 ERROR("loop devices cannot be snapshotted.");
1477                 return -1;
1478         }
1479
1480         if (!orig->dest || !orig->src)
1481                 return -1;
1482
1483         len = strlen(lxcpath) + strlen(cname) + strlen("rootdev") + 3;
1484         srcdev = alloca(len);
1485         ret = snprintf(srcdev, len, "%s/%s/rootdev", lxcpath, cname);
1486         if (ret < 0 || ret >= len)
1487                 return -1;
1488
1489         new->src = malloc(len + 5);
1490         if (!new->src)
1491                 return -1;
1492         ret = snprintf(new->src, len + 5, "loop:%s", srcdev);
1493         if (ret < 0 || ret >= len + 5)
1494                 return -1;
1495
1496         new->dest = malloc(len);
1497         if (!new->dest)
1498                 return -1;
1499         ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
1500         if (ret < 0 || ret >= len)
1501                 return -1;
1502
1503         // it's tempting to say: if orig->src == loopback and !newsize, then
1504         // copy the loopback file.  However, we'd have to make sure to
1505         // correctly keep holes!  So punt for now.
1506
1507         if (is_blktype(orig)) {
1508                 if (!newsize && blk_getsize(orig, &size) < 0) {
1509                         ERROR("Error getting size of %s", orig->src);
1510                         return -1;
1511                 }
1512                 if (detect_fs(orig, fstype, 100) < 0) {
1513                         INFO("could not find fstype for %s, using %s", orig->src,
1514                                 DEFAULT_FSTYPE);
1515                         return -1;
1516                 }
1517         } else {
1518                 sprintf(fstype, "%s", DEFAULT_FSTYPE);
1519                 if (!newsize)
1520                         size = DEFAULT_FS_SIZE; // default to 1G
1521         }
1522         return do_loop_create(srcdev, size, fstype);
1523 }
1524
1525 static int loop_create(struct bdev *bdev, const char *dest, const char *n,
1526                         struct bdev_specs *specs)
1527 {
1528         const char *fstype;
1529         unsigned long sz;
1530         int ret, len;
1531         char *srcdev;
1532
1533         if (!specs)
1534                 return -1;
1535
1536         // dest is passed in as $lxcpath / $lxcname / rootfs
1537         // srcdev will be:      $lxcpath / $lxcname / rootdev
1538         // src will be 'loop:$srcdev'
1539         len = strlen(dest) + 2;
1540         srcdev = alloca(len);
1541
1542         ret = snprintf(srcdev, len, "%s", dest);
1543         if (ret < 0 || ret >= len)
1544                 return -1;
1545         sprintf(srcdev + len - 4, "dev");
1546
1547         bdev->src = malloc(len + 5);
1548         if (!bdev->src)
1549                 return -1;
1550         ret = snprintf(bdev->src, len + 5, "loop:%s", srcdev);
1551         if (ret < 0 || ret >= len + 5)
1552                 return -1;
1553
1554         sz = specs->u.loop.fssize;
1555         if (!sz)
1556                 sz = DEFAULT_FS_SIZE;
1557
1558         fstype = specs->u.loop.fstype;
1559         if (!fstype)
1560                 fstype = DEFAULT_FSTYPE;
1561
1562         if (!(bdev->dest = strdup(dest)))
1563                 return -1;
1564
1565         if (mkdir_p(bdev->dest, 0755) < 0) {
1566                 ERROR("Error creating %s\n", bdev->dest);
1567                 return -1;
1568         }
1569
1570         return do_loop_create(srcdev, sz, fstype);
1571 }
1572
1573 static int loop_destroy(struct bdev *orig)
1574 {
1575         return unlink(orig->src + 5);
1576 }
1577
1578 struct bdev_ops loop_ops = {
1579         .detect = &loop_detect,
1580         .mount = &loop_mount,
1581         .umount = &loop_umount,
1582         .clone_paths = &loop_clonepaths,
1583         .destroy = &loop_destroy,
1584         .create = &loop_create,
1585 };
1586
1587 //
1588 // overlayfs ops
1589 //
1590
1591 static int overlayfs_detect(const char *path)
1592 {
1593         if (strncmp(path, "overlayfs:", 10) == 0)
1594                 return 1; // take their word for it
1595         return 0;
1596 }
1597
1598 //
1599 // XXXXXXX plain directory bind mount ops
1600 //
1601 static int overlayfs_mount(struct bdev *bdev)
1602 {
1603         char *options, *dup, *lower, *upper;
1604         int len;
1605         int ret;
1606
1607         if (strcmp(bdev->type, "overlayfs"))
1608                 return -22;
1609         if (!bdev->src || !bdev->dest)
1610                 return -22;
1611
1612         //  separately mount it first
1613         //  mount -t overlayfs -oupperdir=${upper},lowerdir=${lower} lower dest
1614         dup = alloca(strlen(bdev->src)+1);
1615         strcpy(dup, bdev->src);
1616         if (!(lower = index(dup, ':')))
1617                 return -22;
1618         if (!(upper = index(++lower, ':')))
1619                 return -22;
1620         *upper = '\0';
1621         upper++;
1622
1623         // TODO We should check whether bdev->src is a blockdev, and if so
1624         // but for now, only support overlays of a basic directory
1625
1626         len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=") + 1;
1627         options = alloca(len);
1628         ret = snprintf(options, len, "upperdir=%s,lowerdir=%s", upper, lower);
1629         if (ret < 0 || ret >= len)
1630                 return -1;
1631         ret = mount(lower, bdev->dest, "overlayfs", MS_MGC_VAL, options);
1632         if (ret < 0)
1633                 SYSERROR("overlayfs: error mounting %s onto %s options %s",
1634                         lower, bdev->dest, options);
1635         else
1636                 INFO("overlayfs: mounted %s onto %s options %s",
1637                         lower, bdev->dest, options);
1638         return ret;
1639 }
1640
1641 static int overlayfs_umount(struct bdev *bdev)
1642 {
1643         if (strcmp(bdev->type, "overlayfs"))
1644                 return -22;
1645         if (!bdev->src || !bdev->dest)
1646                 return -22;
1647         return umount(bdev->dest);
1648 }
1649
1650 static int overlayfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1651                 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1652                 unsigned long newsize)
1653 {
1654         if (!snap) {
1655                 ERROR("overlayfs is only for snapshot clones");
1656                 return -22;
1657         }
1658
1659         if (!orig->src || !orig->dest)
1660                 return -1;
1661
1662         new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
1663         if (!new->dest)
1664                 return -1;
1665         if (mkdir_p(new->dest, 0755) < 0)
1666                 return -1;
1667
1668         if (strcmp(orig->type, "dir") == 0) {
1669                 char *delta;
1670                 int ret, len;
1671
1672                 // if we have /var/lib/lxc/c2/rootfs, then delta will be
1673                 //            /var/lib/lxc/c2/delta0
1674                 delta = strdup(new->dest);
1675                 if (!delta) {
1676                         return -1;
1677                 }
1678                 if (strlen(delta) < 6) {
1679                         free(delta);
1680                         return -22;
1681                 }
1682                 strcpy(&delta[strlen(delta)-6], "delta0");
1683                 if ((ret = mkdir(delta, 0755)) < 0) {
1684                         SYSERROR("error: mkdir %s", delta);
1685                         free(delta);
1686                         return -1;
1687                 }
1688
1689                 // the src will be 'overlayfs:lowerdir:upperdir'
1690                 len = strlen(delta) + strlen(orig->src) + 12;
1691                 new->src = malloc(len);
1692                 if (!new->src) {
1693                         free(delta);
1694                         return -ENOMEM;
1695                 }
1696                 ret = snprintf(new->src, len, "overlayfs:%s:%s", orig->src, delta);
1697                 free(delta);
1698                 if (ret < 0 || ret >= len)
1699                         return -ENOMEM;
1700         } else if (strcmp(orig->type, "overlayfs") == 0) {
1701                 // What exactly do we want to do here?
1702                 // I think we want to use the original lowerdir, with a
1703                 // private delta which is originally rsynced from the
1704                 // original delta
1705                 char *osrc, *odelta, *nsrc, *ndelta;
1706                 int len, ret;
1707                 if (!(osrc = strdup(orig->src)))
1708                         return -22;
1709                 nsrc = index(osrc, ':') + 1;
1710                 if (nsrc != osrc + 10 || (odelta = index(nsrc, ':')) == NULL) {
1711                         free(osrc);
1712                         return -22;
1713                 }
1714                 *odelta = '\0';
1715                 odelta++;
1716                 ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath);
1717                 if (!ndelta) {
1718                         free(osrc);
1719                         return -ENOMEM;
1720                 }
1721                 if (do_rsync(odelta, ndelta) < 0) {
1722                         free(osrc);
1723                         free(ndelta);
1724                         ERROR("copying overlayfs delta");
1725                         return -1;
1726                 }
1727                 len = strlen(nsrc) + strlen(ndelta) + 12;
1728                 new->src = malloc(len);
1729                 if (!new->src) {
1730                         free(osrc);
1731                         free(ndelta);
1732                         return -ENOMEM;
1733                 }
1734                 ret = snprintf(new->src, len, "overlayfs:%s:%s", nsrc, ndelta);
1735                 free(osrc);
1736                 free(ndelta);
1737                 if (ret < 0 || ret >= len)
1738                         return -ENOMEM;
1739         } else {
1740                 ERROR("overlayfs clone of %s container is not yet supported",
1741                         orig->type);
1742                 // Note, supporting this will require overlayfs_mount supporting
1743                 // mounting of the underlay.  No big deal, just needs to be done.
1744                 return -1;
1745         }
1746
1747         return 0;
1748 }
1749
1750 int overlayfs_destroy(struct bdev *orig)
1751 {
1752         char *upper;
1753
1754         if (strncmp(orig->src, "overlayfs:", 10) != 0)
1755                 return -22;
1756         upper = index(orig->src + 10, ':');
1757         if (!upper)
1758                 return -22;
1759         upper++;
1760         return lxc_rmdir_onedev(upper);
1761 }
1762
1763 /*
1764  * to say 'lxc-create -t ubuntu -n o1 -B overlayfs' means you want
1765  * $lxcpath/$lxcname/rootfs to have the created container, while all
1766  * changes after starting the container are written to
1767  * $lxcpath/$lxcname/delta0
1768  */
1769 static int overlayfs_create(struct bdev *bdev, const char *dest, const char *n,
1770                         struct bdev_specs *specs)
1771 {
1772         char *delta;
1773         int ret, len = strlen(dest), newlen;
1774
1775         if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0)
1776                 return -1;
1777
1778         if (!(bdev->dest = strdup(dest))) {
1779                 ERROR("Out of memory");
1780                 return -1;
1781         }
1782
1783         delta = alloca(strlen(dest)+1);
1784         strcpy(delta, dest);
1785         strcpy(delta+len-6, "delta0");
1786
1787         if (mkdir_p(delta, 0755) < 0) {
1788                 ERROR("Error creating %s\n", delta);
1789                 return -1;
1790         }
1791
1792         /* overlayfs:lower:upper */
1793         newlen = (2 * len) + strlen("overlayfs:") + 2;
1794         bdev->src = malloc(newlen);
1795         if (!bdev->src) {
1796                 ERROR("Out of memory");
1797                 return -1;
1798         }
1799         ret = snprintf(bdev->src, newlen, "overlayfs:%s:%s", dest, delta);
1800         if (ret < 0 || ret >= newlen)
1801                 return -1;
1802
1803         if (mkdir_p(bdev->dest, 0755) < 0) {
1804                 ERROR("Error creating %s\n", bdev->dest);
1805                 return -1;
1806         }
1807
1808         return 0;
1809 }
1810
1811 struct bdev_ops overlayfs_ops = {
1812         .detect = &overlayfs_detect,
1813         .mount = &overlayfs_mount,
1814         .umount = &overlayfs_umount,
1815         .clone_paths = &overlayfs_clonepaths,
1816         .destroy = &overlayfs_destroy,
1817         .create = &overlayfs_create,
1818 };
1819
1820 struct bdev_type bdevs[] = {
1821         {.name = "zfs", .ops = &zfs_ops,},
1822         {.name = "lvm", .ops = &lvm_ops,},
1823         {.name = "btrfs", .ops = &btrfs_ops,},
1824         {.name = "dir", .ops = &dir_ops,},
1825         {.name = "overlayfs", .ops = &overlayfs_ops,},
1826         {.name = "loop", .ops = &loop_ops,},
1827 };
1828
1829 static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type);
1830
1831 void bdev_put(struct bdev *bdev)
1832 {
1833         if (bdev->data)
1834                 free(bdev->data);
1835         if (bdev->src)
1836                 free(bdev->src);
1837         if (bdev->dest)
1838                 free(bdev->dest);
1839         free(bdev);
1840 }
1841
1842 struct bdev *bdev_get(const char *type)
1843 {
1844         int i;
1845         struct bdev *bdev;
1846
1847         for (i=0; i<numbdevs; i++) {
1848                 if (strcmp(bdevs[i].name, type) == 0)
1849                         break;
1850         }
1851         if (i == numbdevs)
1852                 return NULL;
1853         bdev = malloc(sizeof(struct bdev));
1854         if (!bdev)
1855                 return NULL;
1856         memset(bdev, 0, sizeof(struct bdev));
1857         bdev->ops = bdevs[i].ops;
1858         bdev->type = bdevs[i].name;
1859         return bdev;
1860 }
1861
1862 struct bdev *bdev_init(const char *src, const char *dst, const char *data)
1863 {
1864         int i;
1865         struct bdev *bdev;
1866
1867         for (i=0; i<numbdevs; i++) {
1868                 int r;
1869                 r = bdevs[i].ops->detect(src);
1870                 if (r)
1871                         break;
1872         }
1873
1874         if (i == numbdevs)
1875                 return NULL;
1876         bdev = malloc(sizeof(struct bdev));
1877         if (!bdev)
1878                 return NULL;
1879         memset(bdev, 0, sizeof(struct bdev));
1880         bdev->ops = bdevs[i].ops;
1881         bdev->type = bdevs[i].name;
1882         if (data)
1883                 bdev->data = strdup(data);
1884         if (src)
1885                 bdev->src = strdup(src);
1886         if (dst)
1887                 bdev->dest = strdup(dst);
1888
1889         return bdev;
1890 }
1891
1892 /*
1893  * If we're not snaphotting, then bdev_copy becomes a simple case of mount
1894  * the original, mount the new, and rsync the contents.
1895  */
1896 struct bdev *bdev_copy(const char *src, const char *oldname, const char *cname,
1897                         const char *oldpath, const char *lxcpath, const char *bdevtype,
1898                         int snap, const char *bdevdata, unsigned long newsize,
1899                         int *needs_rdep)
1900 {
1901         struct bdev *orig, *new;
1902         pid_t pid;
1903
1904         /* if the container name doesn't show up in the rootfs path, then
1905          * we don't know how to come up with a new name
1906          */
1907         if (strstr(src, oldname) == NULL) {
1908                 ERROR("original rootfs path %s doesn't include container name %s",
1909                         src, oldname);
1910                 return NULL;
1911         }
1912
1913         orig = bdev_init(src, NULL, NULL);
1914         if (!orig) {
1915                 ERROR("failed to detect blockdev type for %s\n", src);
1916                 return NULL;
1917         }
1918
1919         if (!orig->dest) {
1920                 int ret;
1921                 orig->dest = malloc(MAXPATHLEN);
1922                 if (!orig->dest) {
1923                         ERROR("out of memory");
1924                         bdev_put(orig);
1925                         return NULL;
1926                 }
1927                 ret = snprintf(orig->dest, MAXPATHLEN, "%s/%s/rootfs", oldpath, oldname);
1928                 if (ret < 0 || ret >= MAXPATHLEN) {
1929                         ERROR("rootfs path too long");
1930                         bdev_put(orig);
1931                         return NULL;
1932                 }
1933         }
1934
1935         /*
1936          * If newtype is NULL and snapshot is set, then use overlayfs
1937          */
1938         if (!bdevtype && snap && strcmp(orig->type , "dir") == 0)
1939                 bdevtype = "overlayfs";
1940
1941         *needs_rdep = 0;
1942         if (strcmp(orig->type, "dir") == 0 &&
1943                         strcmp(bdevtype, "overlayfs") == 0)
1944                 *needs_rdep = 1;
1945
1946         new = bdev_get(bdevtype ? bdevtype : orig->type);
1947         if (!new) {
1948                 ERROR("no such block device type: %s", bdevtype ? bdevtype : orig->type);
1949                 bdev_put(orig);
1950                 return NULL;
1951         }
1952
1953         if (new->ops->clone_paths(orig, new, oldname, cname, oldpath, lxcpath, snap, newsize) < 0) {
1954                 ERROR("failed getting pathnames for cloned storage: %s\n", src);
1955                 bdev_put(orig);
1956                 bdev_put(new);
1957                 return NULL;
1958         }
1959
1960         pid = fork();
1961         if (pid < 0) {
1962                 SYSERROR("fork");
1963                 bdev_put(orig);
1964                 bdev_put(new);
1965                 return NULL;
1966         }
1967
1968         if (pid > 0) {
1969                 int ret = wait_for_pid(pid);
1970                 bdev_put(orig);
1971                 if (ret < 0) {
1972                         bdev_put(new);
1973                         return NULL;
1974                 }
1975                 return new;
1976         }
1977
1978         if (unshare(CLONE_NEWNS) < 0) {
1979                 SYSERROR("unshare CLONE_NEWNS");
1980                 exit(1);
1981         }
1982         if (snap)
1983                 exit(0);
1984
1985         // If not a snapshot, copy the fs.
1986         if (orig->ops->mount(orig) < 0) {
1987                 ERROR("failed mounting %s onto %s\n", src, orig->dest);
1988                 exit(1);
1989         }
1990         if (new->ops->mount(new) < 0) {
1991                 ERROR("failed mounting %s onto %s\n", new->src, new->dest);
1992                 exit(1);
1993         }
1994         if (do_rsync(orig->dest, new->dest) < 0) {
1995                 ERROR("rsyncing %s to %s\n", orig->src, new->src);
1996                 exit(1);
1997         }
1998         // don't bother umounting, ns exit will do that
1999
2000         exit(0);
2001 }
2002
2003 static struct bdev * do_bdev_create(const char *dest, const char *type,
2004                         const char *cname, struct bdev_specs *specs)
2005 {
2006         struct bdev *bdev = bdev_get(type);
2007         if (!bdev) {
2008                 return NULL;
2009         }
2010
2011         if (bdev->ops->create(bdev, dest, cname, specs) < 0) {
2012                  bdev_put(bdev);
2013                  return NULL;
2014         }
2015
2016         return bdev;
2017 }
2018
2019 /*
2020  * bdev_create:
2021  * Create a backing store for a container.
2022  * If successfull, return a struct bdev *, with the bdev mounted and ready
2023  * for use.  Before completing, the caller will need to call the
2024  * umount operation and bdev_put().
2025  * @dest: the mountpoint (i.e. /var/lib/lxc/$name/rootfs)
2026  * @type: the bdevtype (dir, btrfs, zfs, etc)
2027  * @cname: the container name
2028  * @specs: details about the backing store to create, like fstype
2029  */
2030 struct bdev *bdev_create(const char *dest, const char *type,
2031                         const char *cname, struct bdev_specs *specs)
2032 {
2033         struct bdev *bdev;
2034         char *best_options[] = {"btrfs", "zfs", "lvm", "dir", NULL};
2035
2036         if (!type)
2037                 return do_bdev_create(dest, "dir", cname, specs);
2038
2039         if (strcmp(type, "best") == 0) {
2040                 int i;
2041                 // try for the best backing store type, according to our
2042                 // opinionated preferences
2043                 for (i=0; best_options[i]; i++) {
2044                         if ((bdev = do_bdev_create(dest, best_options[i], cname, specs)))
2045                                 return bdev;
2046                 }
2047                 return NULL;  // 'dir' should never fail, so this shouldn't happen
2048         }
2049
2050         // -B lvm,dir
2051         if (index(type, ',') != NULL) {
2052                 char *dup = alloca(strlen(type)+1), *saveptr, *token;
2053                 strcpy(dup, type);
2054                 for (token = strtok_r(dup, ",", &saveptr); token;
2055                                 token = strtok_r(NULL, ",", &saveptr)) {
2056                         if ((bdev = do_bdev_create(dest, token, cname, specs)))
2057                                 return bdev;
2058                 }
2059         }
2060
2061         return do_bdev_create(dest, type, cname, specs);
2062 }
2063
2064 char *overlayfs_getlower(char *p)
2065 {
2066         char *p1 = index(p, ':');
2067         if (p1)
2068                 *p1 = '\0';
2069         return p;
2070 }