]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/bdev.c
log: Drop trailing \n from log messages
[mirror_lxc.git] / src / lxc / bdev.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 /*
25 * this is all just a first shot for experiment. If we go this route, much
26 * shoudl change. bdev should be a directory with per-bdev file. Things which
27 * I'm doing by calling out to userspace should sometimes be done through
28 * libraries like liblvm2
29 */
30 #define _GNU_SOURCE
31 #include <stdio.h>
32 #include <stdint.h>
33 #include <inttypes.h>
34 #include <sys/types.h>
35 #include <grp.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <sched.h>
39 #include <sys/mount.h>
40 #include <sys/wait.h>
41 #include <libgen.h>
42 #include <linux/loop.h>
43 #include <dirent.h>
44
45 #include "lxc.h"
46 #include "config.h"
47 #include "conf.h"
48 #include "bdev.h"
49 #include "log.h"
50 #include "error.h"
51 #include "utils.h"
52 #include "namespace.h"
53 #include "parse.h"
54 #include "lxclock.h"
55
56 #ifndef BLKGETSIZE64
57 #define BLKGETSIZE64 _IOR(0x12,114,size_t)
58 #endif
59
60 #ifndef LO_FLAGS_AUTOCLEAR
61 #define LO_FLAGS_AUTOCLEAR 4
62 #endif
63
64 #define DEFAULT_FS_SIZE 1073741824
65 #define DEFAULT_FSTYPE "ext3"
66
67 lxc_log_define(bdev, lxc);
68
69 static int do_rsync(const char *src, const char *dest)
70 {
71 // call out to rsync
72 pid_t pid;
73 char *s;
74 size_t l;
75
76 pid = fork();
77 if (pid < 0)
78 return -1;
79 if (pid > 0)
80 return wait_for_pid(pid);
81
82 l = strlen(src) + 2;
83 s = malloc(l);
84 if (!s)
85 exit(1);
86 strcpy(s, src);
87 s[l-2] = '/';
88 s[l-1] = '\0';
89
90 execlp("rsync", "rsync", "-a", s, dest, (char *)NULL);
91 exit(1);
92 }
93
94 /*
95 * return block size of dev->src in units of bytes
96 */
97 static int blk_getsize(struct bdev *bdev, uint64_t *size)
98 {
99 int fd, ret;
100 char *path = bdev->src;
101
102 if (strcmp(bdev->type, "loop") == 0)
103 path = bdev->src + 5;
104
105 fd = open(path, O_RDONLY);
106 if (fd < 0)
107 return -1;
108
109 ret = ioctl(fd, BLKGETSIZE64, size); // size of device in bytes
110 close(fd);
111 return ret;
112 }
113
114 /*
115 * These are copied from conf.c. However as conf.c will be moved to using
116 * the callback system, they can be pulled from there eventually, so we
117 * don't need to pollute utils.c with these low level functions
118 */
119 static int find_fstype_cb(char* buffer, void *data)
120 {
121 struct cbarg {
122 const char *rootfs;
123 const char *target;
124 const char *options;
125 } *cbarg = data;
126
127 unsigned long mntflags;
128 char *mntdata;
129 char *fstype;
130
131 /* we don't try 'nodev' entries */
132 if (strstr(buffer, "nodev"))
133 return 0;
134
135 fstype = buffer;
136 fstype += lxc_char_left_gc(fstype, strlen(fstype));
137 fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
138
139 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
140 cbarg->rootfs, cbarg->target, fstype);
141
142 if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) {
143 free(mntdata);
144 return 0;
145 }
146
147 if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) {
148 DEBUG("mount failed with error: %s", strerror(errno));
149 free(mntdata);
150 return 0;
151 }
152
153 free(mntdata);
154
155 INFO("mounted '%s' on '%s', with fstype '%s'",
156 cbarg->rootfs, cbarg->target, fstype);
157
158 return 1;
159 }
160
161 static int mount_unknown_fs(const char *rootfs, const char *target,
162 const char *options)
163 {
164 int i;
165
166 struct cbarg {
167 const char *rootfs;
168 const char *target;
169 const char *options;
170 } cbarg = {
171 .rootfs = rootfs,
172 .target = target,
173 .options = options,
174 };
175
176 /*
177 * find the filesystem type with brute force:
178 * first we check with /etc/filesystems, in case the modules
179 * are auto-loaded and fall back to the supported kernel fs
180 */
181 char *fsfile[] = {
182 "/etc/filesystems",
183 "/proc/filesystems",
184 };
185
186 for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
187
188 int ret;
189
190 if (access(fsfile[i], F_OK))
191 continue;
192
193 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
194 if (ret < 0) {
195 ERROR("failed to parse '%s'", fsfile[i]);
196 return -1;
197 }
198
199 if (ret)
200 return 0;
201 }
202
203 ERROR("failed to determine fs type for '%s'", rootfs);
204 return -1;
205 }
206
207 static int do_mkfs(const char *path, const char *fstype)
208 {
209 pid_t pid;
210
211 if ((pid = fork()) < 0) {
212 ERROR("error forking");
213 return -1;
214 }
215 if (pid > 0)
216 return wait_for_pid(pid);
217
218 // If the file is not a block device, we don't want mkfs to ask
219 // us about whether to proceed.
220 close(0);
221 close(1);
222 close(2);
223 open("/dev/zero", O_RDONLY);
224 open("/dev/null", O_RDWR);
225 open("/dev/null", O_RDWR);
226 execlp("mkfs", "mkfs", "-t", fstype, path, NULL);
227 exit(1);
228 }
229
230 static char *linkderef(char *path, char *dest)
231 {
232 struct stat sbuf;
233 ssize_t ret;
234
235 ret = stat(path, &sbuf);
236 if (ret < 0)
237 return NULL;
238 if (!S_ISLNK(sbuf.st_mode))
239 return path;
240 ret = readlink(path, dest, MAXPATHLEN);
241 if (ret < 0) {
242 SYSERROR("error reading link %s", path);
243 return NULL;
244 } else if (ret >= MAXPATHLEN) {
245 ERROR("link in %s too long", path);
246 return NULL;
247 }
248 dest[ret] = '\0';
249 return dest;
250 }
251
252 /*
253 * Given a bdev (presumably blockdev-based), detect the fstype
254 * by trying mounting (in a private mntns) it.
255 * @bdev: bdev to investigate
256 * @type: preallocated char* in which to write the fstype
257 * @len: length of passed in char*
258 * Returns length of fstype, of -1 on error
259 */
260 static int detect_fs(struct bdev *bdev, char *type, int len)
261 {
262 int p[2], ret;
263 size_t linelen;
264 pid_t pid;
265 FILE *f;
266 char *sp1, *sp2, *sp3, *line = NULL;
267 char *srcdev;
268
269 if (!bdev || !bdev->src || !bdev->dest)
270 return -1;
271
272 srcdev = bdev->src;
273 if (strcmp(bdev->type, "loop") == 0)
274 srcdev = bdev->src + 5;
275
276 ret = pipe(p);
277 if (ret < 0)
278 return -1;
279 if ((pid = fork()) < 0)
280 return -1;
281 if (pid > 0) {
282 int status;
283 close(p[1]);
284 memset(type, 0, len);
285 ret = read(p[0], type, len-1);
286 close(p[0]);
287 if (ret < 0) {
288 SYSERROR("error reading from pipe");
289 wait(&status);
290 return -1;
291 } else if (ret == 0) {
292 ERROR("child exited early - fstype not found");
293 wait(&status);
294 return -1;
295 }
296 wait(&status);
297 type[len-1] = '\0';
298 INFO("detected fstype %s for %s", type, srcdev);
299 return ret;
300 }
301
302 if (unshare(CLONE_NEWNS) < 0)
303 exit(1);
304
305 ret = mount_unknown_fs(srcdev, bdev->dest, bdev->mntopts);
306 if (ret < 0) {
307 ERROR("failed mounting %s onto %s to detect fstype", srcdev, bdev->dest);
308 exit(1);
309 }
310 // if symlink, get the real dev name
311 char devpath[MAXPATHLEN];
312 char *l = linkderef(srcdev, devpath);
313 if (!l)
314 exit(1);
315 f = fopen("/proc/self/mounts", "r");
316 if (!f)
317 exit(1);
318 while (getline(&line, &linelen, f) != -1) {
319 sp1 = index(line, ' ');
320 if (!sp1)
321 exit(1);
322 *sp1 = '\0';
323 if (strcmp(line, l))
324 continue;
325 sp2 = index(sp1+1, ' ');
326 if (!sp2)
327 exit(1);
328 *sp2 = '\0';
329 sp3 = index(sp2+1, ' ');
330 if (!sp3)
331 exit(1);
332 *sp3 = '\0';
333 sp2++;
334 if (write(p[1], sp2, strlen(sp2)) != strlen(sp2))
335 exit(1);
336 exit(0);
337 }
338 exit(1);
339 }
340
341 struct bdev_type {
342 const char *name;
343 const struct bdev_ops *ops;
344 };
345
346 static int is_dir(const char *path)
347 {
348 struct stat statbuf;
349 int ret = stat(path, &statbuf);
350 if (ret == 0 && S_ISDIR(statbuf.st_mode))
351 return 1;
352 return 0;
353 }
354
355 static int dir_detect(const char *path)
356 {
357 if (strncmp(path, "dir:", 4) == 0)
358 return 1; // take their word for it
359 if (is_dir(path))
360 return 1;
361 return 0;
362 }
363
364 //
365 // XXXXXXX plain directory bind mount ops
366 //
367 static int dir_mount(struct bdev *bdev)
368 {
369 unsigned long mntflags;
370 char *mntdata;
371 int ret;
372
373 if (strcmp(bdev->type, "dir"))
374 return -22;
375 if (!bdev->src || !bdev->dest)
376 return -22;
377
378 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
379 free(mntdata);
380 return -22;
381 }
382
383 ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
384 free(mntdata);
385 return ret;
386 }
387
388 static int dir_umount(struct bdev *bdev)
389 {
390 if (strcmp(bdev->type, "dir"))
391 return -22;
392 if (!bdev->src || !bdev->dest)
393 return -22;
394 return umount(bdev->dest);
395 }
396
397 /* the bulk of this needs to become a common helper */
398 static char *dir_new_path(char *src, const char *oldname, const char *name,
399 const char *oldpath, const char *lxcpath)
400 {
401 char *ret, *p, *p2;
402 int l1, l2, nlen;
403
404 nlen = strlen(src) + 1;
405 l1 = strlen(oldpath);
406 p = src;
407 /* if src starts with oldpath, look for oldname only after
408 * that path */
409 if (strncmp(src, oldpath, l1) == 0) {
410 p += l1;
411 nlen += (strlen(lxcpath) - l1);
412 }
413 l2 = strlen(oldname);
414 while ((p = strstr(p, oldname)) != NULL) {
415 p += l2;
416 nlen += strlen(name) - l2;
417 }
418
419 ret = malloc(nlen);
420 if (!ret)
421 return NULL;
422
423 p = ret;
424 if (strncmp(src, oldpath, l1) == 0) {
425 p += sprintf(p, "%s", lxcpath);
426 src += l1;
427 }
428
429 while ((p2 = strstr(src, oldname)) != NULL) {
430 strncpy(p, src, p2-src); // copy text up to oldname
431 p += p2-src; // move target pointer (p)
432 p += sprintf(p, "%s", name); // print new name in place of oldname
433 src = p2 + l2; // move src to end of oldname
434 }
435 sprintf(p, "%s", src); // copy the rest of src
436 return ret;
437 }
438
439 /*
440 * for a simple directory bind mount, we substitute the old container
441 * name and paths for the new
442 */
443 static int dir_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
444 const char *cname, const char *oldpath, const char *lxcpath, int snap,
445 uint64_t newsize)
446 {
447 int len, ret;
448
449 if (snap) {
450 ERROR("directories cannot be snapshotted. Try overlayfs.");
451 return -1;
452 }
453
454 if (!orig->dest || !orig->src)
455 return -1;
456
457 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
458 new->src = malloc(len);
459 if (!new->src)
460 return -1;
461 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
462 if (ret < 0 || ret >= len)
463 return -1;
464 if ((new->dest = strdup(new->src)) == NULL)
465 return -1;
466
467 return 0;
468 }
469
470 static int dir_destroy(struct bdev *orig)
471 {
472 if (lxc_rmdir_onedev(orig->src) < 0)
473 return -1;
474 return 0;
475 }
476
477 static int dir_create(struct bdev *bdev, const char *dest, const char *n,
478 struct bdev_specs *specs)
479 {
480 bdev->src = strdup(dest);
481 bdev->dest = strdup(dest);
482 if (!bdev->src || !bdev->dest) {
483 ERROR("Out of memory");
484 return -1;
485 }
486
487 if (mkdir_p(bdev->src, 0755) < 0) {
488 ERROR("Error creating %s", bdev->src);
489 return -1;
490 }
491 if (mkdir_p(bdev->dest, 0755) < 0) {
492 ERROR("Error creating %s", bdev->dest);
493 return -1;
494 }
495
496 return 0;
497 }
498
499 static const struct bdev_ops dir_ops = {
500 .detect = &dir_detect,
501 .mount = &dir_mount,
502 .umount = &dir_umount,
503 .clone_paths = &dir_clonepaths,
504 .destroy = &dir_destroy,
505 .create = &dir_create,
506 .can_snapshot = false,
507 };
508
509
510 //
511 // XXXXXXX zfs ops
512 // There are two ways we could do this. We could always specify the
513 // 'zfs device' (i.e. tank/lxc lxc/container) as rootfs. But instead
514 // (at least right now) we have lxc-create specify $lxcpath/$lxcname/rootfs
515 // as the mountpoint, so that it is always mounted.
516 //
517 // That means 'mount' is really never needed and could be noop, but for the
518 // sake of flexibility let's always bind-mount.
519 //
520
521 static int zfs_list_entry(const char *path, char *output, size_t inlen)
522 {
523 struct lxc_popen_FILE *f;
524 int found=0;
525
526 f = lxc_popen("zfs list 2> /dev/null");
527 if (f == NULL) {
528 SYSERROR("popen failed");
529 return 0;
530 }
531 while (fgets(output, inlen, f->f)) {
532 if (strstr(output, path)) {
533 found = 1;
534 break;
535 }
536 }
537 (void) lxc_pclose(f);
538
539 return found;
540 }
541
542 static int zfs_detect(const char *path)
543 {
544 char *output = malloc(LXC_LOG_BUFFER_SIZE);
545 int found;
546
547 if (!output) {
548 ERROR("out of memory");
549 return 0;
550 }
551 found = zfs_list_entry(path, output, LXC_LOG_BUFFER_SIZE);
552 free(output);
553 return found;
554 }
555
556 static int zfs_mount(struct bdev *bdev)
557 {
558 unsigned long mntflags;
559 char *mntdata;
560 int ret;
561
562 if (strcmp(bdev->type, "zfs"))
563 return -22;
564 if (!bdev->src || !bdev->dest)
565 return -22;
566
567 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
568 free(mntdata);
569 return -22;
570 }
571
572 ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
573 free(mntdata);
574 return ret;
575 }
576
577 static int zfs_umount(struct bdev *bdev)
578 {
579 if (strcmp(bdev->type, "zfs"))
580 return -22;
581 if (!bdev->src || !bdev->dest)
582 return -22;
583 return umount(bdev->dest);
584 }
585
586 static int zfs_clone(const char *opath, const char *npath, const char *oname,
587 const char *nname, const char *lxcpath, int snapshot)
588 {
589 // use the 'zfs list | grep opath' entry to get the zfsroot
590 char output[MAXPATHLEN], option[MAXPATHLEN], *p;
591 const char *zfsroot = output;
592 int ret;
593 pid_t pid;
594
595 if (zfs_list_entry(opath, output, MAXPATHLEN)) {
596 // zfsroot is output up to ' '
597 if ((p = index(output, ' ')) == NULL)
598 return -1;
599 *p = '\0';
600 if ((p = strrchr(output, '/')) == NULL)
601 return -1;
602 *p = '\0';
603 } else
604 zfsroot = lxc_global_config_value("lxc.bdev.zfs.root");
605
606 ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s/%s/rootfs",
607 lxcpath, nname);
608 if (ret < 0 || ret >= MAXPATHLEN)
609 return -1;
610
611 // zfs create -omountpoint=$lxcpath/$lxcname $zfsroot/$nname
612 if (!snapshot) {
613 if ((pid = fork()) < 0)
614 return -1;
615 if (!pid) {
616 char dev[MAXPATHLEN];
617
618 ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, nname);
619 if (ret < 0 || ret >= MAXPATHLEN)
620 exit(1);
621 execlp("zfs", "zfs", "create", option, dev, NULL);
622 exit(1);
623 }
624 return wait_for_pid(pid);
625 } else {
626 // if snapshot, do
627 // 'zfs snapshot zfsroot/oname@nname
628 // zfs clone zfsroot/oname@nname zfsroot/nname
629 char path1[MAXPATHLEN], path2[MAXPATHLEN];
630
631 ret = snprintf(path1, MAXPATHLEN, "%s/%s@%s", zfsroot,
632 oname, nname);
633 if (ret < 0 || ret >= MAXPATHLEN)
634 return -1;
635 (void) snprintf(path2, MAXPATHLEN, "%s/%s", zfsroot, nname);
636
637 // if the snapshot exists, delete it
638 if ((pid = fork()) < 0)
639 return -1;
640 if (!pid) {
641 execlp("zfs", "zfs", "destroy", path1, NULL);
642 exit(1);
643 }
644 // it probably doesn't exist so destroy probably will fail.
645 (void) wait_for_pid(pid);
646
647 // run first (snapshot) command
648 if ((pid = fork()) < 0)
649 return -1;
650 if (!pid) {
651 execlp("zfs", "zfs", "snapshot", path1, NULL);
652 exit(1);
653 }
654 if (wait_for_pid(pid) < 0)
655 return -1;
656
657 // run second (clone) command
658 if ((pid = fork()) < 0)
659 return -1;
660 if (!pid) {
661 execlp("zfs", "zfs", "clone", option, path1, path2, NULL);
662 exit(1);
663 }
664 return wait_for_pid(pid);
665 }
666 }
667
668 static int zfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
669 const char *cname, const char *oldpath, const char *lxcpath, int snap,
670 uint64_t newsize)
671 {
672 int len, ret;
673
674 if (!orig->src || !orig->dest)
675 return -1;
676
677 if (snap && strcmp(orig->type, "zfs")) {
678 ERROR("zfs snapshot from %s backing store is not supported",
679 orig->type);
680 return -1;
681 }
682
683 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
684 new->src = malloc(len);
685 if (!new->src)
686 return -1;
687 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
688 if (ret < 0 || ret >= len)
689 return -1;
690 if ((new->dest = strdup(new->src)) == NULL)
691 return -1;
692
693 return zfs_clone(orig->src, new->src, oldname, cname, lxcpath, snap);
694 }
695
696 /*
697 * TODO: detect whether this was a clone, and if so then also delete the
698 * snapshot it was based on, so that we don't hold the original
699 * container busy.
700 */
701 static int zfs_destroy(struct bdev *orig)
702 {
703 pid_t pid;
704 char output[MAXPATHLEN], *p;
705
706 if ((pid = fork()) < 0)
707 return -1;
708 if (pid)
709 return wait_for_pid(pid);
710
711 if (!zfs_list_entry(orig->src, output, MAXPATHLEN)) {
712 ERROR("Error: zfs entry for %s not found", orig->src);
713 return -1;
714 }
715
716 // zfs mount is output up to ' '
717 if ((p = index(output, ' ')) == NULL)
718 return -1;
719 *p = '\0';
720
721 execlp("zfs", "zfs", "destroy", output, NULL);
722 exit(1);
723 }
724
725 static int zfs_create(struct bdev *bdev, const char *dest, const char *n,
726 struct bdev_specs *specs)
727 {
728 const char *zfsroot;
729 char option[MAXPATHLEN];
730 int ret;
731 pid_t pid;
732
733 if (!specs || !specs->zfs.zfsroot)
734 zfsroot = lxc_global_config_value("lxc.bdev.zfs.root");
735 else
736 zfsroot = specs->zfs.zfsroot;
737
738 if (!(bdev->dest = strdup(dest))) {
739 ERROR("No mount target specified or out of memory");
740 return -1;
741 }
742 if (!(bdev->src = strdup(bdev->dest))) {
743 ERROR("out of memory");
744 return -1;
745 }
746
747 ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s", bdev->dest);
748 if (ret < 0 || ret >= MAXPATHLEN)
749 return -1;
750 if ((pid = fork()) < 0)
751 return -1;
752 if (pid)
753 return wait_for_pid(pid);
754
755 char dev[MAXPATHLEN];
756 ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, n);
757 if (ret < 0 || ret >= MAXPATHLEN)
758 exit(1);
759 execlp("zfs", "zfs", "create", option, dev, NULL);
760 exit(1);
761 }
762
763 static const struct bdev_ops zfs_ops = {
764 .detect = &zfs_detect,
765 .mount = &zfs_mount,
766 .umount = &zfs_umount,
767 .clone_paths = &zfs_clonepaths,
768 .destroy = &zfs_destroy,
769 .create = &zfs_create,
770 .can_snapshot = true,
771 };
772
773 //
774 // LVM ops
775 //
776
777 /*
778 * Look at /sys/dev/block/maj:min/dm/uuid. If it contains the hardcoded LVM
779 * prefix "LVM-", then this is an lvm2 LV
780 */
781 static int lvm_detect(const char *path)
782 {
783 char devp[MAXPATHLEN], buf[4];
784 FILE *fout;
785 int ret;
786 struct stat statbuf;
787
788 if (strncmp(path, "lvm:", 4) == 0)
789 return 1; // take their word for it
790
791 ret = stat(path, &statbuf);
792 if (ret != 0)
793 return 0;
794 if (!S_ISBLK(statbuf.st_mode))
795 return 0;
796
797 ret = snprintf(devp, MAXPATHLEN, "/sys/dev/block/%d:%d/dm/uuid",
798 major(statbuf.st_rdev), minor(statbuf.st_rdev));
799 if (ret < 0 || ret >= MAXPATHLEN) {
800 ERROR("lvm uuid pathname too long");
801 return 0;
802 }
803 fout = fopen(devp, "r");
804 if (!fout)
805 return 0;
806 ret = fread(buf, 1, 4, fout);
807 fclose(fout);
808 if (ret != 4 || strncmp(buf, "LVM-", 4) != 0)
809 return 0;
810 return 1;
811 }
812
813 static int lvm_mount(struct bdev *bdev)
814 {
815 if (strcmp(bdev->type, "lvm"))
816 return -22;
817 if (!bdev->src || !bdev->dest)
818 return -22;
819 /* if we might pass in data sometime, then we'll have to enrich
820 * mount_unknown_fs */
821 return mount_unknown_fs(bdev->src, bdev->dest, bdev->mntopts);
822 }
823
824 static int lvm_umount(struct bdev *bdev)
825 {
826 if (strcmp(bdev->type, "lvm"))
827 return -22;
828 if (!bdev->src || !bdev->dest)
829 return -22;
830 return umount(bdev->dest);
831 }
832
833 static int lvm_compare_lv_attr(const char *path, int pos, const char expected) {
834 struct lxc_popen_FILE *f;
835 int ret, len, status, start=0;
836 char *cmd, output[12];
837 const char *lvscmd = "lvs --unbuffered --noheadings -o lv_attr %s 2>/dev/null";
838
839 len = strlen(lvscmd) + strlen(path) - 1;
840 cmd = alloca(len);
841
842 ret = snprintf(cmd, len, lvscmd, path);
843 if (ret < 0 || ret >= len)
844 return -1;
845
846 f = lxc_popen(cmd);
847
848 if (f == NULL) {
849 SYSERROR("popen failed");
850 return -1;
851 }
852
853 ret = fgets(output, 12, f->f) == NULL;
854
855 status = lxc_pclose(f);
856
857 if (ret || WEXITSTATUS(status))
858 // Assume either vg or lvs do not exist, default
859 // comparison to false.
860 return 0;
861
862 len = strlen(output);
863 while(start < len && output[start] == ' ') start++;
864
865 if (start + pos < len && output[start + pos] == expected)
866 return 1;
867
868 return 0;
869 }
870
871 static int lvm_is_thin_volume(const char *path)
872 {
873 return lvm_compare_lv_attr(path, 6, 't');
874 }
875
876 static int lvm_is_thin_pool(const char *path)
877 {
878 return lvm_compare_lv_attr(path, 0, 't');
879 }
880
881 /*
882 * path must be '/dev/$vg/$lv', $vg must be an existing VG, and $lv must not
883 * yet exist. This function will attempt to create /dev/$vg/$lv of size
884 * $size. If thinpool is specified, we'll check for it's existence and if it's
885 * a valid thin pool, and if so, we'll create the requested lv from that thin
886 * pool.
887 */
888 static int do_lvm_create(const char *path, uint64_t size, const char *thinpool)
889 {
890 int ret, pid, len;
891 char sz[24], *pathdup, *vg, *lv, *tp = NULL;
892
893 if ((pid = fork()) < 0) {
894 SYSERROR("failed fork");
895 return -1;
896 }
897 if (pid > 0)
898 return wait_for_pid(pid);
899
900 // specify bytes to lvcreate
901 ret = snprintf(sz, 24, "%"PRIu64"b", size);
902 if (ret < 0 || ret >= 24)
903 exit(1);
904
905 pathdup = strdup(path);
906 if (!pathdup)
907 exit(1);
908
909 lv = strrchr(pathdup, '/');
910 if (!lv)
911 exit(1);
912
913 *lv = '\0';
914 lv++;
915
916 vg = strrchr(pathdup, '/');
917 if (!vg)
918 exit(1);
919 vg++;
920
921 if (thinpool) {
922 len = strlen(pathdup) + strlen(thinpool) + 2;
923 tp = alloca(len);
924
925 ret = snprintf(tp, len, "%s/%s", pathdup, thinpool);
926 if (ret < 0 || ret >= len)
927 exit(1);
928
929 ret = lvm_is_thin_pool(tp);
930 INFO("got %d for thin pool at path: %s", ret, tp);
931 if (ret < 0)
932 exit(1);
933
934 if (!ret)
935 tp = NULL;
936 }
937
938 if (!tp)
939 execlp("lvcreate", "lvcreate", "-L", sz, vg, "-n", lv, (char *)NULL);
940 else
941 execlp("lvcreate", "lvcreate", "--thinpool", tp, "-V", sz, vg, "-n", lv, (char *)NULL);
942
943 SYSERROR("execlp");
944 exit(1);
945 }
946
947 static int lvm_snapshot(const char *orig, const char *path, uint64_t size)
948 {
949 int ret, pid;
950 char sz[24], *pathdup, *lv;
951
952 if ((pid = fork()) < 0) {
953 SYSERROR("failed fork");
954 return -1;
955 }
956 if (pid > 0)
957 return wait_for_pid(pid);
958
959 // specify bytes to lvcreate
960 ret = snprintf(sz, 24, "%"PRIu64"b", size);
961 if (ret < 0 || ret >= 24)
962 exit(1);
963
964 pathdup = strdup(path);
965 if (!pathdup)
966 exit(1);
967 lv = strrchr(pathdup, '/');
968 if (!lv) {
969 free(pathdup);
970 exit(1);
971 }
972 *lv = '\0';
973 lv++;
974
975 // check if the original lv is backed by a thin pool, in which case we
976 // cannot specify a size that's different from the original size.
977 ret = lvm_is_thin_volume(orig);
978 if (ret == -1) {
979 free(pathdup);
980 return -1;
981 }
982
983 if (!ret) {
984 ret = execlp("lvcreate", "lvcreate", "-s", "-L", sz, "-n", lv, orig, (char *)NULL);
985 } else {
986 ret = execlp("lvcreate", "lvcreate", "-s", "-n", lv, orig, (char *)NULL);
987 }
988
989 free(pathdup);
990 exit(1);
991 }
992
993 // this will return 1 for physical disks, qemu-nbd, loop, etc
994 // right now only lvm is a block device
995 static int is_blktype(struct bdev *b)
996 {
997 if (strcmp(b->type, "lvm") == 0)
998 return 1;
999 return 0;
1000 }
1001
1002 static int lvm_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1003 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1004 uint64_t newsize)
1005 {
1006 char fstype[100];
1007 uint64_t size = newsize;
1008 int len, ret;
1009
1010 if (!orig->src || !orig->dest)
1011 return -1;
1012
1013 if (strcmp(orig->type, "lvm")) {
1014 const char *vg;
1015
1016 if (snap) {
1017 ERROR("LVM snapshot from %s backing store is not supported",
1018 orig->type);
1019 return -1;
1020 }
1021 vg = lxc_global_config_value("lxc.bdev.lvm.vg");
1022 len = strlen("/dev/") + strlen(vg) + strlen(cname) + 2;
1023 if ((new->src = malloc(len)) == NULL)
1024 return -1;
1025 ret = snprintf(new->src, len, "/dev/%s/%s", vg, cname);
1026 if (ret < 0 || ret >= len)
1027 return -1;
1028 } else {
1029 new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath);
1030 if (!new->src)
1031 return -1;
1032 }
1033
1034 if (orig->mntopts) {
1035 new->mntopts = strdup(orig->mntopts);
1036 if (!new->mntopts)
1037 return -1;
1038 }
1039
1040 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1041 new->dest = malloc(len);
1042 if (!new->dest)
1043 return -1;
1044 ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
1045 if (ret < 0 || ret >= len)
1046 return -1;
1047 if (mkdir_p(new->dest, 0755) < 0)
1048 return -1;
1049
1050 if (is_blktype(orig)) {
1051 if (!newsize && blk_getsize(orig, &size) < 0) {
1052 ERROR("Error getting size of %s", orig->src);
1053 return -1;
1054 }
1055 if (detect_fs(orig, fstype, 100) < 0) {
1056 INFO("could not find fstype for %s, using ext3", orig->src);
1057 return -1;
1058 }
1059 } else {
1060 sprintf(fstype, "ext3");
1061 if (!newsize)
1062 size = DEFAULT_FS_SIZE;
1063 }
1064
1065 if (snap) {
1066 if (lvm_snapshot(orig->src, new->src, size) < 0) {
1067 ERROR("could not create %s snapshot of %s", new->src, orig->src);
1068 return -1;
1069 }
1070 } else {
1071 if (do_lvm_create(new->src, size, lxc_global_config_value("lxc.bdev.lvm.thin_pool")) < 0) {
1072 ERROR("Error creating new lvm blockdev");
1073 return -1;
1074 }
1075 if (do_mkfs(new->src, fstype) < 0) {
1076 ERROR("Error creating filesystem type %s on %s", fstype,
1077 new->src);
1078 return -1;
1079 }
1080 }
1081
1082 return 0;
1083 }
1084
1085 static int lvm_destroy(struct bdev *orig)
1086 {
1087 pid_t pid;
1088
1089 if ((pid = fork()) < 0)
1090 return -1;
1091 if (!pid) {
1092 execlp("lvremove", "lvremove", "-f", orig->src, NULL);
1093 exit(1);
1094 }
1095 return wait_for_pid(pid);
1096 }
1097
1098 static int lvm_create(struct bdev *bdev, const char *dest, const char *n,
1099 struct bdev_specs *specs)
1100 {
1101 const char *vg, *thinpool, *fstype, *lv = n;
1102 uint64_t sz;
1103 int ret, len;
1104
1105 if (!specs)
1106 return -1;
1107
1108 vg = specs->lvm.vg;
1109 if (!vg)
1110 vg = lxc_global_config_value("lxc.bdev.lvm.vg");
1111
1112 thinpool = specs->lvm.thinpool;
1113 if (!thinpool)
1114 thinpool = lxc_global_config_value("lxc.bdev.lvm.thin_pool");
1115
1116 /* /dev/$vg/$lv */
1117 if (specs->lvm.lv)
1118 lv = specs->lvm.lv;
1119
1120 len = strlen(vg) + strlen(lv) + 7;
1121 bdev->src = malloc(len);
1122 if (!bdev->src)
1123 return -1;
1124
1125 ret = snprintf(bdev->src, len, "/dev/%s/%s", vg, lv);
1126 if (ret < 0 || ret >= len)
1127 return -1;
1128
1129 // fssize is in bytes.
1130 sz = specs->fssize;
1131 if (!sz)
1132 sz = DEFAULT_FS_SIZE;
1133
1134 if (do_lvm_create(bdev->src, sz, thinpool) < 0) {
1135 ERROR("Error creating new lvm blockdev %s size %"PRIu64" bytes", bdev->src, sz);
1136 return -1;
1137 }
1138
1139 fstype = specs->fstype;
1140 if (!fstype)
1141 fstype = DEFAULT_FSTYPE;
1142 if (do_mkfs(bdev->src, fstype) < 0) {
1143 ERROR("Error creating filesystem type %s on %s", fstype,
1144 bdev->src);
1145 return -1;
1146 }
1147 if (!(bdev->dest = strdup(dest)))
1148 return -1;
1149
1150 if (mkdir_p(bdev->dest, 0755) < 0) {
1151 ERROR("Error creating %s", bdev->dest);
1152 return -1;
1153 }
1154
1155 return 0;
1156 }
1157
1158 static const struct bdev_ops lvm_ops = {
1159 .detect = &lvm_detect,
1160 .mount = &lvm_mount,
1161 .umount = &lvm_umount,
1162 .clone_paths = &lvm_clonepaths,
1163 .destroy = &lvm_destroy,
1164 .create = &lvm_create,
1165 .can_snapshot = true,
1166 };
1167
1168 //
1169 // btrfs ops
1170 //
1171
1172 struct btrfs_ioctl_space_info {
1173 unsigned long long flags;
1174 unsigned long long total_bytes;
1175 unsigned long long used_bytes;
1176 };
1177
1178 struct btrfs_ioctl_space_args {
1179 unsigned long long space_slots;
1180 unsigned long long total_spaces;
1181 struct btrfs_ioctl_space_info spaces[0];
1182 };
1183
1184 #define BTRFS_IOCTL_MAGIC 0x94
1185 #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, unsigned long long)
1186 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
1187 struct btrfs_ioctl_space_args)
1188
1189 static bool is_btrfs_fs(const char *path)
1190 {
1191 int fd, ret;
1192 struct btrfs_ioctl_space_args sargs;
1193
1194 // make sure this is a btrfs filesystem
1195 fd = open(path, O_RDONLY);
1196 if (fd < 0)
1197 return false;
1198 sargs.space_slots = 0;
1199 sargs.total_spaces = 0;
1200 ret = ioctl(fd, BTRFS_IOC_SPACE_INFO, &sargs);
1201 close(fd);
1202 if (ret < 0)
1203 return false;
1204
1205 return true;
1206 }
1207
1208 static int btrfs_detect(const char *path)
1209 {
1210 struct stat st;
1211 int ret;
1212
1213 if (!is_btrfs_fs(path))
1214 return 0;
1215
1216 // and make sure it's a subvolume.
1217 ret = stat(path, &st);
1218 if (ret < 0)
1219 return 0;
1220
1221 if (st.st_ino == 256 && S_ISDIR(st.st_mode))
1222 return 1;
1223
1224 return 0;
1225 }
1226
1227 static int btrfs_mount(struct bdev *bdev)
1228 {
1229 unsigned long mntflags;
1230 char *mntdata;
1231 int ret;
1232
1233 if (strcmp(bdev->type, "btrfs"))
1234 return -22;
1235 if (!bdev->src || !bdev->dest)
1236 return -22;
1237
1238 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
1239 free(mntdata);
1240 return -22;
1241 }
1242
1243 ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
1244 free(mntdata);
1245 return ret;
1246 }
1247
1248 static int btrfs_umount(struct bdev *bdev)
1249 {
1250 if (strcmp(bdev->type, "btrfs"))
1251 return -22;
1252 if (!bdev->src || !bdev->dest)
1253 return -22;
1254 return umount(bdev->dest);
1255 }
1256
1257 #define BTRFS_SUBVOL_NAME_MAX 4039
1258 #define BTRFS_PATH_NAME_MAX 4087
1259
1260 struct btrfs_ioctl_vol_args {
1261 signed long long fd;
1262 char name[BTRFS_PATH_NAME_MAX + 1];
1263 };
1264
1265 #define BTRFS_IOCTL_MAGIC 0x94
1266 #define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
1267 struct btrfs_ioctl_vol_args_v2)
1268 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
1269 struct btrfs_ioctl_vol_args_v2)
1270 #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
1271 struct btrfs_ioctl_vol_args)
1272 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
1273 struct btrfs_ioctl_vol_args)
1274
1275 #define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
1276
1277 struct btrfs_ioctl_vol_args_v2 {
1278 signed long long fd;
1279 unsigned long long transid;
1280 unsigned long long flags;
1281 union {
1282 struct {
1283 unsigned long long size;
1284 //struct btrfs_qgroup_inherit *qgroup_inherit;
1285 void *qgroup_inherit;
1286 };
1287 unsigned long long unused[4];
1288 };
1289 char name[BTRFS_SUBVOL_NAME_MAX + 1];
1290 };
1291
1292 static int btrfs_subvolume_create(const char *path)
1293 {
1294 int ret, fd = -1;
1295 struct btrfs_ioctl_vol_args args;
1296 char *p, *newfull = strdup(path);
1297
1298 if (!newfull) {
1299 ERROR("Error: out of memory");
1300 return -1;
1301 }
1302
1303 p = strrchr(newfull, '/');
1304 if (!p) {
1305 ERROR("bad path: %s", path);
1306 free(newfull);
1307 return -1;
1308 }
1309 *p = '\0';
1310
1311 fd = open(newfull, O_RDONLY);
1312 if (fd < 0) {
1313 ERROR("Error opening %s", newfull);
1314 free(newfull);
1315 return -1;
1316 }
1317
1318 memset(&args, 0, sizeof(args));
1319 strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1320 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1321 ret = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, &args);
1322 INFO("btrfs: snapshot create ioctl returned %d", ret);
1323
1324 free(newfull);
1325 close(fd);
1326 return ret;
1327 }
1328
1329 static int btrfs_snapshot(const char *orig, const char *new)
1330 {
1331 int fd = -1, fddst = -1, ret = -1;
1332 struct btrfs_ioctl_vol_args_v2 args;
1333 char *newdir, *newname, *newfull = NULL;
1334
1335 newfull = strdup(new);
1336 if (!newfull) {
1337 ERROR("Error: out of memory");
1338 goto out;
1339 }
1340 // make sure the directory doesn't already exist
1341 if (rmdir(newfull) < 0 && errno != -ENOENT) {
1342 SYSERROR("Error removing empty new rootfs");
1343 goto out;
1344 }
1345 newname = basename(newfull);
1346 newdir = dirname(newfull);
1347 fd = open(orig, O_RDONLY);
1348 if (fd < 0) {
1349 SYSERROR("Error opening original rootfs %s", orig);
1350 goto out;
1351 }
1352 fddst = open(newdir, O_RDONLY);
1353 if (fddst < 0) {
1354 SYSERROR("Error opening new container dir %s", newdir);
1355 goto out;
1356 }
1357
1358 memset(&args, 0, sizeof(args));
1359 args.fd = fd;
1360 strncpy(args.name, newname, BTRFS_SUBVOL_NAME_MAX);
1361 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1362 ret = ioctl(fddst, BTRFS_IOC_SNAP_CREATE_V2, &args);
1363 INFO("btrfs: snapshot create ioctl returned %d", ret);
1364
1365 out:
1366 if (fddst != -1)
1367 close(fddst);
1368 if (fd != -1)
1369 close(fd);
1370 if (newfull)
1371 free(newfull);
1372 return ret;
1373 }
1374
1375 static int btrfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1376 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1377 uint64_t newsize)
1378 {
1379 if (!orig->dest || !orig->src)
1380 return -1;
1381
1382 if (strcmp(orig->type, "btrfs")) {
1383 int len, ret;
1384 if (snap) {
1385 ERROR("btrfs snapshot from %s backing store is not supported",
1386 orig->type);
1387 return -1;
1388 }
1389 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1390 new->src = malloc(len);
1391 if (!new->src)
1392 return -1;
1393 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
1394 if (ret < 0 || ret >= len)
1395 return -1;
1396 } else {
1397 // in case rootfs is in custom path, reuse it
1398 if ((new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath)) == NULL)
1399 return -1;
1400
1401 }
1402
1403 if ((new->dest = strdup(new->src)) == NULL)
1404 return -1;
1405
1406 if (orig->mntopts && (new->mntopts = strdup(orig->mntopts)) == NULL)
1407 return -1;
1408
1409 if (snap)
1410 return btrfs_snapshot(orig->dest, new->dest);
1411
1412 if (rmdir(new->dest) < 0 && errno != -ENOENT) {
1413 SYSERROR("removing %s", new->dest);
1414 return -1;
1415 }
1416
1417 return btrfs_subvolume_create(new->dest);
1418 }
1419
1420 static int btrfs_destroy(struct bdev *orig)
1421 {
1422 int ret, fd = -1;
1423 struct btrfs_ioctl_vol_args args;
1424 char *path = orig->src;
1425 char *p, *newfull = strdup(path);
1426
1427 if (!newfull) {
1428 ERROR("Error: out of memory");
1429 return -1;
1430 }
1431
1432 p = strrchr(newfull, '/');
1433 if (!p) {
1434 ERROR("bad path: %s", path);
1435 free(newfull);
1436 return -1;
1437 }
1438 *p = '\0';
1439
1440 fd = open(newfull, O_RDONLY);
1441 if (fd < 0) {
1442 ERROR("Error opening %s", newfull);
1443 free(newfull);
1444 return -1;
1445 }
1446
1447 memset(&args, 0, sizeof(args));
1448 strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1449 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1450 ret = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &args);
1451 INFO("btrfs: snapshot create ioctl returned %d", ret);
1452
1453 free(newfull);
1454 close(fd);
1455 return ret;
1456 }
1457
1458 static int btrfs_create(struct bdev *bdev, const char *dest, const char *n,
1459 struct bdev_specs *specs)
1460 {
1461 bdev->src = strdup(dest);
1462 bdev->dest = strdup(dest);
1463 if (!bdev->src || !bdev->dest)
1464 return -1;
1465 return btrfs_subvolume_create(bdev->dest);
1466 }
1467
1468 static const struct bdev_ops btrfs_ops = {
1469 .detect = &btrfs_detect,
1470 .mount = &btrfs_mount,
1471 .umount = &btrfs_umount,
1472 .clone_paths = &btrfs_clonepaths,
1473 .destroy = &btrfs_destroy,
1474 .create = &btrfs_create,
1475 .can_snapshot = true,
1476 };
1477
1478 //
1479 // loopback dev ops
1480 //
1481 static int loop_detect(const char *path)
1482 {
1483 if (strncmp(path, "loop:", 5) == 0)
1484 return 1;
1485 return 0;
1486 }
1487
1488 static int find_free_loopdev(int *retfd, char *namep)
1489 {
1490 struct dirent dirent, *direntp;
1491 struct loop_info64 lo;
1492 DIR *dir;
1493 int fd = -1;
1494
1495 dir = opendir("/dev");
1496 if (!dir) {
1497 SYSERROR("Error opening /dev");
1498 return -1;
1499 }
1500 while (!readdir_r(dir, &dirent, &direntp)) {
1501
1502 if (!direntp)
1503 break;
1504 if (strncmp(direntp->d_name, "loop", 4) != 0)
1505 continue;
1506 fd = openat(dirfd(dir), direntp->d_name, O_RDWR);
1507 if (fd < 0)
1508 continue;
1509 if (ioctl(fd, LOOP_GET_STATUS64, &lo) == 0 || errno != ENXIO) {
1510 close(fd);
1511 fd = -1;
1512 continue;
1513 }
1514 // We can use this fd
1515 snprintf(namep, 100, "/dev/%s", direntp->d_name);
1516 break;
1517 }
1518 closedir(dir);
1519 if (fd == -1) {
1520 ERROR("No loop device found");
1521 return -1;
1522 }
1523
1524 *retfd = fd;
1525 return 0;
1526 }
1527
1528 static int loop_mount(struct bdev *bdev)
1529 {
1530 int lfd, ffd = -1, ret = -1;
1531 struct loop_info64 lo;
1532 char loname[100];
1533
1534 if (strcmp(bdev->type, "loop"))
1535 return -22;
1536 if (!bdev->src || !bdev->dest)
1537 return -22;
1538 if (find_free_loopdev(&lfd, loname) < 0)
1539 return -22;
1540
1541 ffd = open(bdev->src + 5, O_RDWR);
1542 if (ffd < 0) {
1543 SYSERROR("Error opening backing file %s", bdev->src);
1544 goto out;
1545 }
1546
1547 if (ioctl(lfd, LOOP_SET_FD, ffd) < 0) {
1548 SYSERROR("Error attaching backing file to loop dev");
1549 goto out;
1550 }
1551 memset(&lo, 0, sizeof(lo));
1552 lo.lo_flags = LO_FLAGS_AUTOCLEAR;
1553 if (ioctl(lfd, LOOP_SET_STATUS64, &lo) < 0) {
1554 SYSERROR("Error setting autoclear on loop dev");
1555 goto out;
1556 }
1557
1558 ret = mount_unknown_fs(loname, bdev->dest, bdev->mntopts);
1559 if (ret < 0)
1560 ERROR("Error mounting %s", bdev->src);
1561 else
1562 bdev->lofd = lfd;
1563
1564 out:
1565 if (ffd > -1)
1566 close(ffd);
1567 if (ret < 0) {
1568 close(lfd);
1569 bdev->lofd = -1;
1570 }
1571 return ret;
1572 }
1573
1574 static int loop_umount(struct bdev *bdev)
1575 {
1576 int ret;
1577
1578 if (strcmp(bdev->type, "loop"))
1579 return -22;
1580 if (!bdev->src || !bdev->dest)
1581 return -22;
1582 ret = umount(bdev->dest);
1583 if (bdev->lofd >= 0) {
1584 close(bdev->lofd);
1585 bdev->lofd = -1;
1586 }
1587 return ret;
1588 }
1589
1590 static int do_loop_create(const char *path, uint64_t size, const char *fstype)
1591 {
1592 int fd, ret;
1593 // create the new loopback file.
1594 fd = creat(path, S_IRUSR|S_IWUSR);
1595 if (fd < 0)
1596 return -1;
1597 if (lseek(fd, size, SEEK_SET) < 0) {
1598 SYSERROR("Error seeking to set new loop file size");
1599 close(fd);
1600 return -1;
1601 }
1602 if (write(fd, "1", 1) != 1) {
1603 SYSERROR("Error creating new loop file");
1604 close(fd);
1605 return -1;
1606 }
1607 ret = close(fd);
1608 if (ret < 0) {
1609 SYSERROR("Error closing new loop file");
1610 return -1;
1611 }
1612
1613 // create an fs in the loopback file
1614 if (do_mkfs(path, fstype) < 0) {
1615 ERROR("Error creating filesystem type %s on %s", fstype,
1616 path);
1617 return -1;
1618 }
1619
1620 return 0;
1621 }
1622
1623 /*
1624 * No idea what the original blockdev will be called, but the copy will be
1625 * called $lxcpath/$lxcname/rootdev
1626 */
1627 static int loop_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1628 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1629 uint64_t newsize)
1630 {
1631 char fstype[100];
1632 uint64_t size = newsize;
1633 int len, ret;
1634 char *srcdev;
1635
1636 if (snap) {
1637 ERROR("loop devices cannot be snapshotted.");
1638 return -1;
1639 }
1640
1641 if (!orig->dest || !orig->src)
1642 return -1;
1643
1644 len = strlen(lxcpath) + strlen(cname) + strlen("rootdev") + 3;
1645 srcdev = alloca(len);
1646 ret = snprintf(srcdev, len, "%s/%s/rootdev", lxcpath, cname);
1647 if (ret < 0 || ret >= len)
1648 return -1;
1649
1650 new->src = malloc(len + 5);
1651 if (!new->src)
1652 return -1;
1653 ret = snprintf(new->src, len + 5, "loop:%s", srcdev);
1654 if (ret < 0 || ret >= len + 5)
1655 return -1;
1656
1657 new->dest = malloc(len);
1658 if (!new->dest)
1659 return -1;
1660 ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
1661 if (ret < 0 || ret >= len)
1662 return -1;
1663
1664 // it's tempting to say: if orig->src == loopback and !newsize, then
1665 // copy the loopback file. However, we'd have to make sure to
1666 // correctly keep holes! So punt for now.
1667
1668 if (is_blktype(orig)) {
1669 if (!newsize && blk_getsize(orig, &size) < 0) {
1670 ERROR("Error getting size of %s", orig->src);
1671 return -1;
1672 }
1673 if (detect_fs(orig, fstype, 100) < 0) {
1674 INFO("could not find fstype for %s, using %s", orig->src,
1675 DEFAULT_FSTYPE);
1676 return -1;
1677 }
1678 } else {
1679 sprintf(fstype, "%s", DEFAULT_FSTYPE);
1680 if (!newsize)
1681 size = DEFAULT_FS_SIZE;
1682 }
1683 return do_loop_create(srcdev, size, fstype);
1684 }
1685
1686 static int loop_create(struct bdev *bdev, const char *dest, const char *n,
1687 struct bdev_specs *specs)
1688 {
1689 const char *fstype;
1690 uint64_t sz;
1691 int ret, len;
1692 char *srcdev;
1693
1694 if (!specs)
1695 return -1;
1696
1697 // dest is passed in as $lxcpath / $lxcname / rootfs
1698 // srcdev will be: $lxcpath / $lxcname / rootdev
1699 // src will be 'loop:$srcdev'
1700 len = strlen(dest) + 2;
1701 srcdev = alloca(len);
1702
1703 ret = snprintf(srcdev, len, "%s", dest);
1704 if (ret < 0 || ret >= len)
1705 return -1;
1706 sprintf(srcdev + len - 4, "dev");
1707
1708 bdev->src = malloc(len + 5);
1709 if (!bdev->src)
1710 return -1;
1711 ret = snprintf(bdev->src, len + 5, "loop:%s", srcdev);
1712 if (ret < 0 || ret >= len + 5)
1713 return -1;
1714
1715 sz = specs->fssize;
1716 if (!sz)
1717 sz = DEFAULT_FS_SIZE;
1718
1719 fstype = specs->fstype;
1720 if (!fstype)
1721 fstype = DEFAULT_FSTYPE;
1722
1723 if (!(bdev->dest = strdup(dest)))
1724 return -1;
1725
1726 if (mkdir_p(bdev->dest, 0755) < 0) {
1727 ERROR("Error creating %s", bdev->dest);
1728 return -1;
1729 }
1730
1731 return do_loop_create(srcdev, sz, fstype);
1732 }
1733
1734 static int loop_destroy(struct bdev *orig)
1735 {
1736 return unlink(orig->src + 5);
1737 }
1738
1739 static const struct bdev_ops loop_ops = {
1740 .detect = &loop_detect,
1741 .mount = &loop_mount,
1742 .umount = &loop_umount,
1743 .clone_paths = &loop_clonepaths,
1744 .destroy = &loop_destroy,
1745 .create = &loop_create,
1746 .can_snapshot = false,
1747 };
1748
1749 //
1750 // overlayfs ops
1751 //
1752
1753 static int overlayfs_detect(const char *path)
1754 {
1755 if (strncmp(path, "overlayfs:", 10) == 0)
1756 return 1; // take their word for it
1757 return 0;
1758 }
1759
1760 //
1761 // XXXXXXX plain directory bind mount ops
1762 //
1763 static int overlayfs_mount(struct bdev *bdev)
1764 {
1765 char *options, *dup, *lower, *upper;
1766 int len;
1767 unsigned long mntflags;
1768 char *mntdata;
1769 int ret;
1770
1771 if (strcmp(bdev->type, "overlayfs"))
1772 return -22;
1773 if (!bdev->src || !bdev->dest)
1774 return -22;
1775
1776 // separately mount it first
1777 // mount -t overlayfs -oupperdir=${upper},lowerdir=${lower} lower dest
1778 dup = alloca(strlen(bdev->src)+1);
1779 strcpy(dup, bdev->src);
1780 if (!(lower = index(dup, ':')))
1781 return -22;
1782 if (!(upper = index(++lower, ':')))
1783 return -22;
1784 *upper = '\0';
1785 upper++;
1786
1787 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
1788 free(mntdata);
1789 return -22;
1790 }
1791
1792 // TODO We should check whether bdev->src is a blockdev, and if so
1793 // but for now, only support overlays of a basic directory
1794
1795 if (mntdata) {
1796 len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=,") + strlen(mntdata) + 1;
1797 options = alloca(len);
1798 ret = snprintf(options, len, "upperdir=%s,lowerdir=%s,%s", upper, lower, mntdata);
1799 }
1800 else {
1801 len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=") + 1;
1802 options = alloca(len);
1803 ret = snprintf(options, len, "upperdir=%s,lowerdir=%s", upper, lower);
1804 }
1805 if (ret < 0 || ret >= len) {
1806 free(mntdata);
1807 return -1;
1808 }
1809
1810 ret = mount(lower, bdev->dest, "overlayfs", MS_MGC_VAL | mntflags, options);
1811 if (ret < 0)
1812 SYSERROR("overlayfs: error mounting %s onto %s options %s",
1813 lower, bdev->dest, options);
1814 else
1815 INFO("overlayfs: mounted %s onto %s options %s",
1816 lower, bdev->dest, options);
1817 return ret;
1818 }
1819
1820 static int overlayfs_umount(struct bdev *bdev)
1821 {
1822 if (strcmp(bdev->type, "overlayfs"))
1823 return -22;
1824 if (!bdev->src || !bdev->dest)
1825 return -22;
1826 return umount(bdev->dest);
1827 }
1828
1829 static int overlayfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1830 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1831 uint64_t newsize)
1832 {
1833 if (!snap) {
1834 ERROR("overlayfs is only for snapshot clones");
1835 return -22;
1836 }
1837
1838 if (!orig->src || !orig->dest)
1839 return -1;
1840
1841 new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
1842 if (!new->dest)
1843 return -1;
1844 if (mkdir_p(new->dest, 0755) < 0)
1845 return -1;
1846
1847 if (strcmp(orig->type, "dir") == 0) {
1848 char *delta;
1849 int ret, len;
1850
1851 // if we have /var/lib/lxc/c2/rootfs, then delta will be
1852 // /var/lib/lxc/c2/delta0
1853 delta = strdup(new->dest);
1854 if (!delta) {
1855 return -1;
1856 }
1857 if (strlen(delta) < 6) {
1858 free(delta);
1859 return -22;
1860 }
1861 strcpy(&delta[strlen(delta)-6], "delta0");
1862 if ((ret = mkdir(delta, 0755)) < 0) {
1863 SYSERROR("error: mkdir %s", delta);
1864 free(delta);
1865 return -1;
1866 }
1867
1868 // the src will be 'overlayfs:lowerdir:upperdir'
1869 len = strlen(delta) + strlen(orig->src) + 12;
1870 new->src = malloc(len);
1871 if (!new->src) {
1872 free(delta);
1873 return -ENOMEM;
1874 }
1875 ret = snprintf(new->src, len, "overlayfs:%s:%s", orig->src, delta);
1876 free(delta);
1877 if (ret < 0 || ret >= len)
1878 return -ENOMEM;
1879 } else if (strcmp(orig->type, "overlayfs") == 0) {
1880 // What exactly do we want to do here?
1881 // I think we want to use the original lowerdir, with a
1882 // private delta which is originally rsynced from the
1883 // original delta
1884 char *osrc, *odelta, *nsrc, *ndelta;
1885 int len, ret;
1886 if (!(osrc = strdup(orig->src)))
1887 return -22;
1888 nsrc = index(osrc, ':') + 1;
1889 if (nsrc != osrc + 10 || (odelta = index(nsrc, ':')) == NULL) {
1890 free(osrc);
1891 return -22;
1892 }
1893 *odelta = '\0';
1894 odelta++;
1895 ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath);
1896 if (!ndelta) {
1897 free(osrc);
1898 return -ENOMEM;
1899 }
1900 if (do_rsync(odelta, ndelta) < 0) {
1901 free(osrc);
1902 free(ndelta);
1903 ERROR("copying overlayfs delta");
1904 return -1;
1905 }
1906 len = strlen(nsrc) + strlen(ndelta) + 12;
1907 new->src = malloc(len);
1908 if (!new->src) {
1909 free(osrc);
1910 free(ndelta);
1911 return -ENOMEM;
1912 }
1913 ret = snprintf(new->src, len, "overlayfs:%s:%s", nsrc, ndelta);
1914 free(osrc);
1915 free(ndelta);
1916 if (ret < 0 || ret >= len)
1917 return -ENOMEM;
1918 } else {
1919 ERROR("overlayfs clone of %s container is not yet supported",
1920 orig->type);
1921 // Note, supporting this will require overlayfs_mount supporting
1922 // mounting of the underlay. No big deal, just needs to be done.
1923 return -1;
1924 }
1925
1926 return 0;
1927 }
1928
1929 static int overlayfs_destroy(struct bdev *orig)
1930 {
1931 char *upper;
1932
1933 if (strncmp(orig->src, "overlayfs:", 10) != 0)
1934 return -22;
1935 upper = index(orig->src + 10, ':');
1936 if (!upper)
1937 return -22;
1938 upper++;
1939 return lxc_rmdir_onedev(upper);
1940 }
1941
1942 /*
1943 * to say 'lxc-create -t ubuntu -n o1 -B overlayfs' means you want
1944 * $lxcpath/$lxcname/rootfs to have the created container, while all
1945 * changes after starting the container are written to
1946 * $lxcpath/$lxcname/delta0
1947 */
1948 static int overlayfs_create(struct bdev *bdev, const char *dest, const char *n,
1949 struct bdev_specs *specs)
1950 {
1951 char *delta;
1952 int ret, len = strlen(dest), newlen;
1953
1954 if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0)
1955 return -1;
1956
1957 if (!(bdev->dest = strdup(dest))) {
1958 ERROR("Out of memory");
1959 return -1;
1960 }
1961
1962 delta = alloca(strlen(dest)+1);
1963 strcpy(delta, dest);
1964 strcpy(delta+len-6, "delta0");
1965
1966 if (mkdir_p(delta, 0755) < 0) {
1967 ERROR("Error creating %s", delta);
1968 return -1;
1969 }
1970
1971 /* overlayfs:lower:upper */
1972 newlen = (2 * len) + strlen("overlayfs:") + 2;
1973 bdev->src = malloc(newlen);
1974 if (!bdev->src) {
1975 ERROR("Out of memory");
1976 return -1;
1977 }
1978 ret = snprintf(bdev->src, newlen, "overlayfs:%s:%s", dest, delta);
1979 if (ret < 0 || ret >= newlen)
1980 return -1;
1981
1982 if (mkdir_p(bdev->dest, 0755) < 0) {
1983 ERROR("Error creating %s", bdev->dest);
1984 return -1;
1985 }
1986
1987 return 0;
1988 }
1989
1990 static const struct bdev_ops overlayfs_ops = {
1991 .detect = &overlayfs_detect,
1992 .mount = &overlayfs_mount,
1993 .umount = &overlayfs_umount,
1994 .clone_paths = &overlayfs_clonepaths,
1995 .destroy = &overlayfs_destroy,
1996 .create = &overlayfs_create,
1997 .can_snapshot = true,
1998 };
1999
2000 static const struct bdev_type bdevs[] = {
2001 {.name = "zfs", .ops = &zfs_ops,},
2002 {.name = "lvm", .ops = &lvm_ops,},
2003 {.name = "btrfs", .ops = &btrfs_ops,},
2004 {.name = "dir", .ops = &dir_ops,},
2005 {.name = "overlayfs", .ops = &overlayfs_ops,},
2006 {.name = "loop", .ops = &loop_ops,},
2007 };
2008
2009 static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type);
2010
2011 void bdev_put(struct bdev *bdev)
2012 {
2013 if (bdev->mntopts)
2014 free(bdev->mntopts);
2015 if (bdev->src)
2016 free(bdev->src);
2017 if (bdev->dest)
2018 free(bdev->dest);
2019 free(bdev);
2020 }
2021
2022 struct bdev *bdev_get(const char *type)
2023 {
2024 int i;
2025 struct bdev *bdev;
2026
2027 for (i=0; i<numbdevs; i++) {
2028 if (strcmp(bdevs[i].name, type) == 0)
2029 break;
2030 }
2031 if (i == numbdevs)
2032 return NULL;
2033 bdev = malloc(sizeof(struct bdev));
2034 if (!bdev)
2035 return NULL;
2036 memset(bdev, 0, sizeof(struct bdev));
2037 bdev->ops = bdevs[i].ops;
2038 bdev->type = bdevs[i].name;
2039 return bdev;
2040 }
2041
2042 struct bdev *bdev_init(const char *src, const char *dst, const char *mntopts)
2043 {
2044 int i;
2045 struct bdev *bdev;
2046
2047 for (i=0; i<numbdevs; i++) {
2048 int r;
2049 r = bdevs[i].ops->detect(src);
2050 if (r)
2051 break;
2052 }
2053
2054 if (i == numbdevs)
2055 return NULL;
2056 bdev = malloc(sizeof(struct bdev));
2057 if (!bdev)
2058 return NULL;
2059 memset(bdev, 0, sizeof(struct bdev));
2060 bdev->ops = bdevs[i].ops;
2061 bdev->type = bdevs[i].name;
2062 if (mntopts)
2063 bdev->mntopts = strdup(mntopts);
2064 if (src)
2065 bdev->src = strdup(src);
2066 if (dst)
2067 bdev->dest = strdup(dst);
2068
2069 return bdev;
2070 }
2071
2072 struct rsync_data {
2073 struct bdev *orig;
2074 struct bdev *new;
2075 };
2076
2077 static int rsync_rootfs(struct rsync_data *data)
2078 {
2079 struct bdev *orig = data->orig,
2080 *new = data->new;
2081
2082 if (unshare(CLONE_NEWNS) < 0) {
2083 SYSERROR("unshare CLONE_NEWNS");
2084 return -1;
2085 }
2086
2087 // If not a snapshot, copy the fs.
2088 if (orig->ops->mount(orig) < 0) {
2089 ERROR("failed mounting %s onto %s", orig->src, orig->dest);
2090 return -1;
2091 }
2092 if (new->ops->mount(new) < 0) {
2093 ERROR("failed mounting %s onto %s", new->src, new->dest);
2094 return -1;
2095 }
2096 if (setgid(0) < 0) {
2097 ERROR("Failed to setgid to 0");
2098 return -1;
2099 }
2100 if (setgroups(0, NULL) < 0)
2101 WARN("Failed to clear groups");
2102 if (setuid(0) < 0) {
2103 ERROR("Failed to setuid to 0");
2104 return -1;
2105 }
2106 if (do_rsync(orig->dest, new->dest) < 0) {
2107 ERROR("rsyncing %s to %s", orig->src, new->src);
2108 return -1;
2109 }
2110
2111 return 0;
2112 }
2113
2114 static int rsync_rootfs_wrapper(void *data)
2115 {
2116 struct rsync_data *arg = data;
2117 return rsync_rootfs(arg);
2118 }
2119 /*
2120 * If we're not snaphotting, then bdev_copy becomes a simple case of mount
2121 * the original, mount the new, and rsync the contents.
2122 */
2123 struct bdev *bdev_copy(struct lxc_container *c0, const char *cname,
2124 const char *lxcpath, const char *bdevtype,
2125 int flags, const char *bdevdata, uint64_t newsize,
2126 int *needs_rdep)
2127 {
2128 struct bdev *orig, *new;
2129 pid_t pid;
2130 int ret;
2131 bool snap = flags & LXC_CLONE_SNAPSHOT;
2132 bool maybe_snap = flags & LXC_CLONE_MAYBE_SNAPSHOT;
2133 bool keepbdevtype = flags & LXC_CLONE_KEEPBDEVTYPE;
2134 const char *src = c0->lxc_conf->rootfs.path;
2135 const char *oldname = c0->name;
2136 const char *oldpath = c0->config_path;
2137 struct rsync_data data;
2138
2139 /* if the container name doesn't show up in the rootfs path, then
2140 * we don't know how to come up with a new name
2141 */
2142 if (strstr(src, oldname) == NULL) {
2143 ERROR("original rootfs path %s doesn't include container name %s",
2144 src, oldname);
2145 return NULL;
2146 }
2147
2148 orig = bdev_init(src, NULL, NULL);
2149 if (!orig) {
2150 ERROR("failed to detect blockdev type for %s", src);
2151 return NULL;
2152 }
2153
2154 if (!orig->dest) {
2155 int ret;
2156 orig->dest = malloc(MAXPATHLEN);
2157 if (!orig->dest) {
2158 ERROR("out of memory");
2159 bdev_put(orig);
2160 return NULL;
2161 }
2162 ret = snprintf(orig->dest, MAXPATHLEN, "%s/%s/rootfs", oldpath, oldname);
2163 if (ret < 0 || ret >= MAXPATHLEN) {
2164 ERROR("rootfs path too long");
2165 bdev_put(orig);
2166 return NULL;
2167 }
2168 }
2169
2170 /* check for privilege */
2171 if (am_unpriv()) {
2172 if (snap && !maybe_snap) {
2173 ERROR("Unprivileged users cannot snapshot");
2174 bdev_put(orig);
2175 return NULL;
2176 }
2177 if (bdevtype && strcmp(bdevtype, "dir") != 0) {
2178 ERROR("Unprivileged users can only make dir copy-clones");
2179 bdev_put(orig);
2180 return NULL;
2181 }
2182 if (strcmp(orig->type, "dir") != 0) {
2183 ERROR("Unprivileged users can only make dir copy-clones");
2184 bdev_put(orig);
2185 return NULL;
2186 }
2187 }
2188
2189
2190 /*
2191 * special case for snapshot - if caller requested maybe_snapshot and
2192 * keepbdevtype and backing store is directory, then proceed with a copy
2193 * clone rather than returning error
2194 */
2195 if (maybe_snap && keepbdevtype && !bdevtype && !orig->ops->can_snapshot)
2196 snap = false;
2197
2198 /*
2199 * If newtype is NULL and snapshot is set, then use overlayfs
2200 */
2201 if (!bdevtype && !keepbdevtype && snap && strcmp(orig->type , "dir") == 0)
2202 bdevtype = "overlayfs";
2203
2204 *needs_rdep = 0;
2205 if (bdevtype && strcmp(orig->type, "dir") == 0 &&
2206 strcmp(bdevtype, "overlayfs") == 0)
2207 *needs_rdep = 1;
2208
2209 new = bdev_get(bdevtype ? bdevtype : orig->type);
2210 if (!new) {
2211 ERROR("no such block device type: %s", bdevtype ? bdevtype : orig->type);
2212 bdev_put(orig);
2213 return NULL;
2214 }
2215
2216 if (new->ops->clone_paths(orig, new, oldname, cname, oldpath, lxcpath, snap, newsize) < 0) {
2217 ERROR("failed getting pathnames for cloned storage: %s", src);
2218 bdev_put(orig);
2219 bdev_put(new);
2220 return NULL;
2221 }
2222 if (snap)
2223 return new;
2224
2225 pid = fork();
2226 if (pid < 0) {
2227 SYSERROR("fork");
2228 bdev_put(orig);
2229 bdev_put(new);
2230 return NULL;
2231 }
2232
2233 if (pid > 0) {
2234 int ret = wait_for_pid(pid);
2235 bdev_put(orig);
2236 if (ret < 0) {
2237 bdev_put(new);
2238 return NULL;
2239 }
2240 return new;
2241 }
2242
2243 data.orig = orig;
2244 data.new = new;
2245 if (am_unpriv())
2246 ret = userns_exec_1(c0->lxc_conf, rsync_rootfs_wrapper, &data);
2247 else
2248 ret = rsync_rootfs(&data);
2249
2250 exit(ret == 0 ? 0 : 1);
2251 }
2252
2253 static struct bdev * do_bdev_create(const char *dest, const char *type,
2254 const char *cname, struct bdev_specs *specs)
2255 {
2256 struct bdev *bdev = bdev_get(type);
2257 if (!bdev) {
2258 return NULL;
2259 }
2260
2261 if (bdev->ops->create(bdev, dest, cname, specs) < 0) {
2262 bdev_put(bdev);
2263 return NULL;
2264 }
2265
2266 return bdev;
2267 }
2268
2269 /*
2270 * bdev_create:
2271 * Create a backing store for a container.
2272 * If successfull, return a struct bdev *, with the bdev mounted and ready
2273 * for use. Before completing, the caller will need to call the
2274 * umount operation and bdev_put().
2275 * @dest: the mountpoint (i.e. /var/lib/lxc/$name/rootfs)
2276 * @type: the bdevtype (dir, btrfs, zfs, etc)
2277 * @cname: the container name
2278 * @specs: details about the backing store to create, like fstype
2279 */
2280 struct bdev *bdev_create(const char *dest, const char *type,
2281 const char *cname, struct bdev_specs *specs)
2282 {
2283 struct bdev *bdev;
2284 char *best_options[] = {"btrfs", "zfs", "lvm", "dir", NULL};
2285
2286 if (!type)
2287 return do_bdev_create(dest, "dir", cname, specs);
2288
2289 if (strcmp(type, "best") == 0) {
2290 int i;
2291 // try for the best backing store type, according to our
2292 // opinionated preferences
2293 for (i=0; best_options[i]; i++) {
2294 if ((bdev = do_bdev_create(dest, best_options[i], cname, specs)))
2295 return bdev;
2296 }
2297 return NULL; // 'dir' should never fail, so this shouldn't happen
2298 }
2299
2300 // -B lvm,dir
2301 if (index(type, ',') != NULL) {
2302 char *dup = alloca(strlen(type)+1), *saveptr, *token;
2303 strcpy(dup, type);
2304 for (token = strtok_r(dup, ",", &saveptr); token;
2305 token = strtok_r(NULL, ",", &saveptr)) {
2306 if ((bdev = do_bdev_create(dest, token, cname, specs)))
2307 return bdev;
2308 }
2309 }
2310
2311 return do_bdev_create(dest, type, cname, specs);
2312 }
2313
2314 char *overlayfs_getlower(char *p)
2315 {
2316 char *p1 = index(p, ':');
2317 if (p1)
2318 *p1 = '\0';
2319 return p;
2320 }