]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/bdev.c
licensing: Add missing headers and FSF address
[mirror_lxc.git] / src / lxc / bdev.c
CommitLineData
9be53773
SH
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9be53773
SH
22 */
23
24/*
25 * this is all just a first shot for experiment. If we go this route, much
26 * shoudl change. bdev should be a directory with per-bdev file. Things which
27 * I'm doing by calling out to userspace should sometimes be done through
28 * libraries like liblvm2
29 */
30#define _GNU_SOURCE
31#include <stdio.h>
32#include <unistd.h>
33#include <errno.h>
34#include <sched.h>
35#include <sys/mount.h>
36#include <sys/wait.h>
37#include <libgen.h>
eddaaafd
SH
38#include <linux/loop.h>
39#include <dirent.h>
9be53773
SH
40#include "lxc.h"
41#include "config.h"
42#include "conf.h"
43#include "bdev.h"
44#include "log.h"
45#include "error.h"
46#include "utils.h"
47#include "namespace.h"
48#include "parse.h"
6a44839f 49#include "utils.h"
9be53773 50
bff13ba2
SG
51#ifndef BLKGETSIZE64
52#define BLKGETSIZE64 _IOR(0x12,114,size_t)
53#endif
54
55#ifndef LO_FLAGS_AUTOCLEAR
56#define LO_FLAGS_AUTOCLEAR 4
57#endif
58
9be53773
SH
59lxc_log_define(bdev, lxc);
60
9be53773
SH
61static int do_rsync(const char *src, const char *dest)
62{
63 // call out to rsync
64 pid_t pid;
65 char *s;
66 size_t l;
67
68 pid = fork();
69 if (pid < 0)
70 return -1;
71 if (pid > 0)
72 return wait_for_pid(pid);
73 l = strlen(src) + 2;
74 s = malloc(l);
75 if (!s)
76 exit(1);
77 strcpy(s, src);
78 s[l-2] = '/';
79 s[l-1] = '\0';
80
ca52dcb5
SH
81 execlp("rsync", "rsync", "-a", s, dest, (char *)NULL);
82 exit(1);
9be53773
SH
83}
84
eddaaafd
SH
85/*
86 * return block size of dev->src
87 */
88static int blk_getsize(struct bdev *bdev, unsigned long *size)
9be53773
SH
89{
90 int fd, ret;
eddaaafd
SH
91 char *path = bdev->src;
92
93 if (strcmp(bdev->type, "loop") == 0)
94 path = bdev->src + 5;
9be53773
SH
95
96 fd = open(path, O_RDONLY);
42fb4b15 97 if (fd < 0)
9be53773
SH
98 return -1;
99 ret = ioctl(fd, BLKGETSIZE64, size);
100 close(fd);
101 return ret;
102}
103
104/*
105 * These are copied from conf.c. However as conf.c will be moved to using
106 * the callback system, they can be pulled from there eventually, so we
107 * don't need to pollute utils.c with these low level functions
108 */
109static int find_fstype_cb(char* buffer, void *data)
110{
111 struct cbarg {
112 const char *rootfs;
113 const char *target;
114 int mntopt;
115 } *cbarg = data;
116
117 char *fstype;
118
119 /* we don't try 'nodev' entries */
120 if (strstr(buffer, "nodev"))
121 return 0;
122
123 fstype = buffer;
124 fstype += lxc_char_left_gc(fstype, strlen(fstype));
125 fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
126
127 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
128 cbarg->rootfs, cbarg->target, fstype);
129
130 if (mount(cbarg->rootfs, cbarg->target, fstype, cbarg->mntopt, NULL)) {
131 DEBUG("mount failed with error: %s", strerror(errno));
132 return 0;
133 }
134
135 INFO("mounted '%s' on '%s', with fstype '%s'",
136 cbarg->rootfs, cbarg->target, fstype);
137
138 return 1;
139}
140
141static int mount_unknow_fs(const char *rootfs, const char *target, int mntopt)
142{
143 int i;
144
145 struct cbarg {
146 const char *rootfs;
147 const char *target;
148 int mntopt;
149 } cbarg = {
150 .rootfs = rootfs,
151 .target = target,
152 .mntopt = mntopt,
153 };
154
155 /*
156 * find the filesystem type with brute force:
157 * first we check with /etc/filesystems, in case the modules
158 * are auto-loaded and fall back to the supported kernel fs
159 */
160 char *fsfile[] = {
161 "/etc/filesystems",
162 "/proc/filesystems",
163 };
164
165 for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
166
167 int ret;
168
169 if (access(fsfile[i], F_OK))
170 continue;
171
172 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
173 if (ret < 0) {
174 ERROR("failed to parse '%s'", fsfile[i]);
175 return -1;
176 }
177
178 if (ret)
179 return 0;
180 }
181
182 ERROR("failed to determine fs type for '%s'", rootfs);
183 return -1;
184}
185
186static int do_mkfs(const char *path, const char *fstype)
187{
188 pid_t pid;
189
190 if ((pid = fork()) < 0) {
191 ERROR("error forking");
192 return -1;
193 }
194 if (pid > 0)
195 return wait_for_pid(pid);
196
eddaaafd
SH
197 // If the file is not a block device, we don't want mkfs to ask
198 // us about whether to proceed.
199 close(0);
200 close(1);
201 close(2);
202 open("/dev/zero", O_RDONLY);
203 open("/dev/null", O_RDWR);
204 open("/dev/null", O_RDWR);
ca52dcb5
SH
205 execlp("mkfs", "mkfs", "-t", fstype, path, NULL);
206 exit(1);
9be53773
SH
207}
208
209static char *linkderef(char *path, char *dest)
210{
211 struct stat sbuf;
212 ssize_t ret;
213
214 ret = stat(path, &sbuf);
215 if (ret < 0)
216 return NULL;
217 if (!S_ISLNK(sbuf.st_mode))
218 return path;
219 ret = readlink(path, dest, MAXPATHLEN);
220 if (ret < 0) {
221 SYSERROR("error reading link %s", path);
222 return NULL;
223 } else if (ret >= MAXPATHLEN) {
224 ERROR("link in %s too long", path);
225 return NULL;
226 }
227 dest[ret] = '\0';
228 return dest;
229}
230
231/*
232 * Given a bdev (presumably blockdev-based), detect the fstype
233 * by trying mounting (in a private mntns) it.
234 * @bdev: bdev to investigate
235 * @type: preallocated char* in which to write the fstype
236 * @len: length of passed in char*
237 * Returns length of fstype, of -1 on error
238 */
239static int detect_fs(struct bdev *bdev, char *type, int len)
240{
241 int p[2], ret;
242 size_t linelen;
243 pid_t pid;
244 FILE *f;
245 char *sp1, *sp2, *sp3, *line = NULL;
5d9598d7 246 char *srcdev;
9be53773
SH
247
248 if (!bdev || !bdev->src || !bdev->dest)
249 return -1;
250
5d9598d7 251 srcdev = bdev->src;
eddaaafd
SH
252 if (strcmp(bdev->type, "loop") == 0)
253 srcdev = bdev->src + 5;
254
9be53773
SH
255 if (pipe(p) < 0)
256 return -1;
257 if ((pid = fork()) < 0)
258 return -1;
259 if (pid > 0) {
260 int status;
261 close(p[1]);
262 memset(type, 0, len);
263 ret = read(p[0], type, len-1);
264 close(p[0]);
265 if (ret < 0) {
266 SYSERROR("error reading from pipe");
267 wait(&status);
268 return -1;
269 } else if (ret == 0) {
270 ERROR("child exited early - fstype not found");
271 wait(&status);
272 return -1;
273 }
274 wait(&status);
275 type[len-1] = '\0';
eddaaafd 276 INFO("detected fstype %s for %s", type, srcdev);
9be53773
SH
277 return ret;
278 }
279
280 if (unshare(CLONE_NEWNS) < 0)
281 exit(1);
282
eddaaafd 283 ret = mount_unknow_fs(srcdev, bdev->dest, 0);
9be53773 284 if (ret < 0) {
eddaaafd 285 ERROR("failed mounting %s onto %s to detect fstype", srcdev, bdev->dest);
9be53773
SH
286 exit(1);
287 }
288 // if symlink, get the real dev name
289 char devpath[MAXPATHLEN];
eddaaafd 290 char *l = linkderef(srcdev, devpath);
9be53773
SH
291 if (!l)
292 exit(1);
293 f = fopen("/proc/self/mounts", "r");
294 if (!f)
295 exit(1);
296 while (getline(&line, &linelen, f) != -1) {
297 sp1 = index(line, ' ');
298 if (!sp1)
299 exit(1);
300 *sp1 = '\0';
301 if (strcmp(line, l))
302 continue;
303 sp2 = index(sp1+1, ' ');
304 if (!sp2)
305 exit(1);
306 *sp2 = '\0';
307 sp3 = index(sp2+1, ' ');
308 if (!sp3)
309 exit(1);
310 *sp3 = '\0';
311 sp2++;
312 if (write(p[1], sp2, strlen(sp2)) != strlen(sp2))
313 exit(1);
314 exit(0);
315 }
316 exit(1);
317}
318
319struct bdev_type {
320 char *name;
321 struct bdev_ops *ops;
322};
323
324static int is_dir(const char *path)
325{
326 struct stat statbuf;
327 int ret = stat(path, &statbuf);
328 if (ret == 0 && S_ISDIR(statbuf.st_mode))
329 return 1;
330 return 0;
331}
332
333static int dir_detect(const char *path)
334{
335 if (strncmp(path, "dir:", 4) == 0)
336 return 1; // take their word for it
337 if (is_dir(path))
338 return 1;
339 return 0;
340}
341
342//
343// XXXXXXX plain directory bind mount ops
344//
60bf62d4 345static int dir_mount(struct bdev *bdev)
9be53773
SH
346{
347 if (strcmp(bdev->type, "dir"))
348 return -22;
349 if (!bdev->src || !bdev->dest)
350 return -22;
351 return mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC, NULL);
352}
353
60bf62d4 354static int dir_umount(struct bdev *bdev)
9be53773
SH
355{
356 if (strcmp(bdev->type, "dir"))
357 return -22;
358 if (!bdev->src || !bdev->dest)
359 return -22;
360 return umount(bdev->dest);
361}
362
363/* the bulk of this needs to become a common helper */
364static char *dir_new_path(char *src, const char *oldname, const char *name,
365 const char *oldpath, const char *lxcpath)
366{
367 char *ret, *p, *p2;
368 int l1, l2, nlen;
369
370 nlen = strlen(src) + 1;
371 l1 = strlen(oldpath);
372 p = src;
373 /* if src starts with oldpath, look for oldname only after
374 * that path */
375 if (strncmp(src, oldpath, l1) == 0) {
376 p += l1;
377 nlen += (strlen(lxcpath) - l1);
378 }
379 l2 = strlen(oldname);
380 while ((p = strstr(p, oldname)) != NULL) {
381 p += l2;
382 nlen += strlen(name) - l2;
383 }
384
385 ret = malloc(nlen);
386 if (!ret)
387 return NULL;
388
389 p = ret;
390 if (strncmp(src, oldpath, l1) == 0) {
391 p += sprintf(p, "%s", lxcpath);
392 src += l1;
393 }
394
395 while ((p2 = strstr(src, oldname)) != NULL) {
396 strncpy(p, src, p2-src); // copy text up to oldname
397 p += p2-src; // move target pointer (p)
398 p += sprintf(p, "%s", name); // print new name in place of oldname
399 src = p2 + l2; // move src to end of oldname
400 }
401 sprintf(p, "%s", src); // copy the rest of src
402 return ret;
403}
404
405/*
406 * for a simple directory bind mount, we substitute the old container
407 * name and paths for the new
408 */
409static int dir_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
410 const char *cname, const char *oldpath, const char *lxcpath, int snap,
411 unsigned long newsize)
412{
ca52dcb5
SH
413 int len, ret;
414
9be53773
SH
415 if (snap) {
416 ERROR("directories cannot be snapshotted. Try overlayfs.");
417 return -1;
418 }
419
9be53773
SH
420 if (!orig->dest || !orig->src)
421 return -1;
9be53773 422
ca52dcb5
SH
423 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
424 new->src = malloc(len);
9be53773
SH
425 if (!new->src)
426 return -1;
ca52dcb5
SH
427 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
428 if (ret < 0 || ret >= len)
429 return -1;
430 if ((new->dest = strdup(new->src)) == NULL)
431 return -1;
9be53773
SH
432
433 return 0;
434}
435
60bf62d4
SH
436static int dir_destroy(struct bdev *orig)
437{
438 if (!lxc_rmdir_onedev(orig->src))
439 return -1;
440 return 0;
441}
442
1897e3bc
SH
443static int dir_create(struct bdev *bdev, const char *dest, const char *n,
444 struct bdev_specs *specs)
445{
446 bdev->src = strdup(dest);
447 bdev->dest = strdup(dest);
448 if (!bdev->src || !bdev->dest) {
449 ERROR("Out of memory");
450 return -1;
451 }
452
453 if (mkdir_p(bdev->src, 0755) < 0) {
454 ERROR("Error creating %s\n", bdev->src);
455 return -1;
456 }
457 if (mkdir_p(bdev->dest, 0755) < 0) {
458 ERROR("Error creating %s\n", bdev->dest);
459 return -1;
460 }
461
462 return 0;
463}
464
9be53773
SH
465struct bdev_ops dir_ops = {
466 .detect = &dir_detect,
467 .mount = &dir_mount,
468 .umount = &dir_umount,
469 .clone_paths = &dir_clonepaths,
60bf62d4 470 .destroy = &dir_destroy,
1897e3bc 471 .create = &dir_create,
9be53773
SH
472};
473
3baa76fe
SH
474
475//
476// XXXXXXX zfs ops
477// There are two ways we could do this. We could always specify the
478// 'zfs device' (i.e. tank/lxc lxc/container) as rootfs. But instead
479// (at least right now) we have lxc-create specify $lxcpath/$lxcname/rootfs
480// as the mountpoint, so that it is always mounted.
481//
482// That means 'mount' is really never needed and could be noop, but for the
483// sake of flexibility let's always bind-mount.
484//
485
60bf62d4 486static int zfs_list_entry(const char *path, char *output, size_t inlen)
3baa76fe
SH
487{
488 FILE *f;
489 int found=0;
490
0fc0d057 491 if ((f = popen("zfs list 2> /dev/null", "r")) == NULL) {
3baa76fe
SH
492 SYSERROR("popen failed");
493 return 0;
494 }
60bf62d4 495 while (fgets(output, inlen, f)) {
3baa76fe
SH
496 if (strstr(output, path)) {
497 found = 1;
498 break;
499 }
500 }
501 (void) pclose(f);
502
503 return found;
504}
505
506static int zfs_detect(const char *path)
507{
508 char *output = malloc(LXC_LOG_BUFFER_SIZE);
509 int found;
510
511 if (!output) {
512 ERROR("out of memory");
513 return 0;
514 }
60bf62d4 515 found = zfs_list_entry(path, output, LXC_LOG_BUFFER_SIZE);
3baa76fe
SH
516 free(output);
517 return found;
518}
519
60bf62d4 520static int zfs_mount(struct bdev *bdev)
3baa76fe
SH
521{
522 if (strcmp(bdev->type, "zfs"))
523 return -22;
524 if (!bdev->src || !bdev->dest)
525 return -22;
526 return mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC, NULL);
527}
528
60bf62d4 529static int zfs_umount(struct bdev *bdev)
3baa76fe
SH
530{
531 if (strcmp(bdev->type, "zfs"))
532 return -22;
533 if (!bdev->src || !bdev->dest)
534 return -22;
535 return umount(bdev->dest);
536}
537
538static int zfs_clone(const char *opath, const char *npath, const char *oname,
539 const char *nname, const char *lxcpath, int snapshot)
540{
541 // use the 'zfs list | grep opath' entry to get the zfsroot
542 char output[MAXPATHLEN], option[MAXPATHLEN], *p;
31a95fec 543 const char *zfsroot = output;
3baa76fe
SH
544 int ret;
545 pid_t pid;
546
60bf62d4 547 if (zfs_list_entry(opath, output, MAXPATHLEN)) {
31a95fec
SH
548 // zfsroot is output up to ' '
549 if ((p = index(output, ' ')) == NULL)
550 return -1;
551 *p = '\0';
c32981c3 552 if ((p = strrchr(output, '/')) == NULL)
31a95fec
SH
553 return -1;
554 *p = '\0';
555 } else
556 zfsroot = default_zfs_root();
3baa76fe
SH
557
558 ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s/%s/rootfs",
559 lxcpath, nname);
560 if (ret < 0 || ret >= MAXPATHLEN)
561 return -1;
562
3baa76fe
SH
563 // zfs create -omountpoint=$lxcpath/$lxcname $zfsroot/$nname
564 if (!snapshot) {
565 if ((pid = fork()) < 0)
566 return -1;
567 if (!pid) {
568 char dev[MAXPATHLEN];
31a95fec 569 ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, nname);
3baa76fe
SH
570 if (ret < 0 || ret >= MAXPATHLEN)
571 exit(1);
ca52dcb5
SH
572 execlp("zfs", "zfs", "create", option, dev, NULL);
573 exit(1);
3baa76fe
SH
574 }
575 return wait_for_pid(pid);
576 } else {
577 // if snapshot, do
578 // 'zfs snapshot zfsroot/oname@nname
579 // zfs clone zfsroot/oname@nname zfsroot/nname
580 char path1[MAXPATHLEN], path2[MAXPATHLEN];
581
31a95fec 582 ret = snprintf(path1, MAXPATHLEN, "%s/%s@%s", zfsroot,
3baa76fe
SH
583 oname, nname);
584 if (ret < 0 || ret >= MAXPATHLEN)
585 return -1;
31a95fec 586 (void) snprintf(path2, MAXPATHLEN, "%s/%s", zfsroot, nname);
3baa76fe
SH
587
588 // if the snapshot exists, delete it
589 if ((pid = fork()) < 0)
590 return -1;
591 if (!pid) {
ca52dcb5
SH
592 execlp("zfs", "zfs", "destroy", path1, NULL);
593 exit(1);
3baa76fe
SH
594 }
595 // it probably doesn't exist so destroy probably will fail.
596 (void) wait_for_pid(pid);
597
598 // run first (snapshot) command
599 if ((pid = fork()) < 0)
600 return -1;
601 if (!pid) {
ca52dcb5
SH
602 execlp("zfs", "zfs", "snapshot", path1, NULL);
603 exit(1);
3baa76fe
SH
604 }
605 if (wait_for_pid(pid) < 0)
606 return -1;
607
608 // run second (clone) command
609 if ((pid = fork()) < 0)
610 return -1;
611 if (!pid) {
ca52dcb5
SH
612 execlp("zfs", "zfs", "clone", option, path1, path2, NULL);
613 exit(1);
3baa76fe
SH
614 }
615 return wait_for_pid(pid);
616 }
617}
618
619static int zfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
620 const char *cname, const char *oldpath, const char *lxcpath, int snap,
621 unsigned long newsize)
622{
ca52dcb5
SH
623 int len, ret;
624
3baa76fe
SH
625 if (!orig->src || !orig->dest)
626 return -1;
627
ca52dcb5
SH
628 if (snap && strcmp(orig->type, "zfs")) {
629 ERROR("zfs snapshot from %s backing store is not supported",
3baa76fe
SH
630 orig->type);
631 return -1;
632 }
633
ca52dcb5
SH
634 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
635 new->src = malloc(len);
3baa76fe
SH
636 if (!new->src)
637 return -1;
ca52dcb5
SH
638 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
639 if (ret < 0 || ret >= len)
640 return -1;
641 if ((new->dest = strdup(new->src)) == NULL)
642 return -1;
3baa76fe
SH
643
644 return zfs_clone(orig->src, new->src, oldname, cname, lxcpath, snap);
645}
646
60bf62d4
SH
647/*
648 * TODO: detect whether this was a clone, and if so then also delete the
649 * snapshot it was based on, so that we don't hold the original
650 * container busy.
651 */
652static int zfs_destroy(struct bdev *orig)
653{
654 pid_t pid;
655 char output[MAXPATHLEN], *p;
656
657 if ((pid = fork()) < 0)
658 return -1;
659 if (pid)
660 return wait_for_pid(pid);
661
662 if (!zfs_list_entry(orig->src, output, MAXPATHLEN)) {
663 ERROR("Error: zfs entry for %s not found", orig->src);
664 return -1;
665 }
666
667 // zfs mount is output up to ' '
668 if ((p = index(output, ' ')) == NULL)
669 return -1;
670 *p = '\0';
671
672 execlp("zfs", "zfs", "destroy", output, NULL);
673 exit(1);
674}
675
1897e3bc
SH
676static int zfs_create(struct bdev *bdev, const char *dest, const char *n,
677 struct bdev_specs *specs)
678{
679 const char *zfsroot;
680 char option[MAXPATHLEN];
681 int ret;
682 pid_t pid;
683
684 if (!specs || !specs->u.zfs.zfsroot)
685 zfsroot = default_zfs_root();
686 else
687 zfsroot = specs->u.zfs.zfsroot;
688
689 if (!(bdev->dest = strdup(dest))) {
690 ERROR("No mount target specified or out of memory");
691 return -1;
692 }
693 if (!(bdev->src = strdup(bdev->dest))) {
694 ERROR("out of memory");
695 return -1;
696 }
697
698 ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s", bdev->dest);
699 if (ret < 0 || ret >= MAXPATHLEN)
700 return -1;
701 if ((pid = fork()) < 0)
702 return -1;
703 if (pid)
704 return wait_for_pid(pid);
705
706 char dev[MAXPATHLEN];
707 ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, n);
708 if (ret < 0 || ret >= MAXPATHLEN)
709 exit(1);
710 execlp("zfs", "zfs", "create", option, dev, NULL);
711 exit(1);
712}
713
3baa76fe
SH
714struct bdev_ops zfs_ops = {
715 .detect = &zfs_detect,
716 .mount = &zfs_mount,
717 .umount = &zfs_umount,
718 .clone_paths = &zfs_clonepaths,
60bf62d4 719 .destroy = &zfs_destroy,
1897e3bc 720 .create = &zfs_create,
3baa76fe
SH
721};
722
9be53773
SH
723//
724// LVM ops
725//
726
727/*
728 * Look at /sys/dev/block/maj:min/dm/uuid. If it contains the hardcoded LVM
729 * prefix "LVM-", then this is an lvm2 LV
730 */
731static int lvm_detect(const char *path)
732{
733 char devp[MAXPATHLEN], buf[4];
734 FILE *fout;
735 int ret;
736 struct stat statbuf;
737
738 if (strncmp(path, "lvm:", 4) == 0)
739 return 1; // take their word for it
740
741 ret = stat(path, &statbuf);
742 if (ret != 0)
743 return 0;
744 if (!S_ISBLK(statbuf.st_mode))
745 return 0;
746
747 ret = snprintf(devp, MAXPATHLEN, "/sys/dev/block/%d:%d/dm/uuid",
748 major(statbuf.st_rdev), minor(statbuf.st_rdev));
749 if (ret < 0 || ret >= MAXPATHLEN) {
750 ERROR("lvm uuid pathname too long");
751 return 0;
752 }
753 fout = fopen(devp, "r");
754 if (!fout)
755 return 0;
756 ret = fread(buf, 1, 4, fout);
757 fclose(fout);
758 if (ret != 4 || strncmp(buf, "LVM-", 4) != 0)
759 return 0;
760 return 1;
761}
762
763static int lvm_mount(struct bdev *bdev)
764{
765 if (strcmp(bdev->type, "lvm"))
766 return -22;
767 if (!bdev->src || !bdev->dest)
768 return -22;
769 /* if we might pass in data sometime, then we'll have to enrich
770 * mount_unknow_fs */
771 return mount_unknow_fs(bdev->src, bdev->dest, 0);
772}
773
774static int lvm_umount(struct bdev *bdev)
775{
776 if (strcmp(bdev->type, "lvm"))
777 return -22;
778 if (!bdev->src || !bdev->dest)
779 return -22;
780 return umount(bdev->dest);
781}
782
783/*
784 * path must be '/dev/$vg/$lv', $vg must be an existing VG, and $lv must
785 * not yet exist. This function will attempt to create /dev/$vg/$lv of
786 * size $size.
787 */
1897e3bc 788static int do_lvm_create(const char *path, unsigned long size)
9be53773
SH
789{
790 int ret, pid;
791 char sz[24], *pathdup, *vg, *lv;
792
793 if ((pid = fork()) < 0) {
794 SYSERROR("failed fork");
795 return -1;
796 }
797 if (pid > 0)
798 return wait_for_pid(pid);
799
800 // lvcreate default size is in M, not bytes.
801 ret = snprintf(sz, 24, "%lu", size/1000000);
802 if (ret < 0 || ret >= 24)
803 exit(1);
804
805 pathdup = strdup(path);
806 if (!pathdup)
807 exit(1);
c32981c3 808 lv = strrchr(pathdup, '/');
9be53773
SH
809 if (!lv) {
810 free(pathdup);
811 exit(1);
812 }
813 *lv = '\0';
814 lv++;
c32981c3 815 vg = strrchr(pathdup, '/');
9be53773
SH
816 if (!vg)
817 exit(1);
818 vg++;
ca52dcb5 819 execlp("lvcreate", "lvcreate", "-L", sz, vg, "-n", lv, (char *)NULL);
9be53773 820 free(pathdup);
ca52dcb5 821 exit(1);
9be53773
SH
822}
823
824static int lvm_snapshot(const char *orig, const char *path, unsigned long size)
825{
826 int ret, pid;
827 char sz[24], *pathdup, *lv;
828
829 if ((pid = fork()) < 0) {
830 SYSERROR("failed fork");
831 return -1;
832 }
833 if (pid > 0)
834 return wait_for_pid(pid);
835 // lvcreate default size is in M, not bytes.
836 ret = snprintf(sz, 24, "%lu", size/1000000);
837 if (ret < 0 || ret >= 24)
838 exit(1);
839
840 pathdup = strdup(path);
841 if (!pathdup)
842 exit(1);
c32981c3 843 lv = strrchr(pathdup, '/');
9be53773
SH
844 if (!lv) {
845 free(pathdup);
846 exit(1);
847 }
848 *lv = '\0';
849 lv++;
850
851 ret = execlp("lvcreate", "lvcreate", "-s", "-L", sz, "-n", lv, orig, (char *)NULL);
852 free(pathdup);
ca52dcb5
SH
853 exit(1);
854}
855
856// this will return 1 for physical disks, qemu-nbd, loop, etc
857// right now only lvm is a block device
858static int is_blktype(struct bdev *b)
859{
860 if (strcmp(b->type, "lvm") == 0)
861 return 1;
862 return 0;
9be53773
SH
863}
864
865static int lvm_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
866 const char *cname, const char *oldpath, const char *lxcpath, int snap,
867 unsigned long newsize)
868{
869 char fstype[100];
870 unsigned long size = newsize;
ca52dcb5 871 int len, ret;
9be53773
SH
872
873 if (!orig->src || !orig->dest)
874 return -1;
875
876 if (strcmp(orig->type, "lvm")) {
31a95fec
SH
877 const char *vg;
878
ca52dcb5
SH
879 if (snap) {
880 ERROR("LVM snapshot from %s backing store is not supported",
881 orig->type);
882 return -1;
883 }
31a95fec
SH
884 vg = default_lvm_vg();
885 len = strlen("/dev/") + strlen(vg) + strlen(cname) + 2;
ca52dcb5
SH
886 if ((new->src = malloc(len)) == NULL)
887 return -1;
31a95fec 888 ret = snprintf(new->src, len, "/dev/%s/%s", vg, cname);
ca52dcb5
SH
889 if (ret < 0 || ret >= len)
890 return -1;
891 } else {
892 new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath);
893 if (!new->src)
894 return -1;
9be53773
SH
895 }
896
897 if (orig->data) {
898 new->data = strdup(orig->data);
899 if (!new->data)
900 return -1;
901 }
ca52dcb5
SH
902
903 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
904 new->dest = malloc(len);
9be53773
SH
905 if (!new->dest)
906 return -1;
ca52dcb5
SH
907 ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
908 if (ret < 0 || ret >= len)
9be53773 909 return -1;
ca52dcb5 910 if (mkdir_p(new->dest, 0755) < 0)
9be53773
SH
911 return -1;
912
ca52dcb5 913 if (is_blktype(orig)) {
eddaaafd 914 if (!newsize && blk_getsize(orig, &size) < 0) {
ca52dcb5
SH
915 ERROR("Error getting size of %s", orig->src);
916 return -1;
917 }
918 if (detect_fs(orig, fstype, 100) < 0) {
919 INFO("could not find fstype for %s, using ext3", orig->src);
920 return -1;
921 }
922 } else {
923 sprintf(fstype, "ext3");
924 if (!newsize)
925 size = 1000000000; // default to 1G
9be53773 926 }
ca52dcb5 927
9be53773
SH
928 if (snap) {
929 if (lvm_snapshot(orig->src, new->src, size) < 0) {
930 ERROR("could not create %s snapshot of %s", new->src, orig->src);
931 return -1;
932 }
933 } else {
1897e3bc 934 if (do_lvm_create(new->src, size) < 0) {
9be53773
SH
935 ERROR("Error creating new lvm blockdev");
936 return -1;
937 }
9be53773
SH
938 if (do_mkfs(new->src, fstype) < 0) {
939 ERROR("Error creating filesystem type %s on %s", fstype,
940 new->src);
941 return -1;
942 }
943 }
944
945 return 0;
946}
947
60bf62d4
SH
948static int lvm_destroy(struct bdev *orig)
949{
950 pid_t pid;
951
952 if ((pid = fork()) < 0)
953 return -1;
954 if (!pid) {
955 execlp("lvremove", "lvremove", "-f", orig->src, NULL);
956 exit(1);
957 }
958 return wait_for_pid(pid);
959}
960
eddaaafd
SH
961#define DEFAULT_FS_SIZE 1024000000
962#define DEFAULT_FSTYPE "ext3"
1897e3bc
SH
963static int lvm_create(struct bdev *bdev, const char *dest, const char *n,
964 struct bdev_specs *specs)
965{
966 const char *vg, *fstype, *lv = n;
967 unsigned long sz;
968 int ret, len;
969
970 if (!specs)
971 return -1;
972
973 vg = specs->u.lvm.vg;
974 if (!vg)
975 vg = default_lvm_vg();
976
977 /* /dev/$vg/$lv */
978 if (specs->u.lvm.lv)
979 lv = specs->u.lvm.lv;
980 len = strlen(vg) + strlen(lv) + 7;
981 bdev->src = malloc(len);
982 if (!bdev->src)
983 return -1;
984
985 ret = snprintf(bdev->src, len, "/dev/%s/%s", vg, lv);
986 if (ret < 0 || ret >= len)
987 return -1;
988
989 // lvm.fssize is in bytes.
990 sz = specs->u.lvm.fssize;
991 if (!sz)
eddaaafd 992 sz = DEFAULT_FS_SIZE;
1897e3bc
SH
993
994 INFO("Error creating new lvm blockdev %s size %lu", bdev->src, sz);
995 if (do_lvm_create(bdev->src, sz) < 0) {
996 ERROR("Error creating new lvm blockdev %s size %lu", bdev->src, sz);
997 return -1;
998 }
999
1000 fstype = specs->u.lvm.fstype;
1001 if (!fstype)
eddaaafd 1002 fstype = DEFAULT_FSTYPE;
1897e3bc
SH
1003 if (do_mkfs(bdev->src, fstype) < 0) {
1004 ERROR("Error creating filesystem type %s on %s", fstype,
1005 bdev->src);
1006 return -1;
1007 }
1008 if (!(bdev->dest = strdup(dest)))
1009 return -1;
1010
1011 if (mkdir_p(bdev->dest, 0755) < 0) {
1012 ERROR("Error creating %s\n", bdev->dest);
1013 return -1;
1014 }
1015
1016 return 0;
1017}
1018
9be53773
SH
1019struct bdev_ops lvm_ops = {
1020 .detect = &lvm_detect,
1021 .mount = &lvm_mount,
1022 .umount = &lvm_umount,
1023 .clone_paths = &lvm_clonepaths,
60bf62d4 1024 .destroy = &lvm_destroy,
1897e3bc 1025 .create = &lvm_create,
9be53773
SH
1026};
1027
1028//
1029// btrfs ops
1030//
1031
1032struct btrfs_ioctl_space_info {
1033 unsigned long long flags;
1034 unsigned long long total_bytes;
1035 unsigned long long used_bytes;
1036};
1037
1038struct btrfs_ioctl_space_args {
1039 unsigned long long space_slots;
1040 unsigned long long total_spaces;
1041 struct btrfs_ioctl_space_info spaces[0];
1042};
1043
1044#define BTRFS_IOCTL_MAGIC 0x94
1045#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, unsigned long long)
1046#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
1047 struct btrfs_ioctl_space_args)
1048
1897e3bc 1049static bool is_btrfs_fs(const char *path)
9be53773 1050{
9be53773
SH
1051 int fd, ret;
1052 struct btrfs_ioctl_space_args sargs;
1053
1054 // make sure this is a btrfs filesystem
1055 fd = open(path, O_RDONLY);
1056 if (fd < 0)
1897e3bc 1057 return false;
9be53773
SH
1058 sargs.space_slots = 0;
1059 sargs.total_spaces = 0;
1060 ret = ioctl(fd, BTRFS_IOC_SPACE_INFO, &sargs);
1061 close(fd);
1062 if (ret < 0)
1897e3bc
SH
1063 return false;
1064
1065 return true;
1066}
1067
1068static int btrfs_detect(const char *path)
1069{
1070 struct stat st;
1071 int ret;
1072
1073 if (!is_btrfs_fs(path))
9be53773
SH
1074 return 0;
1075
1076 // and make sure it's a subvolume.
1077 ret = stat(path, &st);
1078 if (ret < 0)
1079 return 0;
1080
1081 if (st.st_ino == 256 && S_ISDIR(st.st_mode))
1082 return 1;
1083
1084 return 0;
1085}
1086
60bf62d4 1087static int btrfs_mount(struct bdev *bdev)
9be53773
SH
1088{
1089 if (strcmp(bdev->type, "btrfs"))
1090 return -22;
1091 if (!bdev->src || !bdev->dest)
1092 return -22;
1093 return mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC, NULL);
1094}
1095
60bf62d4 1096static int btrfs_umount(struct bdev *bdev)
9be53773
SH
1097{
1098 if (strcmp(bdev->type, "btrfs"))
1099 return -22;
1100 if (!bdev->src || !bdev->dest)
1101 return -22;
1102 return umount(bdev->dest);
1103}
1104
1105#define BTRFS_SUBVOL_NAME_MAX 4039
1106#define BTRFS_PATH_NAME_MAX 4087
1107
1108struct btrfs_ioctl_vol_args {
1109 signed long long fd;
1110 char name[BTRFS_PATH_NAME_MAX + 1];
1111};
1112
1113#define BTRFS_IOCTL_MAGIC 0x94
1114#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
1115 struct btrfs_ioctl_vol_args_v2)
1116#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
1117 struct btrfs_ioctl_vol_args_v2)
1118#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
1119 struct btrfs_ioctl_vol_args)
60bf62d4
SH
1120#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
1121 struct btrfs_ioctl_vol_args)
9be53773
SH
1122
1123#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
1124
1125struct btrfs_ioctl_vol_args_v2 {
1126 signed long long fd;
1127 unsigned long long transid;
1128 unsigned long long flags;
1129 union {
1130 struct {
1131 unsigned long long size;
1132 //struct btrfs_qgroup_inherit *qgroup_inherit;
1133 void *qgroup_inherit;
1134 };
1135 unsigned long long unused[4];
1136 };
1137 char name[BTRFS_SUBVOL_NAME_MAX + 1];
1138};
1139
1140static int btrfs_subvolume_create(const char *path)
1141{
1142 int ret, fd = -1;
1143 struct btrfs_ioctl_vol_args args;
1144 char *p, *newfull = strdup(path);
1145
1146 if (!newfull) {
1147 ERROR("Error: out of memory");
1148 return -1;
1149 }
1150
c32981c3 1151 p = strrchr(newfull, '/');
9be53773
SH
1152 if (!p) {
1153 ERROR("bad path: %s", path);
1154 return -1;
1155 }
1156 *p = '\0';
1157
1158 if ((fd = open(newfull, O_RDONLY)) < 0) {
1159 ERROR("Error opening %s", newfull);
1160 free(newfull);
1161 return -1;
1162 }
1163
1164 memset(&args, 0, sizeof(args));
1165 strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1166 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1167 ret = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, &args);
1168 INFO("btrfs: snapshot create ioctl returned %d", ret);
9be53773
SH
1169
1170 free(newfull);
1171 close(fd);
1172 return ret;
1173}
1174
1175static int btrfs_snapshot(const char *orig, const char *new)
1176{
1177 int fd = -1, fddst = -1, ret = -1;
1178 struct btrfs_ioctl_vol_args_v2 args;
1179 char *newdir, *newname, *newfull = NULL;
1180
1181 newfull = strdup(new);
1182 if (!newfull) {
1183 ERROR("Error: out of memory");
1184 goto out;
1185 }
1186 // make sure the directory doesn't already exist
1187 if (rmdir(newfull) < 0 && errno != -ENOENT) {
1188 SYSERROR("Error removing empty new rootfs");
1189 goto out;
1190 }
1191 newname = basename(newfull);
1192 newdir = dirname(newfull);
1193 fd = open(orig, O_RDONLY);
1194 if (fd < 0) {
1195 SYSERROR("Error opening original rootfs %s", orig);
1196 goto out;
1197 }
1198 fddst = open(newdir, O_RDONLY);
1199 if (fddst < 0) {
1200 SYSERROR("Error opening new container dir %s", newdir);
1201 goto out;
1202 }
1203
1204 memset(&args, 0, sizeof(args));
1205 args.fd = fd;
1206 strncpy(args.name, newname, BTRFS_SUBVOL_NAME_MAX);
1207 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1208 ret = ioctl(fddst, BTRFS_IOC_SNAP_CREATE_V2, &args);
1209 INFO("btrfs: snapshot create ioctl returned %d", ret);
1210
1211out:
1212 if (fddst != -1)
1213 close(fddst);
1214 if (fd != -1)
1215 close(fd);
1216 if (newfull)
1217 free(newfull);
1218 return ret;
1219}
1220
1221static int btrfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1222 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1223 unsigned long newsize)
1224{
1225 if (!orig->dest || !orig->src)
1226 return -1;
1227
1228 if (strcmp(orig->type, "btrfs")) {
ca52dcb5
SH
1229 int len, ret;
1230 if (snap) {
1231 ERROR("btrfs snapshot from %s backing store is not supported",
1232 orig->type);
1233 return -1;
1234 }
1235 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1236 new->src = malloc(len);
1237 if (!new->src)
1238 return -1;
1239 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
1240 if (ret < 0 || ret >= len)
1241 return -1;
1242 } else {
1243 // in case rootfs is in custom path, reuse it
1244 if ((new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath)) == NULL)
1245 return -1;
9be53773 1246
ca52dcb5 1247 }
9be53773 1248
ca52dcb5 1249 if ((new->dest = strdup(new->src)) == NULL)
9be53773
SH
1250 return -1;
1251
1252 if (orig->data && (new->data = strdup(orig->data)) == NULL)
1253 return -1;
1254
1255 if (snap)
1256 return btrfs_snapshot(orig->dest, new->dest);
1257
1258 if (rmdir(new->dest) < 0 && errno != -ENOENT) {
1259 SYSERROR("removing %s\n", new->dest);
1260 return -1;
1261 }
1262
1263 return btrfs_subvolume_create(new->dest);
1264}
1265
60bf62d4
SH
1266static int btrfs_destroy(struct bdev *orig)
1267{
1268 int ret, fd = -1;
1269 struct btrfs_ioctl_vol_args args;
1270 char *path = orig->src;
1271 char *p, *newfull = strdup(path);
1272
1273 if (!newfull) {
1274 ERROR("Error: out of memory");
1275 return -1;
1276 }
1277
c32981c3 1278 p = strrchr(newfull, '/');
60bf62d4
SH
1279 if (!p) {
1280 ERROR("bad path: %s", path);
1281 return -1;
1282 }
1283 *p = '\0';
1284
1285 if ((fd = open(newfull, O_RDONLY)) < 0) {
1286 ERROR("Error opening %s", newfull);
1287 free(newfull);
1288 return -1;
1289 }
1290
1291 memset(&args, 0, sizeof(args));
1292 strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1293 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1294 ret = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &args);
1295 INFO("btrfs: snapshot create ioctl returned %d", ret);
1296
1297 free(newfull);
1298 close(fd);
1299 return ret;
1300}
1301
1897e3bc
SH
1302static int btrfs_create(struct bdev *bdev, const char *dest, const char *n,
1303 struct bdev_specs *specs)
1304{
1305 bdev->src = strdup(dest);
1306 bdev->dest = strdup(dest);
1307 if (!bdev->src || !bdev->dest)
1308 return -1;
1309 return btrfs_subvolume_create(bdev->dest);
1310}
1311
9be53773
SH
1312struct bdev_ops btrfs_ops = {
1313 .detect = &btrfs_detect,
1314 .mount = &btrfs_mount,
1315 .umount = &btrfs_umount,
1316 .clone_paths = &btrfs_clonepaths,
60bf62d4 1317 .destroy = &btrfs_destroy,
1897e3bc 1318 .create = &btrfs_create,
9be53773
SH
1319};
1320
eddaaafd
SH
1321//
1322// loopback dev ops
1323//
1324static int loop_detect(const char *path)
1325{
1326 if (strncmp(path, "loop:", 5) == 0)
1327 return 1;
1328 return 0;
1329}
1330
1331static int find_free_loopdev(int *retfd, char *namep)
1332{
1333 struct dirent dirent, *direntp;
1334 struct loop_info64 lo;
1335 DIR *dir;
1336 int fd = -1;
1337
1338 if (!(dir = opendir("/dev"))) {
1339 SYSERROR("Error opening /dev");
1340 return -1;
1341 }
1342 while (!readdir_r(dir, &dirent, &direntp)) {
1343
1344 if (!direntp)
1345 break;
1346 if (strncmp(direntp->d_name, "loop", 4) != 0)
1347 continue;
1348 if ((fd = openat(dirfd(dir), direntp->d_name, O_RDWR)) < 0)
1349 continue;
1350 if (ioctl(fd, LOOP_GET_STATUS64, &lo) == 0 || errno != ENXIO) {
1351 close(fd);
1352 fd = -1;
1353 continue;
1354 }
1355 // We can use this fd
1356 snprintf(namep, 100, "/dev/%s", direntp->d_name);
1357 break;
1358 }
ca697342 1359 closedir(dir);
eddaaafd
SH
1360 if (fd == -1) {
1361 ERROR("No loop device found");
1362 return -1;
1363 }
eddaaafd
SH
1364
1365 *retfd = fd;
1366 return 0;
1367}
1368
1369static int loop_mount(struct bdev *bdev)
1370{
1371 int lfd, ffd = -1, ret = -1;
1372 struct loop_info64 lo;
1373 char loname[100];
1374
1375 if (strcmp(bdev->type, "loop"))
1376 return -22;
1377 if (!bdev->src || !bdev->dest)
1378 return -22;
1379 if (find_free_loopdev(&lfd, loname) < 0)
1380 return -22;
1381
1382 if ((ffd = open(bdev->src + 5, O_RDWR)) < 0) {
1383 SYSERROR("Error opening backing file %s\n", bdev->src);
1384 goto out;
1385 }
1386
1387 if (ioctl(lfd, LOOP_SET_FD, ffd) < 0) {
1388 SYSERROR("Error attaching backing file to loop dev");
1389 goto out;
1390 }
1391 memset(&lo, 0, sizeof(lo));
1392 lo.lo_flags = LO_FLAGS_AUTOCLEAR;
1393 if (ioctl(lfd, LOOP_SET_STATUS64, &lo) < 0) {
1394 SYSERROR("Error setting autoclear on loop dev\n");
1395 goto out;
1396 }
1397
1398 ret = mount_unknow_fs(loname, bdev->dest, 0);
1399 if (ret < 0)
1400 ERROR("Error mounting %s\n", bdev->src);
1401 else
1402 bdev->lofd = lfd;
1403
1404out:
1405 if (ffd > -1)
1406 close(ffd);
1407 if (ret < 0) {
1408 close(lfd);
1409 bdev->lofd = -1;
1410 }
1411 return ret;
1412}
1413
1414static int loop_umount(struct bdev *bdev)
1415{
1416 int ret;
1417
1418 if (strcmp(bdev->type, "loop"))
1419 return -22;
1420 if (!bdev->src || !bdev->dest)
1421 return -22;
1422 ret = umount(bdev->dest);
1423 if (bdev->lofd >= 0) {
1424 close(bdev->lofd);
1425 bdev->lofd = -1;
1426 }
1427 return ret;
1428}
1429
1430static int do_loop_create(const char *path, unsigned long size, const char *fstype)
1431{
1432 int fd;
1433 // create the new loopback file.
1434 fd = creat(path, S_IRUSR|S_IWUSR);
1435 if (fd < 0)
1436 return -1;
1437 if (lseek(fd, size, SEEK_SET) < 0) {
1438 SYSERROR("Error seeking to set new loop file size");
1439 close(fd);
1440 return -1;
1441 }
1442 if (write(fd, "1", 1) != 1) {
1443 SYSERROR("Error creating new loop file");
1444 close(fd);
1445 return -1;
1446 }
1447 if (close(fd) < 0) {
1448 SYSERROR("Error closing new loop file");
1449 return -1;
1450 }
1451
1452 // create an fs in the loopback file
1453 if (do_mkfs(path, fstype) < 0) {
1454 ERROR("Error creating filesystem type %s on %s", fstype,
1455 path);
1456 return -1;
1457 }
1458
1459 return 0;
1460}
1461
1462/*
1463 * No idea what the original blockdev will be called, but the copy will be
1464 * called $lxcpath/$lxcname/rootdev
1465 */
1466static int loop_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1467 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1468 unsigned long newsize)
1469{
1470 char fstype[100];
1471 unsigned long size = newsize;
1472 int len, ret;
1473 char *srcdev;
1474
1475 if (snap) {
1476 ERROR("loop devices cannot be snapshotted.");
1477 return -1;
1478 }
1479
1480 if (!orig->dest || !orig->src)
1481 return -1;
1482
1483 len = strlen(lxcpath) + strlen(cname) + strlen("rootdev") + 3;
1484 srcdev = alloca(len);
1485 ret = snprintf(srcdev, len, "%s/%s/rootdev", lxcpath, cname);
1486 if (ret < 0 || ret >= len)
1487 return -1;
1488
1489 new->src = malloc(len + 5);
1490 if (!new->src)
1491 return -1;
1492 ret = snprintf(new->src, len + 5, "loop:%s", srcdev);
1493 if (ret < 0 || ret >= len + 5)
1494 return -1;
1495
1496 new->dest = malloc(len);
1497 if (!new->dest)
1498 return -1;
1499 ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
1500 if (ret < 0 || ret >= len)
1501 return -1;
1502
1503 // it's tempting to say: if orig->src == loopback and !newsize, then
1504 // copy the loopback file. However, we'd have to make sure to
1505 // correctly keep holes! So punt for now.
1506
1507 if (is_blktype(orig)) {
1508 if (!newsize && blk_getsize(orig, &size) < 0) {
1509 ERROR("Error getting size of %s", orig->src);
1510 return -1;
1511 }
1512 if (detect_fs(orig, fstype, 100) < 0) {
1513 INFO("could not find fstype for %s, using %s", orig->src,
1514 DEFAULT_FSTYPE);
1515 return -1;
1516 }
1517 } else {
1518 sprintf(fstype, "%s", DEFAULT_FSTYPE);
1519 if (!newsize)
1520 size = DEFAULT_FS_SIZE; // default to 1G
1521 }
1522 return do_loop_create(srcdev, size, fstype);
1523}
1524
1525static int loop_create(struct bdev *bdev, const char *dest, const char *n,
1526 struct bdev_specs *specs)
1527{
1528 const char *fstype;
1529 unsigned long sz;
1530 int ret, len;
1531 char *srcdev;
1532
1533 if (!specs)
1534 return -1;
1535
1536 // dest is passed in as $lxcpath / $lxcname / rootfs
1537 // srcdev will be: $lxcpath / $lxcname / rootdev
1538 // src will be 'loop:$srcdev'
1539 len = strlen(dest) + 2;
1540 srcdev = alloca(len);
1541
1542 ret = snprintf(srcdev, len, "%s", dest);
1543 if (ret < 0 || ret >= len)
1544 return -1;
1545 sprintf(srcdev + len - 4, "dev");
1546
1547 bdev->src = malloc(len + 5);
1548 if (!bdev->src)
1549 return -1;
1550 ret = snprintf(bdev->src, len + 5, "loop:%s", srcdev);
1551 if (ret < 0 || ret >= len + 5)
1552 return -1;
1553
1554 sz = specs->u.loop.fssize;
1555 if (!sz)
1556 sz = DEFAULT_FS_SIZE;
1557
1558 fstype = specs->u.loop.fstype;
1559 if (!fstype)
1560 fstype = DEFAULT_FSTYPE;
1561
1562 if (!(bdev->dest = strdup(dest)))
1563 return -1;
1564
1565 if (mkdir_p(bdev->dest, 0755) < 0) {
1566 ERROR("Error creating %s\n", bdev->dest);
1567 return -1;
1568 }
1569
1570 return do_loop_create(srcdev, sz, fstype);
1571}
1572
1573static int loop_destroy(struct bdev *orig)
1574{
1575 return unlink(orig->src + 5);
1576}
1577
1578struct bdev_ops loop_ops = {
1579 .detect = &loop_detect,
1580 .mount = &loop_mount,
1581 .umount = &loop_umount,
1582 .clone_paths = &loop_clonepaths,
1583 .destroy = &loop_destroy,
1584 .create = &loop_create,
1585};
1586
9be53773
SH
1587//
1588// overlayfs ops
1589//
1590
1591static int overlayfs_detect(const char *path)
1592{
1593 if (strncmp(path, "overlayfs:", 10) == 0)
1594 return 1; // take their word for it
1595 return 0;
1596}
1597
1598//
1599// XXXXXXX plain directory bind mount ops
1600//
60bf62d4 1601static int overlayfs_mount(struct bdev *bdev)
9be53773
SH
1602{
1603 char *options, *dup, *lower, *upper;
1604 int len;
1605 int ret;
1606
1607 if (strcmp(bdev->type, "overlayfs"))
1608 return -22;
1609 if (!bdev->src || !bdev->dest)
1610 return -22;
1611
1612 // separately mount it first
1613 // mount -t overlayfs -oupperdir=${upper},lowerdir=${lower} lower dest
d74325c4
SG
1614 dup = alloca(strlen(bdev->src)+1);
1615 strcpy(dup, bdev->src);
9be53773
SH
1616 if (!(lower = index(dup, ':')))
1617 return -22;
1618 if (!(upper = index(++lower, ':')))
1619 return -22;
1620 *upper = '\0';
1621 upper++;
1622
1623 // TODO We should check whether bdev->src is a blockdev, and if so
1624 // but for now, only support overlays of a basic directory
1625
1626 len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=") + 1;
1627 options = alloca(len);
1628 ret = snprintf(options, len, "upperdir=%s,lowerdir=%s", upper, lower);
1629 if (ret < 0 || ret >= len)
1630 return -1;
1631 ret = mount(lower, bdev->dest, "overlayfs", MS_MGC_VAL, options);
1632 if (ret < 0)
1633 SYSERROR("overlayfs: error mounting %s onto %s options %s",
1634 lower, bdev->dest, options);
1635 else
1636 INFO("overlayfs: mounted %s onto %s options %s",
1637 lower, bdev->dest, options);
1638 return ret;
1639}
1640
60bf62d4 1641static int overlayfs_umount(struct bdev *bdev)
9be53773
SH
1642{
1643 if (strcmp(bdev->type, "overlayfs"))
1644 return -22;
1645 if (!bdev->src || !bdev->dest)
1646 return -22;
1647 return umount(bdev->dest);
1648}
1649
1650static int overlayfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1651 const char *cname, const char *oldpath, const char *lxcpath, int snap,
1652 unsigned long newsize)
1653{
1654 if (!snap) {
1655 ERROR("overlayfs is only for snapshot clones");
1656 return -22;
1657 }
1658
1659 if (!orig->src || !orig->dest)
1660 return -1;
1661
1662 new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
1663 if (!new->dest)
1664 return -1;
1665 if (mkdir_p(new->dest, 0755) < 0)
1666 return -1;
1667
1668 if (strcmp(orig->type, "dir") == 0) {
1669 char *delta;
1670 int ret, len;
5ca6c34b 1671
9be53773
SH
1672 // if we have /var/lib/lxc/c2/rootfs, then delta will be
1673 // /var/lib/lxc/c2/delta0
1674 delta = strdup(new->dest);
1675 if (!delta) {
1676 return -1;
1677 }
1678 if (strlen(delta) < 6) {
1679 free(delta);
1680 return -22;
1681 }
1682 strcpy(&delta[strlen(delta)-6], "delta0");
1683 if ((ret = mkdir(delta, 0755)) < 0) {
1684 SYSERROR("error: mkdir %s", delta);
1685 free(delta);
1686 return -1;
1687 }
1688
1689 // the src will be 'overlayfs:lowerdir:upperdir'
1690 len = strlen(delta) + strlen(orig->src) + 12;
1691 new->src = malloc(len);
1692 if (!new->src) {
1693 free(delta);
1694 return -ENOMEM;
1695 }
1696 ret = snprintf(new->src, len, "overlayfs:%s:%s", orig->src, delta);
1697 free(delta);
1698 if (ret < 0 || ret >= len)
1699 return -ENOMEM;
9be53773
SH
1700 } else if (strcmp(orig->type, "overlayfs") == 0) {
1701 // What exactly do we want to do here?
1702 // I think we want to use the original lowerdir, with a
1703 // private delta which is originally rsynced from the
1704 // original delta
1705 char *osrc, *odelta, *nsrc, *ndelta;
1706 int len, ret;
1707 if (!(osrc = strdup(orig->src)))
1708 return -22;
1709 nsrc = index(osrc, ':') + 1;
1710 if (nsrc != osrc + 10 || (odelta = index(nsrc, ':')) == NULL) {
1711 free(osrc);
1712 return -22;
1713 }
1714 *odelta = '\0';
1715 odelta++;
1716 ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath);
1717 if (!ndelta) {
1718 free(osrc);
1719 return -ENOMEM;
1720 }
1721 if (do_rsync(odelta, ndelta) < 0) {
91c908ee
DE
1722 free(osrc);
1723 free(ndelta);
9be53773
SH
1724 ERROR("copying overlayfs delta");
1725 return -1;
1726 }
1727 len = strlen(nsrc) + strlen(ndelta) + 12;
1728 new->src = malloc(len);
1729 if (!new->src) {
1730 free(osrc);
1731 free(ndelta);
1732 return -ENOMEM;
1733 }
1734 ret = snprintf(new->src, len, "overlayfs:%s:%s", nsrc, ndelta);
1735 free(osrc);
1736 free(ndelta);
1737 if (ret < 0 || ret >= len)
1738 return -ENOMEM;
375c2258
SH
1739 } else {
1740 ERROR("overlayfs clone of %s container is not yet supported",
1741 orig->type);
1742 // Note, supporting this will require overlayfs_mount supporting
1743 // mounting of the underlay. No big deal, just needs to be done.
1744 return -1;
9be53773
SH
1745 }
1746
1747 return 0;
1748}
1749
60bf62d4
SH
1750int overlayfs_destroy(struct bdev *orig)
1751{
1752 char *upper;
1753
1754 if (strncmp(orig->src, "overlayfs:", 10) != 0)
1755 return -22;
1756 upper = index(orig->src + 10, ':');
1757 if (!upper)
1758 return -22;
1759 upper++;
1760 return lxc_rmdir_onedev(upper);
1761}
1762
1897e3bc
SH
1763/*
1764 * to say 'lxc-create -t ubuntu -n o1 -B overlayfs' means you want
1765 * $lxcpath/$lxcname/rootfs to have the created container, while all
1766 * changes after starting the container are written to
1767 * $lxcpath/$lxcname/delta0
1768 */
1769static int overlayfs_create(struct bdev *bdev, const char *dest, const char *n,
1770 struct bdev_specs *specs)
1771{
1772 char *delta;
1773 int ret, len = strlen(dest), newlen;
1774
1775 if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0)
1776 return -1;
1777
1778 if (!(bdev->dest = strdup(dest))) {
1779 ERROR("Out of memory");
1780 return -1;
1781 }
1782
d74325c4
SG
1783 delta = alloca(strlen(dest)+1);
1784 strcpy(delta, dest);
1897e3bc
SH
1785 strcpy(delta+len-6, "delta0");
1786
1787 if (mkdir_p(delta, 0755) < 0) {
1788 ERROR("Error creating %s\n", delta);
1789 return -1;
1790 }
1791
1792 /* overlayfs:lower:upper */
1793 newlen = (2 * len) + strlen("overlayfs:") + 2;
1794 bdev->src = malloc(newlen);
1795 if (!bdev->src) {
1796 ERROR("Out of memory");
1797 return -1;
1798 }
1799 ret = snprintf(bdev->src, newlen, "overlayfs:%s:%s", dest, delta);
1800 if (ret < 0 || ret >= newlen)
1801 return -1;
1802
1803 if (mkdir_p(bdev->dest, 0755) < 0) {
1804 ERROR("Error creating %s\n", bdev->dest);
1805 return -1;
1806 }
1807
1808 return 0;
1809}
1810
9be53773
SH
1811struct bdev_ops overlayfs_ops = {
1812 .detect = &overlayfs_detect,
1813 .mount = &overlayfs_mount,
1814 .umount = &overlayfs_umount,
1815 .clone_paths = &overlayfs_clonepaths,
60bf62d4 1816 .destroy = &overlayfs_destroy,
1897e3bc 1817 .create = &overlayfs_create,
9be53773
SH
1818};
1819
1820struct bdev_type bdevs[] = {
3baa76fe 1821 {.name = "zfs", .ops = &zfs_ops,},
9be53773
SH
1822 {.name = "lvm", .ops = &lvm_ops,},
1823 {.name = "btrfs", .ops = &btrfs_ops,},
1824 {.name = "dir", .ops = &dir_ops,},
1825 {.name = "overlayfs", .ops = &overlayfs_ops,},
eddaaafd 1826 {.name = "loop", .ops = &loop_ops,},
9be53773
SH
1827};
1828
1829static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type);
1830
1831void bdev_put(struct bdev *bdev)
1832{
1833 if (bdev->data)
1834 free(bdev->data);
1835 if (bdev->src)
1836 free(bdev->src);
1837 if (bdev->dest)
1838 free(bdev->dest);
1839 free(bdev);
1840}
1841
1842struct bdev *bdev_get(const char *type)
1843{
1844 int i;
1845 struct bdev *bdev;
1846
1847 for (i=0; i<numbdevs; i++) {
1848 if (strcmp(bdevs[i].name, type) == 0)
1849 break;
1850 }
1851 if (i == numbdevs)
1852 return NULL;
1853 bdev = malloc(sizeof(struct bdev));
1854 if (!bdev)
1855 return NULL;
1856 memset(bdev, 0, sizeof(struct bdev));
1857 bdev->ops = bdevs[i].ops;
1858 bdev->type = bdevs[i].name;
1859 return bdev;
1860}
1861
1862struct bdev *bdev_init(const char *src, const char *dst, const char *data)
1863{
1864 int i;
1865 struct bdev *bdev;
1866
1867 for (i=0; i<numbdevs; i++) {
1868 int r;
1869 r = bdevs[i].ops->detect(src);
1870 if (r)
1871 break;
1872 }
eddaaafd 1873
9be53773
SH
1874 if (i == numbdevs)
1875 return NULL;
1876 bdev = malloc(sizeof(struct bdev));
1877 if (!bdev)
1878 return NULL;
1879 memset(bdev, 0, sizeof(struct bdev));
1880 bdev->ops = bdevs[i].ops;
1881 bdev->type = bdevs[i].name;
1882 if (data)
1883 bdev->data = strdup(data);
1884 if (src)
1885 bdev->src = strdup(src);
1886 if (dst)
1887 bdev->dest = strdup(dst);
1888
1889 return bdev;
1890}
1891
1892/*
1893 * If we're not snaphotting, then bdev_copy becomes a simple case of mount
1894 * the original, mount the new, and rsync the contents.
1895 */
1896struct bdev *bdev_copy(const char *src, const char *oldname, const char *cname,
1897 const char *oldpath, const char *lxcpath, const char *bdevtype,
dfb31b25
SH
1898 int snap, const char *bdevdata, unsigned long newsize,
1899 int *needs_rdep)
9be53773
SH
1900{
1901 struct bdev *orig, *new;
1902 pid_t pid;
1903
1904 /* if the container name doesn't show up in the rootfs path, then
1905 * we don't know how to come up with a new name
1906 */
1907 if (strstr(src, oldname) == NULL) {
1908 ERROR("original rootfs path %s doesn't include container name %s",
1909 src, oldname);
1910 return NULL;
1911 }
1912
1913 orig = bdev_init(src, NULL, NULL);
1914 if (!orig) {
1915 ERROR("failed to detect blockdev type for %s\n", src);
1916 return NULL;
1917 }
1918
1919 if (!orig->dest) {
1920 int ret;
1921 orig->dest = malloc(MAXPATHLEN);
1922 if (!orig->dest) {
1923 ERROR("out of memory");
1924 bdev_put(orig);
1925 return NULL;
1926 }
1927 ret = snprintf(orig->dest, MAXPATHLEN, "%s/%s/rootfs", oldpath, oldname);
1928 if (ret < 0 || ret >= MAXPATHLEN) {
1929 ERROR("rootfs path too long");
1930 bdev_put(orig);
1931 return NULL;
1932 }
1933 }
1934
e3fdf5cc
SH
1935 /*
1936 * If newtype is NULL and snapshot is set, then use overlayfs
1937 */
1938 if (!bdevtype && snap && strcmp(orig->type , "dir") == 0)
1939 bdevtype = "overlayfs";
1940
dfb31b25
SH
1941 *needs_rdep = 0;
1942 if (strcmp(orig->type, "dir") == 0 &&
1943 strcmp(bdevtype, "overlayfs") == 0)
1944 *needs_rdep = 1;
1945
9be53773
SH
1946 new = bdev_get(bdevtype ? bdevtype : orig->type);
1947 if (!new) {
1948 ERROR("no such block device type: %s", bdevtype ? bdevtype : orig->type);
1949 bdev_put(orig);
1950 return NULL;
1951 }
1952
1953 if (new->ops->clone_paths(orig, new, oldname, cname, oldpath, lxcpath, snap, newsize) < 0) {
1954 ERROR("failed getting pathnames for cloned storage: %s\n", src);
1955 bdev_put(orig);
1956 bdev_put(new);
1957 return NULL;
1958 }
1959
1960 pid = fork();
1961 if (pid < 0) {
1962 SYSERROR("fork");
1963 bdev_put(orig);
1964 bdev_put(new);
1965 return NULL;
1966 }
1967
1968 if (pid > 0) {
1969 int ret = wait_for_pid(pid);
1970 bdev_put(orig);
1971 if (ret < 0) {
1972 bdev_put(new);
1973 return NULL;
1974 }
1975 return new;
1976 }
1977
1978 if (unshare(CLONE_NEWNS) < 0) {
1979 SYSERROR("unshare CLONE_NEWNS");
1980 exit(1);
1981 }
1982 if (snap)
1983 exit(0);
1984
1985 // If not a snapshot, copy the fs.
1986 if (orig->ops->mount(orig) < 0) {
1987 ERROR("failed mounting %s onto %s\n", src, orig->dest);
1988 exit(1);
1989 }
1990 if (new->ops->mount(new) < 0) {
1991 ERROR("failed mounting %s onto %s\n", new->src, new->dest);
1992 exit(1);
1993 }
1994 if (do_rsync(orig->dest, new->dest) < 0) {
1995 ERROR("rsyncing %s to %s\n", orig->src, new->src);
1996 exit(1);
1997 }
1998 // don't bother umounting, ns exit will do that
1999
2000 exit(0);
2001}
1897e3bc 2002
d44e88c2
SH
2003static struct bdev * do_bdev_create(const char *dest, const char *type,
2004 const char *cname, struct bdev_specs *specs)
2005{
2006 struct bdev *bdev = bdev_get(type);
2007 if (!bdev) {
2008 return NULL;
2009 }
2010
2011 if (bdev->ops->create(bdev, dest, cname, specs) < 0) {
2012 bdev_put(bdev);
2013 return NULL;
2014 }
2015
2016 return bdev;
2017}
2018
1897e3bc
SH
2019/*
2020 * bdev_create:
2021 * Create a backing store for a container.
2022 * If successfull, return a struct bdev *, with the bdev mounted and ready
2023 * for use. Before completing, the caller will need to call the
2024 * umount operation and bdev_put().
2025 * @dest: the mountpoint (i.e. /var/lib/lxc/$name/rootfs)
2026 * @type: the bdevtype (dir, btrfs, zfs, etc)
2027 * @cname: the container name
2028 * @specs: details about the backing store to create, like fstype
2029 */
2030struct bdev *bdev_create(const char *dest, const char *type,
2031 const char *cname, struct bdev_specs *specs)
2032{
2033 struct bdev *bdev;
d44e88c2 2034 char *best_options[] = {"btrfs", "zfs", "lvm", "dir", NULL};
1897e3bc 2035
d3060bd0 2036 if (!type)
d44e88c2
SH
2037 return do_bdev_create(dest, "dir", cname, specs);
2038
2039 if (strcmp(type, "best") == 0) {
2040 int i;
2041 // try for the best backing store type, according to our
2042 // opinionated preferences
2043 for (i=0; best_options[i]; i++) {
2044 if ((bdev = do_bdev_create(dest, best_options[i], cname, specs)))
2045 return bdev;
2046 }
2047 return NULL; // 'dir' should never fail, so this shouldn't happen
1897e3bc
SH
2048 }
2049
d44e88c2
SH
2050 // -B lvm,dir
2051 if (index(type, ',') != NULL) {
2052 char *dup = alloca(strlen(type)+1), *saveptr, *token;
2053 strcpy(dup, type);
2054 for (token = strtok_r(dup, ",", &saveptr); token;
2055 token = strtok_r(NULL, ",", &saveptr)) {
2056 if ((bdev = do_bdev_create(dest, token, cname, specs)))
2057 return bdev;
2058 }
1897e3bc
SH
2059 }
2060
d44e88c2 2061 return do_bdev_create(dest, type, cname, specs);
1897e3bc
SH
2062}
2063
2064char *overlayfs_getlower(char *p)
2065{
2066 char *p1 = index(p, ':');
2067 if (p1)
2068 *p1 = '\0';
2069 return p;
2070}