]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/bdev.c
ovl_rsync: make sure to umount
[mirror_lxc.git] / src / lxc / bdev.c
CommitLineData
9be53773
SH
1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9be53773
SH
22 */
23
24/*
25 * this is all just a first shot for experiment. If we go this route, much
ec64264d 26 * should change. bdev should be a directory with per-bdev file. Things which
9be53773
SH
27 * I'm doing by calling out to userspace should sometimes be done through
28 * libraries like liblvm2
29 */
30#define _GNU_SOURCE
31#include <stdio.h>
d659597e
SA
32#include <stdint.h>
33#include <inttypes.h>
c476bdce
SH
34#include <sys/types.h>
35#include <grp.h>
9be53773
SH
36#include <unistd.h>
37#include <errno.h>
38#include <sched.h>
39#include <sys/mount.h>
40#include <sys/wait.h>
41#include <libgen.h>
eddaaafd
SH
42#include <linux/loop.h>
43#include <dirent.h>
76a26f55 44#include <sys/prctl.h>
f2363e38 45
9be53773
SH
46#include "lxc.h"
47#include "config.h"
48#include "conf.h"
49#include "bdev.h"
50#include "log.h"
51#include "error.h"
52#include "utils.h"
53#include "namespace.h"
54#include "parse.h"
95ee490b 55#include "lxclock.h"
ff462013 56#include "lxc-btrfs.h"
9be53773 57
bff13ba2
SG
58#ifndef BLKGETSIZE64
59#define BLKGETSIZE64 _IOR(0x12,114,size_t)
60#endif
61
62#ifndef LO_FLAGS_AUTOCLEAR
63#define LO_FLAGS_AUTOCLEAR 4
64#endif
65
f5fd66f7
SG
66#ifndef LOOP_CTL_GET_FREE
67#define LOOP_CTL_GET_FREE 0x4C82
68#endif
69
d659597e
SA
70#define DEFAULT_FS_SIZE 1073741824
71#define DEFAULT_FSTYPE "ext3"
72
9be53773
SH
73lxc_log_define(bdev, lxc);
74
186bef00
SH
75struct ovl_rsync_data {
76 struct bdev *orig;
77 struct bdev *new;
78};
79
2659c7cb
SH
80struct rsync_data_char {
81 char *src;
82 char *dest;
83};
84
9be53773
SH
85static int do_rsync(const char *src, const char *dest)
86{
87 // call out to rsync
88 pid_t pid;
89 char *s;
90 size_t l;
91
92 pid = fork();
93 if (pid < 0)
94 return -1;
95 if (pid > 0)
96 return wait_for_pid(pid);
025ed0f3 97
9be53773
SH
98 l = strlen(src) + 2;
99 s = malloc(l);
100 if (!s)
101 exit(1);
102 strcpy(s, src);
103 s[l-2] = '/';
104 s[l-1] = '\0';
105
186bef00 106 execlp("rsync", "rsync", "-aHX", "--delete", s, dest, (char *)NULL);
ca52dcb5 107 exit(1);
9be53773
SH
108}
109
eddaaafd 110/*
d659597e 111 * return block size of dev->src in units of bytes
eddaaafd 112 */
d659597e 113static int blk_getsize(struct bdev *bdev, uint64_t *size)
9be53773
SH
114{
115 int fd, ret;
eddaaafd
SH
116 char *path = bdev->src;
117
118 if (strcmp(bdev->type, "loop") == 0)
119 path = bdev->src + 5;
9be53773
SH
120
121 fd = open(path, O_RDONLY);
42fb4b15 122 if (fd < 0)
9be53773 123 return -1;
d659597e
SA
124
125 ret = ioctl(fd, BLKGETSIZE64, size); // size of device in bytes
9be53773
SH
126 close(fd);
127 return ret;
128}
129
130/*
131 * These are copied from conf.c. However as conf.c will be moved to using
132 * the callback system, they can be pulled from there eventually, so we
133 * don't need to pollute utils.c with these low level functions
134 */
135static int find_fstype_cb(char* buffer, void *data)
136{
137 struct cbarg {
138 const char *rootfs;
139 const char *target;
a17b1e65 140 const char *options;
9be53773
SH
141 } *cbarg = data;
142
a17b1e65
SG
143 unsigned long mntflags;
144 char *mntdata;
9be53773
SH
145 char *fstype;
146
147 /* we don't try 'nodev' entries */
148 if (strstr(buffer, "nodev"))
149 return 0;
150
151 fstype = buffer;
152 fstype += lxc_char_left_gc(fstype, strlen(fstype));
153 fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
154
155 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
156 cbarg->rootfs, cbarg->target, fstype);
157
a17b1e65
SG
158 if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) {
159 free(mntdata);
160 return 0;
161 }
162
163 if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) {
9be53773 164 DEBUG("mount failed with error: %s", strerror(errno));
a17b1e65 165 free(mntdata);
9be53773
SH
166 return 0;
167 }
168
a17b1e65
SG
169 free(mntdata);
170
9be53773
SH
171 INFO("mounted '%s' on '%s', with fstype '%s'",
172 cbarg->rootfs, cbarg->target, fstype);
173
174 return 1;
175}
176
a17b1e65
SG
177static int mount_unknown_fs(const char *rootfs, const char *target,
178 const char *options)
9be53773
SH
179{
180 int i;
181
182 struct cbarg {
183 const char *rootfs;
184 const char *target;
a17b1e65 185 const char *options;
9be53773
SH
186 } cbarg = {
187 .rootfs = rootfs,
188 .target = target,
a17b1e65 189 .options = options,
9be53773
SH
190 };
191
192 /*
193 * find the filesystem type with brute force:
194 * first we check with /etc/filesystems, in case the modules
195 * are auto-loaded and fall back to the supported kernel fs
196 */
197 char *fsfile[] = {
198 "/etc/filesystems",
199 "/proc/filesystems",
200 };
201
202 for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
203
204 int ret;
205
206 if (access(fsfile[i], F_OK))
207 continue;
208
209 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
210 if (ret < 0) {
211 ERROR("failed to parse '%s'", fsfile[i]);
212 return -1;
213 }
214
215 if (ret)
216 return 0;
217 }
218
219 ERROR("failed to determine fs type for '%s'", rootfs);
220 return -1;
221}
222
223static int do_mkfs(const char *path, const char *fstype)
224{
225 pid_t pid;
226
227 if ((pid = fork()) < 0) {
228 ERROR("error forking");
229 return -1;
230 }
231 if (pid > 0)
232 return wait_for_pid(pid);
233
eddaaafd
SH
234 // If the file is not a block device, we don't want mkfs to ask
235 // us about whether to proceed.
69aeabac
TA
236 if (null_stdfds() < 0)
237 exit(1);
ca52dcb5
SH
238 execlp("mkfs", "mkfs", "-t", fstype, path, NULL);
239 exit(1);
9be53773
SH
240}
241
242static char *linkderef(char *path, char *dest)
243{
244 struct stat sbuf;
245 ssize_t ret;
246
247 ret = stat(path, &sbuf);
248 if (ret < 0)
249 return NULL;
250 if (!S_ISLNK(sbuf.st_mode))
251 return path;
252 ret = readlink(path, dest, MAXPATHLEN);
253 if (ret < 0) {
254 SYSERROR("error reading link %s", path);
255 return NULL;
256 } else if (ret >= MAXPATHLEN) {
257 ERROR("link in %s too long", path);
258 return NULL;
259 }
260 dest[ret] = '\0';
261 return dest;
262}
263
264/*
265 * Given a bdev (presumably blockdev-based), detect the fstype
266 * by trying mounting (in a private mntns) it.
267 * @bdev: bdev to investigate
268 * @type: preallocated char* in which to write the fstype
269 * @len: length of passed in char*
270 * Returns length of fstype, of -1 on error
271 */
272static int detect_fs(struct bdev *bdev, char *type, int len)
273{
274 int p[2], ret;
275 size_t linelen;
276 pid_t pid;
277 FILE *f;
278 char *sp1, *sp2, *sp3, *line = NULL;
5d9598d7 279 char *srcdev;
9be53773
SH
280
281 if (!bdev || !bdev->src || !bdev->dest)
282 return -1;
283
5d9598d7 284 srcdev = bdev->src;
eddaaafd
SH
285 if (strcmp(bdev->type, "loop") == 0)
286 srcdev = bdev->src + 5;
287
025ed0f3 288 ret = pipe(p);
025ed0f3 289 if (ret < 0)
9be53773
SH
290 return -1;
291 if ((pid = fork()) < 0)
292 return -1;
293 if (pid > 0) {
294 int status;
295 close(p[1]);
296 memset(type, 0, len);
297 ret = read(p[0], type, len-1);
298 close(p[0]);
299 if (ret < 0) {
300 SYSERROR("error reading from pipe");
301 wait(&status);
302 return -1;
303 } else if (ret == 0) {
304 ERROR("child exited early - fstype not found");
305 wait(&status);
306 return -1;
307 }
308 wait(&status);
309 type[len-1] = '\0';
eddaaafd 310 INFO("detected fstype %s for %s", type, srcdev);
9be53773
SH
311 return ret;
312 }
313
314 if (unshare(CLONE_NEWNS) < 0)
315 exit(1);
316
2c6f3fc9
SH
317 if (detect_shared_rootfs()) {
318 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
319 SYSERROR("Failed to make / rslave");
320 ERROR("Continuing...");
321 }
322 }
323
a17b1e65 324 ret = mount_unknown_fs(srcdev, bdev->dest, bdev->mntopts);
9be53773 325 if (ret < 0) {
eddaaafd 326 ERROR("failed mounting %s onto %s to detect fstype", srcdev, bdev->dest);
9be53773
SH
327 exit(1);
328 }
329 // if symlink, get the real dev name
330 char devpath[MAXPATHLEN];
eddaaafd 331 char *l = linkderef(srcdev, devpath);
9be53773
SH
332 if (!l)
333 exit(1);
334 f = fopen("/proc/self/mounts", "r");
335 if (!f)
336 exit(1);
337 while (getline(&line, &linelen, f) != -1) {
46cd2845 338 sp1 = strchr(line, ' ');
9be53773
SH
339 if (!sp1)
340 exit(1);
341 *sp1 = '\0';
342 if (strcmp(line, l))
343 continue;
46cd2845 344 sp2 = strchr(sp1+1, ' ');
9be53773
SH
345 if (!sp2)
346 exit(1);
347 *sp2 = '\0';
46cd2845 348 sp3 = strchr(sp2+1, ' ');
9be53773
SH
349 if (!sp3)
350 exit(1);
351 *sp3 = '\0';
352 sp2++;
353 if (write(p[1], sp2, strlen(sp2)) != strlen(sp2))
354 exit(1);
355 exit(0);
356 }
357 exit(1);
358}
359
360struct bdev_type {
74a3920a
AM
361 const char *name;
362 const struct bdev_ops *ops;
9be53773
SH
363};
364
9be53773
SH
365static int dir_detect(const char *path)
366{
367 if (strncmp(path, "dir:", 4) == 0)
368 return 1; // take their word for it
369 if (is_dir(path))
370 return 1;
371 return 0;
372}
373
374//
375// XXXXXXX plain directory bind mount ops
376//
60bf62d4 377static int dir_mount(struct bdev *bdev)
9be53773 378{
a17b1e65
SG
379 unsigned long mntflags;
380 char *mntdata;
381 int ret;
382
9be53773
SH
383 if (strcmp(bdev->type, "dir"))
384 return -22;
385 if (!bdev->src || !bdev->dest)
386 return -22;
a17b1e65
SG
387
388 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
389 free(mntdata);
390 return -22;
391 }
392
393 ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
394 free(mntdata);
395 return ret;
9be53773
SH
396}
397
60bf62d4 398static int dir_umount(struct bdev *bdev)
9be53773
SH
399{
400 if (strcmp(bdev->type, "dir"))
401 return -22;
402 if (!bdev->src || !bdev->dest)
403 return -22;
404 return umount(bdev->dest);
405}
406
407/* the bulk of this needs to become a common helper */
408static char *dir_new_path(char *src, const char *oldname, const char *name,
409 const char *oldpath, const char *lxcpath)
410{
411 char *ret, *p, *p2;
412 int l1, l2, nlen;
413
414 nlen = strlen(src) + 1;
415 l1 = strlen(oldpath);
416 p = src;
417 /* if src starts with oldpath, look for oldname only after
418 * that path */
419 if (strncmp(src, oldpath, l1) == 0) {
420 p += l1;
421 nlen += (strlen(lxcpath) - l1);
422 }
423 l2 = strlen(oldname);
424 while ((p = strstr(p, oldname)) != NULL) {
425 p += l2;
426 nlen += strlen(name) - l2;
427 }
428
429 ret = malloc(nlen);
430 if (!ret)
431 return NULL;
432
433 p = ret;
434 if (strncmp(src, oldpath, l1) == 0) {
435 p += sprintf(p, "%s", lxcpath);
436 src += l1;
437 }
438
439 while ((p2 = strstr(src, oldname)) != NULL) {
440 strncpy(p, src, p2-src); // copy text up to oldname
441 p += p2-src; // move target pointer (p)
442 p += sprintf(p, "%s", name); // print new name in place of oldname
443 src = p2 + l2; // move src to end of oldname
444 }
445 sprintf(p, "%s", src); // copy the rest of src
446 return ret;
447}
448
449/*
450 * for a simple directory bind mount, we substitute the old container
451 * name and paths for the new
452 */
453static int dir_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
454 const char *cname, const char *oldpath, const char *lxcpath, int snap,
25190e5b 455 uint64_t newsize, struct lxc_conf *conf)
9be53773 456{
ca52dcb5
SH
457 int len, ret;
458
9be53773 459 if (snap) {
1f92162d 460 ERROR("directories cannot be snapshotted. Try aufs or overlayfs.");
9be53773
SH
461 return -1;
462 }
463
9be53773
SH
464 if (!orig->dest || !orig->src)
465 return -1;
9be53773 466
ca52dcb5
SH
467 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
468 new->src = malloc(len);
9be53773
SH
469 if (!new->src)
470 return -1;
ca52dcb5
SH
471 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
472 if (ret < 0 || ret >= len)
473 return -1;
474 if ((new->dest = strdup(new->src)) == NULL)
475 return -1;
9be53773
SH
476
477 return 0;
478}
479
60bf62d4
SH
480static int dir_destroy(struct bdev *orig)
481{
18aa217b 482 if (lxc_rmdir_onedev(orig->src, NULL) < 0)
60bf62d4
SH
483 return -1;
484 return 0;
485}
486
1897e3bc
SH
487static int dir_create(struct bdev *bdev, const char *dest, const char *n,
488 struct bdev_specs *specs)
489{
5292adfd 490 if (specs && specs->dir)
7bb87886
SH
491 bdev->src = strdup(specs->dir);
492 else
493 bdev->src = strdup(dest);
1897e3bc
SH
494 bdev->dest = strdup(dest);
495 if (!bdev->src || !bdev->dest) {
496 ERROR("Out of memory");
497 return -1;
498 }
499
500 if (mkdir_p(bdev->src, 0755) < 0) {
959aee9c 501 ERROR("Error creating %s", bdev->src);
1897e3bc
SH
502 return -1;
503 }
504 if (mkdir_p(bdev->dest, 0755) < 0) {
959aee9c 505 ERROR("Error creating %s", bdev->dest);
1897e3bc
SH
506 return -1;
507 }
508
509 return 0;
510}
511
74a3920a 512static const struct bdev_ops dir_ops = {
9be53773
SH
513 .detect = &dir_detect,
514 .mount = &dir_mount,
515 .umount = &dir_umount,
516 .clone_paths = &dir_clonepaths,
60bf62d4 517 .destroy = &dir_destroy,
1897e3bc 518 .create = &dir_create,
0a83cbbb 519 .can_snapshot = false,
cdd01be2 520 .can_backup = true,
9be53773
SH
521};
522
3baa76fe
SH
523
524//
525// XXXXXXX zfs ops
526// There are two ways we could do this. We could always specify the
527// 'zfs device' (i.e. tank/lxc lxc/container) as rootfs. But instead
528// (at least right now) we have lxc-create specify $lxcpath/$lxcname/rootfs
529// as the mountpoint, so that it is always mounted.
530//
531// That means 'mount' is really never needed and could be noop, but for the
532// sake of flexibility let's always bind-mount.
533//
534
60bf62d4 535static int zfs_list_entry(const char *path, char *output, size_t inlen)
3baa76fe 536{
ebec9176 537 struct lxc_popen_FILE *f;
3baa76fe
SH
538 int found=0;
539
ebec9176 540 f = lxc_popen("zfs list 2> /dev/null");
025ed0f3 541 if (f == NULL) {
3baa76fe
SH
542 SYSERROR("popen failed");
543 return 0;
544 }
ebec9176 545 while (fgets(output, inlen, f->f)) {
3baa76fe
SH
546 if (strstr(output, path)) {
547 found = 1;
548 break;
549 }
550 }
ebec9176 551 (void) lxc_pclose(f);
3baa76fe
SH
552
553 return found;
554}
555
556static int zfs_detect(const char *path)
557{
558 char *output = malloc(LXC_LOG_BUFFER_SIZE);
559 int found;
560
561 if (!output) {
562 ERROR("out of memory");
563 return 0;
564 }
60bf62d4 565 found = zfs_list_entry(path, output, LXC_LOG_BUFFER_SIZE);
3baa76fe
SH
566 free(output);
567 return found;
568}
569
60bf62d4 570static int zfs_mount(struct bdev *bdev)
3baa76fe 571{
a17b1e65
SG
572 unsigned long mntflags;
573 char *mntdata;
574 int ret;
575
3baa76fe
SH
576 if (strcmp(bdev->type, "zfs"))
577 return -22;
578 if (!bdev->src || !bdev->dest)
579 return -22;
a17b1e65
SG
580
581 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
582 free(mntdata);
583 return -22;
584 }
585
586 ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
587 free(mntdata);
588 return ret;
3baa76fe
SH
589}
590
60bf62d4 591static int zfs_umount(struct bdev *bdev)
3baa76fe
SH
592{
593 if (strcmp(bdev->type, "zfs"))
594 return -22;
595 if (!bdev->src || !bdev->dest)
596 return -22;
597 return umount(bdev->dest);
598}
599
600static int zfs_clone(const char *opath, const char *npath, const char *oname,
601 const char *nname, const char *lxcpath, int snapshot)
602{
603 // use the 'zfs list | grep opath' entry to get the zfsroot
604 char output[MAXPATHLEN], option[MAXPATHLEN], *p;
31a95fec 605 const char *zfsroot = output;
3baa76fe
SH
606 int ret;
607 pid_t pid;
608
60bf62d4 609 if (zfs_list_entry(opath, output, MAXPATHLEN)) {
31a95fec 610 // zfsroot is output up to ' '
46cd2845 611 if ((p = strchr(output, ' ')) == NULL)
31a95fec
SH
612 return -1;
613 *p = '\0';
c32981c3 614 if ((p = strrchr(output, '/')) == NULL)
31a95fec
SH
615 return -1;
616 *p = '\0';
617 } else
2e59ba02 618 zfsroot = lxc_global_config_value("lxc.bdev.zfs.root");
3baa76fe
SH
619
620 ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s/%s/rootfs",
621 lxcpath, nname);
622 if (ret < 0 || ret >= MAXPATHLEN)
623 return -1;
624
3baa76fe
SH
625 // zfs create -omountpoint=$lxcpath/$lxcname $zfsroot/$nname
626 if (!snapshot) {
627 if ((pid = fork()) < 0)
628 return -1;
629 if (!pid) {
630 char dev[MAXPATHLEN];
025ed0f3 631
31a95fec 632 ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, nname);
3baa76fe
SH
633 if (ret < 0 || ret >= MAXPATHLEN)
634 exit(1);
ca52dcb5
SH
635 execlp("zfs", "zfs", "create", option, dev, NULL);
636 exit(1);
3baa76fe
SH
637 }
638 return wait_for_pid(pid);
639 } else {
640 // if snapshot, do
641 // 'zfs snapshot zfsroot/oname@nname
642 // zfs clone zfsroot/oname@nname zfsroot/nname
643 char path1[MAXPATHLEN], path2[MAXPATHLEN];
644
31a95fec 645 ret = snprintf(path1, MAXPATHLEN, "%s/%s@%s", zfsroot,
3baa76fe
SH
646 oname, nname);
647 if (ret < 0 || ret >= MAXPATHLEN)
648 return -1;
31a95fec 649 (void) snprintf(path2, MAXPATHLEN, "%s/%s", zfsroot, nname);
3baa76fe
SH
650
651 // if the snapshot exists, delete it
652 if ((pid = fork()) < 0)
653 return -1;
654 if (!pid) {
ca52dcb5
SH
655 execlp("zfs", "zfs", "destroy", path1, NULL);
656 exit(1);
3baa76fe
SH
657 }
658 // it probably doesn't exist so destroy probably will fail.
659 (void) wait_for_pid(pid);
660
661 // run first (snapshot) command
662 if ((pid = fork()) < 0)
663 return -1;
664 if (!pid) {
ca52dcb5
SH
665 execlp("zfs", "zfs", "snapshot", path1, NULL);
666 exit(1);
3baa76fe
SH
667 }
668 if (wait_for_pid(pid) < 0)
669 return -1;
670
671 // run second (clone) command
672 if ((pid = fork()) < 0)
673 return -1;
674 if (!pid) {
ca52dcb5
SH
675 execlp("zfs", "zfs", "clone", option, path1, path2, NULL);
676 exit(1);
3baa76fe
SH
677 }
678 return wait_for_pid(pid);
679 }
680}
681
682static int zfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
683 const char *cname, const char *oldpath, const char *lxcpath, int snap,
25190e5b 684 uint64_t newsize, struct lxc_conf *conf)
3baa76fe 685{
ca52dcb5
SH
686 int len, ret;
687
3baa76fe
SH
688 if (!orig->src || !orig->dest)
689 return -1;
690
ca52dcb5
SH
691 if (snap && strcmp(orig->type, "zfs")) {
692 ERROR("zfs snapshot from %s backing store is not supported",
3baa76fe
SH
693 orig->type);
694 return -1;
695 }
696
ca52dcb5
SH
697 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
698 new->src = malloc(len);
3baa76fe
SH
699 if (!new->src)
700 return -1;
ca52dcb5
SH
701 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
702 if (ret < 0 || ret >= len)
703 return -1;
704 if ((new->dest = strdup(new->src)) == NULL)
705 return -1;
3baa76fe
SH
706
707 return zfs_clone(orig->src, new->src, oldname, cname, lxcpath, snap);
708}
709
60bf62d4
SH
710/*
711 * TODO: detect whether this was a clone, and if so then also delete the
712 * snapshot it was based on, so that we don't hold the original
713 * container busy.
714 */
715static int zfs_destroy(struct bdev *orig)
716{
717 pid_t pid;
718 char output[MAXPATHLEN], *p;
719
720 if ((pid = fork()) < 0)
721 return -1;
722 if (pid)
723 return wait_for_pid(pid);
724
725 if (!zfs_list_entry(orig->src, output, MAXPATHLEN)) {
726 ERROR("Error: zfs entry for %s not found", orig->src);
727 return -1;
728 }
729
730 // zfs mount is output up to ' '
46cd2845 731 if ((p = strchr(output, ' ')) == NULL)
60bf62d4
SH
732 return -1;
733 *p = '\0';
734
735 execlp("zfs", "zfs", "destroy", output, NULL);
736 exit(1);
737}
738
1897e3bc
SH
739static int zfs_create(struct bdev *bdev, const char *dest, const char *n,
740 struct bdev_specs *specs)
741{
742 const char *zfsroot;
743 char option[MAXPATHLEN];
744 int ret;
745 pid_t pid;
746
72e99249 747 if (!specs || !specs->zfs.zfsroot)
2e59ba02 748 zfsroot = lxc_global_config_value("lxc.bdev.zfs.root");
1897e3bc 749 else
72e99249 750 zfsroot = specs->zfs.zfsroot;
1897e3bc
SH
751
752 if (!(bdev->dest = strdup(dest))) {
753 ERROR("No mount target specified or out of memory");
754 return -1;
755 }
756 if (!(bdev->src = strdup(bdev->dest))) {
757 ERROR("out of memory");
758 return -1;
759 }
760
761 ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s", bdev->dest);
762 if (ret < 0 || ret >= MAXPATHLEN)
763 return -1;
764 if ((pid = fork()) < 0)
765 return -1;
766 if (pid)
767 return wait_for_pid(pid);
768
769 char dev[MAXPATHLEN];
770 ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, n);
771 if (ret < 0 || ret >= MAXPATHLEN)
772 exit(1);
773 execlp("zfs", "zfs", "create", option, dev, NULL);
774 exit(1);
775}
776
74a3920a 777static const struct bdev_ops zfs_ops = {
3baa76fe
SH
778 .detect = &zfs_detect,
779 .mount = &zfs_mount,
780 .umount = &zfs_umount,
781 .clone_paths = &zfs_clonepaths,
60bf62d4 782 .destroy = &zfs_destroy,
1897e3bc 783 .create = &zfs_create,
0a83cbbb 784 .can_snapshot = true,
cdd01be2 785 .can_backup = true,
3baa76fe
SH
786};
787
9be53773
SH
788//
789// LVM ops
790//
791
792/*
793 * Look at /sys/dev/block/maj:min/dm/uuid. If it contains the hardcoded LVM
794 * prefix "LVM-", then this is an lvm2 LV
795 */
796static int lvm_detect(const char *path)
797{
798 char devp[MAXPATHLEN], buf[4];
799 FILE *fout;
800 int ret;
801 struct stat statbuf;
802
803 if (strncmp(path, "lvm:", 4) == 0)
804 return 1; // take their word for it
805
806 ret = stat(path, &statbuf);
807 if (ret != 0)
808 return 0;
809 if (!S_ISBLK(statbuf.st_mode))
810 return 0;
811
812 ret = snprintf(devp, MAXPATHLEN, "/sys/dev/block/%d:%d/dm/uuid",
813 major(statbuf.st_rdev), minor(statbuf.st_rdev));
814 if (ret < 0 || ret >= MAXPATHLEN) {
815 ERROR("lvm uuid pathname too long");
816 return 0;
817 }
818 fout = fopen(devp, "r");
819 if (!fout)
820 return 0;
821 ret = fread(buf, 1, 4, fout);
822 fclose(fout);
823 if (ret != 4 || strncmp(buf, "LVM-", 4) != 0)
824 return 0;
825 return 1;
826}
827
828static int lvm_mount(struct bdev *bdev)
829{
830 if (strcmp(bdev->type, "lvm"))
831 return -22;
832 if (!bdev->src || !bdev->dest)
833 return -22;
834 /* if we might pass in data sometime, then we'll have to enrich
8ddf877b 835 * mount_unknown_fs */
a17b1e65 836 return mount_unknown_fs(bdev->src, bdev->dest, bdev->mntopts);
9be53773
SH
837}
838
839static int lvm_umount(struct bdev *bdev)
840{
841 if (strcmp(bdev->type, "lvm"))
842 return -22;
843 if (!bdev->src || !bdev->dest)
844 return -22;
845 return umount(bdev->dest);
846}
847
055af165 848static int lvm_compare_lv_attr(const char *path, int pos, const char expected) {
ebec9176 849 struct lxc_popen_FILE *f;
8aba14bb 850 int ret, len, status, start=0;
f99c386b
SS
851 char *cmd, output[12];
852 const char *lvscmd = "lvs --unbuffered --noheadings -o lv_attr %s 2>/dev/null";
853
854 len = strlen(lvscmd) + strlen(path) - 1;
55a204f9 855 cmd = alloca(len);
f99c386b
SS
856
857 ret = snprintf(cmd, len, lvscmd, path);
858 if (ret < 0 || ret >= len)
859 return -1;
860
ebec9176 861 f = lxc_popen(cmd);
f99c386b
SS
862
863 if (f == NULL) {
864 SYSERROR("popen failed");
865 return -1;
866 }
867
ebec9176 868 ret = fgets(output, 12, f->f) == NULL;
f99c386b 869
ebec9176 870 status = lxc_pclose(f);
f99c386b 871
8aba14bb
SS
872 if (ret || WEXITSTATUS(status))
873 // Assume either vg or lvs do not exist, default
874 // comparison to false.
875 return 0;
f99c386b
SS
876
877 len = strlen(output);
878 while(start < len && output[start] == ' ') start++;
879
055af165 880 if (start + pos < len && output[start + pos] == expected)
f99c386b
SS
881 return 1;
882
883 return 0;
884}
885
055af165
SS
886static int lvm_is_thin_volume(const char *path)
887{
888 return lvm_compare_lv_attr(path, 6, 't');
889}
890
891static int lvm_is_thin_pool(const char *path)
892{
893 return lvm_compare_lv_attr(path, 0, 't');
894}
895
896/*
897 * path must be '/dev/$vg/$lv', $vg must be an existing VG, and $lv must not
898 * yet exist. This function will attempt to create /dev/$vg/$lv of size
899 * $size. If thinpool is specified, we'll check for it's existence and if it's
900 * a valid thin pool, and if so, we'll create the requested lv from that thin
901 * pool.
902 */
d659597e 903static int do_lvm_create(const char *path, uint64_t size, const char *thinpool)
055af165
SS
904{
905 int ret, pid, len;
72e99249 906 char sz[24], *pathdup, *vg, *lv, *tp = NULL;
055af165
SS
907
908 if ((pid = fork()) < 0) {
909 SYSERROR("failed fork");
910 return -1;
911 }
912 if (pid > 0)
913 return wait_for_pid(pid);
914
d659597e
SA
915 // specify bytes to lvcreate
916 ret = snprintf(sz, 24, "%"PRIu64"b", size);
055af165
SS
917 if (ret < 0 || ret >= 24)
918 exit(1);
919
920 pathdup = strdup(path);
921 if (!pathdup)
922 exit(1);
923
924 lv = strrchr(pathdup, '/');
47b6e6cf 925 if (!lv)
055af165 926 exit(1);
47b6e6cf 927
055af165
SS
928 *lv = '\0';
929 lv++;
930
931 vg = strrchr(pathdup, '/');
47b6e6cf 932 if (!vg)
055af165
SS
933 exit(1);
934 vg++;
935
936 if (thinpool) {
937 len = strlen(pathdup) + strlen(thinpool) + 2;
938 tp = alloca(len);
939
055af165 940 ret = snprintf(tp, len, "%s/%s", pathdup, thinpool);
47b6e6cf 941 if (ret < 0 || ret >= len)
72e99249 942 exit(1);
055af165
SS
943
944 ret = lvm_is_thin_pool(tp);
945 INFO("got %d for thin pool at path: %s", ret, tp);
47b6e6cf 946 if (ret < 0)
72e99249 947 exit(1);
055af165
SS
948
949 if (!ret)
72e99249 950 tp = NULL;
055af165
SS
951 }
952
47b6e6cf 953 if (!tp)
055af165 954 execlp("lvcreate", "lvcreate", "-L", sz, vg, "-n", lv, (char *)NULL);
47b6e6cf 955 else
72e99249 956 execlp("lvcreate", "lvcreate", "--thinpool", tp, "-V", sz, vg, "-n", lv, (char *)NULL);
055af165 957
47b6e6cf 958 SYSERROR("execlp");
055af165
SS
959 exit(1);
960}
961
d659597e 962static int lvm_snapshot(const char *orig, const char *path, uint64_t size)
9be53773
SH
963{
964 int ret, pid;
965 char sz[24], *pathdup, *lv;
966
967 if ((pid = fork()) < 0) {
968 SYSERROR("failed fork");
969 return -1;
970 }
971 if (pid > 0)
972 return wait_for_pid(pid);
025ed0f3 973
d659597e
SA
974 // specify bytes to lvcreate
975 ret = snprintf(sz, 24, "%"PRIu64"b", size);
9be53773
SH
976 if (ret < 0 || ret >= 24)
977 exit(1);
978
979 pathdup = strdup(path);
980 if (!pathdup)
981 exit(1);
c32981c3 982 lv = strrchr(pathdup, '/');
9be53773
SH
983 if (!lv) {
984 free(pathdup);
985 exit(1);
986 }
987 *lv = '\0';
988 lv++;
989
f99c386b
SS
990 // check if the original lv is backed by a thin pool, in which case we
991 // cannot specify a size that's different from the original size.
992 ret = lvm_is_thin_volume(orig);
9529609a
ÇO
993 if (ret == -1) {
994 free(pathdup);
f99c386b 995 return -1;
9529609a 996 }
f99c386b
SS
997
998 if (!ret) {
999 ret = execlp("lvcreate", "lvcreate", "-s", "-L", sz, "-n", lv, orig, (char *)NULL);
1000 } else {
1001 ret = execlp("lvcreate", "lvcreate", "-s", "-n", lv, orig, (char *)NULL);
1002 }
1003
9be53773 1004 free(pathdup);
ca52dcb5
SH
1005 exit(1);
1006}
1007
1008// this will return 1 for physical disks, qemu-nbd, loop, etc
1009// right now only lvm is a block device
1010static int is_blktype(struct bdev *b)
1011{
1012 if (strcmp(b->type, "lvm") == 0)
1013 return 1;
1014 return 0;
9be53773
SH
1015}
1016
1017static int lvm_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1018 const char *cname, const char *oldpath, const char *lxcpath, int snap,
25190e5b 1019 uint64_t newsize, struct lxc_conf *conf)
9be53773
SH
1020{
1021 char fstype[100];
d659597e 1022 uint64_t size = newsize;
ca52dcb5 1023 int len, ret;
9be53773
SH
1024
1025 if (!orig->src || !orig->dest)
1026 return -1;
1027
1028 if (strcmp(orig->type, "lvm")) {
31a95fec
SH
1029 const char *vg;
1030
ca52dcb5
SH
1031 if (snap) {
1032 ERROR("LVM snapshot from %s backing store is not supported",
1033 orig->type);
1034 return -1;
1035 }
2e59ba02 1036 vg = lxc_global_config_value("lxc.bdev.lvm.vg");
31a95fec 1037 len = strlen("/dev/") + strlen(vg) + strlen(cname) + 2;
ca52dcb5
SH
1038 if ((new->src = malloc(len)) == NULL)
1039 return -1;
31a95fec 1040 ret = snprintf(new->src, len, "/dev/%s/%s", vg, cname);
ca52dcb5
SH
1041 if (ret < 0 || ret >= len)
1042 return -1;
1043 } else {
1044 new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath);
1045 if (!new->src)
1046 return -1;
9be53773
SH
1047 }
1048
a17b1e65
SG
1049 if (orig->mntopts) {
1050 new->mntopts = strdup(orig->mntopts);
1051 if (!new->mntopts)
9be53773
SH
1052 return -1;
1053 }
ca52dcb5
SH
1054
1055 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1056 new->dest = malloc(len);
9be53773
SH
1057 if (!new->dest)
1058 return -1;
ca52dcb5
SH
1059 ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
1060 if (ret < 0 || ret >= len)
9be53773 1061 return -1;
ca52dcb5 1062 if (mkdir_p(new->dest, 0755) < 0)
9be53773
SH
1063 return -1;
1064
ca52dcb5 1065 if (is_blktype(orig)) {
eddaaafd 1066 if (!newsize && blk_getsize(orig, &size) < 0) {
ca52dcb5
SH
1067 ERROR("Error getting size of %s", orig->src);
1068 return -1;
1069 }
1070 if (detect_fs(orig, fstype, 100) < 0) {
1071 INFO("could not find fstype for %s, using ext3", orig->src);
1072 return -1;
1073 }
1074 } else {
1075 sprintf(fstype, "ext3");
1076 if (!newsize)
d659597e 1077 size = DEFAULT_FS_SIZE;
9be53773 1078 }
ca52dcb5 1079
9be53773
SH
1080 if (snap) {
1081 if (lvm_snapshot(orig->src, new->src, size) < 0) {
1082 ERROR("could not create %s snapshot of %s", new->src, orig->src);
1083 return -1;
1084 }
1085 } else {
2e59ba02 1086 if (do_lvm_create(new->src, size, lxc_global_config_value("lxc.bdev.lvm.thin_pool")) < 0) {
9be53773
SH
1087 ERROR("Error creating new lvm blockdev");
1088 return -1;
1089 }
9be53773
SH
1090 if (do_mkfs(new->src, fstype) < 0) {
1091 ERROR("Error creating filesystem type %s on %s", fstype,
1092 new->src);
1093 return -1;
1094 }
1095 }
1096
1097 return 0;
1098}
1099
60bf62d4
SH
1100static int lvm_destroy(struct bdev *orig)
1101{
1102 pid_t pid;
1103
1104 if ((pid = fork()) < 0)
1105 return -1;
1106 if (!pid) {
1107 execlp("lvremove", "lvremove", "-f", orig->src, NULL);
1108 exit(1);
1109 }
1110 return wait_for_pid(pid);
1111}
1112
1897e3bc
SH
1113static int lvm_create(struct bdev *bdev, const char *dest, const char *n,
1114 struct bdev_specs *specs)
1115{
f99c386b 1116 const char *vg, *thinpool, *fstype, *lv = n;
d659597e 1117 uint64_t sz;
1897e3bc
SH
1118 int ret, len;
1119
1120 if (!specs)
1121 return -1;
1122
72e99249 1123 vg = specs->lvm.vg;
1897e3bc 1124 if (!vg)
2e59ba02 1125 vg = lxc_global_config_value("lxc.bdev.lvm.vg");
1897e3bc 1126
72e99249 1127 thinpool = specs->lvm.thinpool;
055af165 1128 if (!thinpool)
2e59ba02 1129 thinpool = lxc_global_config_value("lxc.bdev.lvm.thin_pool");
f99c386b 1130
1897e3bc 1131 /* /dev/$vg/$lv */
72e99249
SS
1132 if (specs->lvm.lv)
1133 lv = specs->lvm.lv;
1134
1897e3bc
SH
1135 len = strlen(vg) + strlen(lv) + 7;
1136 bdev->src = malloc(len);
1137 if (!bdev->src)
1138 return -1;
1139
1140 ret = snprintf(bdev->src, len, "/dev/%s/%s", vg, lv);
1141 if (ret < 0 || ret >= len)
1142 return -1;
1143
72e99249
SS
1144 // fssize is in bytes.
1145 sz = specs->fssize;
1897e3bc 1146 if (!sz)
eddaaafd 1147 sz = DEFAULT_FS_SIZE;
1897e3bc 1148
f99c386b 1149 if (do_lvm_create(bdev->src, sz, thinpool) < 0) {
d659597e 1150 ERROR("Error creating new lvm blockdev %s size %"PRIu64" bytes", bdev->src, sz);
1897e3bc
SH
1151 return -1;
1152 }
1153
72e99249 1154 fstype = specs->fstype;
1897e3bc 1155 if (!fstype)
eddaaafd 1156 fstype = DEFAULT_FSTYPE;
1897e3bc
SH
1157 if (do_mkfs(bdev->src, fstype) < 0) {
1158 ERROR("Error creating filesystem type %s on %s", fstype,
1159 bdev->src);
1160 return -1;
1161 }
1162 if (!(bdev->dest = strdup(dest)))
1163 return -1;
1164
1165 if (mkdir_p(bdev->dest, 0755) < 0) {
959aee9c 1166 ERROR("Error creating %s", bdev->dest);
1897e3bc
SH
1167 return -1;
1168 }
1169
1170 return 0;
1171}
1172
74a3920a 1173static const struct bdev_ops lvm_ops = {
9be53773
SH
1174 .detect = &lvm_detect,
1175 .mount = &lvm_mount,
1176 .umount = &lvm_umount,
1177 .clone_paths = &lvm_clonepaths,
60bf62d4 1178 .destroy = &lvm_destroy,
1897e3bc 1179 .create = &lvm_create,
0a83cbbb 1180 .can_snapshot = true,
cdd01be2 1181 .can_backup = false,
9be53773
SH
1182};
1183
ff462013
SH
1184/*
1185 * Return the full path of objid under dirid. Let's say dirid is
1186 * /lxc/c1/rootfs, and objid is /lxc/c1/rootfs/a/b/c. Then we will
1187 * return a/b/c. If instead objid is for /lxc/c1/rootfs/a, we will
1188 * simply return a.
1189 */
1190char *get_btrfs_subvol_path(int fd, u64 dir_id, u64 objid,
1191 char *name, int name_len)
1192{
1193 struct btrfs_ioctl_ino_lookup_args args;
1194 int ret, e;
1195 size_t len;
1196 char *retpath;
1197
1198 memset(&args, 0, sizeof(args));
1199 args.treeid = dir_id;
1200 args.objectid = objid;
1201
1202 ret = ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args);
1203 e = errno;
1204 if (ret) {
1205 ERROR("%s: ERROR: Failed to lookup path for %llu %llu %s - %s\n",
1206 __func__, (unsigned long long) dir_id,
1207 (unsigned long long) objid,
1208 name, strerror(e));
1209 return NULL;
1210 } else
1211 INFO("%s: got path for %llu %llu - %s\n", __func__,
1212 (unsigned long long) objid, (unsigned long long) dir_id,
1213 name);
1214
1215 if (args.name[0]) {
1216 /*
1217 * we're in a subdirectory of ref_tree, the kernel ioctl
1218 * puts a / in there for us
1219 */
1220 len = strlen(args.name) + name_len + 2;
1221 retpath = malloc(len);
1222 if (!retpath)
1223 return NULL;
1224 strcpy(retpath, args.name);
1225 strcat(retpath, "/");
1226 strncat(retpath, name, name_len);
1227 } else {
1228 /* we're at the root of ref_tree */
1229 len = name_len + 1;
1230 retpath = malloc(len);
1231 if (!retpath)
1232 return NULL;
1233 *retpath = '\0';
1234 strncat(retpath, name, name_len);
1235 }
1236 return retpath;
1237}
1238
9be53773
SH
1239//
1240// btrfs ops
1241//
1242
ff462013
SH
1243int btrfs_list_get_path_rootid(int fd, u64 *treeid)
1244{
1245 int ret;
1246 struct btrfs_ioctl_ino_lookup_args args;
9be53773 1247
ff462013
SH
1248 memset(&args, 0, sizeof(args));
1249 args.objectid = BTRFS_FIRST_FREE_OBJECTID;
9be53773 1250
ff462013
SH
1251 ret = ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args);
1252 if (ret < 0) {
1253 WARN("Warning: can't perform the search -%s\n",
1254 strerror(errno));
1255 return ret;
1256 }
1257 *treeid = args.treeid;
1258 return 0;
1259}
9be53773 1260
4295c5de 1261bool is_btrfs_fs(const char *path)
9be53773 1262{
9be53773
SH
1263 int fd, ret;
1264 struct btrfs_ioctl_space_args sargs;
1265
1266 // make sure this is a btrfs filesystem
1267 fd = open(path, O_RDONLY);
1268 if (fd < 0)
1897e3bc 1269 return false;
9be53773
SH
1270 sargs.space_slots = 0;
1271 sargs.total_spaces = 0;
1272 ret = ioctl(fd, BTRFS_IOC_SPACE_INFO, &sargs);
1273 close(fd);
1274 if (ret < 0)
1897e3bc
SH
1275 return false;
1276
1277 return true;
1278}
1279
1280static int btrfs_detect(const char *path)
1281{
1282 struct stat st;
1283 int ret;
1284
1285 if (!is_btrfs_fs(path))
9be53773
SH
1286 return 0;
1287
1288 // and make sure it's a subvolume.
1289 ret = stat(path, &st);
1290 if (ret < 0)
1291 return 0;
1292
1293 if (st.st_ino == 256 && S_ISDIR(st.st_mode))
1294 return 1;
1295
1296 return 0;
1297}
1298
60bf62d4 1299static int btrfs_mount(struct bdev *bdev)
9be53773 1300{
a17b1e65
SG
1301 unsigned long mntflags;
1302 char *mntdata;
1303 int ret;
1304
9be53773
SH
1305 if (strcmp(bdev->type, "btrfs"))
1306 return -22;
1307 if (!bdev->src || !bdev->dest)
1308 return -22;
a17b1e65
SG
1309
1310 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
1311 free(mntdata);
1312 return -22;
1313 }
1314
1315 ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata);
1316 free(mntdata);
1317 return ret;
9be53773
SH
1318}
1319
60bf62d4 1320static int btrfs_umount(struct bdev *bdev)
9be53773
SH
1321{
1322 if (strcmp(bdev->type, "btrfs"))
1323 return -22;
1324 if (!bdev->src || !bdev->dest)
1325 return -22;
1326 return umount(bdev->dest);
1327}
1328
9be53773
SH
1329static int btrfs_subvolume_create(const char *path)
1330{
1331 int ret, fd = -1;
1332 struct btrfs_ioctl_vol_args args;
1333 char *p, *newfull = strdup(path);
1334
1335 if (!newfull) {
1336 ERROR("Error: out of memory");
1337 return -1;
1338 }
1339
c32981c3 1340 p = strrchr(newfull, '/');
9be53773
SH
1341 if (!p) {
1342 ERROR("bad path: %s", path);
9529609a 1343 free(newfull);
9be53773
SH
1344 return -1;
1345 }
1346 *p = '\0';
1347
025ed0f3 1348 fd = open(newfull, O_RDONLY);
025ed0f3 1349 if (fd < 0) {
9be53773
SH
1350 ERROR("Error opening %s", newfull);
1351 free(newfull);
1352 return -1;
1353 }
1354
1355 memset(&args, 0, sizeof(args));
1356 strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1357 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1358 ret = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, &args);
1359 INFO("btrfs: snapshot create ioctl returned %d", ret);
9be53773
SH
1360
1361 free(newfull);
1362 close(fd);
1363 return ret;
1364}
1365
65db0e5a
ÇO
1366static int btrfs_same_fs(const char *orig, const char *new) {
1367 int fd_orig = -1, fd_new = -1, ret = -1;
1368 struct btrfs_ioctl_fs_info_args orig_args, new_args;
1369
1370 fd_orig = open(orig, O_RDONLY);
1371 if (fd_orig < 0) {
1372 SYSERROR("Error opening original rootfs %s", orig);
1373 goto out;
1374 }
1375 ret = ioctl(fd_orig, BTRFS_IOC_FS_INFO, &orig_args);
1376 if (ret < 0) {
1377 SYSERROR("BTRFS_IOC_FS_INFO %s", orig);
1378 goto out;
1379 }
1380
1381 fd_new = open(new, O_RDONLY);
1382 if (fd_new < 0) {
1383 SYSERROR("Error opening new container dir %s", new);
e27141fa 1384 ret = -1;
65db0e5a
ÇO
1385 goto out;
1386 }
1387 ret = ioctl(fd_new, BTRFS_IOC_FS_INFO, &new_args);
1388 if (ret < 0) {
1389 SYSERROR("BTRFS_IOC_FS_INFO %s", new);
1390 goto out;
1391 }
1392
1393 if (strncmp(orig_args.fsid, new_args.fsid, BTRFS_FSID_SIZE) != 0) {
1394 ret = -1;
1395 goto out;
1396 }
1397 ret = 0;
1398out:
1399 if (fd_new != -1)
1400 close(fd_new);
1401 if (fd_orig != -1)
1402 close(fd_orig);
1403 return ret;
1404}
1405
9be53773
SH
1406static int btrfs_snapshot(const char *orig, const char *new)
1407{
1408 int fd = -1, fddst = -1, ret = -1;
1409 struct btrfs_ioctl_vol_args_v2 args;
1410 char *newdir, *newname, *newfull = NULL;
1411
1412 newfull = strdup(new);
1413 if (!newfull) {
1414 ERROR("Error: out of memory");
1415 goto out;
1416 }
1417 // make sure the directory doesn't already exist
8479c136 1418 if (rmdir(newfull) < 0 && errno != ENOENT) {
9be53773
SH
1419 SYSERROR("Error removing empty new rootfs");
1420 goto out;
1421 }
1422 newname = basename(newfull);
1423 newdir = dirname(newfull);
1424 fd = open(orig, O_RDONLY);
1425 if (fd < 0) {
1426 SYSERROR("Error opening original rootfs %s", orig);
1427 goto out;
1428 }
dd1d77f9 1429 fddst = open(newdir, O_RDONLY);
9be53773
SH
1430 if (fddst < 0) {
1431 SYSERROR("Error opening new container dir %s", newdir);
1432 goto out;
1433 }
1434
1435 memset(&args, 0, sizeof(args));
1436 args.fd = fd;
1437 strncpy(args.name, newname, BTRFS_SUBVOL_NAME_MAX);
1438 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1439 ret = ioctl(fddst, BTRFS_IOC_SNAP_CREATE_V2, &args);
1440 INFO("btrfs: snapshot create ioctl returned %d", ret);
1441
1442out:
1443 if (fddst != -1)
1444 close(fddst);
1445 if (fd != -1)
1446 close(fd);
f10fad2f 1447 free(newfull);
9be53773
SH
1448 return ret;
1449}
1450
2659c7cb
SH
1451static int btrfs_snapshot_wrapper(void *data)
1452{
1453 struct rsync_data_char *arg = data;
1454 if (setgid(0) < 0) {
1455 ERROR("Failed to setgid to 0");
1456 return -1;
1457 }
1458 if (setgroups(0, NULL) < 0)
1459 WARN("Failed to clear groups");
1460 if (setuid(0) < 0) {
1461 ERROR("Failed to setuid to 0");
1462 return -1;
1463 }
1464 return btrfs_snapshot(arg->src, arg->dest);
1465}
1466
9be53773
SH
1467static int btrfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
1468 const char *cname, const char *oldpath, const char *lxcpath, int snap,
25190e5b 1469 uint64_t newsize, struct lxc_conf *conf)
9be53773
SH
1470{
1471 if (!orig->dest || !orig->src)
1472 return -1;
1473
1474 if (strcmp(orig->type, "btrfs")) {
ca52dcb5
SH
1475 int len, ret;
1476 if (snap) {
1477 ERROR("btrfs snapshot from %s backing store is not supported",
1478 orig->type);
1479 return -1;
1480 }
1481 len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3;
1482 new->src = malloc(len);
1483 if (!new->src)
1484 return -1;
1485 ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname);
1486 if (ret < 0 || ret >= len)
1487 return -1;
1488 } else {
1489 // in case rootfs is in custom path, reuse it
1490 if ((new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath)) == NULL)
1491 return -1;
9be53773 1492
ca52dcb5 1493 }
9be53773 1494
ca52dcb5 1495 if ((new->dest = strdup(new->src)) == NULL)
9be53773
SH
1496 return -1;
1497
a17b1e65 1498 if (orig->mntopts && (new->mntopts = strdup(orig->mntopts)) == NULL)
9be53773
SH
1499 return -1;
1500
2659c7cb
SH
1501 if (snap) {
1502 struct rsync_data_char sdata;
1503 if (!am_unpriv())
1504 return btrfs_snapshot(orig->dest, new->dest);
1505 sdata.dest = new->dest;
1506 sdata.src = orig->dest;
1507 return userns_exec_1(conf, btrfs_snapshot_wrapper, &sdata);
1508 }
9be53773 1509
8479c136 1510 if (rmdir(new->dest) < 0 && errno != ENOENT) {
959aee9c 1511 SYSERROR("removing %s", new->dest);
9be53773
SH
1512 return -1;
1513 }
1514
1515 return btrfs_subvolume_create(new->dest);
1516}
1517
ff462013 1518static int btrfs_do_destroy_subvol(const char *path)
60bf62d4
SH
1519{
1520 int ret, fd = -1;
1521 struct btrfs_ioctl_vol_args args;
60bf62d4
SH
1522 char *p, *newfull = strdup(path);
1523
1524 if (!newfull) {
1525 ERROR("Error: out of memory");
1526 return -1;
1527 }
1528
c32981c3 1529 p = strrchr(newfull, '/');
60bf62d4
SH
1530 if (!p) {
1531 ERROR("bad path: %s", path);
9529609a 1532 free(newfull);
60bf62d4
SH
1533 return -1;
1534 }
1535 *p = '\0';
1536
025ed0f3 1537 fd = open(newfull, O_RDONLY);
025ed0f3 1538 if (fd < 0) {
4295c5de 1539 SYSERROR("Error opening %s", newfull);
60bf62d4
SH
1540 free(newfull);
1541 return -1;
1542 }
1543
1544 memset(&args, 0, sizeof(args));
1545 strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX);
1546 args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0;
1547 ret = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &args);
ff462013 1548 INFO("btrfs: snapshot destroy ioctl returned %d for %s", ret, path);
2659c7cb 1549 if (ret < 0 && errno == EPERM)
cf03f973 1550 ERROR("Is the rootfs mounted with -o user_subvol_rm_allowed?");
60bf62d4
SH
1551
1552 free(newfull);
1553 close(fd);
1554 return ret;
1555}
1556
ff462013
SH
1557struct mytree_node {
1558 u64 objid;
1559 u64 parentid;
1560 char *name;
1561 char *dirname;
1562};
1563
1564struct my_btrfs_tree {
1565 struct mytree_node *nodes;
1566 int num;
1567};
1568
1569static int get_btrfs_tree_idx(struct my_btrfs_tree *tree, u64 id)
1570{
1571 int i;
1572 if (!tree)
1573 return -1;
1574 for (i = 0; i < tree->num; i++) {
1575 if (tree->nodes[i].objid == id)
1576 return i;
1577 }
1578 return -1;
1579}
1580
1581static struct my_btrfs_tree *create_my_btrfs_tree(u64 id, const char *path, int name_len)
1582{
1583 struct my_btrfs_tree *tree;
1584
8873e65e 1585 tree = malloc(sizeof(struct my_btrfs_tree));
ff462013
SH
1586 if (!tree)
1587 return NULL;
1588 tree->nodes = malloc(sizeof(struct mytree_node));
1589 if (!tree->nodes) {
1590 free(tree);
1591 return NULL;
1592 }
1593 tree->num = 1;
1594 tree->nodes[0].dirname = NULL;
1595 tree->nodes[0].name = strdup(path);
1596 if (!tree->nodes[0].name) {
1597 free(tree->nodes);
1598 free(tree);
1599 return NULL;
1600 }
1601 tree->nodes[0].parentid = 0;
1602 tree->nodes[0].objid = id;
1603 return tree;
1604}
1605
1606static bool update_tree_node(struct mytree_node *n, u64 id, u64 parent, char *name,
1607 int name_len, char *dirname)
1608{
1609 if (id)
1610 n->objid = id;
1611 if (parent)
1612 n->parentid = parent;
1613 if (name) {
1614 n->name = malloc(name_len + 1);
1615 if (!n->name)
1616 return false;
1617 strncpy(n->name, name, name_len);
1618 n->name[name_len] = '\0';
1619 }
1620 if (dirname) {
1621 n->dirname = malloc(strlen(dirname) + 1);
1622 if (!n->dirname) {
1623 free(n->name);
1624 return false;
1625 }
1626 strcpy(n->dirname, dirname);
1627 }
1628 return true;
1629}
1630
1631static bool add_btrfs_tree_node(struct my_btrfs_tree *tree, u64 id, u64 parent,
1632 char *name, int name_len, char *dirname)
1633{
1634 struct mytree_node *tmp;
1635
1636 int i = get_btrfs_tree_idx(tree, id);
1637 if (i != -1)
1638 return update_tree_node(&tree->nodes[i], id, parent, name,
1639 name_len, dirname);
1640
1641 tmp = realloc(tree->nodes, (tree->num+1) * sizeof(struct mytree_node));
1642 if (!tmp)
1643 return false;
1644 tree->nodes = tmp;
1645 memset(&tree->nodes[tree->num], 0, sizeof(struct mytree_node));
1646 if (!update_tree_node(&tree->nodes[tree->num], id, parent, name,
1647 name_len, dirname))
1648 return false;
1649 tree->num++;
1650 return true;
1651}
1652
1653static void free_btrfs_tree(struct my_btrfs_tree *tree)
1654{
1655 int i;
1656 if (!tree)
1657 return;
1658 for (i = 0; i < tree->num; i++) {
1659 free(tree->nodes[i].name);
1660 free(tree->nodes[i].dirname);
1661 }
1662 free(tree->nodes);
1663 free(tree);
1664}
1665
1666/*
1667 * Given a @tree of subvolumes under @path, ask btrfs to remove each
1668 * subvolume
1669 */
1670static bool do_remove_btrfs_children(struct my_btrfs_tree *tree, u64 root_id,
1671 const char *path)
1672{
1673 int i;
1674 char *newpath;
1675 size_t len;
1676
1677 for (i = 0; i < tree->num; i++) {
1678 if (tree->nodes[i].parentid == root_id) {
1679 if (!tree->nodes[i].dirname) {
1680 WARN("Odd condition: child objid with no name under %s\n", path);
1681 continue;
1682 }
1683 len = strlen(path) + strlen(tree->nodes[i].dirname) + 2;
1684 newpath = malloc(len);
1685 if (!newpath) {
1686 ERROR("Out of memory");
1687 return false;
1688 }
1689 snprintf(newpath, len, "%s/%s", path, tree->nodes[i].dirname);
1690 if (!do_remove_btrfs_children(tree, tree->nodes[i].objid, newpath)) {
1691 ERROR("Failed to prune %s\n", tree->nodes[i].name);
1692 free(newpath);
1693 return false;
1694 }
1695 if (btrfs_do_destroy_subvol(newpath) != 0) {
1696 ERROR("Failed to remove %s\n", newpath);
1697 free(newpath);
1698 return false;
1699 }
1700 free(newpath);
1701 }
1702 }
1703 return true;
1704}
1705
1706static int btrfs_recursive_destroy(const char *path)
1707{
1708 u64 root_id;
1709 int fd;
1710 struct btrfs_ioctl_search_args args;
1711 struct btrfs_ioctl_search_key *sk = &args.key;
1712 struct btrfs_ioctl_search_header *sh;
1713 struct btrfs_root_ref *ref;
1714 struct my_btrfs_tree *tree;
1715 int ret, i;
1716 unsigned long off = 0;
1717 int name_len;
1718 char *name;
1719 char *tmppath;
1720
1721 fd = open(path, O_RDONLY);
1722 if (fd < 0) {
1723 ERROR("Failed to open %s\n", path);
1724 return -1;
1725 }
1726
1727 if (btrfs_list_get_path_rootid(fd, &root_id)) {
1728 close(fd);
1729 if (errno == EPERM || errno == EACCES) {
1730 WARN("Will simply try removing");
1731 goto ignore_search;
1732 }
1733
1734 return -1;
1735 }
1736
1737 tree = create_my_btrfs_tree(root_id, path, strlen(path));
1738 if (!tree) {
1739 ERROR("Out of memory\n");
1740 close(fd);
1741 return -1;
1742 }
1743 /* Walk all subvols looking for any under this id */
1744 memset(&args, 0, sizeof(args));
1745
1746 /* search in the tree of tree roots */
1747 sk->tree_id = 1;
1748
1749 sk->max_type = BTRFS_ROOT_REF_KEY;
1750 sk->min_type = BTRFS_ROOT_ITEM_KEY;
1751 sk->min_objectid = 0;
1752 sk->max_objectid = (u64)-1;
1753 sk->max_offset = (u64)-1;
1754 sk->min_offset = 0;
1755 sk->max_transid = (u64)-1;
1756 sk->nr_items = 4096;
1757
1758 while(1) {
1759 ret = ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args);
1760 if (ret < 0) {
1761 close(fd);
1762 ERROR("Error: can't perform the search under %s\n", path);
1763 free_btrfs_tree(tree);
1764 return -1;
1765 }
1766 if (sk->nr_items == 0)
1767 break;
1768
1769 off = 0;
1770 for (i = 0; i < sk->nr_items; i++) {
1771 sh = (struct btrfs_ioctl_search_header *)(args.buf + off);
1772 off += sizeof(*sh);
1773 /*
1774 * A backref key with the name and dirid of the parent
1775 * comes followed by the reoot ref key which has the
1776 * name of the child subvol in question.
1777 */
1778 if (sh->objectid != root_id && sh->type == BTRFS_ROOT_BACKREF_KEY) {
1779 ref = (struct btrfs_root_ref *)(args.buf + off);
1780 name_len = ref->name_len;
1781 name = (char *)(ref + 1);
1782 tmppath = get_btrfs_subvol_path(fd, sh->offset,
1783 ref->dirid, name, name_len);
1784 if (!add_btrfs_tree_node(tree, sh->objectid,
1785 sh->offset, name,
1786 name_len, tmppath)) {
1787 ERROR("Out of memory");
1788 free_btrfs_tree(tree);
1789 free(tmppath);
1790 close(fd);
1791 return -1;
1792 }
1793 free(tmppath);
1794 }
1795 off += sh->len;
1796
1797 /*
1798 * record the mins in sk so we can make sure the
1799 * next search doesn't repeat this root
1800 */
1801 sk->min_objectid = sh->objectid;
1802 sk->min_type = sh->type;
1803 sk->min_offset = sh->offset;
1804 }
1805 sk->nr_items = 4096;
1806 sk->min_offset++;
1807 if (!sk->min_offset)
1808 sk->min_type++;
1809 else
1810 continue;
1811
1812 if (sk->min_type > BTRFS_ROOT_BACKREF_KEY) {
1813 sk->min_type = BTRFS_ROOT_ITEM_KEY;
1814 sk->min_objectid++;
1815 } else
1816 continue;
1817
1818 if (sk->min_objectid >= sk->max_objectid)
1819 break;
1820 }
1821 close(fd);
1822
1823 /* now actually remove them */
1824
1825 if (!do_remove_btrfs_children(tree, root_id, path)) {
1826 free_btrfs_tree(tree);
1827 ERROR("failed pruning\n");
1828 return -1;
1829 }
1830
1831 free_btrfs_tree(tree);
1832 /* All child subvols have been removed, now remove this one */
1833ignore_search:
1834 return btrfs_do_destroy_subvol(path);
1835}
1836
4295c5de
SH
1837bool btrfs_try_remove_subvol(const char *path)
1838{
1839 if (!btrfs_detect(path))
1840 return false;
1841 return btrfs_recursive_destroy(path) == 0;
1842}
1843
ff462013
SH
1844static int btrfs_destroy(struct bdev *orig)
1845{
1846 return btrfs_recursive_destroy(orig->src);
1847}
1848
1897e3bc
SH
1849static int btrfs_create(struct bdev *bdev, const char *dest, const char *n,
1850 struct bdev_specs *specs)
1851{
1852 bdev->src = strdup(dest);
1853 bdev->dest = strdup(dest);
1854 if (!bdev->src || !bdev->dest)
1855 return -1;
1856 return btrfs_subvolume_create(bdev->dest);
1857}
1858
74a3920a 1859static const struct bdev_ops btrfs_ops = {
9be53773
SH
1860 .detect = &btrfs_detect,
1861 .mount = &btrfs_mount,
1862 .umount = &btrfs_umount,
1863 .clone_paths = &btrfs_clonepaths,
60bf62d4 1864 .destroy = &btrfs_destroy,
1897e3bc 1865 .create = &btrfs_create,
0a83cbbb 1866 .can_snapshot = true,
cdd01be2 1867 .can_backup = true,
9be53773
SH
1868};
1869
eddaaafd
SH
1870//
1871// loopback dev ops
1872//
1873static int loop_detect(const char *path)
1874{
1875 if (strncmp(path, "loop:", 5) == 0)
1876 return 1;
1877 return 0;
1878}
1879
edd7414a 1880static int find_free_loopdev_no_control(int *retfd, char *namep)
eddaaafd
SH
1881{
1882 struct dirent dirent, *direntp;
1883 struct loop_info64 lo;
1884 DIR *dir;
1885 int fd = -1;
1886
025ed0f3 1887 dir = opendir("/dev");
025ed0f3 1888 if (!dir) {
eddaaafd
SH
1889 SYSERROR("Error opening /dev");
1890 return -1;
1891 }
1892 while (!readdir_r(dir, &dirent, &direntp)) {
1893
1894 if (!direntp)
1895 break;
1896 if (strncmp(direntp->d_name, "loop", 4) != 0)
1897 continue;
025ed0f3 1898 fd = openat(dirfd(dir), direntp->d_name, O_RDWR);
025ed0f3 1899 if (fd < 0)
eddaaafd
SH
1900 continue;
1901 if (ioctl(fd, LOOP_GET_STATUS64, &lo) == 0 || errno != ENXIO) {
1902 close(fd);
1903 fd = -1;
1904 continue;
1905 }
1906 // We can use this fd
1907 snprintf(namep, 100, "/dev/%s", direntp->d_name);
1908 break;
1909 }
ca697342 1910 closedir(dir);
eddaaafd
SH
1911 if (fd == -1) {
1912 ERROR("No loop device found");
1913 return -1;
1914 }
eddaaafd
SH
1915
1916 *retfd = fd;
1917 return 0;
1918}
1919
edd7414a
WB
1920static int find_free_loopdev(int *retfd, char *namep)
1921{
1922 int rc, fd = -1;
1923 int ctl = open("/dev/loop-control", O_RDWR);
1924 if (ctl < 0)
1925 return find_free_loopdev_no_control(retfd, namep);
1926 rc = ioctl(ctl, LOOP_CTL_GET_FREE);
1927 if (rc >= 0) {
1928 snprintf(namep, 100, "/dev/loop%d", rc);
1929 fd = open(namep, O_RDWR);
1930 }
1931 close(ctl);
1932 if (fd == -1) {
1933 ERROR("No loop device found");
1934 return -1;
1935 }
1936 *retfd = fd;
1937 return 0;
1938}
1939
eddaaafd
SH
1940static int loop_mount(struct bdev *bdev)
1941{
1942 int lfd, ffd = -1, ret = -1;
1943 struct loop_info64 lo;
1944 char loname[100];
1945
1946 if (strcmp(bdev->type, "loop"))
1947 return -22;
1948 if (!bdev->src || !bdev->dest)
1949 return -22;
1950 if (find_free_loopdev(&lfd, loname) < 0)
1951 return -22;
1952
025ed0f3 1953 ffd = open(bdev->src + 5, O_RDWR);
025ed0f3 1954 if (ffd < 0) {
959aee9c 1955 SYSERROR("Error opening backing file %s", bdev->src);
eddaaafd
SH
1956 goto out;
1957 }
1958
1959 if (ioctl(lfd, LOOP_SET_FD, ffd) < 0) {
1960 SYSERROR("Error attaching backing file to loop dev");
1961 goto out;
1962 }
1963 memset(&lo, 0, sizeof(lo));
1964 lo.lo_flags = LO_FLAGS_AUTOCLEAR;
1965 if (ioctl(lfd, LOOP_SET_STATUS64, &lo) < 0) {
959aee9c 1966 SYSERROR("Error setting autoclear on loop dev");
eddaaafd
SH
1967 goto out;
1968 }
1969
a17b1e65 1970 ret = mount_unknown_fs(loname, bdev->dest, bdev->mntopts);
eddaaafd 1971 if (ret < 0)
959aee9c 1972 ERROR("Error mounting %s", bdev->src);
eddaaafd
SH
1973 else
1974 bdev->lofd = lfd;
1975
1976out:
1977 if (ffd > -1)
1978 close(ffd);
1979 if (ret < 0) {
1980 close(lfd);
1981 bdev->lofd = -1;
1982 }
1983 return ret;
1984}
1985
1986static int loop_umount(struct bdev *bdev)
1987{
1988 int ret;
1989
1990 if (strcmp(bdev->type, "loop"))
1991 return -22;
1992 if (!bdev->src || !bdev->dest)
1993 return -22;
1994 ret = umount(bdev->dest);
1995 if (bdev->lofd >= 0) {
1996 close(bdev->lofd);
1997 bdev->lofd = -1;
1998 }
1999 return ret;
2000}
2001
d659597e 2002static int do_loop_create(const char *path, uint64_t size, const char *fstype)
eddaaafd 2003{
025ed0f3 2004 int fd, ret;
eddaaafd
SH
2005 // create the new loopback file.
2006 fd = creat(path, S_IRUSR|S_IWUSR);
2007 if (fd < 0)
2008 return -1;
2009 if (lseek(fd, size, SEEK_SET) < 0) {
2010 SYSERROR("Error seeking to set new loop file size");
2011 close(fd);
2012 return -1;
2013 }
2014 if (write(fd, "1", 1) != 1) {
2015 SYSERROR("Error creating new loop file");
2016 close(fd);
2017 return -1;
2018 }
025ed0f3 2019 ret = close(fd);
025ed0f3 2020 if (ret < 0) {
eddaaafd
SH
2021 SYSERROR("Error closing new loop file");
2022 return -1;
2023 }
2024
2025 // create an fs in the loopback file
2026 if (do_mkfs(path, fstype) < 0) {
2027 ERROR("Error creating filesystem type %s on %s", fstype,
2028 path);
2029 return -1;
2030 }
2031
2032 return 0;
2033}
2034
2035/*
2036 * No idea what the original blockdev will be called, but the copy will be
2037 * called $lxcpath/$lxcname/rootdev
2038 */
2039static int loop_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
2040 const char *cname, const char *oldpath, const char *lxcpath, int snap,
25190e5b 2041 uint64_t newsize, struct lxc_conf *conf)
eddaaafd
SH
2042{
2043 char fstype[100];
d659597e 2044 uint64_t size = newsize;
eddaaafd
SH
2045 int len, ret;
2046 char *srcdev;
2047
2048 if (snap) {
2049 ERROR("loop devices cannot be snapshotted.");
2050 return -1;
2051 }
2052
2053 if (!orig->dest || !orig->src)
2054 return -1;
2055
2056 len = strlen(lxcpath) + strlen(cname) + strlen("rootdev") + 3;
2057 srcdev = alloca(len);
2058 ret = snprintf(srcdev, len, "%s/%s/rootdev", lxcpath, cname);
2059 if (ret < 0 || ret >= len)
2060 return -1;
2061
2062 new->src = malloc(len + 5);
2063 if (!new->src)
2064 return -1;
2065 ret = snprintf(new->src, len + 5, "loop:%s", srcdev);
2066 if (ret < 0 || ret >= len + 5)
2067 return -1;
2068
2069 new->dest = malloc(len);
2070 if (!new->dest)
2071 return -1;
2072 ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname);
2073 if (ret < 0 || ret >= len)
2074 return -1;
2075
2076 // it's tempting to say: if orig->src == loopback and !newsize, then
2077 // copy the loopback file. However, we'd have to make sure to
2078 // correctly keep holes! So punt for now.
2079
2080 if (is_blktype(orig)) {
2081 if (!newsize && blk_getsize(orig, &size) < 0) {
2082 ERROR("Error getting size of %s", orig->src);
2083 return -1;
2084 }
2085 if (detect_fs(orig, fstype, 100) < 0) {
2086 INFO("could not find fstype for %s, using %s", orig->src,
2087 DEFAULT_FSTYPE);
2088 return -1;
2089 }
2090 } else {
2091 sprintf(fstype, "%s", DEFAULT_FSTYPE);
2092 if (!newsize)
d659597e 2093 size = DEFAULT_FS_SIZE;
eddaaafd
SH
2094 }
2095 return do_loop_create(srcdev, size, fstype);
2096}
2097
2098static int loop_create(struct bdev *bdev, const char *dest, const char *n,
2099 struct bdev_specs *specs)
2100{
2101 const char *fstype;
d659597e 2102 uint64_t sz;
eddaaafd
SH
2103 int ret, len;
2104 char *srcdev;
2105
2106 if (!specs)
2107 return -1;
2108
2109 // dest is passed in as $lxcpath / $lxcname / rootfs
2110 // srcdev will be: $lxcpath / $lxcname / rootdev
2111 // src will be 'loop:$srcdev'
2112 len = strlen(dest) + 2;
2113 srcdev = alloca(len);
2114
2115 ret = snprintf(srcdev, len, "%s", dest);
2116 if (ret < 0 || ret >= len)
2117 return -1;
2118 sprintf(srcdev + len - 4, "dev");
2119
2120 bdev->src = malloc(len + 5);
2121 if (!bdev->src)
2122 return -1;
2123 ret = snprintf(bdev->src, len + 5, "loop:%s", srcdev);
2124 if (ret < 0 || ret >= len + 5)
2125 return -1;
2126
72e99249 2127 sz = specs->fssize;
eddaaafd
SH
2128 if (!sz)
2129 sz = DEFAULT_FS_SIZE;
2130
72e99249 2131 fstype = specs->fstype;
eddaaafd
SH
2132 if (!fstype)
2133 fstype = DEFAULT_FSTYPE;
2134
2135 if (!(bdev->dest = strdup(dest)))
2136 return -1;
2137
2138 if (mkdir_p(bdev->dest, 0755) < 0) {
959aee9c 2139 ERROR("Error creating %s", bdev->dest);
eddaaafd
SH
2140 return -1;
2141 }
2142
2143 return do_loop_create(srcdev, sz, fstype);
2144}
2145
2146static int loop_destroy(struct bdev *orig)
2147{
2148 return unlink(orig->src + 5);
2149}
2150
74a3920a 2151static const struct bdev_ops loop_ops = {
eddaaafd
SH
2152 .detect = &loop_detect,
2153 .mount = &loop_mount,
2154 .umount = &loop_umount,
2155 .clone_paths = &loop_clonepaths,
2156 .destroy = &loop_destroy,
2157 .create = &loop_create,
0a83cbbb 2158 .can_snapshot = false,
cdd01be2 2159 .can_backup = true,
eddaaafd
SH
2160};
2161
9be53773
SH
2162//
2163// overlayfs ops
2164//
2165
2166static int overlayfs_detect(const char *path)
2167{
2168 if (strncmp(path, "overlayfs:", 10) == 0)
2169 return 1; // take their word for it
2170 return 0;
2171}
2172
38b34913
SH
2173static char *overlayfs_name;
2174static char *detect_overlayfs_name(void)
2175{
2176 char *v = "overlayfs";
2177 char *line = NULL;
2178 size_t len = 0;
2179 FILE *f = fopen("/proc/filesystems", "r");
2180 if (!f)
2181 return v;
2182
2183 while (getline(&line, &len, f) != -1) {
2184 if (strcmp(line, "nodev\toverlay\n") == 0) {
2185 v = "overlay";
2186 break;
2187 }
2188 }
2189
2190 fclose(f);
2191 free(line);
2192 return v;
2193}
2194
9be53773
SH
2195//
2196// XXXXXXX plain directory bind mount ops
2197//
60bf62d4 2198static int overlayfs_mount(struct bdev *bdev)
9be53773
SH
2199{
2200 char *options, *dup, *lower, *upper;
7fb1bef2
KY
2201 char *options_work, *work, *lastslash;
2202 int lastslashidx;
2203 int len, len2;
a17b1e65
SG
2204 unsigned long mntflags;
2205 char *mntdata;
7fb1bef2 2206 int ret, ret2;
9be53773
SH
2207
2208 if (strcmp(bdev->type, "overlayfs"))
2209 return -22;
2210 if (!bdev->src || !bdev->dest)
2211 return -22;
2212
38b34913
SH
2213 if (!overlayfs_name)
2214 overlayfs_name = detect_overlayfs_name();
2215
9be53773
SH
2216 // separately mount it first
2217 // mount -t overlayfs -oupperdir=${upper},lowerdir=${lower} lower dest
d74325c4
SG
2218 dup = alloca(strlen(bdev->src)+1);
2219 strcpy(dup, bdev->src);
46cd2845 2220 if (!(lower = strchr(dup, ':')))
9be53773 2221 return -22;
46cd2845 2222 if (!(upper = strchr(++lower, ':')))
9be53773
SH
2223 return -22;
2224 *upper = '\0';
2225 upper++;
2226
a93488df
SH
2227 // if delta doesn't yet exist, create it
2228 if (mkdir_p(upper, 0755) < 0 && errno != EEXIST)
2229 return -22;
2230
7fb1bef2
KY
2231 // overlayfs.v22 or higher needs workdir option
2232 // if upper is /var/lib/lxc/c2/delta0,
2233 // then workdir is /var/lib/lxc/c2/olwork
2234 lastslash = strrchr(upper, '/');
2235 if (!lastslash)
2236 return -22;
2237 lastslash++;
2238 lastslashidx = lastslash - upper;
2239
2240 work = alloca(lastslashidx + 7);
2241 strncpy(work, upper, lastslashidx+7);
2242 strcpy(work+lastslashidx, "olwork");
2243
a17b1e65
SG
2244 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
2245 free(mntdata);
2246 return -22;
2247 }
2248
44481bff
SH
2249 if (mkdir_p(work, 0755) < 0 && errno != EEXIST) {
2250 free(mntdata);
2251 return -22;
2252 }
2253
9be53773
SH
2254 // TODO We should check whether bdev->src is a blockdev, and if so
2255 // but for now, only support overlays of a basic directory
2256
a17b1e65
SG
2257 if (mntdata) {
2258 len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=,") + strlen(mntdata) + 1;
2259 options = alloca(len);
2260 ret = snprintf(options, len, "upperdir=%s,lowerdir=%s,%s", upper, lower, mntdata);
7fb1bef2
KY
2261
2262 len2 = strlen(lower) + strlen(upper) + strlen(work)
2263 + strlen("upperdir=,lowerdir=,workdir=") + strlen(mntdata) + 1;
2264 options_work = alloca(len2);
2265 ret2 = snprintf(options, len2, "upperdir=%s,lowerdir=%s,workdir=%s,%s",
2266 upper, lower, work, mntdata);
a17b1e65
SG
2267 }
2268 else {
2269 len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=") + 1;
2270 options = alloca(len);
2271 ret = snprintf(options, len, "upperdir=%s,lowerdir=%s", upper, lower);
7fb1bef2
KY
2272
2273 len2 = strlen(lower) + strlen(upper) + strlen(work)
2274 + strlen("upperdir=,lowerdir=,workdir=") + 1;
2275 options_work = alloca(len2);
2276 ret2 = snprintf(options_work, len2, "upperdir=%s,lowerdir=%s,workdir=%s",
2277 upper, lower, work);
a17b1e65 2278 }
7fb1bef2 2279 if (ret < 0 || ret >= len || ret2 < 0 || ret2 >= len2) {
a17b1e65 2280 free(mntdata);
9be53773 2281 return -1;
a17b1e65
SG
2282 }
2283
7fb1bef2 2284 // mount without workdir option for overlayfs before v21
38b34913 2285 ret = mount(lower, bdev->dest, overlayfs_name, MS_MGC_VAL | mntflags, options);
7fb1bef2
KY
2286 if (ret < 0) {
2287 INFO("overlayfs: error mounting %s onto %s options %s. retry with workdir",
9be53773 2288 lower, bdev->dest, options);
7fb1bef2
KY
2289
2290 // retry with workdir option for overlayfs v22 and higher
38b34913 2291 ret = mount(lower, bdev->dest, overlayfs_name, MS_MGC_VAL | mntflags, options_work);
7fb1bef2
KY
2292 if (ret < 0)
2293 SYSERROR("overlayfs: error mounting %s onto %s options %s",
2294 lower, bdev->dest, options_work);
2295 else
2296 INFO("overlayfs: mounted %s onto %s options %s",
2297 lower, bdev->dest, options_work);
2298 }
9be53773
SH
2299 else
2300 INFO("overlayfs: mounted %s onto %s options %s",
2301 lower, bdev->dest, options);
2302 return ret;
2303}
2304
60bf62d4 2305static int overlayfs_umount(struct bdev *bdev)
9be53773
SH
2306{
2307 if (strcmp(bdev->type, "overlayfs"))
2308 return -22;
2309 if (!bdev->src || !bdev->dest)
2310 return -22;
2311 return umount(bdev->dest);
2312}
2313
25190e5b
SH
2314static int rsync_delta(struct rsync_data_char *data)
2315{
2316 if (setgid(0) < 0) {
2317 ERROR("Failed to setgid to 0");
2318 return -1;
2319 }
2320 if (setgroups(0, NULL) < 0)
2321 WARN("Failed to clear groups");
2322 if (setuid(0) < 0) {
2323 ERROR("Failed to setuid to 0");
2324 return -1;
2325 }
2326 if (do_rsync(data->src, data->dest) < 0) {
2327 ERROR("rsyncing %s to %s", data->src, data->dest);
2328 return -1;
2329 }
2330
2331 return 0;
2332}
2333
2334static int rsync_delta_wrapper(void *data)
2335{
2336 struct rsync_data_char *arg = data;
2337 return rsync_delta(arg);
2338}
2339
186bef00
SH
2340static int ovl_rsync(struct ovl_rsync_data *data)
2341{
270261b9
SH
2342 int ret;
2343
186bef00
SH
2344 if (setgid(0) < 0) {
2345 ERROR("Failed to setgid to 0");
2346 return -1;
2347 }
2348 if (setgroups(0, NULL) < 0)
2349 WARN("Failed to clear groups");
2350 if (setuid(0) < 0) {
2351 ERROR("Failed to setuid to 0");
2352 return -1;
2353 }
2354
2355 if (unshare(CLONE_NEWNS) < 0) {
2356 SYSERROR("Unable to unshare mounts ns");
2357 return -1;
2358 }
2359 if (detect_shared_rootfs()) {
2360 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
2361 SYSERROR("Failed to make / rslave");
2362 ERROR("Continuing...");
2363 }
2364 }
2365 if (overlayfs_mount(data->orig) < 0) {
2366 ERROR("Failed mounting original container fs");
2367 return -1;
2368 }
2369 if (overlayfs_mount(data->new) < 0) {
2370 ERROR("Failed mounting new container fs");
2371 return -1;
2372 }
270261b9
SH
2373 ret = do_rsync(data->orig->dest, data->new->dest);
2374
2375 overlayfs_umount(data->new);
2376 overlayfs_umount(data->orig);
2377
2378 if (ret < 0) {
186bef00
SH
2379 ERROR("rsyncing %s to %s", data->orig->dest, data->new->dest);
2380 return -1;
2381 }
2382
2383 return 0;
2384}
2385
2386static int ovl_rsync_wrapper(void *data)
2387{
2388 struct ovl_rsync_data *arg = data;
2389 return ovl_rsync(arg);
2390}
2391
2392static int ovl_do_rsync(struct bdev *orig, struct bdev *new, struct lxc_conf *conf)
2393{
2394 int ret = -1;
2395 struct ovl_rsync_data rdata;
2396
2397 rdata.orig = orig;
2398 rdata.new = new;
2399 if (am_unpriv())
2400 ret = userns_exec_1(conf, ovl_rsync_wrapper, &rdata);
2401 else
2402 ret = ovl_rsync(&rdata);
2403 if (ret)
2404 ERROR("copying overlayfs delta");
2405
2406 return ret;
2407}
2408
9be53773
SH
2409static int overlayfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
2410 const char *cname, const char *oldpath, const char *lxcpath, int snap,
25190e5b 2411 uint64_t newsize, struct lxc_conf *conf)
9be53773
SH
2412{
2413 if (!snap) {
2414 ERROR("overlayfs is only for snapshot clones");
2415 return -22;
2416 }
2417
2418 if (!orig->src || !orig->dest)
2419 return -1;
2420
2421 new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
2422 if (!new->dest)
2423 return -1;
2424 if (mkdir_p(new->dest, 0755) < 0)
2425 return -1;
2426
25190e5b
SH
2427 if (am_unpriv() && chown_mapped_root(new->dest, conf) < 0)
2428 WARN("Failed to update ownership of %s", new->dest);
2429
9be53773 2430 if (strcmp(orig->type, "dir") == 0) {
edf77341 2431 char *delta, *lastslash;
7fb1bef2 2432 char *work;
edf77341 2433 int ret, len, lastslashidx;
5ca6c34b 2434
9be53773
SH
2435 // if we have /var/lib/lxc/c2/rootfs, then delta will be
2436 // /var/lib/lxc/c2/delta0
edf77341
SH
2437 lastslash = strrchr(new->dest, '/');
2438 if (!lastslash)
9be53773 2439 return -22;
edf77341
SH
2440 if (strlen(lastslash) < 7)
2441 return -22;
2442 lastslash++;
2443 lastslashidx = lastslash - new->dest;
2444
2445 delta = malloc(lastslashidx + 7);
2446 if (!delta)
2447 return -1;
2448 strncpy(delta, new->dest, lastslashidx+1);
2449 strcpy(delta+lastslashidx, "delta0");
9be53773
SH
2450 if ((ret = mkdir(delta, 0755)) < 0) {
2451 SYSERROR("error: mkdir %s", delta);
2452 free(delta);
2453 return -1;
2454 }
25190e5b
SH
2455 if (am_unpriv() && chown_mapped_root(delta, conf) < 0)
2456 WARN("Failed to update ownership of %s", delta);
9be53773 2457
7fb1bef2
KY
2458 // make workdir for overlayfs.v22 or higher
2459 // workdir is /var/lib/lxc/c2/olwork
2460 // it is used to prepare files before atomically swithing with destination,
2461 // and needs to be on the same filesystem as upperdir,
2462 // so it's OK for it to be empty.
2463 work = malloc(lastslashidx + 7);
2464 if (!work)
2465 return -1;
2466 strncpy(work, new->dest, lastslashidx+1);
2467 strcpy(work+lastslashidx, "olwork");
2468 if (mkdir(work, 0755) < 0) {
2469 SYSERROR("error: mkdir %s", work);
2470 free(work);
2471 return -1;
2472 }
2473 if (am_unpriv() && chown_mapped_root(work, conf) < 0)
2474 WARN("Failed to update ownership of %s", work);
2475 free(work);
2476
9be53773
SH
2477 // the src will be 'overlayfs:lowerdir:upperdir'
2478 len = strlen(delta) + strlen(orig->src) + 12;
2479 new->src = malloc(len);
2480 if (!new->src) {
2481 free(delta);
2482 return -ENOMEM;
2483 }
2484 ret = snprintf(new->src, len, "overlayfs:%s:%s", orig->src, delta);
2485 free(delta);
2486 if (ret < 0 || ret >= len)
2487 return -ENOMEM;
9be53773
SH
2488 } else if (strcmp(orig->type, "overlayfs") == 0) {
2489 // What exactly do we want to do here?
2490 // I think we want to use the original lowerdir, with a
2491 // private delta which is originally rsynced from the
2492 // original delta
7fb1bef2
KY
2493 char *osrc, *odelta, *nsrc, *ndelta, *work;
2494 char *lastslash;
2495 int len, ret, lastslashidx;
9be53773
SH
2496 if (!(osrc = strdup(orig->src)))
2497 return -22;
46cd2845
PL
2498 nsrc = strchr(osrc, ':') + 1;
2499 if (nsrc != osrc + 10 || (odelta = strchr(nsrc, ':')) == NULL) {
9be53773
SH
2500 free(osrc);
2501 return -22;
2502 }
2503 *odelta = '\0';
2504 odelta++;
2505 ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath);
2506 if (!ndelta) {
2507 free(osrc);
2508 return -ENOMEM;
2509 }
edf77341 2510 if ((ret = mkdir(ndelta, 0755)) < 0 && errno != EEXIST) {
25190e5b
SH
2511 SYSERROR("error: mkdir %s", ndelta);
2512 free(osrc);
2513 free(ndelta);
2514 return -1;
2515 }
2516 if (am_unpriv() && chown_mapped_root(ndelta, conf) < 0)
2517 WARN("Failed to update ownership of %s", ndelta);
7fb1bef2
KY
2518
2519 // make workdir for overlayfs.v22 or higher
2520 // for details, see above.
2521 lastslash = strrchr(ndelta, '/');
2522 if (!lastslash)
2523 return -1;
2524 lastslash++;
2525 lastslashidx = lastslash - ndelta;
2526
2527 work = malloc(lastslashidx + 7);
2528 if (!work)
2529 return -1;
2530 strncpy(work, ndelta, lastslashidx+1);
2531 strcpy(work+lastslashidx, "olwork");
2532 if ((mkdir(work, 0755) < 0) && errno != EEXIST) {
2533 SYSERROR("error: mkdir %s", work);
2534 free(work);
2535 return -1;
2536 }
2537 if (am_unpriv() && chown_mapped_root(work, conf) < 0)
2538 WARN("Failed to update ownership of %s", work);
2539 free(work);
2540
9be53773
SH
2541 len = strlen(nsrc) + strlen(ndelta) + 12;
2542 new->src = malloc(len);
2543 if (!new->src) {
2544 free(osrc);
2545 free(ndelta);
2546 return -ENOMEM;
2547 }
2548 ret = snprintf(new->src, len, "overlayfs:%s:%s", nsrc, ndelta);
2549 free(osrc);
2550 free(ndelta);
2551 if (ret < 0 || ret >= len)
2552 return -ENOMEM;
186bef00
SH
2553
2554 return ovl_do_rsync(orig, new, conf);
375c2258
SH
2555 } else {
2556 ERROR("overlayfs clone of %s container is not yet supported",
2557 orig->type);
2558 // Note, supporting this will require overlayfs_mount supporting
2559 // mounting of the underlay. No big deal, just needs to be done.
2560 return -1;
9be53773
SH
2561 }
2562
2563 return 0;
2564}
2565
74a3920a 2566static int overlayfs_destroy(struct bdev *orig)
60bf62d4
SH
2567{
2568 char *upper;
2569
2570 if (strncmp(orig->src, "overlayfs:", 10) != 0)
2571 return -22;
46cd2845 2572 upper = strchr(orig->src + 10, ':');
60bf62d4
SH
2573 if (!upper)
2574 return -22;
2575 upper++;
18aa217b 2576 return lxc_rmdir_onedev(upper, NULL);
60bf62d4
SH
2577}
2578
1897e3bc
SH
2579/*
2580 * to say 'lxc-create -t ubuntu -n o1 -B overlayfs' means you want
2581 * $lxcpath/$lxcname/rootfs to have the created container, while all
2582 * changes after starting the container are written to
2583 * $lxcpath/$lxcname/delta0
2584 */
2585static int overlayfs_create(struct bdev *bdev, const char *dest, const char *n,
2586 struct bdev_specs *specs)
2587{
2588 char *delta;
2589 int ret, len = strlen(dest), newlen;
2590
2591 if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0)
2592 return -1;
2593
2594 if (!(bdev->dest = strdup(dest))) {
2595 ERROR("Out of memory");
2596 return -1;
2597 }
2598
d74325c4
SG
2599 delta = alloca(strlen(dest)+1);
2600 strcpy(delta, dest);
1897e3bc
SH
2601 strcpy(delta+len-6, "delta0");
2602
2603 if (mkdir_p(delta, 0755) < 0) {
959aee9c 2604 ERROR("Error creating %s", delta);
1897e3bc
SH
2605 return -1;
2606 }
2607
2608 /* overlayfs:lower:upper */
2609 newlen = (2 * len) + strlen("overlayfs:") + 2;
2610 bdev->src = malloc(newlen);
2611 if (!bdev->src) {
2612 ERROR("Out of memory");
2613 return -1;
2614 }
2615 ret = snprintf(bdev->src, newlen, "overlayfs:%s:%s", dest, delta);
2616 if (ret < 0 || ret >= newlen)
2617 return -1;
2618
2619 if (mkdir_p(bdev->dest, 0755) < 0) {
959aee9c 2620 ERROR("Error creating %s", bdev->dest);
1897e3bc
SH
2621 return -1;
2622 }
2623
2624 return 0;
2625}
2626
74a3920a 2627static const struct bdev_ops overlayfs_ops = {
9be53773
SH
2628 .detect = &overlayfs_detect,
2629 .mount = &overlayfs_mount,
2630 .umount = &overlayfs_umount,
2631 .clone_paths = &overlayfs_clonepaths,
60bf62d4 2632 .destroy = &overlayfs_destroy,
1897e3bc 2633 .create = &overlayfs_create,
0a83cbbb 2634 .can_snapshot = true,
cdd01be2 2635 .can_backup = true,
9be53773
SH
2636};
2637
1f92162d
SG
2638//
2639// aufs ops
2640//
2641
2642static int aufs_detect(const char *path)
2643{
2644 if (strncmp(path, "aufs:", 5) == 0)
2645 return 1; // take their word for it
2646 return 0;
2647}
2648
2649//
2650// XXXXXXX plain directory bind mount ops
2651//
2652static int aufs_mount(struct bdev *bdev)
2653{
31a882ef 2654 char *options, *dup, *lower, *upper;
1f92162d
SG
2655 int len;
2656 unsigned long mntflags;
2657 char *mntdata;
2658 int ret;
31a882ef 2659 const char *xinopath = "/dev/shm/aufs.xino";
1f92162d
SG
2660
2661 if (strcmp(bdev->type, "aufs"))
2662 return -22;
2663 if (!bdev->src || !bdev->dest)
2664 return -22;
2665
2666 // separately mount it first
2667 // mount -t aufs -obr=${upper}=rw:${lower}=ro lower dest
2668 dup = alloca(strlen(bdev->src)+1);
2669 strcpy(dup, bdev->src);
46cd2845 2670 if (!(lower = strchr(dup, ':')))
1f92162d 2671 return -22;
46cd2845 2672 if (!(upper = strchr(++lower, ':')))
1f92162d
SG
2673 return -22;
2674 *upper = '\0';
2675 upper++;
2676
2677 if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) {
2678 free(mntdata);
2679 return -22;
2680 }
2681
2682 // TODO We should check whether bdev->src is a blockdev, and if so
2683 // but for now, only support aufs of a basic directory
2684
9009a728 2685 // AUFS does not work on top of certain filesystems like (XFS or Btrfs)
31a882ef
KY
2686 // so add xino=/dev/shm/aufs.xino parameter to mount options.
2687 // The same xino option can be specified to multiple aufs mounts, and
2688 // a xino file is not shared among multiple aufs mounts.
9009a728
ÇO
2689 //
2690 // see http://www.mail-archive.com/aufs-users@lists.sourceforge.net/msg02587.html
31a882ef 2691 // http://www.mail-archive.com/aufs-users@lists.sourceforge.net/msg05126.html
1f92162d 2692 if (mntdata) {
31a882ef 2693 len = strlen(lower) + strlen(upper) + strlen(xinopath) + strlen("br==rw:=ro,,xino=") + strlen(mntdata) + 1;
1f92162d 2694 options = alloca(len);
31a882ef 2695 ret = snprintf(options, len, "br=%s=rw:%s=ro,%s,xino=%s", upper, lower, mntdata, xinopath);
1f92162d
SG
2696 }
2697 else {
31a882ef 2698 len = strlen(lower) + strlen(upper) + strlen(xinopath) + strlen("br==rw:=ro,xino=") + 1;
1f92162d 2699 options = alloca(len);
31a882ef 2700 ret = snprintf(options, len, "br=%s=rw:%s=ro,xino=%s", upper, lower, xinopath);
1f92162d 2701 }
9009a728 2702
1f92162d
SG
2703 if (ret < 0 || ret >= len) {
2704 free(mntdata);
2705 return -1;
2706 }
2707
2708 ret = mount(lower, bdev->dest, "aufs", MS_MGC_VAL | mntflags, options);
2709 if (ret < 0)
2710 SYSERROR("aufs: error mounting %s onto %s options %s",
2711 lower, bdev->dest, options);
2712 else
2713 INFO("aufs: mounted %s onto %s options %s",
2714 lower, bdev->dest, options);
2715 return ret;
2716}
2717
2718static int aufs_umount(struct bdev *bdev)
2719{
2720 if (strcmp(bdev->type, "aufs"))
2721 return -22;
2722 if (!bdev->src || !bdev->dest)
2723 return -22;
2724 return umount(bdev->dest);
2725}
2726
2727static int aufs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
2728 const char *cname, const char *oldpath, const char *lxcpath, int snap,
25190e5b 2729 uint64_t newsize, struct lxc_conf *conf)
1f92162d
SG
2730{
2731 if (!snap) {
2732 ERROR("aufs is only for snapshot clones");
2733 return -22;
2734 }
2735
2736 if (!orig->src || !orig->dest)
2737 return -1;
2738
2739 new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath);
2740 if (!new->dest)
2741 return -1;
2742 if (mkdir_p(new->dest, 0755) < 0)
2743 return -1;
2744
31a882ef
KY
2745 if (am_unpriv() && chown_mapped_root(new->dest, conf) < 0)
2746 WARN("Failed to update ownership of %s", new->dest);
2747
1f92162d 2748 if (strcmp(orig->type, "dir") == 0) {
edf77341
SH
2749 char *delta, *lastslash;
2750 int ret, len, lastslashidx;
1f92162d
SG
2751
2752 // if we have /var/lib/lxc/c2/rootfs, then delta will be
2753 // /var/lib/lxc/c2/delta0
edf77341
SH
2754 lastslash = strrchr(new->dest, '/');
2755 if (!lastslash)
1f92162d 2756 return -22;
edf77341
SH
2757 if (strlen(lastslash) < 7)
2758 return -22;
2759 lastslash++;
2760 lastslashidx = lastslash - new->dest;
2761
2762 delta = malloc(lastslashidx + 7);
2763 if (!delta)
2764 return -1;
2765 strncpy(delta, new->dest, lastslashidx+1);
2766 strcpy(delta+lastslashidx, "delta0");
1f92162d
SG
2767 if ((ret = mkdir(delta, 0755)) < 0) {
2768 SYSERROR("error: mkdir %s", delta);
2769 free(delta);
2770 return -1;
2771 }
31a882ef
KY
2772 if (am_unpriv() && chown_mapped_root(delta, conf) < 0)
2773 WARN("Failed to update ownership of %s", delta);
1f92162d
SG
2774
2775 // the src will be 'aufs:lowerdir:upperdir'
2776 len = strlen(delta) + strlen(orig->src) + 12;
2777 new->src = malloc(len);
2778 if (!new->src) {
2779 free(delta);
2780 return -ENOMEM;
2781 }
2782 ret = snprintf(new->src, len, "aufs:%s:%s", orig->src, delta);
2783 free(delta);
2784 if (ret < 0 || ret >= len)
2785 return -ENOMEM;
2786 } else if (strcmp(orig->type, "aufs") == 0) {
2787 // What exactly do we want to do here?
2788 // I think we want to use the original lowerdir, with a
2789 // private delta which is originally rsynced from the
2790 // original delta
2791 char *osrc, *odelta, *nsrc, *ndelta;
2792 int len, ret;
2793 if (!(osrc = strdup(orig->src)))
2794 return -22;
46cd2845
PL
2795 nsrc = strchr(osrc, ':') + 1;
2796 if (nsrc != osrc + 5 || (odelta = strchr(nsrc, ':')) == NULL) {
1f92162d
SG
2797 free(osrc);
2798 return -22;
2799 }
2800 *odelta = '\0';
2801 odelta++;
2802 ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath);
2803 if (!ndelta) {
2804 free(osrc);
2805 return -ENOMEM;
2806 }
31a882ef
KY
2807 if ((ret = mkdir(ndelta, 0755)) < 0 && errno != EEXIST) {
2808 SYSERROR("error: mkdir %s", ndelta);
2809 free(osrc);
2810 free(ndelta);
2811 return -1;
2812 }
2813 if (am_unpriv() && chown_mapped_root(ndelta, conf) < 0)
2814 WARN("Failed to update ownership of %s", ndelta);
2815
2816 struct rsync_data_char rdata;
2817 rdata.src = odelta;
2818 rdata.dest = ndelta;
2819 if (am_unpriv())
2820 ret = userns_exec_1(conf, rsync_delta_wrapper, &rdata);
2821 else
2822 ret = rsync_delta(&rdata);
2823 if (ret) {
1f92162d
SG
2824 free(osrc);
2825 free(ndelta);
2826 ERROR("copying aufs delta");
2827 return -1;
2828 }
2829 len = strlen(nsrc) + strlen(ndelta) + 12;
2830 new->src = malloc(len);
2831 if (!new->src) {
2832 free(osrc);
2833 free(ndelta);
2834 return -ENOMEM;
2835 }
2836 ret = snprintf(new->src, len, "aufs:%s:%s", nsrc, ndelta);
2837 free(osrc);
2838 free(ndelta);
2839 if (ret < 0 || ret >= len)
2840 return -ENOMEM;
2841 } else {
2842 ERROR("aufs clone of %s container is not yet supported",
2843 orig->type);
2844 // Note, supporting this will require aufs_mount supporting
2845 // mounting of the underlay. No big deal, just needs to be done.
2846 return -1;
2847 }
2848
2849 return 0;
2850}
2851
2852static int aufs_destroy(struct bdev *orig)
2853{
2854 char *upper;
2855
2856 if (strncmp(orig->src, "aufs:", 5) != 0)
2857 return -22;
46cd2845 2858 upper = strchr(orig->src + 5, ':');
1f92162d
SG
2859 if (!upper)
2860 return -22;
2861 upper++;
18aa217b 2862 return lxc_rmdir_onedev(upper, NULL);
1f92162d
SG
2863}
2864
2865/*
2866 * to say 'lxc-create -t ubuntu -n o1 -B aufs' means you want
2867 * $lxcpath/$lxcname/rootfs to have the created container, while all
2868 * changes after starting the container are written to
2869 * $lxcpath/$lxcname/delta0
2870 */
2871static int aufs_create(struct bdev *bdev, const char *dest, const char *n,
2872 struct bdev_specs *specs)
2873{
2874 char *delta;
2875 int ret, len = strlen(dest), newlen;
2876
2877 if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0)
2878 return -1;
2879
2880 if (!(bdev->dest = strdup(dest))) {
2881 ERROR("Out of memory");
2882 return -1;
2883 }
2884
2885 delta = alloca(strlen(dest)+1);
2886 strcpy(delta, dest);
2887 strcpy(delta+len-6, "delta0");
2888
2889 if (mkdir_p(delta, 0755) < 0) {
2890 ERROR("Error creating %s", delta);
2891 return -1;
2892 }
2893
2894 /* aufs:lower:upper */
2895 newlen = (2 * len) + strlen("aufs:") + 2;
2896 bdev->src = malloc(newlen);
2897 if (!bdev->src) {
2898 ERROR("Out of memory");
2899 return -1;
2900 }
2901 ret = snprintf(bdev->src, newlen, "aufs:%s:%s", dest, delta);
2902 if (ret < 0 || ret >= newlen)
2903 return -1;
2904
2905 if (mkdir_p(bdev->dest, 0755) < 0) {
2906 ERROR("Error creating %s", bdev->dest);
2907 return -1;
2908 }
2909
2910 return 0;
2911}
2912
2913static const struct bdev_ops aufs_ops = {
2914 .detect = &aufs_detect,
2915 .mount = &aufs_mount,
2916 .umount = &aufs_umount,
2917 .clone_paths = &aufs_clonepaths,
2918 .destroy = &aufs_destroy,
2919 .create = &aufs_create,
2920 .can_snapshot = true,
cdd01be2 2921 .can_backup = true,
1f92162d
SG
2922};
2923
76a26f55
SH
2924//
2925// nbd dev ops
2926//
2927
2928static int nbd_detect(const char *path)
2929{
2930 if (strncmp(path, "nbd:", 4) == 0)
2931 return 1;
2932 return 0;
2933}
2934
2935struct nbd_attach_data {
2936 const char *nbd;
2937 const char *path;
2938};
2939
2940static void nbd_detach(const char *path)
2941{
2942 int ret;
2943 pid_t pid = fork();
2944
2945 if (pid < 0) {
2946 SYSERROR("Error forking to detach nbd");
2947 return;
2948 }
2949 if (pid) {
2950 ret = wait_for_pid(pid);
2951 if (ret < 0)
2952 ERROR("nbd disconnect returned an error");
2953 return;
2954 }
2955 execlp("qemu-nbd", "qemu-nbd", "-d", path, NULL);
2956 SYSERROR("Error executing qemu-nbd");
2957 exit(1);
2958}
2959
2960static int do_attach_nbd(void *d)
2961{
2962 struct nbd_attach_data *data = d;
2963 const char *nbd, *path;
2964 pid_t pid;
2965 sigset_t mask;
2966 int sfd;
2967 ssize_t s;
2968 struct signalfd_siginfo fdsi;
2969
2970 sigemptyset(&mask);
2971 sigaddset(&mask, SIGHUP);
2972 sigaddset(&mask, SIGCHLD);
2973
2974 nbd = data->nbd;
2975 path = data->path;
2976
2977 if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) {
2978 SYSERROR("Error blocking signals for nbd watcher");
2979 exit(1);
2980 }
2981
2982 sfd = signalfd(-1, &mask, 0);
2983 if (sfd == -1) {
2984 SYSERROR("Error opening signalfd for nbd task");
2985 exit(1);
2986 }
2987
2988 if (prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0) < 0)
2989 SYSERROR("Error setting parent death signal for nbd watcher");
2990
2991 pid = fork();
2992 if (pid) {
2993 for (;;) {
2994 s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo));
2995 if (s != sizeof(struct signalfd_siginfo))
2996 SYSERROR("Error reading from signalfd");
2997
2998 if (fdsi.ssi_signo == SIGHUP) {
2999 /* container has exited */
3000 nbd_detach(nbd);
3001 exit(0);
3002 } else if (fdsi.ssi_signo == SIGCHLD) {
3003 int status;
3abd3e54
SH
3004 /* If qemu-nbd fails, or is killed by a signal,
3005 * then exit */
3006 while (waitpid(-1, &status, WNOHANG) > 0) {
3007 if ((WIFEXITED(status) && WEXITSTATUS(status) != 0) ||
3008 WIFSIGNALED(status)) {
3009 nbd_detach(nbd);
3010 exit(1);
3011 }
3012 }
76a26f55
SH
3013 }
3014 }
3015 }
3016
3017 close(sfd);
3018 if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1)
3019 WARN("Warning: unblocking signals for nbd watcher");
3020
3021 execlp("qemu-nbd", "qemu-nbd", "-c", nbd, path, NULL);
3022 SYSERROR("Error executing qemu-nbd");
3023 exit(1);
3024}
3025
3026static bool clone_attach_nbd(const char *nbd, const char *path)
3027{
3028 pid_t pid;
3029 struct nbd_attach_data data;
3030
3031 data.nbd = nbd;
3032 data.path = path;
3033
3034 pid = lxc_clone(do_attach_nbd, &data, CLONE_NEWPID);
3035 if (pid < 0)
3036 return false;
3037 return true;
3038}
3039
3040static bool nbd_busy(int idx)
3041{
3042 char path[100];
3043 int ret;
3044
3045 ret = snprintf(path, 100, "/sys/block/nbd%d/pid", idx);
3046 if (ret < 0 || ret >= 100)
3047 return true;
3048 return file_exists(path);
3049}
3050
3051static bool attach_nbd(char *src, struct lxc_conf *conf)
3052{
3053 char *orig = alloca(strlen(src)+1), *p, path[50];
3054 int i = 0;
3055
3056 strcpy(orig, src);
3057 /* if path is followed by a partition, drop that for now */
3058 p = strchr(orig, ':');
3059 if (p)
3060 *p = '\0';
3061 while (1) {
3062 sprintf(path, "/dev/nbd%d", i);
3063 if (!file_exists(path))
3064 return false;
3065 if (nbd_busy(i)) {
3066 i++;
3067 continue;
3068 }
3069 if (!clone_attach_nbd(path, orig))
3070 return false;
3071 conf->nbd_idx = i;
3072 return true;
3073 }
3074}
3075
3076static bool requires_nbd(const char *path)
3077{
3078 if (strncmp(path, "nbd:", 4) == 0)
3079 return true;
3080 return false;
3081}
3082
3083/*
3084 * attach_block_device returns true if all went well,
3085 * meaning either a block device was attached or was not
3086 * needed. It returns false if something went wrong and
ec64264d 3087 * container startup should be stopped.
76a26f55
SH
3088 */
3089bool attach_block_device(struct lxc_conf *conf)
3090{
3091 char *path;
3092
3093 if (!conf->rootfs.path)
3094 return true;
3095 path = conf->rootfs.path;
3096 if (!requires_nbd(path))
3097 return true;
3098 path = strchr(path, ':');
3099 if (!path)
3100 return false;
3101 path++;
3102 if (!attach_nbd(path, conf))
3103 return false;
3104 return true;
3105}
3106
3107void detach_nbd_idx(int idx)
3108{
3109 int ret;
3110 char path[50];
3111
3112 ret = snprintf(path, 50, "/dev/nbd%d", idx);
3113 if (ret < 0 || ret >= 50)
3114 return;
3115
3116 nbd_detach(path);
3117}
3118
3119void detach_block_device(struct lxc_conf *conf)
3120{
3121 if (conf->nbd_idx != -1)
3122 detach_nbd_idx(conf->nbd_idx);
3123}
3124
3125/*
3126 * Pick the partition # off the end of a nbd:file:p
3127 * description. Return 1-9 for the partition id, or 0
3128 * for no partition.
3129 */
3130static int nbd_get_partition(const char *src)
3131{
3132 char *p = strchr(src, ':');
3133 if (!p)
3134 return 0;
3135 p = strchr(p+1, ':');
3136 if (!p)
3137 return 0;
3138 p++;
a6ee1277 3139 if (*p < '1' || *p > '9')
76a26f55
SH
3140 return 0;
3141 return *p - '0';
3142}
3143
bfd0b144
SH
3144static bool wait_for_partition(const char *path)
3145{
3146 int count = 0;
3147 while (count < 5) {
3148 if (file_exists(path))
3149 return true;
3150 sleep(1);
3151 count++;
3152 }
3153 ERROR("Device %s did not show up after 5 seconds", path);
3154 return false;
3155}
3156
76a26f55
SH
3157static int nbd_mount(struct bdev *bdev)
3158{
3159 int ret = -1, partition;
3160 char path[50];
3161
3162 if (strcmp(bdev->type, "nbd"))
3163 return -22;
3164 if (!bdev->src || !bdev->dest)
3165 return -22;
3166
3167 /* nbd_idx should have been copied by bdev_init from the lxc_conf */
3168 if (bdev->nbd_idx < 0)
3169 return -22;
3170 partition = nbd_get_partition(bdev->src);
3171 if (partition)
3172 ret = snprintf(path, 50, "/dev/nbd%dp%d", bdev->nbd_idx,
3173 partition);
3174 else
3175 ret = snprintf(path, 50, "/dev/nbd%d", bdev->nbd_idx);
3176 if (ret < 0 || ret >= 50) {
3177 ERROR("Error setting up nbd device path");
3178 return ret;
3179 }
bfd0b144
SH
3180
3181 /* It might take awhile for the partition files to show up */
3182 if (partition) {
3183 if (!wait_for_partition(path))
3184 return -2;
3185 }
76a26f55
SH
3186 ret = mount_unknown_fs(path, bdev->dest, bdev->mntopts);
3187 if (ret < 0)
3188 ERROR("Error mounting %s", bdev->src);
3189
3190 return ret;
3191}
3192
3193static int nbd_create(struct bdev *bdev, const char *dest, const char *n,
3194 struct bdev_specs *specs)
3195{
3196 return -ENOSYS;
3197}
3198
3199static int nbd_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname,
3200 const char *cname, const char *oldpath, const char *lxcpath, int snap,
3201 uint64_t newsize, struct lxc_conf *conf)
3202{
3203 return -ENOSYS;
3204}
3205
3206static int nbd_destroy(struct bdev *orig)
3207{
3208 return -ENOSYS;
3209}
3210
3211static int nbd_umount(struct bdev *bdev)
3212{
3213 int ret;
3214
3215 if (strcmp(bdev->type, "nbd"))
3216 return -22;
3217 if (!bdev->src || !bdev->dest)
3218 return -22;
3219 ret = umount(bdev->dest);
3220 return ret;
3221}
3222
3223static const struct bdev_ops nbd_ops = {
3224 .detect = &nbd_detect,
3225 .mount = &nbd_mount,
3226 .umount = &nbd_umount,
3227 .clone_paths = &nbd_clonepaths,
3228 .destroy = &nbd_destroy,
3229 .create = &nbd_create,
3230 .can_snapshot = true,
cdd01be2 3231 .can_backup = false,
76a26f55 3232};
1f92162d 3233
74a3920a 3234static const struct bdev_type bdevs[] = {
3baa76fe 3235 {.name = "zfs", .ops = &zfs_ops,},
9be53773
SH
3236 {.name = "lvm", .ops = &lvm_ops,},
3237 {.name = "btrfs", .ops = &btrfs_ops,},
3238 {.name = "dir", .ops = &dir_ops,},
1f92162d 3239 {.name = "aufs", .ops = &aufs_ops,},
9be53773 3240 {.name = "overlayfs", .ops = &overlayfs_ops,},
eddaaafd 3241 {.name = "loop", .ops = &loop_ops,},
76a26f55 3242 {.name = "nbd", .ops = &nbd_ops,},
9be53773
SH
3243};
3244
3245static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type);
3246
3247void bdev_put(struct bdev *bdev)
3248{
f10fad2f
ME
3249 free(bdev->mntopts);
3250 free(bdev->src);
3251 free(bdev->dest);
9be53773
SH
3252 free(bdev);
3253}
3254
3255struct bdev *bdev_get(const char *type)
3256{
3257 int i;
3258 struct bdev *bdev;
3259
3260 for (i=0; i<numbdevs; i++) {
3261 if (strcmp(bdevs[i].name, type) == 0)
3262 break;
3263 }
3264 if (i == numbdevs)
3265 return NULL;
3266 bdev = malloc(sizeof(struct bdev));
3267 if (!bdev)
3268 return NULL;
3269 memset(bdev, 0, sizeof(struct bdev));
3270 bdev->ops = bdevs[i].ops;
3271 bdev->type = bdevs[i].name;
3272 return bdev;
3273}
3274
35120d9c 3275static const struct bdev_type *bdev_query(const char *src)
9be53773
SH
3276{
3277 int i;
9be53773
SH
3278 for (i=0; i<numbdevs; i++) {
3279 int r;
3280 r = bdevs[i].ops->detect(src);
3281 if (r)
3282 break;
3283 }
eddaaafd 3284
9be53773
SH
3285 if (i == numbdevs)
3286 return NULL;
35120d9c
SH
3287 return &bdevs[i];
3288}
3289
3290struct bdev *bdev_init(struct lxc_conf *conf, const char *src, const char *dst, const char *mntopts)
3291{
3292 struct bdev *bdev;
3293 const struct bdev_type *q;
3294
cdd01be2
SH
3295 if (!src)
3296 src = conf->rootfs.path;
3297
3298 if (!src)
3299 return NULL;
3300
35120d9c
SH
3301 q = bdev_query(src);
3302 if (!q)
3303 return NULL;
3304
9be53773
SH
3305 bdev = malloc(sizeof(struct bdev));
3306 if (!bdev)
3307 return NULL;
3308 memset(bdev, 0, sizeof(struct bdev));
35120d9c
SH
3309 bdev->ops = q->ops;
3310 bdev->type = q->name;
a17b1e65
SG
3311 if (mntopts)
3312 bdev->mntopts = strdup(mntopts);
9be53773
SH
3313 if (src)
3314 bdev->src = strdup(src);
3315 if (dst)
3316 bdev->dest = strdup(dst);
76a26f55
SH
3317 if (strcmp(bdev->type, "nbd") == 0)
3318 bdev->nbd_idx = conf->nbd_idx;
9be53773
SH
3319
3320 return bdev;
3321}
3322
1354955b
SH
3323struct rsync_data {
3324 struct bdev *orig;
3325 struct bdev *new;
3326};
3327
3328static int rsync_rootfs(struct rsync_data *data)
3329{
3330 struct bdev *orig = data->orig,
3331 *new = data->new;
3332
3333 if (unshare(CLONE_NEWNS) < 0) {
3334 SYSERROR("unshare CLONE_NEWNS");
3335 return -1;
3336 }
c597baa8
DE
3337 if (detect_shared_rootfs()) {
3338 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
2c6f3fc9 3339 SYSERROR("Failed to make / rslave");
c597baa8
DE
3340 ERROR("Continuing...");
3341 }
3342 }
1354955b
SH
3343
3344 // If not a snapshot, copy the fs.
3345 if (orig->ops->mount(orig) < 0) {
959aee9c 3346 ERROR("failed mounting %s onto %s", orig->src, orig->dest);
1354955b
SH
3347 return -1;
3348 }
3349 if (new->ops->mount(new) < 0) {
959aee9c 3350 ERROR("failed mounting %s onto %s", new->src, new->dest);
1354955b
SH
3351 return -1;
3352 }
3353 if (setgid(0) < 0) {
3354 ERROR("Failed to setgid to 0");
3355 return -1;
3356 }
c476bdce
SH
3357 if (setgroups(0, NULL) < 0)
3358 WARN("Failed to clear groups");
1354955b
SH
3359 if (setuid(0) < 0) {
3360 ERROR("Failed to setuid to 0");
3361 return -1;
3362 }
3363 if (do_rsync(orig->dest, new->dest) < 0) {
959aee9c 3364 ERROR("rsyncing %s to %s", orig->src, new->src);
1354955b
SH
3365 return -1;
3366 }
3367
3368 return 0;
3369}
3370
3371static int rsync_rootfs_wrapper(void *data)
3372{
3373 struct rsync_data *arg = data;
3374 return rsync_rootfs(arg);
3375}
8c39f7a4 3376
76a26f55 3377bool bdev_is_dir(struct lxc_conf *conf, const char *path)
8c39f7a4 3378{
76a26f55 3379 struct bdev *orig = bdev_init(conf, path, NULL, NULL);
8c39f7a4
SH
3380 bool ret = false;
3381 if (!orig)
3382 return ret;
3383 if (strcmp(orig->type, "dir") == 0)
3384 ret = true;
3385 bdev_put(orig);
3386 return ret;
3387}
3388
cdd01be2
SH
3389bool bdev_can_backup(struct lxc_conf *conf)
3390{
3391 struct bdev *bdev = bdev_init(conf, NULL, NULL, NULL);
3392 bool ret;
3393
3394 if (!bdev)
3395 return false;
3396 ret = bdev->ops->can_backup;
3397 bdev_put(bdev);
3398 return ret;
3399}
3400
a7ef8753
SH
3401/*
3402 * is an unprivileged user allowed to make this kind of snapshot
3403 */
3404static bool unpriv_snap_allowed(struct bdev *b, const char *t, bool snap,
3405 bool maybesnap)
3406{
3407 if (!t) {
3408 // new type will be same as original
3409 // (unless snap && b->type == dir, in which case it will be
3410 // overlayfs -- which is also allowed)
3411 if (strcmp(b->type, "dir") == 0 ||
31a882ef 3412 strcmp(b->type, "aufs") == 0 ||
a7ef8753 3413 strcmp(b->type, "overlayfs") == 0 ||
2659c7cb 3414 strcmp(b->type, "btrfs") == 0 ||
a7ef8753
SH
3415 strcmp(b->type, "loop") == 0)
3416 return true;
3417 return false;
3418 }
3419
3420 // unprivileged users can copy and snapshot dir, overlayfs,
3421 // and loop. In particular, not zfs, btrfs, or lvm.
31a882ef
KY
3422 if (strcmp(t, "dir") == 0 ||
3423 strcmp(t, "aufs") == 0 ||
3424 strcmp(t, "overlayfs") == 0 ||
3425 strcmp(t, "btrfs") == 0 ||
3426 strcmp(t, "loop") == 0)
a7ef8753
SH
3427 return true;
3428 return false;
3429}
3430
9be53773
SH
3431/*
3432 * If we're not snaphotting, then bdev_copy becomes a simple case of mount
3433 * the original, mount the new, and rsync the contents.
3434 */
1354955b
SH
3435struct bdev *bdev_copy(struct lxc_container *c0, const char *cname,
3436 const char *lxcpath, const char *bdevtype,
d659597e 3437 int flags, const char *bdevdata, uint64_t newsize,
dfb31b25 3438 int *needs_rdep)
9be53773
SH
3439{
3440 struct bdev *orig, *new;
3441 pid_t pid;
1354955b 3442 int ret;
0a83cbbb
SH
3443 bool snap = flags & LXC_CLONE_SNAPSHOT;
3444 bool maybe_snap = flags & LXC_CLONE_MAYBE_SNAPSHOT;
3445 bool keepbdevtype = flags & LXC_CLONE_KEEPBDEVTYPE;
1354955b
SH
3446 const char *src = c0->lxc_conf->rootfs.path;
3447 const char *oldname = c0->name;
3448 const char *oldpath = c0->config_path;
3449 struct rsync_data data;
9be53773
SH
3450
3451 /* if the container name doesn't show up in the rootfs path, then
3452 * we don't know how to come up with a new name
3453 */
3454 if (strstr(src, oldname) == NULL) {
3455 ERROR("original rootfs path %s doesn't include container name %s",
3456 src, oldname);
3457 return NULL;
3458 }
3459
ac00e8f2 3460 orig = bdev_init(c0->lxc_conf, src, NULL, NULL);
9be53773 3461 if (!orig) {
959aee9c 3462 ERROR("failed to detect blockdev type for %s", src);
9be53773
SH
3463 return NULL;
3464 }
3465
ac00e8f2
KY
3466 if (!orig->dest) {
3467 int ret;
730e3f9e
SH
3468 size_t len;
3469 struct stat sb;
3470
3471 len = strlen(oldpath) + strlen(oldname) + strlen("/rootfs") + 2;
3472 orig->dest = malloc(len);
ac00e8f2
KY
3473 if (!orig->dest) {
3474 ERROR("out of memory");
3475 bdev_put(orig);
3476 return NULL;
3477 }
730e3f9e
SH
3478 ret = snprintf(orig->dest, len, "%s/%s/rootfs", oldpath, oldname);
3479 if (ret < 0 || ret >= len) {
ac00e8f2
KY
3480 ERROR("rootfs path too long");
3481 bdev_put(orig);
3482 return NULL;
3483 }
730e3f9e
SH
3484 ret = stat(orig->dest, &sb);
3485 if (ret < 0 && errno == ENOENT)
3486 if (mkdir_p(orig->dest, 0755) < 0)
3487 WARN("Error creating '%s', continuing.", orig->dest);
ac00e8f2
KY
3488 }
3489
0a83cbbb
SH
3490 /*
3491 * special case for snapshot - if caller requested maybe_snapshot and
3492 * keepbdevtype and backing store is directory, then proceed with a copy
3493 * clone rather than returning error
3494 */
3495 if (maybe_snap && keepbdevtype && !bdevtype && !orig->ops->can_snapshot)
3496 snap = false;
3497
e3fdf5cc
SH
3498 /*
3499 * If newtype is NULL and snapshot is set, then use overlayfs
3500 */
0a83cbbb 3501 if (!bdevtype && !keepbdevtype && snap && strcmp(orig->type , "dir") == 0)
e3fdf5cc
SH
3502 bdevtype = "overlayfs";
3503
a7ef8753
SH
3504 if (am_unpriv() && !unpriv_snap_allowed(orig, bdevtype, snap, maybe_snap)) {
3505 ERROR("Unsupported snapshot type for unprivileged users");
3506 bdev_put(orig);
3507 return NULL;
3508 }
3509
dfb31b25 3510 *needs_rdep = 0;
e34b5d2e 3511 if (bdevtype && strcmp(orig->type, "dir") == 0 &&
1f92162d 3512 (strcmp(bdevtype, "aufs") == 0 ||
d8c4c595 3513 strcmp(bdevtype, "overlayfs") == 0)) {
dfb31b25 3514 *needs_rdep = 1;
d8c4c595
KY
3515 } else if (snap && strcmp(orig->type, "lvm") == 0 &&
3516 !lvm_is_thin_volume(orig->src)) {
3517 *needs_rdep = 1;
3518 }
dfb31b25 3519
9be53773
SH
3520 new = bdev_get(bdevtype ? bdevtype : orig->type);
3521 if (!new) {
3522 ERROR("no such block device type: %s", bdevtype ? bdevtype : orig->type);
3523 bdev_put(orig);
3524 return NULL;
3525 }
3526
25190e5b
SH
3527 if (new->ops->clone_paths(orig, new, oldname, cname, oldpath, lxcpath,
3528 snap, newsize, c0->lxc_conf) < 0) {
959aee9c 3529 ERROR("failed getting pathnames for cloned storage: %s", src);
65db0e5a 3530 goto err;
9be53773 3531 }
a7ef8753
SH
3532
3533 if (am_unpriv() && chown_mapped_root(new->src, c0->lxc_conf) < 0)
3534 WARN("Failed to update ownership of %s", new->dest);
3535
1354955b
SH
3536 if (snap)
3537 return new;
9be53773 3538
65db0e5a
ÇO
3539 /*
3540 * https://github.com/lxc/lxc/issues/131
3541 * Use btrfs snapshot feature instead of rsync to restore if both orig and new are btrfs
3542 */
3543 if (bdevtype &&
3544 strcmp(orig->type, "btrfs") == 0 && strcmp(new->type, "btrfs") == 0 &&
3545 btrfs_same_fs(orig->dest, new->dest) == 0) {
3546 if (btrfs_destroy(new) < 0) {
3547 ERROR("Error destroying %s subvolume", new->dest);
3548 goto err;
3549 }
3550 if (mkdir_p(new->dest, 0755) < 0) {
3551 ERROR("Error creating %s directory", new->dest);
3552 goto err;
3553 }
3554 if (btrfs_snapshot(orig->dest, new->dest) < 0) {
3555 ERROR("Error restoring %s to %s", orig->dest, new->dest);
3556 goto err;
3557 }
3558 bdev_put(orig);
3559 return new;
3560 }
3561
9be53773
SH
3562 pid = fork();
3563 if (pid < 0) {
3564 SYSERROR("fork");
65db0e5a 3565 goto err;
9be53773
SH
3566 }
3567
3568 if (pid > 0) {
3569 int ret = wait_for_pid(pid);
3570 bdev_put(orig);
3571 if (ret < 0) {
3572 bdev_put(new);
3573 return NULL;
3574 }
3575 return new;
3576 }
3577
1354955b
SH
3578 data.orig = orig;
3579 data.new = new;
3580 if (am_unpriv())
3581 ret = userns_exec_1(c0->lxc_conf, rsync_rootfs_wrapper, &data);
3582 else
3583 ret = rsync_rootfs(&data);
9be53773 3584
1354955b 3585 exit(ret == 0 ? 0 : 1);
65db0e5a
ÇO
3586
3587err:
3588 bdev_put(orig);
3589 bdev_put(new);
3590 return NULL;
9be53773 3591}
1897e3bc 3592
d44e88c2
SH
3593static struct bdev * do_bdev_create(const char *dest, const char *type,
3594 const char *cname, struct bdev_specs *specs)
3595{
3596 struct bdev *bdev = bdev_get(type);
3597 if (!bdev) {
3598 return NULL;
3599 }
3600
3601 if (bdev->ops->create(bdev, dest, cname, specs) < 0) {
3602 bdev_put(bdev);
3603 return NULL;
3604 }
3605
3606 return bdev;
3607}
3608
1897e3bc
SH
3609/*
3610 * bdev_create:
3611 * Create a backing store for a container.
ec64264d 3612 * If successful, return a struct bdev *, with the bdev mounted and ready
1897e3bc
SH
3613 * for use. Before completing, the caller will need to call the
3614 * umount operation and bdev_put().
3615 * @dest: the mountpoint (i.e. /var/lib/lxc/$name/rootfs)
3616 * @type: the bdevtype (dir, btrfs, zfs, etc)
3617 * @cname: the container name
3618 * @specs: details about the backing store to create, like fstype
3619 */
3620struct bdev *bdev_create(const char *dest, const char *type,
3621 const char *cname, struct bdev_specs *specs)
3622{
3623 struct bdev *bdev;
d44e88c2 3624 char *best_options[] = {"btrfs", "zfs", "lvm", "dir", NULL};
1897e3bc 3625
d3060bd0 3626 if (!type)
d44e88c2
SH
3627 return do_bdev_create(dest, "dir", cname, specs);
3628
3629 if (strcmp(type, "best") == 0) {
3630 int i;
3631 // try for the best backing store type, according to our
3632 // opinionated preferences
3633 for (i=0; best_options[i]; i++) {
3634 if ((bdev = do_bdev_create(dest, best_options[i], cname, specs)))
3635 return bdev;
3636 }
3637 return NULL; // 'dir' should never fail, so this shouldn't happen
1897e3bc
SH
3638 }
3639
d44e88c2 3640 // -B lvm,dir
46cd2845 3641 if (strchr(type, ',') != NULL) {
08182d44 3642 char *dup = alloca(strlen(type)+1), *saveptr = NULL, *token;
d44e88c2
SH
3643 strcpy(dup, type);
3644 for (token = strtok_r(dup, ",", &saveptr); token;
3645 token = strtok_r(NULL, ",", &saveptr)) {
3646 if ((bdev = do_bdev_create(dest, token, cname, specs)))
3647 return bdev;
3648 }
1897e3bc
SH
3649 }
3650
d44e88c2 3651 return do_bdev_create(dest, type, cname, specs);
1897e3bc
SH
3652}
3653
1f92162d 3654char *overlay_getlower(char *p)
1897e3bc 3655{
46cd2845 3656 char *p1 = strchr(p, ':');
1897e3bc
SH
3657 if (p1)
3658 *p1 = '\0';
3659 return p;
3660}
35120d9c
SH
3661
3662bool rootfs_is_blockdev(struct lxc_conf *conf)
3663{
3664 const struct bdev_type *q;
3665 struct stat st;
3666 int ret;
3667
acf9f89e
SH
3668 if (!conf->rootfs.path || strcmp(conf->rootfs.path, "/") == 0 ||
3669 strlen(conf->rootfs.path) == 0)
3670 return false;
3671
35120d9c
SH
3672 ret = stat(conf->rootfs.path, &st);
3673 if (ret == 0 && S_ISBLK(st.st_mode))
3674 return true;
3675 q = bdev_query(conf->rootfs.path);
3676 if (!q)
3677 return false;
3678 if (strcmp(q->name, "lvm") == 0 ||
3679 strcmp(q->name, "loop") == 0 ||
3680 strcmp(q->name, "nbd") == 0)
3681 return true;
3682 return false;
3683}
339c6f1f
CB
3684
3685bool bdev_destroy(struct lxc_conf *conf)
3686{
3687 struct bdev *r;
3688 bool ret = false;
3689
3690 r = bdev_init(conf, conf->rootfs.path, conf->rootfs.mount, NULL);
3691 if (!r)
3692 return ret;
3693
3694 if (r->ops->destroy(r) == 0)
3695 ret = true;
3696 bdev_put(r);
3697
3698 return ret;
3699}
3700
3701int bdev_destroy_wrapper(void *data)
3702{
3703 struct lxc_conf *conf = data;
3704
3705 if (setgid(0) < 0) {
3706 ERROR("Failed to setgid to 0");
3707 return -1;
3708 }
3709 if (setgroups(0, NULL) < 0)
3710 WARN("Failed to clear groups");
3711 if (setuid(0) < 0) {
3712 ERROR("Failed to setuid to 0");
3713 return -1;
3714 }
3715 if (!bdev_destroy(conf))
3716 return -1;
3717 else
3718 return 0;
3719}
3720