]>
Commit | Line | Data |
---|---|---|
9be53773 SH |
1 | /* |
2 | * lxc: linux Container library | |
3 | * | |
4 | * (C) Copyright IBM Corp. 2007, 2008 | |
5 | * | |
6 | * Authors: | |
7 | * Daniel Lezcano <daniel.lezcano at free.fr> | |
8 | * | |
9 | * This library is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * This library is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with this library; if not, write to the Free Software | |
250b1eec | 21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
9be53773 SH |
22 | */ |
23 | ||
24 | /* | |
25 | * this is all just a first shot for experiment. If we go this route, much | |
ec64264d | 26 | * should change. bdev should be a directory with per-bdev file. Things which |
9be53773 SH |
27 | * I'm doing by calling out to userspace should sometimes be done through |
28 | * libraries like liblvm2 | |
29 | */ | |
30 | #define _GNU_SOURCE | |
31 | #include <stdio.h> | |
d659597e SA |
32 | #include <stdint.h> |
33 | #include <inttypes.h> | |
c476bdce SH |
34 | #include <sys/types.h> |
35 | #include <grp.h> | |
9be53773 SH |
36 | #include <unistd.h> |
37 | #include <errno.h> | |
38 | #include <sched.h> | |
39 | #include <sys/mount.h> | |
40 | #include <sys/wait.h> | |
41 | #include <libgen.h> | |
eddaaafd SH |
42 | #include <linux/loop.h> |
43 | #include <dirent.h> | |
76a26f55 | 44 | #include <sys/prctl.h> |
f2363e38 | 45 | |
9be53773 SH |
46 | #include "lxc.h" |
47 | #include "config.h" | |
48 | #include "conf.h" | |
49 | #include "bdev.h" | |
50 | #include "log.h" | |
51 | #include "error.h" | |
52 | #include "utils.h" | |
53 | #include "namespace.h" | |
54 | #include "parse.h" | |
95ee490b | 55 | #include "lxclock.h" |
ff462013 | 56 | #include "lxc-btrfs.h" |
9be53773 | 57 | |
bff13ba2 SG |
58 | #ifndef BLKGETSIZE64 |
59 | #define BLKGETSIZE64 _IOR(0x12,114,size_t) | |
60 | #endif | |
61 | ||
62 | #ifndef LO_FLAGS_AUTOCLEAR | |
63 | #define LO_FLAGS_AUTOCLEAR 4 | |
64 | #endif | |
65 | ||
f5fd66f7 SG |
66 | #ifndef LOOP_CTL_GET_FREE |
67 | #define LOOP_CTL_GET_FREE 0x4C82 | |
68 | #endif | |
69 | ||
d659597e SA |
70 | #define DEFAULT_FS_SIZE 1073741824 |
71 | #define DEFAULT_FSTYPE "ext3" | |
72 | ||
9be53773 SH |
73 | lxc_log_define(bdev, lxc); |
74 | ||
186bef00 SH |
75 | struct ovl_rsync_data { |
76 | struct bdev *orig; | |
77 | struct bdev *new; | |
78 | }; | |
79 | ||
2659c7cb SH |
80 | struct rsync_data_char { |
81 | char *src; | |
82 | char *dest; | |
83 | }; | |
84 | ||
9be53773 SH |
85 | static int do_rsync(const char *src, const char *dest) |
86 | { | |
87 | // call out to rsync | |
88 | pid_t pid; | |
89 | char *s; | |
90 | size_t l; | |
91 | ||
92 | pid = fork(); | |
93 | if (pid < 0) | |
94 | return -1; | |
95 | if (pid > 0) | |
96 | return wait_for_pid(pid); | |
025ed0f3 | 97 | |
9be53773 SH |
98 | l = strlen(src) + 2; |
99 | s = malloc(l); | |
100 | if (!s) | |
101 | exit(1); | |
102 | strcpy(s, src); | |
103 | s[l-2] = '/'; | |
104 | s[l-1] = '\0'; | |
105 | ||
186bef00 | 106 | execlp("rsync", "rsync", "-aHX", "--delete", s, dest, (char *)NULL); |
ca52dcb5 | 107 | exit(1); |
9be53773 SH |
108 | } |
109 | ||
eddaaafd | 110 | /* |
d659597e | 111 | * return block size of dev->src in units of bytes |
eddaaafd | 112 | */ |
d659597e | 113 | static int blk_getsize(struct bdev *bdev, uint64_t *size) |
9be53773 SH |
114 | { |
115 | int fd, ret; | |
eddaaafd SH |
116 | char *path = bdev->src; |
117 | ||
118 | if (strcmp(bdev->type, "loop") == 0) | |
119 | path = bdev->src + 5; | |
9be53773 SH |
120 | |
121 | fd = open(path, O_RDONLY); | |
42fb4b15 | 122 | if (fd < 0) |
9be53773 | 123 | return -1; |
d659597e SA |
124 | |
125 | ret = ioctl(fd, BLKGETSIZE64, size); // size of device in bytes | |
9be53773 SH |
126 | close(fd); |
127 | return ret; | |
128 | } | |
129 | ||
130 | /* | |
131 | * These are copied from conf.c. However as conf.c will be moved to using | |
132 | * the callback system, they can be pulled from there eventually, so we | |
133 | * don't need to pollute utils.c with these low level functions | |
134 | */ | |
135 | static int find_fstype_cb(char* buffer, void *data) | |
136 | { | |
137 | struct cbarg { | |
138 | const char *rootfs; | |
139 | const char *target; | |
a17b1e65 | 140 | const char *options; |
9be53773 SH |
141 | } *cbarg = data; |
142 | ||
a17b1e65 SG |
143 | unsigned long mntflags; |
144 | char *mntdata; | |
9be53773 SH |
145 | char *fstype; |
146 | ||
147 | /* we don't try 'nodev' entries */ | |
148 | if (strstr(buffer, "nodev")) | |
149 | return 0; | |
150 | ||
151 | fstype = buffer; | |
152 | fstype += lxc_char_left_gc(fstype, strlen(fstype)); | |
153 | fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0'; | |
154 | ||
155 | DEBUG("trying to mount '%s'->'%s' with fstype '%s'", | |
156 | cbarg->rootfs, cbarg->target, fstype); | |
157 | ||
a17b1e65 SG |
158 | if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) { |
159 | free(mntdata); | |
160 | return 0; | |
161 | } | |
162 | ||
163 | if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) { | |
9be53773 | 164 | DEBUG("mount failed with error: %s", strerror(errno)); |
a17b1e65 | 165 | free(mntdata); |
9be53773 SH |
166 | return 0; |
167 | } | |
168 | ||
a17b1e65 SG |
169 | free(mntdata); |
170 | ||
9be53773 SH |
171 | INFO("mounted '%s' on '%s', with fstype '%s'", |
172 | cbarg->rootfs, cbarg->target, fstype); | |
173 | ||
174 | return 1; | |
175 | } | |
176 | ||
a17b1e65 SG |
177 | static int mount_unknown_fs(const char *rootfs, const char *target, |
178 | const char *options) | |
9be53773 SH |
179 | { |
180 | int i; | |
181 | ||
182 | struct cbarg { | |
183 | const char *rootfs; | |
184 | const char *target; | |
a17b1e65 | 185 | const char *options; |
9be53773 SH |
186 | } cbarg = { |
187 | .rootfs = rootfs, | |
188 | .target = target, | |
a17b1e65 | 189 | .options = options, |
9be53773 SH |
190 | }; |
191 | ||
192 | /* | |
193 | * find the filesystem type with brute force: | |
194 | * first we check with /etc/filesystems, in case the modules | |
195 | * are auto-loaded and fall back to the supported kernel fs | |
196 | */ | |
197 | char *fsfile[] = { | |
198 | "/etc/filesystems", | |
199 | "/proc/filesystems", | |
200 | }; | |
201 | ||
202 | for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) { | |
203 | ||
204 | int ret; | |
205 | ||
206 | if (access(fsfile[i], F_OK)) | |
207 | continue; | |
208 | ||
209 | ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg); | |
210 | if (ret < 0) { | |
211 | ERROR("failed to parse '%s'", fsfile[i]); | |
212 | return -1; | |
213 | } | |
214 | ||
215 | if (ret) | |
216 | return 0; | |
217 | } | |
218 | ||
219 | ERROR("failed to determine fs type for '%s'", rootfs); | |
220 | return -1; | |
221 | } | |
222 | ||
223 | static int do_mkfs(const char *path, const char *fstype) | |
224 | { | |
225 | pid_t pid; | |
226 | ||
227 | if ((pid = fork()) < 0) { | |
228 | ERROR("error forking"); | |
229 | return -1; | |
230 | } | |
231 | if (pid > 0) | |
232 | return wait_for_pid(pid); | |
233 | ||
eddaaafd SH |
234 | // If the file is not a block device, we don't want mkfs to ask |
235 | // us about whether to proceed. | |
69aeabac TA |
236 | if (null_stdfds() < 0) |
237 | exit(1); | |
ca52dcb5 SH |
238 | execlp("mkfs", "mkfs", "-t", fstype, path, NULL); |
239 | exit(1); | |
9be53773 SH |
240 | } |
241 | ||
242 | static char *linkderef(char *path, char *dest) | |
243 | { | |
244 | struct stat sbuf; | |
245 | ssize_t ret; | |
246 | ||
247 | ret = stat(path, &sbuf); | |
248 | if (ret < 0) | |
249 | return NULL; | |
250 | if (!S_ISLNK(sbuf.st_mode)) | |
251 | return path; | |
252 | ret = readlink(path, dest, MAXPATHLEN); | |
253 | if (ret < 0) { | |
254 | SYSERROR("error reading link %s", path); | |
255 | return NULL; | |
256 | } else if (ret >= MAXPATHLEN) { | |
257 | ERROR("link in %s too long", path); | |
258 | return NULL; | |
259 | } | |
260 | dest[ret] = '\0'; | |
261 | return dest; | |
262 | } | |
263 | ||
264 | /* | |
265 | * Given a bdev (presumably blockdev-based), detect the fstype | |
266 | * by trying mounting (in a private mntns) it. | |
267 | * @bdev: bdev to investigate | |
268 | * @type: preallocated char* in which to write the fstype | |
269 | * @len: length of passed in char* | |
270 | * Returns length of fstype, of -1 on error | |
271 | */ | |
272 | static int detect_fs(struct bdev *bdev, char *type, int len) | |
273 | { | |
274 | int p[2], ret; | |
275 | size_t linelen; | |
276 | pid_t pid; | |
277 | FILE *f; | |
278 | char *sp1, *sp2, *sp3, *line = NULL; | |
5d9598d7 | 279 | char *srcdev; |
9be53773 SH |
280 | |
281 | if (!bdev || !bdev->src || !bdev->dest) | |
282 | return -1; | |
283 | ||
5d9598d7 | 284 | srcdev = bdev->src; |
eddaaafd SH |
285 | if (strcmp(bdev->type, "loop") == 0) |
286 | srcdev = bdev->src + 5; | |
287 | ||
025ed0f3 | 288 | ret = pipe(p); |
025ed0f3 | 289 | if (ret < 0) |
9be53773 SH |
290 | return -1; |
291 | if ((pid = fork()) < 0) | |
292 | return -1; | |
293 | if (pid > 0) { | |
294 | int status; | |
295 | close(p[1]); | |
296 | memset(type, 0, len); | |
297 | ret = read(p[0], type, len-1); | |
298 | close(p[0]); | |
299 | if (ret < 0) { | |
300 | SYSERROR("error reading from pipe"); | |
301 | wait(&status); | |
302 | return -1; | |
303 | } else if (ret == 0) { | |
304 | ERROR("child exited early - fstype not found"); | |
305 | wait(&status); | |
306 | return -1; | |
307 | } | |
308 | wait(&status); | |
309 | type[len-1] = '\0'; | |
eddaaafd | 310 | INFO("detected fstype %s for %s", type, srcdev); |
9be53773 SH |
311 | return ret; |
312 | } | |
313 | ||
314 | if (unshare(CLONE_NEWNS) < 0) | |
315 | exit(1); | |
316 | ||
2c6f3fc9 SH |
317 | if (detect_shared_rootfs()) { |
318 | if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) { | |
319 | SYSERROR("Failed to make / rslave"); | |
320 | ERROR("Continuing..."); | |
321 | } | |
322 | } | |
323 | ||
a17b1e65 | 324 | ret = mount_unknown_fs(srcdev, bdev->dest, bdev->mntopts); |
9be53773 | 325 | if (ret < 0) { |
eddaaafd | 326 | ERROR("failed mounting %s onto %s to detect fstype", srcdev, bdev->dest); |
9be53773 SH |
327 | exit(1); |
328 | } | |
329 | // if symlink, get the real dev name | |
330 | char devpath[MAXPATHLEN]; | |
eddaaafd | 331 | char *l = linkderef(srcdev, devpath); |
9be53773 SH |
332 | if (!l) |
333 | exit(1); | |
334 | f = fopen("/proc/self/mounts", "r"); | |
335 | if (!f) | |
336 | exit(1); | |
337 | while (getline(&line, &linelen, f) != -1) { | |
46cd2845 | 338 | sp1 = strchr(line, ' '); |
9be53773 SH |
339 | if (!sp1) |
340 | exit(1); | |
341 | *sp1 = '\0'; | |
342 | if (strcmp(line, l)) | |
343 | continue; | |
46cd2845 | 344 | sp2 = strchr(sp1+1, ' '); |
9be53773 SH |
345 | if (!sp2) |
346 | exit(1); | |
347 | *sp2 = '\0'; | |
46cd2845 | 348 | sp3 = strchr(sp2+1, ' '); |
9be53773 SH |
349 | if (!sp3) |
350 | exit(1); | |
351 | *sp3 = '\0'; | |
352 | sp2++; | |
353 | if (write(p[1], sp2, strlen(sp2)) != strlen(sp2)) | |
354 | exit(1); | |
355 | exit(0); | |
356 | } | |
357 | exit(1); | |
358 | } | |
359 | ||
360 | struct bdev_type { | |
74a3920a AM |
361 | const char *name; |
362 | const struct bdev_ops *ops; | |
9be53773 SH |
363 | }; |
364 | ||
9be53773 SH |
365 | static int dir_detect(const char *path) |
366 | { | |
367 | if (strncmp(path, "dir:", 4) == 0) | |
368 | return 1; // take their word for it | |
369 | if (is_dir(path)) | |
370 | return 1; | |
371 | return 0; | |
372 | } | |
373 | ||
374 | // | |
375 | // XXXXXXX plain directory bind mount ops | |
376 | // | |
60bf62d4 | 377 | static int dir_mount(struct bdev *bdev) |
9be53773 | 378 | { |
a17b1e65 SG |
379 | unsigned long mntflags; |
380 | char *mntdata; | |
381 | int ret; | |
382 | ||
9be53773 SH |
383 | if (strcmp(bdev->type, "dir")) |
384 | return -22; | |
385 | if (!bdev->src || !bdev->dest) | |
386 | return -22; | |
a17b1e65 SG |
387 | |
388 | if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) { | |
389 | free(mntdata); | |
390 | return -22; | |
391 | } | |
392 | ||
393 | ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata); | |
394 | free(mntdata); | |
395 | return ret; | |
9be53773 SH |
396 | } |
397 | ||
60bf62d4 | 398 | static int dir_umount(struct bdev *bdev) |
9be53773 SH |
399 | { |
400 | if (strcmp(bdev->type, "dir")) | |
401 | return -22; | |
402 | if (!bdev->src || !bdev->dest) | |
403 | return -22; | |
404 | return umount(bdev->dest); | |
405 | } | |
406 | ||
407 | /* the bulk of this needs to become a common helper */ | |
408 | static char *dir_new_path(char *src, const char *oldname, const char *name, | |
409 | const char *oldpath, const char *lxcpath) | |
410 | { | |
411 | char *ret, *p, *p2; | |
412 | int l1, l2, nlen; | |
413 | ||
414 | nlen = strlen(src) + 1; | |
415 | l1 = strlen(oldpath); | |
416 | p = src; | |
417 | /* if src starts with oldpath, look for oldname only after | |
418 | * that path */ | |
419 | if (strncmp(src, oldpath, l1) == 0) { | |
420 | p += l1; | |
421 | nlen += (strlen(lxcpath) - l1); | |
422 | } | |
423 | l2 = strlen(oldname); | |
424 | while ((p = strstr(p, oldname)) != NULL) { | |
425 | p += l2; | |
426 | nlen += strlen(name) - l2; | |
427 | } | |
428 | ||
429 | ret = malloc(nlen); | |
430 | if (!ret) | |
431 | return NULL; | |
432 | ||
433 | p = ret; | |
434 | if (strncmp(src, oldpath, l1) == 0) { | |
435 | p += sprintf(p, "%s", lxcpath); | |
436 | src += l1; | |
437 | } | |
438 | ||
439 | while ((p2 = strstr(src, oldname)) != NULL) { | |
440 | strncpy(p, src, p2-src); // copy text up to oldname | |
441 | p += p2-src; // move target pointer (p) | |
442 | p += sprintf(p, "%s", name); // print new name in place of oldname | |
443 | src = p2 + l2; // move src to end of oldname | |
444 | } | |
445 | sprintf(p, "%s", src); // copy the rest of src | |
446 | return ret; | |
447 | } | |
448 | ||
449 | /* | |
450 | * for a simple directory bind mount, we substitute the old container | |
451 | * name and paths for the new | |
452 | */ | |
453 | static int dir_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, | |
454 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
25190e5b | 455 | uint64_t newsize, struct lxc_conf *conf) |
9be53773 | 456 | { |
ca52dcb5 SH |
457 | int len, ret; |
458 | ||
9be53773 | 459 | if (snap) { |
1f92162d | 460 | ERROR("directories cannot be snapshotted. Try aufs or overlayfs."); |
9be53773 SH |
461 | return -1; |
462 | } | |
463 | ||
9be53773 SH |
464 | if (!orig->dest || !orig->src) |
465 | return -1; | |
9be53773 | 466 | |
ca52dcb5 SH |
467 | len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3; |
468 | new->src = malloc(len); | |
9be53773 SH |
469 | if (!new->src) |
470 | return -1; | |
ca52dcb5 SH |
471 | ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname); |
472 | if (ret < 0 || ret >= len) | |
473 | return -1; | |
474 | if ((new->dest = strdup(new->src)) == NULL) | |
475 | return -1; | |
9be53773 SH |
476 | |
477 | return 0; | |
478 | } | |
479 | ||
60bf62d4 SH |
480 | static int dir_destroy(struct bdev *orig) |
481 | { | |
18aa217b | 482 | if (lxc_rmdir_onedev(orig->src, NULL) < 0) |
60bf62d4 SH |
483 | return -1; |
484 | return 0; | |
485 | } | |
486 | ||
1897e3bc SH |
487 | static int dir_create(struct bdev *bdev, const char *dest, const char *n, |
488 | struct bdev_specs *specs) | |
489 | { | |
5292adfd | 490 | if (specs && specs->dir) |
7bb87886 SH |
491 | bdev->src = strdup(specs->dir); |
492 | else | |
493 | bdev->src = strdup(dest); | |
1897e3bc SH |
494 | bdev->dest = strdup(dest); |
495 | if (!bdev->src || !bdev->dest) { | |
496 | ERROR("Out of memory"); | |
497 | return -1; | |
498 | } | |
499 | ||
500 | if (mkdir_p(bdev->src, 0755) < 0) { | |
959aee9c | 501 | ERROR("Error creating %s", bdev->src); |
1897e3bc SH |
502 | return -1; |
503 | } | |
504 | if (mkdir_p(bdev->dest, 0755) < 0) { | |
959aee9c | 505 | ERROR("Error creating %s", bdev->dest); |
1897e3bc SH |
506 | return -1; |
507 | } | |
508 | ||
509 | return 0; | |
510 | } | |
511 | ||
74a3920a | 512 | static const struct bdev_ops dir_ops = { |
9be53773 SH |
513 | .detect = &dir_detect, |
514 | .mount = &dir_mount, | |
515 | .umount = &dir_umount, | |
516 | .clone_paths = &dir_clonepaths, | |
60bf62d4 | 517 | .destroy = &dir_destroy, |
1897e3bc | 518 | .create = &dir_create, |
0a83cbbb | 519 | .can_snapshot = false, |
cdd01be2 | 520 | .can_backup = true, |
9be53773 SH |
521 | }; |
522 | ||
3baa76fe SH |
523 | |
524 | // | |
525 | // XXXXXXX zfs ops | |
526 | // There are two ways we could do this. We could always specify the | |
527 | // 'zfs device' (i.e. tank/lxc lxc/container) as rootfs. But instead | |
528 | // (at least right now) we have lxc-create specify $lxcpath/$lxcname/rootfs | |
529 | // as the mountpoint, so that it is always mounted. | |
530 | // | |
531 | // That means 'mount' is really never needed and could be noop, but for the | |
532 | // sake of flexibility let's always bind-mount. | |
533 | // | |
534 | ||
60bf62d4 | 535 | static int zfs_list_entry(const char *path, char *output, size_t inlen) |
3baa76fe | 536 | { |
ebec9176 | 537 | struct lxc_popen_FILE *f; |
3baa76fe SH |
538 | int found=0; |
539 | ||
ebec9176 | 540 | f = lxc_popen("zfs list 2> /dev/null"); |
025ed0f3 | 541 | if (f == NULL) { |
3baa76fe SH |
542 | SYSERROR("popen failed"); |
543 | return 0; | |
544 | } | |
ebec9176 | 545 | while (fgets(output, inlen, f->f)) { |
3baa76fe SH |
546 | if (strstr(output, path)) { |
547 | found = 1; | |
548 | break; | |
549 | } | |
550 | } | |
ebec9176 | 551 | (void) lxc_pclose(f); |
3baa76fe SH |
552 | |
553 | return found; | |
554 | } | |
555 | ||
556 | static int zfs_detect(const char *path) | |
557 | { | |
558 | char *output = malloc(LXC_LOG_BUFFER_SIZE); | |
559 | int found; | |
560 | ||
561 | if (!output) { | |
562 | ERROR("out of memory"); | |
563 | return 0; | |
564 | } | |
60bf62d4 | 565 | found = zfs_list_entry(path, output, LXC_LOG_BUFFER_SIZE); |
3baa76fe SH |
566 | free(output); |
567 | return found; | |
568 | } | |
569 | ||
60bf62d4 | 570 | static int zfs_mount(struct bdev *bdev) |
3baa76fe | 571 | { |
a17b1e65 SG |
572 | unsigned long mntflags; |
573 | char *mntdata; | |
574 | int ret; | |
575 | ||
3baa76fe SH |
576 | if (strcmp(bdev->type, "zfs")) |
577 | return -22; | |
578 | if (!bdev->src || !bdev->dest) | |
579 | return -22; | |
a17b1e65 SG |
580 | |
581 | if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) { | |
582 | free(mntdata); | |
583 | return -22; | |
584 | } | |
585 | ||
586 | ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata); | |
587 | free(mntdata); | |
588 | return ret; | |
3baa76fe SH |
589 | } |
590 | ||
60bf62d4 | 591 | static int zfs_umount(struct bdev *bdev) |
3baa76fe SH |
592 | { |
593 | if (strcmp(bdev->type, "zfs")) | |
594 | return -22; | |
595 | if (!bdev->src || !bdev->dest) | |
596 | return -22; | |
597 | return umount(bdev->dest); | |
598 | } | |
599 | ||
600 | static int zfs_clone(const char *opath, const char *npath, const char *oname, | |
601 | const char *nname, const char *lxcpath, int snapshot) | |
602 | { | |
603 | // use the 'zfs list | grep opath' entry to get the zfsroot | |
604 | char output[MAXPATHLEN], option[MAXPATHLEN], *p; | |
31a95fec | 605 | const char *zfsroot = output; |
3baa76fe SH |
606 | int ret; |
607 | pid_t pid; | |
608 | ||
60bf62d4 | 609 | if (zfs_list_entry(opath, output, MAXPATHLEN)) { |
31a95fec | 610 | // zfsroot is output up to ' ' |
46cd2845 | 611 | if ((p = strchr(output, ' ')) == NULL) |
31a95fec SH |
612 | return -1; |
613 | *p = '\0'; | |
c32981c3 | 614 | if ((p = strrchr(output, '/')) == NULL) |
31a95fec SH |
615 | return -1; |
616 | *p = '\0'; | |
617 | } else | |
2e59ba02 | 618 | zfsroot = lxc_global_config_value("lxc.bdev.zfs.root"); |
3baa76fe SH |
619 | |
620 | ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s/%s/rootfs", | |
621 | lxcpath, nname); | |
622 | if (ret < 0 || ret >= MAXPATHLEN) | |
623 | return -1; | |
624 | ||
3baa76fe SH |
625 | // zfs create -omountpoint=$lxcpath/$lxcname $zfsroot/$nname |
626 | if (!snapshot) { | |
627 | if ((pid = fork()) < 0) | |
628 | return -1; | |
629 | if (!pid) { | |
630 | char dev[MAXPATHLEN]; | |
025ed0f3 | 631 | |
31a95fec | 632 | ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, nname); |
3baa76fe SH |
633 | if (ret < 0 || ret >= MAXPATHLEN) |
634 | exit(1); | |
ca52dcb5 SH |
635 | execlp("zfs", "zfs", "create", option, dev, NULL); |
636 | exit(1); | |
3baa76fe SH |
637 | } |
638 | return wait_for_pid(pid); | |
639 | } else { | |
640 | // if snapshot, do | |
641 | // 'zfs snapshot zfsroot/oname@nname | |
642 | // zfs clone zfsroot/oname@nname zfsroot/nname | |
643 | char path1[MAXPATHLEN], path2[MAXPATHLEN]; | |
644 | ||
31a95fec | 645 | ret = snprintf(path1, MAXPATHLEN, "%s/%s@%s", zfsroot, |
3baa76fe SH |
646 | oname, nname); |
647 | if (ret < 0 || ret >= MAXPATHLEN) | |
648 | return -1; | |
31a95fec | 649 | (void) snprintf(path2, MAXPATHLEN, "%s/%s", zfsroot, nname); |
3baa76fe SH |
650 | |
651 | // if the snapshot exists, delete it | |
652 | if ((pid = fork()) < 0) | |
653 | return -1; | |
654 | if (!pid) { | |
ca52dcb5 SH |
655 | execlp("zfs", "zfs", "destroy", path1, NULL); |
656 | exit(1); | |
3baa76fe SH |
657 | } |
658 | // it probably doesn't exist so destroy probably will fail. | |
659 | (void) wait_for_pid(pid); | |
660 | ||
661 | // run first (snapshot) command | |
662 | if ((pid = fork()) < 0) | |
663 | return -1; | |
664 | if (!pid) { | |
ca52dcb5 SH |
665 | execlp("zfs", "zfs", "snapshot", path1, NULL); |
666 | exit(1); | |
3baa76fe SH |
667 | } |
668 | if (wait_for_pid(pid) < 0) | |
669 | return -1; | |
670 | ||
671 | // run second (clone) command | |
672 | if ((pid = fork()) < 0) | |
673 | return -1; | |
674 | if (!pid) { | |
ca52dcb5 SH |
675 | execlp("zfs", "zfs", "clone", option, path1, path2, NULL); |
676 | exit(1); | |
3baa76fe SH |
677 | } |
678 | return wait_for_pid(pid); | |
679 | } | |
680 | } | |
681 | ||
682 | static int zfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, | |
683 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
25190e5b | 684 | uint64_t newsize, struct lxc_conf *conf) |
3baa76fe | 685 | { |
ca52dcb5 SH |
686 | int len, ret; |
687 | ||
3baa76fe SH |
688 | if (!orig->src || !orig->dest) |
689 | return -1; | |
690 | ||
ca52dcb5 SH |
691 | if (snap && strcmp(orig->type, "zfs")) { |
692 | ERROR("zfs snapshot from %s backing store is not supported", | |
3baa76fe SH |
693 | orig->type); |
694 | return -1; | |
695 | } | |
696 | ||
ca52dcb5 SH |
697 | len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3; |
698 | new->src = malloc(len); | |
3baa76fe SH |
699 | if (!new->src) |
700 | return -1; | |
ca52dcb5 SH |
701 | ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname); |
702 | if (ret < 0 || ret >= len) | |
703 | return -1; | |
704 | if ((new->dest = strdup(new->src)) == NULL) | |
705 | return -1; | |
3baa76fe SH |
706 | |
707 | return zfs_clone(orig->src, new->src, oldname, cname, lxcpath, snap); | |
708 | } | |
709 | ||
60bf62d4 SH |
710 | /* |
711 | * TODO: detect whether this was a clone, and if so then also delete the | |
712 | * snapshot it was based on, so that we don't hold the original | |
713 | * container busy. | |
714 | */ | |
715 | static int zfs_destroy(struct bdev *orig) | |
716 | { | |
717 | pid_t pid; | |
718 | char output[MAXPATHLEN], *p; | |
719 | ||
720 | if ((pid = fork()) < 0) | |
721 | return -1; | |
722 | if (pid) | |
723 | return wait_for_pid(pid); | |
724 | ||
725 | if (!zfs_list_entry(orig->src, output, MAXPATHLEN)) { | |
726 | ERROR("Error: zfs entry for %s not found", orig->src); | |
727 | return -1; | |
728 | } | |
729 | ||
730 | // zfs mount is output up to ' ' | |
46cd2845 | 731 | if ((p = strchr(output, ' ')) == NULL) |
60bf62d4 SH |
732 | return -1; |
733 | *p = '\0'; | |
734 | ||
735 | execlp("zfs", "zfs", "destroy", output, NULL); | |
736 | exit(1); | |
737 | } | |
738 | ||
1897e3bc SH |
739 | static int zfs_create(struct bdev *bdev, const char *dest, const char *n, |
740 | struct bdev_specs *specs) | |
741 | { | |
742 | const char *zfsroot; | |
743 | char option[MAXPATHLEN]; | |
744 | int ret; | |
745 | pid_t pid; | |
746 | ||
72e99249 | 747 | if (!specs || !specs->zfs.zfsroot) |
2e59ba02 | 748 | zfsroot = lxc_global_config_value("lxc.bdev.zfs.root"); |
1897e3bc | 749 | else |
72e99249 | 750 | zfsroot = specs->zfs.zfsroot; |
1897e3bc SH |
751 | |
752 | if (!(bdev->dest = strdup(dest))) { | |
753 | ERROR("No mount target specified or out of memory"); | |
754 | return -1; | |
755 | } | |
756 | if (!(bdev->src = strdup(bdev->dest))) { | |
757 | ERROR("out of memory"); | |
758 | return -1; | |
759 | } | |
760 | ||
761 | ret = snprintf(option, MAXPATHLEN, "-omountpoint=%s", bdev->dest); | |
762 | if (ret < 0 || ret >= MAXPATHLEN) | |
763 | return -1; | |
764 | if ((pid = fork()) < 0) | |
765 | return -1; | |
766 | if (pid) | |
767 | return wait_for_pid(pid); | |
768 | ||
769 | char dev[MAXPATHLEN]; | |
770 | ret = snprintf(dev, MAXPATHLEN, "%s/%s", zfsroot, n); | |
771 | if (ret < 0 || ret >= MAXPATHLEN) | |
772 | exit(1); | |
773 | execlp("zfs", "zfs", "create", option, dev, NULL); | |
774 | exit(1); | |
775 | } | |
776 | ||
74a3920a | 777 | static const struct bdev_ops zfs_ops = { |
3baa76fe SH |
778 | .detect = &zfs_detect, |
779 | .mount = &zfs_mount, | |
780 | .umount = &zfs_umount, | |
781 | .clone_paths = &zfs_clonepaths, | |
60bf62d4 | 782 | .destroy = &zfs_destroy, |
1897e3bc | 783 | .create = &zfs_create, |
0a83cbbb | 784 | .can_snapshot = true, |
cdd01be2 | 785 | .can_backup = true, |
3baa76fe SH |
786 | }; |
787 | ||
9be53773 SH |
788 | // |
789 | // LVM ops | |
790 | // | |
791 | ||
792 | /* | |
793 | * Look at /sys/dev/block/maj:min/dm/uuid. If it contains the hardcoded LVM | |
794 | * prefix "LVM-", then this is an lvm2 LV | |
795 | */ | |
796 | static int lvm_detect(const char *path) | |
797 | { | |
798 | char devp[MAXPATHLEN], buf[4]; | |
799 | FILE *fout; | |
800 | int ret; | |
801 | struct stat statbuf; | |
802 | ||
803 | if (strncmp(path, "lvm:", 4) == 0) | |
804 | return 1; // take their word for it | |
805 | ||
806 | ret = stat(path, &statbuf); | |
807 | if (ret != 0) | |
808 | return 0; | |
809 | if (!S_ISBLK(statbuf.st_mode)) | |
810 | return 0; | |
811 | ||
812 | ret = snprintf(devp, MAXPATHLEN, "/sys/dev/block/%d:%d/dm/uuid", | |
813 | major(statbuf.st_rdev), minor(statbuf.st_rdev)); | |
814 | if (ret < 0 || ret >= MAXPATHLEN) { | |
815 | ERROR("lvm uuid pathname too long"); | |
816 | return 0; | |
817 | } | |
818 | fout = fopen(devp, "r"); | |
819 | if (!fout) | |
820 | return 0; | |
821 | ret = fread(buf, 1, 4, fout); | |
822 | fclose(fout); | |
823 | if (ret != 4 || strncmp(buf, "LVM-", 4) != 0) | |
824 | return 0; | |
825 | return 1; | |
826 | } | |
827 | ||
828 | static int lvm_mount(struct bdev *bdev) | |
829 | { | |
830 | if (strcmp(bdev->type, "lvm")) | |
831 | return -22; | |
832 | if (!bdev->src || !bdev->dest) | |
833 | return -22; | |
834 | /* if we might pass in data sometime, then we'll have to enrich | |
8ddf877b | 835 | * mount_unknown_fs */ |
a17b1e65 | 836 | return mount_unknown_fs(bdev->src, bdev->dest, bdev->mntopts); |
9be53773 SH |
837 | } |
838 | ||
839 | static int lvm_umount(struct bdev *bdev) | |
840 | { | |
841 | if (strcmp(bdev->type, "lvm")) | |
842 | return -22; | |
843 | if (!bdev->src || !bdev->dest) | |
844 | return -22; | |
845 | return umount(bdev->dest); | |
846 | } | |
847 | ||
055af165 | 848 | static int lvm_compare_lv_attr(const char *path, int pos, const char expected) { |
ebec9176 | 849 | struct lxc_popen_FILE *f; |
8aba14bb | 850 | int ret, len, status, start=0; |
f99c386b SS |
851 | char *cmd, output[12]; |
852 | const char *lvscmd = "lvs --unbuffered --noheadings -o lv_attr %s 2>/dev/null"; | |
853 | ||
854 | len = strlen(lvscmd) + strlen(path) - 1; | |
55a204f9 | 855 | cmd = alloca(len); |
f99c386b SS |
856 | |
857 | ret = snprintf(cmd, len, lvscmd, path); | |
858 | if (ret < 0 || ret >= len) | |
859 | return -1; | |
860 | ||
ebec9176 | 861 | f = lxc_popen(cmd); |
f99c386b SS |
862 | |
863 | if (f == NULL) { | |
864 | SYSERROR("popen failed"); | |
865 | return -1; | |
866 | } | |
867 | ||
ebec9176 | 868 | ret = fgets(output, 12, f->f) == NULL; |
f99c386b | 869 | |
ebec9176 | 870 | status = lxc_pclose(f); |
f99c386b | 871 | |
8aba14bb SS |
872 | if (ret || WEXITSTATUS(status)) |
873 | // Assume either vg or lvs do not exist, default | |
874 | // comparison to false. | |
875 | return 0; | |
f99c386b SS |
876 | |
877 | len = strlen(output); | |
878 | while(start < len && output[start] == ' ') start++; | |
879 | ||
055af165 | 880 | if (start + pos < len && output[start + pos] == expected) |
f99c386b SS |
881 | return 1; |
882 | ||
883 | return 0; | |
884 | } | |
885 | ||
055af165 SS |
886 | static int lvm_is_thin_volume(const char *path) |
887 | { | |
888 | return lvm_compare_lv_attr(path, 6, 't'); | |
889 | } | |
890 | ||
891 | static int lvm_is_thin_pool(const char *path) | |
892 | { | |
893 | return lvm_compare_lv_attr(path, 0, 't'); | |
894 | } | |
895 | ||
896 | /* | |
897 | * path must be '/dev/$vg/$lv', $vg must be an existing VG, and $lv must not | |
898 | * yet exist. This function will attempt to create /dev/$vg/$lv of size | |
899 | * $size. If thinpool is specified, we'll check for it's existence and if it's | |
900 | * a valid thin pool, and if so, we'll create the requested lv from that thin | |
901 | * pool. | |
902 | */ | |
d659597e | 903 | static int do_lvm_create(const char *path, uint64_t size, const char *thinpool) |
055af165 SS |
904 | { |
905 | int ret, pid, len; | |
72e99249 | 906 | char sz[24], *pathdup, *vg, *lv, *tp = NULL; |
055af165 SS |
907 | |
908 | if ((pid = fork()) < 0) { | |
909 | SYSERROR("failed fork"); | |
910 | return -1; | |
911 | } | |
912 | if (pid > 0) | |
913 | return wait_for_pid(pid); | |
914 | ||
d659597e SA |
915 | // specify bytes to lvcreate |
916 | ret = snprintf(sz, 24, "%"PRIu64"b", size); | |
055af165 SS |
917 | if (ret < 0 || ret >= 24) |
918 | exit(1); | |
919 | ||
920 | pathdup = strdup(path); | |
921 | if (!pathdup) | |
922 | exit(1); | |
923 | ||
924 | lv = strrchr(pathdup, '/'); | |
47b6e6cf | 925 | if (!lv) |
055af165 | 926 | exit(1); |
47b6e6cf | 927 | |
055af165 SS |
928 | *lv = '\0'; |
929 | lv++; | |
930 | ||
931 | vg = strrchr(pathdup, '/'); | |
47b6e6cf | 932 | if (!vg) |
055af165 SS |
933 | exit(1); |
934 | vg++; | |
935 | ||
936 | if (thinpool) { | |
937 | len = strlen(pathdup) + strlen(thinpool) + 2; | |
938 | tp = alloca(len); | |
939 | ||
055af165 | 940 | ret = snprintf(tp, len, "%s/%s", pathdup, thinpool); |
47b6e6cf | 941 | if (ret < 0 || ret >= len) |
72e99249 | 942 | exit(1); |
055af165 SS |
943 | |
944 | ret = lvm_is_thin_pool(tp); | |
945 | INFO("got %d for thin pool at path: %s", ret, tp); | |
47b6e6cf | 946 | if (ret < 0) |
72e99249 | 947 | exit(1); |
055af165 SS |
948 | |
949 | if (!ret) | |
72e99249 | 950 | tp = NULL; |
055af165 SS |
951 | } |
952 | ||
47b6e6cf | 953 | if (!tp) |
055af165 | 954 | execlp("lvcreate", "lvcreate", "-L", sz, vg, "-n", lv, (char *)NULL); |
47b6e6cf | 955 | else |
72e99249 | 956 | execlp("lvcreate", "lvcreate", "--thinpool", tp, "-V", sz, vg, "-n", lv, (char *)NULL); |
055af165 | 957 | |
47b6e6cf | 958 | SYSERROR("execlp"); |
055af165 SS |
959 | exit(1); |
960 | } | |
961 | ||
d659597e | 962 | static int lvm_snapshot(const char *orig, const char *path, uint64_t size) |
9be53773 SH |
963 | { |
964 | int ret, pid; | |
965 | char sz[24], *pathdup, *lv; | |
966 | ||
967 | if ((pid = fork()) < 0) { | |
968 | SYSERROR("failed fork"); | |
969 | return -1; | |
970 | } | |
971 | if (pid > 0) | |
972 | return wait_for_pid(pid); | |
025ed0f3 | 973 | |
d659597e SA |
974 | // specify bytes to lvcreate |
975 | ret = snprintf(sz, 24, "%"PRIu64"b", size); | |
9be53773 SH |
976 | if (ret < 0 || ret >= 24) |
977 | exit(1); | |
978 | ||
979 | pathdup = strdup(path); | |
980 | if (!pathdup) | |
981 | exit(1); | |
c32981c3 | 982 | lv = strrchr(pathdup, '/'); |
9be53773 SH |
983 | if (!lv) { |
984 | free(pathdup); | |
985 | exit(1); | |
986 | } | |
987 | *lv = '\0'; | |
988 | lv++; | |
989 | ||
f99c386b SS |
990 | // check if the original lv is backed by a thin pool, in which case we |
991 | // cannot specify a size that's different from the original size. | |
992 | ret = lvm_is_thin_volume(orig); | |
9529609a ÇO |
993 | if (ret == -1) { |
994 | free(pathdup); | |
f99c386b | 995 | return -1; |
9529609a | 996 | } |
f99c386b SS |
997 | |
998 | if (!ret) { | |
999 | ret = execlp("lvcreate", "lvcreate", "-s", "-L", sz, "-n", lv, orig, (char *)NULL); | |
1000 | } else { | |
1001 | ret = execlp("lvcreate", "lvcreate", "-s", "-n", lv, orig, (char *)NULL); | |
1002 | } | |
1003 | ||
9be53773 | 1004 | free(pathdup); |
ca52dcb5 SH |
1005 | exit(1); |
1006 | } | |
1007 | ||
1008 | // this will return 1 for physical disks, qemu-nbd, loop, etc | |
1009 | // right now only lvm is a block device | |
1010 | static int is_blktype(struct bdev *b) | |
1011 | { | |
1012 | if (strcmp(b->type, "lvm") == 0) | |
1013 | return 1; | |
1014 | return 0; | |
9be53773 SH |
1015 | } |
1016 | ||
1017 | static int lvm_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, | |
1018 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
25190e5b | 1019 | uint64_t newsize, struct lxc_conf *conf) |
9be53773 SH |
1020 | { |
1021 | char fstype[100]; | |
d659597e | 1022 | uint64_t size = newsize; |
ca52dcb5 | 1023 | int len, ret; |
9be53773 SH |
1024 | |
1025 | if (!orig->src || !orig->dest) | |
1026 | return -1; | |
1027 | ||
1028 | if (strcmp(orig->type, "lvm")) { | |
31a95fec SH |
1029 | const char *vg; |
1030 | ||
ca52dcb5 SH |
1031 | if (snap) { |
1032 | ERROR("LVM snapshot from %s backing store is not supported", | |
1033 | orig->type); | |
1034 | return -1; | |
1035 | } | |
2e59ba02 | 1036 | vg = lxc_global_config_value("lxc.bdev.lvm.vg"); |
31a95fec | 1037 | len = strlen("/dev/") + strlen(vg) + strlen(cname) + 2; |
ca52dcb5 SH |
1038 | if ((new->src = malloc(len)) == NULL) |
1039 | return -1; | |
31a95fec | 1040 | ret = snprintf(new->src, len, "/dev/%s/%s", vg, cname); |
ca52dcb5 SH |
1041 | if (ret < 0 || ret >= len) |
1042 | return -1; | |
1043 | } else { | |
1044 | new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath); | |
1045 | if (!new->src) | |
1046 | return -1; | |
9be53773 SH |
1047 | } |
1048 | ||
a17b1e65 SG |
1049 | if (orig->mntopts) { |
1050 | new->mntopts = strdup(orig->mntopts); | |
1051 | if (!new->mntopts) | |
9be53773 SH |
1052 | return -1; |
1053 | } | |
ca52dcb5 SH |
1054 | |
1055 | len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3; | |
1056 | new->dest = malloc(len); | |
9be53773 SH |
1057 | if (!new->dest) |
1058 | return -1; | |
ca52dcb5 SH |
1059 | ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname); |
1060 | if (ret < 0 || ret >= len) | |
9be53773 | 1061 | return -1; |
ca52dcb5 | 1062 | if (mkdir_p(new->dest, 0755) < 0) |
9be53773 SH |
1063 | return -1; |
1064 | ||
ca52dcb5 | 1065 | if (is_blktype(orig)) { |
eddaaafd | 1066 | if (!newsize && blk_getsize(orig, &size) < 0) { |
ca52dcb5 SH |
1067 | ERROR("Error getting size of %s", orig->src); |
1068 | return -1; | |
1069 | } | |
1070 | if (detect_fs(orig, fstype, 100) < 0) { | |
1071 | INFO("could not find fstype for %s, using ext3", orig->src); | |
1072 | return -1; | |
1073 | } | |
1074 | } else { | |
1075 | sprintf(fstype, "ext3"); | |
1076 | if (!newsize) | |
d659597e | 1077 | size = DEFAULT_FS_SIZE; |
9be53773 | 1078 | } |
ca52dcb5 | 1079 | |
9be53773 SH |
1080 | if (snap) { |
1081 | if (lvm_snapshot(orig->src, new->src, size) < 0) { | |
1082 | ERROR("could not create %s snapshot of %s", new->src, orig->src); | |
1083 | return -1; | |
1084 | } | |
1085 | } else { | |
2e59ba02 | 1086 | if (do_lvm_create(new->src, size, lxc_global_config_value("lxc.bdev.lvm.thin_pool")) < 0) { |
9be53773 SH |
1087 | ERROR("Error creating new lvm blockdev"); |
1088 | return -1; | |
1089 | } | |
9be53773 SH |
1090 | if (do_mkfs(new->src, fstype) < 0) { |
1091 | ERROR("Error creating filesystem type %s on %s", fstype, | |
1092 | new->src); | |
1093 | return -1; | |
1094 | } | |
1095 | } | |
1096 | ||
1097 | return 0; | |
1098 | } | |
1099 | ||
60bf62d4 SH |
1100 | static int lvm_destroy(struct bdev *orig) |
1101 | { | |
1102 | pid_t pid; | |
1103 | ||
1104 | if ((pid = fork()) < 0) | |
1105 | return -1; | |
1106 | if (!pid) { | |
1107 | execlp("lvremove", "lvremove", "-f", orig->src, NULL); | |
1108 | exit(1); | |
1109 | } | |
1110 | return wait_for_pid(pid); | |
1111 | } | |
1112 | ||
1897e3bc SH |
1113 | static int lvm_create(struct bdev *bdev, const char *dest, const char *n, |
1114 | struct bdev_specs *specs) | |
1115 | { | |
f99c386b | 1116 | const char *vg, *thinpool, *fstype, *lv = n; |
d659597e | 1117 | uint64_t sz; |
1897e3bc SH |
1118 | int ret, len; |
1119 | ||
1120 | if (!specs) | |
1121 | return -1; | |
1122 | ||
72e99249 | 1123 | vg = specs->lvm.vg; |
1897e3bc | 1124 | if (!vg) |
2e59ba02 | 1125 | vg = lxc_global_config_value("lxc.bdev.lvm.vg"); |
1897e3bc | 1126 | |
72e99249 | 1127 | thinpool = specs->lvm.thinpool; |
055af165 | 1128 | if (!thinpool) |
2e59ba02 | 1129 | thinpool = lxc_global_config_value("lxc.bdev.lvm.thin_pool"); |
f99c386b | 1130 | |
1897e3bc | 1131 | /* /dev/$vg/$lv */ |
72e99249 SS |
1132 | if (specs->lvm.lv) |
1133 | lv = specs->lvm.lv; | |
1134 | ||
1897e3bc SH |
1135 | len = strlen(vg) + strlen(lv) + 7; |
1136 | bdev->src = malloc(len); | |
1137 | if (!bdev->src) | |
1138 | return -1; | |
1139 | ||
1140 | ret = snprintf(bdev->src, len, "/dev/%s/%s", vg, lv); | |
1141 | if (ret < 0 || ret >= len) | |
1142 | return -1; | |
1143 | ||
72e99249 SS |
1144 | // fssize is in bytes. |
1145 | sz = specs->fssize; | |
1897e3bc | 1146 | if (!sz) |
eddaaafd | 1147 | sz = DEFAULT_FS_SIZE; |
1897e3bc | 1148 | |
f99c386b | 1149 | if (do_lvm_create(bdev->src, sz, thinpool) < 0) { |
d659597e | 1150 | ERROR("Error creating new lvm blockdev %s size %"PRIu64" bytes", bdev->src, sz); |
1897e3bc SH |
1151 | return -1; |
1152 | } | |
1153 | ||
72e99249 | 1154 | fstype = specs->fstype; |
1897e3bc | 1155 | if (!fstype) |
eddaaafd | 1156 | fstype = DEFAULT_FSTYPE; |
1897e3bc SH |
1157 | if (do_mkfs(bdev->src, fstype) < 0) { |
1158 | ERROR("Error creating filesystem type %s on %s", fstype, | |
1159 | bdev->src); | |
1160 | return -1; | |
1161 | } | |
1162 | if (!(bdev->dest = strdup(dest))) | |
1163 | return -1; | |
1164 | ||
1165 | if (mkdir_p(bdev->dest, 0755) < 0) { | |
959aee9c | 1166 | ERROR("Error creating %s", bdev->dest); |
1897e3bc SH |
1167 | return -1; |
1168 | } | |
1169 | ||
1170 | return 0; | |
1171 | } | |
1172 | ||
74a3920a | 1173 | static const struct bdev_ops lvm_ops = { |
9be53773 SH |
1174 | .detect = &lvm_detect, |
1175 | .mount = &lvm_mount, | |
1176 | .umount = &lvm_umount, | |
1177 | .clone_paths = &lvm_clonepaths, | |
60bf62d4 | 1178 | .destroy = &lvm_destroy, |
1897e3bc | 1179 | .create = &lvm_create, |
0a83cbbb | 1180 | .can_snapshot = true, |
cdd01be2 | 1181 | .can_backup = false, |
9be53773 SH |
1182 | }; |
1183 | ||
ff462013 SH |
1184 | /* |
1185 | * Return the full path of objid under dirid. Let's say dirid is | |
1186 | * /lxc/c1/rootfs, and objid is /lxc/c1/rootfs/a/b/c. Then we will | |
1187 | * return a/b/c. If instead objid is for /lxc/c1/rootfs/a, we will | |
1188 | * simply return a. | |
1189 | */ | |
1190 | char *get_btrfs_subvol_path(int fd, u64 dir_id, u64 objid, | |
1191 | char *name, int name_len) | |
1192 | { | |
1193 | struct btrfs_ioctl_ino_lookup_args args; | |
1194 | int ret, e; | |
1195 | size_t len; | |
1196 | char *retpath; | |
1197 | ||
1198 | memset(&args, 0, sizeof(args)); | |
1199 | args.treeid = dir_id; | |
1200 | args.objectid = objid; | |
1201 | ||
1202 | ret = ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args); | |
1203 | e = errno; | |
1204 | if (ret) { | |
1205 | ERROR("%s: ERROR: Failed to lookup path for %llu %llu %s - %s\n", | |
1206 | __func__, (unsigned long long) dir_id, | |
1207 | (unsigned long long) objid, | |
1208 | name, strerror(e)); | |
1209 | return NULL; | |
1210 | } else | |
1211 | INFO("%s: got path for %llu %llu - %s\n", __func__, | |
1212 | (unsigned long long) objid, (unsigned long long) dir_id, | |
1213 | name); | |
1214 | ||
1215 | if (args.name[0]) { | |
1216 | /* | |
1217 | * we're in a subdirectory of ref_tree, the kernel ioctl | |
1218 | * puts a / in there for us | |
1219 | */ | |
1220 | len = strlen(args.name) + name_len + 2; | |
1221 | retpath = malloc(len); | |
1222 | if (!retpath) | |
1223 | return NULL; | |
1224 | strcpy(retpath, args.name); | |
1225 | strcat(retpath, "/"); | |
1226 | strncat(retpath, name, name_len); | |
1227 | } else { | |
1228 | /* we're at the root of ref_tree */ | |
1229 | len = name_len + 1; | |
1230 | retpath = malloc(len); | |
1231 | if (!retpath) | |
1232 | return NULL; | |
1233 | *retpath = '\0'; | |
1234 | strncat(retpath, name, name_len); | |
1235 | } | |
1236 | return retpath; | |
1237 | } | |
1238 | ||
9be53773 SH |
1239 | // |
1240 | // btrfs ops | |
1241 | // | |
1242 | ||
ff462013 SH |
1243 | int btrfs_list_get_path_rootid(int fd, u64 *treeid) |
1244 | { | |
1245 | int ret; | |
1246 | struct btrfs_ioctl_ino_lookup_args args; | |
9be53773 | 1247 | |
ff462013 SH |
1248 | memset(&args, 0, sizeof(args)); |
1249 | args.objectid = BTRFS_FIRST_FREE_OBJECTID; | |
9be53773 | 1250 | |
ff462013 SH |
1251 | ret = ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args); |
1252 | if (ret < 0) { | |
1253 | WARN("Warning: can't perform the search -%s\n", | |
1254 | strerror(errno)); | |
1255 | return ret; | |
1256 | } | |
1257 | *treeid = args.treeid; | |
1258 | return 0; | |
1259 | } | |
9be53773 | 1260 | |
4295c5de | 1261 | bool is_btrfs_fs(const char *path) |
9be53773 | 1262 | { |
9be53773 SH |
1263 | int fd, ret; |
1264 | struct btrfs_ioctl_space_args sargs; | |
1265 | ||
1266 | // make sure this is a btrfs filesystem | |
1267 | fd = open(path, O_RDONLY); | |
1268 | if (fd < 0) | |
1897e3bc | 1269 | return false; |
9be53773 SH |
1270 | sargs.space_slots = 0; |
1271 | sargs.total_spaces = 0; | |
1272 | ret = ioctl(fd, BTRFS_IOC_SPACE_INFO, &sargs); | |
1273 | close(fd); | |
1274 | if (ret < 0) | |
1897e3bc SH |
1275 | return false; |
1276 | ||
1277 | return true; | |
1278 | } | |
1279 | ||
1280 | static int btrfs_detect(const char *path) | |
1281 | { | |
1282 | struct stat st; | |
1283 | int ret; | |
1284 | ||
1285 | if (!is_btrfs_fs(path)) | |
9be53773 SH |
1286 | return 0; |
1287 | ||
1288 | // and make sure it's a subvolume. | |
1289 | ret = stat(path, &st); | |
1290 | if (ret < 0) | |
1291 | return 0; | |
1292 | ||
1293 | if (st.st_ino == 256 && S_ISDIR(st.st_mode)) | |
1294 | return 1; | |
1295 | ||
1296 | return 0; | |
1297 | } | |
1298 | ||
60bf62d4 | 1299 | static int btrfs_mount(struct bdev *bdev) |
9be53773 | 1300 | { |
a17b1e65 SG |
1301 | unsigned long mntflags; |
1302 | char *mntdata; | |
1303 | int ret; | |
1304 | ||
9be53773 SH |
1305 | if (strcmp(bdev->type, "btrfs")) |
1306 | return -22; | |
1307 | if (!bdev->src || !bdev->dest) | |
1308 | return -22; | |
a17b1e65 SG |
1309 | |
1310 | if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) { | |
1311 | free(mntdata); | |
1312 | return -22; | |
1313 | } | |
1314 | ||
1315 | ret = mount(bdev->src, bdev->dest, "bind", MS_BIND | MS_REC | mntflags, mntdata); | |
1316 | free(mntdata); | |
1317 | return ret; | |
9be53773 SH |
1318 | } |
1319 | ||
60bf62d4 | 1320 | static int btrfs_umount(struct bdev *bdev) |
9be53773 SH |
1321 | { |
1322 | if (strcmp(bdev->type, "btrfs")) | |
1323 | return -22; | |
1324 | if (!bdev->src || !bdev->dest) | |
1325 | return -22; | |
1326 | return umount(bdev->dest); | |
1327 | } | |
1328 | ||
9be53773 SH |
1329 | static int btrfs_subvolume_create(const char *path) |
1330 | { | |
1331 | int ret, fd = -1; | |
1332 | struct btrfs_ioctl_vol_args args; | |
1333 | char *p, *newfull = strdup(path); | |
1334 | ||
1335 | if (!newfull) { | |
1336 | ERROR("Error: out of memory"); | |
1337 | return -1; | |
1338 | } | |
1339 | ||
c32981c3 | 1340 | p = strrchr(newfull, '/'); |
9be53773 SH |
1341 | if (!p) { |
1342 | ERROR("bad path: %s", path); | |
9529609a | 1343 | free(newfull); |
9be53773 SH |
1344 | return -1; |
1345 | } | |
1346 | *p = '\0'; | |
1347 | ||
025ed0f3 | 1348 | fd = open(newfull, O_RDONLY); |
025ed0f3 | 1349 | if (fd < 0) { |
9be53773 SH |
1350 | ERROR("Error opening %s", newfull); |
1351 | free(newfull); | |
1352 | return -1; | |
1353 | } | |
1354 | ||
1355 | memset(&args, 0, sizeof(args)); | |
1356 | strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX); | |
1357 | args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0; | |
1358 | ret = ioctl(fd, BTRFS_IOC_SUBVOL_CREATE, &args); | |
1359 | INFO("btrfs: snapshot create ioctl returned %d", ret); | |
9be53773 SH |
1360 | |
1361 | free(newfull); | |
1362 | close(fd); | |
1363 | return ret; | |
1364 | } | |
1365 | ||
65db0e5a ÇO |
1366 | static int btrfs_same_fs(const char *orig, const char *new) { |
1367 | int fd_orig = -1, fd_new = -1, ret = -1; | |
1368 | struct btrfs_ioctl_fs_info_args orig_args, new_args; | |
1369 | ||
1370 | fd_orig = open(orig, O_RDONLY); | |
1371 | if (fd_orig < 0) { | |
1372 | SYSERROR("Error opening original rootfs %s", orig); | |
1373 | goto out; | |
1374 | } | |
1375 | ret = ioctl(fd_orig, BTRFS_IOC_FS_INFO, &orig_args); | |
1376 | if (ret < 0) { | |
1377 | SYSERROR("BTRFS_IOC_FS_INFO %s", orig); | |
1378 | goto out; | |
1379 | } | |
1380 | ||
1381 | fd_new = open(new, O_RDONLY); | |
1382 | if (fd_new < 0) { | |
1383 | SYSERROR("Error opening new container dir %s", new); | |
e27141fa | 1384 | ret = -1; |
65db0e5a ÇO |
1385 | goto out; |
1386 | } | |
1387 | ret = ioctl(fd_new, BTRFS_IOC_FS_INFO, &new_args); | |
1388 | if (ret < 0) { | |
1389 | SYSERROR("BTRFS_IOC_FS_INFO %s", new); | |
1390 | goto out; | |
1391 | } | |
1392 | ||
1393 | if (strncmp(orig_args.fsid, new_args.fsid, BTRFS_FSID_SIZE) != 0) { | |
1394 | ret = -1; | |
1395 | goto out; | |
1396 | } | |
1397 | ret = 0; | |
1398 | out: | |
1399 | if (fd_new != -1) | |
1400 | close(fd_new); | |
1401 | if (fd_orig != -1) | |
1402 | close(fd_orig); | |
1403 | return ret; | |
1404 | } | |
1405 | ||
9be53773 SH |
1406 | static int btrfs_snapshot(const char *orig, const char *new) |
1407 | { | |
1408 | int fd = -1, fddst = -1, ret = -1; | |
1409 | struct btrfs_ioctl_vol_args_v2 args; | |
1410 | char *newdir, *newname, *newfull = NULL; | |
1411 | ||
1412 | newfull = strdup(new); | |
1413 | if (!newfull) { | |
1414 | ERROR("Error: out of memory"); | |
1415 | goto out; | |
1416 | } | |
1417 | // make sure the directory doesn't already exist | |
8479c136 | 1418 | if (rmdir(newfull) < 0 && errno != ENOENT) { |
9be53773 SH |
1419 | SYSERROR("Error removing empty new rootfs"); |
1420 | goto out; | |
1421 | } | |
1422 | newname = basename(newfull); | |
1423 | newdir = dirname(newfull); | |
1424 | fd = open(orig, O_RDONLY); | |
1425 | if (fd < 0) { | |
1426 | SYSERROR("Error opening original rootfs %s", orig); | |
1427 | goto out; | |
1428 | } | |
dd1d77f9 | 1429 | fddst = open(newdir, O_RDONLY); |
9be53773 SH |
1430 | if (fddst < 0) { |
1431 | SYSERROR("Error opening new container dir %s", newdir); | |
1432 | goto out; | |
1433 | } | |
1434 | ||
1435 | memset(&args, 0, sizeof(args)); | |
1436 | args.fd = fd; | |
1437 | strncpy(args.name, newname, BTRFS_SUBVOL_NAME_MAX); | |
1438 | args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0; | |
1439 | ret = ioctl(fddst, BTRFS_IOC_SNAP_CREATE_V2, &args); | |
1440 | INFO("btrfs: snapshot create ioctl returned %d", ret); | |
1441 | ||
1442 | out: | |
1443 | if (fddst != -1) | |
1444 | close(fddst); | |
1445 | if (fd != -1) | |
1446 | close(fd); | |
f10fad2f | 1447 | free(newfull); |
9be53773 SH |
1448 | return ret; |
1449 | } | |
1450 | ||
2659c7cb SH |
1451 | static int btrfs_snapshot_wrapper(void *data) |
1452 | { | |
1453 | struct rsync_data_char *arg = data; | |
1454 | if (setgid(0) < 0) { | |
1455 | ERROR("Failed to setgid to 0"); | |
1456 | return -1; | |
1457 | } | |
1458 | if (setgroups(0, NULL) < 0) | |
1459 | WARN("Failed to clear groups"); | |
1460 | if (setuid(0) < 0) { | |
1461 | ERROR("Failed to setuid to 0"); | |
1462 | return -1; | |
1463 | } | |
1464 | return btrfs_snapshot(arg->src, arg->dest); | |
1465 | } | |
1466 | ||
9be53773 SH |
1467 | static int btrfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, |
1468 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
25190e5b | 1469 | uint64_t newsize, struct lxc_conf *conf) |
9be53773 SH |
1470 | { |
1471 | if (!orig->dest || !orig->src) | |
1472 | return -1; | |
1473 | ||
1474 | if (strcmp(orig->type, "btrfs")) { | |
ca52dcb5 SH |
1475 | int len, ret; |
1476 | if (snap) { | |
1477 | ERROR("btrfs snapshot from %s backing store is not supported", | |
1478 | orig->type); | |
1479 | return -1; | |
1480 | } | |
1481 | len = strlen(lxcpath) + strlen(cname) + strlen("rootfs") + 3; | |
1482 | new->src = malloc(len); | |
1483 | if (!new->src) | |
1484 | return -1; | |
1485 | ret = snprintf(new->src, len, "%s/%s/rootfs", lxcpath, cname); | |
1486 | if (ret < 0 || ret >= len) | |
1487 | return -1; | |
1488 | } else { | |
1489 | // in case rootfs is in custom path, reuse it | |
1490 | if ((new->src = dir_new_path(orig->src, oldname, cname, oldpath, lxcpath)) == NULL) | |
1491 | return -1; | |
9be53773 | 1492 | |
ca52dcb5 | 1493 | } |
9be53773 | 1494 | |
ca52dcb5 | 1495 | if ((new->dest = strdup(new->src)) == NULL) |
9be53773 SH |
1496 | return -1; |
1497 | ||
a17b1e65 | 1498 | if (orig->mntopts && (new->mntopts = strdup(orig->mntopts)) == NULL) |
9be53773 SH |
1499 | return -1; |
1500 | ||
2659c7cb SH |
1501 | if (snap) { |
1502 | struct rsync_data_char sdata; | |
1503 | if (!am_unpriv()) | |
1504 | return btrfs_snapshot(orig->dest, new->dest); | |
1505 | sdata.dest = new->dest; | |
1506 | sdata.src = orig->dest; | |
1507 | return userns_exec_1(conf, btrfs_snapshot_wrapper, &sdata); | |
1508 | } | |
9be53773 | 1509 | |
8479c136 | 1510 | if (rmdir(new->dest) < 0 && errno != ENOENT) { |
959aee9c | 1511 | SYSERROR("removing %s", new->dest); |
9be53773 SH |
1512 | return -1; |
1513 | } | |
1514 | ||
1515 | return btrfs_subvolume_create(new->dest); | |
1516 | } | |
1517 | ||
ff462013 | 1518 | static int btrfs_do_destroy_subvol(const char *path) |
60bf62d4 SH |
1519 | { |
1520 | int ret, fd = -1; | |
1521 | struct btrfs_ioctl_vol_args args; | |
60bf62d4 SH |
1522 | char *p, *newfull = strdup(path); |
1523 | ||
1524 | if (!newfull) { | |
1525 | ERROR("Error: out of memory"); | |
1526 | return -1; | |
1527 | } | |
1528 | ||
c32981c3 | 1529 | p = strrchr(newfull, '/'); |
60bf62d4 SH |
1530 | if (!p) { |
1531 | ERROR("bad path: %s", path); | |
9529609a | 1532 | free(newfull); |
60bf62d4 SH |
1533 | return -1; |
1534 | } | |
1535 | *p = '\0'; | |
1536 | ||
025ed0f3 | 1537 | fd = open(newfull, O_RDONLY); |
025ed0f3 | 1538 | if (fd < 0) { |
4295c5de | 1539 | SYSERROR("Error opening %s", newfull); |
60bf62d4 SH |
1540 | free(newfull); |
1541 | return -1; | |
1542 | } | |
1543 | ||
1544 | memset(&args, 0, sizeof(args)); | |
1545 | strncpy(args.name, p+1, BTRFS_SUBVOL_NAME_MAX); | |
1546 | args.name[BTRFS_SUBVOL_NAME_MAX-1] = 0; | |
1547 | ret = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &args); | |
ff462013 | 1548 | INFO("btrfs: snapshot destroy ioctl returned %d for %s", ret, path); |
2659c7cb | 1549 | if (ret < 0 && errno == EPERM) |
cf03f973 | 1550 | ERROR("Is the rootfs mounted with -o user_subvol_rm_allowed?"); |
60bf62d4 SH |
1551 | |
1552 | free(newfull); | |
1553 | close(fd); | |
1554 | return ret; | |
1555 | } | |
1556 | ||
ff462013 SH |
1557 | struct mytree_node { |
1558 | u64 objid; | |
1559 | u64 parentid; | |
1560 | char *name; | |
1561 | char *dirname; | |
1562 | }; | |
1563 | ||
1564 | struct my_btrfs_tree { | |
1565 | struct mytree_node *nodes; | |
1566 | int num; | |
1567 | }; | |
1568 | ||
1569 | static int get_btrfs_tree_idx(struct my_btrfs_tree *tree, u64 id) | |
1570 | { | |
1571 | int i; | |
1572 | if (!tree) | |
1573 | return -1; | |
1574 | for (i = 0; i < tree->num; i++) { | |
1575 | if (tree->nodes[i].objid == id) | |
1576 | return i; | |
1577 | } | |
1578 | return -1; | |
1579 | } | |
1580 | ||
1581 | static struct my_btrfs_tree *create_my_btrfs_tree(u64 id, const char *path, int name_len) | |
1582 | { | |
1583 | struct my_btrfs_tree *tree; | |
1584 | ||
8873e65e | 1585 | tree = malloc(sizeof(struct my_btrfs_tree)); |
ff462013 SH |
1586 | if (!tree) |
1587 | return NULL; | |
1588 | tree->nodes = malloc(sizeof(struct mytree_node)); | |
1589 | if (!tree->nodes) { | |
1590 | free(tree); | |
1591 | return NULL; | |
1592 | } | |
1593 | tree->num = 1; | |
1594 | tree->nodes[0].dirname = NULL; | |
1595 | tree->nodes[0].name = strdup(path); | |
1596 | if (!tree->nodes[0].name) { | |
1597 | free(tree->nodes); | |
1598 | free(tree); | |
1599 | return NULL; | |
1600 | } | |
1601 | tree->nodes[0].parentid = 0; | |
1602 | tree->nodes[0].objid = id; | |
1603 | return tree; | |
1604 | } | |
1605 | ||
1606 | static bool update_tree_node(struct mytree_node *n, u64 id, u64 parent, char *name, | |
1607 | int name_len, char *dirname) | |
1608 | { | |
1609 | if (id) | |
1610 | n->objid = id; | |
1611 | if (parent) | |
1612 | n->parentid = parent; | |
1613 | if (name) { | |
1614 | n->name = malloc(name_len + 1); | |
1615 | if (!n->name) | |
1616 | return false; | |
1617 | strncpy(n->name, name, name_len); | |
1618 | n->name[name_len] = '\0'; | |
1619 | } | |
1620 | if (dirname) { | |
1621 | n->dirname = malloc(strlen(dirname) + 1); | |
1622 | if (!n->dirname) { | |
1623 | free(n->name); | |
1624 | return false; | |
1625 | } | |
1626 | strcpy(n->dirname, dirname); | |
1627 | } | |
1628 | return true; | |
1629 | } | |
1630 | ||
1631 | static bool add_btrfs_tree_node(struct my_btrfs_tree *tree, u64 id, u64 parent, | |
1632 | char *name, int name_len, char *dirname) | |
1633 | { | |
1634 | struct mytree_node *tmp; | |
1635 | ||
1636 | int i = get_btrfs_tree_idx(tree, id); | |
1637 | if (i != -1) | |
1638 | return update_tree_node(&tree->nodes[i], id, parent, name, | |
1639 | name_len, dirname); | |
1640 | ||
1641 | tmp = realloc(tree->nodes, (tree->num+1) * sizeof(struct mytree_node)); | |
1642 | if (!tmp) | |
1643 | return false; | |
1644 | tree->nodes = tmp; | |
1645 | memset(&tree->nodes[tree->num], 0, sizeof(struct mytree_node)); | |
1646 | if (!update_tree_node(&tree->nodes[tree->num], id, parent, name, | |
1647 | name_len, dirname)) | |
1648 | return false; | |
1649 | tree->num++; | |
1650 | return true; | |
1651 | } | |
1652 | ||
1653 | static void free_btrfs_tree(struct my_btrfs_tree *tree) | |
1654 | { | |
1655 | int i; | |
1656 | if (!tree) | |
1657 | return; | |
1658 | for (i = 0; i < tree->num; i++) { | |
1659 | free(tree->nodes[i].name); | |
1660 | free(tree->nodes[i].dirname); | |
1661 | } | |
1662 | free(tree->nodes); | |
1663 | free(tree); | |
1664 | } | |
1665 | ||
1666 | /* | |
1667 | * Given a @tree of subvolumes under @path, ask btrfs to remove each | |
1668 | * subvolume | |
1669 | */ | |
1670 | static bool do_remove_btrfs_children(struct my_btrfs_tree *tree, u64 root_id, | |
1671 | const char *path) | |
1672 | { | |
1673 | int i; | |
1674 | char *newpath; | |
1675 | size_t len; | |
1676 | ||
1677 | for (i = 0; i < tree->num; i++) { | |
1678 | if (tree->nodes[i].parentid == root_id) { | |
1679 | if (!tree->nodes[i].dirname) { | |
1680 | WARN("Odd condition: child objid with no name under %s\n", path); | |
1681 | continue; | |
1682 | } | |
1683 | len = strlen(path) + strlen(tree->nodes[i].dirname) + 2; | |
1684 | newpath = malloc(len); | |
1685 | if (!newpath) { | |
1686 | ERROR("Out of memory"); | |
1687 | return false; | |
1688 | } | |
1689 | snprintf(newpath, len, "%s/%s", path, tree->nodes[i].dirname); | |
1690 | if (!do_remove_btrfs_children(tree, tree->nodes[i].objid, newpath)) { | |
1691 | ERROR("Failed to prune %s\n", tree->nodes[i].name); | |
1692 | free(newpath); | |
1693 | return false; | |
1694 | } | |
1695 | if (btrfs_do_destroy_subvol(newpath) != 0) { | |
1696 | ERROR("Failed to remove %s\n", newpath); | |
1697 | free(newpath); | |
1698 | return false; | |
1699 | } | |
1700 | free(newpath); | |
1701 | } | |
1702 | } | |
1703 | return true; | |
1704 | } | |
1705 | ||
1706 | static int btrfs_recursive_destroy(const char *path) | |
1707 | { | |
1708 | u64 root_id; | |
1709 | int fd; | |
1710 | struct btrfs_ioctl_search_args args; | |
1711 | struct btrfs_ioctl_search_key *sk = &args.key; | |
1712 | struct btrfs_ioctl_search_header *sh; | |
1713 | struct btrfs_root_ref *ref; | |
1714 | struct my_btrfs_tree *tree; | |
1715 | int ret, i; | |
1716 | unsigned long off = 0; | |
1717 | int name_len; | |
1718 | char *name; | |
1719 | char *tmppath; | |
1720 | ||
1721 | fd = open(path, O_RDONLY); | |
1722 | if (fd < 0) { | |
1723 | ERROR("Failed to open %s\n", path); | |
1724 | return -1; | |
1725 | } | |
1726 | ||
1727 | if (btrfs_list_get_path_rootid(fd, &root_id)) { | |
1728 | close(fd); | |
1729 | if (errno == EPERM || errno == EACCES) { | |
1730 | WARN("Will simply try removing"); | |
1731 | goto ignore_search; | |
1732 | } | |
1733 | ||
1734 | return -1; | |
1735 | } | |
1736 | ||
1737 | tree = create_my_btrfs_tree(root_id, path, strlen(path)); | |
1738 | if (!tree) { | |
1739 | ERROR("Out of memory\n"); | |
1740 | close(fd); | |
1741 | return -1; | |
1742 | } | |
1743 | /* Walk all subvols looking for any under this id */ | |
1744 | memset(&args, 0, sizeof(args)); | |
1745 | ||
1746 | /* search in the tree of tree roots */ | |
1747 | sk->tree_id = 1; | |
1748 | ||
1749 | sk->max_type = BTRFS_ROOT_REF_KEY; | |
1750 | sk->min_type = BTRFS_ROOT_ITEM_KEY; | |
1751 | sk->min_objectid = 0; | |
1752 | sk->max_objectid = (u64)-1; | |
1753 | sk->max_offset = (u64)-1; | |
1754 | sk->min_offset = 0; | |
1755 | sk->max_transid = (u64)-1; | |
1756 | sk->nr_items = 4096; | |
1757 | ||
1758 | while(1) { | |
1759 | ret = ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args); | |
1760 | if (ret < 0) { | |
1761 | close(fd); | |
1762 | ERROR("Error: can't perform the search under %s\n", path); | |
1763 | free_btrfs_tree(tree); | |
1764 | return -1; | |
1765 | } | |
1766 | if (sk->nr_items == 0) | |
1767 | break; | |
1768 | ||
1769 | off = 0; | |
1770 | for (i = 0; i < sk->nr_items; i++) { | |
1771 | sh = (struct btrfs_ioctl_search_header *)(args.buf + off); | |
1772 | off += sizeof(*sh); | |
1773 | /* | |
1774 | * A backref key with the name and dirid of the parent | |
1775 | * comes followed by the reoot ref key which has the | |
1776 | * name of the child subvol in question. | |
1777 | */ | |
1778 | if (sh->objectid != root_id && sh->type == BTRFS_ROOT_BACKREF_KEY) { | |
1779 | ref = (struct btrfs_root_ref *)(args.buf + off); | |
1780 | name_len = ref->name_len; | |
1781 | name = (char *)(ref + 1); | |
1782 | tmppath = get_btrfs_subvol_path(fd, sh->offset, | |
1783 | ref->dirid, name, name_len); | |
1784 | if (!add_btrfs_tree_node(tree, sh->objectid, | |
1785 | sh->offset, name, | |
1786 | name_len, tmppath)) { | |
1787 | ERROR("Out of memory"); | |
1788 | free_btrfs_tree(tree); | |
1789 | free(tmppath); | |
1790 | close(fd); | |
1791 | return -1; | |
1792 | } | |
1793 | free(tmppath); | |
1794 | } | |
1795 | off += sh->len; | |
1796 | ||
1797 | /* | |
1798 | * record the mins in sk so we can make sure the | |
1799 | * next search doesn't repeat this root | |
1800 | */ | |
1801 | sk->min_objectid = sh->objectid; | |
1802 | sk->min_type = sh->type; | |
1803 | sk->min_offset = sh->offset; | |
1804 | } | |
1805 | sk->nr_items = 4096; | |
1806 | sk->min_offset++; | |
1807 | if (!sk->min_offset) | |
1808 | sk->min_type++; | |
1809 | else | |
1810 | continue; | |
1811 | ||
1812 | if (sk->min_type > BTRFS_ROOT_BACKREF_KEY) { | |
1813 | sk->min_type = BTRFS_ROOT_ITEM_KEY; | |
1814 | sk->min_objectid++; | |
1815 | } else | |
1816 | continue; | |
1817 | ||
1818 | if (sk->min_objectid >= sk->max_objectid) | |
1819 | break; | |
1820 | } | |
1821 | close(fd); | |
1822 | ||
1823 | /* now actually remove them */ | |
1824 | ||
1825 | if (!do_remove_btrfs_children(tree, root_id, path)) { | |
1826 | free_btrfs_tree(tree); | |
1827 | ERROR("failed pruning\n"); | |
1828 | return -1; | |
1829 | } | |
1830 | ||
1831 | free_btrfs_tree(tree); | |
1832 | /* All child subvols have been removed, now remove this one */ | |
1833 | ignore_search: | |
1834 | return btrfs_do_destroy_subvol(path); | |
1835 | } | |
1836 | ||
4295c5de SH |
1837 | bool btrfs_try_remove_subvol(const char *path) |
1838 | { | |
1839 | if (!btrfs_detect(path)) | |
1840 | return false; | |
1841 | return btrfs_recursive_destroy(path) == 0; | |
1842 | } | |
1843 | ||
ff462013 SH |
1844 | static int btrfs_destroy(struct bdev *orig) |
1845 | { | |
1846 | return btrfs_recursive_destroy(orig->src); | |
1847 | } | |
1848 | ||
1897e3bc SH |
1849 | static int btrfs_create(struct bdev *bdev, const char *dest, const char *n, |
1850 | struct bdev_specs *specs) | |
1851 | { | |
1852 | bdev->src = strdup(dest); | |
1853 | bdev->dest = strdup(dest); | |
1854 | if (!bdev->src || !bdev->dest) | |
1855 | return -1; | |
1856 | return btrfs_subvolume_create(bdev->dest); | |
1857 | } | |
1858 | ||
74a3920a | 1859 | static const struct bdev_ops btrfs_ops = { |
9be53773 SH |
1860 | .detect = &btrfs_detect, |
1861 | .mount = &btrfs_mount, | |
1862 | .umount = &btrfs_umount, | |
1863 | .clone_paths = &btrfs_clonepaths, | |
60bf62d4 | 1864 | .destroy = &btrfs_destroy, |
1897e3bc | 1865 | .create = &btrfs_create, |
0a83cbbb | 1866 | .can_snapshot = true, |
cdd01be2 | 1867 | .can_backup = true, |
9be53773 SH |
1868 | }; |
1869 | ||
eddaaafd SH |
1870 | // |
1871 | // loopback dev ops | |
1872 | // | |
1873 | static int loop_detect(const char *path) | |
1874 | { | |
1875 | if (strncmp(path, "loop:", 5) == 0) | |
1876 | return 1; | |
1877 | return 0; | |
1878 | } | |
1879 | ||
edd7414a | 1880 | static int find_free_loopdev_no_control(int *retfd, char *namep) |
eddaaafd SH |
1881 | { |
1882 | struct dirent dirent, *direntp; | |
1883 | struct loop_info64 lo; | |
1884 | DIR *dir; | |
1885 | int fd = -1; | |
1886 | ||
025ed0f3 | 1887 | dir = opendir("/dev"); |
025ed0f3 | 1888 | if (!dir) { |
eddaaafd SH |
1889 | SYSERROR("Error opening /dev"); |
1890 | return -1; | |
1891 | } | |
1892 | while (!readdir_r(dir, &dirent, &direntp)) { | |
1893 | ||
1894 | if (!direntp) | |
1895 | break; | |
1896 | if (strncmp(direntp->d_name, "loop", 4) != 0) | |
1897 | continue; | |
025ed0f3 | 1898 | fd = openat(dirfd(dir), direntp->d_name, O_RDWR); |
025ed0f3 | 1899 | if (fd < 0) |
eddaaafd SH |
1900 | continue; |
1901 | if (ioctl(fd, LOOP_GET_STATUS64, &lo) == 0 || errno != ENXIO) { | |
1902 | close(fd); | |
1903 | fd = -1; | |
1904 | continue; | |
1905 | } | |
1906 | // We can use this fd | |
1907 | snprintf(namep, 100, "/dev/%s", direntp->d_name); | |
1908 | break; | |
1909 | } | |
ca697342 | 1910 | closedir(dir); |
eddaaafd SH |
1911 | if (fd == -1) { |
1912 | ERROR("No loop device found"); | |
1913 | return -1; | |
1914 | } | |
eddaaafd SH |
1915 | |
1916 | *retfd = fd; | |
1917 | return 0; | |
1918 | } | |
1919 | ||
edd7414a WB |
1920 | static int find_free_loopdev(int *retfd, char *namep) |
1921 | { | |
1922 | int rc, fd = -1; | |
1923 | int ctl = open("/dev/loop-control", O_RDWR); | |
1924 | if (ctl < 0) | |
1925 | return find_free_loopdev_no_control(retfd, namep); | |
1926 | rc = ioctl(ctl, LOOP_CTL_GET_FREE); | |
1927 | if (rc >= 0) { | |
1928 | snprintf(namep, 100, "/dev/loop%d", rc); | |
1929 | fd = open(namep, O_RDWR); | |
1930 | } | |
1931 | close(ctl); | |
1932 | if (fd == -1) { | |
1933 | ERROR("No loop device found"); | |
1934 | return -1; | |
1935 | } | |
1936 | *retfd = fd; | |
1937 | return 0; | |
1938 | } | |
1939 | ||
eddaaafd SH |
1940 | static int loop_mount(struct bdev *bdev) |
1941 | { | |
1942 | int lfd, ffd = -1, ret = -1; | |
1943 | struct loop_info64 lo; | |
1944 | char loname[100]; | |
1945 | ||
1946 | if (strcmp(bdev->type, "loop")) | |
1947 | return -22; | |
1948 | if (!bdev->src || !bdev->dest) | |
1949 | return -22; | |
1950 | if (find_free_loopdev(&lfd, loname) < 0) | |
1951 | return -22; | |
1952 | ||
025ed0f3 | 1953 | ffd = open(bdev->src + 5, O_RDWR); |
025ed0f3 | 1954 | if (ffd < 0) { |
959aee9c | 1955 | SYSERROR("Error opening backing file %s", bdev->src); |
eddaaafd SH |
1956 | goto out; |
1957 | } | |
1958 | ||
1959 | if (ioctl(lfd, LOOP_SET_FD, ffd) < 0) { | |
1960 | SYSERROR("Error attaching backing file to loop dev"); | |
1961 | goto out; | |
1962 | } | |
1963 | memset(&lo, 0, sizeof(lo)); | |
1964 | lo.lo_flags = LO_FLAGS_AUTOCLEAR; | |
1965 | if (ioctl(lfd, LOOP_SET_STATUS64, &lo) < 0) { | |
959aee9c | 1966 | SYSERROR("Error setting autoclear on loop dev"); |
eddaaafd SH |
1967 | goto out; |
1968 | } | |
1969 | ||
a17b1e65 | 1970 | ret = mount_unknown_fs(loname, bdev->dest, bdev->mntopts); |
eddaaafd | 1971 | if (ret < 0) |
959aee9c | 1972 | ERROR("Error mounting %s", bdev->src); |
eddaaafd SH |
1973 | else |
1974 | bdev->lofd = lfd; | |
1975 | ||
1976 | out: | |
1977 | if (ffd > -1) | |
1978 | close(ffd); | |
1979 | if (ret < 0) { | |
1980 | close(lfd); | |
1981 | bdev->lofd = -1; | |
1982 | } | |
1983 | return ret; | |
1984 | } | |
1985 | ||
1986 | static int loop_umount(struct bdev *bdev) | |
1987 | { | |
1988 | int ret; | |
1989 | ||
1990 | if (strcmp(bdev->type, "loop")) | |
1991 | return -22; | |
1992 | if (!bdev->src || !bdev->dest) | |
1993 | return -22; | |
1994 | ret = umount(bdev->dest); | |
1995 | if (bdev->lofd >= 0) { | |
1996 | close(bdev->lofd); | |
1997 | bdev->lofd = -1; | |
1998 | } | |
1999 | return ret; | |
2000 | } | |
2001 | ||
d659597e | 2002 | static int do_loop_create(const char *path, uint64_t size, const char *fstype) |
eddaaafd | 2003 | { |
025ed0f3 | 2004 | int fd, ret; |
eddaaafd SH |
2005 | // create the new loopback file. |
2006 | fd = creat(path, S_IRUSR|S_IWUSR); | |
2007 | if (fd < 0) | |
2008 | return -1; | |
2009 | if (lseek(fd, size, SEEK_SET) < 0) { | |
2010 | SYSERROR("Error seeking to set new loop file size"); | |
2011 | close(fd); | |
2012 | return -1; | |
2013 | } | |
2014 | if (write(fd, "1", 1) != 1) { | |
2015 | SYSERROR("Error creating new loop file"); | |
2016 | close(fd); | |
2017 | return -1; | |
2018 | } | |
025ed0f3 | 2019 | ret = close(fd); |
025ed0f3 | 2020 | if (ret < 0) { |
eddaaafd SH |
2021 | SYSERROR("Error closing new loop file"); |
2022 | return -1; | |
2023 | } | |
2024 | ||
2025 | // create an fs in the loopback file | |
2026 | if (do_mkfs(path, fstype) < 0) { | |
2027 | ERROR("Error creating filesystem type %s on %s", fstype, | |
2028 | path); | |
2029 | return -1; | |
2030 | } | |
2031 | ||
2032 | return 0; | |
2033 | } | |
2034 | ||
2035 | /* | |
2036 | * No idea what the original blockdev will be called, but the copy will be | |
2037 | * called $lxcpath/$lxcname/rootdev | |
2038 | */ | |
2039 | static int loop_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, | |
2040 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
25190e5b | 2041 | uint64_t newsize, struct lxc_conf *conf) |
eddaaafd SH |
2042 | { |
2043 | char fstype[100]; | |
d659597e | 2044 | uint64_t size = newsize; |
eddaaafd SH |
2045 | int len, ret; |
2046 | char *srcdev; | |
2047 | ||
2048 | if (snap) { | |
2049 | ERROR("loop devices cannot be snapshotted."); | |
2050 | return -1; | |
2051 | } | |
2052 | ||
2053 | if (!orig->dest || !orig->src) | |
2054 | return -1; | |
2055 | ||
2056 | len = strlen(lxcpath) + strlen(cname) + strlen("rootdev") + 3; | |
2057 | srcdev = alloca(len); | |
2058 | ret = snprintf(srcdev, len, "%s/%s/rootdev", lxcpath, cname); | |
2059 | if (ret < 0 || ret >= len) | |
2060 | return -1; | |
2061 | ||
2062 | new->src = malloc(len + 5); | |
2063 | if (!new->src) | |
2064 | return -1; | |
2065 | ret = snprintf(new->src, len + 5, "loop:%s", srcdev); | |
2066 | if (ret < 0 || ret >= len + 5) | |
2067 | return -1; | |
2068 | ||
2069 | new->dest = malloc(len); | |
2070 | if (!new->dest) | |
2071 | return -1; | |
2072 | ret = snprintf(new->dest, len, "%s/%s/rootfs", lxcpath, cname); | |
2073 | if (ret < 0 || ret >= len) | |
2074 | return -1; | |
2075 | ||
2076 | // it's tempting to say: if orig->src == loopback and !newsize, then | |
2077 | // copy the loopback file. However, we'd have to make sure to | |
2078 | // correctly keep holes! So punt for now. | |
2079 | ||
2080 | if (is_blktype(orig)) { | |
2081 | if (!newsize && blk_getsize(orig, &size) < 0) { | |
2082 | ERROR("Error getting size of %s", orig->src); | |
2083 | return -1; | |
2084 | } | |
2085 | if (detect_fs(orig, fstype, 100) < 0) { | |
2086 | INFO("could not find fstype for %s, using %s", orig->src, | |
2087 | DEFAULT_FSTYPE); | |
2088 | return -1; | |
2089 | } | |
2090 | } else { | |
2091 | sprintf(fstype, "%s", DEFAULT_FSTYPE); | |
2092 | if (!newsize) | |
d659597e | 2093 | size = DEFAULT_FS_SIZE; |
eddaaafd SH |
2094 | } |
2095 | return do_loop_create(srcdev, size, fstype); | |
2096 | } | |
2097 | ||
2098 | static int loop_create(struct bdev *bdev, const char *dest, const char *n, | |
2099 | struct bdev_specs *specs) | |
2100 | { | |
2101 | const char *fstype; | |
d659597e | 2102 | uint64_t sz; |
eddaaafd SH |
2103 | int ret, len; |
2104 | char *srcdev; | |
2105 | ||
2106 | if (!specs) | |
2107 | return -1; | |
2108 | ||
2109 | // dest is passed in as $lxcpath / $lxcname / rootfs | |
2110 | // srcdev will be: $lxcpath / $lxcname / rootdev | |
2111 | // src will be 'loop:$srcdev' | |
2112 | len = strlen(dest) + 2; | |
2113 | srcdev = alloca(len); | |
2114 | ||
2115 | ret = snprintf(srcdev, len, "%s", dest); | |
2116 | if (ret < 0 || ret >= len) | |
2117 | return -1; | |
2118 | sprintf(srcdev + len - 4, "dev"); | |
2119 | ||
2120 | bdev->src = malloc(len + 5); | |
2121 | if (!bdev->src) | |
2122 | return -1; | |
2123 | ret = snprintf(bdev->src, len + 5, "loop:%s", srcdev); | |
2124 | if (ret < 0 || ret >= len + 5) | |
2125 | return -1; | |
2126 | ||
72e99249 | 2127 | sz = specs->fssize; |
eddaaafd SH |
2128 | if (!sz) |
2129 | sz = DEFAULT_FS_SIZE; | |
2130 | ||
72e99249 | 2131 | fstype = specs->fstype; |
eddaaafd SH |
2132 | if (!fstype) |
2133 | fstype = DEFAULT_FSTYPE; | |
2134 | ||
2135 | if (!(bdev->dest = strdup(dest))) | |
2136 | return -1; | |
2137 | ||
2138 | if (mkdir_p(bdev->dest, 0755) < 0) { | |
959aee9c | 2139 | ERROR("Error creating %s", bdev->dest); |
eddaaafd SH |
2140 | return -1; |
2141 | } | |
2142 | ||
2143 | return do_loop_create(srcdev, sz, fstype); | |
2144 | } | |
2145 | ||
2146 | static int loop_destroy(struct bdev *orig) | |
2147 | { | |
2148 | return unlink(orig->src + 5); | |
2149 | } | |
2150 | ||
74a3920a | 2151 | static const struct bdev_ops loop_ops = { |
eddaaafd SH |
2152 | .detect = &loop_detect, |
2153 | .mount = &loop_mount, | |
2154 | .umount = &loop_umount, | |
2155 | .clone_paths = &loop_clonepaths, | |
2156 | .destroy = &loop_destroy, | |
2157 | .create = &loop_create, | |
0a83cbbb | 2158 | .can_snapshot = false, |
cdd01be2 | 2159 | .can_backup = true, |
eddaaafd SH |
2160 | }; |
2161 | ||
9be53773 SH |
2162 | // |
2163 | // overlayfs ops | |
2164 | // | |
2165 | ||
2166 | static int overlayfs_detect(const char *path) | |
2167 | { | |
2168 | if (strncmp(path, "overlayfs:", 10) == 0) | |
2169 | return 1; // take their word for it | |
2170 | return 0; | |
2171 | } | |
2172 | ||
38b34913 SH |
2173 | static char *overlayfs_name; |
2174 | static char *detect_overlayfs_name(void) | |
2175 | { | |
2176 | char *v = "overlayfs"; | |
2177 | char *line = NULL; | |
2178 | size_t len = 0; | |
2179 | FILE *f = fopen("/proc/filesystems", "r"); | |
2180 | if (!f) | |
2181 | return v; | |
2182 | ||
2183 | while (getline(&line, &len, f) != -1) { | |
2184 | if (strcmp(line, "nodev\toverlay\n") == 0) { | |
2185 | v = "overlay"; | |
2186 | break; | |
2187 | } | |
2188 | } | |
2189 | ||
2190 | fclose(f); | |
2191 | free(line); | |
2192 | return v; | |
2193 | } | |
2194 | ||
9be53773 SH |
2195 | // |
2196 | // XXXXXXX plain directory bind mount ops | |
2197 | // | |
60bf62d4 | 2198 | static int overlayfs_mount(struct bdev *bdev) |
9be53773 SH |
2199 | { |
2200 | char *options, *dup, *lower, *upper; | |
7fb1bef2 KY |
2201 | char *options_work, *work, *lastslash; |
2202 | int lastslashidx; | |
2203 | int len, len2; | |
a17b1e65 SG |
2204 | unsigned long mntflags; |
2205 | char *mntdata; | |
7fb1bef2 | 2206 | int ret, ret2; |
9be53773 SH |
2207 | |
2208 | if (strcmp(bdev->type, "overlayfs")) | |
2209 | return -22; | |
2210 | if (!bdev->src || !bdev->dest) | |
2211 | return -22; | |
2212 | ||
38b34913 SH |
2213 | if (!overlayfs_name) |
2214 | overlayfs_name = detect_overlayfs_name(); | |
2215 | ||
9be53773 SH |
2216 | // separately mount it first |
2217 | // mount -t overlayfs -oupperdir=${upper},lowerdir=${lower} lower dest | |
d74325c4 SG |
2218 | dup = alloca(strlen(bdev->src)+1); |
2219 | strcpy(dup, bdev->src); | |
46cd2845 | 2220 | if (!(lower = strchr(dup, ':'))) |
9be53773 | 2221 | return -22; |
46cd2845 | 2222 | if (!(upper = strchr(++lower, ':'))) |
9be53773 SH |
2223 | return -22; |
2224 | *upper = '\0'; | |
2225 | upper++; | |
2226 | ||
a93488df SH |
2227 | // if delta doesn't yet exist, create it |
2228 | if (mkdir_p(upper, 0755) < 0 && errno != EEXIST) | |
2229 | return -22; | |
2230 | ||
7fb1bef2 KY |
2231 | // overlayfs.v22 or higher needs workdir option |
2232 | // if upper is /var/lib/lxc/c2/delta0, | |
2233 | // then workdir is /var/lib/lxc/c2/olwork | |
2234 | lastslash = strrchr(upper, '/'); | |
2235 | if (!lastslash) | |
2236 | return -22; | |
2237 | lastslash++; | |
2238 | lastslashidx = lastslash - upper; | |
2239 | ||
2240 | work = alloca(lastslashidx + 7); | |
2241 | strncpy(work, upper, lastslashidx+7); | |
2242 | strcpy(work+lastslashidx, "olwork"); | |
2243 | ||
a17b1e65 SG |
2244 | if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) { |
2245 | free(mntdata); | |
2246 | return -22; | |
2247 | } | |
2248 | ||
44481bff SH |
2249 | if (mkdir_p(work, 0755) < 0 && errno != EEXIST) { |
2250 | free(mntdata); | |
2251 | return -22; | |
2252 | } | |
2253 | ||
9be53773 SH |
2254 | // TODO We should check whether bdev->src is a blockdev, and if so |
2255 | // but for now, only support overlays of a basic directory | |
2256 | ||
a17b1e65 SG |
2257 | if (mntdata) { |
2258 | len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=,") + strlen(mntdata) + 1; | |
2259 | options = alloca(len); | |
2260 | ret = snprintf(options, len, "upperdir=%s,lowerdir=%s,%s", upper, lower, mntdata); | |
7fb1bef2 KY |
2261 | |
2262 | len2 = strlen(lower) + strlen(upper) + strlen(work) | |
2263 | + strlen("upperdir=,lowerdir=,workdir=") + strlen(mntdata) + 1; | |
2264 | options_work = alloca(len2); | |
2265 | ret2 = snprintf(options, len2, "upperdir=%s,lowerdir=%s,workdir=%s,%s", | |
2266 | upper, lower, work, mntdata); | |
a17b1e65 SG |
2267 | } |
2268 | else { | |
2269 | len = strlen(lower) + strlen(upper) + strlen("upperdir=,lowerdir=") + 1; | |
2270 | options = alloca(len); | |
2271 | ret = snprintf(options, len, "upperdir=%s,lowerdir=%s", upper, lower); | |
7fb1bef2 KY |
2272 | |
2273 | len2 = strlen(lower) + strlen(upper) + strlen(work) | |
2274 | + strlen("upperdir=,lowerdir=,workdir=") + 1; | |
2275 | options_work = alloca(len2); | |
2276 | ret2 = snprintf(options_work, len2, "upperdir=%s,lowerdir=%s,workdir=%s", | |
2277 | upper, lower, work); | |
a17b1e65 | 2278 | } |
7fb1bef2 | 2279 | if (ret < 0 || ret >= len || ret2 < 0 || ret2 >= len2) { |
a17b1e65 | 2280 | free(mntdata); |
9be53773 | 2281 | return -1; |
a17b1e65 SG |
2282 | } |
2283 | ||
7fb1bef2 | 2284 | // mount without workdir option for overlayfs before v21 |
38b34913 | 2285 | ret = mount(lower, bdev->dest, overlayfs_name, MS_MGC_VAL | mntflags, options); |
7fb1bef2 KY |
2286 | if (ret < 0) { |
2287 | INFO("overlayfs: error mounting %s onto %s options %s. retry with workdir", | |
9be53773 | 2288 | lower, bdev->dest, options); |
7fb1bef2 KY |
2289 | |
2290 | // retry with workdir option for overlayfs v22 and higher | |
38b34913 | 2291 | ret = mount(lower, bdev->dest, overlayfs_name, MS_MGC_VAL | mntflags, options_work); |
7fb1bef2 KY |
2292 | if (ret < 0) |
2293 | SYSERROR("overlayfs: error mounting %s onto %s options %s", | |
2294 | lower, bdev->dest, options_work); | |
2295 | else | |
2296 | INFO("overlayfs: mounted %s onto %s options %s", | |
2297 | lower, bdev->dest, options_work); | |
2298 | } | |
9be53773 SH |
2299 | else |
2300 | INFO("overlayfs: mounted %s onto %s options %s", | |
2301 | lower, bdev->dest, options); | |
2302 | return ret; | |
2303 | } | |
2304 | ||
60bf62d4 | 2305 | static int overlayfs_umount(struct bdev *bdev) |
9be53773 SH |
2306 | { |
2307 | if (strcmp(bdev->type, "overlayfs")) | |
2308 | return -22; | |
2309 | if (!bdev->src || !bdev->dest) | |
2310 | return -22; | |
2311 | return umount(bdev->dest); | |
2312 | } | |
2313 | ||
25190e5b SH |
2314 | static int rsync_delta(struct rsync_data_char *data) |
2315 | { | |
2316 | if (setgid(0) < 0) { | |
2317 | ERROR("Failed to setgid to 0"); | |
2318 | return -1; | |
2319 | } | |
2320 | if (setgroups(0, NULL) < 0) | |
2321 | WARN("Failed to clear groups"); | |
2322 | if (setuid(0) < 0) { | |
2323 | ERROR("Failed to setuid to 0"); | |
2324 | return -1; | |
2325 | } | |
2326 | if (do_rsync(data->src, data->dest) < 0) { | |
2327 | ERROR("rsyncing %s to %s", data->src, data->dest); | |
2328 | return -1; | |
2329 | } | |
2330 | ||
2331 | return 0; | |
2332 | } | |
2333 | ||
2334 | static int rsync_delta_wrapper(void *data) | |
2335 | { | |
2336 | struct rsync_data_char *arg = data; | |
2337 | return rsync_delta(arg); | |
2338 | } | |
2339 | ||
186bef00 SH |
2340 | static int ovl_rsync(struct ovl_rsync_data *data) |
2341 | { | |
270261b9 SH |
2342 | int ret; |
2343 | ||
186bef00 SH |
2344 | if (setgid(0) < 0) { |
2345 | ERROR("Failed to setgid to 0"); | |
2346 | return -1; | |
2347 | } | |
2348 | if (setgroups(0, NULL) < 0) | |
2349 | WARN("Failed to clear groups"); | |
2350 | if (setuid(0) < 0) { | |
2351 | ERROR("Failed to setuid to 0"); | |
2352 | return -1; | |
2353 | } | |
2354 | ||
2355 | if (unshare(CLONE_NEWNS) < 0) { | |
2356 | SYSERROR("Unable to unshare mounts ns"); | |
2357 | return -1; | |
2358 | } | |
2359 | if (detect_shared_rootfs()) { | |
2360 | if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) { | |
2361 | SYSERROR("Failed to make / rslave"); | |
2362 | ERROR("Continuing..."); | |
2363 | } | |
2364 | } | |
2365 | if (overlayfs_mount(data->orig) < 0) { | |
2366 | ERROR("Failed mounting original container fs"); | |
2367 | return -1; | |
2368 | } | |
2369 | if (overlayfs_mount(data->new) < 0) { | |
2370 | ERROR("Failed mounting new container fs"); | |
2371 | return -1; | |
2372 | } | |
270261b9 SH |
2373 | ret = do_rsync(data->orig->dest, data->new->dest); |
2374 | ||
2375 | overlayfs_umount(data->new); | |
2376 | overlayfs_umount(data->orig); | |
2377 | ||
2378 | if (ret < 0) { | |
186bef00 SH |
2379 | ERROR("rsyncing %s to %s", data->orig->dest, data->new->dest); |
2380 | return -1; | |
2381 | } | |
2382 | ||
2383 | return 0; | |
2384 | } | |
2385 | ||
2386 | static int ovl_rsync_wrapper(void *data) | |
2387 | { | |
2388 | struct ovl_rsync_data *arg = data; | |
2389 | return ovl_rsync(arg); | |
2390 | } | |
2391 | ||
2392 | static int ovl_do_rsync(struct bdev *orig, struct bdev *new, struct lxc_conf *conf) | |
2393 | { | |
2394 | int ret = -1; | |
2395 | struct ovl_rsync_data rdata; | |
2396 | ||
2397 | rdata.orig = orig; | |
2398 | rdata.new = new; | |
2399 | if (am_unpriv()) | |
2400 | ret = userns_exec_1(conf, ovl_rsync_wrapper, &rdata); | |
2401 | else | |
2402 | ret = ovl_rsync(&rdata); | |
2403 | if (ret) | |
2404 | ERROR("copying overlayfs delta"); | |
2405 | ||
2406 | return ret; | |
2407 | } | |
2408 | ||
9be53773 SH |
2409 | static int overlayfs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, |
2410 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
25190e5b | 2411 | uint64_t newsize, struct lxc_conf *conf) |
9be53773 SH |
2412 | { |
2413 | if (!snap) { | |
2414 | ERROR("overlayfs is only for snapshot clones"); | |
2415 | return -22; | |
2416 | } | |
2417 | ||
2418 | if (!orig->src || !orig->dest) | |
2419 | return -1; | |
2420 | ||
2421 | new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath); | |
2422 | if (!new->dest) | |
2423 | return -1; | |
2424 | if (mkdir_p(new->dest, 0755) < 0) | |
2425 | return -1; | |
2426 | ||
25190e5b SH |
2427 | if (am_unpriv() && chown_mapped_root(new->dest, conf) < 0) |
2428 | WARN("Failed to update ownership of %s", new->dest); | |
2429 | ||
9be53773 | 2430 | if (strcmp(orig->type, "dir") == 0) { |
edf77341 | 2431 | char *delta, *lastslash; |
7fb1bef2 | 2432 | char *work; |
edf77341 | 2433 | int ret, len, lastslashidx; |
5ca6c34b | 2434 | |
9be53773 SH |
2435 | // if we have /var/lib/lxc/c2/rootfs, then delta will be |
2436 | // /var/lib/lxc/c2/delta0 | |
edf77341 SH |
2437 | lastslash = strrchr(new->dest, '/'); |
2438 | if (!lastslash) | |
9be53773 | 2439 | return -22; |
edf77341 SH |
2440 | if (strlen(lastslash) < 7) |
2441 | return -22; | |
2442 | lastslash++; | |
2443 | lastslashidx = lastslash - new->dest; | |
2444 | ||
2445 | delta = malloc(lastslashidx + 7); | |
2446 | if (!delta) | |
2447 | return -1; | |
2448 | strncpy(delta, new->dest, lastslashidx+1); | |
2449 | strcpy(delta+lastslashidx, "delta0"); | |
9be53773 SH |
2450 | if ((ret = mkdir(delta, 0755)) < 0) { |
2451 | SYSERROR("error: mkdir %s", delta); | |
2452 | free(delta); | |
2453 | return -1; | |
2454 | } | |
25190e5b SH |
2455 | if (am_unpriv() && chown_mapped_root(delta, conf) < 0) |
2456 | WARN("Failed to update ownership of %s", delta); | |
9be53773 | 2457 | |
7fb1bef2 KY |
2458 | // make workdir for overlayfs.v22 or higher |
2459 | // workdir is /var/lib/lxc/c2/olwork | |
2460 | // it is used to prepare files before atomically swithing with destination, | |
2461 | // and needs to be on the same filesystem as upperdir, | |
2462 | // so it's OK for it to be empty. | |
2463 | work = malloc(lastslashidx + 7); | |
2464 | if (!work) | |
2465 | return -1; | |
2466 | strncpy(work, new->dest, lastslashidx+1); | |
2467 | strcpy(work+lastslashidx, "olwork"); | |
2468 | if (mkdir(work, 0755) < 0) { | |
2469 | SYSERROR("error: mkdir %s", work); | |
2470 | free(work); | |
2471 | return -1; | |
2472 | } | |
2473 | if (am_unpriv() && chown_mapped_root(work, conf) < 0) | |
2474 | WARN("Failed to update ownership of %s", work); | |
2475 | free(work); | |
2476 | ||
9be53773 SH |
2477 | // the src will be 'overlayfs:lowerdir:upperdir' |
2478 | len = strlen(delta) + strlen(orig->src) + 12; | |
2479 | new->src = malloc(len); | |
2480 | if (!new->src) { | |
2481 | free(delta); | |
2482 | return -ENOMEM; | |
2483 | } | |
2484 | ret = snprintf(new->src, len, "overlayfs:%s:%s", orig->src, delta); | |
2485 | free(delta); | |
2486 | if (ret < 0 || ret >= len) | |
2487 | return -ENOMEM; | |
9be53773 SH |
2488 | } else if (strcmp(orig->type, "overlayfs") == 0) { |
2489 | // What exactly do we want to do here? | |
2490 | // I think we want to use the original lowerdir, with a | |
2491 | // private delta which is originally rsynced from the | |
2492 | // original delta | |
7fb1bef2 KY |
2493 | char *osrc, *odelta, *nsrc, *ndelta, *work; |
2494 | char *lastslash; | |
2495 | int len, ret, lastslashidx; | |
9be53773 SH |
2496 | if (!(osrc = strdup(orig->src))) |
2497 | return -22; | |
46cd2845 PL |
2498 | nsrc = strchr(osrc, ':') + 1; |
2499 | if (nsrc != osrc + 10 || (odelta = strchr(nsrc, ':')) == NULL) { | |
9be53773 SH |
2500 | free(osrc); |
2501 | return -22; | |
2502 | } | |
2503 | *odelta = '\0'; | |
2504 | odelta++; | |
2505 | ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath); | |
2506 | if (!ndelta) { | |
2507 | free(osrc); | |
2508 | return -ENOMEM; | |
2509 | } | |
edf77341 | 2510 | if ((ret = mkdir(ndelta, 0755)) < 0 && errno != EEXIST) { |
25190e5b SH |
2511 | SYSERROR("error: mkdir %s", ndelta); |
2512 | free(osrc); | |
2513 | free(ndelta); | |
2514 | return -1; | |
2515 | } | |
2516 | if (am_unpriv() && chown_mapped_root(ndelta, conf) < 0) | |
2517 | WARN("Failed to update ownership of %s", ndelta); | |
7fb1bef2 KY |
2518 | |
2519 | // make workdir for overlayfs.v22 or higher | |
2520 | // for details, see above. | |
2521 | lastslash = strrchr(ndelta, '/'); | |
2522 | if (!lastslash) | |
2523 | return -1; | |
2524 | lastslash++; | |
2525 | lastslashidx = lastslash - ndelta; | |
2526 | ||
2527 | work = malloc(lastslashidx + 7); | |
2528 | if (!work) | |
2529 | return -1; | |
2530 | strncpy(work, ndelta, lastslashidx+1); | |
2531 | strcpy(work+lastslashidx, "olwork"); | |
2532 | if ((mkdir(work, 0755) < 0) && errno != EEXIST) { | |
2533 | SYSERROR("error: mkdir %s", work); | |
2534 | free(work); | |
2535 | return -1; | |
2536 | } | |
2537 | if (am_unpriv() && chown_mapped_root(work, conf) < 0) | |
2538 | WARN("Failed to update ownership of %s", work); | |
2539 | free(work); | |
2540 | ||
9be53773 SH |
2541 | len = strlen(nsrc) + strlen(ndelta) + 12; |
2542 | new->src = malloc(len); | |
2543 | if (!new->src) { | |
2544 | free(osrc); | |
2545 | free(ndelta); | |
2546 | return -ENOMEM; | |
2547 | } | |
2548 | ret = snprintf(new->src, len, "overlayfs:%s:%s", nsrc, ndelta); | |
2549 | free(osrc); | |
2550 | free(ndelta); | |
2551 | if (ret < 0 || ret >= len) | |
2552 | return -ENOMEM; | |
186bef00 SH |
2553 | |
2554 | return ovl_do_rsync(orig, new, conf); | |
375c2258 SH |
2555 | } else { |
2556 | ERROR("overlayfs clone of %s container is not yet supported", | |
2557 | orig->type); | |
2558 | // Note, supporting this will require overlayfs_mount supporting | |
2559 | // mounting of the underlay. No big deal, just needs to be done. | |
2560 | return -1; | |
9be53773 SH |
2561 | } |
2562 | ||
2563 | return 0; | |
2564 | } | |
2565 | ||
74a3920a | 2566 | static int overlayfs_destroy(struct bdev *orig) |
60bf62d4 SH |
2567 | { |
2568 | char *upper; | |
2569 | ||
2570 | if (strncmp(orig->src, "overlayfs:", 10) != 0) | |
2571 | return -22; | |
46cd2845 | 2572 | upper = strchr(orig->src + 10, ':'); |
60bf62d4 SH |
2573 | if (!upper) |
2574 | return -22; | |
2575 | upper++; | |
18aa217b | 2576 | return lxc_rmdir_onedev(upper, NULL); |
60bf62d4 SH |
2577 | } |
2578 | ||
1897e3bc SH |
2579 | /* |
2580 | * to say 'lxc-create -t ubuntu -n o1 -B overlayfs' means you want | |
2581 | * $lxcpath/$lxcname/rootfs to have the created container, while all | |
2582 | * changes after starting the container are written to | |
2583 | * $lxcpath/$lxcname/delta0 | |
2584 | */ | |
2585 | static int overlayfs_create(struct bdev *bdev, const char *dest, const char *n, | |
2586 | struct bdev_specs *specs) | |
2587 | { | |
2588 | char *delta; | |
2589 | int ret, len = strlen(dest), newlen; | |
2590 | ||
2591 | if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0) | |
2592 | return -1; | |
2593 | ||
2594 | if (!(bdev->dest = strdup(dest))) { | |
2595 | ERROR("Out of memory"); | |
2596 | return -1; | |
2597 | } | |
2598 | ||
d74325c4 SG |
2599 | delta = alloca(strlen(dest)+1); |
2600 | strcpy(delta, dest); | |
1897e3bc SH |
2601 | strcpy(delta+len-6, "delta0"); |
2602 | ||
2603 | if (mkdir_p(delta, 0755) < 0) { | |
959aee9c | 2604 | ERROR("Error creating %s", delta); |
1897e3bc SH |
2605 | return -1; |
2606 | } | |
2607 | ||
2608 | /* overlayfs:lower:upper */ | |
2609 | newlen = (2 * len) + strlen("overlayfs:") + 2; | |
2610 | bdev->src = malloc(newlen); | |
2611 | if (!bdev->src) { | |
2612 | ERROR("Out of memory"); | |
2613 | return -1; | |
2614 | } | |
2615 | ret = snprintf(bdev->src, newlen, "overlayfs:%s:%s", dest, delta); | |
2616 | if (ret < 0 || ret >= newlen) | |
2617 | return -1; | |
2618 | ||
2619 | if (mkdir_p(bdev->dest, 0755) < 0) { | |
959aee9c | 2620 | ERROR("Error creating %s", bdev->dest); |
1897e3bc SH |
2621 | return -1; |
2622 | } | |
2623 | ||
2624 | return 0; | |
2625 | } | |
2626 | ||
74a3920a | 2627 | static const struct bdev_ops overlayfs_ops = { |
9be53773 SH |
2628 | .detect = &overlayfs_detect, |
2629 | .mount = &overlayfs_mount, | |
2630 | .umount = &overlayfs_umount, | |
2631 | .clone_paths = &overlayfs_clonepaths, | |
60bf62d4 | 2632 | .destroy = &overlayfs_destroy, |
1897e3bc | 2633 | .create = &overlayfs_create, |
0a83cbbb | 2634 | .can_snapshot = true, |
cdd01be2 | 2635 | .can_backup = true, |
9be53773 SH |
2636 | }; |
2637 | ||
1f92162d SG |
2638 | // |
2639 | // aufs ops | |
2640 | // | |
2641 | ||
2642 | static int aufs_detect(const char *path) | |
2643 | { | |
2644 | if (strncmp(path, "aufs:", 5) == 0) | |
2645 | return 1; // take their word for it | |
2646 | return 0; | |
2647 | } | |
2648 | ||
2649 | // | |
2650 | // XXXXXXX plain directory bind mount ops | |
2651 | // | |
2652 | static int aufs_mount(struct bdev *bdev) | |
2653 | { | |
31a882ef | 2654 | char *options, *dup, *lower, *upper; |
1f92162d SG |
2655 | int len; |
2656 | unsigned long mntflags; | |
2657 | char *mntdata; | |
2658 | int ret; | |
31a882ef | 2659 | const char *xinopath = "/dev/shm/aufs.xino"; |
1f92162d SG |
2660 | |
2661 | if (strcmp(bdev->type, "aufs")) | |
2662 | return -22; | |
2663 | if (!bdev->src || !bdev->dest) | |
2664 | return -22; | |
2665 | ||
2666 | // separately mount it first | |
2667 | // mount -t aufs -obr=${upper}=rw:${lower}=ro lower dest | |
2668 | dup = alloca(strlen(bdev->src)+1); | |
2669 | strcpy(dup, bdev->src); | |
46cd2845 | 2670 | if (!(lower = strchr(dup, ':'))) |
1f92162d | 2671 | return -22; |
46cd2845 | 2672 | if (!(upper = strchr(++lower, ':'))) |
1f92162d SG |
2673 | return -22; |
2674 | *upper = '\0'; | |
2675 | upper++; | |
2676 | ||
2677 | if (parse_mntopts(bdev->mntopts, &mntflags, &mntdata) < 0) { | |
2678 | free(mntdata); | |
2679 | return -22; | |
2680 | } | |
2681 | ||
2682 | // TODO We should check whether bdev->src is a blockdev, and if so | |
2683 | // but for now, only support aufs of a basic directory | |
2684 | ||
9009a728 | 2685 | // AUFS does not work on top of certain filesystems like (XFS or Btrfs) |
31a882ef KY |
2686 | // so add xino=/dev/shm/aufs.xino parameter to mount options. |
2687 | // The same xino option can be specified to multiple aufs mounts, and | |
2688 | // a xino file is not shared among multiple aufs mounts. | |
9009a728 ÇO |
2689 | // |
2690 | // see http://www.mail-archive.com/aufs-users@lists.sourceforge.net/msg02587.html | |
31a882ef | 2691 | // http://www.mail-archive.com/aufs-users@lists.sourceforge.net/msg05126.html |
1f92162d | 2692 | if (mntdata) { |
31a882ef | 2693 | len = strlen(lower) + strlen(upper) + strlen(xinopath) + strlen("br==rw:=ro,,xino=") + strlen(mntdata) + 1; |
1f92162d | 2694 | options = alloca(len); |
31a882ef | 2695 | ret = snprintf(options, len, "br=%s=rw:%s=ro,%s,xino=%s", upper, lower, mntdata, xinopath); |
1f92162d SG |
2696 | } |
2697 | else { | |
31a882ef | 2698 | len = strlen(lower) + strlen(upper) + strlen(xinopath) + strlen("br==rw:=ro,xino=") + 1; |
1f92162d | 2699 | options = alloca(len); |
31a882ef | 2700 | ret = snprintf(options, len, "br=%s=rw:%s=ro,xino=%s", upper, lower, xinopath); |
1f92162d | 2701 | } |
9009a728 | 2702 | |
1f92162d SG |
2703 | if (ret < 0 || ret >= len) { |
2704 | free(mntdata); | |
2705 | return -1; | |
2706 | } | |
2707 | ||
2708 | ret = mount(lower, bdev->dest, "aufs", MS_MGC_VAL | mntflags, options); | |
2709 | if (ret < 0) | |
2710 | SYSERROR("aufs: error mounting %s onto %s options %s", | |
2711 | lower, bdev->dest, options); | |
2712 | else | |
2713 | INFO("aufs: mounted %s onto %s options %s", | |
2714 | lower, bdev->dest, options); | |
2715 | return ret; | |
2716 | } | |
2717 | ||
2718 | static int aufs_umount(struct bdev *bdev) | |
2719 | { | |
2720 | if (strcmp(bdev->type, "aufs")) | |
2721 | return -22; | |
2722 | if (!bdev->src || !bdev->dest) | |
2723 | return -22; | |
2724 | return umount(bdev->dest); | |
2725 | } | |
2726 | ||
2727 | static int aufs_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, | |
2728 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
25190e5b | 2729 | uint64_t newsize, struct lxc_conf *conf) |
1f92162d SG |
2730 | { |
2731 | if (!snap) { | |
2732 | ERROR("aufs is only for snapshot clones"); | |
2733 | return -22; | |
2734 | } | |
2735 | ||
2736 | if (!orig->src || !orig->dest) | |
2737 | return -1; | |
2738 | ||
2739 | new->dest = dir_new_path(orig->dest, oldname, cname, oldpath, lxcpath); | |
2740 | if (!new->dest) | |
2741 | return -1; | |
2742 | if (mkdir_p(new->dest, 0755) < 0) | |
2743 | return -1; | |
2744 | ||
31a882ef KY |
2745 | if (am_unpriv() && chown_mapped_root(new->dest, conf) < 0) |
2746 | WARN("Failed to update ownership of %s", new->dest); | |
2747 | ||
1f92162d | 2748 | if (strcmp(orig->type, "dir") == 0) { |
edf77341 SH |
2749 | char *delta, *lastslash; |
2750 | int ret, len, lastslashidx; | |
1f92162d SG |
2751 | |
2752 | // if we have /var/lib/lxc/c2/rootfs, then delta will be | |
2753 | // /var/lib/lxc/c2/delta0 | |
edf77341 SH |
2754 | lastslash = strrchr(new->dest, '/'); |
2755 | if (!lastslash) | |
1f92162d | 2756 | return -22; |
edf77341 SH |
2757 | if (strlen(lastslash) < 7) |
2758 | return -22; | |
2759 | lastslash++; | |
2760 | lastslashidx = lastslash - new->dest; | |
2761 | ||
2762 | delta = malloc(lastslashidx + 7); | |
2763 | if (!delta) | |
2764 | return -1; | |
2765 | strncpy(delta, new->dest, lastslashidx+1); | |
2766 | strcpy(delta+lastslashidx, "delta0"); | |
1f92162d SG |
2767 | if ((ret = mkdir(delta, 0755)) < 0) { |
2768 | SYSERROR("error: mkdir %s", delta); | |
2769 | free(delta); | |
2770 | return -1; | |
2771 | } | |
31a882ef KY |
2772 | if (am_unpriv() && chown_mapped_root(delta, conf) < 0) |
2773 | WARN("Failed to update ownership of %s", delta); | |
1f92162d SG |
2774 | |
2775 | // the src will be 'aufs:lowerdir:upperdir' | |
2776 | len = strlen(delta) + strlen(orig->src) + 12; | |
2777 | new->src = malloc(len); | |
2778 | if (!new->src) { | |
2779 | free(delta); | |
2780 | return -ENOMEM; | |
2781 | } | |
2782 | ret = snprintf(new->src, len, "aufs:%s:%s", orig->src, delta); | |
2783 | free(delta); | |
2784 | if (ret < 0 || ret >= len) | |
2785 | return -ENOMEM; | |
2786 | } else if (strcmp(orig->type, "aufs") == 0) { | |
2787 | // What exactly do we want to do here? | |
2788 | // I think we want to use the original lowerdir, with a | |
2789 | // private delta which is originally rsynced from the | |
2790 | // original delta | |
2791 | char *osrc, *odelta, *nsrc, *ndelta; | |
2792 | int len, ret; | |
2793 | if (!(osrc = strdup(orig->src))) | |
2794 | return -22; | |
46cd2845 PL |
2795 | nsrc = strchr(osrc, ':') + 1; |
2796 | if (nsrc != osrc + 5 || (odelta = strchr(nsrc, ':')) == NULL) { | |
1f92162d SG |
2797 | free(osrc); |
2798 | return -22; | |
2799 | } | |
2800 | *odelta = '\0'; | |
2801 | odelta++; | |
2802 | ndelta = dir_new_path(odelta, oldname, cname, oldpath, lxcpath); | |
2803 | if (!ndelta) { | |
2804 | free(osrc); | |
2805 | return -ENOMEM; | |
2806 | } | |
31a882ef KY |
2807 | if ((ret = mkdir(ndelta, 0755)) < 0 && errno != EEXIST) { |
2808 | SYSERROR("error: mkdir %s", ndelta); | |
2809 | free(osrc); | |
2810 | free(ndelta); | |
2811 | return -1; | |
2812 | } | |
2813 | if (am_unpriv() && chown_mapped_root(ndelta, conf) < 0) | |
2814 | WARN("Failed to update ownership of %s", ndelta); | |
2815 | ||
2816 | struct rsync_data_char rdata; | |
2817 | rdata.src = odelta; | |
2818 | rdata.dest = ndelta; | |
2819 | if (am_unpriv()) | |
2820 | ret = userns_exec_1(conf, rsync_delta_wrapper, &rdata); | |
2821 | else | |
2822 | ret = rsync_delta(&rdata); | |
2823 | if (ret) { | |
1f92162d SG |
2824 | free(osrc); |
2825 | free(ndelta); | |
2826 | ERROR("copying aufs delta"); | |
2827 | return -1; | |
2828 | } | |
2829 | len = strlen(nsrc) + strlen(ndelta) + 12; | |
2830 | new->src = malloc(len); | |
2831 | if (!new->src) { | |
2832 | free(osrc); | |
2833 | free(ndelta); | |
2834 | return -ENOMEM; | |
2835 | } | |
2836 | ret = snprintf(new->src, len, "aufs:%s:%s", nsrc, ndelta); | |
2837 | free(osrc); | |
2838 | free(ndelta); | |
2839 | if (ret < 0 || ret >= len) | |
2840 | return -ENOMEM; | |
2841 | } else { | |
2842 | ERROR("aufs clone of %s container is not yet supported", | |
2843 | orig->type); | |
2844 | // Note, supporting this will require aufs_mount supporting | |
2845 | // mounting of the underlay. No big deal, just needs to be done. | |
2846 | return -1; | |
2847 | } | |
2848 | ||
2849 | return 0; | |
2850 | } | |
2851 | ||
2852 | static int aufs_destroy(struct bdev *orig) | |
2853 | { | |
2854 | char *upper; | |
2855 | ||
2856 | if (strncmp(orig->src, "aufs:", 5) != 0) | |
2857 | return -22; | |
46cd2845 | 2858 | upper = strchr(orig->src + 5, ':'); |
1f92162d SG |
2859 | if (!upper) |
2860 | return -22; | |
2861 | upper++; | |
18aa217b | 2862 | return lxc_rmdir_onedev(upper, NULL); |
1f92162d SG |
2863 | } |
2864 | ||
2865 | /* | |
2866 | * to say 'lxc-create -t ubuntu -n o1 -B aufs' means you want | |
2867 | * $lxcpath/$lxcname/rootfs to have the created container, while all | |
2868 | * changes after starting the container are written to | |
2869 | * $lxcpath/$lxcname/delta0 | |
2870 | */ | |
2871 | static int aufs_create(struct bdev *bdev, const char *dest, const char *n, | |
2872 | struct bdev_specs *specs) | |
2873 | { | |
2874 | char *delta; | |
2875 | int ret, len = strlen(dest), newlen; | |
2876 | ||
2877 | if (len < 8 || strcmp(dest+len-7, "/rootfs") != 0) | |
2878 | return -1; | |
2879 | ||
2880 | if (!(bdev->dest = strdup(dest))) { | |
2881 | ERROR("Out of memory"); | |
2882 | return -1; | |
2883 | } | |
2884 | ||
2885 | delta = alloca(strlen(dest)+1); | |
2886 | strcpy(delta, dest); | |
2887 | strcpy(delta+len-6, "delta0"); | |
2888 | ||
2889 | if (mkdir_p(delta, 0755) < 0) { | |
2890 | ERROR("Error creating %s", delta); | |
2891 | return -1; | |
2892 | } | |
2893 | ||
2894 | /* aufs:lower:upper */ | |
2895 | newlen = (2 * len) + strlen("aufs:") + 2; | |
2896 | bdev->src = malloc(newlen); | |
2897 | if (!bdev->src) { | |
2898 | ERROR("Out of memory"); | |
2899 | return -1; | |
2900 | } | |
2901 | ret = snprintf(bdev->src, newlen, "aufs:%s:%s", dest, delta); | |
2902 | if (ret < 0 || ret >= newlen) | |
2903 | return -1; | |
2904 | ||
2905 | if (mkdir_p(bdev->dest, 0755) < 0) { | |
2906 | ERROR("Error creating %s", bdev->dest); | |
2907 | return -1; | |
2908 | } | |
2909 | ||
2910 | return 0; | |
2911 | } | |
2912 | ||
2913 | static const struct bdev_ops aufs_ops = { | |
2914 | .detect = &aufs_detect, | |
2915 | .mount = &aufs_mount, | |
2916 | .umount = &aufs_umount, | |
2917 | .clone_paths = &aufs_clonepaths, | |
2918 | .destroy = &aufs_destroy, | |
2919 | .create = &aufs_create, | |
2920 | .can_snapshot = true, | |
cdd01be2 | 2921 | .can_backup = true, |
1f92162d SG |
2922 | }; |
2923 | ||
76a26f55 SH |
2924 | // |
2925 | // nbd dev ops | |
2926 | // | |
2927 | ||
2928 | static int nbd_detect(const char *path) | |
2929 | { | |
2930 | if (strncmp(path, "nbd:", 4) == 0) | |
2931 | return 1; | |
2932 | return 0; | |
2933 | } | |
2934 | ||
2935 | struct nbd_attach_data { | |
2936 | const char *nbd; | |
2937 | const char *path; | |
2938 | }; | |
2939 | ||
2940 | static void nbd_detach(const char *path) | |
2941 | { | |
2942 | int ret; | |
2943 | pid_t pid = fork(); | |
2944 | ||
2945 | if (pid < 0) { | |
2946 | SYSERROR("Error forking to detach nbd"); | |
2947 | return; | |
2948 | } | |
2949 | if (pid) { | |
2950 | ret = wait_for_pid(pid); | |
2951 | if (ret < 0) | |
2952 | ERROR("nbd disconnect returned an error"); | |
2953 | return; | |
2954 | } | |
2955 | execlp("qemu-nbd", "qemu-nbd", "-d", path, NULL); | |
2956 | SYSERROR("Error executing qemu-nbd"); | |
2957 | exit(1); | |
2958 | } | |
2959 | ||
2960 | static int do_attach_nbd(void *d) | |
2961 | { | |
2962 | struct nbd_attach_data *data = d; | |
2963 | const char *nbd, *path; | |
2964 | pid_t pid; | |
2965 | sigset_t mask; | |
2966 | int sfd; | |
2967 | ssize_t s; | |
2968 | struct signalfd_siginfo fdsi; | |
2969 | ||
2970 | sigemptyset(&mask); | |
2971 | sigaddset(&mask, SIGHUP); | |
2972 | sigaddset(&mask, SIGCHLD); | |
2973 | ||
2974 | nbd = data->nbd; | |
2975 | path = data->path; | |
2976 | ||
2977 | if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) { | |
2978 | SYSERROR("Error blocking signals for nbd watcher"); | |
2979 | exit(1); | |
2980 | } | |
2981 | ||
2982 | sfd = signalfd(-1, &mask, 0); | |
2983 | if (sfd == -1) { | |
2984 | SYSERROR("Error opening signalfd for nbd task"); | |
2985 | exit(1); | |
2986 | } | |
2987 | ||
2988 | if (prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0) < 0) | |
2989 | SYSERROR("Error setting parent death signal for nbd watcher"); | |
2990 | ||
2991 | pid = fork(); | |
2992 | if (pid) { | |
2993 | for (;;) { | |
2994 | s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo)); | |
2995 | if (s != sizeof(struct signalfd_siginfo)) | |
2996 | SYSERROR("Error reading from signalfd"); | |
2997 | ||
2998 | if (fdsi.ssi_signo == SIGHUP) { | |
2999 | /* container has exited */ | |
3000 | nbd_detach(nbd); | |
3001 | exit(0); | |
3002 | } else if (fdsi.ssi_signo == SIGCHLD) { | |
3003 | int status; | |
3abd3e54 SH |
3004 | /* If qemu-nbd fails, or is killed by a signal, |
3005 | * then exit */ | |
3006 | while (waitpid(-1, &status, WNOHANG) > 0) { | |
3007 | if ((WIFEXITED(status) && WEXITSTATUS(status) != 0) || | |
3008 | WIFSIGNALED(status)) { | |
3009 | nbd_detach(nbd); | |
3010 | exit(1); | |
3011 | } | |
3012 | } | |
76a26f55 SH |
3013 | } |
3014 | } | |
3015 | } | |
3016 | ||
3017 | close(sfd); | |
3018 | if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1) | |
3019 | WARN("Warning: unblocking signals for nbd watcher"); | |
3020 | ||
3021 | execlp("qemu-nbd", "qemu-nbd", "-c", nbd, path, NULL); | |
3022 | SYSERROR("Error executing qemu-nbd"); | |
3023 | exit(1); | |
3024 | } | |
3025 | ||
3026 | static bool clone_attach_nbd(const char *nbd, const char *path) | |
3027 | { | |
3028 | pid_t pid; | |
3029 | struct nbd_attach_data data; | |
3030 | ||
3031 | data.nbd = nbd; | |
3032 | data.path = path; | |
3033 | ||
3034 | pid = lxc_clone(do_attach_nbd, &data, CLONE_NEWPID); | |
3035 | if (pid < 0) | |
3036 | return false; | |
3037 | return true; | |
3038 | } | |
3039 | ||
3040 | static bool nbd_busy(int idx) | |
3041 | { | |
3042 | char path[100]; | |
3043 | int ret; | |
3044 | ||
3045 | ret = snprintf(path, 100, "/sys/block/nbd%d/pid", idx); | |
3046 | if (ret < 0 || ret >= 100) | |
3047 | return true; | |
3048 | return file_exists(path); | |
3049 | } | |
3050 | ||
3051 | static bool attach_nbd(char *src, struct lxc_conf *conf) | |
3052 | { | |
3053 | char *orig = alloca(strlen(src)+1), *p, path[50]; | |
3054 | int i = 0; | |
3055 | ||
3056 | strcpy(orig, src); | |
3057 | /* if path is followed by a partition, drop that for now */ | |
3058 | p = strchr(orig, ':'); | |
3059 | if (p) | |
3060 | *p = '\0'; | |
3061 | while (1) { | |
3062 | sprintf(path, "/dev/nbd%d", i); | |
3063 | if (!file_exists(path)) | |
3064 | return false; | |
3065 | if (nbd_busy(i)) { | |
3066 | i++; | |
3067 | continue; | |
3068 | } | |
3069 | if (!clone_attach_nbd(path, orig)) | |
3070 | return false; | |
3071 | conf->nbd_idx = i; | |
3072 | return true; | |
3073 | } | |
3074 | } | |
3075 | ||
3076 | static bool requires_nbd(const char *path) | |
3077 | { | |
3078 | if (strncmp(path, "nbd:", 4) == 0) | |
3079 | return true; | |
3080 | return false; | |
3081 | } | |
3082 | ||
3083 | /* | |
3084 | * attach_block_device returns true if all went well, | |
3085 | * meaning either a block device was attached or was not | |
3086 | * needed. It returns false if something went wrong and | |
ec64264d | 3087 | * container startup should be stopped. |
76a26f55 SH |
3088 | */ |
3089 | bool attach_block_device(struct lxc_conf *conf) | |
3090 | { | |
3091 | char *path; | |
3092 | ||
3093 | if (!conf->rootfs.path) | |
3094 | return true; | |
3095 | path = conf->rootfs.path; | |
3096 | if (!requires_nbd(path)) | |
3097 | return true; | |
3098 | path = strchr(path, ':'); | |
3099 | if (!path) | |
3100 | return false; | |
3101 | path++; | |
3102 | if (!attach_nbd(path, conf)) | |
3103 | return false; | |
3104 | return true; | |
3105 | } | |
3106 | ||
3107 | void detach_nbd_idx(int idx) | |
3108 | { | |
3109 | int ret; | |
3110 | char path[50]; | |
3111 | ||
3112 | ret = snprintf(path, 50, "/dev/nbd%d", idx); | |
3113 | if (ret < 0 || ret >= 50) | |
3114 | return; | |
3115 | ||
3116 | nbd_detach(path); | |
3117 | } | |
3118 | ||
3119 | void detach_block_device(struct lxc_conf *conf) | |
3120 | { | |
3121 | if (conf->nbd_idx != -1) | |
3122 | detach_nbd_idx(conf->nbd_idx); | |
3123 | } | |
3124 | ||
3125 | /* | |
3126 | * Pick the partition # off the end of a nbd:file:p | |
3127 | * description. Return 1-9 for the partition id, or 0 | |
3128 | * for no partition. | |
3129 | */ | |
3130 | static int nbd_get_partition(const char *src) | |
3131 | { | |
3132 | char *p = strchr(src, ':'); | |
3133 | if (!p) | |
3134 | return 0; | |
3135 | p = strchr(p+1, ':'); | |
3136 | if (!p) | |
3137 | return 0; | |
3138 | p++; | |
a6ee1277 | 3139 | if (*p < '1' || *p > '9') |
76a26f55 SH |
3140 | return 0; |
3141 | return *p - '0'; | |
3142 | } | |
3143 | ||
bfd0b144 SH |
3144 | static bool wait_for_partition(const char *path) |
3145 | { | |
3146 | int count = 0; | |
3147 | while (count < 5) { | |
3148 | if (file_exists(path)) | |
3149 | return true; | |
3150 | sleep(1); | |
3151 | count++; | |
3152 | } | |
3153 | ERROR("Device %s did not show up after 5 seconds", path); | |
3154 | return false; | |
3155 | } | |
3156 | ||
76a26f55 SH |
3157 | static int nbd_mount(struct bdev *bdev) |
3158 | { | |
3159 | int ret = -1, partition; | |
3160 | char path[50]; | |
3161 | ||
3162 | if (strcmp(bdev->type, "nbd")) | |
3163 | return -22; | |
3164 | if (!bdev->src || !bdev->dest) | |
3165 | return -22; | |
3166 | ||
3167 | /* nbd_idx should have been copied by bdev_init from the lxc_conf */ | |
3168 | if (bdev->nbd_idx < 0) | |
3169 | return -22; | |
3170 | partition = nbd_get_partition(bdev->src); | |
3171 | if (partition) | |
3172 | ret = snprintf(path, 50, "/dev/nbd%dp%d", bdev->nbd_idx, | |
3173 | partition); | |
3174 | else | |
3175 | ret = snprintf(path, 50, "/dev/nbd%d", bdev->nbd_idx); | |
3176 | if (ret < 0 || ret >= 50) { | |
3177 | ERROR("Error setting up nbd device path"); | |
3178 | return ret; | |
3179 | } | |
bfd0b144 SH |
3180 | |
3181 | /* It might take awhile for the partition files to show up */ | |
3182 | if (partition) { | |
3183 | if (!wait_for_partition(path)) | |
3184 | return -2; | |
3185 | } | |
76a26f55 SH |
3186 | ret = mount_unknown_fs(path, bdev->dest, bdev->mntopts); |
3187 | if (ret < 0) | |
3188 | ERROR("Error mounting %s", bdev->src); | |
3189 | ||
3190 | return ret; | |
3191 | } | |
3192 | ||
3193 | static int nbd_create(struct bdev *bdev, const char *dest, const char *n, | |
3194 | struct bdev_specs *specs) | |
3195 | { | |
3196 | return -ENOSYS; | |
3197 | } | |
3198 | ||
3199 | static int nbd_clonepaths(struct bdev *orig, struct bdev *new, const char *oldname, | |
3200 | const char *cname, const char *oldpath, const char *lxcpath, int snap, | |
3201 | uint64_t newsize, struct lxc_conf *conf) | |
3202 | { | |
3203 | return -ENOSYS; | |
3204 | } | |
3205 | ||
3206 | static int nbd_destroy(struct bdev *orig) | |
3207 | { | |
3208 | return -ENOSYS; | |
3209 | } | |
3210 | ||
3211 | static int nbd_umount(struct bdev *bdev) | |
3212 | { | |
3213 | int ret; | |
3214 | ||
3215 | if (strcmp(bdev->type, "nbd")) | |
3216 | return -22; | |
3217 | if (!bdev->src || !bdev->dest) | |
3218 | return -22; | |
3219 | ret = umount(bdev->dest); | |
3220 | return ret; | |
3221 | } | |
3222 | ||
3223 | static const struct bdev_ops nbd_ops = { | |
3224 | .detect = &nbd_detect, | |
3225 | .mount = &nbd_mount, | |
3226 | .umount = &nbd_umount, | |
3227 | .clone_paths = &nbd_clonepaths, | |
3228 | .destroy = &nbd_destroy, | |
3229 | .create = &nbd_create, | |
3230 | .can_snapshot = true, | |
cdd01be2 | 3231 | .can_backup = false, |
76a26f55 | 3232 | }; |
1f92162d | 3233 | |
74a3920a | 3234 | static const struct bdev_type bdevs[] = { |
3baa76fe | 3235 | {.name = "zfs", .ops = &zfs_ops,}, |
9be53773 SH |
3236 | {.name = "lvm", .ops = &lvm_ops,}, |
3237 | {.name = "btrfs", .ops = &btrfs_ops,}, | |
3238 | {.name = "dir", .ops = &dir_ops,}, | |
1f92162d | 3239 | {.name = "aufs", .ops = &aufs_ops,}, |
9be53773 | 3240 | {.name = "overlayfs", .ops = &overlayfs_ops,}, |
eddaaafd | 3241 | {.name = "loop", .ops = &loop_ops,}, |
76a26f55 | 3242 | {.name = "nbd", .ops = &nbd_ops,}, |
9be53773 SH |
3243 | }; |
3244 | ||
3245 | static const size_t numbdevs = sizeof(bdevs) / sizeof(struct bdev_type); | |
3246 | ||
3247 | void bdev_put(struct bdev *bdev) | |
3248 | { | |
f10fad2f ME |
3249 | free(bdev->mntopts); |
3250 | free(bdev->src); | |
3251 | free(bdev->dest); | |
9be53773 SH |
3252 | free(bdev); |
3253 | } | |
3254 | ||
3255 | struct bdev *bdev_get(const char *type) | |
3256 | { | |
3257 | int i; | |
3258 | struct bdev *bdev; | |
3259 | ||
3260 | for (i=0; i<numbdevs; i++) { | |
3261 | if (strcmp(bdevs[i].name, type) == 0) | |
3262 | break; | |
3263 | } | |
3264 | if (i == numbdevs) | |
3265 | return NULL; | |
3266 | bdev = malloc(sizeof(struct bdev)); | |
3267 | if (!bdev) | |
3268 | return NULL; | |
3269 | memset(bdev, 0, sizeof(struct bdev)); | |
3270 | bdev->ops = bdevs[i].ops; | |
3271 | bdev->type = bdevs[i].name; | |
3272 | return bdev; | |
3273 | } | |
3274 | ||
35120d9c | 3275 | static const struct bdev_type *bdev_query(const char *src) |
9be53773 SH |
3276 | { |
3277 | int i; | |
9be53773 SH |
3278 | for (i=0; i<numbdevs; i++) { |
3279 | int r; | |
3280 | r = bdevs[i].ops->detect(src); | |
3281 | if (r) | |
3282 | break; | |
3283 | } | |
eddaaafd | 3284 | |
9be53773 SH |
3285 | if (i == numbdevs) |
3286 | return NULL; | |
35120d9c SH |
3287 | return &bdevs[i]; |
3288 | } | |
3289 | ||
3290 | struct bdev *bdev_init(struct lxc_conf *conf, const char *src, const char *dst, const char *mntopts) | |
3291 | { | |
3292 | struct bdev *bdev; | |
3293 | const struct bdev_type *q; | |
3294 | ||
cdd01be2 SH |
3295 | if (!src) |
3296 | src = conf->rootfs.path; | |
3297 | ||
3298 | if (!src) | |
3299 | return NULL; | |
3300 | ||
35120d9c SH |
3301 | q = bdev_query(src); |
3302 | if (!q) | |
3303 | return NULL; | |
3304 | ||
9be53773 SH |
3305 | bdev = malloc(sizeof(struct bdev)); |
3306 | if (!bdev) | |
3307 | return NULL; | |
3308 | memset(bdev, 0, sizeof(struct bdev)); | |
35120d9c SH |
3309 | bdev->ops = q->ops; |
3310 | bdev->type = q->name; | |
a17b1e65 SG |
3311 | if (mntopts) |
3312 | bdev->mntopts = strdup(mntopts); | |
9be53773 SH |
3313 | if (src) |
3314 | bdev->src = strdup(src); | |
3315 | if (dst) | |
3316 | bdev->dest = strdup(dst); | |
76a26f55 SH |
3317 | if (strcmp(bdev->type, "nbd") == 0) |
3318 | bdev->nbd_idx = conf->nbd_idx; | |
9be53773 SH |
3319 | |
3320 | return bdev; | |
3321 | } | |
3322 | ||
1354955b SH |
3323 | struct rsync_data { |
3324 | struct bdev *orig; | |
3325 | struct bdev *new; | |
3326 | }; | |
3327 | ||
3328 | static int rsync_rootfs(struct rsync_data *data) | |
3329 | { | |
3330 | struct bdev *orig = data->orig, | |
3331 | *new = data->new; | |
3332 | ||
3333 | if (unshare(CLONE_NEWNS) < 0) { | |
3334 | SYSERROR("unshare CLONE_NEWNS"); | |
3335 | return -1; | |
3336 | } | |
c597baa8 DE |
3337 | if (detect_shared_rootfs()) { |
3338 | if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) { | |
2c6f3fc9 | 3339 | SYSERROR("Failed to make / rslave"); |
c597baa8 DE |
3340 | ERROR("Continuing..."); |
3341 | } | |
3342 | } | |
1354955b SH |
3343 | |
3344 | // If not a snapshot, copy the fs. | |
3345 | if (orig->ops->mount(orig) < 0) { | |
959aee9c | 3346 | ERROR("failed mounting %s onto %s", orig->src, orig->dest); |
1354955b SH |
3347 | return -1; |
3348 | } | |
3349 | if (new->ops->mount(new) < 0) { | |
959aee9c | 3350 | ERROR("failed mounting %s onto %s", new->src, new->dest); |
1354955b SH |
3351 | return -1; |
3352 | } | |
3353 | if (setgid(0) < 0) { | |
3354 | ERROR("Failed to setgid to 0"); | |
3355 | return -1; | |
3356 | } | |
c476bdce SH |
3357 | if (setgroups(0, NULL) < 0) |
3358 | WARN("Failed to clear groups"); | |
1354955b SH |
3359 | if (setuid(0) < 0) { |
3360 | ERROR("Failed to setuid to 0"); | |
3361 | return -1; | |
3362 | } | |
3363 | if (do_rsync(orig->dest, new->dest) < 0) { | |
959aee9c | 3364 | ERROR("rsyncing %s to %s", orig->src, new->src); |
1354955b SH |
3365 | return -1; |
3366 | } | |
3367 | ||
3368 | return 0; | |
3369 | } | |
3370 | ||
3371 | static int rsync_rootfs_wrapper(void *data) | |
3372 | { | |
3373 | struct rsync_data *arg = data; | |
3374 | return rsync_rootfs(arg); | |
3375 | } | |
8c39f7a4 | 3376 | |
76a26f55 | 3377 | bool bdev_is_dir(struct lxc_conf *conf, const char *path) |
8c39f7a4 | 3378 | { |
76a26f55 | 3379 | struct bdev *orig = bdev_init(conf, path, NULL, NULL); |
8c39f7a4 SH |
3380 | bool ret = false; |
3381 | if (!orig) | |
3382 | return ret; | |
3383 | if (strcmp(orig->type, "dir") == 0) | |
3384 | ret = true; | |
3385 | bdev_put(orig); | |
3386 | return ret; | |
3387 | } | |
3388 | ||
cdd01be2 SH |
3389 | bool bdev_can_backup(struct lxc_conf *conf) |
3390 | { | |
3391 | struct bdev *bdev = bdev_init(conf, NULL, NULL, NULL); | |
3392 | bool ret; | |
3393 | ||
3394 | if (!bdev) | |
3395 | return false; | |
3396 | ret = bdev->ops->can_backup; | |
3397 | bdev_put(bdev); | |
3398 | return ret; | |
3399 | } | |
3400 | ||
a7ef8753 SH |
3401 | /* |
3402 | * is an unprivileged user allowed to make this kind of snapshot | |
3403 | */ | |
3404 | static bool unpriv_snap_allowed(struct bdev *b, const char *t, bool snap, | |
3405 | bool maybesnap) | |
3406 | { | |
3407 | if (!t) { | |
3408 | // new type will be same as original | |
3409 | // (unless snap && b->type == dir, in which case it will be | |
3410 | // overlayfs -- which is also allowed) | |
3411 | if (strcmp(b->type, "dir") == 0 || | |
31a882ef | 3412 | strcmp(b->type, "aufs") == 0 || |
a7ef8753 | 3413 | strcmp(b->type, "overlayfs") == 0 || |
2659c7cb | 3414 | strcmp(b->type, "btrfs") == 0 || |
a7ef8753 SH |
3415 | strcmp(b->type, "loop") == 0) |
3416 | return true; | |
3417 | return false; | |
3418 | } | |
3419 | ||
3420 | // unprivileged users can copy and snapshot dir, overlayfs, | |
3421 | // and loop. In particular, not zfs, btrfs, or lvm. | |
31a882ef KY |
3422 | if (strcmp(t, "dir") == 0 || |
3423 | strcmp(t, "aufs") == 0 || | |
3424 | strcmp(t, "overlayfs") == 0 || | |
3425 | strcmp(t, "btrfs") == 0 || | |
3426 | strcmp(t, "loop") == 0) | |
a7ef8753 SH |
3427 | return true; |
3428 | return false; | |
3429 | } | |
3430 | ||
9be53773 SH |
3431 | /* |
3432 | * If we're not snaphotting, then bdev_copy becomes a simple case of mount | |
3433 | * the original, mount the new, and rsync the contents. | |
3434 | */ | |
1354955b SH |
3435 | struct bdev *bdev_copy(struct lxc_container *c0, const char *cname, |
3436 | const char *lxcpath, const char *bdevtype, | |
d659597e | 3437 | int flags, const char *bdevdata, uint64_t newsize, |
dfb31b25 | 3438 | int *needs_rdep) |
9be53773 SH |
3439 | { |
3440 | struct bdev *orig, *new; | |
3441 | pid_t pid; | |
1354955b | 3442 | int ret; |
0a83cbbb SH |
3443 | bool snap = flags & LXC_CLONE_SNAPSHOT; |
3444 | bool maybe_snap = flags & LXC_CLONE_MAYBE_SNAPSHOT; | |
3445 | bool keepbdevtype = flags & LXC_CLONE_KEEPBDEVTYPE; | |
1354955b SH |
3446 | const char *src = c0->lxc_conf->rootfs.path; |
3447 | const char *oldname = c0->name; | |
3448 | const char *oldpath = c0->config_path; | |
3449 | struct rsync_data data; | |
9be53773 SH |
3450 | |
3451 | /* if the container name doesn't show up in the rootfs path, then | |
3452 | * we don't know how to come up with a new name | |
3453 | */ | |
3454 | if (strstr(src, oldname) == NULL) { | |
3455 | ERROR("original rootfs path %s doesn't include container name %s", | |
3456 | src, oldname); | |
3457 | return NULL; | |
3458 | } | |
3459 | ||
ac00e8f2 | 3460 | orig = bdev_init(c0->lxc_conf, src, NULL, NULL); |
9be53773 | 3461 | if (!orig) { |
959aee9c | 3462 | ERROR("failed to detect blockdev type for %s", src); |
9be53773 SH |
3463 | return NULL; |
3464 | } | |
3465 | ||
ac00e8f2 KY |
3466 | if (!orig->dest) { |
3467 | int ret; | |
730e3f9e SH |
3468 | size_t len; |
3469 | struct stat sb; | |
3470 | ||
3471 | len = strlen(oldpath) + strlen(oldname) + strlen("/rootfs") + 2; | |
3472 | orig->dest = malloc(len); | |
ac00e8f2 KY |
3473 | if (!orig->dest) { |
3474 | ERROR("out of memory"); | |
3475 | bdev_put(orig); | |
3476 | return NULL; | |
3477 | } | |
730e3f9e SH |
3478 | ret = snprintf(orig->dest, len, "%s/%s/rootfs", oldpath, oldname); |
3479 | if (ret < 0 || ret >= len) { | |
ac00e8f2 KY |
3480 | ERROR("rootfs path too long"); |
3481 | bdev_put(orig); | |
3482 | return NULL; | |
3483 | } | |
730e3f9e SH |
3484 | ret = stat(orig->dest, &sb); |
3485 | if (ret < 0 && errno == ENOENT) | |
3486 | if (mkdir_p(orig->dest, 0755) < 0) | |
3487 | WARN("Error creating '%s', continuing.", orig->dest); | |
ac00e8f2 KY |
3488 | } |
3489 | ||
0a83cbbb SH |
3490 | /* |
3491 | * special case for snapshot - if caller requested maybe_snapshot and | |
3492 | * keepbdevtype and backing store is directory, then proceed with a copy | |
3493 | * clone rather than returning error | |
3494 | */ | |
3495 | if (maybe_snap && keepbdevtype && !bdevtype && !orig->ops->can_snapshot) | |
3496 | snap = false; | |
3497 | ||
e3fdf5cc SH |
3498 | /* |
3499 | * If newtype is NULL and snapshot is set, then use overlayfs | |
3500 | */ | |
0a83cbbb | 3501 | if (!bdevtype && !keepbdevtype && snap && strcmp(orig->type , "dir") == 0) |
e3fdf5cc SH |
3502 | bdevtype = "overlayfs"; |
3503 | ||
a7ef8753 SH |
3504 | if (am_unpriv() && !unpriv_snap_allowed(orig, bdevtype, snap, maybe_snap)) { |
3505 | ERROR("Unsupported snapshot type for unprivileged users"); | |
3506 | bdev_put(orig); | |
3507 | return NULL; | |
3508 | } | |
3509 | ||
dfb31b25 | 3510 | *needs_rdep = 0; |
e34b5d2e | 3511 | if (bdevtype && strcmp(orig->type, "dir") == 0 && |
1f92162d | 3512 | (strcmp(bdevtype, "aufs") == 0 || |
d8c4c595 | 3513 | strcmp(bdevtype, "overlayfs") == 0)) { |
dfb31b25 | 3514 | *needs_rdep = 1; |
d8c4c595 KY |
3515 | } else if (snap && strcmp(orig->type, "lvm") == 0 && |
3516 | !lvm_is_thin_volume(orig->src)) { | |
3517 | *needs_rdep = 1; | |
3518 | } | |
dfb31b25 | 3519 | |
9be53773 SH |
3520 | new = bdev_get(bdevtype ? bdevtype : orig->type); |
3521 | if (!new) { | |
3522 | ERROR("no such block device type: %s", bdevtype ? bdevtype : orig->type); | |
3523 | bdev_put(orig); | |
3524 | return NULL; | |
3525 | } | |
3526 | ||
25190e5b SH |
3527 | if (new->ops->clone_paths(orig, new, oldname, cname, oldpath, lxcpath, |
3528 | snap, newsize, c0->lxc_conf) < 0) { | |
959aee9c | 3529 | ERROR("failed getting pathnames for cloned storage: %s", src); |
65db0e5a | 3530 | goto err; |
9be53773 | 3531 | } |
a7ef8753 SH |
3532 | |
3533 | if (am_unpriv() && chown_mapped_root(new->src, c0->lxc_conf) < 0) | |
3534 | WARN("Failed to update ownership of %s", new->dest); | |
3535 | ||
1354955b SH |
3536 | if (snap) |
3537 | return new; | |
9be53773 | 3538 | |
65db0e5a ÇO |
3539 | /* |
3540 | * https://github.com/lxc/lxc/issues/131 | |
3541 | * Use btrfs snapshot feature instead of rsync to restore if both orig and new are btrfs | |
3542 | */ | |
3543 | if (bdevtype && | |
3544 | strcmp(orig->type, "btrfs") == 0 && strcmp(new->type, "btrfs") == 0 && | |
3545 | btrfs_same_fs(orig->dest, new->dest) == 0) { | |
3546 | if (btrfs_destroy(new) < 0) { | |
3547 | ERROR("Error destroying %s subvolume", new->dest); | |
3548 | goto err; | |
3549 | } | |
3550 | if (mkdir_p(new->dest, 0755) < 0) { | |
3551 | ERROR("Error creating %s directory", new->dest); | |
3552 | goto err; | |
3553 | } | |
3554 | if (btrfs_snapshot(orig->dest, new->dest) < 0) { | |
3555 | ERROR("Error restoring %s to %s", orig->dest, new->dest); | |
3556 | goto err; | |
3557 | } | |
3558 | bdev_put(orig); | |
3559 | return new; | |
3560 | } | |
3561 | ||
9be53773 SH |
3562 | pid = fork(); |
3563 | if (pid < 0) { | |
3564 | SYSERROR("fork"); | |
65db0e5a | 3565 | goto err; |
9be53773 SH |
3566 | } |
3567 | ||
3568 | if (pid > 0) { | |
3569 | int ret = wait_for_pid(pid); | |
3570 | bdev_put(orig); | |
3571 | if (ret < 0) { | |
3572 | bdev_put(new); | |
3573 | return NULL; | |
3574 | } | |
3575 | return new; | |
3576 | } | |
3577 | ||
1354955b SH |
3578 | data.orig = orig; |
3579 | data.new = new; | |
3580 | if (am_unpriv()) | |
3581 | ret = userns_exec_1(c0->lxc_conf, rsync_rootfs_wrapper, &data); | |
3582 | else | |
3583 | ret = rsync_rootfs(&data); | |
9be53773 | 3584 | |
1354955b | 3585 | exit(ret == 0 ? 0 : 1); |
65db0e5a ÇO |
3586 | |
3587 | err: | |
3588 | bdev_put(orig); | |
3589 | bdev_put(new); | |
3590 | return NULL; | |
9be53773 | 3591 | } |
1897e3bc | 3592 | |
d44e88c2 SH |
3593 | static struct bdev * do_bdev_create(const char *dest, const char *type, |
3594 | const char *cname, struct bdev_specs *specs) | |
3595 | { | |
3596 | struct bdev *bdev = bdev_get(type); | |
3597 | if (!bdev) { | |
3598 | return NULL; | |
3599 | } | |
3600 | ||
3601 | if (bdev->ops->create(bdev, dest, cname, specs) < 0) { | |
3602 | bdev_put(bdev); | |
3603 | return NULL; | |
3604 | } | |
3605 | ||
3606 | return bdev; | |
3607 | } | |
3608 | ||
1897e3bc SH |
3609 | /* |
3610 | * bdev_create: | |
3611 | * Create a backing store for a container. | |
ec64264d | 3612 | * If successful, return a struct bdev *, with the bdev mounted and ready |
1897e3bc SH |
3613 | * for use. Before completing, the caller will need to call the |
3614 | * umount operation and bdev_put(). | |
3615 | * @dest: the mountpoint (i.e. /var/lib/lxc/$name/rootfs) | |
3616 | * @type: the bdevtype (dir, btrfs, zfs, etc) | |
3617 | * @cname: the container name | |
3618 | * @specs: details about the backing store to create, like fstype | |
3619 | */ | |
3620 | struct bdev *bdev_create(const char *dest, const char *type, | |
3621 | const char *cname, struct bdev_specs *specs) | |
3622 | { | |
3623 | struct bdev *bdev; | |
d44e88c2 | 3624 | char *best_options[] = {"btrfs", "zfs", "lvm", "dir", NULL}; |
1897e3bc | 3625 | |
d3060bd0 | 3626 | if (!type) |
d44e88c2 SH |
3627 | return do_bdev_create(dest, "dir", cname, specs); |
3628 | ||
3629 | if (strcmp(type, "best") == 0) { | |
3630 | int i; | |
3631 | // try for the best backing store type, according to our | |
3632 | // opinionated preferences | |
3633 | for (i=0; best_options[i]; i++) { | |
3634 | if ((bdev = do_bdev_create(dest, best_options[i], cname, specs))) | |
3635 | return bdev; | |
3636 | } | |
3637 | return NULL; // 'dir' should never fail, so this shouldn't happen | |
1897e3bc SH |
3638 | } |
3639 | ||
d44e88c2 | 3640 | // -B lvm,dir |
46cd2845 | 3641 | if (strchr(type, ',') != NULL) { |
08182d44 | 3642 | char *dup = alloca(strlen(type)+1), *saveptr = NULL, *token; |
d44e88c2 SH |
3643 | strcpy(dup, type); |
3644 | for (token = strtok_r(dup, ",", &saveptr); token; | |
3645 | token = strtok_r(NULL, ",", &saveptr)) { | |
3646 | if ((bdev = do_bdev_create(dest, token, cname, specs))) | |
3647 | return bdev; | |
3648 | } | |
1897e3bc SH |
3649 | } |
3650 | ||
d44e88c2 | 3651 | return do_bdev_create(dest, type, cname, specs); |
1897e3bc SH |
3652 | } |
3653 | ||
1f92162d | 3654 | char *overlay_getlower(char *p) |
1897e3bc | 3655 | { |
46cd2845 | 3656 | char *p1 = strchr(p, ':'); |
1897e3bc SH |
3657 | if (p1) |
3658 | *p1 = '\0'; | |
3659 | return p; | |
3660 | } | |
35120d9c SH |
3661 | |
3662 | bool rootfs_is_blockdev(struct lxc_conf *conf) | |
3663 | { | |
3664 | const struct bdev_type *q; | |
3665 | struct stat st; | |
3666 | int ret; | |
3667 | ||
acf9f89e SH |
3668 | if (!conf->rootfs.path || strcmp(conf->rootfs.path, "/") == 0 || |
3669 | strlen(conf->rootfs.path) == 0) | |
3670 | return false; | |
3671 | ||
35120d9c SH |
3672 | ret = stat(conf->rootfs.path, &st); |
3673 | if (ret == 0 && S_ISBLK(st.st_mode)) | |
3674 | return true; | |
3675 | q = bdev_query(conf->rootfs.path); | |
3676 | if (!q) | |
3677 | return false; | |
3678 | if (strcmp(q->name, "lvm") == 0 || | |
3679 | strcmp(q->name, "loop") == 0 || | |
3680 | strcmp(q->name, "nbd") == 0) | |
3681 | return true; | |
3682 | return false; | |
3683 | } | |
339c6f1f CB |
3684 | |
3685 | bool bdev_destroy(struct lxc_conf *conf) | |
3686 | { | |
3687 | struct bdev *r; | |
3688 | bool ret = false; | |
3689 | ||
3690 | r = bdev_init(conf, conf->rootfs.path, conf->rootfs.mount, NULL); | |
3691 | if (!r) | |
3692 | return ret; | |
3693 | ||
3694 | if (r->ops->destroy(r) == 0) | |
3695 | ret = true; | |
3696 | bdev_put(r); | |
3697 | ||
3698 | return ret; | |
3699 | } | |
3700 | ||
3701 | int bdev_destroy_wrapper(void *data) | |
3702 | { | |
3703 | struct lxc_conf *conf = data; | |
3704 | ||
3705 | if (setgid(0) < 0) { | |
3706 | ERROR("Failed to setgid to 0"); | |
3707 | return -1; | |
3708 | } | |
3709 | if (setgroups(0, NULL) < 0) | |
3710 | WARN("Failed to clear groups"); | |
3711 | if (setuid(0) < 0) { | |
3712 | ERROR("Failed to setuid to 0"); | |
3713 | return -1; | |
3714 | } | |
3715 | if (!bdev_destroy(conf)) | |
3716 | return -1; | |
3717 | else | |
3718 | return 0; | |
3719 | } | |
3720 |