1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "include/types.h"
22 #include <sys/types.h>
24 #include <sys/ioctl.h>
25 #include "include/compat.h"
26 #include "include/linux_fiemap.h"
27 #include "include/color.h"
28 #include "include/buffer.h"
29 #include "include/ceph_assert.h"
32 #include "os/fs/btrfs_ioctl.h"
39 #include "BtrfsFileStoreBackend.h"
41 #include "common/errno.h"
42 #include "common/config.h"
44 #if defined(__linux__)
46 #define dout_context cct()
47 #define dout_subsys ceph_subsys_filestore
49 #define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
55 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56 #define ALIGNED(x, by) (!((x) % (by)))
57 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
59 BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore
*fs
):
60 GenericFileStoreBackend(fs
), has_clone_range(false),
61 has_snap_create(false), has_snap_destroy(false),
62 has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
63 m_filestore_btrfs_clone_range(cct()->_conf
->filestore_btrfs_clone_range
),
64 m_filestore_btrfs_snap (cct()->_conf
->filestore_btrfs_snap
) { }
66 int BtrfsFileStoreBackend::detect_features()
70 r
= GenericFileStoreBackend::detect_features();
75 if (m_filestore_btrfs_clone_range
) {
76 int fd
= ::openat(get_basedir_fd(), "clone_range_test", O_CREAT
|O_WRONLY
|O_CLOEXEC
, 0600);
78 if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
80 dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
81 << cpp_strerror(r
) << dendl
;
83 btrfs_ioctl_clone_range_args clone_args
;
84 memset(&clone_args
, 0, sizeof(clone_args
));
85 clone_args
.src_fd
= -1;
86 r
= ::ioctl(fd
, BTRFS_IOC_CLONE_RANGE
, &clone_args
);
87 if (r
< 0 && errno
== EBADF
) {
88 dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl
;
89 has_clone_range
= true;
92 dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r
) << dendl
;
94 TEMP_FAILURE_RETRY(::close(fd
));
97 dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
98 << cpp_strerror(r
) << dendl
;
101 dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl
;
104 struct btrfs_ioctl_vol_args vol_args
;
105 memset(&vol_args
, 0, sizeof(vol_args
));
107 // create test source volume
109 strcpy(vol_args
.name
, "test_subvol");
110 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE
, &vol_args
);
113 dout(0) << "detect_feature: failed to create simple subvolume " << vol_args
.name
<< ": " << cpp_strerror(r
) << dendl
;
115 int srcfd
= ::openat(get_basedir_fd(), vol_args
.name
, O_RDONLY
|O_CLOEXEC
);
118 dout(0) << "detect_feature: failed to open " << vol_args
.name
<< ": " << cpp_strerror(r
) << dendl
;
121 // snap_create and snap_destroy?
123 strcpy(vol_args
.name
, "sync_snap_test");
124 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE
, &vol_args
);
126 if (r
== 0 || errno
== EEXIST
) {
127 dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl
;
128 has_snap_create
= true;
130 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY
, &vol_args
);
132 dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl
;
133 has_snap_destroy
= true;
136 dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err
) << dendl
;
138 if (err
== -EPERM
&& getuid() != 0) {
139 dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl
;
141 << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
142 << TEXT_NORMAL
<< std::endl
;
143 } else if (err
== -EOPNOTSUPP
) {
144 derr
<< "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl
;
148 dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err
) << dendl
;
151 if (m_filestore_btrfs_snap
) {
152 if (has_snap_destroy
)
153 stable_commits
= true;
155 dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl
;
160 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC
, &transid
);
163 dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err
) << dendl
;
165 if (r
== 0 && transid
> 0) {
166 dout(0) << "detect_feature: START_SYNC is supported (transid " << transid
<< ")" << dendl
;
168 // do we have wait_sync too?
169 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC
, &transid
);
170 if (r
== 0 || errno
== ERANGE
) {
171 dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl
;
172 has_wait_sync
= true;
175 dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err
) << dendl
;
179 dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err
) << dendl
;
183 // async snap creation?
184 struct btrfs_ioctl_vol_args_v2 async_args
;
185 memset(&async_args
, 0, sizeof(async_args
));
186 async_args
.fd
= srcfd
;
187 async_args
.flags
= BTRFS_SUBVOL_CREATE_ASYNC
;
188 strcpy(async_args
.name
, "async_snap_test");
190 // remove old one, first
192 strcpy(vol_args
.name
, async_args
.name
);
193 if (::fstatat(get_basedir_fd(), vol_args
.name
, &st
, 0) == 0) {
194 dout(0) << "detect_feature: removing old async_snap_test" << dendl
;
195 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY
, &vol_args
);
198 dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err
) << dendl
;
202 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2
, &async_args
);
203 if (r
== 0 || errno
== EEXIST
) {
204 dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl
;
205 has_snap_create_v2
= true;
208 strcpy(vol_args
.name
, "async_snap_test");
209 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY
, &vol_args
);
212 dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err
) << dendl
;
216 dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err
) << dendl
;
220 // clean up test subvol
222 TEMP_FAILURE_RETRY(::close(srcfd
));
224 strcpy(vol_args
.name
, "test_subvol");
225 r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY
, &vol_args
);
228 dout(0) << "detect_feature: failed to remove " << vol_args
.name
<< ": " << cpp_strerror(r
) << dendl
;
231 if (m_filestore_btrfs_snap
&& !has_snap_create_v2
) {
232 dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl
;
234 << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
235 << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
236 << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
244 bool BtrfsFileStoreBackend::can_checkpoint()
246 return stable_commits
;
249 int BtrfsFileStoreBackend::create_current()
252 int ret
= ::stat(get_current_path().c_str(), &st
);
255 if (!S_ISDIR(st
.st_mode
)) {
256 dout(0) << "create_current: current/ exists but is not a directory" << dendl
;
261 struct statfs currentfs
;
262 ret
= ::fstat(get_basedir_fd(), &basest
);
265 dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret
) << dendl
;
268 ret
= ::statfs(get_current_path().c_str(), ¤tfs
);
271 dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret
) << dendl
;
274 if (currentfs
.f_type
== BTRFS_SUPER_MAGIC
&& basest
.st_dev
!= st
.st_dev
) {
275 dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl
;
276 stable_commits
= true;
281 struct btrfs_ioctl_vol_args volargs
;
282 memset(&volargs
, 0, sizeof(volargs
));
285 strcpy(volargs
.name
, "current");
286 if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE
, (unsigned long int)&volargs
) < 0) {
288 dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
289 << cpp_strerror(ret
) << dendl
;
293 dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl
;
294 if (::chmod(get_current_path().c_str(), 0755) < 0) {
296 dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
297 << cpp_strerror(ret
) << dendl
;
301 stable_commits
= true;
305 int BtrfsFileStoreBackend::list_checkpoints(list
<string
>& ls
)
310 ret
= ::fstat(get_basedir_fd(), &basest
);
313 dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret
) << dendl
;
318 DIR *dir
= ::opendir(get_basedir_path().c_str());
321 dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
322 << cpp_strerror(ret
) << dendl
;
335 dout(0) << "list_checkpoints: readdir '" << get_basedir_path() << "' failed: "
336 << cpp_strerror(err
) << dendl
;
340 snprintf(path
, sizeof(path
), "%s/%s", get_basedir_path().c_str(), de
->d_name
);
343 ret
= ::stat(path
, &st
);
346 dout(0) << "list_checkpoints: stat '" << path
<< "' failed: "
347 << cpp_strerror(err
) << dendl
;
351 if (!S_ISDIR(st
.st_mode
))
355 ret
= ::statfs(path
, &fs
);
358 dout(0) << "list_checkpoints: statfs '" << path
<< "' failed: "
359 << cpp_strerror(err
) << dendl
;
363 if (fs
.f_type
== BTRFS_SUPER_MAGIC
&& basest
.st_dev
!= st
.st_dev
)
364 snaps
.push_back(string(de
->d_name
));
367 if (::closedir(dir
) < 0) {
369 dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret
) << dendl
;
381 int BtrfsFileStoreBackend::create_checkpoint(const string
& name
, uint64_t *transid
)
383 dout(10) << "create_checkpoint: '" << name
<< "'" << dendl
;
384 if (has_snap_create_v2
&& transid
) {
385 struct btrfs_ioctl_vol_args_v2 async_args
;
386 memset(&async_args
, 0, sizeof(async_args
));
387 async_args
.fd
= get_current_fd();
388 async_args
.flags
= BTRFS_SUBVOL_CREATE_ASYNC
;
390 size_t name_size
= sizeof(async_args
.name
);
391 strncpy(async_args
.name
, name
.c_str(), name_size
);
392 async_args
.name
[name_size
-1] = '\0';
394 int r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2
, &async_args
);
397 dout(0) << "create_checkpoint: async snap create '" << name
<< "' got " << cpp_strerror(r
) << dendl
;
400 dout(20) << "create_checkpoint: async snap create '" << name
<< "' transid " << async_args
.transid
<< dendl
;
401 *transid
= async_args
.transid
;
403 struct btrfs_ioctl_vol_args vol_args
;
404 memset(&vol_args
, 0, sizeof(vol_args
));
405 vol_args
.fd
= get_current_fd();
407 size_t name_size
= sizeof(vol_args
.name
);
408 strncpy(vol_args
.name
, name
.c_str(), name_size
);
409 vol_args
.name
[name_size
-1] = '\0';
411 int r
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE
, &vol_args
);
414 dout(0) << "create_checkpoint: snap create '" << name
<< "' got " << cpp_strerror(r
) << dendl
;
423 int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid
)
426 dout(10) << "sync_checkpoint: transid " << transid
<< " to complete" << dendl
;
427 int ret
= ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC
, &transid
);
430 dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret
) << dendl
;
433 dout(20) << "sync_checkpoint: done waiting for transid " << transid
<< dendl
;
437 int BtrfsFileStoreBackend::rollback_to(const string
& name
)
439 dout(10) << "rollback_to: to '" << name
<< "'" << dendl
;
441 btrfs_ioctl_vol_args vol_args
;
443 memset(&vol_args
, 0, sizeof(vol_args
));
445 strcpy(vol_args
.name
, "current");
447 int ret
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY
, &vol_args
);
448 if (ret
&& errno
!= ENOENT
) {
449 dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret
) << dendl
;
450 snprintf(s
, sizeof(s
), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
451 if (::rename(get_current_path().c_str(), s
)) {
453 dout(0) << "rollback_to: error renaming old current subvol: "
454 << cpp_strerror(ret
) << dendl
;
459 snprintf(s
, sizeof(s
), "%s/%s", get_basedir_path().c_str(), name
.c_str());
462 vol_args
.fd
= ::open(s
, O_RDONLY
|O_CLOEXEC
);
463 if (vol_args
.fd
< 0) {
465 dout(0) << "rollback_to: error opening '" << s
<< "': " << cpp_strerror(ret
) << dendl
;
468 ret
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE
, &vol_args
);
471 dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret
) << dendl
;
473 TEMP_FAILURE_RETRY(::close(vol_args
.fd
));
477 int BtrfsFileStoreBackend::destroy_checkpoint(const string
& name
)
479 dout(10) << "destroy_checkpoint: '" << name
<< "'" << dendl
;
480 btrfs_ioctl_vol_args vol_args
;
481 memset(&vol_args
, 0, sizeof(vol_args
));
483 strncpy(vol_args
.name
, name
.c_str(), sizeof(vol_args
.name
) - 1);
484 vol_args
.name
[sizeof(vol_args
.name
) - 1] = '\0';
486 int ret
= ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY
, &vol_args
);
489 dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret
) << dendl
;
495 int BtrfsFileStoreBackend::syncfs()
497 dout(15) << "syncfs" << dendl
;
498 // do a full btrfs commit
499 int ret
= ::ioctl(get_op_fd(), BTRFS_IOC_SYNC
);
502 dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret
) << dendl
;
507 int BtrfsFileStoreBackend::clone_range(int from
, int to
, uint64_t srcoff
, uint64_t len
, uint64_t dstoff
)
509 dout(20) << "clone_range: " << srcoff
<< "~" << len
<< " to " << dstoff
<< dendl
;
510 size_t blk_size
= get_blksize();
511 if (!has_clone_range
||
512 srcoff
% blk_size
!= dstoff
% blk_size
) {
513 dout(20) << "clone_range: using copy" << dendl
;
514 return _copy_range(from
, to
, srcoff
, len
, dstoff
);
520 uint64_t srcoffclone
= ALIGN_UP(srcoff
, blk_size
);
521 uint64_t dstoffclone
= ALIGN_UP(dstoff
, blk_size
);
522 if (srcoffclone
>= srcoff
+ len
) {
523 dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl
;
524 return _copy_range(from
, to
, srcoff
, len
, dstoff
);
527 uint64_t lenclone
= len
- (srcoffclone
- srcoff
);
528 if (!ALIGNED(lenclone
, blk_size
)) {
529 struct stat from_stat
, to_stat
;
530 err
= ::fstat(from
, &from_stat
);
531 if (err
) return -errno
;
532 err
= ::fstat(to
, &to_stat
);
533 if (err
) return -errno
;
535 if (srcoff
+ len
!= (uint64_t)from_stat
.st_size
||
536 dstoff
+ len
< (uint64_t)to_stat
.st_size
) {
537 // Not to the end of the file, need to align length as well
538 lenclone
= ALIGN_DOWN(lenclone
, blk_size
);
543 return _copy_range(from
, to
, srcoff
, len
, dstoff
);
546 dout(20) << "clone_range: cloning " << srcoffclone
<< "~" << lenclone
547 << " to " << dstoffclone
<< " = " << r
<< dendl
;
548 btrfs_ioctl_clone_range_args a
;
550 a
.src_offset
= srcoffclone
;
551 a
.src_length
= lenclone
;
552 a
.dest_offset
= dstoffclone
;
553 err
= ::ioctl(to
, BTRFS_IOC_CLONE_RANGE
, &a
);
556 } else if (errno
== EINVAL
) {
557 // Still failed, might be compressed
558 dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl
;
559 return _copy_range(from
, to
, srcoff
, len
, dstoff
);
564 // Take care any trimmed from front
565 if (srcoffclone
!= srcoff
) {
566 err
= _copy_range(from
, to
, srcoff
, srcoffclone
- srcoff
, dstoff
);
575 if (srcoffclone
+ lenclone
!= srcoff
+ len
) {
576 err
= _copy_range(from
, to
,
577 srcoffclone
+ lenclone
,
578 (srcoff
+ len
) - (srcoffclone
+ lenclone
),
579 dstoffclone
+ lenclone
);
586 dout(20) << "clone_range: finished " << srcoff
<< "~" << len
587 << " to " << dstoff
<< " = " << r
<< dendl
;