1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "include/types.h"
22 #include <sys/types.h>
24 #include <sys/ioctl.h>
26 #if defined(__linux__)
30 #include "include/compat.h"
31 #include "include/linux_fiemap.h"
37 #include "GenericFileStoreBackend.h"
39 #include "common/errno.h"
40 #include "common/config.h"
41 #include "common/sync_filesystem.h"
42 #include "common/blkdev.h"
44 #include "common/SloppyCRCMap.h"
45 #include "os/filestore/chain_xattr.h"
47 #define SLOPPY_CRC_XATTR "user.cephos.scrc"
50 #define dout_context cct()
51 #define dout_subsys ceph_subsys_filestore
53 #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
55 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56 #define ALIGNED(x, by) (!((x) % (by)))
57 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
60 using std::ostringstream
;
63 using ceph::bufferptr
;
64 using ceph::bufferlist
;
66 GenericFileStoreBackend::GenericFileStoreBackend(FileStore
*fs
):
69 seek_data_hole(false),
71 m_filestore_fiemap(cct()->_conf
->filestore_fiemap
),
72 m_filestore_seek_data_hole(cct()->_conf
->filestore_seek_data_hole
),
73 m_filestore_fsync_flushes_journal_data(cct()->_conf
->filestore_fsync_flushes_journal_data
),
74 m_filestore_splice(cct()->_conf
->filestore_splice
)
78 // NOTE: the below won't work on btrfs; we'll assume rotational.
79 string fn
= get_basedir_path();
80 int fd
= ::open(fn
.c_str(), O_RDONLY
|O_CLOEXEC
);
85 m_rotational
= blkdev
.is_rotational();
86 dout(20) << __func__
<< " basedir " << fn
87 << " rotational " << (int)m_rotational
<< dendl
;
90 // journal rotational?
92 // NOTE: the below won't work on btrfs; we'll assume rotational.
93 string fn
= get_journal_path();
94 int fd
= ::open(fn
.c_str(), O_RDONLY
|O_CLOEXEC
);
99 m_journal_rotational
= blkdev
.is_rotational();
100 dout(20) << __func__
<< " journal filename " << fn
.c_str()
101 << " journal rotational " << (int)m_journal_rotational
<< dendl
;
106 int GenericFileStoreBackend::detect_features()
109 snprintf(fn
, sizeof(fn
), "%s/fiemap_test", get_basedir_path().c_str());
111 int fd
= ::open(fn
, O_CREAT
|O_RDWR
|O_TRUNC
|O_CLOEXEC
, 0644);
114 derr
<< "detect_features: unable to create " << fn
<< ": " << cpp_strerror(fd
) << dendl
;
118 // ext4 has a bug in older kernels where fiemap will return an empty
119 // result in some cases. this is a file layout that triggers the bug
122 0x0000000000016000, 0x0000000000007000,
123 0x000000000004a000, 0x0000000000007000,
124 0x0000000000060000, 0x0000000000001000,
125 0x0000000000061000, 0x0000000000008000,
126 0x0000000000069000, 0x0000000000007000,
127 0x00000000000a3000, 0x000000000000c000,
128 0x000000000024e000, 0x000000000000c000,
129 0x000000000028b000, 0x0000000000009000,
130 0x00000000002b1000, 0x0000000000003000,
133 for (int i
=0; v
[i
]; i
++) {
137 // write a large extent
139 memset(buf
, 1, sizeof(buf
));
140 int r
= ::lseek(fd
, off
, SEEK_SET
);
143 derr
<< "detect_features: failed to lseek " << fn
<< ": " << cpp_strerror(r
) << dendl
;
144 VOID_TEMP_FAILURE_RETRY(::close(fd
));
147 r
= write(fd
, buf
, sizeof(buf
));
149 derr
<< "detect_features: failed to write to " << fn
<< ": " << cpp_strerror(r
) << dendl
;
150 VOID_TEMP_FAILURE_RETRY(::close(fd
));
155 // fiemap an extent inside that
156 if (!m_filestore_fiemap
) {
157 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl
;
158 ioctl_fiemap
= false;
160 struct fiemap
*fiemap
;
161 int r
= do_fiemap(fd
, 2430421, 59284, &fiemap
);
163 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl
;
164 ioctl_fiemap
= false;
166 if (fiemap
->fm_mapped_extents
== 0) {
167 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl
;
168 ioctl_fiemap
= false;
170 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl
;
177 // SEEK_DATA/SEEK_HOLE detection
178 if (!m_filestore_seek_data_hole
) {
179 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl
;
180 seek_data_hole
= false;
182 #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
183 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
184 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
185 // Fall back to use fiemap.
188 hole_pos
= lseek(fd
, 0, SEEK_HOLE
);
190 if (errno
== EINVAL
) {
191 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl
;
192 seek_data_hole
= false;
194 derr
<< "detect_features: failed to lseek " << fn
<< ": " << cpp_strerror(-errno
) << dendl
;
195 VOID_TEMP_FAILURE_RETRY(::close(fd
));
199 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl
;
200 seek_data_hole
= true;
206 #ifdef CEPH_HAVE_SPLICE
207 if (!m_filestore_splice
) {
208 dout(0) << __func__
<< ": splice() is disabled via 'filestore splice' config option" << dendl
;
214 if (pipe_cloexec(pipefd
, 0) < 0) {
216 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e
) << dendl
;
218 lseek(fd
, 0, SEEK_SET
);
219 r
= splice(fd
, &off_in
, pipefd
[1], NULL
, 10, 0);
220 if (!(r
< 0 && errno
== EINVAL
)) {
222 dout(0) << "detect_features: splice is supported" << dendl
;
224 dout(0) << "detect_features: splice is NOT supported" << dendl
;
231 VOID_TEMP_FAILURE_RETRY(::close(fd
));
234 bool have_syncfs
= false;
235 #ifdef HAVE_SYS_SYNCFS
236 if (::syncfs(get_basedir_fd()) == 0) {
237 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl
;
240 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl
;
242 #elif defined(SYS_syncfs)
243 if (syscall(SYS_syncfs
, get_basedir_fd()) == 0) {
244 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl
;
247 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl
;
249 #elif defined(__NR_syncfs)
250 if (syscall(__NR_syncfs
, get_basedir_fd()) == 0) {
251 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl
;
254 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl
;
258 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl
;
259 if (m_filestore_fsync_flushes_journal_data
) {
260 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl
;
262 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl
;
263 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl
;
270 int GenericFileStoreBackend::create_current()
273 int ret
= ::stat(get_current_path().c_str(), &st
);
276 if (!S_ISDIR(st
.st_mode
)) {
277 dout(0) << "_create_current: current/ exists but is not a directory" << dendl
;
281 ret
= ::mkdir(get_current_path().c_str(), 0755);
284 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret
) << dendl
;
290 int GenericFileStoreBackend::syncfs()
293 if (m_filestore_fsync_flushes_journal_data
) {
294 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl
;
295 // make the file system's journal commit.
296 // this works with ext3, but NOT ext4
297 ret
= ::fsync(get_op_fd());
301 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl
;
302 ret
= sync_filesystem(get_current_fd());
307 int GenericFileStoreBackend::do_fiemap(int fd
, off_t start
, size_t len
, struct fiemap
**pfiemap
)
309 struct fiemap
*fiemap
= NULL
;
310 struct fiemap
*_realloc_fiemap
= NULL
;
314 fiemap
= (struct fiemap
*)calloc(sizeof(struct fiemap
), 1);
318 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
319 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
320 * Commit:"xfs: fix rounding error of fiemap length parameter
321 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
322 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
324 fiemap
->fm_start
= start
- start
% CEPH_PAGE_SIZE
;
325 fiemap
->fm_length
= len
+ start
% CEPH_PAGE_SIZE
;
326 fiemap
->fm_flags
= FIEMAP_FLAG_SYNC
; /* flush extents to disk if needed */
328 #if defined(__APPLE__) || defined(__FreeBSD__)
332 if (ioctl(fd
, FS_IOC_FIEMAP
, fiemap
) < 0) {
337 size
= sizeof(struct fiemap_extent
) * (fiemap
->fm_mapped_extents
);
339 _realloc_fiemap
= (struct fiemap
*)realloc(fiemap
, sizeof(struct fiemap
) + size
);
340 if (!_realloc_fiemap
) {
344 fiemap
= _realloc_fiemap
;
347 memset(fiemap
->fm_extents
, 0, size
);
349 fiemap
->fm_extent_count
= fiemap
->fm_mapped_extents
;
350 fiemap
->fm_mapped_extents
= 0;
352 #if defined(__APPLE__) || defined(__FreeBSD__)
356 if (ioctl(fd
, FS_IOC_FIEMAP
, fiemap
) < 0) {
371 int GenericFileStoreBackend::_crc_load_or_init(int fd
, SloppyCRCMap
*cm
)
376 int l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, buf
, sizeof(buf
));
381 bp
= ceph::buffer::create(l
);
382 memcpy(bp
.c_str(), buf
, l
);
383 } else if (l
== -ERANGE
) {
384 l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, 0, 0);
386 bp
= ceph::buffer::create(l
);
387 l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, bp
.c_str(), l
);
391 bl
.append(std::move(bp
));
392 auto p
= bl
.cbegin();
396 catch (ceph::buffer::error
&e
) {
400 derr
<< __func__
<< " got " << cpp_strerror(r
) << dendl
;
404 int GenericFileStoreBackend::_crc_save(int fd
, SloppyCRCMap
*cm
)
408 int r
= chain_fsetxattr(fd
, SLOPPY_CRC_XATTR
, bl
.c_str(), bl
.length());
410 derr
<< __func__
<< " got " << cpp_strerror(r
) << dendl
;
414 int GenericFileStoreBackend::_crc_update_write(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
)
416 SloppyCRCMap
scm(get_crc_block_size());
417 int r
= _crc_load_or_init(fd
, &scm
);
421 scm
.write(off
, len
, bl
, &ss
);
422 dout(30) << __func__
<< "\n" << ss
.str() << dendl
;
423 r
= _crc_save(fd
, &scm
);
427 int GenericFileStoreBackend::_crc_update_truncate(int fd
, loff_t off
)
429 SloppyCRCMap
scm(get_crc_block_size());
430 int r
= _crc_load_or_init(fd
, &scm
);
434 r
= _crc_save(fd
, &scm
);
438 int GenericFileStoreBackend::_crc_update_zero(int fd
, loff_t off
, size_t len
)
440 SloppyCRCMap
scm(get_crc_block_size());
441 int r
= _crc_load_or_init(fd
, &scm
);
445 r
= _crc_save(fd
, &scm
);
449 int GenericFileStoreBackend::_crc_update_clone_range(int srcfd
, int destfd
,
450 loff_t srcoff
, size_t len
, loff_t dstoff
)
452 SloppyCRCMap
scm_src(get_crc_block_size());
453 SloppyCRCMap
scm_dst(get_crc_block_size());
454 int r
= _crc_load_or_init(srcfd
, &scm_src
);
457 r
= _crc_load_or_init(destfd
, &scm_dst
);
461 scm_dst
.clone_range(srcoff
, len
, dstoff
, scm_src
, &ss
);
462 dout(30) << __func__
<< "\n" << ss
.str() << dendl
;
463 r
= _crc_save(destfd
, &scm_dst
);
467 int GenericFileStoreBackend::_crc_verify_read(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
,
470 SloppyCRCMap
scm(get_crc_block_size());
471 int r
= _crc_load_or_init(fd
, &scm
);
474 return scm
.read(off
, len
, bl
, out
);