1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "include/types.h"
22 #include <sys/types.h>
24 #include <sys/ioctl.h>
26 #if defined(__linux__)
30 #include "include/compat.h"
31 #include "include/linux_fiemap.h"
37 #include "GenericFileStoreBackend.h"
39 #include "common/errno.h"
40 #include "common/config.h"
41 #include "common/sync_filesystem.h"
42 #include "common/blkdev.h"
44 #include "common/SloppyCRCMap.h"
45 #include "os/filestore/chain_xattr.h"
47 #define SLOPPY_CRC_XATTR "user.cephos.scrc"
50 #define dout_context cct()
51 #define dout_subsys ceph_subsys_filestore
53 #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
55 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56 #define ALIGNED(x, by) (!((x) % (by)))
57 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
59 GenericFileStoreBackend::GenericFileStoreBackend(FileStore
*fs
):
62 seek_data_hole(false),
64 m_filestore_fiemap(cct()->_conf
->filestore_fiemap
),
65 m_filestore_seek_data_hole(cct()->_conf
->filestore_seek_data_hole
),
66 m_filestore_fsync_flushes_journal_data(cct()->_conf
->filestore_fsync_flushes_journal_data
),
67 m_filestore_splice(cct()->_conf
->filestore_splice
)
71 // NOTE: the below won't work on btrfs; we'll assume rotational.
72 string fn
= get_basedir_path();
73 int fd
= ::open(fn
.c_str(), O_RDONLY
|O_CLOEXEC
);
78 m_rotational
= blkdev
.is_rotational();
79 dout(20) << __func__
<< " basedir " << fn
80 << " rotational " << (int)m_rotational
<< dendl
;
83 // journal rotational?
85 // NOTE: the below won't work on btrfs; we'll assume rotational.
86 string fn
= get_journal_path();
87 int fd
= ::open(fn
.c_str(), O_RDONLY
|O_CLOEXEC
);
92 m_journal_rotational
= blkdev
.is_rotational();
93 dout(20) << __func__
<< " journal filename " << fn
.c_str()
94 << " journal rotational " << (int)m_journal_rotational
<< dendl
;
99 int GenericFileStoreBackend::detect_features()
102 snprintf(fn
, sizeof(fn
), "%s/fiemap_test", get_basedir_path().c_str());
104 int fd
= ::open(fn
, O_CREAT
|O_RDWR
|O_TRUNC
|O_CLOEXEC
, 0644);
107 derr
<< "detect_features: unable to create " << fn
<< ": " << cpp_strerror(fd
) << dendl
;
111 // ext4 has a bug in older kernels where fiemap will return an empty
112 // result in some cases. this is a file layout that triggers the bug
115 0x0000000000016000, 0x0000000000007000,
116 0x000000000004a000, 0x0000000000007000,
117 0x0000000000060000, 0x0000000000001000,
118 0x0000000000061000, 0x0000000000008000,
119 0x0000000000069000, 0x0000000000007000,
120 0x00000000000a3000, 0x000000000000c000,
121 0x000000000024e000, 0x000000000000c000,
122 0x000000000028b000, 0x0000000000009000,
123 0x00000000002b1000, 0x0000000000003000,
126 for (int i
=0; v
[i
]; i
++) {
130 // write a large extent
132 memset(buf
, 1, sizeof(buf
));
133 int r
= ::lseek(fd
, off
, SEEK_SET
);
136 derr
<< "detect_features: failed to lseek " << fn
<< ": " << cpp_strerror(r
) << dendl
;
137 VOID_TEMP_FAILURE_RETRY(::close(fd
));
140 r
= write(fd
, buf
, sizeof(buf
));
142 derr
<< "detect_features: failed to write to " << fn
<< ": " << cpp_strerror(r
) << dendl
;
143 VOID_TEMP_FAILURE_RETRY(::close(fd
));
148 // fiemap an extent inside that
149 if (!m_filestore_fiemap
) {
150 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl
;
151 ioctl_fiemap
= false;
153 struct fiemap
*fiemap
;
154 int r
= do_fiemap(fd
, 2430421, 59284, &fiemap
);
156 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl
;
157 ioctl_fiemap
= false;
159 if (fiemap
->fm_mapped_extents
== 0) {
160 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl
;
161 ioctl_fiemap
= false;
163 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl
;
170 // SEEK_DATA/SEEK_HOLE detection
171 if (!m_filestore_seek_data_hole
) {
172 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl
;
173 seek_data_hole
= false;
175 #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
176 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
177 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
178 // Fall back to use fiemap.
181 hole_pos
= lseek(fd
, 0, SEEK_HOLE
);
183 if (errno
== EINVAL
) {
184 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl
;
185 seek_data_hole
= false;
187 derr
<< "detect_features: failed to lseek " << fn
<< ": " << cpp_strerror(-errno
) << dendl
;
188 VOID_TEMP_FAILURE_RETRY(::close(fd
));
192 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl
;
193 seek_data_hole
= true;
199 #ifdef CEPH_HAVE_SPLICE
200 if (!m_filestore_splice
) {
201 dout(0) << __func__
<< ": splice() is disabled via 'filestore splice' config option" << dendl
;
207 if (pipe_cloexec(pipefd
, 0) < 0) {
209 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e
) << dendl
;
211 lseek(fd
, 0, SEEK_SET
);
212 r
= splice(fd
, &off_in
, pipefd
[1], NULL
, 10, 0);
213 if (!(r
< 0 && errno
== EINVAL
)) {
215 dout(0) << "detect_features: splice is supported" << dendl
;
217 dout(0) << "detect_features: splice is NOT supported" << dendl
;
224 VOID_TEMP_FAILURE_RETRY(::close(fd
));
227 bool have_syncfs
= false;
228 #ifdef HAVE_SYS_SYNCFS
229 if (::syncfs(get_basedir_fd()) == 0) {
230 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl
;
233 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl
;
235 #elif defined(SYS_syncfs)
236 if (syscall(SYS_syncfs
, get_basedir_fd()) == 0) {
237 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl
;
240 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl
;
242 #elif defined(__NR_syncfs)
243 if (syscall(__NR_syncfs
, get_basedir_fd()) == 0) {
244 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl
;
247 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl
;
251 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl
;
252 if (m_filestore_fsync_flushes_journal_data
) {
253 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl
;
255 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl
;
256 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl
;
263 int GenericFileStoreBackend::create_current()
266 int ret
= ::stat(get_current_path().c_str(), &st
);
269 if (!S_ISDIR(st
.st_mode
)) {
270 dout(0) << "_create_current: current/ exists but is not a directory" << dendl
;
274 ret
= ::mkdir(get_current_path().c_str(), 0755);
277 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret
) << dendl
;
283 int GenericFileStoreBackend::syncfs()
286 if (m_filestore_fsync_flushes_journal_data
) {
287 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl
;
288 // make the file system's journal commit.
289 // this works with ext3, but NOT ext4
290 ret
= ::fsync(get_op_fd());
294 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl
;
295 ret
= sync_filesystem(get_current_fd());
300 int GenericFileStoreBackend::do_fiemap(int fd
, off_t start
, size_t len
, struct fiemap
**pfiemap
)
302 struct fiemap
*fiemap
= NULL
;
303 struct fiemap
*_realloc_fiemap
= NULL
;
307 fiemap
= (struct fiemap
*)calloc(sizeof(struct fiemap
), 1);
311 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
312 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
313 * Commit:"xfs: fix rounding error of fiemap length parameter
314 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
315 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
317 fiemap
->fm_start
= start
- start
% CEPH_PAGE_SIZE
;
318 fiemap
->fm_length
= len
+ start
% CEPH_PAGE_SIZE
;
319 fiemap
->fm_flags
= FIEMAP_FLAG_SYNC
; /* flush extents to disk if needed */
321 #if defined(__APPLE__) || defined(__FreeBSD__)
325 if (ioctl(fd
, FS_IOC_FIEMAP
, fiemap
) < 0) {
330 size
= sizeof(struct fiemap_extent
) * (fiemap
->fm_mapped_extents
);
332 _realloc_fiemap
= (struct fiemap
*)realloc(fiemap
, sizeof(struct fiemap
) + size
);
333 if (!_realloc_fiemap
) {
337 fiemap
= _realloc_fiemap
;
340 memset(fiemap
->fm_extents
, 0, size
);
342 fiemap
->fm_extent_count
= fiemap
->fm_mapped_extents
;
343 fiemap
->fm_mapped_extents
= 0;
345 #if defined(__APPLE__) || defined(__FreeBSD__)
349 if (ioctl(fd
, FS_IOC_FIEMAP
, fiemap
) < 0) {
364 int GenericFileStoreBackend::_crc_load_or_init(int fd
, SloppyCRCMap
*cm
)
369 int l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, buf
, sizeof(buf
));
374 bp
= buffer::create(l
);
375 memcpy(bp
.c_str(), buf
, l
);
376 } else if (l
== -ERANGE
) {
377 l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, 0, 0);
379 bp
= buffer::create(l
);
380 l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, bp
.c_str(), l
);
384 bl
.append(std::move(bp
));
385 auto p
= bl
.cbegin();
389 catch (buffer::error
&e
) {
393 derr
<< __func__
<< " got " << cpp_strerror(r
) << dendl
;
397 int GenericFileStoreBackend::_crc_save(int fd
, SloppyCRCMap
*cm
)
401 int r
= chain_fsetxattr(fd
, SLOPPY_CRC_XATTR
, bl
.c_str(), bl
.length());
403 derr
<< __func__
<< " got " << cpp_strerror(r
) << dendl
;
407 int GenericFileStoreBackend::_crc_update_write(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
)
409 SloppyCRCMap
scm(get_crc_block_size());
410 int r
= _crc_load_or_init(fd
, &scm
);
414 scm
.write(off
, len
, bl
, &ss
);
415 dout(30) << __func__
<< "\n" << ss
.str() << dendl
;
416 r
= _crc_save(fd
, &scm
);
420 int GenericFileStoreBackend::_crc_update_truncate(int fd
, loff_t off
)
422 SloppyCRCMap
scm(get_crc_block_size());
423 int r
= _crc_load_or_init(fd
, &scm
);
427 r
= _crc_save(fd
, &scm
);
431 int GenericFileStoreBackend::_crc_update_zero(int fd
, loff_t off
, size_t len
)
433 SloppyCRCMap
scm(get_crc_block_size());
434 int r
= _crc_load_or_init(fd
, &scm
);
438 r
= _crc_save(fd
, &scm
);
442 int GenericFileStoreBackend::_crc_update_clone_range(int srcfd
, int destfd
,
443 loff_t srcoff
, size_t len
, loff_t dstoff
)
445 SloppyCRCMap
scm_src(get_crc_block_size());
446 SloppyCRCMap
scm_dst(get_crc_block_size());
447 int r
= _crc_load_or_init(srcfd
, &scm_src
);
450 r
= _crc_load_or_init(destfd
, &scm_dst
);
454 scm_dst
.clone_range(srcoff
, len
, dstoff
, scm_src
, &ss
);
455 dout(30) << __func__
<< "\n" << ss
.str() << dendl
;
456 r
= _crc_save(destfd
, &scm_dst
);
460 int GenericFileStoreBackend::_crc_verify_read(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
,
463 SloppyCRCMap
scm(get_crc_block_size());
464 int r
= _crc_load_or_init(fd
, &scm
);
467 return scm
.read(off
, len
, bl
, out
);