1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "include/types.h"
22 #include <sys/types.h>
24 #include <sys/ioctl.h>
26 #if defined(__linux__)
30 #include "include/compat.h"
31 #include "include/linux_fiemap.h"
37 #include "GenericFileStoreBackend.h"
39 #include "common/errno.h"
40 #include "common/config.h"
41 #include "common/sync_filesystem.h"
42 #include "common/blkdev.h"
44 #include "common/SloppyCRCMap.h"
45 #include "os/filestore/chain_xattr.h"
47 #define SLOPPY_CRC_XATTR "user.cephos.scrc"
50 #define dout_context cct()
51 #define dout_subsys ceph_subsys_filestore
53 #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
55 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56 #define ALIGNED(x, by) (!((x) % (by)))
57 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
59 GenericFileStoreBackend::GenericFileStoreBackend(FileStore
*fs
):
62 seek_data_hole(false),
64 m_filestore_fiemap(cct()->_conf
->filestore_fiemap
),
65 m_filestore_seek_data_hole(cct()->_conf
->filestore_seek_data_hole
),
66 m_filestore_fsync_flushes_journal_data(cct()->_conf
->filestore_fsync_flushes_journal_data
),
67 m_filestore_splice(cct()->_conf
->filestore_splice
)
71 // NOTE: the below won't work on btrfs; we'll assume rotational.
72 string fn
= get_basedir_path();
73 int fd
= ::open(fn
.c_str(), O_RDONLY
);
77 char partition
[PATH_MAX
], devname
[PATH_MAX
];
78 int r
= get_device_by_fd(fd
, partition
, devname
, sizeof(devname
));
80 dout(1) << "unable to get device name for " << get_basedir_path() << ": "
81 << cpp_strerror(r
) << dendl
;
84 m_rotational
= block_device_is_rotational(devname
);
85 dout(20) << __func__
<< " devname " << devname
86 << " rotational " << (int)m_rotational
<< dendl
;
92 int GenericFileStoreBackend::detect_features()
95 snprintf(fn
, sizeof(fn
), "%s/fiemap_test", get_basedir_path().c_str());
97 int fd
= ::open(fn
, O_CREAT
|O_RDWR
|O_TRUNC
, 0644);
100 derr
<< "detect_features: unable to create " << fn
<< ": " << cpp_strerror(fd
) << dendl
;
104 // ext4 has a bug in older kernels where fiemap will return an empty
105 // result in some cases. this is a file layout that triggers the bug
108 0x0000000000016000, 0x0000000000007000,
109 0x000000000004a000, 0x0000000000007000,
110 0x0000000000060000, 0x0000000000001000,
111 0x0000000000061000, 0x0000000000008000,
112 0x0000000000069000, 0x0000000000007000,
113 0x00000000000a3000, 0x000000000000c000,
114 0x000000000024e000, 0x000000000000c000,
115 0x000000000028b000, 0x0000000000009000,
116 0x00000000002b1000, 0x0000000000003000,
119 for (int i
=0; v
[i
]; i
++) {
123 // write a large extent
125 memset(buf
, 1, sizeof(buf
));
126 int r
= ::lseek(fd
, off
, SEEK_SET
);
129 derr
<< "detect_features: failed to lseek " << fn
<< ": " << cpp_strerror(r
) << dendl
;
130 VOID_TEMP_FAILURE_RETRY(::close(fd
));
133 r
= write(fd
, buf
, sizeof(buf
));
135 derr
<< "detect_features: failed to write to " << fn
<< ": " << cpp_strerror(r
) << dendl
;
136 VOID_TEMP_FAILURE_RETRY(::close(fd
));
141 // fiemap an extent inside that
142 if (!m_filestore_fiemap
) {
143 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl
;
144 ioctl_fiemap
= false;
146 struct fiemap
*fiemap
;
147 int r
= do_fiemap(fd
, 2430421, 59284, &fiemap
);
149 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl
;
150 ioctl_fiemap
= false;
152 if (fiemap
->fm_mapped_extents
== 0) {
153 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl
;
154 ioctl_fiemap
= false;
156 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl
;
163 // SEEK_DATA/SEEK_HOLE detection
164 if (!m_filestore_seek_data_hole
) {
165 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl
;
166 seek_data_hole
= false;
168 #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
169 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
170 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
171 // Fall back to use fiemap.
174 hole_pos
= lseek(fd
, 0, SEEK_HOLE
);
176 if (errno
== EINVAL
) {
177 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl
;
178 seek_data_hole
= false;
180 derr
<< "detect_features: failed to lseek " << fn
<< ": " << cpp_strerror(-errno
) << dendl
;
181 VOID_TEMP_FAILURE_RETRY(::close(fd
));
185 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl
;
186 seek_data_hole
= true;
192 #ifdef CEPH_HAVE_SPLICE
193 if (!m_filestore_splice
) {
194 dout(0) << __func__
<< ": splice() is disabled via 'filestore splice' config option" << dendl
;
200 if ((r
= pipe(pipefd
)) < 0)
201 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno
) << dendl
;
203 lseek(fd
, 0, SEEK_SET
);
204 r
= splice(fd
, &off_in
, pipefd
[1], NULL
, 10, 0);
205 if (!(r
< 0 && errno
== EINVAL
)) {
207 dout(0) << "detect_features: splice is supported" << dendl
;
209 dout(0) << "detect_features: splice is NOT supported" << dendl
;
216 VOID_TEMP_FAILURE_RETRY(::close(fd
));
219 bool have_syncfs
= false;
220 #ifdef HAVE_SYS_SYNCFS
221 if (::syncfs(get_basedir_fd()) == 0) {
222 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl
;
225 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl
;
227 #elif defined(SYS_syncfs)
228 if (syscall(SYS_syncfs
, get_basedir_fd()) == 0) {
229 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl
;
232 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl
;
234 #elif defined(__NR_syncfs)
235 if (syscall(__NR_syncfs
, get_basedir_fd()) == 0) {
236 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl
;
239 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl
;
243 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl
;
244 if (m_filestore_fsync_flushes_journal_data
) {
245 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl
;
247 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl
;
248 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl
;
255 int GenericFileStoreBackend::create_current()
258 int ret
= ::stat(get_current_path().c_str(), &st
);
261 if (!S_ISDIR(st
.st_mode
)) {
262 dout(0) << "_create_current: current/ exists but is not a directory" << dendl
;
266 ret
= ::mkdir(get_current_path().c_str(), 0755);
269 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret
) << dendl
;
275 int GenericFileStoreBackend::syncfs()
278 if (m_filestore_fsync_flushes_journal_data
) {
279 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl
;
280 // make the file system's journal commit.
281 // this works with ext3, but NOT ext4
282 ret
= ::fsync(get_op_fd());
286 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl
;
287 ret
= sync_filesystem(get_current_fd());
292 int GenericFileStoreBackend::do_fiemap(int fd
, off_t start
, size_t len
, struct fiemap
**pfiemap
)
294 struct fiemap
*fiemap
= NULL
;
295 struct fiemap
*_realloc_fiemap
= NULL
;
299 fiemap
= (struct fiemap
*)calloc(sizeof(struct fiemap
), 1);
303 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
304 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
305 * Commit:"xfs: fix rounding error of fiemap length parameter
306 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
307 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
309 fiemap
->fm_start
= start
- start
% CEPH_PAGE_SIZE
;
310 fiemap
->fm_length
= len
+ start
% CEPH_PAGE_SIZE
;
311 fiemap
->fm_flags
= FIEMAP_FLAG_SYNC
; /* flush extents to disk if needed */
313 #if defined(DARWIN) || defined(__FreeBSD__)
317 if (ioctl(fd
, FS_IOC_FIEMAP
, fiemap
) < 0) {
322 size
= sizeof(struct fiemap_extent
) * (fiemap
->fm_mapped_extents
);
324 _realloc_fiemap
= (struct fiemap
*)realloc(fiemap
, sizeof(struct fiemap
) + size
);
325 if (!_realloc_fiemap
) {
329 fiemap
= _realloc_fiemap
;
332 memset(fiemap
->fm_extents
, 0, size
);
334 fiemap
->fm_extent_count
= fiemap
->fm_mapped_extents
;
335 fiemap
->fm_mapped_extents
= 0;
337 #if defined(DARWIN) || defined(__FreeBSD__)
341 if (ioctl(fd
, FS_IOC_FIEMAP
, fiemap
) < 0) {
356 int GenericFileStoreBackend::_crc_load_or_init(int fd
, SloppyCRCMap
*cm
)
361 int l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, buf
, sizeof(buf
));
366 bp
= buffer::create(l
);
367 memcpy(bp
.c_str(), buf
, l
);
368 } else if (l
== -ERANGE
) {
369 l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, 0, 0);
371 bp
= buffer::create(l
);
372 l
= chain_fgetxattr(fd
, SLOPPY_CRC_XATTR
, bp
.c_str(), l
);
376 bl
.append(std::move(bp
));
377 bufferlist::iterator p
= bl
.begin();
381 catch (buffer::error
&e
) {
385 derr
<< __func__
<< " got " << cpp_strerror(r
) << dendl
;
389 int GenericFileStoreBackend::_crc_save(int fd
, SloppyCRCMap
*cm
)
393 int r
= chain_fsetxattr(fd
, SLOPPY_CRC_XATTR
, bl
.c_str(), bl
.length());
395 derr
<< __func__
<< " got " << cpp_strerror(r
) << dendl
;
399 int GenericFileStoreBackend::_crc_update_write(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
)
401 SloppyCRCMap
scm(get_crc_block_size());
402 int r
= _crc_load_or_init(fd
, &scm
);
406 scm
.write(off
, len
, bl
, &ss
);
407 dout(30) << __func__
<< "\n" << ss
.str() << dendl
;
408 r
= _crc_save(fd
, &scm
);
412 int GenericFileStoreBackend::_crc_update_truncate(int fd
, loff_t off
)
414 SloppyCRCMap
scm(get_crc_block_size());
415 int r
= _crc_load_or_init(fd
, &scm
);
419 r
= _crc_save(fd
, &scm
);
423 int GenericFileStoreBackend::_crc_update_zero(int fd
, loff_t off
, size_t len
)
425 SloppyCRCMap
scm(get_crc_block_size());
426 int r
= _crc_load_or_init(fd
, &scm
);
430 r
= _crc_save(fd
, &scm
);
434 int GenericFileStoreBackend::_crc_update_clone_range(int srcfd
, int destfd
,
435 loff_t srcoff
, size_t len
, loff_t dstoff
)
437 SloppyCRCMap
scm_src(get_crc_block_size());
438 SloppyCRCMap
scm_dst(get_crc_block_size());
439 int r
= _crc_load_or_init(srcfd
, &scm_src
);
442 r
= _crc_load_or_init(destfd
, &scm_dst
);
446 scm_dst
.clone_range(srcoff
, len
, dstoff
, scm_src
, &ss
);
447 dout(30) << __func__
<< "\n" << ss
.str() << dendl
;
448 r
= _crc_save(destfd
, &scm_dst
);
452 int GenericFileStoreBackend::_crc_verify_read(int fd
, loff_t off
, size_t len
, const bufferlist
& bl
,
455 SloppyCRCMap
scm(get_crc_block_size());
456 int r
= _crc_load_or_init(fd
, &scm
);
459 return scm
.read(off
, len
, bl
, out
);