]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/filestore/GenericFileStoreBackend.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / os / filestore / GenericFileStoreBackend.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/int_types.h"
16#include "include/types.h"
17
18#include <unistd.h>
19#include <fcntl.h>
20#include <errno.h>
21#include <stdlib.h>
22#include <sys/types.h>
23#include <sys/stat.h>
24#include <sys/ioctl.h>
25
26#if defined(__linux__)
27#include <linux/fs.h>
28#endif
29
30#include "include/compat.h"
31#include "include/linux_fiemap.h"
32
33#include <iostream>
34#include <fstream>
35#include <sstream>
36
37#include "GenericFileStoreBackend.h"
38
39#include "common/errno.h"
40#include "common/config.h"
41#include "common/sync_filesystem.h"
42
43#include "common/SloppyCRCMap.h"
44#include "os/filestore/chain_xattr.h"
45
46#define SLOPPY_CRC_XATTR "user.cephos.scrc"
47
48
49#define dout_context cct()
50#define dout_subsys ceph_subsys_filestore
51#undef dout_prefix
52#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
53
54#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
55#define ALIGNED(x, by) (!((x) % (by)))
56#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
57
58GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
59 FileStoreBackend(fs),
60 ioctl_fiemap(false),
61 seek_data_hole(false),
62 use_splice(false),
63 m_filestore_fiemap(cct()->_conf->filestore_fiemap),
64 m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
65 m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
66 m_filestore_splice(cct()->_conf->filestore_splice) {}
67
68int GenericFileStoreBackend::detect_features()
69{
70 char fn[PATH_MAX];
71 snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
72
73 int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644);
74 if (fd < 0) {
75 fd = -errno;
76 derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
77 return fd;
78 }
79
80 // ext4 has a bug in older kernels where fiemap will return an empty
81 // result in some cases. this is a file layout that triggers the bug
82 // on 2.6.34-rc5.
83 int v[] = {
84 0x0000000000016000, 0x0000000000007000,
85 0x000000000004a000, 0x0000000000007000,
86 0x0000000000060000, 0x0000000000001000,
87 0x0000000000061000, 0x0000000000008000,
88 0x0000000000069000, 0x0000000000007000,
89 0x00000000000a3000, 0x000000000000c000,
90 0x000000000024e000, 0x000000000000c000,
91 0x000000000028b000, 0x0000000000009000,
92 0x00000000002b1000, 0x0000000000003000,
93 0, 0
94 };
95 for (int i=0; v[i]; i++) {
96 int off = v[i++];
97 int len = v[i];
98
99 // write a large extent
100 char buf[len];
101 memset(buf, 1, sizeof(buf));
102 int r = ::lseek(fd, off, SEEK_SET);
103 if (r < 0) {
104 r = -errno;
105 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
106 VOID_TEMP_FAILURE_RETRY(::close(fd));
107 return r;
108 }
109 r = write(fd, buf, sizeof(buf));
110 if (r < 0) {
111 derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
112 VOID_TEMP_FAILURE_RETRY(::close(fd));
113 return r;
114 }
115 }
116
117 // fiemap an extent inside that
118 if (!m_filestore_fiemap) {
119 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
120 ioctl_fiemap = false;
121 } else {
122 struct fiemap *fiemap;
123 int r = do_fiemap(fd, 2430421, 59284, &fiemap);
124 if (r < 0) {
125 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
126 ioctl_fiemap = false;
127 } else {
128 if (fiemap->fm_mapped_extents == 0) {
129 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
130 ioctl_fiemap = false;
131 } else {
132 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
133 ioctl_fiemap = true;
134 }
135 free(fiemap);
136 }
137 }
138
139 // SEEK_DATA/SEEK_HOLE detection
140 if (!m_filestore_seek_data_hole) {
141 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
142 seek_data_hole = false;
143 } else {
144#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
145 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
146 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
147 // Fall back to use fiemap.
148 off_t hole_pos;
149
150 hole_pos = lseek(fd, 0, SEEK_HOLE);
151 if (hole_pos < 0) {
152 if (errno == EINVAL) {
153 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
154 seek_data_hole = false;
155 } else {
156 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
157 VOID_TEMP_FAILURE_RETRY(::close(fd));
158 return -errno;
159 }
160 } else {
161 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
162 seek_data_hole = true;
163 }
164#endif
165 }
166
167 //splice detection
168#ifdef CEPH_HAVE_SPLICE
169 if (!m_filestore_splice) {
170 dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
171 use_splice = false;
172 } else {
173 int pipefd[2];
174 loff_t off_in = 0;
175 int r;
176 if ((r = pipe(pipefd)) < 0)
177 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl;
178 else {
179 lseek(fd, 0, SEEK_SET);
180 r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
181 if (!(r < 0 && errno == EINVAL)) {
182 use_splice = true;
183 dout(0) << "detect_features: splice is supported" << dendl;
184 } else
185 dout(0) << "detect_features: splice is NOT supported" << dendl;
186 close(pipefd[0]);
187 close(pipefd[1]);
188 }
189 }
190#endif
191 ::unlink(fn);
192 VOID_TEMP_FAILURE_RETRY(::close(fd));
193
194
195 bool have_syncfs = false;
196#ifdef HAVE_SYS_SYNCFS
197 if (::syncfs(get_basedir_fd()) == 0) {
198 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
199 have_syncfs = true;
200 } else {
201 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
202 }
203#elif defined(SYS_syncfs)
204 if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
205 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
206 have_syncfs = true;
207 } else {
208 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
209 }
210#elif defined(__NR_syncfs)
211 if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
212 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
213 have_syncfs = true;
214 } else {
215 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
216 }
217#endif
218 if (!have_syncfs) {
219 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
220 if (m_filestore_fsync_flushes_journal_data) {
221 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
222 } else {
223 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
224 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
225 }
226 }
227
228 return 0;
229}
230
231int GenericFileStoreBackend::create_current()
232{
233 struct stat st;
234 int ret = ::stat(get_current_path().c_str(), &st);
235 if (ret == 0) {
236 // current/ exists
237 if (!S_ISDIR(st.st_mode)) {
238 dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
239 ret = -EINVAL;
240 }
241 } else {
242 ret = ::mkdir(get_current_path().c_str(), 0755);
243 if (ret < 0) {
244 ret = -errno;
245 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
246 }
247 }
248 return ret;
249}
250
251int GenericFileStoreBackend::syncfs()
252{
253 int ret;
254 if (m_filestore_fsync_flushes_journal_data) {
255 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
256 // make the file system's journal commit.
257 // this works with ext3, but NOT ext4
258 ret = ::fsync(get_op_fd());
259 if (ret < 0)
260 ret = -errno;
261 } else {
262 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
263 ret = sync_filesystem(get_current_fd());
264 }
265 return ret;
266}
267
268int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
269{
270 struct fiemap *fiemap = NULL;
271 struct fiemap *_realloc_fiemap = NULL;
272 int size;
273 int ret;
274
275 fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
276 if (!fiemap)
277 return -ENOMEM;
278 /*
279 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
280 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
281 * Commit:"xfs: fix rounding error of fiemap length parameter
282 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
283 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
284 */
285 fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
286 fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
287 fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
288
289#if defined(DARWIN) || defined(__FreeBSD__)
290 ret = -ENOTSUP;
291 goto done_err;
292#else
293 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
294 ret = -errno;
295 goto done_err;
296 }
297#endif
298 size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
299
300 _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
301 if (!_realloc_fiemap) {
302 ret = -ENOMEM;
303 goto done_err;
304 } else {
305 fiemap = _realloc_fiemap;
306 }
307
308 memset(fiemap->fm_extents, 0, size);
309
310 fiemap->fm_extent_count = fiemap->fm_mapped_extents;
311 fiemap->fm_mapped_extents = 0;
312
313#if defined(DARWIN) || defined(__FreeBSD__)
314 ret = -ENOTSUP;
315 goto done_err;
316#else
317 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
318 ret = -errno;
319 goto done_err;
320 }
321 *pfiemap = fiemap;
322#endif
323 return 0;
324
325done_err:
326 *pfiemap = NULL;
327 free(fiemap);
328 return ret;
329}
330
331
332int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
333{
334 char buf[100];
335 bufferptr bp;
336 int r = 0;
337 int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
338 if (l == -ENODATA) {
339 return 0;
340 }
341 if (l >= 0) {
342 bp = buffer::create(l);
343 memcpy(bp.c_str(), buf, l);
344 } else if (l == -ERANGE) {
345 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
346 if (l > 0) {
347 bp = buffer::create(l);
348 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
349 }
350 }
351 bufferlist bl;
352 bl.append(std::move(bp));
353 bufferlist::iterator p = bl.begin();
354 try {
355 ::decode(*cm, p);
356 }
357 catch (buffer::error &e) {
358 r = -EIO;
359 }
360 if (r < 0)
361 derr << __func__ << " got " << cpp_strerror(r) << dendl;
362 return r;
363}
364
365int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
366{
367 bufferlist bl;
368 ::encode(*cm, bl);
369 int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
370 if (r < 0)
371 derr << __func__ << " got " << cpp_strerror(r) << dendl;
372 return r;
373}
374
375int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
376{
377 SloppyCRCMap scm(get_crc_block_size());
378 int r = _crc_load_or_init(fd, &scm);
379 if (r < 0)
380 return r;
381 ostringstream ss;
382 scm.write(off, len, bl, &ss);
383 dout(30) << __func__ << "\n" << ss.str() << dendl;
384 r = _crc_save(fd, &scm);
385 return r;
386}
387
388int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
389{
390 SloppyCRCMap scm(get_crc_block_size());
391 int r = _crc_load_or_init(fd, &scm);
392 if (r < 0)
393 return r;
394 scm.truncate(off);
395 r = _crc_save(fd, &scm);
396 return r;
397}
398
399int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
400{
401 SloppyCRCMap scm(get_crc_block_size());
402 int r = _crc_load_or_init(fd, &scm);
403 if (r < 0)
404 return r;
405 scm.zero(off, len);
406 r = _crc_save(fd, &scm);
407 return r;
408}
409
410int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
411 loff_t srcoff, size_t len, loff_t dstoff)
412{
413 SloppyCRCMap scm_src(get_crc_block_size());
414 SloppyCRCMap scm_dst(get_crc_block_size());
415 int r = _crc_load_or_init(srcfd, &scm_src);
416 if (r < 0)
417 return r;
418 r = _crc_load_or_init(destfd, &scm_dst);
419 if (r < 0)
420 return r;
421 ostringstream ss;
422 scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
423 dout(30) << __func__ << "\n" << ss.str() << dendl;
424 r = _crc_save(destfd, &scm_dst);
425 return r;
426}
427
428int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
429 ostream *out)
430{
431 SloppyCRCMap scm(get_crc_block_size());
432 int r = _crc_load_or_init(fd, &scm);
433 if (r < 0)
434 return r;
435 return scm.read(off, len, bl, out);
436}