]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/filestore/GenericFileStoreBackend.cc
33680a4ca6b79ad5ac36fc97dbbfbcb12722ee4f
[ceph.git] / ceph / src / os / filestore / GenericFileStoreBackend.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/int_types.h"
16 #include "include/types.h"
17
18 #include <unistd.h>
19 #include <fcntl.h>
20 #include <errno.h>
21 #include <stdlib.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <sys/ioctl.h>
25
26 #if defined(__linux__)
27 #include <linux/fs.h>
28 #endif
29
30 #include "include/compat.h"
31 #include "include/linux_fiemap.h"
32
33 #include <iostream>
34 #include <fstream>
35 #include <sstream>
36
37 #include "GenericFileStoreBackend.h"
38
39 #include "common/errno.h"
40 #include "common/config.h"
41 #include "common/sync_filesystem.h"
42 #include "common/blkdev.h"
43
44 #include "common/SloppyCRCMap.h"
45 #include "os/filestore/chain_xattr.h"
46
47 #define SLOPPY_CRC_XATTR "user.cephos.scrc"
48
49
50 #define dout_context cct()
51 #define dout_subsys ceph_subsys_filestore
52 #undef dout_prefix
53 #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
54
55 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56 #define ALIGNED(x, by) (!((x) % (by)))
57 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
58
59 GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
60 FileStoreBackend(fs),
61 ioctl_fiemap(false),
62 seek_data_hole(false),
63 use_splice(false),
64 m_filestore_fiemap(cct()->_conf->filestore_fiemap),
65 m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
66 m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
67 m_filestore_splice(cct()->_conf->filestore_splice)
68 {
69 // rotational?
70 {
71 // NOTE: the below won't work on btrfs; we'll assume rotational.
72 string fn = get_basedir_path();
73 int fd = ::open(fn.c_str(), O_RDONLY);
74 if (fd < 0) {
75 return;
76 }
77 char partition[PATH_MAX], devname[PATH_MAX];
78 int r = get_device_by_fd(fd, partition, devname, sizeof(devname));
79 if (r < 0) {
80 dout(1) << "unable to get device name for " << get_basedir_path() << ": "
81 << cpp_strerror(r) << dendl;
82 m_rotational = true;
83 } else {
84 m_rotational = block_device_is_rotational(devname);
85 dout(20) << __func__ << " devname " << devname
86 << " rotational " << (int)m_rotational << dendl;
87 }
88 ::close(fd);
89 }
90 }
91
92 int GenericFileStoreBackend::detect_features()
93 {
94 char fn[PATH_MAX];
95 snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
96
97 int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644);
98 if (fd < 0) {
99 fd = -errno;
100 derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
101 return fd;
102 }
103
104 // ext4 has a bug in older kernels where fiemap will return an empty
105 // result in some cases. this is a file layout that triggers the bug
106 // on 2.6.34-rc5.
107 int v[] = {
108 0x0000000000016000, 0x0000000000007000,
109 0x000000000004a000, 0x0000000000007000,
110 0x0000000000060000, 0x0000000000001000,
111 0x0000000000061000, 0x0000000000008000,
112 0x0000000000069000, 0x0000000000007000,
113 0x00000000000a3000, 0x000000000000c000,
114 0x000000000024e000, 0x000000000000c000,
115 0x000000000028b000, 0x0000000000009000,
116 0x00000000002b1000, 0x0000000000003000,
117 0, 0
118 };
119 for (int i=0; v[i]; i++) {
120 int off = v[i++];
121 int len = v[i];
122
123 // write a large extent
124 char buf[len];
125 memset(buf, 1, sizeof(buf));
126 int r = ::lseek(fd, off, SEEK_SET);
127 if (r < 0) {
128 r = -errno;
129 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
130 VOID_TEMP_FAILURE_RETRY(::close(fd));
131 return r;
132 }
133 r = write(fd, buf, sizeof(buf));
134 if (r < 0) {
135 derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
136 VOID_TEMP_FAILURE_RETRY(::close(fd));
137 return r;
138 }
139 }
140
141 // fiemap an extent inside that
142 if (!m_filestore_fiemap) {
143 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
144 ioctl_fiemap = false;
145 } else {
146 struct fiemap *fiemap;
147 int r = do_fiemap(fd, 2430421, 59284, &fiemap);
148 if (r < 0) {
149 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
150 ioctl_fiemap = false;
151 } else {
152 if (fiemap->fm_mapped_extents == 0) {
153 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
154 ioctl_fiemap = false;
155 } else {
156 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
157 ioctl_fiemap = true;
158 }
159 free(fiemap);
160 }
161 }
162
163 // SEEK_DATA/SEEK_HOLE detection
164 if (!m_filestore_seek_data_hole) {
165 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
166 seek_data_hole = false;
167 } else {
168 #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
169 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
170 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
171 // Fall back to use fiemap.
172 off_t hole_pos;
173
174 hole_pos = lseek(fd, 0, SEEK_HOLE);
175 if (hole_pos < 0) {
176 if (errno == EINVAL) {
177 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
178 seek_data_hole = false;
179 } else {
180 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
181 VOID_TEMP_FAILURE_RETRY(::close(fd));
182 return -errno;
183 }
184 } else {
185 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
186 seek_data_hole = true;
187 }
188 #endif
189 }
190
191 //splice detection
192 #ifdef CEPH_HAVE_SPLICE
193 if (!m_filestore_splice) {
194 dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
195 use_splice = false;
196 } else {
197 int pipefd[2];
198 loff_t off_in = 0;
199 int r;
200 if ((r = pipe(pipefd)) < 0)
201 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl;
202 else {
203 lseek(fd, 0, SEEK_SET);
204 r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
205 if (!(r < 0 && errno == EINVAL)) {
206 use_splice = true;
207 dout(0) << "detect_features: splice is supported" << dendl;
208 } else
209 dout(0) << "detect_features: splice is NOT supported" << dendl;
210 close(pipefd[0]);
211 close(pipefd[1]);
212 }
213 }
214 #endif
215 ::unlink(fn);
216 VOID_TEMP_FAILURE_RETRY(::close(fd));
217
218
219 bool have_syncfs = false;
220 #ifdef HAVE_SYS_SYNCFS
221 if (::syncfs(get_basedir_fd()) == 0) {
222 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
223 have_syncfs = true;
224 } else {
225 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
226 }
227 #elif defined(SYS_syncfs)
228 if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
229 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
230 have_syncfs = true;
231 } else {
232 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
233 }
234 #elif defined(__NR_syncfs)
235 if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
236 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
237 have_syncfs = true;
238 } else {
239 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
240 }
241 #endif
242 if (!have_syncfs) {
243 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
244 if (m_filestore_fsync_flushes_journal_data) {
245 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
246 } else {
247 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
248 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
249 }
250 }
251
252 return 0;
253 }
254
255 int GenericFileStoreBackend::create_current()
256 {
257 struct stat st;
258 int ret = ::stat(get_current_path().c_str(), &st);
259 if (ret == 0) {
260 // current/ exists
261 if (!S_ISDIR(st.st_mode)) {
262 dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
263 ret = -EINVAL;
264 }
265 } else {
266 ret = ::mkdir(get_current_path().c_str(), 0755);
267 if (ret < 0) {
268 ret = -errno;
269 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
270 }
271 }
272 return ret;
273 }
274
275 int GenericFileStoreBackend::syncfs()
276 {
277 int ret;
278 if (m_filestore_fsync_flushes_journal_data) {
279 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
280 // make the file system's journal commit.
281 // this works with ext3, but NOT ext4
282 ret = ::fsync(get_op_fd());
283 if (ret < 0)
284 ret = -errno;
285 } else {
286 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
287 ret = sync_filesystem(get_current_fd());
288 }
289 return ret;
290 }
291
292 int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
293 {
294 struct fiemap *fiemap = NULL;
295 struct fiemap *_realloc_fiemap = NULL;
296 int size;
297 int ret;
298
299 fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
300 if (!fiemap)
301 return -ENOMEM;
302 /*
303 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
304 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
305 * Commit:"xfs: fix rounding error of fiemap length parameter
306 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
307 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
308 */
309 fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
310 fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
311 fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
312
313 #if defined(DARWIN) || defined(__FreeBSD__)
314 ret = -ENOTSUP;
315 goto done_err;
316 #else
317 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
318 ret = -errno;
319 goto done_err;
320 }
321 #endif
322 size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
323
324 _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
325 if (!_realloc_fiemap) {
326 ret = -ENOMEM;
327 goto done_err;
328 } else {
329 fiemap = _realloc_fiemap;
330 }
331
332 memset(fiemap->fm_extents, 0, size);
333
334 fiemap->fm_extent_count = fiemap->fm_mapped_extents;
335 fiemap->fm_mapped_extents = 0;
336
337 #if defined(DARWIN) || defined(__FreeBSD__)
338 ret = -ENOTSUP;
339 goto done_err;
340 #else
341 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
342 ret = -errno;
343 goto done_err;
344 }
345 *pfiemap = fiemap;
346 #endif
347 return 0;
348
349 done_err:
350 *pfiemap = NULL;
351 free(fiemap);
352 return ret;
353 }
354
355
356 int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
357 {
358 char buf[100];
359 bufferptr bp;
360 int r = 0;
361 int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
362 if (l == -ENODATA) {
363 return 0;
364 }
365 if (l >= 0) {
366 bp = buffer::create(l);
367 memcpy(bp.c_str(), buf, l);
368 } else if (l == -ERANGE) {
369 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
370 if (l > 0) {
371 bp = buffer::create(l);
372 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
373 }
374 }
375 bufferlist bl;
376 bl.append(std::move(bp));
377 bufferlist::iterator p = bl.begin();
378 try {
379 ::decode(*cm, p);
380 }
381 catch (buffer::error &e) {
382 r = -EIO;
383 }
384 if (r < 0)
385 derr << __func__ << " got " << cpp_strerror(r) << dendl;
386 return r;
387 }
388
389 int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
390 {
391 bufferlist bl;
392 ::encode(*cm, bl);
393 int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
394 if (r < 0)
395 derr << __func__ << " got " << cpp_strerror(r) << dendl;
396 return r;
397 }
398
399 int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
400 {
401 SloppyCRCMap scm(get_crc_block_size());
402 int r = _crc_load_or_init(fd, &scm);
403 if (r < 0)
404 return r;
405 ostringstream ss;
406 scm.write(off, len, bl, &ss);
407 dout(30) << __func__ << "\n" << ss.str() << dendl;
408 r = _crc_save(fd, &scm);
409 return r;
410 }
411
412 int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
413 {
414 SloppyCRCMap scm(get_crc_block_size());
415 int r = _crc_load_or_init(fd, &scm);
416 if (r < 0)
417 return r;
418 scm.truncate(off);
419 r = _crc_save(fd, &scm);
420 return r;
421 }
422
423 int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
424 {
425 SloppyCRCMap scm(get_crc_block_size());
426 int r = _crc_load_or_init(fd, &scm);
427 if (r < 0)
428 return r;
429 scm.zero(off, len);
430 r = _crc_save(fd, &scm);
431 return r;
432 }
433
434 int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
435 loff_t srcoff, size_t len, loff_t dstoff)
436 {
437 SloppyCRCMap scm_src(get_crc_block_size());
438 SloppyCRCMap scm_dst(get_crc_block_size());
439 int r = _crc_load_or_init(srcfd, &scm_src);
440 if (r < 0)
441 return r;
442 r = _crc_load_or_init(destfd, &scm_dst);
443 if (r < 0)
444 return r;
445 ostringstream ss;
446 scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
447 dout(30) << __func__ << "\n" << ss.str() << dendl;
448 r = _crc_save(destfd, &scm_dst);
449 return r;
450 }
451
452 int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
453 ostream *out)
454 {
455 SloppyCRCMap scm(get_crc_block_size());
456 int r = _crc_load_or_init(fd, &scm);
457 if (r < 0)
458 return r;
459 return scm.read(off, len, bl, out);
460 }