]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/filestore/GenericFileStoreBackend.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / os / filestore / GenericFileStoreBackend.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/int_types.h"
16 #include "include/types.h"
17
18 #include <unistd.h>
19 #include <fcntl.h>
20 #include <errno.h>
21 #include <stdlib.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <sys/ioctl.h>
25
26 #if defined(__linux__)
27 #include <linux/fs.h>
28 #endif
29
30 #include "include/compat.h"
31 #include "include/linux_fiemap.h"
32
33 #include <iostream>
34 #include <fstream>
35 #include <sstream>
36
37 #include "GenericFileStoreBackend.h"
38
39 #include "common/errno.h"
40 #include "common/config.h"
41 #include "common/sync_filesystem.h"
42 #include "common/blkdev.h"
43
44 #include "common/SloppyCRCMap.h"
45 #include "os/filestore/chain_xattr.h"
46
47 #define SLOPPY_CRC_XATTR "user.cephos.scrc"
48
49
50 #define dout_context cct()
51 #define dout_subsys ceph_subsys_filestore
52 #undef dout_prefix
53 #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
54
55 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56 #define ALIGNED(x, by) (!((x) % (by)))
57 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
58
59 using std::ostream;
60 using std::ostringstream;
61 using std::string;
62
63 using ceph::bufferptr;
64 using ceph::bufferlist;
65
66 GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
67 FileStoreBackend(fs),
68 ioctl_fiemap(false),
69 seek_data_hole(false),
70 use_splice(false),
71 m_filestore_fiemap(cct()->_conf->filestore_fiemap),
72 m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
73 m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
74 m_filestore_splice(cct()->_conf->filestore_splice)
75 {
76 // rotational?
77 {
78 // NOTE: the below won't work on btrfs; we'll assume rotational.
79 string fn = get_basedir_path();
80 int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
81 if (fd < 0) {
82 return;
83 }
84 BlkDev blkdev(fd);
85 m_rotational = blkdev.is_rotational();
86 dout(20) << __func__ << " basedir " << fn
87 << " rotational " << (int)m_rotational << dendl;
88 ::close(fd);
89 }
90 // journal rotational?
91 {
92 // NOTE: the below won't work on btrfs; we'll assume rotational.
93 string fn = get_journal_path();
94 int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
95 if (fd < 0) {
96 return;
97 }
98 BlkDev blkdev(fd);
99 m_journal_rotational = blkdev.is_rotational();
100 dout(20) << __func__ << " journal filename " << fn.c_str()
101 << " journal rotational " << (int)m_journal_rotational << dendl;
102 ::close(fd);
103 }
104 }
105
106 int GenericFileStoreBackend::detect_features()
107 {
108 char fn[PATH_MAX];
109 snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
110
111 int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, 0644);
112 if (fd < 0) {
113 fd = -errno;
114 derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
115 return fd;
116 }
117
118 // ext4 has a bug in older kernels where fiemap will return an empty
119 // result in some cases. this is a file layout that triggers the bug
120 // on 2.6.34-rc5.
121 int v[] = {
122 0x0000000000016000, 0x0000000000007000,
123 0x000000000004a000, 0x0000000000007000,
124 0x0000000000060000, 0x0000000000001000,
125 0x0000000000061000, 0x0000000000008000,
126 0x0000000000069000, 0x0000000000007000,
127 0x00000000000a3000, 0x000000000000c000,
128 0x000000000024e000, 0x000000000000c000,
129 0x000000000028b000, 0x0000000000009000,
130 0x00000000002b1000, 0x0000000000003000,
131 0, 0
132 };
133 for (int i=0; v[i]; i++) {
134 int off = v[i++];
135 int len = v[i];
136
137 // write a large extent
138 char buf[len];
139 memset(buf, 1, sizeof(buf));
140 int r = ::lseek(fd, off, SEEK_SET);
141 if (r < 0) {
142 r = -errno;
143 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
144 VOID_TEMP_FAILURE_RETRY(::close(fd));
145 return r;
146 }
147 r = write(fd, buf, sizeof(buf));
148 if (r < 0) {
149 derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
150 VOID_TEMP_FAILURE_RETRY(::close(fd));
151 return r;
152 }
153 }
154
155 // fiemap an extent inside that
156 if (!m_filestore_fiemap) {
157 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
158 ioctl_fiemap = false;
159 } else {
160 struct fiemap *fiemap;
161 int r = do_fiemap(fd, 2430421, 59284, &fiemap);
162 if (r < 0) {
163 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
164 ioctl_fiemap = false;
165 } else {
166 if (fiemap->fm_mapped_extents == 0) {
167 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
168 ioctl_fiemap = false;
169 } else {
170 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
171 ioctl_fiemap = true;
172 }
173 free(fiemap);
174 }
175 }
176
177 // SEEK_DATA/SEEK_HOLE detection
178 if (!m_filestore_seek_data_hole) {
179 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
180 seek_data_hole = false;
181 } else {
182 #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
183 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
184 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
185 // Fall back to use fiemap.
186 off_t hole_pos;
187
188 hole_pos = lseek(fd, 0, SEEK_HOLE);
189 if (hole_pos < 0) {
190 if (errno == EINVAL) {
191 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
192 seek_data_hole = false;
193 } else {
194 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
195 VOID_TEMP_FAILURE_RETRY(::close(fd));
196 return -errno;
197 }
198 } else {
199 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
200 seek_data_hole = true;
201 }
202 #endif
203 }
204
205 //splice detection
206 #ifdef CEPH_HAVE_SPLICE
207 if (!m_filestore_splice) {
208 dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
209 use_splice = false;
210 } else {
211 int pipefd[2];
212 loff_t off_in = 0;
213 int r;
214 if (pipe_cloexec(pipefd, 0) < 0) {
215 int e = errno;
216 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e) << dendl;
217 } else {
218 lseek(fd, 0, SEEK_SET);
219 r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
220 if (!(r < 0 && errno == EINVAL)) {
221 use_splice = true;
222 dout(0) << "detect_features: splice is supported" << dendl;
223 } else
224 dout(0) << "detect_features: splice is NOT supported" << dendl;
225 close(pipefd[0]);
226 close(pipefd[1]);
227 }
228 }
229 #endif
230 ::unlink(fn);
231 VOID_TEMP_FAILURE_RETRY(::close(fd));
232
233
234 bool have_syncfs = false;
235 #ifdef HAVE_SYS_SYNCFS
236 if (::syncfs(get_basedir_fd()) == 0) {
237 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
238 have_syncfs = true;
239 } else {
240 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
241 }
242 #elif defined(SYS_syncfs)
243 if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
244 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
245 have_syncfs = true;
246 } else {
247 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
248 }
249 #elif defined(__NR_syncfs)
250 if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
251 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
252 have_syncfs = true;
253 } else {
254 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
255 }
256 #endif
257 if (!have_syncfs) {
258 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
259 if (m_filestore_fsync_flushes_journal_data) {
260 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
261 } else {
262 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
263 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
264 }
265 }
266
267 return 0;
268 }
269
270 int GenericFileStoreBackend::create_current()
271 {
272 struct stat st;
273 int ret = ::stat(get_current_path().c_str(), &st);
274 if (ret == 0) {
275 // current/ exists
276 if (!S_ISDIR(st.st_mode)) {
277 dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
278 ret = -EINVAL;
279 }
280 } else {
281 ret = ::mkdir(get_current_path().c_str(), 0755);
282 if (ret < 0) {
283 ret = -errno;
284 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
285 }
286 }
287 return ret;
288 }
289
290 int GenericFileStoreBackend::syncfs()
291 {
292 int ret;
293 if (m_filestore_fsync_flushes_journal_data) {
294 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
295 // make the file system's journal commit.
296 // this works with ext3, but NOT ext4
297 ret = ::fsync(get_op_fd());
298 if (ret < 0)
299 ret = -errno;
300 } else {
301 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
302 ret = sync_filesystem(get_current_fd());
303 }
304 return ret;
305 }
306
307 int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
308 {
309 struct fiemap *fiemap = NULL;
310 struct fiemap *_realloc_fiemap = NULL;
311 int size;
312 int ret;
313
314 fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
315 if (!fiemap)
316 return -ENOMEM;
317 /*
318 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
319 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
320 * Commit:"xfs: fix rounding error of fiemap length parameter
321 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
322 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
323 */
324 fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
325 fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
326 fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
327
328 #if defined(__APPLE__) || defined(__FreeBSD__)
329 ret = -ENOTSUP;
330 goto done_err;
331 #else
332 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
333 ret = -errno;
334 goto done_err;
335 }
336 #endif
337 size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
338
339 _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
340 if (!_realloc_fiemap) {
341 ret = -ENOMEM;
342 goto done_err;
343 } else {
344 fiemap = _realloc_fiemap;
345 }
346
347 memset(fiemap->fm_extents, 0, size);
348
349 fiemap->fm_extent_count = fiemap->fm_mapped_extents;
350 fiemap->fm_mapped_extents = 0;
351
352 #if defined(__APPLE__) || defined(__FreeBSD__)
353 ret = -ENOTSUP;
354 goto done_err;
355 #else
356 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
357 ret = -errno;
358 goto done_err;
359 }
360 *pfiemap = fiemap;
361 #endif
362 return 0;
363
364 done_err:
365 *pfiemap = NULL;
366 free(fiemap);
367 return ret;
368 }
369
370
371 int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
372 {
373 char buf[100];
374 bufferptr bp;
375 int r = 0;
376 int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
377 if (l == -ENODATA) {
378 return 0;
379 }
380 if (l >= 0) {
381 bp = ceph::buffer::create(l);
382 memcpy(bp.c_str(), buf, l);
383 } else if (l == -ERANGE) {
384 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
385 if (l > 0) {
386 bp = ceph::buffer::create(l);
387 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
388 }
389 }
390 bufferlist bl;
391 bl.append(std::move(bp));
392 auto p = bl.cbegin();
393 try {
394 decode(*cm, p);
395 }
396 catch (ceph::buffer::error &e) {
397 r = -EIO;
398 }
399 if (r < 0)
400 derr << __func__ << " got " << cpp_strerror(r) << dendl;
401 return r;
402 }
403
404 int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
405 {
406 bufferlist bl;
407 encode(*cm, bl);
408 int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
409 if (r < 0)
410 derr << __func__ << " got " << cpp_strerror(r) << dendl;
411 return r;
412 }
413
414 int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
415 {
416 SloppyCRCMap scm(get_crc_block_size());
417 int r = _crc_load_or_init(fd, &scm);
418 if (r < 0)
419 return r;
420 ostringstream ss;
421 scm.write(off, len, bl, &ss);
422 dout(30) << __func__ << "\n" << ss.str() << dendl;
423 r = _crc_save(fd, &scm);
424 return r;
425 }
426
427 int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
428 {
429 SloppyCRCMap scm(get_crc_block_size());
430 int r = _crc_load_or_init(fd, &scm);
431 if (r < 0)
432 return r;
433 scm.truncate(off);
434 r = _crc_save(fd, &scm);
435 return r;
436 }
437
438 int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
439 {
440 SloppyCRCMap scm(get_crc_block_size());
441 int r = _crc_load_or_init(fd, &scm);
442 if (r < 0)
443 return r;
444 scm.zero(off, len);
445 r = _crc_save(fd, &scm);
446 return r;
447 }
448
449 int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
450 loff_t srcoff, size_t len, loff_t dstoff)
451 {
452 SloppyCRCMap scm_src(get_crc_block_size());
453 SloppyCRCMap scm_dst(get_crc_block_size());
454 int r = _crc_load_or_init(srcfd, &scm_src);
455 if (r < 0)
456 return r;
457 r = _crc_load_or_init(destfd, &scm_dst);
458 if (r < 0)
459 return r;
460 ostringstream ss;
461 scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
462 dout(30) << __func__ << "\n" << ss.str() << dendl;
463 r = _crc_save(destfd, &scm_dst);
464 return r;
465 }
466
467 int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
468 ostream *out)
469 {
470 SloppyCRCMap scm(get_crc_block_size());
471 int r = _crc_load_or_init(fd, &scm);
472 if (r < 0)
473 return r;
474 return scm.read(off, len, bl, out);
475 }