]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/filestore/GenericFileStoreBackend.cc
update sources to v12.1.3
[ceph.git] / ceph / src / os / filestore / GenericFileStoreBackend.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/int_types.h"
16#include "include/types.h"
17
18#include <unistd.h>
19#include <fcntl.h>
20#include <errno.h>
21#include <stdlib.h>
22#include <sys/types.h>
23#include <sys/stat.h>
24#include <sys/ioctl.h>
25
26#if defined(__linux__)
27#include <linux/fs.h>
28#endif
29
30#include "include/compat.h"
31#include "include/linux_fiemap.h"
32
33#include <iostream>
34#include <fstream>
35#include <sstream>
36
37#include "GenericFileStoreBackend.h"
38
39#include "common/errno.h"
40#include "common/config.h"
41#include "common/sync_filesystem.h"
31f18b77 42#include "common/blkdev.h"
7c673cae
FG
43
44#include "common/SloppyCRCMap.h"
45#include "os/filestore/chain_xattr.h"
46
47#define SLOPPY_CRC_XATTR "user.cephos.scrc"
48
49
50#define dout_context cct()
51#define dout_subsys ceph_subsys_filestore
52#undef dout_prefix
53#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
54
55#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56#define ALIGNED(x, by) (!((x) % (by)))
57#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
58
59GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
60 FileStoreBackend(fs),
61 ioctl_fiemap(false),
62 seek_data_hole(false),
63 use_splice(false),
64 m_filestore_fiemap(cct()->_conf->filestore_fiemap),
65 m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
66 m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
31f18b77
FG
67 m_filestore_splice(cct()->_conf->filestore_splice)
68{
69 // rotational?
70 {
71 // NOTE: the below won't work on btrfs; we'll assume rotational.
72 string fn = get_basedir_path();
73 int fd = ::open(fn.c_str(), O_RDONLY);
74 if (fd < 0) {
75 return;
76 }
77 char partition[PATH_MAX], devname[PATH_MAX];
78 int r = get_device_by_fd(fd, partition, devname, sizeof(devname));
79 if (r < 0) {
80 dout(1) << "unable to get device name for " << get_basedir_path() << ": "
81 << cpp_strerror(r) << dendl;
82 m_rotational = true;
83 } else {
84 m_rotational = block_device_is_rotational(devname);
85 dout(20) << __func__ << " devname " << devname
86 << " rotational " << (int)m_rotational << dendl;
87 }
88 ::close(fd);
89 }
d2e6a577
FG
90 // journal rotational?
91 {
92 // NOTE: the below won't work on btrfs; we'll assume rotational.
93 string fn = get_journal_path();
94 int fd = ::open(fn.c_str(), O_RDONLY);
95 if (fd < 0) {
96 return;
97 }
98 char partition[PATH_MAX], devname[PATH_MAX];
99 int r = get_device_by_fd(fd, partition, devname, sizeof(devname));
100 if (r < 0) {
101 dout(1) << "unable to get journal device name for "
102 << get_journal_path() << ": " << cpp_strerror(r) << dendl;
103 m_journal_rotational = true;
104 } else {
105 m_journal_rotational = block_device_is_rotational(devname);
106 dout(20) << __func__ << " journal devname " << devname
107 << " journal rotational " << (int)m_journal_rotational << dendl;
108 }
109 ::close(fd);
110 }
31f18b77 111}
7c673cae
FG
112
113int GenericFileStoreBackend::detect_features()
114{
115 char fn[PATH_MAX];
116 snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
117
118 int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644);
119 if (fd < 0) {
120 fd = -errno;
121 derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
122 return fd;
123 }
124
125 // ext4 has a bug in older kernels where fiemap will return an empty
126 // result in some cases. this is a file layout that triggers the bug
127 // on 2.6.34-rc5.
128 int v[] = {
129 0x0000000000016000, 0x0000000000007000,
130 0x000000000004a000, 0x0000000000007000,
131 0x0000000000060000, 0x0000000000001000,
132 0x0000000000061000, 0x0000000000008000,
133 0x0000000000069000, 0x0000000000007000,
134 0x00000000000a3000, 0x000000000000c000,
135 0x000000000024e000, 0x000000000000c000,
136 0x000000000028b000, 0x0000000000009000,
137 0x00000000002b1000, 0x0000000000003000,
138 0, 0
139 };
140 for (int i=0; v[i]; i++) {
141 int off = v[i++];
142 int len = v[i];
143
144 // write a large extent
145 char buf[len];
146 memset(buf, 1, sizeof(buf));
147 int r = ::lseek(fd, off, SEEK_SET);
148 if (r < 0) {
149 r = -errno;
150 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
151 VOID_TEMP_FAILURE_RETRY(::close(fd));
152 return r;
153 }
154 r = write(fd, buf, sizeof(buf));
155 if (r < 0) {
156 derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
157 VOID_TEMP_FAILURE_RETRY(::close(fd));
158 return r;
159 }
160 }
161
162 // fiemap an extent inside that
163 if (!m_filestore_fiemap) {
164 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
165 ioctl_fiemap = false;
166 } else {
167 struct fiemap *fiemap;
168 int r = do_fiemap(fd, 2430421, 59284, &fiemap);
169 if (r < 0) {
170 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
171 ioctl_fiemap = false;
172 } else {
173 if (fiemap->fm_mapped_extents == 0) {
174 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
175 ioctl_fiemap = false;
176 } else {
177 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
178 ioctl_fiemap = true;
179 }
180 free(fiemap);
181 }
182 }
183
184 // SEEK_DATA/SEEK_HOLE detection
185 if (!m_filestore_seek_data_hole) {
186 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
187 seek_data_hole = false;
188 } else {
189#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
190 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
191 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
192 // Fall back to use fiemap.
193 off_t hole_pos;
194
195 hole_pos = lseek(fd, 0, SEEK_HOLE);
196 if (hole_pos < 0) {
197 if (errno == EINVAL) {
198 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
199 seek_data_hole = false;
200 } else {
201 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
202 VOID_TEMP_FAILURE_RETRY(::close(fd));
203 return -errno;
204 }
205 } else {
206 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
207 seek_data_hole = true;
208 }
209#endif
210 }
211
212 //splice detection
213#ifdef CEPH_HAVE_SPLICE
214 if (!m_filestore_splice) {
215 dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
216 use_splice = false;
217 } else {
218 int pipefd[2];
219 loff_t off_in = 0;
220 int r;
221 if ((r = pipe(pipefd)) < 0)
222 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl;
223 else {
224 lseek(fd, 0, SEEK_SET);
225 r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
226 if (!(r < 0 && errno == EINVAL)) {
227 use_splice = true;
228 dout(0) << "detect_features: splice is supported" << dendl;
229 } else
230 dout(0) << "detect_features: splice is NOT supported" << dendl;
231 close(pipefd[0]);
232 close(pipefd[1]);
233 }
234 }
235#endif
236 ::unlink(fn);
237 VOID_TEMP_FAILURE_RETRY(::close(fd));
238
239
240 bool have_syncfs = false;
241#ifdef HAVE_SYS_SYNCFS
242 if (::syncfs(get_basedir_fd()) == 0) {
243 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
244 have_syncfs = true;
245 } else {
246 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
247 }
248#elif defined(SYS_syncfs)
249 if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
250 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
251 have_syncfs = true;
252 } else {
253 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
254 }
255#elif defined(__NR_syncfs)
256 if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
257 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
258 have_syncfs = true;
259 } else {
260 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
261 }
262#endif
263 if (!have_syncfs) {
264 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
265 if (m_filestore_fsync_flushes_journal_data) {
266 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
267 } else {
268 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
269 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
270 }
271 }
272
273 return 0;
274}
275
276int GenericFileStoreBackend::create_current()
277{
278 struct stat st;
279 int ret = ::stat(get_current_path().c_str(), &st);
280 if (ret == 0) {
281 // current/ exists
282 if (!S_ISDIR(st.st_mode)) {
283 dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
284 ret = -EINVAL;
285 }
286 } else {
287 ret = ::mkdir(get_current_path().c_str(), 0755);
288 if (ret < 0) {
289 ret = -errno;
290 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
291 }
292 }
293 return ret;
294}
295
296int GenericFileStoreBackend::syncfs()
297{
298 int ret;
299 if (m_filestore_fsync_flushes_journal_data) {
300 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
301 // make the file system's journal commit.
302 // this works with ext3, but NOT ext4
303 ret = ::fsync(get_op_fd());
304 if (ret < 0)
305 ret = -errno;
306 } else {
307 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
308 ret = sync_filesystem(get_current_fd());
309 }
310 return ret;
311}
312
313int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
314{
315 struct fiemap *fiemap = NULL;
316 struct fiemap *_realloc_fiemap = NULL;
317 int size;
318 int ret;
319
320 fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
321 if (!fiemap)
322 return -ENOMEM;
323 /*
324 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
325 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
326 * Commit:"xfs: fix rounding error of fiemap length parameter
327 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
328 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
329 */
330 fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
331 fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
332 fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
333
334#if defined(DARWIN) || defined(__FreeBSD__)
335 ret = -ENOTSUP;
336 goto done_err;
337#else
338 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
339 ret = -errno;
340 goto done_err;
341 }
342#endif
343 size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
344
345 _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
346 if (!_realloc_fiemap) {
347 ret = -ENOMEM;
348 goto done_err;
349 } else {
350 fiemap = _realloc_fiemap;
351 }
352
353 memset(fiemap->fm_extents, 0, size);
354
355 fiemap->fm_extent_count = fiemap->fm_mapped_extents;
356 fiemap->fm_mapped_extents = 0;
357
358#if defined(DARWIN) || defined(__FreeBSD__)
359 ret = -ENOTSUP;
360 goto done_err;
361#else
362 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
363 ret = -errno;
364 goto done_err;
365 }
366 *pfiemap = fiemap;
367#endif
368 return 0;
369
370done_err:
371 *pfiemap = NULL;
372 free(fiemap);
373 return ret;
374}
375
376
377int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
378{
379 char buf[100];
380 bufferptr bp;
381 int r = 0;
382 int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
383 if (l == -ENODATA) {
384 return 0;
385 }
386 if (l >= 0) {
387 bp = buffer::create(l);
388 memcpy(bp.c_str(), buf, l);
389 } else if (l == -ERANGE) {
390 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
391 if (l > 0) {
392 bp = buffer::create(l);
393 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
394 }
395 }
396 bufferlist bl;
397 bl.append(std::move(bp));
398 bufferlist::iterator p = bl.begin();
399 try {
400 ::decode(*cm, p);
401 }
402 catch (buffer::error &e) {
403 r = -EIO;
404 }
405 if (r < 0)
406 derr << __func__ << " got " << cpp_strerror(r) << dendl;
407 return r;
408}
409
410int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
411{
412 bufferlist bl;
413 ::encode(*cm, bl);
414 int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
415 if (r < 0)
416 derr << __func__ << " got " << cpp_strerror(r) << dendl;
417 return r;
418}
419
420int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
421{
422 SloppyCRCMap scm(get_crc_block_size());
423 int r = _crc_load_or_init(fd, &scm);
424 if (r < 0)
425 return r;
426 ostringstream ss;
427 scm.write(off, len, bl, &ss);
428 dout(30) << __func__ << "\n" << ss.str() << dendl;
429 r = _crc_save(fd, &scm);
430 return r;
431}
432
433int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
434{
435 SloppyCRCMap scm(get_crc_block_size());
436 int r = _crc_load_or_init(fd, &scm);
437 if (r < 0)
438 return r;
439 scm.truncate(off);
440 r = _crc_save(fd, &scm);
441 return r;
442}
443
444int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
445{
446 SloppyCRCMap scm(get_crc_block_size());
447 int r = _crc_load_or_init(fd, &scm);
448 if (r < 0)
449 return r;
450 scm.zero(off, len);
451 r = _crc_save(fd, &scm);
452 return r;
453}
454
455int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
456 loff_t srcoff, size_t len, loff_t dstoff)
457{
458 SloppyCRCMap scm_src(get_crc_block_size());
459 SloppyCRCMap scm_dst(get_crc_block_size());
460 int r = _crc_load_or_init(srcfd, &scm_src);
461 if (r < 0)
462 return r;
463 r = _crc_load_or_init(destfd, &scm_dst);
464 if (r < 0)
465 return r;
466 ostringstream ss;
467 scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
468 dout(30) << __func__ << "\n" << ss.str() << dendl;
469 r = _crc_save(destfd, &scm_dst);
470 return r;
471}
472
473int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
474 ostream *out)
475{
476 SloppyCRCMap scm(get_crc_block_size());
477 int r = _crc_load_or_init(fd, &scm);
478 if (r < 0)
479 return r;
480 return scm.read(off, len, bl, out);
481}