]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "include/int_types.h" | |
16 | #include "include/types.h" | |
17 | ||
18 | #include <unistd.h> | |
19 | #include <fcntl.h> | |
20 | #include <errno.h> | |
21 | #include <stdlib.h> | |
22 | #include <sys/types.h> | |
23 | #include <sys/stat.h> | |
24 | #include <sys/ioctl.h> | |
25 | ||
26 | #if defined(__linux__) | |
27 | #include <linux/fs.h> | |
28 | #endif | |
29 | ||
30 | #include "include/compat.h" | |
31 | #include "include/linux_fiemap.h" | |
32 | ||
33 | #include <iostream> | |
34 | #include <fstream> | |
35 | #include <sstream> | |
36 | ||
37 | #include "GenericFileStoreBackend.h" | |
38 | ||
39 | #include "common/errno.h" | |
40 | #include "common/config.h" | |
41 | #include "common/sync_filesystem.h" | |
31f18b77 | 42 | #include "common/blkdev.h" |
7c673cae FG |
43 | |
44 | #include "common/SloppyCRCMap.h" | |
45 | #include "os/filestore/chain_xattr.h" | |
46 | ||
47 | #define SLOPPY_CRC_XATTR "user.cephos.scrc" | |
48 | ||
49 | ||
50 | #define dout_context cct() | |
51 | #define dout_subsys ceph_subsys_filestore | |
52 | #undef dout_prefix | |
53 | #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") " | |
54 | ||
55 | #define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) | |
56 | #define ALIGNED(x, by) (!((x) % (by))) | |
57 | #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) | |
58 | ||
59 | GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs): | |
60 | FileStoreBackend(fs), | |
61 | ioctl_fiemap(false), | |
62 | seek_data_hole(false), | |
63 | use_splice(false), | |
64 | m_filestore_fiemap(cct()->_conf->filestore_fiemap), | |
65 | m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole), | |
66 | m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data), | |
31f18b77 FG |
67 | m_filestore_splice(cct()->_conf->filestore_splice) |
68 | { | |
69 | // rotational? | |
70 | { | |
71 | // NOTE: the below won't work on btrfs; we'll assume rotational. | |
72 | string fn = get_basedir_path(); | |
73 | int fd = ::open(fn.c_str(), O_RDONLY); | |
74 | if (fd < 0) { | |
75 | return; | |
76 | } | |
77 | char partition[PATH_MAX], devname[PATH_MAX]; | |
78 | int r = get_device_by_fd(fd, partition, devname, sizeof(devname)); | |
79 | if (r < 0) { | |
80 | dout(1) << "unable to get device name for " << get_basedir_path() << ": " | |
81 | << cpp_strerror(r) << dendl; | |
82 | m_rotational = true; | |
83 | } else { | |
84 | m_rotational = block_device_is_rotational(devname); | |
85 | dout(20) << __func__ << " devname " << devname | |
86 | << " rotational " << (int)m_rotational << dendl; | |
87 | } | |
88 | ::close(fd); | |
89 | } | |
d2e6a577 FG |
90 | // journal rotational? |
91 | { | |
92 | // NOTE: the below won't work on btrfs; we'll assume rotational. | |
93 | string fn = get_journal_path(); | |
94 | int fd = ::open(fn.c_str(), O_RDONLY); | |
95 | if (fd < 0) { | |
96 | return; | |
97 | } | |
98 | char partition[PATH_MAX], devname[PATH_MAX]; | |
99 | int r = get_device_by_fd(fd, partition, devname, sizeof(devname)); | |
100 | if (r < 0) { | |
101 | dout(1) << "unable to get journal device name for " | |
102 | << get_journal_path() << ": " << cpp_strerror(r) << dendl; | |
103 | m_journal_rotational = true; | |
104 | } else { | |
105 | m_journal_rotational = block_device_is_rotational(devname); | |
106 | dout(20) << __func__ << " journal devname " << devname | |
107 | << " journal rotational " << (int)m_journal_rotational << dendl; | |
108 | } | |
109 | ::close(fd); | |
110 | } | |
31f18b77 | 111 | } |
7c673cae FG |
112 | |
113 | int GenericFileStoreBackend::detect_features() | |
114 | { | |
115 | char fn[PATH_MAX]; | |
116 | snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str()); | |
117 | ||
118 | int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644); | |
119 | if (fd < 0) { | |
120 | fd = -errno; | |
121 | derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl; | |
122 | return fd; | |
123 | } | |
124 | ||
125 | // ext4 has a bug in older kernels where fiemap will return an empty | |
126 | // result in some cases. this is a file layout that triggers the bug | |
127 | // on 2.6.34-rc5. | |
128 | int v[] = { | |
129 | 0x0000000000016000, 0x0000000000007000, | |
130 | 0x000000000004a000, 0x0000000000007000, | |
131 | 0x0000000000060000, 0x0000000000001000, | |
132 | 0x0000000000061000, 0x0000000000008000, | |
133 | 0x0000000000069000, 0x0000000000007000, | |
134 | 0x00000000000a3000, 0x000000000000c000, | |
135 | 0x000000000024e000, 0x000000000000c000, | |
136 | 0x000000000028b000, 0x0000000000009000, | |
137 | 0x00000000002b1000, 0x0000000000003000, | |
138 | 0, 0 | |
139 | }; | |
140 | for (int i=0; v[i]; i++) { | |
141 | int off = v[i++]; | |
142 | int len = v[i]; | |
143 | ||
144 | // write a large extent | |
145 | char buf[len]; | |
146 | memset(buf, 1, sizeof(buf)); | |
147 | int r = ::lseek(fd, off, SEEK_SET); | |
148 | if (r < 0) { | |
149 | r = -errno; | |
150 | derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl; | |
151 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
152 | return r; | |
153 | } | |
154 | r = write(fd, buf, sizeof(buf)); | |
155 | if (r < 0) { | |
156 | derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl; | |
157 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
158 | return r; | |
159 | } | |
160 | } | |
161 | ||
162 | // fiemap an extent inside that | |
163 | if (!m_filestore_fiemap) { | |
164 | dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; | |
165 | ioctl_fiemap = false; | |
166 | } else { | |
167 | struct fiemap *fiemap; | |
168 | int r = do_fiemap(fd, 2430421, 59284, &fiemap); | |
169 | if (r < 0) { | |
170 | dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl; | |
171 | ioctl_fiemap = false; | |
172 | } else { | |
173 | if (fiemap->fm_mapped_extents == 0) { | |
174 | dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl; | |
175 | ioctl_fiemap = false; | |
176 | } else { | |
177 | dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; | |
178 | ioctl_fiemap = true; | |
179 | } | |
180 | free(fiemap); | |
181 | } | |
182 | } | |
183 | ||
184 | // SEEK_DATA/SEEK_HOLE detection | |
185 | if (!m_filestore_seek_data_hole) { | |
186 | dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl; | |
187 | seek_data_hole = false; | |
188 | } else { | |
189 | #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) | |
190 | // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running | |
191 | // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned. | |
192 | // Fall back to use fiemap. | |
193 | off_t hole_pos; | |
194 | ||
195 | hole_pos = lseek(fd, 0, SEEK_HOLE); | |
196 | if (hole_pos < 0) { | |
197 | if (errno == EINVAL) { | |
198 | dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl; | |
199 | seek_data_hole = false; | |
200 | } else { | |
201 | derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl; | |
202 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
203 | return -errno; | |
204 | } | |
205 | } else { | |
206 | dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl; | |
207 | seek_data_hole = true; | |
208 | } | |
209 | #endif | |
210 | } | |
211 | ||
212 | //splice detection | |
213 | #ifdef CEPH_HAVE_SPLICE | |
214 | if (!m_filestore_splice) { | |
215 | dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl; | |
216 | use_splice = false; | |
217 | } else { | |
218 | int pipefd[2]; | |
219 | loff_t off_in = 0; | |
220 | int r; | |
221 | if ((r = pipe(pipefd)) < 0) | |
222 | dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl; | |
223 | else { | |
224 | lseek(fd, 0, SEEK_SET); | |
225 | r = splice(fd, &off_in, pipefd[1], NULL, 10, 0); | |
226 | if (!(r < 0 && errno == EINVAL)) { | |
227 | use_splice = true; | |
228 | dout(0) << "detect_features: splice is supported" << dendl; | |
229 | } else | |
230 | dout(0) << "detect_features: splice is NOT supported" << dendl; | |
231 | close(pipefd[0]); | |
232 | close(pipefd[1]); | |
233 | } | |
234 | } | |
235 | #endif | |
236 | ::unlink(fn); | |
237 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
238 | ||
239 | ||
240 | bool have_syncfs = false; | |
241 | #ifdef HAVE_SYS_SYNCFS | |
242 | if (::syncfs(get_basedir_fd()) == 0) { | |
243 | dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl; | |
244 | have_syncfs = true; | |
245 | } else { | |
246 | dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; | |
247 | } | |
248 | #elif defined(SYS_syncfs) | |
249 | if (syscall(SYS_syncfs, get_basedir_fd()) == 0) { | |
250 | dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl; | |
251 | have_syncfs = true; | |
252 | } else { | |
253 | dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; | |
254 | } | |
255 | #elif defined(__NR_syncfs) | |
256 | if (syscall(__NR_syncfs, get_basedir_fd()) == 0) { | |
257 | dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl; | |
258 | have_syncfs = true; | |
259 | } else { | |
260 | dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; | |
261 | } | |
262 | #endif | |
263 | if (!have_syncfs) { | |
264 | dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl; | |
265 | if (m_filestore_fsync_flushes_journal_data) { | |
266 | dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl; | |
267 | } else { | |
268 | dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl; | |
269 | dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl; | |
270 | } | |
271 | } | |
272 | ||
273 | return 0; | |
274 | } | |
275 | ||
276 | int GenericFileStoreBackend::create_current() | |
277 | { | |
278 | struct stat st; | |
279 | int ret = ::stat(get_current_path().c_str(), &st); | |
280 | if (ret == 0) { | |
281 | // current/ exists | |
282 | if (!S_ISDIR(st.st_mode)) { | |
283 | dout(0) << "_create_current: current/ exists but is not a directory" << dendl; | |
284 | ret = -EINVAL; | |
285 | } | |
286 | } else { | |
287 | ret = ::mkdir(get_current_path().c_str(), 0755); | |
288 | if (ret < 0) { | |
289 | ret = -errno; | |
290 | dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl; | |
291 | } | |
292 | } | |
293 | return ret; | |
294 | } | |
295 | ||
296 | int GenericFileStoreBackend::syncfs() | |
297 | { | |
298 | int ret; | |
299 | if (m_filestore_fsync_flushes_journal_data) { | |
300 | dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl; | |
301 | // make the file system's journal commit. | |
302 | // this works with ext3, but NOT ext4 | |
303 | ret = ::fsync(get_op_fd()); | |
304 | if (ret < 0) | |
305 | ret = -errno; | |
306 | } else { | |
307 | dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl; | |
308 | ret = sync_filesystem(get_current_fd()); | |
309 | } | |
310 | return ret; | |
311 | } | |
312 | ||
313 | int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) | |
314 | { | |
315 | struct fiemap *fiemap = NULL; | |
316 | struct fiemap *_realloc_fiemap = NULL; | |
317 | int size; | |
318 | int ret; | |
319 | ||
320 | fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1); | |
321 | if (!fiemap) | |
322 | return -ENOMEM; | |
323 | /* | |
324 | * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096), | |
325 | * the result is (logical=4096, len=4096). It leak the [3990, 4096). | |
326 | * Commit:"xfs: fix rounding error of fiemap length parameter | |
327 | * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug. | |
328 | * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug. | |
329 | */ | |
330 | fiemap->fm_start = start - start % CEPH_PAGE_SIZE; | |
331 | fiemap->fm_length = len + start % CEPH_PAGE_SIZE; | |
332 | fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ | |
333 | ||
334 | #if defined(DARWIN) || defined(__FreeBSD__) | |
335 | ret = -ENOTSUP; | |
336 | goto done_err; | |
337 | #else | |
338 | if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { | |
339 | ret = -errno; | |
340 | goto done_err; | |
341 | } | |
342 | #endif | |
343 | size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); | |
344 | ||
345 | _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); | |
346 | if (!_realloc_fiemap) { | |
347 | ret = -ENOMEM; | |
348 | goto done_err; | |
349 | } else { | |
350 | fiemap = _realloc_fiemap; | |
351 | } | |
352 | ||
353 | memset(fiemap->fm_extents, 0, size); | |
354 | ||
355 | fiemap->fm_extent_count = fiemap->fm_mapped_extents; | |
356 | fiemap->fm_mapped_extents = 0; | |
357 | ||
358 | #if defined(DARWIN) || defined(__FreeBSD__) | |
359 | ret = -ENOTSUP; | |
360 | goto done_err; | |
361 | #else | |
362 | if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { | |
363 | ret = -errno; | |
364 | goto done_err; | |
365 | } | |
366 | *pfiemap = fiemap; | |
367 | #endif | |
368 | return 0; | |
369 | ||
370 | done_err: | |
371 | *pfiemap = NULL; | |
372 | free(fiemap); | |
373 | return ret; | |
374 | } | |
375 | ||
376 | ||
377 | int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm) | |
378 | { | |
379 | char buf[100]; | |
380 | bufferptr bp; | |
381 | int r = 0; | |
382 | int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf)); | |
383 | if (l == -ENODATA) { | |
384 | return 0; | |
385 | } | |
386 | if (l >= 0) { | |
387 | bp = buffer::create(l); | |
388 | memcpy(bp.c_str(), buf, l); | |
389 | } else if (l == -ERANGE) { | |
390 | l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0); | |
391 | if (l > 0) { | |
392 | bp = buffer::create(l); | |
393 | l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l); | |
394 | } | |
395 | } | |
396 | bufferlist bl; | |
397 | bl.append(std::move(bp)); | |
398 | bufferlist::iterator p = bl.begin(); | |
399 | try { | |
400 | ::decode(*cm, p); | |
401 | } | |
402 | catch (buffer::error &e) { | |
403 | r = -EIO; | |
404 | } | |
405 | if (r < 0) | |
406 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
407 | return r; | |
408 | } | |
409 | ||
410 | int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm) | |
411 | { | |
412 | bufferlist bl; | |
413 | ::encode(*cm, bl); | |
414 | int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length()); | |
415 | if (r < 0) | |
416 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
417 | return r; | |
418 | } | |
419 | ||
420 | int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) | |
421 | { | |
422 | SloppyCRCMap scm(get_crc_block_size()); | |
423 | int r = _crc_load_or_init(fd, &scm); | |
424 | if (r < 0) | |
425 | return r; | |
426 | ostringstream ss; | |
427 | scm.write(off, len, bl, &ss); | |
428 | dout(30) << __func__ << "\n" << ss.str() << dendl; | |
429 | r = _crc_save(fd, &scm); | |
430 | return r; | |
431 | } | |
432 | ||
433 | int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off) | |
434 | { | |
435 | SloppyCRCMap scm(get_crc_block_size()); | |
436 | int r = _crc_load_or_init(fd, &scm); | |
437 | if (r < 0) | |
438 | return r; | |
439 | scm.truncate(off); | |
440 | r = _crc_save(fd, &scm); | |
441 | return r; | |
442 | } | |
443 | ||
444 | int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len) | |
445 | { | |
446 | SloppyCRCMap scm(get_crc_block_size()); | |
447 | int r = _crc_load_or_init(fd, &scm); | |
448 | if (r < 0) | |
449 | return r; | |
450 | scm.zero(off, len); | |
451 | r = _crc_save(fd, &scm); | |
452 | return r; | |
453 | } | |
454 | ||
455 | int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd, | |
456 | loff_t srcoff, size_t len, loff_t dstoff) | |
457 | { | |
458 | SloppyCRCMap scm_src(get_crc_block_size()); | |
459 | SloppyCRCMap scm_dst(get_crc_block_size()); | |
460 | int r = _crc_load_or_init(srcfd, &scm_src); | |
461 | if (r < 0) | |
462 | return r; | |
463 | r = _crc_load_or_init(destfd, &scm_dst); | |
464 | if (r < 0) | |
465 | return r; | |
466 | ostringstream ss; | |
467 | scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss); | |
468 | dout(30) << __func__ << "\n" << ss.str() << dendl; | |
469 | r = _crc_save(destfd, &scm_dst); | |
470 | return r; | |
471 | } | |
472 | ||
473 | int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, | |
474 | ostream *out) | |
475 | { | |
476 | SloppyCRCMap scm(get_crc_block_size()); | |
477 | int r = _crc_load_or_init(fd, &scm); | |
478 | if (r < 0) | |
479 | return r; | |
480 | return scm.read(off, len, bl, out); | |
481 | } |