]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "include/int_types.h" | |
16 | #include "include/types.h" | |
17 | ||
18 | #include <unistd.h> | |
19 | #include <fcntl.h> | |
20 | #include <errno.h> | |
21 | #include <stdlib.h> | |
22 | #include <sys/types.h> | |
23 | #include <sys/stat.h> | |
24 | #include <sys/ioctl.h> | |
25 | ||
26 | #if defined(__linux__) | |
27 | #include <linux/fs.h> | |
28 | #endif | |
29 | ||
30 | #include "include/compat.h" | |
31 | #include "include/linux_fiemap.h" | |
32 | ||
33 | #include <iostream> | |
34 | #include <fstream> | |
35 | #include <sstream> | |
36 | ||
37 | #include "GenericFileStoreBackend.h" | |
38 | ||
39 | #include "common/errno.h" | |
40 | #include "common/config.h" | |
41 | #include "common/sync_filesystem.h" | |
31f18b77 | 42 | #include "common/blkdev.h" |
7c673cae FG |
43 | |
44 | #include "common/SloppyCRCMap.h" | |
45 | #include "os/filestore/chain_xattr.h" | |
46 | ||
47 | #define SLOPPY_CRC_XATTR "user.cephos.scrc" | |
48 | ||
49 | ||
50 | #define dout_context cct() | |
51 | #define dout_subsys ceph_subsys_filestore | |
52 | #undef dout_prefix | |
53 | #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") " | |
54 | ||
55 | #define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) | |
56 | #define ALIGNED(x, by) (!((x) % (by))) | |
57 | #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) | |
58 | ||
59 | GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs): | |
60 | FileStoreBackend(fs), | |
61 | ioctl_fiemap(false), | |
62 | seek_data_hole(false), | |
63 | use_splice(false), | |
64 | m_filestore_fiemap(cct()->_conf->filestore_fiemap), | |
65 | m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole), | |
66 | m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data), | |
31f18b77 FG |
67 | m_filestore_splice(cct()->_conf->filestore_splice) |
68 | { | |
69 | // rotational? | |
70 | { | |
71 | // NOTE: the below won't work on btrfs; we'll assume rotational. | |
72 | string fn = get_basedir_path(); | |
73 | int fd = ::open(fn.c_str(), O_RDONLY); | |
74 | if (fd < 0) { | |
75 | return; | |
76 | } | |
77 | char partition[PATH_MAX], devname[PATH_MAX]; | |
78 | int r = get_device_by_fd(fd, partition, devname, sizeof(devname)); | |
79 | if (r < 0) { | |
80 | dout(1) << "unable to get device name for " << get_basedir_path() << ": " | |
81 | << cpp_strerror(r) << dendl; | |
82 | m_rotational = true; | |
83 | } else { | |
84 | m_rotational = block_device_is_rotational(devname); | |
85 | dout(20) << __func__ << " devname " << devname | |
86 | << " rotational " << (int)m_rotational << dendl; | |
87 | } | |
88 | ::close(fd); | |
89 | } | |
90 | } | |
7c673cae FG |
91 | |
92 | int GenericFileStoreBackend::detect_features() | |
93 | { | |
94 | char fn[PATH_MAX]; | |
95 | snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str()); | |
96 | ||
97 | int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644); | |
98 | if (fd < 0) { | |
99 | fd = -errno; | |
100 | derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl; | |
101 | return fd; | |
102 | } | |
103 | ||
104 | // ext4 has a bug in older kernels where fiemap will return an empty | |
105 | // result in some cases. this is a file layout that triggers the bug | |
106 | // on 2.6.34-rc5. | |
107 | int v[] = { | |
108 | 0x0000000000016000, 0x0000000000007000, | |
109 | 0x000000000004a000, 0x0000000000007000, | |
110 | 0x0000000000060000, 0x0000000000001000, | |
111 | 0x0000000000061000, 0x0000000000008000, | |
112 | 0x0000000000069000, 0x0000000000007000, | |
113 | 0x00000000000a3000, 0x000000000000c000, | |
114 | 0x000000000024e000, 0x000000000000c000, | |
115 | 0x000000000028b000, 0x0000000000009000, | |
116 | 0x00000000002b1000, 0x0000000000003000, | |
117 | 0, 0 | |
118 | }; | |
119 | for (int i=0; v[i]; i++) { | |
120 | int off = v[i++]; | |
121 | int len = v[i]; | |
122 | ||
123 | // write a large extent | |
124 | char buf[len]; | |
125 | memset(buf, 1, sizeof(buf)); | |
126 | int r = ::lseek(fd, off, SEEK_SET); | |
127 | if (r < 0) { | |
128 | r = -errno; | |
129 | derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl; | |
130 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
131 | return r; | |
132 | } | |
133 | r = write(fd, buf, sizeof(buf)); | |
134 | if (r < 0) { | |
135 | derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl; | |
136 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
137 | return r; | |
138 | } | |
139 | } | |
140 | ||
141 | // fiemap an extent inside that | |
142 | if (!m_filestore_fiemap) { | |
143 | dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; | |
144 | ioctl_fiemap = false; | |
145 | } else { | |
146 | struct fiemap *fiemap; | |
147 | int r = do_fiemap(fd, 2430421, 59284, &fiemap); | |
148 | if (r < 0) { | |
149 | dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl; | |
150 | ioctl_fiemap = false; | |
151 | } else { | |
152 | if (fiemap->fm_mapped_extents == 0) { | |
153 | dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl; | |
154 | ioctl_fiemap = false; | |
155 | } else { | |
156 | dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; | |
157 | ioctl_fiemap = true; | |
158 | } | |
159 | free(fiemap); | |
160 | } | |
161 | } | |
162 | ||
163 | // SEEK_DATA/SEEK_HOLE detection | |
164 | if (!m_filestore_seek_data_hole) { | |
165 | dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl; | |
166 | seek_data_hole = false; | |
167 | } else { | |
168 | #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) | |
169 | // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running | |
170 | // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned. | |
171 | // Fall back to use fiemap. | |
172 | off_t hole_pos; | |
173 | ||
174 | hole_pos = lseek(fd, 0, SEEK_HOLE); | |
175 | if (hole_pos < 0) { | |
176 | if (errno == EINVAL) { | |
177 | dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl; | |
178 | seek_data_hole = false; | |
179 | } else { | |
180 | derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl; | |
181 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
182 | return -errno; | |
183 | } | |
184 | } else { | |
185 | dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl; | |
186 | seek_data_hole = true; | |
187 | } | |
188 | #endif | |
189 | } | |
190 | ||
191 | //splice detection | |
192 | #ifdef CEPH_HAVE_SPLICE | |
193 | if (!m_filestore_splice) { | |
194 | dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl; | |
195 | use_splice = false; | |
196 | } else { | |
197 | int pipefd[2]; | |
198 | loff_t off_in = 0; | |
199 | int r; | |
200 | if ((r = pipe(pipefd)) < 0) | |
201 | dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl; | |
202 | else { | |
203 | lseek(fd, 0, SEEK_SET); | |
204 | r = splice(fd, &off_in, pipefd[1], NULL, 10, 0); | |
205 | if (!(r < 0 && errno == EINVAL)) { | |
206 | use_splice = true; | |
207 | dout(0) << "detect_features: splice is supported" << dendl; | |
208 | } else | |
209 | dout(0) << "detect_features: splice is NOT supported" << dendl; | |
210 | close(pipefd[0]); | |
211 | close(pipefd[1]); | |
212 | } | |
213 | } | |
214 | #endif | |
215 | ::unlink(fn); | |
216 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
217 | ||
218 | ||
219 | bool have_syncfs = false; | |
220 | #ifdef HAVE_SYS_SYNCFS | |
221 | if (::syncfs(get_basedir_fd()) == 0) { | |
222 | dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl; | |
223 | have_syncfs = true; | |
224 | } else { | |
225 | dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; | |
226 | } | |
227 | #elif defined(SYS_syncfs) | |
228 | if (syscall(SYS_syncfs, get_basedir_fd()) == 0) { | |
229 | dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl; | |
230 | have_syncfs = true; | |
231 | } else { | |
232 | dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; | |
233 | } | |
234 | #elif defined(__NR_syncfs) | |
235 | if (syscall(__NR_syncfs, get_basedir_fd()) == 0) { | |
236 | dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl; | |
237 | have_syncfs = true; | |
238 | } else { | |
239 | dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; | |
240 | } | |
241 | #endif | |
242 | if (!have_syncfs) { | |
243 | dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl; | |
244 | if (m_filestore_fsync_flushes_journal_data) { | |
245 | dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl; | |
246 | } else { | |
247 | dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl; | |
248 | dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl; | |
249 | } | |
250 | } | |
251 | ||
252 | return 0; | |
253 | } | |
254 | ||
255 | int GenericFileStoreBackend::create_current() | |
256 | { | |
257 | struct stat st; | |
258 | int ret = ::stat(get_current_path().c_str(), &st); | |
259 | if (ret == 0) { | |
260 | // current/ exists | |
261 | if (!S_ISDIR(st.st_mode)) { | |
262 | dout(0) << "_create_current: current/ exists but is not a directory" << dendl; | |
263 | ret = -EINVAL; | |
264 | } | |
265 | } else { | |
266 | ret = ::mkdir(get_current_path().c_str(), 0755); | |
267 | if (ret < 0) { | |
268 | ret = -errno; | |
269 | dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl; | |
270 | } | |
271 | } | |
272 | return ret; | |
273 | } | |
274 | ||
275 | int GenericFileStoreBackend::syncfs() | |
276 | { | |
277 | int ret; | |
278 | if (m_filestore_fsync_flushes_journal_data) { | |
279 | dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl; | |
280 | // make the file system's journal commit. | |
281 | // this works with ext3, but NOT ext4 | |
282 | ret = ::fsync(get_op_fd()); | |
283 | if (ret < 0) | |
284 | ret = -errno; | |
285 | } else { | |
286 | dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl; | |
287 | ret = sync_filesystem(get_current_fd()); | |
288 | } | |
289 | return ret; | |
290 | } | |
291 | ||
292 | int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) | |
293 | { | |
294 | struct fiemap *fiemap = NULL; | |
295 | struct fiemap *_realloc_fiemap = NULL; | |
296 | int size; | |
297 | int ret; | |
298 | ||
299 | fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1); | |
300 | if (!fiemap) | |
301 | return -ENOMEM; | |
302 | /* | |
303 | * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096), | |
304 | * the result is (logical=4096, len=4096). It leak the [3990, 4096). | |
305 | * Commit:"xfs: fix rounding error of fiemap length parameter | |
306 | * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug. | |
307 | * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug. | |
308 | */ | |
309 | fiemap->fm_start = start - start % CEPH_PAGE_SIZE; | |
310 | fiemap->fm_length = len + start % CEPH_PAGE_SIZE; | |
311 | fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ | |
312 | ||
313 | #if defined(DARWIN) || defined(__FreeBSD__) | |
314 | ret = -ENOTSUP; | |
315 | goto done_err; | |
316 | #else | |
317 | if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { | |
318 | ret = -errno; | |
319 | goto done_err; | |
320 | } | |
321 | #endif | |
322 | size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); | |
323 | ||
324 | _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); | |
325 | if (!_realloc_fiemap) { | |
326 | ret = -ENOMEM; | |
327 | goto done_err; | |
328 | } else { | |
329 | fiemap = _realloc_fiemap; | |
330 | } | |
331 | ||
332 | memset(fiemap->fm_extents, 0, size); | |
333 | ||
334 | fiemap->fm_extent_count = fiemap->fm_mapped_extents; | |
335 | fiemap->fm_mapped_extents = 0; | |
336 | ||
337 | #if defined(DARWIN) || defined(__FreeBSD__) | |
338 | ret = -ENOTSUP; | |
339 | goto done_err; | |
340 | #else | |
341 | if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { | |
342 | ret = -errno; | |
343 | goto done_err; | |
344 | } | |
345 | *pfiemap = fiemap; | |
346 | #endif | |
347 | return 0; | |
348 | ||
349 | done_err: | |
350 | *pfiemap = NULL; | |
351 | free(fiemap); | |
352 | return ret; | |
353 | } | |
354 | ||
355 | ||
356 | int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm) | |
357 | { | |
358 | char buf[100]; | |
359 | bufferptr bp; | |
360 | int r = 0; | |
361 | int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf)); | |
362 | if (l == -ENODATA) { | |
363 | return 0; | |
364 | } | |
365 | if (l >= 0) { | |
366 | bp = buffer::create(l); | |
367 | memcpy(bp.c_str(), buf, l); | |
368 | } else if (l == -ERANGE) { | |
369 | l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0); | |
370 | if (l > 0) { | |
371 | bp = buffer::create(l); | |
372 | l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l); | |
373 | } | |
374 | } | |
375 | bufferlist bl; | |
376 | bl.append(std::move(bp)); | |
377 | bufferlist::iterator p = bl.begin(); | |
378 | try { | |
379 | ::decode(*cm, p); | |
380 | } | |
381 | catch (buffer::error &e) { | |
382 | r = -EIO; | |
383 | } | |
384 | if (r < 0) | |
385 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
386 | return r; | |
387 | } | |
388 | ||
389 | int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm) | |
390 | { | |
391 | bufferlist bl; | |
392 | ::encode(*cm, bl); | |
393 | int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length()); | |
394 | if (r < 0) | |
395 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
396 | return r; | |
397 | } | |
398 | ||
399 | int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) | |
400 | { | |
401 | SloppyCRCMap scm(get_crc_block_size()); | |
402 | int r = _crc_load_or_init(fd, &scm); | |
403 | if (r < 0) | |
404 | return r; | |
405 | ostringstream ss; | |
406 | scm.write(off, len, bl, &ss); | |
407 | dout(30) << __func__ << "\n" << ss.str() << dendl; | |
408 | r = _crc_save(fd, &scm); | |
409 | return r; | |
410 | } | |
411 | ||
412 | int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off) | |
413 | { | |
414 | SloppyCRCMap scm(get_crc_block_size()); | |
415 | int r = _crc_load_or_init(fd, &scm); | |
416 | if (r < 0) | |
417 | return r; | |
418 | scm.truncate(off); | |
419 | r = _crc_save(fd, &scm); | |
420 | return r; | |
421 | } | |
422 | ||
423 | int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len) | |
424 | { | |
425 | SloppyCRCMap scm(get_crc_block_size()); | |
426 | int r = _crc_load_or_init(fd, &scm); | |
427 | if (r < 0) | |
428 | return r; | |
429 | scm.zero(off, len); | |
430 | r = _crc_save(fd, &scm); | |
431 | return r; | |
432 | } | |
433 | ||
434 | int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd, | |
435 | loff_t srcoff, size_t len, loff_t dstoff) | |
436 | { | |
437 | SloppyCRCMap scm_src(get_crc_block_size()); | |
438 | SloppyCRCMap scm_dst(get_crc_block_size()); | |
439 | int r = _crc_load_or_init(srcfd, &scm_src); | |
440 | if (r < 0) | |
441 | return r; | |
442 | r = _crc_load_or_init(destfd, &scm_dst); | |
443 | if (r < 0) | |
444 | return r; | |
445 | ostringstream ss; | |
446 | scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss); | |
447 | dout(30) << __func__ << "\n" << ss.str() << dendl; | |
448 | r = _crc_save(destfd, &scm_dst); | |
449 | return r; | |
450 | } | |
451 | ||
452 | int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, | |
453 | ostream *out) | |
454 | { | |
455 | SloppyCRCMap scm(get_crc_block_size()); | |
456 | int r = _crc_load_or_init(fd, &scm); | |
457 | if (r < 0) | |
458 | return r; | |
459 | return scm.read(off, len, bl, out); | |
460 | } |