]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "include/int_types.h" | |
16 | #include "include/types.h" | |
17 | ||
18 | #include <unistd.h> | |
19 | #include <fcntl.h> | |
20 | #include <errno.h> | |
21 | #include <stdlib.h> | |
22 | #include <sys/types.h> | |
23 | #include <sys/stat.h> | |
24 | #include <sys/ioctl.h> | |
25 | ||
26 | #if defined(__linux__) | |
27 | #include <linux/fs.h> | |
28 | #endif | |
29 | ||
30 | #include "include/compat.h" | |
31 | #include "include/linux_fiemap.h" | |
32 | ||
33 | #include <iostream> | |
34 | #include <fstream> | |
35 | #include <sstream> | |
36 | ||
37 | #include "GenericFileStoreBackend.h" | |
38 | ||
39 | #include "common/errno.h" | |
40 | #include "common/config.h" | |
41 | #include "common/sync_filesystem.h" | |
42 | ||
43 | #include "common/SloppyCRCMap.h" | |
44 | #include "os/filestore/chain_xattr.h" | |
45 | ||
46 | #define SLOPPY_CRC_XATTR "user.cephos.scrc" | |
47 | ||
48 | ||
49 | #define dout_context cct() | |
50 | #define dout_subsys ceph_subsys_filestore | |
51 | #undef dout_prefix | |
52 | #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") " | |
53 | ||
54 | #define ALIGN_DOWN(x, by) ((x) - ((x) % (by))) | |
55 | #define ALIGNED(x, by) (!((x) % (by))) | |
56 | #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by))) | |
57 | ||
58 | GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs): | |
59 | FileStoreBackend(fs), | |
60 | ioctl_fiemap(false), | |
61 | seek_data_hole(false), | |
62 | use_splice(false), | |
63 | m_filestore_fiemap(cct()->_conf->filestore_fiemap), | |
64 | m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole), | |
65 | m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data), | |
66 | m_filestore_splice(cct()->_conf->filestore_splice) {} | |
67 | ||
68 | int GenericFileStoreBackend::detect_features() | |
69 | { | |
70 | char fn[PATH_MAX]; | |
71 | snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str()); | |
72 | ||
73 | int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644); | |
74 | if (fd < 0) { | |
75 | fd = -errno; | |
76 | derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl; | |
77 | return fd; | |
78 | } | |
79 | ||
80 | // ext4 has a bug in older kernels where fiemap will return an empty | |
81 | // result in some cases. this is a file layout that triggers the bug | |
82 | // on 2.6.34-rc5. | |
83 | int v[] = { | |
84 | 0x0000000000016000, 0x0000000000007000, | |
85 | 0x000000000004a000, 0x0000000000007000, | |
86 | 0x0000000000060000, 0x0000000000001000, | |
87 | 0x0000000000061000, 0x0000000000008000, | |
88 | 0x0000000000069000, 0x0000000000007000, | |
89 | 0x00000000000a3000, 0x000000000000c000, | |
90 | 0x000000000024e000, 0x000000000000c000, | |
91 | 0x000000000028b000, 0x0000000000009000, | |
92 | 0x00000000002b1000, 0x0000000000003000, | |
93 | 0, 0 | |
94 | }; | |
95 | for (int i=0; v[i]; i++) { | |
96 | int off = v[i++]; | |
97 | int len = v[i]; | |
98 | ||
99 | // write a large extent | |
100 | char buf[len]; | |
101 | memset(buf, 1, sizeof(buf)); | |
102 | int r = ::lseek(fd, off, SEEK_SET); | |
103 | if (r < 0) { | |
104 | r = -errno; | |
105 | derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl; | |
106 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
107 | return r; | |
108 | } | |
109 | r = write(fd, buf, sizeof(buf)); | |
110 | if (r < 0) { | |
111 | derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl; | |
112 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
113 | return r; | |
114 | } | |
115 | } | |
116 | ||
117 | // fiemap an extent inside that | |
118 | if (!m_filestore_fiemap) { | |
119 | dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl; | |
120 | ioctl_fiemap = false; | |
121 | } else { | |
122 | struct fiemap *fiemap; | |
123 | int r = do_fiemap(fd, 2430421, 59284, &fiemap); | |
124 | if (r < 0) { | |
125 | dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl; | |
126 | ioctl_fiemap = false; | |
127 | } else { | |
128 | if (fiemap->fm_mapped_extents == 0) { | |
129 | dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl; | |
130 | ioctl_fiemap = false; | |
131 | } else { | |
132 | dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl; | |
133 | ioctl_fiemap = true; | |
134 | } | |
135 | free(fiemap); | |
136 | } | |
137 | } | |
138 | ||
139 | // SEEK_DATA/SEEK_HOLE detection | |
140 | if (!m_filestore_seek_data_hole) { | |
141 | dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl; | |
142 | seek_data_hole = false; | |
143 | } else { | |
144 | #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) | |
145 | // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running | |
146 | // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned. | |
147 | // Fall back to use fiemap. | |
148 | off_t hole_pos; | |
149 | ||
150 | hole_pos = lseek(fd, 0, SEEK_HOLE); | |
151 | if (hole_pos < 0) { | |
152 | if (errno == EINVAL) { | |
153 | dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl; | |
154 | seek_data_hole = false; | |
155 | } else { | |
156 | derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl; | |
157 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
158 | return -errno; | |
159 | } | |
160 | } else { | |
161 | dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl; | |
162 | seek_data_hole = true; | |
163 | } | |
164 | #endif | |
165 | } | |
166 | ||
167 | //splice detection | |
168 | #ifdef CEPH_HAVE_SPLICE | |
169 | if (!m_filestore_splice) { | |
170 | dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl; | |
171 | use_splice = false; | |
172 | } else { | |
173 | int pipefd[2]; | |
174 | loff_t off_in = 0; | |
175 | int r; | |
176 | if ((r = pipe(pipefd)) < 0) | |
177 | dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl; | |
178 | else { | |
179 | lseek(fd, 0, SEEK_SET); | |
180 | r = splice(fd, &off_in, pipefd[1], NULL, 10, 0); | |
181 | if (!(r < 0 && errno == EINVAL)) { | |
182 | use_splice = true; | |
183 | dout(0) << "detect_features: splice is supported" << dendl; | |
184 | } else | |
185 | dout(0) << "detect_features: splice is NOT supported" << dendl; | |
186 | close(pipefd[0]); | |
187 | close(pipefd[1]); | |
188 | } | |
189 | } | |
190 | #endif | |
191 | ::unlink(fn); | |
192 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
193 | ||
194 | ||
195 | bool have_syncfs = false; | |
196 | #ifdef HAVE_SYS_SYNCFS | |
197 | if (::syncfs(get_basedir_fd()) == 0) { | |
198 | dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl; | |
199 | have_syncfs = true; | |
200 | } else { | |
201 | dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl; | |
202 | } | |
203 | #elif defined(SYS_syncfs) | |
204 | if (syscall(SYS_syncfs, get_basedir_fd()) == 0) { | |
205 | dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl; | |
206 | have_syncfs = true; | |
207 | } else { | |
208 | dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; | |
209 | } | |
210 | #elif defined(__NR_syncfs) | |
211 | if (syscall(__NR_syncfs, get_basedir_fd()) == 0) { | |
212 | dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl; | |
213 | have_syncfs = true; | |
214 | } else { | |
215 | dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl; | |
216 | } | |
217 | #endif | |
218 | if (!have_syncfs) { | |
219 | dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl; | |
220 | if (m_filestore_fsync_flushes_journal_data) { | |
221 | dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl; | |
222 | } else { | |
223 | dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl; | |
224 | dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl; | |
225 | } | |
226 | } | |
227 | ||
228 | return 0; | |
229 | } | |
230 | ||
231 | int GenericFileStoreBackend::create_current() | |
232 | { | |
233 | struct stat st; | |
234 | int ret = ::stat(get_current_path().c_str(), &st); | |
235 | if (ret == 0) { | |
236 | // current/ exists | |
237 | if (!S_ISDIR(st.st_mode)) { | |
238 | dout(0) << "_create_current: current/ exists but is not a directory" << dendl; | |
239 | ret = -EINVAL; | |
240 | } | |
241 | } else { | |
242 | ret = ::mkdir(get_current_path().c_str(), 0755); | |
243 | if (ret < 0) { | |
244 | ret = -errno; | |
245 | dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl; | |
246 | } | |
247 | } | |
248 | return ret; | |
249 | } | |
250 | ||
251 | int GenericFileStoreBackend::syncfs() | |
252 | { | |
253 | int ret; | |
254 | if (m_filestore_fsync_flushes_journal_data) { | |
255 | dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl; | |
256 | // make the file system's journal commit. | |
257 | // this works with ext3, but NOT ext4 | |
258 | ret = ::fsync(get_op_fd()); | |
259 | if (ret < 0) | |
260 | ret = -errno; | |
261 | } else { | |
262 | dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl; | |
263 | ret = sync_filesystem(get_current_fd()); | |
264 | } | |
265 | return ret; | |
266 | } | |
267 | ||
268 | int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) | |
269 | { | |
270 | struct fiemap *fiemap = NULL; | |
271 | struct fiemap *_realloc_fiemap = NULL; | |
272 | int size; | |
273 | int ret; | |
274 | ||
275 | fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1); | |
276 | if (!fiemap) | |
277 | return -ENOMEM; | |
278 | /* | |
279 | * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096), | |
280 | * the result is (logical=4096, len=4096). It leak the [3990, 4096). | |
281 | * Commit:"xfs: fix rounding error of fiemap length parameter | |
282 | * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug. | |
283 | * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug. | |
284 | */ | |
285 | fiemap->fm_start = start - start % CEPH_PAGE_SIZE; | |
286 | fiemap->fm_length = len + start % CEPH_PAGE_SIZE; | |
287 | fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */ | |
288 | ||
289 | #if defined(DARWIN) || defined(__FreeBSD__) | |
290 | ret = -ENOTSUP; | |
291 | goto done_err; | |
292 | #else | |
293 | if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { | |
294 | ret = -errno; | |
295 | goto done_err; | |
296 | } | |
297 | #endif | |
298 | size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents); | |
299 | ||
300 | _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size); | |
301 | if (!_realloc_fiemap) { | |
302 | ret = -ENOMEM; | |
303 | goto done_err; | |
304 | } else { | |
305 | fiemap = _realloc_fiemap; | |
306 | } | |
307 | ||
308 | memset(fiemap->fm_extents, 0, size); | |
309 | ||
310 | fiemap->fm_extent_count = fiemap->fm_mapped_extents; | |
311 | fiemap->fm_mapped_extents = 0; | |
312 | ||
313 | #if defined(DARWIN) || defined(__FreeBSD__) | |
314 | ret = -ENOTSUP; | |
315 | goto done_err; | |
316 | #else | |
317 | if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { | |
318 | ret = -errno; | |
319 | goto done_err; | |
320 | } | |
321 | *pfiemap = fiemap; | |
322 | #endif | |
323 | return 0; | |
324 | ||
325 | done_err: | |
326 | *pfiemap = NULL; | |
327 | free(fiemap); | |
328 | return ret; | |
329 | } | |
330 | ||
331 | ||
332 | int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm) | |
333 | { | |
334 | char buf[100]; | |
335 | bufferptr bp; | |
336 | int r = 0; | |
337 | int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf)); | |
338 | if (l == -ENODATA) { | |
339 | return 0; | |
340 | } | |
341 | if (l >= 0) { | |
342 | bp = buffer::create(l); | |
343 | memcpy(bp.c_str(), buf, l); | |
344 | } else if (l == -ERANGE) { | |
345 | l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0); | |
346 | if (l > 0) { | |
347 | bp = buffer::create(l); | |
348 | l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l); | |
349 | } | |
350 | } | |
351 | bufferlist bl; | |
352 | bl.append(std::move(bp)); | |
353 | bufferlist::iterator p = bl.begin(); | |
354 | try { | |
355 | ::decode(*cm, p); | |
356 | } | |
357 | catch (buffer::error &e) { | |
358 | r = -EIO; | |
359 | } | |
360 | if (r < 0) | |
361 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
362 | return r; | |
363 | } | |
364 | ||
365 | int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm) | |
366 | { | |
367 | bufferlist bl; | |
368 | ::encode(*cm, bl); | |
369 | int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length()); | |
370 | if (r < 0) | |
371 | derr << __func__ << " got " << cpp_strerror(r) << dendl; | |
372 | return r; | |
373 | } | |
374 | ||
375 | int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl) | |
376 | { | |
377 | SloppyCRCMap scm(get_crc_block_size()); | |
378 | int r = _crc_load_or_init(fd, &scm); | |
379 | if (r < 0) | |
380 | return r; | |
381 | ostringstream ss; | |
382 | scm.write(off, len, bl, &ss); | |
383 | dout(30) << __func__ << "\n" << ss.str() << dendl; | |
384 | r = _crc_save(fd, &scm); | |
385 | return r; | |
386 | } | |
387 | ||
388 | int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off) | |
389 | { | |
390 | SloppyCRCMap scm(get_crc_block_size()); | |
391 | int r = _crc_load_or_init(fd, &scm); | |
392 | if (r < 0) | |
393 | return r; | |
394 | scm.truncate(off); | |
395 | r = _crc_save(fd, &scm); | |
396 | return r; | |
397 | } | |
398 | ||
399 | int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len) | |
400 | { | |
401 | SloppyCRCMap scm(get_crc_block_size()); | |
402 | int r = _crc_load_or_init(fd, &scm); | |
403 | if (r < 0) | |
404 | return r; | |
405 | scm.zero(off, len); | |
406 | r = _crc_save(fd, &scm); | |
407 | return r; | |
408 | } | |
409 | ||
410 | int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd, | |
411 | loff_t srcoff, size_t len, loff_t dstoff) | |
412 | { | |
413 | SloppyCRCMap scm_src(get_crc_block_size()); | |
414 | SloppyCRCMap scm_dst(get_crc_block_size()); | |
415 | int r = _crc_load_or_init(srcfd, &scm_src); | |
416 | if (r < 0) | |
417 | return r; | |
418 | r = _crc_load_or_init(destfd, &scm_dst); | |
419 | if (r < 0) | |
420 | return r; | |
421 | ostringstream ss; | |
422 | scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss); | |
423 | dout(30) << __func__ << "\n" << ss.str() << dendl; | |
424 | r = _crc_save(destfd, &scm_dst); | |
425 | return r; | |
426 | } | |
427 | ||
428 | int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl, | |
429 | ostream *out) | |
430 | { | |
431 | SloppyCRCMap scm(get_crc_block_size()); | |
432 | int r = _crc_load_or_init(fd, &scm); | |
433 | if (r < 0) | |
434 | return r; | |
435 | return scm.read(off, len, bl, out); | |
436 | } |