]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/filestore/BtrfsFileStoreBackend.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / os / filestore / BtrfsFileStoreBackend.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/int_types.h"
16#include "include/types.h"
17
18#include <unistd.h>
19#include <fcntl.h>
20#include <errno.h>
21#include <stdlib.h>
22#include <sys/types.h>
23#include <sys/stat.h>
24#include <sys/ioctl.h>
25#include "include/compat.h"
26#include "include/linux_fiemap.h"
27#include "include/color.h"
28#include "include/buffer.h"
11fdf7f2 29#include "include/ceph_assert.h"
7c673cae
FG
30
31#ifndef __CYGWIN__
32#include "os/fs/btrfs_ioctl.h"
33#endif
34
35#include <iostream>
36#include <fstream>
37#include <sstream>
38
39#include "BtrfsFileStoreBackend.h"
40
41#include "common/errno.h"
42#include "common/config.h"
43
44#if defined(__linux__)
45
46#define dout_context cct()
47#define dout_subsys ceph_subsys_filestore
48#undef dout_prefix
49#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
50
f67539c2
TL
51using std::cerr;
52using std::list;
53using std::string;
54
7c673cae
FG
55#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56#define ALIGNED(x, by) (!((x) % (by)))
57#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
58
59BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
60 GenericFileStoreBackend(fs), has_clone_range(false),
61 has_snap_create(false), has_snap_destroy(false),
62 has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
63 m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range),
64 m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { }
65
66int BtrfsFileStoreBackend::detect_features()
67{
68 int r;
69
70 r = GenericFileStoreBackend::detect_features();
71 if (r < 0)
72 return r;
73
74 // clone_range?
75 if (m_filestore_btrfs_clone_range) {
91327a77 76 int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY|O_CLOEXEC, 0600);
7c673cae
FG
77 if (fd >= 0) {
78 if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
79 r = -errno;
80 dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
81 << cpp_strerror(r) << dendl;
82 }
83 btrfs_ioctl_clone_range_args clone_args;
84 memset(&clone_args, 0, sizeof(clone_args));
85 clone_args.src_fd = -1;
86 r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
87 if (r < 0 && errno == EBADF) {
88 dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
89 has_clone_range = true;
90 } else {
91 r = -errno;
92 dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
93 }
94 TEMP_FAILURE_RETRY(::close(fd));
95 } else {
96 r = -errno;
97 dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
98 << cpp_strerror(r) << dendl;
99 }
100 } else {
101 dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
102 }
103
104 struct btrfs_ioctl_vol_args vol_args;
105 memset(&vol_args, 0, sizeof(vol_args));
106
107 // create test source volume
108 vol_args.fd = 0;
109 strcpy(vol_args.name, "test_subvol");
110 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
111 if (r != 0) {
112 r = -errno;
113 dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
114 }
91327a77 115 int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY|O_CLOEXEC);
7c673cae
FG
116 if (srcfd < 0) {
117 r = -errno;
118 dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
119 }
120
121 // snap_create and snap_destroy?
122 vol_args.fd = srcfd;
123 strcpy(vol_args.name, "sync_snap_test");
124 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
125 int err = errno;
126 if (r == 0 || errno == EEXIST) {
127 dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
128 has_snap_create = true;
129
130 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
131 if (r == 0) {
132 dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
133 has_snap_destroy = true;
134 } else {
135 err = -errno;
136 dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
137
138 if (err == -EPERM && getuid() != 0) {
139 dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
140 cerr << TEXT_YELLOW
141 << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
142 << TEXT_NORMAL << std::endl;
143 } else if (err == -EOPNOTSUPP) {
144 derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
145 }
146 }
147 } else {
148 dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
149 }
150
151 if (m_filestore_btrfs_snap) {
152 if (has_snap_destroy)
153 stable_commits = true;
154 else
155 dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
156 }
157
158 // start_sync?
159 __u64 transid = 0;
160 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
161 if (r < 0) {
162 int err = errno;
163 dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
164 }
165 if (r == 0 && transid > 0) {
166 dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
167
168 // do we have wait_sync too?
169 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
170 if (r == 0 || errno == ERANGE) {
171 dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
172 has_wait_sync = true;
173 } else {
174 int err = errno;
175 dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
176 }
177 } else {
178 int err = errno;
179 dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
180 }
181
182 if (has_wait_sync) {
183 // async snap creation?
184 struct btrfs_ioctl_vol_args_v2 async_args;
185 memset(&async_args, 0, sizeof(async_args));
186 async_args.fd = srcfd;
187 async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
188 strcpy(async_args.name, "async_snap_test");
189
190 // remove old one, first
191 struct stat st;
192 strcpy(vol_args.name, async_args.name);
193 if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
194 dout(0) << "detect_feature: removing old async_snap_test" << dendl;
195 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
196 if (r != 0) {
197 int err = errno;
198 dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
199 }
200 }
201
202 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
203 if (r == 0 || errno == EEXIST) {
204 dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
205 has_snap_create_v2 = true;
206
207 // clean up
208 strcpy(vol_args.name, "async_snap_test");
209 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
210 if (r != 0) {
211 int err = errno;
212 dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
213 }
214 } else {
215 int err = errno;
216 dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
217 }
218 }
219
220 // clean up test subvol
221 if (srcfd >= 0)
222 TEMP_FAILURE_RETRY(::close(srcfd));
223
224 strcpy(vol_args.name, "test_subvol");
225 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
226 if (r < 0) {
227 r = -errno;
228 dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
229 }
230
231 if (m_filestore_btrfs_snap && !has_snap_create_v2) {
232 dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
233 cerr << TEXT_YELLOW
234 << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
235 << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
236 << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
237 << " performance.\n"
238 << TEXT_NORMAL;
239 }
240
241 return 0;
242}
243
244bool BtrfsFileStoreBackend::can_checkpoint()
245{
246 return stable_commits;
247}
248
249int BtrfsFileStoreBackend::create_current()
250{
251 struct stat st;
252 int ret = ::stat(get_current_path().c_str(), &st);
253 if (ret == 0) {
254 // current/ exists
255 if (!S_ISDIR(st.st_mode)) {
256 dout(0) << "create_current: current/ exists but is not a directory" << dendl;
257 return -EINVAL;
258 }
259
260 struct stat basest;
261 struct statfs currentfs;
262 ret = ::fstat(get_basedir_fd(), &basest);
263 if (ret < 0) {
264 ret = -errno;
265 dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
266 return ret;
267 }
268 ret = ::statfs(get_current_path().c_str(), &currentfs);
269 if (ret < 0) {
270 ret = -errno;
271 dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
272 return ret;
273 }
274 if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
275 dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
276 stable_commits = true;
277 }
278 return 0;
279 }
280
281 struct btrfs_ioctl_vol_args volargs;
282 memset(&volargs, 0, sizeof(volargs));
283
284 volargs.fd = 0;
285 strcpy(volargs.name, "current");
286 if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
287 ret = -errno;
288 dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
289 << cpp_strerror(ret) << dendl;
290 return ret;
291 }
292
293 dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
294 if (::chmod(get_current_path().c_str(), 0755) < 0) {
295 ret = -errno;
296 dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
297 << cpp_strerror(ret) << dendl;
298 return ret;
299 }
300
301 stable_commits = true;
302 return 0;
303}
304
305int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
306{
307 int ret, err = 0;
308
309 struct stat basest;
310 ret = ::fstat(get_basedir_fd(), &basest);
311 if (ret < 0) {
312 ret = -errno;
313 dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
314 return ret;
315 }
316
317 // get snap list
318 DIR *dir = ::opendir(get_basedir_path().c_str());
319 if (!dir) {
320 ret = -errno;
321 dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
322 << cpp_strerror(ret) << dendl;
323 return ret;
324 }
325
326 list<string> snaps;
327 char path[PATH_MAX];
328 struct dirent *de;
b3b6e05e
TL
329 while (true) {
330 errno = 0;
331 de = ::readdir(dir);
332 if (de == nullptr) {
333 if (errno != 0) {
334 err = -errno;
335 dout(0) << "list_checkpoints: readdir '" << get_basedir_path() << "' failed: "
336 << cpp_strerror(err) << dendl;
337 }
338 break;
339 }
7c673cae
FG
340 snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
341
342 struct stat st;
343 ret = ::stat(path, &st);
344 if (ret < 0) {
345 err = -errno;
346 dout(0) << "list_checkpoints: stat '" << path << "' failed: "
347 << cpp_strerror(err) << dendl;
348 break;
349 }
350
351 if (!S_ISDIR(st.st_mode))
352 continue;
353
354 struct statfs fs;
355 ret = ::statfs(path, &fs);
356 if (ret < 0) {
357 err = -errno;
358 dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
359 << cpp_strerror(err) << dendl;
360 break;
361 }
362
363 if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
364 snaps.push_back(string(de->d_name));
365 }
366
367 if (::closedir(dir) < 0) {
368 ret = -errno;
369 dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
370 if (!err)
371 err = ret;
372 }
373
374 if (err)
375 return err;
376
377 ls.swap(snaps);
378 return 0;
379}
380
381int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
382{
383 dout(10) << "create_checkpoint: '" << name << "'" << dendl;
384 if (has_snap_create_v2 && transid) {
385 struct btrfs_ioctl_vol_args_v2 async_args;
386 memset(&async_args, 0, sizeof(async_args));
387 async_args.fd = get_current_fd();
388 async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
389
390 size_t name_size = sizeof(async_args.name);
391 strncpy(async_args.name, name.c_str(), name_size);
392 async_args.name[name_size-1] = '\0';
393
394 int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
395 if (r < 0) {
396 r = -errno;
397 dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
398 return r;
399 }
400 dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
401 *transid = async_args.transid;
402 } else {
403 struct btrfs_ioctl_vol_args vol_args;
404 memset(&vol_args, 0, sizeof(vol_args));
405 vol_args.fd = get_current_fd();
406
407 size_t name_size = sizeof(vol_args.name);
408 strncpy(vol_args.name, name.c_str(), name_size);
409 vol_args.name[name_size-1] = '\0';
410
411 int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
412 if (r < 0) {
413 r = -errno;
414 dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
415 return r;
416 }
417 if (transid)
418 *transid = 0;
419 }
420 return 0;
421}
422
423int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
424{
425 // wait for commit
426 dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
427 int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
428 if (ret < 0) {
429 ret = -errno;
430 dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
431 return -errno;
432 }
433 dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
434 return 0;
435}
436
437int BtrfsFileStoreBackend::rollback_to(const string& name)
438{
439 dout(10) << "rollback_to: to '" << name << "'" << dendl;
440 char s[PATH_MAX];
441 btrfs_ioctl_vol_args vol_args;
442
443 memset(&vol_args, 0, sizeof(vol_args));
444 vol_args.fd = 0;
445 strcpy(vol_args.name, "current");
446
447 int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
448 if (ret && errno != ENOENT) {
449 dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
450 snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
451 if (::rename(get_current_path().c_str(), s)) {
452 ret = -errno;
453 dout(0) << "rollback_to: error renaming old current subvol: "
454 << cpp_strerror(ret) << dendl;
455 return ret;
456 }
457 }
458
459 snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
460
461 // roll back
91327a77 462 vol_args.fd = ::open(s, O_RDONLY|O_CLOEXEC);
7c673cae
FG
463 if (vol_args.fd < 0) {
464 ret = -errno;
465 dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
466 return ret;
467 }
468 ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
469 if (ret < 0 ) {
470 ret = -errno;
471 dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
472 }
473 TEMP_FAILURE_RETRY(::close(vol_args.fd));
474 return ret;
475}
476
477int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
478{
479 dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
480 btrfs_ioctl_vol_args vol_args;
481 memset(&vol_args, 0, sizeof(vol_args));
482 vol_args.fd = 0;
9f95a23c
TL
483 strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name) - 1);
484 vol_args.name[sizeof(vol_args.name) - 1] = '\0';
7c673cae
FG
485
486 int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
487 if (ret) {
488 ret = -errno;
489 dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
490 return ret;
491 }
492 return 0;
493}
494
495int BtrfsFileStoreBackend::syncfs()
496{
497 dout(15) << "syncfs" << dendl;
498 // do a full btrfs commit
499 int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
500 if (ret < 0) {
501 ret = -errno;
502 dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
503 }
504 return ret;
505}
506
507int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
508{
509 dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
510 size_t blk_size = get_blksize();
511 if (!has_clone_range ||
512 srcoff % blk_size != dstoff % blk_size) {
513 dout(20) << "clone_range: using copy" << dendl;
514 return _copy_range(from, to, srcoff, len, dstoff);
515 }
516
517 int err = 0;
518 int r = 0;
519
520 uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
521 uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
522 if (srcoffclone >= srcoff + len) {
523 dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
524 return _copy_range(from, to, srcoff, len, dstoff);
525 }
526
527 uint64_t lenclone = len - (srcoffclone - srcoff);
528 if (!ALIGNED(lenclone, blk_size)) {
529 struct stat from_stat, to_stat;
530 err = ::fstat(from, &from_stat);
531 if (err) return -errno;
532 err = ::fstat(to , &to_stat);
533 if (err) return -errno;
534
535 if (srcoff + len != (uint64_t)from_stat.st_size ||
536 dstoff + len < (uint64_t)to_stat.st_size) {
537 // Not to the end of the file, need to align length as well
538 lenclone = ALIGN_DOWN(lenclone, blk_size);
539 }
540 }
541 if (lenclone == 0) {
542 // too short
543 return _copy_range(from, to, srcoff, len, dstoff);
544 }
545
546 dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
547 << " to " << dstoffclone << " = " << r << dendl;
548 btrfs_ioctl_clone_range_args a;
549 a.src_fd = from;
550 a.src_offset = srcoffclone;
551 a.src_length = lenclone;
552 a.dest_offset = dstoffclone;
553 err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
554 if (err >= 0) {
555 r += err;
556 } else if (errno == EINVAL) {
557 // Still failed, might be compressed
558 dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
559 return _copy_range(from, to, srcoff, len, dstoff);
560 } else {
561 return -errno;
562 }
563
564 // Take care any trimmed from front
565 if (srcoffclone != srcoff) {
566 err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
567 if (err >= 0) {
568 r += err;
569 } else {
570 return err;
571 }
572 }
573
574 // Copy end
575 if (srcoffclone + lenclone != srcoff + len) {
576 err = _copy_range(from, to,
577 srcoffclone + lenclone,
578 (srcoff + len) - (srcoffclone + lenclone),
579 dstoffclone + lenclone);
580 if (err >= 0) {
581 r += err;
582 } else {
583 return err;
584 }
585 }
586 dout(20) << "clone_range: finished " << srcoff << "~" << len
587 << " to " << dstoff << " = " << r << dendl;
588 return r;
589}
590#endif