1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/compat.h"
16 #include "common/errno.h"
17 #include "common/ceph_argparse.h"
19 #include "include/util.h"
20 #include "include/ceph_fs.h"
22 #include "mds/CDentry.h"
23 #include "mds/CInode.h"
24 #include "mds/CDentry.h"
25 #include "mds/InoTable.h"
26 #include "mds/SnapServer.h"
27 #include "cls/cephfs/cls_cephfs_client.h"
31 #include "include/compat.h"
33 #define dout_context g_ceph_context
34 #define dout_subsys ceph_subsys_mds
36 #define dout_prefix *_dout << "datascan." << __func__ << ": "
40 void DataScan::usage()
42 std::cout
<< "Usage: \n"
43 << " cephfs-data-scan init [--force-init]\n"
44 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
45 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
46 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
47 << " cephfs-data-scan scan_links\n"
49 << " --force-corrupt: overrite apparently corrupt structures\n"
50 << " --force-init: write root inodes even if they exist\n"
51 << " --force-pool: use data pool even if it is not in FSMap\n"
52 << " --worker_m: Maximum number of workers\n"
53 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
55 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
56 << " cephfs-data-scan cleanup <data pool name>\n"
59 generic_client_usage();
62 bool DataScan::parse_kwarg(
63 const std::vector
<const char*> &args
,
64 std::vector
<const char *>::const_iterator
&i
,
67 if (i
+ 1 == args
.end()) {
71 const std::string
arg(*i
);
72 const std::string
val(*(i
+ 1));
74 if (arg
== std::string("--output-dir")) {
76 derr
<< "Unexpected --output-dir: output already selected!" << dendl
;
80 dout(4) << "Using local file output to '" << val
<< "'" << dendl
;
81 driver
= new LocalFileDriver(val
, data_io
);
83 } else if (arg
== std::string("--worker_n")) {
85 n
= strict_strtoll(val
.c_str(), 10, &err
);
87 std::cerr
<< "Invalid worker number '" << val
<< "'" << std::endl
;
92 } else if (arg
== std::string("--worker_m")) {
94 m
= strict_strtoll(val
.c_str(), 10, &err
);
96 std::cerr
<< "Invalid worker count '" << val
<< "'" << std::endl
;
101 } else if (arg
== std::string("--filter-tag")) {
103 dout(10) << "Applying tag filter: '" << filter_tag
<< "'" << dendl
;
105 } else if (arg
== std::string("--filesystem")) {
106 std::shared_ptr
<const Filesystem
> fs
;
107 *r
= fsmap
->parse_filesystem(val
, &fs
);
109 std::cerr
<< "Invalid filesystem '" << val
<< "'" << std::endl
;
114 } else if (arg
== std::string("--alternate-pool")) {
115 metadata_pool_name
= val
;
122 bool DataScan::parse_arg(
123 const std::vector
<const char*> &args
,
124 std::vector
<const char *>::const_iterator
&i
)
126 const std::string
arg(*i
);
127 if (arg
== "--force-pool") {
130 } else if (arg
== "--force-corrupt") {
131 force_corrupt
= true;
133 } else if (arg
== "--force-init") {
141 int DataScan::main(const std::vector
<const char*> &args
)
145 if (args
.size() < 1) {
146 cerr
<< "missing position argument" << std::endl
;
150 // Common RADOS init: open metadata pool
151 // =====================================
152 librados::Rados rados
;
153 int r
= rados
.init_with_context(g_ceph_context
);
155 derr
<< "RADOS unavailable" << dendl
;
159 std::string
const &command
= args
[0];
160 std::string data_pool_name
;
162 std::string pg_files_path
;
163 std::set
<pg_t
> pg_files_pgs
;
165 // Consume any known --key val or --flag arguments
166 for (std::vector
<const char *>::const_iterator i
= args
.begin() + 1;
167 i
!= args
.end(); ++i
) {
168 if (parse_kwarg(args
, i
, &r
)) {
169 // Skip the kwarg value field
176 if (parse_arg(args
, i
)) {
180 // Trailing positional argument
181 if (i
+ 1 == args
.end() &&
182 (command
== "scan_inodes"
183 || command
== "scan_extents"
184 || command
== "cleanup")) {
189 if (command
== "pg_files") {
190 if (i
== args
.begin() + 1) {
195 bool parsed
= pg
.parse(*i
);
197 std::cerr
<< "Invalid PG '" << *i
<< "'" << std::endl
;
200 pg_files_pgs
.insert(pg
);
207 // Fall through: unhandled
208 std::cerr
<< "Unknown argument '" << *i
<< "'" << std::endl
;
212 // If caller didn't specify a namespace, try to pick
213 // one if only one exists
214 if (fscid
== FS_CLUSTER_ID_NONE
) {
215 if (fsmap
->filesystem_count() == 1) {
216 fscid
= fsmap
->get_filesystem()->fscid
;
218 std::cerr
<< "Specify a filesystem with --filesystem" << std::endl
;
222 auto fs
= fsmap
->get_filesystem(fscid
);
223 ceph_assert(fs
!= nullptr);
225 // Default to output to metadata pool
226 if (driver
== NULL
) {
227 driver
= new MetadataDriver();
228 driver
->set_force_corrupt(force_corrupt
);
229 driver
->set_force_init(force_init
);
230 dout(4) << "Using metadata pool output" << dendl
;
233 dout(4) << "connecting to RADOS..." << dendl
;
236 std::cerr
<< "couldn't connect to cluster: " << cpp_strerror(r
)
241 r
= driver
->init(rados
, metadata_pool_name
, fsmap
, fscid
);
246 if (command
== "pg_files") {
247 auto pge
= PgFiles(objecter
, pg_files_pgs
);
249 return pge
.scan_path(pg_files_path
);
252 // Initialize data_io for those commands that need it
253 if (command
== "scan_inodes" ||
254 command
== "scan_extents" ||
255 command
== "cleanup") {
256 if (data_pool_name
.empty()) {
257 std::cerr
<< "Data pool not specified" << std::endl
;
261 data_pool_id
= rados
.pool_lookup(data_pool_name
.c_str());
262 if (data_pool_id
< 0) {
263 std::cerr
<< "Data pool '" << data_pool_name
<< "' not found!" << std::endl
;
266 dout(4) << "data pool '" << data_pool_name
267 << "' has ID " << data_pool_id
<< dendl
;
270 if (!fs
->mds_map
.is_data_pool(data_pool_id
)) {
271 std::cerr
<< "Warning: pool '" << data_pool_name
<< "' is not a "
272 "CephFS data pool!" << std::endl
;
274 std::cerr
<< "Use --force-pool to continue" << std::endl
;
279 dout(4) << "opening data pool '" << data_pool_name
<< "'" << dendl
;
280 r
= rados
.ioctx_create(data_pool_name
.c_str(), data_io
);
286 // Initialize metadata_io from MDSMap for scan_frags
287 if (command
== "scan_frags" || command
== "scan_links") {
288 const auto fs
= fsmap
->get_filesystem(fscid
);
290 std::cerr
<< "Filesystem id " << fscid
<< " does not exist" << std::endl
;
293 int64_t const metadata_pool_id
= fs
->mds_map
.get_metadata_pool();
295 dout(4) << "resolving metadata pool " << metadata_pool_id
<< dendl
;
296 int r
= rados
.pool_reverse_lookup(metadata_pool_id
, &metadata_pool_name
);
298 std::cerr
<< "Pool " << metadata_pool_id
299 << " identified in MDS map not found in RADOS!" << std::endl
;
303 r
= rados
.ioctx_create(metadata_pool_name
.c_str(), metadata_io
);
308 data_pools
= fs
->mds_map
.get_data_pools();
311 // Finally, dispatch command
312 if (command
== "scan_inodes") {
313 return scan_inodes();
314 } else if (command
== "scan_extents") {
315 return scan_extents();
316 } else if (command
== "scan_frags") {
318 } else if (command
== "scan_links") {
320 } else if (command
== "cleanup") {
322 } else if (command
== "init") {
323 return driver
->init_roots(fs
->mds_map
.get_first_data_pool());
325 std::cerr
<< "Unknown command '" << command
<< "'" << std::endl
;
330 int MetadataDriver::inject_unlinked_inode(
331 inodeno_t inono
, int mode
, int64_t data_pool_id
)
333 const object_t oid
= InodeStore::get_object_name(inono
, frag_t(), ".inode");
336 bool already_exists
= false;
337 int r
= root_exists(inono
, &already_exists
);
341 if (already_exists
&& !force_init
) {
342 std::cerr
<< "Inode 0x" << std::hex
<< inono
<< std::dec
<< " already"
343 " exists, skipping create. Use --force-init to overwrite"
344 " the existing object." << std::endl
;
349 InodeStore inode_data
;
350 auto inode
= inode_data
.get_inode();
353 inode
->xattr_version
= 1;
354 inode
->mode
= 0500 | mode
;
355 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
356 // (we won't actually give the *correct* dirstat here though)
357 inode
->dirstat
.nfiles
= 1;
359 inode
->ctime
= inode
->mtime
= ceph_clock_now();
361 inode
->truncate_size
= -1ull;
362 inode
->truncate_seq
= 1;
363 inode
->uid
= g_conf()->mds_root_ino_uid
;
364 inode
->gid
= g_conf()->mds_root_ino_gid
;
366 // Force layout to default: should we let users override this so that
367 // they don't have to mount the filesystem to correct it?
368 inode
->layout
= file_layout_t::get_default();
369 inode
->layout
.pool_id
= data_pool_id
;
370 inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
372 // Assume that we will get our stats wrong, and that we may
373 // be ignoring dirfrags that exist
374 inode_data
.damage_flags
|= (DAMAGE_STATS
| DAMAGE_RSTATS
| DAMAGE_FRAGTREE
);
376 if (inono
== CEPH_INO_ROOT
|| MDS_INO_IS_MDSDIR(inono
)) {
379 encode(srnode
, inode_data
.snap_blob
);
384 encode(std::string(CEPH_FS_ONDISK_MAGIC
), inode_bl
);
385 inode_data
.encode(inode_bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
388 r
= metadata_io
.write_full(oid
.name
, inode_bl
);
390 derr
<< "Error writing '" << oid
.name
<< "': " << cpp_strerror(r
) << dendl
;
397 int MetadataDriver::root_exists(inodeno_t ino
, bool *result
)
399 object_t oid
= InodeStore::get_object_name(ino
, frag_t(), ".inode");
402 int r
= metadata_io
.stat(oid
.name
, &size
, &mtime
);
414 int MetadataDriver::init_roots(int64_t data_pool_id
)
417 r
= inject_unlinked_inode(CEPH_INO_ROOT
, S_IFDIR
|0755, data_pool_id
);
421 r
= inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR
, data_pool_id
);
425 bool created
= false;
426 r
= find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created
);
434 int MetadataDriver::check_roots(bool *result
)
437 r
= root_exists(CEPH_INO_ROOT
, result
);
445 r
= root_exists(MDS_INO_MDSDIR(0), result
);
460 * 0. Create root inodes if don't exist
461 * PARALLEL scan_extents
462 * 1. Size and mtime recovery: scan ALL objects, and update 0th
463 * objects with max size and max mtime seen.
464 * PARALLEL scan_inodes
465 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
466 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
467 * or rstats at this stage. Inodes without backtraces go into
469 * TODO: SERIAL "recover stats"
470 * 3. Dirfrag statistics: depth first traverse into metadata tree,
471 * rebuilding dir sizes.
472 * TODO PARALLEL "clean up"
473 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
474 * anything onto them) and remove any of the xattrs that we
475 * used for accumulating.
479 int parse_oid(const std::string
&oid
, uint64_t *inode_no
, uint64_t *obj_id
)
481 if (oid
.find(".") == std::string::npos
|| oid
.find(".") == oid
.size() - 1) {
486 std::string inode_str
= oid
.substr(0, oid
.find("."));
487 *inode_no
= strict_strtoll(inode_str
.c_str(), 16, &err
);
492 std::string pos_string
= oid
.substr(oid
.find(".") + 1);
493 *obj_id
= strict_strtoll(pos_string
.c_str(), 16, &err
);
502 int DataScan::scan_extents()
504 return forall_objects(data_io
, false, [this](
505 std::string
const &oid
,
506 uint64_t obj_name_ino
,
507 uint64_t obj_name_offset
) -> int
512 int r
= data_io
.stat(oid
, &size
, &mtime
);
513 dout(10) << "handling object " << obj_name_ino
514 << "." << obj_name_offset
<< dendl
;
516 dout(4) << "Cannot stat '" << oid
<< "': skipping" << dendl
;
520 // I need to keep track of
521 // * The highest object ID seen
522 // * The size of the highest object ID seen
523 // * The largest object seen
525 // Given those things, I can later infer the object chunking
526 // size, the offset of the last object (chunk size * highest ID seen)
527 // and the actual size (offset of last object + size of highest ID seen)
529 // This logic doesn't take account of striping.
530 r
= ClsCephFSClient::accumulate_inode_metadata(
537 derr
<< "Failed to accumulate metadata data from '"
538 << oid
<< "': " << cpp_strerror(r
) << dendl
;
546 int DataScan::probe_filter(librados::IoCtx
&ioctx
)
548 bufferlist filter_bl
;
549 ClsCephFSClient::build_tag_filter("test", &filter_bl
);
550 librados::ObjectCursor range_i
;
551 librados::ObjectCursor range_end
;
553 std::vector
<librados::ObjectItem
> tmp_result
;
554 librados::ObjectCursor tmp_next
;
555 int r
= ioctx
.object_list(ioctx
.object_list_begin(), ioctx
.object_list_end(),
556 1, filter_bl
, &tmp_result
, &tmp_next
);
561 int DataScan::forall_objects(
562 librados::IoCtx
&ioctx
,
564 std::function
<int(std::string
, uint64_t, uint64_t)> handler
567 librados::ObjectCursor range_i
;
568 librados::ObjectCursor range_end
;
569 ioctx
.object_list_slice(
570 ioctx
.object_list_begin(),
571 ioctx
.object_list_end(),
578 bufferlist filter_bl
;
580 bool legacy_filtering
= false;
582 // probe to deal with older OSDs that don't support
583 // the cephfs pgls filtering mode
584 legacy_filtering
= !probe_filter(ioctx
);
585 if (!legacy_filtering
) {
586 ClsCephFSClient::build_tag_filter(filter_tag
, &filter_bl
);
591 while(range_i
< range_end
) {
592 std::vector
<librados::ObjectItem
> result
;
593 int r
= ioctx
.object_list(range_i
, range_end
, 1,
594 filter_bl
, &result
, &range_i
);
596 derr
<< "Unexpected error listing objects: " << cpp_strerror(r
) << dendl
;
600 for (const auto &i
: result
) {
601 const std::string
&oid
= i
.oid
;
602 uint64_t obj_name_ino
= 0;
603 uint64_t obj_name_offset
= 0;
604 r
= parse_oid(oid
, &obj_name_ino
, &obj_name_offset
);
606 dout(4) << "Bad object name '" << oid
<< "', skipping" << dendl
;
610 if (untagged_only
&& legacy_filtering
) {
611 dout(20) << "Applying filter to " << oid
<< dendl
;
613 // We are only interested in 0th objects during this phase: we touched
614 // the other objects during scan_extents
615 if (obj_name_offset
!= 0) {
616 dout(20) << "Non-zeroth object" << dendl
;
620 bufferlist scrub_tag_bl
;
621 int r
= ioctx
.getxattr(oid
, "scrub_tag", scrub_tag_bl
);
623 std::string read_tag
;
624 auto q
= scrub_tag_bl
.cbegin();
627 if (read_tag
== filter_tag
) {
628 dout(20) << "skipping " << oid
<< " because it has the filter_tag"
632 } catch (const buffer::error
&err
) {
634 dout(20) << "read non-matching tag '" << read_tag
<< "'" << dendl
;
636 dout(20) << "no tag read (" << r
<< ")" << dendl
;
639 } else if (untagged_only
) {
640 ceph_assert(obj_name_offset
== 0);
641 dout(20) << "OSD matched oid " << oid
<< dendl
;
644 int this_oid_r
= handler(oid
, obj_name_ino
, obj_name_offset
);
645 if (r
== 0 && this_oid_r
< 0) {
654 int DataScan::scan_inodes()
657 int r
= driver
->check_roots(&roots_present
);
659 derr
<< "Unexpected error checking roots: '"
660 << cpp_strerror(r
) << "'" << dendl
;
664 if (!roots_present
) {
665 std::cerr
<< "Some or all system inodes are absent. Run 'init' from "
666 "one node before running 'scan_inodes'" << std::endl
;
670 return forall_objects(data_io
, true, [this](
671 std::string
const &oid
,
672 uint64_t obj_name_ino
,
673 uint64_t obj_name_offset
) -> int
677 dout(10) << "handling object "
678 << std::hex
<< obj_name_ino
<< "." << obj_name_offset
<< std::dec
681 AccumulateResult accum_res
;
682 inode_backtrace_t backtrace
;
683 file_layout_t loaded_layout
= file_layout_t::get_default();
685 r
= ClsCephFSClient::fetch_inode_accumulate_result(
686 data_io
, oid
, &backtrace
, &loaded_layout
, &symlink
, &accum_res
);
689 dout(4) << "Accumulated metadata missing from '"
690 << oid
<< ", did you run scan_extents?" << dendl
;
693 dout(4) << "Unexpected error loading accumulated metadata from '"
694 << oid
<< "': " << cpp_strerror(r
) << dendl
;
695 // FIXME: this creates situation where if a client has a corrupt
696 // backtrace/layout, we will fail to inject it. We should (optionally)
697 // proceed if the backtrace/layout is corrupt but we have valid
698 // accumulated metadata.
702 const time_t file_mtime
= accum_res
.max_mtime
;
703 uint64_t file_size
= 0;
704 bool have_backtrace
= !(backtrace
.ancestors
.empty());
706 // This is the layout we will use for injection, populated either
707 // from loaded_layout or from best guesses
708 file_layout_t guessed_layout
;
709 guessed_layout
.pool_id
= data_pool_id
;
711 // Calculate file_size, guess the layout
712 if (accum_res
.ceiling_obj_index
> 0) {
713 uint32_t chunk_size
= file_layout_t::get_default().object_size
;
714 // When there are multiple objects, the largest object probably
715 // indicates the chunk size. But not necessarily, because files
716 // can be sparse. Only make this assumption if size seen
717 // is a power of two, as chunk sizes typically are.
718 if ((accum_res
.max_obj_size
& (accum_res
.max_obj_size
- 1)) == 0) {
719 chunk_size
= accum_res
.max_obj_size
;
722 if (loaded_layout
.pool_id
== -1) {
723 // If no stashed layout was found, guess it
724 guessed_layout
.object_size
= chunk_size
;
725 guessed_layout
.stripe_unit
= chunk_size
;
726 guessed_layout
.stripe_count
= 1;
727 } else if (!loaded_layout
.is_valid() ||
728 loaded_layout
.object_size
< accum_res
.max_obj_size
) {
729 // If the max size seen exceeds what the stashed layout claims, then
730 // disbelieve it. Guess instead. Same for invalid layouts on disk.
731 dout(4) << "bogus xattr layout on 0x" << std::hex
<< obj_name_ino
732 << std::dec
<< ", ignoring in favour of best guess" << dendl
;
733 guessed_layout
.object_size
= chunk_size
;
734 guessed_layout
.stripe_unit
= chunk_size
;
735 guessed_layout
.stripe_count
= 1;
737 // We have a stashed layout that we can't disprove, so apply it
738 guessed_layout
= loaded_layout
;
739 dout(20) << "loaded layout from xattr:"
740 << " os: " << guessed_layout
.object_size
741 << " sc: " << guessed_layout
.stripe_count
742 << " su: " << guessed_layout
.stripe_unit
744 // User might have transplanted files from a pool with a different
745 // ID, so whatever the loaded_layout says, we'll force the injected
746 // layout to point to the pool we really read from
747 guessed_layout
.pool_id
= data_pool_id
;
750 if (guessed_layout
.stripe_count
== 1) {
751 // Unstriped file: simple chunking
752 file_size
= guessed_layout
.object_size
* accum_res
.ceiling_obj_index
753 + accum_res
.ceiling_obj_size
;
755 // Striped file: need to examine the last stripe_count objects
756 // in the file to determine the size.
758 // How many complete (i.e. not last stripe) objects?
759 uint64_t complete_objs
= 0;
760 if (accum_res
.ceiling_obj_index
> guessed_layout
.stripe_count
- 1) {
761 complete_objs
= (accum_res
.ceiling_obj_index
/ guessed_layout
.stripe_count
) * guessed_layout
.stripe_count
;
766 // How many potentially-short objects (i.e. last stripe set) objects?
767 uint64_t partial_objs
= accum_res
.ceiling_obj_index
+ 1 - complete_objs
;
769 dout(10) << "calculating striped size from complete objs: "
770 << complete_objs
<< ", partial objs: " << partial_objs
773 // Maximum amount of data that may be in the incomplete objects
774 uint64_t incomplete_size
= 0;
776 // For each short object, calculate the max file size within it
777 // and accumulate the maximum
778 for (uint64_t i
= complete_objs
; i
< complete_objs
+ partial_objs
; ++i
) {
780 snprintf(buf
, sizeof(buf
), "%llx.%08llx",
781 (long long unsigned)obj_name_ino
, (long long unsigned)i
);
785 r
= data_io
.stat(std::string(buf
), &osize
, &omtime
);
788 // Upper bound within this object
789 uint64_t upper_size
= (osize
- 1) / guessed_layout
.stripe_unit
790 * (guessed_layout
.stripe_unit
* guessed_layout
.stripe_count
)
791 + (i
% guessed_layout
.stripe_count
)
792 * guessed_layout
.stripe_unit
+ (osize
- 1)
793 % guessed_layout
.stripe_unit
+ 1;
794 incomplete_size
= std::max(incomplete_size
, upper_size
);
796 } else if (r
== -ENOENT
) {
797 // Absent object, treat as size 0 and ignore.
799 // Unexpected error, carry r to outer scope for handling.
803 if (r
!= 0 && r
!= -ENOENT
) {
804 derr
<< "Unexpected error checking size of ino 0x" << std::hex
805 << obj_name_ino
<< std::dec
<< ": " << cpp_strerror(r
) << dendl
;
808 file_size
= complete_objs
* guessed_layout
.object_size
812 file_size
= accum_res
.ceiling_obj_size
;
813 if (loaded_layout
.pool_id
< 0
814 || loaded_layout
.object_size
< accum_res
.max_obj_size
) {
815 // No layout loaded, or inconsistent layout, use default
816 guessed_layout
= file_layout_t::get_default();
817 guessed_layout
.pool_id
= data_pool_id
;
819 guessed_layout
= loaded_layout
;
823 // Santity checking backtrace ino against object name
824 if (have_backtrace
&& backtrace
.ino
!= obj_name_ino
) {
825 dout(4) << "Backtrace ino 0x" << std::hex
<< backtrace
.ino
826 << " doesn't match object name ino 0x" << obj_name_ino
827 << std::dec
<< dendl
;
828 have_backtrace
= false;
832 build_file_dentry(obj_name_ino
, file_size
, file_mtime
, guessed_layout
, &dentry
, symlink
);
834 // Inject inode to the metadata pool
835 if (have_backtrace
) {
836 inode_backpointer_t root_bp
= *(backtrace
.ancestors
.rbegin());
837 if (MDS_INO_IS_MDSDIR(root_bp
.dirino
)) {
838 /* Special case for strays: even if we have a good backtrace,
839 * don't put it in the stray dir, because while that would technically
840 * give it linkage it would still be invisible to the user */
841 r
= driver
->inject_lost_and_found(obj_name_ino
, dentry
);
843 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
844 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
846 dout(4) << "Use --force-corrupt to overwrite structures that "
847 "appear to be corrupt" << dendl
;
851 /* Happy case: we will inject a named dentry for this inode */
852 r
= driver
->inject_with_backtrace(backtrace
, dentry
);
854 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
855 << std::dec
<< " with backtrace: " << cpp_strerror(r
) << dendl
;
857 dout(4) << "Use --force-corrupt to overwrite structures that "
858 "appear to be corrupt" << dendl
;
863 /* Backtrace-less case: we will inject a lost+found dentry */
864 r
= driver
->inject_lost_and_found(
865 obj_name_ino
, dentry
);
867 dout(4) << "Error injecting 0x" << std::hex
<< obj_name_ino
868 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
870 dout(4) << "Use --force-corrupt to overwrite structures that "
871 "appear to be corrupt" << dendl
;
880 int DataScan::cleanup()
882 // We are looking for only zeroth object
884 return forall_objects(data_io
, true, [this](
885 std::string
const &oid
,
886 uint64_t obj_name_ino
,
887 uint64_t obj_name_offset
) -> int
890 r
= ClsCephFSClient::delete_inode_accumulate_result(data_io
, oid
);
892 dout(4) << "Error deleting accumulated metadata from '"
893 << oid
<< "': " << cpp_strerror(r
) << dendl
;
899 bool DataScan::valid_ino(inodeno_t ino
) const
901 return (ino
>= inodeno_t((1ull << 40)))
902 || (MDS_INO_IS_STRAY(ino
))
903 || (MDS_INO_IS_MDSDIR(ino
))
904 || ino
== CEPH_INO_ROOT
905 || ino
== CEPH_INO_CEPH
;
908 int DataScan::scan_links()
910 MetadataDriver
*metadata_driver
= dynamic_cast<MetadataDriver
*>(driver
);
911 if (!metadata_driver
) {
912 derr
<< "Unexpected --output-dir option for scan_links" << dendl
;
916 interval_set
<uint64_t> used_inos
;
917 map
<inodeno_t
, int> remote_links
;
918 map
<snapid_t
, SnapInfo
> snaps
;
919 snapid_t last_snap
= 1;
920 snapid_t snaprealm_v2_since
= 2;
929 map
<snapid_t
, SnapInfo
> snaps
;
930 link_info_t() : version(0), nlink(0), is_dir(false) {}
931 link_info_t(inodeno_t di
, frag_t df
, const string
& n
, const CInode::inode_const_ptr
& i
) :
932 dirino(di
), frag(df
), name(n
),
933 version(i
->version
), nlink(i
->nlink
), is_dir(S_IFDIR
& i
->mode
) {}
934 dirfrag_t
dirfrag() const {
935 return dirfrag_t(dirino
, frag
);
938 map
<inodeno_t
, list
<link_info_t
> > dup_primaries
;
939 map
<inodeno_t
, link_info_t
> bad_nlink_inos
;
940 map
<inodeno_t
, link_info_t
> injected_inos
;
942 map
<dirfrag_t
, set
<string
> > to_remove
;
949 for (int step
= SCAN_INOS
; step
<= CHECK_LINK
; step
++) {
950 const librados::NObjectIterator it_end
= metadata_io
.nobjects_end();
951 for (auto it
= metadata_io
.nobjects_begin(); it
!= it_end
; ++it
) {
952 const std::string oid
= it
->get_oid();
954 uint64_t dir_ino
= 0;
955 uint64_t frag_id
= 0;
956 int r
= parse_oid(oid
, &dir_ino
, &frag_id
);
958 dout(10) << "Not a dirfrag: '" << oid
<< "'" << dendl
;
961 // parse_oid can only do 0 or -EINVAL
965 if (!valid_ino(dir_ino
)) {
966 dout(10) << "Not a dirfrag (invalid ino): '" << oid
<< "'" << dendl
;
970 std::map
<std::string
, bufferlist
> items
;
971 r
= metadata_io
.omap_get_vals(oid
, "", (uint64_t)-1, &items
);
973 derr
<< "Error getting omap from '" << oid
<< "': " << cpp_strerror(r
) << dendl
;
977 for (auto& p
: items
) {
978 auto q
= p
.second
.cbegin();
981 dentry_key_t::decode_helper(p
.first
, dname
, last
);
983 if (last
!= CEPH_NOSNAP
) {
984 if (last
> last_snap
)
992 if (dnfirst
<= CEPH_MAXSNAP
) {
993 if (dnfirst
- 1 > last_snap
)
994 last_snap
= dnfirst
- 1;
997 decode(dentry_type
, q
);
998 mempool::mds_co::string alternate_name
;
999 if (dentry_type
== 'I' || dentry_type
== 'i') {
1001 if (dentry_type
== 'i') {
1004 decode(alternate_name
, q
);
1008 inode
.decode_bare(q
);
1011 inodeno_t ino
= inode
.inode
->ino
;
1013 if (step
== SCAN_INOS
) {
1014 if (used_inos
.contains(ino
, 1)) {
1015 dup_primaries
[ino
].size();
1017 used_inos
.insert(ino
);
1019 } else if (step
== CHECK_LINK
) {
1021 if (inode
.snap_blob
.length()) {
1022 auto p
= inode
.snap_blob
.cbegin();
1024 for (auto it
= srnode
.snaps
.begin();
1025 it
!= srnode
.snaps
.end(); ) {
1026 if (it
->second
.ino
!= ino
||
1027 it
->second
.snapid
!= it
->first
) {
1028 srnode
.snaps
.erase(it
++);
1033 if (!srnode
.past_parents
.empty()) {
1034 snapid_t last
= srnode
.past_parents
.rbegin()->first
;
1035 if (last
+ 1 > snaprealm_v2_since
)
1036 snaprealm_v2_since
= last
+ 1;
1039 if (inode
.old_inodes
&& !inode
.old_inodes
->empty()) {
1040 auto _last_snap
= inode
.old_inodes
->rbegin()->first
;
1041 if (_last_snap
> last_snap
)
1042 last_snap
= _last_snap
;
1044 auto q
= dup_primaries
.find(ino
);
1045 if (q
!= dup_primaries
.end()) {
1046 q
->second
.push_back(link_info_t(dir_ino
, frag_id
, dname
, inode
.inode
));
1047 q
->second
.back().snaps
.swap(srnode
.snaps
);
1050 auto r
= remote_links
.find(ino
);
1051 if (r
!= remote_links
.end())
1053 if (!MDS_INO_IS_STRAY(dir_ino
))
1055 if (inode
.inode
->nlink
!= nlink
) {
1056 derr
<< "Bad nlink on " << ino
<< " expected " << nlink
1057 << " has " << inode
.inode
->nlink
<< dendl
;
1058 bad_nlink_inos
[ino
] = link_info_t(dir_ino
, frag_id
, dname
, inode
.inode
);
1059 bad_nlink_inos
[ino
].nlink
= nlink
;
1061 snaps
.insert(make_move_iterator(begin(srnode
.snaps
)),
1062 make_move_iterator(end(srnode
.snaps
)));
1064 if (dnfirst
== CEPH_NOSNAP
)
1065 injected_inos
[ino
] = link_info_t(dir_ino
, frag_id
, dname
, inode
.inode
);
1067 } else if (dentry_type
== 'L' || dentry_type
== 'l') {
1069 unsigned char d_type
;
1070 CDentry::decode_remote(dentry_type
, ino
, d_type
, alternate_name
, q
);
1072 if (step
== SCAN_INOS
) {
1073 remote_links
[ino
]++;
1074 } else if (step
== CHECK_LINK
) {
1075 if (!used_inos
.contains(ino
, 1)) {
1076 derr
<< "Bad remote link dentry 0x" << std::hex
<< dir_ino
1077 << std::dec
<< "/" << dname
1078 << ", ino " << ino
<< " not found" << dendl
;
1080 dentry_key_t
dn_key(CEPH_NOSNAP
, dname
.c_str());
1082 to_remove
[dirfrag_t(dir_ino
, frag_id
)].insert(key
);
1086 derr
<< "Invalid tag char '" << dentry_type
<< "' dentry 0x" << dir_ino
1087 << std::dec
<< "/" << dname
<< dendl
;
1090 } catch (const buffer::error
&err
) {
1091 derr
<< "Error decoding dentry 0x" << std::hex
<< dir_ino
1092 << std::dec
<< "/" << dname
<< dendl
;
1099 map
<unsigned, uint64_t> max_ino_map
;
1101 auto prev_max_ino
= (uint64_t)1 << 40;
1102 for (auto p
= used_inos
.begin(); p
!= used_inos
.end(); ++p
) {
1103 auto cur_max
= p
.get_start() + p
.get_len() - 1;
1104 if (cur_max
< prev_max_ino
)
1105 continue; // system inodes
1107 if ((prev_max_ino
>> 40) != (cur_max
>> 40)) {
1108 unsigned rank
= (prev_max_ino
>> 40) - 1;
1109 max_ino_map
[rank
] = prev_max_ino
;
1110 } else if ((p
.get_start() >> 40) != (cur_max
>> 40)) {
1111 unsigned rank
= (p
.get_start() >> 40) - 1;
1112 max_ino_map
[rank
] = ((uint64_t)(rank
+ 2) << 40) - 1;
1114 prev_max_ino
= cur_max
;
1116 unsigned rank
= (prev_max_ino
>> 40) - 1;
1117 max_ino_map
[rank
] = prev_max_ino
;
1122 for (auto& p
: dup_primaries
) {
1124 for (auto& q
: p
.second
) {
1125 if (q
.version
> newest
.version
) {
1127 } else if (q
.version
== newest
.version
&&
1128 !MDS_INO_IS_STRAY(q
.dirino
) &&
1129 MDS_INO_IS_STRAY(newest
.dirino
)) {
1134 for (auto& q
: p
.second
) {
1135 // in the middle of dir fragmentation?
1136 if (newest
.dirino
== q
.dirino
&& newest
.name
== q
.name
) {
1137 snaps
.insert(make_move_iterator(begin(q
.snaps
)),
1138 make_move_iterator(end(q
.snaps
)));
1143 dentry_key_t
dn_key(CEPH_NOSNAP
, q
.name
.c_str());
1145 to_remove
[q
.dirfrag()].insert(key
);
1146 derr
<< "Remove duplicated ino 0x" << p
.first
<< " from "
1147 << q
.dirfrag() << "/" << q
.name
<< dendl
;
1151 auto q
= remote_links
.find(p
.first
);
1152 if (q
!= remote_links
.end())
1154 if (!MDS_INO_IS_STRAY(newest
.dirino
))
1157 if (nlink
!= newest
.nlink
) {
1158 derr
<< "Bad nlink on " << p
.first
<< " expected " << nlink
1159 << " has " << newest
.nlink
<< dendl
;
1160 bad_nlink_inos
[p
.first
] = newest
;
1161 bad_nlink_inos
[p
.first
].nlink
= nlink
;
1164 dup_primaries
.clear();
1165 remote_links
.clear();
1168 objecter
->with_osdmap([&](const OSDMap
& o
) {
1169 for (auto p
: data_pools
) {
1170 const pg_pool_t
*pi
= o
.get_pg_pool(p
);
1173 if (pi
->snap_seq
> last_snap
)
1174 last_snap
= pi
->snap_seq
;
1178 if (!snaps
.empty()) {
1179 if (snaps
.rbegin()->first
> last_snap
)
1180 last_snap
= snaps
.rbegin()->first
;
1184 for (auto& p
: to_remove
) {
1185 object_t frag_oid
= InodeStore::get_object_name(p
.first
.ino
, p
.first
.frag
, "");
1187 int r
= metadata_io
.omap_rm_keys(frag_oid
.name
, p
.second
);
1189 derr
<< "Error removing duplicated dentries from " << p
.first
<< dendl
;
1195 for (auto &p
: bad_nlink_inos
) {
1198 int r
= read_dentry(p
.second
.dirino
, p
.second
.frag
, p
.second
.name
, &inode
, &first
);
1200 derr
<< "Unexpected error reading dentry "
1201 << p
.second
.dirfrag() << "/" << p
.second
.name
1202 << ": " << cpp_strerror(r
) << dendl
;
1206 if (inode
.inode
->ino
!= p
.first
|| inode
.inode
->version
!= p
.second
.version
)
1209 inode
.get_inode()->nlink
= p
.second
.nlink
;
1210 r
= metadata_driver
->inject_linkage(p
.second
.dirino
, p
.second
.name
, p
.second
.frag
, inode
, first
);
1215 for (auto &p
: injected_inos
) {
1218 int r
= read_dentry(p
.second
.dirino
, p
.second
.frag
, p
.second
.name
, &inode
, &first
);
1220 derr
<< "Unexpected error reading dentry "
1221 << p
.second
.dirfrag() << "/" << p
.second
.name
1222 << ": " << cpp_strerror(r
) << dendl
;
1226 if (first
!= CEPH_NOSNAP
)
1229 first
= last_snap
+ 1;
1230 r
= metadata_driver
->inject_linkage(p
.second
.dirino
, p
.second
.name
, p
.second
.frag
, inode
, first
);
1235 for (auto& p
: max_ino_map
) {
1236 InoTable
inotable(nullptr);
1237 inotable
.set_rank(p
.first
);
1239 int r
= metadata_driver
->load_table(&inotable
);
1241 inotable
.reset_state();
1244 if (inotable
.force_consume_to(p
.second
))
1247 r
= metadata_driver
->save_table(&inotable
);
1254 SnapServer snaptable
;
1255 snaptable
.set_rank(0);
1257 int r
= metadata_driver
->load_table(&snaptable
);
1259 snaptable
.reset_state();
1262 if (snaptable
.force_update(last_snap
, snaprealm_v2_since
, snaps
))
1265 r
= metadata_driver
->save_table(&snaptable
);
1273 int DataScan::scan_frags()
1276 int r
= driver
->check_roots(&roots_present
);
1278 derr
<< "Unexpected error checking roots: '"
1279 << cpp_strerror(r
) << "'" << dendl
;
1283 if (!roots_present
) {
1284 std::cerr
<< "Some or all system inodes are absent. Run 'init' from "
1285 "one node before running 'scan_inodes'" << std::endl
;
1289 return forall_objects(metadata_io
, true, [this](
1290 std::string
const &oid
,
1291 uint64_t obj_name_ino
,
1292 uint64_t obj_name_offset
) -> int
1295 r
= parse_oid(oid
, &obj_name_ino
, &obj_name_offset
);
1297 dout(4) << "Bad object name '" << oid
<< "', skipping" << dendl
;
1301 if (obj_name_ino
< (1ULL << 40)) {
1302 // FIXME: we're skipping stray dirs here: if they're
1303 // orphaned then we should be resetting them some other
1305 dout(10) << "Skipping system ino " << obj_name_ino
<< dendl
;
1309 AccumulateResult accum_res
;
1310 inode_backtrace_t backtrace
;
1312 // Default to inherit layout (i.e. no explicit layout on dir) which is
1313 // expressed as a zeroed layout struct (see inode_t::has_layout)
1314 file_layout_t loaded_layout
;
1317 bufferlist parent_bl
;
1319 bufferlist layout_bl
;
1322 librados::ObjectReadOperation op
;
1323 op
.getxattr("parent", &parent_bl
, &parent_r
);
1324 op
.getxattr("layout", &layout_bl
, &layout_r
);
1325 r
= metadata_io
.operate(oid
, &op
, &op_bl
);
1326 if (r
!= 0 && r
!= -ENODATA
) {
1327 derr
<< "Unexpected error reading backtrace: " << cpp_strerror(parent_r
) << dendl
;
1331 if (parent_r
!= -ENODATA
) {
1333 auto q
= parent_bl
.cbegin();
1334 backtrace
.decode(q
);
1335 } catch (buffer::error
&e
) {
1336 dout(4) << "Corrupt backtrace on '" << oid
<< "': " << e
.what() << dendl
;
1337 if (!force_corrupt
) {
1340 // Treat backtrace as absent: we'll inject into lost+found
1341 backtrace
= inode_backtrace_t();
1346 if (layout_r
!= -ENODATA
) {
1348 auto q
= layout_bl
.cbegin();
1349 decode(loaded_layout
, q
);
1350 } catch (buffer::error
&e
) {
1351 dout(4) << "Corrupt layout on '" << oid
<< "': " << e
.what() << dendl
;
1352 if (!force_corrupt
) {
1358 bool have_backtrace
= !(backtrace
.ancestors
.empty());
1360 // Santity checking backtrace ino against object name
1361 if (have_backtrace
&& backtrace
.ino
!= obj_name_ino
) {
1362 dout(4) << "Backtrace ino 0x" << std::hex
<< backtrace
.ino
1363 << " doesn't match object name ino 0x" << obj_name_ino
1364 << std::dec
<< dendl
;
1365 have_backtrace
= false;
1368 uint64_t fnode_version
= 0;
1370 r
= read_fnode(obj_name_ino
, frag_t(), &fnode
, &fnode_version
);
1372 derr
<< "Corrupt fnode on " << oid
<< dendl
;
1373 if (force_corrupt
) {
1374 fnode
.fragstat
.mtime
= 0;
1375 fnode
.fragstat
.nfiles
= 1;
1376 fnode
.fragstat
.nsubdirs
= 0;
1377 fnode
.accounted_fragstat
= fnode
.fragstat
;
1384 build_dir_dentry(obj_name_ino
, fnode
.accounted_fragstat
,
1385 loaded_layout
, &dentry
);
1387 // Inject inode to the metadata pool
1388 if (have_backtrace
) {
1389 inode_backpointer_t root_bp
= *(backtrace
.ancestors
.rbegin());
1390 if (MDS_INO_IS_MDSDIR(root_bp
.dirino
)) {
1391 /* Special case for strays: even if we have a good backtrace,
1392 * don't put it in the stray dir, because while that would technically
1393 * give it linkage it would still be invisible to the user */
1394 r
= driver
->inject_lost_and_found(obj_name_ino
, dentry
);
1396 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
1397 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
1399 dout(4) << "Use --force-corrupt to overwrite structures that "
1400 "appear to be corrupt" << dendl
;
1404 /* Happy case: we will inject a named dentry for this inode */
1405 r
= driver
->inject_with_backtrace(backtrace
, dentry
);
1407 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
1408 << std::dec
<< " with backtrace: " << cpp_strerror(r
) << dendl
;
1410 dout(4) << "Use --force-corrupt to overwrite structures that "
1411 "appear to be corrupt" << dendl
;
1416 /* Backtrace-less case: we will inject a lost+found dentry */
1417 r
= driver
->inject_lost_and_found(
1418 obj_name_ino
, dentry
);
1420 dout(4) << "Error injecting 0x" << std::hex
<< obj_name_ino
1421 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
1423 dout(4) << "Use --force-corrupt to overwrite structures that "
1424 "appear to be corrupt" << dendl
;
1433 int MetadataTool::read_fnode(
1434 inodeno_t ino
, frag_t frag
, fnode_t
*fnode
,
1435 uint64_t *last_version
)
1437 ceph_assert(fnode
!= NULL
);
1439 object_t frag_oid
= InodeStore::get_object_name(ino
, frag
, "");
1440 bufferlist fnode_bl
;
1441 int r
= metadata_io
.omap_get_header(frag_oid
.name
, &fnode_bl
);
1442 *last_version
= metadata_io
.get_last_version();
1447 auto old_fnode_iter
= fnode_bl
.cbegin();
1449 (*fnode
).decode(old_fnode_iter
);
1450 } catch (const buffer::error
&err
) {
1457 int MetadataTool::read_dentry(inodeno_t parent_ino
, frag_t frag
,
1458 const std::string
&dname
, InodeStore
*inode
, snapid_t
*dnfirst
)
1460 ceph_assert(inode
!= NULL
);
1463 dentry_key_t
dn_key(CEPH_NOSNAP
, dname
.c_str());
1466 std::set
<std::string
> keys
;
1468 std::map
<std::string
, bufferlist
> vals
;
1469 object_t frag_oid
= InodeStore::get_object_name(parent_ino
, frag
, "");
1470 int r
= metadata_io
.omap_get_vals_by_keys(frag_oid
.name
, keys
, &vals
);
1471 dout(20) << "oid=" << frag_oid
.name
1472 << " dname=" << dname
1474 << ", r=" << r
<< dendl
;
1479 if (vals
.find(key
) == vals
.end()) {
1480 dout(20) << key
<< " not found in result" << dendl
;
1485 auto q
= vals
[key
].cbegin();
1489 decode(dentry_type
, q
);
1490 if (dentry_type
== 'I' || dentry_type
== 'i') {
1491 if (dentry_type
== 'i') {
1492 mempool::mds_co::string alternate_name
;
1496 decode(alternate_name
, q
);
1500 inode
->decode_bare(q
);
1503 dout(20) << "dentry type '" << dentry_type
<< "': cannot"
1504 "read an inode out of that" << dendl
;
1509 } catch (const buffer::error
&err
) {
1510 dout(20) << "encoding error in dentry 0x" << std::hex
<< parent_ino
1511 << std::dec
<< "/" << dname
<< dendl
;
1518 int MetadataDriver::load_table(MDSTable
*table
)
1520 object_t table_oid
= table
->get_object_name();
1522 bufferlist table_bl
;
1523 int r
= metadata_io
.read(table_oid
.name
, table_bl
, 0, 0);
1525 derr
<< "unable to read mds table '" << table_oid
.name
<< "': "
1526 << cpp_strerror(r
) << dendl
;
1531 version_t table_ver
;
1532 auto p
= table_bl
.cbegin();
1533 decode(table_ver
, p
);
1534 table
->decode_state(p
);
1535 table
->force_replay_version(table_ver
);
1536 } catch (const buffer::error
&err
) {
1537 derr
<< "unable to decode mds table '" << table_oid
.name
<< "': "
1538 << err
.what() << dendl
;
1544 int MetadataDriver::save_table(MDSTable
*table
)
1546 object_t table_oid
= table
->get_object_name();
1548 bufferlist table_bl
;
1549 encode(table
->get_version(), table_bl
);
1550 table
->encode_state(table_bl
);
1551 int r
= metadata_io
.write_full(table_oid
.name
, table_bl
);
1553 derr
<< "error updating mds table " << table_oid
.name
1554 << ": " << cpp_strerror(r
) << dendl
;
1560 int MetadataDriver::inject_lost_and_found(
1561 inodeno_t ino
, const InodeStore
&dentry
)
1563 // Create lost+found if doesn't exist
1564 bool created
= false;
1565 int r
= find_or_create_dirfrag(CEPH_INO_ROOT
, frag_t(), &created
);
1570 r
= read_dentry(CEPH_INO_ROOT
, frag_t(), "lost+found", &lf_ino
);
1571 if (r
== -ENOENT
|| r
== -EINVAL
) {
1572 if (r
== -EINVAL
&& !force_corrupt
) {
1576 // To have a directory not specify a layout, give it zeros (see
1577 // inode_t::has_layout)
1578 file_layout_t inherit_layout
;
1580 // Construct LF inode
1581 frag_info_t fragstat
;
1582 fragstat
.nfiles
= 1,
1583 build_dir_dentry(CEPH_INO_LOST_AND_FOUND
, fragstat
, inherit_layout
, &lf_ino
);
1585 // Inject link to LF inode in the root dir
1586 r
= inject_linkage(CEPH_INO_ROOT
, "lost+found", frag_t(), lf_ino
);
1591 if (!(lf_ino
.inode
->mode
& S_IFDIR
)) {
1592 derr
<< "lost+found exists but is not a directory!" << dendl
;
1593 // In this case we error out, and the user should do something about
1599 r
= find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND
, frag_t(), &created
);
1604 const std::string dname
= lost_found_dname(ino
);
1606 // Write dentry into lost+found dirfrag
1607 return inject_linkage(lf_ino
.inode
->ino
, dname
, frag_t(), dentry
);
1611 int MetadataDriver::get_frag_of(
1613 const std::string
&target_dname
,
1616 object_t root_frag_oid
= InodeStore::get_object_name(dirino
, frag_t(), "");
1618 dout(20) << "dirino=" << dirino
<< " target_dname=" << target_dname
<< dendl
;
1620 // Find and load fragtree if existing dirfrag
1621 // ==========================================
1622 bool have_backtrace
= false;
1623 bufferlist parent_bl
;
1624 int r
= metadata_io
.getxattr(root_frag_oid
.name
, "parent", parent_bl
);
1625 if (r
== -ENODATA
) {
1626 dout(10) << "No backtrace on '" << root_frag_oid
<< "'" << dendl
;
1628 dout(4) << "Unexpected error on '" << root_frag_oid
<< "': "
1629 << cpp_strerror(r
) << dendl
;
1633 // Deserialize backtrace
1634 inode_backtrace_t backtrace
;
1635 if (parent_bl
.length()) {
1637 auto q
= parent_bl
.cbegin();
1638 backtrace
.decode(q
);
1639 have_backtrace
= true;
1640 } catch (buffer::error
&e
) {
1641 dout(4) << "Corrupt backtrace on '" << root_frag_oid
<< "': "
1642 << e
.what() << dendl
;
1646 if (!(have_backtrace
&& backtrace
.ancestors
.size())) {
1647 // Can't work out fragtree without a backtrace
1648 dout(4) << "No backtrace on '" << root_frag_oid
1649 << "': cannot determine fragtree" << dendl
;
1653 // The parentage of dirino
1654 const inode_backpointer_t
&bp
= *(backtrace
.ancestors
.begin());
1656 // The inode of dirino's parent
1657 const inodeno_t parent_ino
= bp
.dirino
;
1659 // The dname of dirino in its parent.
1660 const std::string
&parent_dname
= bp
.dname
;
1662 dout(20) << "got backtrace parent " << parent_ino
<< "/"
1663 << parent_dname
<< dendl
;
1665 // The primary dentry for dirino
1666 InodeStore existing_dentry
;
1668 // See if we can find ourselves in dirfrag zero of the parent: this
1669 // is a fast path that avoids needing to go further up the tree
1670 // if the parent isn't fragmented (worst case we would have to
1671 // go all the way to the root)
1672 r
= read_dentry(parent_ino
, frag_t(), parent_dname
, &existing_dentry
);
1674 // Great, fast path: return the fragtree from here
1675 if (existing_dentry
.inode
->ino
!= dirino
) {
1676 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
1677 << existing_dentry
.inode
->ino
1678 << " vs expected 0x" << dirino
<< std::dec
<< dendl
;
1681 dout(20) << "fast path, fragtree is "
1682 << existing_dentry
.dirfragtree
<< dendl
;
1683 *result_ft
= existing_dentry
.pick_dirfrag(target_dname
);
1684 dout(20) << "frag is " << *result_ft
<< dendl
;
1686 } else if (r
!= -ENOENT
) {
1687 // Dentry not present in 0th frag, must read parent's fragtree
1689 r
= get_frag_of(parent_ino
, parent_dname
, &parent_frag
);
1691 // We have the parent fragtree, so try again to load our dentry
1692 r
= read_dentry(parent_ino
, parent_frag
, parent_dname
, &existing_dentry
);
1695 *result_ft
= existing_dentry
.pick_dirfrag(target_dname
);
1696 dout(20) << "resolved via parent, frag is " << *result_ft
<< dendl
;
1699 if (r
== -EINVAL
|| r
== -ENOENT
) {
1700 return -ENOENT
; // dentry missing or corrupt, so frag is missing
1706 // Couldn't resolve parent fragtree, so can't find ours.
1709 } else if (r
== -EINVAL
) {
1710 // Unreadable dentry, can't know the fragtree.
1713 // Unexpected error, raise it
1719 int MetadataDriver::inject_with_backtrace(
1720 const inode_backtrace_t
&backtrace
, const InodeStore
&dentry
)
1726 // In order to insert something into a directory, we first (ideally)
1727 // need to know the fragtree for the directory. Sometimes we can't
1728 // get that, in which case we just go ahead and insert it into
1729 // fragment zero for a good chance of that being the right thing
1730 // anyway (most moderate-sized dirs aren't fragmented!)
1734 // My immediate ancestry should be correct, so if we can find that
1735 // directory's dirfrag then go inject it there. This works well
1736 // in the case that this inode's dentry was somehow lost and we
1737 // are recreating it, because the rest of the hierarchy
1738 // will probably still exist.
1740 // It's more of a "better than nothing" approach when rebuilding
1741 // a whole tree, as backtraces will in general not be up to date
1742 // beyond the first parent, if anything in the trace was ever
1743 // moved after the file was created.
1747 // The backtrace tells us inodes for each of the parents. If we are
1748 // creating those parent dirfrags, then there is a risk that somehow
1749 // the inode indicated here was also used for data (not a dirfrag) at
1750 // some stage. That would be a zany situation, and we don't check
1751 // for it here, because to do so would require extra IOs for everything
1752 // we inject, and anyway wouldn't guarantee that the inode number
1753 // wasn't in use in some dentry elsewhere in the metadata tree that
1754 // just happened not to have any data objects.
1756 // On multiple workers touching the same traces
1757 // ============================================
1758 // When creating linkage for a directory, *only* create it if we are
1759 // also creating the object. That way, we might not manage to get the
1760 // *right* linkage for a directory, but at least we won't multiply link
1761 // it. We assume that if a root dirfrag exists for a directory, then
1762 // it is linked somewhere (i.e. that the metadata pool is not already
1765 // Making sure *that* is true is someone else's job! Probably someone
1766 // who is not going to run in parallel, so that they can self-consistently
1767 // look at versions and move things around as they go.
1768 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1769 // object, next run will fail to create linkage for the dirfrag object
1770 // and leave it orphaned.
1772 inodeno_t ino
= backtrace
.ino
;
1773 dout(10) << " inode: 0x" << std::hex
<< ino
<< std::dec
<< dendl
;
1774 for (std::vector
<inode_backpointer_t
>::const_iterator i
= backtrace
.ancestors
.begin();
1775 i
!= backtrace
.ancestors
.end(); ++i
) {
1776 const inode_backpointer_t
&backptr
= *i
;
1777 dout(10) << " backptr: 0x" << std::hex
<< backptr
.dirino
<< std::dec
1778 << "/" << backptr
.dname
<< dendl
;
1780 // Examine root dirfrag for parent
1781 const inodeno_t parent_ino
= backptr
.dirino
;
1782 const std::string dname
= backptr
.dname
;
1785 int r
= get_frag_of(parent_ino
, dname
, &fragment
);
1787 // Don't know fragment, fall back to assuming root
1788 dout(20) << "don't know fragment for 0x" << std::hex
<<
1789 parent_ino
<< std::dec
<< "/" << dname
<< ", will insert to root"
1793 // Find or create dirfrag
1794 // ======================
1795 bool created_dirfrag
;
1796 r
= find_or_create_dirfrag(parent_ino
, fragment
, &created_dirfrag
);
1801 // Check if dentry already exists
1802 // ==============================
1803 InodeStore existing_dentry
;
1804 r
= read_dentry(parent_ino
, fragment
, dname
, &existing_dentry
);
1805 bool write_dentry
= false;
1806 if (r
== -ENOENT
|| r
== -EINVAL
) {
1807 if (r
== -EINVAL
&& !force_corrupt
) {
1810 // Missing or corrupt dentry
1811 write_dentry
= true;
1813 derr
<< "Unexpected error reading dentry 0x" << std::hex
1814 << parent_ino
<< std::dec
<< "/"
1815 << dname
<< ": " << cpp_strerror(r
) << dendl
;
1818 // Dentry already present, does it link to me?
1819 if (existing_dentry
.inode
->ino
== ino
) {
1820 dout(20) << "Dentry 0x" << std::hex
1821 << parent_ino
<< std::dec
<< "/"
1822 << dname
<< " already exists and points to me" << dendl
;
1824 derr
<< "Dentry 0x" << std::hex
1825 << parent_ino
<< std::dec
<< "/"
1826 << dname
<< " already exists but points to 0x"
1827 << std::hex
<< existing_dentry
.inode
->ino
<< std::dec
<< dendl
;
1828 // Fall back to lost+found!
1829 return inject_lost_and_found(backtrace
.ino
, dentry
);
1837 if (i
== backtrace
.ancestors
.begin()) {
1838 // This is the linkage for the file of interest
1839 dout(10) << "Linking inode 0x" << std::hex
<< ino
1840 << " at 0x" << parent_ino
<< "/" << dname
<< std::dec
1841 << " with size=" << dentry
.inode
->size
<< " bytes" << dendl
;
1843 r
= inject_linkage(parent_ino
, dname
, fragment
, dentry
);
1845 // This is the linkage for an ancestor directory
1846 InodeStore ancestor_dentry
;
1847 auto inode
= ancestor_dentry
.get_inode();
1848 inode
->mode
= 0755 | S_IFDIR
;
1850 // Set nfiles to something non-zero, to fool any other code
1851 // that tries to ignore 'empty' directories. This won't be
1852 // accurate, but it should avoid functional issues.
1854 inode
->dirstat
.nfiles
= 1;
1855 inode
->dir_layout
.dl_dir_hash
=
1856 g_conf()->mds_default_dir_hash
;
1860 inode
->uid
= g_conf()->mds_root_ino_uid
;
1861 inode
->gid
= g_conf()->mds_root_ino_gid
;
1863 inode
->backtrace_version
= 1;
1864 r
= inject_linkage(parent_ino
, dname
, fragment
, ancestor_dentry
);
1872 if (!created_dirfrag
) {
1873 // If the parent dirfrag already existed, then stop traversing the
1874 // backtrace: assume that the other ancestors already exist too. This
1875 // is an assumption rather than a truth, but it's a convenient way
1876 // to avoid the risk of creating multiply-linked directories while
1877 // injecting data. If there are in fact missing ancestors, this
1878 // should be fixed up using a separate tool scanning the metadata
1882 // Proceed up the backtrace, creating parents
1890 int MetadataDriver::find_or_create_dirfrag(
1895 ceph_assert(created
!= NULL
);
1897 fnode_t existing_fnode
;
1900 uint64_t read_version
= 0;
1901 int r
= read_fnode(ino
, fragment
, &existing_fnode
, &read_version
);
1902 dout(10) << "read_version = " << read_version
<< dendl
;
1904 if (r
== -ENOENT
|| r
== -EINVAL
) {
1905 if (r
== -EINVAL
&& !force_corrupt
) {
1909 // Missing or corrupt fnode, create afresh
1910 bufferlist fnode_bl
;
1911 fnode_t blank_fnode
;
1912 blank_fnode
.version
= 1;
1913 // mark it as non-empty
1914 blank_fnode
.fragstat
.nfiles
= 1;
1915 blank_fnode
.accounted_fragstat
= blank_fnode
.fragstat
;
1916 blank_fnode
.damage_flags
|= (DAMAGE_STATS
| DAMAGE_RSTATS
);
1917 blank_fnode
.encode(fnode_bl
);
1920 librados::ObjectWriteOperation op
;
1923 ceph_assert(r
== -EINVAL
);
1924 // Case A: We must assert that the version isn't changed since we saw the object
1925 // was unreadable, to avoid the possibility of two data-scan processes
1926 // both creating the frag.
1927 op
.assert_version(read_version
);
1929 ceph_assert(r
== -ENOENT
);
1930 // Case B: The object didn't exist in read_fnode, so while creating it we must
1931 // use an exclusive create to correctly populate *creating with
1932 // whether we created it ourselves or someone beat us to it.
1936 object_t frag_oid
= InodeStore::get_object_name(ino
, fragment
, "");
1937 op
.omap_set_header(fnode_bl
);
1938 r
= metadata_io
.operate(frag_oid
.name
, &op
);
1939 if (r
== -EOVERFLOW
|| r
== -EEXIST
) {
1940 // Someone else wrote it (see case A above)
1941 dout(10) << "Dirfrag creation race: 0x" << std::hex
1942 << ino
<< " " << fragment
<< std::dec
<< dendl
;
1946 // We were unable to create or write it, error out
1947 derr
<< "Failed to create dirfrag 0x" << std::hex
1948 << ino
<< std::dec
<< ": " << cpp_strerror(r
) << dendl
;
1951 // Success: the dirfrag object now exists with a value header
1952 dout(10) << "Created dirfrag: 0x" << std::hex
1953 << ino
<< std::dec
<< dendl
;
1957 derr
<< "Unexpected error reading dirfrag 0x" << std::hex
1958 << ino
<< std::dec
<< " : " << cpp_strerror(r
) << dendl
;
1961 dout(20) << "Dirfrag already exists: 0x" << std::hex
1962 << ino
<< " " << fragment
<< std::dec
<< dendl
;
1968 int MetadataDriver::inject_linkage(
1969 inodeno_t dir_ino
, const std::string
&dname
,
1970 const frag_t fragment
, const InodeStore
&inode
, const snapid_t dnfirst
)
1972 object_t frag_oid
= InodeStore::get_object_name(dir_ino
, fragment
, "");
1975 dentry_key_t
dn_key(CEPH_NOSNAP
, dname
.c_str());
1978 bufferlist dentry_bl
;
1979 encode(dnfirst
, dentry_bl
);
1980 encode('I', dentry_bl
);
1981 inode
.encode_bare(dentry_bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1984 std::map
<std::string
, bufferlist
> vals
;
1985 vals
[key
] = dentry_bl
;
1986 int r
= metadata_io
.omap_set(frag_oid
.name
, vals
);
1988 derr
<< "Error writing dentry 0x" << std::hex
1989 << dir_ino
<< std::dec
<< "/"
1990 << dname
<< ": " << cpp_strerror(r
) << dendl
;
1993 dout(20) << "Injected dentry 0x" << std::hex
1994 << dir_ino
<< "/" << dname
<< " pointing to 0x"
1995 << inode
.inode
->ino
<< std::dec
<< dendl
;
2001 int MetadataDriver::init(
2002 librados::Rados
&rados
, std::string
&metadata_pool_name
, const FSMap
*fsmap
,
2003 fs_cluster_id_t fscid
)
2005 if (metadata_pool_name
.empty()) {
2006 auto fs
= fsmap
->get_filesystem(fscid
);
2007 ceph_assert(fs
!= nullptr);
2008 int64_t const metadata_pool_id
= fs
->mds_map
.get_metadata_pool();
2010 dout(4) << "resolving metadata pool " << metadata_pool_id
<< dendl
;
2011 int r
= rados
.pool_reverse_lookup(metadata_pool_id
, &metadata_pool_name
);
2013 derr
<< "Pool " << metadata_pool_id
2014 << " identified in MDS map not found in RADOS!" << dendl
;
2017 dout(4) << "found metadata pool '" << metadata_pool_name
<< "'" << dendl
;
2019 dout(4) << "forcing metadata pool '" << metadata_pool_name
<< "'" << dendl
;
2021 return rados
.ioctx_create(metadata_pool_name
.c_str(), metadata_io
);
2024 int LocalFileDriver::init(
2025 librados::Rados
&rados
, std::string
&metadata_pool_name
, const FSMap
*fsmap
,
2026 fs_cluster_id_t fscid
)
2031 int LocalFileDriver::inject_data(
2032 const std::string
&file_path
,
2034 uint32_t chunk_size
,
2037 // Scrape the file contents out of the data pool and into the
2040 f
.open(file_path
.c_str(), std::fstream::out
| std::fstream::binary
);
2042 for (uint64_t offset
= 0; offset
< size
; offset
+= chunk_size
) {
2046 snprintf(buf
, sizeof(buf
),
2048 (unsigned long long)ino
,
2049 (unsigned long long)(offset
/ chunk_size
));
2050 std::string
oid(buf
);
2052 int r
= data_io
.read(oid
, bl
, chunk_size
, 0);
2054 if (r
<= 0 && r
!= -ENOENT
) {
2055 derr
<< "error reading data object '" << oid
<< "': "
2056 << cpp_strerror(r
) << dendl
;
2071 int LocalFileDriver::inject_with_backtrace(
2072 const inode_backtrace_t
&bt
,
2073 const InodeStore
&dentry
)
2075 std::string path_builder
= path
;
2077 // Iterate through backtrace creating directory parents
2078 std::vector
<inode_backpointer_t
>::const_reverse_iterator i
;
2079 for (i
= bt
.ancestors
.rbegin();
2080 i
!= bt
.ancestors
.rend(); ++i
) {
2082 const inode_backpointer_t
&backptr
= *i
;
2083 path_builder
+= "/";
2084 path_builder
+= backptr
.dname
;
2086 // Last entry is the filename itself
2087 bool is_file
= (i
+ 1 == bt
.ancestors
.rend());
2089 // FIXME: inject_data won't cope with interesting (i.e. striped)
2090 // layouts (need a librados-compatible Filer to read these)
2091 inject_data(path_builder
, dentry
.inode
->size
,
2092 dentry
.inode
->layout
.object_size
, bt
.ino
);
2094 int r
= mkdir(path_builder
.c_str(), 0755);
2095 if (r
!= 0 && r
!= -EPERM
) {
2096 derr
<< "error creating directory: '" << path_builder
<< "': "
2097 << cpp_strerror(r
) << dendl
;
2106 int LocalFileDriver::inject_lost_and_found(
2108 const InodeStore
&dentry
)
2110 std::string lf_path
= path
+ "/lost+found";
2111 int r
= mkdir(lf_path
.c_str(), 0755);
2112 if (r
!= 0 && r
!= -EPERM
) {
2113 derr
<< "error creating directory: '" << lf_path
<< "': "
2114 << cpp_strerror(r
) << dendl
;
2118 std::string file_path
= lf_path
+ "/" + lost_found_dname(ino
);
2119 return inject_data(file_path
, dentry
.inode
->size
,
2120 dentry
.inode
->layout
.object_size
, ino
);
2123 int LocalFileDriver::init_roots(int64_t data_pool_id
)
2125 // Ensure that the path exists and is a directory
2127 int r
= check_roots(&exists
);
2135 return ::mkdir(path
.c_str(), 0755);
2139 int LocalFileDriver::check_roots(bool *result
)
2141 // Check if the path exists and is a directory
2142 DIR *d
= ::opendir(path
.c_str());
2146 int r
= closedir(d
);
2148 // Weird, but maybe possible with e.g. stale FD on NFS mount?
2158 void MetadataTool::build_file_dentry(
2159 inodeno_t ino
, uint64_t file_size
, time_t file_mtime
,
2160 const file_layout_t
&layout
, InodeStore
*out
, std::string symlink
)
2162 ceph_assert(out
!= NULL
);
2164 auto inode
= out
->get_inode();
2165 if(!symlink
.empty()) {
2166 inode
->mode
= 0777 | S_IFLNK
;
2167 out
->symlink
= symlink
;
2170 inode
->mode
= 0500 | S_IFREG
;
2173 inode
->size
= file_size
;
2174 inode
->max_size_ever
= file_size
;
2175 inode
->mtime
.tv
.tv_sec
= file_mtime
;
2176 inode
->atime
.tv
.tv_sec
= file_mtime
;
2177 inode
->ctime
.tv
.tv_sec
= file_mtime
;
2179 inode
->layout
= layout
;
2181 inode
->truncate_seq
= 1;
2182 inode
->truncate_size
= -1ull;
2184 inode
->inline_data
.version
= CEPH_INLINE_NONE
;
2189 inode
->backtrace_version
= 1;
2190 inode
->uid
= g_conf()->mds_root_ino_uid
;
2191 inode
->gid
= g_conf()->mds_root_ino_gid
;
2194 void MetadataTool::build_dir_dentry(
2195 inodeno_t ino
, const frag_info_t
&fragstat
,
2196 const file_layout_t
&layout
, InodeStore
*out
)
2198 ceph_assert(out
!= NULL
);
2200 auto inode
= out
->get_inode();
2201 inode
->mode
= 0755 | S_IFDIR
;
2202 inode
->dirstat
= fragstat
;
2203 inode
->mtime
.tv
.tv_sec
= fragstat
.mtime
;
2204 inode
->atime
.tv
.tv_sec
= fragstat
.mtime
;
2205 inode
->ctime
.tv
.tv_sec
= fragstat
.mtime
;
2207 inode
->layout
= layout
;
2208 inode
->dir_layout
.dl_dir_hash
= g_conf()->mds_default_dir_hash
;
2210 inode
->truncate_seq
= 1;
2211 inode
->truncate_size
= -1ull;
2213 inode
->inline_data
.version
= CEPH_INLINE_NONE
;
2218 inode
->backtrace_version
= 1;
2219 inode
->uid
= g_conf()->mds_root_ino_uid
;
2220 inode
->gid
= g_conf()->mds_root_ino_gid
;