1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/compat.h"
16 #include "common/errno.h"
17 #include "common/ceph_argparse.h"
19 #include "include/util.h"
21 #include "mds/CInode.h"
22 #include "cls/cephfs/cls_cephfs_client.h"
26 #include "include/compat.h"
28 #define dout_context g_ceph_context
29 #define dout_subsys ceph_subsys_mds
31 #define dout_prefix *_dout << "datascan." << __func__ << ": "
33 void DataScan::usage()
35 std::cout
<< "Usage: \n"
36 << " cephfs-data-scan init [--force-init]\n"
37 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
38 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
39 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
40 << " cephfs-data-scan scan_links\n"
42 << " --force-corrupt: overrite apparently corrupt structures\n"
43 << " --force-init: write root inodes even if they exist\n"
44 << " --force-pool: use data pool even if it is not in FSMap\n"
45 << " --worker_m: Maximum number of workers\n"
46 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
48 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
49 << " cephfs-data-scan cleanup <data pool name>\n"
52 generic_client_usage();
55 bool DataScan::parse_kwarg(
56 const std::vector
<const char*> &args
,
57 std::vector
<const char *>::const_iterator
&i
,
60 if (i
+ 1 == args
.end()) {
64 const std::string
arg(*i
);
65 const std::string
val(*(i
+ 1));
67 if (arg
== std::string("--output-dir")) {
69 derr
<< "Unexpected --output-dir: output already selected!" << dendl
;
73 dout(4) << "Using local file output to '" << val
<< "'" << dendl
;
74 driver
= new LocalFileDriver(val
, data_io
);
76 } else if (arg
== std::string("--worker_n")) {
78 n
= strict_strtoll(val
.c_str(), 10, &err
);
80 std::cerr
<< "Invalid worker number '" << val
<< "'" << std::endl
;
85 } else if (arg
== std::string("--worker_m")) {
87 m
= strict_strtoll(val
.c_str(), 10, &err
);
89 std::cerr
<< "Invalid worker count '" << val
<< "'" << std::endl
;
94 } else if (arg
== std::string("--filter-tag")) {
96 dout(10) << "Applying tag filter: '" << filter_tag
<< "'" << dendl
;
98 } else if (arg
== std::string("--filesystem")) {
99 std::shared_ptr
<const Filesystem
> fs
;
100 *r
= fsmap
->parse_filesystem(val
, &fs
);
102 std::cerr
<< "Invalid filesystem '" << val
<< "'" << std::endl
;
107 } else if (arg
== std::string("--alternate-pool")) {
108 metadata_pool_name
= val
;
115 bool DataScan::parse_arg(
116 const std::vector
<const char*> &args
,
117 std::vector
<const char *>::const_iterator
&i
)
119 const std::string
arg(*i
);
120 if (arg
== "--force-pool") {
123 } else if (arg
== "--force-corrupt") {
124 force_corrupt
= true;
126 } else if (arg
== "--force-init") {
134 int DataScan::main(const std::vector
<const char*> &args
)
138 if (args
.size() < 1) {
143 // Common RADOS init: open metadata pool
144 // =====================================
145 librados::Rados rados
;
146 int r
= rados
.init_with_context(g_ceph_context
);
148 derr
<< "RADOS unavailable" << dendl
;
152 std::string
const &command
= args
[0];
153 std::string data_pool_name
;
155 std::string pg_files_path
;
156 std::set
<pg_t
> pg_files_pgs
;
158 // Consume any known --key val or --flag arguments
159 for (std::vector
<const char *>::const_iterator i
= args
.begin() + 1;
160 i
!= args
.end(); ++i
) {
161 if (parse_kwarg(args
, i
, &r
)) {
162 // Skip the kwarg value field
169 if (parse_arg(args
, i
)) {
173 // Trailing positional argument
174 if (i
+ 1 == args
.end() &&
175 (command
== "scan_inodes"
176 || command
== "scan_extents"
177 || command
== "cleanup")) {
182 if (command
== "pg_files") {
183 if (i
== args
.begin() + 1) {
188 bool parsed
= pg
.parse(*i
);
190 std::cerr
<< "Invalid PG '" << *i
<< "'" << std::endl
;
193 pg_files_pgs
.insert(pg
);
200 // Fall through: unhandled
201 std::cerr
<< "Unknown argument '" << *i
<< "'" << std::endl
;
205 // If caller didn't specify a namespace, try to pick
206 // one if only one exists
207 if (fscid
== FS_CLUSTER_ID_NONE
) {
208 if (fsmap
->filesystem_count() == 1) {
209 fscid
= fsmap
->get_filesystem()->fscid
;
211 std::cerr
<< "Specify a filesystem with --filesystem" << std::endl
;
215 auto fs
= fsmap
->get_filesystem(fscid
);
216 assert(fs
!= nullptr);
218 // Default to output to metadata pool
219 if (driver
== NULL
) {
220 driver
= new MetadataDriver();
221 driver
->set_force_corrupt(force_corrupt
);
222 driver
->set_force_init(force_init
);
223 dout(4) << "Using metadata pool output" << dendl
;
226 dout(4) << "connecting to RADOS..." << dendl
;
229 std::cerr
<< "couldn't connect to cluster: " << cpp_strerror(r
)
234 r
= driver
->init(rados
, metadata_pool_name
, fsmap
, fscid
);
239 if (command
== "pg_files") {
240 auto pge
= PgFiles(objecter
, pg_files_pgs
);
242 return pge
.scan_path(pg_files_path
);
245 // Initialize data_io for those commands that need it
246 if (command
== "scan_inodes" ||
247 command
== "scan_extents" ||
248 command
== "cleanup") {
249 if (data_pool_name
.empty()) {
250 std::cerr
<< "Data pool not specified" << std::endl
;
255 data_pool_id
= rados
.pool_lookup(data_pool_name
.c_str());
256 if (data_pool_id
< 0) {
257 std::cerr
<< "Data pool '" << data_pool_name
<< "' not found!" << std::endl
;
260 dout(4) << "data pool '" << data_pool_name
261 << "' has ID " << data_pool_id
<< dendl
;
264 if (!fs
->mds_map
.is_data_pool(data_pool_id
)) {
265 std::cerr
<< "Warning: pool '" << data_pool_name
<< "' is not a "
266 "CephFS data pool!" << std::endl
;
268 std::cerr
<< "Use --force-pool to continue" << std::endl
;
273 dout(4) << "opening data pool '" << data_pool_name
<< "'" << dendl
;
274 r
= rados
.ioctx_create(data_pool_name
.c_str(), data_io
);
280 // Initialize metadata_io from MDSMap for scan_frags
281 if (command
== "scan_frags" || command
== "scan_links") {
282 const auto fs
= fsmap
->get_filesystem(fscid
);
284 std::cerr
<< "Filesystem id " << fscid
<< " does not exist" << std::endl
;
287 int64_t const metadata_pool_id
= fs
->mds_map
.get_metadata_pool();
289 dout(4) << "resolving metadata pool " << metadata_pool_id
<< dendl
;
290 int r
= rados
.pool_reverse_lookup(metadata_pool_id
, &metadata_pool_name
);
292 std::cerr
<< "Pool " << metadata_pool_id
293 << " identified in MDS map not found in RADOS!" << std::endl
;
297 r
= rados
.ioctx_create(metadata_pool_name
.c_str(), metadata_io
);
303 // Finally, dispatch command
304 if (command
== "scan_inodes") {
305 return scan_inodes();
306 } else if (command
== "scan_extents") {
307 return scan_extents();
308 } else if (command
== "scan_frags") {
310 } else if (command
== "scan_links") {
312 } else if (command
== "cleanup") {
314 } else if (command
== "init") {
315 return driver
->init_roots(fs
->mds_map
.get_first_data_pool());
317 std::cerr
<< "Unknown command '" << command
<< "'" << std::endl
;
322 int MetadataDriver::inject_unlinked_inode(
323 inodeno_t inono
, int mode
, int64_t data_pool_id
)
325 const object_t oid
= InodeStore::get_object_name(inono
, frag_t(), ".inode");
328 bool already_exists
= false;
329 int r
= root_exists(inono
, &already_exists
);
333 if (already_exists
&& !force_init
) {
334 std::cerr
<< "Inode 0x" << std::hex
<< inono
<< std::dec
<< " already"
335 " exists, skipping create. Use --force-init to overwrite"
336 " the existing object." << std::endl
;
342 inode
.inode
.ino
= inono
;
343 inode
.inode
.version
= 1;
344 inode
.inode
.xattr_version
= 1;
345 inode
.inode
.mode
= 0500 | mode
;
346 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
347 // (we won't actually give the *correct* dirstat here though)
348 inode
.inode
.dirstat
.nfiles
= 1;
351 inode
.inode
.mtime
= ceph_clock_now();
352 inode
.inode
.nlink
= 1;
353 inode
.inode
.truncate_size
= -1ull;
354 inode
.inode
.truncate_seq
= 1;
355 inode
.inode
.uid
= g_conf
->mds_root_ino_uid
;
356 inode
.inode
.gid
= g_conf
->mds_root_ino_gid
;
358 // Force layout to default: should we let users override this so that
359 // they don't have to mount the filesystem to correct it?
360 inode
.inode
.layout
= file_layout_t::get_default();
361 inode
.inode
.layout
.pool_id
= data_pool_id
;
362 inode
.inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
364 // Assume that we will get our stats wrong, and that we may
365 // be ignoring dirfrags that exist
366 inode
.damage_flags
|= (DAMAGE_STATS
| DAMAGE_RSTATS
| DAMAGE_FRAGTREE
);
370 ::encode(std::string(CEPH_FS_ONDISK_MAGIC
), inode_bl
);
371 inode
.encode(inode_bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
374 r
= metadata_io
.write_full(oid
.name
, inode_bl
);
376 derr
<< "Error writing '" << oid
.name
<< "': " << cpp_strerror(r
) << dendl
;
383 int MetadataDriver::root_exists(inodeno_t ino
, bool *result
)
385 object_t oid
= InodeStore::get_object_name(ino
, frag_t(), ".inode");
388 int r
= metadata_io
.stat(oid
.name
, &size
, &mtime
);
400 int MetadataDriver::init_roots(int64_t data_pool_id
)
403 r
= inject_unlinked_inode(MDS_INO_ROOT
, S_IFDIR
|0755, data_pool_id
);
407 r
= inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR
, data_pool_id
);
411 bool created
= false;
412 r
= find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created
);
420 int MetadataDriver::check_roots(bool *result
)
423 r
= root_exists(MDS_INO_ROOT
, result
);
431 r
= root_exists(MDS_INO_MDSDIR(0), result
);
446 * 0. Create root inodes if don't exist
447 * PARALLEL scan_extents
448 * 1. Size and mtime recovery: scan ALL objects, and update 0th
449 * objects with max size and max mtime seen.
450 * PARALLEL scan_inodes
451 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
452 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
453 * or rstats at this stage. Inodes without backtraces go into
455 * TODO: SERIAL "recover stats"
456 * 3. Dirfrag statistics: depth first traverse into metadata tree,
457 * rebuilding dir sizes.
458 * TODO PARALLEL "clean up"
459 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
460 * anything onto them) and remove any of the xattrs that we
461 * used for accumulating.
465 int parse_oid(const std::string
&oid
, uint64_t *inode_no
, uint64_t *obj_id
)
467 if (oid
.find(".") == std::string::npos
|| oid
.find(".") == oid
.size() - 1) {
472 std::string inode_str
= oid
.substr(0, oid
.find("."));
473 *inode_no
= strict_strtoll(inode_str
.c_str(), 16, &err
);
478 std::string pos_string
= oid
.substr(oid
.find(".") + 1);
479 *obj_id
= strict_strtoll(pos_string
.c_str(), 16, &err
);
488 int DataScan::scan_extents()
490 return forall_objects(data_io
, false, [this](
491 std::string
const &oid
,
492 uint64_t obj_name_ino
,
493 uint64_t obj_name_offset
) -> int
498 int r
= data_io
.stat(oid
, &size
, &mtime
);
499 dout(10) << "handling object " << obj_name_ino
500 << "." << obj_name_offset
<< dendl
;
502 dout(4) << "Cannot stat '" << oid
<< "': skipping" << dendl
;
506 // I need to keep track of
507 // * The highest object ID seen
508 // * The size of the highest object ID seen
509 // * The largest object seen
511 // Given those things, I can later infer the object chunking
512 // size, the offset of the last object (chunk size * highest ID seen)
513 // and the actual size (offset of last object + size of highest ID seen)
515 // This logic doesn't take account of striping.
516 r
= ClsCephFSClient::accumulate_inode_metadata(
523 derr
<< "Failed to accumulate metadata data from '"
524 << oid
<< "': " << cpp_strerror(r
) << dendl
;
532 int DataScan::probe_filter(librados::IoCtx
&ioctx
)
534 bufferlist filter_bl
;
535 ClsCephFSClient::build_tag_filter("test", &filter_bl
);
536 librados::ObjectCursor range_i
;
537 librados::ObjectCursor range_end
;
539 std::vector
<librados::ObjectItem
> tmp_result
;
540 librados::ObjectCursor tmp_next
;
541 int r
= ioctx
.object_list(ioctx
.object_list_begin(), ioctx
.object_list_end(),
542 1, filter_bl
, &tmp_result
, &tmp_next
);
547 int DataScan::forall_objects(
548 librados::IoCtx
&ioctx
,
550 std::function
<int(std::string
, uint64_t, uint64_t)> handler
553 librados::ObjectCursor range_i
;
554 librados::ObjectCursor range_end
;
555 ioctx
.object_list_slice(
556 ioctx
.object_list_begin(),
557 ioctx
.object_list_end(),
564 bufferlist filter_bl
;
566 bool legacy_filtering
= false;
568 // probe to deal with older OSDs that don't support
569 // the cephfs pgls filtering mode
570 legacy_filtering
= !probe_filter(ioctx
);
571 if (!legacy_filtering
) {
572 ClsCephFSClient::build_tag_filter(filter_tag
, &filter_bl
);
577 while(range_i
< range_end
) {
578 std::vector
<librados::ObjectItem
> result
;
579 int r
= ioctx
.object_list(range_i
, range_end
, 1,
580 filter_bl
, &result
, &range_i
);
582 derr
<< "Unexpected error listing objects: " << cpp_strerror(r
) << dendl
;
586 for (const auto &i
: result
) {
587 const std::string
&oid
= i
.oid
;
588 uint64_t obj_name_ino
= 0;
589 uint64_t obj_name_offset
= 0;
590 r
= parse_oid(oid
, &obj_name_ino
, &obj_name_offset
);
592 dout(4) << "Bad object name '" << oid
<< "', skipping" << dendl
;
596 if (untagged_only
&& legacy_filtering
) {
597 dout(20) << "Applying filter to " << oid
<< dendl
;
599 // We are only interested in 0th objects during this phase: we touched
600 // the other objects during scan_extents
601 if (obj_name_offset
!= 0) {
602 dout(20) << "Non-zeroth object" << dendl
;
606 bufferlist scrub_tag_bl
;
607 int r
= ioctx
.getxattr(oid
, "scrub_tag", scrub_tag_bl
);
609 std::string read_tag
;
610 bufferlist::iterator q
= scrub_tag_bl
.begin();
612 ::decode(read_tag
, q
);
613 if (read_tag
== filter_tag
) {
614 dout(20) << "skipping " << oid
<< " because it has the filter_tag"
618 } catch (const buffer::error
&err
) {
620 dout(20) << "read non-matching tag '" << read_tag
<< "'" << dendl
;
622 dout(20) << "no tag read (" << r
<< ")" << dendl
;
625 } else if (untagged_only
) {
626 assert(obj_name_offset
== 0);
627 dout(20) << "OSD matched oid " << oid
<< dendl
;
630 int this_oid_r
= handler(oid
, obj_name_ino
, obj_name_offset
);
631 if (r
== 0 && this_oid_r
< 0) {
640 int DataScan::scan_inodes()
643 int r
= driver
->check_roots(&roots_present
);
645 derr
<< "Unexpected error checking roots: '"
646 << cpp_strerror(r
) << "'" << dendl
;
650 if (!roots_present
) {
651 std::cerr
<< "Some or all system inodes are absent. Run 'init' from "
652 "one node before running 'scan_inodes'" << std::endl
;
656 return forall_objects(data_io
, true, [this](
657 std::string
const &oid
,
658 uint64_t obj_name_ino
,
659 uint64_t obj_name_offset
) -> int
663 dout(10) << "handling object "
664 << std::hex
<< obj_name_ino
<< "." << obj_name_offset
<< std::dec
667 AccumulateResult accum_res
;
668 inode_backtrace_t backtrace
;
669 file_layout_t loaded_layout
= file_layout_t::get_default();
670 r
= ClsCephFSClient::fetch_inode_accumulate_result(
671 data_io
, oid
, &backtrace
, &loaded_layout
, &accum_res
);
674 dout(4) << "Accumulated metadata missing from '"
675 << oid
<< ", did you run scan_extents?" << dendl
;
678 dout(4) << "Unexpected error loading accumulated metadata from '"
679 << oid
<< "': " << cpp_strerror(r
) << dendl
;
680 // FIXME: this creates situation where if a client has a corrupt
681 // backtrace/layout, we will fail to inject it. We should (optionally)
682 // proceed if the backtrace/layout is corrupt but we have valid
683 // accumulated metadata.
687 const time_t file_mtime
= accum_res
.max_mtime
;
688 uint64_t file_size
= 0;
689 bool have_backtrace
= !(backtrace
.ancestors
.empty());
691 // This is the layout we will use for injection, populated either
692 // from loaded_layout or from best guesses
693 file_layout_t guessed_layout
;
694 guessed_layout
.pool_id
= data_pool_id
;
696 // Calculate file_size, guess the layout
697 if (accum_res
.ceiling_obj_index
> 0) {
698 uint32_t chunk_size
= file_layout_t::get_default().object_size
;
699 // When there are multiple objects, the largest object probably
700 // indicates the chunk size. But not necessarily, because files
701 // can be sparse. Only make this assumption if size seen
702 // is a power of two, as chunk sizes typically are.
703 if ((accum_res
.max_obj_size
& (accum_res
.max_obj_size
- 1)) == 0) {
704 chunk_size
= accum_res
.max_obj_size
;
707 if (loaded_layout
.pool_id
== -1) {
708 // If no stashed layout was found, guess it
709 guessed_layout
.object_size
= chunk_size
;
710 guessed_layout
.stripe_unit
= chunk_size
;
711 guessed_layout
.stripe_count
= 1;
712 } else if (!loaded_layout
.is_valid() ||
713 loaded_layout
.object_size
< accum_res
.max_obj_size
) {
714 // If the max size seen exceeds what the stashed layout claims, then
715 // disbelieve it. Guess instead. Same for invalid layouts on disk.
716 dout(4) << "bogus xattr layout on 0x" << std::hex
<< obj_name_ino
717 << std::dec
<< ", ignoring in favour of best guess" << dendl
;
718 guessed_layout
.object_size
= chunk_size
;
719 guessed_layout
.stripe_unit
= chunk_size
;
720 guessed_layout
.stripe_count
= 1;
722 // We have a stashed layout that we can't disprove, so apply it
723 guessed_layout
= loaded_layout
;
724 dout(20) << "loaded layout from xattr:"
725 << " os: " << guessed_layout
.object_size
726 << " sc: " << guessed_layout
.stripe_count
727 << " su: " << guessed_layout
.stripe_unit
729 // User might have transplanted files from a pool with a different
730 // ID, so whatever the loaded_layout says, we'll force the injected
731 // layout to point to the pool we really read from
732 guessed_layout
.pool_id
= data_pool_id
;
735 if (guessed_layout
.stripe_count
== 1) {
736 // Unstriped file: simple chunking
737 file_size
= guessed_layout
.object_size
* accum_res
.ceiling_obj_index
738 + accum_res
.ceiling_obj_size
;
740 // Striped file: need to examine the last stripe_count objects
741 // in the file to determine the size.
743 // How many complete (i.e. not last stripe) objects?
744 uint64_t complete_objs
= 0;
745 if (accum_res
.ceiling_obj_index
> guessed_layout
.stripe_count
- 1) {
746 complete_objs
= (accum_res
.ceiling_obj_index
/ guessed_layout
.stripe_count
) * guessed_layout
.stripe_count
;
751 // How many potentially-short objects (i.e. last stripe set) objects?
752 uint64_t partial_objs
= accum_res
.ceiling_obj_index
+ 1 - complete_objs
;
754 dout(10) << "calculating striped size from complete objs: "
755 << complete_objs
<< ", partial objs: " << partial_objs
758 // Maximum amount of data that may be in the incomplete objects
759 uint64_t incomplete_size
= 0;
761 // For each short object, calculate the max file size within it
762 // and accumulate the maximum
763 for (uint64_t i
= complete_objs
; i
< complete_objs
+ partial_objs
; ++i
) {
765 snprintf(buf
, sizeof(buf
), "%llx.%08llx",
766 (long long unsigned)obj_name_ino
, (long long unsigned)i
);
770 r
= data_io
.stat(std::string(buf
), &osize
, &omtime
);
773 // Upper bound within this object
774 uint64_t upper_size
= (osize
- 1) / guessed_layout
.stripe_unit
775 * (guessed_layout
.stripe_unit
* guessed_layout
.stripe_count
)
776 + (i
% guessed_layout
.stripe_count
)
777 * guessed_layout
.stripe_unit
+ (osize
- 1)
778 % guessed_layout
.stripe_unit
+ 1;
779 incomplete_size
= MAX(incomplete_size
, upper_size
);
781 } else if (r
== -ENOENT
) {
782 // Absent object, treat as size 0 and ignore.
784 // Unexpected error, carry r to outer scope for handling.
788 if (r
!= 0 && r
!= -ENOENT
) {
789 derr
<< "Unexpected error checking size of ino 0x" << std::hex
790 << obj_name_ino
<< std::dec
<< ": " << cpp_strerror(r
) << dendl
;
793 file_size
= complete_objs
* guessed_layout
.object_size
797 file_size
= accum_res
.ceiling_obj_size
;
798 if (loaded_layout
.pool_id
< 0
799 || loaded_layout
.object_size
< accum_res
.max_obj_size
) {
800 // No layout loaded, or inconsistent layout, use default
801 guessed_layout
= file_layout_t::get_default();
802 guessed_layout
.pool_id
= data_pool_id
;
804 guessed_layout
= loaded_layout
;
808 // Santity checking backtrace ino against object name
809 if (have_backtrace
&& backtrace
.ino
!= obj_name_ino
) {
810 dout(4) << "Backtrace ino 0x" << std::hex
<< backtrace
.ino
811 << " doesn't match object name ino 0x" << obj_name_ino
812 << std::dec
<< dendl
;
813 have_backtrace
= false;
817 build_file_dentry(obj_name_ino
, file_size
, file_mtime
, guessed_layout
, &dentry
);
819 // Inject inode to the metadata pool
820 if (have_backtrace
) {
821 inode_backpointer_t root_bp
= *(backtrace
.ancestors
.rbegin());
822 if (MDS_INO_IS_MDSDIR(root_bp
.dirino
)) {
823 /* Special case for strays: even if we have a good backtrace,
824 * don't put it in the stray dir, because while that would technically
825 * give it linkage it would still be invisible to the user */
826 r
= driver
->inject_lost_and_found(obj_name_ino
, dentry
);
828 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
829 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
831 dout(4) << "Use --force-corrupt to overwrite structures that "
832 "appear to be corrupt" << dendl
;
836 /* Happy case: we will inject a named dentry for this inode */
837 r
= driver
->inject_with_backtrace(backtrace
, dentry
);
839 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
840 << std::dec
<< " with backtrace: " << cpp_strerror(r
) << dendl
;
842 dout(4) << "Use --force-corrupt to overwrite structures that "
843 "appear to be corrupt" << dendl
;
848 /* Backtrace-less case: we will inject a lost+found dentry */
849 r
= driver
->inject_lost_and_found(
850 obj_name_ino
, dentry
);
852 dout(4) << "Error injecting 0x" << std::hex
<< obj_name_ino
853 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
855 dout(4) << "Use --force-corrupt to overwrite structures that "
856 "appear to be corrupt" << dendl
;
865 int DataScan::cleanup()
867 // We are looking for only zeroth object
869 return forall_objects(data_io
, true, [this](
870 std::string
const &oid
,
871 uint64_t obj_name_ino
,
872 uint64_t obj_name_offset
) -> int
875 r
= ClsCephFSClient::delete_inode_accumulate_result(data_io
, oid
);
877 dout(4) << "Error deleting accumulated metadata from '"
878 << oid
<< "': " << cpp_strerror(r
) << dendl
;
884 bool DataScan::valid_ino(inodeno_t ino
) const
886 return (ino
>= inodeno_t((1ull << 40)))
887 || (MDS_INO_IS_STRAY(ino
))
888 || (MDS_INO_IS_MDSDIR(ino
))
889 || ino
== MDS_INO_ROOT
890 || ino
== MDS_INO_CEPH
;
893 int DataScan::scan_links()
895 MetadataDriver
*metadata_driver
= dynamic_cast<MetadataDriver
*>(driver
);
896 if (!metadata_driver
) {
897 derr
<< "Unexpected --output-dir option for scan_links" << dendl
;
901 interval_set
<inodeno_t
> used_inos
;
902 map
<inodeno_t
, int> remote_links
;
911 link_info_t() : version(0), nlink(0), is_dir(false) {}
912 link_info_t(inodeno_t di
, frag_t df
, const string
& n
, const inode_t i
) :
913 dirino(di
), frag(df
), name(n
),
914 version(i
.version
), nlink(i
.nlink
), is_dir(S_IFDIR
& i
.mode
) {}
915 dirfrag_t
dirfrag() const {
916 return dirfrag_t(dirino
, frag
);
919 map
<inodeno_t
, list
<link_info_t
> > dup_primaries
;
920 map
<inodeno_t
, link_info_t
> bad_nlink_inos
;
922 map
<dirfrag_t
, set
<string
> > to_remove
;
929 for (int step
= SCAN_INOS
; step
<= CHECK_LINK
; step
++) {
930 const librados::NObjectIterator it_end
= metadata_io
.nobjects_end();
931 for (auto it
= metadata_io
.nobjects_begin(); it
!= it_end
; ++it
) {
932 const std::string oid
= it
->get_oid();
934 uint64_t dir_ino
= 0;
935 uint64_t frag_id
= 0;
936 int r
= parse_oid(oid
, &dir_ino
, &frag_id
);
938 dout(10) << "Not a dirfrag: '" << oid
<< "'" << dendl
;
941 // parse_oid can only do 0 or -EINVAL
945 if (!valid_ino(dir_ino
)) {
946 dout(10) << "Not a dirfrag (invalid ino): '" << oid
<< "'" << dendl
;
950 std::map
<std::string
, bufferlist
> items
;
951 r
= metadata_io
.omap_get_vals(oid
, "", (uint64_t)-1, &items
);
953 derr
<< "Error getting omap from '" << oid
<< "': " << cpp_strerror(r
) << dendl
;
957 for (auto& p
: items
) {
958 bufferlist::iterator q
= p
.second
.begin();
961 dentry_key_t::decode_helper(p
.first
, dname
, last
);
963 if (last
!= CEPH_NOSNAP
)
968 ::decode(dnfirst
, q
);
970 ::decode(dentry_type
, q
);
971 if (dentry_type
== 'I') {
973 inode
.decode_bare(q
);
974 inodeno_t ino
= inode
.inode
.ino
;
976 if (step
== SCAN_INOS
) {
977 if (used_inos
.contains(ino
, 1)) {
978 dup_primaries
[ino
].size();
980 used_inos
.insert(ino
);
982 } else if (step
== CHECK_LINK
) {
983 auto q
= dup_primaries
.find(ino
);
984 if (q
!= dup_primaries
.end()) {
985 q
->second
.push_back(link_info_t(dir_ino
, frag_id
, dname
, inode
.inode
));
988 auto r
= remote_links
.find(ino
);
989 if (r
!= remote_links
.end())
991 if (!MDS_INO_IS_STRAY(dir_ino
))
993 if (inode
.inode
.nlink
!= nlink
) {
994 derr
<< "Bad nlink on " << ino
<< " expected " << nlink
995 << " has " << inode
.inode
.nlink
<< dendl
;
996 bad_nlink_inos
[ino
] = link_info_t(dir_ino
, frag_id
, dname
, inode
.inode
);
997 bad_nlink_inos
[ino
].nlink
= nlink
;
1001 } else if (dentry_type
== 'L') {
1003 unsigned char d_type
;
1005 ::decode(d_type
, q
);
1007 if (step
== SCAN_INOS
) {
1008 remote_links
[ino
]++;
1009 } else if (step
== CHECK_LINK
) {
1010 if (!used_inos
.contains(ino
, 1)) {
1011 derr
<< "Bad remote link dentry 0x" << std::hex
<< dir_ino
1012 << std::dec
<< "/" << dname
1013 << ", ino " << ino
<< " not found" << dendl
;
1015 dentry_key_t
dn_key(CEPH_NOSNAP
, dname
.c_str());
1017 to_remove
[dirfrag_t(dir_ino
, frag_id
)].insert(key
);
1021 derr
<< "Invalid tag char '" << dentry_type
<< "' dentry 0x" << dir_ino
1022 << std::dec
<< "/" << dname
<< dendl
;
1025 } catch (const buffer::error
&err
) {
1026 derr
<< "Error decoding dentry 0x" << std::hex
<< dir_ino
1027 << std::dec
<< "/" << dname
<< dendl
;
1035 for (auto& p
: dup_primaries
) {
1037 for (auto& q
: p
.second
) {
1038 if (q
.version
> newest
.version
) {
1040 } else if (q
.version
== newest
.version
&&
1041 !MDS_INO_IS_STRAY(q
.dirino
) &&
1042 MDS_INO_IS_STRAY(newest
.dirino
)) {
1047 for (auto& q
: p
.second
) {
1048 // in the middle of dir fragmentation?
1049 if (newest
.dirino
== q
.dirino
&& newest
.name
== q
.name
)
1053 dentry_key_t
dn_key(CEPH_NOSNAP
, q
.name
.c_str());
1055 to_remove
[q
.dirfrag()].insert(key
);
1056 derr
<< "Remove duplicated ino 0x" << p
.first
<< " from "
1057 << q
.dirfrag() << "/" << q
.name
<< dendl
;
1061 auto q
= remote_links
.find(p
.first
);
1062 if (q
!= remote_links
.end())
1064 if (!MDS_INO_IS_STRAY(newest
.dirino
))
1067 if (nlink
!= newest
.nlink
) {
1068 derr
<< "Bad nlink on " << p
.first
<< " expected " << nlink
1069 << " has " << newest
.nlink
<< dendl
;
1070 bad_nlink_inos
[p
.first
] = newest
;
1071 bad_nlink_inos
[p
.first
].nlink
= nlink
;
1074 dup_primaries
.clear();
1075 remote_links
.clear();
1077 for (auto& p
: to_remove
) {
1078 object_t frag_oid
= InodeStore::get_object_name(p
.first
.ino
, p
.first
.frag
, "");
1080 int r
= metadata_io
.omap_rm_keys(frag_oid
.name
, p
.second
);
1082 derr
<< "Error removing duplicated dentries from " << p
.first
<< dendl
;
1088 for (auto &p
: bad_nlink_inos
) {
1090 int r
= read_dentry(p
.second
.dirino
, p
.second
.frag
, p
.second
.name
, &inode
);
1092 derr
<< "Unexpected error reading dentry "
1093 << p
.second
.dirfrag() << "/" << p
.second
.name
1094 << ": " << cpp_strerror(r
) << dendl
;
1098 if (inode
.inode
.ino
!= p
.first
|| inode
.inode
.version
!= p
.second
.version
)
1101 inode
.inode
.nlink
= p
.second
.nlink
;
1102 r
= metadata_driver
->inject_linkage(p
.second
.dirino
, p
.second
.name
, p
.second
.frag
, inode
);
1110 int DataScan::scan_frags()
1113 int r
= driver
->check_roots(&roots_present
);
1115 derr
<< "Unexpected error checking roots: '"
1116 << cpp_strerror(r
) << "'" << dendl
;
1120 if (!roots_present
) {
1121 std::cerr
<< "Some or all system inodes are absent. Run 'init' from "
1122 "one node before running 'scan_inodes'" << std::endl
;
1126 return forall_objects(metadata_io
, true, [this](
1127 std::string
const &oid
,
1128 uint64_t obj_name_ino
,
1129 uint64_t obj_name_offset
) -> int
1132 r
= parse_oid(oid
, &obj_name_ino
, &obj_name_offset
);
1134 dout(4) << "Bad object name '" << oid
<< "', skipping" << dendl
;
1138 if (obj_name_ino
< (1ULL << 40)) {
1139 // FIXME: we're skipping stray dirs here: if they're
1140 // orphaned then we should be resetting them some other
1142 dout(10) << "Skipping system ino " << obj_name_ino
<< dendl
;
1146 AccumulateResult accum_res
;
1147 inode_backtrace_t backtrace
;
1149 // Default to inherit layout (i.e. no explicit layout on dir) which is
1150 // expressed as a zeroed layout struct (see inode_t::has_layout)
1151 file_layout_t loaded_layout
;
1154 bufferlist parent_bl
;
1156 bufferlist layout_bl
;
1159 librados::ObjectReadOperation op
;
1160 op
.getxattr("parent", &parent_bl
, &parent_r
);
1161 op
.getxattr("layout", &layout_bl
, &layout_r
);
1162 r
= metadata_io
.operate(oid
, &op
, &op_bl
);
1163 if (r
!= 0 && r
!= -ENODATA
) {
1164 derr
<< "Unexpected error reading backtrace: " << cpp_strerror(parent_r
) << dendl
;
1168 if (parent_r
!= -ENODATA
) {
1170 bufferlist::iterator q
= parent_bl
.begin();
1171 backtrace
.decode(q
);
1172 } catch (buffer::error
&e
) {
1173 dout(4) << "Corrupt backtrace on '" << oid
<< "': " << e
<< dendl
;
1174 if (!force_corrupt
) {
1177 // Treat backtrace as absent: we'll inject into lost+found
1178 backtrace
= inode_backtrace_t();
1183 if (layout_r
!= -ENODATA
) {
1185 bufferlist::iterator q
= layout_bl
.begin();
1186 ::decode(loaded_layout
, q
);
1187 } catch (buffer::error
&e
) {
1188 dout(4) << "Corrupt layout on '" << oid
<< "': " << e
<< dendl
;
1189 if (!force_corrupt
) {
1195 bool have_backtrace
= !(backtrace
.ancestors
.empty());
1197 // Santity checking backtrace ino against object name
1198 if (have_backtrace
&& backtrace
.ino
!= obj_name_ino
) {
1199 dout(4) << "Backtrace ino 0x" << std::hex
<< backtrace
.ino
1200 << " doesn't match object name ino 0x" << obj_name_ino
1201 << std::dec
<< dendl
;
1202 have_backtrace
= false;
1205 uint64_t fnode_version
= 0;
1207 r
= read_fnode(obj_name_ino
, frag_t(), &fnode
, &fnode_version
);
1209 derr
<< "Corrupt fnode on " << oid
<< dendl
;
1210 if (force_corrupt
) {
1211 fnode
.fragstat
.mtime
= 0;
1212 fnode
.fragstat
.nfiles
= 1;
1213 fnode
.fragstat
.nsubdirs
= 0;
1214 fnode
.accounted_fragstat
= fnode
.fragstat
;
1221 build_dir_dentry(obj_name_ino
, fnode
.accounted_fragstat
,
1222 loaded_layout
, &dentry
);
1224 // Inject inode to the metadata pool
1225 if (have_backtrace
) {
1226 inode_backpointer_t root_bp
= *(backtrace
.ancestors
.rbegin());
1227 if (MDS_INO_IS_MDSDIR(root_bp
.dirino
)) {
1228 /* Special case for strays: even if we have a good backtrace,
1229 * don't put it in the stray dir, because while that would technically
1230 * give it linkage it would still be invisible to the user */
1231 r
= driver
->inject_lost_and_found(obj_name_ino
, dentry
);
1233 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
1234 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
1236 dout(4) << "Use --force-corrupt to overwrite structures that "
1237 "appear to be corrupt" << dendl
;
1241 /* Happy case: we will inject a named dentry for this inode */
1242 r
= driver
->inject_with_backtrace(backtrace
, dentry
);
1244 dout(4) << "Error injecting 0x" << std::hex
<< backtrace
.ino
1245 << std::dec
<< " with backtrace: " << cpp_strerror(r
) << dendl
;
1247 dout(4) << "Use --force-corrupt to overwrite structures that "
1248 "appear to be corrupt" << dendl
;
1253 /* Backtrace-less case: we will inject a lost+found dentry */
1254 r
= driver
->inject_lost_and_found(
1255 obj_name_ino
, dentry
);
1257 dout(4) << "Error injecting 0x" << std::hex
<< obj_name_ino
1258 << std::dec
<< " into lost+found: " << cpp_strerror(r
) << dendl
;
1260 dout(4) << "Use --force-corrupt to overwrite structures that "
1261 "appear to be corrupt" << dendl
;
1270 int MetadataTool::read_fnode(
1271 inodeno_t ino
, frag_t frag
, fnode_t
*fnode
,
1272 uint64_t *last_version
)
1274 assert(fnode
!= NULL
);
1276 object_t frag_oid
= InodeStore::get_object_name(ino
, frag
, "");
1277 bufferlist fnode_bl
;
1278 int r
= metadata_io
.omap_get_header(frag_oid
.name
, &fnode_bl
);
1279 *last_version
= metadata_io
.get_last_version();
1284 bufferlist::iterator old_fnode_iter
= fnode_bl
.begin();
1286 (*fnode
).decode(old_fnode_iter
);
1287 } catch (const buffer::error
&err
) {
1294 int MetadataTool::read_dentry(inodeno_t parent_ino
, frag_t frag
,
1295 const std::string
&dname
, InodeStore
*inode
)
1297 assert(inode
!= NULL
);
1301 dentry_key_t
dn_key(CEPH_NOSNAP
, dname
.c_str());
1304 std::set
<std::string
> keys
;
1306 std::map
<std::string
, bufferlist
> vals
;
1307 object_t frag_oid
= InodeStore::get_object_name(parent_ino
, frag
, "");
1308 int r
= metadata_io
.omap_get_vals_by_keys(frag_oid
.name
, keys
, &vals
);
1309 dout(20) << "oid=" << frag_oid
.name
1310 << " dname=" << dname
1312 << ", r=" << r
<< dendl
;
1317 if (vals
.find(key
) == vals
.end()) {
1318 dout(20) << key
<< " not found in result" << dendl
;
1323 bufferlist::iterator q
= vals
[key
].begin();
1325 ::decode(dnfirst
, q
);
1327 ::decode(dentry_type
, q
);
1328 if (dentry_type
== 'I') {
1329 inode
->decode_bare(q
);
1332 dout(20) << "dentry type '" << dentry_type
<< "': cannot"
1333 "read an inode out of that" << dendl
;
1336 } catch (const buffer::error
&err
) {
1337 dout(20) << "encoding error in dentry 0x" << std::hex
<< parent_ino
1338 << std::dec
<< "/" << dname
<< dendl
;
1345 int MetadataDriver::inject_lost_and_found(
1346 inodeno_t ino
, const InodeStore
&dentry
)
1348 // Create lost+found if doesn't exist
1349 bool created
= false;
1350 int r
= find_or_create_dirfrag(CEPH_INO_ROOT
, frag_t(), &created
);
1355 r
= read_dentry(CEPH_INO_ROOT
, frag_t(), "lost+found", &lf_ino
);
1356 if (r
== -ENOENT
|| r
== -EINVAL
) {
1357 if (r
== -EINVAL
&& !force_corrupt
) {
1361 // To have a directory not specify a layout, give it zeros (see
1362 // inode_t::has_layout)
1363 file_layout_t inherit_layout
;
1365 // Construct LF inode
1366 frag_info_t fragstat
;
1367 fragstat
.nfiles
= 1,
1368 build_dir_dentry(CEPH_INO_LOST_AND_FOUND
, fragstat
, inherit_layout
, &lf_ino
);
1370 // Inject link to LF inode in the root dir
1371 r
= inject_linkage(CEPH_INO_ROOT
, "lost+found", frag_t(), lf_ino
);
1376 if (!(lf_ino
.inode
.mode
& S_IFDIR
)) {
1377 derr
<< "lost+found exists but is not a directory!" << dendl
;
1378 // In this case we error out, and the user should do something about
1384 r
= find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND
, frag_t(), &created
);
1389 InodeStore recovered_ino
;
1392 const std::string dname
= lost_found_dname(ino
);
1394 // Write dentry into lost+found dirfrag
1395 return inject_linkage(lf_ino
.inode
.ino
, dname
, frag_t(), dentry
);
1399 int MetadataDriver::get_frag_of(
1401 const std::string
&target_dname
,
1404 object_t root_frag_oid
= InodeStore::get_object_name(dirino
, frag_t(), "");
1406 dout(20) << "dirino=" << dirino
<< " target_dname=" << target_dname
<< dendl
;
1408 // Find and load fragtree if existing dirfrag
1409 // ==========================================
1410 bool have_backtrace
= false;
1411 bufferlist parent_bl
;
1412 int r
= metadata_io
.getxattr(root_frag_oid
.name
, "parent", parent_bl
);
1413 if (r
== -ENODATA
) {
1414 dout(10) << "No backtrace on '" << root_frag_oid
<< "'" << dendl
;
1416 dout(4) << "Unexpected error on '" << root_frag_oid
<< "': "
1417 << cpp_strerror(r
) << dendl
;
1421 // Deserialize backtrace
1422 inode_backtrace_t backtrace
;
1423 if (parent_bl
.length()) {
1425 bufferlist::iterator q
= parent_bl
.begin();
1426 backtrace
.decode(q
);
1427 have_backtrace
= true;
1428 } catch (buffer::error
&e
) {
1429 dout(4) << "Corrupt backtrace on '" << root_frag_oid
<< "': " << e
<< dendl
;
1433 if (!(have_backtrace
&& backtrace
.ancestors
.size())) {
1434 // Can't work out fragtree without a backtrace
1435 dout(4) << "No backtrace on '" << root_frag_oid
1436 << "': cannot determine fragtree" << dendl
;
1440 // The parentage of dirino
1441 const inode_backpointer_t
&bp
= *(backtrace
.ancestors
.begin());
1443 // The inode of dirino's parent
1444 const inodeno_t parent_ino
= bp
.dirino
;
1446 // The dname of dirino in its parent.
1447 const std::string
&parent_dname
= bp
.dname
;
1449 dout(20) << "got backtrace parent " << parent_ino
<< "/"
1450 << parent_dname
<< dendl
;
1452 // The primary dentry for dirino
1453 InodeStore existing_dentry
;
1455 // See if we can find ourselves in dirfrag zero of the parent: this
1456 // is a fast path that avoids needing to go further up the tree
1457 // if the parent isn't fragmented (worst case we would have to
1458 // go all the way to the root)
1459 r
= read_dentry(parent_ino
, frag_t(), parent_dname
, &existing_dentry
);
1461 // Great, fast path: return the fragtree from here
1462 if (existing_dentry
.inode
.ino
!= dirino
) {
1463 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
1464 << existing_dentry
.inode
.ino
1465 << " vs expected 0x" << dirino
<< std::dec
<< dendl
;
1468 dout(20) << "fast path, fragtree is "
1469 << existing_dentry
.dirfragtree
<< dendl
;
1470 *result_ft
= existing_dentry
.pick_dirfrag(target_dname
);
1471 dout(20) << "frag is " << *result_ft
<< dendl
;
1473 } else if (r
!= -ENOENT
) {
1474 // Dentry not present in 0th frag, must read parent's fragtree
1476 r
= get_frag_of(parent_ino
, parent_dname
, &parent_frag
);
1478 // We have the parent fragtree, so try again to load our dentry
1479 r
= read_dentry(parent_ino
, parent_frag
, parent_dname
, &existing_dentry
);
1482 *result_ft
= existing_dentry
.pick_dirfrag(target_dname
);
1483 dout(20) << "resolved via parent, frag is " << *result_ft
<< dendl
;
1486 if (r
== -EINVAL
|| r
== -ENOENT
) {
1487 return -ENOENT
; // dentry missing or corrupt, so frag is missing
1493 // Couldn't resolve parent fragtree, so can't find ours.
1496 } else if (r
== -EINVAL
) {
1497 // Unreadable dentry, can't know the fragtree.
1500 // Unexpected error, raise it
1506 int MetadataDriver::inject_with_backtrace(
1507 const inode_backtrace_t
&backtrace
, const InodeStore
&dentry
)
1513 // In order to insert something into a directory, we first (ideally)
1514 // need to know the fragtree for the directory. Sometimes we can't
1515 // get that, in which case we just go ahead and insert it into
1516 // fragment zero for a good chance of that being the right thing
1517 // anyway (most moderate-sized dirs aren't fragmented!)
1521 // My immediate ancestry should be correct, so if we can find that
1522 // directory's dirfrag then go inject it there. This works well
1523 // in the case that this inode's dentry was somehow lost and we
1524 // are recreating it, because the rest of the hierarchy
1525 // will probably still exist.
1527 // It's more of a "better than nothing" approach when rebuilding
1528 // a whole tree, as backtraces will in general not be up to date
1529 // beyond the first parent, if anything in the trace was ever
1530 // moved after the file was created.
1534 // The backtrace tells us inodes for each of the parents. If we are
1535 // creating those parent dirfrags, then there is a risk that somehow
1536 // the inode indicated here was also used for data (not a dirfrag) at
1537 // some stage. That would be a zany situation, and we don't check
1538 // for it here, because to do so would require extra IOs for everything
1539 // we inject, and anyway wouldn't guarantee that the inode number
1540 // wasn't in use in some dentry elsewhere in the metadata tree that
1541 // just happened not to have any data objects.
1543 // On multiple workers touching the same traces
1544 // ============================================
1545 // When creating linkage for a directory, *only* create it if we are
1546 // also creating the object. That way, we might not manage to get the
1547 // *right* linkage for a directory, but at least we won't multiply link
1548 // it. We assume that if a root dirfrag exists for a directory, then
1549 // it is linked somewhere (i.e. that the metadata pool is not already
1552 // Making sure *that* is true is someone else's job! Probably someone
1553 // who is not going to run in parallel, so that they can self-consistently
1554 // look at versions and move things around as they go.
1555 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1556 // object, next run will fail to create linkage for the dirfrag object
1557 // and leave it orphaned.
1559 inodeno_t ino
= backtrace
.ino
;
1560 dout(10) << " inode: 0x" << std::hex
<< ino
<< std::dec
<< dendl
;
1561 for (std::vector
<inode_backpointer_t
>::const_iterator i
= backtrace
.ancestors
.begin();
1562 i
!= backtrace
.ancestors
.end(); ++i
) {
1563 const inode_backpointer_t
&backptr
= *i
;
1564 dout(10) << " backptr: 0x" << std::hex
<< backptr
.dirino
<< std::dec
1565 << "/" << backptr
.dname
<< dendl
;
1567 // Examine root dirfrag for parent
1568 const inodeno_t parent_ino
= backptr
.dirino
;
1569 const std::string dname
= backptr
.dname
;
1572 int r
= get_frag_of(parent_ino
, dname
, &fragment
);
1574 // Don't know fragment, fall back to assuming root
1575 dout(20) << "don't know fragment for 0x" << std::hex
<<
1576 parent_ino
<< std::dec
<< "/" << dname
<< ", will insert to root"
1580 // Find or create dirfrag
1581 // ======================
1582 bool created_dirfrag
;
1583 r
= find_or_create_dirfrag(parent_ino
, fragment
, &created_dirfrag
);
1588 // Check if dentry already exists
1589 // ==============================
1590 InodeStore existing_dentry
;
1591 r
= read_dentry(parent_ino
, fragment
, dname
, &existing_dentry
);
1592 bool write_dentry
= false;
1593 if (r
== -ENOENT
|| r
== -EINVAL
) {
1594 if (r
== -EINVAL
&& !force_corrupt
) {
1597 // Missing or corrupt dentry
1598 write_dentry
= true;
1600 derr
<< "Unexpected error reading dentry 0x" << std::hex
1601 << parent_ino
<< std::dec
<< "/"
1602 << dname
<< ": " << cpp_strerror(r
) << dendl
;
1605 // Dentry already present, does it link to me?
1606 if (existing_dentry
.inode
.ino
== ino
) {
1607 dout(20) << "Dentry 0x" << std::hex
1608 << parent_ino
<< std::dec
<< "/"
1609 << dname
<< " already exists and points to me" << dendl
;
1611 derr
<< "Dentry 0x" << std::hex
1612 << parent_ino
<< std::dec
<< "/"
1613 << dname
<< " already exists but points to 0x"
1614 << std::hex
<< existing_dentry
.inode
.ino
<< std::dec
<< dendl
;
1615 // Fall back to lost+found!
1616 return inject_lost_and_found(backtrace
.ino
, dentry
);
1624 if (i
== backtrace
.ancestors
.begin()) {
1625 // This is the linkage for the file of interest
1626 dout(10) << "Linking inode 0x" << std::hex
<< ino
1627 << " at 0x" << parent_ino
<< "/" << dname
<< std::dec
1628 << " with size=" << dentry
.inode
.size
<< " bytes" << dendl
;
1630 r
= inject_linkage(parent_ino
, dname
, fragment
, dentry
);
1632 // This is the linkage for an ancestor directory
1633 InodeStore ancestor_dentry
;
1634 ancestor_dentry
.inode
.mode
= 0755 | S_IFDIR
;
1636 // Set nfiles to something non-zero, to fool any other code
1637 // that tries to ignore 'empty' directories. This won't be
1638 // accurate, but it should avoid functional issues.
1640 ancestor_dentry
.inode
.dirstat
.nfiles
= 1;
1641 ancestor_dentry
.inode
.dir_layout
.dl_dir_hash
=
1642 g_conf
->mds_default_dir_hash
;
1644 ancestor_dentry
.inode
.nlink
= 1;
1645 ancestor_dentry
.inode
.ino
= ino
;
1646 ancestor_dentry
.inode
.uid
= g_conf
->mds_root_ino_uid
;
1647 ancestor_dentry
.inode
.gid
= g_conf
->mds_root_ino_gid
;
1648 ancestor_dentry
.inode
.version
= 1;
1649 ancestor_dentry
.inode
.backtrace_version
= 1;
1650 r
= inject_linkage(parent_ino
, dname
, fragment
, ancestor_dentry
);
1658 if (!created_dirfrag
) {
1659 // If the parent dirfrag already existed, then stop traversing the
1660 // backtrace: assume that the other ancestors already exist too. This
1661 // is an assumption rather than a truth, but it's a convenient way
1662 // to avoid the risk of creating multiply-linked directories while
1663 // injecting data. If there are in fact missing ancestors, this
1664 // should be fixed up using a separate tool scanning the metadata
1668 // Proceed up the backtrace, creating parents
1676 int MetadataDriver::find_or_create_dirfrag(
1681 assert(created
!= NULL
);
1683 fnode_t existing_fnode
;
1686 uint64_t read_version
= 0;
1687 int r
= read_fnode(ino
, fragment
, &existing_fnode
, &read_version
);
1688 dout(10) << "read_version = " << read_version
<< dendl
;
1690 if (r
== -ENOENT
|| r
== -EINVAL
) {
1691 if (r
== -EINVAL
&& !force_corrupt
) {
1695 // Missing or corrupt fnode, create afresh
1696 bufferlist fnode_bl
;
1697 fnode_t blank_fnode
;
1698 blank_fnode
.version
= 1;
1699 // mark it as non-empty
1700 blank_fnode
.fragstat
.nfiles
= 1;
1701 blank_fnode
.accounted_fragstat
= blank_fnode
.fragstat
;
1702 blank_fnode
.damage_flags
|= (DAMAGE_STATS
| DAMAGE_RSTATS
);
1703 blank_fnode
.encode(fnode_bl
);
1706 librados::ObjectWriteOperation op
;
1709 assert(r
== -EINVAL
);
1710 // Case A: We must assert that the version isn't changed since we saw the object
1711 // was unreadable, to avoid the possibility of two data-scan processes
1712 // both creating the frag.
1713 op
.assert_version(read_version
);
1715 assert(r
== -ENOENT
);
1716 // Case B: The object didn't exist in read_fnode, so while creating it we must
1717 // use an exclusive create to correctly populate *creating with
1718 // whether we created it ourselves or someone beat us to it.
1722 object_t frag_oid
= InodeStore::get_object_name(ino
, fragment
, "");
1723 op
.omap_set_header(fnode_bl
);
1724 r
= metadata_io
.operate(frag_oid
.name
, &op
);
1725 if (r
== -EOVERFLOW
|| r
== -EEXIST
) {
1726 // Someone else wrote it (see case A above)
1727 dout(10) << "Dirfrag creation race: 0x" << std::hex
1728 << ino
<< " " << fragment
<< std::dec
<< dendl
;
1732 // We were unable to create or write it, error out
1733 derr
<< "Failed to create dirfrag 0x" << std::hex
1734 << ino
<< std::dec
<< ": " << cpp_strerror(r
) << dendl
;
1737 // Success: the dirfrag object now exists with a value header
1738 dout(10) << "Created dirfrag: 0x" << std::hex
1739 << ino
<< std::dec
<< dendl
;
1743 derr
<< "Unexpected error reading dirfrag 0x" << std::hex
1744 << ino
<< std::dec
<< " : " << cpp_strerror(r
) << dendl
;
1747 dout(20) << "Dirfrag already exists: 0x" << std::hex
1748 << ino
<< " " << fragment
<< std::dec
<< dendl
;
1754 int MetadataDriver::inject_linkage(
1755 inodeno_t dir_ino
, const std::string
&dname
,
1756 const frag_t fragment
, const InodeStore
&inode
)
1758 // We have no information about snapshots, so everything goes
1759 // in as CEPH_NOSNAP
1760 snapid_t snap
= CEPH_NOSNAP
;
1762 object_t frag_oid
= InodeStore::get_object_name(dir_ino
, fragment
, "");
1765 dentry_key_t
dn_key(snap
, dname
.c_str());
1768 bufferlist dentry_bl
;
1769 ::encode(snap
, dentry_bl
);
1770 ::encode('I', dentry_bl
);
1771 inode
.encode_bare(dentry_bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1774 std::map
<std::string
, bufferlist
> vals
;
1775 vals
[key
] = dentry_bl
;
1776 int r
= metadata_io
.omap_set(frag_oid
.name
, vals
);
1778 derr
<< "Error writing dentry 0x" << std::hex
1779 << dir_ino
<< std::dec
<< "/"
1780 << dname
<< ": " << cpp_strerror(r
) << dendl
;
1783 dout(20) << "Injected dentry 0x" << std::hex
1784 << dir_ino
<< "/" << dname
<< " pointing to 0x"
1785 << inode
.inode
.ino
<< std::dec
<< dendl
;
1791 int MetadataDriver::init(
1792 librados::Rados
&rados
, std::string
&metadata_pool_name
, const FSMap
*fsmap
,
1793 fs_cluster_id_t fscid
)
1795 if (metadata_pool_name
.empty()) {
1796 auto fs
= fsmap
->get_filesystem(fscid
);
1797 assert(fs
!= nullptr);
1798 int64_t const metadata_pool_id
= fs
->mds_map
.get_metadata_pool();
1800 dout(4) << "resolving metadata pool " << metadata_pool_id
<< dendl
;
1801 int r
= rados
.pool_reverse_lookup(metadata_pool_id
, &metadata_pool_name
);
1803 derr
<< "Pool " << metadata_pool_id
1804 << " identified in MDS map not found in RADOS!" << dendl
;
1807 dout(4) << "found metadata pool '" << metadata_pool_name
<< "'" << dendl
;
1809 dout(4) << "forcing metadata pool '" << metadata_pool_name
<< "'" << dendl
;
1811 return rados
.ioctx_create(metadata_pool_name
.c_str(), metadata_io
);
1814 int LocalFileDriver::init(
1815 librados::Rados
&rados
, std::string
&metadata_pool_name
, const FSMap
*fsmap
,
1816 fs_cluster_id_t fscid
)
1821 int LocalFileDriver::inject_data(
1822 const std::string
&file_path
,
1824 uint32_t chunk_size
,
1827 // Scrape the file contents out of the data pool and into the
1830 f
.open(file_path
.c_str(), std::fstream::out
| std::fstream::binary
);
1832 for (uint64_t offset
= 0; offset
< size
; offset
+= chunk_size
) {
1836 snprintf(buf
, sizeof(buf
),
1838 (unsigned long long)ino
,
1839 (unsigned long long)(offset
/ chunk_size
));
1840 std::string
oid(buf
);
1842 int r
= data_io
.read(oid
, bl
, chunk_size
, 0);
1844 if (r
<= 0 && r
!= -ENOENT
) {
1845 derr
<< "error reading data object '" << oid
<< "': "
1846 << cpp_strerror(r
) << dendl
;
1861 int LocalFileDriver::inject_with_backtrace(
1862 const inode_backtrace_t
&bt
,
1863 const InodeStore
&dentry
)
1865 std::string path_builder
= path
;
1867 // Iterate through backtrace creating directory parents
1868 std::vector
<inode_backpointer_t
>::const_reverse_iterator i
;
1869 for (i
= bt
.ancestors
.rbegin();
1870 i
!= bt
.ancestors
.rend(); ++i
) {
1872 const inode_backpointer_t
&backptr
= *i
;
1873 path_builder
+= "/";
1874 path_builder
+= backptr
.dname
;
1876 // Last entry is the filename itself
1877 bool is_file
= (i
+ 1 == bt
.ancestors
.rend());
1879 // FIXME: inject_data won't cope with interesting (i.e. striped)
1880 // layouts (need a librados-compatible Filer to read these)
1881 inject_data(path_builder
, dentry
.inode
.size
,
1882 dentry
.inode
.layout
.object_size
, bt
.ino
);
1884 int r
= mkdir(path_builder
.c_str(), 0755);
1885 if (r
!= 0 && r
!= -EPERM
) {
1886 derr
<< "error creating directory: '" << path_builder
<< "': "
1887 << cpp_strerror(r
) << dendl
;
1896 int LocalFileDriver::inject_lost_and_found(
1898 const InodeStore
&dentry
)
1900 std::string lf_path
= path
+ "/lost+found";
1901 int r
= mkdir(lf_path
.c_str(), 0755);
1902 if (r
!= 0 && r
!= -EPERM
) {
1903 derr
<< "error creating directory: '" << lf_path
<< "': "
1904 << cpp_strerror(r
) << dendl
;
1908 std::string file_path
= lf_path
+ "/" + lost_found_dname(ino
);
1909 return inject_data(file_path
, dentry
.inode
.size
,
1910 dentry
.inode
.layout
.object_size
, ino
);
1913 int LocalFileDriver::init_roots(int64_t data_pool_id
)
1915 // Ensure that the path exists and is a directory
1917 int r
= check_roots(&exists
);
1925 return ::mkdir(path
.c_str(), 0755);
1929 int LocalFileDriver::check_roots(bool *result
)
1931 // Check if the path exists and is a directory
1932 DIR *d
= ::opendir(path
.c_str());
1936 int r
= closedir(d
);
1938 // Weird, but maybe possible with e.g. stale FD on NFS mount?
1948 void MetadataTool::build_file_dentry(
1949 inodeno_t ino
, uint64_t file_size
, time_t file_mtime
,
1950 const file_layout_t
&layout
, InodeStore
*out
)
1952 assert(out
!= NULL
);
1954 out
->inode
.mode
= 0500 | S_IFREG
;
1955 out
->inode
.size
= file_size
;
1956 out
->inode
.max_size_ever
= file_size
;
1957 out
->inode
.mtime
.tv
.tv_sec
= file_mtime
;
1958 out
->inode
.atime
.tv
.tv_sec
= file_mtime
;
1959 out
->inode
.ctime
.tv
.tv_sec
= file_mtime
;
1961 out
->inode
.layout
= layout
;
1963 out
->inode
.truncate_seq
= 1;
1964 out
->inode
.truncate_size
= -1ull;
1966 out
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
1968 out
->inode
.nlink
= 1;
1969 out
->inode
.ino
= ino
;
1970 out
->inode
.version
= 1;
1971 out
->inode
.backtrace_version
= 1;
1972 out
->inode
.uid
= g_conf
->mds_root_ino_uid
;
1973 out
->inode
.gid
= g_conf
->mds_root_ino_gid
;
1976 void MetadataTool::build_dir_dentry(
1977 inodeno_t ino
, const frag_info_t
&fragstat
,
1978 const file_layout_t
&layout
, InodeStore
*out
)
1980 assert(out
!= NULL
);
1982 out
->inode
.mode
= 0755 | S_IFDIR
;
1983 out
->inode
.dirstat
= fragstat
;
1984 out
->inode
.mtime
.tv
.tv_sec
= fragstat
.mtime
;
1985 out
->inode
.atime
.tv
.tv_sec
= fragstat
.mtime
;
1986 out
->inode
.ctime
.tv
.tv_sec
= fragstat
.mtime
;
1988 out
->inode
.layout
= layout
;
1989 out
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
1991 out
->inode
.truncate_seq
= 1;
1992 out
->inode
.truncate_size
= -1ull;
1994 out
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
1996 out
->inode
.nlink
= 1;
1997 out
->inode
.ino
= ino
;
1998 out
->inode
.version
= 1;
1999 out
->inode
.backtrace_version
= 1;
2000 out
->inode
.uid
= g_conf
->mds_root_ino_uid
;
2001 out
->inode
.gid
= g_conf
->mds_root_ino_gid
;