1 // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * ceph - scalable distributed file system
6 * copyright (c) 2014 john spray <john.spray@inktank.com>
8 * this is free software; you can redistribute it and/or
9 * modify it under the terms of the gnu lesser general public
10 * license version 2.1, as published by the free software
11 * foundation. see file copying.
17 #include "common/ceph_argparse.h"
18 #include "common/errno.h"
19 #include "osdc/Journaler.h"
20 #include "mds/mdstypes.h"
21 #include "mds/LogEvent.h"
22 #include "mds/InoTable.h"
24 #include "mds/events/ENoOp.h"
25 #include "mds/events/EUpdate.h"
27 #include "JournalScanner.h"
28 #include "EventOutput.h"
32 #include "JournalTool.h"
35 #define dout_context g_ceph_context
36 #define dout_subsys ceph_subsys_mds
38 #define dout_prefix *_dout << __func__ << ": "
42 void JournalTool::usage()
44 std::cout
<< "Usage: \n"
45 << " cephfs-journal-tool [options] journal <command>\n"
48 << " import <path> [--force]\n"
50 << " reset [--force]\n"
51 << " cephfs-journal-tool [options] header <get|set> <field> <value>\n"
52 << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
53 << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
55 << " --range=<start>..<end>\n"
56 << " --path=<substring>\n"
57 << " --inode=<integer>\n"
58 << " --type=<UPDATE|OPEN|SESSION...><\n"
59 << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
60 << " --client=<session id integer>\n"
61 << " <effect>: [get|recover_dentries|splice]\n"
62 << " <output>: [summary|list|binary|json] [--path <path>]\n"
64 << "General options:\n"
65 << " --rank=filesystem:{mds-rank|all} journal rank or \"all\" ranks (mandatory)\n"
66 << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n"
67 << " this journal is used to queue for purge operation,\n"
68 << " default is mdlog, and only mdlog support event mode)\n"
70 << "Special options\n"
71 << " --alternate-pool <name> Alternative metadata pool to target\n"
72 << " when using recover_dentries.\n";
74 generic_client_usage();
79 * Handle arguments and hand off to journal/header/event mode
81 int JournalTool::main(std::vector
<const char*> &argv
)
85 dout(10) << "JournalTool::main " << dendl
;
89 cerr
<< "missing positional argument" << std::endl
;
93 std::vector
<const char*>::iterator arg
= argv
.begin();
96 if (!ceph_argparse_witharg(argv
, arg
, &rank_str
, "--rank", (char*)NULL
)) {
97 derr
<< "missing mandatory \"--rank\" argument" << dendl
;
101 if (!ceph_argparse_witharg(argv
, arg
, &type
, "--journal", (char*)NULL
)) {
106 r
= validate_type(type
);
108 derr
<< "journal type is not correct." << dendl
;
112 r
= role_selector
.parse(*fsmap
, rank_str
, false);
114 derr
<< "Couldn't determine MDS rank." << dendl
;
119 if (arg
== argv
.end()) {
120 derr
<< "Missing mode [journal|header|event]" << dendl
;
123 mode
= std::string(*arg
);
124 arg
= argv
.erase(arg
);
128 r
= rados
.init_with_context(g_ceph_context
);
130 derr
<< "RADOS unavailable, cannot scan filesystem journal" << dendl
;
134 dout(4) << "JournalTool: connecting to RADOS..." << dendl
;
137 derr
<< "couldn't connect to cluster: " << cpp_strerror(r
) << dendl
;
141 auto fs
= fsmap
->get_filesystem(role_selector
.get_ns());
142 ceph_assert(fs
!= nullptr);
143 int64_t const pool_id
= fs
->mds_map
.get_metadata_pool();
144 dout(4) << "JournalTool: resolving pool " << pool_id
<< dendl
;
145 std::string pool_name
;
146 r
= rados
.pool_reverse_lookup(pool_id
, &pool_name
);
148 derr
<< "Pool " << pool_id
<< " named in MDS map not found in RADOS!" << dendl
;
152 dout(4) << "JournalTool: creating IoCtx.." << dendl
;
153 r
= rados
.ioctx_create(pool_name
.c_str(), input
);
159 // journal and header are general journal mode
160 // event mode is only specific for mdlog
161 auto roles
= role_selector
.get_roles();
162 if (roles
.size() > 1) {
163 const std::string
&command
= argv
[0];
164 bool allowed
= can_execute_for_all_ranks(mode
, command
);
166 derr
<< "operation not allowed for all ranks" << dendl
;
172 for (auto role
: roles
) {
174 std::vector
<const char *> rank_argv(argv
);
175 dout(4) << "Executing for rank " << rank
<< dendl
;
176 if (mode
== std::string("journal")) {
177 r
= main_journal(rank_argv
);
178 } else if (mode
== std::string("header")) {
179 r
= main_header(rank_argv
);
180 } else if (mode
== std::string("event")) {
181 r
= main_event(rank_argv
);
183 cerr
<< "Bad command '" << mode
<< "'" << std::endl
;
195 int JournalTool::validate_type(const std::string
&type
)
197 if (type
== "mdlog" || type
== "purge_queue") {
203 std::string
JournalTool::gen_dump_file_path(const std::string
&prefix
) {
208 return prefix
+ "." + std::to_string(rank
);
211 bool JournalTool::can_execute_for_all_ranks(const std::string
&mode
,
212 const std::string
&command
) {
213 if (mode
== "journal" && command
== "import") {
221 * Handle arguments for 'journal' mode
223 * This is for operations that act on the journal as a whole.
225 int JournalTool::main_journal(std::vector
<const char*> &argv
)
228 derr
<< "Missing journal command, please see help" << dendl
;
232 std::string command
= argv
[0];
233 if (command
== "inspect") {
234 return journal_inspect();
235 } else if (command
== "export" || command
== "import") {
237 if (argv
.size() >= 2) {
238 std::string
const path
= argv
[1];
239 if (argv
.size() == 3) {
240 if (std::string(argv
[2]) == "--force") {
243 std::cerr
<< "Unknown argument " << argv
[1] << std::endl
;
247 return journal_export(path
, command
== "import", force
);
249 derr
<< "Missing path" << dendl
;
252 } else if (command
== "reset") {
254 if (argv
.size() == 2) {
255 if (std::string(argv
[1]) == "--force") {
258 std::cerr
<< "Unknown argument " << argv
[1] << std::endl
;
261 } else if (argv
.size() > 2) {
262 std::cerr
<< "Too many arguments!" << std::endl
;
265 return journal_reset(force
);
267 derr
<< "Bad journal command '" << command
<< "'" << dendl
;
274 * Parse arguments and execute for 'header' mode
276 * This is for operations that act on the header only.
278 int JournalTool::main_header(std::vector
<const char*> &argv
)
280 JournalFilter
filter(type
);
281 JournalScanner
js(input
, rank
, type
, filter
);
282 int r
= js
.scan(false);
284 std::cerr
<< "Unable to scan journal" << std::endl
;
288 if (!js
.header_present
) {
289 std::cerr
<< "Header object not found!" << std::endl
;
291 } else if (!js
.header_valid
&& js
.header
== NULL
) {
292 // Can't do a read or a single-field write without a copy of the original
293 derr
<< "Header could not be read!" << dendl
;
296 ceph_assert(js
.header
!= NULL
);
300 derr
<< "Missing header command, must be [get|set]" << dendl
;
303 std::vector
<const char *>::iterator arg
= argv
.begin();
304 std::string
const command
= *arg
;
305 arg
= argv
.erase(arg
);
307 if (command
== std::string("get")) {
308 // Write JSON journal dump to stdout
309 JSONFormatter
jf(true);
310 js
.header
->dump(&jf
);
312 std::cout
<< std::endl
;
313 } else if (command
== std::string("set")) {
314 // Need two more args <key> <val>
315 if (argv
.size() != 2) {
316 derr
<< "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl
;
320 std::string
const field_name
= *arg
;
321 arg
= argv
.erase(arg
);
323 std::string
const value_str
= *arg
;
324 arg
= argv
.erase(arg
);
325 ceph_assert(argv
.empty());
327 std::string parse_err
;
328 uint64_t new_val
= strict_strtoll(value_str
.c_str(), 0, &parse_err
);
329 if (!parse_err
.empty()) {
330 derr
<< "Invalid value '" << value_str
<< "': " << parse_err
<< dendl
;
334 uint64_t *field
= NULL
;
335 if (field_name
== "trimmed_pos") {
336 field
= &(js
.header
->trimmed_pos
);
337 } else if (field_name
== "expire_pos") {
338 field
= &(js
.header
->expire_pos
);
339 } else if (field_name
== "write_pos") {
340 field
= &(js
.header
->write_pos
);
341 } else if (field_name
== "pool_id") {
342 field
= (uint64_t*)(&(js
.header
->layout
.pool_id
));
344 derr
<< "Invalid field '" << field_name
<< "'" << dendl
;
348 std::cout
<< "Updating " << field_name
<< std::hex
<< " 0x" << *field
<< " -> 0x" << new_val
<< std::dec
<< std::endl
;
351 dout(4) << "Writing object..." << dendl
;
352 bufferlist header_bl
;
353 encode(*(js
.header
), header_bl
);
354 output
.write_full(js
.obj_name(0), header_bl
);
355 dout(4) << "Write complete." << dendl
;
356 std::cout
<< "Successfully updated header." << std::endl
;
358 derr
<< "Bad header command '" << command
<< "'" << dendl
;
367 * Parse arguments and execute for 'event' mode
369 * This is for operations that act on LogEvents within the log
371 int JournalTool::main_event(std::vector
<const char*> &argv
)
376 derr
<< "Missing event command, please see help" << dendl
;
380 std::vector
<const char*>::iterator arg
= argv
.begin();
381 bool dry_run
= false;
383 std::string command
= *(arg
++);
384 if (command
!= "get" && command
!= "splice" && command
!= "recover_dentries") {
385 derr
<< "Unknown argument '" << command
<< "'" << dendl
;
389 if (command
== "recover_dentries") {
390 if (type
!= "mdlog") {
391 derr
<< "journaler for " << type
<< " can't do \"recover_dentries\"." << dendl
;
394 if (arg
!= argv
.end() && ceph_argparse_flag(argv
, arg
, "--dry_run", (char*)NULL
)) {
400 if (arg
== argv
.end()) {
401 derr
<< "Incomplete command line" << dendl
;
405 // Parse filter options
406 // ====================
407 JournalFilter
filter(type
);
408 r
= filter
.parse_args(argv
, arg
);
413 // Parse output options
414 // ====================
415 if (arg
== argv
.end()) {
416 cerr
<< "Missing output command" << std::endl
;
419 std::string output_style
= *(arg
++);
420 if (output_style
!= "binary" && output_style
!= "json" &&
421 output_style
!= "summary" && output_style
!= "list") {
422 cerr
<< "Unknown argument: '" << output_style
<< "'" << std::endl
;
426 std::string output_path
= "dump";
427 while(arg
!= argv
.end()) {
429 if (ceph_argparse_witharg(argv
, arg
, &arg_str
, "--path", (char*)NULL
)) {
430 output_path
= arg_str
;
431 } else if (ceph_argparse_witharg(argv
, arg
, &arg_str
, "--alternate-pool",
433 dout(1) << "Using alternate pool " << arg_str
<< dendl
;
434 int r
= rados
.ioctx_create(arg_str
.c_str(), output
);
438 cerr
<< "Unknown argument: '" << *arg
<< "'" << std::endl
;
443 const std::string dump_path
= gen_dump_file_path(output_path
);
447 JournalScanner
js(input
, rank
, type
, filter
);
448 if (command
== "get") {
451 derr
<< "Failed to scan journal (" << cpp_strerror(r
) << ")" << dendl
;
454 } else if (command
== "recover_dentries") {
457 derr
<< "Failed to scan journal (" << cpp_strerror(r
) << ")" << dendl
;
462 * Iterate over log entries, attempting to scavenge from each one
464 std::set
<inodeno_t
> consumed_inos
;
465 for (JournalScanner::EventMap::iterator i
= js
.events
.begin();
466 i
!= js
.events
.end(); ++i
) {
467 auto& le
= i
->second
.log_event
;
468 EMetaBlob
const *mb
= le
->get_metablob();
470 int scav_r
= recover_dentries(*mb
, dry_run
, &consumed_inos
);
472 dout(1) << "Error processing event 0x" << std::hex
<< i
->first
<< std::dec
473 << ": " << cpp_strerror(scav_r
) << ", continuing..." << dendl
;
477 // Our goal is to read all we can, so don't stop on errors, but
478 // do record them for possible later output
479 js
.errors
.insert(std::make_pair(i
->first
,
480 JournalScanner::EventError(scav_r
, cpp_strerror(r
))));
486 * Update InoTable to reflect any inode numbers consumed during scavenge
488 dout(4) << "consumed " << consumed_inos
.size() << " inodes" << dendl
;
489 if (consumed_inos
.size() && !dry_run
) {
490 int consume_r
= consume_inos(consumed_inos
);
492 dout(1) << "Error updating InoTable for " << consumed_inos
.size()
493 << " consume inos: " << cpp_strerror(consume_r
) << dendl
;
500 // Remove consumed dentries from lost+found.
501 if (other_pool
&& !dry_run
) {
502 std::set
<std::string
> found
;
504 for (auto i
: consumed_inos
) {
507 snprintf(s
, sizeof(s
), "%llx_head", (unsigned long long) i
);
508 dout(20) << "removing " << s
<< dendl
;
509 found
.insert(std::string(s
));
513 frag_oid
= InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND
,
515 output
.omap_rm_keys(frag_oid
.name
, found
);
517 } else if (command
== "splice") {
520 derr
<< "Failed to scan journal (" << cpp_strerror(r
) << ")" << dendl
;
525 if (filter
.get_range(start
, end
)) {
526 // Special case for range filter: erase a numeric range in the log
527 uint64_t range
= end
- start
;
528 int r
= erase_region(js
, start
, range
);
530 derr
<< "Failed to erase region 0x" << std::hex
<< start
<< "~0x" << range
<< std::dec
531 << ": " << cpp_strerror(r
) << dendl
;
535 // General case: erase a collection of individual entries in the log
536 for (JournalScanner::EventMap::iterator i
= js
.events
.begin(); i
!= js
.events
.end(); ++i
) {
537 dout(4) << "Erasing offset 0x" << std::hex
<< i
->first
<< std::dec
<< dendl
;
539 int r
= erase_region(js
, i
->first
, i
->second
.raw_size
);
541 derr
<< "Failed to erase event 0x" << std::hex
<< i
->first
<< std::dec
542 << ": " << cpp_strerror(r
) << dendl
;
550 cerr
<< "Unknown argument '" << command
<< "'" << std::endl
;
556 EventOutput
output(js
, dump_path
);
557 int output_result
= 0;
558 if (output_style
== "binary") {
559 output_result
= output
.binary();
560 } else if (output_style
== "json") {
561 output_result
= output
.json();
562 } else if (output_style
== "summary") {
564 } else if (output_style
== "list") {
567 std::cerr
<< "Bad output command '" << output_style
<< "'" << std::endl
;
571 if (output_result
!= 0) {
572 std::cerr
<< "Error writing output: " << cpp_strerror(output_result
) << std::endl
;
575 return output_result
;
579 * Provide the user with information about the condition of the journal,
580 * especially indicating what range of log events is available and where
581 * any gaps or corruptions in the journal are.
583 int JournalTool::journal_inspect()
587 JournalFilter
filter(type
);
588 JournalScanner
js(input
, rank
, type
, filter
);
591 std::cerr
<< "Failed to scan journal (" << cpp_strerror(r
) << ")" << std::endl
;
595 js
.report(std::cout
);
602 * Attempt to export a binary dump of the journal.
604 * This is allowed to fail if the header is malformed or there are
605 * objects inaccessible, in which case the user would have to fall
606 * back to manually listing RADOS objects and extracting them, which
607 * they can do with the ``rados`` CLI.
609 int JournalTool::journal_export(std::string
const &path
, bool import
, bool force
)
612 JournalScanner
js(input
, rank
, type
);
616 * If doing an export, first check that the header is valid and
617 * no objects are missing before trying to dump
621 derr
<< "Unable to scan journal, assuming badly damaged" << dendl
;
624 if (!js
.is_readable()) {
625 derr
<< "Journal not readable, attempt object-by-object dump with `rados`" << dendl
;
631 * Assuming we can cleanly read the journal data, dump it out to a file
635 r
= dumper
.init(mds_role_t(role_selector
.get_ns(), rank
), type
);
637 derr
<< "dumper::init failed: " << cpp_strerror(r
) << dendl
;
641 r
= dumper
.undump(path
.c_str(), force
);
643 const std::string ex_path
= gen_dump_file_path(path
);
644 r
= dumper
.dump(ex_path
.c_str());
653 * Truncate journal and insert EResetJournal
655 int JournalTool::journal_reset(bool hard
)
659 r
= resetter
.init(mds_role_t(role_selector
.get_ns(), rank
), type
, hard
);
661 derr
<< "resetter::init failed: " << cpp_strerror(r
) << dendl
;
666 r
= resetter
.reset_hard();
668 r
= resetter
.reset();
676 * Selective offline replay which only reads out dentries and writes
677 * them to the backing store iff their version is > what is currently
678 * in the backing store.
680 * In order to write dentries to the backing store, we may create the
681 * required enclosing dirfrag objects.
683 * Test this by running scavenge on an unflushed journal, then nuking
684 * it offline, then starting an MDS and seeing that the dentries are
687 * @param metablob an EMetaBlob retrieved from the journal
688 * @param dry_run if true, do no writes to RADOS
689 * @param consumed_inos output, populated with any inos inserted
690 * @returns 0 on success, else negative error code
692 int JournalTool::recover_dentries(
693 EMetaBlob
const &metablob
,
695 std::set
<inodeno_t
> *consumed_inos
)
697 ceph_assert(consumed_inos
!= NULL
);
701 // Replay fullbits (dentry+inode)
702 for (const auto& frag
: metablob
.lump_order
) {
703 EMetaBlob::dirlump
const &lump
= metablob
.lump_map
.find(frag
)->second
;
705 object_t frag_oid
= InodeStore::get_object_name(frag
.ino
, frag
.frag
, "");
707 dout(4) << "inspecting lump " << frag_oid
.name
<< dendl
;
710 // We will record old fnode version for use in hard link handling
711 // If we don't read an old fnode, take version as zero and write in
712 // all hardlinks we find.
713 version_t old_fnode_version
= 0;
715 // Update fnode in omap header of dirfrag object
716 bool write_fnode
= false;
717 bufferlist old_fnode_bl
;
718 r
= input
.omap_get_header(frag_oid
.name
, &old_fnode_bl
);
720 // Creating dirfrag from scratch
721 dout(4) << "failed to read OMAP header from directory fragment "
722 << frag_oid
.name
<< " " << cpp_strerror(r
) << dendl
;
724 // Note: creating the dirfrag *without* a backtrace, relying on
725 // MDS to regenerate backtraces on read or in FSCK
727 // Conditionally update existing omap header
729 auto old_fnode_iter
= old_fnode_bl
.cbegin();
731 old_fnode
.decode(old_fnode_iter
);
732 dout(4) << "frag " << frag_oid
.name
<< " fnode old v" <<
733 old_fnode
.version
<< " vs new v" << lump
.fnode
->version
<< dendl
;
734 old_fnode_version
= old_fnode
.version
;
735 write_fnode
= old_fnode_version
< lump
.fnode
->version
;
736 } catch (const buffer::error
&err
) {
737 dout(1) << "frag " << frag_oid
.name
738 << " is corrupt, overwriting" << dendl
;
743 dout(4) << "failed to read OMAP header from directory fragment "
744 << frag_oid
.name
<< " " << cpp_strerror(r
) << dendl
;
748 if ((other_pool
|| write_fnode
) && !dry_run
) {
749 dout(4) << "writing fnode to omap header" << dendl
;
751 lump
.fnode
->encode(fnode_bl
);
752 if (!other_pool
|| frag
.ino
>= MDS_INO_SYSTEM_BASE
) {
753 r
= output
.omap_set_header(frag_oid
.name
, fnode_bl
);
756 derr
<< "Failed to write fnode for frag object "
757 << frag_oid
.name
<< dendl
;
762 std::set
<std::string
> read_keys
;
764 // Compose list of potentially-existing dentries we would like to fetch
765 for (const auto& fb
: lump
.get_dfull()) {
766 // Get a key like "foobar_head"
768 dentry_key_t
dn_key(fb
.dnlast
, fb
.dn
.c_str());
770 read_keys
.insert(key
);
773 for(const auto& rb
: lump
.get_dremote()) {
774 // Get a key like "foobar_head"
776 dentry_key_t
dn_key(rb
.dnlast
, rb
.dn
.c_str());
778 read_keys
.insert(key
);
781 for (const auto& nb
: lump
.get_dnull()) {
782 // Get a key like "foobar_head"
784 dentry_key_t
dn_key(nb
.dnlast
, nb
.dn
.c_str());
786 read_keys
.insert(key
);
789 // Perform bulk read of existing dentries
790 std::map
<std::string
, bufferlist
> read_vals
;
791 r
= input
.omap_get_vals_by_keys(frag_oid
.name
, read_keys
, &read_vals
);
792 if (r
== -ENOENT
&& other_pool
) {
793 r
= output
.omap_get_vals_by_keys(frag_oid
.name
, read_keys
, &read_vals
);
796 derr
<< "unexpected error reading fragment object "
797 << frag_oid
.name
<< ": " << cpp_strerror(r
) << dendl
;
801 // Compose list of dentries we will write back
802 std::map
<std::string
, bufferlist
> write_vals
;
803 for (const auto& fb
: lump
.get_dfull()) {
804 // Get a key like "foobar_head"
806 dentry_key_t
dn_key(fb
.dnlast
, fb
.dn
.c_str());
809 dout(4) << "inspecting fullbit " << frag_oid
.name
<< "/" << fb
.dn
811 bool write_dentry
= false;
812 if (read_vals
.find(key
) == read_vals
.end()) {
813 dout(4) << "dentry did not already exist, will create" << dendl
;
816 dout(4) << "dentry " << key
<< " existed already" << dendl
;
817 dout(4) << "dentry exists, checking versions..." << dendl
;
818 bufferlist
&old_dentry
= read_vals
[key
];
819 // Decode dentry+inode
820 auto q
= old_dentry
.cbegin();
825 decode(dentry_type
, q
);
827 if (dentry_type
== 'L' || dentry_type
== 'l') {
828 // leave write_dentry false, we have no version to
829 // compare with in a hardlink, so it's not safe to
830 // squash over it with what's in this fullbit
831 dout(10) << "Existing remote inode in slot to be (maybe) written "
832 << "by a full inode from the journal dn '" << fb
.dn
.c_str()
833 << "' with lump fnode version " << lump
.fnode
->version
834 << "vs existing fnode version " << old_fnode_version
<< dendl
;
835 write_dentry
= old_fnode_version
< lump
.fnode
->version
;
836 } else if (dentry_type
== 'I' || dentry_type
== 'i') {
837 // Read out inode version to compare with backing store
839 if (dentry_type
== 'i') {
840 mempool::mds_co::string alternate_name
;
844 decode(alternate_name
, q
);
848 inode
.decode_bare(q
);
850 dout(4) << "decoded embedded inode version "
851 << inode
.inode
->version
<< " vs fullbit version "
852 << fb
.inode
->version
<< dendl
;
853 if (inode
.inode
->version
< fb
.inode
->version
) {
857 dout(4) << "corrupt dentry in backing store, overwriting from "
863 if ((other_pool
|| write_dentry
) && !dry_run
) {
864 dout(4) << "writing I dentry " << key
<< " into frag "
865 << frag_oid
.name
<< dendl
;
867 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
868 bufferlist dentry_bl
;
869 encode(fb
.dnfirst
, dentry_bl
);
870 encode('I', dentry_bl
);
871 encode_fullbit_as_inode(fb
, true, &dentry_bl
);
873 // Record for writing to RADOS
874 write_vals
[key
] = dentry_bl
;
875 consumed_inos
->insert(fb
.inode
->ino
);
879 for(const auto& rb
: lump
.get_dremote()) {
880 // Get a key like "foobar_head"
882 dentry_key_t
dn_key(rb
.dnlast
, rb
.dn
.c_str());
885 dout(4) << "inspecting remotebit " << frag_oid
.name
<< "/" << rb
.dn
887 bool write_dentry
= false;
888 if (read_vals
.find(key
) == read_vals
.end()) {
889 dout(4) << "dentry did not already exist, will create" << dendl
;
892 dout(4) << "dentry " << key
<< " existed already" << dendl
;
893 dout(4) << "dentry exists, checking versions..." << dendl
;
894 bufferlist
&old_dentry
= read_vals
[key
];
895 // Decode dentry+inode
896 auto q
= old_dentry
.cbegin();
901 decode(dentry_type
, q
);
903 if (dentry_type
== 'L' || dentry_type
== 'l') {
904 dout(10) << "Existing hardlink inode in slot to be (maybe) written "
905 << "by a remote inode from the journal dn '" << rb
.dn
.c_str()
906 << "' with lump fnode version " << lump
.fnode
->version
907 << "vs existing fnode version " << old_fnode_version
<< dendl
;
908 write_dentry
= old_fnode_version
< lump
.fnode
->version
;
909 } else if (dentry_type
== 'I' || dentry_type
== 'i') {
910 dout(10) << "Existing full inode in slot to be (maybe) written "
911 << "by a remote inode from the journal dn '" << rb
.dn
.c_str()
912 << "' with lump fnode version " << lump
.fnode
->version
913 << "vs existing fnode version " << old_fnode_version
<< dendl
;
914 write_dentry
= old_fnode_version
< lump
.fnode
->version
;
916 dout(4) << "corrupt dentry in backing store, overwriting from "
922 if ((other_pool
|| write_dentry
) && !dry_run
) {
923 dout(4) << "writing L dentry " << key
<< " into frag "
924 << frag_oid
.name
<< dendl
;
926 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
927 bufferlist dentry_bl
;
928 encode(rb
.dnfirst
, dentry_bl
);
929 encode('L', dentry_bl
);
930 encode(rb
.ino
, dentry_bl
);
931 encode(rb
.d_type
, dentry_bl
);
933 // Record for writing to RADOS
934 write_vals
[key
] = dentry_bl
;
935 consumed_inos
->insert(rb
.ino
);
939 std::set
<std::string
> null_vals
;
940 for (const auto& nb
: lump
.get_dnull()) {
942 dentry_key_t
dn_key(nb
.dnlast
, nb
.dn
.c_str());
945 dout(4) << "inspecting nullbit " << frag_oid
.name
<< "/" << nb
.dn
948 auto it
= read_vals
.find(key
);
949 if (it
!= read_vals
.end()) {
950 dout(4) << "dentry exists, will remove" << dendl
;
952 auto q
= it
->second
.cbegin();
956 decode(dentry_type
, q
);
958 bool remove_dentry
= false;
959 if (dentry_type
== 'L' || dentry_type
== 'l') {
960 dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
961 << "by null journal dn '" << nb
.dn
.c_str()
962 << "' with lump fnode version " << lump
.fnode
->version
963 << "vs existing fnode version " << old_fnode_version
<< dendl
;
964 remove_dentry
= old_fnode_version
< lump
.fnode
->version
;
965 } else if (dentry_type
== 'I' || dentry_type
== 'i') {
966 dout(10) << "Existing full inode in slot to be (maybe) removed "
967 << "by null journal dn '" << nb
.dn
.c_str()
968 << "' with lump fnode version " << lump
.fnode
->version
969 << "vs existing fnode version " << old_fnode_version
<< dendl
;
970 remove_dentry
= old_fnode_version
< lump
.fnode
->version
;
972 dout(4) << "corrupt dentry in backing store, will remove" << dendl
;
973 remove_dentry
= true;
977 null_vals
.insert(key
);
981 // Write back any new/changed dentries
982 if (!write_vals
.empty()) {
983 r
= output
.omap_set(frag_oid
.name
, write_vals
);
985 derr
<< "error writing dentries to " << frag_oid
.name
986 << ": " << cpp_strerror(r
) << dendl
;
991 // remove any null dentries
992 if (!null_vals
.empty()) {
993 r
= output
.omap_rm_keys(frag_oid
.name
, null_vals
);
995 derr
<< "error removing dentries from " << frag_oid
.name
996 << ": " << cpp_strerror(r
) << dendl
;
1002 /* Now that we've looked at the dirlumps, we finally pay attention to
1003 * the roots (i.e. inodes without ancestry). This is necessary in order
1004 * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
1005 * important because clients use them to infer completeness
1008 for (const auto& fb
: metablob
.roots
) {
1009 inodeno_t ino
= fb
.inode
->ino
;
1010 dout(4) << "updating root 0x" << std::hex
<< ino
<< std::dec
<< dendl
;
1012 object_t root_oid
= InodeStore::get_object_name(ino
, frag_t(), ".inode");
1013 dout(4) << "object id " << root_oid
.name
<< dendl
;
1015 bool write_root_ino
= false;
1016 bufferlist old_root_ino_bl
;
1017 r
= input
.read(root_oid
.name
, old_root_ino_bl
, (1<<22), 0);
1019 dout(4) << "root does not exist, will create" << dendl
;
1020 write_root_ino
= true;
1021 } else if (r
>= 0) {
1023 InodeStore old_inode
;
1024 dout(4) << "root exists, will modify (" << old_root_ino_bl
.length()
1026 auto inode_bl_iter
= old_root_ino_bl
.cbegin();
1028 decode(magic
, inode_bl_iter
);
1029 if (magic
== CEPH_FS_ONDISK_MAGIC
) {
1030 dout(4) << "magic ok" << dendl
;
1031 old_inode
.decode(inode_bl_iter
);
1033 if (old_inode
.inode
->version
< fb
.inode
->version
) {
1034 write_root_ino
= true;
1037 dout(4) << "magic bad: '" << magic
<< "'" << dendl
;
1038 write_root_ino
= true;
1041 derr
<< "error reading root inode object " << root_oid
.name
1042 << ": " << cpp_strerror(r
) << dendl
;
1046 if (write_root_ino
&& !dry_run
) {
1047 dout(4) << "writing root ino " << root_oid
.name
1048 << " version " << fb
.inode
->version
<< dendl
;
1050 // Compose: root ino format is magic,InodeStore(bare=false)
1051 bufferlist new_root_ino_bl
;
1052 encode(std::string(CEPH_FS_ONDISK_MAGIC
), new_root_ino_bl
);
1053 encode_fullbit_as_inode(fb
, false, &new_root_ino_bl
);
1056 r
= output
.write_full(root_oid
.name
, new_root_ino_bl
);
1058 derr
<< "error writing inode object " << root_oid
.name
1059 << ": " << cpp_strerror(r
) << dendl
;
1070 * Erase a region of the log by overwriting it with ENoOp
1073 int JournalTool::erase_region(JournalScanner
const &js
, uint64_t const pos
, uint64_t const length
)
1075 // To erase this region, we use our preamble, the encoding overhead
1076 // of an ENoOp, and our trailing start ptr. Calculate how much padding
1077 // is needed inside the ENoOp to make up the difference.
1079 if (type
== "mdlog") {
1081 enoop
.encode_with_header(tmp
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1082 } else if (type
== "purge_queue") {
1087 dout(4) << "erase_region " << pos
<< " len=" << length
<< dendl
;
1089 // FIXME: get the preamble/postamble length via JournalStream
1090 int32_t padding
= length
- tmp
.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
1091 dout(4) << "erase_region padding=0x" << std::hex
<< padding
<< std::dec
<< dendl
;
1094 derr
<< "Erase region " << length
<< " too short" << dendl
;
1099 if (type
== "mdlog") {
1100 // Serialize an ENoOp with the correct amount of padding
1101 ENoOp
enoop(padding
);
1102 enoop
.encode_with_header(entry
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1103 } else if (type
== "purge_queue") {
1105 pi
.pad_size
= padding
;
1108 JournalStream
stream(JOURNAL_FORMAT_RESILIENT
);
1109 // Serialize region of log stream
1110 bufferlist log_data
;
1111 stream
.write(entry
, &log_data
, pos
);
1113 dout(4) << "erase_region data length " << log_data
.length() << dendl
;
1114 ceph_assert(log_data
.length() == length
);
1116 // Write log stream region to RADOS
1117 // FIXME: get object size somewhere common to scan_events
1118 uint32_t object_size
= g_conf()->mds_log_segment_size
;
1119 if (object_size
== 0) {
1120 // Default layout object size
1121 object_size
= file_layout_t::get_default().object_size
;
1124 uint64_t write_offset
= pos
;
1125 uint64_t obj_offset
= (pos
/ object_size
);
1127 while(log_data
.length()) {
1128 std::string
const oid
= js
.obj_name(obj_offset
);
1129 uint32_t offset_in_obj
= write_offset
% object_size
;
1130 uint32_t write_len
= min(log_data
.length(), object_size
- offset_in_obj
);
1132 r
= output
.write(oid
, log_data
, write_len
, offset_in_obj
);
1136 dout(4) << "Wrote " << write_len
<< " bytes to " << oid
<< dendl
;
1140 log_data
.splice(0, write_len
);
1141 write_offset
+= write_len
;
1149 * Given an EMetaBlob::fullbit containing an inode, write out
1150 * the encoded inode in the format used by InodeStore (i.e. the
1151 * backing store format)
1153 * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
1154 * on an offline InodeStore instance. It's way simpler, because we are just
1155 * uncritically hauling the data between structs.
1157 * @param fb a fullbit extracted from a journal entry
1158 * @param bare if true, leave out [EN|DE]CODE_START decoration
1159 * @param out_bl output, write serialized inode to this bufferlist
1161 void JournalTool::encode_fullbit_as_inode(
1162 const EMetaBlob::fullbit
&fb
,
1166 ceph_assert(out_bl
!= NULL
);
1168 // Compose InodeStore
1169 InodeStore new_inode
;
1170 new_inode
.inode
= fb
.inode
;
1171 new_inode
.xattrs
= fb
.xattrs
;
1172 new_inode
.dirfragtree
= fb
.dirfragtree
;
1173 new_inode
.snap_blob
= fb
.snapbl
;
1174 new_inode
.symlink
= fb
.symlink
;
1175 new_inode
.old_inodes
= fb
.old_inodes
;
1177 // Serialize InodeStore
1179 new_inode
.encode_bare(*out_bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1181 new_inode
.encode(*out_bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1186 * Given a list of inode numbers known to be in use by
1187 * inodes in the backing store, ensure that none of these
1188 * numbers are listed as free in the InoTables in the
1191 * Used after injecting inodes into the backing store, to
1192 * ensure that the same inode numbers are not subsequently
1193 * used for new files during ordinary operation.
1195 * @param inos list of inode numbers to be removed from
1196 * free lists in InoTables
1197 * @returns 0 on success, else negative error code
1199 int JournalTool::consume_inos(const std::set
<inodeno_t
> &inos
)
1203 // InoTable is a per-MDS structure, so iterate over assigned ranks
1204 auto fs
= fsmap
->get_filesystem(role_selector
.get_ns());
1205 std::set
<mds_rank_t
> in_ranks
;
1206 fs
->mds_map
.get_mds_set(in_ranks
);
1208 for (std::set
<mds_rank_t
>::iterator rank_i
= in_ranks
.begin();
1209 rank_i
!= in_ranks
.end(); ++rank_i
)
1211 // Compose object name
1212 std::ostringstream oss
;
1213 oss
<< "mds" << *rank_i
<< "_inotable";
1214 object_t inotable_oid
= object_t(oss
.str());
1217 bufferlist inotable_bl
;
1218 int read_r
= input
.read(inotable_oid
.name
, inotable_bl
, (1<<22), 0);
1220 // Things are really bad if we can't read inotable. Beyond our powers.
1221 derr
<< "unable to read inotable '" << inotable_oid
.name
<< "': "
1222 << cpp_strerror(read_r
) << dendl
;
1227 // Deserialize InoTable
1228 version_t inotable_ver
;
1229 auto q
= inotable_bl
.cbegin();
1230 decode(inotable_ver
, q
);
1231 InoTable
ino_table(NULL
);
1232 ino_table
.decode(q
);
1234 // Update InoTable in memory
1235 bool inotable_modified
= false;
1236 for (std::set
<inodeno_t
>::iterator i
= inos
.begin();
1237 i
!= inos
.end(); ++i
)
1239 const inodeno_t ino
= *i
;
1240 if (ino_table
.force_consume(ino
)) {
1241 dout(4) << "Used ino 0x" << std::hex
<< ino
<< std::dec
1242 << " requires inotable update" << dendl
;
1243 inotable_modified
= true;
1247 // Serialize and write InoTable
1248 if (inotable_modified
) {
1250 dout(4) << "writing modified inotable version " << inotable_ver
<< dendl
;
1251 bufferlist inotable_new_bl
;
1252 encode(inotable_ver
, inotable_new_bl
);
1253 ino_table
.encode_state(inotable_new_bl
);
1254 int write_r
= output
.write_full(inotable_oid
.name
, inotable_new_bl
);
1256 derr
<< "error writing modified inotable " << inotable_oid
.name
1257 << ": " << cpp_strerror(write_r
) << dendl
;