]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/cephfs/JournalTool.cc
update sources to v12.2.5
[ceph.git] / ceph / src / tools / cephfs / JournalTool.cc
CommitLineData
7c673cae
FG
1// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * ceph - scalable distributed file system
5 *
6 * copyright (c) 2014 john spray <john.spray@inktank.com>
7 *
8 * this is free software; you can redistribute it and/or
9 * modify it under the terms of the gnu lesser general public
10 * license version 2.1, as published by the free software
11 * foundation. see file copying.
12 */
13
14
15#include <sstream>
16
17#include "common/ceph_argparse.h"
18#include "common/errno.h"
19#include "osdc/Journaler.h"
20#include "mds/mdstypes.h"
21#include "mds/LogEvent.h"
22#include "mds/InoTable.h"
23
24#include "mds/events/ENoOp.h"
25#include "mds/events/EUpdate.h"
26
27#include "JournalScanner.h"
28#include "EventOutput.h"
29#include "Dumper.h"
30#include "Resetter.h"
31
32#include "JournalTool.h"
33
34
35#define dout_context g_ceph_context
36#define dout_subsys ceph_subsys_mds
37#undef dout_prefix
38#define dout_prefix *_dout << __func__ << ": "
39
40
41
42void JournalTool::usage()
43{
44 std::cout << "Usage: \n"
45 << " cephfs-journal-tool [options] journal <command>\n"
46 << " <command>:\n"
47 << " inspect\n"
48 << " import <path>\n"
49 << " export <path>\n"
50 << " reset [--force]\n"
51 << " cephfs-journal-tool [options] header <get|set <field> <value>\n"
b32b8144 52 << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]"
31f18b77 53 << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
7c673cae
FG
54 << " <selector>:\n"
55 << " --range=<start>..<end>\n"
56 << " --path=<substring>\n"
57 << " --inode=<integer>\n"
58 << " --type=<UPDATE|OPEN|SESSION...><\n"
59 << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
7c673cae 60 << " --client=<session id integer>\n"
31f18b77 61 << " <effect>: [get|recover_dentries|splice]\n"
7c673cae
FG
62 << " <output>: [summary|list|binary|json] [--path <path>]\n"
63 << "\n"
31f18b77 64 << "General options:\n"
7c673cae
FG
65 << " --rank=filesystem:mds-rank Journal rank (required if multiple\n"
66 << " file systems, default is rank 0 on\n"
31f18b77
FG
67 << " the only filesystem otherwise.\n"
68 << "\n"
69 << "Special options\n"
70 << " --alternate-pool <name> Alternative metadata pool to target\n"
71 << " when using recover_dentries.\n";
7c673cae
FG
72
73 generic_client_usage();
74}
75
76
77/**
78 * Handle arguments and hand off to journal/header/event mode
79 */
80int JournalTool::main(std::vector<const char*> &argv)
81{
82 int r;
83
84 dout(10) << "JournalTool::main " << dendl;
85 // Common arg parsing
86 // ==================
87 if (argv.empty()) {
88 usage();
89 return -EINVAL;
90 }
91
92 std::vector<const char*>::iterator arg = argv.begin();
93
94 std::string rank_str;
95 if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
96 // Default: act on rank 0. Will give the user an error if they
97 // try invoking this way when they have more than one filesystem.
98 rank_str = "0";
99 }
100
101 r = role_selector.parse(*fsmap, rank_str);
102 if (r != 0) {
103 derr << "Couldn't determine MDS rank." << dendl;
104 return r;
105 }
106
107 std::string mode;
108 if (arg == argv.end()) {
109 derr << "Missing mode [journal|header|event]" << dendl;
110 return -EINVAL;
111 }
112 mode = std::string(*arg);
113 arg = argv.erase(arg);
114
115 // RADOS init
116 // ==========
117 r = rados.init_with_context(g_ceph_context);
118 if (r < 0) {
119 derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
120 return r;
121 }
122
123 dout(4) << "JournalTool: connecting to RADOS..." << dendl;
124 r = rados.connect();
125 if (r < 0) {
126 derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
127 return r;
128 }
129
130 auto fs = fsmap->get_filesystem(role_selector.get_ns());
131 assert(fs != nullptr);
132 int64_t const pool_id = fs->mds_map.get_metadata_pool();
133 dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
134 std::string pool_name;
135 r = rados.pool_reverse_lookup(pool_id, &pool_name);
136 if (r < 0) {
137 derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
138 return r;
139 }
140
141 dout(4) << "JournalTool: creating IoCtx.." << dendl;
142 r = rados.ioctx_create(pool_name.c_str(), input);
143 assert(r == 0);
144 output.dup(input);
145
146 // Execution
147 // =========
148 for (auto role : role_selector.get_roles()) {
149 rank = role.rank;
150 dout(4) << "Executing for rank " << rank << dendl;
151 if (mode == std::string("journal")) {
152 r = main_journal(argv);
153 } else if (mode == std::string("header")) {
154 r = main_header(argv);
155 } else if (mode == std::string("event")) {
156 r = main_event(argv);
157 } else {
158 derr << "Bad command '" << mode << "'" << dendl;
159 usage();
160 return -EINVAL;
161 }
162
163 if (r != 0) {
164 return r;
165 }
166 }
167
168 return r;
169}
170
171
172/**
173 * Handle arguments for 'journal' mode
174 *
175 * This is for operations that act on the journal as a whole.
176 */
177int JournalTool::main_journal(std::vector<const char*> &argv)
178{
179 std::string command = argv[0];
180 if (command == "inspect") {
181 return journal_inspect();
182 } else if (command == "export" || command == "import") {
183 if (argv.size() >= 2) {
184 std::string const path = argv[1];
185 return journal_export(path, command == "import");
186 } else {
187 derr << "Missing path" << dendl;
188 return -EINVAL;
189 }
190 } else if (command == "reset") {
191 bool force = false;
192 if (argv.size() == 2) {
193 if (std::string(argv[1]) == "--force") {
194 force = true;
195 } else {
196 std::cerr << "Unknown argument " << argv[1] << std::endl;
197 usage();
198 return -EINVAL;
199 }
200 } else if (argv.size() > 2) {
201 std::cerr << "Too many arguments!" << std::endl;
202 usage();
203 return -EINVAL;
204 }
205 return journal_reset(force);
206 } else {
207 derr << "Bad journal command '" << command << "'" << dendl;
208 return -EINVAL;
209 }
210}
211
212
213/**
214 * Parse arguments and execute for 'header' mode
215 *
216 * This is for operations that act on the header only.
217 */
218int JournalTool::main_header(std::vector<const char*> &argv)
219{
220 JournalFilter filter;
221 JournalScanner js(input, rank, filter);
222 int r = js.scan(false);
223 if (r < 0) {
224 std::cerr << "Unable to scan journal" << std::endl;
225 return r;
226 }
227
228 if (!js.header_present) {
229 std::cerr << "Header object not found!" << std::endl;
230 return -ENOENT;
231 } else if (!js.header_valid && js.header == NULL) {
232 // Can't do a read or a single-field write without a copy of the original
233 derr << "Header could not be read!" << dendl;
234 return -ENOENT;
235 } else {
236 assert(js.header != NULL);
237 }
238
239 if (argv.size() == 0) {
240 derr << "Invalid header command, must be [get|set]" << dendl;
241 return -EINVAL;
242 }
243 std::vector<const char *>::iterator arg = argv.begin();
244 std::string const command = *arg;
245 arg = argv.erase(arg);
246
247 if (command == std::string("get")) {
248 // Write JSON journal dump to stdout
249 JSONFormatter jf(true);
250 js.header->dump(&jf);
251 jf.flush(std::cout);
252 std::cout << std::endl;
253 } else if (command == std::string("set")) {
254 // Need two more args <key> <val>
255 if (argv.size() != 2) {
256 derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
257 return -EINVAL;
258 }
259
260 std::string const field_name = *arg;
261 arg = argv.erase(arg);
262
263 std::string const value_str = *arg;
264 arg = argv.erase(arg);
265 assert(argv.empty());
266
267 std::string parse_err;
268 uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
269 if (!parse_err.empty()) {
270 derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
271 return -EINVAL;
272 }
273
274 uint64_t *field = NULL;
275 if (field_name == "trimmed_pos") {
276 field = &(js.header->trimmed_pos);
277 } else if (field_name == "expire_pos") {
278 field = &(js.header->expire_pos);
279 } else if (field_name == "write_pos") {
280 field = &(js.header->write_pos);
b32b8144
FG
281 } else if (field_name == "pool_id") {
282 field = (uint64_t*)(&(js.header->layout.pool_id));
7c673cae
FG
283 } else {
284 derr << "Invalid field '" << field_name << "'" << dendl;
285 return -EINVAL;
286 }
287
288 std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
289 *field = new_val;
290
291 dout(4) << "Writing object..." << dendl;
292 bufferlist header_bl;
293 ::encode(*(js.header), header_bl);
294 output.write_full(js.obj_name(0), header_bl);
295 dout(4) << "Write complete." << dendl;
296 std::cout << "Successfully updated header." << std::endl;
297 } else {
298 derr << "Bad header command '" << command << "'" << dendl;
299 return -EINVAL;
300 }
301
302 return 0;
303}
304
305
306/**
307 * Parse arguments and execute for 'event' mode
308 *
309 * This is for operations that act on LogEvents within the log
310 */
311int JournalTool::main_event(std::vector<const char*> &argv)
312{
313 int r;
314
315 std::vector<const char*>::iterator arg = argv.begin();
316
317 std::string command = *(arg++);
31f18b77 318 if (command != "get" && command != "splice" && command != "recover_dentries") {
7c673cae
FG
319 derr << "Unknown argument '" << command << "'" << dendl;
320 usage();
321 return -EINVAL;
322 }
323
324 if (arg == argv.end()) {
325 derr << "Incomplete command line" << dendl;
326 usage();
327 return -EINVAL;
328 }
329
330 // Parse filter options
331 // ====================
332 JournalFilter filter;
333 r = filter.parse_args(argv, arg);
334 if (r) {
335 return r;
336 }
337
338 // Parse output options
339 // ====================
340 if (arg == argv.end()) {
341 derr << "Missing output command" << dendl;
342 usage();
343 }
344 std::string output_style = *(arg++);
345 if (output_style != "binary" && output_style != "json" &&
346 output_style != "summary" && output_style != "list") {
347 derr << "Unknown argument: '" << output_style << "'" << dendl;
348 usage();
349 return -EINVAL;
350 }
351
352 std::string output_path = "dump";
353 while(arg != argv.end()) {
354 std::string arg_str;
355 if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
356 output_path = arg_str;
357 } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
358 nullptr)) {
359 dout(1) << "Using alternate pool " << arg_str << dendl;
360 int r = rados.ioctx_create(arg_str.c_str(), output);
361 assert(r == 0);
362 other_pool = true;
363 } else {
364 derr << "Unknown argument: '" << *arg << "'" << dendl;
365 usage();
366 return -EINVAL;
367 }
368 }
369
370 // Execute command
371 // ===============
372 JournalScanner js(input, rank, filter);
373 if (command == "get") {
374 r = js.scan();
375 if (r) {
376 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
377 return r;
378 }
7c673cae
FG
379 } else if (command == "recover_dentries") {
380 r = js.scan();
381 if (r) {
382 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
383 return r;
384 }
385
386 bool dry_run = false;
387 if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
388 dry_run = true;
389 }
390
391 /**
392 * Iterate over log entries, attempting to scavenge from each one
393 */
394 std::set<inodeno_t> consumed_inos;
395 for (JournalScanner::EventMap::iterator i = js.events.begin();
396 i != js.events.end(); ++i) {
397 LogEvent *le = i->second.log_event;
398 EMetaBlob const *mb = le->get_metablob();
399 if (mb) {
31f18b77 400 int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
7c673cae
FG
401 if (scav_r) {
402 dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
403 << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
404 if (r == 0) {
405 r = scav_r;
406 }
407 // Our goal is to read all we can, so don't stop on errors, but
408 // do record them for possible later output
409 js.errors.insert(std::make_pair(i->first,
410 JournalScanner::EventError(scav_r, cpp_strerror(r))));
411 }
412 }
413 }
414
415 /**
416 * Update InoTable to reflect any inode numbers consumed during scavenge
417 */
418 dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
419 if (consumed_inos.size() && !dry_run) {
420 int consume_r = consume_inos(consumed_inos);
421 if (consume_r) {
422 dout(1) << "Error updating InoTable for " << consumed_inos.size()
423 << " consume inos: " << cpp_strerror(consume_r) << dendl;
424 if (r == 0) {
425 r = consume_r;
426 }
427 }
428 }
429
430 // Remove consumed dentries from lost+found.
431 if (other_pool && !dry_run) {
432 std::set<std::string> found;
433
434 for (auto i : consumed_inos) {
435 char s[20];
436
437 snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
438 dout(20) << "removing " << s << dendl;
439 found.insert(std::string(s));
440 }
441
442 object_t frag_oid;
443 frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
444 frag_t(), "");
445 output.omap_rm_keys(frag_oid.name, found);
446 }
447 } else if (command == "splice") {
448 r = js.scan();
449 if (r) {
450 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
451 return r;
452 }
453
454 uint64_t start, end;
455 if (filter.get_range(start, end)) {
456 // Special case for range filter: erase a numeric range in the log
457 uint64_t range = end - start;
458 int r = erase_region(js, start, range);
459 if (r) {
460 derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
461 << ": " << cpp_strerror(r) << dendl;
462 return r;
463 }
464 } else {
465 // General case: erase a collection of individual entries in the log
466 for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
467 dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
468
469 int r = erase_region(js, i->first, i->second.raw_size);
470 if (r) {
471 derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
472 << ": " << cpp_strerror(r) << dendl;
473 return r;
474 }
475 }
476 }
477
478
479 } else {
480 derr << "Unknown argument '" << command << "'" << dendl;
481 usage();
482 return -EINVAL;
483 }
484
485 // Generate output
486 // ===============
487 EventOutput output(js, output_path);
488 int output_result = 0;
489 if (output_style == "binary") {
490 output_result = output.binary();
491 } else if (output_style == "json") {
492 output_result = output.json();
493 } else if (output_style == "summary") {
494 output.summary();
495 } else if (output_style == "list") {
496 output.list();
497 } else {
498 std::cerr << "Bad output command '" << output_style << "'" << std::endl;
499 return -EINVAL;
500 }
501
502 if (output_result != 0) {
503 std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
504 }
505
506 return output_result;
507}
508
509/**
510 * Provide the user with information about the condition of the journal,
511 * especially indicating what range of log events is available and where
512 * any gaps or corruptions in the journal are.
513 */
514int JournalTool::journal_inspect()
515{
516 int r;
517
518 JournalFilter filter;
519 JournalScanner js(input, rank, filter);
520 r = js.scan();
521 if (r) {
522 std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
523 return r;
524 }
525
526 js.report(std::cout);
527
528 return 0;
529}
530
531
532/**
533 * Attempt to export a binary dump of the journal.
534 *
535 * This is allowed to fail if the header is malformed or there are
536 * objects inaccessible, in which case the user would have to fall
537 * back to manually listing RADOS objects and extracting them, which
538 * they can do with the ``rados`` CLI.
539 */
540int JournalTool::journal_export(std::string const &path, bool import)
541{
542 int r = 0;
543 JournalScanner js(input, rank);
544
545 if (!import) {
546 /*
547 * If doing an export, first check that the header is valid and
548 * no objects are missing before trying to dump
549 */
550 r = js.scan();
551 if (r < 0) {
552 derr << "Unable to scan journal, assuming badly damaged" << dendl;
553 return r;
554 }
555 if (!js.is_readable()) {
556 derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
557 return -EIO;
558 }
559 }
560
561 /*
562 * Assuming we can cleanly read the journal data, dump it out to a file
563 */
564 {
565 Dumper dumper;
566 r = dumper.init(mds_role_t(role_selector.get_ns(), rank));
567 if (r < 0) {
568 derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
569 return r;
570 }
571 if (import) {
572 r = dumper.undump(path.c_str());
573 } else {
574 r = dumper.dump(path.c_str());
575 }
7c673cae
FG
576 }
577
578 return r;
579}
580
581
582/**
583 * Truncate journal and insert EResetJournal
584 */
585int JournalTool::journal_reset(bool hard)
586{
587 int r = 0;
588 Resetter resetter;
589 r = resetter.init();
590 if (r < 0) {
591 derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
592 return r;
593 }
594
595 if (hard) {
596 r = resetter.reset_hard(mds_role_t(role_selector.get_ns(), rank));
597 } else {
598 r = resetter.reset(mds_role_t(role_selector.get_ns(), rank));
599 }
7c673cae
FG
600
601 return r;
602}
603
604
605/**
606 * Selective offline replay which only reads out dentries and writes
607 * them to the backing store iff their version is > what is currently
608 * in the backing store.
609 *
610 * In order to write dentries to the backing store, we may create the
611 * required enclosing dirfrag objects.
612 *
613 * Test this by running scavenge on an unflushed journal, then nuking
614 * it offline, then starting an MDS and seeing that the dentries are
615 * visible.
616 *
617 * @param metablob an EMetaBlob retrieved from the journal
618 * @param dry_run if true, do no writes to RADOS
619 * @param consumed_inos output, populated with any inos inserted
620 * @returns 0 on success, else negative error code
621 */
31f18b77 622int JournalTool::recover_dentries(
7c673cae
FG
623 EMetaBlob const &metablob,
624 bool const dry_run,
625 std::set<inodeno_t> *consumed_inos)
626{
627 assert(consumed_inos != NULL);
628
629 int r = 0;
630
631 // Replay fullbits (dentry+inode)
632 for (list<dirfrag_t>::const_iterator lp = metablob.lump_order.begin();
633 lp != metablob.lump_order.end(); ++lp)
634 {
635 dirfrag_t const &frag = *lp;
636 EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
637 lump._decode_bits();
638 object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
639
640 dout(4) << "inspecting lump " << frag_oid.name << dendl;
641
642
643 // We will record old fnode version for use in hard link handling
644 // If we don't read an old fnode, take version as zero and write in
645 // all hardlinks we find.
646 version_t old_fnode_version = 0;
647
648 // Update fnode in omap header of dirfrag object
649 bool write_fnode = false;
650 bufferlist old_fnode_bl;
651 r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
652 if (r == -ENOENT) {
653 // Creating dirfrag from scratch
654 dout(4) << "failed to read OMAP header from directory fragment "
655 << frag_oid.name << " " << cpp_strerror(r) << dendl;
656 write_fnode = true;
657 // Note: creating the dirfrag *without* a backtrace, relying on
658 // MDS to regenerate backtraces on read or in FSCK
659 } else if (r == 0) {
660 // Conditionally update existing omap header
661 fnode_t old_fnode;
662 bufferlist::iterator old_fnode_iter = old_fnode_bl.begin();
663 try {
664 old_fnode.decode(old_fnode_iter);
665 dout(4) << "frag " << frag_oid.name << " fnode old v" <<
666 old_fnode.version << " vs new v" << lump.fnode.version << dendl;
667 old_fnode_version = old_fnode.version;
668 write_fnode = old_fnode_version < lump.fnode.version;
669 } catch (const buffer::error &err) {
670 dout(1) << "frag " << frag_oid.name
671 << " is corrupt, overwriting" << dendl;
672 write_fnode = true;
673 }
674 } else {
675 // Unexpected error
676 dout(4) << "failed to read OMAP header from directory fragment "
677 << frag_oid.name << " " << cpp_strerror(r) << dendl;
678 return r;
679 }
680
681 if ((other_pool || write_fnode) && !dry_run) {
682 dout(4) << "writing fnode to omap header" << dendl;
683 bufferlist fnode_bl;
684 lump.fnode.encode(fnode_bl);
685 if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
686 r = output.omap_set_header(frag_oid.name, fnode_bl);
687 }
688 if (r != 0) {
689 derr << "Failed to write fnode for frag object "
690 << frag_oid.name << dendl;
691 return r;
692 }
693 }
694
695 std::set<std::string> read_keys;
696
697 // Compose list of potentially-existing dentries we would like to fetch
698 list<ceph::shared_ptr<EMetaBlob::fullbit> > const &fb_list =
699 lump.get_dfull();
700 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
701 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
702 EMetaBlob::fullbit const &fb = *(*fbi);
703
704 // Get a key like "foobar_head"
705 std::string key;
706 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
707 dn_key.encode(key);
708 read_keys.insert(key);
709 }
710
711 list<EMetaBlob::remotebit> const &rb_list =
712 lump.get_dremote();
713 for (list<EMetaBlob::remotebit>::const_iterator rbi =
714 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
715 EMetaBlob::remotebit const &rb = *rbi;
716
717 // Get a key like "foobar_head"
718 std::string key;
719 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
720 dn_key.encode(key);
721 read_keys.insert(key);
722 }
723
c07f9fc5
FG
724 list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull();
725 for (auto& nb : nb_list) {
726 // Get a key like "foobar_head"
727 std::string key;
728 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
729 dn_key.encode(key);
730 read_keys.insert(key);
731 }
732
7c673cae
FG
733 // Perform bulk read of existing dentries
734 std::map<std::string, bufferlist> read_vals;
735 r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
736 if (r == -ENOENT && other_pool) {
737 r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
738 }
739 if (r != 0) {
740 derr << "unexpected error reading fragment object "
741 << frag_oid.name << ": " << cpp_strerror(r) << dendl;
742 return r;
743 }
744
745 // Compose list of dentries we will write back
746 std::map<std::string, bufferlist> write_vals;
747 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
748 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
749 EMetaBlob::fullbit const &fb = *(*fbi);
750
751 // Get a key like "foobar_head"
752 std::string key;
753 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
754 dn_key.encode(key);
755
756 dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
757 << dendl;
758 bool write_dentry = false;
759 if (read_vals.find(key) == read_vals.end()) {
760 dout(4) << "dentry did not already exist, will create" << dendl;
761 write_dentry = true;
762 } else {
763 dout(4) << "dentry " << key << " existed already" << dendl;
764 dout(4) << "dentry exists, checking versions..." << dendl;
765 bufferlist &old_dentry = read_vals[key];
766 // Decode dentry+inode
767 bufferlist::iterator q = old_dentry.begin();
768
769 snapid_t dnfirst;
770 ::decode(dnfirst, q);
771 char dentry_type;
772 ::decode(dentry_type, q);
773
774 if (dentry_type == 'L') {
775 // leave write_dentry false, we have no version to
776 // compare with in a hardlink, so it's not safe to
777 // squash over it with what's in this fullbit
778 dout(10) << "Existing remote inode in slot to be (maybe) written "
779 << "by a full inode from the journal dn '" << fb.dn.c_str()
780 << "' with lump fnode version " << lump.fnode.version
781 << "vs existing fnode version " << old_fnode_version << dendl;
782 write_dentry = old_fnode_version < lump.fnode.version;
783 } else if (dentry_type == 'I') {
784 // Read out inode version to compare with backing store
785 InodeStore inode;
786 inode.decode_bare(q);
787 dout(4) << "decoded embedded inode version "
788 << inode.inode.version << " vs fullbit version "
789 << fb.inode.version << dendl;
790 if (inode.inode.version < fb.inode.version) {
791 write_dentry = true;
792 }
793 } else {
794 dout(4) << "corrupt dentry in backing store, overwriting from "
795 "journal" << dendl;
796 write_dentry = true;
797 }
798 }
799
800 if ((other_pool || write_dentry) && !dry_run) {
801 dout(4) << "writing I dentry " << key << " into frag "
802 << frag_oid.name << dendl;
803
804 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
805 bufferlist dentry_bl;
806 ::encode(fb.dnfirst, dentry_bl);
807 ::encode('I', dentry_bl);
808 encode_fullbit_as_inode(fb, true, &dentry_bl);
809
810 // Record for writing to RADOS
811 write_vals[key] = dentry_bl;
812 consumed_inos->insert(fb.inode.ino);
813 }
814 }
815
816 for (list<EMetaBlob::remotebit>::const_iterator rbi =
817 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
818 EMetaBlob::remotebit const &rb = *rbi;
819
820 // Get a key like "foobar_head"
821 std::string key;
822 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
823 dn_key.encode(key);
824
825 dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
826 << dendl;
827 bool write_dentry = false;
828 if (read_vals.find(key) == read_vals.end()) {
829 dout(4) << "dentry did not already exist, will create" << dendl;
830 write_dentry = true;
831 } else {
832 dout(4) << "dentry " << key << " existed already" << dendl;
833 dout(4) << "dentry exists, checking versions..." << dendl;
834 bufferlist &old_dentry = read_vals[key];
835 // Decode dentry+inode
836 bufferlist::iterator q = old_dentry.begin();
837
838 snapid_t dnfirst;
839 ::decode(dnfirst, q);
840 char dentry_type;
841 ::decode(dentry_type, q);
842
843 if (dentry_type == 'L') {
844 dout(10) << "Existing hardlink inode in slot to be (maybe) written "
845 << "by a remote inode from the journal dn '" << rb.dn.c_str()
846 << "' with lump fnode version " << lump.fnode.version
847 << "vs existing fnode version " << old_fnode_version << dendl;
848 write_dentry = old_fnode_version < lump.fnode.version;
849 } else if (dentry_type == 'I') {
850 dout(10) << "Existing full inode in slot to be (maybe) written "
851 << "by a remote inode from the journal dn '" << rb.dn.c_str()
852 << "' with lump fnode version " << lump.fnode.version
853 << "vs existing fnode version " << old_fnode_version << dendl;
854 write_dentry = old_fnode_version < lump.fnode.version;
855 } else {
856 dout(4) << "corrupt dentry in backing store, overwriting from "
857 "journal" << dendl;
858 write_dentry = true;
859 }
860 }
861
862 if ((other_pool || write_dentry) && !dry_run) {
863 dout(4) << "writing L dentry " << key << " into frag "
864 << frag_oid.name << dendl;
865
866 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
867 bufferlist dentry_bl;
868 ::encode(rb.dnfirst, dentry_bl);
869 ::encode('L', dentry_bl);
870 ::encode(rb.ino, dentry_bl);
871 ::encode(rb.d_type, dentry_bl);
872
873 // Record for writing to RADOS
874 write_vals[key] = dentry_bl;
875 consumed_inos->insert(rb.ino);
876 }
877 }
878
c07f9fc5
FG
879 std::set<std::string> null_vals;
880 for (auto& nb : nb_list) {
881 std::string key;
882 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
883 dn_key.encode(key);
884
885 dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
886 << dendl;
887
888 auto it = read_vals.find(key);
889 if (it != read_vals.end()) {
890 dout(4) << "dentry exists, will remove" << dendl;
891
892 bufferlist::iterator q = it->second.begin();
893 snapid_t dnfirst;
894 ::decode(dnfirst, q);
895 char dentry_type;
896 ::decode(dentry_type, q);
897
898 bool remove_dentry = false;
899 if (dentry_type == 'L') {
900 dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
901 << "by null journal dn '" << nb.dn.c_str()
902 << "' with lump fnode version " << lump.fnode.version
903 << "vs existing fnode version " << old_fnode_version << dendl;
904 remove_dentry = old_fnode_version < lump.fnode.version;
905 } else if (dentry_type == 'I') {
906 dout(10) << "Existing full inode in slot to be (maybe) removed "
907 << "by null journal dn '" << nb.dn.c_str()
908 << "' with lump fnode version " << lump.fnode.version
909 << "vs existing fnode version " << old_fnode_version << dendl;
910 remove_dentry = old_fnode_version < lump.fnode.version;
911 } else {
912 dout(4) << "corrupt dentry in backing store, will remove" << dendl;
913 remove_dentry = true;
914 }
915
916 if (remove_dentry)
917 null_vals.insert(key);
918 }
919 }
920
7c673cae
FG
921 // Write back any new/changed dentries
922 if (!write_vals.empty()) {
923 r = output.omap_set(frag_oid.name, write_vals);
924 if (r != 0) {
925 derr << "error writing dentries to " << frag_oid.name
926 << ": " << cpp_strerror(r) << dendl;
927 return r;
928 }
929 }
c07f9fc5
FG
930
931 // remove any null dentries
932 if (!null_vals.empty()) {
933 r = output.omap_rm_keys(frag_oid.name, null_vals);
934 if (r != 0) {
935 derr << "error removing dentries from " << frag_oid.name
936 << ": " << cpp_strerror(r) << dendl;
937 return r;
938 }
939 }
7c673cae
FG
940 }
941
942 /* Now that we've looked at the dirlumps, we finally pay attention to
943 * the roots (i.e. inodes without ancestry). This is necessary in order
944 * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
945 * important because clients use them to infer completeness
946 * of directories
947 */
948 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator p =
949 metablob.roots.begin(); p != metablob.roots.end(); ++p) {
950 EMetaBlob::fullbit const &fb = *(*p);
951 inodeno_t ino = fb.inode.ino;
952 dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
953
954 object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
955 dout(4) << "object id " << root_oid.name << dendl;
956
957 bool write_root_ino = false;
958 bufferlist old_root_ino_bl;
959 r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
960 if (r == -ENOENT) {
961 dout(4) << "root does not exist, will create" << dendl;
962 write_root_ino = true;
963 } else if (r >= 0) {
964 r = 0;
965 InodeStore old_inode;
966 dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
967 << ")" << dendl;
968 bufferlist::iterator inode_bl_iter = old_root_ino_bl.begin();
969 std::string magic;
970 ::decode(magic, inode_bl_iter);
971 if (magic == CEPH_FS_ONDISK_MAGIC) {
972 dout(4) << "magic ok" << dendl;
973 old_inode.decode(inode_bl_iter);
974
975 if (old_inode.inode.version < fb.inode.version) {
976 write_root_ino = true;
977 }
978 } else {
979 dout(4) << "magic bad: '" << magic << "'" << dendl;
980 write_root_ino = true;
981 }
982 } else {
983 derr << "error reading root inode object " << root_oid.name
984 << ": " << cpp_strerror(r) << dendl;
985 return r;
986 }
987
988 if (write_root_ino && !dry_run) {
989 dout(4) << "writing root ino " << root_oid.name
990 << " version " << fb.inode.version << dendl;
991
992 // Compose: root ino format is magic,InodeStore(bare=false)
993 bufferlist new_root_ino_bl;
994 ::encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
995 encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
996
997 // Write to RADOS
998 r = output.write_full(root_oid.name, new_root_ino_bl);
999 if (r != 0) {
1000 derr << "error writing inode object " << root_oid.name
1001 << ": " << cpp_strerror(r) << dendl;
1002 return r;
1003 }
1004 }
1005 }
1006
1007 return r;
1008}
1009
1010
7c673cae
FG
1011/**
1012 * Erase a region of the log by overwriting it with ENoOp
1013 *
1014 */
1015int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
1016{
1017 // To erase this region, we use our preamble, the encoding overhead
1018 // of an ENoOp, and our trailing start ptr. Calculate how much padding
1019 // is needed inside the ENoOp to make up the difference.
1020 bufferlist tmp;
1021 ENoOp enoop(0);
1022 enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
1023
1024 dout(4) << "erase_region " << pos << " len=" << length << dendl;
1025
1026 // FIXME: get the preamble/postamble length via JournalStream
1027 int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
1028 dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
1029
1030 if (padding < 0) {
1031 derr << "Erase region " << length << " too short" << dendl;
1032 return -EINVAL;
1033 }
1034
1035 // Serialize an ENoOp with the correct amount of padding
1036 enoop = ENoOp(padding);
1037 bufferlist entry;
1038 enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
1039 JournalStream stream(JOURNAL_FORMAT_RESILIENT);
1040
1041 // Serialize region of log stream
1042 bufferlist log_data;
1043 stream.write(entry, &log_data, pos);
1044
1045 dout(4) << "erase_region data length " << log_data.length() << dendl;
1046 assert(log_data.length() == length);
1047
1048 // Write log stream region to RADOS
1049 // FIXME: get object size somewhere common to scan_events
1050 uint32_t object_size = g_conf->mds_log_segment_size;
1051 if (object_size == 0) {
1052 // Default layout object size
1053 object_size = file_layout_t::get_default().object_size;
1054 }
1055
1056 uint64_t write_offset = pos;
1057 uint64_t obj_offset = (pos / object_size);
1058 int r = 0;
1059 while(log_data.length()) {
1060 std::string const oid = js.obj_name(obj_offset);
1061 uint32_t offset_in_obj = write_offset % object_size;
1062 uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
1063
1064 r = output.write(oid, log_data, write_len, offset_in_obj);
1065 if (r < 0) {
1066 return r;
1067 } else {
1068 dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
1069 r = 0;
1070 }
1071
1072 log_data.splice(0, write_len);
1073 write_offset += write_len;
1074 obj_offset++;
1075 }
1076
1077 return r;
1078}
1079
1080/**
1081 * Given an EMetaBlob::fullbit containing an inode, write out
1082 * the encoded inode in the format used by InodeStore (i.e. the
1083 * backing store format)
1084 *
1085 * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
1086 * on an offline InodeStore instance. It's way simpler, because we are just
1087 * uncritically hauling the data between structs.
1088 *
1089 * @param fb a fullbit extracted from a journal entry
1090 * @param bare if true, leave out [EN|DE]CODE_START decoration
1091 * @param out_bl output, write serialized inode to this bufferlist
1092 */
1093void JournalTool::encode_fullbit_as_inode(
1094 const EMetaBlob::fullbit &fb,
1095 const bool bare,
1096 bufferlist *out_bl)
1097{
1098 assert(out_bl != NULL);
1099
1100 // Compose InodeStore
1101 InodeStore new_inode;
1102 new_inode.inode = fb.inode;
1103 new_inode.xattrs = fb.xattrs;
1104 new_inode.dirfragtree = fb.dirfragtree;
1105 new_inode.snap_blob = fb.snapbl;
94b18763 1106 new_inode.symlink = mempool::mds_co::string(boost::string_view(fb.symlink));
7c673cae
FG
1107 new_inode.old_inodes = fb.old_inodes;
1108
1109 // Serialize InodeStore
1110 if (bare) {
1111 new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1112 } else {
1113 new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1114 }
1115}
1116
1117/**
1118 * Given a list of inode numbers known to be in use by
1119 * inodes in the backing store, ensure that none of these
1120 * numbers are listed as free in the InoTables in the
1121 * backing store.
1122 *
1123 * Used after injecting inodes into the backing store, to
1124 * ensure that the same inode numbers are not subsequently
1125 * used for new files during ordinary operation.
1126 *
1127 * @param inos list of inode numbers to be removed from
1128 * free lists in InoTables
1129 * @returns 0 on success, else negative error code
1130 */
1131int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
1132{
1133 int r = 0;
1134
1135 // InoTable is a per-MDS structure, so iterate over assigned ranks
1136 auto fs = fsmap->get_filesystem(role_selector.get_ns());
1137 std::set<mds_rank_t> in_ranks;
1138 fs->mds_map.get_mds_set(in_ranks);
1139
1140 for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
1141 rank_i != in_ranks.end(); ++rank_i)
1142 {
1143 // Compose object name
1144 std::ostringstream oss;
1145 oss << "mds" << *rank_i << "_inotable";
1146 object_t inotable_oid = object_t(oss.str());
1147
1148 // Read object
1149 bufferlist inotable_bl;
1150 int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
1151 if (read_r < 0) {
1152 // Things are really bad if we can't read inotable. Beyond our powers.
1153 derr << "unable to read inotable '" << inotable_oid.name << "': "
1154 << cpp_strerror(read_r) << dendl;
1155 r = r ? r : read_r;
1156 continue;
1157 }
1158
1159 // Deserialize InoTable
1160 version_t inotable_ver;
1161 bufferlist::iterator q = inotable_bl.begin();
1162 ::decode(inotable_ver, q);
1163 InoTable ino_table(NULL);
1164 ino_table.decode(q);
1165
1166 // Update InoTable in memory
1167 bool inotable_modified = false;
1168 for (std::set<inodeno_t>::iterator i = inos.begin();
1169 i != inos.end(); ++i)
1170 {
1171 const inodeno_t ino = *i;
1172 if (ino_table.force_consume(ino)) {
1173 dout(4) << "Used ino 0x" << std::hex << ino << std::dec
1174 << " requires inotable update" << dendl;
1175 inotable_modified = true;
1176 }
1177 }
1178
1179 // Serialize and write InoTable
1180 if (inotable_modified) {
1181 inotable_ver += 1;
1182 dout(4) << "writing modified inotable version " << inotable_ver << dendl;
1183 bufferlist inotable_new_bl;
1184 ::encode(inotable_ver, inotable_new_bl);
1185 ino_table.encode_state(inotable_new_bl);
1186 int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
1187 if (write_r != 0) {
1188 derr << "error writing modified inotable " << inotable_oid.name
1189 << ": " << cpp_strerror(write_r) << dendl;
1190 r = r ? r : read_r;
1191 continue;
1192 }
1193 }
1194 }
1195
1196 return r;
1197}
1198