]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/cephfs/JournalTool.cc
import ceph quincy 17.2.4
[ceph.git] / ceph / src / tools / cephfs / JournalTool.cc
CommitLineData
7c673cae
FG
1// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * ceph - scalable distributed file system
5 *
6 * copyright (c) 2014 john spray <john.spray@inktank.com>
7 *
8 * this is free software; you can redistribute it and/or
9 * modify it under the terms of the gnu lesser general public
10 * license version 2.1, as published by the free software
11 * foundation. see file copying.
12 */
13
14
15#include <sstream>
16
17#include "common/ceph_argparse.h"
18#include "common/errno.h"
19#include "osdc/Journaler.h"
20#include "mds/mdstypes.h"
21#include "mds/LogEvent.h"
22#include "mds/InoTable.h"
23
24#include "mds/events/ENoOp.h"
25#include "mds/events/EUpdate.h"
26
27#include "JournalScanner.h"
28#include "EventOutput.h"
29#include "Dumper.h"
30#include "Resetter.h"
31
32#include "JournalTool.h"
33
34
35#define dout_context g_ceph_context
36#define dout_subsys ceph_subsys_mds
37#undef dout_prefix
38#define dout_prefix *_dout << __func__ << ": "
39
20effc67 40using namespace std;
7c673cae
FG
41
42void JournalTool::usage()
43{
44 std::cout << "Usage: \n"
45 << " cephfs-journal-tool [options] journal <command>\n"
46 << " <command>:\n"
47 << " inspect\n"
91327a77 48 << " import <path> [--force]\n"
7c673cae
FG
49 << " export <path>\n"
50 << " reset [--force]\n"
92f5a8d4
TL
51 << " cephfs-journal-tool [options] header <get|set> <field> <value>\n"
52 << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n"
31f18b77 53 << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
7c673cae
FG
54 << " <selector>:\n"
55 << " --range=<start>..<end>\n"
56 << " --path=<substring>\n"
57 << " --inode=<integer>\n"
58 << " --type=<UPDATE|OPEN|SESSION...><\n"
59 << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
7c673cae 60 << " --client=<session id integer>\n"
31f18b77 61 << " <effect>: [get|recover_dentries|splice]\n"
7c673cae
FG
62 << " <output>: [summary|list|binary|json] [--path <path>]\n"
63 << "\n"
31f18b77 64 << "General options:\n"
f64942e4 65 << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n"
11fdf7f2
TL
66 << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n"
67 << " this journal is used to queue for purge operation,\n"
92f5a8d4 68 << " default is mdlog, and only mdlog support event mode)\n"
31f18b77
FG
69 << "\n"
70 << "Special options\n"
71 << " --alternate-pool <name> Alternative metadata pool to target\n"
72 << " when using recover_dentries.\n";
7c673cae
FG
73
74 generic_client_usage();
75}
76
77
78/**
79 * Handle arguments and hand off to journal/header/event mode
80 */
81int JournalTool::main(std::vector<const char*> &argv)
82{
83 int r;
84
85 dout(10) << "JournalTool::main " << dendl;
86 // Common arg parsing
87 // ==================
88 if (argv.empty()) {
11fdf7f2 89 cerr << "missing positional argument" << std::endl;
7c673cae
FG
90 return -EINVAL;
91 }
92
93 std::vector<const char*>::iterator arg = argv.begin();
94
95 std::string rank_str;
f64942e4
AA
96 if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
97 derr << "missing mandatory \"--rank\" argument" << dendl;
98 return -EINVAL;
7c673cae
FG
99 }
100
11fdf7f2
TL
101 if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) {
102 // Default is mdlog
103 type = "mdlog";
104 }
105
106 r = validate_type(type);
107 if (r != 0) {
108 derr << "journal type is not correct." << dendl;
109 return r;
110 }
111
f64942e4 112 r = role_selector.parse(*fsmap, rank_str, false);
7c673cae
FG
113 if (r != 0) {
114 derr << "Couldn't determine MDS rank." << dendl;
115 return r;
116 }
117
118 std::string mode;
119 if (arg == argv.end()) {
120 derr << "Missing mode [journal|header|event]" << dendl;
121 return -EINVAL;
122 }
123 mode = std::string(*arg);
124 arg = argv.erase(arg);
125
126 // RADOS init
127 // ==========
128 r = rados.init_with_context(g_ceph_context);
129 if (r < 0) {
130 derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
131 return r;
132 }
133
134 dout(4) << "JournalTool: connecting to RADOS..." << dendl;
135 r = rados.connect();
136 if (r < 0) {
137 derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
138 return r;
139 }
140
141 auto fs = fsmap->get_filesystem(role_selector.get_ns());
11fdf7f2 142 ceph_assert(fs != nullptr);
7c673cae
FG
143 int64_t const pool_id = fs->mds_map.get_metadata_pool();
144 dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
145 std::string pool_name;
146 r = rados.pool_reverse_lookup(pool_id, &pool_name);
147 if (r < 0) {
148 derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
149 return r;
150 }
151
152 dout(4) << "JournalTool: creating IoCtx.." << dendl;
153 r = rados.ioctx_create(pool_name.c_str(), input);
11fdf7f2 154 ceph_assert(r == 0);
7c673cae
FG
155 output.dup(input);
156
157 // Execution
158 // =========
11fdf7f2
TL
159 // journal and header are general journal mode
160 // event mode is only specific for mdlog
f64942e4
AA
161 auto roles = role_selector.get_roles();
162 if (roles.size() > 1) {
163 const std::string &command = argv[0];
164 bool allowed = can_execute_for_all_ranks(mode, command);
165 if (!allowed) {
166 derr << "operation not allowed for all ranks" << dendl;
167 return -EINVAL;
168 }
169
170 all_ranks = true;
171 }
172 for (auto role : roles) {
7c673cae 173 rank = role.rank;
f64942e4 174 std::vector<const char *> rank_argv(argv);
7c673cae
FG
175 dout(4) << "Executing for rank " << rank << dendl;
176 if (mode == std::string("journal")) {
f64942e4 177 r = main_journal(rank_argv);
7c673cae 178 } else if (mode == std::string("header")) {
f64942e4 179 r = main_header(rank_argv);
7c673cae 180 } else if (mode == std::string("event")) {
f64942e4 181 r = main_event(rank_argv);
7c673cae 182 } else {
11fdf7f2 183 cerr << "Bad command '" << mode << "'" << std::endl;
7c673cae
FG
184 return -EINVAL;
185 }
186
187 if (r != 0) {
188 return r;
189 }
190 }
191
192 return r;
193}
194
11fdf7f2
TL
195int JournalTool::validate_type(const std::string &type)
196{
197 if (type == "mdlog" || type == "purge_queue") {
198 return 0;
199 }
200 return -1;
201}
7c673cae 202
f64942e4
AA
203std::string JournalTool::gen_dump_file_path(const std::string &prefix) {
204 if (!all_ranks) {
205 return prefix;
206 }
207
208 return prefix + "." + std::to_string(rank);
209}
210
211bool JournalTool::can_execute_for_all_ranks(const std::string &mode,
212 const std::string &command) {
213 if (mode == "journal" && command == "import") {
214 return false;
215 }
216
217 return true;
218}
219
7c673cae
FG
220/**
221 * Handle arguments for 'journal' mode
222 *
223 * This is for operations that act on the journal as a whole.
224 */
225int JournalTool::main_journal(std::vector<const char*> &argv)
226{
92f5a8d4
TL
227 if (argv.empty()) {
228 derr << "Missing journal command, please see help" << dendl;
229 return -EINVAL;
230 }
231
7c673cae
FG
232 std::string command = argv[0];
233 if (command == "inspect") {
234 return journal_inspect();
235 } else if (command == "export" || command == "import") {
91327a77 236 bool force = false;
7c673cae
FG
237 if (argv.size() >= 2) {
238 std::string const path = argv[1];
91327a77
AA
239 if (argv.size() == 3) {
240 if (std::string(argv[2]) == "--force") {
241 force = true;
242 } else {
243 std::cerr << "Unknown argument " << argv[1] << std::endl;
244 return -EINVAL;
245 }
246 }
247 return journal_export(path, command == "import", force);
7c673cae
FG
248 } else {
249 derr << "Missing path" << dendl;
250 return -EINVAL;
251 }
252 } else if (command == "reset") {
253 bool force = false;
254 if (argv.size() == 2) {
255 if (std::string(argv[1]) == "--force") {
256 force = true;
257 } else {
258 std::cerr << "Unknown argument " << argv[1] << std::endl;
7c673cae
FG
259 return -EINVAL;
260 }
261 } else if (argv.size() > 2) {
262 std::cerr << "Too many arguments!" << std::endl;
7c673cae
FG
263 return -EINVAL;
264 }
265 return journal_reset(force);
266 } else {
267 derr << "Bad journal command '" << command << "'" << dendl;
268 return -EINVAL;
269 }
270}
271
272
273/**
274 * Parse arguments and execute for 'header' mode
275 *
276 * This is for operations that act on the header only.
277 */
278int JournalTool::main_header(std::vector<const char*> &argv)
279{
11fdf7f2
TL
280 JournalFilter filter(type);
281 JournalScanner js(input, rank, type, filter);
7c673cae
FG
282 int r = js.scan(false);
283 if (r < 0) {
284 std::cerr << "Unable to scan journal" << std::endl;
285 return r;
286 }
287
288 if (!js.header_present) {
289 std::cerr << "Header object not found!" << std::endl;
290 return -ENOENT;
291 } else if (!js.header_valid && js.header == NULL) {
292 // Can't do a read or a single-field write without a copy of the original
293 derr << "Header could not be read!" << dendl;
294 return -ENOENT;
295 } else {
11fdf7f2 296 ceph_assert(js.header != NULL);
7c673cae
FG
297 }
298
92f5a8d4
TL
299 if (argv.empty()) {
300 derr << "Missing header command, must be [get|set]" << dendl;
7c673cae
FG
301 return -EINVAL;
302 }
303 std::vector<const char *>::iterator arg = argv.begin();
304 std::string const command = *arg;
305 arg = argv.erase(arg);
306
307 if (command == std::string("get")) {
308 // Write JSON journal dump to stdout
309 JSONFormatter jf(true);
310 js.header->dump(&jf);
311 jf.flush(std::cout);
312 std::cout << std::endl;
313 } else if (command == std::string("set")) {
314 // Need two more args <key> <val>
315 if (argv.size() != 2) {
316 derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
317 return -EINVAL;
318 }
319
320 std::string const field_name = *arg;
321 arg = argv.erase(arg);
322
323 std::string const value_str = *arg;
324 arg = argv.erase(arg);
11fdf7f2 325 ceph_assert(argv.empty());
7c673cae
FG
326
327 std::string parse_err;
328 uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
329 if (!parse_err.empty()) {
330 derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
331 return -EINVAL;
332 }
333
334 uint64_t *field = NULL;
335 if (field_name == "trimmed_pos") {
336 field = &(js.header->trimmed_pos);
337 } else if (field_name == "expire_pos") {
338 field = &(js.header->expire_pos);
339 } else if (field_name == "write_pos") {
340 field = &(js.header->write_pos);
b32b8144
FG
341 } else if (field_name == "pool_id") {
342 field = (uint64_t*)(&(js.header->layout.pool_id));
7c673cae
FG
343 } else {
344 derr << "Invalid field '" << field_name << "'" << dendl;
345 return -EINVAL;
346 }
347
348 std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
349 *field = new_val;
350
351 dout(4) << "Writing object..." << dendl;
352 bufferlist header_bl;
11fdf7f2 353 encode(*(js.header), header_bl);
7c673cae
FG
354 output.write_full(js.obj_name(0), header_bl);
355 dout(4) << "Write complete." << dendl;
356 std::cout << "Successfully updated header." << std::endl;
357 } else {
358 derr << "Bad header command '" << command << "'" << dendl;
359 return -EINVAL;
360 }
361
362 return 0;
363}
364
365
366/**
367 * Parse arguments and execute for 'event' mode
368 *
369 * This is for operations that act on LogEvents within the log
370 */
371int JournalTool::main_event(std::vector<const char*> &argv)
372{
373 int r;
374
92f5a8d4
TL
375 if (argv.empty()) {
376 derr << "Missing event command, please see help" << dendl;
377 return -EINVAL;
378 }
7c673cae 379
92f5a8d4 380 std::vector<const char*>::iterator arg = argv.begin();
1911f103
TL
381 bool dry_run = false;
382
7c673cae 383 std::string command = *(arg++);
31f18b77 384 if (command != "get" && command != "splice" && command != "recover_dentries") {
7c673cae 385 derr << "Unknown argument '" << command << "'" << dendl;
11fdf7f2
TL
386 return -EINVAL;
387 }
388
1911f103
TL
389 if (command == "recover_dentries") {
390 if (type != "mdlog") {
391 derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl;
392 return -EINVAL;
393 } else {
394 if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
395 dry_run = true;
396 }
397 }
7c673cae
FG
398 }
399
400 if (arg == argv.end()) {
401 derr << "Incomplete command line" << dendl;
7c673cae
FG
402 return -EINVAL;
403 }
404
405 // Parse filter options
406 // ====================
11fdf7f2 407 JournalFilter filter(type);
7c673cae
FG
408 r = filter.parse_args(argv, arg);
409 if (r) {
410 return r;
411 }
412
413 // Parse output options
414 // ====================
415 if (arg == argv.end()) {
11fdf7f2
TL
416 cerr << "Missing output command" << std::endl;
417 return -EINVAL;
7c673cae
FG
418 }
419 std::string output_style = *(arg++);
420 if (output_style != "binary" && output_style != "json" &&
421 output_style != "summary" && output_style != "list") {
11fdf7f2
TL
422 cerr << "Unknown argument: '" << output_style << "'" << std::endl;
423 return -EINVAL;
7c673cae
FG
424 }
425
426 std::string output_path = "dump";
427 while(arg != argv.end()) {
428 std::string arg_str;
429 if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
430 output_path = arg_str;
431 } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
432 nullptr)) {
433 dout(1) << "Using alternate pool " << arg_str << dendl;
434 int r = rados.ioctx_create(arg_str.c_str(), output);
11fdf7f2 435 ceph_assert(r == 0);
7c673cae
FG
436 other_pool = true;
437 } else {
11fdf7f2 438 cerr << "Unknown argument: '" << *arg << "'" << std::endl;
7c673cae
FG
439 return -EINVAL;
440 }
441 }
442
f64942e4
AA
443 const std::string dump_path = gen_dump_file_path(output_path);
444
7c673cae
FG
445 // Execute command
446 // ===============
11fdf7f2 447 JournalScanner js(input, rank, type, filter);
7c673cae
FG
448 if (command == "get") {
449 r = js.scan();
450 if (r) {
451 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
452 return r;
453 }
7c673cae
FG
454 } else if (command == "recover_dentries") {
455 r = js.scan();
456 if (r) {
457 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
458 return r;
459 }
460
7c673cae
FG
461 /**
462 * Iterate over log entries, attempting to scavenge from each one
463 */
464 std::set<inodeno_t> consumed_inos;
465 for (JournalScanner::EventMap::iterator i = js.events.begin();
466 i != js.events.end(); ++i) {
11fdf7f2 467 auto& le = i->second.log_event;
7c673cae
FG
468 EMetaBlob const *mb = le->get_metablob();
469 if (mb) {
31f18b77 470 int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
7c673cae
FG
471 if (scav_r) {
472 dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
473 << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
474 if (r == 0) {
475 r = scav_r;
476 }
477 // Our goal is to read all we can, so don't stop on errors, but
478 // do record them for possible later output
479 js.errors.insert(std::make_pair(i->first,
480 JournalScanner::EventError(scav_r, cpp_strerror(r))));
481 }
482 }
483 }
484
485 /**
486 * Update InoTable to reflect any inode numbers consumed during scavenge
487 */
488 dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
489 if (consumed_inos.size() && !dry_run) {
490 int consume_r = consume_inos(consumed_inos);
491 if (consume_r) {
492 dout(1) << "Error updating InoTable for " << consumed_inos.size()
493 << " consume inos: " << cpp_strerror(consume_r) << dendl;
494 if (r == 0) {
495 r = consume_r;
496 }
497 }
498 }
499
500 // Remove consumed dentries from lost+found.
501 if (other_pool && !dry_run) {
502 std::set<std::string> found;
503
504 for (auto i : consumed_inos) {
505 char s[20];
506
507 snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
508 dout(20) << "removing " << s << dendl;
509 found.insert(std::string(s));
510 }
511
512 object_t frag_oid;
513 frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
514 frag_t(), "");
515 output.omap_rm_keys(frag_oid.name, found);
516 }
517 } else if (command == "splice") {
518 r = js.scan();
519 if (r) {
520 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
521 return r;
522 }
523
524 uint64_t start, end;
525 if (filter.get_range(start, end)) {
526 // Special case for range filter: erase a numeric range in the log
527 uint64_t range = end - start;
528 int r = erase_region(js, start, range);
529 if (r) {
530 derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
531 << ": " << cpp_strerror(r) << dendl;
532 return r;
533 }
534 } else {
535 // General case: erase a collection of individual entries in the log
536 for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
537 dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
538
539 int r = erase_region(js, i->first, i->second.raw_size);
540 if (r) {
541 derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
542 << ": " << cpp_strerror(r) << dendl;
543 return r;
544 }
545 }
546 }
547
548
549 } else {
11fdf7f2 550 cerr << "Unknown argument '" << command << "'" << std::endl;
7c673cae
FG
551 return -EINVAL;
552 }
553
554 // Generate output
555 // ===============
f64942e4 556 EventOutput output(js, dump_path);
7c673cae
FG
557 int output_result = 0;
558 if (output_style == "binary") {
559 output_result = output.binary();
560 } else if (output_style == "json") {
561 output_result = output.json();
562 } else if (output_style == "summary") {
563 output.summary();
564 } else if (output_style == "list") {
565 output.list();
566 } else {
567 std::cerr << "Bad output command '" << output_style << "'" << std::endl;
568 return -EINVAL;
569 }
570
571 if (output_result != 0) {
572 std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
573 }
574
575 return output_result;
576}
577
578/**
579 * Provide the user with information about the condition of the journal,
580 * especially indicating what range of log events is available and where
581 * any gaps or corruptions in the journal are.
582 */
583int JournalTool::journal_inspect()
584{
585 int r;
586
11fdf7f2
TL
587 JournalFilter filter(type);
588 JournalScanner js(input, rank, type, filter);
7c673cae
FG
589 r = js.scan();
590 if (r) {
591 std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
592 return r;
593 }
594
595 js.report(std::cout);
596
597 return 0;
598}
599
600
601/**
602 * Attempt to export a binary dump of the journal.
603 *
604 * This is allowed to fail if the header is malformed or there are
605 * objects inaccessible, in which case the user would have to fall
606 * back to manually listing RADOS objects and extracting them, which
607 * they can do with the ``rados`` CLI.
608 */
91327a77 609int JournalTool::journal_export(std::string const &path, bool import, bool force)
7c673cae
FG
610{
611 int r = 0;
11fdf7f2 612 JournalScanner js(input, rank, type);
7c673cae
FG
613
614 if (!import) {
615 /*
616 * If doing an export, first check that the header is valid and
617 * no objects are missing before trying to dump
618 */
619 r = js.scan();
620 if (r < 0) {
621 derr << "Unable to scan journal, assuming badly damaged" << dendl;
622 return r;
623 }
624 if (!js.is_readable()) {
625 derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
626 return -EIO;
627 }
628 }
629
630 /*
631 * Assuming we can cleanly read the journal data, dump it out to a file
632 */
633 {
634 Dumper dumper;
11fdf7f2 635 r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type);
7c673cae
FG
636 if (r < 0) {
637 derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
638 return r;
639 }
640 if (import) {
91327a77 641 r = dumper.undump(path.c_str(), force);
7c673cae 642 } else {
f64942e4
AA
643 const std::string ex_path = gen_dump_file_path(path);
644 r = dumper.dump(ex_path.c_str());
7c673cae 645 }
7c673cae
FG
646 }
647
648 return r;
649}
650
651
652/**
653 * Truncate journal and insert EResetJournal
654 */
655int JournalTool::journal_reset(bool hard)
656{
657 int r = 0;
658 Resetter resetter;
11fdf7f2 659 r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard);
7c673cae
FG
660 if (r < 0) {
661 derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
662 return r;
663 }
664
665 if (hard) {
11fdf7f2 666 r = resetter.reset_hard();
7c673cae 667 } else {
11fdf7f2 668 r = resetter.reset();
7c673cae 669 }
7c673cae
FG
670
671 return r;
672}
673
674
675/**
676 * Selective offline replay which only reads out dentries and writes
677 * them to the backing store iff their version is > what is currently
678 * in the backing store.
679 *
680 * In order to write dentries to the backing store, we may create the
681 * required enclosing dirfrag objects.
682 *
683 * Test this by running scavenge on an unflushed journal, then nuking
684 * it offline, then starting an MDS and seeing that the dentries are
685 * visible.
686 *
687 * @param metablob an EMetaBlob retrieved from the journal
688 * @param dry_run if true, do no writes to RADOS
689 * @param consumed_inos output, populated with any inos inserted
690 * @returns 0 on success, else negative error code
691 */
31f18b77 692int JournalTool::recover_dentries(
7c673cae
FG
693 EMetaBlob const &metablob,
694 bool const dry_run,
695 std::set<inodeno_t> *consumed_inos)
696{
11fdf7f2 697 ceph_assert(consumed_inos != NULL);
7c673cae
FG
698
699 int r = 0;
700
701 // Replay fullbits (dentry+inode)
11fdf7f2 702 for (const auto& frag : metablob.lump_order) {
7c673cae
FG
703 EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
704 lump._decode_bits();
705 object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
706
707 dout(4) << "inspecting lump " << frag_oid.name << dendl;
708
709
710 // We will record old fnode version for use in hard link handling
711 // If we don't read an old fnode, take version as zero and write in
712 // all hardlinks we find.
713 version_t old_fnode_version = 0;
714
715 // Update fnode in omap header of dirfrag object
716 bool write_fnode = false;
717 bufferlist old_fnode_bl;
718 r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
719 if (r == -ENOENT) {
720 // Creating dirfrag from scratch
721 dout(4) << "failed to read OMAP header from directory fragment "
722 << frag_oid.name << " " << cpp_strerror(r) << dendl;
723 write_fnode = true;
724 // Note: creating the dirfrag *without* a backtrace, relying on
725 // MDS to regenerate backtraces on read or in FSCK
726 } else if (r == 0) {
727 // Conditionally update existing omap header
728 fnode_t old_fnode;
11fdf7f2 729 auto old_fnode_iter = old_fnode_bl.cbegin();
7c673cae
FG
730 try {
731 old_fnode.decode(old_fnode_iter);
732 dout(4) << "frag " << frag_oid.name << " fnode old v" <<
f67539c2 733 old_fnode.version << " vs new v" << lump.fnode->version << dendl;
7c673cae 734 old_fnode_version = old_fnode.version;
f67539c2 735 write_fnode = old_fnode_version < lump.fnode->version;
7c673cae
FG
736 } catch (const buffer::error &err) {
737 dout(1) << "frag " << frag_oid.name
738 << " is corrupt, overwriting" << dendl;
739 write_fnode = true;
740 }
741 } else {
742 // Unexpected error
743 dout(4) << "failed to read OMAP header from directory fragment "
744 << frag_oid.name << " " << cpp_strerror(r) << dendl;
745 return r;
746 }
747
748 if ((other_pool || write_fnode) && !dry_run) {
749 dout(4) << "writing fnode to omap header" << dendl;
750 bufferlist fnode_bl;
f67539c2 751 lump.fnode->encode(fnode_bl);
7c673cae
FG
752 if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
753 r = output.omap_set_header(frag_oid.name, fnode_bl);
754 }
755 if (r != 0) {
756 derr << "Failed to write fnode for frag object "
757 << frag_oid.name << dendl;
758 return r;
759 }
760 }
761
762 std::set<std::string> read_keys;
763
764 // Compose list of potentially-existing dentries we would like to fetch
11fdf7f2 765 for (const auto& fb : lump.get_dfull()) {
7c673cae
FG
766 // Get a key like "foobar_head"
767 std::string key;
768 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
769 dn_key.encode(key);
770 read_keys.insert(key);
771 }
772
11fdf7f2 773 for(const auto& rb : lump.get_dremote()) {
7c673cae
FG
774 // Get a key like "foobar_head"
775 std::string key;
776 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
777 dn_key.encode(key);
778 read_keys.insert(key);
779 }
780
11fdf7f2 781 for (const auto& nb : lump.get_dnull()) {
c07f9fc5
FG
782 // Get a key like "foobar_head"
783 std::string key;
784 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
785 dn_key.encode(key);
786 read_keys.insert(key);
787 }
788
7c673cae
FG
789 // Perform bulk read of existing dentries
790 std::map<std::string, bufferlist> read_vals;
791 r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
792 if (r == -ENOENT && other_pool) {
793 r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
794 }
795 if (r != 0) {
796 derr << "unexpected error reading fragment object "
797 << frag_oid.name << ": " << cpp_strerror(r) << dendl;
798 return r;
799 }
800
801 // Compose list of dentries we will write back
802 std::map<std::string, bufferlist> write_vals;
11fdf7f2 803 for (const auto& fb : lump.get_dfull()) {
7c673cae
FG
804 // Get a key like "foobar_head"
805 std::string key;
806 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
807 dn_key.encode(key);
808
809 dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
810 << dendl;
811 bool write_dentry = false;
812 if (read_vals.find(key) == read_vals.end()) {
813 dout(4) << "dentry did not already exist, will create" << dendl;
814 write_dentry = true;
815 } else {
816 dout(4) << "dentry " << key << " existed already" << dendl;
817 dout(4) << "dentry exists, checking versions..." << dendl;
818 bufferlist &old_dentry = read_vals[key];
819 // Decode dentry+inode
11fdf7f2 820 auto q = old_dentry.cbegin();
7c673cae
FG
821
822 snapid_t dnfirst;
11fdf7f2 823 decode(dnfirst, q);
7c673cae 824 char dentry_type;
11fdf7f2 825 decode(dentry_type, q);
7c673cae 826
f67539c2 827 if (dentry_type == 'L' || dentry_type == 'l') {
7c673cae
FG
828 // leave write_dentry false, we have no version to
829 // compare with in a hardlink, so it's not safe to
830 // squash over it with what's in this fullbit
831 dout(10) << "Existing remote inode in slot to be (maybe) written "
832 << "by a full inode from the journal dn '" << fb.dn.c_str()
f67539c2 833 << "' with lump fnode version " << lump.fnode->version
7c673cae 834 << "vs existing fnode version " << old_fnode_version << dendl;
f67539c2
TL
835 write_dentry = old_fnode_version < lump.fnode->version;
836 } else if (dentry_type == 'I' || dentry_type == 'i') {
7c673cae
FG
837 // Read out inode version to compare with backing store
838 InodeStore inode;
f67539c2
TL
839 if (dentry_type == 'i') {
840 mempool::mds_co::string alternate_name;
841
842 DECODE_START(2, q);
843 if (struct_v >= 2)
844 decode(alternate_name, q);
845 inode.decode(q);
846 DECODE_FINISH(q);
847 } else {
848 inode.decode_bare(q);
849 }
7c673cae 850 dout(4) << "decoded embedded inode version "
f67539c2
TL
851 << inode.inode->version << " vs fullbit version "
852 << fb.inode->version << dendl;
853 if (inode.inode->version < fb.inode->version) {
7c673cae
FG
854 write_dentry = true;
855 }
856 } else {
857 dout(4) << "corrupt dentry in backing store, overwriting from "
858 "journal" << dendl;
859 write_dentry = true;
860 }
861 }
862
863 if ((other_pool || write_dentry) && !dry_run) {
864 dout(4) << "writing I dentry " << key << " into frag "
865 << frag_oid.name << dendl;
866
867 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
868 bufferlist dentry_bl;
11fdf7f2
TL
869 encode(fb.dnfirst, dentry_bl);
870 encode('I', dentry_bl);
7c673cae
FG
871 encode_fullbit_as_inode(fb, true, &dentry_bl);
872
873 // Record for writing to RADOS
874 write_vals[key] = dentry_bl;
f67539c2 875 consumed_inos->insert(fb.inode->ino);
7c673cae
FG
876 }
877 }
878
11fdf7f2 879 for(const auto& rb : lump.get_dremote()) {
7c673cae
FG
880 // Get a key like "foobar_head"
881 std::string key;
882 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
883 dn_key.encode(key);
884
885 dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
886 << dendl;
887 bool write_dentry = false;
888 if (read_vals.find(key) == read_vals.end()) {
889 dout(4) << "dentry did not already exist, will create" << dendl;
890 write_dentry = true;
891 } else {
892 dout(4) << "dentry " << key << " existed already" << dendl;
893 dout(4) << "dentry exists, checking versions..." << dendl;
894 bufferlist &old_dentry = read_vals[key];
895 // Decode dentry+inode
11fdf7f2 896 auto q = old_dentry.cbegin();
7c673cae
FG
897
898 snapid_t dnfirst;
11fdf7f2 899 decode(dnfirst, q);
7c673cae 900 char dentry_type;
11fdf7f2 901 decode(dentry_type, q);
7c673cae 902
f67539c2 903 if (dentry_type == 'L' || dentry_type == 'l') {
7c673cae
FG
904 dout(10) << "Existing hardlink inode in slot to be (maybe) written "
905 << "by a remote inode from the journal dn '" << rb.dn.c_str()
f67539c2 906 << "' with lump fnode version " << lump.fnode->version
7c673cae 907 << "vs existing fnode version " << old_fnode_version << dendl;
f67539c2
TL
908 write_dentry = old_fnode_version < lump.fnode->version;
909 } else if (dentry_type == 'I' || dentry_type == 'i') {
7c673cae
FG
910 dout(10) << "Existing full inode in slot to be (maybe) written "
911 << "by a remote inode from the journal dn '" << rb.dn.c_str()
f67539c2 912 << "' with lump fnode version " << lump.fnode->version
7c673cae 913 << "vs existing fnode version " << old_fnode_version << dendl;
f67539c2 914 write_dentry = old_fnode_version < lump.fnode->version;
7c673cae
FG
915 } else {
916 dout(4) << "corrupt dentry in backing store, overwriting from "
917 "journal" << dendl;
918 write_dentry = true;
919 }
920 }
921
922 if ((other_pool || write_dentry) && !dry_run) {
923 dout(4) << "writing L dentry " << key << " into frag "
924 << frag_oid.name << dendl;
925
926 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
927 bufferlist dentry_bl;
11fdf7f2
TL
928 encode(rb.dnfirst, dentry_bl);
929 encode('L', dentry_bl);
930 encode(rb.ino, dentry_bl);
931 encode(rb.d_type, dentry_bl);
7c673cae
FG
932
933 // Record for writing to RADOS
934 write_vals[key] = dentry_bl;
935 consumed_inos->insert(rb.ino);
936 }
937 }
938
c07f9fc5 939 std::set<std::string> null_vals;
11fdf7f2 940 for (const auto& nb : lump.get_dnull()) {
c07f9fc5
FG
941 std::string key;
942 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
943 dn_key.encode(key);
944
945 dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
946 << dendl;
947
948 auto it = read_vals.find(key);
949 if (it != read_vals.end()) {
950 dout(4) << "dentry exists, will remove" << dendl;
951
11fdf7f2 952 auto q = it->second.cbegin();
c07f9fc5 953 snapid_t dnfirst;
11fdf7f2 954 decode(dnfirst, q);
c07f9fc5 955 char dentry_type;
11fdf7f2 956 decode(dentry_type, q);
c07f9fc5
FG
957
958 bool remove_dentry = false;
f67539c2 959 if (dentry_type == 'L' || dentry_type == 'l') {
c07f9fc5
FG
960 dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
961 << "by null journal dn '" << nb.dn.c_str()
f67539c2 962 << "' with lump fnode version " << lump.fnode->version
c07f9fc5 963 << "vs existing fnode version " << old_fnode_version << dendl;
f67539c2
TL
964 remove_dentry = old_fnode_version < lump.fnode->version;
965 } else if (dentry_type == 'I' || dentry_type == 'i') {
c07f9fc5
FG
966 dout(10) << "Existing full inode in slot to be (maybe) removed "
967 << "by null journal dn '" << nb.dn.c_str()
f67539c2 968 << "' with lump fnode version " << lump.fnode->version
c07f9fc5 969 << "vs existing fnode version " << old_fnode_version << dendl;
f67539c2 970 remove_dentry = old_fnode_version < lump.fnode->version;
c07f9fc5
FG
971 } else {
972 dout(4) << "corrupt dentry in backing store, will remove" << dendl;
973 remove_dentry = true;
974 }
975
976 if (remove_dentry)
977 null_vals.insert(key);
978 }
979 }
980
7c673cae
FG
981 // Write back any new/changed dentries
982 if (!write_vals.empty()) {
983 r = output.omap_set(frag_oid.name, write_vals);
984 if (r != 0) {
985 derr << "error writing dentries to " << frag_oid.name
986 << ": " << cpp_strerror(r) << dendl;
987 return r;
988 }
989 }
c07f9fc5
FG
990
991 // remove any null dentries
992 if (!null_vals.empty()) {
993 r = output.omap_rm_keys(frag_oid.name, null_vals);
994 if (r != 0) {
995 derr << "error removing dentries from " << frag_oid.name
996 << ": " << cpp_strerror(r) << dendl;
997 return r;
998 }
999 }
7c673cae
FG
1000 }
1001
1002 /* Now that we've looked at the dirlumps, we finally pay attention to
1003 * the roots (i.e. inodes without ancestry). This is necessary in order
1004 * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
1005 * important because clients use them to infer completeness
1006 * of directories
1007 */
11fdf7f2 1008 for (const auto& fb : metablob.roots) {
f67539c2 1009 inodeno_t ino = fb.inode->ino;
7c673cae
FG
1010 dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
1011
1012 object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
1013 dout(4) << "object id " << root_oid.name << dendl;
1014
1015 bool write_root_ino = false;
1016 bufferlist old_root_ino_bl;
1017 r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
1018 if (r == -ENOENT) {
1019 dout(4) << "root does not exist, will create" << dendl;
1020 write_root_ino = true;
1021 } else if (r >= 0) {
1022 r = 0;
1023 InodeStore old_inode;
1024 dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
1025 << ")" << dendl;
11fdf7f2 1026 auto inode_bl_iter = old_root_ino_bl.cbegin();
7c673cae 1027 std::string magic;
11fdf7f2 1028 decode(magic, inode_bl_iter);
7c673cae
FG
1029 if (magic == CEPH_FS_ONDISK_MAGIC) {
1030 dout(4) << "magic ok" << dendl;
1031 old_inode.decode(inode_bl_iter);
1032
f67539c2 1033 if (old_inode.inode->version < fb.inode->version) {
7c673cae
FG
1034 write_root_ino = true;
1035 }
1036 } else {
1037 dout(4) << "magic bad: '" << magic << "'" << dendl;
1038 write_root_ino = true;
1039 }
1040 } else {
1041 derr << "error reading root inode object " << root_oid.name
1042 << ": " << cpp_strerror(r) << dendl;
1043 return r;
1044 }
1045
1046 if (write_root_ino && !dry_run) {
1047 dout(4) << "writing root ino " << root_oid.name
f67539c2 1048 << " version " << fb.inode->version << dendl;
7c673cae
FG
1049
1050 // Compose: root ino format is magic,InodeStore(bare=false)
1051 bufferlist new_root_ino_bl;
11fdf7f2 1052 encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
7c673cae
FG
1053 encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
1054
1055 // Write to RADOS
1056 r = output.write_full(root_oid.name, new_root_ino_bl);
1057 if (r != 0) {
1058 derr << "error writing inode object " << root_oid.name
1059 << ": " << cpp_strerror(r) << dendl;
1060 return r;
1061 }
1062 }
1063 }
1064
1065 return r;
1066}
1067
1068
7c673cae
FG
1069/**
1070 * Erase a region of the log by overwriting it with ENoOp
1071 *
1072 */
1073int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
1074{
1075 // To erase this region, we use our preamble, the encoding overhead
1076 // of an ENoOp, and our trailing start ptr. Calculate how much padding
1077 // is needed inside the ENoOp to make up the difference.
1078 bufferlist tmp;
11fdf7f2
TL
1079 if (type == "mdlog") {
1080 ENoOp enoop(0);
1081 enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
1082 } else if (type == "purge_queue") {
1083 PurgeItem pi;
1084 pi.encode(tmp);
1085 }
7c673cae
FG
1086
1087 dout(4) << "erase_region " << pos << " len=" << length << dendl;
1088
1089 // FIXME: get the preamble/postamble length via JournalStream
1090 int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
1091 dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
1092
1093 if (padding < 0) {
1094 derr << "Erase region " << length << " too short" << dendl;
1095 return -EINVAL;
1096 }
1097
7c673cae 1098 bufferlist entry;
11fdf7f2
TL
1099 if (type == "mdlog") {
1100 // Serialize an ENoOp with the correct amount of padding
1101 ENoOp enoop(padding);
1102 enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
1103 } else if (type == "purge_queue") {
1104 PurgeItem pi;
1105 pi.pad_size = padding;
1106 pi.encode(entry);
1107 }
7c673cae 1108 JournalStream stream(JOURNAL_FORMAT_RESILIENT);
7c673cae
FG
1109 // Serialize region of log stream
1110 bufferlist log_data;
1111 stream.write(entry, &log_data, pos);
1112
1113 dout(4) << "erase_region data length " << log_data.length() << dendl;
11fdf7f2 1114 ceph_assert(log_data.length() == length);
7c673cae
FG
1115
1116 // Write log stream region to RADOS
1117 // FIXME: get object size somewhere common to scan_events
11fdf7f2 1118 uint32_t object_size = g_conf()->mds_log_segment_size;
7c673cae
FG
1119 if (object_size == 0) {
1120 // Default layout object size
1121 object_size = file_layout_t::get_default().object_size;
1122 }
1123
1124 uint64_t write_offset = pos;
1125 uint64_t obj_offset = (pos / object_size);
1126 int r = 0;
1127 while(log_data.length()) {
1128 std::string const oid = js.obj_name(obj_offset);
1129 uint32_t offset_in_obj = write_offset % object_size;
1130 uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
1131
1132 r = output.write(oid, log_data, write_len, offset_in_obj);
1133 if (r < 0) {
1134 return r;
1135 } else {
1136 dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
1137 r = 0;
1138 }
1139
1140 log_data.splice(0, write_len);
1141 write_offset += write_len;
1142 obj_offset++;
1143 }
1144
1145 return r;
1146}
1147
1148/**
1149 * Given an EMetaBlob::fullbit containing an inode, write out
1150 * the encoded inode in the format used by InodeStore (i.e. the
1151 * backing store format)
1152 *
1153 * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
1154 * on an offline InodeStore instance. It's way simpler, because we are just
1155 * uncritically hauling the data between structs.
1156 *
1157 * @param fb a fullbit extracted from a journal entry
1158 * @param bare if true, leave out [EN|DE]CODE_START decoration
1159 * @param out_bl output, write serialized inode to this bufferlist
1160 */
1161void JournalTool::encode_fullbit_as_inode(
1162 const EMetaBlob::fullbit &fb,
1163 const bool bare,
1164 bufferlist *out_bl)
1165{
11fdf7f2 1166 ceph_assert(out_bl != NULL);
7c673cae
FG
1167
1168 // Compose InodeStore
1169 InodeStore new_inode;
1170 new_inode.inode = fb.inode;
1171 new_inode.xattrs = fb.xattrs;
1172 new_inode.dirfragtree = fb.dirfragtree;
1173 new_inode.snap_blob = fb.snapbl;
11fdf7f2 1174 new_inode.symlink = fb.symlink;
7c673cae
FG
1175 new_inode.old_inodes = fb.old_inodes;
1176
1177 // Serialize InodeStore
1178 if (bare) {
1179 new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1180 } else {
1181 new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1182 }
1183}
1184
1185/**
1186 * Given a list of inode numbers known to be in use by
1187 * inodes in the backing store, ensure that none of these
1188 * numbers are listed as free in the InoTables in the
1189 * backing store.
1190 *
1191 * Used after injecting inodes into the backing store, to
1192 * ensure that the same inode numbers are not subsequently
1193 * used for new files during ordinary operation.
1194 *
1195 * @param inos list of inode numbers to be removed from
1196 * free lists in InoTables
1197 * @returns 0 on success, else negative error code
1198 */
1199int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
1200{
1201 int r = 0;
1202
1203 // InoTable is a per-MDS structure, so iterate over assigned ranks
1204 auto fs = fsmap->get_filesystem(role_selector.get_ns());
1205 std::set<mds_rank_t> in_ranks;
1206 fs->mds_map.get_mds_set(in_ranks);
1207
1208 for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
1209 rank_i != in_ranks.end(); ++rank_i)
1210 {
1211 // Compose object name
1212 std::ostringstream oss;
1213 oss << "mds" << *rank_i << "_inotable";
1214 object_t inotable_oid = object_t(oss.str());
1215
1216 // Read object
1217 bufferlist inotable_bl;
1218 int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
1219 if (read_r < 0) {
1220 // Things are really bad if we can't read inotable. Beyond our powers.
1221 derr << "unable to read inotable '" << inotable_oid.name << "': "
1222 << cpp_strerror(read_r) << dendl;
1223 r = r ? r : read_r;
1224 continue;
1225 }
1226
1227 // Deserialize InoTable
1228 version_t inotable_ver;
11fdf7f2
TL
1229 auto q = inotable_bl.cbegin();
1230 decode(inotable_ver, q);
7c673cae
FG
1231 InoTable ino_table(NULL);
1232 ino_table.decode(q);
1233
1234 // Update InoTable in memory
1235 bool inotable_modified = false;
1236 for (std::set<inodeno_t>::iterator i = inos.begin();
1237 i != inos.end(); ++i)
1238 {
1239 const inodeno_t ino = *i;
1240 if (ino_table.force_consume(ino)) {
1241 dout(4) << "Used ino 0x" << std::hex << ino << std::dec
1242 << " requires inotable update" << dendl;
1243 inotable_modified = true;
1244 }
1245 }
1246
1247 // Serialize and write InoTable
1248 if (inotable_modified) {
1249 inotable_ver += 1;
1250 dout(4) << "writing modified inotable version " << inotable_ver << dendl;
1251 bufferlist inotable_new_bl;
11fdf7f2 1252 encode(inotable_ver, inotable_new_bl);
7c673cae
FG
1253 ino_table.encode_state(inotable_new_bl);
1254 int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
1255 if (write_r != 0) {
1256 derr << "error writing modified inotable " << inotable_oid.name
1257 << ": " << cpp_strerror(write_r) << dendl;
1258 r = r ? r : read_r;
1259 continue;
1260 }
1261 }
1262 }
1263
1264 return r;
1265}
1266