]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/cephfs/JournalTool.cc
update sources to v12.1.2
[ceph.git] / ceph / src / tools / cephfs / JournalTool.cc
CommitLineData
7c673cae
FG
1// -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * ceph - scalable distributed file system
5 *
6 * copyright (c) 2014 john spray <john.spray@inktank.com>
7 *
8 * this is free software; you can redistribute it and/or
9 * modify it under the terms of the gnu lesser general public
10 * license version 2.1, as published by the free software
11 * foundation. see file copying.
12 */
13
14
15#include <sstream>
16
17#include "common/ceph_argparse.h"
18#include "common/errno.h"
19#include "osdc/Journaler.h"
20#include "mds/mdstypes.h"
21#include "mds/LogEvent.h"
22#include "mds/InoTable.h"
23
24#include "mds/events/ENoOp.h"
25#include "mds/events/EUpdate.h"
26
27#include "JournalScanner.h"
28#include "EventOutput.h"
29#include "Dumper.h"
30#include "Resetter.h"
31
32#include "JournalTool.h"
33
34
35#define dout_context g_ceph_context
36#define dout_subsys ceph_subsys_mds
37#undef dout_prefix
38#define dout_prefix *_dout << __func__ << ": "
39
40
41
42void JournalTool::usage()
43{
44 std::cout << "Usage: \n"
45 << " cephfs-journal-tool [options] journal <command>\n"
46 << " <command>:\n"
47 << " inspect\n"
48 << " import <path>\n"
49 << " export <path>\n"
50 << " reset [--force]\n"
51 << " cephfs-journal-tool [options] header <get|set <field> <value>\n"
31f18b77 52 << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
7c673cae
FG
53 << " <selector>:\n"
54 << " --range=<start>..<end>\n"
55 << " --path=<substring>\n"
56 << " --inode=<integer>\n"
57 << " --type=<UPDATE|OPEN|SESSION...><\n"
58 << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
7c673cae 59 << " --client=<session id integer>\n"
31f18b77 60 << " <effect>: [get|recover_dentries|splice]\n"
7c673cae
FG
61 << " <output>: [summary|list|binary|json] [--path <path>]\n"
62 << "\n"
31f18b77 63 << "General options:\n"
7c673cae
FG
64 << " --rank=filesystem:mds-rank Journal rank (required if multiple\n"
65 << " file systems, default is rank 0 on\n"
31f18b77
FG
66 << " the only filesystem otherwise.\n"
67 << "\n"
68 << "Special options\n"
69 << " --alternate-pool <name> Alternative metadata pool to target\n"
70 << " when using recover_dentries.\n";
7c673cae
FG
71
72 generic_client_usage();
73}
74
75
76/**
77 * Handle arguments and hand off to journal/header/event mode
78 */
79int JournalTool::main(std::vector<const char*> &argv)
80{
81 int r;
82
83 dout(10) << "JournalTool::main " << dendl;
84 // Common arg parsing
85 // ==================
86 if (argv.empty()) {
87 usage();
88 return -EINVAL;
89 }
90
91 std::vector<const char*>::iterator arg = argv.begin();
92
93 std::string rank_str;
94 if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
95 // Default: act on rank 0. Will give the user an error if they
96 // try invoking this way when they have more than one filesystem.
97 rank_str = "0";
98 }
99
100 r = role_selector.parse(*fsmap, rank_str);
101 if (r != 0) {
102 derr << "Couldn't determine MDS rank." << dendl;
103 return r;
104 }
105
106 std::string mode;
107 if (arg == argv.end()) {
108 derr << "Missing mode [journal|header|event]" << dendl;
109 return -EINVAL;
110 }
111 mode = std::string(*arg);
112 arg = argv.erase(arg);
113
114 // RADOS init
115 // ==========
116 r = rados.init_with_context(g_ceph_context);
117 if (r < 0) {
118 derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
119 return r;
120 }
121
122 dout(4) << "JournalTool: connecting to RADOS..." << dendl;
123 r = rados.connect();
124 if (r < 0) {
125 derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
126 return r;
127 }
128
129 auto fs = fsmap->get_filesystem(role_selector.get_ns());
130 assert(fs != nullptr);
131 int64_t const pool_id = fs->mds_map.get_metadata_pool();
132 dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
133 std::string pool_name;
134 r = rados.pool_reverse_lookup(pool_id, &pool_name);
135 if (r < 0) {
136 derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
137 return r;
138 }
139
140 dout(4) << "JournalTool: creating IoCtx.." << dendl;
141 r = rados.ioctx_create(pool_name.c_str(), input);
142 assert(r == 0);
143 output.dup(input);
144
145 // Execution
146 // =========
147 for (auto role : role_selector.get_roles()) {
148 rank = role.rank;
149 dout(4) << "Executing for rank " << rank << dendl;
150 if (mode == std::string("journal")) {
151 r = main_journal(argv);
152 } else if (mode == std::string("header")) {
153 r = main_header(argv);
154 } else if (mode == std::string("event")) {
155 r = main_event(argv);
156 } else {
157 derr << "Bad command '" << mode << "'" << dendl;
158 usage();
159 return -EINVAL;
160 }
161
162 if (r != 0) {
163 return r;
164 }
165 }
166
167 return r;
168}
169
170
171/**
172 * Handle arguments for 'journal' mode
173 *
174 * This is for operations that act on the journal as a whole.
175 */
176int JournalTool::main_journal(std::vector<const char*> &argv)
177{
178 std::string command = argv[0];
179 if (command == "inspect") {
180 return journal_inspect();
181 } else if (command == "export" || command == "import") {
182 if (argv.size() >= 2) {
183 std::string const path = argv[1];
184 return journal_export(path, command == "import");
185 } else {
186 derr << "Missing path" << dendl;
187 return -EINVAL;
188 }
189 } else if (command == "reset") {
190 bool force = false;
191 if (argv.size() == 2) {
192 if (std::string(argv[1]) == "--force") {
193 force = true;
194 } else {
195 std::cerr << "Unknown argument " << argv[1] << std::endl;
196 usage();
197 return -EINVAL;
198 }
199 } else if (argv.size() > 2) {
200 std::cerr << "Too many arguments!" << std::endl;
201 usage();
202 return -EINVAL;
203 }
204 return journal_reset(force);
205 } else {
206 derr << "Bad journal command '" << command << "'" << dendl;
207 return -EINVAL;
208 }
209}
210
211
212/**
213 * Parse arguments and execute for 'header' mode
214 *
215 * This is for operations that act on the header only.
216 */
217int JournalTool::main_header(std::vector<const char*> &argv)
218{
219 JournalFilter filter;
220 JournalScanner js(input, rank, filter);
221 int r = js.scan(false);
222 if (r < 0) {
223 std::cerr << "Unable to scan journal" << std::endl;
224 return r;
225 }
226
227 if (!js.header_present) {
228 std::cerr << "Header object not found!" << std::endl;
229 return -ENOENT;
230 } else if (!js.header_valid && js.header == NULL) {
231 // Can't do a read or a single-field write without a copy of the original
232 derr << "Header could not be read!" << dendl;
233 return -ENOENT;
234 } else {
235 assert(js.header != NULL);
236 }
237
238 if (argv.size() == 0) {
239 derr << "Invalid header command, must be [get|set]" << dendl;
240 return -EINVAL;
241 }
242 std::vector<const char *>::iterator arg = argv.begin();
243 std::string const command = *arg;
244 arg = argv.erase(arg);
245
246 if (command == std::string("get")) {
247 // Write JSON journal dump to stdout
248 JSONFormatter jf(true);
249 js.header->dump(&jf);
250 jf.flush(std::cout);
251 std::cout << std::endl;
252 } else if (command == std::string("set")) {
253 // Need two more args <key> <val>
254 if (argv.size() != 2) {
255 derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
256 return -EINVAL;
257 }
258
259 std::string const field_name = *arg;
260 arg = argv.erase(arg);
261
262 std::string const value_str = *arg;
263 arg = argv.erase(arg);
264 assert(argv.empty());
265
266 std::string parse_err;
267 uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
268 if (!parse_err.empty()) {
269 derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
270 return -EINVAL;
271 }
272
273 uint64_t *field = NULL;
274 if (field_name == "trimmed_pos") {
275 field = &(js.header->trimmed_pos);
276 } else if (field_name == "expire_pos") {
277 field = &(js.header->expire_pos);
278 } else if (field_name == "write_pos") {
279 field = &(js.header->write_pos);
280 } else {
281 derr << "Invalid field '" << field_name << "'" << dendl;
282 return -EINVAL;
283 }
284
285 std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
286 *field = new_val;
287
288 dout(4) << "Writing object..." << dendl;
289 bufferlist header_bl;
290 ::encode(*(js.header), header_bl);
291 output.write_full(js.obj_name(0), header_bl);
292 dout(4) << "Write complete." << dendl;
293 std::cout << "Successfully updated header." << std::endl;
294 } else {
295 derr << "Bad header command '" << command << "'" << dendl;
296 return -EINVAL;
297 }
298
299 return 0;
300}
301
302
303/**
304 * Parse arguments and execute for 'event' mode
305 *
306 * This is for operations that act on LogEvents within the log
307 */
308int JournalTool::main_event(std::vector<const char*> &argv)
309{
310 int r;
311
312 std::vector<const char*>::iterator arg = argv.begin();
313
314 std::string command = *(arg++);
31f18b77 315 if (command != "get" && command != "splice" && command != "recover_dentries") {
7c673cae
FG
316 derr << "Unknown argument '" << command << "'" << dendl;
317 usage();
318 return -EINVAL;
319 }
320
321 if (arg == argv.end()) {
322 derr << "Incomplete command line" << dendl;
323 usage();
324 return -EINVAL;
325 }
326
327 // Parse filter options
328 // ====================
329 JournalFilter filter;
330 r = filter.parse_args(argv, arg);
331 if (r) {
332 return r;
333 }
334
335 // Parse output options
336 // ====================
337 if (arg == argv.end()) {
338 derr << "Missing output command" << dendl;
339 usage();
340 }
341 std::string output_style = *(arg++);
342 if (output_style != "binary" && output_style != "json" &&
343 output_style != "summary" && output_style != "list") {
344 derr << "Unknown argument: '" << output_style << "'" << dendl;
345 usage();
346 return -EINVAL;
347 }
348
349 std::string output_path = "dump";
350 while(arg != argv.end()) {
351 std::string arg_str;
352 if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
353 output_path = arg_str;
354 } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
355 nullptr)) {
356 dout(1) << "Using alternate pool " << arg_str << dendl;
357 int r = rados.ioctx_create(arg_str.c_str(), output);
358 assert(r == 0);
359 other_pool = true;
360 } else {
361 derr << "Unknown argument: '" << *arg << "'" << dendl;
362 usage();
363 return -EINVAL;
364 }
365 }
366
367 // Execute command
368 // ===============
369 JournalScanner js(input, rank, filter);
370 if (command == "get") {
371 r = js.scan();
372 if (r) {
373 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
374 return r;
375 }
7c673cae
FG
376 } else if (command == "recover_dentries") {
377 r = js.scan();
378 if (r) {
379 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
380 return r;
381 }
382
383 bool dry_run = false;
384 if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
385 dry_run = true;
386 }
387
388 /**
389 * Iterate over log entries, attempting to scavenge from each one
390 */
391 std::set<inodeno_t> consumed_inos;
392 for (JournalScanner::EventMap::iterator i = js.events.begin();
393 i != js.events.end(); ++i) {
394 LogEvent *le = i->second.log_event;
395 EMetaBlob const *mb = le->get_metablob();
396 if (mb) {
31f18b77 397 int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
7c673cae
FG
398 if (scav_r) {
399 dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
400 << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
401 if (r == 0) {
402 r = scav_r;
403 }
404 // Our goal is to read all we can, so don't stop on errors, but
405 // do record them for possible later output
406 js.errors.insert(std::make_pair(i->first,
407 JournalScanner::EventError(scav_r, cpp_strerror(r))));
408 }
409 }
410 }
411
412 /**
413 * Update InoTable to reflect any inode numbers consumed during scavenge
414 */
415 dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
416 if (consumed_inos.size() && !dry_run) {
417 int consume_r = consume_inos(consumed_inos);
418 if (consume_r) {
419 dout(1) << "Error updating InoTable for " << consumed_inos.size()
420 << " consume inos: " << cpp_strerror(consume_r) << dendl;
421 if (r == 0) {
422 r = consume_r;
423 }
424 }
425 }
426
427 // Remove consumed dentries from lost+found.
428 if (other_pool && !dry_run) {
429 std::set<std::string> found;
430
431 for (auto i : consumed_inos) {
432 char s[20];
433
434 snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
435 dout(20) << "removing " << s << dendl;
436 found.insert(std::string(s));
437 }
438
439 object_t frag_oid;
440 frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
441 frag_t(), "");
442 output.omap_rm_keys(frag_oid.name, found);
443 }
444 } else if (command == "splice") {
445 r = js.scan();
446 if (r) {
447 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
448 return r;
449 }
450
451 uint64_t start, end;
452 if (filter.get_range(start, end)) {
453 // Special case for range filter: erase a numeric range in the log
454 uint64_t range = end - start;
455 int r = erase_region(js, start, range);
456 if (r) {
457 derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
458 << ": " << cpp_strerror(r) << dendl;
459 return r;
460 }
461 } else {
462 // General case: erase a collection of individual entries in the log
463 for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
464 dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
465
466 int r = erase_region(js, i->first, i->second.raw_size);
467 if (r) {
468 derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
469 << ": " << cpp_strerror(r) << dendl;
470 return r;
471 }
472 }
473 }
474
475
476 } else {
477 derr << "Unknown argument '" << command << "'" << dendl;
478 usage();
479 return -EINVAL;
480 }
481
482 // Generate output
483 // ===============
484 EventOutput output(js, output_path);
485 int output_result = 0;
486 if (output_style == "binary") {
487 output_result = output.binary();
488 } else if (output_style == "json") {
489 output_result = output.json();
490 } else if (output_style == "summary") {
491 output.summary();
492 } else if (output_style == "list") {
493 output.list();
494 } else {
495 std::cerr << "Bad output command '" << output_style << "'" << std::endl;
496 return -EINVAL;
497 }
498
499 if (output_result != 0) {
500 std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
501 }
502
503 return output_result;
504}
505
506/**
507 * Provide the user with information about the condition of the journal,
508 * especially indicating what range of log events is available and where
509 * any gaps or corruptions in the journal are.
510 */
511int JournalTool::journal_inspect()
512{
513 int r;
514
515 JournalFilter filter;
516 JournalScanner js(input, rank, filter);
517 r = js.scan();
518 if (r) {
519 std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
520 return r;
521 }
522
523 js.report(std::cout);
524
525 return 0;
526}
527
528
529/**
530 * Attempt to export a binary dump of the journal.
531 *
532 * This is allowed to fail if the header is malformed or there are
533 * objects inaccessible, in which case the user would have to fall
534 * back to manually listing RADOS objects and extracting them, which
535 * they can do with the ``rados`` CLI.
536 */
537int JournalTool::journal_export(std::string const &path, bool import)
538{
539 int r = 0;
540 JournalScanner js(input, rank);
541
542 if (!import) {
543 /*
544 * If doing an export, first check that the header is valid and
545 * no objects are missing before trying to dump
546 */
547 r = js.scan();
548 if (r < 0) {
549 derr << "Unable to scan journal, assuming badly damaged" << dendl;
550 return r;
551 }
552 if (!js.is_readable()) {
553 derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
554 return -EIO;
555 }
556 }
557
558 /*
559 * Assuming we can cleanly read the journal data, dump it out to a file
560 */
561 {
562 Dumper dumper;
563 r = dumper.init(mds_role_t(role_selector.get_ns(), rank));
564 if (r < 0) {
565 derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
566 return r;
567 }
568 if (import) {
569 r = dumper.undump(path.c_str());
570 } else {
571 r = dumper.dump(path.c_str());
572 }
573 dumper.shutdown();
574 }
575
576 return r;
577}
578
579
580/**
581 * Truncate journal and insert EResetJournal
582 */
583int JournalTool::journal_reset(bool hard)
584{
585 int r = 0;
586 Resetter resetter;
587 r = resetter.init();
588 if (r < 0) {
589 derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
590 return r;
591 }
592
593 if (hard) {
594 r = resetter.reset_hard(mds_role_t(role_selector.get_ns(), rank));
595 } else {
596 r = resetter.reset(mds_role_t(role_selector.get_ns(), rank));
597 }
598 resetter.shutdown();
599
600 return r;
601}
602
603
604/**
605 * Selective offline replay which only reads out dentries and writes
606 * them to the backing store iff their version is > what is currently
607 * in the backing store.
608 *
609 * In order to write dentries to the backing store, we may create the
610 * required enclosing dirfrag objects.
611 *
612 * Test this by running scavenge on an unflushed journal, then nuking
613 * it offline, then starting an MDS and seeing that the dentries are
614 * visible.
615 *
616 * @param metablob an EMetaBlob retrieved from the journal
617 * @param dry_run if true, do no writes to RADOS
618 * @param consumed_inos output, populated with any inos inserted
619 * @returns 0 on success, else negative error code
620 */
31f18b77 621int JournalTool::recover_dentries(
7c673cae
FG
622 EMetaBlob const &metablob,
623 bool const dry_run,
624 std::set<inodeno_t> *consumed_inos)
625{
626 assert(consumed_inos != NULL);
627
628 int r = 0;
629
630 // Replay fullbits (dentry+inode)
631 for (list<dirfrag_t>::const_iterator lp = metablob.lump_order.begin();
632 lp != metablob.lump_order.end(); ++lp)
633 {
634 dirfrag_t const &frag = *lp;
635 EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
636 lump._decode_bits();
637 object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
638
639 dout(4) << "inspecting lump " << frag_oid.name << dendl;
640
641
642 // We will record old fnode version for use in hard link handling
643 // If we don't read an old fnode, take version as zero and write in
644 // all hardlinks we find.
645 version_t old_fnode_version = 0;
646
647 // Update fnode in omap header of dirfrag object
648 bool write_fnode = false;
649 bufferlist old_fnode_bl;
650 r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
651 if (r == -ENOENT) {
652 // Creating dirfrag from scratch
653 dout(4) << "failed to read OMAP header from directory fragment "
654 << frag_oid.name << " " << cpp_strerror(r) << dendl;
655 write_fnode = true;
656 // Note: creating the dirfrag *without* a backtrace, relying on
657 // MDS to regenerate backtraces on read or in FSCK
658 } else if (r == 0) {
659 // Conditionally update existing omap header
660 fnode_t old_fnode;
661 bufferlist::iterator old_fnode_iter = old_fnode_bl.begin();
662 try {
663 old_fnode.decode(old_fnode_iter);
664 dout(4) << "frag " << frag_oid.name << " fnode old v" <<
665 old_fnode.version << " vs new v" << lump.fnode.version << dendl;
666 old_fnode_version = old_fnode.version;
667 write_fnode = old_fnode_version < lump.fnode.version;
668 } catch (const buffer::error &err) {
669 dout(1) << "frag " << frag_oid.name
670 << " is corrupt, overwriting" << dendl;
671 write_fnode = true;
672 }
673 } else {
674 // Unexpected error
675 dout(4) << "failed to read OMAP header from directory fragment "
676 << frag_oid.name << " " << cpp_strerror(r) << dendl;
677 return r;
678 }
679
680 if ((other_pool || write_fnode) && !dry_run) {
681 dout(4) << "writing fnode to omap header" << dendl;
682 bufferlist fnode_bl;
683 lump.fnode.encode(fnode_bl);
684 if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
685 r = output.omap_set_header(frag_oid.name, fnode_bl);
686 }
687 if (r != 0) {
688 derr << "Failed to write fnode for frag object "
689 << frag_oid.name << dendl;
690 return r;
691 }
692 }
693
694 std::set<std::string> read_keys;
695
696 // Compose list of potentially-existing dentries we would like to fetch
697 list<ceph::shared_ptr<EMetaBlob::fullbit> > const &fb_list =
698 lump.get_dfull();
699 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
700 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
701 EMetaBlob::fullbit const &fb = *(*fbi);
702
703 // Get a key like "foobar_head"
704 std::string key;
705 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
706 dn_key.encode(key);
707 read_keys.insert(key);
708 }
709
710 list<EMetaBlob::remotebit> const &rb_list =
711 lump.get_dremote();
712 for (list<EMetaBlob::remotebit>::const_iterator rbi =
713 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
714 EMetaBlob::remotebit const &rb = *rbi;
715
716 // Get a key like "foobar_head"
717 std::string key;
718 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
719 dn_key.encode(key);
720 read_keys.insert(key);
721 }
722
c07f9fc5
FG
723 list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull();
724 for (auto& nb : nb_list) {
725 // Get a key like "foobar_head"
726 std::string key;
727 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
728 dn_key.encode(key);
729 read_keys.insert(key);
730 }
731
7c673cae
FG
732 // Perform bulk read of existing dentries
733 std::map<std::string, bufferlist> read_vals;
734 r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
735 if (r == -ENOENT && other_pool) {
736 r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
737 }
738 if (r != 0) {
739 derr << "unexpected error reading fragment object "
740 << frag_oid.name << ": " << cpp_strerror(r) << dendl;
741 return r;
742 }
743
744 // Compose list of dentries we will write back
745 std::map<std::string, bufferlist> write_vals;
746 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
747 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
748 EMetaBlob::fullbit const &fb = *(*fbi);
749
750 // Get a key like "foobar_head"
751 std::string key;
752 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
753 dn_key.encode(key);
754
755 dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
756 << dendl;
757 bool write_dentry = false;
758 if (read_vals.find(key) == read_vals.end()) {
759 dout(4) << "dentry did not already exist, will create" << dendl;
760 write_dentry = true;
761 } else {
762 dout(4) << "dentry " << key << " existed already" << dendl;
763 dout(4) << "dentry exists, checking versions..." << dendl;
764 bufferlist &old_dentry = read_vals[key];
765 // Decode dentry+inode
766 bufferlist::iterator q = old_dentry.begin();
767
768 snapid_t dnfirst;
769 ::decode(dnfirst, q);
770 char dentry_type;
771 ::decode(dentry_type, q);
772
773 if (dentry_type == 'L') {
774 // leave write_dentry false, we have no version to
775 // compare with in a hardlink, so it's not safe to
776 // squash over it with what's in this fullbit
777 dout(10) << "Existing remote inode in slot to be (maybe) written "
778 << "by a full inode from the journal dn '" << fb.dn.c_str()
779 << "' with lump fnode version " << lump.fnode.version
780 << "vs existing fnode version " << old_fnode_version << dendl;
781 write_dentry = old_fnode_version < lump.fnode.version;
782 } else if (dentry_type == 'I') {
783 // Read out inode version to compare with backing store
784 InodeStore inode;
785 inode.decode_bare(q);
786 dout(4) << "decoded embedded inode version "
787 << inode.inode.version << " vs fullbit version "
788 << fb.inode.version << dendl;
789 if (inode.inode.version < fb.inode.version) {
790 write_dentry = true;
791 }
792 } else {
793 dout(4) << "corrupt dentry in backing store, overwriting from "
794 "journal" << dendl;
795 write_dentry = true;
796 }
797 }
798
799 if ((other_pool || write_dentry) && !dry_run) {
800 dout(4) << "writing I dentry " << key << " into frag "
801 << frag_oid.name << dendl;
802
803 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
804 bufferlist dentry_bl;
805 ::encode(fb.dnfirst, dentry_bl);
806 ::encode('I', dentry_bl);
807 encode_fullbit_as_inode(fb, true, &dentry_bl);
808
809 // Record for writing to RADOS
810 write_vals[key] = dentry_bl;
811 consumed_inos->insert(fb.inode.ino);
812 }
813 }
814
815 for (list<EMetaBlob::remotebit>::const_iterator rbi =
816 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
817 EMetaBlob::remotebit const &rb = *rbi;
818
819 // Get a key like "foobar_head"
820 std::string key;
821 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
822 dn_key.encode(key);
823
824 dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
825 << dendl;
826 bool write_dentry = false;
827 if (read_vals.find(key) == read_vals.end()) {
828 dout(4) << "dentry did not already exist, will create" << dendl;
829 write_dentry = true;
830 } else {
831 dout(4) << "dentry " << key << " existed already" << dendl;
832 dout(4) << "dentry exists, checking versions..." << dendl;
833 bufferlist &old_dentry = read_vals[key];
834 // Decode dentry+inode
835 bufferlist::iterator q = old_dentry.begin();
836
837 snapid_t dnfirst;
838 ::decode(dnfirst, q);
839 char dentry_type;
840 ::decode(dentry_type, q);
841
842 if (dentry_type == 'L') {
843 dout(10) << "Existing hardlink inode in slot to be (maybe) written "
844 << "by a remote inode from the journal dn '" << rb.dn.c_str()
845 << "' with lump fnode version " << lump.fnode.version
846 << "vs existing fnode version " << old_fnode_version << dendl;
847 write_dentry = old_fnode_version < lump.fnode.version;
848 } else if (dentry_type == 'I') {
849 dout(10) << "Existing full inode in slot to be (maybe) written "
850 << "by a remote inode from the journal dn '" << rb.dn.c_str()
851 << "' with lump fnode version " << lump.fnode.version
852 << "vs existing fnode version " << old_fnode_version << dendl;
853 write_dentry = old_fnode_version < lump.fnode.version;
854 } else {
855 dout(4) << "corrupt dentry in backing store, overwriting from "
856 "journal" << dendl;
857 write_dentry = true;
858 }
859 }
860
861 if ((other_pool || write_dentry) && !dry_run) {
862 dout(4) << "writing L dentry " << key << " into frag "
863 << frag_oid.name << dendl;
864
865 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
866 bufferlist dentry_bl;
867 ::encode(rb.dnfirst, dentry_bl);
868 ::encode('L', dentry_bl);
869 ::encode(rb.ino, dentry_bl);
870 ::encode(rb.d_type, dentry_bl);
871
872 // Record for writing to RADOS
873 write_vals[key] = dentry_bl;
874 consumed_inos->insert(rb.ino);
875 }
876 }
877
c07f9fc5
FG
878 std::set<std::string> null_vals;
879 for (auto& nb : nb_list) {
880 std::string key;
881 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
882 dn_key.encode(key);
883
884 dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
885 << dendl;
886
887 auto it = read_vals.find(key);
888 if (it != read_vals.end()) {
889 dout(4) << "dentry exists, will remove" << dendl;
890
891 bufferlist::iterator q = it->second.begin();
892 snapid_t dnfirst;
893 ::decode(dnfirst, q);
894 char dentry_type;
895 ::decode(dentry_type, q);
896
897 bool remove_dentry = false;
898 if (dentry_type == 'L') {
899 dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
900 << "by null journal dn '" << nb.dn.c_str()
901 << "' with lump fnode version " << lump.fnode.version
902 << "vs existing fnode version " << old_fnode_version << dendl;
903 remove_dentry = old_fnode_version < lump.fnode.version;
904 } else if (dentry_type == 'I') {
905 dout(10) << "Existing full inode in slot to be (maybe) removed "
906 << "by null journal dn '" << nb.dn.c_str()
907 << "' with lump fnode version " << lump.fnode.version
908 << "vs existing fnode version " << old_fnode_version << dendl;
909 remove_dentry = old_fnode_version < lump.fnode.version;
910 } else {
911 dout(4) << "corrupt dentry in backing store, will remove" << dendl;
912 remove_dentry = true;
913 }
914
915 if (remove_dentry)
916 null_vals.insert(key);
917 }
918 }
919
7c673cae
FG
920 // Write back any new/changed dentries
921 if (!write_vals.empty()) {
922 r = output.omap_set(frag_oid.name, write_vals);
923 if (r != 0) {
924 derr << "error writing dentries to " << frag_oid.name
925 << ": " << cpp_strerror(r) << dendl;
926 return r;
927 }
928 }
c07f9fc5
FG
929
930 // remove any null dentries
931 if (!null_vals.empty()) {
932 r = output.omap_rm_keys(frag_oid.name, null_vals);
933 if (r != 0) {
934 derr << "error removing dentries from " << frag_oid.name
935 << ": " << cpp_strerror(r) << dendl;
936 return r;
937 }
938 }
7c673cae
FG
939 }
940
941 /* Now that we've looked at the dirlumps, we finally pay attention to
942 * the roots (i.e. inodes without ancestry). This is necessary in order
943 * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
944 * important because clients use them to infer completeness
945 * of directories
946 */
947 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator p =
948 metablob.roots.begin(); p != metablob.roots.end(); ++p) {
949 EMetaBlob::fullbit const &fb = *(*p);
950 inodeno_t ino = fb.inode.ino;
951 dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
952
953 object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
954 dout(4) << "object id " << root_oid.name << dendl;
955
956 bool write_root_ino = false;
957 bufferlist old_root_ino_bl;
958 r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
959 if (r == -ENOENT) {
960 dout(4) << "root does not exist, will create" << dendl;
961 write_root_ino = true;
962 } else if (r >= 0) {
963 r = 0;
964 InodeStore old_inode;
965 dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
966 << ")" << dendl;
967 bufferlist::iterator inode_bl_iter = old_root_ino_bl.begin();
968 std::string magic;
969 ::decode(magic, inode_bl_iter);
970 if (magic == CEPH_FS_ONDISK_MAGIC) {
971 dout(4) << "magic ok" << dendl;
972 old_inode.decode(inode_bl_iter);
973
974 if (old_inode.inode.version < fb.inode.version) {
975 write_root_ino = true;
976 }
977 } else {
978 dout(4) << "magic bad: '" << magic << "'" << dendl;
979 write_root_ino = true;
980 }
981 } else {
982 derr << "error reading root inode object " << root_oid.name
983 << ": " << cpp_strerror(r) << dendl;
984 return r;
985 }
986
987 if (write_root_ino && !dry_run) {
988 dout(4) << "writing root ino " << root_oid.name
989 << " version " << fb.inode.version << dendl;
990
991 // Compose: root ino format is magic,InodeStore(bare=false)
992 bufferlist new_root_ino_bl;
993 ::encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
994 encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
995
996 // Write to RADOS
997 r = output.write_full(root_oid.name, new_root_ino_bl);
998 if (r != 0) {
999 derr << "error writing inode object " << root_oid.name
1000 << ": " << cpp_strerror(r) << dendl;
1001 return r;
1002 }
1003 }
1004 }
1005
1006 return r;
1007}
1008
1009
7c673cae
FG
1010/**
1011 * Erase a region of the log by overwriting it with ENoOp
1012 *
1013 */
1014int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
1015{
1016 // To erase this region, we use our preamble, the encoding overhead
1017 // of an ENoOp, and our trailing start ptr. Calculate how much padding
1018 // is needed inside the ENoOp to make up the difference.
1019 bufferlist tmp;
1020 ENoOp enoop(0);
1021 enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
1022
1023 dout(4) << "erase_region " << pos << " len=" << length << dendl;
1024
1025 // FIXME: get the preamble/postamble length via JournalStream
1026 int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
1027 dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
1028
1029 if (padding < 0) {
1030 derr << "Erase region " << length << " too short" << dendl;
1031 return -EINVAL;
1032 }
1033
1034 // Serialize an ENoOp with the correct amount of padding
1035 enoop = ENoOp(padding);
1036 bufferlist entry;
1037 enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
1038 JournalStream stream(JOURNAL_FORMAT_RESILIENT);
1039
1040 // Serialize region of log stream
1041 bufferlist log_data;
1042 stream.write(entry, &log_data, pos);
1043
1044 dout(4) << "erase_region data length " << log_data.length() << dendl;
1045 assert(log_data.length() == length);
1046
1047 // Write log stream region to RADOS
1048 // FIXME: get object size somewhere common to scan_events
1049 uint32_t object_size = g_conf->mds_log_segment_size;
1050 if (object_size == 0) {
1051 // Default layout object size
1052 object_size = file_layout_t::get_default().object_size;
1053 }
1054
1055 uint64_t write_offset = pos;
1056 uint64_t obj_offset = (pos / object_size);
1057 int r = 0;
1058 while(log_data.length()) {
1059 std::string const oid = js.obj_name(obj_offset);
1060 uint32_t offset_in_obj = write_offset % object_size;
1061 uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
1062
1063 r = output.write(oid, log_data, write_len, offset_in_obj);
1064 if (r < 0) {
1065 return r;
1066 } else {
1067 dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
1068 r = 0;
1069 }
1070
1071 log_data.splice(0, write_len);
1072 write_offset += write_len;
1073 obj_offset++;
1074 }
1075
1076 return r;
1077}
1078
1079/**
1080 * Given an EMetaBlob::fullbit containing an inode, write out
1081 * the encoded inode in the format used by InodeStore (i.e. the
1082 * backing store format)
1083 *
1084 * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
1085 * on an offline InodeStore instance. It's way simpler, because we are just
1086 * uncritically hauling the data between structs.
1087 *
1088 * @param fb a fullbit extracted from a journal entry
1089 * @param bare if true, leave out [EN|DE]CODE_START decoration
1090 * @param out_bl output, write serialized inode to this bufferlist
1091 */
1092void JournalTool::encode_fullbit_as_inode(
1093 const EMetaBlob::fullbit &fb,
1094 const bool bare,
1095 bufferlist *out_bl)
1096{
1097 assert(out_bl != NULL);
1098
1099 // Compose InodeStore
1100 InodeStore new_inode;
1101 new_inode.inode = fb.inode;
1102 new_inode.xattrs = fb.xattrs;
1103 new_inode.dirfragtree = fb.dirfragtree;
1104 new_inode.snap_blob = fb.snapbl;
1105 new_inode.symlink = fb.symlink;
1106 new_inode.old_inodes = fb.old_inodes;
1107
1108 // Serialize InodeStore
1109 if (bare) {
1110 new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1111 } else {
1112 new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1113 }
1114}
1115
1116/**
1117 * Given a list of inode numbers known to be in use by
1118 * inodes in the backing store, ensure that none of these
1119 * numbers are listed as free in the InoTables in the
1120 * backing store.
1121 *
1122 * Used after injecting inodes into the backing store, to
1123 * ensure that the same inode numbers are not subsequently
1124 * used for new files during ordinary operation.
1125 *
1126 * @param inos list of inode numbers to be removed from
1127 * free lists in InoTables
1128 * @returns 0 on success, else negative error code
1129 */
1130int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
1131{
1132 int r = 0;
1133
1134 // InoTable is a per-MDS structure, so iterate over assigned ranks
1135 auto fs = fsmap->get_filesystem(role_selector.get_ns());
1136 std::set<mds_rank_t> in_ranks;
1137 fs->mds_map.get_mds_set(in_ranks);
1138
1139 for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
1140 rank_i != in_ranks.end(); ++rank_i)
1141 {
1142 // Compose object name
1143 std::ostringstream oss;
1144 oss << "mds" << *rank_i << "_inotable";
1145 object_t inotable_oid = object_t(oss.str());
1146
1147 // Read object
1148 bufferlist inotable_bl;
1149 int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
1150 if (read_r < 0) {
1151 // Things are really bad if we can't read inotable. Beyond our powers.
1152 derr << "unable to read inotable '" << inotable_oid.name << "': "
1153 << cpp_strerror(read_r) << dendl;
1154 r = r ? r : read_r;
1155 continue;
1156 }
1157
1158 // Deserialize InoTable
1159 version_t inotable_ver;
1160 bufferlist::iterator q = inotable_bl.begin();
1161 ::decode(inotable_ver, q);
1162 InoTable ino_table(NULL);
1163 ino_table.decode(q);
1164
1165 // Update InoTable in memory
1166 bool inotable_modified = false;
1167 for (std::set<inodeno_t>::iterator i = inos.begin();
1168 i != inos.end(); ++i)
1169 {
1170 const inodeno_t ino = *i;
1171 if (ino_table.force_consume(ino)) {
1172 dout(4) << "Used ino 0x" << std::hex << ino << std::dec
1173 << " requires inotable update" << dendl;
1174 inotable_modified = true;
1175 }
1176 }
1177
1178 // Serialize and write InoTable
1179 if (inotable_modified) {
1180 inotable_ver += 1;
1181 dout(4) << "writing modified inotable version " << inotable_ver << dendl;
1182 bufferlist inotable_new_bl;
1183 ::encode(inotable_ver, inotable_new_bl);
1184 ino_table.encode_state(inotable_new_bl);
1185 int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
1186 if (write_r != 0) {
1187 derr << "error writing modified inotable " << inotable_oid.name
1188 << ": " << cpp_strerror(write_r) << dendl;
1189 r = r ? r : read_r;
1190 continue;
1191 }
1192 }
1193 }
1194
1195 return r;
1196}
1197