]> git.proxmox.com Git - ceph.git/blob - ceph/src/tools/cephfs/JournalTool.cc
update sources to v12.2.3
[ceph.git] / ceph / src / tools / cephfs / JournalTool.cc
1 // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * ceph - scalable distributed file system
5 *
6 * copyright (c) 2014 john spray <john.spray@inktank.com>
7 *
8 * this is free software; you can redistribute it and/or
9 * modify it under the terms of the gnu lesser general public
10 * license version 2.1, as published by the free software
11 * foundation. see file copying.
12 */
13
14
15 #include <sstream>
16
17 #include "common/ceph_argparse.h"
18 #include "common/errno.h"
19 #include "osdc/Journaler.h"
20 #include "mds/mdstypes.h"
21 #include "mds/LogEvent.h"
22 #include "mds/InoTable.h"
23
24 #include "mds/events/ENoOp.h"
25 #include "mds/events/EUpdate.h"
26
27 #include "JournalScanner.h"
28 #include "EventOutput.h"
29 #include "Dumper.h"
30 #include "Resetter.h"
31
32 #include "JournalTool.h"
33
34
35 #define dout_context g_ceph_context
36 #define dout_subsys ceph_subsys_mds
37 #undef dout_prefix
38 #define dout_prefix *_dout << __func__ << ": "
39
40
41
42 void JournalTool::usage()
43 {
44 std::cout << "Usage: \n"
45 << " cephfs-journal-tool [options] journal <command>\n"
46 << " <command>:\n"
47 << " inspect\n"
48 << " import <path>\n"
49 << " export <path>\n"
50 << " reset [--force]\n"
51 << " cephfs-journal-tool [options] header <get|set <field> <value>\n"
52 << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]"
53 << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
54 << " <selector>:\n"
55 << " --range=<start>..<end>\n"
56 << " --path=<substring>\n"
57 << " --inode=<integer>\n"
58 << " --type=<UPDATE|OPEN|SESSION...><\n"
59 << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
60 << " --client=<session id integer>\n"
61 << " <effect>: [get|recover_dentries|splice]\n"
62 << " <output>: [summary|list|binary|json] [--path <path>]\n"
63 << "\n"
64 << "General options:\n"
65 << " --rank=filesystem:mds-rank Journal rank (required if multiple\n"
66 << " file systems, default is rank 0 on\n"
67 << " the only filesystem otherwise.\n"
68 << "\n"
69 << "Special options\n"
70 << " --alternate-pool <name> Alternative metadata pool to target\n"
71 << " when using recover_dentries.\n";
72
73 generic_client_usage();
74 }
75
76
77 /**
78 * Handle arguments and hand off to journal/header/event mode
79 */
80 int JournalTool::main(std::vector<const char*> &argv)
81 {
82 int r;
83
84 dout(10) << "JournalTool::main " << dendl;
85 // Common arg parsing
86 // ==================
87 if (argv.empty()) {
88 usage();
89 return -EINVAL;
90 }
91
92 std::vector<const char*>::iterator arg = argv.begin();
93
94 std::string rank_str;
95 if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
96 // Default: act on rank 0. Will give the user an error if they
97 // try invoking this way when they have more than one filesystem.
98 rank_str = "0";
99 }
100
101 r = role_selector.parse(*fsmap, rank_str);
102 if (r != 0) {
103 derr << "Couldn't determine MDS rank." << dendl;
104 return r;
105 }
106
107 std::string mode;
108 if (arg == argv.end()) {
109 derr << "Missing mode [journal|header|event]" << dendl;
110 return -EINVAL;
111 }
112 mode = std::string(*arg);
113 arg = argv.erase(arg);
114
115 // RADOS init
116 // ==========
117 r = rados.init_with_context(g_ceph_context);
118 if (r < 0) {
119 derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
120 return r;
121 }
122
123 dout(4) << "JournalTool: connecting to RADOS..." << dendl;
124 r = rados.connect();
125 if (r < 0) {
126 derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
127 return r;
128 }
129
130 auto fs = fsmap->get_filesystem(role_selector.get_ns());
131 assert(fs != nullptr);
132 int64_t const pool_id = fs->mds_map.get_metadata_pool();
133 dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
134 std::string pool_name;
135 r = rados.pool_reverse_lookup(pool_id, &pool_name);
136 if (r < 0) {
137 derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
138 return r;
139 }
140
141 dout(4) << "JournalTool: creating IoCtx.." << dendl;
142 r = rados.ioctx_create(pool_name.c_str(), input);
143 assert(r == 0);
144 output.dup(input);
145
146 // Execution
147 // =========
148 for (auto role : role_selector.get_roles()) {
149 rank = role.rank;
150 dout(4) << "Executing for rank " << rank << dendl;
151 if (mode == std::string("journal")) {
152 r = main_journal(argv);
153 } else if (mode == std::string("header")) {
154 r = main_header(argv);
155 } else if (mode == std::string("event")) {
156 r = main_event(argv);
157 } else {
158 derr << "Bad command '" << mode << "'" << dendl;
159 usage();
160 return -EINVAL;
161 }
162
163 if (r != 0) {
164 return r;
165 }
166 }
167
168 return r;
169 }
170
171
172 /**
173 * Handle arguments for 'journal' mode
174 *
175 * This is for operations that act on the journal as a whole.
176 */
177 int JournalTool::main_journal(std::vector<const char*> &argv)
178 {
179 std::string command = argv[0];
180 if (command == "inspect") {
181 return journal_inspect();
182 } else if (command == "export" || command == "import") {
183 if (argv.size() >= 2) {
184 std::string const path = argv[1];
185 return journal_export(path, command == "import");
186 } else {
187 derr << "Missing path" << dendl;
188 return -EINVAL;
189 }
190 } else if (command == "reset") {
191 bool force = false;
192 if (argv.size() == 2) {
193 if (std::string(argv[1]) == "--force") {
194 force = true;
195 } else {
196 std::cerr << "Unknown argument " << argv[1] << std::endl;
197 usage();
198 return -EINVAL;
199 }
200 } else if (argv.size() > 2) {
201 std::cerr << "Too many arguments!" << std::endl;
202 usage();
203 return -EINVAL;
204 }
205 return journal_reset(force);
206 } else {
207 derr << "Bad journal command '" << command << "'" << dendl;
208 return -EINVAL;
209 }
210 }
211
212
213 /**
214 * Parse arguments and execute for 'header' mode
215 *
216 * This is for operations that act on the header only.
217 */
218 int JournalTool::main_header(std::vector<const char*> &argv)
219 {
220 JournalFilter filter;
221 JournalScanner js(input, rank, filter);
222 int r = js.scan(false);
223 if (r < 0) {
224 std::cerr << "Unable to scan journal" << std::endl;
225 return r;
226 }
227
228 if (!js.header_present) {
229 std::cerr << "Header object not found!" << std::endl;
230 return -ENOENT;
231 } else if (!js.header_valid && js.header == NULL) {
232 // Can't do a read or a single-field write without a copy of the original
233 derr << "Header could not be read!" << dendl;
234 return -ENOENT;
235 } else {
236 assert(js.header != NULL);
237 }
238
239 if (argv.size() == 0) {
240 derr << "Invalid header command, must be [get|set]" << dendl;
241 return -EINVAL;
242 }
243 std::vector<const char *>::iterator arg = argv.begin();
244 std::string const command = *arg;
245 arg = argv.erase(arg);
246
247 if (command == std::string("get")) {
248 // Write JSON journal dump to stdout
249 JSONFormatter jf(true);
250 js.header->dump(&jf);
251 jf.flush(std::cout);
252 std::cout << std::endl;
253 } else if (command == std::string("set")) {
254 // Need two more args <key> <val>
255 if (argv.size() != 2) {
256 derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
257 return -EINVAL;
258 }
259
260 std::string const field_name = *arg;
261 arg = argv.erase(arg);
262
263 std::string const value_str = *arg;
264 arg = argv.erase(arg);
265 assert(argv.empty());
266
267 std::string parse_err;
268 uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
269 if (!parse_err.empty()) {
270 derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
271 return -EINVAL;
272 }
273
274 uint64_t *field = NULL;
275 if (field_name == "trimmed_pos") {
276 field = &(js.header->trimmed_pos);
277 } else if (field_name == "expire_pos") {
278 field = &(js.header->expire_pos);
279 } else if (field_name == "write_pos") {
280 field = &(js.header->write_pos);
281 } else if (field_name == "pool_id") {
282 field = (uint64_t*)(&(js.header->layout.pool_id));
283 } else {
284 derr << "Invalid field '" << field_name << "'" << dendl;
285 return -EINVAL;
286 }
287
288 std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
289 *field = new_val;
290
291 dout(4) << "Writing object..." << dendl;
292 bufferlist header_bl;
293 ::encode(*(js.header), header_bl);
294 output.write_full(js.obj_name(0), header_bl);
295 dout(4) << "Write complete." << dendl;
296 std::cout << "Successfully updated header." << std::endl;
297 } else {
298 derr << "Bad header command '" << command << "'" << dendl;
299 return -EINVAL;
300 }
301
302 return 0;
303 }
304
305
306 /**
307 * Parse arguments and execute for 'event' mode
308 *
309 * This is for operations that act on LogEvents within the log
310 */
311 int JournalTool::main_event(std::vector<const char*> &argv)
312 {
313 int r;
314
315 std::vector<const char*>::iterator arg = argv.begin();
316
317 std::string command = *(arg++);
318 if (command != "get" && command != "splice" && command != "recover_dentries") {
319 derr << "Unknown argument '" << command << "'" << dendl;
320 usage();
321 return -EINVAL;
322 }
323
324 if (arg == argv.end()) {
325 derr << "Incomplete command line" << dendl;
326 usage();
327 return -EINVAL;
328 }
329
330 // Parse filter options
331 // ====================
332 JournalFilter filter;
333 r = filter.parse_args(argv, arg);
334 if (r) {
335 return r;
336 }
337
338 // Parse output options
339 // ====================
340 if (arg == argv.end()) {
341 derr << "Missing output command" << dendl;
342 usage();
343 }
344 std::string output_style = *(arg++);
345 if (output_style != "binary" && output_style != "json" &&
346 output_style != "summary" && output_style != "list") {
347 derr << "Unknown argument: '" << output_style << "'" << dendl;
348 usage();
349 return -EINVAL;
350 }
351
352 std::string output_path = "dump";
353 while(arg != argv.end()) {
354 std::string arg_str;
355 if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
356 output_path = arg_str;
357 } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
358 nullptr)) {
359 dout(1) << "Using alternate pool " << arg_str << dendl;
360 int r = rados.ioctx_create(arg_str.c_str(), output);
361 assert(r == 0);
362 other_pool = true;
363 } else {
364 derr << "Unknown argument: '" << *arg << "'" << dendl;
365 usage();
366 return -EINVAL;
367 }
368 }
369
370 // Execute command
371 // ===============
372 JournalScanner js(input, rank, filter);
373 if (command == "get") {
374 r = js.scan();
375 if (r) {
376 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
377 return r;
378 }
379 } else if (command == "recover_dentries") {
380 r = js.scan();
381 if (r) {
382 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
383 return r;
384 }
385
386 bool dry_run = false;
387 if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
388 dry_run = true;
389 }
390
391 /**
392 * Iterate over log entries, attempting to scavenge from each one
393 */
394 std::set<inodeno_t> consumed_inos;
395 for (JournalScanner::EventMap::iterator i = js.events.begin();
396 i != js.events.end(); ++i) {
397 LogEvent *le = i->second.log_event;
398 EMetaBlob const *mb = le->get_metablob();
399 if (mb) {
400 int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
401 if (scav_r) {
402 dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
403 << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
404 if (r == 0) {
405 r = scav_r;
406 }
407 // Our goal is to read all we can, so don't stop on errors, but
408 // do record them for possible later output
409 js.errors.insert(std::make_pair(i->first,
410 JournalScanner::EventError(scav_r, cpp_strerror(r))));
411 }
412 }
413 }
414
415 /**
416 * Update InoTable to reflect any inode numbers consumed during scavenge
417 */
418 dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
419 if (consumed_inos.size() && !dry_run) {
420 int consume_r = consume_inos(consumed_inos);
421 if (consume_r) {
422 dout(1) << "Error updating InoTable for " << consumed_inos.size()
423 << " consume inos: " << cpp_strerror(consume_r) << dendl;
424 if (r == 0) {
425 r = consume_r;
426 }
427 }
428 }
429
430 // Remove consumed dentries from lost+found.
431 if (other_pool && !dry_run) {
432 std::set<std::string> found;
433
434 for (auto i : consumed_inos) {
435 char s[20];
436
437 snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
438 dout(20) << "removing " << s << dendl;
439 found.insert(std::string(s));
440 }
441
442 object_t frag_oid;
443 frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
444 frag_t(), "");
445 output.omap_rm_keys(frag_oid.name, found);
446 }
447 } else if (command == "splice") {
448 r = js.scan();
449 if (r) {
450 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
451 return r;
452 }
453
454 uint64_t start, end;
455 if (filter.get_range(start, end)) {
456 // Special case for range filter: erase a numeric range in the log
457 uint64_t range = end - start;
458 int r = erase_region(js, start, range);
459 if (r) {
460 derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
461 << ": " << cpp_strerror(r) << dendl;
462 return r;
463 }
464 } else {
465 // General case: erase a collection of individual entries in the log
466 for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
467 dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
468
469 int r = erase_region(js, i->first, i->second.raw_size);
470 if (r) {
471 derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
472 << ": " << cpp_strerror(r) << dendl;
473 return r;
474 }
475 }
476 }
477
478
479 } else {
480 derr << "Unknown argument '" << command << "'" << dendl;
481 usage();
482 return -EINVAL;
483 }
484
485 // Generate output
486 // ===============
487 EventOutput output(js, output_path);
488 int output_result = 0;
489 if (output_style == "binary") {
490 output_result = output.binary();
491 } else if (output_style == "json") {
492 output_result = output.json();
493 } else if (output_style == "summary") {
494 output.summary();
495 } else if (output_style == "list") {
496 output.list();
497 } else {
498 std::cerr << "Bad output command '" << output_style << "'" << std::endl;
499 return -EINVAL;
500 }
501
502 if (output_result != 0) {
503 std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
504 }
505
506 return output_result;
507 }
508
509 /**
510 * Provide the user with information about the condition of the journal,
511 * especially indicating what range of log events is available and where
512 * any gaps or corruptions in the journal are.
513 */
514 int JournalTool::journal_inspect()
515 {
516 int r;
517
518 JournalFilter filter;
519 JournalScanner js(input, rank, filter);
520 r = js.scan();
521 if (r) {
522 std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
523 return r;
524 }
525
526 js.report(std::cout);
527
528 return 0;
529 }
530
531
532 /**
533 * Attempt to export a binary dump of the journal.
534 *
535 * This is allowed to fail if the header is malformed or there are
536 * objects inaccessible, in which case the user would have to fall
537 * back to manually listing RADOS objects and extracting them, which
538 * they can do with the ``rados`` CLI.
539 */
540 int JournalTool::journal_export(std::string const &path, bool import)
541 {
542 int r = 0;
543 JournalScanner js(input, rank);
544
545 if (!import) {
546 /*
547 * If doing an export, first check that the header is valid and
548 * no objects are missing before trying to dump
549 */
550 r = js.scan();
551 if (r < 0) {
552 derr << "Unable to scan journal, assuming badly damaged" << dendl;
553 return r;
554 }
555 if (!js.is_readable()) {
556 derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
557 return -EIO;
558 }
559 }
560
561 /*
562 * Assuming we can cleanly read the journal data, dump it out to a file
563 */
564 {
565 Dumper dumper;
566 r = dumper.init(mds_role_t(role_selector.get_ns(), rank));
567 if (r < 0) {
568 derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
569 return r;
570 }
571 if (import) {
572 r = dumper.undump(path.c_str());
573 } else {
574 r = dumper.dump(path.c_str());
575 }
576 dumper.shutdown();
577 }
578
579 return r;
580 }
581
582
583 /**
584 * Truncate journal and insert EResetJournal
585 */
586 int JournalTool::journal_reset(bool hard)
587 {
588 int r = 0;
589 Resetter resetter;
590 r = resetter.init();
591 if (r < 0) {
592 derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
593 return r;
594 }
595
596 if (hard) {
597 r = resetter.reset_hard(mds_role_t(role_selector.get_ns(), rank));
598 } else {
599 r = resetter.reset(mds_role_t(role_selector.get_ns(), rank));
600 }
601 resetter.shutdown();
602
603 return r;
604 }
605
606
607 /**
608 * Selective offline replay which only reads out dentries and writes
609 * them to the backing store iff their version is > what is currently
610 * in the backing store.
611 *
612 * In order to write dentries to the backing store, we may create the
613 * required enclosing dirfrag objects.
614 *
615 * Test this by running scavenge on an unflushed journal, then nuking
616 * it offline, then starting an MDS and seeing that the dentries are
617 * visible.
618 *
619 * @param metablob an EMetaBlob retrieved from the journal
620 * @param dry_run if true, do no writes to RADOS
621 * @param consumed_inos output, populated with any inos inserted
622 * @returns 0 on success, else negative error code
623 */
624 int JournalTool::recover_dentries(
625 EMetaBlob const &metablob,
626 bool const dry_run,
627 std::set<inodeno_t> *consumed_inos)
628 {
629 assert(consumed_inos != NULL);
630
631 int r = 0;
632
633 // Replay fullbits (dentry+inode)
634 for (list<dirfrag_t>::const_iterator lp = metablob.lump_order.begin();
635 lp != metablob.lump_order.end(); ++lp)
636 {
637 dirfrag_t const &frag = *lp;
638 EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
639 lump._decode_bits();
640 object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
641
642 dout(4) << "inspecting lump " << frag_oid.name << dendl;
643
644
645 // We will record old fnode version for use in hard link handling
646 // If we don't read an old fnode, take version as zero and write in
647 // all hardlinks we find.
648 version_t old_fnode_version = 0;
649
650 // Update fnode in omap header of dirfrag object
651 bool write_fnode = false;
652 bufferlist old_fnode_bl;
653 r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
654 if (r == -ENOENT) {
655 // Creating dirfrag from scratch
656 dout(4) << "failed to read OMAP header from directory fragment "
657 << frag_oid.name << " " << cpp_strerror(r) << dendl;
658 write_fnode = true;
659 // Note: creating the dirfrag *without* a backtrace, relying on
660 // MDS to regenerate backtraces on read or in FSCK
661 } else if (r == 0) {
662 // Conditionally update existing omap header
663 fnode_t old_fnode;
664 bufferlist::iterator old_fnode_iter = old_fnode_bl.begin();
665 try {
666 old_fnode.decode(old_fnode_iter);
667 dout(4) << "frag " << frag_oid.name << " fnode old v" <<
668 old_fnode.version << " vs new v" << lump.fnode.version << dendl;
669 old_fnode_version = old_fnode.version;
670 write_fnode = old_fnode_version < lump.fnode.version;
671 } catch (const buffer::error &err) {
672 dout(1) << "frag " << frag_oid.name
673 << " is corrupt, overwriting" << dendl;
674 write_fnode = true;
675 }
676 } else {
677 // Unexpected error
678 dout(4) << "failed to read OMAP header from directory fragment "
679 << frag_oid.name << " " << cpp_strerror(r) << dendl;
680 return r;
681 }
682
683 if ((other_pool || write_fnode) && !dry_run) {
684 dout(4) << "writing fnode to omap header" << dendl;
685 bufferlist fnode_bl;
686 lump.fnode.encode(fnode_bl);
687 if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
688 r = output.omap_set_header(frag_oid.name, fnode_bl);
689 }
690 if (r != 0) {
691 derr << "Failed to write fnode for frag object "
692 << frag_oid.name << dendl;
693 return r;
694 }
695 }
696
697 std::set<std::string> read_keys;
698
699 // Compose list of potentially-existing dentries we would like to fetch
700 list<ceph::shared_ptr<EMetaBlob::fullbit> > const &fb_list =
701 lump.get_dfull();
702 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
703 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
704 EMetaBlob::fullbit const &fb = *(*fbi);
705
706 // Get a key like "foobar_head"
707 std::string key;
708 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
709 dn_key.encode(key);
710 read_keys.insert(key);
711 }
712
713 list<EMetaBlob::remotebit> const &rb_list =
714 lump.get_dremote();
715 for (list<EMetaBlob::remotebit>::const_iterator rbi =
716 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
717 EMetaBlob::remotebit const &rb = *rbi;
718
719 // Get a key like "foobar_head"
720 std::string key;
721 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
722 dn_key.encode(key);
723 read_keys.insert(key);
724 }
725
726 list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull();
727 for (auto& nb : nb_list) {
728 // Get a key like "foobar_head"
729 std::string key;
730 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
731 dn_key.encode(key);
732 read_keys.insert(key);
733 }
734
735 // Perform bulk read of existing dentries
736 std::map<std::string, bufferlist> read_vals;
737 r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
738 if (r == -ENOENT && other_pool) {
739 r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
740 }
741 if (r != 0) {
742 derr << "unexpected error reading fragment object "
743 << frag_oid.name << ": " << cpp_strerror(r) << dendl;
744 return r;
745 }
746
747 // Compose list of dentries we will write back
748 std::map<std::string, bufferlist> write_vals;
749 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
750 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
751 EMetaBlob::fullbit const &fb = *(*fbi);
752
753 // Get a key like "foobar_head"
754 std::string key;
755 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
756 dn_key.encode(key);
757
758 dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
759 << dendl;
760 bool write_dentry = false;
761 if (read_vals.find(key) == read_vals.end()) {
762 dout(4) << "dentry did not already exist, will create" << dendl;
763 write_dentry = true;
764 } else {
765 dout(4) << "dentry " << key << " existed already" << dendl;
766 dout(4) << "dentry exists, checking versions..." << dendl;
767 bufferlist &old_dentry = read_vals[key];
768 // Decode dentry+inode
769 bufferlist::iterator q = old_dentry.begin();
770
771 snapid_t dnfirst;
772 ::decode(dnfirst, q);
773 char dentry_type;
774 ::decode(dentry_type, q);
775
776 if (dentry_type == 'L') {
777 // leave write_dentry false, we have no version to
778 // compare with in a hardlink, so it's not safe to
779 // squash over it with what's in this fullbit
780 dout(10) << "Existing remote inode in slot to be (maybe) written "
781 << "by a full inode from the journal dn '" << fb.dn.c_str()
782 << "' with lump fnode version " << lump.fnode.version
783 << "vs existing fnode version " << old_fnode_version << dendl;
784 write_dentry = old_fnode_version < lump.fnode.version;
785 } else if (dentry_type == 'I') {
786 // Read out inode version to compare with backing store
787 InodeStore inode;
788 inode.decode_bare(q);
789 dout(4) << "decoded embedded inode version "
790 << inode.inode.version << " vs fullbit version "
791 << fb.inode.version << dendl;
792 if (inode.inode.version < fb.inode.version) {
793 write_dentry = true;
794 }
795 } else {
796 dout(4) << "corrupt dentry in backing store, overwriting from "
797 "journal" << dendl;
798 write_dentry = true;
799 }
800 }
801
802 if ((other_pool || write_dentry) && !dry_run) {
803 dout(4) << "writing I dentry " << key << " into frag "
804 << frag_oid.name << dendl;
805
806 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
807 bufferlist dentry_bl;
808 ::encode(fb.dnfirst, dentry_bl);
809 ::encode('I', dentry_bl);
810 encode_fullbit_as_inode(fb, true, &dentry_bl);
811
812 // Record for writing to RADOS
813 write_vals[key] = dentry_bl;
814 consumed_inos->insert(fb.inode.ino);
815 }
816 }
817
818 for (list<EMetaBlob::remotebit>::const_iterator rbi =
819 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
820 EMetaBlob::remotebit const &rb = *rbi;
821
822 // Get a key like "foobar_head"
823 std::string key;
824 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
825 dn_key.encode(key);
826
827 dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
828 << dendl;
829 bool write_dentry = false;
830 if (read_vals.find(key) == read_vals.end()) {
831 dout(4) << "dentry did not already exist, will create" << dendl;
832 write_dentry = true;
833 } else {
834 dout(4) << "dentry " << key << " existed already" << dendl;
835 dout(4) << "dentry exists, checking versions..." << dendl;
836 bufferlist &old_dentry = read_vals[key];
837 // Decode dentry+inode
838 bufferlist::iterator q = old_dentry.begin();
839
840 snapid_t dnfirst;
841 ::decode(dnfirst, q);
842 char dentry_type;
843 ::decode(dentry_type, q);
844
845 if (dentry_type == 'L') {
846 dout(10) << "Existing hardlink inode in slot to be (maybe) written "
847 << "by a remote inode from the journal dn '" << rb.dn.c_str()
848 << "' with lump fnode version " << lump.fnode.version
849 << "vs existing fnode version " << old_fnode_version << dendl;
850 write_dentry = old_fnode_version < lump.fnode.version;
851 } else if (dentry_type == 'I') {
852 dout(10) << "Existing full inode in slot to be (maybe) written "
853 << "by a remote inode from the journal dn '" << rb.dn.c_str()
854 << "' with lump fnode version " << lump.fnode.version
855 << "vs existing fnode version " << old_fnode_version << dendl;
856 write_dentry = old_fnode_version < lump.fnode.version;
857 } else {
858 dout(4) << "corrupt dentry in backing store, overwriting from "
859 "journal" << dendl;
860 write_dentry = true;
861 }
862 }
863
864 if ((other_pool || write_dentry) && !dry_run) {
865 dout(4) << "writing L dentry " << key << " into frag "
866 << frag_oid.name << dendl;
867
868 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
869 bufferlist dentry_bl;
870 ::encode(rb.dnfirst, dentry_bl);
871 ::encode('L', dentry_bl);
872 ::encode(rb.ino, dentry_bl);
873 ::encode(rb.d_type, dentry_bl);
874
875 // Record for writing to RADOS
876 write_vals[key] = dentry_bl;
877 consumed_inos->insert(rb.ino);
878 }
879 }
880
881 std::set<std::string> null_vals;
882 for (auto& nb : nb_list) {
883 std::string key;
884 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
885 dn_key.encode(key);
886
887 dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
888 << dendl;
889
890 auto it = read_vals.find(key);
891 if (it != read_vals.end()) {
892 dout(4) << "dentry exists, will remove" << dendl;
893
894 bufferlist::iterator q = it->second.begin();
895 snapid_t dnfirst;
896 ::decode(dnfirst, q);
897 char dentry_type;
898 ::decode(dentry_type, q);
899
900 bool remove_dentry = false;
901 if (dentry_type == 'L') {
902 dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
903 << "by null journal dn '" << nb.dn.c_str()
904 << "' with lump fnode version " << lump.fnode.version
905 << "vs existing fnode version " << old_fnode_version << dendl;
906 remove_dentry = old_fnode_version < lump.fnode.version;
907 } else if (dentry_type == 'I') {
908 dout(10) << "Existing full inode in slot to be (maybe) removed "
909 << "by null journal dn '" << nb.dn.c_str()
910 << "' with lump fnode version " << lump.fnode.version
911 << "vs existing fnode version " << old_fnode_version << dendl;
912 remove_dentry = old_fnode_version < lump.fnode.version;
913 } else {
914 dout(4) << "corrupt dentry in backing store, will remove" << dendl;
915 remove_dentry = true;
916 }
917
918 if (remove_dentry)
919 null_vals.insert(key);
920 }
921 }
922
923 // Write back any new/changed dentries
924 if (!write_vals.empty()) {
925 r = output.omap_set(frag_oid.name, write_vals);
926 if (r != 0) {
927 derr << "error writing dentries to " << frag_oid.name
928 << ": " << cpp_strerror(r) << dendl;
929 return r;
930 }
931 }
932
933 // remove any null dentries
934 if (!null_vals.empty()) {
935 r = output.omap_rm_keys(frag_oid.name, null_vals);
936 if (r != 0) {
937 derr << "error removing dentries from " << frag_oid.name
938 << ": " << cpp_strerror(r) << dendl;
939 return r;
940 }
941 }
942 }
943
944 /* Now that we've looked at the dirlumps, we finally pay attention to
945 * the roots (i.e. inodes without ancestry). This is necessary in order
946 * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
947 * important because clients use them to infer completeness
948 * of directories
949 */
950 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator p =
951 metablob.roots.begin(); p != metablob.roots.end(); ++p) {
952 EMetaBlob::fullbit const &fb = *(*p);
953 inodeno_t ino = fb.inode.ino;
954 dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
955
956 object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
957 dout(4) << "object id " << root_oid.name << dendl;
958
959 bool write_root_ino = false;
960 bufferlist old_root_ino_bl;
961 r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
962 if (r == -ENOENT) {
963 dout(4) << "root does not exist, will create" << dendl;
964 write_root_ino = true;
965 } else if (r >= 0) {
966 r = 0;
967 InodeStore old_inode;
968 dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
969 << ")" << dendl;
970 bufferlist::iterator inode_bl_iter = old_root_ino_bl.begin();
971 std::string magic;
972 ::decode(magic, inode_bl_iter);
973 if (magic == CEPH_FS_ONDISK_MAGIC) {
974 dout(4) << "magic ok" << dendl;
975 old_inode.decode(inode_bl_iter);
976
977 if (old_inode.inode.version < fb.inode.version) {
978 write_root_ino = true;
979 }
980 } else {
981 dout(4) << "magic bad: '" << magic << "'" << dendl;
982 write_root_ino = true;
983 }
984 } else {
985 derr << "error reading root inode object " << root_oid.name
986 << ": " << cpp_strerror(r) << dendl;
987 return r;
988 }
989
990 if (write_root_ino && !dry_run) {
991 dout(4) << "writing root ino " << root_oid.name
992 << " version " << fb.inode.version << dendl;
993
994 // Compose: root ino format is magic,InodeStore(bare=false)
995 bufferlist new_root_ino_bl;
996 ::encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
997 encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
998
999 // Write to RADOS
1000 r = output.write_full(root_oid.name, new_root_ino_bl);
1001 if (r != 0) {
1002 derr << "error writing inode object " << root_oid.name
1003 << ": " << cpp_strerror(r) << dendl;
1004 return r;
1005 }
1006 }
1007 }
1008
1009 return r;
1010 }
1011
1012
1013 /**
1014 * Erase a region of the log by overwriting it with ENoOp
1015 *
1016 */
1017 int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
1018 {
1019 // To erase this region, we use our preamble, the encoding overhead
1020 // of an ENoOp, and our trailing start ptr. Calculate how much padding
1021 // is needed inside the ENoOp to make up the difference.
1022 bufferlist tmp;
1023 ENoOp enoop(0);
1024 enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
1025
1026 dout(4) << "erase_region " << pos << " len=" << length << dendl;
1027
1028 // FIXME: get the preamble/postamble length via JournalStream
1029 int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
1030 dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
1031
1032 if (padding < 0) {
1033 derr << "Erase region " << length << " too short" << dendl;
1034 return -EINVAL;
1035 }
1036
1037 // Serialize an ENoOp with the correct amount of padding
1038 enoop = ENoOp(padding);
1039 bufferlist entry;
1040 enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
1041 JournalStream stream(JOURNAL_FORMAT_RESILIENT);
1042
1043 // Serialize region of log stream
1044 bufferlist log_data;
1045 stream.write(entry, &log_data, pos);
1046
1047 dout(4) << "erase_region data length " << log_data.length() << dendl;
1048 assert(log_data.length() == length);
1049
1050 // Write log stream region to RADOS
1051 // FIXME: get object size somewhere common to scan_events
1052 uint32_t object_size = g_conf->mds_log_segment_size;
1053 if (object_size == 0) {
1054 // Default layout object size
1055 object_size = file_layout_t::get_default().object_size;
1056 }
1057
1058 uint64_t write_offset = pos;
1059 uint64_t obj_offset = (pos / object_size);
1060 int r = 0;
1061 while(log_data.length()) {
1062 std::string const oid = js.obj_name(obj_offset);
1063 uint32_t offset_in_obj = write_offset % object_size;
1064 uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
1065
1066 r = output.write(oid, log_data, write_len, offset_in_obj);
1067 if (r < 0) {
1068 return r;
1069 } else {
1070 dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
1071 r = 0;
1072 }
1073
1074 log_data.splice(0, write_len);
1075 write_offset += write_len;
1076 obj_offset++;
1077 }
1078
1079 return r;
1080 }
1081
1082 /**
1083 * Given an EMetaBlob::fullbit containing an inode, write out
1084 * the encoded inode in the format used by InodeStore (i.e. the
1085 * backing store format)
1086 *
1087 * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
1088 * on an offline InodeStore instance. It's way simpler, because we are just
1089 * uncritically hauling the data between structs.
1090 *
1091 * @param fb a fullbit extracted from a journal entry
1092 * @param bare if true, leave out [EN|DE]CODE_START decoration
1093 * @param out_bl output, write serialized inode to this bufferlist
1094 */
1095 void JournalTool::encode_fullbit_as_inode(
1096 const EMetaBlob::fullbit &fb,
1097 const bool bare,
1098 bufferlist *out_bl)
1099 {
1100 assert(out_bl != NULL);
1101
1102 // Compose InodeStore
1103 InodeStore new_inode;
1104 new_inode.inode = fb.inode;
1105 new_inode.xattrs = fb.xattrs;
1106 new_inode.dirfragtree = fb.dirfragtree;
1107 new_inode.snap_blob = fb.snapbl;
1108 new_inode.symlink = fb.symlink;
1109 new_inode.old_inodes = fb.old_inodes;
1110
1111 // Serialize InodeStore
1112 if (bare) {
1113 new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1114 } else {
1115 new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1116 }
1117 }
1118
1119 /**
1120 * Given a list of inode numbers known to be in use by
1121 * inodes in the backing store, ensure that none of these
1122 * numbers are listed as free in the InoTables in the
1123 * backing store.
1124 *
1125 * Used after injecting inodes into the backing store, to
1126 * ensure that the same inode numbers are not subsequently
1127 * used for new files during ordinary operation.
1128 *
1129 * @param inos list of inode numbers to be removed from
1130 * free lists in InoTables
1131 * @returns 0 on success, else negative error code
1132 */
1133 int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
1134 {
1135 int r = 0;
1136
1137 // InoTable is a per-MDS structure, so iterate over assigned ranks
1138 auto fs = fsmap->get_filesystem(role_selector.get_ns());
1139 std::set<mds_rank_t> in_ranks;
1140 fs->mds_map.get_mds_set(in_ranks);
1141
1142 for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
1143 rank_i != in_ranks.end(); ++rank_i)
1144 {
1145 // Compose object name
1146 std::ostringstream oss;
1147 oss << "mds" << *rank_i << "_inotable";
1148 object_t inotable_oid = object_t(oss.str());
1149
1150 // Read object
1151 bufferlist inotable_bl;
1152 int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
1153 if (read_r < 0) {
1154 // Things are really bad if we can't read inotable. Beyond our powers.
1155 derr << "unable to read inotable '" << inotable_oid.name << "': "
1156 << cpp_strerror(read_r) << dendl;
1157 r = r ? r : read_r;
1158 continue;
1159 }
1160
1161 // Deserialize InoTable
1162 version_t inotable_ver;
1163 bufferlist::iterator q = inotable_bl.begin();
1164 ::decode(inotable_ver, q);
1165 InoTable ino_table(NULL);
1166 ino_table.decode(q);
1167
1168 // Update InoTable in memory
1169 bool inotable_modified = false;
1170 for (std::set<inodeno_t>::iterator i = inos.begin();
1171 i != inos.end(); ++i)
1172 {
1173 const inodeno_t ino = *i;
1174 if (ino_table.force_consume(ino)) {
1175 dout(4) << "Used ino 0x" << std::hex << ino << std::dec
1176 << " requires inotable update" << dendl;
1177 inotable_modified = true;
1178 }
1179 }
1180
1181 // Serialize and write InoTable
1182 if (inotable_modified) {
1183 inotable_ver += 1;
1184 dout(4) << "writing modified inotable version " << inotable_ver << dendl;
1185 bufferlist inotable_new_bl;
1186 ::encode(inotable_ver, inotable_new_bl);
1187 ino_table.encode_state(inotable_new_bl);
1188 int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
1189 if (write_r != 0) {
1190 derr << "error writing modified inotable " << inotable_oid.name
1191 << ": " << cpp_strerror(write_r) << dendl;
1192 r = r ? r : read_r;
1193 continue;
1194 }
1195 }
1196 }
1197
1198 return r;
1199 }
1200