]> git.proxmox.com Git - ceph.git/blob - ceph/src/tools/cephfs/JournalTool.cc
a66cc2d70ba565f31b8853f2eb3898cfe466b597
[ceph.git] / ceph / src / tools / cephfs / JournalTool.cc
1 // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * ceph - scalable distributed file system
5 *
6 * copyright (c) 2014 john spray <john.spray@inktank.com>
7 *
8 * this is free software; you can redistribute it and/or
9 * modify it under the terms of the gnu lesser general public
10 * license version 2.1, as published by the free software
11 * foundation. see file copying.
12 */
13
14
15 #include <sstream>
16
17 #include "common/ceph_argparse.h"
18 #include "common/errno.h"
19 #include "osdc/Journaler.h"
20 #include "mds/mdstypes.h"
21 #include "mds/LogEvent.h"
22 #include "mds/InoTable.h"
23
24 #include "mds/events/ENoOp.h"
25 #include "mds/events/EUpdate.h"
26
27 #include "JournalScanner.h"
28 #include "EventOutput.h"
29 #include "Dumper.h"
30 #include "Resetter.h"
31
32 #include "JournalTool.h"
33
34
35 #define dout_context g_ceph_context
36 #define dout_subsys ceph_subsys_mds
37 #undef dout_prefix
38 #define dout_prefix *_dout << __func__ << ": "
39
40
41
42 void JournalTool::usage()
43 {
44 std::cout << "Usage: \n"
45 << " cephfs-journal-tool [options] journal <command>\n"
46 << " <command>:\n"
47 << " inspect\n"
48 << " import <path> [--force]\n"
49 << " export <path>\n"
50 << " reset [--force]\n"
51 << " cephfs-journal-tool [options] header <get|set <field> <value>\n"
52 << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]"
53 << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n"
54 << " <selector>:\n"
55 << " --range=<start>..<end>\n"
56 << " --path=<substring>\n"
57 << " --inode=<integer>\n"
58 << " --type=<UPDATE|OPEN|SESSION...><\n"
59 << " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
60 << " --client=<session id integer>\n"
61 << " <effect>: [get|recover_dentries|splice]\n"
62 << " <output>: [summary|list|binary|json] [--path <path>]\n"
63 << "\n"
64 << "General options:\n"
65 << " --rank=filesystem:mds-rank Journal rank (required if multiple\n"
66 << " file systems, default is rank 0 on\n"
67 << " the only filesystem otherwise.\n"
68 << "\n"
69 << "Special options\n"
70 << " --alternate-pool <name> Alternative metadata pool to target\n"
71 << " when using recover_dentries.\n";
72
73 generic_client_usage();
74 }
75
76
77 /**
78 * Handle arguments and hand off to journal/header/event mode
79 */
80 int JournalTool::main(std::vector<const char*> &argv)
81 {
82 int r;
83
84 dout(10) << "JournalTool::main " << dendl;
85 // Common arg parsing
86 // ==================
87 if (argv.empty()) {
88 usage();
89 return -EINVAL;
90 }
91
92 std::vector<const char*>::iterator arg = argv.begin();
93
94 std::string rank_str;
95 if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) {
96 // Default: act on rank 0. Will give the user an error if they
97 // try invoking this way when they have more than one filesystem.
98 rank_str = "0";
99 }
100
101 r = role_selector.parse(*fsmap, rank_str);
102 if (r != 0) {
103 derr << "Couldn't determine MDS rank." << dendl;
104 return r;
105 }
106
107 std::string mode;
108 if (arg == argv.end()) {
109 derr << "Missing mode [journal|header|event]" << dendl;
110 return -EINVAL;
111 }
112 mode = std::string(*arg);
113 arg = argv.erase(arg);
114
115 // RADOS init
116 // ==========
117 r = rados.init_with_context(g_ceph_context);
118 if (r < 0) {
119 derr << "RADOS unavailable, cannot scan filesystem journal" << dendl;
120 return r;
121 }
122
123 dout(4) << "JournalTool: connecting to RADOS..." << dendl;
124 r = rados.connect();
125 if (r < 0) {
126 derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl;
127 return r;
128 }
129
130 auto fs = fsmap->get_filesystem(role_selector.get_ns());
131 assert(fs != nullptr);
132 int64_t const pool_id = fs->mds_map.get_metadata_pool();
133 dout(4) << "JournalTool: resolving pool " << pool_id << dendl;
134 std::string pool_name;
135 r = rados.pool_reverse_lookup(pool_id, &pool_name);
136 if (r < 0) {
137 derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl;
138 return r;
139 }
140
141 dout(4) << "JournalTool: creating IoCtx.." << dendl;
142 r = rados.ioctx_create(pool_name.c_str(), input);
143 assert(r == 0);
144 output.dup(input);
145
146 // Execution
147 // =========
148 for (auto role : role_selector.get_roles()) {
149 rank = role.rank;
150 dout(4) << "Executing for rank " << rank << dendl;
151 if (mode == std::string("journal")) {
152 r = main_journal(argv);
153 } else if (mode == std::string("header")) {
154 r = main_header(argv);
155 } else if (mode == std::string("event")) {
156 r = main_event(argv);
157 } else {
158 derr << "Bad command '" << mode << "'" << dendl;
159 usage();
160 return -EINVAL;
161 }
162
163 if (r != 0) {
164 return r;
165 }
166 }
167
168 return r;
169 }
170
171
172 /**
173 * Handle arguments for 'journal' mode
174 *
175 * This is for operations that act on the journal as a whole.
176 */
177 int JournalTool::main_journal(std::vector<const char*> &argv)
178 {
179 std::string command = argv[0];
180 if (command == "inspect") {
181 return journal_inspect();
182 } else if (command == "export" || command == "import") {
183 bool force = false;
184 if (argv.size() >= 2) {
185 std::string const path = argv[1];
186 if (argv.size() == 3) {
187 if (std::string(argv[2]) == "--force") {
188 force = true;
189 } else {
190 std::cerr << "Unknown argument " << argv[1] << std::endl;
191 return -EINVAL;
192 }
193 }
194 return journal_export(path, command == "import", force);
195 } else {
196 derr << "Missing path" << dendl;
197 return -EINVAL;
198 }
199 } else if (command == "reset") {
200 bool force = false;
201 if (argv.size() == 2) {
202 if (std::string(argv[1]) == "--force") {
203 force = true;
204 } else {
205 std::cerr << "Unknown argument " << argv[1] << std::endl;
206 usage();
207 return -EINVAL;
208 }
209 } else if (argv.size() > 2) {
210 std::cerr << "Too many arguments!" << std::endl;
211 usage();
212 return -EINVAL;
213 }
214 return journal_reset(force);
215 } else {
216 derr << "Bad journal command '" << command << "'" << dendl;
217 return -EINVAL;
218 }
219 }
220
221
222 /**
223 * Parse arguments and execute for 'header' mode
224 *
225 * This is for operations that act on the header only.
226 */
227 int JournalTool::main_header(std::vector<const char*> &argv)
228 {
229 JournalFilter filter;
230 JournalScanner js(input, rank, filter);
231 int r = js.scan(false);
232 if (r < 0) {
233 std::cerr << "Unable to scan journal" << std::endl;
234 return r;
235 }
236
237 if (!js.header_present) {
238 std::cerr << "Header object not found!" << std::endl;
239 return -ENOENT;
240 } else if (!js.header_valid && js.header == NULL) {
241 // Can't do a read or a single-field write without a copy of the original
242 derr << "Header could not be read!" << dendl;
243 return -ENOENT;
244 } else {
245 assert(js.header != NULL);
246 }
247
248 if (argv.size() == 0) {
249 derr << "Invalid header command, must be [get|set]" << dendl;
250 return -EINVAL;
251 }
252 std::vector<const char *>::iterator arg = argv.begin();
253 std::string const command = *arg;
254 arg = argv.erase(arg);
255
256 if (command == std::string("get")) {
257 // Write JSON journal dump to stdout
258 JSONFormatter jf(true);
259 js.header->dump(&jf);
260 jf.flush(std::cout);
261 std::cout << std::endl;
262 } else if (command == std::string("set")) {
263 // Need two more args <key> <val>
264 if (argv.size() != 2) {
265 derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl;
266 return -EINVAL;
267 }
268
269 std::string const field_name = *arg;
270 arg = argv.erase(arg);
271
272 std::string const value_str = *arg;
273 arg = argv.erase(arg);
274 assert(argv.empty());
275
276 std::string parse_err;
277 uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err);
278 if (!parse_err.empty()) {
279 derr << "Invalid value '" << value_str << "': " << parse_err << dendl;
280 return -EINVAL;
281 }
282
283 uint64_t *field = NULL;
284 if (field_name == "trimmed_pos") {
285 field = &(js.header->trimmed_pos);
286 } else if (field_name == "expire_pos") {
287 field = &(js.header->expire_pos);
288 } else if (field_name == "write_pos") {
289 field = &(js.header->write_pos);
290 } else if (field_name == "pool_id") {
291 field = (uint64_t*)(&(js.header->layout.pool_id));
292 } else {
293 derr << "Invalid field '" << field_name << "'" << dendl;
294 return -EINVAL;
295 }
296
297 std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl;
298 *field = new_val;
299
300 dout(4) << "Writing object..." << dendl;
301 bufferlist header_bl;
302 ::encode(*(js.header), header_bl);
303 output.write_full(js.obj_name(0), header_bl);
304 dout(4) << "Write complete." << dendl;
305 std::cout << "Successfully updated header." << std::endl;
306 } else {
307 derr << "Bad header command '" << command << "'" << dendl;
308 return -EINVAL;
309 }
310
311 return 0;
312 }
313
314
315 /**
316 * Parse arguments and execute for 'event' mode
317 *
318 * This is for operations that act on LogEvents within the log
319 */
320 int JournalTool::main_event(std::vector<const char*> &argv)
321 {
322 int r;
323
324 std::vector<const char*>::iterator arg = argv.begin();
325
326 std::string command = *(arg++);
327 if (command != "get" && command != "splice" && command != "recover_dentries") {
328 derr << "Unknown argument '" << command << "'" << dendl;
329 usage();
330 return -EINVAL;
331 }
332
333 if (arg == argv.end()) {
334 derr << "Incomplete command line" << dendl;
335 usage();
336 return -EINVAL;
337 }
338
339 // Parse filter options
340 // ====================
341 JournalFilter filter;
342 r = filter.parse_args(argv, arg);
343 if (r) {
344 return r;
345 }
346
347 // Parse output options
348 // ====================
349 if (arg == argv.end()) {
350 derr << "Missing output command" << dendl;
351 usage();
352 }
353 std::string output_style = *(arg++);
354 if (output_style != "binary" && output_style != "json" &&
355 output_style != "summary" && output_style != "list") {
356 derr << "Unknown argument: '" << output_style << "'" << dendl;
357 usage();
358 return -EINVAL;
359 }
360
361 std::string output_path = "dump";
362 while(arg != argv.end()) {
363 std::string arg_str;
364 if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
365 output_path = arg_str;
366 } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
367 nullptr)) {
368 dout(1) << "Using alternate pool " << arg_str << dendl;
369 int r = rados.ioctx_create(arg_str.c_str(), output);
370 assert(r == 0);
371 other_pool = true;
372 } else {
373 derr << "Unknown argument: '" << *arg << "'" << dendl;
374 usage();
375 return -EINVAL;
376 }
377 }
378
379 // Execute command
380 // ===============
381 JournalScanner js(input, rank, filter);
382 if (command == "get") {
383 r = js.scan();
384 if (r) {
385 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
386 return r;
387 }
388 } else if (command == "recover_dentries") {
389 r = js.scan();
390 if (r) {
391 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
392 return r;
393 }
394
395 bool dry_run = false;
396 if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) {
397 dry_run = true;
398 }
399
400 /**
401 * Iterate over log entries, attempting to scavenge from each one
402 */
403 std::set<inodeno_t> consumed_inos;
404 for (JournalScanner::EventMap::iterator i = js.events.begin();
405 i != js.events.end(); ++i) {
406 LogEvent *le = i->second.log_event;
407 EMetaBlob const *mb = le->get_metablob();
408 if (mb) {
409 int scav_r = recover_dentries(*mb, dry_run, &consumed_inos);
410 if (scav_r) {
411 dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec
412 << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl;
413 if (r == 0) {
414 r = scav_r;
415 }
416 // Our goal is to read all we can, so don't stop on errors, but
417 // do record them for possible later output
418 js.errors.insert(std::make_pair(i->first,
419 JournalScanner::EventError(scav_r, cpp_strerror(r))));
420 }
421 }
422 }
423
424 /**
425 * Update InoTable to reflect any inode numbers consumed during scavenge
426 */
427 dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl;
428 if (consumed_inos.size() && !dry_run) {
429 int consume_r = consume_inos(consumed_inos);
430 if (consume_r) {
431 dout(1) << "Error updating InoTable for " << consumed_inos.size()
432 << " consume inos: " << cpp_strerror(consume_r) << dendl;
433 if (r == 0) {
434 r = consume_r;
435 }
436 }
437 }
438
439 // Remove consumed dentries from lost+found.
440 if (other_pool && !dry_run) {
441 std::set<std::string> found;
442
443 for (auto i : consumed_inos) {
444 char s[20];
445
446 snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
447 dout(20) << "removing " << s << dendl;
448 found.insert(std::string(s));
449 }
450
451 object_t frag_oid;
452 frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
453 frag_t(), "");
454 output.omap_rm_keys(frag_oid.name, found);
455 }
456 } else if (command == "splice") {
457 r = js.scan();
458 if (r) {
459 derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl;
460 return r;
461 }
462
463 uint64_t start, end;
464 if (filter.get_range(start, end)) {
465 // Special case for range filter: erase a numeric range in the log
466 uint64_t range = end - start;
467 int r = erase_region(js, start, range);
468 if (r) {
469 derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec
470 << ": " << cpp_strerror(r) << dendl;
471 return r;
472 }
473 } else {
474 // General case: erase a collection of individual entries in the log
475 for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) {
476 dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl;
477
478 int r = erase_region(js, i->first, i->second.raw_size);
479 if (r) {
480 derr << "Failed to erase event 0x" << std::hex << i->first << std::dec
481 << ": " << cpp_strerror(r) << dendl;
482 return r;
483 }
484 }
485 }
486
487
488 } else {
489 derr << "Unknown argument '" << command << "'" << dendl;
490 usage();
491 return -EINVAL;
492 }
493
494 // Generate output
495 // ===============
496 EventOutput output(js, output_path);
497 int output_result = 0;
498 if (output_style == "binary") {
499 output_result = output.binary();
500 } else if (output_style == "json") {
501 output_result = output.json();
502 } else if (output_style == "summary") {
503 output.summary();
504 } else if (output_style == "list") {
505 output.list();
506 } else {
507 std::cerr << "Bad output command '" << output_style << "'" << std::endl;
508 return -EINVAL;
509 }
510
511 if (output_result != 0) {
512 std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl;
513 }
514
515 return output_result;
516 }
517
518 /**
519 * Provide the user with information about the condition of the journal,
520 * especially indicating what range of log events is available and where
521 * any gaps or corruptions in the journal are.
522 */
523 int JournalTool::journal_inspect()
524 {
525 int r;
526
527 JournalFilter filter;
528 JournalScanner js(input, rank, filter);
529 r = js.scan();
530 if (r) {
531 std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
532 return r;
533 }
534
535 js.report(std::cout);
536
537 return 0;
538 }
539
540
541 /**
542 * Attempt to export a binary dump of the journal.
543 *
544 * This is allowed to fail if the header is malformed or there are
545 * objects inaccessible, in which case the user would have to fall
546 * back to manually listing RADOS objects and extracting them, which
547 * they can do with the ``rados`` CLI.
548 */
549 int JournalTool::journal_export(std::string const &path, bool import, bool force)
550 {
551 int r = 0;
552 JournalScanner js(input, rank);
553
554 if (!import) {
555 /*
556 * If doing an export, first check that the header is valid and
557 * no objects are missing before trying to dump
558 */
559 r = js.scan();
560 if (r < 0) {
561 derr << "Unable to scan journal, assuming badly damaged" << dendl;
562 return r;
563 }
564 if (!js.is_readable()) {
565 derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl;
566 return -EIO;
567 }
568 }
569
570 /*
571 * Assuming we can cleanly read the journal data, dump it out to a file
572 */
573 {
574 Dumper dumper;
575 r = dumper.init(mds_role_t(role_selector.get_ns(), rank));
576 if (r < 0) {
577 derr << "dumper::init failed: " << cpp_strerror(r) << dendl;
578 return r;
579 }
580 if (import) {
581 r = dumper.undump(path.c_str(), force);
582 } else {
583 r = dumper.dump(path.c_str());
584 }
585 }
586
587 return r;
588 }
589
590
591 /**
592 * Truncate journal and insert EResetJournal
593 */
594 int JournalTool::journal_reset(bool hard)
595 {
596 int r = 0;
597 Resetter resetter;
598 r = resetter.init();
599 if (r < 0) {
600 derr << "resetter::init failed: " << cpp_strerror(r) << dendl;
601 return r;
602 }
603
604 if (hard) {
605 r = resetter.reset_hard(mds_role_t(role_selector.get_ns(), rank));
606 } else {
607 r = resetter.reset(mds_role_t(role_selector.get_ns(), rank));
608 }
609
610 return r;
611 }
612
613
614 /**
615 * Selective offline replay which only reads out dentries and writes
616 * them to the backing store iff their version is > what is currently
617 * in the backing store.
618 *
619 * In order to write dentries to the backing store, we may create the
620 * required enclosing dirfrag objects.
621 *
622 * Test this by running scavenge on an unflushed journal, then nuking
623 * it offline, then starting an MDS and seeing that the dentries are
624 * visible.
625 *
626 * @param metablob an EMetaBlob retrieved from the journal
627 * @param dry_run if true, do no writes to RADOS
628 * @param consumed_inos output, populated with any inos inserted
629 * @returns 0 on success, else negative error code
630 */
631 int JournalTool::recover_dentries(
632 EMetaBlob const &metablob,
633 bool const dry_run,
634 std::set<inodeno_t> *consumed_inos)
635 {
636 assert(consumed_inos != NULL);
637
638 int r = 0;
639
640 // Replay fullbits (dentry+inode)
641 for (list<dirfrag_t>::const_iterator lp = metablob.lump_order.begin();
642 lp != metablob.lump_order.end(); ++lp)
643 {
644 dirfrag_t const &frag = *lp;
645 EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second;
646 lump._decode_bits();
647 object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, "");
648
649 dout(4) << "inspecting lump " << frag_oid.name << dendl;
650
651
652 // We will record old fnode version for use in hard link handling
653 // If we don't read an old fnode, take version as zero and write in
654 // all hardlinks we find.
655 version_t old_fnode_version = 0;
656
657 // Update fnode in omap header of dirfrag object
658 bool write_fnode = false;
659 bufferlist old_fnode_bl;
660 r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
661 if (r == -ENOENT) {
662 // Creating dirfrag from scratch
663 dout(4) << "failed to read OMAP header from directory fragment "
664 << frag_oid.name << " " << cpp_strerror(r) << dendl;
665 write_fnode = true;
666 // Note: creating the dirfrag *without* a backtrace, relying on
667 // MDS to regenerate backtraces on read or in FSCK
668 } else if (r == 0) {
669 // Conditionally update existing omap header
670 fnode_t old_fnode;
671 bufferlist::iterator old_fnode_iter = old_fnode_bl.begin();
672 try {
673 old_fnode.decode(old_fnode_iter);
674 dout(4) << "frag " << frag_oid.name << " fnode old v" <<
675 old_fnode.version << " vs new v" << lump.fnode.version << dendl;
676 old_fnode_version = old_fnode.version;
677 write_fnode = old_fnode_version < lump.fnode.version;
678 } catch (const buffer::error &err) {
679 dout(1) << "frag " << frag_oid.name
680 << " is corrupt, overwriting" << dendl;
681 write_fnode = true;
682 }
683 } else {
684 // Unexpected error
685 dout(4) << "failed to read OMAP header from directory fragment "
686 << frag_oid.name << " " << cpp_strerror(r) << dendl;
687 return r;
688 }
689
690 if ((other_pool || write_fnode) && !dry_run) {
691 dout(4) << "writing fnode to omap header" << dendl;
692 bufferlist fnode_bl;
693 lump.fnode.encode(fnode_bl);
694 if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
695 r = output.omap_set_header(frag_oid.name, fnode_bl);
696 }
697 if (r != 0) {
698 derr << "Failed to write fnode for frag object "
699 << frag_oid.name << dendl;
700 return r;
701 }
702 }
703
704 std::set<std::string> read_keys;
705
706 // Compose list of potentially-existing dentries we would like to fetch
707 list<ceph::shared_ptr<EMetaBlob::fullbit> > const &fb_list =
708 lump.get_dfull();
709 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
710 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
711 EMetaBlob::fullbit const &fb = *(*fbi);
712
713 // Get a key like "foobar_head"
714 std::string key;
715 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
716 dn_key.encode(key);
717 read_keys.insert(key);
718 }
719
720 list<EMetaBlob::remotebit> const &rb_list =
721 lump.get_dremote();
722 for (list<EMetaBlob::remotebit>::const_iterator rbi =
723 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
724 EMetaBlob::remotebit const &rb = *rbi;
725
726 // Get a key like "foobar_head"
727 std::string key;
728 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
729 dn_key.encode(key);
730 read_keys.insert(key);
731 }
732
733 list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull();
734 for (auto& nb : nb_list) {
735 // Get a key like "foobar_head"
736 std::string key;
737 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
738 dn_key.encode(key);
739 read_keys.insert(key);
740 }
741
742 // Perform bulk read of existing dentries
743 std::map<std::string, bufferlist> read_vals;
744 r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
745 if (r == -ENOENT && other_pool) {
746 r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
747 }
748 if (r != 0) {
749 derr << "unexpected error reading fragment object "
750 << frag_oid.name << ": " << cpp_strerror(r) << dendl;
751 return r;
752 }
753
754 // Compose list of dentries we will write back
755 std::map<std::string, bufferlist> write_vals;
756 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi =
757 fb_list.begin(); fbi != fb_list.end(); ++fbi) {
758 EMetaBlob::fullbit const &fb = *(*fbi);
759
760 // Get a key like "foobar_head"
761 std::string key;
762 dentry_key_t dn_key(fb.dnlast, fb.dn.c_str());
763 dn_key.encode(key);
764
765 dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn
766 << dendl;
767 bool write_dentry = false;
768 if (read_vals.find(key) == read_vals.end()) {
769 dout(4) << "dentry did not already exist, will create" << dendl;
770 write_dentry = true;
771 } else {
772 dout(4) << "dentry " << key << " existed already" << dendl;
773 dout(4) << "dentry exists, checking versions..." << dendl;
774 bufferlist &old_dentry = read_vals[key];
775 // Decode dentry+inode
776 bufferlist::iterator q = old_dentry.begin();
777
778 snapid_t dnfirst;
779 ::decode(dnfirst, q);
780 char dentry_type;
781 ::decode(dentry_type, q);
782
783 if (dentry_type == 'L') {
784 // leave write_dentry false, we have no version to
785 // compare with in a hardlink, so it's not safe to
786 // squash over it with what's in this fullbit
787 dout(10) << "Existing remote inode in slot to be (maybe) written "
788 << "by a full inode from the journal dn '" << fb.dn.c_str()
789 << "' with lump fnode version " << lump.fnode.version
790 << "vs existing fnode version " << old_fnode_version << dendl;
791 write_dentry = old_fnode_version < lump.fnode.version;
792 } else if (dentry_type == 'I') {
793 // Read out inode version to compare with backing store
794 InodeStore inode;
795 inode.decode_bare(q);
796 dout(4) << "decoded embedded inode version "
797 << inode.inode.version << " vs fullbit version "
798 << fb.inode.version << dendl;
799 if (inode.inode.version < fb.inode.version) {
800 write_dentry = true;
801 }
802 } else {
803 dout(4) << "corrupt dentry in backing store, overwriting from "
804 "journal" << dendl;
805 write_dentry = true;
806 }
807 }
808
809 if ((other_pool || write_dentry) && !dry_run) {
810 dout(4) << "writing I dentry " << key << " into frag "
811 << frag_oid.name << dendl;
812
813 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
814 bufferlist dentry_bl;
815 ::encode(fb.dnfirst, dentry_bl);
816 ::encode('I', dentry_bl);
817 encode_fullbit_as_inode(fb, true, &dentry_bl);
818
819 // Record for writing to RADOS
820 write_vals[key] = dentry_bl;
821 consumed_inos->insert(fb.inode.ino);
822 }
823 }
824
825 for (list<EMetaBlob::remotebit>::const_iterator rbi =
826 rb_list.begin(); rbi != rb_list.end(); ++rbi) {
827 EMetaBlob::remotebit const &rb = *rbi;
828
829 // Get a key like "foobar_head"
830 std::string key;
831 dentry_key_t dn_key(rb.dnlast, rb.dn.c_str());
832 dn_key.encode(key);
833
834 dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn
835 << dendl;
836 bool write_dentry = false;
837 if (read_vals.find(key) == read_vals.end()) {
838 dout(4) << "dentry did not already exist, will create" << dendl;
839 write_dentry = true;
840 } else {
841 dout(4) << "dentry " << key << " existed already" << dendl;
842 dout(4) << "dentry exists, checking versions..." << dendl;
843 bufferlist &old_dentry = read_vals[key];
844 // Decode dentry+inode
845 bufferlist::iterator q = old_dentry.begin();
846
847 snapid_t dnfirst;
848 ::decode(dnfirst, q);
849 char dentry_type;
850 ::decode(dentry_type, q);
851
852 if (dentry_type == 'L') {
853 dout(10) << "Existing hardlink inode in slot to be (maybe) written "
854 << "by a remote inode from the journal dn '" << rb.dn.c_str()
855 << "' with lump fnode version " << lump.fnode.version
856 << "vs existing fnode version " << old_fnode_version << dendl;
857 write_dentry = old_fnode_version < lump.fnode.version;
858 } else if (dentry_type == 'I') {
859 dout(10) << "Existing full inode in slot to be (maybe) written "
860 << "by a remote inode from the journal dn '" << rb.dn.c_str()
861 << "' with lump fnode version " << lump.fnode.version
862 << "vs existing fnode version " << old_fnode_version << dendl;
863 write_dentry = old_fnode_version < lump.fnode.version;
864 } else {
865 dout(4) << "corrupt dentry in backing store, overwriting from "
866 "journal" << dendl;
867 write_dentry = true;
868 }
869 }
870
871 if ((other_pool || write_dentry) && !dry_run) {
872 dout(4) << "writing L dentry " << key << " into frag "
873 << frag_oid.name << dendl;
874
875 // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true)
876 bufferlist dentry_bl;
877 ::encode(rb.dnfirst, dentry_bl);
878 ::encode('L', dentry_bl);
879 ::encode(rb.ino, dentry_bl);
880 ::encode(rb.d_type, dentry_bl);
881
882 // Record for writing to RADOS
883 write_vals[key] = dentry_bl;
884 consumed_inos->insert(rb.ino);
885 }
886 }
887
888 std::set<std::string> null_vals;
889 for (auto& nb : nb_list) {
890 std::string key;
891 dentry_key_t dn_key(nb.dnlast, nb.dn.c_str());
892 dn_key.encode(key);
893
894 dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn
895 << dendl;
896
897 auto it = read_vals.find(key);
898 if (it != read_vals.end()) {
899 dout(4) << "dentry exists, will remove" << dendl;
900
901 bufferlist::iterator q = it->second.begin();
902 snapid_t dnfirst;
903 ::decode(dnfirst, q);
904 char dentry_type;
905 ::decode(dentry_type, q);
906
907 bool remove_dentry = false;
908 if (dentry_type == 'L') {
909 dout(10) << "Existing hardlink inode in slot to be (maybe) removed "
910 << "by null journal dn '" << nb.dn.c_str()
911 << "' with lump fnode version " << lump.fnode.version
912 << "vs existing fnode version " << old_fnode_version << dendl;
913 remove_dentry = old_fnode_version < lump.fnode.version;
914 } else if (dentry_type == 'I') {
915 dout(10) << "Existing full inode in slot to be (maybe) removed "
916 << "by null journal dn '" << nb.dn.c_str()
917 << "' with lump fnode version " << lump.fnode.version
918 << "vs existing fnode version " << old_fnode_version << dendl;
919 remove_dentry = old_fnode_version < lump.fnode.version;
920 } else {
921 dout(4) << "corrupt dentry in backing store, will remove" << dendl;
922 remove_dentry = true;
923 }
924
925 if (remove_dentry)
926 null_vals.insert(key);
927 }
928 }
929
930 // Write back any new/changed dentries
931 if (!write_vals.empty()) {
932 r = output.omap_set(frag_oid.name, write_vals);
933 if (r != 0) {
934 derr << "error writing dentries to " << frag_oid.name
935 << ": " << cpp_strerror(r) << dendl;
936 return r;
937 }
938 }
939
940 // remove any null dentries
941 if (!null_vals.empty()) {
942 r = output.omap_rm_keys(frag_oid.name, null_vals);
943 if (r != 0) {
944 derr << "error removing dentries from " << frag_oid.name
945 << ": " << cpp_strerror(r) << dendl;
946 return r;
947 }
948 }
949 }
950
951 /* Now that we've looked at the dirlumps, we finally pay attention to
952 * the roots (i.e. inodes without ancestry). This is necessary in order
953 * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally
954 * important because clients use them to infer completeness
955 * of directories
956 */
957 for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator p =
958 metablob.roots.begin(); p != metablob.roots.end(); ++p) {
959 EMetaBlob::fullbit const &fb = *(*p);
960 inodeno_t ino = fb.inode.ino;
961 dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl;
962
963 object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
964 dout(4) << "object id " << root_oid.name << dendl;
965
966 bool write_root_ino = false;
967 bufferlist old_root_ino_bl;
968 r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
969 if (r == -ENOENT) {
970 dout(4) << "root does not exist, will create" << dendl;
971 write_root_ino = true;
972 } else if (r >= 0) {
973 r = 0;
974 InodeStore old_inode;
975 dout(4) << "root exists, will modify (" << old_root_ino_bl.length()
976 << ")" << dendl;
977 bufferlist::iterator inode_bl_iter = old_root_ino_bl.begin();
978 std::string magic;
979 ::decode(magic, inode_bl_iter);
980 if (magic == CEPH_FS_ONDISK_MAGIC) {
981 dout(4) << "magic ok" << dendl;
982 old_inode.decode(inode_bl_iter);
983
984 if (old_inode.inode.version < fb.inode.version) {
985 write_root_ino = true;
986 }
987 } else {
988 dout(4) << "magic bad: '" << magic << "'" << dendl;
989 write_root_ino = true;
990 }
991 } else {
992 derr << "error reading root inode object " << root_oid.name
993 << ": " << cpp_strerror(r) << dendl;
994 return r;
995 }
996
997 if (write_root_ino && !dry_run) {
998 dout(4) << "writing root ino " << root_oid.name
999 << " version " << fb.inode.version << dendl;
1000
1001 // Compose: root ino format is magic,InodeStore(bare=false)
1002 bufferlist new_root_ino_bl;
1003 ::encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl);
1004 encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
1005
1006 // Write to RADOS
1007 r = output.write_full(root_oid.name, new_root_ino_bl);
1008 if (r != 0) {
1009 derr << "error writing inode object " << root_oid.name
1010 << ": " << cpp_strerror(r) << dendl;
1011 return r;
1012 }
1013 }
1014 }
1015
1016 return r;
1017 }
1018
1019
1020 /**
1021 * Erase a region of the log by overwriting it with ENoOp
1022 *
1023 */
1024 int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length)
1025 {
1026 // To erase this region, we use our preamble, the encoding overhead
1027 // of an ENoOp, and our trailing start ptr. Calculate how much padding
1028 // is needed inside the ENoOp to make up the difference.
1029 bufferlist tmp;
1030 ENoOp enoop(0);
1031 enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT);
1032
1033 dout(4) << "erase_region " << pos << " len=" << length << dendl;
1034
1035 // FIXME: get the preamble/postamble length via JournalStream
1036 int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t);
1037 dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl;
1038
1039 if (padding < 0) {
1040 derr << "Erase region " << length << " too short" << dendl;
1041 return -EINVAL;
1042 }
1043
1044 // Serialize an ENoOp with the correct amount of padding
1045 enoop = ENoOp(padding);
1046 bufferlist entry;
1047 enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT);
1048 JournalStream stream(JOURNAL_FORMAT_RESILIENT);
1049
1050 // Serialize region of log stream
1051 bufferlist log_data;
1052 stream.write(entry, &log_data, pos);
1053
1054 dout(4) << "erase_region data length " << log_data.length() << dendl;
1055 assert(log_data.length() == length);
1056
1057 // Write log stream region to RADOS
1058 // FIXME: get object size somewhere common to scan_events
1059 uint32_t object_size = g_conf->mds_log_segment_size;
1060 if (object_size == 0) {
1061 // Default layout object size
1062 object_size = file_layout_t::get_default().object_size;
1063 }
1064
1065 uint64_t write_offset = pos;
1066 uint64_t obj_offset = (pos / object_size);
1067 int r = 0;
1068 while(log_data.length()) {
1069 std::string const oid = js.obj_name(obj_offset);
1070 uint32_t offset_in_obj = write_offset % object_size;
1071 uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
1072
1073 r = output.write(oid, log_data, write_len, offset_in_obj);
1074 if (r < 0) {
1075 return r;
1076 } else {
1077 dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl;
1078 r = 0;
1079 }
1080
1081 log_data.splice(0, write_len);
1082 write_offset += write_len;
1083 obj_offset++;
1084 }
1085
1086 return r;
1087 }
1088
1089 /**
1090 * Given an EMetaBlob::fullbit containing an inode, write out
1091 * the encoded inode in the format used by InodeStore (i.e. the
1092 * backing store format)
1093 *
1094 * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use
1095 * on an offline InodeStore instance. It's way simpler, because we are just
1096 * uncritically hauling the data between structs.
1097 *
1098 * @param fb a fullbit extracted from a journal entry
1099 * @param bare if true, leave out [EN|DE]CODE_START decoration
1100 * @param out_bl output, write serialized inode to this bufferlist
1101 */
1102 void JournalTool::encode_fullbit_as_inode(
1103 const EMetaBlob::fullbit &fb,
1104 const bool bare,
1105 bufferlist *out_bl)
1106 {
1107 assert(out_bl != NULL);
1108
1109 // Compose InodeStore
1110 InodeStore new_inode;
1111 new_inode.inode = fb.inode;
1112 new_inode.xattrs = fb.xattrs;
1113 new_inode.dirfragtree = fb.dirfragtree;
1114 new_inode.snap_blob = fb.snapbl;
1115 new_inode.symlink = mempool::mds_co::string(boost::string_view(fb.symlink));
1116 new_inode.old_inodes = fb.old_inodes;
1117
1118 // Serialize InodeStore
1119 if (bare) {
1120 new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1121 } else {
1122 new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1123 }
1124 }
1125
1126 /**
1127 * Given a list of inode numbers known to be in use by
1128 * inodes in the backing store, ensure that none of these
1129 * numbers are listed as free in the InoTables in the
1130 * backing store.
1131 *
1132 * Used after injecting inodes into the backing store, to
1133 * ensure that the same inode numbers are not subsequently
1134 * used for new files during ordinary operation.
1135 *
1136 * @param inos list of inode numbers to be removed from
1137 * free lists in InoTables
1138 * @returns 0 on success, else negative error code
1139 */
1140 int JournalTool::consume_inos(const std::set<inodeno_t> &inos)
1141 {
1142 int r = 0;
1143
1144 // InoTable is a per-MDS structure, so iterate over assigned ranks
1145 auto fs = fsmap->get_filesystem(role_selector.get_ns());
1146 std::set<mds_rank_t> in_ranks;
1147 fs->mds_map.get_mds_set(in_ranks);
1148
1149 for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin();
1150 rank_i != in_ranks.end(); ++rank_i)
1151 {
1152 // Compose object name
1153 std::ostringstream oss;
1154 oss << "mds" << *rank_i << "_inotable";
1155 object_t inotable_oid = object_t(oss.str());
1156
1157 // Read object
1158 bufferlist inotable_bl;
1159 int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
1160 if (read_r < 0) {
1161 // Things are really bad if we can't read inotable. Beyond our powers.
1162 derr << "unable to read inotable '" << inotable_oid.name << "': "
1163 << cpp_strerror(read_r) << dendl;
1164 r = r ? r : read_r;
1165 continue;
1166 }
1167
1168 // Deserialize InoTable
1169 version_t inotable_ver;
1170 bufferlist::iterator q = inotable_bl.begin();
1171 ::decode(inotable_ver, q);
1172 InoTable ino_table(NULL);
1173 ino_table.decode(q);
1174
1175 // Update InoTable in memory
1176 bool inotable_modified = false;
1177 for (std::set<inodeno_t>::iterator i = inos.begin();
1178 i != inos.end(); ++i)
1179 {
1180 const inodeno_t ino = *i;
1181 if (ino_table.force_consume(ino)) {
1182 dout(4) << "Used ino 0x" << std::hex << ino << std::dec
1183 << " requires inotable update" << dendl;
1184 inotable_modified = true;
1185 }
1186 }
1187
1188 // Serialize and write InoTable
1189 if (inotable_modified) {
1190 inotable_ver += 1;
1191 dout(4) << "writing modified inotable version " << inotable_ver << dendl;
1192 bufferlist inotable_new_bl;
1193 ::encode(inotable_ver, inotable_new_bl);
1194 ino_table.encode_state(inotable_new_bl);
1195 int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
1196 if (write_r != 0) {
1197 derr << "error writing modified inotable " << inotable_oid.name
1198 << ": " << cpp_strerror(write_r) << dendl;
1199 r = r ? r : read_r;
1200 continue;
1201 }
1202 }
1203 }
1204
1205 return r;
1206 }
1207