]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * ceph - scalable distributed file system | |
5 | * | |
6 | * copyright (c) 2014 john spray <john.spray@inktank.com> | |
7 | * | |
8 | * this is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the gnu lesser general public | |
10 | * license version 2.1, as published by the free software | |
11 | * foundation. see file copying. | |
12 | */ | |
13 | ||
14 | ||
15 | #include <sstream> | |
16 | ||
17 | #include "common/ceph_argparse.h" | |
18 | #include "common/errno.h" | |
19 | #include "osdc/Journaler.h" | |
20 | #include "mds/mdstypes.h" | |
21 | #include "mds/LogEvent.h" | |
22 | #include "mds/InoTable.h" | |
23 | ||
24 | #include "mds/events/ENoOp.h" | |
25 | #include "mds/events/EUpdate.h" | |
26 | ||
27 | #include "JournalScanner.h" | |
28 | #include "EventOutput.h" | |
29 | #include "Dumper.h" | |
30 | #include "Resetter.h" | |
31 | ||
32 | #include "JournalTool.h" | |
33 | ||
34 | ||
35 | #define dout_context g_ceph_context | |
36 | #define dout_subsys ceph_subsys_mds | |
37 | #undef dout_prefix | |
38 | #define dout_prefix *_dout << __func__ << ": " | |
39 | ||
40 | ||
41 | ||
42 | void JournalTool::usage() | |
43 | { | |
44 | std::cout << "Usage: \n" | |
45 | << " cephfs-journal-tool [options] journal <command>\n" | |
46 | << " <command>:\n" | |
47 | << " inspect\n" | |
48 | << " import <path>\n" | |
49 | << " export <path>\n" | |
50 | << " reset [--force]\n" | |
51 | << " cephfs-journal-tool [options] header <get|set <field> <value>\n" | |
31f18b77 | 52 | << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n" |
7c673cae FG |
53 | << " <selector>:\n" |
54 | << " --range=<start>..<end>\n" | |
55 | << " --path=<substring>\n" | |
56 | << " --inode=<integer>\n" | |
57 | << " --type=<UPDATE|OPEN|SESSION...><\n" | |
58 | << " --frag=<ino>.<frag> [--dname=<dentry string>]\n" | |
7c673cae | 59 | << " --client=<session id integer>\n" |
31f18b77 | 60 | << " <effect>: [get|recover_dentries|splice]\n" |
7c673cae FG |
61 | << " <output>: [summary|list|binary|json] [--path <path>]\n" |
62 | << "\n" | |
31f18b77 | 63 | << "General options:\n" |
7c673cae FG |
64 | << " --rank=filesystem:mds-rank Journal rank (required if multiple\n" |
65 | << " file systems, default is rank 0 on\n" | |
31f18b77 FG |
66 | << " the only filesystem otherwise.\n" |
67 | << "\n" | |
68 | << "Special options\n" | |
69 | << " --alternate-pool <name> Alternative metadata pool to target\n" | |
70 | << " when using recover_dentries.\n"; | |
7c673cae FG |
71 | |
72 | generic_client_usage(); | |
73 | } | |
74 | ||
75 | ||
76 | /** | |
77 | * Handle arguments and hand off to journal/header/event mode | |
78 | */ | |
79 | int JournalTool::main(std::vector<const char*> &argv) | |
80 | { | |
81 | int r; | |
82 | ||
83 | dout(10) << "JournalTool::main " << dendl; | |
84 | // Common arg parsing | |
85 | // ================== | |
86 | if (argv.empty()) { | |
87 | usage(); | |
88 | return -EINVAL; | |
89 | } | |
90 | ||
91 | std::vector<const char*>::iterator arg = argv.begin(); | |
92 | ||
93 | std::string rank_str; | |
94 | if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) { | |
95 | // Default: act on rank 0. Will give the user an error if they | |
96 | // try invoking this way when they have more than one filesystem. | |
97 | rank_str = "0"; | |
98 | } | |
99 | ||
100 | r = role_selector.parse(*fsmap, rank_str); | |
101 | if (r != 0) { | |
102 | derr << "Couldn't determine MDS rank." << dendl; | |
103 | return r; | |
104 | } | |
105 | ||
106 | std::string mode; | |
107 | if (arg == argv.end()) { | |
108 | derr << "Missing mode [journal|header|event]" << dendl; | |
109 | return -EINVAL; | |
110 | } | |
111 | mode = std::string(*arg); | |
112 | arg = argv.erase(arg); | |
113 | ||
114 | // RADOS init | |
115 | // ========== | |
116 | r = rados.init_with_context(g_ceph_context); | |
117 | if (r < 0) { | |
118 | derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; | |
119 | return r; | |
120 | } | |
121 | ||
122 | dout(4) << "JournalTool: connecting to RADOS..." << dendl; | |
123 | r = rados.connect(); | |
124 | if (r < 0) { | |
125 | derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; | |
126 | return r; | |
127 | } | |
128 | ||
129 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
130 | assert(fs != nullptr); | |
131 | int64_t const pool_id = fs->mds_map.get_metadata_pool(); | |
132 | dout(4) << "JournalTool: resolving pool " << pool_id << dendl; | |
133 | std::string pool_name; | |
134 | r = rados.pool_reverse_lookup(pool_id, &pool_name); | |
135 | if (r < 0) { | |
136 | derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl; | |
137 | return r; | |
138 | } | |
139 | ||
140 | dout(4) << "JournalTool: creating IoCtx.." << dendl; | |
141 | r = rados.ioctx_create(pool_name.c_str(), input); | |
142 | assert(r == 0); | |
143 | output.dup(input); | |
144 | ||
145 | // Execution | |
146 | // ========= | |
147 | for (auto role : role_selector.get_roles()) { | |
148 | rank = role.rank; | |
149 | dout(4) << "Executing for rank " << rank << dendl; | |
150 | if (mode == std::string("journal")) { | |
151 | r = main_journal(argv); | |
152 | } else if (mode == std::string("header")) { | |
153 | r = main_header(argv); | |
154 | } else if (mode == std::string("event")) { | |
155 | r = main_event(argv); | |
156 | } else { | |
157 | derr << "Bad command '" << mode << "'" << dendl; | |
158 | usage(); | |
159 | return -EINVAL; | |
160 | } | |
161 | ||
162 | if (r != 0) { | |
163 | return r; | |
164 | } | |
165 | } | |
166 | ||
167 | return r; | |
168 | } | |
169 | ||
170 | ||
171 | /** | |
172 | * Handle arguments for 'journal' mode | |
173 | * | |
174 | * This is for operations that act on the journal as a whole. | |
175 | */ | |
176 | int JournalTool::main_journal(std::vector<const char*> &argv) | |
177 | { | |
178 | std::string command = argv[0]; | |
179 | if (command == "inspect") { | |
180 | return journal_inspect(); | |
181 | } else if (command == "export" || command == "import") { | |
182 | if (argv.size() >= 2) { | |
183 | std::string const path = argv[1]; | |
184 | return journal_export(path, command == "import"); | |
185 | } else { | |
186 | derr << "Missing path" << dendl; | |
187 | return -EINVAL; | |
188 | } | |
189 | } else if (command == "reset") { | |
190 | bool force = false; | |
191 | if (argv.size() == 2) { | |
192 | if (std::string(argv[1]) == "--force") { | |
193 | force = true; | |
194 | } else { | |
195 | std::cerr << "Unknown argument " << argv[1] << std::endl; | |
196 | usage(); | |
197 | return -EINVAL; | |
198 | } | |
199 | } else if (argv.size() > 2) { | |
200 | std::cerr << "Too many arguments!" << std::endl; | |
201 | usage(); | |
202 | return -EINVAL; | |
203 | } | |
204 | return journal_reset(force); | |
205 | } else { | |
206 | derr << "Bad journal command '" << command << "'" << dendl; | |
207 | return -EINVAL; | |
208 | } | |
209 | } | |
210 | ||
211 | ||
212 | /** | |
213 | * Parse arguments and execute for 'header' mode | |
214 | * | |
215 | * This is for operations that act on the header only. | |
216 | */ | |
217 | int JournalTool::main_header(std::vector<const char*> &argv) | |
218 | { | |
219 | JournalFilter filter; | |
220 | JournalScanner js(input, rank, filter); | |
221 | int r = js.scan(false); | |
222 | if (r < 0) { | |
223 | std::cerr << "Unable to scan journal" << std::endl; | |
224 | return r; | |
225 | } | |
226 | ||
227 | if (!js.header_present) { | |
228 | std::cerr << "Header object not found!" << std::endl; | |
229 | return -ENOENT; | |
230 | } else if (!js.header_valid && js.header == NULL) { | |
231 | // Can't do a read or a single-field write without a copy of the original | |
232 | derr << "Header could not be read!" << dendl; | |
233 | return -ENOENT; | |
234 | } else { | |
235 | assert(js.header != NULL); | |
236 | } | |
237 | ||
238 | if (argv.size() == 0) { | |
239 | derr << "Invalid header command, must be [get|set]" << dendl; | |
240 | return -EINVAL; | |
241 | } | |
242 | std::vector<const char *>::iterator arg = argv.begin(); | |
243 | std::string const command = *arg; | |
244 | arg = argv.erase(arg); | |
245 | ||
246 | if (command == std::string("get")) { | |
247 | // Write JSON journal dump to stdout | |
248 | JSONFormatter jf(true); | |
249 | js.header->dump(&jf); | |
250 | jf.flush(std::cout); | |
251 | std::cout << std::endl; | |
252 | } else if (command == std::string("set")) { | |
253 | // Need two more args <key> <val> | |
254 | if (argv.size() != 2) { | |
255 | derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl; | |
256 | return -EINVAL; | |
257 | } | |
258 | ||
259 | std::string const field_name = *arg; | |
260 | arg = argv.erase(arg); | |
261 | ||
262 | std::string const value_str = *arg; | |
263 | arg = argv.erase(arg); | |
264 | assert(argv.empty()); | |
265 | ||
266 | std::string parse_err; | |
267 | uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err); | |
268 | if (!parse_err.empty()) { | |
269 | derr << "Invalid value '" << value_str << "': " << parse_err << dendl; | |
270 | return -EINVAL; | |
271 | } | |
272 | ||
273 | uint64_t *field = NULL; | |
274 | if (field_name == "trimmed_pos") { | |
275 | field = &(js.header->trimmed_pos); | |
276 | } else if (field_name == "expire_pos") { | |
277 | field = &(js.header->expire_pos); | |
278 | } else if (field_name == "write_pos") { | |
279 | field = &(js.header->write_pos); | |
280 | } else { | |
281 | derr << "Invalid field '" << field_name << "'" << dendl; | |
282 | return -EINVAL; | |
283 | } | |
284 | ||
285 | std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl; | |
286 | *field = new_val; | |
287 | ||
288 | dout(4) << "Writing object..." << dendl; | |
289 | bufferlist header_bl; | |
290 | ::encode(*(js.header), header_bl); | |
291 | output.write_full(js.obj_name(0), header_bl); | |
292 | dout(4) << "Write complete." << dendl; | |
293 | std::cout << "Successfully updated header." << std::endl; | |
294 | } else { | |
295 | derr << "Bad header command '" << command << "'" << dendl; | |
296 | return -EINVAL; | |
297 | } | |
298 | ||
299 | return 0; | |
300 | } | |
301 | ||
302 | ||
303 | /** | |
304 | * Parse arguments and execute for 'event' mode | |
305 | * | |
306 | * This is for operations that act on LogEvents within the log | |
307 | */ | |
308 | int JournalTool::main_event(std::vector<const char*> &argv) | |
309 | { | |
310 | int r; | |
311 | ||
312 | std::vector<const char*>::iterator arg = argv.begin(); | |
313 | ||
314 | std::string command = *(arg++); | |
31f18b77 | 315 | if (command != "get" && command != "splice" && command != "recover_dentries") { |
7c673cae FG |
316 | derr << "Unknown argument '" << command << "'" << dendl; |
317 | usage(); | |
318 | return -EINVAL; | |
319 | } | |
320 | ||
321 | if (arg == argv.end()) { | |
322 | derr << "Incomplete command line" << dendl; | |
323 | usage(); | |
324 | return -EINVAL; | |
325 | } | |
326 | ||
327 | // Parse filter options | |
328 | // ==================== | |
329 | JournalFilter filter; | |
330 | r = filter.parse_args(argv, arg); | |
331 | if (r) { | |
332 | return r; | |
333 | } | |
334 | ||
335 | // Parse output options | |
336 | // ==================== | |
337 | if (arg == argv.end()) { | |
338 | derr << "Missing output command" << dendl; | |
339 | usage(); | |
340 | } | |
341 | std::string output_style = *(arg++); | |
342 | if (output_style != "binary" && output_style != "json" && | |
343 | output_style != "summary" && output_style != "list") { | |
344 | derr << "Unknown argument: '" << output_style << "'" << dendl; | |
345 | usage(); | |
346 | return -EINVAL; | |
347 | } | |
348 | ||
349 | std::string output_path = "dump"; | |
350 | while(arg != argv.end()) { | |
351 | std::string arg_str; | |
352 | if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { | |
353 | output_path = arg_str; | |
354 | } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", | |
355 | nullptr)) { | |
356 | dout(1) << "Using alternate pool " << arg_str << dendl; | |
357 | int r = rados.ioctx_create(arg_str.c_str(), output); | |
358 | assert(r == 0); | |
359 | other_pool = true; | |
360 | } else { | |
361 | derr << "Unknown argument: '" << *arg << "'" << dendl; | |
362 | usage(); | |
363 | return -EINVAL; | |
364 | } | |
365 | } | |
366 | ||
367 | // Execute command | |
368 | // =============== | |
369 | JournalScanner js(input, rank, filter); | |
370 | if (command == "get") { | |
371 | r = js.scan(); | |
372 | if (r) { | |
373 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
374 | return r; | |
375 | } | |
7c673cae FG |
376 | } else if (command == "recover_dentries") { |
377 | r = js.scan(); | |
378 | if (r) { | |
379 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
380 | return r; | |
381 | } | |
382 | ||
383 | bool dry_run = false; | |
384 | if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) { | |
385 | dry_run = true; | |
386 | } | |
387 | ||
388 | /** | |
389 | * Iterate over log entries, attempting to scavenge from each one | |
390 | */ | |
391 | std::set<inodeno_t> consumed_inos; | |
392 | for (JournalScanner::EventMap::iterator i = js.events.begin(); | |
393 | i != js.events.end(); ++i) { | |
394 | LogEvent *le = i->second.log_event; | |
395 | EMetaBlob const *mb = le->get_metablob(); | |
396 | if (mb) { | |
31f18b77 | 397 | int scav_r = recover_dentries(*mb, dry_run, &consumed_inos); |
7c673cae FG |
398 | if (scav_r) { |
399 | dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec | |
400 | << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl; | |
401 | if (r == 0) { | |
402 | r = scav_r; | |
403 | } | |
404 | // Our goal is to read all we can, so don't stop on errors, but | |
405 | // do record them for possible later output | |
406 | js.errors.insert(std::make_pair(i->first, | |
407 | JournalScanner::EventError(scav_r, cpp_strerror(r)))); | |
408 | } | |
409 | } | |
410 | } | |
411 | ||
412 | /** | |
413 | * Update InoTable to reflect any inode numbers consumed during scavenge | |
414 | */ | |
415 | dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl; | |
416 | if (consumed_inos.size() && !dry_run) { | |
417 | int consume_r = consume_inos(consumed_inos); | |
418 | if (consume_r) { | |
419 | dout(1) << "Error updating InoTable for " << consumed_inos.size() | |
420 | << " consume inos: " << cpp_strerror(consume_r) << dendl; | |
421 | if (r == 0) { | |
422 | r = consume_r; | |
423 | } | |
424 | } | |
425 | } | |
426 | ||
427 | // Remove consumed dentries from lost+found. | |
428 | if (other_pool && !dry_run) { | |
429 | std::set<std::string> found; | |
430 | ||
431 | for (auto i : consumed_inos) { | |
432 | char s[20]; | |
433 | ||
434 | snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); | |
435 | dout(20) << "removing " << s << dendl; | |
436 | found.insert(std::string(s)); | |
437 | } | |
438 | ||
439 | object_t frag_oid; | |
440 | frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, | |
441 | frag_t(), ""); | |
442 | output.omap_rm_keys(frag_oid.name, found); | |
443 | } | |
444 | } else if (command == "splice") { | |
445 | r = js.scan(); | |
446 | if (r) { | |
447 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
448 | return r; | |
449 | } | |
450 | ||
451 | uint64_t start, end; | |
452 | if (filter.get_range(start, end)) { | |
453 | // Special case for range filter: erase a numeric range in the log | |
454 | uint64_t range = end - start; | |
455 | int r = erase_region(js, start, range); | |
456 | if (r) { | |
457 | derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec | |
458 | << ": " << cpp_strerror(r) << dendl; | |
459 | return r; | |
460 | } | |
461 | } else { | |
462 | // General case: erase a collection of individual entries in the log | |
463 | for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) { | |
464 | dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl; | |
465 | ||
466 | int r = erase_region(js, i->first, i->second.raw_size); | |
467 | if (r) { | |
468 | derr << "Failed to erase event 0x" << std::hex << i->first << std::dec | |
469 | << ": " << cpp_strerror(r) << dendl; | |
470 | return r; | |
471 | } | |
472 | } | |
473 | } | |
474 | ||
475 | ||
476 | } else { | |
477 | derr << "Unknown argument '" << command << "'" << dendl; | |
478 | usage(); | |
479 | return -EINVAL; | |
480 | } | |
481 | ||
482 | // Generate output | |
483 | // =============== | |
484 | EventOutput output(js, output_path); | |
485 | int output_result = 0; | |
486 | if (output_style == "binary") { | |
487 | output_result = output.binary(); | |
488 | } else if (output_style == "json") { | |
489 | output_result = output.json(); | |
490 | } else if (output_style == "summary") { | |
491 | output.summary(); | |
492 | } else if (output_style == "list") { | |
493 | output.list(); | |
494 | } else { | |
495 | std::cerr << "Bad output command '" << output_style << "'" << std::endl; | |
496 | return -EINVAL; | |
497 | } | |
498 | ||
499 | if (output_result != 0) { | |
500 | std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl; | |
501 | } | |
502 | ||
503 | return output_result; | |
504 | } | |
505 | ||
506 | /** | |
507 | * Provide the user with information about the condition of the journal, | |
508 | * especially indicating what range of log events is available and where | |
509 | * any gaps or corruptions in the journal are. | |
510 | */ | |
511 | int JournalTool::journal_inspect() | |
512 | { | |
513 | int r; | |
514 | ||
515 | JournalFilter filter; | |
516 | JournalScanner js(input, rank, filter); | |
517 | r = js.scan(); | |
518 | if (r) { | |
519 | std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; | |
520 | return r; | |
521 | } | |
522 | ||
523 | js.report(std::cout); | |
524 | ||
525 | return 0; | |
526 | } | |
527 | ||
528 | ||
529 | /** | |
530 | * Attempt to export a binary dump of the journal. | |
531 | * | |
532 | * This is allowed to fail if the header is malformed or there are | |
533 | * objects inaccessible, in which case the user would have to fall | |
534 | * back to manually listing RADOS objects and extracting them, which | |
535 | * they can do with the ``rados`` CLI. | |
536 | */ | |
537 | int JournalTool::journal_export(std::string const &path, bool import) | |
538 | { | |
539 | int r = 0; | |
540 | JournalScanner js(input, rank); | |
541 | ||
542 | if (!import) { | |
543 | /* | |
544 | * If doing an export, first check that the header is valid and | |
545 | * no objects are missing before trying to dump | |
546 | */ | |
547 | r = js.scan(); | |
548 | if (r < 0) { | |
549 | derr << "Unable to scan journal, assuming badly damaged" << dendl; | |
550 | return r; | |
551 | } | |
552 | if (!js.is_readable()) { | |
553 | derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl; | |
554 | return -EIO; | |
555 | } | |
556 | } | |
557 | ||
558 | /* | |
559 | * Assuming we can cleanly read the journal data, dump it out to a file | |
560 | */ | |
561 | { | |
562 | Dumper dumper; | |
563 | r = dumper.init(mds_role_t(role_selector.get_ns(), rank)); | |
564 | if (r < 0) { | |
565 | derr << "dumper::init failed: " << cpp_strerror(r) << dendl; | |
566 | return r; | |
567 | } | |
568 | if (import) { | |
569 | r = dumper.undump(path.c_str()); | |
570 | } else { | |
571 | r = dumper.dump(path.c_str()); | |
572 | } | |
573 | dumper.shutdown(); | |
574 | } | |
575 | ||
576 | return r; | |
577 | } | |
578 | ||
579 | ||
580 | /** | |
581 | * Truncate journal and insert EResetJournal | |
582 | */ | |
583 | int JournalTool::journal_reset(bool hard) | |
584 | { | |
585 | int r = 0; | |
586 | Resetter resetter; | |
587 | r = resetter.init(); | |
588 | if (r < 0) { | |
589 | derr << "resetter::init failed: " << cpp_strerror(r) << dendl; | |
590 | return r; | |
591 | } | |
592 | ||
593 | if (hard) { | |
594 | r = resetter.reset_hard(mds_role_t(role_selector.get_ns(), rank)); | |
595 | } else { | |
596 | r = resetter.reset(mds_role_t(role_selector.get_ns(), rank)); | |
597 | } | |
598 | resetter.shutdown(); | |
599 | ||
600 | return r; | |
601 | } | |
602 | ||
603 | ||
604 | /** | |
605 | * Selective offline replay which only reads out dentries and writes | |
606 | * them to the backing store iff their version is > what is currently | |
607 | * in the backing store. | |
608 | * | |
609 | * In order to write dentries to the backing store, we may create the | |
610 | * required enclosing dirfrag objects. | |
611 | * | |
612 | * Test this by running scavenge on an unflushed journal, then nuking | |
613 | * it offline, then starting an MDS and seeing that the dentries are | |
614 | * visible. | |
615 | * | |
616 | * @param metablob an EMetaBlob retrieved from the journal | |
617 | * @param dry_run if true, do no writes to RADOS | |
618 | * @param consumed_inos output, populated with any inos inserted | |
619 | * @returns 0 on success, else negative error code | |
620 | */ | |
31f18b77 | 621 | int JournalTool::recover_dentries( |
7c673cae FG |
622 | EMetaBlob const &metablob, |
623 | bool const dry_run, | |
624 | std::set<inodeno_t> *consumed_inos) | |
625 | { | |
626 | assert(consumed_inos != NULL); | |
627 | ||
628 | int r = 0; | |
629 | ||
630 | // Replay fullbits (dentry+inode) | |
631 | for (list<dirfrag_t>::const_iterator lp = metablob.lump_order.begin(); | |
632 | lp != metablob.lump_order.end(); ++lp) | |
633 | { | |
634 | dirfrag_t const &frag = *lp; | |
635 | EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second; | |
636 | lump._decode_bits(); | |
637 | object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, ""); | |
638 | ||
639 | dout(4) << "inspecting lump " << frag_oid.name << dendl; | |
640 | ||
641 | ||
642 | // We will record old fnode version for use in hard link handling | |
643 | // If we don't read an old fnode, take version as zero and write in | |
644 | // all hardlinks we find. | |
645 | version_t old_fnode_version = 0; | |
646 | ||
647 | // Update fnode in omap header of dirfrag object | |
648 | bool write_fnode = false; | |
649 | bufferlist old_fnode_bl; | |
650 | r = input.omap_get_header(frag_oid.name, &old_fnode_bl); | |
651 | if (r == -ENOENT) { | |
652 | // Creating dirfrag from scratch | |
653 | dout(4) << "failed to read OMAP header from directory fragment " | |
654 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
655 | write_fnode = true; | |
656 | // Note: creating the dirfrag *without* a backtrace, relying on | |
657 | // MDS to regenerate backtraces on read or in FSCK | |
658 | } else if (r == 0) { | |
659 | // Conditionally update existing omap header | |
660 | fnode_t old_fnode; | |
661 | bufferlist::iterator old_fnode_iter = old_fnode_bl.begin(); | |
662 | try { | |
663 | old_fnode.decode(old_fnode_iter); | |
664 | dout(4) << "frag " << frag_oid.name << " fnode old v" << | |
665 | old_fnode.version << " vs new v" << lump.fnode.version << dendl; | |
666 | old_fnode_version = old_fnode.version; | |
667 | write_fnode = old_fnode_version < lump.fnode.version; | |
668 | } catch (const buffer::error &err) { | |
669 | dout(1) << "frag " << frag_oid.name | |
670 | << " is corrupt, overwriting" << dendl; | |
671 | write_fnode = true; | |
672 | } | |
673 | } else { | |
674 | // Unexpected error | |
675 | dout(4) << "failed to read OMAP header from directory fragment " | |
676 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
677 | return r; | |
678 | } | |
679 | ||
680 | if ((other_pool || write_fnode) && !dry_run) { | |
681 | dout(4) << "writing fnode to omap header" << dendl; | |
682 | bufferlist fnode_bl; | |
683 | lump.fnode.encode(fnode_bl); | |
684 | if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { | |
685 | r = output.omap_set_header(frag_oid.name, fnode_bl); | |
686 | } | |
687 | if (r != 0) { | |
688 | derr << "Failed to write fnode for frag object " | |
689 | << frag_oid.name << dendl; | |
690 | return r; | |
691 | } | |
692 | } | |
693 | ||
694 | std::set<std::string> read_keys; | |
695 | ||
696 | // Compose list of potentially-existing dentries we would like to fetch | |
697 | list<ceph::shared_ptr<EMetaBlob::fullbit> > const &fb_list = | |
698 | lump.get_dfull(); | |
699 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi = | |
700 | fb_list.begin(); fbi != fb_list.end(); ++fbi) { | |
701 | EMetaBlob::fullbit const &fb = *(*fbi); | |
702 | ||
703 | // Get a key like "foobar_head" | |
704 | std::string key; | |
705 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
706 | dn_key.encode(key); | |
707 | read_keys.insert(key); | |
708 | } | |
709 | ||
710 | list<EMetaBlob::remotebit> const &rb_list = | |
711 | lump.get_dremote(); | |
712 | for (list<EMetaBlob::remotebit>::const_iterator rbi = | |
713 | rb_list.begin(); rbi != rb_list.end(); ++rbi) { | |
714 | EMetaBlob::remotebit const &rb = *rbi; | |
715 | ||
716 | // Get a key like "foobar_head" | |
717 | std::string key; | |
718 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
719 | dn_key.encode(key); | |
720 | read_keys.insert(key); | |
721 | } | |
722 | ||
723 | // Perform bulk read of existing dentries | |
724 | std::map<std::string, bufferlist> read_vals; | |
725 | r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
726 | if (r == -ENOENT && other_pool) { | |
727 | r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
728 | } | |
729 | if (r != 0) { | |
730 | derr << "unexpected error reading fragment object " | |
731 | << frag_oid.name << ": " << cpp_strerror(r) << dendl; | |
732 | return r; | |
733 | } | |
734 | ||
735 | // Compose list of dentries we will write back | |
736 | std::map<std::string, bufferlist> write_vals; | |
737 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi = | |
738 | fb_list.begin(); fbi != fb_list.end(); ++fbi) { | |
739 | EMetaBlob::fullbit const &fb = *(*fbi); | |
740 | ||
741 | // Get a key like "foobar_head" | |
742 | std::string key; | |
743 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
744 | dn_key.encode(key); | |
745 | ||
746 | dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn | |
747 | << dendl; | |
748 | bool write_dentry = false; | |
749 | if (read_vals.find(key) == read_vals.end()) { | |
750 | dout(4) << "dentry did not already exist, will create" << dendl; | |
751 | write_dentry = true; | |
752 | } else { | |
753 | dout(4) << "dentry " << key << " existed already" << dendl; | |
754 | dout(4) << "dentry exists, checking versions..." << dendl; | |
755 | bufferlist &old_dentry = read_vals[key]; | |
756 | // Decode dentry+inode | |
757 | bufferlist::iterator q = old_dentry.begin(); | |
758 | ||
759 | snapid_t dnfirst; | |
760 | ::decode(dnfirst, q); | |
761 | char dentry_type; | |
762 | ::decode(dentry_type, q); | |
763 | ||
764 | if (dentry_type == 'L') { | |
765 | // leave write_dentry false, we have no version to | |
766 | // compare with in a hardlink, so it's not safe to | |
767 | // squash over it with what's in this fullbit | |
768 | dout(10) << "Existing remote inode in slot to be (maybe) written " | |
769 | << "by a full inode from the journal dn '" << fb.dn.c_str() | |
770 | << "' with lump fnode version " << lump.fnode.version | |
771 | << "vs existing fnode version " << old_fnode_version << dendl; | |
772 | write_dentry = old_fnode_version < lump.fnode.version; | |
773 | } else if (dentry_type == 'I') { | |
774 | // Read out inode version to compare with backing store | |
775 | InodeStore inode; | |
776 | inode.decode_bare(q); | |
777 | dout(4) << "decoded embedded inode version " | |
778 | << inode.inode.version << " vs fullbit version " | |
779 | << fb.inode.version << dendl; | |
780 | if (inode.inode.version < fb.inode.version) { | |
781 | write_dentry = true; | |
782 | } | |
783 | } else { | |
784 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
785 | "journal" << dendl; | |
786 | write_dentry = true; | |
787 | } | |
788 | } | |
789 | ||
790 | if ((other_pool || write_dentry) && !dry_run) { | |
791 | dout(4) << "writing I dentry " << key << " into frag " | |
792 | << frag_oid.name << dendl; | |
793 | ||
794 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
795 | bufferlist dentry_bl; | |
796 | ::encode(fb.dnfirst, dentry_bl); | |
797 | ::encode('I', dentry_bl); | |
798 | encode_fullbit_as_inode(fb, true, &dentry_bl); | |
799 | ||
800 | // Record for writing to RADOS | |
801 | write_vals[key] = dentry_bl; | |
802 | consumed_inos->insert(fb.inode.ino); | |
803 | } | |
804 | } | |
805 | ||
806 | for (list<EMetaBlob::remotebit>::const_iterator rbi = | |
807 | rb_list.begin(); rbi != rb_list.end(); ++rbi) { | |
808 | EMetaBlob::remotebit const &rb = *rbi; | |
809 | ||
810 | // Get a key like "foobar_head" | |
811 | std::string key; | |
812 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
813 | dn_key.encode(key); | |
814 | ||
815 | dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn | |
816 | << dendl; | |
817 | bool write_dentry = false; | |
818 | if (read_vals.find(key) == read_vals.end()) { | |
819 | dout(4) << "dentry did not already exist, will create" << dendl; | |
820 | write_dentry = true; | |
821 | } else { | |
822 | dout(4) << "dentry " << key << " existed already" << dendl; | |
823 | dout(4) << "dentry exists, checking versions..." << dendl; | |
824 | bufferlist &old_dentry = read_vals[key]; | |
825 | // Decode dentry+inode | |
826 | bufferlist::iterator q = old_dentry.begin(); | |
827 | ||
828 | snapid_t dnfirst; | |
829 | ::decode(dnfirst, q); | |
830 | char dentry_type; | |
831 | ::decode(dentry_type, q); | |
832 | ||
833 | if (dentry_type == 'L') { | |
834 | dout(10) << "Existing hardlink inode in slot to be (maybe) written " | |
835 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
836 | << "' with lump fnode version " << lump.fnode.version | |
837 | << "vs existing fnode version " << old_fnode_version << dendl; | |
838 | write_dentry = old_fnode_version < lump.fnode.version; | |
839 | } else if (dentry_type == 'I') { | |
840 | dout(10) << "Existing full inode in slot to be (maybe) written " | |
841 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
842 | << "' with lump fnode version " << lump.fnode.version | |
843 | << "vs existing fnode version " << old_fnode_version << dendl; | |
844 | write_dentry = old_fnode_version < lump.fnode.version; | |
845 | } else { | |
846 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
847 | "journal" << dendl; | |
848 | write_dentry = true; | |
849 | } | |
850 | } | |
851 | ||
852 | if ((other_pool || write_dentry) && !dry_run) { | |
853 | dout(4) << "writing L dentry " << key << " into frag " | |
854 | << frag_oid.name << dendl; | |
855 | ||
856 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
857 | bufferlist dentry_bl; | |
858 | ::encode(rb.dnfirst, dentry_bl); | |
859 | ::encode('L', dentry_bl); | |
860 | ::encode(rb.ino, dentry_bl); | |
861 | ::encode(rb.d_type, dentry_bl); | |
862 | ||
863 | // Record for writing to RADOS | |
864 | write_vals[key] = dentry_bl; | |
865 | consumed_inos->insert(rb.ino); | |
866 | } | |
867 | } | |
868 | ||
869 | // Write back any new/changed dentries | |
870 | if (!write_vals.empty()) { | |
871 | r = output.omap_set(frag_oid.name, write_vals); | |
872 | if (r != 0) { | |
873 | derr << "error writing dentries to " << frag_oid.name | |
874 | << ": " << cpp_strerror(r) << dendl; | |
875 | return r; | |
876 | } | |
877 | } | |
878 | } | |
879 | ||
880 | /* Now that we've looked at the dirlumps, we finally pay attention to | |
881 | * the roots (i.e. inodes without ancestry). This is necessary in order | |
882 | * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally | |
883 | * important because clients use them to infer completeness | |
884 | * of directories | |
885 | */ | |
886 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator p = | |
887 | metablob.roots.begin(); p != metablob.roots.end(); ++p) { | |
888 | EMetaBlob::fullbit const &fb = *(*p); | |
889 | inodeno_t ino = fb.inode.ino; | |
890 | dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl; | |
891 | ||
892 | object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); | |
893 | dout(4) << "object id " << root_oid.name << dendl; | |
894 | ||
895 | bool write_root_ino = false; | |
896 | bufferlist old_root_ino_bl; | |
897 | r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); | |
898 | if (r == -ENOENT) { | |
899 | dout(4) << "root does not exist, will create" << dendl; | |
900 | write_root_ino = true; | |
901 | } else if (r >= 0) { | |
902 | r = 0; | |
903 | InodeStore old_inode; | |
904 | dout(4) << "root exists, will modify (" << old_root_ino_bl.length() | |
905 | << ")" << dendl; | |
906 | bufferlist::iterator inode_bl_iter = old_root_ino_bl.begin(); | |
907 | std::string magic; | |
908 | ::decode(magic, inode_bl_iter); | |
909 | if (magic == CEPH_FS_ONDISK_MAGIC) { | |
910 | dout(4) << "magic ok" << dendl; | |
911 | old_inode.decode(inode_bl_iter); | |
912 | ||
913 | if (old_inode.inode.version < fb.inode.version) { | |
914 | write_root_ino = true; | |
915 | } | |
916 | } else { | |
917 | dout(4) << "magic bad: '" << magic << "'" << dendl; | |
918 | write_root_ino = true; | |
919 | } | |
920 | } else { | |
921 | derr << "error reading root inode object " << root_oid.name | |
922 | << ": " << cpp_strerror(r) << dendl; | |
923 | return r; | |
924 | } | |
925 | ||
926 | if (write_root_ino && !dry_run) { | |
927 | dout(4) << "writing root ino " << root_oid.name | |
928 | << " version " << fb.inode.version << dendl; | |
929 | ||
930 | // Compose: root ino format is magic,InodeStore(bare=false) | |
931 | bufferlist new_root_ino_bl; | |
932 | ::encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl); | |
933 | encode_fullbit_as_inode(fb, false, &new_root_ino_bl); | |
934 | ||
935 | // Write to RADOS | |
936 | r = output.write_full(root_oid.name, new_root_ino_bl); | |
937 | if (r != 0) { | |
938 | derr << "error writing inode object " << root_oid.name | |
939 | << ": " << cpp_strerror(r) << dendl; | |
940 | return r; | |
941 | } | |
942 | } | |
943 | } | |
944 | ||
945 | return r; | |
946 | } | |
947 | ||
948 | ||
7c673cae FG |
949 | /** |
950 | * Erase a region of the log by overwriting it with ENoOp | |
951 | * | |
952 | */ | |
953 | int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length) | |
954 | { | |
955 | // To erase this region, we use our preamble, the encoding overhead | |
956 | // of an ENoOp, and our trailing start ptr. Calculate how much padding | |
957 | // is needed inside the ENoOp to make up the difference. | |
958 | bufferlist tmp; | |
959 | ENoOp enoop(0); | |
960 | enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
961 | ||
962 | dout(4) << "erase_region " << pos << " len=" << length << dendl; | |
963 | ||
964 | // FIXME: get the preamble/postamble length via JournalStream | |
965 | int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t); | |
966 | dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl; | |
967 | ||
968 | if (padding < 0) { | |
969 | derr << "Erase region " << length << " too short" << dendl; | |
970 | return -EINVAL; | |
971 | } | |
972 | ||
973 | // Serialize an ENoOp with the correct amount of padding | |
974 | enoop = ENoOp(padding); | |
975 | bufferlist entry; | |
976 | enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
977 | JournalStream stream(JOURNAL_FORMAT_RESILIENT); | |
978 | ||
979 | // Serialize region of log stream | |
980 | bufferlist log_data; | |
981 | stream.write(entry, &log_data, pos); | |
982 | ||
983 | dout(4) << "erase_region data length " << log_data.length() << dendl; | |
984 | assert(log_data.length() == length); | |
985 | ||
986 | // Write log stream region to RADOS | |
987 | // FIXME: get object size somewhere common to scan_events | |
988 | uint32_t object_size = g_conf->mds_log_segment_size; | |
989 | if (object_size == 0) { | |
990 | // Default layout object size | |
991 | object_size = file_layout_t::get_default().object_size; | |
992 | } | |
993 | ||
994 | uint64_t write_offset = pos; | |
995 | uint64_t obj_offset = (pos / object_size); | |
996 | int r = 0; | |
997 | while(log_data.length()) { | |
998 | std::string const oid = js.obj_name(obj_offset); | |
999 | uint32_t offset_in_obj = write_offset % object_size; | |
1000 | uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); | |
1001 | ||
1002 | r = output.write(oid, log_data, write_len, offset_in_obj); | |
1003 | if (r < 0) { | |
1004 | return r; | |
1005 | } else { | |
1006 | dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl; | |
1007 | r = 0; | |
1008 | } | |
1009 | ||
1010 | log_data.splice(0, write_len); | |
1011 | write_offset += write_len; | |
1012 | obj_offset++; | |
1013 | } | |
1014 | ||
1015 | return r; | |
1016 | } | |
1017 | ||
1018 | /** | |
1019 | * Given an EMetaBlob::fullbit containing an inode, write out | |
1020 | * the encoded inode in the format used by InodeStore (i.e. the | |
1021 | * backing store format) | |
1022 | * | |
1023 | * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use | |
1024 | * on an offline InodeStore instance. It's way simpler, because we are just | |
1025 | * uncritically hauling the data between structs. | |
1026 | * | |
1027 | * @param fb a fullbit extracted from a journal entry | |
1028 | * @param bare if true, leave out [EN|DE]CODE_START decoration | |
1029 | * @param out_bl output, write serialized inode to this bufferlist | |
1030 | */ | |
1031 | void JournalTool::encode_fullbit_as_inode( | |
1032 | const EMetaBlob::fullbit &fb, | |
1033 | const bool bare, | |
1034 | bufferlist *out_bl) | |
1035 | { | |
1036 | assert(out_bl != NULL); | |
1037 | ||
1038 | // Compose InodeStore | |
1039 | InodeStore new_inode; | |
1040 | new_inode.inode = fb.inode; | |
1041 | new_inode.xattrs = fb.xattrs; | |
1042 | new_inode.dirfragtree = fb.dirfragtree; | |
1043 | new_inode.snap_blob = fb.snapbl; | |
1044 | new_inode.symlink = fb.symlink; | |
1045 | new_inode.old_inodes = fb.old_inodes; | |
1046 | ||
1047 | // Serialize InodeStore | |
1048 | if (bare) { | |
1049 | new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1050 | } else { | |
1051 | new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1052 | } | |
1053 | } | |
1054 | ||
1055 | /** | |
1056 | * Given a list of inode numbers known to be in use by | |
1057 | * inodes in the backing store, ensure that none of these | |
1058 | * numbers are listed as free in the InoTables in the | |
1059 | * backing store. | |
1060 | * | |
1061 | * Used after injecting inodes into the backing store, to | |
1062 | * ensure that the same inode numbers are not subsequently | |
1063 | * used for new files during ordinary operation. | |
1064 | * | |
1065 | * @param inos list of inode numbers to be removed from | |
1066 | * free lists in InoTables | |
1067 | * @returns 0 on success, else negative error code | |
1068 | */ | |
1069 | int JournalTool::consume_inos(const std::set<inodeno_t> &inos) | |
1070 | { | |
1071 | int r = 0; | |
1072 | ||
1073 | // InoTable is a per-MDS structure, so iterate over assigned ranks | |
1074 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
1075 | std::set<mds_rank_t> in_ranks; | |
1076 | fs->mds_map.get_mds_set(in_ranks); | |
1077 | ||
1078 | for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin(); | |
1079 | rank_i != in_ranks.end(); ++rank_i) | |
1080 | { | |
1081 | // Compose object name | |
1082 | std::ostringstream oss; | |
1083 | oss << "mds" << *rank_i << "_inotable"; | |
1084 | object_t inotable_oid = object_t(oss.str()); | |
1085 | ||
1086 | // Read object | |
1087 | bufferlist inotable_bl; | |
1088 | int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); | |
1089 | if (read_r < 0) { | |
1090 | // Things are really bad if we can't read inotable. Beyond our powers. | |
1091 | derr << "unable to read inotable '" << inotable_oid.name << "': " | |
1092 | << cpp_strerror(read_r) << dendl; | |
1093 | r = r ? r : read_r; | |
1094 | continue; | |
1095 | } | |
1096 | ||
1097 | // Deserialize InoTable | |
1098 | version_t inotable_ver; | |
1099 | bufferlist::iterator q = inotable_bl.begin(); | |
1100 | ::decode(inotable_ver, q); | |
1101 | InoTable ino_table(NULL); | |
1102 | ino_table.decode(q); | |
1103 | ||
1104 | // Update InoTable in memory | |
1105 | bool inotable_modified = false; | |
1106 | for (std::set<inodeno_t>::iterator i = inos.begin(); | |
1107 | i != inos.end(); ++i) | |
1108 | { | |
1109 | const inodeno_t ino = *i; | |
1110 | if (ino_table.force_consume(ino)) { | |
1111 | dout(4) << "Used ino 0x" << std::hex << ino << std::dec | |
1112 | << " requires inotable update" << dendl; | |
1113 | inotable_modified = true; | |
1114 | } | |
1115 | } | |
1116 | ||
1117 | // Serialize and write InoTable | |
1118 | if (inotable_modified) { | |
1119 | inotable_ver += 1; | |
1120 | dout(4) << "writing modified inotable version " << inotable_ver << dendl; | |
1121 | bufferlist inotable_new_bl; | |
1122 | ::encode(inotable_ver, inotable_new_bl); | |
1123 | ino_table.encode_state(inotable_new_bl); | |
1124 | int write_r = output.write_full(inotable_oid.name, inotable_new_bl); | |
1125 | if (write_r != 0) { | |
1126 | derr << "error writing modified inotable " << inotable_oid.name | |
1127 | << ": " << cpp_strerror(write_r) << dendl; | |
1128 | r = r ? r : read_r; | |
1129 | continue; | |
1130 | } | |
1131 | } | |
1132 | } | |
1133 | ||
1134 | return r; | |
1135 | } | |
1136 |