]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * ceph - scalable distributed file system | |
5 | * | |
6 | * copyright (c) 2014 john spray <john.spray@inktank.com> | |
7 | * | |
8 | * this is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the gnu lesser general public | |
10 | * license version 2.1, as published by the free software | |
11 | * foundation. see file copying. | |
12 | */ | |
13 | ||
14 | ||
15 | #include <sstream> | |
16 | ||
17 | #include "common/ceph_argparse.h" | |
18 | #include "common/errno.h" | |
19 | #include "osdc/Journaler.h" | |
20 | #include "mds/mdstypes.h" | |
21 | #include "mds/LogEvent.h" | |
22 | #include "mds/InoTable.h" | |
23 | ||
24 | #include "mds/events/ENoOp.h" | |
25 | #include "mds/events/EUpdate.h" | |
26 | ||
27 | #include "JournalScanner.h" | |
28 | #include "EventOutput.h" | |
29 | #include "Dumper.h" | |
30 | #include "Resetter.h" | |
31 | ||
32 | #include "JournalTool.h" | |
33 | ||
34 | ||
35 | #define dout_context g_ceph_context | |
36 | #define dout_subsys ceph_subsys_mds | |
37 | #undef dout_prefix | |
38 | #define dout_prefix *_dout << __func__ << ": " | |
39 | ||
20effc67 | 40 | using namespace std; |
7c673cae FG |
41 | |
42 | void JournalTool::usage() | |
43 | { | |
44 | std::cout << "Usage: \n" | |
45 | << " cephfs-journal-tool [options] journal <command>\n" | |
46 | << " <command>:\n" | |
47 | << " inspect\n" | |
91327a77 | 48 | << " import <path> [--force]\n" |
7c673cae FG |
49 | << " export <path>\n" |
50 | << " reset [--force]\n" | |
92f5a8d4 TL |
51 | << " cephfs-journal-tool [options] header <get|set> <field> <value>\n" |
52 | << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]\n" | |
31f18b77 | 53 | << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n" |
7c673cae FG |
54 | << " <selector>:\n" |
55 | << " --range=<start>..<end>\n" | |
56 | << " --path=<substring>\n" | |
57 | << " --inode=<integer>\n" | |
58 | << " --type=<UPDATE|OPEN|SESSION...><\n" | |
59 | << " --frag=<ino>.<frag> [--dname=<dentry string>]\n" | |
7c673cae | 60 | << " --client=<session id integer>\n" |
31f18b77 | 61 | << " <effect>: [get|recover_dentries|splice]\n" |
7c673cae FG |
62 | << " <output>: [summary|list|binary|json] [--path <path>]\n" |
63 | << "\n" | |
31f18b77 | 64 | << "General options:\n" |
f64942e4 | 65 | << " --rank=filesystem:mds-rank|all Journal rank (mandatory)\n" |
11fdf7f2 TL |
66 | << " --journal=<mdlog|purge_queue> Journal type (purge_queue means\n" |
67 | << " this journal is used to queue for purge operation,\n" | |
92f5a8d4 | 68 | << " default is mdlog, and only mdlog support event mode)\n" |
31f18b77 FG |
69 | << "\n" |
70 | << "Special options\n" | |
71 | << " --alternate-pool <name> Alternative metadata pool to target\n" | |
72 | << " when using recover_dentries.\n"; | |
7c673cae FG |
73 | |
74 | generic_client_usage(); | |
75 | } | |
76 | ||
77 | ||
78 | /** | |
79 | * Handle arguments and hand off to journal/header/event mode | |
80 | */ | |
81 | int JournalTool::main(std::vector<const char*> &argv) | |
82 | { | |
83 | int r; | |
84 | ||
85 | dout(10) << "JournalTool::main " << dendl; | |
86 | // Common arg parsing | |
87 | // ================== | |
88 | if (argv.empty()) { | |
11fdf7f2 | 89 | cerr << "missing positional argument" << std::endl; |
7c673cae FG |
90 | return -EINVAL; |
91 | } | |
92 | ||
93 | std::vector<const char*>::iterator arg = argv.begin(); | |
94 | ||
95 | std::string rank_str; | |
f64942e4 AA |
96 | if (!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) { |
97 | derr << "missing mandatory \"--rank\" argument" << dendl; | |
98 | return -EINVAL; | |
7c673cae FG |
99 | } |
100 | ||
11fdf7f2 TL |
101 | if (!ceph_argparse_witharg(argv, arg, &type, "--journal", (char*)NULL)) { |
102 | // Default is mdlog | |
103 | type = "mdlog"; | |
104 | } | |
105 | ||
106 | r = validate_type(type); | |
107 | if (r != 0) { | |
108 | derr << "journal type is not correct." << dendl; | |
109 | return r; | |
110 | } | |
111 | ||
f64942e4 | 112 | r = role_selector.parse(*fsmap, rank_str, false); |
7c673cae FG |
113 | if (r != 0) { |
114 | derr << "Couldn't determine MDS rank." << dendl; | |
115 | return r; | |
116 | } | |
117 | ||
118 | std::string mode; | |
119 | if (arg == argv.end()) { | |
120 | derr << "Missing mode [journal|header|event]" << dendl; | |
121 | return -EINVAL; | |
122 | } | |
123 | mode = std::string(*arg); | |
124 | arg = argv.erase(arg); | |
125 | ||
126 | // RADOS init | |
127 | // ========== | |
128 | r = rados.init_with_context(g_ceph_context); | |
129 | if (r < 0) { | |
130 | derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; | |
131 | return r; | |
132 | } | |
133 | ||
134 | dout(4) << "JournalTool: connecting to RADOS..." << dendl; | |
135 | r = rados.connect(); | |
136 | if (r < 0) { | |
137 | derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; | |
138 | return r; | |
139 | } | |
140 | ||
141 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
11fdf7f2 | 142 | ceph_assert(fs != nullptr); |
7c673cae FG |
143 | int64_t const pool_id = fs->mds_map.get_metadata_pool(); |
144 | dout(4) << "JournalTool: resolving pool " << pool_id << dendl; | |
145 | std::string pool_name; | |
146 | r = rados.pool_reverse_lookup(pool_id, &pool_name); | |
147 | if (r < 0) { | |
148 | derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl; | |
149 | return r; | |
150 | } | |
151 | ||
152 | dout(4) << "JournalTool: creating IoCtx.." << dendl; | |
153 | r = rados.ioctx_create(pool_name.c_str(), input); | |
11fdf7f2 | 154 | ceph_assert(r == 0); |
7c673cae FG |
155 | output.dup(input); |
156 | ||
157 | // Execution | |
158 | // ========= | |
11fdf7f2 TL |
159 | // journal and header are general journal mode |
160 | // event mode is only specific for mdlog | |
f64942e4 AA |
161 | auto roles = role_selector.get_roles(); |
162 | if (roles.size() > 1) { | |
163 | const std::string &command = argv[0]; | |
164 | bool allowed = can_execute_for_all_ranks(mode, command); | |
165 | if (!allowed) { | |
166 | derr << "operation not allowed for all ranks" << dendl; | |
167 | return -EINVAL; | |
168 | } | |
169 | ||
170 | all_ranks = true; | |
171 | } | |
172 | for (auto role : roles) { | |
7c673cae | 173 | rank = role.rank; |
f64942e4 | 174 | std::vector<const char *> rank_argv(argv); |
7c673cae FG |
175 | dout(4) << "Executing for rank " << rank << dendl; |
176 | if (mode == std::string("journal")) { | |
f64942e4 | 177 | r = main_journal(rank_argv); |
7c673cae | 178 | } else if (mode == std::string("header")) { |
f64942e4 | 179 | r = main_header(rank_argv); |
7c673cae | 180 | } else if (mode == std::string("event")) { |
f64942e4 | 181 | r = main_event(rank_argv); |
7c673cae | 182 | } else { |
11fdf7f2 | 183 | cerr << "Bad command '" << mode << "'" << std::endl; |
7c673cae FG |
184 | return -EINVAL; |
185 | } | |
186 | ||
187 | if (r != 0) { | |
188 | return r; | |
189 | } | |
190 | } | |
191 | ||
192 | return r; | |
193 | } | |
194 | ||
11fdf7f2 TL |
195 | int JournalTool::validate_type(const std::string &type) |
196 | { | |
197 | if (type == "mdlog" || type == "purge_queue") { | |
198 | return 0; | |
199 | } | |
200 | return -1; | |
201 | } | |
7c673cae | 202 | |
f64942e4 AA |
203 | std::string JournalTool::gen_dump_file_path(const std::string &prefix) { |
204 | if (!all_ranks) { | |
205 | return prefix; | |
206 | } | |
207 | ||
208 | return prefix + "." + std::to_string(rank); | |
209 | } | |
210 | ||
211 | bool JournalTool::can_execute_for_all_ranks(const std::string &mode, | |
212 | const std::string &command) { | |
213 | if (mode == "journal" && command == "import") { | |
214 | return false; | |
215 | } | |
216 | ||
217 | return true; | |
218 | } | |
219 | ||
7c673cae FG |
220 | /** |
221 | * Handle arguments for 'journal' mode | |
222 | * | |
223 | * This is for operations that act on the journal as a whole. | |
224 | */ | |
225 | int JournalTool::main_journal(std::vector<const char*> &argv) | |
226 | { | |
92f5a8d4 TL |
227 | if (argv.empty()) { |
228 | derr << "Missing journal command, please see help" << dendl; | |
229 | return -EINVAL; | |
230 | } | |
231 | ||
7c673cae FG |
232 | std::string command = argv[0]; |
233 | if (command == "inspect") { | |
234 | return journal_inspect(); | |
235 | } else if (command == "export" || command == "import") { | |
91327a77 | 236 | bool force = false; |
7c673cae FG |
237 | if (argv.size() >= 2) { |
238 | std::string const path = argv[1]; | |
91327a77 AA |
239 | if (argv.size() == 3) { |
240 | if (std::string(argv[2]) == "--force") { | |
241 | force = true; | |
242 | } else { | |
243 | std::cerr << "Unknown argument " << argv[1] << std::endl; | |
244 | return -EINVAL; | |
245 | } | |
246 | } | |
247 | return journal_export(path, command == "import", force); | |
7c673cae FG |
248 | } else { |
249 | derr << "Missing path" << dendl; | |
250 | return -EINVAL; | |
251 | } | |
252 | } else if (command == "reset") { | |
253 | bool force = false; | |
254 | if (argv.size() == 2) { | |
255 | if (std::string(argv[1]) == "--force") { | |
256 | force = true; | |
257 | } else { | |
258 | std::cerr << "Unknown argument " << argv[1] << std::endl; | |
7c673cae FG |
259 | return -EINVAL; |
260 | } | |
261 | } else if (argv.size() > 2) { | |
262 | std::cerr << "Too many arguments!" << std::endl; | |
7c673cae FG |
263 | return -EINVAL; |
264 | } | |
265 | return journal_reset(force); | |
266 | } else { | |
267 | derr << "Bad journal command '" << command << "'" << dendl; | |
268 | return -EINVAL; | |
269 | } | |
270 | } | |
271 | ||
272 | ||
273 | /** | |
274 | * Parse arguments and execute for 'header' mode | |
275 | * | |
276 | * This is for operations that act on the header only. | |
277 | */ | |
278 | int JournalTool::main_header(std::vector<const char*> &argv) | |
279 | { | |
11fdf7f2 TL |
280 | JournalFilter filter(type); |
281 | JournalScanner js(input, rank, type, filter); | |
7c673cae FG |
282 | int r = js.scan(false); |
283 | if (r < 0) { | |
284 | std::cerr << "Unable to scan journal" << std::endl; | |
285 | return r; | |
286 | } | |
287 | ||
288 | if (!js.header_present) { | |
289 | std::cerr << "Header object not found!" << std::endl; | |
290 | return -ENOENT; | |
291 | } else if (!js.header_valid && js.header == NULL) { | |
292 | // Can't do a read or a single-field write without a copy of the original | |
293 | derr << "Header could not be read!" << dendl; | |
294 | return -ENOENT; | |
295 | } else { | |
11fdf7f2 | 296 | ceph_assert(js.header != NULL); |
7c673cae FG |
297 | } |
298 | ||
92f5a8d4 TL |
299 | if (argv.empty()) { |
300 | derr << "Missing header command, must be [get|set]" << dendl; | |
7c673cae FG |
301 | return -EINVAL; |
302 | } | |
303 | std::vector<const char *>::iterator arg = argv.begin(); | |
304 | std::string const command = *arg; | |
305 | arg = argv.erase(arg); | |
306 | ||
307 | if (command == std::string("get")) { | |
308 | // Write JSON journal dump to stdout | |
309 | JSONFormatter jf(true); | |
310 | js.header->dump(&jf); | |
311 | jf.flush(std::cout); | |
312 | std::cout << std::endl; | |
313 | } else if (command == std::string("set")) { | |
314 | // Need two more args <key> <val> | |
315 | if (argv.size() != 2) { | |
316 | derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl; | |
317 | return -EINVAL; | |
318 | } | |
319 | ||
320 | std::string const field_name = *arg; | |
321 | arg = argv.erase(arg); | |
322 | ||
323 | std::string const value_str = *arg; | |
324 | arg = argv.erase(arg); | |
11fdf7f2 | 325 | ceph_assert(argv.empty()); |
7c673cae FG |
326 | |
327 | std::string parse_err; | |
328 | uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err); | |
329 | if (!parse_err.empty()) { | |
330 | derr << "Invalid value '" << value_str << "': " << parse_err << dendl; | |
331 | return -EINVAL; | |
332 | } | |
333 | ||
334 | uint64_t *field = NULL; | |
335 | if (field_name == "trimmed_pos") { | |
336 | field = &(js.header->trimmed_pos); | |
337 | } else if (field_name == "expire_pos") { | |
338 | field = &(js.header->expire_pos); | |
339 | } else if (field_name == "write_pos") { | |
340 | field = &(js.header->write_pos); | |
b32b8144 FG |
341 | } else if (field_name == "pool_id") { |
342 | field = (uint64_t*)(&(js.header->layout.pool_id)); | |
7c673cae FG |
343 | } else { |
344 | derr << "Invalid field '" << field_name << "'" << dendl; | |
345 | return -EINVAL; | |
346 | } | |
347 | ||
348 | std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl; | |
349 | *field = new_val; | |
350 | ||
351 | dout(4) << "Writing object..." << dendl; | |
352 | bufferlist header_bl; | |
11fdf7f2 | 353 | encode(*(js.header), header_bl); |
7c673cae FG |
354 | output.write_full(js.obj_name(0), header_bl); |
355 | dout(4) << "Write complete." << dendl; | |
356 | std::cout << "Successfully updated header." << std::endl; | |
357 | } else { | |
358 | derr << "Bad header command '" << command << "'" << dendl; | |
359 | return -EINVAL; | |
360 | } | |
361 | ||
362 | return 0; | |
363 | } | |
364 | ||
365 | ||
366 | /** | |
367 | * Parse arguments and execute for 'event' mode | |
368 | * | |
369 | * This is for operations that act on LogEvents within the log | |
370 | */ | |
371 | int JournalTool::main_event(std::vector<const char*> &argv) | |
372 | { | |
373 | int r; | |
374 | ||
92f5a8d4 TL |
375 | if (argv.empty()) { |
376 | derr << "Missing event command, please see help" << dendl; | |
377 | return -EINVAL; | |
378 | } | |
7c673cae | 379 | |
92f5a8d4 | 380 | std::vector<const char*>::iterator arg = argv.begin(); |
1911f103 TL |
381 | bool dry_run = false; |
382 | ||
7c673cae | 383 | std::string command = *(arg++); |
31f18b77 | 384 | if (command != "get" && command != "splice" && command != "recover_dentries") { |
7c673cae | 385 | derr << "Unknown argument '" << command << "'" << dendl; |
11fdf7f2 TL |
386 | return -EINVAL; |
387 | } | |
388 | ||
1911f103 TL |
389 | if (command == "recover_dentries") { |
390 | if (type != "mdlog") { | |
391 | derr << "journaler for " << type << " can't do \"recover_dentries\"." << dendl; | |
392 | return -EINVAL; | |
393 | } else { | |
394 | if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) { | |
395 | dry_run = true; | |
396 | } | |
397 | } | |
7c673cae FG |
398 | } |
399 | ||
400 | if (arg == argv.end()) { | |
401 | derr << "Incomplete command line" << dendl; | |
7c673cae FG |
402 | return -EINVAL; |
403 | } | |
404 | ||
405 | // Parse filter options | |
406 | // ==================== | |
11fdf7f2 | 407 | JournalFilter filter(type); |
7c673cae FG |
408 | r = filter.parse_args(argv, arg); |
409 | if (r) { | |
410 | return r; | |
411 | } | |
412 | ||
413 | // Parse output options | |
414 | // ==================== | |
415 | if (arg == argv.end()) { | |
11fdf7f2 TL |
416 | cerr << "Missing output command" << std::endl; |
417 | return -EINVAL; | |
7c673cae FG |
418 | } |
419 | std::string output_style = *(arg++); | |
420 | if (output_style != "binary" && output_style != "json" && | |
421 | output_style != "summary" && output_style != "list") { | |
11fdf7f2 TL |
422 | cerr << "Unknown argument: '" << output_style << "'" << std::endl; |
423 | return -EINVAL; | |
7c673cae FG |
424 | } |
425 | ||
426 | std::string output_path = "dump"; | |
427 | while(arg != argv.end()) { | |
428 | std::string arg_str; | |
429 | if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { | |
430 | output_path = arg_str; | |
431 | } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", | |
432 | nullptr)) { | |
433 | dout(1) << "Using alternate pool " << arg_str << dendl; | |
434 | int r = rados.ioctx_create(arg_str.c_str(), output); | |
11fdf7f2 | 435 | ceph_assert(r == 0); |
7c673cae FG |
436 | other_pool = true; |
437 | } else { | |
11fdf7f2 | 438 | cerr << "Unknown argument: '" << *arg << "'" << std::endl; |
7c673cae FG |
439 | return -EINVAL; |
440 | } | |
441 | } | |
442 | ||
f64942e4 AA |
443 | const std::string dump_path = gen_dump_file_path(output_path); |
444 | ||
7c673cae FG |
445 | // Execute command |
446 | // =============== | |
11fdf7f2 | 447 | JournalScanner js(input, rank, type, filter); |
7c673cae FG |
448 | if (command == "get") { |
449 | r = js.scan(); | |
450 | if (r) { | |
451 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
452 | return r; | |
453 | } | |
7c673cae FG |
454 | } else if (command == "recover_dentries") { |
455 | r = js.scan(); | |
456 | if (r) { | |
457 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
458 | return r; | |
459 | } | |
460 | ||
7c673cae FG |
461 | /** |
462 | * Iterate over log entries, attempting to scavenge from each one | |
463 | */ | |
464 | std::set<inodeno_t> consumed_inos; | |
465 | for (JournalScanner::EventMap::iterator i = js.events.begin(); | |
466 | i != js.events.end(); ++i) { | |
11fdf7f2 | 467 | auto& le = i->second.log_event; |
7c673cae FG |
468 | EMetaBlob const *mb = le->get_metablob(); |
469 | if (mb) { | |
31f18b77 | 470 | int scav_r = recover_dentries(*mb, dry_run, &consumed_inos); |
7c673cae FG |
471 | if (scav_r) { |
472 | dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec | |
473 | << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl; | |
474 | if (r == 0) { | |
475 | r = scav_r; | |
476 | } | |
477 | // Our goal is to read all we can, so don't stop on errors, but | |
478 | // do record them for possible later output | |
479 | js.errors.insert(std::make_pair(i->first, | |
480 | JournalScanner::EventError(scav_r, cpp_strerror(r)))); | |
481 | } | |
482 | } | |
483 | } | |
484 | ||
485 | /** | |
486 | * Update InoTable to reflect any inode numbers consumed during scavenge | |
487 | */ | |
488 | dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl; | |
489 | if (consumed_inos.size() && !dry_run) { | |
490 | int consume_r = consume_inos(consumed_inos); | |
491 | if (consume_r) { | |
492 | dout(1) << "Error updating InoTable for " << consumed_inos.size() | |
493 | << " consume inos: " << cpp_strerror(consume_r) << dendl; | |
494 | if (r == 0) { | |
495 | r = consume_r; | |
496 | } | |
497 | } | |
498 | } | |
499 | ||
500 | // Remove consumed dentries from lost+found. | |
501 | if (other_pool && !dry_run) { | |
502 | std::set<std::string> found; | |
503 | ||
504 | for (auto i : consumed_inos) { | |
505 | char s[20]; | |
506 | ||
507 | snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); | |
508 | dout(20) << "removing " << s << dendl; | |
509 | found.insert(std::string(s)); | |
510 | } | |
511 | ||
512 | object_t frag_oid; | |
513 | frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, | |
514 | frag_t(), ""); | |
515 | output.omap_rm_keys(frag_oid.name, found); | |
516 | } | |
517 | } else if (command == "splice") { | |
518 | r = js.scan(); | |
519 | if (r) { | |
520 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
521 | return r; | |
522 | } | |
523 | ||
524 | uint64_t start, end; | |
525 | if (filter.get_range(start, end)) { | |
526 | // Special case for range filter: erase a numeric range in the log | |
527 | uint64_t range = end - start; | |
528 | int r = erase_region(js, start, range); | |
529 | if (r) { | |
530 | derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec | |
531 | << ": " << cpp_strerror(r) << dendl; | |
532 | return r; | |
533 | } | |
534 | } else { | |
535 | // General case: erase a collection of individual entries in the log | |
536 | for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) { | |
537 | dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl; | |
538 | ||
539 | int r = erase_region(js, i->first, i->second.raw_size); | |
540 | if (r) { | |
541 | derr << "Failed to erase event 0x" << std::hex << i->first << std::dec | |
542 | << ": " << cpp_strerror(r) << dendl; | |
543 | return r; | |
544 | } | |
545 | } | |
546 | } | |
547 | ||
548 | ||
549 | } else { | |
11fdf7f2 | 550 | cerr << "Unknown argument '" << command << "'" << std::endl; |
7c673cae FG |
551 | return -EINVAL; |
552 | } | |
553 | ||
554 | // Generate output | |
555 | // =============== | |
f64942e4 | 556 | EventOutput output(js, dump_path); |
7c673cae FG |
557 | int output_result = 0; |
558 | if (output_style == "binary") { | |
559 | output_result = output.binary(); | |
560 | } else if (output_style == "json") { | |
561 | output_result = output.json(); | |
562 | } else if (output_style == "summary") { | |
563 | output.summary(); | |
564 | } else if (output_style == "list") { | |
565 | output.list(); | |
566 | } else { | |
567 | std::cerr << "Bad output command '" << output_style << "'" << std::endl; | |
568 | return -EINVAL; | |
569 | } | |
570 | ||
571 | if (output_result != 0) { | |
572 | std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl; | |
573 | } | |
574 | ||
575 | return output_result; | |
576 | } | |
577 | ||
578 | /** | |
579 | * Provide the user with information about the condition of the journal, | |
580 | * especially indicating what range of log events is available and where | |
581 | * any gaps or corruptions in the journal are. | |
582 | */ | |
583 | int JournalTool::journal_inspect() | |
584 | { | |
585 | int r; | |
586 | ||
11fdf7f2 TL |
587 | JournalFilter filter(type); |
588 | JournalScanner js(input, rank, type, filter); | |
7c673cae FG |
589 | r = js.scan(); |
590 | if (r) { | |
591 | std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; | |
592 | return r; | |
593 | } | |
594 | ||
595 | js.report(std::cout); | |
596 | ||
597 | return 0; | |
598 | } | |
599 | ||
600 | ||
601 | /** | |
602 | * Attempt to export a binary dump of the journal. | |
603 | * | |
604 | * This is allowed to fail if the header is malformed or there are | |
605 | * objects inaccessible, in which case the user would have to fall | |
606 | * back to manually listing RADOS objects and extracting them, which | |
607 | * they can do with the ``rados`` CLI. | |
608 | */ | |
91327a77 | 609 | int JournalTool::journal_export(std::string const &path, bool import, bool force) |
7c673cae FG |
610 | { |
611 | int r = 0; | |
11fdf7f2 | 612 | JournalScanner js(input, rank, type); |
7c673cae FG |
613 | |
614 | if (!import) { | |
615 | /* | |
616 | * If doing an export, first check that the header is valid and | |
617 | * no objects are missing before trying to dump | |
618 | */ | |
619 | r = js.scan(); | |
620 | if (r < 0) { | |
621 | derr << "Unable to scan journal, assuming badly damaged" << dendl; | |
622 | return r; | |
623 | } | |
624 | if (!js.is_readable()) { | |
625 | derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl; | |
626 | return -EIO; | |
627 | } | |
628 | } | |
629 | ||
630 | /* | |
631 | * Assuming we can cleanly read the journal data, dump it out to a file | |
632 | */ | |
633 | { | |
634 | Dumper dumper; | |
11fdf7f2 | 635 | r = dumper.init(mds_role_t(role_selector.get_ns(), rank), type); |
7c673cae FG |
636 | if (r < 0) { |
637 | derr << "dumper::init failed: " << cpp_strerror(r) << dendl; | |
638 | return r; | |
639 | } | |
640 | if (import) { | |
91327a77 | 641 | r = dumper.undump(path.c_str(), force); |
7c673cae | 642 | } else { |
f64942e4 AA |
643 | const std::string ex_path = gen_dump_file_path(path); |
644 | r = dumper.dump(ex_path.c_str()); | |
7c673cae | 645 | } |
7c673cae FG |
646 | } |
647 | ||
648 | return r; | |
649 | } | |
650 | ||
651 | ||
652 | /** | |
653 | * Truncate journal and insert EResetJournal | |
654 | */ | |
655 | int JournalTool::journal_reset(bool hard) | |
656 | { | |
657 | int r = 0; | |
658 | Resetter resetter; | |
11fdf7f2 | 659 | r = resetter.init(mds_role_t(role_selector.get_ns(), rank), type, hard); |
7c673cae FG |
660 | if (r < 0) { |
661 | derr << "resetter::init failed: " << cpp_strerror(r) << dendl; | |
662 | return r; | |
663 | } | |
664 | ||
665 | if (hard) { | |
11fdf7f2 | 666 | r = resetter.reset_hard(); |
7c673cae | 667 | } else { |
11fdf7f2 | 668 | r = resetter.reset(); |
7c673cae | 669 | } |
7c673cae FG |
670 | |
671 | return r; | |
672 | } | |
673 | ||
674 | ||
675 | /** | |
676 | * Selective offline replay which only reads out dentries and writes | |
677 | * them to the backing store iff their version is > what is currently | |
678 | * in the backing store. | |
679 | * | |
680 | * In order to write dentries to the backing store, we may create the | |
681 | * required enclosing dirfrag objects. | |
682 | * | |
683 | * Test this by running scavenge on an unflushed journal, then nuking | |
684 | * it offline, then starting an MDS and seeing that the dentries are | |
685 | * visible. | |
686 | * | |
687 | * @param metablob an EMetaBlob retrieved from the journal | |
688 | * @param dry_run if true, do no writes to RADOS | |
689 | * @param consumed_inos output, populated with any inos inserted | |
690 | * @returns 0 on success, else negative error code | |
691 | */ | |
31f18b77 | 692 | int JournalTool::recover_dentries( |
7c673cae FG |
693 | EMetaBlob const &metablob, |
694 | bool const dry_run, | |
695 | std::set<inodeno_t> *consumed_inos) | |
696 | { | |
11fdf7f2 | 697 | ceph_assert(consumed_inos != NULL); |
7c673cae FG |
698 | |
699 | int r = 0; | |
700 | ||
701 | // Replay fullbits (dentry+inode) | |
11fdf7f2 | 702 | for (const auto& frag : metablob.lump_order) { |
7c673cae FG |
703 | EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second; |
704 | lump._decode_bits(); | |
705 | object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, ""); | |
706 | ||
707 | dout(4) << "inspecting lump " << frag_oid.name << dendl; | |
708 | ||
709 | ||
710 | // We will record old fnode version for use in hard link handling | |
711 | // If we don't read an old fnode, take version as zero and write in | |
712 | // all hardlinks we find. | |
713 | version_t old_fnode_version = 0; | |
714 | ||
715 | // Update fnode in omap header of dirfrag object | |
716 | bool write_fnode = false; | |
717 | bufferlist old_fnode_bl; | |
718 | r = input.omap_get_header(frag_oid.name, &old_fnode_bl); | |
719 | if (r == -ENOENT) { | |
720 | // Creating dirfrag from scratch | |
721 | dout(4) << "failed to read OMAP header from directory fragment " | |
722 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
723 | write_fnode = true; | |
724 | // Note: creating the dirfrag *without* a backtrace, relying on | |
725 | // MDS to regenerate backtraces on read or in FSCK | |
726 | } else if (r == 0) { | |
727 | // Conditionally update existing omap header | |
728 | fnode_t old_fnode; | |
11fdf7f2 | 729 | auto old_fnode_iter = old_fnode_bl.cbegin(); |
7c673cae FG |
730 | try { |
731 | old_fnode.decode(old_fnode_iter); | |
732 | dout(4) << "frag " << frag_oid.name << " fnode old v" << | |
f67539c2 | 733 | old_fnode.version << " vs new v" << lump.fnode->version << dendl; |
7c673cae | 734 | old_fnode_version = old_fnode.version; |
f67539c2 | 735 | write_fnode = old_fnode_version < lump.fnode->version; |
7c673cae FG |
736 | } catch (const buffer::error &err) { |
737 | dout(1) << "frag " << frag_oid.name | |
738 | << " is corrupt, overwriting" << dendl; | |
739 | write_fnode = true; | |
740 | } | |
741 | } else { | |
742 | // Unexpected error | |
743 | dout(4) << "failed to read OMAP header from directory fragment " | |
744 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
745 | return r; | |
746 | } | |
747 | ||
748 | if ((other_pool || write_fnode) && !dry_run) { | |
749 | dout(4) << "writing fnode to omap header" << dendl; | |
750 | bufferlist fnode_bl; | |
f67539c2 | 751 | lump.fnode->encode(fnode_bl); |
7c673cae FG |
752 | if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { |
753 | r = output.omap_set_header(frag_oid.name, fnode_bl); | |
754 | } | |
755 | if (r != 0) { | |
756 | derr << "Failed to write fnode for frag object " | |
757 | << frag_oid.name << dendl; | |
758 | return r; | |
759 | } | |
760 | } | |
761 | ||
762 | std::set<std::string> read_keys; | |
763 | ||
764 | // Compose list of potentially-existing dentries we would like to fetch | |
11fdf7f2 | 765 | for (const auto& fb : lump.get_dfull()) { |
7c673cae FG |
766 | // Get a key like "foobar_head" |
767 | std::string key; | |
768 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
769 | dn_key.encode(key); | |
770 | read_keys.insert(key); | |
771 | } | |
772 | ||
11fdf7f2 | 773 | for(const auto& rb : lump.get_dremote()) { |
7c673cae FG |
774 | // Get a key like "foobar_head" |
775 | std::string key; | |
776 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
777 | dn_key.encode(key); | |
778 | read_keys.insert(key); | |
779 | } | |
780 | ||
11fdf7f2 | 781 | for (const auto& nb : lump.get_dnull()) { |
c07f9fc5 FG |
782 | // Get a key like "foobar_head" |
783 | std::string key; | |
784 | dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); | |
785 | dn_key.encode(key); | |
786 | read_keys.insert(key); | |
787 | } | |
788 | ||
7c673cae FG |
789 | // Perform bulk read of existing dentries |
790 | std::map<std::string, bufferlist> read_vals; | |
791 | r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
792 | if (r == -ENOENT && other_pool) { | |
793 | r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
794 | } | |
795 | if (r != 0) { | |
796 | derr << "unexpected error reading fragment object " | |
797 | << frag_oid.name << ": " << cpp_strerror(r) << dendl; | |
798 | return r; | |
799 | } | |
800 | ||
801 | // Compose list of dentries we will write back | |
802 | std::map<std::string, bufferlist> write_vals; | |
11fdf7f2 | 803 | for (const auto& fb : lump.get_dfull()) { |
7c673cae FG |
804 | // Get a key like "foobar_head" |
805 | std::string key; | |
806 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
807 | dn_key.encode(key); | |
808 | ||
809 | dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn | |
810 | << dendl; | |
811 | bool write_dentry = false; | |
812 | if (read_vals.find(key) == read_vals.end()) { | |
813 | dout(4) << "dentry did not already exist, will create" << dendl; | |
814 | write_dentry = true; | |
815 | } else { | |
816 | dout(4) << "dentry " << key << " existed already" << dendl; | |
817 | dout(4) << "dentry exists, checking versions..." << dendl; | |
818 | bufferlist &old_dentry = read_vals[key]; | |
819 | // Decode dentry+inode | |
11fdf7f2 | 820 | auto q = old_dentry.cbegin(); |
7c673cae FG |
821 | |
822 | snapid_t dnfirst; | |
11fdf7f2 | 823 | decode(dnfirst, q); |
7c673cae | 824 | char dentry_type; |
11fdf7f2 | 825 | decode(dentry_type, q); |
7c673cae | 826 | |
f67539c2 | 827 | if (dentry_type == 'L' || dentry_type == 'l') { |
7c673cae FG |
828 | // leave write_dentry false, we have no version to |
829 | // compare with in a hardlink, so it's not safe to | |
830 | // squash over it with what's in this fullbit | |
831 | dout(10) << "Existing remote inode in slot to be (maybe) written " | |
832 | << "by a full inode from the journal dn '" << fb.dn.c_str() | |
f67539c2 | 833 | << "' with lump fnode version " << lump.fnode->version |
7c673cae | 834 | << "vs existing fnode version " << old_fnode_version << dendl; |
f67539c2 TL |
835 | write_dentry = old_fnode_version < lump.fnode->version; |
836 | } else if (dentry_type == 'I' || dentry_type == 'i') { | |
7c673cae FG |
837 | // Read out inode version to compare with backing store |
838 | InodeStore inode; | |
f67539c2 TL |
839 | if (dentry_type == 'i') { |
840 | mempool::mds_co::string alternate_name; | |
841 | ||
842 | DECODE_START(2, q); | |
843 | if (struct_v >= 2) | |
844 | decode(alternate_name, q); | |
845 | inode.decode(q); | |
846 | DECODE_FINISH(q); | |
847 | } else { | |
848 | inode.decode_bare(q); | |
849 | } | |
7c673cae | 850 | dout(4) << "decoded embedded inode version " |
f67539c2 TL |
851 | << inode.inode->version << " vs fullbit version " |
852 | << fb.inode->version << dendl; | |
853 | if (inode.inode->version < fb.inode->version) { | |
7c673cae FG |
854 | write_dentry = true; |
855 | } | |
856 | } else { | |
857 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
858 | "journal" << dendl; | |
859 | write_dentry = true; | |
860 | } | |
861 | } | |
862 | ||
863 | if ((other_pool || write_dentry) && !dry_run) { | |
864 | dout(4) << "writing I dentry " << key << " into frag " | |
865 | << frag_oid.name << dendl; | |
866 | ||
867 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
868 | bufferlist dentry_bl; | |
11fdf7f2 TL |
869 | encode(fb.dnfirst, dentry_bl); |
870 | encode('I', dentry_bl); | |
7c673cae FG |
871 | encode_fullbit_as_inode(fb, true, &dentry_bl); |
872 | ||
873 | // Record for writing to RADOS | |
874 | write_vals[key] = dentry_bl; | |
f67539c2 | 875 | consumed_inos->insert(fb.inode->ino); |
7c673cae FG |
876 | } |
877 | } | |
878 | ||
11fdf7f2 | 879 | for(const auto& rb : lump.get_dremote()) { |
7c673cae FG |
880 | // Get a key like "foobar_head" |
881 | std::string key; | |
882 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
883 | dn_key.encode(key); | |
884 | ||
885 | dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn | |
886 | << dendl; | |
887 | bool write_dentry = false; | |
888 | if (read_vals.find(key) == read_vals.end()) { | |
889 | dout(4) << "dentry did not already exist, will create" << dendl; | |
890 | write_dentry = true; | |
891 | } else { | |
892 | dout(4) << "dentry " << key << " existed already" << dendl; | |
893 | dout(4) << "dentry exists, checking versions..." << dendl; | |
894 | bufferlist &old_dentry = read_vals[key]; | |
895 | // Decode dentry+inode | |
11fdf7f2 | 896 | auto q = old_dentry.cbegin(); |
7c673cae FG |
897 | |
898 | snapid_t dnfirst; | |
11fdf7f2 | 899 | decode(dnfirst, q); |
7c673cae | 900 | char dentry_type; |
11fdf7f2 | 901 | decode(dentry_type, q); |
7c673cae | 902 | |
f67539c2 | 903 | if (dentry_type == 'L' || dentry_type == 'l') { |
7c673cae FG |
904 | dout(10) << "Existing hardlink inode in slot to be (maybe) written " |
905 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
f67539c2 | 906 | << "' with lump fnode version " << lump.fnode->version |
7c673cae | 907 | << "vs existing fnode version " << old_fnode_version << dendl; |
f67539c2 TL |
908 | write_dentry = old_fnode_version < lump.fnode->version; |
909 | } else if (dentry_type == 'I' || dentry_type == 'i') { | |
7c673cae FG |
910 | dout(10) << "Existing full inode in slot to be (maybe) written " |
911 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
f67539c2 | 912 | << "' with lump fnode version " << lump.fnode->version |
7c673cae | 913 | << "vs existing fnode version " << old_fnode_version << dendl; |
f67539c2 | 914 | write_dentry = old_fnode_version < lump.fnode->version; |
7c673cae FG |
915 | } else { |
916 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
917 | "journal" << dendl; | |
918 | write_dentry = true; | |
919 | } | |
920 | } | |
921 | ||
922 | if ((other_pool || write_dentry) && !dry_run) { | |
923 | dout(4) << "writing L dentry " << key << " into frag " | |
924 | << frag_oid.name << dendl; | |
925 | ||
926 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
927 | bufferlist dentry_bl; | |
11fdf7f2 TL |
928 | encode(rb.dnfirst, dentry_bl); |
929 | encode('L', dentry_bl); | |
930 | encode(rb.ino, dentry_bl); | |
931 | encode(rb.d_type, dentry_bl); | |
7c673cae FG |
932 | |
933 | // Record for writing to RADOS | |
934 | write_vals[key] = dentry_bl; | |
935 | consumed_inos->insert(rb.ino); | |
936 | } | |
937 | } | |
938 | ||
c07f9fc5 | 939 | std::set<std::string> null_vals; |
11fdf7f2 | 940 | for (const auto& nb : lump.get_dnull()) { |
c07f9fc5 FG |
941 | std::string key; |
942 | dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); | |
943 | dn_key.encode(key); | |
944 | ||
945 | dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn | |
946 | << dendl; | |
947 | ||
948 | auto it = read_vals.find(key); | |
949 | if (it != read_vals.end()) { | |
950 | dout(4) << "dentry exists, will remove" << dendl; | |
951 | ||
11fdf7f2 | 952 | auto q = it->second.cbegin(); |
c07f9fc5 | 953 | snapid_t dnfirst; |
11fdf7f2 | 954 | decode(dnfirst, q); |
c07f9fc5 | 955 | char dentry_type; |
11fdf7f2 | 956 | decode(dentry_type, q); |
c07f9fc5 FG |
957 | |
958 | bool remove_dentry = false; | |
f67539c2 | 959 | if (dentry_type == 'L' || dentry_type == 'l') { |
c07f9fc5 FG |
960 | dout(10) << "Existing hardlink inode in slot to be (maybe) removed " |
961 | << "by null journal dn '" << nb.dn.c_str() | |
f67539c2 | 962 | << "' with lump fnode version " << lump.fnode->version |
c07f9fc5 | 963 | << "vs existing fnode version " << old_fnode_version << dendl; |
f67539c2 TL |
964 | remove_dentry = old_fnode_version < lump.fnode->version; |
965 | } else if (dentry_type == 'I' || dentry_type == 'i') { | |
c07f9fc5 FG |
966 | dout(10) << "Existing full inode in slot to be (maybe) removed " |
967 | << "by null journal dn '" << nb.dn.c_str() | |
f67539c2 | 968 | << "' with lump fnode version " << lump.fnode->version |
c07f9fc5 | 969 | << "vs existing fnode version " << old_fnode_version << dendl; |
f67539c2 | 970 | remove_dentry = old_fnode_version < lump.fnode->version; |
c07f9fc5 FG |
971 | } else { |
972 | dout(4) << "corrupt dentry in backing store, will remove" << dendl; | |
973 | remove_dentry = true; | |
974 | } | |
975 | ||
976 | if (remove_dentry) | |
977 | null_vals.insert(key); | |
978 | } | |
979 | } | |
980 | ||
7c673cae FG |
981 | // Write back any new/changed dentries |
982 | if (!write_vals.empty()) { | |
983 | r = output.omap_set(frag_oid.name, write_vals); | |
984 | if (r != 0) { | |
985 | derr << "error writing dentries to " << frag_oid.name | |
986 | << ": " << cpp_strerror(r) << dendl; | |
987 | return r; | |
988 | } | |
989 | } | |
c07f9fc5 FG |
990 | |
991 | // remove any null dentries | |
992 | if (!null_vals.empty()) { | |
993 | r = output.omap_rm_keys(frag_oid.name, null_vals); | |
994 | if (r != 0) { | |
995 | derr << "error removing dentries from " << frag_oid.name | |
996 | << ": " << cpp_strerror(r) << dendl; | |
997 | return r; | |
998 | } | |
999 | } | |
7c673cae FG |
1000 | } |
1001 | ||
1002 | /* Now that we've looked at the dirlumps, we finally pay attention to | |
1003 | * the roots (i.e. inodes without ancestry). This is necessary in order | |
1004 | * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally | |
1005 | * important because clients use them to infer completeness | |
1006 | * of directories | |
1007 | */ | |
11fdf7f2 | 1008 | for (const auto& fb : metablob.roots) { |
f67539c2 | 1009 | inodeno_t ino = fb.inode->ino; |
7c673cae FG |
1010 | dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl; |
1011 | ||
1012 | object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); | |
1013 | dout(4) << "object id " << root_oid.name << dendl; | |
1014 | ||
1015 | bool write_root_ino = false; | |
1016 | bufferlist old_root_ino_bl; | |
1017 | r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); | |
1018 | if (r == -ENOENT) { | |
1019 | dout(4) << "root does not exist, will create" << dendl; | |
1020 | write_root_ino = true; | |
1021 | } else if (r >= 0) { | |
1022 | r = 0; | |
1023 | InodeStore old_inode; | |
1024 | dout(4) << "root exists, will modify (" << old_root_ino_bl.length() | |
1025 | << ")" << dendl; | |
11fdf7f2 | 1026 | auto inode_bl_iter = old_root_ino_bl.cbegin(); |
7c673cae | 1027 | std::string magic; |
11fdf7f2 | 1028 | decode(magic, inode_bl_iter); |
7c673cae FG |
1029 | if (magic == CEPH_FS_ONDISK_MAGIC) { |
1030 | dout(4) << "magic ok" << dendl; | |
1031 | old_inode.decode(inode_bl_iter); | |
1032 | ||
f67539c2 | 1033 | if (old_inode.inode->version < fb.inode->version) { |
7c673cae FG |
1034 | write_root_ino = true; |
1035 | } | |
1036 | } else { | |
1037 | dout(4) << "magic bad: '" << magic << "'" << dendl; | |
1038 | write_root_ino = true; | |
1039 | } | |
1040 | } else { | |
1041 | derr << "error reading root inode object " << root_oid.name | |
1042 | << ": " << cpp_strerror(r) << dendl; | |
1043 | return r; | |
1044 | } | |
1045 | ||
1046 | if (write_root_ino && !dry_run) { | |
1047 | dout(4) << "writing root ino " << root_oid.name | |
f67539c2 | 1048 | << " version " << fb.inode->version << dendl; |
7c673cae FG |
1049 | |
1050 | // Compose: root ino format is magic,InodeStore(bare=false) | |
1051 | bufferlist new_root_ino_bl; | |
11fdf7f2 | 1052 | encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl); |
7c673cae FG |
1053 | encode_fullbit_as_inode(fb, false, &new_root_ino_bl); |
1054 | ||
1055 | // Write to RADOS | |
1056 | r = output.write_full(root_oid.name, new_root_ino_bl); | |
1057 | if (r != 0) { | |
1058 | derr << "error writing inode object " << root_oid.name | |
1059 | << ": " << cpp_strerror(r) << dendl; | |
1060 | return r; | |
1061 | } | |
1062 | } | |
1063 | } | |
1064 | ||
1065 | return r; | |
1066 | } | |
1067 | ||
1068 | ||
7c673cae FG |
1069 | /** |
1070 | * Erase a region of the log by overwriting it with ENoOp | |
1071 | * | |
1072 | */ | |
1073 | int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length) | |
1074 | { | |
1075 | // To erase this region, we use our preamble, the encoding overhead | |
1076 | // of an ENoOp, and our trailing start ptr. Calculate how much padding | |
1077 | // is needed inside the ENoOp to make up the difference. | |
1078 | bufferlist tmp; | |
11fdf7f2 TL |
1079 | if (type == "mdlog") { |
1080 | ENoOp enoop(0); | |
1081 | enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1082 | } else if (type == "purge_queue") { | |
1083 | PurgeItem pi; | |
1084 | pi.encode(tmp); | |
1085 | } | |
7c673cae FG |
1086 | |
1087 | dout(4) << "erase_region " << pos << " len=" << length << dendl; | |
1088 | ||
1089 | // FIXME: get the preamble/postamble length via JournalStream | |
1090 | int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t); | |
1091 | dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl; | |
1092 | ||
1093 | if (padding < 0) { | |
1094 | derr << "Erase region " << length << " too short" << dendl; | |
1095 | return -EINVAL; | |
1096 | } | |
1097 | ||
7c673cae | 1098 | bufferlist entry; |
11fdf7f2 TL |
1099 | if (type == "mdlog") { |
1100 | // Serialize an ENoOp with the correct amount of padding | |
1101 | ENoOp enoop(padding); | |
1102 | enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1103 | } else if (type == "purge_queue") { | |
1104 | PurgeItem pi; | |
1105 | pi.pad_size = padding; | |
1106 | pi.encode(entry); | |
1107 | } | |
7c673cae | 1108 | JournalStream stream(JOURNAL_FORMAT_RESILIENT); |
7c673cae FG |
1109 | // Serialize region of log stream |
1110 | bufferlist log_data; | |
1111 | stream.write(entry, &log_data, pos); | |
1112 | ||
1113 | dout(4) << "erase_region data length " << log_data.length() << dendl; | |
11fdf7f2 | 1114 | ceph_assert(log_data.length() == length); |
7c673cae FG |
1115 | |
1116 | // Write log stream region to RADOS | |
1117 | // FIXME: get object size somewhere common to scan_events | |
11fdf7f2 | 1118 | uint32_t object_size = g_conf()->mds_log_segment_size; |
7c673cae FG |
1119 | if (object_size == 0) { |
1120 | // Default layout object size | |
1121 | object_size = file_layout_t::get_default().object_size; | |
1122 | } | |
1123 | ||
1124 | uint64_t write_offset = pos; | |
1125 | uint64_t obj_offset = (pos / object_size); | |
1126 | int r = 0; | |
1127 | while(log_data.length()) { | |
1128 | std::string const oid = js.obj_name(obj_offset); | |
1129 | uint32_t offset_in_obj = write_offset % object_size; | |
1130 | uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); | |
1131 | ||
1132 | r = output.write(oid, log_data, write_len, offset_in_obj); | |
1133 | if (r < 0) { | |
1134 | return r; | |
1135 | } else { | |
1136 | dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl; | |
1137 | r = 0; | |
1138 | } | |
1139 | ||
1140 | log_data.splice(0, write_len); | |
1141 | write_offset += write_len; | |
1142 | obj_offset++; | |
1143 | } | |
1144 | ||
1145 | return r; | |
1146 | } | |
1147 | ||
1148 | /** | |
1149 | * Given an EMetaBlob::fullbit containing an inode, write out | |
1150 | * the encoded inode in the format used by InodeStore (i.e. the | |
1151 | * backing store format) | |
1152 | * | |
1153 | * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use | |
1154 | * on an offline InodeStore instance. It's way simpler, because we are just | |
1155 | * uncritically hauling the data between structs. | |
1156 | * | |
1157 | * @param fb a fullbit extracted from a journal entry | |
1158 | * @param bare if true, leave out [EN|DE]CODE_START decoration | |
1159 | * @param out_bl output, write serialized inode to this bufferlist | |
1160 | */ | |
1161 | void JournalTool::encode_fullbit_as_inode( | |
1162 | const EMetaBlob::fullbit &fb, | |
1163 | const bool bare, | |
1164 | bufferlist *out_bl) | |
1165 | { | |
11fdf7f2 | 1166 | ceph_assert(out_bl != NULL); |
7c673cae FG |
1167 | |
1168 | // Compose InodeStore | |
1169 | InodeStore new_inode; | |
1170 | new_inode.inode = fb.inode; | |
1171 | new_inode.xattrs = fb.xattrs; | |
1172 | new_inode.dirfragtree = fb.dirfragtree; | |
1173 | new_inode.snap_blob = fb.snapbl; | |
11fdf7f2 | 1174 | new_inode.symlink = fb.symlink; |
7c673cae FG |
1175 | new_inode.old_inodes = fb.old_inodes; |
1176 | ||
1177 | // Serialize InodeStore | |
1178 | if (bare) { | |
1179 | new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1180 | } else { | |
1181 | new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1182 | } | |
1183 | } | |
1184 | ||
1185 | /** | |
1186 | * Given a list of inode numbers known to be in use by | |
1187 | * inodes in the backing store, ensure that none of these | |
1188 | * numbers are listed as free in the InoTables in the | |
1189 | * backing store. | |
1190 | * | |
1191 | * Used after injecting inodes into the backing store, to | |
1192 | * ensure that the same inode numbers are not subsequently | |
1193 | * used for new files during ordinary operation. | |
1194 | * | |
1195 | * @param inos list of inode numbers to be removed from | |
1196 | * free lists in InoTables | |
1197 | * @returns 0 on success, else negative error code | |
1198 | */ | |
1199 | int JournalTool::consume_inos(const std::set<inodeno_t> &inos) | |
1200 | { | |
1201 | int r = 0; | |
1202 | ||
1203 | // InoTable is a per-MDS structure, so iterate over assigned ranks | |
1204 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
1205 | std::set<mds_rank_t> in_ranks; | |
1206 | fs->mds_map.get_mds_set(in_ranks); | |
1207 | ||
1208 | for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin(); | |
1209 | rank_i != in_ranks.end(); ++rank_i) | |
1210 | { | |
1211 | // Compose object name | |
1212 | std::ostringstream oss; | |
1213 | oss << "mds" << *rank_i << "_inotable"; | |
1214 | object_t inotable_oid = object_t(oss.str()); | |
1215 | ||
1216 | // Read object | |
1217 | bufferlist inotable_bl; | |
1218 | int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); | |
1219 | if (read_r < 0) { | |
1220 | // Things are really bad if we can't read inotable. Beyond our powers. | |
1221 | derr << "unable to read inotable '" << inotable_oid.name << "': " | |
1222 | << cpp_strerror(read_r) << dendl; | |
1223 | r = r ? r : read_r; | |
1224 | continue; | |
1225 | } | |
1226 | ||
1227 | // Deserialize InoTable | |
1228 | version_t inotable_ver; | |
11fdf7f2 TL |
1229 | auto q = inotable_bl.cbegin(); |
1230 | decode(inotable_ver, q); | |
7c673cae FG |
1231 | InoTable ino_table(NULL); |
1232 | ino_table.decode(q); | |
1233 | ||
1234 | // Update InoTable in memory | |
1235 | bool inotable_modified = false; | |
1236 | for (std::set<inodeno_t>::iterator i = inos.begin(); | |
1237 | i != inos.end(); ++i) | |
1238 | { | |
1239 | const inodeno_t ino = *i; | |
1240 | if (ino_table.force_consume(ino)) { | |
1241 | dout(4) << "Used ino 0x" << std::hex << ino << std::dec | |
1242 | << " requires inotable update" << dendl; | |
1243 | inotable_modified = true; | |
1244 | } | |
1245 | } | |
1246 | ||
1247 | // Serialize and write InoTable | |
1248 | if (inotable_modified) { | |
1249 | inotable_ver += 1; | |
1250 | dout(4) << "writing modified inotable version " << inotable_ver << dendl; | |
1251 | bufferlist inotable_new_bl; | |
11fdf7f2 | 1252 | encode(inotable_ver, inotable_new_bl); |
7c673cae FG |
1253 | ino_table.encode_state(inotable_new_bl); |
1254 | int write_r = output.write_full(inotable_oid.name, inotable_new_bl); | |
1255 | if (write_r != 0) { | |
1256 | derr << "error writing modified inotable " << inotable_oid.name | |
1257 | << ": " << cpp_strerror(write_r) << dendl; | |
1258 | r = r ? r : read_r; | |
1259 | continue; | |
1260 | } | |
1261 | } | |
1262 | } | |
1263 | ||
1264 | return r; | |
1265 | } | |
1266 |