]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * ceph - scalable distributed file system | |
5 | * | |
6 | * copyright (c) 2014 john spray <john.spray@inktank.com> | |
7 | * | |
8 | * this is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the gnu lesser general public | |
10 | * license version 2.1, as published by the free software | |
11 | * foundation. see file copying. | |
12 | */ | |
13 | ||
14 | ||
15 | #include <sstream> | |
16 | ||
17 | #include "common/ceph_argparse.h" | |
18 | #include "common/errno.h" | |
19 | #include "osdc/Journaler.h" | |
20 | #include "mds/mdstypes.h" | |
21 | #include "mds/LogEvent.h" | |
22 | #include "mds/InoTable.h" | |
23 | ||
24 | #include "mds/events/ENoOp.h" | |
25 | #include "mds/events/EUpdate.h" | |
26 | ||
27 | #include "JournalScanner.h" | |
28 | #include "EventOutput.h" | |
29 | #include "Dumper.h" | |
30 | #include "Resetter.h" | |
31 | ||
32 | #include "JournalTool.h" | |
33 | ||
34 | ||
35 | #define dout_context g_ceph_context | |
36 | #define dout_subsys ceph_subsys_mds | |
37 | #undef dout_prefix | |
38 | #define dout_prefix *_dout << __func__ << ": " | |
39 | ||
40 | ||
41 | ||
42 | void JournalTool::usage() | |
43 | { | |
44 | std::cout << "Usage: \n" | |
45 | << " cephfs-journal-tool [options] journal <command>\n" | |
46 | << " <command>:\n" | |
47 | << " inspect\n" | |
91327a77 | 48 | << " import <path> [--force]\n" |
7c673cae FG |
49 | << " export <path>\n" |
50 | << " reset [--force]\n" | |
51 | << " cephfs-journal-tool [options] header <get|set <field> <value>\n" | |
b32b8144 | 52 | << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]" |
31f18b77 | 53 | << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n" |
7c673cae FG |
54 | << " <selector>:\n" |
55 | << " --range=<start>..<end>\n" | |
56 | << " --path=<substring>\n" | |
57 | << " --inode=<integer>\n" | |
58 | << " --type=<UPDATE|OPEN|SESSION...><\n" | |
59 | << " --frag=<ino>.<frag> [--dname=<dentry string>]\n" | |
7c673cae | 60 | << " --client=<session id integer>\n" |
31f18b77 | 61 | << " <effect>: [get|recover_dentries|splice]\n" |
7c673cae FG |
62 | << " <output>: [summary|list|binary|json] [--path <path>]\n" |
63 | << "\n" | |
31f18b77 | 64 | << "General options:\n" |
7c673cae FG |
65 | << " --rank=filesystem:mds-rank Journal rank (required if multiple\n" |
66 | << " file systems, default is rank 0 on\n" | |
31f18b77 FG |
67 | << " the only filesystem otherwise.\n" |
68 | << "\n" | |
69 | << "Special options\n" | |
70 | << " --alternate-pool <name> Alternative metadata pool to target\n" | |
71 | << " when using recover_dentries.\n"; | |
7c673cae FG |
72 | |
73 | generic_client_usage(); | |
74 | } | |
75 | ||
76 | ||
77 | /** | |
78 | * Handle arguments and hand off to journal/header/event mode | |
79 | */ | |
80 | int JournalTool::main(std::vector<const char*> &argv) | |
81 | { | |
82 | int r; | |
83 | ||
84 | dout(10) << "JournalTool::main " << dendl; | |
85 | // Common arg parsing | |
86 | // ================== | |
87 | if (argv.empty()) { | |
88 | usage(); | |
89 | return -EINVAL; | |
90 | } | |
91 | ||
92 | std::vector<const char*>::iterator arg = argv.begin(); | |
93 | ||
94 | std::string rank_str; | |
95 | if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) { | |
96 | // Default: act on rank 0. Will give the user an error if they | |
97 | // try invoking this way when they have more than one filesystem. | |
98 | rank_str = "0"; | |
99 | } | |
100 | ||
101 | r = role_selector.parse(*fsmap, rank_str); | |
102 | if (r != 0) { | |
103 | derr << "Couldn't determine MDS rank." << dendl; | |
104 | return r; | |
105 | } | |
106 | ||
107 | std::string mode; | |
108 | if (arg == argv.end()) { | |
109 | derr << "Missing mode [journal|header|event]" << dendl; | |
110 | return -EINVAL; | |
111 | } | |
112 | mode = std::string(*arg); | |
113 | arg = argv.erase(arg); | |
114 | ||
115 | // RADOS init | |
116 | // ========== | |
117 | r = rados.init_with_context(g_ceph_context); | |
118 | if (r < 0) { | |
119 | derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; | |
120 | return r; | |
121 | } | |
122 | ||
123 | dout(4) << "JournalTool: connecting to RADOS..." << dendl; | |
124 | r = rados.connect(); | |
125 | if (r < 0) { | |
126 | derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; | |
127 | return r; | |
128 | } | |
129 | ||
130 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
131 | assert(fs != nullptr); | |
132 | int64_t const pool_id = fs->mds_map.get_metadata_pool(); | |
133 | dout(4) << "JournalTool: resolving pool " << pool_id << dendl; | |
134 | std::string pool_name; | |
135 | r = rados.pool_reverse_lookup(pool_id, &pool_name); | |
136 | if (r < 0) { | |
137 | derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl; | |
138 | return r; | |
139 | } | |
140 | ||
141 | dout(4) << "JournalTool: creating IoCtx.." << dendl; | |
142 | r = rados.ioctx_create(pool_name.c_str(), input); | |
143 | assert(r == 0); | |
144 | output.dup(input); | |
145 | ||
146 | // Execution | |
147 | // ========= | |
148 | for (auto role : role_selector.get_roles()) { | |
149 | rank = role.rank; | |
150 | dout(4) << "Executing for rank " << rank << dendl; | |
151 | if (mode == std::string("journal")) { | |
152 | r = main_journal(argv); | |
153 | } else if (mode == std::string("header")) { | |
154 | r = main_header(argv); | |
155 | } else if (mode == std::string("event")) { | |
156 | r = main_event(argv); | |
157 | } else { | |
158 | derr << "Bad command '" << mode << "'" << dendl; | |
159 | usage(); | |
160 | return -EINVAL; | |
161 | } | |
162 | ||
163 | if (r != 0) { | |
164 | return r; | |
165 | } | |
166 | } | |
167 | ||
168 | return r; | |
169 | } | |
170 | ||
171 | ||
172 | /** | |
173 | * Handle arguments for 'journal' mode | |
174 | * | |
175 | * This is for operations that act on the journal as a whole. | |
176 | */ | |
177 | int JournalTool::main_journal(std::vector<const char*> &argv) | |
178 | { | |
179 | std::string command = argv[0]; | |
180 | if (command == "inspect") { | |
181 | return journal_inspect(); | |
182 | } else if (command == "export" || command == "import") { | |
91327a77 | 183 | bool force = false; |
7c673cae FG |
184 | if (argv.size() >= 2) { |
185 | std::string const path = argv[1]; | |
91327a77 AA |
186 | if (argv.size() == 3) { |
187 | if (std::string(argv[2]) == "--force") { | |
188 | force = true; | |
189 | } else { | |
190 | std::cerr << "Unknown argument " << argv[1] << std::endl; | |
191 | return -EINVAL; | |
192 | } | |
193 | } | |
194 | return journal_export(path, command == "import", force); | |
7c673cae FG |
195 | } else { |
196 | derr << "Missing path" << dendl; | |
197 | return -EINVAL; | |
198 | } | |
199 | } else if (command == "reset") { | |
200 | bool force = false; | |
201 | if (argv.size() == 2) { | |
202 | if (std::string(argv[1]) == "--force") { | |
203 | force = true; | |
204 | } else { | |
205 | std::cerr << "Unknown argument " << argv[1] << std::endl; | |
206 | usage(); | |
207 | return -EINVAL; | |
208 | } | |
209 | } else if (argv.size() > 2) { | |
210 | std::cerr << "Too many arguments!" << std::endl; | |
211 | usage(); | |
212 | return -EINVAL; | |
213 | } | |
214 | return journal_reset(force); | |
215 | } else { | |
216 | derr << "Bad journal command '" << command << "'" << dendl; | |
217 | return -EINVAL; | |
218 | } | |
219 | } | |
220 | ||
221 | ||
222 | /** | |
223 | * Parse arguments and execute for 'header' mode | |
224 | * | |
225 | * This is for operations that act on the header only. | |
226 | */ | |
227 | int JournalTool::main_header(std::vector<const char*> &argv) | |
228 | { | |
229 | JournalFilter filter; | |
230 | JournalScanner js(input, rank, filter); | |
231 | int r = js.scan(false); | |
232 | if (r < 0) { | |
233 | std::cerr << "Unable to scan journal" << std::endl; | |
234 | return r; | |
235 | } | |
236 | ||
237 | if (!js.header_present) { | |
238 | std::cerr << "Header object not found!" << std::endl; | |
239 | return -ENOENT; | |
240 | } else if (!js.header_valid && js.header == NULL) { | |
241 | // Can't do a read or a single-field write without a copy of the original | |
242 | derr << "Header could not be read!" << dendl; | |
243 | return -ENOENT; | |
244 | } else { | |
245 | assert(js.header != NULL); | |
246 | } | |
247 | ||
248 | if (argv.size() == 0) { | |
249 | derr << "Invalid header command, must be [get|set]" << dendl; | |
250 | return -EINVAL; | |
251 | } | |
252 | std::vector<const char *>::iterator arg = argv.begin(); | |
253 | std::string const command = *arg; | |
254 | arg = argv.erase(arg); | |
255 | ||
256 | if (command == std::string("get")) { | |
257 | // Write JSON journal dump to stdout | |
258 | JSONFormatter jf(true); | |
259 | js.header->dump(&jf); | |
260 | jf.flush(std::cout); | |
261 | std::cout << std::endl; | |
262 | } else if (command == std::string("set")) { | |
263 | // Need two more args <key> <val> | |
264 | if (argv.size() != 2) { | |
265 | derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl; | |
266 | return -EINVAL; | |
267 | } | |
268 | ||
269 | std::string const field_name = *arg; | |
270 | arg = argv.erase(arg); | |
271 | ||
272 | std::string const value_str = *arg; | |
273 | arg = argv.erase(arg); | |
274 | assert(argv.empty()); | |
275 | ||
276 | std::string parse_err; | |
277 | uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err); | |
278 | if (!parse_err.empty()) { | |
279 | derr << "Invalid value '" << value_str << "': " << parse_err << dendl; | |
280 | return -EINVAL; | |
281 | } | |
282 | ||
283 | uint64_t *field = NULL; | |
284 | if (field_name == "trimmed_pos") { | |
285 | field = &(js.header->trimmed_pos); | |
286 | } else if (field_name == "expire_pos") { | |
287 | field = &(js.header->expire_pos); | |
288 | } else if (field_name == "write_pos") { | |
289 | field = &(js.header->write_pos); | |
b32b8144 FG |
290 | } else if (field_name == "pool_id") { |
291 | field = (uint64_t*)(&(js.header->layout.pool_id)); | |
7c673cae FG |
292 | } else { |
293 | derr << "Invalid field '" << field_name << "'" << dendl; | |
294 | return -EINVAL; | |
295 | } | |
296 | ||
297 | std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl; | |
298 | *field = new_val; | |
299 | ||
300 | dout(4) << "Writing object..." << dendl; | |
301 | bufferlist header_bl; | |
302 | ::encode(*(js.header), header_bl); | |
303 | output.write_full(js.obj_name(0), header_bl); | |
304 | dout(4) << "Write complete." << dendl; | |
305 | std::cout << "Successfully updated header." << std::endl; | |
306 | } else { | |
307 | derr << "Bad header command '" << command << "'" << dendl; | |
308 | return -EINVAL; | |
309 | } | |
310 | ||
311 | return 0; | |
312 | } | |
313 | ||
314 | ||
315 | /** | |
316 | * Parse arguments and execute for 'event' mode | |
317 | * | |
318 | * This is for operations that act on LogEvents within the log | |
319 | */ | |
320 | int JournalTool::main_event(std::vector<const char*> &argv) | |
321 | { | |
322 | int r; | |
323 | ||
324 | std::vector<const char*>::iterator arg = argv.begin(); | |
325 | ||
326 | std::string command = *(arg++); | |
31f18b77 | 327 | if (command != "get" && command != "splice" && command != "recover_dentries") { |
7c673cae FG |
328 | derr << "Unknown argument '" << command << "'" << dendl; |
329 | usage(); | |
330 | return -EINVAL; | |
331 | } | |
332 | ||
333 | if (arg == argv.end()) { | |
334 | derr << "Incomplete command line" << dendl; | |
335 | usage(); | |
336 | return -EINVAL; | |
337 | } | |
338 | ||
339 | // Parse filter options | |
340 | // ==================== | |
341 | JournalFilter filter; | |
342 | r = filter.parse_args(argv, arg); | |
343 | if (r) { | |
344 | return r; | |
345 | } | |
346 | ||
347 | // Parse output options | |
348 | // ==================== | |
349 | if (arg == argv.end()) { | |
350 | derr << "Missing output command" << dendl; | |
351 | usage(); | |
352 | } | |
353 | std::string output_style = *(arg++); | |
354 | if (output_style != "binary" && output_style != "json" && | |
355 | output_style != "summary" && output_style != "list") { | |
356 | derr << "Unknown argument: '" << output_style << "'" << dendl; | |
357 | usage(); | |
358 | return -EINVAL; | |
359 | } | |
360 | ||
361 | std::string output_path = "dump"; | |
362 | while(arg != argv.end()) { | |
363 | std::string arg_str; | |
364 | if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { | |
365 | output_path = arg_str; | |
366 | } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", | |
367 | nullptr)) { | |
368 | dout(1) << "Using alternate pool " << arg_str << dendl; | |
369 | int r = rados.ioctx_create(arg_str.c_str(), output); | |
370 | assert(r == 0); | |
371 | other_pool = true; | |
372 | } else { | |
373 | derr << "Unknown argument: '" << *arg << "'" << dendl; | |
374 | usage(); | |
375 | return -EINVAL; | |
376 | } | |
377 | } | |
378 | ||
379 | // Execute command | |
380 | // =============== | |
381 | JournalScanner js(input, rank, filter); | |
382 | if (command == "get") { | |
383 | r = js.scan(); | |
384 | if (r) { | |
385 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
386 | return r; | |
387 | } | |
7c673cae FG |
388 | } else if (command == "recover_dentries") { |
389 | r = js.scan(); | |
390 | if (r) { | |
391 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
392 | return r; | |
393 | } | |
394 | ||
395 | bool dry_run = false; | |
396 | if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) { | |
397 | dry_run = true; | |
398 | } | |
399 | ||
400 | /** | |
401 | * Iterate over log entries, attempting to scavenge from each one | |
402 | */ | |
403 | std::set<inodeno_t> consumed_inos; | |
404 | for (JournalScanner::EventMap::iterator i = js.events.begin(); | |
405 | i != js.events.end(); ++i) { | |
406 | LogEvent *le = i->second.log_event; | |
407 | EMetaBlob const *mb = le->get_metablob(); | |
408 | if (mb) { | |
31f18b77 | 409 | int scav_r = recover_dentries(*mb, dry_run, &consumed_inos); |
7c673cae FG |
410 | if (scav_r) { |
411 | dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec | |
412 | << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl; | |
413 | if (r == 0) { | |
414 | r = scav_r; | |
415 | } | |
416 | // Our goal is to read all we can, so don't stop on errors, but | |
417 | // do record them for possible later output | |
418 | js.errors.insert(std::make_pair(i->first, | |
419 | JournalScanner::EventError(scav_r, cpp_strerror(r)))); | |
420 | } | |
421 | } | |
422 | } | |
423 | ||
424 | /** | |
425 | * Update InoTable to reflect any inode numbers consumed during scavenge | |
426 | */ | |
427 | dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl; | |
428 | if (consumed_inos.size() && !dry_run) { | |
429 | int consume_r = consume_inos(consumed_inos); | |
430 | if (consume_r) { | |
431 | dout(1) << "Error updating InoTable for " << consumed_inos.size() | |
432 | << " consume inos: " << cpp_strerror(consume_r) << dendl; | |
433 | if (r == 0) { | |
434 | r = consume_r; | |
435 | } | |
436 | } | |
437 | } | |
438 | ||
439 | // Remove consumed dentries from lost+found. | |
440 | if (other_pool && !dry_run) { | |
441 | std::set<std::string> found; | |
442 | ||
443 | for (auto i : consumed_inos) { | |
444 | char s[20]; | |
445 | ||
446 | snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); | |
447 | dout(20) << "removing " << s << dendl; | |
448 | found.insert(std::string(s)); | |
449 | } | |
450 | ||
451 | object_t frag_oid; | |
452 | frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, | |
453 | frag_t(), ""); | |
454 | output.omap_rm_keys(frag_oid.name, found); | |
455 | } | |
456 | } else if (command == "splice") { | |
457 | r = js.scan(); | |
458 | if (r) { | |
459 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
460 | return r; | |
461 | } | |
462 | ||
463 | uint64_t start, end; | |
464 | if (filter.get_range(start, end)) { | |
465 | // Special case for range filter: erase a numeric range in the log | |
466 | uint64_t range = end - start; | |
467 | int r = erase_region(js, start, range); | |
468 | if (r) { | |
469 | derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec | |
470 | << ": " << cpp_strerror(r) << dendl; | |
471 | return r; | |
472 | } | |
473 | } else { | |
474 | // General case: erase a collection of individual entries in the log | |
475 | for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) { | |
476 | dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl; | |
477 | ||
478 | int r = erase_region(js, i->first, i->second.raw_size); | |
479 | if (r) { | |
480 | derr << "Failed to erase event 0x" << std::hex << i->first << std::dec | |
481 | << ": " << cpp_strerror(r) << dendl; | |
482 | return r; | |
483 | } | |
484 | } | |
485 | } | |
486 | ||
487 | ||
488 | } else { | |
489 | derr << "Unknown argument '" << command << "'" << dendl; | |
490 | usage(); | |
491 | return -EINVAL; | |
492 | } | |
493 | ||
494 | // Generate output | |
495 | // =============== | |
496 | EventOutput output(js, output_path); | |
497 | int output_result = 0; | |
498 | if (output_style == "binary") { | |
499 | output_result = output.binary(); | |
500 | } else if (output_style == "json") { | |
501 | output_result = output.json(); | |
502 | } else if (output_style == "summary") { | |
503 | output.summary(); | |
504 | } else if (output_style == "list") { | |
505 | output.list(); | |
506 | } else { | |
507 | std::cerr << "Bad output command '" << output_style << "'" << std::endl; | |
508 | return -EINVAL; | |
509 | } | |
510 | ||
511 | if (output_result != 0) { | |
512 | std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl; | |
513 | } | |
514 | ||
515 | return output_result; | |
516 | } | |
517 | ||
518 | /** | |
519 | * Provide the user with information about the condition of the journal, | |
520 | * especially indicating what range of log events is available and where | |
521 | * any gaps or corruptions in the journal are. | |
522 | */ | |
523 | int JournalTool::journal_inspect() | |
524 | { | |
525 | int r; | |
526 | ||
527 | JournalFilter filter; | |
528 | JournalScanner js(input, rank, filter); | |
529 | r = js.scan(); | |
530 | if (r) { | |
531 | std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; | |
532 | return r; | |
533 | } | |
534 | ||
535 | js.report(std::cout); | |
536 | ||
537 | return 0; | |
538 | } | |
539 | ||
540 | ||
541 | /** | |
542 | * Attempt to export a binary dump of the journal. | |
543 | * | |
544 | * This is allowed to fail if the header is malformed or there are | |
545 | * objects inaccessible, in which case the user would have to fall | |
546 | * back to manually listing RADOS objects and extracting them, which | |
547 | * they can do with the ``rados`` CLI. | |
548 | */ | |
91327a77 | 549 | int JournalTool::journal_export(std::string const &path, bool import, bool force) |
7c673cae FG |
550 | { |
551 | int r = 0; | |
552 | JournalScanner js(input, rank); | |
553 | ||
554 | if (!import) { | |
555 | /* | |
556 | * If doing an export, first check that the header is valid and | |
557 | * no objects are missing before trying to dump | |
558 | */ | |
559 | r = js.scan(); | |
560 | if (r < 0) { | |
561 | derr << "Unable to scan journal, assuming badly damaged" << dendl; | |
562 | return r; | |
563 | } | |
564 | if (!js.is_readable()) { | |
565 | derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl; | |
566 | return -EIO; | |
567 | } | |
568 | } | |
569 | ||
570 | /* | |
571 | * Assuming we can cleanly read the journal data, dump it out to a file | |
572 | */ | |
573 | { | |
574 | Dumper dumper; | |
575 | r = dumper.init(mds_role_t(role_selector.get_ns(), rank)); | |
576 | if (r < 0) { | |
577 | derr << "dumper::init failed: " << cpp_strerror(r) << dendl; | |
578 | return r; | |
579 | } | |
580 | if (import) { | |
91327a77 | 581 | r = dumper.undump(path.c_str(), force); |
7c673cae FG |
582 | } else { |
583 | r = dumper.dump(path.c_str()); | |
584 | } | |
7c673cae FG |
585 | } |
586 | ||
587 | return r; | |
588 | } | |
589 | ||
590 | ||
591 | /** | |
592 | * Truncate journal and insert EResetJournal | |
593 | */ | |
594 | int JournalTool::journal_reset(bool hard) | |
595 | { | |
596 | int r = 0; | |
597 | Resetter resetter; | |
598 | r = resetter.init(); | |
599 | if (r < 0) { | |
600 | derr << "resetter::init failed: " << cpp_strerror(r) << dendl; | |
601 | return r; | |
602 | } | |
603 | ||
604 | if (hard) { | |
605 | r = resetter.reset_hard(mds_role_t(role_selector.get_ns(), rank)); | |
606 | } else { | |
607 | r = resetter.reset(mds_role_t(role_selector.get_ns(), rank)); | |
608 | } | |
7c673cae FG |
609 | |
610 | return r; | |
611 | } | |
612 | ||
613 | ||
614 | /** | |
615 | * Selective offline replay which only reads out dentries and writes | |
616 | * them to the backing store iff their version is > what is currently | |
617 | * in the backing store. | |
618 | * | |
619 | * In order to write dentries to the backing store, we may create the | |
620 | * required enclosing dirfrag objects. | |
621 | * | |
622 | * Test this by running scavenge on an unflushed journal, then nuking | |
623 | * it offline, then starting an MDS and seeing that the dentries are | |
624 | * visible. | |
625 | * | |
626 | * @param metablob an EMetaBlob retrieved from the journal | |
627 | * @param dry_run if true, do no writes to RADOS | |
628 | * @param consumed_inos output, populated with any inos inserted | |
629 | * @returns 0 on success, else negative error code | |
630 | */ | |
31f18b77 | 631 | int JournalTool::recover_dentries( |
7c673cae FG |
632 | EMetaBlob const &metablob, |
633 | bool const dry_run, | |
634 | std::set<inodeno_t> *consumed_inos) | |
635 | { | |
636 | assert(consumed_inos != NULL); | |
637 | ||
638 | int r = 0; | |
639 | ||
640 | // Replay fullbits (dentry+inode) | |
641 | for (list<dirfrag_t>::const_iterator lp = metablob.lump_order.begin(); | |
642 | lp != metablob.lump_order.end(); ++lp) | |
643 | { | |
644 | dirfrag_t const &frag = *lp; | |
645 | EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second; | |
646 | lump._decode_bits(); | |
647 | object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, ""); | |
648 | ||
649 | dout(4) << "inspecting lump " << frag_oid.name << dendl; | |
650 | ||
651 | ||
652 | // We will record old fnode version for use in hard link handling | |
653 | // If we don't read an old fnode, take version as zero and write in | |
654 | // all hardlinks we find. | |
655 | version_t old_fnode_version = 0; | |
656 | ||
657 | // Update fnode in omap header of dirfrag object | |
658 | bool write_fnode = false; | |
659 | bufferlist old_fnode_bl; | |
660 | r = input.omap_get_header(frag_oid.name, &old_fnode_bl); | |
661 | if (r == -ENOENT) { | |
662 | // Creating dirfrag from scratch | |
663 | dout(4) << "failed to read OMAP header from directory fragment " | |
664 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
665 | write_fnode = true; | |
666 | // Note: creating the dirfrag *without* a backtrace, relying on | |
667 | // MDS to regenerate backtraces on read or in FSCK | |
668 | } else if (r == 0) { | |
669 | // Conditionally update existing omap header | |
670 | fnode_t old_fnode; | |
671 | bufferlist::iterator old_fnode_iter = old_fnode_bl.begin(); | |
672 | try { | |
673 | old_fnode.decode(old_fnode_iter); | |
674 | dout(4) << "frag " << frag_oid.name << " fnode old v" << | |
675 | old_fnode.version << " vs new v" << lump.fnode.version << dendl; | |
676 | old_fnode_version = old_fnode.version; | |
677 | write_fnode = old_fnode_version < lump.fnode.version; | |
678 | } catch (const buffer::error &err) { | |
679 | dout(1) << "frag " << frag_oid.name | |
680 | << " is corrupt, overwriting" << dendl; | |
681 | write_fnode = true; | |
682 | } | |
683 | } else { | |
684 | // Unexpected error | |
685 | dout(4) << "failed to read OMAP header from directory fragment " | |
686 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
687 | return r; | |
688 | } | |
689 | ||
690 | if ((other_pool || write_fnode) && !dry_run) { | |
691 | dout(4) << "writing fnode to omap header" << dendl; | |
692 | bufferlist fnode_bl; | |
693 | lump.fnode.encode(fnode_bl); | |
694 | if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { | |
695 | r = output.omap_set_header(frag_oid.name, fnode_bl); | |
696 | } | |
697 | if (r != 0) { | |
698 | derr << "Failed to write fnode for frag object " | |
699 | << frag_oid.name << dendl; | |
700 | return r; | |
701 | } | |
702 | } | |
703 | ||
704 | std::set<std::string> read_keys; | |
705 | ||
706 | // Compose list of potentially-existing dentries we would like to fetch | |
707 | list<ceph::shared_ptr<EMetaBlob::fullbit> > const &fb_list = | |
708 | lump.get_dfull(); | |
709 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi = | |
710 | fb_list.begin(); fbi != fb_list.end(); ++fbi) { | |
711 | EMetaBlob::fullbit const &fb = *(*fbi); | |
712 | ||
713 | // Get a key like "foobar_head" | |
714 | std::string key; | |
715 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
716 | dn_key.encode(key); | |
717 | read_keys.insert(key); | |
718 | } | |
719 | ||
720 | list<EMetaBlob::remotebit> const &rb_list = | |
721 | lump.get_dremote(); | |
722 | for (list<EMetaBlob::remotebit>::const_iterator rbi = | |
723 | rb_list.begin(); rbi != rb_list.end(); ++rbi) { | |
724 | EMetaBlob::remotebit const &rb = *rbi; | |
725 | ||
726 | // Get a key like "foobar_head" | |
727 | std::string key; | |
728 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
729 | dn_key.encode(key); | |
730 | read_keys.insert(key); | |
731 | } | |
732 | ||
c07f9fc5 FG |
733 | list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull(); |
734 | for (auto& nb : nb_list) { | |
735 | // Get a key like "foobar_head" | |
736 | std::string key; | |
737 | dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); | |
738 | dn_key.encode(key); | |
739 | read_keys.insert(key); | |
740 | } | |
741 | ||
7c673cae FG |
742 | // Perform bulk read of existing dentries |
743 | std::map<std::string, bufferlist> read_vals; | |
744 | r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
745 | if (r == -ENOENT && other_pool) { | |
746 | r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
747 | } | |
748 | if (r != 0) { | |
749 | derr << "unexpected error reading fragment object " | |
750 | << frag_oid.name << ": " << cpp_strerror(r) << dendl; | |
751 | return r; | |
752 | } | |
753 | ||
754 | // Compose list of dentries we will write back | |
755 | std::map<std::string, bufferlist> write_vals; | |
756 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi = | |
757 | fb_list.begin(); fbi != fb_list.end(); ++fbi) { | |
758 | EMetaBlob::fullbit const &fb = *(*fbi); | |
759 | ||
760 | // Get a key like "foobar_head" | |
761 | std::string key; | |
762 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
763 | dn_key.encode(key); | |
764 | ||
765 | dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn | |
766 | << dendl; | |
767 | bool write_dentry = false; | |
768 | if (read_vals.find(key) == read_vals.end()) { | |
769 | dout(4) << "dentry did not already exist, will create" << dendl; | |
770 | write_dentry = true; | |
771 | } else { | |
772 | dout(4) << "dentry " << key << " existed already" << dendl; | |
773 | dout(4) << "dentry exists, checking versions..." << dendl; | |
774 | bufferlist &old_dentry = read_vals[key]; | |
775 | // Decode dentry+inode | |
776 | bufferlist::iterator q = old_dentry.begin(); | |
777 | ||
778 | snapid_t dnfirst; | |
779 | ::decode(dnfirst, q); | |
780 | char dentry_type; | |
781 | ::decode(dentry_type, q); | |
782 | ||
783 | if (dentry_type == 'L') { | |
784 | // leave write_dentry false, we have no version to | |
785 | // compare with in a hardlink, so it's not safe to | |
786 | // squash over it with what's in this fullbit | |
787 | dout(10) << "Existing remote inode in slot to be (maybe) written " | |
788 | << "by a full inode from the journal dn '" << fb.dn.c_str() | |
789 | << "' with lump fnode version " << lump.fnode.version | |
790 | << "vs existing fnode version " << old_fnode_version << dendl; | |
791 | write_dentry = old_fnode_version < lump.fnode.version; | |
792 | } else if (dentry_type == 'I') { | |
793 | // Read out inode version to compare with backing store | |
794 | InodeStore inode; | |
795 | inode.decode_bare(q); | |
796 | dout(4) << "decoded embedded inode version " | |
797 | << inode.inode.version << " vs fullbit version " | |
798 | << fb.inode.version << dendl; | |
799 | if (inode.inode.version < fb.inode.version) { | |
800 | write_dentry = true; | |
801 | } | |
802 | } else { | |
803 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
804 | "journal" << dendl; | |
805 | write_dentry = true; | |
806 | } | |
807 | } | |
808 | ||
809 | if ((other_pool || write_dentry) && !dry_run) { | |
810 | dout(4) << "writing I dentry " << key << " into frag " | |
811 | << frag_oid.name << dendl; | |
812 | ||
813 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
814 | bufferlist dentry_bl; | |
815 | ::encode(fb.dnfirst, dentry_bl); | |
816 | ::encode('I', dentry_bl); | |
817 | encode_fullbit_as_inode(fb, true, &dentry_bl); | |
818 | ||
819 | // Record for writing to RADOS | |
820 | write_vals[key] = dentry_bl; | |
821 | consumed_inos->insert(fb.inode.ino); | |
822 | } | |
823 | } | |
824 | ||
825 | for (list<EMetaBlob::remotebit>::const_iterator rbi = | |
826 | rb_list.begin(); rbi != rb_list.end(); ++rbi) { | |
827 | EMetaBlob::remotebit const &rb = *rbi; | |
828 | ||
829 | // Get a key like "foobar_head" | |
830 | std::string key; | |
831 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
832 | dn_key.encode(key); | |
833 | ||
834 | dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn | |
835 | << dendl; | |
836 | bool write_dentry = false; | |
837 | if (read_vals.find(key) == read_vals.end()) { | |
838 | dout(4) << "dentry did not already exist, will create" << dendl; | |
839 | write_dentry = true; | |
840 | } else { | |
841 | dout(4) << "dentry " << key << " existed already" << dendl; | |
842 | dout(4) << "dentry exists, checking versions..." << dendl; | |
843 | bufferlist &old_dentry = read_vals[key]; | |
844 | // Decode dentry+inode | |
845 | bufferlist::iterator q = old_dentry.begin(); | |
846 | ||
847 | snapid_t dnfirst; | |
848 | ::decode(dnfirst, q); | |
849 | char dentry_type; | |
850 | ::decode(dentry_type, q); | |
851 | ||
852 | if (dentry_type == 'L') { | |
853 | dout(10) << "Existing hardlink inode in slot to be (maybe) written " | |
854 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
855 | << "' with lump fnode version " << lump.fnode.version | |
856 | << "vs existing fnode version " << old_fnode_version << dendl; | |
857 | write_dentry = old_fnode_version < lump.fnode.version; | |
858 | } else if (dentry_type == 'I') { | |
859 | dout(10) << "Existing full inode in slot to be (maybe) written " | |
860 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
861 | << "' with lump fnode version " << lump.fnode.version | |
862 | << "vs existing fnode version " << old_fnode_version << dendl; | |
863 | write_dentry = old_fnode_version < lump.fnode.version; | |
864 | } else { | |
865 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
866 | "journal" << dendl; | |
867 | write_dentry = true; | |
868 | } | |
869 | } | |
870 | ||
871 | if ((other_pool || write_dentry) && !dry_run) { | |
872 | dout(4) << "writing L dentry " << key << " into frag " | |
873 | << frag_oid.name << dendl; | |
874 | ||
875 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
876 | bufferlist dentry_bl; | |
877 | ::encode(rb.dnfirst, dentry_bl); | |
878 | ::encode('L', dentry_bl); | |
879 | ::encode(rb.ino, dentry_bl); | |
880 | ::encode(rb.d_type, dentry_bl); | |
881 | ||
882 | // Record for writing to RADOS | |
883 | write_vals[key] = dentry_bl; | |
884 | consumed_inos->insert(rb.ino); | |
885 | } | |
886 | } | |
887 | ||
c07f9fc5 FG |
888 | std::set<std::string> null_vals; |
889 | for (auto& nb : nb_list) { | |
890 | std::string key; | |
891 | dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); | |
892 | dn_key.encode(key); | |
893 | ||
894 | dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn | |
895 | << dendl; | |
896 | ||
897 | auto it = read_vals.find(key); | |
898 | if (it != read_vals.end()) { | |
899 | dout(4) << "dentry exists, will remove" << dendl; | |
900 | ||
901 | bufferlist::iterator q = it->second.begin(); | |
902 | snapid_t dnfirst; | |
903 | ::decode(dnfirst, q); | |
904 | char dentry_type; | |
905 | ::decode(dentry_type, q); | |
906 | ||
907 | bool remove_dentry = false; | |
908 | if (dentry_type == 'L') { | |
909 | dout(10) << "Existing hardlink inode in slot to be (maybe) removed " | |
910 | << "by null journal dn '" << nb.dn.c_str() | |
911 | << "' with lump fnode version " << lump.fnode.version | |
912 | << "vs existing fnode version " << old_fnode_version << dendl; | |
913 | remove_dentry = old_fnode_version < lump.fnode.version; | |
914 | } else if (dentry_type == 'I') { | |
915 | dout(10) << "Existing full inode in slot to be (maybe) removed " | |
916 | << "by null journal dn '" << nb.dn.c_str() | |
917 | << "' with lump fnode version " << lump.fnode.version | |
918 | << "vs existing fnode version " << old_fnode_version << dendl; | |
919 | remove_dentry = old_fnode_version < lump.fnode.version; | |
920 | } else { | |
921 | dout(4) << "corrupt dentry in backing store, will remove" << dendl; | |
922 | remove_dentry = true; | |
923 | } | |
924 | ||
925 | if (remove_dentry) | |
926 | null_vals.insert(key); | |
927 | } | |
928 | } | |
929 | ||
7c673cae FG |
930 | // Write back any new/changed dentries |
931 | if (!write_vals.empty()) { | |
932 | r = output.omap_set(frag_oid.name, write_vals); | |
933 | if (r != 0) { | |
934 | derr << "error writing dentries to " << frag_oid.name | |
935 | << ": " << cpp_strerror(r) << dendl; | |
936 | return r; | |
937 | } | |
938 | } | |
c07f9fc5 FG |
939 | |
940 | // remove any null dentries | |
941 | if (!null_vals.empty()) { | |
942 | r = output.omap_rm_keys(frag_oid.name, null_vals); | |
943 | if (r != 0) { | |
944 | derr << "error removing dentries from " << frag_oid.name | |
945 | << ": " << cpp_strerror(r) << dendl; | |
946 | return r; | |
947 | } | |
948 | } | |
7c673cae FG |
949 | } |
950 | ||
951 | /* Now that we've looked at the dirlumps, we finally pay attention to | |
952 | * the roots (i.e. inodes without ancestry). This is necessary in order | |
953 | * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally | |
954 | * important because clients use them to infer completeness | |
955 | * of directories | |
956 | */ | |
957 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator p = | |
958 | metablob.roots.begin(); p != metablob.roots.end(); ++p) { | |
959 | EMetaBlob::fullbit const &fb = *(*p); | |
960 | inodeno_t ino = fb.inode.ino; | |
961 | dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl; | |
962 | ||
963 | object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); | |
964 | dout(4) << "object id " << root_oid.name << dendl; | |
965 | ||
966 | bool write_root_ino = false; | |
967 | bufferlist old_root_ino_bl; | |
968 | r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); | |
969 | if (r == -ENOENT) { | |
970 | dout(4) << "root does not exist, will create" << dendl; | |
971 | write_root_ino = true; | |
972 | } else if (r >= 0) { | |
973 | r = 0; | |
974 | InodeStore old_inode; | |
975 | dout(4) << "root exists, will modify (" << old_root_ino_bl.length() | |
976 | << ")" << dendl; | |
977 | bufferlist::iterator inode_bl_iter = old_root_ino_bl.begin(); | |
978 | std::string magic; | |
979 | ::decode(magic, inode_bl_iter); | |
980 | if (magic == CEPH_FS_ONDISK_MAGIC) { | |
981 | dout(4) << "magic ok" << dendl; | |
982 | old_inode.decode(inode_bl_iter); | |
983 | ||
984 | if (old_inode.inode.version < fb.inode.version) { | |
985 | write_root_ino = true; | |
986 | } | |
987 | } else { | |
988 | dout(4) << "magic bad: '" << magic << "'" << dendl; | |
989 | write_root_ino = true; | |
990 | } | |
991 | } else { | |
992 | derr << "error reading root inode object " << root_oid.name | |
993 | << ": " << cpp_strerror(r) << dendl; | |
994 | return r; | |
995 | } | |
996 | ||
997 | if (write_root_ino && !dry_run) { | |
998 | dout(4) << "writing root ino " << root_oid.name | |
999 | << " version " << fb.inode.version << dendl; | |
1000 | ||
1001 | // Compose: root ino format is magic,InodeStore(bare=false) | |
1002 | bufferlist new_root_ino_bl; | |
1003 | ::encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl); | |
1004 | encode_fullbit_as_inode(fb, false, &new_root_ino_bl); | |
1005 | ||
1006 | // Write to RADOS | |
1007 | r = output.write_full(root_oid.name, new_root_ino_bl); | |
1008 | if (r != 0) { | |
1009 | derr << "error writing inode object " << root_oid.name | |
1010 | << ": " << cpp_strerror(r) << dendl; | |
1011 | return r; | |
1012 | } | |
1013 | } | |
1014 | } | |
1015 | ||
1016 | return r; | |
1017 | } | |
1018 | ||
1019 | ||
7c673cae FG |
1020 | /** |
1021 | * Erase a region of the log by overwriting it with ENoOp | |
1022 | * | |
1023 | */ | |
1024 | int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length) | |
1025 | { | |
1026 | // To erase this region, we use our preamble, the encoding overhead | |
1027 | // of an ENoOp, and our trailing start ptr. Calculate how much padding | |
1028 | // is needed inside the ENoOp to make up the difference. | |
1029 | bufferlist tmp; | |
1030 | ENoOp enoop(0); | |
1031 | enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1032 | ||
1033 | dout(4) << "erase_region " << pos << " len=" << length << dendl; | |
1034 | ||
1035 | // FIXME: get the preamble/postamble length via JournalStream | |
1036 | int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t); | |
1037 | dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl; | |
1038 | ||
1039 | if (padding < 0) { | |
1040 | derr << "Erase region " << length << " too short" << dendl; | |
1041 | return -EINVAL; | |
1042 | } | |
1043 | ||
1044 | // Serialize an ENoOp with the correct amount of padding | |
1045 | enoop = ENoOp(padding); | |
1046 | bufferlist entry; | |
1047 | enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1048 | JournalStream stream(JOURNAL_FORMAT_RESILIENT); | |
1049 | ||
1050 | // Serialize region of log stream | |
1051 | bufferlist log_data; | |
1052 | stream.write(entry, &log_data, pos); | |
1053 | ||
1054 | dout(4) << "erase_region data length " << log_data.length() << dendl; | |
1055 | assert(log_data.length() == length); | |
1056 | ||
1057 | // Write log stream region to RADOS | |
1058 | // FIXME: get object size somewhere common to scan_events | |
1059 | uint32_t object_size = g_conf->mds_log_segment_size; | |
1060 | if (object_size == 0) { | |
1061 | // Default layout object size | |
1062 | object_size = file_layout_t::get_default().object_size; | |
1063 | } | |
1064 | ||
1065 | uint64_t write_offset = pos; | |
1066 | uint64_t obj_offset = (pos / object_size); | |
1067 | int r = 0; | |
1068 | while(log_data.length()) { | |
1069 | std::string const oid = js.obj_name(obj_offset); | |
1070 | uint32_t offset_in_obj = write_offset % object_size; | |
1071 | uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); | |
1072 | ||
1073 | r = output.write(oid, log_data, write_len, offset_in_obj); | |
1074 | if (r < 0) { | |
1075 | return r; | |
1076 | } else { | |
1077 | dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl; | |
1078 | r = 0; | |
1079 | } | |
1080 | ||
1081 | log_data.splice(0, write_len); | |
1082 | write_offset += write_len; | |
1083 | obj_offset++; | |
1084 | } | |
1085 | ||
1086 | return r; | |
1087 | } | |
1088 | ||
1089 | /** | |
1090 | * Given an EMetaBlob::fullbit containing an inode, write out | |
1091 | * the encoded inode in the format used by InodeStore (i.e. the | |
1092 | * backing store format) | |
1093 | * | |
1094 | * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use | |
1095 | * on an offline InodeStore instance. It's way simpler, because we are just | |
1096 | * uncritically hauling the data between structs. | |
1097 | * | |
1098 | * @param fb a fullbit extracted from a journal entry | |
1099 | * @param bare if true, leave out [EN|DE]CODE_START decoration | |
1100 | * @param out_bl output, write serialized inode to this bufferlist | |
1101 | */ | |
1102 | void JournalTool::encode_fullbit_as_inode( | |
1103 | const EMetaBlob::fullbit &fb, | |
1104 | const bool bare, | |
1105 | bufferlist *out_bl) | |
1106 | { | |
1107 | assert(out_bl != NULL); | |
1108 | ||
1109 | // Compose InodeStore | |
1110 | InodeStore new_inode; | |
1111 | new_inode.inode = fb.inode; | |
1112 | new_inode.xattrs = fb.xattrs; | |
1113 | new_inode.dirfragtree = fb.dirfragtree; | |
1114 | new_inode.snap_blob = fb.snapbl; | |
94b18763 | 1115 | new_inode.symlink = mempool::mds_co::string(boost::string_view(fb.symlink)); |
7c673cae FG |
1116 | new_inode.old_inodes = fb.old_inodes; |
1117 | ||
1118 | // Serialize InodeStore | |
1119 | if (bare) { | |
1120 | new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1121 | } else { | |
1122 | new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1123 | } | |
1124 | } | |
1125 | ||
1126 | /** | |
1127 | * Given a list of inode numbers known to be in use by | |
1128 | * inodes in the backing store, ensure that none of these | |
1129 | * numbers are listed as free in the InoTables in the | |
1130 | * backing store. | |
1131 | * | |
1132 | * Used after injecting inodes into the backing store, to | |
1133 | * ensure that the same inode numbers are not subsequently | |
1134 | * used for new files during ordinary operation. | |
1135 | * | |
1136 | * @param inos list of inode numbers to be removed from | |
1137 | * free lists in InoTables | |
1138 | * @returns 0 on success, else negative error code | |
1139 | */ | |
1140 | int JournalTool::consume_inos(const std::set<inodeno_t> &inos) | |
1141 | { | |
1142 | int r = 0; | |
1143 | ||
1144 | // InoTable is a per-MDS structure, so iterate over assigned ranks | |
1145 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
1146 | std::set<mds_rank_t> in_ranks; | |
1147 | fs->mds_map.get_mds_set(in_ranks); | |
1148 | ||
1149 | for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin(); | |
1150 | rank_i != in_ranks.end(); ++rank_i) | |
1151 | { | |
1152 | // Compose object name | |
1153 | std::ostringstream oss; | |
1154 | oss << "mds" << *rank_i << "_inotable"; | |
1155 | object_t inotable_oid = object_t(oss.str()); | |
1156 | ||
1157 | // Read object | |
1158 | bufferlist inotable_bl; | |
1159 | int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); | |
1160 | if (read_r < 0) { | |
1161 | // Things are really bad if we can't read inotable. Beyond our powers. | |
1162 | derr << "unable to read inotable '" << inotable_oid.name << "': " | |
1163 | << cpp_strerror(read_r) << dendl; | |
1164 | r = r ? r : read_r; | |
1165 | continue; | |
1166 | } | |
1167 | ||
1168 | // Deserialize InoTable | |
1169 | version_t inotable_ver; | |
1170 | bufferlist::iterator q = inotable_bl.begin(); | |
1171 | ::decode(inotable_ver, q); | |
1172 | InoTable ino_table(NULL); | |
1173 | ino_table.decode(q); | |
1174 | ||
1175 | // Update InoTable in memory | |
1176 | bool inotable_modified = false; | |
1177 | for (std::set<inodeno_t>::iterator i = inos.begin(); | |
1178 | i != inos.end(); ++i) | |
1179 | { | |
1180 | const inodeno_t ino = *i; | |
1181 | if (ino_table.force_consume(ino)) { | |
1182 | dout(4) << "Used ino 0x" << std::hex << ino << std::dec | |
1183 | << " requires inotable update" << dendl; | |
1184 | inotable_modified = true; | |
1185 | } | |
1186 | } | |
1187 | ||
1188 | // Serialize and write InoTable | |
1189 | if (inotable_modified) { | |
1190 | inotable_ver += 1; | |
1191 | dout(4) << "writing modified inotable version " << inotable_ver << dendl; | |
1192 | bufferlist inotable_new_bl; | |
1193 | ::encode(inotable_ver, inotable_new_bl); | |
1194 | ino_table.encode_state(inotable_new_bl); | |
1195 | int write_r = output.write_full(inotable_oid.name, inotable_new_bl); | |
1196 | if (write_r != 0) { | |
1197 | derr << "error writing modified inotable " << inotable_oid.name | |
1198 | << ": " << cpp_strerror(write_r) << dendl; | |
1199 | r = r ? r : read_r; | |
1200 | continue; | |
1201 | } | |
1202 | } | |
1203 | } | |
1204 | ||
1205 | return r; | |
1206 | } | |
1207 |