]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * ceph - scalable distributed file system | |
5 | * | |
6 | * copyright (c) 2014 john spray <john.spray@inktank.com> | |
7 | * | |
8 | * this is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the gnu lesser general public | |
10 | * license version 2.1, as published by the free software | |
11 | * foundation. see file copying. | |
12 | */ | |
13 | ||
14 | ||
15 | #include <sstream> | |
16 | ||
17 | #include "common/ceph_argparse.h" | |
18 | #include "common/errno.h" | |
19 | #include "osdc/Journaler.h" | |
20 | #include "mds/mdstypes.h" | |
21 | #include "mds/LogEvent.h" | |
22 | #include "mds/InoTable.h" | |
23 | ||
24 | #include "mds/events/ENoOp.h" | |
25 | #include "mds/events/EUpdate.h" | |
26 | ||
27 | #include "JournalScanner.h" | |
28 | #include "EventOutput.h" | |
29 | #include "Dumper.h" | |
30 | #include "Resetter.h" | |
31 | ||
32 | #include "JournalTool.h" | |
33 | ||
34 | ||
35 | #define dout_context g_ceph_context | |
36 | #define dout_subsys ceph_subsys_mds | |
37 | #undef dout_prefix | |
38 | #define dout_prefix *_dout << __func__ << ": " | |
39 | ||
40 | ||
41 | ||
42 | void JournalTool::usage() | |
43 | { | |
44 | std::cout << "Usage: \n" | |
45 | << " cephfs-journal-tool [options] journal <command>\n" | |
46 | << " <command>:\n" | |
47 | << " inspect\n" | |
48 | << " import <path>\n" | |
49 | << " export <path>\n" | |
50 | << " reset [--force]\n" | |
51 | << " cephfs-journal-tool [options] header <get|set <field> <value>\n" | |
b32b8144 | 52 | << " <field>: [trimmed_pos|expire_pos|write_pos|pool_id]" |
31f18b77 | 53 | << " cephfs-journal-tool [options] event <effect> <selector> <output> [special options]\n" |
7c673cae FG |
54 | << " <selector>:\n" |
55 | << " --range=<start>..<end>\n" | |
56 | << " --path=<substring>\n" | |
57 | << " --inode=<integer>\n" | |
58 | << " --type=<UPDATE|OPEN|SESSION...><\n" | |
59 | << " --frag=<ino>.<frag> [--dname=<dentry string>]\n" | |
7c673cae | 60 | << " --client=<session id integer>\n" |
31f18b77 | 61 | << " <effect>: [get|recover_dentries|splice]\n" |
7c673cae FG |
62 | << " <output>: [summary|list|binary|json] [--path <path>]\n" |
63 | << "\n" | |
31f18b77 | 64 | << "General options:\n" |
7c673cae FG |
65 | << " --rank=filesystem:mds-rank Journal rank (required if multiple\n" |
66 | << " file systems, default is rank 0 on\n" | |
31f18b77 FG |
67 | << " the only filesystem otherwise.\n" |
68 | << "\n" | |
69 | << "Special options\n" | |
70 | << " --alternate-pool <name> Alternative metadata pool to target\n" | |
71 | << " when using recover_dentries.\n"; | |
7c673cae FG |
72 | |
73 | generic_client_usage(); | |
74 | } | |
75 | ||
76 | ||
77 | /** | |
78 | * Handle arguments and hand off to journal/header/event mode | |
79 | */ | |
80 | int JournalTool::main(std::vector<const char*> &argv) | |
81 | { | |
82 | int r; | |
83 | ||
84 | dout(10) << "JournalTool::main " << dendl; | |
85 | // Common arg parsing | |
86 | // ================== | |
87 | if (argv.empty()) { | |
88 | usage(); | |
89 | return -EINVAL; | |
90 | } | |
91 | ||
92 | std::vector<const char*>::iterator arg = argv.begin(); | |
93 | ||
94 | std::string rank_str; | |
95 | if(!ceph_argparse_witharg(argv, arg, &rank_str, "--rank", (char*)NULL)) { | |
96 | // Default: act on rank 0. Will give the user an error if they | |
97 | // try invoking this way when they have more than one filesystem. | |
98 | rank_str = "0"; | |
99 | } | |
100 | ||
101 | r = role_selector.parse(*fsmap, rank_str); | |
102 | if (r != 0) { | |
103 | derr << "Couldn't determine MDS rank." << dendl; | |
104 | return r; | |
105 | } | |
106 | ||
107 | std::string mode; | |
108 | if (arg == argv.end()) { | |
109 | derr << "Missing mode [journal|header|event]" << dendl; | |
110 | return -EINVAL; | |
111 | } | |
112 | mode = std::string(*arg); | |
113 | arg = argv.erase(arg); | |
114 | ||
115 | // RADOS init | |
116 | // ========== | |
117 | r = rados.init_with_context(g_ceph_context); | |
118 | if (r < 0) { | |
119 | derr << "RADOS unavailable, cannot scan filesystem journal" << dendl; | |
120 | return r; | |
121 | } | |
122 | ||
123 | dout(4) << "JournalTool: connecting to RADOS..." << dendl; | |
124 | r = rados.connect(); | |
125 | if (r < 0) { | |
126 | derr << "couldn't connect to cluster: " << cpp_strerror(r) << dendl; | |
127 | return r; | |
128 | } | |
129 | ||
130 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
131 | assert(fs != nullptr); | |
132 | int64_t const pool_id = fs->mds_map.get_metadata_pool(); | |
133 | dout(4) << "JournalTool: resolving pool " << pool_id << dendl; | |
134 | std::string pool_name; | |
135 | r = rados.pool_reverse_lookup(pool_id, &pool_name); | |
136 | if (r < 0) { | |
137 | derr << "Pool " << pool_id << " named in MDS map not found in RADOS!" << dendl; | |
138 | return r; | |
139 | } | |
140 | ||
141 | dout(4) << "JournalTool: creating IoCtx.." << dendl; | |
142 | r = rados.ioctx_create(pool_name.c_str(), input); | |
143 | assert(r == 0); | |
144 | output.dup(input); | |
145 | ||
146 | // Execution | |
147 | // ========= | |
148 | for (auto role : role_selector.get_roles()) { | |
149 | rank = role.rank; | |
150 | dout(4) << "Executing for rank " << rank << dendl; | |
151 | if (mode == std::string("journal")) { | |
152 | r = main_journal(argv); | |
153 | } else if (mode == std::string("header")) { | |
154 | r = main_header(argv); | |
155 | } else if (mode == std::string("event")) { | |
156 | r = main_event(argv); | |
157 | } else { | |
158 | derr << "Bad command '" << mode << "'" << dendl; | |
159 | usage(); | |
160 | return -EINVAL; | |
161 | } | |
162 | ||
163 | if (r != 0) { | |
164 | return r; | |
165 | } | |
166 | } | |
167 | ||
168 | return r; | |
169 | } | |
170 | ||
171 | ||
172 | /** | |
173 | * Handle arguments for 'journal' mode | |
174 | * | |
175 | * This is for operations that act on the journal as a whole. | |
176 | */ | |
177 | int JournalTool::main_journal(std::vector<const char*> &argv) | |
178 | { | |
179 | std::string command = argv[0]; | |
180 | if (command == "inspect") { | |
181 | return journal_inspect(); | |
182 | } else if (command == "export" || command == "import") { | |
183 | if (argv.size() >= 2) { | |
184 | std::string const path = argv[1]; | |
185 | return journal_export(path, command == "import"); | |
186 | } else { | |
187 | derr << "Missing path" << dendl; | |
188 | return -EINVAL; | |
189 | } | |
190 | } else if (command == "reset") { | |
191 | bool force = false; | |
192 | if (argv.size() == 2) { | |
193 | if (std::string(argv[1]) == "--force") { | |
194 | force = true; | |
195 | } else { | |
196 | std::cerr << "Unknown argument " << argv[1] << std::endl; | |
197 | usage(); | |
198 | return -EINVAL; | |
199 | } | |
200 | } else if (argv.size() > 2) { | |
201 | std::cerr << "Too many arguments!" << std::endl; | |
202 | usage(); | |
203 | return -EINVAL; | |
204 | } | |
205 | return journal_reset(force); | |
206 | } else { | |
207 | derr << "Bad journal command '" << command << "'" << dendl; | |
208 | return -EINVAL; | |
209 | } | |
210 | } | |
211 | ||
212 | ||
213 | /** | |
214 | * Parse arguments and execute for 'header' mode | |
215 | * | |
216 | * This is for operations that act on the header only. | |
217 | */ | |
218 | int JournalTool::main_header(std::vector<const char*> &argv) | |
219 | { | |
220 | JournalFilter filter; | |
221 | JournalScanner js(input, rank, filter); | |
222 | int r = js.scan(false); | |
223 | if (r < 0) { | |
224 | std::cerr << "Unable to scan journal" << std::endl; | |
225 | return r; | |
226 | } | |
227 | ||
228 | if (!js.header_present) { | |
229 | std::cerr << "Header object not found!" << std::endl; | |
230 | return -ENOENT; | |
231 | } else if (!js.header_valid && js.header == NULL) { | |
232 | // Can't do a read or a single-field write without a copy of the original | |
233 | derr << "Header could not be read!" << dendl; | |
234 | return -ENOENT; | |
235 | } else { | |
236 | assert(js.header != NULL); | |
237 | } | |
238 | ||
239 | if (argv.size() == 0) { | |
240 | derr << "Invalid header command, must be [get|set]" << dendl; | |
241 | return -EINVAL; | |
242 | } | |
243 | std::vector<const char *>::iterator arg = argv.begin(); | |
244 | std::string const command = *arg; | |
245 | arg = argv.erase(arg); | |
246 | ||
247 | if (command == std::string("get")) { | |
248 | // Write JSON journal dump to stdout | |
249 | JSONFormatter jf(true); | |
250 | js.header->dump(&jf); | |
251 | jf.flush(std::cout); | |
252 | std::cout << std::endl; | |
253 | } else if (command == std::string("set")) { | |
254 | // Need two more args <key> <val> | |
255 | if (argv.size() != 2) { | |
256 | derr << "'set' requires two arguments <trimmed_pos|expire_pos|write_pos> <value>" << dendl; | |
257 | return -EINVAL; | |
258 | } | |
259 | ||
260 | std::string const field_name = *arg; | |
261 | arg = argv.erase(arg); | |
262 | ||
263 | std::string const value_str = *arg; | |
264 | arg = argv.erase(arg); | |
265 | assert(argv.empty()); | |
266 | ||
267 | std::string parse_err; | |
268 | uint64_t new_val = strict_strtoll(value_str.c_str(), 0, &parse_err); | |
269 | if (!parse_err.empty()) { | |
270 | derr << "Invalid value '" << value_str << "': " << parse_err << dendl; | |
271 | return -EINVAL; | |
272 | } | |
273 | ||
274 | uint64_t *field = NULL; | |
275 | if (field_name == "trimmed_pos") { | |
276 | field = &(js.header->trimmed_pos); | |
277 | } else if (field_name == "expire_pos") { | |
278 | field = &(js.header->expire_pos); | |
279 | } else if (field_name == "write_pos") { | |
280 | field = &(js.header->write_pos); | |
b32b8144 FG |
281 | } else if (field_name == "pool_id") { |
282 | field = (uint64_t*)(&(js.header->layout.pool_id)); | |
7c673cae FG |
283 | } else { |
284 | derr << "Invalid field '" << field_name << "'" << dendl; | |
285 | return -EINVAL; | |
286 | } | |
287 | ||
288 | std::cout << "Updating " << field_name << std::hex << " 0x" << *field << " -> 0x" << new_val << std::dec << std::endl; | |
289 | *field = new_val; | |
290 | ||
291 | dout(4) << "Writing object..." << dendl; | |
292 | bufferlist header_bl; | |
293 | ::encode(*(js.header), header_bl); | |
294 | output.write_full(js.obj_name(0), header_bl); | |
295 | dout(4) << "Write complete." << dendl; | |
296 | std::cout << "Successfully updated header." << std::endl; | |
297 | } else { | |
298 | derr << "Bad header command '" << command << "'" << dendl; | |
299 | return -EINVAL; | |
300 | } | |
301 | ||
302 | return 0; | |
303 | } | |
304 | ||
305 | ||
306 | /** | |
307 | * Parse arguments and execute for 'event' mode | |
308 | * | |
309 | * This is for operations that act on LogEvents within the log | |
310 | */ | |
311 | int JournalTool::main_event(std::vector<const char*> &argv) | |
312 | { | |
313 | int r; | |
314 | ||
315 | std::vector<const char*>::iterator arg = argv.begin(); | |
316 | ||
317 | std::string command = *(arg++); | |
31f18b77 | 318 | if (command != "get" && command != "splice" && command != "recover_dentries") { |
7c673cae FG |
319 | derr << "Unknown argument '" << command << "'" << dendl; |
320 | usage(); | |
321 | return -EINVAL; | |
322 | } | |
323 | ||
324 | if (arg == argv.end()) { | |
325 | derr << "Incomplete command line" << dendl; | |
326 | usage(); | |
327 | return -EINVAL; | |
328 | } | |
329 | ||
330 | // Parse filter options | |
331 | // ==================== | |
332 | JournalFilter filter; | |
333 | r = filter.parse_args(argv, arg); | |
334 | if (r) { | |
335 | return r; | |
336 | } | |
337 | ||
338 | // Parse output options | |
339 | // ==================== | |
340 | if (arg == argv.end()) { | |
341 | derr << "Missing output command" << dendl; | |
342 | usage(); | |
343 | } | |
344 | std::string output_style = *(arg++); | |
345 | if (output_style != "binary" && output_style != "json" && | |
346 | output_style != "summary" && output_style != "list") { | |
347 | derr << "Unknown argument: '" << output_style << "'" << dendl; | |
348 | usage(); | |
349 | return -EINVAL; | |
350 | } | |
351 | ||
352 | std::string output_path = "dump"; | |
353 | while(arg != argv.end()) { | |
354 | std::string arg_str; | |
355 | if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { | |
356 | output_path = arg_str; | |
357 | } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", | |
358 | nullptr)) { | |
359 | dout(1) << "Using alternate pool " << arg_str << dendl; | |
360 | int r = rados.ioctx_create(arg_str.c_str(), output); | |
361 | assert(r == 0); | |
362 | other_pool = true; | |
363 | } else { | |
364 | derr << "Unknown argument: '" << *arg << "'" << dendl; | |
365 | usage(); | |
366 | return -EINVAL; | |
367 | } | |
368 | } | |
369 | ||
370 | // Execute command | |
371 | // =============== | |
372 | JournalScanner js(input, rank, filter); | |
373 | if (command == "get") { | |
374 | r = js.scan(); | |
375 | if (r) { | |
376 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
377 | return r; | |
378 | } | |
7c673cae FG |
379 | } else if (command == "recover_dentries") { |
380 | r = js.scan(); | |
381 | if (r) { | |
382 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
383 | return r; | |
384 | } | |
385 | ||
386 | bool dry_run = false; | |
387 | if (arg != argv.end() && ceph_argparse_flag(argv, arg, "--dry_run", (char*)NULL)) { | |
388 | dry_run = true; | |
389 | } | |
390 | ||
391 | /** | |
392 | * Iterate over log entries, attempting to scavenge from each one | |
393 | */ | |
394 | std::set<inodeno_t> consumed_inos; | |
395 | for (JournalScanner::EventMap::iterator i = js.events.begin(); | |
396 | i != js.events.end(); ++i) { | |
397 | LogEvent *le = i->second.log_event; | |
398 | EMetaBlob const *mb = le->get_metablob(); | |
399 | if (mb) { | |
31f18b77 | 400 | int scav_r = recover_dentries(*mb, dry_run, &consumed_inos); |
7c673cae FG |
401 | if (scav_r) { |
402 | dout(1) << "Error processing event 0x" << std::hex << i->first << std::dec | |
403 | << ": " << cpp_strerror(scav_r) << ", continuing..." << dendl; | |
404 | if (r == 0) { | |
405 | r = scav_r; | |
406 | } | |
407 | // Our goal is to read all we can, so don't stop on errors, but | |
408 | // do record them for possible later output | |
409 | js.errors.insert(std::make_pair(i->first, | |
410 | JournalScanner::EventError(scav_r, cpp_strerror(r)))); | |
411 | } | |
412 | } | |
413 | } | |
414 | ||
415 | /** | |
416 | * Update InoTable to reflect any inode numbers consumed during scavenge | |
417 | */ | |
418 | dout(4) << "consumed " << consumed_inos.size() << " inodes" << dendl; | |
419 | if (consumed_inos.size() && !dry_run) { | |
420 | int consume_r = consume_inos(consumed_inos); | |
421 | if (consume_r) { | |
422 | dout(1) << "Error updating InoTable for " << consumed_inos.size() | |
423 | << " consume inos: " << cpp_strerror(consume_r) << dendl; | |
424 | if (r == 0) { | |
425 | r = consume_r; | |
426 | } | |
427 | } | |
428 | } | |
429 | ||
430 | // Remove consumed dentries from lost+found. | |
431 | if (other_pool && !dry_run) { | |
432 | std::set<std::string> found; | |
433 | ||
434 | for (auto i : consumed_inos) { | |
435 | char s[20]; | |
436 | ||
437 | snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); | |
438 | dout(20) << "removing " << s << dendl; | |
439 | found.insert(std::string(s)); | |
440 | } | |
441 | ||
442 | object_t frag_oid; | |
443 | frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, | |
444 | frag_t(), ""); | |
445 | output.omap_rm_keys(frag_oid.name, found); | |
446 | } | |
447 | } else if (command == "splice") { | |
448 | r = js.scan(); | |
449 | if (r) { | |
450 | derr << "Failed to scan journal (" << cpp_strerror(r) << ")" << dendl; | |
451 | return r; | |
452 | } | |
453 | ||
454 | uint64_t start, end; | |
455 | if (filter.get_range(start, end)) { | |
456 | // Special case for range filter: erase a numeric range in the log | |
457 | uint64_t range = end - start; | |
458 | int r = erase_region(js, start, range); | |
459 | if (r) { | |
460 | derr << "Failed to erase region 0x" << std::hex << start << "~0x" << range << std::dec | |
461 | << ": " << cpp_strerror(r) << dendl; | |
462 | return r; | |
463 | } | |
464 | } else { | |
465 | // General case: erase a collection of individual entries in the log | |
466 | for (JournalScanner::EventMap::iterator i = js.events.begin(); i != js.events.end(); ++i) { | |
467 | dout(4) << "Erasing offset 0x" << std::hex << i->first << std::dec << dendl; | |
468 | ||
469 | int r = erase_region(js, i->first, i->second.raw_size); | |
470 | if (r) { | |
471 | derr << "Failed to erase event 0x" << std::hex << i->first << std::dec | |
472 | << ": " << cpp_strerror(r) << dendl; | |
473 | return r; | |
474 | } | |
475 | } | |
476 | } | |
477 | ||
478 | ||
479 | } else { | |
480 | derr << "Unknown argument '" << command << "'" << dendl; | |
481 | usage(); | |
482 | return -EINVAL; | |
483 | } | |
484 | ||
485 | // Generate output | |
486 | // =============== | |
487 | EventOutput output(js, output_path); | |
488 | int output_result = 0; | |
489 | if (output_style == "binary") { | |
490 | output_result = output.binary(); | |
491 | } else if (output_style == "json") { | |
492 | output_result = output.json(); | |
493 | } else if (output_style == "summary") { | |
494 | output.summary(); | |
495 | } else if (output_style == "list") { | |
496 | output.list(); | |
497 | } else { | |
498 | std::cerr << "Bad output command '" << output_style << "'" << std::endl; | |
499 | return -EINVAL; | |
500 | } | |
501 | ||
502 | if (output_result != 0) { | |
503 | std::cerr << "Error writing output: " << cpp_strerror(output_result) << std::endl; | |
504 | } | |
505 | ||
506 | return output_result; | |
507 | } | |
508 | ||
509 | /** | |
510 | * Provide the user with information about the condition of the journal, | |
511 | * especially indicating what range of log events is available and where | |
512 | * any gaps or corruptions in the journal are. | |
513 | */ | |
514 | int JournalTool::journal_inspect() | |
515 | { | |
516 | int r; | |
517 | ||
518 | JournalFilter filter; | |
519 | JournalScanner js(input, rank, filter); | |
520 | r = js.scan(); | |
521 | if (r) { | |
522 | std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; | |
523 | return r; | |
524 | } | |
525 | ||
526 | js.report(std::cout); | |
527 | ||
528 | return 0; | |
529 | } | |
530 | ||
531 | ||
532 | /** | |
533 | * Attempt to export a binary dump of the journal. | |
534 | * | |
535 | * This is allowed to fail if the header is malformed or there are | |
536 | * objects inaccessible, in which case the user would have to fall | |
537 | * back to manually listing RADOS objects and extracting them, which | |
538 | * they can do with the ``rados`` CLI. | |
539 | */ | |
540 | int JournalTool::journal_export(std::string const &path, bool import) | |
541 | { | |
542 | int r = 0; | |
543 | JournalScanner js(input, rank); | |
544 | ||
545 | if (!import) { | |
546 | /* | |
547 | * If doing an export, first check that the header is valid and | |
548 | * no objects are missing before trying to dump | |
549 | */ | |
550 | r = js.scan(); | |
551 | if (r < 0) { | |
552 | derr << "Unable to scan journal, assuming badly damaged" << dendl; | |
553 | return r; | |
554 | } | |
555 | if (!js.is_readable()) { | |
556 | derr << "Journal not readable, attempt object-by-object dump with `rados`" << dendl; | |
557 | return -EIO; | |
558 | } | |
559 | } | |
560 | ||
561 | /* | |
562 | * Assuming we can cleanly read the journal data, dump it out to a file | |
563 | */ | |
564 | { | |
565 | Dumper dumper; | |
566 | r = dumper.init(mds_role_t(role_selector.get_ns(), rank)); | |
567 | if (r < 0) { | |
568 | derr << "dumper::init failed: " << cpp_strerror(r) << dendl; | |
569 | return r; | |
570 | } | |
571 | if (import) { | |
572 | r = dumper.undump(path.c_str()); | |
573 | } else { | |
574 | r = dumper.dump(path.c_str()); | |
575 | } | |
7c673cae FG |
576 | } |
577 | ||
578 | return r; | |
579 | } | |
580 | ||
581 | ||
582 | /** | |
583 | * Truncate journal and insert EResetJournal | |
584 | */ | |
585 | int JournalTool::journal_reset(bool hard) | |
586 | { | |
587 | int r = 0; | |
588 | Resetter resetter; | |
589 | r = resetter.init(); | |
590 | if (r < 0) { | |
591 | derr << "resetter::init failed: " << cpp_strerror(r) << dendl; | |
592 | return r; | |
593 | } | |
594 | ||
595 | if (hard) { | |
596 | r = resetter.reset_hard(mds_role_t(role_selector.get_ns(), rank)); | |
597 | } else { | |
598 | r = resetter.reset(mds_role_t(role_selector.get_ns(), rank)); | |
599 | } | |
7c673cae FG |
600 | |
601 | return r; | |
602 | } | |
603 | ||
604 | ||
605 | /** | |
606 | * Selective offline replay which only reads out dentries and writes | |
607 | * them to the backing store iff their version is > what is currently | |
608 | * in the backing store. | |
609 | * | |
610 | * In order to write dentries to the backing store, we may create the | |
611 | * required enclosing dirfrag objects. | |
612 | * | |
613 | * Test this by running scavenge on an unflushed journal, then nuking | |
614 | * it offline, then starting an MDS and seeing that the dentries are | |
615 | * visible. | |
616 | * | |
617 | * @param metablob an EMetaBlob retrieved from the journal | |
618 | * @param dry_run if true, do no writes to RADOS | |
619 | * @param consumed_inos output, populated with any inos inserted | |
620 | * @returns 0 on success, else negative error code | |
621 | */ | |
31f18b77 | 622 | int JournalTool::recover_dentries( |
7c673cae FG |
623 | EMetaBlob const &metablob, |
624 | bool const dry_run, | |
625 | std::set<inodeno_t> *consumed_inos) | |
626 | { | |
627 | assert(consumed_inos != NULL); | |
628 | ||
629 | int r = 0; | |
630 | ||
631 | // Replay fullbits (dentry+inode) | |
632 | for (list<dirfrag_t>::const_iterator lp = metablob.lump_order.begin(); | |
633 | lp != metablob.lump_order.end(); ++lp) | |
634 | { | |
635 | dirfrag_t const &frag = *lp; | |
636 | EMetaBlob::dirlump const &lump = metablob.lump_map.find(frag)->second; | |
637 | lump._decode_bits(); | |
638 | object_t frag_oid = InodeStore::get_object_name(frag.ino, frag.frag, ""); | |
639 | ||
640 | dout(4) << "inspecting lump " << frag_oid.name << dendl; | |
641 | ||
642 | ||
643 | // We will record old fnode version for use in hard link handling | |
644 | // If we don't read an old fnode, take version as zero and write in | |
645 | // all hardlinks we find. | |
646 | version_t old_fnode_version = 0; | |
647 | ||
648 | // Update fnode in omap header of dirfrag object | |
649 | bool write_fnode = false; | |
650 | bufferlist old_fnode_bl; | |
651 | r = input.omap_get_header(frag_oid.name, &old_fnode_bl); | |
652 | if (r == -ENOENT) { | |
653 | // Creating dirfrag from scratch | |
654 | dout(4) << "failed to read OMAP header from directory fragment " | |
655 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
656 | write_fnode = true; | |
657 | // Note: creating the dirfrag *without* a backtrace, relying on | |
658 | // MDS to regenerate backtraces on read or in FSCK | |
659 | } else if (r == 0) { | |
660 | // Conditionally update existing omap header | |
661 | fnode_t old_fnode; | |
662 | bufferlist::iterator old_fnode_iter = old_fnode_bl.begin(); | |
663 | try { | |
664 | old_fnode.decode(old_fnode_iter); | |
665 | dout(4) << "frag " << frag_oid.name << " fnode old v" << | |
666 | old_fnode.version << " vs new v" << lump.fnode.version << dendl; | |
667 | old_fnode_version = old_fnode.version; | |
668 | write_fnode = old_fnode_version < lump.fnode.version; | |
669 | } catch (const buffer::error &err) { | |
670 | dout(1) << "frag " << frag_oid.name | |
671 | << " is corrupt, overwriting" << dendl; | |
672 | write_fnode = true; | |
673 | } | |
674 | } else { | |
675 | // Unexpected error | |
676 | dout(4) << "failed to read OMAP header from directory fragment " | |
677 | << frag_oid.name << " " << cpp_strerror(r) << dendl; | |
678 | return r; | |
679 | } | |
680 | ||
681 | if ((other_pool || write_fnode) && !dry_run) { | |
682 | dout(4) << "writing fnode to omap header" << dendl; | |
683 | bufferlist fnode_bl; | |
684 | lump.fnode.encode(fnode_bl); | |
685 | if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { | |
686 | r = output.omap_set_header(frag_oid.name, fnode_bl); | |
687 | } | |
688 | if (r != 0) { | |
689 | derr << "Failed to write fnode for frag object " | |
690 | << frag_oid.name << dendl; | |
691 | return r; | |
692 | } | |
693 | } | |
694 | ||
695 | std::set<std::string> read_keys; | |
696 | ||
697 | // Compose list of potentially-existing dentries we would like to fetch | |
698 | list<ceph::shared_ptr<EMetaBlob::fullbit> > const &fb_list = | |
699 | lump.get_dfull(); | |
700 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi = | |
701 | fb_list.begin(); fbi != fb_list.end(); ++fbi) { | |
702 | EMetaBlob::fullbit const &fb = *(*fbi); | |
703 | ||
704 | // Get a key like "foobar_head" | |
705 | std::string key; | |
706 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
707 | dn_key.encode(key); | |
708 | read_keys.insert(key); | |
709 | } | |
710 | ||
711 | list<EMetaBlob::remotebit> const &rb_list = | |
712 | lump.get_dremote(); | |
713 | for (list<EMetaBlob::remotebit>::const_iterator rbi = | |
714 | rb_list.begin(); rbi != rb_list.end(); ++rbi) { | |
715 | EMetaBlob::remotebit const &rb = *rbi; | |
716 | ||
717 | // Get a key like "foobar_head" | |
718 | std::string key; | |
719 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
720 | dn_key.encode(key); | |
721 | read_keys.insert(key); | |
722 | } | |
723 | ||
c07f9fc5 FG |
724 | list<EMetaBlob::nullbit> const &nb_list = lump.get_dnull(); |
725 | for (auto& nb : nb_list) { | |
726 | // Get a key like "foobar_head" | |
727 | std::string key; | |
728 | dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); | |
729 | dn_key.encode(key); | |
730 | read_keys.insert(key); | |
731 | } | |
732 | ||
7c673cae FG |
733 | // Perform bulk read of existing dentries |
734 | std::map<std::string, bufferlist> read_vals; | |
735 | r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
736 | if (r == -ENOENT && other_pool) { | |
737 | r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); | |
738 | } | |
739 | if (r != 0) { | |
740 | derr << "unexpected error reading fragment object " | |
741 | << frag_oid.name << ": " << cpp_strerror(r) << dendl; | |
742 | return r; | |
743 | } | |
744 | ||
745 | // Compose list of dentries we will write back | |
746 | std::map<std::string, bufferlist> write_vals; | |
747 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator fbi = | |
748 | fb_list.begin(); fbi != fb_list.end(); ++fbi) { | |
749 | EMetaBlob::fullbit const &fb = *(*fbi); | |
750 | ||
751 | // Get a key like "foobar_head" | |
752 | std::string key; | |
753 | dentry_key_t dn_key(fb.dnlast, fb.dn.c_str()); | |
754 | dn_key.encode(key); | |
755 | ||
756 | dout(4) << "inspecting fullbit " << frag_oid.name << "/" << fb.dn | |
757 | << dendl; | |
758 | bool write_dentry = false; | |
759 | if (read_vals.find(key) == read_vals.end()) { | |
760 | dout(4) << "dentry did not already exist, will create" << dendl; | |
761 | write_dentry = true; | |
762 | } else { | |
763 | dout(4) << "dentry " << key << " existed already" << dendl; | |
764 | dout(4) << "dentry exists, checking versions..." << dendl; | |
765 | bufferlist &old_dentry = read_vals[key]; | |
766 | // Decode dentry+inode | |
767 | bufferlist::iterator q = old_dentry.begin(); | |
768 | ||
769 | snapid_t dnfirst; | |
770 | ::decode(dnfirst, q); | |
771 | char dentry_type; | |
772 | ::decode(dentry_type, q); | |
773 | ||
774 | if (dentry_type == 'L') { | |
775 | // leave write_dentry false, we have no version to | |
776 | // compare with in a hardlink, so it's not safe to | |
777 | // squash over it with what's in this fullbit | |
778 | dout(10) << "Existing remote inode in slot to be (maybe) written " | |
779 | << "by a full inode from the journal dn '" << fb.dn.c_str() | |
780 | << "' with lump fnode version " << lump.fnode.version | |
781 | << "vs existing fnode version " << old_fnode_version << dendl; | |
782 | write_dentry = old_fnode_version < lump.fnode.version; | |
783 | } else if (dentry_type == 'I') { | |
784 | // Read out inode version to compare with backing store | |
785 | InodeStore inode; | |
786 | inode.decode_bare(q); | |
787 | dout(4) << "decoded embedded inode version " | |
788 | << inode.inode.version << " vs fullbit version " | |
789 | << fb.inode.version << dendl; | |
790 | if (inode.inode.version < fb.inode.version) { | |
791 | write_dentry = true; | |
792 | } | |
793 | } else { | |
794 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
795 | "journal" << dendl; | |
796 | write_dentry = true; | |
797 | } | |
798 | } | |
799 | ||
800 | if ((other_pool || write_dentry) && !dry_run) { | |
801 | dout(4) << "writing I dentry " << key << " into frag " | |
802 | << frag_oid.name << dendl; | |
803 | ||
804 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
805 | bufferlist dentry_bl; | |
806 | ::encode(fb.dnfirst, dentry_bl); | |
807 | ::encode('I', dentry_bl); | |
808 | encode_fullbit_as_inode(fb, true, &dentry_bl); | |
809 | ||
810 | // Record for writing to RADOS | |
811 | write_vals[key] = dentry_bl; | |
812 | consumed_inos->insert(fb.inode.ino); | |
813 | } | |
814 | } | |
815 | ||
816 | for (list<EMetaBlob::remotebit>::const_iterator rbi = | |
817 | rb_list.begin(); rbi != rb_list.end(); ++rbi) { | |
818 | EMetaBlob::remotebit const &rb = *rbi; | |
819 | ||
820 | // Get a key like "foobar_head" | |
821 | std::string key; | |
822 | dentry_key_t dn_key(rb.dnlast, rb.dn.c_str()); | |
823 | dn_key.encode(key); | |
824 | ||
825 | dout(4) << "inspecting remotebit " << frag_oid.name << "/" << rb.dn | |
826 | << dendl; | |
827 | bool write_dentry = false; | |
828 | if (read_vals.find(key) == read_vals.end()) { | |
829 | dout(4) << "dentry did not already exist, will create" << dendl; | |
830 | write_dentry = true; | |
831 | } else { | |
832 | dout(4) << "dentry " << key << " existed already" << dendl; | |
833 | dout(4) << "dentry exists, checking versions..." << dendl; | |
834 | bufferlist &old_dentry = read_vals[key]; | |
835 | // Decode dentry+inode | |
836 | bufferlist::iterator q = old_dentry.begin(); | |
837 | ||
838 | snapid_t dnfirst; | |
839 | ::decode(dnfirst, q); | |
840 | char dentry_type; | |
841 | ::decode(dentry_type, q); | |
842 | ||
843 | if (dentry_type == 'L') { | |
844 | dout(10) << "Existing hardlink inode in slot to be (maybe) written " | |
845 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
846 | << "' with lump fnode version " << lump.fnode.version | |
847 | << "vs existing fnode version " << old_fnode_version << dendl; | |
848 | write_dentry = old_fnode_version < lump.fnode.version; | |
849 | } else if (dentry_type == 'I') { | |
850 | dout(10) << "Existing full inode in slot to be (maybe) written " | |
851 | << "by a remote inode from the journal dn '" << rb.dn.c_str() | |
852 | << "' with lump fnode version " << lump.fnode.version | |
853 | << "vs existing fnode version " << old_fnode_version << dendl; | |
854 | write_dentry = old_fnode_version < lump.fnode.version; | |
855 | } else { | |
856 | dout(4) << "corrupt dentry in backing store, overwriting from " | |
857 | "journal" << dendl; | |
858 | write_dentry = true; | |
859 | } | |
860 | } | |
861 | ||
862 | if ((other_pool || write_dentry) && !dry_run) { | |
863 | dout(4) << "writing L dentry " << key << " into frag " | |
864 | << frag_oid.name << dendl; | |
865 | ||
866 | // Compose: Dentry format is dnfirst, [I|L], InodeStore(bare=true) | |
867 | bufferlist dentry_bl; | |
868 | ::encode(rb.dnfirst, dentry_bl); | |
869 | ::encode('L', dentry_bl); | |
870 | ::encode(rb.ino, dentry_bl); | |
871 | ::encode(rb.d_type, dentry_bl); | |
872 | ||
873 | // Record for writing to RADOS | |
874 | write_vals[key] = dentry_bl; | |
875 | consumed_inos->insert(rb.ino); | |
876 | } | |
877 | } | |
878 | ||
c07f9fc5 FG |
879 | std::set<std::string> null_vals; |
880 | for (auto& nb : nb_list) { | |
881 | std::string key; | |
882 | dentry_key_t dn_key(nb.dnlast, nb.dn.c_str()); | |
883 | dn_key.encode(key); | |
884 | ||
885 | dout(4) << "inspecting nullbit " << frag_oid.name << "/" << nb.dn | |
886 | << dendl; | |
887 | ||
888 | auto it = read_vals.find(key); | |
889 | if (it != read_vals.end()) { | |
890 | dout(4) << "dentry exists, will remove" << dendl; | |
891 | ||
892 | bufferlist::iterator q = it->second.begin(); | |
893 | snapid_t dnfirst; | |
894 | ::decode(dnfirst, q); | |
895 | char dentry_type; | |
896 | ::decode(dentry_type, q); | |
897 | ||
898 | bool remove_dentry = false; | |
899 | if (dentry_type == 'L') { | |
900 | dout(10) << "Existing hardlink inode in slot to be (maybe) removed " | |
901 | << "by null journal dn '" << nb.dn.c_str() | |
902 | << "' with lump fnode version " << lump.fnode.version | |
903 | << "vs existing fnode version " << old_fnode_version << dendl; | |
904 | remove_dentry = old_fnode_version < lump.fnode.version; | |
905 | } else if (dentry_type == 'I') { | |
906 | dout(10) << "Existing full inode in slot to be (maybe) removed " | |
907 | << "by null journal dn '" << nb.dn.c_str() | |
908 | << "' with lump fnode version " << lump.fnode.version | |
909 | << "vs existing fnode version " << old_fnode_version << dendl; | |
910 | remove_dentry = old_fnode_version < lump.fnode.version; | |
911 | } else { | |
912 | dout(4) << "corrupt dentry in backing store, will remove" << dendl; | |
913 | remove_dentry = true; | |
914 | } | |
915 | ||
916 | if (remove_dentry) | |
917 | null_vals.insert(key); | |
918 | } | |
919 | } | |
920 | ||
7c673cae FG |
921 | // Write back any new/changed dentries |
922 | if (!write_vals.empty()) { | |
923 | r = output.omap_set(frag_oid.name, write_vals); | |
924 | if (r != 0) { | |
925 | derr << "error writing dentries to " << frag_oid.name | |
926 | << ": " << cpp_strerror(r) << dendl; | |
927 | return r; | |
928 | } | |
929 | } | |
c07f9fc5 FG |
930 | |
931 | // remove any null dentries | |
932 | if (!null_vals.empty()) { | |
933 | r = output.omap_rm_keys(frag_oid.name, null_vals); | |
934 | if (r != 0) { | |
935 | derr << "error removing dentries from " << frag_oid.name | |
936 | << ": " << cpp_strerror(r) << dendl; | |
937 | return r; | |
938 | } | |
939 | } | |
7c673cae FG |
940 | } |
941 | ||
942 | /* Now that we've looked at the dirlumps, we finally pay attention to | |
943 | * the roots (i.e. inodes without ancestry). This is necessary in order | |
944 | * to pick up dirstat updates on ROOT_INO. dirstat updates are functionally | |
945 | * important because clients use them to infer completeness | |
946 | * of directories | |
947 | */ | |
948 | for (list<ceph::shared_ptr<EMetaBlob::fullbit> >::const_iterator p = | |
949 | metablob.roots.begin(); p != metablob.roots.end(); ++p) { | |
950 | EMetaBlob::fullbit const &fb = *(*p); | |
951 | inodeno_t ino = fb.inode.ino; | |
952 | dout(4) << "updating root 0x" << std::hex << ino << std::dec << dendl; | |
953 | ||
954 | object_t root_oid = InodeStore::get_object_name(ino, frag_t(), ".inode"); | |
955 | dout(4) << "object id " << root_oid.name << dendl; | |
956 | ||
957 | bool write_root_ino = false; | |
958 | bufferlist old_root_ino_bl; | |
959 | r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); | |
960 | if (r == -ENOENT) { | |
961 | dout(4) << "root does not exist, will create" << dendl; | |
962 | write_root_ino = true; | |
963 | } else if (r >= 0) { | |
964 | r = 0; | |
965 | InodeStore old_inode; | |
966 | dout(4) << "root exists, will modify (" << old_root_ino_bl.length() | |
967 | << ")" << dendl; | |
968 | bufferlist::iterator inode_bl_iter = old_root_ino_bl.begin(); | |
969 | std::string magic; | |
970 | ::decode(magic, inode_bl_iter); | |
971 | if (magic == CEPH_FS_ONDISK_MAGIC) { | |
972 | dout(4) << "magic ok" << dendl; | |
973 | old_inode.decode(inode_bl_iter); | |
974 | ||
975 | if (old_inode.inode.version < fb.inode.version) { | |
976 | write_root_ino = true; | |
977 | } | |
978 | } else { | |
979 | dout(4) << "magic bad: '" << magic << "'" << dendl; | |
980 | write_root_ino = true; | |
981 | } | |
982 | } else { | |
983 | derr << "error reading root inode object " << root_oid.name | |
984 | << ": " << cpp_strerror(r) << dendl; | |
985 | return r; | |
986 | } | |
987 | ||
988 | if (write_root_ino && !dry_run) { | |
989 | dout(4) << "writing root ino " << root_oid.name | |
990 | << " version " << fb.inode.version << dendl; | |
991 | ||
992 | // Compose: root ino format is magic,InodeStore(bare=false) | |
993 | bufferlist new_root_ino_bl; | |
994 | ::encode(std::string(CEPH_FS_ONDISK_MAGIC), new_root_ino_bl); | |
995 | encode_fullbit_as_inode(fb, false, &new_root_ino_bl); | |
996 | ||
997 | // Write to RADOS | |
998 | r = output.write_full(root_oid.name, new_root_ino_bl); | |
999 | if (r != 0) { | |
1000 | derr << "error writing inode object " << root_oid.name | |
1001 | << ": " << cpp_strerror(r) << dendl; | |
1002 | return r; | |
1003 | } | |
1004 | } | |
1005 | } | |
1006 | ||
1007 | return r; | |
1008 | } | |
1009 | ||
1010 | ||
7c673cae FG |
1011 | /** |
1012 | * Erase a region of the log by overwriting it with ENoOp | |
1013 | * | |
1014 | */ | |
1015 | int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint64_t const length) | |
1016 | { | |
1017 | // To erase this region, we use our preamble, the encoding overhead | |
1018 | // of an ENoOp, and our trailing start ptr. Calculate how much padding | |
1019 | // is needed inside the ENoOp to make up the difference. | |
1020 | bufferlist tmp; | |
1021 | ENoOp enoop(0); | |
1022 | enoop.encode_with_header(tmp, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1023 | ||
1024 | dout(4) << "erase_region " << pos << " len=" << length << dendl; | |
1025 | ||
1026 | // FIXME: get the preamble/postamble length via JournalStream | |
1027 | int32_t padding = length - tmp.length() - sizeof(uint32_t) - sizeof(uint64_t) - sizeof(uint64_t); | |
1028 | dout(4) << "erase_region padding=0x" << std::hex << padding << std::dec << dendl; | |
1029 | ||
1030 | if (padding < 0) { | |
1031 | derr << "Erase region " << length << " too short" << dendl; | |
1032 | return -EINVAL; | |
1033 | } | |
1034 | ||
1035 | // Serialize an ENoOp with the correct amount of padding | |
1036 | enoop = ENoOp(padding); | |
1037 | bufferlist entry; | |
1038 | enoop.encode_with_header(entry, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1039 | JournalStream stream(JOURNAL_FORMAT_RESILIENT); | |
1040 | ||
1041 | // Serialize region of log stream | |
1042 | bufferlist log_data; | |
1043 | stream.write(entry, &log_data, pos); | |
1044 | ||
1045 | dout(4) << "erase_region data length " << log_data.length() << dendl; | |
1046 | assert(log_data.length() == length); | |
1047 | ||
1048 | // Write log stream region to RADOS | |
1049 | // FIXME: get object size somewhere common to scan_events | |
1050 | uint32_t object_size = g_conf->mds_log_segment_size; | |
1051 | if (object_size == 0) { | |
1052 | // Default layout object size | |
1053 | object_size = file_layout_t::get_default().object_size; | |
1054 | } | |
1055 | ||
1056 | uint64_t write_offset = pos; | |
1057 | uint64_t obj_offset = (pos / object_size); | |
1058 | int r = 0; | |
1059 | while(log_data.length()) { | |
1060 | std::string const oid = js.obj_name(obj_offset); | |
1061 | uint32_t offset_in_obj = write_offset % object_size; | |
1062 | uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); | |
1063 | ||
1064 | r = output.write(oid, log_data, write_len, offset_in_obj); | |
1065 | if (r < 0) { | |
1066 | return r; | |
1067 | } else { | |
1068 | dout(4) << "Wrote " << write_len << " bytes to " << oid << dendl; | |
1069 | r = 0; | |
1070 | } | |
1071 | ||
1072 | log_data.splice(0, write_len); | |
1073 | write_offset += write_len; | |
1074 | obj_offset++; | |
1075 | } | |
1076 | ||
1077 | return r; | |
1078 | } | |
1079 | ||
1080 | /** | |
1081 | * Given an EMetaBlob::fullbit containing an inode, write out | |
1082 | * the encoded inode in the format used by InodeStore (i.e. the | |
1083 | * backing store format) | |
1084 | * | |
1085 | * This is a distant cousin of EMetaBlob::fullbit::update_inode, but for use | |
1086 | * on an offline InodeStore instance. It's way simpler, because we are just | |
1087 | * uncritically hauling the data between structs. | |
1088 | * | |
1089 | * @param fb a fullbit extracted from a journal entry | |
1090 | * @param bare if true, leave out [EN|DE]CODE_START decoration | |
1091 | * @param out_bl output, write serialized inode to this bufferlist | |
1092 | */ | |
1093 | void JournalTool::encode_fullbit_as_inode( | |
1094 | const EMetaBlob::fullbit &fb, | |
1095 | const bool bare, | |
1096 | bufferlist *out_bl) | |
1097 | { | |
1098 | assert(out_bl != NULL); | |
1099 | ||
1100 | // Compose InodeStore | |
1101 | InodeStore new_inode; | |
1102 | new_inode.inode = fb.inode; | |
1103 | new_inode.xattrs = fb.xattrs; | |
1104 | new_inode.dirfragtree = fb.dirfragtree; | |
1105 | new_inode.snap_blob = fb.snapbl; | |
94b18763 | 1106 | new_inode.symlink = mempool::mds_co::string(boost::string_view(fb.symlink)); |
7c673cae FG |
1107 | new_inode.old_inodes = fb.old_inodes; |
1108 | ||
1109 | // Serialize InodeStore | |
1110 | if (bare) { | |
1111 | new_inode.encode_bare(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1112 | } else { | |
1113 | new_inode.encode(*out_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); | |
1114 | } | |
1115 | } | |
1116 | ||
1117 | /** | |
1118 | * Given a list of inode numbers known to be in use by | |
1119 | * inodes in the backing store, ensure that none of these | |
1120 | * numbers are listed as free in the InoTables in the | |
1121 | * backing store. | |
1122 | * | |
1123 | * Used after injecting inodes into the backing store, to | |
1124 | * ensure that the same inode numbers are not subsequently | |
1125 | * used for new files during ordinary operation. | |
1126 | * | |
1127 | * @param inos list of inode numbers to be removed from | |
1128 | * free lists in InoTables | |
1129 | * @returns 0 on success, else negative error code | |
1130 | */ | |
1131 | int JournalTool::consume_inos(const std::set<inodeno_t> &inos) | |
1132 | { | |
1133 | int r = 0; | |
1134 | ||
1135 | // InoTable is a per-MDS structure, so iterate over assigned ranks | |
1136 | auto fs = fsmap->get_filesystem(role_selector.get_ns()); | |
1137 | std::set<mds_rank_t> in_ranks; | |
1138 | fs->mds_map.get_mds_set(in_ranks); | |
1139 | ||
1140 | for (std::set<mds_rank_t>::iterator rank_i = in_ranks.begin(); | |
1141 | rank_i != in_ranks.end(); ++rank_i) | |
1142 | { | |
1143 | // Compose object name | |
1144 | std::ostringstream oss; | |
1145 | oss << "mds" << *rank_i << "_inotable"; | |
1146 | object_t inotable_oid = object_t(oss.str()); | |
1147 | ||
1148 | // Read object | |
1149 | bufferlist inotable_bl; | |
1150 | int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); | |
1151 | if (read_r < 0) { | |
1152 | // Things are really bad if we can't read inotable. Beyond our powers. | |
1153 | derr << "unable to read inotable '" << inotable_oid.name << "': " | |
1154 | << cpp_strerror(read_r) << dendl; | |
1155 | r = r ? r : read_r; | |
1156 | continue; | |
1157 | } | |
1158 | ||
1159 | // Deserialize InoTable | |
1160 | version_t inotable_ver; | |
1161 | bufferlist::iterator q = inotable_bl.begin(); | |
1162 | ::decode(inotable_ver, q); | |
1163 | InoTable ino_table(NULL); | |
1164 | ino_table.decode(q); | |
1165 | ||
1166 | // Update InoTable in memory | |
1167 | bool inotable_modified = false; | |
1168 | for (std::set<inodeno_t>::iterator i = inos.begin(); | |
1169 | i != inos.end(); ++i) | |
1170 | { | |
1171 | const inodeno_t ino = *i; | |
1172 | if (ino_table.force_consume(ino)) { | |
1173 | dout(4) << "Used ino 0x" << std::hex << ino << std::dec | |
1174 | << " requires inotable update" << dendl; | |
1175 | inotable_modified = true; | |
1176 | } | |
1177 | } | |
1178 | ||
1179 | // Serialize and write InoTable | |
1180 | if (inotable_modified) { | |
1181 | inotable_ver += 1; | |
1182 | dout(4) << "writing modified inotable version " << inotable_ver << dendl; | |
1183 | bufferlist inotable_new_bl; | |
1184 | ::encode(inotable_ver, inotable_new_bl); | |
1185 | ino_table.encode_state(inotable_new_bl); | |
1186 | int write_r = output.write_full(inotable_oid.name, inotable_new_bl); | |
1187 | if (write_r != 0) { | |
1188 | derr << "error writing modified inotable " << inotable_oid.name | |
1189 | << ": " << cpp_strerror(write_r) << dendl; | |
1190 | r = r ? r : read_r; | |
1191 | continue; | |
1192 | } | |
1193 | } | |
1194 | } | |
1195 | ||
1196 | return r; | |
1197 | } | |
1198 |