]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/cephfs/DataScan.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / tools / cephfs / DataScan.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "common/errno.h"
16#include "common/ceph_argparse.h"
17#include <fstream>
18#include "include/util.h"
19
20#include "mds/CInode.h"
21#include "cls/cephfs/cls_cephfs_client.h"
22
23#include "PgFiles.h"
24#include "DataScan.h"
25#include "include/compat.h"
26
27#define dout_context g_ceph_context
28#define dout_subsys ceph_subsys_mds
29#undef dout_prefix
30#define dout_prefix *_dout << "datascan." << __func__ << ": "
31
32void DataScan::usage()
33{
34 std::cout << "Usage: \n"
35 << " cephfs-data-scan init [--force-init]\n"
36 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
37 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
38 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
39 << " cephfs-data-scan scan_links\n"
40 << "\n"
41 << " --force-corrupt: overrite apparently corrupt structures\n"
42 << " --force-init: write root inodes even if they exist\n"
43 << " --force-pool: use data pool even if it is not in FSMap\n"
44 << " --worker_m: Maximum number of workers\n"
45 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
46 << "\n"
47 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
48 << " cephfs-data-scan cleanup <data pool name>\n"
49 << std::endl;
50
51 generic_client_usage();
52}
53
54bool DataScan::parse_kwarg(
55 const std::vector<const char*> &args,
56 std::vector<const char *>::const_iterator &i,
57 int *r)
58{
59 if (i + 1 == args.end()) {
60 return false;
61 }
62
63 const std::string arg(*i);
64 const std::string val(*(i + 1));
65
66 if (arg == std::string("--output-dir")) {
67 if (driver != NULL) {
68 derr << "Unexpected --output-dir: output already selected!" << dendl;
69 *r = -EINVAL;
70 return false;
71 }
72 dout(4) << "Using local file output to '" << val << "'" << dendl;
73 driver = new LocalFileDriver(val, data_io);
74 return true;
75 } else if (arg == std::string("--worker_n")) {
76 std::string err;
77 n = strict_strtoll(val.c_str(), 10, &err);
78 if (!err.empty()) {
79 std::cerr << "Invalid worker number '" << val << "'" << std::endl;
80 *r = -EINVAL;
81 return false;
82 }
83 return true;
84 } else if (arg == std::string("--worker_m")) {
85 std::string err;
86 m = strict_strtoll(val.c_str(), 10, &err);
87 if (!err.empty()) {
88 std::cerr << "Invalid worker count '" << val << "'" << std::endl;
89 *r = -EINVAL;
90 return false;
91 }
92 return true;
93 } else if (arg == std::string("--filter-tag")) {
94 filter_tag = val;
95 dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
96 return true;
97 } else if (arg == std::string("--filesystem")) {
98 std::shared_ptr<const Filesystem> fs;
99 *r = fsmap->parse_filesystem(val, &fs);
100 if (*r != 0) {
101 std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
102 return false;
103 }
104 fscid = fs->fscid;
105 return true;
106 } else if (arg == std::string("--alternate-pool")) {
107 metadata_pool_name = val;
108 return true;
109 } else {
110 return false;
111 }
112}
113
114bool DataScan::parse_arg(
115 const std::vector<const char*> &args,
116 std::vector<const char *>::const_iterator &i)
117{
118 const std::string arg(*i);
119 if (arg == "--force-pool") {
120 force_pool = true;
121 return true;
122 } else if (arg == "--force-corrupt") {
123 force_corrupt = true;
124 return true;
125 } else if (arg == "--force-init") {
126 force_init = true;
127 return true;
128 } else {
129 return false;
130 }
131}
132
133int DataScan::main(const std::vector<const char*> &args)
134{
135 // Parse args
136 // ==========
137 if (args.size() < 1) {
138 usage();
139 return -EINVAL;
140 }
141
142 // Common RADOS init: open metadata pool
143 // =====================================
144 librados::Rados rados;
145 int r = rados.init_with_context(g_ceph_context);
146 if (r < 0) {
147 derr << "RADOS unavailable" << dendl;
148 return r;
149 }
150
151 std::string const &command = args[0];
152 std::string data_pool_name;
153
154 std::string pg_files_path;
155 std::set<pg_t> pg_files_pgs;
156
157 // Consume any known --key val or --flag arguments
158 for (std::vector<const char *>::const_iterator i = args.begin() + 1;
159 i != args.end(); ++i) {
160 if (parse_kwarg(args, i, &r)) {
161 // Skip the kwarg value field
162 ++i;
163 continue;
164 } else if (r) {
165 return r;
166 }
167
168 if (parse_arg(args, i)) {
169 continue;
170 }
171
172 // Trailing positional argument
173 if (i + 1 == args.end() &&
174 (command == "scan_inodes"
175 || command == "scan_extents"
176 || command == "cleanup")) {
177 data_pool_name = *i;
178 continue;
179 }
180
181 if (command == "pg_files") {
182 if (i == args.begin() + 1) {
183 pg_files_path = *i;
184 continue;
185 } else {
186 pg_t pg;
187 bool parsed = pg.parse(*i);
188 if (!parsed) {
189 std::cerr << "Invalid PG '" << *i << "'" << std::endl;
190 return -EINVAL;
191 } else {
192 pg_files_pgs.insert(pg);
193 continue;
194 }
195 }
196
197 }
198
199 // Fall through: unhandled
200 std::cerr << "Unknown argument '" << *i << "'" << std::endl;
201 return -EINVAL;
202 }
203
204 // If caller didn't specify a namespace, try to pick
205 // one if only one exists
206 if (fscid == FS_CLUSTER_ID_NONE) {
207 if (fsmap->filesystem_count() == 1) {
208 fscid = fsmap->get_filesystem()->fscid;
209 } else {
210 std::cerr << "Specify a filesystem with --filesystem" << std::endl;
211 return -EINVAL;
212 }
213 }
214 auto fs = fsmap->get_filesystem(fscid);
215 assert(fs != nullptr);
216
217 // Default to output to metadata pool
218 if (driver == NULL) {
219 driver = new MetadataDriver();
220 driver->set_force_corrupt(force_corrupt);
221 driver->set_force_init(force_init);
222 dout(4) << "Using metadata pool output" << dendl;
223 }
224
225 dout(4) << "connecting to RADOS..." << dendl;
226 r = rados.connect();
227 if (r < 0) {
228 std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
229 << std::endl;
230 return r;
231 }
232
233 r = driver->init(rados, metadata_pool_name, fsmap, fscid);
234 if (r < 0) {
235 return r;
236 }
237
238 if (command == "pg_files") {
239 auto pge = PgFiles(objecter, pg_files_pgs);
240 pge.init();
241 return pge.scan_path(pg_files_path);
242 }
243
244 // Initialize data_io for those commands that need it
245 if (command == "scan_inodes" ||
246 command == "scan_extents" ||
247 command == "cleanup") {
248 if (data_pool_name.empty()) {
249 std::cerr << "Data pool not specified" << std::endl;
250 usage();
251 return -EINVAL;
252 }
253
254 data_pool_id = rados.pool_lookup(data_pool_name.c_str());
255 if (data_pool_id < 0) {
256 std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
257 return -ENOENT;
258 } else {
259 dout(4) << "data pool '" << data_pool_name
260 << "' has ID " << data_pool_id << dendl;
261 }
262
263 if (!fs->mds_map.is_data_pool(data_pool_id)) {
264 std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
265 "CephFS data pool!" << std::endl;
266 if (!force_pool) {
267 std::cerr << "Use --force-pool to continue" << std::endl;
268 return -EINVAL;
269 }
270 }
271
272 dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
273 r = rados.ioctx_create(data_pool_name.c_str(), data_io);
274 if (r != 0) {
275 return r;
276 }
277 }
278
279 // Initialize metadata_io from MDSMap for scan_frags
280 if (command == "scan_frags" || command == "scan_links") {
281 const auto fs = fsmap->get_filesystem(fscid);
282 if (fs == nullptr) {
283 std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
284 return -ENOENT;
285 }
286 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
287
288 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
289 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
290 if (r < 0) {
291 std::cerr << "Pool " << metadata_pool_id
292 << " identified in MDS map not found in RADOS!" << std::endl;
293 return r;
294 }
295
296 r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
297 if (r != 0) {
298 return r;
299 }
300 }
301
302 // Finally, dispatch command
303 if (command == "scan_inodes") {
304 return scan_inodes();
305 } else if (command == "scan_extents") {
306 return scan_extents();
307 } else if (command == "scan_frags") {
308 return scan_frags();
309 } else if (command == "scan_links") {
310 return scan_links();
311 } else if (command == "cleanup") {
312 return cleanup();
313 } else if (command == "init") {
314 return driver->init_roots(fs->mds_map.get_first_data_pool());
315 } else {
316 std::cerr << "Unknown command '" << command << "'" << std::endl;
317 return -EINVAL;
318 }
319}
320
321int MetadataDriver::inject_unlinked_inode(
322 inodeno_t inono, int mode, int64_t data_pool_id)
323{
324 const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
325
326 // Skip if exists
327 bool already_exists = false;
328 int r = root_exists(inono, &already_exists);
329 if (r) {
330 return r;
331 }
332 if (already_exists && !force_init) {
333 std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
334 " exists, skipping create. Use --force-init to overwrite"
335 " the existing object." << std::endl;
336 return 0;
337 }
338
339 // Compose
340 InodeStore inode;
341 inode.inode.ino = inono;
342 inode.inode.version = 1;
343 inode.inode.xattr_version = 1;
344 inode.inode.mode = 0500 | mode;
345 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
346 // (we won't actually give the *correct* dirstat here though)
347 inode.inode.dirstat.nfiles = 1;
348
349 inode.inode.ctime =
350 inode.inode.mtime = ceph_clock_now();
351 inode.inode.nlink = 1;
352 inode.inode.truncate_size = -1ull;
353 inode.inode.truncate_seq = 1;
354 inode.inode.uid = g_conf->mds_root_ino_uid;
355 inode.inode.gid = g_conf->mds_root_ino_gid;
356
357 // Force layout to default: should we let users override this so that
358 // they don't have to mount the filesystem to correct it?
359 inode.inode.layout = file_layout_t::get_default();
360 inode.inode.layout.pool_id = data_pool_id;
361 inode.inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
362
363 // Assume that we will get our stats wrong, and that we may
364 // be ignoring dirfrags that exist
365 inode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
366
367 // Serialize
368 bufferlist inode_bl;
369 ::encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
370 inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
371
372 // Write
373 r = metadata_io.write_full(oid.name, inode_bl);
374 if (r != 0) {
375 derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
376 return r;
377 }
378
379 return r;
380}
381
382int MetadataDriver::root_exists(inodeno_t ino, bool *result)
383{
384 object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
385 uint64_t size;
386 time_t mtime;
387 int r = metadata_io.stat(oid.name, &size, &mtime);
388 if (r == -ENOENT) {
389 *result = false;
390 return 0;
391 } else if (r < 0) {
392 return r;
393 }
394
395 *result = true;
396 return 0;
397}
398
399int MetadataDriver::init_roots(int64_t data_pool_id)
400{
401 int r = 0;
402 r = inject_unlinked_inode(MDS_INO_ROOT, S_IFDIR|0755, data_pool_id);
403 if (r != 0) {
404 return r;
405 }
406 r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
407 if (r != 0) {
408 return r;
409 }
410 bool created = false;
411 r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
412 if (r != 0) {
413 return r;
414 }
415
416 return 0;
417}
418
419int MetadataDriver::check_roots(bool *result)
420{
421 int r;
422 r = root_exists(MDS_INO_ROOT, result);
423 if (r != 0) {
424 return r;
425 }
426 if (!*result) {
427 return 0;
428 }
429
430 r = root_exists(MDS_INO_MDSDIR(0), result);
431 if (r != 0) {
432 return r;
433 }
434 if (!*result) {
435 return 0;
436 }
437
438 return 0;
439}
440
441/**
442 * Stages:
443 *
444 * SERIAL init
445 * 0. Create root inodes if don't exist
446 * PARALLEL scan_extents
447 * 1. Size and mtime recovery: scan ALL objects, and update 0th
448 * objects with max size and max mtime seen.
449 * PARALLEL scan_inodes
450 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
451 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
452 * or rstats at this stage. Inodes without backtraces go into
453 * lost+found
454 * TODO: SERIAL "recover stats"
455 * 3. Dirfrag statistics: depth first traverse into metadata tree,
456 * rebuilding dir sizes.
457 * TODO PARALLEL "clean up"
458 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
459 * anything onto them) and remove any of the xattrs that we
460 * used for accumulating.
461 */
462
463
464int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
465{
466 if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
467 return -EINVAL;
468 }
469
470 std::string err;
471 std::string inode_str = oid.substr(0, oid.find("."));
472 *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
473 if (!err.empty()) {
474 return -EINVAL;
475 }
476
477 std::string pos_string = oid.substr(oid.find(".") + 1);
478 *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
479 if (!err.empty()) {
480 return -EINVAL;
481 }
482
483 return 0;
484}
485
486
487int DataScan::scan_extents()
488{
489 return forall_objects(data_io, false, [this](
490 std::string const &oid,
491 uint64_t obj_name_ino,
492 uint64_t obj_name_offset) -> int
493 {
494 // Read size
495 uint64_t size;
496 time_t mtime;
497 int r = data_io.stat(oid, &size, &mtime);
498 dout(10) << "handling object " << obj_name_ino
499 << "." << obj_name_offset << dendl;
500 if (r != 0) {
501 dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
502 return r;
503 }
504
505 // I need to keep track of
506 // * The highest object ID seen
507 // * The size of the highest object ID seen
508 // * The largest object seen
509 //
510 // Given those things, I can later infer the object chunking
511 // size, the offset of the last object (chunk size * highest ID seen)
512 // and the actual size (offset of last object + size of highest ID seen)
513 //
514 // This logic doesn't take account of striping.
515 r = ClsCephFSClient::accumulate_inode_metadata(
516 data_io,
517 obj_name_ino,
518 obj_name_offset,
519 size,
520 mtime);
521 if (r < 0) {
522 derr << "Failed to accumulate metadata data from '"
523 << oid << "': " << cpp_strerror(r) << dendl;
524 return r;
525 }
526
527 return r;
528 });
529}
530
531int DataScan::probe_filter(librados::IoCtx &ioctx)
532{
533 bufferlist filter_bl;
534 ClsCephFSClient::build_tag_filter("test", &filter_bl);
535 librados::ObjectCursor range_i;
536 librados::ObjectCursor range_end;
537
538 std::vector<librados::ObjectItem> tmp_result;
539 librados::ObjectCursor tmp_next;
540 int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
541 1, filter_bl, &tmp_result, &tmp_next);
542
543 return r >= 0;
544}
545
546int DataScan::forall_objects(
547 librados::IoCtx &ioctx,
548 bool untagged_only,
549 std::function<int(std::string, uint64_t, uint64_t)> handler
550 )
551{
552 librados::ObjectCursor range_i;
553 librados::ObjectCursor range_end;
554 ioctx.object_list_slice(
555 ioctx.object_list_begin(),
556 ioctx.object_list_end(),
557 n,
558 m,
559 &range_i,
560 &range_end);
561
562
563 bufferlist filter_bl;
564
565 bool legacy_filtering = false;
566 if (untagged_only) {
567 // probe to deal with older OSDs that don't support
568 // the cephfs pgls filtering mode
569 legacy_filtering = !probe_filter(ioctx);
570 if (!legacy_filtering) {
571 ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
572 }
573 }
574
575 int r = 0;
576 while(range_i < range_end) {
577 std::vector<librados::ObjectItem> result;
578 int r = ioctx.object_list(range_i, range_end, 1,
579 filter_bl, &result, &range_i);
580 if (r < 0) {
581 derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
582 return r;
583 }
584
585 for (const auto &i : result) {
586 const std::string &oid = i.oid;
587 uint64_t obj_name_ino = 0;
588 uint64_t obj_name_offset = 0;
589 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
590 if (r != 0) {
591 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
592 continue;
593 }
594
595 if (untagged_only && legacy_filtering) {
596 dout(20) << "Applying filter to " << oid << dendl;
597
598 // We are only interested in 0th objects during this phase: we touched
599 // the other objects during scan_extents
600 if (obj_name_offset != 0) {
601 dout(20) << "Non-zeroth object" << dendl;
602 continue;
603 }
604
605 bufferlist scrub_tag_bl;
606 int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
607 if (r >= 0) {
608 std::string read_tag;
609 bufferlist::iterator q = scrub_tag_bl.begin();
610 try {
611 ::decode(read_tag, q);
612 if (read_tag == filter_tag) {
613 dout(20) << "skipping " << oid << " because it has the filter_tag"
614 << dendl;
615 continue;
616 }
617 } catch (const buffer::error &err) {
618 }
619 dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
620 } else {
621 dout(20) << "no tag read (" << r << ")" << dendl;
622 }
623
624 } else if (untagged_only) {
625 assert(obj_name_offset == 0);
626 dout(20) << "OSD matched oid " << oid << dendl;
627 }
628
629 int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
630 if (r == 0 && this_oid_r < 0) {
631 r = this_oid_r;
632 }
633 }
634 }
635
636 return r;
637}
638
639int DataScan::scan_inodes()
640{
641 bool roots_present;
642 int r = driver->check_roots(&roots_present);
643 if (r != 0) {
644 derr << "Unexpected error checking roots: '"
645 << cpp_strerror(r) << "'" << dendl;
646 return r;
647 }
648
649 if (!roots_present) {
650 std::cerr << "Some or all system inodes are absent. Run 'init' from "
651 "one node before running 'scan_inodes'" << std::endl;
652 return -EIO;
653 }
654
655 return forall_objects(data_io, true, [this](
656 std::string const &oid,
657 uint64_t obj_name_ino,
658 uint64_t obj_name_offset) -> int
659 {
660 int r = 0;
661
662 dout(10) << "handling object "
663 << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
664 << dendl;
665
666 AccumulateResult accum_res;
667 inode_backtrace_t backtrace;
668 file_layout_t loaded_layout = file_layout_t::get_default();
669 r = ClsCephFSClient::fetch_inode_accumulate_result(
670 data_io, oid, &backtrace, &loaded_layout, &accum_res);
671
672 if (r == -EINVAL) {
673 dout(4) << "Accumulated metadata missing from '"
674 << oid << ", did you run scan_extents?" << dendl;
675 return r;
676 } else if (r < 0) {
677 dout(4) << "Unexpected error loading accumulated metadata from '"
678 << oid << "': " << cpp_strerror(r) << dendl;
679 // FIXME: this creates situation where if a client has a corrupt
680 // backtrace/layout, we will fail to inject it. We should (optionally)
681 // proceed if the backtrace/layout is corrupt but we have valid
682 // accumulated metadata.
683 return r;
684 }
685
686 const time_t file_mtime = accum_res.max_mtime;
687 uint64_t file_size = 0;
688 bool have_backtrace = !(backtrace.ancestors.empty());
689
690 // This is the layout we will use for injection, populated either
691 // from loaded_layout or from best guesses
692 file_layout_t guessed_layout;
693 guessed_layout.pool_id = data_pool_id;
694
695 // Calculate file_size, guess the layout
696 if (accum_res.ceiling_obj_index > 0) {
697 uint32_t chunk_size = file_layout_t::get_default().object_size;
698 // When there are multiple objects, the largest object probably
699 // indicates the chunk size. But not necessarily, because files
700 // can be sparse. Only make this assumption if size seen
701 // is a power of two, as chunk sizes typically are.
702 if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
703 chunk_size = accum_res.max_obj_size;
704 }
705
706 if (loaded_layout.pool_id == -1) {
707 // If no stashed layout was found, guess it
708 guessed_layout.object_size = chunk_size;
709 guessed_layout.stripe_unit = chunk_size;
710 guessed_layout.stripe_count = 1;
711 } else if (!loaded_layout.is_valid() ||
712 loaded_layout.object_size < accum_res.max_obj_size) {
713 // If the max size seen exceeds what the stashed layout claims, then
714 // disbelieve it. Guess instead. Same for invalid layouts on disk.
715 dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
716 << std::dec << ", ignoring in favour of best guess" << dendl;
717 guessed_layout.object_size = chunk_size;
718 guessed_layout.stripe_unit = chunk_size;
719 guessed_layout.stripe_count = 1;
720 } else {
721 // We have a stashed layout that we can't disprove, so apply it
722 guessed_layout = loaded_layout;
723 dout(20) << "loaded layout from xattr:"
724 << " os: " << guessed_layout.object_size
725 << " sc: " << guessed_layout.stripe_count
726 << " su: " << guessed_layout.stripe_unit
727 << dendl;
728 // User might have transplanted files from a pool with a different
729 // ID, so whatever the loaded_layout says, we'll force the injected
730 // layout to point to the pool we really read from
731 guessed_layout.pool_id = data_pool_id;
732 }
733
734 if (guessed_layout.stripe_count == 1) {
735 // Unstriped file: simple chunking
736 file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
737 + accum_res.ceiling_obj_size;
738 } else {
739 // Striped file: need to examine the last stripe_count objects
740 // in the file to determine the size.
741
742 // How many complete (i.e. not last stripe) objects?
743 uint64_t complete_objs = 0;
744 if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
745 complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
746 } else {
747 complete_objs = 0;
748 }
749
750 // How many potentially-short objects (i.e. last stripe set) objects?
751 uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
752
753 dout(10) << "calculating striped size from complete objs: "
754 << complete_objs << ", partial objs: " << partial_objs
755 << dendl;
756
757 // Maximum amount of data that may be in the incomplete objects
758 uint64_t incomplete_size = 0;
759
760 // For each short object, calculate the max file size within it
761 // and accumulate the maximum
762 for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
763 char buf[60];
764 snprintf(buf, sizeof(buf), "%llx.%08llx",
765 (long long unsigned)obj_name_ino, (long long unsigned)i);
766
767 uint64_t osize(0);
768 time_t omtime(0);
769 r = data_io.stat(std::string(buf), &osize, &omtime);
770 if (r == 0) {
771 if (osize > 0) {
772 // Upper bound within this object
773 uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
774 * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
775 + (i % guessed_layout.stripe_count)
776 * guessed_layout.stripe_unit + (osize - 1)
777 % guessed_layout.stripe_unit + 1;
778 incomplete_size = MAX(incomplete_size, upper_size);
779 }
780 } else if (r == -ENOENT) {
781 // Absent object, treat as size 0 and ignore.
782 } else {
783 // Unexpected error, carry r to outer scope for handling.
784 break;
785 }
786 }
787 if (r != 0 && r != -ENOENT) {
788 derr << "Unexpected error checking size of ino 0x" << std::hex
789 << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
790 return r;
791 }
792 file_size = complete_objs * guessed_layout.object_size
793 + incomplete_size;
794 }
795 } else {
796 file_size = accum_res.ceiling_obj_size;
797 if (loaded_layout.pool_id < 0
798 || loaded_layout.object_size < accum_res.max_obj_size) {
799 // No layout loaded, or inconsistent layout, use default
800 guessed_layout = file_layout_t::get_default();
801 guessed_layout.pool_id = data_pool_id;
802 } else {
803 guessed_layout = loaded_layout;
804 }
805 }
806
807 // Santity checking backtrace ino against object name
808 if (have_backtrace && backtrace.ino != obj_name_ino) {
809 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
810 << " doesn't match object name ino 0x" << obj_name_ino
811 << std::dec << dendl;
812 have_backtrace = false;
813 }
814
815 InodeStore dentry;
816 build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
817
818 // Inject inode to the metadata pool
819 if (have_backtrace) {
820 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
821 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
822 /* Special case for strays: even if we have a good backtrace,
823 * don't put it in the stray dir, because while that would technically
824 * give it linkage it would still be invisible to the user */
825 r = driver->inject_lost_and_found(obj_name_ino, dentry);
826 if (r < 0) {
827 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
828 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
829 if (r == -EINVAL) {
830 dout(4) << "Use --force-corrupt to overwrite structures that "
831 "appear to be corrupt" << dendl;
832 }
833 }
834 } else {
835 /* Happy case: we will inject a named dentry for this inode */
836 r = driver->inject_with_backtrace(backtrace, dentry);
837 if (r < 0) {
838 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
839 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
840 if (r == -EINVAL) {
841 dout(4) << "Use --force-corrupt to overwrite structures that "
842 "appear to be corrupt" << dendl;
843 }
844 }
845 }
846 } else {
847 /* Backtrace-less case: we will inject a lost+found dentry */
848 r = driver->inject_lost_and_found(
849 obj_name_ino, dentry);
850 if (r < 0) {
851 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
852 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
853 if (r == -EINVAL) {
854 dout(4) << "Use --force-corrupt to overwrite structures that "
855 "appear to be corrupt" << dendl;
856 }
857 }
858 }
859
860 return r;
861 });
862}
863
864int DataScan::cleanup()
865{
866 // We are looking for only zeroth object
867 //
868 return forall_objects(data_io, true, [this](
869 std::string const &oid,
870 uint64_t obj_name_ino,
871 uint64_t obj_name_offset) -> int
872 {
873 int r = 0;
874 r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
875 if (r < 0) {
876 dout(4) << "Error deleting accumulated metadata from '"
877 << oid << "': " << cpp_strerror(r) << dendl;
878 }
879 return r;
880 });
881}
882
883bool DataScan::valid_ino(inodeno_t ino) const
884{
885 return (ino >= inodeno_t((1ull << 40)))
886 || (MDS_INO_IS_STRAY(ino))
887 || (MDS_INO_IS_MDSDIR(ino))
888 || ino == MDS_INO_ROOT
889 || ino == MDS_INO_CEPH;
890}
891
892int DataScan::scan_links()
893{
894 MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
895 if (!metadata_driver) {
896 derr << "Unexpected --output-dir option for scan_links" << dendl;
897 return -EINVAL;
898 }
899
900 interval_set<inodeno_t> used_inos;
901 map<inodeno_t, int> remote_links;
902
903 struct link_info_t {
904 inodeno_t dirino;
905 frag_t frag;
906 string name;
907 version_t version;
908 int nlink;
909 bool is_dir;
910 link_info_t() : version(0), nlink(0), is_dir(false) {}
911 link_info_t(inodeno_t di, frag_t df, const string& n, const inode_t i) :
912 dirino(di), frag(df), name(n),
913 version(i.version), nlink(i.nlink), is_dir(S_IFDIR & i.mode) {}
914 dirfrag_t dirfrag() const {
915 return dirfrag_t(dirino, frag);
916 }
917 };
918 map<inodeno_t, list<link_info_t> > dup_primaries;
919 map<inodeno_t, link_info_t> bad_nlink_inos;
920
921 map<dirfrag_t, set<string> > to_remove;
922
923 enum {
924 SCAN_INOS = 1,
925 CHECK_LINK,
926 };
927
928 for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
929 const librados::NObjectIterator it_end = metadata_io.nobjects_end();
930 for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
931 const std::string oid = it->get_oid();
932
933 uint64_t dir_ino = 0;
934 uint64_t frag_id = 0;
935 int r = parse_oid(oid, &dir_ino, &frag_id);
936 if (r == -EINVAL) {
937 dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
938 continue;
939 } else {
940 // parse_oid can only do 0 or -EINVAL
941 assert(r == 0);
942 }
943
944 if (!valid_ino(dir_ino)) {
945 dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
946 continue;
947 }
948
949 std::map<std::string, bufferlist> items;
950 r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
951 if (r < 0) {
952 derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
953 return r;
954 }
955
956 for (auto& p : items) {
957 bufferlist::iterator q = p.second.begin();
958 string dname;
959 snapid_t last;
960 dentry_key_t::decode_helper(p.first, dname, last);
961
962 if (last != CEPH_NOSNAP)
963 continue;
964
965 try {
966 snapid_t dnfirst;
967 ::decode(dnfirst, q);
968 char dentry_type;
969 ::decode(dentry_type, q);
970 if (dentry_type == 'I') {
971 InodeStore inode;
972 inode.decode_bare(q);
973 inodeno_t ino = inode.inode.ino;
974
975 if (step == SCAN_INOS) {
976 if (used_inos.contains(ino, 1)) {
977 dup_primaries[ino].size();
978 } else {
979 used_inos.insert(ino);
980 }
981 } else if (step == CHECK_LINK) {
982 auto q = dup_primaries.find(ino);
983 if (q != dup_primaries.end()) {
984 q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
985 } else {
986 int nlink = 0;
987 auto r = remote_links.find(ino);
988 if (r != remote_links.end())
989 nlink = r->second;
990 if (!MDS_INO_IS_STRAY(dir_ino))
991 nlink++;
992 if (inode.inode.nlink != nlink) {
993 derr << "Bad nlink on " << ino << " expected " << nlink
994 << " has " << inode.inode.nlink << dendl;
995 bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
996 bad_nlink_inos[ino].nlink = nlink;
997 }
998 }
999 }
1000 } else if (dentry_type == 'L') {
1001 inodeno_t ino;
1002 unsigned char d_type;
1003 ::decode(ino, q);
1004 ::decode(d_type, q);
1005
1006 if (step == SCAN_INOS) {
1007 remote_links[ino]++;
1008 } else if (step == CHECK_LINK) {
1009 if (!used_inos.contains(ino, 1)) {
1010 derr << "Bad remote link dentry 0x" << std::hex << dir_ino
1011 << std::dec << "/" << dname
1012 << ", ino " << ino << " not found" << dendl;
1013 std::string key;
1014 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1015 dn_key.encode(key);
1016 to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
1017 }
1018 }
1019 } else {
1020 derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
1021 << std::dec << "/" << dname << dendl;
1022 return -EINVAL;
1023 }
1024 } catch (const buffer::error &err) {
1025 derr << "Error decoding dentry 0x" << std::hex << dir_ino
1026 << std::dec << "/" << dname << dendl;
1027 return -EINVAL;
1028 }
1029 }
1030 }
1031 }
1032 used_inos.clear();
1033
1034 for (auto& p : dup_primaries) {
1035 link_info_t newest;
1036 for (auto& q : p.second) {
1037 if (q.version > newest.version) {
1038 newest = q;
1039 } else if (q.version == newest.version &&
1040 !MDS_INO_IS_STRAY(q.dirino) &&
1041 MDS_INO_IS_STRAY(newest.dirino)) {
1042 newest = q;
1043 }
1044 }
1045
1046 for (auto& q : p.second) {
1047 // in the middle of dir fragmentation?
1048 if (newest.dirino == q.dirino && newest.name == q.name)
1049 continue;
1050
1051 std::string key;
1052 dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
1053 dn_key.encode(key);
1054 to_remove[q.dirfrag()].insert(key);
1055 derr << "Remove duplicated ino 0x" << p.first << " from "
1056 << q.dirfrag() << "/" << q.name << dendl;
1057 }
1058
1059 int nlink = 0;
1060 auto q = remote_links.find(p.first);
1061 if (q != remote_links.end())
1062 nlink = q->second;
1063 if (!MDS_INO_IS_STRAY(newest.dirino))
1064 nlink++;
1065
1066 if (nlink != newest.nlink) {
1067 derr << "Bad nlink on " << p.first << " expected " << nlink
1068 << " has " << newest.nlink << dendl;
1069 bad_nlink_inos[p.first] = newest;
1070 bad_nlink_inos[p.first].nlink = nlink;
1071 }
1072 }
1073 dup_primaries.clear();
1074 remote_links.clear();
1075
1076 for (auto& p : to_remove) {
1077 object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
1078
1079 int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
1080 if (r != 0) {
1081 derr << "Error removing duplicated dentries from " << p.first << dendl;
1082 return r;
1083 }
1084 }
1085 to_remove.clear();
1086
1087 for (auto &p : bad_nlink_inos) {
1088 InodeStore inode;
1089 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode);
1090 if (r < 0) {
1091 derr << "Unexpected error reading dentry "
1092 << p.second.dirfrag() << "/" << p.second.name
1093 << ": " << cpp_strerror(r) << dendl;
1094 return r;
1095 }
1096
1097 if (inode.inode.ino != p.first || inode.inode.version != p.second.version)
1098 continue;
1099
1100 inode.inode.nlink = p.second.nlink;
1101 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode);
1102 if (r < 0)
1103 return r;
1104 }
1105
1106 return 0;
1107}
1108
1109int DataScan::scan_frags()
1110{
1111 bool roots_present;
1112 int r = driver->check_roots(&roots_present);
1113 if (r != 0) {
1114 derr << "Unexpected error checking roots: '"
1115 << cpp_strerror(r) << "'" << dendl;
1116 return r;
1117 }
1118
1119 if (!roots_present) {
1120 std::cerr << "Some or all system inodes are absent. Run 'init' from "
1121 "one node before running 'scan_inodes'" << std::endl;
1122 return -EIO;
1123 }
1124
1125 return forall_objects(metadata_io, true, [this](
1126 std::string const &oid,
1127 uint64_t obj_name_ino,
1128 uint64_t obj_name_offset) -> int
1129 {
1130 int r = 0;
1131 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
1132 if (r != 0) {
1133 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
1134 return r;
1135 }
1136
1137 if (obj_name_ino < (1ULL << 40)) {
1138 // FIXME: we're skipping stray dirs here: if they're
1139 // orphaned then we should be resetting them some other
1140 // way
1141 dout(10) << "Skipping system ino " << obj_name_ino << dendl;
1142 return 0;
1143 }
1144
1145 AccumulateResult accum_res;
1146 inode_backtrace_t backtrace;
1147
1148 // Default to inherit layout (i.e. no explicit layout on dir) which is
1149 // expressed as a zeroed layout struct (see inode_t::has_layout)
1150 file_layout_t loaded_layout;
1151
1152 int parent_r = 0;
1153 bufferlist parent_bl;
1154 int layout_r = 0;
1155 bufferlist layout_bl;
1156 bufferlist op_bl;
1157
1158 librados::ObjectReadOperation op;
1159 op.getxattr("parent", &parent_bl, &parent_r);
1160 op.getxattr("layout", &layout_bl, &layout_r);
1161 r = metadata_io.operate(oid, &op, &op_bl);
1162 if (r != 0 && r != -ENODATA) {
1163 derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
1164 return r;
1165 }
1166
1167 if (parent_r != -ENODATA) {
1168 try {
1169 bufferlist::iterator q = parent_bl.begin();
1170 backtrace.decode(q);
1171 } catch (buffer::error &e) {
1172 dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl;
1173 if (!force_corrupt) {
1174 return -EINVAL;
1175 } else {
1176 // Treat backtrace as absent: we'll inject into lost+found
1177 backtrace = inode_backtrace_t();
1178 }
1179 }
1180 }
1181
1182 if (layout_r != -ENODATA) {
1183 try {
1184 bufferlist::iterator q = layout_bl.begin();
1185 ::decode(loaded_layout, q);
1186 } catch (buffer::error &e) {
1187 dout(4) << "Corrupt layout on '" << oid << "': " << e << dendl;
1188 if (!force_corrupt) {
1189 return -EINVAL;
1190 }
1191 }
1192 }
1193
1194 bool have_backtrace = !(backtrace.ancestors.empty());
1195
1196 // Santity checking backtrace ino against object name
1197 if (have_backtrace && backtrace.ino != obj_name_ino) {
1198 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
1199 << " doesn't match object name ino 0x" << obj_name_ino
1200 << std::dec << dendl;
1201 have_backtrace = false;
1202 }
1203
1204 uint64_t fnode_version = 0;
1205 fnode_t fnode;
1206 r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
1207 if (r == -EINVAL) {
1208 derr << "Corrupt fnode on " << oid << dendl;
1209 if (force_corrupt) {
1210 fnode.fragstat.mtime = 0;
1211 fnode.fragstat.nfiles = 1;
1212 fnode.fragstat.nsubdirs = 0;
1213 fnode.accounted_fragstat = fnode.fragstat;
1214 } else {
1215 return r;
1216 }
1217 }
1218
1219 InodeStore dentry;
1220 build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
1221 loaded_layout, &dentry);
1222
1223 // Inject inode to the metadata pool
1224 if (have_backtrace) {
1225 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
1226 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
1227 /* Special case for strays: even if we have a good backtrace,
1228 * don't put it in the stray dir, because while that would technically
1229 * give it linkage it would still be invisible to the user */
1230 r = driver->inject_lost_and_found(obj_name_ino, dentry);
1231 if (r < 0) {
1232 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1233 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1234 if (r == -EINVAL) {
1235 dout(4) << "Use --force-corrupt to overwrite structures that "
1236 "appear to be corrupt" << dendl;
1237 }
1238 }
1239 } else {
1240 /* Happy case: we will inject a named dentry for this inode */
1241 r = driver->inject_with_backtrace(backtrace, dentry);
1242 if (r < 0) {
1243 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1244 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
1245 if (r == -EINVAL) {
1246 dout(4) << "Use --force-corrupt to overwrite structures that "
1247 "appear to be corrupt" << dendl;
1248 }
1249 }
1250 }
1251 } else {
1252 /* Backtrace-less case: we will inject a lost+found dentry */
1253 r = driver->inject_lost_and_found(
1254 obj_name_ino, dentry);
1255 if (r < 0) {
1256 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
1257 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1258 if (r == -EINVAL) {
1259 dout(4) << "Use --force-corrupt to overwrite structures that "
1260 "appear to be corrupt" << dendl;
1261 }
1262 }
1263 }
1264
1265 return r;
1266 });
1267}
1268
1269int MetadataTool::read_fnode(
1270 inodeno_t ino, frag_t frag, fnode_t *fnode,
1271 uint64_t *last_version)
1272{
1273 assert(fnode != NULL);
1274
1275 object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
1276 bufferlist fnode_bl;
1277 int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
1278 *last_version = metadata_io.get_last_version();
1279 if (r < 0) {
1280 return r;
1281 }
1282
1283 bufferlist::iterator old_fnode_iter = fnode_bl.begin();
1284 try {
1285 (*fnode).decode(old_fnode_iter);
1286 } catch (const buffer::error &err) {
1287 return -EINVAL;
1288 }
1289
1290 return 0;
1291}
1292
1293int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
1294 const std::string &dname, InodeStore *inode)
1295{
1296 assert(inode != NULL);
1297
1298
1299 std::string key;
1300 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1301 dn_key.encode(key);
1302
1303 std::set<std::string> keys;
1304 keys.insert(key);
1305 std::map<std::string, bufferlist> vals;
1306 object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
1307 int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
1308 dout(20) << "oid=" << frag_oid.name
1309 << " dname=" << dname
1310 << " frag=" << frag
1311 << ", r=" << r << dendl;
1312 if (r < 0) {
1313 return r;
1314 }
1315
1316 if (vals.find(key) == vals.end()) {
1317 dout(20) << key << " not found in result" << dendl;
1318 return -ENOENT;
1319 }
1320
1321 try {
1322 bufferlist::iterator q = vals[key].begin();
1323 snapid_t dnfirst;
1324 ::decode(dnfirst, q);
1325 char dentry_type;
1326 ::decode(dentry_type, q);
1327 if (dentry_type == 'I') {
1328 inode->decode_bare(q);
1329 return 0;
1330 } else {
1331 dout(20) << "dentry type '" << dentry_type << "': cannot"
1332 "read an inode out of that" << dendl;
1333 return -EINVAL;
1334 }
1335 } catch (const buffer::error &err) {
1336 dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
1337 << std::dec << "/" << dname << dendl;
1338 return -EINVAL;
1339 }
1340
1341 return 0;
1342}
1343
1344int MetadataDriver::inject_lost_and_found(
1345 inodeno_t ino, const InodeStore &dentry)
1346{
1347 // Create lost+found if doesn't exist
1348 bool created = false;
1349 int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
1350 if (r < 0) {
1351 return r;
1352 }
1353 InodeStore lf_ino;
1354 r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
1355 if (r == -ENOENT || r == -EINVAL) {
1356 if (r == -EINVAL && !force_corrupt) {
1357 return r;
1358 }
1359
1360 // To have a directory not specify a layout, give it zeros (see
1361 // inode_t::has_layout)
1362 file_layout_t inherit_layout;
1363
1364 // Construct LF inode
1365 frag_info_t fragstat;
1366 fragstat.nfiles = 1,
1367 build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
1368
1369 // Inject link to LF inode in the root dir
1370 r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
1371 if (r < 0) {
1372 return r;
1373 }
1374 } else {
1375 if (!(lf_ino.inode.mode & S_IFDIR)) {
1376 derr << "lost+found exists but is not a directory!" << dendl;
1377 // In this case we error out, and the user should do something about
1378 // this problem.
1379 return -EINVAL;
1380 }
1381 }
1382
1383 r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
1384 if (r < 0) {
1385 return r;
1386 }
1387
1388 InodeStore recovered_ino;
1389
1390
1391 const std::string dname = lost_found_dname(ino);
1392
1393 // Write dentry into lost+found dirfrag
1394 return inject_linkage(lf_ino.inode.ino, dname, frag_t(), dentry);
1395}
1396
1397
1398int MetadataDriver::get_frag_of(
1399 inodeno_t dirino,
1400 const std::string &target_dname,
1401 frag_t *result_ft)
1402{
1403 object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
1404
1405 dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
1406
1407 // Find and load fragtree if existing dirfrag
1408 // ==========================================
1409 bool have_backtrace = false;
1410 bufferlist parent_bl;
1411 int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
1412 if (r == -ENODATA) {
1413 dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
1414 } else if (r < 0) {
1415 dout(4) << "Unexpected error on '" << root_frag_oid << "': "
1416 << cpp_strerror(r) << dendl;
1417 return r;
1418 }
1419
1420 // Deserialize backtrace
1421 inode_backtrace_t backtrace;
1422 if (parent_bl.length()) {
1423 try {
1424 bufferlist::iterator q = parent_bl.begin();
1425 backtrace.decode(q);
1426 have_backtrace = true;
1427 } catch (buffer::error &e) {
1428 dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " << e << dendl;
1429 }
1430 }
1431
1432 if (!(have_backtrace && backtrace.ancestors.size())) {
1433 // Can't work out fragtree without a backtrace
1434 dout(4) << "No backtrace on '" << root_frag_oid
1435 << "': cannot determine fragtree" << dendl;
1436 return -ENOENT;
1437 }
1438
1439 // The parentage of dirino
1440 const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
1441
1442 // The inode of dirino's parent
1443 const inodeno_t parent_ino = bp.dirino;
1444
1445 // The dname of dirino in its parent.
1446 const std::string &parent_dname = bp.dname;
1447
1448 dout(20) << "got backtrace parent " << parent_ino << "/"
1449 << parent_dname << dendl;
1450
1451 // The primary dentry for dirino
1452 InodeStore existing_dentry;
1453
1454 // See if we can find ourselves in dirfrag zero of the parent: this
1455 // is a fast path that avoids needing to go further up the tree
1456 // if the parent isn't fragmented (worst case we would have to
1457 // go all the way to the root)
1458 r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
1459 if (r >= 0) {
1460 // Great, fast path: return the fragtree from here
1461 if (existing_dentry.inode.ino != dirino) {
1462 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
1463 << existing_dentry.inode.ino
1464 << " vs expected 0x" << dirino << std::dec << dendl;
1465 return -ENOENT;
1466 }
1467 dout(20) << "fast path, fragtree is "
1468 << existing_dentry.dirfragtree << dendl;
1469 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1470 dout(20) << "frag is " << *result_ft << dendl;
1471 return 0;
1472 } else if (r != -ENOENT) {
1473 // Dentry not present in 0th frag, must read parent's fragtree
1474 frag_t parent_frag;
1475 r = get_frag_of(parent_ino, parent_dname, &parent_frag);
1476 if (r == 0) {
1477 // We have the parent fragtree, so try again to load our dentry
1478 r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
1479 if (r >= 0) {
1480 // Got it!
1481 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1482 dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
1483 return 0;
1484 } else {
1485 if (r == -EINVAL || r == -ENOENT) {
1486 return -ENOENT; // dentry missing or corrupt, so frag is missing
1487 } else {
1488 return r;
1489 }
1490 }
1491 } else {
1492 // Couldn't resolve parent fragtree, so can't find ours.
1493 return r;
1494 }
1495 } else if (r == -EINVAL) {
1496 // Unreadable dentry, can't know the fragtree.
1497 return -ENOENT;
1498 } else {
1499 // Unexpected error, raise it
1500 return r;
1501 }
1502}
1503
1504
1505int MetadataDriver::inject_with_backtrace(
1506 const inode_backtrace_t &backtrace, const InodeStore &dentry)
1507
1508{
1509
1510 // On dirfrags
1511 // ===========
1512 // In order to insert something into a directory, we first (ideally)
1513 // need to know the fragtree for the directory. Sometimes we can't
1514 // get that, in which case we just go ahead and insert it into
1515 // fragment zero for a good chance of that being the right thing
1516 // anyway (most moderate-sized dirs aren't fragmented!)
1517
1518 // On ancestry
1519 // ===========
1520 // My immediate ancestry should be correct, so if we can find that
1521 // directory's dirfrag then go inject it there. This works well
1522 // in the case that this inode's dentry was somehow lost and we
1523 // are recreating it, because the rest of the hierarchy
1524 // will probably still exist.
1525 //
1526 // It's more of a "better than nothing" approach when rebuilding
1527 // a whole tree, as backtraces will in general not be up to date
1528 // beyond the first parent, if anything in the trace was ever
1529 // moved after the file was created.
1530
1531 // On inode numbers
1532 // ================
1533 // The backtrace tells us inodes for each of the parents. If we are
1534 // creating those parent dirfrags, then there is a risk that somehow
1535 // the inode indicated here was also used for data (not a dirfrag) at
1536 // some stage. That would be a zany situation, and we don't check
1537 // for it here, because to do so would require extra IOs for everything
1538 // we inject, and anyway wouldn't guarantee that the inode number
1539 // wasn't in use in some dentry elsewhere in the metadata tree that
1540 // just happened not to have any data objects.
1541
1542 // On multiple workers touching the same traces
1543 // ============================================
1544 // When creating linkage for a directory, *only* create it if we are
1545 // also creating the object. That way, we might not manage to get the
1546 // *right* linkage for a directory, but at least we won't multiply link
1547 // it. We assume that if a root dirfrag exists for a directory, then
1548 // it is linked somewhere (i.e. that the metadata pool is not already
1549 // inconsistent).
1550 //
1551 // Making sure *that* is true is someone else's job! Probably someone
1552 // who is not going to run in parallel, so that they can self-consistently
1553 // look at versions and move things around as they go.
1554 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1555 // object, next run will fail to create linkage for the dirfrag object
1556 // and leave it orphaned.
1557
1558 inodeno_t ino = backtrace.ino;
1559 dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
1560 for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
1561 i != backtrace.ancestors.end(); ++i) {
1562 const inode_backpointer_t &backptr = *i;
1563 dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
1564 << "/" << backptr.dname << dendl;
1565
1566 // Examine root dirfrag for parent
1567 const inodeno_t parent_ino = backptr.dirino;
1568 const std::string dname = backptr.dname;
1569
1570 frag_t fragment;
1571 int r = get_frag_of(parent_ino, dname, &fragment);
1572 if (r == -ENOENT) {
1573 // Don't know fragment, fall back to assuming root
1574 dout(20) << "don't know fragment for 0x" << std::hex <<
1575 parent_ino << std::dec << "/" << dname << ", will insert to root"
1576 << dendl;
1577 }
1578
1579 // Find or create dirfrag
1580 // ======================
1581 bool created_dirfrag;
1582 r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
1583 if (r < 0) {
1584 return r;
1585 }
1586
1587 // Check if dentry already exists
1588 // ==============================
1589 InodeStore existing_dentry;
1590 r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
1591 bool write_dentry = false;
1592 if (r == -ENOENT || r == -EINVAL) {
1593 if (r == -EINVAL && !force_corrupt) {
1594 return r;
1595 }
1596 // Missing or corrupt dentry
1597 write_dentry = true;
1598 } else if (r < 0) {
1599 derr << "Unexpected error reading dentry 0x" << std::hex
1600 << parent_ino << std::dec << "/"
1601 << dname << ": " << cpp_strerror(r) << dendl;
1602 break;
1603 } else {
1604 // Dentry already present, does it link to me?
1605 if (existing_dentry.inode.ino == ino) {
1606 dout(20) << "Dentry 0x" << std::hex
1607 << parent_ino << std::dec << "/"
1608 << dname << " already exists and points to me" << dendl;
1609 } else {
1610 derr << "Dentry 0x" << std::hex
1611 << parent_ino << std::dec << "/"
1612 << dname << " already exists but points to 0x"
1613 << std::hex << existing_dentry.inode.ino << std::dec << dendl;
1614 // Fall back to lost+found!
1615 return inject_lost_and_found(backtrace.ino, dentry);
1616 }
1617 }
1618
1619 // Inject linkage
1620 // ==============
1621
1622 if (write_dentry) {
1623 if (i == backtrace.ancestors.begin()) {
1624 // This is the linkage for the file of interest
1625 dout(10) << "Linking inode 0x" << std::hex << ino
1626 << " at 0x" << parent_ino << "/" << dname << std::dec
1627 << " with size=" << dentry.inode.size << " bytes" << dendl;
1628
1629 r = inject_linkage(parent_ino, dname, fragment, dentry);
1630 } else {
1631 // This is the linkage for an ancestor directory
1632 InodeStore ancestor_dentry;
1633 ancestor_dentry.inode.mode = 0755 | S_IFDIR;
1634
1635 // Set nfiles to something non-zero, to fool any other code
1636 // that tries to ignore 'empty' directories. This won't be
1637 // accurate, but it should avoid functional issues.
1638
1639 ancestor_dentry.inode.dirstat.nfiles = 1;
1640 ancestor_dentry.inode.dir_layout.dl_dir_hash =
1641 g_conf->mds_default_dir_hash;
1642
1643 ancestor_dentry.inode.nlink = 1;
1644 ancestor_dentry.inode.ino = ino;
1645 ancestor_dentry.inode.uid = g_conf->mds_root_ino_uid;
1646 ancestor_dentry.inode.gid = g_conf->mds_root_ino_gid;
1647 ancestor_dentry.inode.version = 1;
1648 ancestor_dentry.inode.backtrace_version = 1;
1649 r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
1650 }
1651
1652 if (r < 0) {
1653 return r;
1654 }
1655 }
1656
1657 if (!created_dirfrag) {
1658 // If the parent dirfrag already existed, then stop traversing the
1659 // backtrace: assume that the other ancestors already exist too. This
1660 // is an assumption rather than a truth, but it's a convenient way
1661 // to avoid the risk of creating multiply-linked directories while
1662 // injecting data. If there are in fact missing ancestors, this
1663 // should be fixed up using a separate tool scanning the metadata
1664 // pool.
1665 break;
1666 } else {
1667 // Proceed up the backtrace, creating parents
1668 ino = parent_ino;
1669 }
1670 }
1671
1672 return 0;
1673}
1674
1675int MetadataDriver::find_or_create_dirfrag(
1676 inodeno_t ino,
1677 frag_t fragment,
1678 bool *created)
1679{
1680 assert(created != NULL);
1681
1682 fnode_t existing_fnode;
1683 *created = false;
1684
1685 uint64_t read_version = 0;
1686 int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
1687 dout(10) << "read_version = " << read_version << dendl;
1688
1689 if (r == -ENOENT || r == -EINVAL) {
1690 if (r == -EINVAL && !force_corrupt) {
1691 return r;
1692 }
1693
1694 // Missing or corrupt fnode, create afresh
1695 bufferlist fnode_bl;
1696 fnode_t blank_fnode;
1697 blank_fnode.version = 1;
1698 // mark it as non-empty
1699 blank_fnode.fragstat.nfiles = 1;
1700 blank_fnode.accounted_fragstat = blank_fnode.fragstat;
1701 blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
1702 blank_fnode.encode(fnode_bl);
1703
1704
1705 librados::ObjectWriteOperation op;
1706
1707 if (read_version) {
1708 assert(r == -EINVAL);
1709 // Case A: We must assert that the version isn't changed since we saw the object
1710 // was unreadable, to avoid the possibility of two data-scan processes
1711 // both creating the frag.
1712 op.assert_version(read_version);
1713 } else {
1714 assert(r == -ENOENT);
1715 // Case B: The object didn't exist in read_fnode, so while creating it we must
1716 // use an exclusive create to correctly populate *creating with
1717 // whether we created it ourselves or someone beat us to it.
1718 op.create(true);
1719 }
1720
1721 object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
1722 op.omap_set_header(fnode_bl);
1723 r = metadata_io.operate(frag_oid.name, &op);
1724 if (r == -EOVERFLOW || r == -EEXIST) {
1725 // Someone else wrote it (see case A above)
1726 dout(10) << "Dirfrag creation race: 0x" << std::hex
1727 << ino << " " << fragment << std::dec << dendl;
1728 *created = false;
1729 return 0;
1730 } else if (r < 0) {
1731 // We were unable to create or write it, error out
1732 derr << "Failed to create dirfrag 0x" << std::hex
1733 << ino << std::dec << ": " << cpp_strerror(r) << dendl;
1734 return r;
1735 } else {
1736 // Success: the dirfrag object now exists with a value header
1737 dout(10) << "Created dirfrag: 0x" << std::hex
1738 << ino << std::dec << dendl;
1739 *created = true;
1740 }
1741 } else if (r < 0) {
1742 derr << "Unexpected error reading dirfrag 0x" << std::hex
1743 << ino << std::dec << " : " << cpp_strerror(r) << dendl;
1744 return r;
1745 } else {
1746 dout(20) << "Dirfrag already exists: 0x" << std::hex
1747 << ino << " " << fragment << std::dec << dendl;
1748 }
1749
1750 return 0;
1751}
1752
1753int MetadataDriver::inject_linkage(
1754 inodeno_t dir_ino, const std::string &dname,
1755 const frag_t fragment, const InodeStore &inode)
1756{
1757 // We have no information about snapshots, so everything goes
1758 // in as CEPH_NOSNAP
1759 snapid_t snap = CEPH_NOSNAP;
1760
1761 object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
1762
1763 std::string key;
1764 dentry_key_t dn_key(snap, dname.c_str());
1765 dn_key.encode(key);
1766
1767 bufferlist dentry_bl;
1768 ::encode(snap, dentry_bl);
1769 ::encode('I', dentry_bl);
1770 inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1771
1772 // Write out
1773 std::map<std::string, bufferlist> vals;
1774 vals[key] = dentry_bl;
1775 int r = metadata_io.omap_set(frag_oid.name, vals);
1776 if (r != 0) {
1777 derr << "Error writing dentry 0x" << std::hex
1778 << dir_ino << std::dec << "/"
1779 << dname << ": " << cpp_strerror(r) << dendl;
1780 return r;
1781 } else {
1782 dout(20) << "Injected dentry 0x" << std::hex
1783 << dir_ino << "/" << dname << " pointing to 0x"
1784 << inode.inode.ino << std::dec << dendl;
1785 return 0;
1786 }
1787}
1788
1789
1790int MetadataDriver::init(
1791 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
1792 fs_cluster_id_t fscid)
1793{
1794 if (metadata_pool_name.empty()) {
1795 auto fs = fsmap->get_filesystem(fscid);
1796 assert(fs != nullptr);
1797 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
1798
1799 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
1800 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
1801 if (r < 0) {
1802 derr << "Pool " << metadata_pool_id
1803 << " identified in MDS map not found in RADOS!" << dendl;
1804 return r;
1805 }
1806 dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
1807 } else {
1808 dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
1809 }
1810 return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
1811}
1812
1813int LocalFileDriver::init(
1814 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
1815 fs_cluster_id_t fscid)
1816{
1817 return 0;
1818}
1819
1820int LocalFileDriver::inject_data(
1821 const std::string &file_path,
1822 uint64_t size,
1823 uint32_t chunk_size,
1824 inodeno_t ino)
1825{
1826 // Scrape the file contents out of the data pool and into the
1827 // local filesystem
1828 std::fstream f;
1829 f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
1830
1831 for (uint64_t offset = 0; offset < size; offset += chunk_size) {
1832 bufferlist bl;
1833
1834 char buf[32];
1835 snprintf(buf, sizeof(buf),
1836 "%llx.%08llx",
1837 (unsigned long long)ino,
1838 (unsigned long long)(offset / chunk_size));
1839 std::string oid(buf);
1840
1841 int r = data_io.read(oid, bl, chunk_size, 0);
1842
1843 if (r <= 0 && r != -ENOENT) {
1844 derr << "error reading data object '" << oid << "': "
1845 << cpp_strerror(r) << dendl;
1846 f.close();
1847 return r;
1848 } else if (r >=0) {
1849
1850 f.seekp(offset);
1851 bl.write_stream(f);
1852 }
1853 }
1854 f.close();
1855
1856 return 0;
1857}
1858
1859
1860int LocalFileDriver::inject_with_backtrace(
1861 const inode_backtrace_t &bt,
1862 const InodeStore &dentry)
1863{
1864 std::string path_builder = path;
1865
1866 // Iterate through backtrace creating directory parents
1867 std::vector<inode_backpointer_t>::const_reverse_iterator i;
1868 for (i = bt.ancestors.rbegin();
1869 i != bt.ancestors.rend(); ++i) {
1870
1871 const inode_backpointer_t &backptr = *i;
1872 path_builder += "/";
1873 path_builder += backptr.dname;
1874
1875 // Last entry is the filename itself
1876 bool is_file = (i + 1 == bt.ancestors.rend());
1877 if (is_file) {
1878 // FIXME: inject_data won't cope with interesting (i.e. striped)
1879 // layouts (need a librados-compatible Filer to read these)
1880 inject_data(path_builder, dentry.inode.size,
1881 dentry.inode.layout.object_size, bt.ino);
1882 } else {
1883 int r = mkdir(path_builder.c_str(), 0755);
1884 if (r != 0 && r != -EPERM) {
1885 derr << "error creating directory: '" << path_builder << "': "
1886 << cpp_strerror(r) << dendl;
1887 return r;
1888 }
1889 }
1890 }
1891
1892 return 0;
1893}
1894
1895int LocalFileDriver::inject_lost_and_found(
1896 inodeno_t ino,
1897 const InodeStore &dentry)
1898{
1899 std::string lf_path = path + "/lost+found";
1900 int r = mkdir(lf_path.c_str(), 0755);
1901 if (r != 0 && r != -EPERM) {
1902 derr << "error creating directory: '" << lf_path << "': "
1903 << cpp_strerror(r) << dendl;
1904 return r;
1905 }
1906
1907 std::string file_path = lf_path + "/" + lost_found_dname(ino);
1908 return inject_data(file_path, dentry.inode.size,
1909 dentry.inode.layout.object_size, ino);
1910}
1911
1912int LocalFileDriver::init_roots(int64_t data_pool_id)
1913{
1914 // Ensure that the path exists and is a directory
1915 bool exists;
1916 int r = check_roots(&exists);
1917 if (r != 0) {
1918 return r;
1919 }
1920
1921 if (exists) {
1922 return 0;
1923 } else {
1924 return ::mkdir(path.c_str(), 0755);
1925 }
1926}
1927
1928int LocalFileDriver::check_roots(bool *result)
1929{
1930 // Check if the path exists and is a directory
1931 DIR *d = ::opendir(path.c_str());
1932 if (d == NULL) {
1933 *result = false;
1934 } else {
1935 int r = closedir(d);
1936 if (r != 0) {
1937 // Weird, but maybe possible with e.g. stale FD on NFS mount?
1938 *result = false;
1939 } else {
1940 *result = true;
1941 }
1942 }
1943
1944 return 0;
1945}
1946
1947void MetadataTool::build_file_dentry(
1948 inodeno_t ino, uint64_t file_size, time_t file_mtime,
1949 const file_layout_t &layout, InodeStore *out)
1950{
1951 assert(out != NULL);
1952
1953 out->inode.mode = 0500 | S_IFREG;
1954 out->inode.size = file_size;
1955 out->inode.max_size_ever = file_size;
1956 out->inode.mtime.tv.tv_sec = file_mtime;
1957 out->inode.atime.tv.tv_sec = file_mtime;
1958 out->inode.ctime.tv.tv_sec = file_mtime;
1959
1960 out->inode.layout = layout;
1961
1962 out->inode.truncate_seq = 1;
1963 out->inode.truncate_size = -1ull;
1964
1965 out->inode.inline_data.version = CEPH_INLINE_NONE;
1966
1967 out->inode.nlink = 1;
1968 out->inode.ino = ino;
1969 out->inode.version = 1;
1970 out->inode.backtrace_version = 1;
1971 out->inode.uid = g_conf->mds_root_ino_uid;
1972 out->inode.gid = g_conf->mds_root_ino_gid;
1973}
1974
1975void MetadataTool::build_dir_dentry(
1976 inodeno_t ino, const frag_info_t &fragstat,
1977 const file_layout_t &layout, InodeStore *out)
1978{
1979 assert(out != NULL);
1980
1981 out->inode.mode = 0755 | S_IFDIR;
1982 out->inode.dirstat = fragstat;
1983 out->inode.mtime.tv.tv_sec = fragstat.mtime;
1984 out->inode.atime.tv.tv_sec = fragstat.mtime;
1985 out->inode.ctime.tv.tv_sec = fragstat.mtime;
1986
1987 out->inode.layout = layout;
1988 out->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
1989
1990 out->inode.truncate_seq = 1;
1991 out->inode.truncate_size = -1ull;
1992
1993 out->inode.inline_data.version = CEPH_INLINE_NONE;
1994
1995 out->inode.nlink = 1;
1996 out->inode.ino = ino;
1997 out->inode.version = 1;
1998 out->inode.backtrace_version = 1;
1999 out->inode.uid = g_conf->mds_root_ino_uid;
2000 out->inode.gid = g_conf->mds_root_ino_gid;
2001}
2002