]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/cephfs/DataScan.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / tools / cephfs / DataScan.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
31f18b77 15#include "include/compat.h"
7c673cae
FG
16#include "common/errno.h"
17#include "common/ceph_argparse.h"
18#include <fstream>
19#include "include/util.h"
b3b6e05e 20#include "include/ceph_fs.h"
7c673cae 21
f67539c2 22#include "mds/CDentry.h"
7c673cae 23#include "mds/CInode.h"
f67539c2 24#include "mds/CDentry.h"
11fdf7f2
TL
25#include "mds/InoTable.h"
26#include "mds/SnapServer.h"
7c673cae
FG
27#include "cls/cephfs/cls_cephfs_client.h"
28
29#include "PgFiles.h"
30#include "DataScan.h"
31#include "include/compat.h"
32
33#define dout_context g_ceph_context
34#define dout_subsys ceph_subsys_mds
35#undef dout_prefix
36#define dout_prefix *_dout << "datascan." << __func__ << ": "
37
20effc67
TL
38using namespace std;
39
7c673cae
FG
40void DataScan::usage()
41{
42 std::cout << "Usage: \n"
43 << " cephfs-data-scan init [--force-init]\n"
1e59de90
TL
44 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] [<data pool name> [<extra data pool name> ...]]\n"
45 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] [<data pool name>]\n"
7c673cae
FG
46 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
47 << " cephfs-data-scan scan_links\n"
48 << "\n"
49 << " --force-corrupt: overrite apparently corrupt structures\n"
50 << " --force-init: write root inodes even if they exist\n"
51 << " --force-pool: use data pool even if it is not in FSMap\n"
52 << " --worker_m: Maximum number of workers\n"
53 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
54 << "\n"
55 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
1e59de90 56 << " cephfs-data-scan cleanup [<data pool name>]\n"
7c673cae
FG
57 << std::endl;
58
59 generic_client_usage();
60}
61
62bool DataScan::parse_kwarg(
63 const std::vector<const char*> &args,
64 std::vector<const char *>::const_iterator &i,
65 int *r)
66{
67 if (i + 1 == args.end()) {
68 return false;
69 }
70
71 const std::string arg(*i);
72 const std::string val(*(i + 1));
73
74 if (arg == std::string("--output-dir")) {
75 if (driver != NULL) {
76 derr << "Unexpected --output-dir: output already selected!" << dendl;
77 *r = -EINVAL;
78 return false;
79 }
80 dout(4) << "Using local file output to '" << val << "'" << dendl;
81 driver = new LocalFileDriver(val, data_io);
82 return true;
83 } else if (arg == std::string("--worker_n")) {
84 std::string err;
85 n = strict_strtoll(val.c_str(), 10, &err);
86 if (!err.empty()) {
87 std::cerr << "Invalid worker number '" << val << "'" << std::endl;
88 *r = -EINVAL;
89 return false;
90 }
91 return true;
92 } else if (arg == std::string("--worker_m")) {
93 std::string err;
94 m = strict_strtoll(val.c_str(), 10, &err);
95 if (!err.empty()) {
96 std::cerr << "Invalid worker count '" << val << "'" << std::endl;
97 *r = -EINVAL;
98 return false;
99 }
100 return true;
101 } else if (arg == std::string("--filter-tag")) {
102 filter_tag = val;
103 dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
104 return true;
105 } else if (arg == std::string("--filesystem")) {
106 std::shared_ptr<const Filesystem> fs;
107 *r = fsmap->parse_filesystem(val, &fs);
108 if (*r != 0) {
109 std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
110 return false;
111 }
112 fscid = fs->fscid;
113 return true;
114 } else if (arg == std::string("--alternate-pool")) {
115 metadata_pool_name = val;
116 return true;
117 } else {
118 return false;
119 }
120}
121
122bool DataScan::parse_arg(
123 const std::vector<const char*> &args,
124 std::vector<const char *>::const_iterator &i)
125{
126 const std::string arg(*i);
127 if (arg == "--force-pool") {
128 force_pool = true;
129 return true;
130 } else if (arg == "--force-corrupt") {
131 force_corrupt = true;
132 return true;
133 } else if (arg == "--force-init") {
134 force_init = true;
135 return true;
136 } else {
137 return false;
138 }
139}
140
141int DataScan::main(const std::vector<const char*> &args)
142{
143 // Parse args
144 // ==========
145 if (args.size() < 1) {
11fdf7f2 146 cerr << "missing position argument" << std::endl;
7c673cae
FG
147 return -EINVAL;
148 }
149
150 // Common RADOS init: open metadata pool
151 // =====================================
152 librados::Rados rados;
153 int r = rados.init_with_context(g_ceph_context);
154 if (r < 0) {
155 derr << "RADOS unavailable" << dendl;
156 return r;
157 }
158
159 std::string const &command = args[0];
160 std::string data_pool_name;
1e59de90 161 std::set<std::string> extra_data_pool_names;
7c673cae
FG
162
163 std::string pg_files_path;
164 std::set<pg_t> pg_files_pgs;
165
166 // Consume any known --key val or --flag arguments
167 for (std::vector<const char *>::const_iterator i = args.begin() + 1;
168 i != args.end(); ++i) {
169 if (parse_kwarg(args, i, &r)) {
170 // Skip the kwarg value field
171 ++i;
172 continue;
173 } else if (r) {
174 return r;
175 }
176
177 if (parse_arg(args, i)) {
178 continue;
179 }
180
1e59de90
TL
181 // Trailing positional arguments
182 if (command == "scan_extents") {
183 if (data_pool_name.empty()) {
184 data_pool_name = *i;
185 } else if (*i != data_pool_name) {
186 extra_data_pool_names.insert(*i);
187 }
188 continue;
189 }
190
7c673cae
FG
191 // Trailing positional argument
192 if (i + 1 == args.end() &&
193 (command == "scan_inodes"
7c673cae
FG
194 || command == "cleanup")) {
195 data_pool_name = *i;
196 continue;
197 }
198
199 if (command == "pg_files") {
200 if (i == args.begin() + 1) {
201 pg_files_path = *i;
202 continue;
203 } else {
204 pg_t pg;
205 bool parsed = pg.parse(*i);
206 if (!parsed) {
207 std::cerr << "Invalid PG '" << *i << "'" << std::endl;
208 return -EINVAL;
209 } else {
210 pg_files_pgs.insert(pg);
211 continue;
212 }
213 }
214
215 }
216
217 // Fall through: unhandled
218 std::cerr << "Unknown argument '" << *i << "'" << std::endl;
219 return -EINVAL;
220 }
221
222 // If caller didn't specify a namespace, try to pick
223 // one if only one exists
224 if (fscid == FS_CLUSTER_ID_NONE) {
225 if (fsmap->filesystem_count() == 1) {
226 fscid = fsmap->get_filesystem()->fscid;
227 } else {
228 std::cerr << "Specify a filesystem with --filesystem" << std::endl;
229 return -EINVAL;
230 }
231 }
232 auto fs = fsmap->get_filesystem(fscid);
11fdf7f2 233 ceph_assert(fs != nullptr);
7c673cae
FG
234
235 // Default to output to metadata pool
236 if (driver == NULL) {
237 driver = new MetadataDriver();
238 driver->set_force_corrupt(force_corrupt);
239 driver->set_force_init(force_init);
240 dout(4) << "Using metadata pool output" << dendl;
241 }
242
243 dout(4) << "connecting to RADOS..." << dendl;
244 r = rados.connect();
245 if (r < 0) {
246 std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
247 << std::endl;
248 return r;
249 }
250
251 r = driver->init(rados, metadata_pool_name, fsmap, fscid);
252 if (r < 0) {
253 return r;
254 }
255
256 if (command == "pg_files") {
257 auto pge = PgFiles(objecter, pg_files_pgs);
258 pge.init();
259 return pge.scan_path(pg_files_path);
260 }
261
1e59de90
TL
262 bool autodetect_data_pools = false;
263
7c673cae
FG
264 // Initialize data_io for those commands that need it
265 if (command == "scan_inodes" ||
266 command == "scan_extents" ||
267 command == "cleanup") {
1e59de90 268 data_pool_id = fs->mds_map.get_first_data_pool();
7c673cae 269
1e59de90
TL
270 std::string pool_name;
271 r = rados.pool_reverse_lookup(data_pool_id, &pool_name);
272 if (r < 0) {
273 std::cerr << "Failed to resolve data pool: " << cpp_strerror(r)
274 << std::endl;
275 return r;
7c673cae
FG
276 }
277
1e59de90
TL
278 if (data_pool_name.empty()) {
279 autodetect_data_pools = true;
280 data_pool_name = pool_name;
281 } else if (data_pool_name != pool_name) {
282 std::cerr << "Warning: pool '" << data_pool_name << "' is not the "
283 "main CephFS data pool!" << std::endl;
7c673cae
FG
284 if (!force_pool) {
285 std::cerr << "Use --force-pool to continue" << std::endl;
286 return -EINVAL;
287 }
1e59de90
TL
288
289 data_pool_id = rados.pool_lookup(data_pool_name.c_str());
290 if (data_pool_id < 0) {
291 std::cerr << "Data pool '" << data_pool_name << "' not found!"
292 << std::endl;
293 return -ENOENT;
294 }
7c673cae
FG
295 }
296
1e59de90
TL
297 dout(4) << "data pool '" << data_pool_name << "' has ID " << data_pool_id
298 << dendl;
299
7c673cae
FG
300 dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
301 r = rados.ioctx_create(data_pool_name.c_str(), data_io);
302 if (r != 0) {
303 return r;
304 }
305 }
306
1e59de90
TL
307 // Initialize extra data_ios for those commands that need it
308 if (command == "scan_extents") {
309 if (autodetect_data_pools) {
310 ceph_assert(extra_data_pool_names.empty());
311
312 for (auto &pool_id : fs->mds_map.get_data_pools()) {
313 if (pool_id == data_pool_id) {
314 continue;
315 }
316
317 std::string pool_name;
318 r = rados.pool_reverse_lookup(pool_id, &pool_name);
319 if (r < 0) {
320 std::cerr << "Failed to resolve data pool: " << cpp_strerror(r)
321 << std::endl;
322 return r;
323 }
324 extra_data_pool_names.insert(pool_name);
325 }
326 }
327
328 for (auto &data_pool_name: extra_data_pool_names) {
329 int64_t pool_id = rados.pool_lookup(data_pool_name.c_str());
330 if (data_pool_id < 0) {
331 std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
332 return -ENOENT;
333 } else {
334 dout(4) << "data pool '" << data_pool_name << "' has ID " << pool_id
335 << dendl;
336 }
337
338 if (!fs->mds_map.is_data_pool(pool_id)) {
339 std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
340 "CephFS data pool!" << std::endl;
341 if (!force_pool) {
342 std::cerr << "Use --force-pool to continue" << std::endl;
343 return -EINVAL;
344 }
345 }
346
347 dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
348 extra_data_ios.push_back({});
349 r = rados.ioctx_create(data_pool_name.c_str(), extra_data_ios.back());
350 if (r != 0) {
351 return r;
352 }
353 }
354 }
355
7c673cae
FG
356 // Initialize metadata_io from MDSMap for scan_frags
357 if (command == "scan_frags" || command == "scan_links") {
358 const auto fs = fsmap->get_filesystem(fscid);
359 if (fs == nullptr) {
360 std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
361 return -ENOENT;
362 }
363 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
364
365 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
366 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
367 if (r < 0) {
368 std::cerr << "Pool " << metadata_pool_id
369 << " identified in MDS map not found in RADOS!" << std::endl;
370 return r;
371 }
372
373 r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
374 if (r != 0) {
375 return r;
376 }
11fdf7f2
TL
377
378 data_pools = fs->mds_map.get_data_pools();
7c673cae
FG
379 }
380
381 // Finally, dispatch command
382 if (command == "scan_inodes") {
383 return scan_inodes();
384 } else if (command == "scan_extents") {
385 return scan_extents();
386 } else if (command == "scan_frags") {
387 return scan_frags();
388 } else if (command == "scan_links") {
389 return scan_links();
390 } else if (command == "cleanup") {
391 return cleanup();
392 } else if (command == "init") {
393 return driver->init_roots(fs->mds_map.get_first_data_pool());
394 } else {
395 std::cerr << "Unknown command '" << command << "'" << std::endl;
396 return -EINVAL;
397 }
398}
399
400int MetadataDriver::inject_unlinked_inode(
401 inodeno_t inono, int mode, int64_t data_pool_id)
402{
403 const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
404
405 // Skip if exists
406 bool already_exists = false;
407 int r = root_exists(inono, &already_exists);
408 if (r) {
409 return r;
410 }
411 if (already_exists && !force_init) {
412 std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
413 " exists, skipping create. Use --force-init to overwrite"
414 " the existing object." << std::endl;
415 return 0;
416 }
417
418 // Compose
f67539c2
TL
419 InodeStore inode_data;
420 auto inode = inode_data.get_inode();
421 inode->ino = inono;
422 inode->version = 1;
423 inode->xattr_version = 1;
424 inode->mode = 0500 | mode;
7c673cae
FG
425 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
426 // (we won't actually give the *correct* dirstat here though)
f67539c2 427 inode->dirstat.nfiles = 1;
7c673cae 428
f67539c2
TL
429 inode->ctime = inode->mtime = ceph_clock_now();
430 inode->nlink = 1;
431 inode->truncate_size = -1ull;
432 inode->truncate_seq = 1;
433 inode->uid = g_conf()->mds_root_ino_uid;
434 inode->gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
435
436 // Force layout to default: should we let users override this so that
437 // they don't have to mount the filesystem to correct it?
f67539c2
TL
438 inode->layout = file_layout_t::get_default();
439 inode->layout.pool_id = data_pool_id;
440 inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
441
442 // Assume that we will get our stats wrong, and that we may
443 // be ignoring dirfrags that exist
f67539c2 444 inode_data.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
7c673cae 445
b3b6e05e 446 if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
11fdf7f2
TL
447 sr_t srnode;
448 srnode.seq = 1;
f67539c2 449 encode(srnode, inode_data.snap_blob);
11fdf7f2
TL
450 }
451
7c673cae
FG
452 // Serialize
453 bufferlist inode_bl;
11fdf7f2 454 encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
f67539c2 455 inode_data.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
7c673cae
FG
456
457 // Write
458 r = metadata_io.write_full(oid.name, inode_bl);
459 if (r != 0) {
460 derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
461 return r;
462 }
463
464 return r;
465}
466
467int MetadataDriver::root_exists(inodeno_t ino, bool *result)
468{
469 object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
470 uint64_t size;
471 time_t mtime;
472 int r = metadata_io.stat(oid.name, &size, &mtime);
473 if (r == -ENOENT) {
474 *result = false;
475 return 0;
476 } else if (r < 0) {
477 return r;
478 }
479
480 *result = true;
481 return 0;
482}
483
484int MetadataDriver::init_roots(int64_t data_pool_id)
485{
486 int r = 0;
b3b6e05e 487 r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id);
7c673cae
FG
488 if (r != 0) {
489 return r;
490 }
491 r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
492 if (r != 0) {
493 return r;
494 }
495 bool created = false;
496 r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
497 if (r != 0) {
498 return r;
499 }
500
501 return 0;
502}
503
504int MetadataDriver::check_roots(bool *result)
505{
506 int r;
b3b6e05e 507 r = root_exists(CEPH_INO_ROOT, result);
7c673cae
FG
508 if (r != 0) {
509 return r;
510 }
511 if (!*result) {
512 return 0;
513 }
514
515 r = root_exists(MDS_INO_MDSDIR(0), result);
516 if (r != 0) {
517 return r;
518 }
519 if (!*result) {
520 return 0;
521 }
522
523 return 0;
524}
525
526/**
527 * Stages:
528 *
529 * SERIAL init
530 * 0. Create root inodes if don't exist
531 * PARALLEL scan_extents
532 * 1. Size and mtime recovery: scan ALL objects, and update 0th
533 * objects with max size and max mtime seen.
534 * PARALLEL scan_inodes
535 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
536 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
537 * or rstats at this stage. Inodes without backtraces go into
538 * lost+found
539 * TODO: SERIAL "recover stats"
540 * 3. Dirfrag statistics: depth first traverse into metadata tree,
541 * rebuilding dir sizes.
542 * TODO PARALLEL "clean up"
543 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
544 * anything onto them) and remove any of the xattrs that we
545 * used for accumulating.
546 */
547
548
549int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
550{
551 if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
552 return -EINVAL;
553 }
554
555 std::string err;
556 std::string inode_str = oid.substr(0, oid.find("."));
557 *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
558 if (!err.empty()) {
559 return -EINVAL;
560 }
561
562 std::string pos_string = oid.substr(oid.find(".") + 1);
563 *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
564 if (!err.empty()) {
565 return -EINVAL;
566 }
567
568 return 0;
569}
570
571
572int DataScan::scan_extents()
573{
1e59de90
TL
574 std::vector<librados::IoCtx *> data_ios;
575 data_ios.push_back(&data_io);
576 for (auto &extra_data_io : extra_data_ios) {
577 data_ios.push_back(&extra_data_io);
578 }
579
580 for (auto ioctx : data_ios) {
581 int r = forall_objects(*ioctx, false, [this, ioctx](
7c673cae
FG
582 std::string const &oid,
583 uint64_t obj_name_ino,
584 uint64_t obj_name_offset) -> int
1e59de90
TL
585 {
586 // Read size
587 uint64_t size;
588 time_t mtime;
589 int r = ioctx->stat(oid, &size, &mtime);
590 dout(10) << "handling object " << obj_name_ino
591 << "." << obj_name_offset << dendl;
592 if (r != 0) {
593 dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
594 return r;
595 }
596 int64_t obj_pool_id = data_io.get_id() != ioctx->get_id() ?
597 ioctx->get_id() : -1;
598
599 // I need to keep track of
600 // * The highest object ID seen
601 // * The size of the highest object ID seen
602 // * The largest object seen
603 // * The pool of the objects seen (if it is not the main data pool)
604 //
605 // Given those things, I can later infer the object chunking
606 // size, the offset of the last object (chunk size * highest ID seen),
607 // the actual size (offset of last object + size of highest ID seen),
608 // and the layout pool id.
609 //
610 // This logic doesn't take account of striping.
611 r = ClsCephFSClient::accumulate_inode_metadata(
612 data_io,
613 obj_name_ino,
614 obj_name_offset,
615 size,
616 obj_pool_id,
617 mtime);
618 if (r < 0) {
619 derr << "Failed to accumulate metadata data from '"
620 << oid << "': " << cpp_strerror(r) << dendl;
621 return r;
622 }
7c673cae 623
1e59de90
TL
624 return r;
625 });
7c673cae 626 if (r < 0) {
7c673cae
FG
627 return r;
628 }
1e59de90 629 }
7c673cae 630
1e59de90 631 return 0;
7c673cae
FG
632}
633
634int DataScan::probe_filter(librados::IoCtx &ioctx)
635{
636 bufferlist filter_bl;
637 ClsCephFSClient::build_tag_filter("test", &filter_bl);
638 librados::ObjectCursor range_i;
639 librados::ObjectCursor range_end;
640
641 std::vector<librados::ObjectItem> tmp_result;
642 librados::ObjectCursor tmp_next;
643 int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
644 1, filter_bl, &tmp_result, &tmp_next);
645
646 return r >= 0;
647}
648
649int DataScan::forall_objects(
650 librados::IoCtx &ioctx,
651 bool untagged_only,
652 std::function<int(std::string, uint64_t, uint64_t)> handler
653 )
654{
655 librados::ObjectCursor range_i;
656 librados::ObjectCursor range_end;
657 ioctx.object_list_slice(
658 ioctx.object_list_begin(),
659 ioctx.object_list_end(),
660 n,
661 m,
662 &range_i,
663 &range_end);
664
665
666 bufferlist filter_bl;
667
668 bool legacy_filtering = false;
669 if (untagged_only) {
670 // probe to deal with older OSDs that don't support
671 // the cephfs pgls filtering mode
672 legacy_filtering = !probe_filter(ioctx);
673 if (!legacy_filtering) {
674 ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
675 }
676 }
677
678 int r = 0;
679 while(range_i < range_end) {
680 std::vector<librados::ObjectItem> result;
681 int r = ioctx.object_list(range_i, range_end, 1,
682 filter_bl, &result, &range_i);
683 if (r < 0) {
684 derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
685 return r;
686 }
687
688 for (const auto &i : result) {
689 const std::string &oid = i.oid;
690 uint64_t obj_name_ino = 0;
691 uint64_t obj_name_offset = 0;
692 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
693 if (r != 0) {
694 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
695 continue;
696 }
697
698 if (untagged_only && legacy_filtering) {
699 dout(20) << "Applying filter to " << oid << dendl;
700
701 // We are only interested in 0th objects during this phase: we touched
702 // the other objects during scan_extents
703 if (obj_name_offset != 0) {
704 dout(20) << "Non-zeroth object" << dendl;
705 continue;
706 }
707
708 bufferlist scrub_tag_bl;
709 int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
710 if (r >= 0) {
711 std::string read_tag;
11fdf7f2 712 auto q = scrub_tag_bl.cbegin();
7c673cae 713 try {
11fdf7f2 714 decode(read_tag, q);
7c673cae
FG
715 if (read_tag == filter_tag) {
716 dout(20) << "skipping " << oid << " because it has the filter_tag"
717 << dendl;
718 continue;
719 }
720 } catch (const buffer::error &err) {
721 }
722 dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
723 } else {
724 dout(20) << "no tag read (" << r << ")" << dendl;
725 }
726
727 } else if (untagged_only) {
11fdf7f2 728 ceph_assert(obj_name_offset == 0);
7c673cae
FG
729 dout(20) << "OSD matched oid " << oid << dendl;
730 }
731
732 int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
733 if (r == 0 && this_oid_r < 0) {
734 r = this_oid_r;
735 }
736 }
737 }
738
739 return r;
740}
741
742int DataScan::scan_inodes()
743{
744 bool roots_present;
745 int r = driver->check_roots(&roots_present);
746 if (r != 0) {
747 derr << "Unexpected error checking roots: '"
748 << cpp_strerror(r) << "'" << dendl;
749 return r;
750 }
751
752 if (!roots_present) {
753 std::cerr << "Some or all system inodes are absent. Run 'init' from "
754 "one node before running 'scan_inodes'" << std::endl;
755 return -EIO;
756 }
757
758 return forall_objects(data_io, true, [this](
759 std::string const &oid,
760 uint64_t obj_name_ino,
761 uint64_t obj_name_offset) -> int
762 {
763 int r = 0;
764
765 dout(10) << "handling object "
766 << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
767 << dendl;
768
769 AccumulateResult accum_res;
770 inode_backtrace_t backtrace;
771 file_layout_t loaded_layout = file_layout_t::get_default();
20effc67 772 std::string symlink;
7c673cae 773 r = ClsCephFSClient::fetch_inode_accumulate_result(
20effc67 774 data_io, oid, &backtrace, &loaded_layout, &symlink, &accum_res);
7c673cae
FG
775
776 if (r == -EINVAL) {
777 dout(4) << "Accumulated metadata missing from '"
778 << oid << ", did you run scan_extents?" << dendl;
779 return r;
780 } else if (r < 0) {
781 dout(4) << "Unexpected error loading accumulated metadata from '"
782 << oid << "': " << cpp_strerror(r) << dendl;
783 // FIXME: this creates situation where if a client has a corrupt
784 // backtrace/layout, we will fail to inject it. We should (optionally)
785 // proceed if the backtrace/layout is corrupt but we have valid
786 // accumulated metadata.
787 return r;
788 }
789
790 const time_t file_mtime = accum_res.max_mtime;
791 uint64_t file_size = 0;
792 bool have_backtrace = !(backtrace.ancestors.empty());
793
794 // This is the layout we will use for injection, populated either
795 // from loaded_layout or from best guesses
796 file_layout_t guessed_layout;
1e59de90
TL
797 if (accum_res.obj_pool_id == -1) {
798 guessed_layout.pool_id = data_pool_id;
799 } else {
800 guessed_layout.pool_id = accum_res.obj_pool_id;
801
802 librados::IoCtx ioctx;
803 r = librados::Rados(data_io).ioctx_create2(guessed_layout.pool_id, ioctx);
804 if (r != 0) {
805 derr << "Unexpected error opening file data pool id="
806 << guessed_layout.pool_id << ": " << cpp_strerror(r) << dendl;
807 return r;
808 }
809
810 bufferlist bl;
811 int r = ioctx.getxattr(oid, "layout", bl);
812 if (r < 0) {
813 if (r != -ENODATA) {
814 derr << "Unexpected error reading layout for " << oid << ": "
815 << cpp_strerror(r) << dendl;
816 return r;
817 }
818 } else {
819 try {
820 auto q = bl.cbegin();
821 decode(loaded_layout, q);
822 } catch (ceph::buffer::error &e) {
823 derr << "Unexpected error decoding layout for " << oid << dendl;
824 return -EINVAL;
825 }
826 }
827 }
7c673cae
FG
828
829 // Calculate file_size, guess the layout
830 if (accum_res.ceiling_obj_index > 0) {
831 uint32_t chunk_size = file_layout_t::get_default().object_size;
832 // When there are multiple objects, the largest object probably
833 // indicates the chunk size. But not necessarily, because files
834 // can be sparse. Only make this assumption if size seen
835 // is a power of two, as chunk sizes typically are.
836 if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
837 chunk_size = accum_res.max_obj_size;
838 }
839
840 if (loaded_layout.pool_id == -1) {
841 // If no stashed layout was found, guess it
842 guessed_layout.object_size = chunk_size;
843 guessed_layout.stripe_unit = chunk_size;
844 guessed_layout.stripe_count = 1;
845 } else if (!loaded_layout.is_valid() ||
846 loaded_layout.object_size < accum_res.max_obj_size) {
847 // If the max size seen exceeds what the stashed layout claims, then
848 // disbelieve it. Guess instead. Same for invalid layouts on disk.
849 dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
850 << std::dec << ", ignoring in favour of best guess" << dendl;
851 guessed_layout.object_size = chunk_size;
852 guessed_layout.stripe_unit = chunk_size;
853 guessed_layout.stripe_count = 1;
854 } else {
855 // We have a stashed layout that we can't disprove, so apply it
856 guessed_layout = loaded_layout;
857 dout(20) << "loaded layout from xattr:"
1e59de90 858 << " pi: " << guessed_layout.pool_id
7c673cae
FG
859 << " os: " << guessed_layout.object_size
860 << " sc: " << guessed_layout.stripe_count
861 << " su: " << guessed_layout.stripe_unit
862 << dendl;
863 // User might have transplanted files from a pool with a different
1e59de90
TL
864 // ID, so if the pool from loaded_layout is not found in the list of
865 // the data pools, we'll force the injected layout to point to the
866 // pool we read from.
867 if (!fsmap->get_filesystem(fscid)->mds_map.is_data_pool(
868 guessed_layout.pool_id)) {
869 dout(20) << "overwriting layout pool_id " << data_pool_id << dendl;
870 guessed_layout.pool_id = data_pool_id;
871 }
7c673cae
FG
872 }
873
874 if (guessed_layout.stripe_count == 1) {
875 // Unstriped file: simple chunking
876 file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
877 + accum_res.ceiling_obj_size;
878 } else {
879 // Striped file: need to examine the last stripe_count objects
880 // in the file to determine the size.
881
1e59de90
TL
882 librados::IoCtx ioctx;
883 if (guessed_layout.pool_id == data_io.get_id()) {
884 ioctx.dup(data_io);
885 } else {
886 r = librados::Rados(data_io).ioctx_create2(guessed_layout.pool_id,
887 ioctx);
888 if (r != 0) {
889 derr << "Unexpected error opening file data pool id="
890 << guessed_layout.pool_id << ": " << cpp_strerror(r) << dendl;
891 return r;
892 }
893 }
894
7c673cae
FG
895 // How many complete (i.e. not last stripe) objects?
896 uint64_t complete_objs = 0;
897 if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
898 complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
899 } else {
900 complete_objs = 0;
901 }
902
903 // How many potentially-short objects (i.e. last stripe set) objects?
904 uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
905
906 dout(10) << "calculating striped size from complete objs: "
907 << complete_objs << ", partial objs: " << partial_objs
908 << dendl;
909
910 // Maximum amount of data that may be in the incomplete objects
911 uint64_t incomplete_size = 0;
912
913 // For each short object, calculate the max file size within it
914 // and accumulate the maximum
915 for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
916 char buf[60];
917 snprintf(buf, sizeof(buf), "%llx.%08llx",
918 (long long unsigned)obj_name_ino, (long long unsigned)i);
919
920 uint64_t osize(0);
921 time_t omtime(0);
1e59de90 922 r = ioctx.stat(std::string(buf), &osize, &omtime);
7c673cae
FG
923 if (r == 0) {
924 if (osize > 0) {
925 // Upper bound within this object
926 uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
927 * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
928 + (i % guessed_layout.stripe_count)
929 * guessed_layout.stripe_unit + (osize - 1)
930 % guessed_layout.stripe_unit + 1;
11fdf7f2 931 incomplete_size = std::max(incomplete_size, upper_size);
7c673cae
FG
932 }
933 } else if (r == -ENOENT) {
934 // Absent object, treat as size 0 and ignore.
935 } else {
936 // Unexpected error, carry r to outer scope for handling.
937 break;
938 }
939 }
940 if (r != 0 && r != -ENOENT) {
941 derr << "Unexpected error checking size of ino 0x" << std::hex
942 << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
943 return r;
944 }
945 file_size = complete_objs * guessed_layout.object_size
946 + incomplete_size;
947 }
948 } else {
949 file_size = accum_res.ceiling_obj_size;
950 if (loaded_layout.pool_id < 0
951 || loaded_layout.object_size < accum_res.max_obj_size) {
952 // No layout loaded, or inconsistent layout, use default
953 guessed_layout = file_layout_t::get_default();
1e59de90
TL
954 guessed_layout.pool_id = accum_res.obj_pool_id != -1 ?
955 accum_res.obj_pool_id : data_pool_id;
7c673cae
FG
956 } else {
957 guessed_layout = loaded_layout;
958 }
959 }
960
961 // Santity checking backtrace ino against object name
962 if (have_backtrace && backtrace.ino != obj_name_ino) {
963 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
964 << " doesn't match object name ino 0x" << obj_name_ino
965 << std::dec << dendl;
966 have_backtrace = false;
967 }
968
969 InodeStore dentry;
20effc67 970 build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry, symlink);
7c673cae
FG
971
972 // Inject inode to the metadata pool
973 if (have_backtrace) {
974 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
975 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
976 /* Special case for strays: even if we have a good backtrace,
977 * don't put it in the stray dir, because while that would technically
978 * give it linkage it would still be invisible to the user */
979 r = driver->inject_lost_and_found(obj_name_ino, dentry);
980 if (r < 0) {
981 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
982 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
983 if (r == -EINVAL) {
984 dout(4) << "Use --force-corrupt to overwrite structures that "
985 "appear to be corrupt" << dendl;
986 }
987 }
988 } else {
989 /* Happy case: we will inject a named dentry for this inode */
990 r = driver->inject_with_backtrace(backtrace, dentry);
991 if (r < 0) {
992 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
993 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
994 if (r == -EINVAL) {
995 dout(4) << "Use --force-corrupt to overwrite structures that "
996 "appear to be corrupt" << dendl;
997 }
998 }
999 }
1000 } else {
1001 /* Backtrace-less case: we will inject a lost+found dentry */
1002 r = driver->inject_lost_and_found(
1003 obj_name_ino, dentry);
1004 if (r < 0) {
1005 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
1006 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1007 if (r == -EINVAL) {
1008 dout(4) << "Use --force-corrupt to overwrite structures that "
1009 "appear to be corrupt" << dendl;
1010 }
1011 }
1012 }
1013
1014 return r;
1015 });
1016}
1017
1018int DataScan::cleanup()
1019{
1020 // We are looking for only zeroth object
1021 //
1022 return forall_objects(data_io, true, [this](
1023 std::string const &oid,
1024 uint64_t obj_name_ino,
1025 uint64_t obj_name_offset) -> int
1026 {
1027 int r = 0;
1028 r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
1029 if (r < 0) {
1030 dout(4) << "Error deleting accumulated metadata from '"
1031 << oid << "': " << cpp_strerror(r) << dendl;
1032 }
1033 return r;
1034 });
1035}
1036
1037bool DataScan::valid_ino(inodeno_t ino) const
1038{
1039 return (ino >= inodeno_t((1ull << 40)))
1040 || (MDS_INO_IS_STRAY(ino))
1041 || (MDS_INO_IS_MDSDIR(ino))
b3b6e05e 1042 || ino == CEPH_INO_ROOT
1e59de90
TL
1043 || ino == CEPH_INO_CEPH
1044 || ino == CEPH_INO_LOST_AND_FOUND;
7c673cae
FG
1045}
1046
1047int DataScan::scan_links()
1048{
1049 MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
1050 if (!metadata_driver) {
1051 derr << "Unexpected --output-dir option for scan_links" << dendl;
1052 return -EINVAL;
1053 }
1054
91327a77 1055 interval_set<uint64_t> used_inos;
7c673cae 1056 map<inodeno_t, int> remote_links;
11fdf7f2
TL
1057 map<snapid_t, SnapInfo> snaps;
1058 snapid_t last_snap = 1;
1059 snapid_t snaprealm_v2_since = 2;
7c673cae
FG
1060
1061 struct link_info_t {
1062 inodeno_t dirino;
1063 frag_t frag;
1064 string name;
1065 version_t version;
1066 int nlink;
1067 bool is_dir;
11fdf7f2 1068 map<snapid_t, SnapInfo> snaps;
7c673cae 1069 link_info_t() : version(0), nlink(0), is_dir(false) {}
f67539c2 1070 link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::inode_const_ptr& i) :
7c673cae 1071 dirino(di), frag(df), name(n),
f67539c2 1072 version(i->version), nlink(i->nlink), is_dir(S_IFDIR & i->mode) {}
7c673cae
FG
1073 dirfrag_t dirfrag() const {
1074 return dirfrag_t(dirino, frag);
1075 }
1076 };
1077 map<inodeno_t, list<link_info_t> > dup_primaries;
1078 map<inodeno_t, link_info_t> bad_nlink_inos;
92f5a8d4 1079 map<inodeno_t, link_info_t> injected_inos;
7c673cae
FG
1080
1081 map<dirfrag_t, set<string> > to_remove;
1082
1083 enum {
1084 SCAN_INOS = 1,
1085 CHECK_LINK,
1086 };
1087
1088 for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
1089 const librados::NObjectIterator it_end = metadata_io.nobjects_end();
1090 for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
1091 const std::string oid = it->get_oid();
1092
39ae355f
TL
1093 dout(10) << "step " << step << ": handling object " << oid << dendl;
1094
7c673cae
FG
1095 uint64_t dir_ino = 0;
1096 uint64_t frag_id = 0;
1097 int r = parse_oid(oid, &dir_ino, &frag_id);
1098 if (r == -EINVAL) {
1099 dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
1100 continue;
1101 } else {
1102 // parse_oid can only do 0 or -EINVAL
11fdf7f2 1103 ceph_assert(r == 0);
7c673cae
FG
1104 }
1105
1106 if (!valid_ino(dir_ino)) {
1107 dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
1108 continue;
1109 }
1110
1111 std::map<std::string, bufferlist> items;
1112 r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
1113 if (r < 0) {
1114 derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
1115 return r;
1116 }
1117
1118 for (auto& p : items) {
11fdf7f2 1119 auto q = p.second.cbegin();
7c673cae
FG
1120 string dname;
1121 snapid_t last;
1122 dentry_key_t::decode_helper(p.first, dname, last);
1123
92f5a8d4
TL
1124 if (last != CEPH_NOSNAP) {
1125 if (last > last_snap)
1126 last_snap = last;
7c673cae 1127 continue;
92f5a8d4 1128 }
7c673cae
FG
1129
1130 try {
1131 snapid_t dnfirst;
11fdf7f2 1132 decode(dnfirst, q);
1e59de90
TL
1133 if (dnfirst == CEPH_NOSNAP) {
1134 dout(20) << "injected ino detected" << dendl;
1135 } else if (dnfirst <= CEPH_MAXSNAP) {
11fdf7f2
TL
1136 if (dnfirst - 1 > last_snap)
1137 last_snap = dnfirst - 1;
1138 }
7c673cae 1139 char dentry_type;
11fdf7f2 1140 decode(dentry_type, q);
f67539c2
TL
1141 mempool::mds_co::string alternate_name;
1142 if (dentry_type == 'I' || dentry_type == 'i') {
7c673cae 1143 InodeStore inode;
f67539c2
TL
1144 if (dentry_type == 'i') {
1145 DECODE_START(2, q);
1146 if (struct_v >= 2)
1147 decode(alternate_name, q);
1148 inode.decode(q);
1149 DECODE_FINISH(q);
1150 } else {
1151 inode.decode_bare(q);
1152 }
1153
1154 inodeno_t ino = inode.inode->ino;
7c673cae
FG
1155
1156 if (step == SCAN_INOS) {
1157 if (used_inos.contains(ino, 1)) {
1e59de90
TL
1158 dup_primaries.emplace(std::piecewise_construct,
1159 std::forward_as_tuple(ino),
1160 std::forward_as_tuple());
7c673cae
FG
1161 } else {
1162 used_inos.insert(ino);
1163 }
1164 } else if (step == CHECK_LINK) {
11fdf7f2
TL
1165 sr_t srnode;
1166 if (inode.snap_blob.length()) {
1167 auto p = inode.snap_blob.cbegin();
1168 decode(srnode, p);
1169 for (auto it = srnode.snaps.begin();
1170 it != srnode.snaps.end(); ) {
1171 if (it->second.ino != ino ||
1172 it->second.snapid != it->first) {
1173 srnode.snaps.erase(it++);
1174 } else {
1175 ++it;
1176 }
1177 }
1178 if (!srnode.past_parents.empty()) {
1179 snapid_t last = srnode.past_parents.rbegin()->first;
1180 if (last + 1 > snaprealm_v2_since)
1181 snaprealm_v2_since = last + 1;
1182 }
1183 }
f67539c2
TL
1184 if (inode.old_inodes && !inode.old_inodes->empty()) {
1185 auto _last_snap = inode.old_inodes->rbegin()->first;
1186 if (_last_snap > last_snap)
1187 last_snap = _last_snap;
11fdf7f2 1188 }
7c673cae
FG
1189 auto q = dup_primaries.find(ino);
1190 if (q != dup_primaries.end()) {
1191 q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
11fdf7f2 1192 q->second.back().snaps.swap(srnode.snaps);
7c673cae
FG
1193 } else {
1194 int nlink = 0;
1195 auto r = remote_links.find(ino);
1196 if (r != remote_links.end())
1197 nlink = r->second;
1198 if (!MDS_INO_IS_STRAY(dir_ino))
1199 nlink++;
f67539c2 1200 if (inode.inode->nlink != nlink) {
7c673cae 1201 derr << "Bad nlink on " << ino << " expected " << nlink
f67539c2 1202 << " has " << inode.inode->nlink << dendl;
7c673cae
FG
1203 bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1204 bad_nlink_inos[ino].nlink = nlink;
1205 }
11fdf7f2
TL
1206 snaps.insert(make_move_iterator(begin(srnode.snaps)),
1207 make_move_iterator(end(srnode.snaps)));
7c673cae 1208 }
1e59de90
TL
1209 if (dnfirst == CEPH_NOSNAP) {
1210 injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1211 dout(20) << "adding " << ino << " for future processing to fix dnfirst" << dendl;
1212 }
7c673cae 1213 }
f67539c2 1214 } else if (dentry_type == 'L' || dentry_type == 'l') {
7c673cae
FG
1215 inodeno_t ino;
1216 unsigned char d_type;
f67539c2 1217 CDentry::decode_remote(dentry_type, ino, d_type, alternate_name, q);
7c673cae
FG
1218
1219 if (step == SCAN_INOS) {
1220 remote_links[ino]++;
1221 } else if (step == CHECK_LINK) {
1222 if (!used_inos.contains(ino, 1)) {
1223 derr << "Bad remote link dentry 0x" << std::hex << dir_ino
1224 << std::dec << "/" << dname
1225 << ", ino " << ino << " not found" << dendl;
1226 std::string key;
1227 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1228 dn_key.encode(key);
1229 to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
1230 }
1231 }
1232 } else {
1233 derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
1234 << std::dec << "/" << dname << dendl;
1235 return -EINVAL;
1236 }
1237 } catch (const buffer::error &err) {
1238 derr << "Error decoding dentry 0x" << std::hex << dir_ino
1239 << std::dec << "/" << dname << dendl;
1240 return -EINVAL;
1241 }
1242 }
1243 }
1244 }
91327a77
AA
1245
1246 map<unsigned, uint64_t> max_ino_map;
1247 {
1248 auto prev_max_ino = (uint64_t)1 << 40;
1249 for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
1250 auto cur_max = p.get_start() + p.get_len() - 1;
1251 if (cur_max < prev_max_ino)
1252 continue; // system inodes
1253
1254 if ((prev_max_ino >> 40) != (cur_max >> 40)) {
1255 unsigned rank = (prev_max_ino >> 40) - 1;
1256 max_ino_map[rank] = prev_max_ino;
1257 } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
1258 unsigned rank = (p.get_start() >> 40) - 1;
1259 max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
1260 }
1261 prev_max_ino = cur_max;
1262 }
1263 unsigned rank = (prev_max_ino >> 40) - 1;
1264 max_ino_map[rank] = prev_max_ino;
1265 }
1266
7c673cae
FG
1267 used_inos.clear();
1268
39ae355f
TL
1269 dout(10) << "processing " << dup_primaries.size() << " dup_primaries, "
1270 << remote_links.size() << " remote_links" << dendl;
1271
7c673cae 1272 for (auto& p : dup_primaries) {
39ae355f
TL
1273
1274 dout(10) << "handling dup " << p.first << dendl;
1275
7c673cae
FG
1276 link_info_t newest;
1277 for (auto& q : p.second) {
1278 if (q.version > newest.version) {
1279 newest = q;
1280 } else if (q.version == newest.version &&
1281 !MDS_INO_IS_STRAY(q.dirino) &&
1282 MDS_INO_IS_STRAY(newest.dirino)) {
1283 newest = q;
1284 }
1285 }
1286
1287 for (auto& q : p.second) {
1288 // in the middle of dir fragmentation?
11fdf7f2
TL
1289 if (newest.dirino == q.dirino && newest.name == q.name) {
1290 snaps.insert(make_move_iterator(begin(q.snaps)),
1291 make_move_iterator(end(q.snaps)));
7c673cae 1292 continue;
11fdf7f2 1293 }
7c673cae
FG
1294
1295 std::string key;
1296 dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
1297 dn_key.encode(key);
1298 to_remove[q.dirfrag()].insert(key);
1299 derr << "Remove duplicated ino 0x" << p.first << " from "
1300 << q.dirfrag() << "/" << q.name << dendl;
1301 }
1302
1303 int nlink = 0;
1304 auto q = remote_links.find(p.first);
1305 if (q != remote_links.end())
1306 nlink = q->second;
1307 if (!MDS_INO_IS_STRAY(newest.dirino))
1308 nlink++;
1309
1310 if (nlink != newest.nlink) {
1311 derr << "Bad nlink on " << p.first << " expected " << nlink
1312 << " has " << newest.nlink << dendl;
1313 bad_nlink_inos[p.first] = newest;
1314 bad_nlink_inos[p.first].nlink = nlink;
1315 }
1316 }
1317 dup_primaries.clear();
1318 remote_links.clear();
1319
92f5a8d4
TL
1320 {
1321 objecter->with_osdmap([&](const OSDMap& o) {
1322 for (auto p : data_pools) {
1323 const pg_pool_t *pi = o.get_pg_pool(p);
1324 if (!pi)
1325 continue;
1326 if (pi->snap_seq > last_snap)
1327 last_snap = pi->snap_seq;
1328 }
1329 });
1330
1331 if (!snaps.empty()) {
1332 if (snaps.rbegin()->first > last_snap)
1333 last_snap = snaps.rbegin()->first;
1334 }
1335 }
1336
39ae355f
TL
1337 dout(10) << "removing dup dentries from " << to_remove.size() << " objects"
1338 << dendl;
1339
7c673cae
FG
1340 for (auto& p : to_remove) {
1341 object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
1342
39ae355f
TL
1343 dout(10) << "removing dup dentries from " << p.first << dendl;
1344
7c673cae
FG
1345 int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
1346 if (r != 0) {
1347 derr << "Error removing duplicated dentries from " << p.first << dendl;
1348 return r;
1349 }
1350 }
1351 to_remove.clear();
1352
39ae355f
TL
1353 dout(10) << "processing " << bad_nlink_inos.size() << " bad_nlink_inos"
1354 << dendl;
1355
7c673cae 1356 for (auto &p : bad_nlink_inos) {
39ae355f
TL
1357 dout(10) << "handling bad_nlink_ino " << p.first << dendl;
1358
7c673cae 1359 InodeStore inode;
92f5a8d4
TL
1360 snapid_t first;
1361 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
7c673cae
FG
1362 if (r < 0) {
1363 derr << "Unexpected error reading dentry "
1364 << p.second.dirfrag() << "/" << p.second.name
1365 << ": " << cpp_strerror(r) << dendl;
1366 return r;
1367 }
1368
f67539c2 1369 if (inode.inode->ino != p.first || inode.inode->version != p.second.version)
7c673cae
FG
1370 continue;
1371
f67539c2 1372 inode.get_inode()->nlink = p.second.nlink;
92f5a8d4
TL
1373 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
1374 if (r < 0)
1375 return r;
1376 }
1377
39ae355f
TL
1378 dout(10) << "processing " << injected_inos.size() << " injected_inos"
1379 << dendl;
1380
92f5a8d4 1381 for (auto &p : injected_inos) {
39ae355f
TL
1382 dout(10) << "handling injected_ino " << p.first << dendl;
1383
92f5a8d4
TL
1384 InodeStore inode;
1385 snapid_t first;
1e59de90 1386 dout(20) << " fixing linkage (dnfirst) of " << p.second.dirino << ":" << p.second.name << dendl;
92f5a8d4
TL
1387 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
1388 if (r < 0) {
1389 derr << "Unexpected error reading dentry "
1390 << p.second.dirfrag() << "/" << p.second.name
1391 << ": " << cpp_strerror(r) << dendl;
1392 return r;
1393 }
1394
1e59de90
TL
1395 if (first != CEPH_NOSNAP) {
1396 dout(20) << " ????" << dendl;
92f5a8d4 1397 continue;
1e59de90 1398 }
92f5a8d4
TL
1399
1400 first = last_snap + 1;
1e59de90 1401 dout(20) << " first is now " << first << dendl;
92f5a8d4 1402 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
7c673cae
FG
1403 if (r < 0)
1404 return r;
1405 }
1406
39ae355f
TL
1407 dout(10) << "updating inotable" << dendl;
1408
91327a77 1409 for (auto& p : max_ino_map) {
11fdf7f2
TL
1410 InoTable inotable(nullptr);
1411 inotable.set_rank(p.first);
1412 bool dirty = false;
1413 int r = metadata_driver->load_table(&inotable);
1414 if (r < 0) {
1415 inotable.reset_state();
1416 dirty = true;
1417 }
1418 if (inotable.force_consume_to(p.second))
1419 dirty = true;
1420 if (dirty) {
1421 r = metadata_driver->save_table(&inotable);
1422 if (r < 0)
1423 return r;
1424 }
91327a77
AA
1425 }
1426
39ae355f
TL
1427 dout(10) << "updating snaptable" << dendl;
1428
11fdf7f2 1429 {
11fdf7f2
TL
1430 SnapServer snaptable;
1431 snaptable.set_rank(0);
1432 bool dirty = false;
1433 int r = metadata_driver->load_table(&snaptable);
1434 if (r < 0) {
1435 snaptable.reset_state();
1436 dirty = true;
1437 }
1438 if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
1439 dirty = true;
1440 if (dirty) {
1441 r = metadata_driver->save_table(&snaptable);
1442 if (r < 0)
1443 return r;
1444 }
1445 }
7c673cae
FG
1446 return 0;
1447}
1448
1449int DataScan::scan_frags()
1450{
1451 bool roots_present;
1452 int r = driver->check_roots(&roots_present);
1453 if (r != 0) {
1454 derr << "Unexpected error checking roots: '"
1455 << cpp_strerror(r) << "'" << dendl;
1456 return r;
1457 }
1458
1459 if (!roots_present) {
1460 std::cerr << "Some or all system inodes are absent. Run 'init' from "
1461 "one node before running 'scan_inodes'" << std::endl;
1462 return -EIO;
1463 }
1464
1465 return forall_objects(metadata_io, true, [this](
1466 std::string const &oid,
1467 uint64_t obj_name_ino,
1468 uint64_t obj_name_offset) -> int
1469 {
1470 int r = 0;
1471 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
1472 if (r != 0) {
1473 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
1474 return r;
1475 }
1476
1477 if (obj_name_ino < (1ULL << 40)) {
1478 // FIXME: we're skipping stray dirs here: if they're
1479 // orphaned then we should be resetting them some other
1480 // way
1481 dout(10) << "Skipping system ino " << obj_name_ino << dendl;
1482 return 0;
1483 }
1484
1485 AccumulateResult accum_res;
1486 inode_backtrace_t backtrace;
1487
1488 // Default to inherit layout (i.e. no explicit layout on dir) which is
1489 // expressed as a zeroed layout struct (see inode_t::has_layout)
1490 file_layout_t loaded_layout;
1491
1492 int parent_r = 0;
1493 bufferlist parent_bl;
1494 int layout_r = 0;
1495 bufferlist layout_bl;
1496 bufferlist op_bl;
1497
1498 librados::ObjectReadOperation op;
1499 op.getxattr("parent", &parent_bl, &parent_r);
1500 op.getxattr("layout", &layout_bl, &layout_r);
1501 r = metadata_io.operate(oid, &op, &op_bl);
1502 if (r != 0 && r != -ENODATA) {
1503 derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
1504 return r;
1505 }
1506
1507 if (parent_r != -ENODATA) {
1508 try {
11fdf7f2 1509 auto q = parent_bl.cbegin();
7c673cae
FG
1510 backtrace.decode(q);
1511 } catch (buffer::error &e) {
f67539c2 1512 dout(4) << "Corrupt backtrace on '" << oid << "': " << e.what() << dendl;
7c673cae
FG
1513 if (!force_corrupt) {
1514 return -EINVAL;
1515 } else {
1516 // Treat backtrace as absent: we'll inject into lost+found
1517 backtrace = inode_backtrace_t();
1518 }
1519 }
1520 }
1521
1522 if (layout_r != -ENODATA) {
1523 try {
11fdf7f2
TL
1524 auto q = layout_bl.cbegin();
1525 decode(loaded_layout, q);
7c673cae 1526 } catch (buffer::error &e) {
f67539c2 1527 dout(4) << "Corrupt layout on '" << oid << "': " << e.what() << dendl;
7c673cae
FG
1528 if (!force_corrupt) {
1529 return -EINVAL;
1530 }
1531 }
1532 }
1533
1534 bool have_backtrace = !(backtrace.ancestors.empty());
1535
1536 // Santity checking backtrace ino against object name
1537 if (have_backtrace && backtrace.ino != obj_name_ino) {
1538 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
1539 << " doesn't match object name ino 0x" << obj_name_ino
1540 << std::dec << dendl;
1541 have_backtrace = false;
1542 }
1543
1544 uint64_t fnode_version = 0;
1545 fnode_t fnode;
1546 r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
1547 if (r == -EINVAL) {
1548 derr << "Corrupt fnode on " << oid << dendl;
1549 if (force_corrupt) {
1550 fnode.fragstat.mtime = 0;
1551 fnode.fragstat.nfiles = 1;
1552 fnode.fragstat.nsubdirs = 0;
1553 fnode.accounted_fragstat = fnode.fragstat;
1554 } else {
1555 return r;
1556 }
1557 }
1558
1559 InodeStore dentry;
1560 build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
1561 loaded_layout, &dentry);
1562
1563 // Inject inode to the metadata pool
1564 if (have_backtrace) {
1565 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
1566 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
1567 /* Special case for strays: even if we have a good backtrace,
1568 * don't put it in the stray dir, because while that would technically
1569 * give it linkage it would still be invisible to the user */
1570 r = driver->inject_lost_and_found(obj_name_ino, dentry);
1571 if (r < 0) {
1572 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1573 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1574 if (r == -EINVAL) {
1575 dout(4) << "Use --force-corrupt to overwrite structures that "
1576 "appear to be corrupt" << dendl;
1577 }
1578 }
1579 } else {
1580 /* Happy case: we will inject a named dentry for this inode */
1581 r = driver->inject_with_backtrace(backtrace, dentry);
1582 if (r < 0) {
1583 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1584 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
1585 if (r == -EINVAL) {
1586 dout(4) << "Use --force-corrupt to overwrite structures that "
1587 "appear to be corrupt" << dendl;
1588 }
1589 }
1590 }
1591 } else {
1592 /* Backtrace-less case: we will inject a lost+found dentry */
1593 r = driver->inject_lost_and_found(
1594 obj_name_ino, dentry);
1595 if (r < 0) {
1596 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
1597 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1598 if (r == -EINVAL) {
1599 dout(4) << "Use --force-corrupt to overwrite structures that "
1600 "appear to be corrupt" << dendl;
1601 }
1602 }
1603 }
1604
1605 return r;
1606 });
1607}
1608
1609int MetadataTool::read_fnode(
1610 inodeno_t ino, frag_t frag, fnode_t *fnode,
1611 uint64_t *last_version)
1612{
11fdf7f2 1613 ceph_assert(fnode != NULL);
7c673cae
FG
1614
1615 object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
1616 bufferlist fnode_bl;
1617 int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
1618 *last_version = metadata_io.get_last_version();
1619 if (r < 0) {
1620 return r;
1621 }
1622
11fdf7f2 1623 auto old_fnode_iter = fnode_bl.cbegin();
7c673cae
FG
1624 try {
1625 (*fnode).decode(old_fnode_iter);
1626 } catch (const buffer::error &err) {
1627 return -EINVAL;
1628 }
1629
1630 return 0;
1631}
1632
1633int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
92f5a8d4 1634 const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
7c673cae 1635{
11fdf7f2 1636 ceph_assert(inode != NULL);
7c673cae 1637
7c673cae
FG
1638 std::string key;
1639 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1640 dn_key.encode(key);
1641
1642 std::set<std::string> keys;
1643 keys.insert(key);
1644 std::map<std::string, bufferlist> vals;
1645 object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
1646 int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
1647 dout(20) << "oid=" << frag_oid.name
1648 << " dname=" << dname
1649 << " frag=" << frag
1650 << ", r=" << r << dendl;
1651 if (r < 0) {
1652 return r;
1653 }
1654
1655 if (vals.find(key) == vals.end()) {
1656 dout(20) << key << " not found in result" << dendl;
1657 return -ENOENT;
1658 }
1659
1660 try {
11fdf7f2 1661 auto q = vals[key].cbegin();
92f5a8d4
TL
1662 snapid_t first;
1663 decode(first, q);
7c673cae 1664 char dentry_type;
11fdf7f2 1665 decode(dentry_type, q);
f67539c2
TL
1666 if (dentry_type == 'I' || dentry_type == 'i') {
1667 if (dentry_type == 'i') {
1668 mempool::mds_co::string alternate_name;
1669
1670 DECODE_START(2, q);
1671 if (struct_v >= 2)
1672 decode(alternate_name, q);
1673 inode->decode(q);
1674 DECODE_FINISH(q);
1675 } else {
1676 inode->decode_bare(q);
1677 }
7c673cae
FG
1678 } else {
1679 dout(20) << "dentry type '" << dentry_type << "': cannot"
1680 "read an inode out of that" << dendl;
1681 return -EINVAL;
1682 }
92f5a8d4
TL
1683 if (dnfirst)
1684 *dnfirst = first;
7c673cae
FG
1685 } catch (const buffer::error &err) {
1686 dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
1687 << std::dec << "/" << dname << dendl;
1688 return -EINVAL;
1689 }
1690
1691 return 0;
1692}
1693
11fdf7f2
TL
1694int MetadataDriver::load_table(MDSTable *table)
1695{
1696 object_t table_oid = table->get_object_name();
1697
1698 bufferlist table_bl;
1699 int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
1700 if (r < 0) {
1701 derr << "unable to read mds table '" << table_oid.name << "': "
1702 << cpp_strerror(r) << dendl;
1703 return r;
1704 }
1705
1706 try {
1707 version_t table_ver;
1708 auto p = table_bl.cbegin();
1709 decode(table_ver, p);
1710 table->decode_state(p);
1711 table->force_replay_version(table_ver);
1712 } catch (const buffer::error &err) {
1713 derr << "unable to decode mds table '" << table_oid.name << "': "
1714 << err.what() << dendl;
1715 return -EIO;
1716 }
1717 return 0;
1718}
1719
1720int MetadataDriver::save_table(MDSTable *table)
1721{
1722 object_t table_oid = table->get_object_name();
1723
1724 bufferlist table_bl;
1725 encode(table->get_version(), table_bl);
1726 table->encode_state(table_bl);
1727 int r = metadata_io.write_full(table_oid.name, table_bl);
1728 if (r != 0) {
1729 derr << "error updating mds table " << table_oid.name
1730 << ": " << cpp_strerror(r) << dendl;
1731 return r;
1732 }
1733 return 0;
1734}
1735
7c673cae
FG
1736int MetadataDriver::inject_lost_and_found(
1737 inodeno_t ino, const InodeStore &dentry)
1738{
1739 // Create lost+found if doesn't exist
1740 bool created = false;
1741 int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
1742 if (r < 0) {
1743 return r;
1744 }
1745 InodeStore lf_ino;
1746 r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
1747 if (r == -ENOENT || r == -EINVAL) {
1748 if (r == -EINVAL && !force_corrupt) {
1749 return r;
1750 }
1751
1752 // To have a directory not specify a layout, give it zeros (see
1753 // inode_t::has_layout)
1754 file_layout_t inherit_layout;
1755
1756 // Construct LF inode
1757 frag_info_t fragstat;
1758 fragstat.nfiles = 1,
1759 build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
1760
1761 // Inject link to LF inode in the root dir
1762 r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
1763 if (r < 0) {
1764 return r;
1765 }
1766 } else {
f67539c2 1767 if (!(lf_ino.inode->mode & S_IFDIR)) {
7c673cae
FG
1768 derr << "lost+found exists but is not a directory!" << dendl;
1769 // In this case we error out, and the user should do something about
1770 // this problem.
1771 return -EINVAL;
1772 }
1773 }
1774
1775 r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
1776 if (r < 0) {
1777 return r;
1778 }
1779
7c673cae
FG
1780 const std::string dname = lost_found_dname(ino);
1781
1782 // Write dentry into lost+found dirfrag
f67539c2 1783 return inject_linkage(lf_ino.inode->ino, dname, frag_t(), dentry);
7c673cae
FG
1784}
1785
1786
1787int MetadataDriver::get_frag_of(
1788 inodeno_t dirino,
1789 const std::string &target_dname,
1790 frag_t *result_ft)
1791{
1792 object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
1793
1794 dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
1795
1796 // Find and load fragtree if existing dirfrag
1797 // ==========================================
1798 bool have_backtrace = false;
1799 bufferlist parent_bl;
1800 int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
1801 if (r == -ENODATA) {
1802 dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
1803 } else if (r < 0) {
1804 dout(4) << "Unexpected error on '" << root_frag_oid << "': "
1805 << cpp_strerror(r) << dendl;
1806 return r;
1807 }
1808
1809 // Deserialize backtrace
1810 inode_backtrace_t backtrace;
1811 if (parent_bl.length()) {
1812 try {
11fdf7f2 1813 auto q = parent_bl.cbegin();
7c673cae
FG
1814 backtrace.decode(q);
1815 have_backtrace = true;
1816 } catch (buffer::error &e) {
f67539c2
TL
1817 dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': "
1818 << e.what() << dendl;
7c673cae
FG
1819 }
1820 }
1821
1822 if (!(have_backtrace && backtrace.ancestors.size())) {
1823 // Can't work out fragtree without a backtrace
1824 dout(4) << "No backtrace on '" << root_frag_oid
1825 << "': cannot determine fragtree" << dendl;
1826 return -ENOENT;
1827 }
1828
1829 // The parentage of dirino
1830 const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
1831
1832 // The inode of dirino's parent
1833 const inodeno_t parent_ino = bp.dirino;
1834
1835 // The dname of dirino in its parent.
1836 const std::string &parent_dname = bp.dname;
1837
1838 dout(20) << "got backtrace parent " << parent_ino << "/"
1839 << parent_dname << dendl;
1840
1841 // The primary dentry for dirino
1842 InodeStore existing_dentry;
1843
1844 // See if we can find ourselves in dirfrag zero of the parent: this
1845 // is a fast path that avoids needing to go further up the tree
1846 // if the parent isn't fragmented (worst case we would have to
1847 // go all the way to the root)
1848 r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
1849 if (r >= 0) {
1850 // Great, fast path: return the fragtree from here
f67539c2 1851 if (existing_dentry.inode->ino != dirino) {
7c673cae 1852 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
f67539c2 1853 << existing_dentry.inode->ino
7c673cae
FG
1854 << " vs expected 0x" << dirino << std::dec << dendl;
1855 return -ENOENT;
1856 }
1857 dout(20) << "fast path, fragtree is "
1858 << existing_dentry.dirfragtree << dendl;
1859 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1860 dout(20) << "frag is " << *result_ft << dendl;
1861 return 0;
1862 } else if (r != -ENOENT) {
1863 // Dentry not present in 0th frag, must read parent's fragtree
1864 frag_t parent_frag;
1865 r = get_frag_of(parent_ino, parent_dname, &parent_frag);
1866 if (r == 0) {
1867 // We have the parent fragtree, so try again to load our dentry
1868 r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
1869 if (r >= 0) {
1870 // Got it!
1871 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1872 dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
1873 return 0;
1874 } else {
1875 if (r == -EINVAL || r == -ENOENT) {
1876 return -ENOENT; // dentry missing or corrupt, so frag is missing
1877 } else {
1878 return r;
1879 }
1880 }
1881 } else {
1882 // Couldn't resolve parent fragtree, so can't find ours.
1883 return r;
1884 }
1885 } else if (r == -EINVAL) {
1886 // Unreadable dentry, can't know the fragtree.
1887 return -ENOENT;
1888 } else {
1889 // Unexpected error, raise it
1890 return r;
1891 }
1892}
1893
1894
1895int MetadataDriver::inject_with_backtrace(
1896 const inode_backtrace_t &backtrace, const InodeStore &dentry)
1897
1898{
1899
1900 // On dirfrags
1901 // ===========
1902 // In order to insert something into a directory, we first (ideally)
1903 // need to know the fragtree for the directory. Sometimes we can't
1904 // get that, in which case we just go ahead and insert it into
1905 // fragment zero for a good chance of that being the right thing
1906 // anyway (most moderate-sized dirs aren't fragmented!)
1907
1908 // On ancestry
1909 // ===========
1910 // My immediate ancestry should be correct, so if we can find that
1911 // directory's dirfrag then go inject it there. This works well
1912 // in the case that this inode's dentry was somehow lost and we
1913 // are recreating it, because the rest of the hierarchy
1914 // will probably still exist.
1915 //
1916 // It's more of a "better than nothing" approach when rebuilding
1917 // a whole tree, as backtraces will in general not be up to date
1918 // beyond the first parent, if anything in the trace was ever
1919 // moved after the file was created.
1920
1921 // On inode numbers
1922 // ================
1923 // The backtrace tells us inodes for each of the parents. If we are
1924 // creating those parent dirfrags, then there is a risk that somehow
1925 // the inode indicated here was also used for data (not a dirfrag) at
1926 // some stage. That would be a zany situation, and we don't check
1927 // for it here, because to do so would require extra IOs for everything
1928 // we inject, and anyway wouldn't guarantee that the inode number
1929 // wasn't in use in some dentry elsewhere in the metadata tree that
1930 // just happened not to have any data objects.
1931
1932 // On multiple workers touching the same traces
1933 // ============================================
1934 // When creating linkage for a directory, *only* create it if we are
1935 // also creating the object. That way, we might not manage to get the
1936 // *right* linkage for a directory, but at least we won't multiply link
1937 // it. We assume that if a root dirfrag exists for a directory, then
1938 // it is linked somewhere (i.e. that the metadata pool is not already
1939 // inconsistent).
1940 //
1941 // Making sure *that* is true is someone else's job! Probably someone
1942 // who is not going to run in parallel, so that they can self-consistently
1943 // look at versions and move things around as they go.
1944 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1945 // object, next run will fail to create linkage for the dirfrag object
1946 // and leave it orphaned.
1947
1948 inodeno_t ino = backtrace.ino;
1949 dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
1950 for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
1951 i != backtrace.ancestors.end(); ++i) {
1952 const inode_backpointer_t &backptr = *i;
1953 dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
1954 << "/" << backptr.dname << dendl;
1955
1956 // Examine root dirfrag for parent
1957 const inodeno_t parent_ino = backptr.dirino;
1958 const std::string dname = backptr.dname;
1959
1960 frag_t fragment;
1961 int r = get_frag_of(parent_ino, dname, &fragment);
1962 if (r == -ENOENT) {
1963 // Don't know fragment, fall back to assuming root
1964 dout(20) << "don't know fragment for 0x" << std::hex <<
1965 parent_ino << std::dec << "/" << dname << ", will insert to root"
1966 << dendl;
1967 }
1968
1969 // Find or create dirfrag
1970 // ======================
1971 bool created_dirfrag;
1972 r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
1973 if (r < 0) {
1974 return r;
1975 }
1976
1977 // Check if dentry already exists
1978 // ==============================
1979 InodeStore existing_dentry;
1980 r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
1981 bool write_dentry = false;
1982 if (r == -ENOENT || r == -EINVAL) {
1983 if (r == -EINVAL && !force_corrupt) {
1984 return r;
1985 }
1986 // Missing or corrupt dentry
1987 write_dentry = true;
1988 } else if (r < 0) {
1989 derr << "Unexpected error reading dentry 0x" << std::hex
1990 << parent_ino << std::dec << "/"
1991 << dname << ": " << cpp_strerror(r) << dendl;
1992 break;
1993 } else {
1994 // Dentry already present, does it link to me?
f67539c2 1995 if (existing_dentry.inode->ino == ino) {
7c673cae
FG
1996 dout(20) << "Dentry 0x" << std::hex
1997 << parent_ino << std::dec << "/"
1998 << dname << " already exists and points to me" << dendl;
1999 } else {
2000 derr << "Dentry 0x" << std::hex
2001 << parent_ino << std::dec << "/"
2002 << dname << " already exists but points to 0x"
f67539c2 2003 << std::hex << existing_dentry.inode->ino << std::dec << dendl;
7c673cae
FG
2004 // Fall back to lost+found!
2005 return inject_lost_and_found(backtrace.ino, dentry);
2006 }
2007 }
2008
2009 // Inject linkage
2010 // ==============
2011
2012 if (write_dentry) {
2013 if (i == backtrace.ancestors.begin()) {
2014 // This is the linkage for the file of interest
2015 dout(10) << "Linking inode 0x" << std::hex << ino
2016 << " at 0x" << parent_ino << "/" << dname << std::dec
f67539c2 2017 << " with size=" << dentry.inode->size << " bytes" << dendl;
7c673cae 2018
1e59de90 2019 /* NOTE: dnfirst fixed in scan_links */
7c673cae
FG
2020 r = inject_linkage(parent_ino, dname, fragment, dentry);
2021 } else {
2022 // This is the linkage for an ancestor directory
1e59de90
TL
2023 dout(10) << "Linking ancestor directory of inode 0x" << std::hex << ino
2024 << " at 0x" << std::hex << parent_ino
2025 << ":" << dname << dendl;
2026
7c673cae 2027 InodeStore ancestor_dentry;
f67539c2
TL
2028 auto inode = ancestor_dentry.get_inode();
2029 inode->mode = 0755 | S_IFDIR;
7c673cae
FG
2030
2031 // Set nfiles to something non-zero, to fool any other code
2032 // that tries to ignore 'empty' directories. This won't be
2033 // accurate, but it should avoid functional issues.
2034
f67539c2
TL
2035 inode->dirstat.nfiles = 1;
2036 inode->dir_layout.dl_dir_hash =
2037 g_conf()->mds_default_dir_hash;
7c673cae 2038
f67539c2
TL
2039 inode->nlink = 1;
2040 inode->ino = ino;
2041 inode->uid = g_conf()->mds_root_ino_uid;
2042 inode->gid = g_conf()->mds_root_ino_gid;
2043 inode->version = 1;
2044 inode->backtrace_version = 1;
1e59de90 2045 /* NOTE: dnfirst fixed in scan_links */
7c673cae
FG
2046 r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
2047 }
2048
2049 if (r < 0) {
2050 return r;
2051 }
2052 }
2053
2054 if (!created_dirfrag) {
2055 // If the parent dirfrag already existed, then stop traversing the
2056 // backtrace: assume that the other ancestors already exist too. This
2057 // is an assumption rather than a truth, but it's a convenient way
2058 // to avoid the risk of creating multiply-linked directories while
2059 // injecting data. If there are in fact missing ancestors, this
2060 // should be fixed up using a separate tool scanning the metadata
2061 // pool.
2062 break;
2063 } else {
2064 // Proceed up the backtrace, creating parents
2065 ino = parent_ino;
2066 }
2067 }
2068
2069 return 0;
2070}
2071
2072int MetadataDriver::find_or_create_dirfrag(
2073 inodeno_t ino,
2074 frag_t fragment,
2075 bool *created)
2076{
11fdf7f2 2077 ceph_assert(created != NULL);
7c673cae
FG
2078
2079 fnode_t existing_fnode;
2080 *created = false;
2081
2082 uint64_t read_version = 0;
2083 int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
2084 dout(10) << "read_version = " << read_version << dendl;
2085
2086 if (r == -ENOENT || r == -EINVAL) {
2087 if (r == -EINVAL && !force_corrupt) {
2088 return r;
2089 }
2090
2091 // Missing or corrupt fnode, create afresh
2092 bufferlist fnode_bl;
2093 fnode_t blank_fnode;
2094 blank_fnode.version = 1;
2095 // mark it as non-empty
2096 blank_fnode.fragstat.nfiles = 1;
2097 blank_fnode.accounted_fragstat = blank_fnode.fragstat;
2098 blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
2099 blank_fnode.encode(fnode_bl);
2100
2101
2102 librados::ObjectWriteOperation op;
2103
2104 if (read_version) {
11fdf7f2 2105 ceph_assert(r == -EINVAL);
7c673cae
FG
2106 // Case A: We must assert that the version isn't changed since we saw the object
2107 // was unreadable, to avoid the possibility of two data-scan processes
2108 // both creating the frag.
2109 op.assert_version(read_version);
2110 } else {
11fdf7f2 2111 ceph_assert(r == -ENOENT);
7c673cae
FG
2112 // Case B: The object didn't exist in read_fnode, so while creating it we must
2113 // use an exclusive create to correctly populate *creating with
2114 // whether we created it ourselves or someone beat us to it.
2115 op.create(true);
2116 }
2117
2118 object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
2119 op.omap_set_header(fnode_bl);
2120 r = metadata_io.operate(frag_oid.name, &op);
2121 if (r == -EOVERFLOW || r == -EEXIST) {
2122 // Someone else wrote it (see case A above)
2123 dout(10) << "Dirfrag creation race: 0x" << std::hex
2124 << ino << " " << fragment << std::dec << dendl;
2125 *created = false;
2126 return 0;
2127 } else if (r < 0) {
2128 // We were unable to create or write it, error out
2129 derr << "Failed to create dirfrag 0x" << std::hex
2130 << ino << std::dec << ": " << cpp_strerror(r) << dendl;
2131 return r;
2132 } else {
2133 // Success: the dirfrag object now exists with a value header
2134 dout(10) << "Created dirfrag: 0x" << std::hex
2135 << ino << std::dec << dendl;
2136 *created = true;
2137 }
2138 } else if (r < 0) {
2139 derr << "Unexpected error reading dirfrag 0x" << std::hex
2140 << ino << std::dec << " : " << cpp_strerror(r) << dendl;
2141 return r;
2142 } else {
2143 dout(20) << "Dirfrag already exists: 0x" << std::hex
2144 << ino << " " << fragment << std::dec << dendl;
2145 }
2146
2147 return 0;
2148}
2149
2150int MetadataDriver::inject_linkage(
2151 inodeno_t dir_ino, const std::string &dname,
92f5a8d4 2152 const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
7c673cae 2153{
7c673cae
FG
2154 object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
2155
2156 std::string key;
92f5a8d4 2157 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
7c673cae
FG
2158 dn_key.encode(key);
2159
2160 bufferlist dentry_bl;
92f5a8d4 2161 encode(dnfirst, dentry_bl);
11fdf7f2 2162 encode('I', dentry_bl);
7c673cae
FG
2163 inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
2164
2165 // Write out
2166 std::map<std::string, bufferlist> vals;
2167 vals[key] = dentry_bl;
2168 int r = metadata_io.omap_set(frag_oid.name, vals);
2169 if (r != 0) {
2170 derr << "Error writing dentry 0x" << std::hex
2171 << dir_ino << std::dec << "/"
2172 << dname << ": " << cpp_strerror(r) << dendl;
2173 return r;
2174 } else {
2175 dout(20) << "Injected dentry 0x" << std::hex
2176 << dir_ino << "/" << dname << " pointing to 0x"
f67539c2 2177 << inode.inode->ino << std::dec << dendl;
7c673cae
FG
2178 return 0;
2179 }
2180}
2181
2182
2183int MetadataDriver::init(
2184 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
2185 fs_cluster_id_t fscid)
2186{
2187 if (metadata_pool_name.empty()) {
2188 auto fs = fsmap->get_filesystem(fscid);
11fdf7f2 2189 ceph_assert(fs != nullptr);
7c673cae
FG
2190 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
2191
2192 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
2193 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
2194 if (r < 0) {
2195 derr << "Pool " << metadata_pool_id
2196 << " identified in MDS map not found in RADOS!" << dendl;
2197 return r;
2198 }
2199 dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
2200 } else {
2201 dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
2202 }
2203 return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
2204}
2205
2206int LocalFileDriver::init(
2207 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
2208 fs_cluster_id_t fscid)
2209{
2210 return 0;
2211}
2212
2213int LocalFileDriver::inject_data(
2214 const std::string &file_path,
2215 uint64_t size,
2216 uint32_t chunk_size,
2217 inodeno_t ino)
2218{
2219 // Scrape the file contents out of the data pool and into the
2220 // local filesystem
2221 std::fstream f;
2222 f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
2223
2224 for (uint64_t offset = 0; offset < size; offset += chunk_size) {
2225 bufferlist bl;
2226
2227 char buf[32];
2228 snprintf(buf, sizeof(buf),
2229 "%llx.%08llx",
2230 (unsigned long long)ino,
2231 (unsigned long long)(offset / chunk_size));
2232 std::string oid(buf);
2233
2234 int r = data_io.read(oid, bl, chunk_size, 0);
2235
2236 if (r <= 0 && r != -ENOENT) {
2237 derr << "error reading data object '" << oid << "': "
2238 << cpp_strerror(r) << dendl;
2239 f.close();
2240 return r;
2241 } else if (r >=0) {
2242
2243 f.seekp(offset);
2244 bl.write_stream(f);
2245 }
2246 }
2247 f.close();
2248
2249 return 0;
2250}
2251
2252
2253int LocalFileDriver::inject_with_backtrace(
2254 const inode_backtrace_t &bt,
2255 const InodeStore &dentry)
2256{
2257 std::string path_builder = path;
2258
2259 // Iterate through backtrace creating directory parents
2260 std::vector<inode_backpointer_t>::const_reverse_iterator i;
2261 for (i = bt.ancestors.rbegin();
2262 i != bt.ancestors.rend(); ++i) {
2263
2264 const inode_backpointer_t &backptr = *i;
2265 path_builder += "/";
2266 path_builder += backptr.dname;
2267
2268 // Last entry is the filename itself
2269 bool is_file = (i + 1 == bt.ancestors.rend());
2270 if (is_file) {
2271 // FIXME: inject_data won't cope with interesting (i.e. striped)
2272 // layouts (need a librados-compatible Filer to read these)
f67539c2
TL
2273 inject_data(path_builder, dentry.inode->size,
2274 dentry.inode->layout.object_size, bt.ino);
7c673cae
FG
2275 } else {
2276 int r = mkdir(path_builder.c_str(), 0755);
2277 if (r != 0 && r != -EPERM) {
2278 derr << "error creating directory: '" << path_builder << "': "
2279 << cpp_strerror(r) << dendl;
2280 return r;
2281 }
2282 }
2283 }
2284
2285 return 0;
2286}
2287
2288int LocalFileDriver::inject_lost_and_found(
2289 inodeno_t ino,
2290 const InodeStore &dentry)
2291{
2292 std::string lf_path = path + "/lost+found";
2293 int r = mkdir(lf_path.c_str(), 0755);
2294 if (r != 0 && r != -EPERM) {
2295 derr << "error creating directory: '" << lf_path << "': "
2296 << cpp_strerror(r) << dendl;
2297 return r;
2298 }
2299
2300 std::string file_path = lf_path + "/" + lost_found_dname(ino);
f67539c2
TL
2301 return inject_data(file_path, dentry.inode->size,
2302 dentry.inode->layout.object_size, ino);
7c673cae
FG
2303}
2304
2305int LocalFileDriver::init_roots(int64_t data_pool_id)
2306{
2307 // Ensure that the path exists and is a directory
2308 bool exists;
2309 int r = check_roots(&exists);
2310 if (r != 0) {
2311 return r;
2312 }
2313
2314 if (exists) {
2315 return 0;
2316 } else {
2317 return ::mkdir(path.c_str(), 0755);
2318 }
2319}
2320
2321int LocalFileDriver::check_roots(bool *result)
2322{
2323 // Check if the path exists and is a directory
2324 DIR *d = ::opendir(path.c_str());
2325 if (d == NULL) {
2326 *result = false;
2327 } else {
2328 int r = closedir(d);
2329 if (r != 0) {
2330 // Weird, but maybe possible with e.g. stale FD on NFS mount?
2331 *result = false;
2332 } else {
2333 *result = true;
2334 }
2335 }
2336
2337 return 0;
2338}
2339
2340void MetadataTool::build_file_dentry(
2341 inodeno_t ino, uint64_t file_size, time_t file_mtime,
20effc67 2342 const file_layout_t &layout, InodeStore *out, std::string symlink)
7c673cae 2343{
11fdf7f2 2344 ceph_assert(out != NULL);
7c673cae 2345
f67539c2 2346 auto inode = out->get_inode();
20effc67
TL
2347 if(!symlink.empty()) {
2348 inode->mode = 0777 | S_IFLNK;
2349 out->symlink = symlink;
2350 }
2351 else {
2352 inode->mode = 0500 | S_IFREG;
2353 }
2354
f67539c2
TL
2355 inode->size = file_size;
2356 inode->max_size_ever = file_size;
2357 inode->mtime.tv.tv_sec = file_mtime;
2358 inode->atime.tv.tv_sec = file_mtime;
2359 inode->ctime.tv.tv_sec = file_mtime;
7c673cae 2360
f67539c2 2361 inode->layout = layout;
7c673cae 2362
f67539c2
TL
2363 inode->truncate_seq = 1;
2364 inode->truncate_size = -1ull;
7c673cae 2365
f67539c2 2366 inode->inline_data.version = CEPH_INLINE_NONE;
7c673cae 2367
f67539c2
TL
2368 inode->nlink = 1;
2369 inode->ino = ino;
2370 inode->version = 1;
2371 inode->backtrace_version = 1;
2372 inode->uid = g_conf()->mds_root_ino_uid;
2373 inode->gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
2374}
2375
2376void MetadataTool::build_dir_dentry(
2377 inodeno_t ino, const frag_info_t &fragstat,
2378 const file_layout_t &layout, InodeStore *out)
2379{
11fdf7f2 2380 ceph_assert(out != NULL);
7c673cae 2381
f67539c2
TL
2382 auto inode = out->get_inode();
2383 inode->mode = 0755 | S_IFDIR;
2384 inode->dirstat = fragstat;
2385 inode->mtime.tv.tv_sec = fragstat.mtime;
2386 inode->atime.tv.tv_sec = fragstat.mtime;
2387 inode->ctime.tv.tv_sec = fragstat.mtime;
7c673cae 2388
f67539c2
TL
2389 inode->layout = layout;
2390 inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 2391
f67539c2
TL
2392 inode->truncate_seq = 1;
2393 inode->truncate_size = -1ull;
7c673cae 2394
f67539c2 2395 inode->inline_data.version = CEPH_INLINE_NONE;
7c673cae 2396
f67539c2
TL
2397 inode->nlink = 1;
2398 inode->ino = ino;
2399 inode->version = 1;
2400 inode->backtrace_version = 1;
2401 inode->uid = g_conf()->mds_root_ino_uid;
2402 inode->gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
2403}
2404