]> git.proxmox.com Git - ceph.git/blob - ceph/src/tools/cephfs/DataScan.cc
363cc026403c52f7d7021bef49abf866abca18d3
[ceph.git] / ceph / src / tools / cephfs / DataScan.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "common/errno.h"
17 #include "common/ceph_argparse.h"
18 #include <fstream>
19 #include "include/util.h"
20 #include "include/ceph_fs.h"
21
22 #include "mds/CDentry.h"
23 #include "mds/CInode.h"
24 #include "mds/CDentry.h"
25 #include "mds/InoTable.h"
26 #include "mds/SnapServer.h"
27 #include "cls/cephfs/cls_cephfs_client.h"
28
29 #include "PgFiles.h"
30 #include "DataScan.h"
31 #include "include/compat.h"
32
33 #define dout_context g_ceph_context
34 #define dout_subsys ceph_subsys_mds
35 #undef dout_prefix
36 #define dout_prefix *_dout << "datascan." << __func__ << ": "
37
38 using namespace std;
39
40 void DataScan::usage()
41 {
42 std::cout << "Usage: \n"
43 << " cephfs-data-scan init [--force-init]\n"
44 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
45 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
46 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
47 << " cephfs-data-scan scan_links\n"
48 << "\n"
49 << " --force-corrupt: overrite apparently corrupt structures\n"
50 << " --force-init: write root inodes even if they exist\n"
51 << " --force-pool: use data pool even if it is not in FSMap\n"
52 << " --worker_m: Maximum number of workers\n"
53 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
54 << "\n"
55 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
56 << " cephfs-data-scan cleanup <data pool name>\n"
57 << std::endl;
58
59 generic_client_usage();
60 }
61
62 bool DataScan::parse_kwarg(
63 const std::vector<const char*> &args,
64 std::vector<const char *>::const_iterator &i,
65 int *r)
66 {
67 if (i + 1 == args.end()) {
68 return false;
69 }
70
71 const std::string arg(*i);
72 const std::string val(*(i + 1));
73
74 if (arg == std::string("--output-dir")) {
75 if (driver != NULL) {
76 derr << "Unexpected --output-dir: output already selected!" << dendl;
77 *r = -EINVAL;
78 return false;
79 }
80 dout(4) << "Using local file output to '" << val << "'" << dendl;
81 driver = new LocalFileDriver(val, data_io);
82 return true;
83 } else if (arg == std::string("--worker_n")) {
84 std::string err;
85 n = strict_strtoll(val.c_str(), 10, &err);
86 if (!err.empty()) {
87 std::cerr << "Invalid worker number '" << val << "'" << std::endl;
88 *r = -EINVAL;
89 return false;
90 }
91 return true;
92 } else if (arg == std::string("--worker_m")) {
93 std::string err;
94 m = strict_strtoll(val.c_str(), 10, &err);
95 if (!err.empty()) {
96 std::cerr << "Invalid worker count '" << val << "'" << std::endl;
97 *r = -EINVAL;
98 return false;
99 }
100 return true;
101 } else if (arg == std::string("--filter-tag")) {
102 filter_tag = val;
103 dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
104 return true;
105 } else if (arg == std::string("--filesystem")) {
106 std::shared_ptr<const Filesystem> fs;
107 *r = fsmap->parse_filesystem(val, &fs);
108 if (*r != 0) {
109 std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
110 return false;
111 }
112 fscid = fs->fscid;
113 return true;
114 } else if (arg == std::string("--alternate-pool")) {
115 metadata_pool_name = val;
116 return true;
117 } else {
118 return false;
119 }
120 }
121
122 bool DataScan::parse_arg(
123 const std::vector<const char*> &args,
124 std::vector<const char *>::const_iterator &i)
125 {
126 const std::string arg(*i);
127 if (arg == "--force-pool") {
128 force_pool = true;
129 return true;
130 } else if (arg == "--force-corrupt") {
131 force_corrupt = true;
132 return true;
133 } else if (arg == "--force-init") {
134 force_init = true;
135 return true;
136 } else {
137 return false;
138 }
139 }
140
141 int DataScan::main(const std::vector<const char*> &args)
142 {
143 // Parse args
144 // ==========
145 if (args.size() < 1) {
146 cerr << "missing position argument" << std::endl;
147 return -EINVAL;
148 }
149
150 // Common RADOS init: open metadata pool
151 // =====================================
152 librados::Rados rados;
153 int r = rados.init_with_context(g_ceph_context);
154 if (r < 0) {
155 derr << "RADOS unavailable" << dendl;
156 return r;
157 }
158
159 std::string const &command = args[0];
160 std::string data_pool_name;
161
162 std::string pg_files_path;
163 std::set<pg_t> pg_files_pgs;
164
165 // Consume any known --key val or --flag arguments
166 for (std::vector<const char *>::const_iterator i = args.begin() + 1;
167 i != args.end(); ++i) {
168 if (parse_kwarg(args, i, &r)) {
169 // Skip the kwarg value field
170 ++i;
171 continue;
172 } else if (r) {
173 return r;
174 }
175
176 if (parse_arg(args, i)) {
177 continue;
178 }
179
180 // Trailing positional argument
181 if (i + 1 == args.end() &&
182 (command == "scan_inodes"
183 || command == "scan_extents"
184 || command == "cleanup")) {
185 data_pool_name = *i;
186 continue;
187 }
188
189 if (command == "pg_files") {
190 if (i == args.begin() + 1) {
191 pg_files_path = *i;
192 continue;
193 } else {
194 pg_t pg;
195 bool parsed = pg.parse(*i);
196 if (!parsed) {
197 std::cerr << "Invalid PG '" << *i << "'" << std::endl;
198 return -EINVAL;
199 } else {
200 pg_files_pgs.insert(pg);
201 continue;
202 }
203 }
204
205 }
206
207 // Fall through: unhandled
208 std::cerr << "Unknown argument '" << *i << "'" << std::endl;
209 return -EINVAL;
210 }
211
212 // If caller didn't specify a namespace, try to pick
213 // one if only one exists
214 if (fscid == FS_CLUSTER_ID_NONE) {
215 if (fsmap->filesystem_count() == 1) {
216 fscid = fsmap->get_filesystem()->fscid;
217 } else {
218 std::cerr << "Specify a filesystem with --filesystem" << std::endl;
219 return -EINVAL;
220 }
221 }
222 auto fs = fsmap->get_filesystem(fscid);
223 ceph_assert(fs != nullptr);
224
225 // Default to output to metadata pool
226 if (driver == NULL) {
227 driver = new MetadataDriver();
228 driver->set_force_corrupt(force_corrupt);
229 driver->set_force_init(force_init);
230 dout(4) << "Using metadata pool output" << dendl;
231 }
232
233 dout(4) << "connecting to RADOS..." << dendl;
234 r = rados.connect();
235 if (r < 0) {
236 std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
237 << std::endl;
238 return r;
239 }
240
241 r = driver->init(rados, metadata_pool_name, fsmap, fscid);
242 if (r < 0) {
243 return r;
244 }
245
246 if (command == "pg_files") {
247 auto pge = PgFiles(objecter, pg_files_pgs);
248 pge.init();
249 return pge.scan_path(pg_files_path);
250 }
251
252 // Initialize data_io for those commands that need it
253 if (command == "scan_inodes" ||
254 command == "scan_extents" ||
255 command == "cleanup") {
256 if (data_pool_name.empty()) {
257 std::cerr << "Data pool not specified" << std::endl;
258 return -EINVAL;
259 }
260
261 data_pool_id = rados.pool_lookup(data_pool_name.c_str());
262 if (data_pool_id < 0) {
263 std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
264 return -ENOENT;
265 } else {
266 dout(4) << "data pool '" << data_pool_name
267 << "' has ID " << data_pool_id << dendl;
268 }
269
270 if (!fs->mds_map.is_data_pool(data_pool_id)) {
271 std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
272 "CephFS data pool!" << std::endl;
273 if (!force_pool) {
274 std::cerr << "Use --force-pool to continue" << std::endl;
275 return -EINVAL;
276 }
277 }
278
279 dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
280 r = rados.ioctx_create(data_pool_name.c_str(), data_io);
281 if (r != 0) {
282 return r;
283 }
284 }
285
286 // Initialize metadata_io from MDSMap for scan_frags
287 if (command == "scan_frags" || command == "scan_links") {
288 const auto fs = fsmap->get_filesystem(fscid);
289 if (fs == nullptr) {
290 std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
291 return -ENOENT;
292 }
293 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
294
295 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
296 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
297 if (r < 0) {
298 std::cerr << "Pool " << metadata_pool_id
299 << " identified in MDS map not found in RADOS!" << std::endl;
300 return r;
301 }
302
303 r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
304 if (r != 0) {
305 return r;
306 }
307
308 data_pools = fs->mds_map.get_data_pools();
309 }
310
311 // Finally, dispatch command
312 if (command == "scan_inodes") {
313 return scan_inodes();
314 } else if (command == "scan_extents") {
315 return scan_extents();
316 } else if (command == "scan_frags") {
317 return scan_frags();
318 } else if (command == "scan_links") {
319 return scan_links();
320 } else if (command == "cleanup") {
321 return cleanup();
322 } else if (command == "init") {
323 return driver->init_roots(fs->mds_map.get_first_data_pool());
324 } else {
325 std::cerr << "Unknown command '" << command << "'" << std::endl;
326 return -EINVAL;
327 }
328 }
329
330 int MetadataDriver::inject_unlinked_inode(
331 inodeno_t inono, int mode, int64_t data_pool_id)
332 {
333 const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
334
335 // Skip if exists
336 bool already_exists = false;
337 int r = root_exists(inono, &already_exists);
338 if (r) {
339 return r;
340 }
341 if (already_exists && !force_init) {
342 std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
343 " exists, skipping create. Use --force-init to overwrite"
344 " the existing object." << std::endl;
345 return 0;
346 }
347
348 // Compose
349 InodeStore inode_data;
350 auto inode = inode_data.get_inode();
351 inode->ino = inono;
352 inode->version = 1;
353 inode->xattr_version = 1;
354 inode->mode = 0500 | mode;
355 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
356 // (we won't actually give the *correct* dirstat here though)
357 inode->dirstat.nfiles = 1;
358
359 inode->ctime = inode->mtime = ceph_clock_now();
360 inode->nlink = 1;
361 inode->truncate_size = -1ull;
362 inode->truncate_seq = 1;
363 inode->uid = g_conf()->mds_root_ino_uid;
364 inode->gid = g_conf()->mds_root_ino_gid;
365
366 // Force layout to default: should we let users override this so that
367 // they don't have to mount the filesystem to correct it?
368 inode->layout = file_layout_t::get_default();
369 inode->layout.pool_id = data_pool_id;
370 inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
371
372 // Assume that we will get our stats wrong, and that we may
373 // be ignoring dirfrags that exist
374 inode_data.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
375
376 if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
377 sr_t srnode;
378 srnode.seq = 1;
379 encode(srnode, inode_data.snap_blob);
380 }
381
382 // Serialize
383 bufferlist inode_bl;
384 encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
385 inode_data.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
386
387 // Write
388 r = metadata_io.write_full(oid.name, inode_bl);
389 if (r != 0) {
390 derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
391 return r;
392 }
393
394 return r;
395 }
396
397 int MetadataDriver::root_exists(inodeno_t ino, bool *result)
398 {
399 object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
400 uint64_t size;
401 time_t mtime;
402 int r = metadata_io.stat(oid.name, &size, &mtime);
403 if (r == -ENOENT) {
404 *result = false;
405 return 0;
406 } else if (r < 0) {
407 return r;
408 }
409
410 *result = true;
411 return 0;
412 }
413
414 int MetadataDriver::init_roots(int64_t data_pool_id)
415 {
416 int r = 0;
417 r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id);
418 if (r != 0) {
419 return r;
420 }
421 r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
422 if (r != 0) {
423 return r;
424 }
425 bool created = false;
426 r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
427 if (r != 0) {
428 return r;
429 }
430
431 return 0;
432 }
433
434 int MetadataDriver::check_roots(bool *result)
435 {
436 int r;
437 r = root_exists(CEPH_INO_ROOT, result);
438 if (r != 0) {
439 return r;
440 }
441 if (!*result) {
442 return 0;
443 }
444
445 r = root_exists(MDS_INO_MDSDIR(0), result);
446 if (r != 0) {
447 return r;
448 }
449 if (!*result) {
450 return 0;
451 }
452
453 return 0;
454 }
455
456 /**
457 * Stages:
458 *
459 * SERIAL init
460 * 0. Create root inodes if don't exist
461 * PARALLEL scan_extents
462 * 1. Size and mtime recovery: scan ALL objects, and update 0th
463 * objects with max size and max mtime seen.
464 * PARALLEL scan_inodes
465 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
466 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
467 * or rstats at this stage. Inodes without backtraces go into
468 * lost+found
469 * TODO: SERIAL "recover stats"
470 * 3. Dirfrag statistics: depth first traverse into metadata tree,
471 * rebuilding dir sizes.
472 * TODO PARALLEL "clean up"
473 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
474 * anything onto them) and remove any of the xattrs that we
475 * used for accumulating.
476 */
477
478
479 int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
480 {
481 if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
482 return -EINVAL;
483 }
484
485 std::string err;
486 std::string inode_str = oid.substr(0, oid.find("."));
487 *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
488 if (!err.empty()) {
489 return -EINVAL;
490 }
491
492 std::string pos_string = oid.substr(oid.find(".") + 1);
493 *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
494 if (!err.empty()) {
495 return -EINVAL;
496 }
497
498 return 0;
499 }
500
501
502 int DataScan::scan_extents()
503 {
504 return forall_objects(data_io, false, [this](
505 std::string const &oid,
506 uint64_t obj_name_ino,
507 uint64_t obj_name_offset) -> int
508 {
509 // Read size
510 uint64_t size;
511 time_t mtime;
512 int r = data_io.stat(oid, &size, &mtime);
513 dout(10) << "handling object " << obj_name_ino
514 << "." << obj_name_offset << dendl;
515 if (r != 0) {
516 dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
517 return r;
518 }
519
520 // I need to keep track of
521 // * The highest object ID seen
522 // * The size of the highest object ID seen
523 // * The largest object seen
524 //
525 // Given those things, I can later infer the object chunking
526 // size, the offset of the last object (chunk size * highest ID seen)
527 // and the actual size (offset of last object + size of highest ID seen)
528 //
529 // This logic doesn't take account of striping.
530 r = ClsCephFSClient::accumulate_inode_metadata(
531 data_io,
532 obj_name_ino,
533 obj_name_offset,
534 size,
535 mtime);
536 if (r < 0) {
537 derr << "Failed to accumulate metadata data from '"
538 << oid << "': " << cpp_strerror(r) << dendl;
539 return r;
540 }
541
542 return r;
543 });
544 }
545
546 int DataScan::probe_filter(librados::IoCtx &ioctx)
547 {
548 bufferlist filter_bl;
549 ClsCephFSClient::build_tag_filter("test", &filter_bl);
550 librados::ObjectCursor range_i;
551 librados::ObjectCursor range_end;
552
553 std::vector<librados::ObjectItem> tmp_result;
554 librados::ObjectCursor tmp_next;
555 int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
556 1, filter_bl, &tmp_result, &tmp_next);
557
558 return r >= 0;
559 }
560
561 int DataScan::forall_objects(
562 librados::IoCtx &ioctx,
563 bool untagged_only,
564 std::function<int(std::string, uint64_t, uint64_t)> handler
565 )
566 {
567 librados::ObjectCursor range_i;
568 librados::ObjectCursor range_end;
569 ioctx.object_list_slice(
570 ioctx.object_list_begin(),
571 ioctx.object_list_end(),
572 n,
573 m,
574 &range_i,
575 &range_end);
576
577
578 bufferlist filter_bl;
579
580 bool legacy_filtering = false;
581 if (untagged_only) {
582 // probe to deal with older OSDs that don't support
583 // the cephfs pgls filtering mode
584 legacy_filtering = !probe_filter(ioctx);
585 if (!legacy_filtering) {
586 ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
587 }
588 }
589
590 int r = 0;
591 while(range_i < range_end) {
592 std::vector<librados::ObjectItem> result;
593 int r = ioctx.object_list(range_i, range_end, 1,
594 filter_bl, &result, &range_i);
595 if (r < 0) {
596 derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
597 return r;
598 }
599
600 for (const auto &i : result) {
601 const std::string &oid = i.oid;
602 uint64_t obj_name_ino = 0;
603 uint64_t obj_name_offset = 0;
604 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
605 if (r != 0) {
606 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
607 continue;
608 }
609
610 if (untagged_only && legacy_filtering) {
611 dout(20) << "Applying filter to " << oid << dendl;
612
613 // We are only interested in 0th objects during this phase: we touched
614 // the other objects during scan_extents
615 if (obj_name_offset != 0) {
616 dout(20) << "Non-zeroth object" << dendl;
617 continue;
618 }
619
620 bufferlist scrub_tag_bl;
621 int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
622 if (r >= 0) {
623 std::string read_tag;
624 auto q = scrub_tag_bl.cbegin();
625 try {
626 decode(read_tag, q);
627 if (read_tag == filter_tag) {
628 dout(20) << "skipping " << oid << " because it has the filter_tag"
629 << dendl;
630 continue;
631 }
632 } catch (const buffer::error &err) {
633 }
634 dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
635 } else {
636 dout(20) << "no tag read (" << r << ")" << dendl;
637 }
638
639 } else if (untagged_only) {
640 ceph_assert(obj_name_offset == 0);
641 dout(20) << "OSD matched oid " << oid << dendl;
642 }
643
644 int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
645 if (r == 0 && this_oid_r < 0) {
646 r = this_oid_r;
647 }
648 }
649 }
650
651 return r;
652 }
653
654 int DataScan::scan_inodes()
655 {
656 bool roots_present;
657 int r = driver->check_roots(&roots_present);
658 if (r != 0) {
659 derr << "Unexpected error checking roots: '"
660 << cpp_strerror(r) << "'" << dendl;
661 return r;
662 }
663
664 if (!roots_present) {
665 std::cerr << "Some or all system inodes are absent. Run 'init' from "
666 "one node before running 'scan_inodes'" << std::endl;
667 return -EIO;
668 }
669
670 return forall_objects(data_io, true, [this](
671 std::string const &oid,
672 uint64_t obj_name_ino,
673 uint64_t obj_name_offset) -> int
674 {
675 int r = 0;
676
677 dout(10) << "handling object "
678 << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
679 << dendl;
680
681 AccumulateResult accum_res;
682 inode_backtrace_t backtrace;
683 file_layout_t loaded_layout = file_layout_t::get_default();
684 std::string symlink;
685 r = ClsCephFSClient::fetch_inode_accumulate_result(
686 data_io, oid, &backtrace, &loaded_layout, &symlink, &accum_res);
687
688 if (r == -EINVAL) {
689 dout(4) << "Accumulated metadata missing from '"
690 << oid << ", did you run scan_extents?" << dendl;
691 return r;
692 } else if (r < 0) {
693 dout(4) << "Unexpected error loading accumulated metadata from '"
694 << oid << "': " << cpp_strerror(r) << dendl;
695 // FIXME: this creates situation where if a client has a corrupt
696 // backtrace/layout, we will fail to inject it. We should (optionally)
697 // proceed if the backtrace/layout is corrupt but we have valid
698 // accumulated metadata.
699 return r;
700 }
701
702 const time_t file_mtime = accum_res.max_mtime;
703 uint64_t file_size = 0;
704 bool have_backtrace = !(backtrace.ancestors.empty());
705
706 // This is the layout we will use for injection, populated either
707 // from loaded_layout or from best guesses
708 file_layout_t guessed_layout;
709 guessed_layout.pool_id = data_pool_id;
710
711 // Calculate file_size, guess the layout
712 if (accum_res.ceiling_obj_index > 0) {
713 uint32_t chunk_size = file_layout_t::get_default().object_size;
714 // When there are multiple objects, the largest object probably
715 // indicates the chunk size. But not necessarily, because files
716 // can be sparse. Only make this assumption if size seen
717 // is a power of two, as chunk sizes typically are.
718 if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
719 chunk_size = accum_res.max_obj_size;
720 }
721
722 if (loaded_layout.pool_id == -1) {
723 // If no stashed layout was found, guess it
724 guessed_layout.object_size = chunk_size;
725 guessed_layout.stripe_unit = chunk_size;
726 guessed_layout.stripe_count = 1;
727 } else if (!loaded_layout.is_valid() ||
728 loaded_layout.object_size < accum_res.max_obj_size) {
729 // If the max size seen exceeds what the stashed layout claims, then
730 // disbelieve it. Guess instead. Same for invalid layouts on disk.
731 dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
732 << std::dec << ", ignoring in favour of best guess" << dendl;
733 guessed_layout.object_size = chunk_size;
734 guessed_layout.stripe_unit = chunk_size;
735 guessed_layout.stripe_count = 1;
736 } else {
737 // We have a stashed layout that we can't disprove, so apply it
738 guessed_layout = loaded_layout;
739 dout(20) << "loaded layout from xattr:"
740 << " os: " << guessed_layout.object_size
741 << " sc: " << guessed_layout.stripe_count
742 << " su: " << guessed_layout.stripe_unit
743 << dendl;
744 // User might have transplanted files from a pool with a different
745 // ID, so whatever the loaded_layout says, we'll force the injected
746 // layout to point to the pool we really read from
747 guessed_layout.pool_id = data_pool_id;
748 }
749
750 if (guessed_layout.stripe_count == 1) {
751 // Unstriped file: simple chunking
752 file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
753 + accum_res.ceiling_obj_size;
754 } else {
755 // Striped file: need to examine the last stripe_count objects
756 // in the file to determine the size.
757
758 // How many complete (i.e. not last stripe) objects?
759 uint64_t complete_objs = 0;
760 if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
761 complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
762 } else {
763 complete_objs = 0;
764 }
765
766 // How many potentially-short objects (i.e. last stripe set) objects?
767 uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
768
769 dout(10) << "calculating striped size from complete objs: "
770 << complete_objs << ", partial objs: " << partial_objs
771 << dendl;
772
773 // Maximum amount of data that may be in the incomplete objects
774 uint64_t incomplete_size = 0;
775
776 // For each short object, calculate the max file size within it
777 // and accumulate the maximum
778 for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
779 char buf[60];
780 snprintf(buf, sizeof(buf), "%llx.%08llx",
781 (long long unsigned)obj_name_ino, (long long unsigned)i);
782
783 uint64_t osize(0);
784 time_t omtime(0);
785 r = data_io.stat(std::string(buf), &osize, &omtime);
786 if (r == 0) {
787 if (osize > 0) {
788 // Upper bound within this object
789 uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
790 * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
791 + (i % guessed_layout.stripe_count)
792 * guessed_layout.stripe_unit + (osize - 1)
793 % guessed_layout.stripe_unit + 1;
794 incomplete_size = std::max(incomplete_size, upper_size);
795 }
796 } else if (r == -ENOENT) {
797 // Absent object, treat as size 0 and ignore.
798 } else {
799 // Unexpected error, carry r to outer scope for handling.
800 break;
801 }
802 }
803 if (r != 0 && r != -ENOENT) {
804 derr << "Unexpected error checking size of ino 0x" << std::hex
805 << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
806 return r;
807 }
808 file_size = complete_objs * guessed_layout.object_size
809 + incomplete_size;
810 }
811 } else {
812 file_size = accum_res.ceiling_obj_size;
813 if (loaded_layout.pool_id < 0
814 || loaded_layout.object_size < accum_res.max_obj_size) {
815 // No layout loaded, or inconsistent layout, use default
816 guessed_layout = file_layout_t::get_default();
817 guessed_layout.pool_id = data_pool_id;
818 } else {
819 guessed_layout = loaded_layout;
820 }
821 }
822
823 // Santity checking backtrace ino against object name
824 if (have_backtrace && backtrace.ino != obj_name_ino) {
825 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
826 << " doesn't match object name ino 0x" << obj_name_ino
827 << std::dec << dendl;
828 have_backtrace = false;
829 }
830
831 InodeStore dentry;
832 build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry, symlink);
833
834 // Inject inode to the metadata pool
835 if (have_backtrace) {
836 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
837 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
838 /* Special case for strays: even if we have a good backtrace,
839 * don't put it in the stray dir, because while that would technically
840 * give it linkage it would still be invisible to the user */
841 r = driver->inject_lost_and_found(obj_name_ino, dentry);
842 if (r < 0) {
843 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
844 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
845 if (r == -EINVAL) {
846 dout(4) << "Use --force-corrupt to overwrite structures that "
847 "appear to be corrupt" << dendl;
848 }
849 }
850 } else {
851 /* Happy case: we will inject a named dentry for this inode */
852 r = driver->inject_with_backtrace(backtrace, dentry);
853 if (r < 0) {
854 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
855 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
856 if (r == -EINVAL) {
857 dout(4) << "Use --force-corrupt to overwrite structures that "
858 "appear to be corrupt" << dendl;
859 }
860 }
861 }
862 } else {
863 /* Backtrace-less case: we will inject a lost+found dentry */
864 r = driver->inject_lost_and_found(
865 obj_name_ino, dentry);
866 if (r < 0) {
867 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
868 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
869 if (r == -EINVAL) {
870 dout(4) << "Use --force-corrupt to overwrite structures that "
871 "appear to be corrupt" << dendl;
872 }
873 }
874 }
875
876 return r;
877 });
878 }
879
880 int DataScan::cleanup()
881 {
882 // We are looking for only zeroth object
883 //
884 return forall_objects(data_io, true, [this](
885 std::string const &oid,
886 uint64_t obj_name_ino,
887 uint64_t obj_name_offset) -> int
888 {
889 int r = 0;
890 r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
891 if (r < 0) {
892 dout(4) << "Error deleting accumulated metadata from '"
893 << oid << "': " << cpp_strerror(r) << dendl;
894 }
895 return r;
896 });
897 }
898
899 bool DataScan::valid_ino(inodeno_t ino) const
900 {
901 return (ino >= inodeno_t((1ull << 40)))
902 || (MDS_INO_IS_STRAY(ino))
903 || (MDS_INO_IS_MDSDIR(ino))
904 || ino == CEPH_INO_ROOT
905 || ino == CEPH_INO_CEPH;
906 }
907
908 int DataScan::scan_links()
909 {
910 MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
911 if (!metadata_driver) {
912 derr << "Unexpected --output-dir option for scan_links" << dendl;
913 return -EINVAL;
914 }
915
916 interval_set<uint64_t> used_inos;
917 map<inodeno_t, int> remote_links;
918 map<snapid_t, SnapInfo> snaps;
919 snapid_t last_snap = 1;
920 snapid_t snaprealm_v2_since = 2;
921
922 struct link_info_t {
923 inodeno_t dirino;
924 frag_t frag;
925 string name;
926 version_t version;
927 int nlink;
928 bool is_dir;
929 map<snapid_t, SnapInfo> snaps;
930 link_info_t() : version(0), nlink(0), is_dir(false) {}
931 link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::inode_const_ptr& i) :
932 dirino(di), frag(df), name(n),
933 version(i->version), nlink(i->nlink), is_dir(S_IFDIR & i->mode) {}
934 dirfrag_t dirfrag() const {
935 return dirfrag_t(dirino, frag);
936 }
937 };
938 map<inodeno_t, list<link_info_t> > dup_primaries;
939 map<inodeno_t, link_info_t> bad_nlink_inos;
940 map<inodeno_t, link_info_t> injected_inos;
941
942 map<dirfrag_t, set<string> > to_remove;
943
944 enum {
945 SCAN_INOS = 1,
946 CHECK_LINK,
947 };
948
949 for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
950 const librados::NObjectIterator it_end = metadata_io.nobjects_end();
951 for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
952 const std::string oid = it->get_oid();
953
954 uint64_t dir_ino = 0;
955 uint64_t frag_id = 0;
956 int r = parse_oid(oid, &dir_ino, &frag_id);
957 if (r == -EINVAL) {
958 dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
959 continue;
960 } else {
961 // parse_oid can only do 0 or -EINVAL
962 ceph_assert(r == 0);
963 }
964
965 if (!valid_ino(dir_ino)) {
966 dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
967 continue;
968 }
969
970 std::map<std::string, bufferlist> items;
971 r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
972 if (r < 0) {
973 derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
974 return r;
975 }
976
977 for (auto& p : items) {
978 auto q = p.second.cbegin();
979 string dname;
980 snapid_t last;
981 dentry_key_t::decode_helper(p.first, dname, last);
982
983 if (last != CEPH_NOSNAP) {
984 if (last > last_snap)
985 last_snap = last;
986 continue;
987 }
988
989 try {
990 snapid_t dnfirst;
991 decode(dnfirst, q);
992 if (dnfirst <= CEPH_MAXSNAP) {
993 if (dnfirst - 1 > last_snap)
994 last_snap = dnfirst - 1;
995 }
996 char dentry_type;
997 decode(dentry_type, q);
998 mempool::mds_co::string alternate_name;
999 if (dentry_type == 'I' || dentry_type == 'i') {
1000 InodeStore inode;
1001 if (dentry_type == 'i') {
1002 DECODE_START(2, q);
1003 if (struct_v >= 2)
1004 decode(alternate_name, q);
1005 inode.decode(q);
1006 DECODE_FINISH(q);
1007 } else {
1008 inode.decode_bare(q);
1009 }
1010
1011 inodeno_t ino = inode.inode->ino;
1012
1013 if (step == SCAN_INOS) {
1014 if (used_inos.contains(ino, 1)) {
1015 dup_primaries[ino].size();
1016 } else {
1017 used_inos.insert(ino);
1018 }
1019 } else if (step == CHECK_LINK) {
1020 sr_t srnode;
1021 if (inode.snap_blob.length()) {
1022 auto p = inode.snap_blob.cbegin();
1023 decode(srnode, p);
1024 for (auto it = srnode.snaps.begin();
1025 it != srnode.snaps.end(); ) {
1026 if (it->second.ino != ino ||
1027 it->second.snapid != it->first) {
1028 srnode.snaps.erase(it++);
1029 } else {
1030 ++it;
1031 }
1032 }
1033 if (!srnode.past_parents.empty()) {
1034 snapid_t last = srnode.past_parents.rbegin()->first;
1035 if (last + 1 > snaprealm_v2_since)
1036 snaprealm_v2_since = last + 1;
1037 }
1038 }
1039 if (inode.old_inodes && !inode.old_inodes->empty()) {
1040 auto _last_snap = inode.old_inodes->rbegin()->first;
1041 if (_last_snap > last_snap)
1042 last_snap = _last_snap;
1043 }
1044 auto q = dup_primaries.find(ino);
1045 if (q != dup_primaries.end()) {
1046 q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
1047 q->second.back().snaps.swap(srnode.snaps);
1048 } else {
1049 int nlink = 0;
1050 auto r = remote_links.find(ino);
1051 if (r != remote_links.end())
1052 nlink = r->second;
1053 if (!MDS_INO_IS_STRAY(dir_ino))
1054 nlink++;
1055 if (inode.inode->nlink != nlink) {
1056 derr << "Bad nlink on " << ino << " expected " << nlink
1057 << " has " << inode.inode->nlink << dendl;
1058 bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1059 bad_nlink_inos[ino].nlink = nlink;
1060 }
1061 snaps.insert(make_move_iterator(begin(srnode.snaps)),
1062 make_move_iterator(end(srnode.snaps)));
1063 }
1064 if (dnfirst == CEPH_NOSNAP)
1065 injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1066 }
1067 } else if (dentry_type == 'L' || dentry_type == 'l') {
1068 inodeno_t ino;
1069 unsigned char d_type;
1070 CDentry::decode_remote(dentry_type, ino, d_type, alternate_name, q);
1071
1072 if (step == SCAN_INOS) {
1073 remote_links[ino]++;
1074 } else if (step == CHECK_LINK) {
1075 if (!used_inos.contains(ino, 1)) {
1076 derr << "Bad remote link dentry 0x" << std::hex << dir_ino
1077 << std::dec << "/" << dname
1078 << ", ino " << ino << " not found" << dendl;
1079 std::string key;
1080 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1081 dn_key.encode(key);
1082 to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
1083 }
1084 }
1085 } else {
1086 derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
1087 << std::dec << "/" << dname << dendl;
1088 return -EINVAL;
1089 }
1090 } catch (const buffer::error &err) {
1091 derr << "Error decoding dentry 0x" << std::hex << dir_ino
1092 << std::dec << "/" << dname << dendl;
1093 return -EINVAL;
1094 }
1095 }
1096 }
1097 }
1098
1099 map<unsigned, uint64_t> max_ino_map;
1100 {
1101 auto prev_max_ino = (uint64_t)1 << 40;
1102 for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
1103 auto cur_max = p.get_start() + p.get_len() - 1;
1104 if (cur_max < prev_max_ino)
1105 continue; // system inodes
1106
1107 if ((prev_max_ino >> 40) != (cur_max >> 40)) {
1108 unsigned rank = (prev_max_ino >> 40) - 1;
1109 max_ino_map[rank] = prev_max_ino;
1110 } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
1111 unsigned rank = (p.get_start() >> 40) - 1;
1112 max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
1113 }
1114 prev_max_ino = cur_max;
1115 }
1116 unsigned rank = (prev_max_ino >> 40) - 1;
1117 max_ino_map[rank] = prev_max_ino;
1118 }
1119
1120 used_inos.clear();
1121
1122 for (auto& p : dup_primaries) {
1123 link_info_t newest;
1124 for (auto& q : p.second) {
1125 if (q.version > newest.version) {
1126 newest = q;
1127 } else if (q.version == newest.version &&
1128 !MDS_INO_IS_STRAY(q.dirino) &&
1129 MDS_INO_IS_STRAY(newest.dirino)) {
1130 newest = q;
1131 }
1132 }
1133
1134 for (auto& q : p.second) {
1135 // in the middle of dir fragmentation?
1136 if (newest.dirino == q.dirino && newest.name == q.name) {
1137 snaps.insert(make_move_iterator(begin(q.snaps)),
1138 make_move_iterator(end(q.snaps)));
1139 continue;
1140 }
1141
1142 std::string key;
1143 dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
1144 dn_key.encode(key);
1145 to_remove[q.dirfrag()].insert(key);
1146 derr << "Remove duplicated ino 0x" << p.first << " from "
1147 << q.dirfrag() << "/" << q.name << dendl;
1148 }
1149
1150 int nlink = 0;
1151 auto q = remote_links.find(p.first);
1152 if (q != remote_links.end())
1153 nlink = q->second;
1154 if (!MDS_INO_IS_STRAY(newest.dirino))
1155 nlink++;
1156
1157 if (nlink != newest.nlink) {
1158 derr << "Bad nlink on " << p.first << " expected " << nlink
1159 << " has " << newest.nlink << dendl;
1160 bad_nlink_inos[p.first] = newest;
1161 bad_nlink_inos[p.first].nlink = nlink;
1162 }
1163 }
1164 dup_primaries.clear();
1165 remote_links.clear();
1166
1167 {
1168 objecter->with_osdmap([&](const OSDMap& o) {
1169 for (auto p : data_pools) {
1170 const pg_pool_t *pi = o.get_pg_pool(p);
1171 if (!pi)
1172 continue;
1173 if (pi->snap_seq > last_snap)
1174 last_snap = pi->snap_seq;
1175 }
1176 });
1177
1178 if (!snaps.empty()) {
1179 if (snaps.rbegin()->first > last_snap)
1180 last_snap = snaps.rbegin()->first;
1181 }
1182 }
1183
1184 for (auto& p : to_remove) {
1185 object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
1186
1187 int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
1188 if (r != 0) {
1189 derr << "Error removing duplicated dentries from " << p.first << dendl;
1190 return r;
1191 }
1192 }
1193 to_remove.clear();
1194
1195 for (auto &p : bad_nlink_inos) {
1196 InodeStore inode;
1197 snapid_t first;
1198 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
1199 if (r < 0) {
1200 derr << "Unexpected error reading dentry "
1201 << p.second.dirfrag() << "/" << p.second.name
1202 << ": " << cpp_strerror(r) << dendl;
1203 return r;
1204 }
1205
1206 if (inode.inode->ino != p.first || inode.inode->version != p.second.version)
1207 continue;
1208
1209 inode.get_inode()->nlink = p.second.nlink;
1210 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
1211 if (r < 0)
1212 return r;
1213 }
1214
1215 for (auto &p : injected_inos) {
1216 InodeStore inode;
1217 snapid_t first;
1218 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
1219 if (r < 0) {
1220 derr << "Unexpected error reading dentry "
1221 << p.second.dirfrag() << "/" << p.second.name
1222 << ": " << cpp_strerror(r) << dendl;
1223 return r;
1224 }
1225
1226 if (first != CEPH_NOSNAP)
1227 continue;
1228
1229 first = last_snap + 1;
1230 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
1231 if (r < 0)
1232 return r;
1233 }
1234
1235 for (auto& p : max_ino_map) {
1236 InoTable inotable(nullptr);
1237 inotable.set_rank(p.first);
1238 bool dirty = false;
1239 int r = metadata_driver->load_table(&inotable);
1240 if (r < 0) {
1241 inotable.reset_state();
1242 dirty = true;
1243 }
1244 if (inotable.force_consume_to(p.second))
1245 dirty = true;
1246 if (dirty) {
1247 r = metadata_driver->save_table(&inotable);
1248 if (r < 0)
1249 return r;
1250 }
1251 }
1252
1253 {
1254 SnapServer snaptable;
1255 snaptable.set_rank(0);
1256 bool dirty = false;
1257 int r = metadata_driver->load_table(&snaptable);
1258 if (r < 0) {
1259 snaptable.reset_state();
1260 dirty = true;
1261 }
1262 if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
1263 dirty = true;
1264 if (dirty) {
1265 r = metadata_driver->save_table(&snaptable);
1266 if (r < 0)
1267 return r;
1268 }
1269 }
1270 return 0;
1271 }
1272
1273 int DataScan::scan_frags()
1274 {
1275 bool roots_present;
1276 int r = driver->check_roots(&roots_present);
1277 if (r != 0) {
1278 derr << "Unexpected error checking roots: '"
1279 << cpp_strerror(r) << "'" << dendl;
1280 return r;
1281 }
1282
1283 if (!roots_present) {
1284 std::cerr << "Some or all system inodes are absent. Run 'init' from "
1285 "one node before running 'scan_inodes'" << std::endl;
1286 return -EIO;
1287 }
1288
1289 return forall_objects(metadata_io, true, [this](
1290 std::string const &oid,
1291 uint64_t obj_name_ino,
1292 uint64_t obj_name_offset) -> int
1293 {
1294 int r = 0;
1295 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
1296 if (r != 0) {
1297 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
1298 return r;
1299 }
1300
1301 if (obj_name_ino < (1ULL << 40)) {
1302 // FIXME: we're skipping stray dirs here: if they're
1303 // orphaned then we should be resetting them some other
1304 // way
1305 dout(10) << "Skipping system ino " << obj_name_ino << dendl;
1306 return 0;
1307 }
1308
1309 AccumulateResult accum_res;
1310 inode_backtrace_t backtrace;
1311
1312 // Default to inherit layout (i.e. no explicit layout on dir) which is
1313 // expressed as a zeroed layout struct (see inode_t::has_layout)
1314 file_layout_t loaded_layout;
1315
1316 int parent_r = 0;
1317 bufferlist parent_bl;
1318 int layout_r = 0;
1319 bufferlist layout_bl;
1320 bufferlist op_bl;
1321
1322 librados::ObjectReadOperation op;
1323 op.getxattr("parent", &parent_bl, &parent_r);
1324 op.getxattr("layout", &layout_bl, &layout_r);
1325 r = metadata_io.operate(oid, &op, &op_bl);
1326 if (r != 0 && r != -ENODATA) {
1327 derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
1328 return r;
1329 }
1330
1331 if (parent_r != -ENODATA) {
1332 try {
1333 auto q = parent_bl.cbegin();
1334 backtrace.decode(q);
1335 } catch (buffer::error &e) {
1336 dout(4) << "Corrupt backtrace on '" << oid << "': " << e.what() << dendl;
1337 if (!force_corrupt) {
1338 return -EINVAL;
1339 } else {
1340 // Treat backtrace as absent: we'll inject into lost+found
1341 backtrace = inode_backtrace_t();
1342 }
1343 }
1344 }
1345
1346 if (layout_r != -ENODATA) {
1347 try {
1348 auto q = layout_bl.cbegin();
1349 decode(loaded_layout, q);
1350 } catch (buffer::error &e) {
1351 dout(4) << "Corrupt layout on '" << oid << "': " << e.what() << dendl;
1352 if (!force_corrupt) {
1353 return -EINVAL;
1354 }
1355 }
1356 }
1357
1358 bool have_backtrace = !(backtrace.ancestors.empty());
1359
1360 // Santity checking backtrace ino against object name
1361 if (have_backtrace && backtrace.ino != obj_name_ino) {
1362 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
1363 << " doesn't match object name ino 0x" << obj_name_ino
1364 << std::dec << dendl;
1365 have_backtrace = false;
1366 }
1367
1368 uint64_t fnode_version = 0;
1369 fnode_t fnode;
1370 r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
1371 if (r == -EINVAL) {
1372 derr << "Corrupt fnode on " << oid << dendl;
1373 if (force_corrupt) {
1374 fnode.fragstat.mtime = 0;
1375 fnode.fragstat.nfiles = 1;
1376 fnode.fragstat.nsubdirs = 0;
1377 fnode.accounted_fragstat = fnode.fragstat;
1378 } else {
1379 return r;
1380 }
1381 }
1382
1383 InodeStore dentry;
1384 build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
1385 loaded_layout, &dentry);
1386
1387 // Inject inode to the metadata pool
1388 if (have_backtrace) {
1389 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
1390 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
1391 /* Special case for strays: even if we have a good backtrace,
1392 * don't put it in the stray dir, because while that would technically
1393 * give it linkage it would still be invisible to the user */
1394 r = driver->inject_lost_and_found(obj_name_ino, dentry);
1395 if (r < 0) {
1396 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1397 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1398 if (r == -EINVAL) {
1399 dout(4) << "Use --force-corrupt to overwrite structures that "
1400 "appear to be corrupt" << dendl;
1401 }
1402 }
1403 } else {
1404 /* Happy case: we will inject a named dentry for this inode */
1405 r = driver->inject_with_backtrace(backtrace, dentry);
1406 if (r < 0) {
1407 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1408 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
1409 if (r == -EINVAL) {
1410 dout(4) << "Use --force-corrupt to overwrite structures that "
1411 "appear to be corrupt" << dendl;
1412 }
1413 }
1414 }
1415 } else {
1416 /* Backtrace-less case: we will inject a lost+found dentry */
1417 r = driver->inject_lost_and_found(
1418 obj_name_ino, dentry);
1419 if (r < 0) {
1420 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
1421 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1422 if (r == -EINVAL) {
1423 dout(4) << "Use --force-corrupt to overwrite structures that "
1424 "appear to be corrupt" << dendl;
1425 }
1426 }
1427 }
1428
1429 return r;
1430 });
1431 }
1432
1433 int MetadataTool::read_fnode(
1434 inodeno_t ino, frag_t frag, fnode_t *fnode,
1435 uint64_t *last_version)
1436 {
1437 ceph_assert(fnode != NULL);
1438
1439 object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
1440 bufferlist fnode_bl;
1441 int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
1442 *last_version = metadata_io.get_last_version();
1443 if (r < 0) {
1444 return r;
1445 }
1446
1447 auto old_fnode_iter = fnode_bl.cbegin();
1448 try {
1449 (*fnode).decode(old_fnode_iter);
1450 } catch (const buffer::error &err) {
1451 return -EINVAL;
1452 }
1453
1454 return 0;
1455 }
1456
1457 int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
1458 const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
1459 {
1460 ceph_assert(inode != NULL);
1461
1462 std::string key;
1463 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1464 dn_key.encode(key);
1465
1466 std::set<std::string> keys;
1467 keys.insert(key);
1468 std::map<std::string, bufferlist> vals;
1469 object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
1470 int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
1471 dout(20) << "oid=" << frag_oid.name
1472 << " dname=" << dname
1473 << " frag=" << frag
1474 << ", r=" << r << dendl;
1475 if (r < 0) {
1476 return r;
1477 }
1478
1479 if (vals.find(key) == vals.end()) {
1480 dout(20) << key << " not found in result" << dendl;
1481 return -ENOENT;
1482 }
1483
1484 try {
1485 auto q = vals[key].cbegin();
1486 snapid_t first;
1487 decode(first, q);
1488 char dentry_type;
1489 decode(dentry_type, q);
1490 if (dentry_type == 'I' || dentry_type == 'i') {
1491 if (dentry_type == 'i') {
1492 mempool::mds_co::string alternate_name;
1493
1494 DECODE_START(2, q);
1495 if (struct_v >= 2)
1496 decode(alternate_name, q);
1497 inode->decode(q);
1498 DECODE_FINISH(q);
1499 } else {
1500 inode->decode_bare(q);
1501 }
1502 } else {
1503 dout(20) << "dentry type '" << dentry_type << "': cannot"
1504 "read an inode out of that" << dendl;
1505 return -EINVAL;
1506 }
1507 if (dnfirst)
1508 *dnfirst = first;
1509 } catch (const buffer::error &err) {
1510 dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
1511 << std::dec << "/" << dname << dendl;
1512 return -EINVAL;
1513 }
1514
1515 return 0;
1516 }
1517
1518 int MetadataDriver::load_table(MDSTable *table)
1519 {
1520 object_t table_oid = table->get_object_name();
1521
1522 bufferlist table_bl;
1523 int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
1524 if (r < 0) {
1525 derr << "unable to read mds table '" << table_oid.name << "': "
1526 << cpp_strerror(r) << dendl;
1527 return r;
1528 }
1529
1530 try {
1531 version_t table_ver;
1532 auto p = table_bl.cbegin();
1533 decode(table_ver, p);
1534 table->decode_state(p);
1535 table->force_replay_version(table_ver);
1536 } catch (const buffer::error &err) {
1537 derr << "unable to decode mds table '" << table_oid.name << "': "
1538 << err.what() << dendl;
1539 return -EIO;
1540 }
1541 return 0;
1542 }
1543
1544 int MetadataDriver::save_table(MDSTable *table)
1545 {
1546 object_t table_oid = table->get_object_name();
1547
1548 bufferlist table_bl;
1549 encode(table->get_version(), table_bl);
1550 table->encode_state(table_bl);
1551 int r = metadata_io.write_full(table_oid.name, table_bl);
1552 if (r != 0) {
1553 derr << "error updating mds table " << table_oid.name
1554 << ": " << cpp_strerror(r) << dendl;
1555 return r;
1556 }
1557 return 0;
1558 }
1559
1560 int MetadataDriver::inject_lost_and_found(
1561 inodeno_t ino, const InodeStore &dentry)
1562 {
1563 // Create lost+found if doesn't exist
1564 bool created = false;
1565 int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
1566 if (r < 0) {
1567 return r;
1568 }
1569 InodeStore lf_ino;
1570 r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
1571 if (r == -ENOENT || r == -EINVAL) {
1572 if (r == -EINVAL && !force_corrupt) {
1573 return r;
1574 }
1575
1576 // To have a directory not specify a layout, give it zeros (see
1577 // inode_t::has_layout)
1578 file_layout_t inherit_layout;
1579
1580 // Construct LF inode
1581 frag_info_t fragstat;
1582 fragstat.nfiles = 1,
1583 build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
1584
1585 // Inject link to LF inode in the root dir
1586 r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
1587 if (r < 0) {
1588 return r;
1589 }
1590 } else {
1591 if (!(lf_ino.inode->mode & S_IFDIR)) {
1592 derr << "lost+found exists but is not a directory!" << dendl;
1593 // In this case we error out, and the user should do something about
1594 // this problem.
1595 return -EINVAL;
1596 }
1597 }
1598
1599 r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
1600 if (r < 0) {
1601 return r;
1602 }
1603
1604 const std::string dname = lost_found_dname(ino);
1605
1606 // Write dentry into lost+found dirfrag
1607 return inject_linkage(lf_ino.inode->ino, dname, frag_t(), dentry);
1608 }
1609
1610
1611 int MetadataDriver::get_frag_of(
1612 inodeno_t dirino,
1613 const std::string &target_dname,
1614 frag_t *result_ft)
1615 {
1616 object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
1617
1618 dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
1619
1620 // Find and load fragtree if existing dirfrag
1621 // ==========================================
1622 bool have_backtrace = false;
1623 bufferlist parent_bl;
1624 int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
1625 if (r == -ENODATA) {
1626 dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
1627 } else if (r < 0) {
1628 dout(4) << "Unexpected error on '" << root_frag_oid << "': "
1629 << cpp_strerror(r) << dendl;
1630 return r;
1631 }
1632
1633 // Deserialize backtrace
1634 inode_backtrace_t backtrace;
1635 if (parent_bl.length()) {
1636 try {
1637 auto q = parent_bl.cbegin();
1638 backtrace.decode(q);
1639 have_backtrace = true;
1640 } catch (buffer::error &e) {
1641 dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': "
1642 << e.what() << dendl;
1643 }
1644 }
1645
1646 if (!(have_backtrace && backtrace.ancestors.size())) {
1647 // Can't work out fragtree without a backtrace
1648 dout(4) << "No backtrace on '" << root_frag_oid
1649 << "': cannot determine fragtree" << dendl;
1650 return -ENOENT;
1651 }
1652
1653 // The parentage of dirino
1654 const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
1655
1656 // The inode of dirino's parent
1657 const inodeno_t parent_ino = bp.dirino;
1658
1659 // The dname of dirino in its parent.
1660 const std::string &parent_dname = bp.dname;
1661
1662 dout(20) << "got backtrace parent " << parent_ino << "/"
1663 << parent_dname << dendl;
1664
1665 // The primary dentry for dirino
1666 InodeStore existing_dentry;
1667
1668 // See if we can find ourselves in dirfrag zero of the parent: this
1669 // is a fast path that avoids needing to go further up the tree
1670 // if the parent isn't fragmented (worst case we would have to
1671 // go all the way to the root)
1672 r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
1673 if (r >= 0) {
1674 // Great, fast path: return the fragtree from here
1675 if (existing_dentry.inode->ino != dirino) {
1676 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
1677 << existing_dentry.inode->ino
1678 << " vs expected 0x" << dirino << std::dec << dendl;
1679 return -ENOENT;
1680 }
1681 dout(20) << "fast path, fragtree is "
1682 << existing_dentry.dirfragtree << dendl;
1683 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1684 dout(20) << "frag is " << *result_ft << dendl;
1685 return 0;
1686 } else if (r != -ENOENT) {
1687 // Dentry not present in 0th frag, must read parent's fragtree
1688 frag_t parent_frag;
1689 r = get_frag_of(parent_ino, parent_dname, &parent_frag);
1690 if (r == 0) {
1691 // We have the parent fragtree, so try again to load our dentry
1692 r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
1693 if (r >= 0) {
1694 // Got it!
1695 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1696 dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
1697 return 0;
1698 } else {
1699 if (r == -EINVAL || r == -ENOENT) {
1700 return -ENOENT; // dentry missing or corrupt, so frag is missing
1701 } else {
1702 return r;
1703 }
1704 }
1705 } else {
1706 // Couldn't resolve parent fragtree, so can't find ours.
1707 return r;
1708 }
1709 } else if (r == -EINVAL) {
1710 // Unreadable dentry, can't know the fragtree.
1711 return -ENOENT;
1712 } else {
1713 // Unexpected error, raise it
1714 return r;
1715 }
1716 }
1717
1718
1719 int MetadataDriver::inject_with_backtrace(
1720 const inode_backtrace_t &backtrace, const InodeStore &dentry)
1721
1722 {
1723
1724 // On dirfrags
1725 // ===========
1726 // In order to insert something into a directory, we first (ideally)
1727 // need to know the fragtree for the directory. Sometimes we can't
1728 // get that, in which case we just go ahead and insert it into
1729 // fragment zero for a good chance of that being the right thing
1730 // anyway (most moderate-sized dirs aren't fragmented!)
1731
1732 // On ancestry
1733 // ===========
1734 // My immediate ancestry should be correct, so if we can find that
1735 // directory's dirfrag then go inject it there. This works well
1736 // in the case that this inode's dentry was somehow lost and we
1737 // are recreating it, because the rest of the hierarchy
1738 // will probably still exist.
1739 //
1740 // It's more of a "better than nothing" approach when rebuilding
1741 // a whole tree, as backtraces will in general not be up to date
1742 // beyond the first parent, if anything in the trace was ever
1743 // moved after the file was created.
1744
1745 // On inode numbers
1746 // ================
1747 // The backtrace tells us inodes for each of the parents. If we are
1748 // creating those parent dirfrags, then there is a risk that somehow
1749 // the inode indicated here was also used for data (not a dirfrag) at
1750 // some stage. That would be a zany situation, and we don't check
1751 // for it here, because to do so would require extra IOs for everything
1752 // we inject, and anyway wouldn't guarantee that the inode number
1753 // wasn't in use in some dentry elsewhere in the metadata tree that
1754 // just happened not to have any data objects.
1755
1756 // On multiple workers touching the same traces
1757 // ============================================
1758 // When creating linkage for a directory, *only* create it if we are
1759 // also creating the object. That way, we might not manage to get the
1760 // *right* linkage for a directory, but at least we won't multiply link
1761 // it. We assume that if a root dirfrag exists for a directory, then
1762 // it is linked somewhere (i.e. that the metadata pool is not already
1763 // inconsistent).
1764 //
1765 // Making sure *that* is true is someone else's job! Probably someone
1766 // who is not going to run in parallel, so that they can self-consistently
1767 // look at versions and move things around as they go.
1768 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1769 // object, next run will fail to create linkage for the dirfrag object
1770 // and leave it orphaned.
1771
1772 inodeno_t ino = backtrace.ino;
1773 dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
1774 for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
1775 i != backtrace.ancestors.end(); ++i) {
1776 const inode_backpointer_t &backptr = *i;
1777 dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
1778 << "/" << backptr.dname << dendl;
1779
1780 // Examine root dirfrag for parent
1781 const inodeno_t parent_ino = backptr.dirino;
1782 const std::string dname = backptr.dname;
1783
1784 frag_t fragment;
1785 int r = get_frag_of(parent_ino, dname, &fragment);
1786 if (r == -ENOENT) {
1787 // Don't know fragment, fall back to assuming root
1788 dout(20) << "don't know fragment for 0x" << std::hex <<
1789 parent_ino << std::dec << "/" << dname << ", will insert to root"
1790 << dendl;
1791 }
1792
1793 // Find or create dirfrag
1794 // ======================
1795 bool created_dirfrag;
1796 r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
1797 if (r < 0) {
1798 return r;
1799 }
1800
1801 // Check if dentry already exists
1802 // ==============================
1803 InodeStore existing_dentry;
1804 r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
1805 bool write_dentry = false;
1806 if (r == -ENOENT || r == -EINVAL) {
1807 if (r == -EINVAL && !force_corrupt) {
1808 return r;
1809 }
1810 // Missing or corrupt dentry
1811 write_dentry = true;
1812 } else if (r < 0) {
1813 derr << "Unexpected error reading dentry 0x" << std::hex
1814 << parent_ino << std::dec << "/"
1815 << dname << ": " << cpp_strerror(r) << dendl;
1816 break;
1817 } else {
1818 // Dentry already present, does it link to me?
1819 if (existing_dentry.inode->ino == ino) {
1820 dout(20) << "Dentry 0x" << std::hex
1821 << parent_ino << std::dec << "/"
1822 << dname << " already exists and points to me" << dendl;
1823 } else {
1824 derr << "Dentry 0x" << std::hex
1825 << parent_ino << std::dec << "/"
1826 << dname << " already exists but points to 0x"
1827 << std::hex << existing_dentry.inode->ino << std::dec << dendl;
1828 // Fall back to lost+found!
1829 return inject_lost_and_found(backtrace.ino, dentry);
1830 }
1831 }
1832
1833 // Inject linkage
1834 // ==============
1835
1836 if (write_dentry) {
1837 if (i == backtrace.ancestors.begin()) {
1838 // This is the linkage for the file of interest
1839 dout(10) << "Linking inode 0x" << std::hex << ino
1840 << " at 0x" << parent_ino << "/" << dname << std::dec
1841 << " with size=" << dentry.inode->size << " bytes" << dendl;
1842
1843 r = inject_linkage(parent_ino, dname, fragment, dentry);
1844 } else {
1845 // This is the linkage for an ancestor directory
1846 InodeStore ancestor_dentry;
1847 auto inode = ancestor_dentry.get_inode();
1848 inode->mode = 0755 | S_IFDIR;
1849
1850 // Set nfiles to something non-zero, to fool any other code
1851 // that tries to ignore 'empty' directories. This won't be
1852 // accurate, but it should avoid functional issues.
1853
1854 inode->dirstat.nfiles = 1;
1855 inode->dir_layout.dl_dir_hash =
1856 g_conf()->mds_default_dir_hash;
1857
1858 inode->nlink = 1;
1859 inode->ino = ino;
1860 inode->uid = g_conf()->mds_root_ino_uid;
1861 inode->gid = g_conf()->mds_root_ino_gid;
1862 inode->version = 1;
1863 inode->backtrace_version = 1;
1864 r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
1865 }
1866
1867 if (r < 0) {
1868 return r;
1869 }
1870 }
1871
1872 if (!created_dirfrag) {
1873 // If the parent dirfrag already existed, then stop traversing the
1874 // backtrace: assume that the other ancestors already exist too. This
1875 // is an assumption rather than a truth, but it's a convenient way
1876 // to avoid the risk of creating multiply-linked directories while
1877 // injecting data. If there are in fact missing ancestors, this
1878 // should be fixed up using a separate tool scanning the metadata
1879 // pool.
1880 break;
1881 } else {
1882 // Proceed up the backtrace, creating parents
1883 ino = parent_ino;
1884 }
1885 }
1886
1887 return 0;
1888 }
1889
1890 int MetadataDriver::find_or_create_dirfrag(
1891 inodeno_t ino,
1892 frag_t fragment,
1893 bool *created)
1894 {
1895 ceph_assert(created != NULL);
1896
1897 fnode_t existing_fnode;
1898 *created = false;
1899
1900 uint64_t read_version = 0;
1901 int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
1902 dout(10) << "read_version = " << read_version << dendl;
1903
1904 if (r == -ENOENT || r == -EINVAL) {
1905 if (r == -EINVAL && !force_corrupt) {
1906 return r;
1907 }
1908
1909 // Missing or corrupt fnode, create afresh
1910 bufferlist fnode_bl;
1911 fnode_t blank_fnode;
1912 blank_fnode.version = 1;
1913 // mark it as non-empty
1914 blank_fnode.fragstat.nfiles = 1;
1915 blank_fnode.accounted_fragstat = blank_fnode.fragstat;
1916 blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
1917 blank_fnode.encode(fnode_bl);
1918
1919
1920 librados::ObjectWriteOperation op;
1921
1922 if (read_version) {
1923 ceph_assert(r == -EINVAL);
1924 // Case A: We must assert that the version isn't changed since we saw the object
1925 // was unreadable, to avoid the possibility of two data-scan processes
1926 // both creating the frag.
1927 op.assert_version(read_version);
1928 } else {
1929 ceph_assert(r == -ENOENT);
1930 // Case B: The object didn't exist in read_fnode, so while creating it we must
1931 // use an exclusive create to correctly populate *creating with
1932 // whether we created it ourselves or someone beat us to it.
1933 op.create(true);
1934 }
1935
1936 object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
1937 op.omap_set_header(fnode_bl);
1938 r = metadata_io.operate(frag_oid.name, &op);
1939 if (r == -EOVERFLOW || r == -EEXIST) {
1940 // Someone else wrote it (see case A above)
1941 dout(10) << "Dirfrag creation race: 0x" << std::hex
1942 << ino << " " << fragment << std::dec << dendl;
1943 *created = false;
1944 return 0;
1945 } else if (r < 0) {
1946 // We were unable to create or write it, error out
1947 derr << "Failed to create dirfrag 0x" << std::hex
1948 << ino << std::dec << ": " << cpp_strerror(r) << dendl;
1949 return r;
1950 } else {
1951 // Success: the dirfrag object now exists with a value header
1952 dout(10) << "Created dirfrag: 0x" << std::hex
1953 << ino << std::dec << dendl;
1954 *created = true;
1955 }
1956 } else if (r < 0) {
1957 derr << "Unexpected error reading dirfrag 0x" << std::hex
1958 << ino << std::dec << " : " << cpp_strerror(r) << dendl;
1959 return r;
1960 } else {
1961 dout(20) << "Dirfrag already exists: 0x" << std::hex
1962 << ino << " " << fragment << std::dec << dendl;
1963 }
1964
1965 return 0;
1966 }
1967
1968 int MetadataDriver::inject_linkage(
1969 inodeno_t dir_ino, const std::string &dname,
1970 const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
1971 {
1972 object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
1973
1974 std::string key;
1975 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1976 dn_key.encode(key);
1977
1978 bufferlist dentry_bl;
1979 encode(dnfirst, dentry_bl);
1980 encode('I', dentry_bl);
1981 inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1982
1983 // Write out
1984 std::map<std::string, bufferlist> vals;
1985 vals[key] = dentry_bl;
1986 int r = metadata_io.omap_set(frag_oid.name, vals);
1987 if (r != 0) {
1988 derr << "Error writing dentry 0x" << std::hex
1989 << dir_ino << std::dec << "/"
1990 << dname << ": " << cpp_strerror(r) << dendl;
1991 return r;
1992 } else {
1993 dout(20) << "Injected dentry 0x" << std::hex
1994 << dir_ino << "/" << dname << " pointing to 0x"
1995 << inode.inode->ino << std::dec << dendl;
1996 return 0;
1997 }
1998 }
1999
2000
2001 int MetadataDriver::init(
2002 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
2003 fs_cluster_id_t fscid)
2004 {
2005 if (metadata_pool_name.empty()) {
2006 auto fs = fsmap->get_filesystem(fscid);
2007 ceph_assert(fs != nullptr);
2008 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
2009
2010 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
2011 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
2012 if (r < 0) {
2013 derr << "Pool " << metadata_pool_id
2014 << " identified in MDS map not found in RADOS!" << dendl;
2015 return r;
2016 }
2017 dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
2018 } else {
2019 dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
2020 }
2021 return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
2022 }
2023
2024 int LocalFileDriver::init(
2025 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
2026 fs_cluster_id_t fscid)
2027 {
2028 return 0;
2029 }
2030
2031 int LocalFileDriver::inject_data(
2032 const std::string &file_path,
2033 uint64_t size,
2034 uint32_t chunk_size,
2035 inodeno_t ino)
2036 {
2037 // Scrape the file contents out of the data pool and into the
2038 // local filesystem
2039 std::fstream f;
2040 f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
2041
2042 for (uint64_t offset = 0; offset < size; offset += chunk_size) {
2043 bufferlist bl;
2044
2045 char buf[32];
2046 snprintf(buf, sizeof(buf),
2047 "%llx.%08llx",
2048 (unsigned long long)ino,
2049 (unsigned long long)(offset / chunk_size));
2050 std::string oid(buf);
2051
2052 int r = data_io.read(oid, bl, chunk_size, 0);
2053
2054 if (r <= 0 && r != -ENOENT) {
2055 derr << "error reading data object '" << oid << "': "
2056 << cpp_strerror(r) << dendl;
2057 f.close();
2058 return r;
2059 } else if (r >=0) {
2060
2061 f.seekp(offset);
2062 bl.write_stream(f);
2063 }
2064 }
2065 f.close();
2066
2067 return 0;
2068 }
2069
2070
2071 int LocalFileDriver::inject_with_backtrace(
2072 const inode_backtrace_t &bt,
2073 const InodeStore &dentry)
2074 {
2075 std::string path_builder = path;
2076
2077 // Iterate through backtrace creating directory parents
2078 std::vector<inode_backpointer_t>::const_reverse_iterator i;
2079 for (i = bt.ancestors.rbegin();
2080 i != bt.ancestors.rend(); ++i) {
2081
2082 const inode_backpointer_t &backptr = *i;
2083 path_builder += "/";
2084 path_builder += backptr.dname;
2085
2086 // Last entry is the filename itself
2087 bool is_file = (i + 1 == bt.ancestors.rend());
2088 if (is_file) {
2089 // FIXME: inject_data won't cope with interesting (i.e. striped)
2090 // layouts (need a librados-compatible Filer to read these)
2091 inject_data(path_builder, dentry.inode->size,
2092 dentry.inode->layout.object_size, bt.ino);
2093 } else {
2094 int r = mkdir(path_builder.c_str(), 0755);
2095 if (r != 0 && r != -EPERM) {
2096 derr << "error creating directory: '" << path_builder << "': "
2097 << cpp_strerror(r) << dendl;
2098 return r;
2099 }
2100 }
2101 }
2102
2103 return 0;
2104 }
2105
2106 int LocalFileDriver::inject_lost_and_found(
2107 inodeno_t ino,
2108 const InodeStore &dentry)
2109 {
2110 std::string lf_path = path + "/lost+found";
2111 int r = mkdir(lf_path.c_str(), 0755);
2112 if (r != 0 && r != -EPERM) {
2113 derr << "error creating directory: '" << lf_path << "': "
2114 << cpp_strerror(r) << dendl;
2115 return r;
2116 }
2117
2118 std::string file_path = lf_path + "/" + lost_found_dname(ino);
2119 return inject_data(file_path, dentry.inode->size,
2120 dentry.inode->layout.object_size, ino);
2121 }
2122
2123 int LocalFileDriver::init_roots(int64_t data_pool_id)
2124 {
2125 // Ensure that the path exists and is a directory
2126 bool exists;
2127 int r = check_roots(&exists);
2128 if (r != 0) {
2129 return r;
2130 }
2131
2132 if (exists) {
2133 return 0;
2134 } else {
2135 return ::mkdir(path.c_str(), 0755);
2136 }
2137 }
2138
2139 int LocalFileDriver::check_roots(bool *result)
2140 {
2141 // Check if the path exists and is a directory
2142 DIR *d = ::opendir(path.c_str());
2143 if (d == NULL) {
2144 *result = false;
2145 } else {
2146 int r = closedir(d);
2147 if (r != 0) {
2148 // Weird, but maybe possible with e.g. stale FD on NFS mount?
2149 *result = false;
2150 } else {
2151 *result = true;
2152 }
2153 }
2154
2155 return 0;
2156 }
2157
2158 void MetadataTool::build_file_dentry(
2159 inodeno_t ino, uint64_t file_size, time_t file_mtime,
2160 const file_layout_t &layout, InodeStore *out, std::string symlink)
2161 {
2162 ceph_assert(out != NULL);
2163
2164 auto inode = out->get_inode();
2165 if(!symlink.empty()) {
2166 inode->mode = 0777 | S_IFLNK;
2167 out->symlink = symlink;
2168 }
2169 else {
2170 inode->mode = 0500 | S_IFREG;
2171 }
2172
2173 inode->size = file_size;
2174 inode->max_size_ever = file_size;
2175 inode->mtime.tv.tv_sec = file_mtime;
2176 inode->atime.tv.tv_sec = file_mtime;
2177 inode->ctime.tv.tv_sec = file_mtime;
2178
2179 inode->layout = layout;
2180
2181 inode->truncate_seq = 1;
2182 inode->truncate_size = -1ull;
2183
2184 inode->inline_data.version = CEPH_INLINE_NONE;
2185
2186 inode->nlink = 1;
2187 inode->ino = ino;
2188 inode->version = 1;
2189 inode->backtrace_version = 1;
2190 inode->uid = g_conf()->mds_root_ino_uid;
2191 inode->gid = g_conf()->mds_root_ino_gid;
2192 }
2193
2194 void MetadataTool::build_dir_dentry(
2195 inodeno_t ino, const frag_info_t &fragstat,
2196 const file_layout_t &layout, InodeStore *out)
2197 {
2198 ceph_assert(out != NULL);
2199
2200 auto inode = out->get_inode();
2201 inode->mode = 0755 | S_IFDIR;
2202 inode->dirstat = fragstat;
2203 inode->mtime.tv.tv_sec = fragstat.mtime;
2204 inode->atime.tv.tv_sec = fragstat.mtime;
2205 inode->ctime.tv.tv_sec = fragstat.mtime;
2206
2207 inode->layout = layout;
2208 inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
2209
2210 inode->truncate_seq = 1;
2211 inode->truncate_size = -1ull;
2212
2213 inode->inline_data.version = CEPH_INLINE_NONE;
2214
2215 inode->nlink = 1;
2216 inode->ino = ino;
2217 inode->version = 1;
2218 inode->backtrace_version = 1;
2219 inode->uid = g_conf()->mds_root_ino_uid;
2220 inode->gid = g_conf()->mds_root_ino_gid;
2221 }
2222