]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/cephfs/DataScan.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / tools / cephfs / DataScan.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
31f18b77 15#include "include/compat.h"
7c673cae
FG
16#include "common/errno.h"
17#include "common/ceph_argparse.h"
18#include <fstream>
19#include "include/util.h"
20
21#include "mds/CInode.h"
11fdf7f2
TL
22#include "mds/InoTable.h"
23#include "mds/SnapServer.h"
7c673cae
FG
24#include "cls/cephfs/cls_cephfs_client.h"
25
26#include "PgFiles.h"
27#include "DataScan.h"
28#include "include/compat.h"
29
30#define dout_context g_ceph_context
31#define dout_subsys ceph_subsys_mds
32#undef dout_prefix
33#define dout_prefix *_dout << "datascan." << __func__ << ": "
34
35void DataScan::usage()
36{
37 std::cout << "Usage: \n"
38 << " cephfs-data-scan init [--force-init]\n"
39 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
40 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
41 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
42 << " cephfs-data-scan scan_links\n"
43 << "\n"
44 << " --force-corrupt: overrite apparently corrupt structures\n"
45 << " --force-init: write root inodes even if they exist\n"
46 << " --force-pool: use data pool even if it is not in FSMap\n"
47 << " --worker_m: Maximum number of workers\n"
48 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
49 << "\n"
50 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
51 << " cephfs-data-scan cleanup <data pool name>\n"
52 << std::endl;
53
54 generic_client_usage();
55}
56
57bool DataScan::parse_kwarg(
58 const std::vector<const char*> &args,
59 std::vector<const char *>::const_iterator &i,
60 int *r)
61{
62 if (i + 1 == args.end()) {
63 return false;
64 }
65
66 const std::string arg(*i);
67 const std::string val(*(i + 1));
68
69 if (arg == std::string("--output-dir")) {
70 if (driver != NULL) {
71 derr << "Unexpected --output-dir: output already selected!" << dendl;
72 *r = -EINVAL;
73 return false;
74 }
75 dout(4) << "Using local file output to '" << val << "'" << dendl;
76 driver = new LocalFileDriver(val, data_io);
77 return true;
78 } else if (arg == std::string("--worker_n")) {
79 std::string err;
80 n = strict_strtoll(val.c_str(), 10, &err);
81 if (!err.empty()) {
82 std::cerr << "Invalid worker number '" << val << "'" << std::endl;
83 *r = -EINVAL;
84 return false;
85 }
86 return true;
87 } else if (arg == std::string("--worker_m")) {
88 std::string err;
89 m = strict_strtoll(val.c_str(), 10, &err);
90 if (!err.empty()) {
91 std::cerr << "Invalid worker count '" << val << "'" << std::endl;
92 *r = -EINVAL;
93 return false;
94 }
95 return true;
96 } else if (arg == std::string("--filter-tag")) {
97 filter_tag = val;
98 dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
99 return true;
100 } else if (arg == std::string("--filesystem")) {
101 std::shared_ptr<const Filesystem> fs;
102 *r = fsmap->parse_filesystem(val, &fs);
103 if (*r != 0) {
104 std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
105 return false;
106 }
107 fscid = fs->fscid;
108 return true;
109 } else if (arg == std::string("--alternate-pool")) {
110 metadata_pool_name = val;
111 return true;
112 } else {
113 return false;
114 }
115}
116
117bool DataScan::parse_arg(
118 const std::vector<const char*> &args,
119 std::vector<const char *>::const_iterator &i)
120{
121 const std::string arg(*i);
122 if (arg == "--force-pool") {
123 force_pool = true;
124 return true;
125 } else if (arg == "--force-corrupt") {
126 force_corrupt = true;
127 return true;
128 } else if (arg == "--force-init") {
129 force_init = true;
130 return true;
131 } else {
132 return false;
133 }
134}
135
136int DataScan::main(const std::vector<const char*> &args)
137{
138 // Parse args
139 // ==========
140 if (args.size() < 1) {
11fdf7f2 141 cerr << "missing position argument" << std::endl;
7c673cae
FG
142 return -EINVAL;
143 }
144
145 // Common RADOS init: open metadata pool
146 // =====================================
147 librados::Rados rados;
148 int r = rados.init_with_context(g_ceph_context);
149 if (r < 0) {
150 derr << "RADOS unavailable" << dendl;
151 return r;
152 }
153
154 std::string const &command = args[0];
155 std::string data_pool_name;
156
157 std::string pg_files_path;
158 std::set<pg_t> pg_files_pgs;
159
160 // Consume any known --key val or --flag arguments
161 for (std::vector<const char *>::const_iterator i = args.begin() + 1;
162 i != args.end(); ++i) {
163 if (parse_kwarg(args, i, &r)) {
164 // Skip the kwarg value field
165 ++i;
166 continue;
167 } else if (r) {
168 return r;
169 }
170
171 if (parse_arg(args, i)) {
172 continue;
173 }
174
175 // Trailing positional argument
176 if (i + 1 == args.end() &&
177 (command == "scan_inodes"
178 || command == "scan_extents"
179 || command == "cleanup")) {
180 data_pool_name = *i;
181 continue;
182 }
183
184 if (command == "pg_files") {
185 if (i == args.begin() + 1) {
186 pg_files_path = *i;
187 continue;
188 } else {
189 pg_t pg;
190 bool parsed = pg.parse(*i);
191 if (!parsed) {
192 std::cerr << "Invalid PG '" << *i << "'" << std::endl;
193 return -EINVAL;
194 } else {
195 pg_files_pgs.insert(pg);
196 continue;
197 }
198 }
199
200 }
201
202 // Fall through: unhandled
203 std::cerr << "Unknown argument '" << *i << "'" << std::endl;
204 return -EINVAL;
205 }
206
207 // If caller didn't specify a namespace, try to pick
208 // one if only one exists
209 if (fscid == FS_CLUSTER_ID_NONE) {
210 if (fsmap->filesystem_count() == 1) {
211 fscid = fsmap->get_filesystem()->fscid;
212 } else {
213 std::cerr << "Specify a filesystem with --filesystem" << std::endl;
214 return -EINVAL;
215 }
216 }
217 auto fs = fsmap->get_filesystem(fscid);
11fdf7f2 218 ceph_assert(fs != nullptr);
7c673cae
FG
219
220 // Default to output to metadata pool
221 if (driver == NULL) {
222 driver = new MetadataDriver();
223 driver->set_force_corrupt(force_corrupt);
224 driver->set_force_init(force_init);
225 dout(4) << "Using metadata pool output" << dendl;
226 }
227
228 dout(4) << "connecting to RADOS..." << dendl;
229 r = rados.connect();
230 if (r < 0) {
231 std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
232 << std::endl;
233 return r;
234 }
235
236 r = driver->init(rados, metadata_pool_name, fsmap, fscid);
237 if (r < 0) {
238 return r;
239 }
240
241 if (command == "pg_files") {
242 auto pge = PgFiles(objecter, pg_files_pgs);
243 pge.init();
244 return pge.scan_path(pg_files_path);
245 }
246
247 // Initialize data_io for those commands that need it
248 if (command == "scan_inodes" ||
249 command == "scan_extents" ||
250 command == "cleanup") {
251 if (data_pool_name.empty()) {
252 std::cerr << "Data pool not specified" << std::endl;
7c673cae
FG
253 return -EINVAL;
254 }
255
256 data_pool_id = rados.pool_lookup(data_pool_name.c_str());
257 if (data_pool_id < 0) {
258 std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
259 return -ENOENT;
260 } else {
261 dout(4) << "data pool '" << data_pool_name
262 << "' has ID " << data_pool_id << dendl;
263 }
264
265 if (!fs->mds_map.is_data_pool(data_pool_id)) {
266 std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
267 "CephFS data pool!" << std::endl;
268 if (!force_pool) {
269 std::cerr << "Use --force-pool to continue" << std::endl;
270 return -EINVAL;
271 }
272 }
273
274 dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
275 r = rados.ioctx_create(data_pool_name.c_str(), data_io);
276 if (r != 0) {
277 return r;
278 }
279 }
280
281 // Initialize metadata_io from MDSMap for scan_frags
282 if (command == "scan_frags" || command == "scan_links") {
283 const auto fs = fsmap->get_filesystem(fscid);
284 if (fs == nullptr) {
285 std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
286 return -ENOENT;
287 }
288 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
289
290 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
291 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
292 if (r < 0) {
293 std::cerr << "Pool " << metadata_pool_id
294 << " identified in MDS map not found in RADOS!" << std::endl;
295 return r;
296 }
297
298 r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
299 if (r != 0) {
300 return r;
301 }
11fdf7f2
TL
302
303 data_pools = fs->mds_map.get_data_pools();
7c673cae
FG
304 }
305
306 // Finally, dispatch command
307 if (command == "scan_inodes") {
308 return scan_inodes();
309 } else if (command == "scan_extents") {
310 return scan_extents();
311 } else if (command == "scan_frags") {
312 return scan_frags();
313 } else if (command == "scan_links") {
314 return scan_links();
315 } else if (command == "cleanup") {
316 return cleanup();
317 } else if (command == "init") {
318 return driver->init_roots(fs->mds_map.get_first_data_pool());
319 } else {
320 std::cerr << "Unknown command '" << command << "'" << std::endl;
321 return -EINVAL;
322 }
323}
324
325int MetadataDriver::inject_unlinked_inode(
326 inodeno_t inono, int mode, int64_t data_pool_id)
327{
328 const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
329
330 // Skip if exists
331 bool already_exists = false;
332 int r = root_exists(inono, &already_exists);
333 if (r) {
334 return r;
335 }
336 if (already_exists && !force_init) {
337 std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
338 " exists, skipping create. Use --force-init to overwrite"
339 " the existing object." << std::endl;
340 return 0;
341 }
342
343 // Compose
344 InodeStore inode;
345 inode.inode.ino = inono;
346 inode.inode.version = 1;
347 inode.inode.xattr_version = 1;
348 inode.inode.mode = 0500 | mode;
349 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
350 // (we won't actually give the *correct* dirstat here though)
351 inode.inode.dirstat.nfiles = 1;
352
353 inode.inode.ctime =
354 inode.inode.mtime = ceph_clock_now();
355 inode.inode.nlink = 1;
356 inode.inode.truncate_size = -1ull;
357 inode.inode.truncate_seq = 1;
11fdf7f2
TL
358 inode.inode.uid = g_conf()->mds_root_ino_uid;
359 inode.inode.gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
360
361 // Force layout to default: should we let users override this so that
362 // they don't have to mount the filesystem to correct it?
363 inode.inode.layout = file_layout_t::get_default();
364 inode.inode.layout.pool_id = data_pool_id;
11fdf7f2 365 inode.inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
366
367 // Assume that we will get our stats wrong, and that we may
368 // be ignoring dirfrags that exist
369 inode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
370
11fdf7f2
TL
371 if (inono == MDS_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
372 sr_t srnode;
373 srnode.seq = 1;
374 encode(srnode, inode.snap_blob);
375 }
376
7c673cae
FG
377 // Serialize
378 bufferlist inode_bl;
11fdf7f2 379 encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
7c673cae
FG
380 inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
381
382 // Write
383 r = metadata_io.write_full(oid.name, inode_bl);
384 if (r != 0) {
385 derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
386 return r;
387 }
388
389 return r;
390}
391
392int MetadataDriver::root_exists(inodeno_t ino, bool *result)
393{
394 object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
395 uint64_t size;
396 time_t mtime;
397 int r = metadata_io.stat(oid.name, &size, &mtime);
398 if (r == -ENOENT) {
399 *result = false;
400 return 0;
401 } else if (r < 0) {
402 return r;
403 }
404
405 *result = true;
406 return 0;
407}
408
409int MetadataDriver::init_roots(int64_t data_pool_id)
410{
411 int r = 0;
412 r = inject_unlinked_inode(MDS_INO_ROOT, S_IFDIR|0755, data_pool_id);
413 if (r != 0) {
414 return r;
415 }
416 r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
417 if (r != 0) {
418 return r;
419 }
420 bool created = false;
421 r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
422 if (r != 0) {
423 return r;
424 }
425
426 return 0;
427}
428
429int MetadataDriver::check_roots(bool *result)
430{
431 int r;
432 r = root_exists(MDS_INO_ROOT, result);
433 if (r != 0) {
434 return r;
435 }
436 if (!*result) {
437 return 0;
438 }
439
440 r = root_exists(MDS_INO_MDSDIR(0), result);
441 if (r != 0) {
442 return r;
443 }
444 if (!*result) {
445 return 0;
446 }
447
448 return 0;
449}
450
451/**
452 * Stages:
453 *
454 * SERIAL init
455 * 0. Create root inodes if don't exist
456 * PARALLEL scan_extents
457 * 1. Size and mtime recovery: scan ALL objects, and update 0th
458 * objects with max size and max mtime seen.
459 * PARALLEL scan_inodes
460 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
461 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
462 * or rstats at this stage. Inodes without backtraces go into
463 * lost+found
464 * TODO: SERIAL "recover stats"
465 * 3. Dirfrag statistics: depth first traverse into metadata tree,
466 * rebuilding dir sizes.
467 * TODO PARALLEL "clean up"
468 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
469 * anything onto them) and remove any of the xattrs that we
470 * used for accumulating.
471 */
472
473
474int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
475{
476 if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
477 return -EINVAL;
478 }
479
480 std::string err;
481 std::string inode_str = oid.substr(0, oid.find("."));
482 *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
483 if (!err.empty()) {
484 return -EINVAL;
485 }
486
487 std::string pos_string = oid.substr(oid.find(".") + 1);
488 *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
489 if (!err.empty()) {
490 return -EINVAL;
491 }
492
493 return 0;
494}
495
496
497int DataScan::scan_extents()
498{
499 return forall_objects(data_io, false, [this](
500 std::string const &oid,
501 uint64_t obj_name_ino,
502 uint64_t obj_name_offset) -> int
503 {
504 // Read size
505 uint64_t size;
506 time_t mtime;
507 int r = data_io.stat(oid, &size, &mtime);
508 dout(10) << "handling object " << obj_name_ino
509 << "." << obj_name_offset << dendl;
510 if (r != 0) {
511 dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
512 return r;
513 }
514
515 // I need to keep track of
516 // * The highest object ID seen
517 // * The size of the highest object ID seen
518 // * The largest object seen
519 //
520 // Given those things, I can later infer the object chunking
521 // size, the offset of the last object (chunk size * highest ID seen)
522 // and the actual size (offset of last object + size of highest ID seen)
523 //
524 // This logic doesn't take account of striping.
525 r = ClsCephFSClient::accumulate_inode_metadata(
526 data_io,
527 obj_name_ino,
528 obj_name_offset,
529 size,
530 mtime);
531 if (r < 0) {
532 derr << "Failed to accumulate metadata data from '"
533 << oid << "': " << cpp_strerror(r) << dendl;
534 return r;
535 }
536
537 return r;
538 });
539}
540
541int DataScan::probe_filter(librados::IoCtx &ioctx)
542{
543 bufferlist filter_bl;
544 ClsCephFSClient::build_tag_filter("test", &filter_bl);
545 librados::ObjectCursor range_i;
546 librados::ObjectCursor range_end;
547
548 std::vector<librados::ObjectItem> tmp_result;
549 librados::ObjectCursor tmp_next;
550 int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
551 1, filter_bl, &tmp_result, &tmp_next);
552
553 return r >= 0;
554}
555
556int DataScan::forall_objects(
557 librados::IoCtx &ioctx,
558 bool untagged_only,
559 std::function<int(std::string, uint64_t, uint64_t)> handler
560 )
561{
562 librados::ObjectCursor range_i;
563 librados::ObjectCursor range_end;
564 ioctx.object_list_slice(
565 ioctx.object_list_begin(),
566 ioctx.object_list_end(),
567 n,
568 m,
569 &range_i,
570 &range_end);
571
572
573 bufferlist filter_bl;
574
575 bool legacy_filtering = false;
576 if (untagged_only) {
577 // probe to deal with older OSDs that don't support
578 // the cephfs pgls filtering mode
579 legacy_filtering = !probe_filter(ioctx);
580 if (!legacy_filtering) {
581 ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
582 }
583 }
584
585 int r = 0;
586 while(range_i < range_end) {
587 std::vector<librados::ObjectItem> result;
588 int r = ioctx.object_list(range_i, range_end, 1,
589 filter_bl, &result, &range_i);
590 if (r < 0) {
591 derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
592 return r;
593 }
594
595 for (const auto &i : result) {
596 const std::string &oid = i.oid;
597 uint64_t obj_name_ino = 0;
598 uint64_t obj_name_offset = 0;
599 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
600 if (r != 0) {
601 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
602 continue;
603 }
604
605 if (untagged_only && legacy_filtering) {
606 dout(20) << "Applying filter to " << oid << dendl;
607
608 // We are only interested in 0th objects during this phase: we touched
609 // the other objects during scan_extents
610 if (obj_name_offset != 0) {
611 dout(20) << "Non-zeroth object" << dendl;
612 continue;
613 }
614
615 bufferlist scrub_tag_bl;
616 int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
617 if (r >= 0) {
618 std::string read_tag;
11fdf7f2 619 auto q = scrub_tag_bl.cbegin();
7c673cae 620 try {
11fdf7f2 621 decode(read_tag, q);
7c673cae
FG
622 if (read_tag == filter_tag) {
623 dout(20) << "skipping " << oid << " because it has the filter_tag"
624 << dendl;
625 continue;
626 }
627 } catch (const buffer::error &err) {
628 }
629 dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
630 } else {
631 dout(20) << "no tag read (" << r << ")" << dendl;
632 }
633
634 } else if (untagged_only) {
11fdf7f2 635 ceph_assert(obj_name_offset == 0);
7c673cae
FG
636 dout(20) << "OSD matched oid " << oid << dendl;
637 }
638
639 int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
640 if (r == 0 && this_oid_r < 0) {
641 r = this_oid_r;
642 }
643 }
644 }
645
646 return r;
647}
648
649int DataScan::scan_inodes()
650{
651 bool roots_present;
652 int r = driver->check_roots(&roots_present);
653 if (r != 0) {
654 derr << "Unexpected error checking roots: '"
655 << cpp_strerror(r) << "'" << dendl;
656 return r;
657 }
658
659 if (!roots_present) {
660 std::cerr << "Some or all system inodes are absent. Run 'init' from "
661 "one node before running 'scan_inodes'" << std::endl;
662 return -EIO;
663 }
664
665 return forall_objects(data_io, true, [this](
666 std::string const &oid,
667 uint64_t obj_name_ino,
668 uint64_t obj_name_offset) -> int
669 {
670 int r = 0;
671
672 dout(10) << "handling object "
673 << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
674 << dendl;
675
676 AccumulateResult accum_res;
677 inode_backtrace_t backtrace;
678 file_layout_t loaded_layout = file_layout_t::get_default();
679 r = ClsCephFSClient::fetch_inode_accumulate_result(
680 data_io, oid, &backtrace, &loaded_layout, &accum_res);
681
682 if (r == -EINVAL) {
683 dout(4) << "Accumulated metadata missing from '"
684 << oid << ", did you run scan_extents?" << dendl;
685 return r;
686 } else if (r < 0) {
687 dout(4) << "Unexpected error loading accumulated metadata from '"
688 << oid << "': " << cpp_strerror(r) << dendl;
689 // FIXME: this creates situation where if a client has a corrupt
690 // backtrace/layout, we will fail to inject it. We should (optionally)
691 // proceed if the backtrace/layout is corrupt but we have valid
692 // accumulated metadata.
693 return r;
694 }
695
696 const time_t file_mtime = accum_res.max_mtime;
697 uint64_t file_size = 0;
698 bool have_backtrace = !(backtrace.ancestors.empty());
699
700 // This is the layout we will use for injection, populated either
701 // from loaded_layout or from best guesses
702 file_layout_t guessed_layout;
703 guessed_layout.pool_id = data_pool_id;
704
705 // Calculate file_size, guess the layout
706 if (accum_res.ceiling_obj_index > 0) {
707 uint32_t chunk_size = file_layout_t::get_default().object_size;
708 // When there are multiple objects, the largest object probably
709 // indicates the chunk size. But not necessarily, because files
710 // can be sparse. Only make this assumption if size seen
711 // is a power of two, as chunk sizes typically are.
712 if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
713 chunk_size = accum_res.max_obj_size;
714 }
715
716 if (loaded_layout.pool_id == -1) {
717 // If no stashed layout was found, guess it
718 guessed_layout.object_size = chunk_size;
719 guessed_layout.stripe_unit = chunk_size;
720 guessed_layout.stripe_count = 1;
721 } else if (!loaded_layout.is_valid() ||
722 loaded_layout.object_size < accum_res.max_obj_size) {
723 // If the max size seen exceeds what the stashed layout claims, then
724 // disbelieve it. Guess instead. Same for invalid layouts on disk.
725 dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
726 << std::dec << ", ignoring in favour of best guess" << dendl;
727 guessed_layout.object_size = chunk_size;
728 guessed_layout.stripe_unit = chunk_size;
729 guessed_layout.stripe_count = 1;
730 } else {
731 // We have a stashed layout that we can't disprove, so apply it
732 guessed_layout = loaded_layout;
733 dout(20) << "loaded layout from xattr:"
734 << " os: " << guessed_layout.object_size
735 << " sc: " << guessed_layout.stripe_count
736 << " su: " << guessed_layout.stripe_unit
737 << dendl;
738 // User might have transplanted files from a pool with a different
739 // ID, so whatever the loaded_layout says, we'll force the injected
740 // layout to point to the pool we really read from
741 guessed_layout.pool_id = data_pool_id;
742 }
743
744 if (guessed_layout.stripe_count == 1) {
745 // Unstriped file: simple chunking
746 file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
747 + accum_res.ceiling_obj_size;
748 } else {
749 // Striped file: need to examine the last stripe_count objects
750 // in the file to determine the size.
751
752 // How many complete (i.e. not last stripe) objects?
753 uint64_t complete_objs = 0;
754 if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
755 complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
756 } else {
757 complete_objs = 0;
758 }
759
760 // How many potentially-short objects (i.e. last stripe set) objects?
761 uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
762
763 dout(10) << "calculating striped size from complete objs: "
764 << complete_objs << ", partial objs: " << partial_objs
765 << dendl;
766
767 // Maximum amount of data that may be in the incomplete objects
768 uint64_t incomplete_size = 0;
769
770 // For each short object, calculate the max file size within it
771 // and accumulate the maximum
772 for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
773 char buf[60];
774 snprintf(buf, sizeof(buf), "%llx.%08llx",
775 (long long unsigned)obj_name_ino, (long long unsigned)i);
776
777 uint64_t osize(0);
778 time_t omtime(0);
779 r = data_io.stat(std::string(buf), &osize, &omtime);
780 if (r == 0) {
781 if (osize > 0) {
782 // Upper bound within this object
783 uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
784 * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
785 + (i % guessed_layout.stripe_count)
786 * guessed_layout.stripe_unit + (osize - 1)
787 % guessed_layout.stripe_unit + 1;
11fdf7f2 788 incomplete_size = std::max(incomplete_size, upper_size);
7c673cae
FG
789 }
790 } else if (r == -ENOENT) {
791 // Absent object, treat as size 0 and ignore.
792 } else {
793 // Unexpected error, carry r to outer scope for handling.
794 break;
795 }
796 }
797 if (r != 0 && r != -ENOENT) {
798 derr << "Unexpected error checking size of ino 0x" << std::hex
799 << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
800 return r;
801 }
802 file_size = complete_objs * guessed_layout.object_size
803 + incomplete_size;
804 }
805 } else {
806 file_size = accum_res.ceiling_obj_size;
807 if (loaded_layout.pool_id < 0
808 || loaded_layout.object_size < accum_res.max_obj_size) {
809 // No layout loaded, or inconsistent layout, use default
810 guessed_layout = file_layout_t::get_default();
811 guessed_layout.pool_id = data_pool_id;
812 } else {
813 guessed_layout = loaded_layout;
814 }
815 }
816
817 // Santity checking backtrace ino against object name
818 if (have_backtrace && backtrace.ino != obj_name_ino) {
819 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
820 << " doesn't match object name ino 0x" << obj_name_ino
821 << std::dec << dendl;
822 have_backtrace = false;
823 }
824
825 InodeStore dentry;
826 build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
827
828 // Inject inode to the metadata pool
829 if (have_backtrace) {
830 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
831 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
832 /* Special case for strays: even if we have a good backtrace,
833 * don't put it in the stray dir, because while that would technically
834 * give it linkage it would still be invisible to the user */
835 r = driver->inject_lost_and_found(obj_name_ino, dentry);
836 if (r < 0) {
837 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
838 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
839 if (r == -EINVAL) {
840 dout(4) << "Use --force-corrupt to overwrite structures that "
841 "appear to be corrupt" << dendl;
842 }
843 }
844 } else {
845 /* Happy case: we will inject a named dentry for this inode */
846 r = driver->inject_with_backtrace(backtrace, dentry);
847 if (r < 0) {
848 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
849 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
850 if (r == -EINVAL) {
851 dout(4) << "Use --force-corrupt to overwrite structures that "
852 "appear to be corrupt" << dendl;
853 }
854 }
855 }
856 } else {
857 /* Backtrace-less case: we will inject a lost+found dentry */
858 r = driver->inject_lost_and_found(
859 obj_name_ino, dentry);
860 if (r < 0) {
861 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
862 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
863 if (r == -EINVAL) {
864 dout(4) << "Use --force-corrupt to overwrite structures that "
865 "appear to be corrupt" << dendl;
866 }
867 }
868 }
869
870 return r;
871 });
872}
873
874int DataScan::cleanup()
875{
876 // We are looking for only zeroth object
877 //
878 return forall_objects(data_io, true, [this](
879 std::string const &oid,
880 uint64_t obj_name_ino,
881 uint64_t obj_name_offset) -> int
882 {
883 int r = 0;
884 r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
885 if (r < 0) {
886 dout(4) << "Error deleting accumulated metadata from '"
887 << oid << "': " << cpp_strerror(r) << dendl;
888 }
889 return r;
890 });
891}
892
893bool DataScan::valid_ino(inodeno_t ino) const
894{
895 return (ino >= inodeno_t((1ull << 40)))
896 || (MDS_INO_IS_STRAY(ino))
897 || (MDS_INO_IS_MDSDIR(ino))
898 || ino == MDS_INO_ROOT
899 || ino == MDS_INO_CEPH;
900}
901
902int DataScan::scan_links()
903{
904 MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
905 if (!metadata_driver) {
906 derr << "Unexpected --output-dir option for scan_links" << dendl;
907 return -EINVAL;
908 }
909
91327a77 910 interval_set<uint64_t> used_inos;
7c673cae 911 map<inodeno_t, int> remote_links;
11fdf7f2
TL
912 map<snapid_t, SnapInfo> snaps;
913 snapid_t last_snap = 1;
914 snapid_t snaprealm_v2_since = 2;
7c673cae
FG
915
916 struct link_info_t {
917 inodeno_t dirino;
918 frag_t frag;
919 string name;
920 version_t version;
921 int nlink;
922 bool is_dir;
11fdf7f2 923 map<snapid_t, SnapInfo> snaps;
7c673cae 924 link_info_t() : version(0), nlink(0), is_dir(false) {}
94b18763 925 link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::mempool_inode& i) :
7c673cae
FG
926 dirino(di), frag(df), name(n),
927 version(i.version), nlink(i.nlink), is_dir(S_IFDIR & i.mode) {}
928 dirfrag_t dirfrag() const {
929 return dirfrag_t(dirino, frag);
930 }
931 };
932 map<inodeno_t, list<link_info_t> > dup_primaries;
933 map<inodeno_t, link_info_t> bad_nlink_inos;
934
935 map<dirfrag_t, set<string> > to_remove;
936
937 enum {
938 SCAN_INOS = 1,
939 CHECK_LINK,
940 };
941
942 for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
943 const librados::NObjectIterator it_end = metadata_io.nobjects_end();
944 for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
945 const std::string oid = it->get_oid();
946
947 uint64_t dir_ino = 0;
948 uint64_t frag_id = 0;
949 int r = parse_oid(oid, &dir_ino, &frag_id);
950 if (r == -EINVAL) {
951 dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
952 continue;
953 } else {
954 // parse_oid can only do 0 or -EINVAL
11fdf7f2 955 ceph_assert(r == 0);
7c673cae
FG
956 }
957
958 if (!valid_ino(dir_ino)) {
959 dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
960 continue;
961 }
962
963 std::map<std::string, bufferlist> items;
964 r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
965 if (r < 0) {
966 derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
967 return r;
968 }
969
970 for (auto& p : items) {
11fdf7f2 971 auto q = p.second.cbegin();
7c673cae
FG
972 string dname;
973 snapid_t last;
974 dentry_key_t::decode_helper(p.first, dname, last);
975
976 if (last != CEPH_NOSNAP)
977 continue;
978
979 try {
980 snapid_t dnfirst;
11fdf7f2
TL
981 decode(dnfirst, q);
982 if (dnfirst <= CEPH_MAXSNAP) {
983 if (dnfirst - 1 > last_snap)
984 last_snap = dnfirst - 1;
985 }
7c673cae 986 char dentry_type;
11fdf7f2 987 decode(dentry_type, q);
7c673cae
FG
988 if (dentry_type == 'I') {
989 InodeStore inode;
990 inode.decode_bare(q);
991 inodeno_t ino = inode.inode.ino;
992
993 if (step == SCAN_INOS) {
994 if (used_inos.contains(ino, 1)) {
995 dup_primaries[ino].size();
996 } else {
997 used_inos.insert(ino);
998 }
999 } else if (step == CHECK_LINK) {
11fdf7f2
TL
1000 sr_t srnode;
1001 if (inode.snap_blob.length()) {
1002 auto p = inode.snap_blob.cbegin();
1003 decode(srnode, p);
1004 for (auto it = srnode.snaps.begin();
1005 it != srnode.snaps.end(); ) {
1006 if (it->second.ino != ino ||
1007 it->second.snapid != it->first) {
1008 srnode.snaps.erase(it++);
1009 } else {
1010 ++it;
1011 }
1012 }
1013 if (!srnode.past_parents.empty()) {
1014 snapid_t last = srnode.past_parents.rbegin()->first;
1015 if (last + 1 > snaprealm_v2_since)
1016 snaprealm_v2_since = last + 1;
1017 }
1018 }
1019 if (!inode.old_inodes.empty()) {
1020 if (inode.old_inodes.rbegin()->first > last_snap)
1021 last_snap = inode.old_inodes.rbegin()->first;
1022 }
7c673cae
FG
1023 auto q = dup_primaries.find(ino);
1024 if (q != dup_primaries.end()) {
1025 q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
11fdf7f2 1026 q->second.back().snaps.swap(srnode.snaps);
7c673cae
FG
1027 } else {
1028 int nlink = 0;
1029 auto r = remote_links.find(ino);
1030 if (r != remote_links.end())
1031 nlink = r->second;
1032 if (!MDS_INO_IS_STRAY(dir_ino))
1033 nlink++;
1034 if (inode.inode.nlink != nlink) {
1035 derr << "Bad nlink on " << ino << " expected " << nlink
1036 << " has " << inode.inode.nlink << dendl;
1037 bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1038 bad_nlink_inos[ino].nlink = nlink;
1039 }
11fdf7f2
TL
1040 snaps.insert(make_move_iterator(begin(srnode.snaps)),
1041 make_move_iterator(end(srnode.snaps)));
7c673cae
FG
1042 }
1043 }
1044 } else if (dentry_type == 'L') {
1045 inodeno_t ino;
1046 unsigned char d_type;
11fdf7f2
TL
1047 decode(ino, q);
1048 decode(d_type, q);
7c673cae
FG
1049
1050 if (step == SCAN_INOS) {
1051 remote_links[ino]++;
1052 } else if (step == CHECK_LINK) {
1053 if (!used_inos.contains(ino, 1)) {
1054 derr << "Bad remote link dentry 0x" << std::hex << dir_ino
1055 << std::dec << "/" << dname
1056 << ", ino " << ino << " not found" << dendl;
1057 std::string key;
1058 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1059 dn_key.encode(key);
1060 to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
1061 }
1062 }
1063 } else {
1064 derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
1065 << std::dec << "/" << dname << dendl;
1066 return -EINVAL;
1067 }
1068 } catch (const buffer::error &err) {
1069 derr << "Error decoding dentry 0x" << std::hex << dir_ino
1070 << std::dec << "/" << dname << dendl;
1071 return -EINVAL;
1072 }
1073 }
1074 }
1075 }
91327a77
AA
1076
1077 map<unsigned, uint64_t> max_ino_map;
1078 {
1079 auto prev_max_ino = (uint64_t)1 << 40;
1080 for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
1081 auto cur_max = p.get_start() + p.get_len() - 1;
1082 if (cur_max < prev_max_ino)
1083 continue; // system inodes
1084
1085 if ((prev_max_ino >> 40) != (cur_max >> 40)) {
1086 unsigned rank = (prev_max_ino >> 40) - 1;
1087 max_ino_map[rank] = prev_max_ino;
1088 } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
1089 unsigned rank = (p.get_start() >> 40) - 1;
1090 max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
1091 }
1092 prev_max_ino = cur_max;
1093 }
1094 unsigned rank = (prev_max_ino >> 40) - 1;
1095 max_ino_map[rank] = prev_max_ino;
1096 }
1097
7c673cae
FG
1098 used_inos.clear();
1099
1100 for (auto& p : dup_primaries) {
1101 link_info_t newest;
1102 for (auto& q : p.second) {
1103 if (q.version > newest.version) {
1104 newest = q;
1105 } else if (q.version == newest.version &&
1106 !MDS_INO_IS_STRAY(q.dirino) &&
1107 MDS_INO_IS_STRAY(newest.dirino)) {
1108 newest = q;
1109 }
1110 }
1111
1112 for (auto& q : p.second) {
1113 // in the middle of dir fragmentation?
11fdf7f2
TL
1114 if (newest.dirino == q.dirino && newest.name == q.name) {
1115 snaps.insert(make_move_iterator(begin(q.snaps)),
1116 make_move_iterator(end(q.snaps)));
7c673cae 1117 continue;
11fdf7f2 1118 }
7c673cae
FG
1119
1120 std::string key;
1121 dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
1122 dn_key.encode(key);
1123 to_remove[q.dirfrag()].insert(key);
1124 derr << "Remove duplicated ino 0x" << p.first << " from "
1125 << q.dirfrag() << "/" << q.name << dendl;
1126 }
1127
1128 int nlink = 0;
1129 auto q = remote_links.find(p.first);
1130 if (q != remote_links.end())
1131 nlink = q->second;
1132 if (!MDS_INO_IS_STRAY(newest.dirino))
1133 nlink++;
1134
1135 if (nlink != newest.nlink) {
1136 derr << "Bad nlink on " << p.first << " expected " << nlink
1137 << " has " << newest.nlink << dendl;
1138 bad_nlink_inos[p.first] = newest;
1139 bad_nlink_inos[p.first].nlink = nlink;
1140 }
1141 }
1142 dup_primaries.clear();
1143 remote_links.clear();
1144
1145 for (auto& p : to_remove) {
1146 object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
1147
1148 int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
1149 if (r != 0) {
1150 derr << "Error removing duplicated dentries from " << p.first << dendl;
1151 return r;
1152 }
1153 }
1154 to_remove.clear();
1155
1156 for (auto &p : bad_nlink_inos) {
1157 InodeStore inode;
1158 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode);
1159 if (r < 0) {
1160 derr << "Unexpected error reading dentry "
1161 << p.second.dirfrag() << "/" << p.second.name
1162 << ": " << cpp_strerror(r) << dendl;
1163 return r;
1164 }
1165
1166 if (inode.inode.ino != p.first || inode.inode.version != p.second.version)
1167 continue;
1168
1169 inode.inode.nlink = p.second.nlink;
1170 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode);
1171 if (r < 0)
1172 return r;
1173 }
1174
91327a77 1175 for (auto& p : max_ino_map) {
11fdf7f2
TL
1176 InoTable inotable(nullptr);
1177 inotable.set_rank(p.first);
1178 bool dirty = false;
1179 int r = metadata_driver->load_table(&inotable);
1180 if (r < 0) {
1181 inotable.reset_state();
1182 dirty = true;
1183 }
1184 if (inotable.force_consume_to(p.second))
1185 dirty = true;
1186 if (dirty) {
1187 r = metadata_driver->save_table(&inotable);
1188 if (r < 0)
1189 return r;
1190 }
91327a77
AA
1191 }
1192
11fdf7f2
TL
1193 {
1194 objecter->with_osdmap([&](const OSDMap& o) {
1195 for (auto p : data_pools) {
1196 const pg_pool_t *pi = o.get_pg_pool(p);
1197 if (!pi)
1198 continue;
1199 if (pi->snap_seq > last_snap)
1200 last_snap = pi->snap_seq;
1201 }
1202 });
1203
1204 if (!snaps.empty()) {
1205 if (snaps.rbegin()->first > last_snap)
1206 last_snap = snaps.rbegin()->first;
1207 }
1208
1209 SnapServer snaptable;
1210 snaptable.set_rank(0);
1211 bool dirty = false;
1212 int r = metadata_driver->load_table(&snaptable);
1213 if (r < 0) {
1214 snaptable.reset_state();
1215 dirty = true;
1216 }
1217 if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
1218 dirty = true;
1219 if (dirty) {
1220 r = metadata_driver->save_table(&snaptable);
1221 if (r < 0)
1222 return r;
1223 }
1224 }
7c673cae
FG
1225 return 0;
1226}
1227
1228int DataScan::scan_frags()
1229{
1230 bool roots_present;
1231 int r = driver->check_roots(&roots_present);
1232 if (r != 0) {
1233 derr << "Unexpected error checking roots: '"
1234 << cpp_strerror(r) << "'" << dendl;
1235 return r;
1236 }
1237
1238 if (!roots_present) {
1239 std::cerr << "Some or all system inodes are absent. Run 'init' from "
1240 "one node before running 'scan_inodes'" << std::endl;
1241 return -EIO;
1242 }
1243
1244 return forall_objects(metadata_io, true, [this](
1245 std::string const &oid,
1246 uint64_t obj_name_ino,
1247 uint64_t obj_name_offset) -> int
1248 {
1249 int r = 0;
1250 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
1251 if (r != 0) {
1252 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
1253 return r;
1254 }
1255
1256 if (obj_name_ino < (1ULL << 40)) {
1257 // FIXME: we're skipping stray dirs here: if they're
1258 // orphaned then we should be resetting them some other
1259 // way
1260 dout(10) << "Skipping system ino " << obj_name_ino << dendl;
1261 return 0;
1262 }
1263
1264 AccumulateResult accum_res;
1265 inode_backtrace_t backtrace;
1266
1267 // Default to inherit layout (i.e. no explicit layout on dir) which is
1268 // expressed as a zeroed layout struct (see inode_t::has_layout)
1269 file_layout_t loaded_layout;
1270
1271 int parent_r = 0;
1272 bufferlist parent_bl;
1273 int layout_r = 0;
1274 bufferlist layout_bl;
1275 bufferlist op_bl;
1276
1277 librados::ObjectReadOperation op;
1278 op.getxattr("parent", &parent_bl, &parent_r);
1279 op.getxattr("layout", &layout_bl, &layout_r);
1280 r = metadata_io.operate(oid, &op, &op_bl);
1281 if (r != 0 && r != -ENODATA) {
1282 derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
1283 return r;
1284 }
1285
1286 if (parent_r != -ENODATA) {
1287 try {
11fdf7f2 1288 auto q = parent_bl.cbegin();
7c673cae
FG
1289 backtrace.decode(q);
1290 } catch (buffer::error &e) {
1291 dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl;
1292 if (!force_corrupt) {
1293 return -EINVAL;
1294 } else {
1295 // Treat backtrace as absent: we'll inject into lost+found
1296 backtrace = inode_backtrace_t();
1297 }
1298 }
1299 }
1300
1301 if (layout_r != -ENODATA) {
1302 try {
11fdf7f2
TL
1303 auto q = layout_bl.cbegin();
1304 decode(loaded_layout, q);
7c673cae
FG
1305 } catch (buffer::error &e) {
1306 dout(4) << "Corrupt layout on '" << oid << "': " << e << dendl;
1307 if (!force_corrupt) {
1308 return -EINVAL;
1309 }
1310 }
1311 }
1312
1313 bool have_backtrace = !(backtrace.ancestors.empty());
1314
1315 // Santity checking backtrace ino against object name
1316 if (have_backtrace && backtrace.ino != obj_name_ino) {
1317 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
1318 << " doesn't match object name ino 0x" << obj_name_ino
1319 << std::dec << dendl;
1320 have_backtrace = false;
1321 }
1322
1323 uint64_t fnode_version = 0;
1324 fnode_t fnode;
1325 r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
1326 if (r == -EINVAL) {
1327 derr << "Corrupt fnode on " << oid << dendl;
1328 if (force_corrupt) {
1329 fnode.fragstat.mtime = 0;
1330 fnode.fragstat.nfiles = 1;
1331 fnode.fragstat.nsubdirs = 0;
1332 fnode.accounted_fragstat = fnode.fragstat;
1333 } else {
1334 return r;
1335 }
1336 }
1337
1338 InodeStore dentry;
1339 build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
1340 loaded_layout, &dentry);
1341
1342 // Inject inode to the metadata pool
1343 if (have_backtrace) {
1344 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
1345 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
1346 /* Special case for strays: even if we have a good backtrace,
1347 * don't put it in the stray dir, because while that would technically
1348 * give it linkage it would still be invisible to the user */
1349 r = driver->inject_lost_and_found(obj_name_ino, dentry);
1350 if (r < 0) {
1351 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1352 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1353 if (r == -EINVAL) {
1354 dout(4) << "Use --force-corrupt to overwrite structures that "
1355 "appear to be corrupt" << dendl;
1356 }
1357 }
1358 } else {
1359 /* Happy case: we will inject a named dentry for this inode */
1360 r = driver->inject_with_backtrace(backtrace, dentry);
1361 if (r < 0) {
1362 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1363 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
1364 if (r == -EINVAL) {
1365 dout(4) << "Use --force-corrupt to overwrite structures that "
1366 "appear to be corrupt" << dendl;
1367 }
1368 }
1369 }
1370 } else {
1371 /* Backtrace-less case: we will inject a lost+found dentry */
1372 r = driver->inject_lost_and_found(
1373 obj_name_ino, dentry);
1374 if (r < 0) {
1375 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
1376 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1377 if (r == -EINVAL) {
1378 dout(4) << "Use --force-corrupt to overwrite structures that "
1379 "appear to be corrupt" << dendl;
1380 }
1381 }
1382 }
1383
1384 return r;
1385 });
1386}
1387
1388int MetadataTool::read_fnode(
1389 inodeno_t ino, frag_t frag, fnode_t *fnode,
1390 uint64_t *last_version)
1391{
11fdf7f2 1392 ceph_assert(fnode != NULL);
7c673cae
FG
1393
1394 object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
1395 bufferlist fnode_bl;
1396 int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
1397 *last_version = metadata_io.get_last_version();
1398 if (r < 0) {
1399 return r;
1400 }
1401
11fdf7f2 1402 auto old_fnode_iter = fnode_bl.cbegin();
7c673cae
FG
1403 try {
1404 (*fnode).decode(old_fnode_iter);
1405 } catch (const buffer::error &err) {
1406 return -EINVAL;
1407 }
1408
1409 return 0;
1410}
1411
1412int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
1413 const std::string &dname, InodeStore *inode)
1414{
11fdf7f2 1415 ceph_assert(inode != NULL);
7c673cae
FG
1416
1417
1418 std::string key;
1419 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1420 dn_key.encode(key);
1421
1422 std::set<std::string> keys;
1423 keys.insert(key);
1424 std::map<std::string, bufferlist> vals;
1425 object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
1426 int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
1427 dout(20) << "oid=" << frag_oid.name
1428 << " dname=" << dname
1429 << " frag=" << frag
1430 << ", r=" << r << dendl;
1431 if (r < 0) {
1432 return r;
1433 }
1434
1435 if (vals.find(key) == vals.end()) {
1436 dout(20) << key << " not found in result" << dendl;
1437 return -ENOENT;
1438 }
1439
1440 try {
11fdf7f2 1441 auto q = vals[key].cbegin();
7c673cae 1442 snapid_t dnfirst;
11fdf7f2 1443 decode(dnfirst, q);
7c673cae 1444 char dentry_type;
11fdf7f2 1445 decode(dentry_type, q);
7c673cae
FG
1446 if (dentry_type == 'I') {
1447 inode->decode_bare(q);
1448 return 0;
1449 } else {
1450 dout(20) << "dentry type '" << dentry_type << "': cannot"
1451 "read an inode out of that" << dendl;
1452 return -EINVAL;
1453 }
1454 } catch (const buffer::error &err) {
1455 dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
1456 << std::dec << "/" << dname << dendl;
1457 return -EINVAL;
1458 }
1459
1460 return 0;
1461}
1462
11fdf7f2
TL
1463int MetadataDriver::load_table(MDSTable *table)
1464{
1465 object_t table_oid = table->get_object_name();
1466
1467 bufferlist table_bl;
1468 int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
1469 if (r < 0) {
1470 derr << "unable to read mds table '" << table_oid.name << "': "
1471 << cpp_strerror(r) << dendl;
1472 return r;
1473 }
1474
1475 try {
1476 version_t table_ver;
1477 auto p = table_bl.cbegin();
1478 decode(table_ver, p);
1479 table->decode_state(p);
1480 table->force_replay_version(table_ver);
1481 } catch (const buffer::error &err) {
1482 derr << "unable to decode mds table '" << table_oid.name << "': "
1483 << err.what() << dendl;
1484 return -EIO;
1485 }
1486 return 0;
1487}
1488
1489int MetadataDriver::save_table(MDSTable *table)
1490{
1491 object_t table_oid = table->get_object_name();
1492
1493 bufferlist table_bl;
1494 encode(table->get_version(), table_bl);
1495 table->encode_state(table_bl);
1496 int r = metadata_io.write_full(table_oid.name, table_bl);
1497 if (r != 0) {
1498 derr << "error updating mds table " << table_oid.name
1499 << ": " << cpp_strerror(r) << dendl;
1500 return r;
1501 }
1502 return 0;
1503}
1504
7c673cae
FG
1505int MetadataDriver::inject_lost_and_found(
1506 inodeno_t ino, const InodeStore &dentry)
1507{
1508 // Create lost+found if doesn't exist
1509 bool created = false;
1510 int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
1511 if (r < 0) {
1512 return r;
1513 }
1514 InodeStore lf_ino;
1515 r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
1516 if (r == -ENOENT || r == -EINVAL) {
1517 if (r == -EINVAL && !force_corrupt) {
1518 return r;
1519 }
1520
1521 // To have a directory not specify a layout, give it zeros (see
1522 // inode_t::has_layout)
1523 file_layout_t inherit_layout;
1524
1525 // Construct LF inode
1526 frag_info_t fragstat;
1527 fragstat.nfiles = 1,
1528 build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
1529
1530 // Inject link to LF inode in the root dir
1531 r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
1532 if (r < 0) {
1533 return r;
1534 }
1535 } else {
1536 if (!(lf_ino.inode.mode & S_IFDIR)) {
1537 derr << "lost+found exists but is not a directory!" << dendl;
1538 // In this case we error out, and the user should do something about
1539 // this problem.
1540 return -EINVAL;
1541 }
1542 }
1543
1544 r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
1545 if (r < 0) {
1546 return r;
1547 }
1548
1549 InodeStore recovered_ino;
1550
1551
1552 const std::string dname = lost_found_dname(ino);
1553
1554 // Write dentry into lost+found dirfrag
1555 return inject_linkage(lf_ino.inode.ino, dname, frag_t(), dentry);
1556}
1557
1558
1559int MetadataDriver::get_frag_of(
1560 inodeno_t dirino,
1561 const std::string &target_dname,
1562 frag_t *result_ft)
1563{
1564 object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
1565
1566 dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
1567
1568 // Find and load fragtree if existing dirfrag
1569 // ==========================================
1570 bool have_backtrace = false;
1571 bufferlist parent_bl;
1572 int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
1573 if (r == -ENODATA) {
1574 dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
1575 } else if (r < 0) {
1576 dout(4) << "Unexpected error on '" << root_frag_oid << "': "
1577 << cpp_strerror(r) << dendl;
1578 return r;
1579 }
1580
1581 // Deserialize backtrace
1582 inode_backtrace_t backtrace;
1583 if (parent_bl.length()) {
1584 try {
11fdf7f2 1585 auto q = parent_bl.cbegin();
7c673cae
FG
1586 backtrace.decode(q);
1587 have_backtrace = true;
1588 } catch (buffer::error &e) {
1589 dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " << e << dendl;
1590 }
1591 }
1592
1593 if (!(have_backtrace && backtrace.ancestors.size())) {
1594 // Can't work out fragtree without a backtrace
1595 dout(4) << "No backtrace on '" << root_frag_oid
1596 << "': cannot determine fragtree" << dendl;
1597 return -ENOENT;
1598 }
1599
1600 // The parentage of dirino
1601 const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
1602
1603 // The inode of dirino's parent
1604 const inodeno_t parent_ino = bp.dirino;
1605
1606 // The dname of dirino in its parent.
1607 const std::string &parent_dname = bp.dname;
1608
1609 dout(20) << "got backtrace parent " << parent_ino << "/"
1610 << parent_dname << dendl;
1611
1612 // The primary dentry for dirino
1613 InodeStore existing_dentry;
1614
1615 // See if we can find ourselves in dirfrag zero of the parent: this
1616 // is a fast path that avoids needing to go further up the tree
1617 // if the parent isn't fragmented (worst case we would have to
1618 // go all the way to the root)
1619 r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
1620 if (r >= 0) {
1621 // Great, fast path: return the fragtree from here
1622 if (existing_dentry.inode.ino != dirino) {
1623 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
1624 << existing_dentry.inode.ino
1625 << " vs expected 0x" << dirino << std::dec << dendl;
1626 return -ENOENT;
1627 }
1628 dout(20) << "fast path, fragtree is "
1629 << existing_dentry.dirfragtree << dendl;
1630 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1631 dout(20) << "frag is " << *result_ft << dendl;
1632 return 0;
1633 } else if (r != -ENOENT) {
1634 // Dentry not present in 0th frag, must read parent's fragtree
1635 frag_t parent_frag;
1636 r = get_frag_of(parent_ino, parent_dname, &parent_frag);
1637 if (r == 0) {
1638 // We have the parent fragtree, so try again to load our dentry
1639 r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
1640 if (r >= 0) {
1641 // Got it!
1642 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1643 dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
1644 return 0;
1645 } else {
1646 if (r == -EINVAL || r == -ENOENT) {
1647 return -ENOENT; // dentry missing or corrupt, so frag is missing
1648 } else {
1649 return r;
1650 }
1651 }
1652 } else {
1653 // Couldn't resolve parent fragtree, so can't find ours.
1654 return r;
1655 }
1656 } else if (r == -EINVAL) {
1657 // Unreadable dentry, can't know the fragtree.
1658 return -ENOENT;
1659 } else {
1660 // Unexpected error, raise it
1661 return r;
1662 }
1663}
1664
1665
1666int MetadataDriver::inject_with_backtrace(
1667 const inode_backtrace_t &backtrace, const InodeStore &dentry)
1668
1669{
1670
1671 // On dirfrags
1672 // ===========
1673 // In order to insert something into a directory, we first (ideally)
1674 // need to know the fragtree for the directory. Sometimes we can't
1675 // get that, in which case we just go ahead and insert it into
1676 // fragment zero for a good chance of that being the right thing
1677 // anyway (most moderate-sized dirs aren't fragmented!)
1678
1679 // On ancestry
1680 // ===========
1681 // My immediate ancestry should be correct, so if we can find that
1682 // directory's dirfrag then go inject it there. This works well
1683 // in the case that this inode's dentry was somehow lost and we
1684 // are recreating it, because the rest of the hierarchy
1685 // will probably still exist.
1686 //
1687 // It's more of a "better than nothing" approach when rebuilding
1688 // a whole tree, as backtraces will in general not be up to date
1689 // beyond the first parent, if anything in the trace was ever
1690 // moved after the file was created.
1691
1692 // On inode numbers
1693 // ================
1694 // The backtrace tells us inodes for each of the parents. If we are
1695 // creating those parent dirfrags, then there is a risk that somehow
1696 // the inode indicated here was also used for data (not a dirfrag) at
1697 // some stage. That would be a zany situation, and we don't check
1698 // for it here, because to do so would require extra IOs for everything
1699 // we inject, and anyway wouldn't guarantee that the inode number
1700 // wasn't in use in some dentry elsewhere in the metadata tree that
1701 // just happened not to have any data objects.
1702
1703 // On multiple workers touching the same traces
1704 // ============================================
1705 // When creating linkage for a directory, *only* create it if we are
1706 // also creating the object. That way, we might not manage to get the
1707 // *right* linkage for a directory, but at least we won't multiply link
1708 // it. We assume that if a root dirfrag exists for a directory, then
1709 // it is linked somewhere (i.e. that the metadata pool is not already
1710 // inconsistent).
1711 //
1712 // Making sure *that* is true is someone else's job! Probably someone
1713 // who is not going to run in parallel, so that they can self-consistently
1714 // look at versions and move things around as they go.
1715 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1716 // object, next run will fail to create linkage for the dirfrag object
1717 // and leave it orphaned.
1718
1719 inodeno_t ino = backtrace.ino;
1720 dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
1721 for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
1722 i != backtrace.ancestors.end(); ++i) {
1723 const inode_backpointer_t &backptr = *i;
1724 dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
1725 << "/" << backptr.dname << dendl;
1726
1727 // Examine root dirfrag for parent
1728 const inodeno_t parent_ino = backptr.dirino;
1729 const std::string dname = backptr.dname;
1730
1731 frag_t fragment;
1732 int r = get_frag_of(parent_ino, dname, &fragment);
1733 if (r == -ENOENT) {
1734 // Don't know fragment, fall back to assuming root
1735 dout(20) << "don't know fragment for 0x" << std::hex <<
1736 parent_ino << std::dec << "/" << dname << ", will insert to root"
1737 << dendl;
1738 }
1739
1740 // Find or create dirfrag
1741 // ======================
1742 bool created_dirfrag;
1743 r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
1744 if (r < 0) {
1745 return r;
1746 }
1747
1748 // Check if dentry already exists
1749 // ==============================
1750 InodeStore existing_dentry;
1751 r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
1752 bool write_dentry = false;
1753 if (r == -ENOENT || r == -EINVAL) {
1754 if (r == -EINVAL && !force_corrupt) {
1755 return r;
1756 }
1757 // Missing or corrupt dentry
1758 write_dentry = true;
1759 } else if (r < 0) {
1760 derr << "Unexpected error reading dentry 0x" << std::hex
1761 << parent_ino << std::dec << "/"
1762 << dname << ": " << cpp_strerror(r) << dendl;
1763 break;
1764 } else {
1765 // Dentry already present, does it link to me?
1766 if (existing_dentry.inode.ino == ino) {
1767 dout(20) << "Dentry 0x" << std::hex
1768 << parent_ino << std::dec << "/"
1769 << dname << " already exists and points to me" << dendl;
1770 } else {
1771 derr << "Dentry 0x" << std::hex
1772 << parent_ino << std::dec << "/"
1773 << dname << " already exists but points to 0x"
1774 << std::hex << existing_dentry.inode.ino << std::dec << dendl;
1775 // Fall back to lost+found!
1776 return inject_lost_and_found(backtrace.ino, dentry);
1777 }
1778 }
1779
1780 // Inject linkage
1781 // ==============
1782
1783 if (write_dentry) {
1784 if (i == backtrace.ancestors.begin()) {
1785 // This is the linkage for the file of interest
1786 dout(10) << "Linking inode 0x" << std::hex << ino
1787 << " at 0x" << parent_ino << "/" << dname << std::dec
1788 << " with size=" << dentry.inode.size << " bytes" << dendl;
1789
1790 r = inject_linkage(parent_ino, dname, fragment, dentry);
1791 } else {
1792 // This is the linkage for an ancestor directory
1793 InodeStore ancestor_dentry;
1794 ancestor_dentry.inode.mode = 0755 | S_IFDIR;
1795
1796 // Set nfiles to something non-zero, to fool any other code
1797 // that tries to ignore 'empty' directories. This won't be
1798 // accurate, but it should avoid functional issues.
1799
1800 ancestor_dentry.inode.dirstat.nfiles = 1;
1801 ancestor_dentry.inode.dir_layout.dl_dir_hash =
11fdf7f2 1802 g_conf()->mds_default_dir_hash;
7c673cae
FG
1803
1804 ancestor_dentry.inode.nlink = 1;
1805 ancestor_dentry.inode.ino = ino;
11fdf7f2
TL
1806 ancestor_dentry.inode.uid = g_conf()->mds_root_ino_uid;
1807 ancestor_dentry.inode.gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
1808 ancestor_dentry.inode.version = 1;
1809 ancestor_dentry.inode.backtrace_version = 1;
1810 r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
1811 }
1812
1813 if (r < 0) {
1814 return r;
1815 }
1816 }
1817
1818 if (!created_dirfrag) {
1819 // If the parent dirfrag already existed, then stop traversing the
1820 // backtrace: assume that the other ancestors already exist too. This
1821 // is an assumption rather than a truth, but it's a convenient way
1822 // to avoid the risk of creating multiply-linked directories while
1823 // injecting data. If there are in fact missing ancestors, this
1824 // should be fixed up using a separate tool scanning the metadata
1825 // pool.
1826 break;
1827 } else {
1828 // Proceed up the backtrace, creating parents
1829 ino = parent_ino;
1830 }
1831 }
1832
1833 return 0;
1834}
1835
1836int MetadataDriver::find_or_create_dirfrag(
1837 inodeno_t ino,
1838 frag_t fragment,
1839 bool *created)
1840{
11fdf7f2 1841 ceph_assert(created != NULL);
7c673cae
FG
1842
1843 fnode_t existing_fnode;
1844 *created = false;
1845
1846 uint64_t read_version = 0;
1847 int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
1848 dout(10) << "read_version = " << read_version << dendl;
1849
1850 if (r == -ENOENT || r == -EINVAL) {
1851 if (r == -EINVAL && !force_corrupt) {
1852 return r;
1853 }
1854
1855 // Missing or corrupt fnode, create afresh
1856 bufferlist fnode_bl;
1857 fnode_t blank_fnode;
1858 blank_fnode.version = 1;
1859 // mark it as non-empty
1860 blank_fnode.fragstat.nfiles = 1;
1861 blank_fnode.accounted_fragstat = blank_fnode.fragstat;
1862 blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
1863 blank_fnode.encode(fnode_bl);
1864
1865
1866 librados::ObjectWriteOperation op;
1867
1868 if (read_version) {
11fdf7f2 1869 ceph_assert(r == -EINVAL);
7c673cae
FG
1870 // Case A: We must assert that the version isn't changed since we saw the object
1871 // was unreadable, to avoid the possibility of two data-scan processes
1872 // both creating the frag.
1873 op.assert_version(read_version);
1874 } else {
11fdf7f2 1875 ceph_assert(r == -ENOENT);
7c673cae
FG
1876 // Case B: The object didn't exist in read_fnode, so while creating it we must
1877 // use an exclusive create to correctly populate *creating with
1878 // whether we created it ourselves or someone beat us to it.
1879 op.create(true);
1880 }
1881
1882 object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
1883 op.omap_set_header(fnode_bl);
1884 r = metadata_io.operate(frag_oid.name, &op);
1885 if (r == -EOVERFLOW || r == -EEXIST) {
1886 // Someone else wrote it (see case A above)
1887 dout(10) << "Dirfrag creation race: 0x" << std::hex
1888 << ino << " " << fragment << std::dec << dendl;
1889 *created = false;
1890 return 0;
1891 } else if (r < 0) {
1892 // We were unable to create or write it, error out
1893 derr << "Failed to create dirfrag 0x" << std::hex
1894 << ino << std::dec << ": " << cpp_strerror(r) << dendl;
1895 return r;
1896 } else {
1897 // Success: the dirfrag object now exists with a value header
1898 dout(10) << "Created dirfrag: 0x" << std::hex
1899 << ino << std::dec << dendl;
1900 *created = true;
1901 }
1902 } else if (r < 0) {
1903 derr << "Unexpected error reading dirfrag 0x" << std::hex
1904 << ino << std::dec << " : " << cpp_strerror(r) << dendl;
1905 return r;
1906 } else {
1907 dout(20) << "Dirfrag already exists: 0x" << std::hex
1908 << ino << " " << fragment << std::dec << dendl;
1909 }
1910
1911 return 0;
1912}
1913
1914int MetadataDriver::inject_linkage(
1915 inodeno_t dir_ino, const std::string &dname,
1916 const frag_t fragment, const InodeStore &inode)
1917{
1918 // We have no information about snapshots, so everything goes
1919 // in as CEPH_NOSNAP
1920 snapid_t snap = CEPH_NOSNAP;
1921
1922 object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
1923
1924 std::string key;
1925 dentry_key_t dn_key(snap, dname.c_str());
1926 dn_key.encode(key);
1927
1928 bufferlist dentry_bl;
11fdf7f2
TL
1929 encode(snap, dentry_bl);
1930 encode('I', dentry_bl);
7c673cae
FG
1931 inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1932
1933 // Write out
1934 std::map<std::string, bufferlist> vals;
1935 vals[key] = dentry_bl;
1936 int r = metadata_io.omap_set(frag_oid.name, vals);
1937 if (r != 0) {
1938 derr << "Error writing dentry 0x" << std::hex
1939 << dir_ino << std::dec << "/"
1940 << dname << ": " << cpp_strerror(r) << dendl;
1941 return r;
1942 } else {
1943 dout(20) << "Injected dentry 0x" << std::hex
1944 << dir_ino << "/" << dname << " pointing to 0x"
1945 << inode.inode.ino << std::dec << dendl;
1946 return 0;
1947 }
1948}
1949
1950
1951int MetadataDriver::init(
1952 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
1953 fs_cluster_id_t fscid)
1954{
1955 if (metadata_pool_name.empty()) {
1956 auto fs = fsmap->get_filesystem(fscid);
11fdf7f2 1957 ceph_assert(fs != nullptr);
7c673cae
FG
1958 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
1959
1960 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
1961 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
1962 if (r < 0) {
1963 derr << "Pool " << metadata_pool_id
1964 << " identified in MDS map not found in RADOS!" << dendl;
1965 return r;
1966 }
1967 dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
1968 } else {
1969 dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
1970 }
1971 return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
1972}
1973
1974int LocalFileDriver::init(
1975 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
1976 fs_cluster_id_t fscid)
1977{
1978 return 0;
1979}
1980
1981int LocalFileDriver::inject_data(
1982 const std::string &file_path,
1983 uint64_t size,
1984 uint32_t chunk_size,
1985 inodeno_t ino)
1986{
1987 // Scrape the file contents out of the data pool and into the
1988 // local filesystem
1989 std::fstream f;
1990 f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
1991
1992 for (uint64_t offset = 0; offset < size; offset += chunk_size) {
1993 bufferlist bl;
1994
1995 char buf[32];
1996 snprintf(buf, sizeof(buf),
1997 "%llx.%08llx",
1998 (unsigned long long)ino,
1999 (unsigned long long)(offset / chunk_size));
2000 std::string oid(buf);
2001
2002 int r = data_io.read(oid, bl, chunk_size, 0);
2003
2004 if (r <= 0 && r != -ENOENT) {
2005 derr << "error reading data object '" << oid << "': "
2006 << cpp_strerror(r) << dendl;
2007 f.close();
2008 return r;
2009 } else if (r >=0) {
2010
2011 f.seekp(offset);
2012 bl.write_stream(f);
2013 }
2014 }
2015 f.close();
2016
2017 return 0;
2018}
2019
2020
2021int LocalFileDriver::inject_with_backtrace(
2022 const inode_backtrace_t &bt,
2023 const InodeStore &dentry)
2024{
2025 std::string path_builder = path;
2026
2027 // Iterate through backtrace creating directory parents
2028 std::vector<inode_backpointer_t>::const_reverse_iterator i;
2029 for (i = bt.ancestors.rbegin();
2030 i != bt.ancestors.rend(); ++i) {
2031
2032 const inode_backpointer_t &backptr = *i;
2033 path_builder += "/";
2034 path_builder += backptr.dname;
2035
2036 // Last entry is the filename itself
2037 bool is_file = (i + 1 == bt.ancestors.rend());
2038 if (is_file) {
2039 // FIXME: inject_data won't cope with interesting (i.e. striped)
2040 // layouts (need a librados-compatible Filer to read these)
2041 inject_data(path_builder, dentry.inode.size,
2042 dentry.inode.layout.object_size, bt.ino);
2043 } else {
2044 int r = mkdir(path_builder.c_str(), 0755);
2045 if (r != 0 && r != -EPERM) {
2046 derr << "error creating directory: '" << path_builder << "': "
2047 << cpp_strerror(r) << dendl;
2048 return r;
2049 }
2050 }
2051 }
2052
2053 return 0;
2054}
2055
2056int LocalFileDriver::inject_lost_and_found(
2057 inodeno_t ino,
2058 const InodeStore &dentry)
2059{
2060 std::string lf_path = path + "/lost+found";
2061 int r = mkdir(lf_path.c_str(), 0755);
2062 if (r != 0 && r != -EPERM) {
2063 derr << "error creating directory: '" << lf_path << "': "
2064 << cpp_strerror(r) << dendl;
2065 return r;
2066 }
2067
2068 std::string file_path = lf_path + "/" + lost_found_dname(ino);
2069 return inject_data(file_path, dentry.inode.size,
2070 dentry.inode.layout.object_size, ino);
2071}
2072
2073int LocalFileDriver::init_roots(int64_t data_pool_id)
2074{
2075 // Ensure that the path exists and is a directory
2076 bool exists;
2077 int r = check_roots(&exists);
2078 if (r != 0) {
2079 return r;
2080 }
2081
2082 if (exists) {
2083 return 0;
2084 } else {
2085 return ::mkdir(path.c_str(), 0755);
2086 }
2087}
2088
2089int LocalFileDriver::check_roots(bool *result)
2090{
2091 // Check if the path exists and is a directory
2092 DIR *d = ::opendir(path.c_str());
2093 if (d == NULL) {
2094 *result = false;
2095 } else {
2096 int r = closedir(d);
2097 if (r != 0) {
2098 // Weird, but maybe possible with e.g. stale FD on NFS mount?
2099 *result = false;
2100 } else {
2101 *result = true;
2102 }
2103 }
2104
2105 return 0;
2106}
2107
2108void MetadataTool::build_file_dentry(
2109 inodeno_t ino, uint64_t file_size, time_t file_mtime,
2110 const file_layout_t &layout, InodeStore *out)
2111{
11fdf7f2 2112 ceph_assert(out != NULL);
7c673cae
FG
2113
2114 out->inode.mode = 0500 | S_IFREG;
2115 out->inode.size = file_size;
2116 out->inode.max_size_ever = file_size;
2117 out->inode.mtime.tv.tv_sec = file_mtime;
2118 out->inode.atime.tv.tv_sec = file_mtime;
2119 out->inode.ctime.tv.tv_sec = file_mtime;
2120
2121 out->inode.layout = layout;
2122
2123 out->inode.truncate_seq = 1;
2124 out->inode.truncate_size = -1ull;
2125
2126 out->inode.inline_data.version = CEPH_INLINE_NONE;
2127
2128 out->inode.nlink = 1;
2129 out->inode.ino = ino;
2130 out->inode.version = 1;
2131 out->inode.backtrace_version = 1;
11fdf7f2
TL
2132 out->inode.uid = g_conf()->mds_root_ino_uid;
2133 out->inode.gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
2134}
2135
2136void MetadataTool::build_dir_dentry(
2137 inodeno_t ino, const frag_info_t &fragstat,
2138 const file_layout_t &layout, InodeStore *out)
2139{
11fdf7f2 2140 ceph_assert(out != NULL);
7c673cae
FG
2141
2142 out->inode.mode = 0755 | S_IFDIR;
2143 out->inode.dirstat = fragstat;
2144 out->inode.mtime.tv.tv_sec = fragstat.mtime;
2145 out->inode.atime.tv.tv_sec = fragstat.mtime;
2146 out->inode.ctime.tv.tv_sec = fragstat.mtime;
2147
2148 out->inode.layout = layout;
11fdf7f2 2149 out->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
2150
2151 out->inode.truncate_seq = 1;
2152 out->inode.truncate_size = -1ull;
2153
2154 out->inode.inline_data.version = CEPH_INLINE_NONE;
2155
2156 out->inode.nlink = 1;
2157 out->inode.ino = ino;
2158 out->inode.version = 1;
2159 out->inode.backtrace_version = 1;
11fdf7f2
TL
2160 out->inode.uid = g_conf()->mds_root_ino_uid;
2161 out->inode.gid = g_conf()->mds_root_ino_gid;
7c673cae
FG
2162}
2163