]> git.proxmox.com Git - ceph.git/blob - ceph/src/tools/cephfs/DataScan.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / tools / cephfs / DataScan.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "common/errno.h"
17 #include "common/ceph_argparse.h"
18 #include <fstream>
19 #include "include/util.h"
20
21 #include "mds/CInode.h"
22 #include "mds/InoTable.h"
23 #include "mds/SnapServer.h"
24 #include "cls/cephfs/cls_cephfs_client.h"
25
26 #include "PgFiles.h"
27 #include "DataScan.h"
28 #include "include/compat.h"
29
30 #define dout_context g_ceph_context
31 #define dout_subsys ceph_subsys_mds
32 #undef dout_prefix
33 #define dout_prefix *_dout << "datascan." << __func__ << ": "
34
35 void DataScan::usage()
36 {
37 std::cout << "Usage: \n"
38 << " cephfs-data-scan init [--force-init]\n"
39 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
40 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
41 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
42 << " cephfs-data-scan scan_links\n"
43 << "\n"
44 << " --force-corrupt: overrite apparently corrupt structures\n"
45 << " --force-init: write root inodes even if they exist\n"
46 << " --force-pool: use data pool even if it is not in FSMap\n"
47 << " --worker_m: Maximum number of workers\n"
48 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
49 << "\n"
50 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
51 << " cephfs-data-scan cleanup <data pool name>\n"
52 << std::endl;
53
54 generic_client_usage();
55 }
56
57 bool DataScan::parse_kwarg(
58 const std::vector<const char*> &args,
59 std::vector<const char *>::const_iterator &i,
60 int *r)
61 {
62 if (i + 1 == args.end()) {
63 return false;
64 }
65
66 const std::string arg(*i);
67 const std::string val(*(i + 1));
68
69 if (arg == std::string("--output-dir")) {
70 if (driver != NULL) {
71 derr << "Unexpected --output-dir: output already selected!" << dendl;
72 *r = -EINVAL;
73 return false;
74 }
75 dout(4) << "Using local file output to '" << val << "'" << dendl;
76 driver = new LocalFileDriver(val, data_io);
77 return true;
78 } else if (arg == std::string("--worker_n")) {
79 std::string err;
80 n = strict_strtoll(val.c_str(), 10, &err);
81 if (!err.empty()) {
82 std::cerr << "Invalid worker number '" << val << "'" << std::endl;
83 *r = -EINVAL;
84 return false;
85 }
86 return true;
87 } else if (arg == std::string("--worker_m")) {
88 std::string err;
89 m = strict_strtoll(val.c_str(), 10, &err);
90 if (!err.empty()) {
91 std::cerr << "Invalid worker count '" << val << "'" << std::endl;
92 *r = -EINVAL;
93 return false;
94 }
95 return true;
96 } else if (arg == std::string("--filter-tag")) {
97 filter_tag = val;
98 dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
99 return true;
100 } else if (arg == std::string("--filesystem")) {
101 std::shared_ptr<const Filesystem> fs;
102 *r = fsmap->parse_filesystem(val, &fs);
103 if (*r != 0) {
104 std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
105 return false;
106 }
107 fscid = fs->fscid;
108 return true;
109 } else if (arg == std::string("--alternate-pool")) {
110 metadata_pool_name = val;
111 return true;
112 } else {
113 return false;
114 }
115 }
116
117 bool DataScan::parse_arg(
118 const std::vector<const char*> &args,
119 std::vector<const char *>::const_iterator &i)
120 {
121 const std::string arg(*i);
122 if (arg == "--force-pool") {
123 force_pool = true;
124 return true;
125 } else if (arg == "--force-corrupt") {
126 force_corrupt = true;
127 return true;
128 } else if (arg == "--force-init") {
129 force_init = true;
130 return true;
131 } else {
132 return false;
133 }
134 }
135
136 int DataScan::main(const std::vector<const char*> &args)
137 {
138 // Parse args
139 // ==========
140 if (args.size() < 1) {
141 cerr << "missing position argument" << std::endl;
142 return -EINVAL;
143 }
144
145 // Common RADOS init: open metadata pool
146 // =====================================
147 librados::Rados rados;
148 int r = rados.init_with_context(g_ceph_context);
149 if (r < 0) {
150 derr << "RADOS unavailable" << dendl;
151 return r;
152 }
153
154 std::string const &command = args[0];
155 std::string data_pool_name;
156
157 std::string pg_files_path;
158 std::set<pg_t> pg_files_pgs;
159
160 // Consume any known --key val or --flag arguments
161 for (std::vector<const char *>::const_iterator i = args.begin() + 1;
162 i != args.end(); ++i) {
163 if (parse_kwarg(args, i, &r)) {
164 // Skip the kwarg value field
165 ++i;
166 continue;
167 } else if (r) {
168 return r;
169 }
170
171 if (parse_arg(args, i)) {
172 continue;
173 }
174
175 // Trailing positional argument
176 if (i + 1 == args.end() &&
177 (command == "scan_inodes"
178 || command == "scan_extents"
179 || command == "cleanup")) {
180 data_pool_name = *i;
181 continue;
182 }
183
184 if (command == "pg_files") {
185 if (i == args.begin() + 1) {
186 pg_files_path = *i;
187 continue;
188 } else {
189 pg_t pg;
190 bool parsed = pg.parse(*i);
191 if (!parsed) {
192 std::cerr << "Invalid PG '" << *i << "'" << std::endl;
193 return -EINVAL;
194 } else {
195 pg_files_pgs.insert(pg);
196 continue;
197 }
198 }
199
200 }
201
202 // Fall through: unhandled
203 std::cerr << "Unknown argument '" << *i << "'" << std::endl;
204 return -EINVAL;
205 }
206
207 // If caller didn't specify a namespace, try to pick
208 // one if only one exists
209 if (fscid == FS_CLUSTER_ID_NONE) {
210 if (fsmap->filesystem_count() == 1) {
211 fscid = fsmap->get_filesystem()->fscid;
212 } else {
213 std::cerr << "Specify a filesystem with --filesystem" << std::endl;
214 return -EINVAL;
215 }
216 }
217 auto fs = fsmap->get_filesystem(fscid);
218 ceph_assert(fs != nullptr);
219
220 // Default to output to metadata pool
221 if (driver == NULL) {
222 driver = new MetadataDriver();
223 driver->set_force_corrupt(force_corrupt);
224 driver->set_force_init(force_init);
225 dout(4) << "Using metadata pool output" << dendl;
226 }
227
228 dout(4) << "connecting to RADOS..." << dendl;
229 r = rados.connect();
230 if (r < 0) {
231 std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
232 << std::endl;
233 return r;
234 }
235
236 r = driver->init(rados, metadata_pool_name, fsmap, fscid);
237 if (r < 0) {
238 return r;
239 }
240
241 if (command == "pg_files") {
242 auto pge = PgFiles(objecter, pg_files_pgs);
243 pge.init();
244 return pge.scan_path(pg_files_path);
245 }
246
247 // Initialize data_io for those commands that need it
248 if (command == "scan_inodes" ||
249 command == "scan_extents" ||
250 command == "cleanup") {
251 if (data_pool_name.empty()) {
252 std::cerr << "Data pool not specified" << std::endl;
253 return -EINVAL;
254 }
255
256 data_pool_id = rados.pool_lookup(data_pool_name.c_str());
257 if (data_pool_id < 0) {
258 std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
259 return -ENOENT;
260 } else {
261 dout(4) << "data pool '" << data_pool_name
262 << "' has ID " << data_pool_id << dendl;
263 }
264
265 if (!fs->mds_map.is_data_pool(data_pool_id)) {
266 std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
267 "CephFS data pool!" << std::endl;
268 if (!force_pool) {
269 std::cerr << "Use --force-pool to continue" << std::endl;
270 return -EINVAL;
271 }
272 }
273
274 dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
275 r = rados.ioctx_create(data_pool_name.c_str(), data_io);
276 if (r != 0) {
277 return r;
278 }
279 }
280
281 // Initialize metadata_io from MDSMap for scan_frags
282 if (command == "scan_frags" || command == "scan_links") {
283 const auto fs = fsmap->get_filesystem(fscid);
284 if (fs == nullptr) {
285 std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
286 return -ENOENT;
287 }
288 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
289
290 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
291 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
292 if (r < 0) {
293 std::cerr << "Pool " << metadata_pool_id
294 << " identified in MDS map not found in RADOS!" << std::endl;
295 return r;
296 }
297
298 r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
299 if (r != 0) {
300 return r;
301 }
302
303 data_pools = fs->mds_map.get_data_pools();
304 }
305
306 // Finally, dispatch command
307 if (command == "scan_inodes") {
308 return scan_inodes();
309 } else if (command == "scan_extents") {
310 return scan_extents();
311 } else if (command == "scan_frags") {
312 return scan_frags();
313 } else if (command == "scan_links") {
314 return scan_links();
315 } else if (command == "cleanup") {
316 return cleanup();
317 } else if (command == "init") {
318 return driver->init_roots(fs->mds_map.get_first_data_pool());
319 } else {
320 std::cerr << "Unknown command '" << command << "'" << std::endl;
321 return -EINVAL;
322 }
323 }
324
325 int MetadataDriver::inject_unlinked_inode(
326 inodeno_t inono, int mode, int64_t data_pool_id)
327 {
328 const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
329
330 // Skip if exists
331 bool already_exists = false;
332 int r = root_exists(inono, &already_exists);
333 if (r) {
334 return r;
335 }
336 if (already_exists && !force_init) {
337 std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
338 " exists, skipping create. Use --force-init to overwrite"
339 " the existing object." << std::endl;
340 return 0;
341 }
342
343 // Compose
344 InodeStore inode;
345 inode.inode.ino = inono;
346 inode.inode.version = 1;
347 inode.inode.xattr_version = 1;
348 inode.inode.mode = 0500 | mode;
349 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
350 // (we won't actually give the *correct* dirstat here though)
351 inode.inode.dirstat.nfiles = 1;
352
353 inode.inode.ctime =
354 inode.inode.mtime = ceph_clock_now();
355 inode.inode.nlink = 1;
356 inode.inode.truncate_size = -1ull;
357 inode.inode.truncate_seq = 1;
358 inode.inode.uid = g_conf()->mds_root_ino_uid;
359 inode.inode.gid = g_conf()->mds_root_ino_gid;
360
361 // Force layout to default: should we let users override this so that
362 // they don't have to mount the filesystem to correct it?
363 inode.inode.layout = file_layout_t::get_default();
364 inode.inode.layout.pool_id = data_pool_id;
365 inode.inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
366
367 // Assume that we will get our stats wrong, and that we may
368 // be ignoring dirfrags that exist
369 inode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
370
371 if (inono == MDS_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
372 sr_t srnode;
373 srnode.seq = 1;
374 encode(srnode, inode.snap_blob);
375 }
376
377 // Serialize
378 bufferlist inode_bl;
379 encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
380 inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
381
382 // Write
383 r = metadata_io.write_full(oid.name, inode_bl);
384 if (r != 0) {
385 derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
386 return r;
387 }
388
389 return r;
390 }
391
392 int MetadataDriver::root_exists(inodeno_t ino, bool *result)
393 {
394 object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
395 uint64_t size;
396 time_t mtime;
397 int r = metadata_io.stat(oid.name, &size, &mtime);
398 if (r == -ENOENT) {
399 *result = false;
400 return 0;
401 } else if (r < 0) {
402 return r;
403 }
404
405 *result = true;
406 return 0;
407 }
408
409 int MetadataDriver::init_roots(int64_t data_pool_id)
410 {
411 int r = 0;
412 r = inject_unlinked_inode(MDS_INO_ROOT, S_IFDIR|0755, data_pool_id);
413 if (r != 0) {
414 return r;
415 }
416 r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
417 if (r != 0) {
418 return r;
419 }
420 bool created = false;
421 r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
422 if (r != 0) {
423 return r;
424 }
425
426 return 0;
427 }
428
429 int MetadataDriver::check_roots(bool *result)
430 {
431 int r;
432 r = root_exists(MDS_INO_ROOT, result);
433 if (r != 0) {
434 return r;
435 }
436 if (!*result) {
437 return 0;
438 }
439
440 r = root_exists(MDS_INO_MDSDIR(0), result);
441 if (r != 0) {
442 return r;
443 }
444 if (!*result) {
445 return 0;
446 }
447
448 return 0;
449 }
450
451 /**
452 * Stages:
453 *
454 * SERIAL init
455 * 0. Create root inodes if don't exist
456 * PARALLEL scan_extents
457 * 1. Size and mtime recovery: scan ALL objects, and update 0th
458 * objects with max size and max mtime seen.
459 * PARALLEL scan_inodes
460 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
461 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
462 * or rstats at this stage. Inodes without backtraces go into
463 * lost+found
464 * TODO: SERIAL "recover stats"
465 * 3. Dirfrag statistics: depth first traverse into metadata tree,
466 * rebuilding dir sizes.
467 * TODO PARALLEL "clean up"
468 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
469 * anything onto them) and remove any of the xattrs that we
470 * used for accumulating.
471 */
472
473
474 int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
475 {
476 if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
477 return -EINVAL;
478 }
479
480 std::string err;
481 std::string inode_str = oid.substr(0, oid.find("."));
482 *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
483 if (!err.empty()) {
484 return -EINVAL;
485 }
486
487 std::string pos_string = oid.substr(oid.find(".") + 1);
488 *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
489 if (!err.empty()) {
490 return -EINVAL;
491 }
492
493 return 0;
494 }
495
496
497 int DataScan::scan_extents()
498 {
499 return forall_objects(data_io, false, [this](
500 std::string const &oid,
501 uint64_t obj_name_ino,
502 uint64_t obj_name_offset) -> int
503 {
504 // Read size
505 uint64_t size;
506 time_t mtime;
507 int r = data_io.stat(oid, &size, &mtime);
508 dout(10) << "handling object " << obj_name_ino
509 << "." << obj_name_offset << dendl;
510 if (r != 0) {
511 dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
512 return r;
513 }
514
515 // I need to keep track of
516 // * The highest object ID seen
517 // * The size of the highest object ID seen
518 // * The largest object seen
519 //
520 // Given those things, I can later infer the object chunking
521 // size, the offset of the last object (chunk size * highest ID seen)
522 // and the actual size (offset of last object + size of highest ID seen)
523 //
524 // This logic doesn't take account of striping.
525 r = ClsCephFSClient::accumulate_inode_metadata(
526 data_io,
527 obj_name_ino,
528 obj_name_offset,
529 size,
530 mtime);
531 if (r < 0) {
532 derr << "Failed to accumulate metadata data from '"
533 << oid << "': " << cpp_strerror(r) << dendl;
534 return r;
535 }
536
537 return r;
538 });
539 }
540
541 int DataScan::probe_filter(librados::IoCtx &ioctx)
542 {
543 bufferlist filter_bl;
544 ClsCephFSClient::build_tag_filter("test", &filter_bl);
545 librados::ObjectCursor range_i;
546 librados::ObjectCursor range_end;
547
548 std::vector<librados::ObjectItem> tmp_result;
549 librados::ObjectCursor tmp_next;
550 int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
551 1, filter_bl, &tmp_result, &tmp_next);
552
553 return r >= 0;
554 }
555
556 int DataScan::forall_objects(
557 librados::IoCtx &ioctx,
558 bool untagged_only,
559 std::function<int(std::string, uint64_t, uint64_t)> handler
560 )
561 {
562 librados::ObjectCursor range_i;
563 librados::ObjectCursor range_end;
564 ioctx.object_list_slice(
565 ioctx.object_list_begin(),
566 ioctx.object_list_end(),
567 n,
568 m,
569 &range_i,
570 &range_end);
571
572
573 bufferlist filter_bl;
574
575 bool legacy_filtering = false;
576 if (untagged_only) {
577 // probe to deal with older OSDs that don't support
578 // the cephfs pgls filtering mode
579 legacy_filtering = !probe_filter(ioctx);
580 if (!legacy_filtering) {
581 ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
582 }
583 }
584
585 int r = 0;
586 while(range_i < range_end) {
587 std::vector<librados::ObjectItem> result;
588 int r = ioctx.object_list(range_i, range_end, 1,
589 filter_bl, &result, &range_i);
590 if (r < 0) {
591 derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
592 return r;
593 }
594
595 for (const auto &i : result) {
596 const std::string &oid = i.oid;
597 uint64_t obj_name_ino = 0;
598 uint64_t obj_name_offset = 0;
599 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
600 if (r != 0) {
601 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
602 continue;
603 }
604
605 if (untagged_only && legacy_filtering) {
606 dout(20) << "Applying filter to " << oid << dendl;
607
608 // We are only interested in 0th objects during this phase: we touched
609 // the other objects during scan_extents
610 if (obj_name_offset != 0) {
611 dout(20) << "Non-zeroth object" << dendl;
612 continue;
613 }
614
615 bufferlist scrub_tag_bl;
616 int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
617 if (r >= 0) {
618 std::string read_tag;
619 auto q = scrub_tag_bl.cbegin();
620 try {
621 decode(read_tag, q);
622 if (read_tag == filter_tag) {
623 dout(20) << "skipping " << oid << " because it has the filter_tag"
624 << dendl;
625 continue;
626 }
627 } catch (const buffer::error &err) {
628 }
629 dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
630 } else {
631 dout(20) << "no tag read (" << r << ")" << dendl;
632 }
633
634 } else if (untagged_only) {
635 ceph_assert(obj_name_offset == 0);
636 dout(20) << "OSD matched oid " << oid << dendl;
637 }
638
639 int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
640 if (r == 0 && this_oid_r < 0) {
641 r = this_oid_r;
642 }
643 }
644 }
645
646 return r;
647 }
648
649 int DataScan::scan_inodes()
650 {
651 bool roots_present;
652 int r = driver->check_roots(&roots_present);
653 if (r != 0) {
654 derr << "Unexpected error checking roots: '"
655 << cpp_strerror(r) << "'" << dendl;
656 return r;
657 }
658
659 if (!roots_present) {
660 std::cerr << "Some or all system inodes are absent. Run 'init' from "
661 "one node before running 'scan_inodes'" << std::endl;
662 return -EIO;
663 }
664
665 return forall_objects(data_io, true, [this](
666 std::string const &oid,
667 uint64_t obj_name_ino,
668 uint64_t obj_name_offset) -> int
669 {
670 int r = 0;
671
672 dout(10) << "handling object "
673 << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
674 << dendl;
675
676 AccumulateResult accum_res;
677 inode_backtrace_t backtrace;
678 file_layout_t loaded_layout = file_layout_t::get_default();
679 r = ClsCephFSClient::fetch_inode_accumulate_result(
680 data_io, oid, &backtrace, &loaded_layout, &accum_res);
681
682 if (r == -EINVAL) {
683 dout(4) << "Accumulated metadata missing from '"
684 << oid << ", did you run scan_extents?" << dendl;
685 return r;
686 } else if (r < 0) {
687 dout(4) << "Unexpected error loading accumulated metadata from '"
688 << oid << "': " << cpp_strerror(r) << dendl;
689 // FIXME: this creates situation where if a client has a corrupt
690 // backtrace/layout, we will fail to inject it. We should (optionally)
691 // proceed if the backtrace/layout is corrupt but we have valid
692 // accumulated metadata.
693 return r;
694 }
695
696 const time_t file_mtime = accum_res.max_mtime;
697 uint64_t file_size = 0;
698 bool have_backtrace = !(backtrace.ancestors.empty());
699
700 // This is the layout we will use for injection, populated either
701 // from loaded_layout or from best guesses
702 file_layout_t guessed_layout;
703 guessed_layout.pool_id = data_pool_id;
704
705 // Calculate file_size, guess the layout
706 if (accum_res.ceiling_obj_index > 0) {
707 uint32_t chunk_size = file_layout_t::get_default().object_size;
708 // When there are multiple objects, the largest object probably
709 // indicates the chunk size. But not necessarily, because files
710 // can be sparse. Only make this assumption if size seen
711 // is a power of two, as chunk sizes typically are.
712 if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
713 chunk_size = accum_res.max_obj_size;
714 }
715
716 if (loaded_layout.pool_id == -1) {
717 // If no stashed layout was found, guess it
718 guessed_layout.object_size = chunk_size;
719 guessed_layout.stripe_unit = chunk_size;
720 guessed_layout.stripe_count = 1;
721 } else if (!loaded_layout.is_valid() ||
722 loaded_layout.object_size < accum_res.max_obj_size) {
723 // If the max size seen exceeds what the stashed layout claims, then
724 // disbelieve it. Guess instead. Same for invalid layouts on disk.
725 dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
726 << std::dec << ", ignoring in favour of best guess" << dendl;
727 guessed_layout.object_size = chunk_size;
728 guessed_layout.stripe_unit = chunk_size;
729 guessed_layout.stripe_count = 1;
730 } else {
731 // We have a stashed layout that we can't disprove, so apply it
732 guessed_layout = loaded_layout;
733 dout(20) << "loaded layout from xattr:"
734 << " os: " << guessed_layout.object_size
735 << " sc: " << guessed_layout.stripe_count
736 << " su: " << guessed_layout.stripe_unit
737 << dendl;
738 // User might have transplanted files from a pool with a different
739 // ID, so whatever the loaded_layout says, we'll force the injected
740 // layout to point to the pool we really read from
741 guessed_layout.pool_id = data_pool_id;
742 }
743
744 if (guessed_layout.stripe_count == 1) {
745 // Unstriped file: simple chunking
746 file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
747 + accum_res.ceiling_obj_size;
748 } else {
749 // Striped file: need to examine the last stripe_count objects
750 // in the file to determine the size.
751
752 // How many complete (i.e. not last stripe) objects?
753 uint64_t complete_objs = 0;
754 if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
755 complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
756 } else {
757 complete_objs = 0;
758 }
759
760 // How many potentially-short objects (i.e. last stripe set) objects?
761 uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
762
763 dout(10) << "calculating striped size from complete objs: "
764 << complete_objs << ", partial objs: " << partial_objs
765 << dendl;
766
767 // Maximum amount of data that may be in the incomplete objects
768 uint64_t incomplete_size = 0;
769
770 // For each short object, calculate the max file size within it
771 // and accumulate the maximum
772 for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
773 char buf[60];
774 snprintf(buf, sizeof(buf), "%llx.%08llx",
775 (long long unsigned)obj_name_ino, (long long unsigned)i);
776
777 uint64_t osize(0);
778 time_t omtime(0);
779 r = data_io.stat(std::string(buf), &osize, &omtime);
780 if (r == 0) {
781 if (osize > 0) {
782 // Upper bound within this object
783 uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
784 * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
785 + (i % guessed_layout.stripe_count)
786 * guessed_layout.stripe_unit + (osize - 1)
787 % guessed_layout.stripe_unit + 1;
788 incomplete_size = std::max(incomplete_size, upper_size);
789 }
790 } else if (r == -ENOENT) {
791 // Absent object, treat as size 0 and ignore.
792 } else {
793 // Unexpected error, carry r to outer scope for handling.
794 break;
795 }
796 }
797 if (r != 0 && r != -ENOENT) {
798 derr << "Unexpected error checking size of ino 0x" << std::hex
799 << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
800 return r;
801 }
802 file_size = complete_objs * guessed_layout.object_size
803 + incomplete_size;
804 }
805 } else {
806 file_size = accum_res.ceiling_obj_size;
807 if (loaded_layout.pool_id < 0
808 || loaded_layout.object_size < accum_res.max_obj_size) {
809 // No layout loaded, or inconsistent layout, use default
810 guessed_layout = file_layout_t::get_default();
811 guessed_layout.pool_id = data_pool_id;
812 } else {
813 guessed_layout = loaded_layout;
814 }
815 }
816
817 // Santity checking backtrace ino against object name
818 if (have_backtrace && backtrace.ino != obj_name_ino) {
819 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
820 << " doesn't match object name ino 0x" << obj_name_ino
821 << std::dec << dendl;
822 have_backtrace = false;
823 }
824
825 InodeStore dentry;
826 build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
827
828 // Inject inode to the metadata pool
829 if (have_backtrace) {
830 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
831 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
832 /* Special case for strays: even if we have a good backtrace,
833 * don't put it in the stray dir, because while that would technically
834 * give it linkage it would still be invisible to the user */
835 r = driver->inject_lost_and_found(obj_name_ino, dentry);
836 if (r < 0) {
837 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
838 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
839 if (r == -EINVAL) {
840 dout(4) << "Use --force-corrupt to overwrite structures that "
841 "appear to be corrupt" << dendl;
842 }
843 }
844 } else {
845 /* Happy case: we will inject a named dentry for this inode */
846 r = driver->inject_with_backtrace(backtrace, dentry);
847 if (r < 0) {
848 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
849 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
850 if (r == -EINVAL) {
851 dout(4) << "Use --force-corrupt to overwrite structures that "
852 "appear to be corrupt" << dendl;
853 }
854 }
855 }
856 } else {
857 /* Backtrace-less case: we will inject a lost+found dentry */
858 r = driver->inject_lost_and_found(
859 obj_name_ino, dentry);
860 if (r < 0) {
861 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
862 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
863 if (r == -EINVAL) {
864 dout(4) << "Use --force-corrupt to overwrite structures that "
865 "appear to be corrupt" << dendl;
866 }
867 }
868 }
869
870 return r;
871 });
872 }
873
874 int DataScan::cleanup()
875 {
876 // We are looking for only zeroth object
877 //
878 return forall_objects(data_io, true, [this](
879 std::string const &oid,
880 uint64_t obj_name_ino,
881 uint64_t obj_name_offset) -> int
882 {
883 int r = 0;
884 r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
885 if (r < 0) {
886 dout(4) << "Error deleting accumulated metadata from '"
887 << oid << "': " << cpp_strerror(r) << dendl;
888 }
889 return r;
890 });
891 }
892
893 bool DataScan::valid_ino(inodeno_t ino) const
894 {
895 return (ino >= inodeno_t((1ull << 40)))
896 || (MDS_INO_IS_STRAY(ino))
897 || (MDS_INO_IS_MDSDIR(ino))
898 || ino == MDS_INO_ROOT
899 || ino == MDS_INO_CEPH;
900 }
901
902 int DataScan::scan_links()
903 {
904 MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
905 if (!metadata_driver) {
906 derr << "Unexpected --output-dir option for scan_links" << dendl;
907 return -EINVAL;
908 }
909
910 interval_set<uint64_t> used_inos;
911 map<inodeno_t, int> remote_links;
912 map<snapid_t, SnapInfo> snaps;
913 snapid_t last_snap = 1;
914 snapid_t snaprealm_v2_since = 2;
915
916 struct link_info_t {
917 inodeno_t dirino;
918 frag_t frag;
919 string name;
920 version_t version;
921 int nlink;
922 bool is_dir;
923 map<snapid_t, SnapInfo> snaps;
924 link_info_t() : version(0), nlink(0), is_dir(false) {}
925 link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::mempool_inode& i) :
926 dirino(di), frag(df), name(n),
927 version(i.version), nlink(i.nlink), is_dir(S_IFDIR & i.mode) {}
928 dirfrag_t dirfrag() const {
929 return dirfrag_t(dirino, frag);
930 }
931 };
932 map<inodeno_t, list<link_info_t> > dup_primaries;
933 map<inodeno_t, link_info_t> bad_nlink_inos;
934 map<inodeno_t, link_info_t> injected_inos;
935
936 map<dirfrag_t, set<string> > to_remove;
937
938 enum {
939 SCAN_INOS = 1,
940 CHECK_LINK,
941 };
942
943 for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
944 const librados::NObjectIterator it_end = metadata_io.nobjects_end();
945 for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
946 const std::string oid = it->get_oid();
947
948 uint64_t dir_ino = 0;
949 uint64_t frag_id = 0;
950 int r = parse_oid(oid, &dir_ino, &frag_id);
951 if (r == -EINVAL) {
952 dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
953 continue;
954 } else {
955 // parse_oid can only do 0 or -EINVAL
956 ceph_assert(r == 0);
957 }
958
959 if (!valid_ino(dir_ino)) {
960 dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
961 continue;
962 }
963
964 std::map<std::string, bufferlist> items;
965 r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
966 if (r < 0) {
967 derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
968 return r;
969 }
970
971 for (auto& p : items) {
972 auto q = p.second.cbegin();
973 string dname;
974 snapid_t last;
975 dentry_key_t::decode_helper(p.first, dname, last);
976
977 if (last != CEPH_NOSNAP) {
978 if (last > last_snap)
979 last_snap = last;
980 continue;
981 }
982
983 try {
984 snapid_t dnfirst;
985 decode(dnfirst, q);
986 if (dnfirst <= CEPH_MAXSNAP) {
987 if (dnfirst - 1 > last_snap)
988 last_snap = dnfirst - 1;
989 }
990 char dentry_type;
991 decode(dentry_type, q);
992 if (dentry_type == 'I') {
993 InodeStore inode;
994 inode.decode_bare(q);
995 inodeno_t ino = inode.inode.ino;
996
997 if (step == SCAN_INOS) {
998 if (used_inos.contains(ino, 1)) {
999 dup_primaries[ino].size();
1000 } else {
1001 used_inos.insert(ino);
1002 }
1003 } else if (step == CHECK_LINK) {
1004 sr_t srnode;
1005 if (inode.snap_blob.length()) {
1006 auto p = inode.snap_blob.cbegin();
1007 decode(srnode, p);
1008 for (auto it = srnode.snaps.begin();
1009 it != srnode.snaps.end(); ) {
1010 if (it->second.ino != ino ||
1011 it->second.snapid != it->first) {
1012 srnode.snaps.erase(it++);
1013 } else {
1014 ++it;
1015 }
1016 }
1017 if (!srnode.past_parents.empty()) {
1018 snapid_t last = srnode.past_parents.rbegin()->first;
1019 if (last + 1 > snaprealm_v2_since)
1020 snaprealm_v2_since = last + 1;
1021 }
1022 }
1023 if (!inode.old_inodes.empty()) {
1024 if (inode.old_inodes.rbegin()->first > last_snap)
1025 last_snap = inode.old_inodes.rbegin()->first;
1026 }
1027 auto q = dup_primaries.find(ino);
1028 if (q != dup_primaries.end()) {
1029 q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
1030 q->second.back().snaps.swap(srnode.snaps);
1031 } else {
1032 int nlink = 0;
1033 auto r = remote_links.find(ino);
1034 if (r != remote_links.end())
1035 nlink = r->second;
1036 if (!MDS_INO_IS_STRAY(dir_ino))
1037 nlink++;
1038 if (inode.inode.nlink != nlink) {
1039 derr << "Bad nlink on " << ino << " expected " << nlink
1040 << " has " << inode.inode.nlink << dendl;
1041 bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1042 bad_nlink_inos[ino].nlink = nlink;
1043 }
1044 snaps.insert(make_move_iterator(begin(srnode.snaps)),
1045 make_move_iterator(end(srnode.snaps)));
1046 }
1047 if (dnfirst == CEPH_NOSNAP)
1048 injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1049 }
1050 } else if (dentry_type == 'L') {
1051 inodeno_t ino;
1052 unsigned char d_type;
1053 decode(ino, q);
1054 decode(d_type, q);
1055
1056 if (step == SCAN_INOS) {
1057 remote_links[ino]++;
1058 } else if (step == CHECK_LINK) {
1059 if (!used_inos.contains(ino, 1)) {
1060 derr << "Bad remote link dentry 0x" << std::hex << dir_ino
1061 << std::dec << "/" << dname
1062 << ", ino " << ino << " not found" << dendl;
1063 std::string key;
1064 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1065 dn_key.encode(key);
1066 to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
1067 }
1068 }
1069 } else {
1070 derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
1071 << std::dec << "/" << dname << dendl;
1072 return -EINVAL;
1073 }
1074 } catch (const buffer::error &err) {
1075 derr << "Error decoding dentry 0x" << std::hex << dir_ino
1076 << std::dec << "/" << dname << dendl;
1077 return -EINVAL;
1078 }
1079 }
1080 }
1081 }
1082
1083 map<unsigned, uint64_t> max_ino_map;
1084 {
1085 auto prev_max_ino = (uint64_t)1 << 40;
1086 for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
1087 auto cur_max = p.get_start() + p.get_len() - 1;
1088 if (cur_max < prev_max_ino)
1089 continue; // system inodes
1090
1091 if ((prev_max_ino >> 40) != (cur_max >> 40)) {
1092 unsigned rank = (prev_max_ino >> 40) - 1;
1093 max_ino_map[rank] = prev_max_ino;
1094 } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
1095 unsigned rank = (p.get_start() >> 40) - 1;
1096 max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
1097 }
1098 prev_max_ino = cur_max;
1099 }
1100 unsigned rank = (prev_max_ino >> 40) - 1;
1101 max_ino_map[rank] = prev_max_ino;
1102 }
1103
1104 used_inos.clear();
1105
1106 for (auto& p : dup_primaries) {
1107 link_info_t newest;
1108 for (auto& q : p.second) {
1109 if (q.version > newest.version) {
1110 newest = q;
1111 } else if (q.version == newest.version &&
1112 !MDS_INO_IS_STRAY(q.dirino) &&
1113 MDS_INO_IS_STRAY(newest.dirino)) {
1114 newest = q;
1115 }
1116 }
1117
1118 for (auto& q : p.second) {
1119 // in the middle of dir fragmentation?
1120 if (newest.dirino == q.dirino && newest.name == q.name) {
1121 snaps.insert(make_move_iterator(begin(q.snaps)),
1122 make_move_iterator(end(q.snaps)));
1123 continue;
1124 }
1125
1126 std::string key;
1127 dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
1128 dn_key.encode(key);
1129 to_remove[q.dirfrag()].insert(key);
1130 derr << "Remove duplicated ino 0x" << p.first << " from "
1131 << q.dirfrag() << "/" << q.name << dendl;
1132 }
1133
1134 int nlink = 0;
1135 auto q = remote_links.find(p.first);
1136 if (q != remote_links.end())
1137 nlink = q->second;
1138 if (!MDS_INO_IS_STRAY(newest.dirino))
1139 nlink++;
1140
1141 if (nlink != newest.nlink) {
1142 derr << "Bad nlink on " << p.first << " expected " << nlink
1143 << " has " << newest.nlink << dendl;
1144 bad_nlink_inos[p.first] = newest;
1145 bad_nlink_inos[p.first].nlink = nlink;
1146 }
1147 }
1148 dup_primaries.clear();
1149 remote_links.clear();
1150
1151 {
1152 objecter->with_osdmap([&](const OSDMap& o) {
1153 for (auto p : data_pools) {
1154 const pg_pool_t *pi = o.get_pg_pool(p);
1155 if (!pi)
1156 continue;
1157 if (pi->snap_seq > last_snap)
1158 last_snap = pi->snap_seq;
1159 }
1160 });
1161
1162 if (!snaps.empty()) {
1163 if (snaps.rbegin()->first > last_snap)
1164 last_snap = snaps.rbegin()->first;
1165 }
1166 }
1167
1168 for (auto& p : to_remove) {
1169 object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
1170
1171 int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
1172 if (r != 0) {
1173 derr << "Error removing duplicated dentries from " << p.first << dendl;
1174 return r;
1175 }
1176 }
1177 to_remove.clear();
1178
1179 for (auto &p : bad_nlink_inos) {
1180 InodeStore inode;
1181 snapid_t first;
1182 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
1183 if (r < 0) {
1184 derr << "Unexpected error reading dentry "
1185 << p.second.dirfrag() << "/" << p.second.name
1186 << ": " << cpp_strerror(r) << dendl;
1187 return r;
1188 }
1189
1190 if (inode.inode.ino != p.first || inode.inode.version != p.second.version)
1191 continue;
1192
1193 inode.inode.nlink = p.second.nlink;
1194 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
1195 if (r < 0)
1196 return r;
1197 }
1198
1199 for (auto &p : injected_inos) {
1200 InodeStore inode;
1201 snapid_t first;
1202 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
1203 if (r < 0) {
1204 derr << "Unexpected error reading dentry "
1205 << p.second.dirfrag() << "/" << p.second.name
1206 << ": " << cpp_strerror(r) << dendl;
1207 return r;
1208 }
1209
1210 if (first != CEPH_NOSNAP)
1211 continue;
1212
1213 first = last_snap + 1;
1214 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
1215 if (r < 0)
1216 return r;
1217 }
1218
1219 for (auto& p : max_ino_map) {
1220 InoTable inotable(nullptr);
1221 inotable.set_rank(p.first);
1222 bool dirty = false;
1223 int r = metadata_driver->load_table(&inotable);
1224 if (r < 0) {
1225 inotable.reset_state();
1226 dirty = true;
1227 }
1228 if (inotable.force_consume_to(p.second))
1229 dirty = true;
1230 if (dirty) {
1231 r = metadata_driver->save_table(&inotable);
1232 if (r < 0)
1233 return r;
1234 }
1235 }
1236
1237 {
1238 SnapServer snaptable;
1239 snaptable.set_rank(0);
1240 bool dirty = false;
1241 int r = metadata_driver->load_table(&snaptable);
1242 if (r < 0) {
1243 snaptable.reset_state();
1244 dirty = true;
1245 }
1246 if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
1247 dirty = true;
1248 if (dirty) {
1249 r = metadata_driver->save_table(&snaptable);
1250 if (r < 0)
1251 return r;
1252 }
1253 }
1254 return 0;
1255 }
1256
1257 int DataScan::scan_frags()
1258 {
1259 bool roots_present;
1260 int r = driver->check_roots(&roots_present);
1261 if (r != 0) {
1262 derr << "Unexpected error checking roots: '"
1263 << cpp_strerror(r) << "'" << dendl;
1264 return r;
1265 }
1266
1267 if (!roots_present) {
1268 std::cerr << "Some or all system inodes are absent. Run 'init' from "
1269 "one node before running 'scan_inodes'" << std::endl;
1270 return -EIO;
1271 }
1272
1273 return forall_objects(metadata_io, true, [this](
1274 std::string const &oid,
1275 uint64_t obj_name_ino,
1276 uint64_t obj_name_offset) -> int
1277 {
1278 int r = 0;
1279 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
1280 if (r != 0) {
1281 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
1282 return r;
1283 }
1284
1285 if (obj_name_ino < (1ULL << 40)) {
1286 // FIXME: we're skipping stray dirs here: if they're
1287 // orphaned then we should be resetting them some other
1288 // way
1289 dout(10) << "Skipping system ino " << obj_name_ino << dendl;
1290 return 0;
1291 }
1292
1293 AccumulateResult accum_res;
1294 inode_backtrace_t backtrace;
1295
1296 // Default to inherit layout (i.e. no explicit layout on dir) which is
1297 // expressed as a zeroed layout struct (see inode_t::has_layout)
1298 file_layout_t loaded_layout;
1299
1300 int parent_r = 0;
1301 bufferlist parent_bl;
1302 int layout_r = 0;
1303 bufferlist layout_bl;
1304 bufferlist op_bl;
1305
1306 librados::ObjectReadOperation op;
1307 op.getxattr("parent", &parent_bl, &parent_r);
1308 op.getxattr("layout", &layout_bl, &layout_r);
1309 r = metadata_io.operate(oid, &op, &op_bl);
1310 if (r != 0 && r != -ENODATA) {
1311 derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
1312 return r;
1313 }
1314
1315 if (parent_r != -ENODATA) {
1316 try {
1317 auto q = parent_bl.cbegin();
1318 backtrace.decode(q);
1319 } catch (buffer::error &e) {
1320 dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl;
1321 if (!force_corrupt) {
1322 return -EINVAL;
1323 } else {
1324 // Treat backtrace as absent: we'll inject into lost+found
1325 backtrace = inode_backtrace_t();
1326 }
1327 }
1328 }
1329
1330 if (layout_r != -ENODATA) {
1331 try {
1332 auto q = layout_bl.cbegin();
1333 decode(loaded_layout, q);
1334 } catch (buffer::error &e) {
1335 dout(4) << "Corrupt layout on '" << oid << "': " << e << dendl;
1336 if (!force_corrupt) {
1337 return -EINVAL;
1338 }
1339 }
1340 }
1341
1342 bool have_backtrace = !(backtrace.ancestors.empty());
1343
1344 // Santity checking backtrace ino against object name
1345 if (have_backtrace && backtrace.ino != obj_name_ino) {
1346 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
1347 << " doesn't match object name ino 0x" << obj_name_ino
1348 << std::dec << dendl;
1349 have_backtrace = false;
1350 }
1351
1352 uint64_t fnode_version = 0;
1353 fnode_t fnode;
1354 r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
1355 if (r == -EINVAL) {
1356 derr << "Corrupt fnode on " << oid << dendl;
1357 if (force_corrupt) {
1358 fnode.fragstat.mtime = 0;
1359 fnode.fragstat.nfiles = 1;
1360 fnode.fragstat.nsubdirs = 0;
1361 fnode.accounted_fragstat = fnode.fragstat;
1362 } else {
1363 return r;
1364 }
1365 }
1366
1367 InodeStore dentry;
1368 build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
1369 loaded_layout, &dentry);
1370
1371 // Inject inode to the metadata pool
1372 if (have_backtrace) {
1373 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
1374 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
1375 /* Special case for strays: even if we have a good backtrace,
1376 * don't put it in the stray dir, because while that would technically
1377 * give it linkage it would still be invisible to the user */
1378 r = driver->inject_lost_and_found(obj_name_ino, dentry);
1379 if (r < 0) {
1380 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1381 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1382 if (r == -EINVAL) {
1383 dout(4) << "Use --force-corrupt to overwrite structures that "
1384 "appear to be corrupt" << dendl;
1385 }
1386 }
1387 } else {
1388 /* Happy case: we will inject a named dentry for this inode */
1389 r = driver->inject_with_backtrace(backtrace, dentry);
1390 if (r < 0) {
1391 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1392 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
1393 if (r == -EINVAL) {
1394 dout(4) << "Use --force-corrupt to overwrite structures that "
1395 "appear to be corrupt" << dendl;
1396 }
1397 }
1398 }
1399 } else {
1400 /* Backtrace-less case: we will inject a lost+found dentry */
1401 r = driver->inject_lost_and_found(
1402 obj_name_ino, dentry);
1403 if (r < 0) {
1404 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
1405 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1406 if (r == -EINVAL) {
1407 dout(4) << "Use --force-corrupt to overwrite structures that "
1408 "appear to be corrupt" << dendl;
1409 }
1410 }
1411 }
1412
1413 return r;
1414 });
1415 }
1416
1417 int MetadataTool::read_fnode(
1418 inodeno_t ino, frag_t frag, fnode_t *fnode,
1419 uint64_t *last_version)
1420 {
1421 ceph_assert(fnode != NULL);
1422
1423 object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
1424 bufferlist fnode_bl;
1425 int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
1426 *last_version = metadata_io.get_last_version();
1427 if (r < 0) {
1428 return r;
1429 }
1430
1431 auto old_fnode_iter = fnode_bl.cbegin();
1432 try {
1433 (*fnode).decode(old_fnode_iter);
1434 } catch (const buffer::error &err) {
1435 return -EINVAL;
1436 }
1437
1438 return 0;
1439 }
1440
1441 int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
1442 const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
1443 {
1444 ceph_assert(inode != NULL);
1445
1446 std::string key;
1447 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1448 dn_key.encode(key);
1449
1450 std::set<std::string> keys;
1451 keys.insert(key);
1452 std::map<std::string, bufferlist> vals;
1453 object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
1454 int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
1455 dout(20) << "oid=" << frag_oid.name
1456 << " dname=" << dname
1457 << " frag=" << frag
1458 << ", r=" << r << dendl;
1459 if (r < 0) {
1460 return r;
1461 }
1462
1463 if (vals.find(key) == vals.end()) {
1464 dout(20) << key << " not found in result" << dendl;
1465 return -ENOENT;
1466 }
1467
1468 try {
1469 auto q = vals[key].cbegin();
1470 snapid_t first;
1471 decode(first, q);
1472 char dentry_type;
1473 decode(dentry_type, q);
1474 if (dentry_type == 'I') {
1475 inode->decode_bare(q);
1476 } else {
1477 dout(20) << "dentry type '" << dentry_type << "': cannot"
1478 "read an inode out of that" << dendl;
1479 return -EINVAL;
1480 }
1481 if (dnfirst)
1482 *dnfirst = first;
1483 } catch (const buffer::error &err) {
1484 dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
1485 << std::dec << "/" << dname << dendl;
1486 return -EINVAL;
1487 }
1488
1489 return 0;
1490 }
1491
1492 int MetadataDriver::load_table(MDSTable *table)
1493 {
1494 object_t table_oid = table->get_object_name();
1495
1496 bufferlist table_bl;
1497 int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
1498 if (r < 0) {
1499 derr << "unable to read mds table '" << table_oid.name << "': "
1500 << cpp_strerror(r) << dendl;
1501 return r;
1502 }
1503
1504 try {
1505 version_t table_ver;
1506 auto p = table_bl.cbegin();
1507 decode(table_ver, p);
1508 table->decode_state(p);
1509 table->force_replay_version(table_ver);
1510 } catch (const buffer::error &err) {
1511 derr << "unable to decode mds table '" << table_oid.name << "': "
1512 << err.what() << dendl;
1513 return -EIO;
1514 }
1515 return 0;
1516 }
1517
1518 int MetadataDriver::save_table(MDSTable *table)
1519 {
1520 object_t table_oid = table->get_object_name();
1521
1522 bufferlist table_bl;
1523 encode(table->get_version(), table_bl);
1524 table->encode_state(table_bl);
1525 int r = metadata_io.write_full(table_oid.name, table_bl);
1526 if (r != 0) {
1527 derr << "error updating mds table " << table_oid.name
1528 << ": " << cpp_strerror(r) << dendl;
1529 return r;
1530 }
1531 return 0;
1532 }
1533
1534 int MetadataDriver::inject_lost_and_found(
1535 inodeno_t ino, const InodeStore &dentry)
1536 {
1537 // Create lost+found if doesn't exist
1538 bool created = false;
1539 int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
1540 if (r < 0) {
1541 return r;
1542 }
1543 InodeStore lf_ino;
1544 r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
1545 if (r == -ENOENT || r == -EINVAL) {
1546 if (r == -EINVAL && !force_corrupt) {
1547 return r;
1548 }
1549
1550 // To have a directory not specify a layout, give it zeros (see
1551 // inode_t::has_layout)
1552 file_layout_t inherit_layout;
1553
1554 // Construct LF inode
1555 frag_info_t fragstat;
1556 fragstat.nfiles = 1,
1557 build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
1558
1559 // Inject link to LF inode in the root dir
1560 r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
1561 if (r < 0) {
1562 return r;
1563 }
1564 } else {
1565 if (!(lf_ino.inode.mode & S_IFDIR)) {
1566 derr << "lost+found exists but is not a directory!" << dendl;
1567 // In this case we error out, and the user should do something about
1568 // this problem.
1569 return -EINVAL;
1570 }
1571 }
1572
1573 r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
1574 if (r < 0) {
1575 return r;
1576 }
1577
1578 InodeStore recovered_ino;
1579
1580
1581 const std::string dname = lost_found_dname(ino);
1582
1583 // Write dentry into lost+found dirfrag
1584 return inject_linkage(lf_ino.inode.ino, dname, frag_t(), dentry);
1585 }
1586
1587
1588 int MetadataDriver::get_frag_of(
1589 inodeno_t dirino,
1590 const std::string &target_dname,
1591 frag_t *result_ft)
1592 {
1593 object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
1594
1595 dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
1596
1597 // Find and load fragtree if existing dirfrag
1598 // ==========================================
1599 bool have_backtrace = false;
1600 bufferlist parent_bl;
1601 int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
1602 if (r == -ENODATA) {
1603 dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
1604 } else if (r < 0) {
1605 dout(4) << "Unexpected error on '" << root_frag_oid << "': "
1606 << cpp_strerror(r) << dendl;
1607 return r;
1608 }
1609
1610 // Deserialize backtrace
1611 inode_backtrace_t backtrace;
1612 if (parent_bl.length()) {
1613 try {
1614 auto q = parent_bl.cbegin();
1615 backtrace.decode(q);
1616 have_backtrace = true;
1617 } catch (buffer::error &e) {
1618 dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " << e << dendl;
1619 }
1620 }
1621
1622 if (!(have_backtrace && backtrace.ancestors.size())) {
1623 // Can't work out fragtree without a backtrace
1624 dout(4) << "No backtrace on '" << root_frag_oid
1625 << "': cannot determine fragtree" << dendl;
1626 return -ENOENT;
1627 }
1628
1629 // The parentage of dirino
1630 const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
1631
1632 // The inode of dirino's parent
1633 const inodeno_t parent_ino = bp.dirino;
1634
1635 // The dname of dirino in its parent.
1636 const std::string &parent_dname = bp.dname;
1637
1638 dout(20) << "got backtrace parent " << parent_ino << "/"
1639 << parent_dname << dendl;
1640
1641 // The primary dentry for dirino
1642 InodeStore existing_dentry;
1643
1644 // See if we can find ourselves in dirfrag zero of the parent: this
1645 // is a fast path that avoids needing to go further up the tree
1646 // if the parent isn't fragmented (worst case we would have to
1647 // go all the way to the root)
1648 r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
1649 if (r >= 0) {
1650 // Great, fast path: return the fragtree from here
1651 if (existing_dentry.inode.ino != dirino) {
1652 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
1653 << existing_dentry.inode.ino
1654 << " vs expected 0x" << dirino << std::dec << dendl;
1655 return -ENOENT;
1656 }
1657 dout(20) << "fast path, fragtree is "
1658 << existing_dentry.dirfragtree << dendl;
1659 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1660 dout(20) << "frag is " << *result_ft << dendl;
1661 return 0;
1662 } else if (r != -ENOENT) {
1663 // Dentry not present in 0th frag, must read parent's fragtree
1664 frag_t parent_frag;
1665 r = get_frag_of(parent_ino, parent_dname, &parent_frag);
1666 if (r == 0) {
1667 // We have the parent fragtree, so try again to load our dentry
1668 r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
1669 if (r >= 0) {
1670 // Got it!
1671 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1672 dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
1673 return 0;
1674 } else {
1675 if (r == -EINVAL || r == -ENOENT) {
1676 return -ENOENT; // dentry missing or corrupt, so frag is missing
1677 } else {
1678 return r;
1679 }
1680 }
1681 } else {
1682 // Couldn't resolve parent fragtree, so can't find ours.
1683 return r;
1684 }
1685 } else if (r == -EINVAL) {
1686 // Unreadable dentry, can't know the fragtree.
1687 return -ENOENT;
1688 } else {
1689 // Unexpected error, raise it
1690 return r;
1691 }
1692 }
1693
1694
1695 int MetadataDriver::inject_with_backtrace(
1696 const inode_backtrace_t &backtrace, const InodeStore &dentry)
1697
1698 {
1699
1700 // On dirfrags
1701 // ===========
1702 // In order to insert something into a directory, we first (ideally)
1703 // need to know the fragtree for the directory. Sometimes we can't
1704 // get that, in which case we just go ahead and insert it into
1705 // fragment zero for a good chance of that being the right thing
1706 // anyway (most moderate-sized dirs aren't fragmented!)
1707
1708 // On ancestry
1709 // ===========
1710 // My immediate ancestry should be correct, so if we can find that
1711 // directory's dirfrag then go inject it there. This works well
1712 // in the case that this inode's dentry was somehow lost and we
1713 // are recreating it, because the rest of the hierarchy
1714 // will probably still exist.
1715 //
1716 // It's more of a "better than nothing" approach when rebuilding
1717 // a whole tree, as backtraces will in general not be up to date
1718 // beyond the first parent, if anything in the trace was ever
1719 // moved after the file was created.
1720
1721 // On inode numbers
1722 // ================
1723 // The backtrace tells us inodes for each of the parents. If we are
1724 // creating those parent dirfrags, then there is a risk that somehow
1725 // the inode indicated here was also used for data (not a dirfrag) at
1726 // some stage. That would be a zany situation, and we don't check
1727 // for it here, because to do so would require extra IOs for everything
1728 // we inject, and anyway wouldn't guarantee that the inode number
1729 // wasn't in use in some dentry elsewhere in the metadata tree that
1730 // just happened not to have any data objects.
1731
1732 // On multiple workers touching the same traces
1733 // ============================================
1734 // When creating linkage for a directory, *only* create it if we are
1735 // also creating the object. That way, we might not manage to get the
1736 // *right* linkage for a directory, but at least we won't multiply link
1737 // it. We assume that if a root dirfrag exists for a directory, then
1738 // it is linked somewhere (i.e. that the metadata pool is not already
1739 // inconsistent).
1740 //
1741 // Making sure *that* is true is someone else's job! Probably someone
1742 // who is not going to run in parallel, so that they can self-consistently
1743 // look at versions and move things around as they go.
1744 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1745 // object, next run will fail to create linkage for the dirfrag object
1746 // and leave it orphaned.
1747
1748 inodeno_t ino = backtrace.ino;
1749 dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
1750 for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
1751 i != backtrace.ancestors.end(); ++i) {
1752 const inode_backpointer_t &backptr = *i;
1753 dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
1754 << "/" << backptr.dname << dendl;
1755
1756 // Examine root dirfrag for parent
1757 const inodeno_t parent_ino = backptr.dirino;
1758 const std::string dname = backptr.dname;
1759
1760 frag_t fragment;
1761 int r = get_frag_of(parent_ino, dname, &fragment);
1762 if (r == -ENOENT) {
1763 // Don't know fragment, fall back to assuming root
1764 dout(20) << "don't know fragment for 0x" << std::hex <<
1765 parent_ino << std::dec << "/" << dname << ", will insert to root"
1766 << dendl;
1767 }
1768
1769 // Find or create dirfrag
1770 // ======================
1771 bool created_dirfrag;
1772 r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
1773 if (r < 0) {
1774 return r;
1775 }
1776
1777 // Check if dentry already exists
1778 // ==============================
1779 InodeStore existing_dentry;
1780 r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
1781 bool write_dentry = false;
1782 if (r == -ENOENT || r == -EINVAL) {
1783 if (r == -EINVAL && !force_corrupt) {
1784 return r;
1785 }
1786 // Missing or corrupt dentry
1787 write_dentry = true;
1788 } else if (r < 0) {
1789 derr << "Unexpected error reading dentry 0x" << std::hex
1790 << parent_ino << std::dec << "/"
1791 << dname << ": " << cpp_strerror(r) << dendl;
1792 break;
1793 } else {
1794 // Dentry already present, does it link to me?
1795 if (existing_dentry.inode.ino == ino) {
1796 dout(20) << "Dentry 0x" << std::hex
1797 << parent_ino << std::dec << "/"
1798 << dname << " already exists and points to me" << dendl;
1799 } else {
1800 derr << "Dentry 0x" << std::hex
1801 << parent_ino << std::dec << "/"
1802 << dname << " already exists but points to 0x"
1803 << std::hex << existing_dentry.inode.ino << std::dec << dendl;
1804 // Fall back to lost+found!
1805 return inject_lost_and_found(backtrace.ino, dentry);
1806 }
1807 }
1808
1809 // Inject linkage
1810 // ==============
1811
1812 if (write_dentry) {
1813 if (i == backtrace.ancestors.begin()) {
1814 // This is the linkage for the file of interest
1815 dout(10) << "Linking inode 0x" << std::hex << ino
1816 << " at 0x" << parent_ino << "/" << dname << std::dec
1817 << " with size=" << dentry.inode.size << " bytes" << dendl;
1818
1819 r = inject_linkage(parent_ino, dname, fragment, dentry);
1820 } else {
1821 // This is the linkage for an ancestor directory
1822 InodeStore ancestor_dentry;
1823 ancestor_dentry.inode.mode = 0755 | S_IFDIR;
1824
1825 // Set nfiles to something non-zero, to fool any other code
1826 // that tries to ignore 'empty' directories. This won't be
1827 // accurate, but it should avoid functional issues.
1828
1829 ancestor_dentry.inode.dirstat.nfiles = 1;
1830 ancestor_dentry.inode.dir_layout.dl_dir_hash =
1831 g_conf()->mds_default_dir_hash;
1832
1833 ancestor_dentry.inode.nlink = 1;
1834 ancestor_dentry.inode.ino = ino;
1835 ancestor_dentry.inode.uid = g_conf()->mds_root_ino_uid;
1836 ancestor_dentry.inode.gid = g_conf()->mds_root_ino_gid;
1837 ancestor_dentry.inode.version = 1;
1838 ancestor_dentry.inode.backtrace_version = 1;
1839 r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
1840 }
1841
1842 if (r < 0) {
1843 return r;
1844 }
1845 }
1846
1847 if (!created_dirfrag) {
1848 // If the parent dirfrag already existed, then stop traversing the
1849 // backtrace: assume that the other ancestors already exist too. This
1850 // is an assumption rather than a truth, but it's a convenient way
1851 // to avoid the risk of creating multiply-linked directories while
1852 // injecting data. If there are in fact missing ancestors, this
1853 // should be fixed up using a separate tool scanning the metadata
1854 // pool.
1855 break;
1856 } else {
1857 // Proceed up the backtrace, creating parents
1858 ino = parent_ino;
1859 }
1860 }
1861
1862 return 0;
1863 }
1864
1865 int MetadataDriver::find_or_create_dirfrag(
1866 inodeno_t ino,
1867 frag_t fragment,
1868 bool *created)
1869 {
1870 ceph_assert(created != NULL);
1871
1872 fnode_t existing_fnode;
1873 *created = false;
1874
1875 uint64_t read_version = 0;
1876 int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
1877 dout(10) << "read_version = " << read_version << dendl;
1878
1879 if (r == -ENOENT || r == -EINVAL) {
1880 if (r == -EINVAL && !force_corrupt) {
1881 return r;
1882 }
1883
1884 // Missing or corrupt fnode, create afresh
1885 bufferlist fnode_bl;
1886 fnode_t blank_fnode;
1887 blank_fnode.version = 1;
1888 // mark it as non-empty
1889 blank_fnode.fragstat.nfiles = 1;
1890 blank_fnode.accounted_fragstat = blank_fnode.fragstat;
1891 blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
1892 blank_fnode.encode(fnode_bl);
1893
1894
1895 librados::ObjectWriteOperation op;
1896
1897 if (read_version) {
1898 ceph_assert(r == -EINVAL);
1899 // Case A: We must assert that the version isn't changed since we saw the object
1900 // was unreadable, to avoid the possibility of two data-scan processes
1901 // both creating the frag.
1902 op.assert_version(read_version);
1903 } else {
1904 ceph_assert(r == -ENOENT);
1905 // Case B: The object didn't exist in read_fnode, so while creating it we must
1906 // use an exclusive create to correctly populate *creating with
1907 // whether we created it ourselves or someone beat us to it.
1908 op.create(true);
1909 }
1910
1911 object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
1912 op.omap_set_header(fnode_bl);
1913 r = metadata_io.operate(frag_oid.name, &op);
1914 if (r == -EOVERFLOW || r == -EEXIST) {
1915 // Someone else wrote it (see case A above)
1916 dout(10) << "Dirfrag creation race: 0x" << std::hex
1917 << ino << " " << fragment << std::dec << dendl;
1918 *created = false;
1919 return 0;
1920 } else if (r < 0) {
1921 // We were unable to create or write it, error out
1922 derr << "Failed to create dirfrag 0x" << std::hex
1923 << ino << std::dec << ": " << cpp_strerror(r) << dendl;
1924 return r;
1925 } else {
1926 // Success: the dirfrag object now exists with a value header
1927 dout(10) << "Created dirfrag: 0x" << std::hex
1928 << ino << std::dec << dendl;
1929 *created = true;
1930 }
1931 } else if (r < 0) {
1932 derr << "Unexpected error reading dirfrag 0x" << std::hex
1933 << ino << std::dec << " : " << cpp_strerror(r) << dendl;
1934 return r;
1935 } else {
1936 dout(20) << "Dirfrag already exists: 0x" << std::hex
1937 << ino << " " << fragment << std::dec << dendl;
1938 }
1939
1940 return 0;
1941 }
1942
1943 int MetadataDriver::inject_linkage(
1944 inodeno_t dir_ino, const std::string &dname,
1945 const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
1946 {
1947 object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
1948
1949 std::string key;
1950 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1951 dn_key.encode(key);
1952
1953 bufferlist dentry_bl;
1954 encode(dnfirst, dentry_bl);
1955 encode('I', dentry_bl);
1956 inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1957
1958 // Write out
1959 std::map<std::string, bufferlist> vals;
1960 vals[key] = dentry_bl;
1961 int r = metadata_io.omap_set(frag_oid.name, vals);
1962 if (r != 0) {
1963 derr << "Error writing dentry 0x" << std::hex
1964 << dir_ino << std::dec << "/"
1965 << dname << ": " << cpp_strerror(r) << dendl;
1966 return r;
1967 } else {
1968 dout(20) << "Injected dentry 0x" << std::hex
1969 << dir_ino << "/" << dname << " pointing to 0x"
1970 << inode.inode.ino << std::dec << dendl;
1971 return 0;
1972 }
1973 }
1974
1975
1976 int MetadataDriver::init(
1977 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
1978 fs_cluster_id_t fscid)
1979 {
1980 if (metadata_pool_name.empty()) {
1981 auto fs = fsmap->get_filesystem(fscid);
1982 ceph_assert(fs != nullptr);
1983 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
1984
1985 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
1986 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
1987 if (r < 0) {
1988 derr << "Pool " << metadata_pool_id
1989 << " identified in MDS map not found in RADOS!" << dendl;
1990 return r;
1991 }
1992 dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
1993 } else {
1994 dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
1995 }
1996 return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
1997 }
1998
1999 int LocalFileDriver::init(
2000 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
2001 fs_cluster_id_t fscid)
2002 {
2003 return 0;
2004 }
2005
2006 int LocalFileDriver::inject_data(
2007 const std::string &file_path,
2008 uint64_t size,
2009 uint32_t chunk_size,
2010 inodeno_t ino)
2011 {
2012 // Scrape the file contents out of the data pool and into the
2013 // local filesystem
2014 std::fstream f;
2015 f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
2016
2017 for (uint64_t offset = 0; offset < size; offset += chunk_size) {
2018 bufferlist bl;
2019
2020 char buf[32];
2021 snprintf(buf, sizeof(buf),
2022 "%llx.%08llx",
2023 (unsigned long long)ino,
2024 (unsigned long long)(offset / chunk_size));
2025 std::string oid(buf);
2026
2027 int r = data_io.read(oid, bl, chunk_size, 0);
2028
2029 if (r <= 0 && r != -ENOENT) {
2030 derr << "error reading data object '" << oid << "': "
2031 << cpp_strerror(r) << dendl;
2032 f.close();
2033 return r;
2034 } else if (r >=0) {
2035
2036 f.seekp(offset);
2037 bl.write_stream(f);
2038 }
2039 }
2040 f.close();
2041
2042 return 0;
2043 }
2044
2045
2046 int LocalFileDriver::inject_with_backtrace(
2047 const inode_backtrace_t &bt,
2048 const InodeStore &dentry)
2049 {
2050 std::string path_builder = path;
2051
2052 // Iterate through backtrace creating directory parents
2053 std::vector<inode_backpointer_t>::const_reverse_iterator i;
2054 for (i = bt.ancestors.rbegin();
2055 i != bt.ancestors.rend(); ++i) {
2056
2057 const inode_backpointer_t &backptr = *i;
2058 path_builder += "/";
2059 path_builder += backptr.dname;
2060
2061 // Last entry is the filename itself
2062 bool is_file = (i + 1 == bt.ancestors.rend());
2063 if (is_file) {
2064 // FIXME: inject_data won't cope with interesting (i.e. striped)
2065 // layouts (need a librados-compatible Filer to read these)
2066 inject_data(path_builder, dentry.inode.size,
2067 dentry.inode.layout.object_size, bt.ino);
2068 } else {
2069 int r = mkdir(path_builder.c_str(), 0755);
2070 if (r != 0 && r != -EPERM) {
2071 derr << "error creating directory: '" << path_builder << "': "
2072 << cpp_strerror(r) << dendl;
2073 return r;
2074 }
2075 }
2076 }
2077
2078 return 0;
2079 }
2080
2081 int LocalFileDriver::inject_lost_and_found(
2082 inodeno_t ino,
2083 const InodeStore &dentry)
2084 {
2085 std::string lf_path = path + "/lost+found";
2086 int r = mkdir(lf_path.c_str(), 0755);
2087 if (r != 0 && r != -EPERM) {
2088 derr << "error creating directory: '" << lf_path << "': "
2089 << cpp_strerror(r) << dendl;
2090 return r;
2091 }
2092
2093 std::string file_path = lf_path + "/" + lost_found_dname(ino);
2094 return inject_data(file_path, dentry.inode.size,
2095 dentry.inode.layout.object_size, ino);
2096 }
2097
2098 int LocalFileDriver::init_roots(int64_t data_pool_id)
2099 {
2100 // Ensure that the path exists and is a directory
2101 bool exists;
2102 int r = check_roots(&exists);
2103 if (r != 0) {
2104 return r;
2105 }
2106
2107 if (exists) {
2108 return 0;
2109 } else {
2110 return ::mkdir(path.c_str(), 0755);
2111 }
2112 }
2113
2114 int LocalFileDriver::check_roots(bool *result)
2115 {
2116 // Check if the path exists and is a directory
2117 DIR *d = ::opendir(path.c_str());
2118 if (d == NULL) {
2119 *result = false;
2120 } else {
2121 int r = closedir(d);
2122 if (r != 0) {
2123 // Weird, but maybe possible with e.g. stale FD on NFS mount?
2124 *result = false;
2125 } else {
2126 *result = true;
2127 }
2128 }
2129
2130 return 0;
2131 }
2132
2133 void MetadataTool::build_file_dentry(
2134 inodeno_t ino, uint64_t file_size, time_t file_mtime,
2135 const file_layout_t &layout, InodeStore *out)
2136 {
2137 ceph_assert(out != NULL);
2138
2139 out->inode.mode = 0500 | S_IFREG;
2140 out->inode.size = file_size;
2141 out->inode.max_size_ever = file_size;
2142 out->inode.mtime.tv.tv_sec = file_mtime;
2143 out->inode.atime.tv.tv_sec = file_mtime;
2144 out->inode.ctime.tv.tv_sec = file_mtime;
2145
2146 out->inode.layout = layout;
2147
2148 out->inode.truncate_seq = 1;
2149 out->inode.truncate_size = -1ull;
2150
2151 out->inode.inline_data.version = CEPH_INLINE_NONE;
2152
2153 out->inode.nlink = 1;
2154 out->inode.ino = ino;
2155 out->inode.version = 1;
2156 out->inode.backtrace_version = 1;
2157 out->inode.uid = g_conf()->mds_root_ino_uid;
2158 out->inode.gid = g_conf()->mds_root_ino_gid;
2159 }
2160
2161 void MetadataTool::build_dir_dentry(
2162 inodeno_t ino, const frag_info_t &fragstat,
2163 const file_layout_t &layout, InodeStore *out)
2164 {
2165 ceph_assert(out != NULL);
2166
2167 out->inode.mode = 0755 | S_IFDIR;
2168 out->inode.dirstat = fragstat;
2169 out->inode.mtime.tv.tv_sec = fragstat.mtime;
2170 out->inode.atime.tv.tv_sec = fragstat.mtime;
2171 out->inode.ctime.tv.tv_sec = fragstat.mtime;
2172
2173 out->inode.layout = layout;
2174 out->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
2175
2176 out->inode.truncate_seq = 1;
2177 out->inode.truncate_size = -1ull;
2178
2179 out->inode.inline_data.version = CEPH_INLINE_NONE;
2180
2181 out->inode.nlink = 1;
2182 out->inode.ino = ino;
2183 out->inode.version = 1;
2184 out->inode.backtrace_version = 1;
2185 out->inode.uid = g_conf()->mds_root_ino_uid;
2186 out->inode.gid = g_conf()->mds_root_ino_gid;
2187 }
2188