]> git.proxmox.com Git - ceph.git/blob - ceph/src/tools/cephfs/DataScan.cc
import ceph 15.2.14
[ceph.git] / ceph / src / tools / cephfs / DataScan.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "common/errno.h"
17 #include "common/ceph_argparse.h"
18 #include <fstream>
19 #include "include/util.h"
20 #include "include/ceph_fs.h"
21
22 #include "mds/CInode.h"
23 #include "mds/InoTable.h"
24 #include "mds/SnapServer.h"
25 #include "cls/cephfs/cls_cephfs_client.h"
26
27 #include "PgFiles.h"
28 #include "DataScan.h"
29 #include "include/compat.h"
30
31 #define dout_context g_ceph_context
32 #define dout_subsys ceph_subsys_mds
33 #undef dout_prefix
34 #define dout_prefix *_dout << "datascan." << __func__ << ": "
35
36 void DataScan::usage()
37 {
38 std::cout << "Usage: \n"
39 << " cephfs-data-scan init [--force-init]\n"
40 << " cephfs-data-scan scan_extents [--force-pool] [--worker_n N --worker_m M] <data pool name>\n"
41 << " cephfs-data-scan scan_inodes [--force-pool] [--force-corrupt] [--worker_n N --worker_m M] <data pool name>\n"
42 << " cephfs-data-scan pg_files <path> <pg id> [<pg id>...]\n"
43 << " cephfs-data-scan scan_links\n"
44 << "\n"
45 << " --force-corrupt: overrite apparently corrupt structures\n"
46 << " --force-init: write root inodes even if they exist\n"
47 << " --force-pool: use data pool even if it is not in FSMap\n"
48 << " --worker_m: Maximum number of workers\n"
49 << " --worker_n: Worker number, range 0-(worker_m-1)\n"
50 << "\n"
51 << " cephfs-data-scan scan_frags [--force-corrupt]\n"
52 << " cephfs-data-scan cleanup <data pool name>\n"
53 << std::endl;
54
55 generic_client_usage();
56 }
57
58 bool DataScan::parse_kwarg(
59 const std::vector<const char*> &args,
60 std::vector<const char *>::const_iterator &i,
61 int *r)
62 {
63 if (i + 1 == args.end()) {
64 return false;
65 }
66
67 const std::string arg(*i);
68 const std::string val(*(i + 1));
69
70 if (arg == std::string("--output-dir")) {
71 if (driver != NULL) {
72 derr << "Unexpected --output-dir: output already selected!" << dendl;
73 *r = -EINVAL;
74 return false;
75 }
76 dout(4) << "Using local file output to '" << val << "'" << dendl;
77 driver = new LocalFileDriver(val, data_io);
78 return true;
79 } else if (arg == std::string("--worker_n")) {
80 std::string err;
81 n = strict_strtoll(val.c_str(), 10, &err);
82 if (!err.empty()) {
83 std::cerr << "Invalid worker number '" << val << "'" << std::endl;
84 *r = -EINVAL;
85 return false;
86 }
87 return true;
88 } else if (arg == std::string("--worker_m")) {
89 std::string err;
90 m = strict_strtoll(val.c_str(), 10, &err);
91 if (!err.empty()) {
92 std::cerr << "Invalid worker count '" << val << "'" << std::endl;
93 *r = -EINVAL;
94 return false;
95 }
96 return true;
97 } else if (arg == std::string("--filter-tag")) {
98 filter_tag = val;
99 dout(10) << "Applying tag filter: '" << filter_tag << "'" << dendl;
100 return true;
101 } else if (arg == std::string("--filesystem")) {
102 std::shared_ptr<const Filesystem> fs;
103 *r = fsmap->parse_filesystem(val, &fs);
104 if (*r != 0) {
105 std::cerr << "Invalid filesystem '" << val << "'" << std::endl;
106 return false;
107 }
108 fscid = fs->fscid;
109 return true;
110 } else if (arg == std::string("--alternate-pool")) {
111 metadata_pool_name = val;
112 return true;
113 } else {
114 return false;
115 }
116 }
117
118 bool DataScan::parse_arg(
119 const std::vector<const char*> &args,
120 std::vector<const char *>::const_iterator &i)
121 {
122 const std::string arg(*i);
123 if (arg == "--force-pool") {
124 force_pool = true;
125 return true;
126 } else if (arg == "--force-corrupt") {
127 force_corrupt = true;
128 return true;
129 } else if (arg == "--force-init") {
130 force_init = true;
131 return true;
132 } else {
133 return false;
134 }
135 }
136
137 int DataScan::main(const std::vector<const char*> &args)
138 {
139 // Parse args
140 // ==========
141 if (args.size() < 1) {
142 cerr << "missing position argument" << std::endl;
143 return -EINVAL;
144 }
145
146 // Common RADOS init: open metadata pool
147 // =====================================
148 librados::Rados rados;
149 int r = rados.init_with_context(g_ceph_context);
150 if (r < 0) {
151 derr << "RADOS unavailable" << dendl;
152 return r;
153 }
154
155 std::string const &command = args[0];
156 std::string data_pool_name;
157
158 std::string pg_files_path;
159 std::set<pg_t> pg_files_pgs;
160
161 // Consume any known --key val or --flag arguments
162 for (std::vector<const char *>::const_iterator i = args.begin() + 1;
163 i != args.end(); ++i) {
164 if (parse_kwarg(args, i, &r)) {
165 // Skip the kwarg value field
166 ++i;
167 continue;
168 } else if (r) {
169 return r;
170 }
171
172 if (parse_arg(args, i)) {
173 continue;
174 }
175
176 // Trailing positional argument
177 if (i + 1 == args.end() &&
178 (command == "scan_inodes"
179 || command == "scan_extents"
180 || command == "cleanup")) {
181 data_pool_name = *i;
182 continue;
183 }
184
185 if (command == "pg_files") {
186 if (i == args.begin() + 1) {
187 pg_files_path = *i;
188 continue;
189 } else {
190 pg_t pg;
191 bool parsed = pg.parse(*i);
192 if (!parsed) {
193 std::cerr << "Invalid PG '" << *i << "'" << std::endl;
194 return -EINVAL;
195 } else {
196 pg_files_pgs.insert(pg);
197 continue;
198 }
199 }
200
201 }
202
203 // Fall through: unhandled
204 std::cerr << "Unknown argument '" << *i << "'" << std::endl;
205 return -EINVAL;
206 }
207
208 // If caller didn't specify a namespace, try to pick
209 // one if only one exists
210 if (fscid == FS_CLUSTER_ID_NONE) {
211 if (fsmap->filesystem_count() == 1) {
212 fscid = fsmap->get_filesystem()->fscid;
213 } else {
214 std::cerr << "Specify a filesystem with --filesystem" << std::endl;
215 return -EINVAL;
216 }
217 }
218 auto fs = fsmap->get_filesystem(fscid);
219 ceph_assert(fs != nullptr);
220
221 // Default to output to metadata pool
222 if (driver == NULL) {
223 driver = new MetadataDriver();
224 driver->set_force_corrupt(force_corrupt);
225 driver->set_force_init(force_init);
226 dout(4) << "Using metadata pool output" << dendl;
227 }
228
229 dout(4) << "connecting to RADOS..." << dendl;
230 r = rados.connect();
231 if (r < 0) {
232 std::cerr << "couldn't connect to cluster: " << cpp_strerror(r)
233 << std::endl;
234 return r;
235 }
236
237 r = driver->init(rados, metadata_pool_name, fsmap, fscid);
238 if (r < 0) {
239 return r;
240 }
241
242 if (command == "pg_files") {
243 auto pge = PgFiles(objecter, pg_files_pgs);
244 pge.init();
245 return pge.scan_path(pg_files_path);
246 }
247
248 // Initialize data_io for those commands that need it
249 if (command == "scan_inodes" ||
250 command == "scan_extents" ||
251 command == "cleanup") {
252 if (data_pool_name.empty()) {
253 std::cerr << "Data pool not specified" << std::endl;
254 return -EINVAL;
255 }
256
257 data_pool_id = rados.pool_lookup(data_pool_name.c_str());
258 if (data_pool_id < 0) {
259 std::cerr << "Data pool '" << data_pool_name << "' not found!" << std::endl;
260 return -ENOENT;
261 } else {
262 dout(4) << "data pool '" << data_pool_name
263 << "' has ID " << data_pool_id << dendl;
264 }
265
266 if (!fs->mds_map.is_data_pool(data_pool_id)) {
267 std::cerr << "Warning: pool '" << data_pool_name << "' is not a "
268 "CephFS data pool!" << std::endl;
269 if (!force_pool) {
270 std::cerr << "Use --force-pool to continue" << std::endl;
271 return -EINVAL;
272 }
273 }
274
275 dout(4) << "opening data pool '" << data_pool_name << "'" << dendl;
276 r = rados.ioctx_create(data_pool_name.c_str(), data_io);
277 if (r != 0) {
278 return r;
279 }
280 }
281
282 // Initialize metadata_io from MDSMap for scan_frags
283 if (command == "scan_frags" || command == "scan_links") {
284 const auto fs = fsmap->get_filesystem(fscid);
285 if (fs == nullptr) {
286 std::cerr << "Filesystem id " << fscid << " does not exist" << std::endl;
287 return -ENOENT;
288 }
289 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
290
291 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
292 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
293 if (r < 0) {
294 std::cerr << "Pool " << metadata_pool_id
295 << " identified in MDS map not found in RADOS!" << std::endl;
296 return r;
297 }
298
299 r = rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
300 if (r != 0) {
301 return r;
302 }
303
304 data_pools = fs->mds_map.get_data_pools();
305 }
306
307 // Finally, dispatch command
308 if (command == "scan_inodes") {
309 return scan_inodes();
310 } else if (command == "scan_extents") {
311 return scan_extents();
312 } else if (command == "scan_frags") {
313 return scan_frags();
314 } else if (command == "scan_links") {
315 return scan_links();
316 } else if (command == "cleanup") {
317 return cleanup();
318 } else if (command == "init") {
319 return driver->init_roots(fs->mds_map.get_first_data_pool());
320 } else {
321 std::cerr << "Unknown command '" << command << "'" << std::endl;
322 return -EINVAL;
323 }
324 }
325
326 int MetadataDriver::inject_unlinked_inode(
327 inodeno_t inono, int mode, int64_t data_pool_id)
328 {
329 const object_t oid = InodeStore::get_object_name(inono, frag_t(), ".inode");
330
331 // Skip if exists
332 bool already_exists = false;
333 int r = root_exists(inono, &already_exists);
334 if (r) {
335 return r;
336 }
337 if (already_exists && !force_init) {
338 std::cerr << "Inode 0x" << std::hex << inono << std::dec << " already"
339 " exists, skipping create. Use --force-init to overwrite"
340 " the existing object." << std::endl;
341 return 0;
342 }
343
344 // Compose
345 InodeStore inode;
346 inode.inode.ino = inono;
347 inode.inode.version = 1;
348 inode.inode.xattr_version = 1;
349 inode.inode.mode = 0500 | mode;
350 // Fake dirstat.nfiles to 1, so that the directory doesn't appear to be empty
351 // (we won't actually give the *correct* dirstat here though)
352 inode.inode.dirstat.nfiles = 1;
353
354 inode.inode.ctime =
355 inode.inode.mtime = ceph_clock_now();
356 inode.inode.nlink = 1;
357 inode.inode.truncate_size = -1ull;
358 inode.inode.truncate_seq = 1;
359 inode.inode.uid = g_conf()->mds_root_ino_uid;
360 inode.inode.gid = g_conf()->mds_root_ino_gid;
361
362 // Force layout to default: should we let users override this so that
363 // they don't have to mount the filesystem to correct it?
364 inode.inode.layout = file_layout_t::get_default();
365 inode.inode.layout.pool_id = data_pool_id;
366 inode.inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
367
368 // Assume that we will get our stats wrong, and that we may
369 // be ignoring dirfrags that exist
370 inode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS | DAMAGE_FRAGTREE);
371
372 if (inono == CEPH_INO_ROOT || MDS_INO_IS_MDSDIR(inono)) {
373 sr_t srnode;
374 srnode.seq = 1;
375 encode(srnode, inode.snap_blob);
376 }
377
378 // Serialize
379 bufferlist inode_bl;
380 encode(std::string(CEPH_FS_ONDISK_MAGIC), inode_bl);
381 inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
382
383 // Write
384 r = metadata_io.write_full(oid.name, inode_bl);
385 if (r != 0) {
386 derr << "Error writing '" << oid.name << "': " << cpp_strerror(r) << dendl;
387 return r;
388 }
389
390 return r;
391 }
392
393 int MetadataDriver::root_exists(inodeno_t ino, bool *result)
394 {
395 object_t oid = InodeStore::get_object_name(ino, frag_t(), ".inode");
396 uint64_t size;
397 time_t mtime;
398 int r = metadata_io.stat(oid.name, &size, &mtime);
399 if (r == -ENOENT) {
400 *result = false;
401 return 0;
402 } else if (r < 0) {
403 return r;
404 }
405
406 *result = true;
407 return 0;
408 }
409
410 int MetadataDriver::init_roots(int64_t data_pool_id)
411 {
412 int r = 0;
413 r = inject_unlinked_inode(CEPH_INO_ROOT, S_IFDIR|0755, data_pool_id);
414 if (r != 0) {
415 return r;
416 }
417 r = inject_unlinked_inode(MDS_INO_MDSDIR(0), S_IFDIR, data_pool_id);
418 if (r != 0) {
419 return r;
420 }
421 bool created = false;
422 r = find_or_create_dirfrag(MDS_INO_MDSDIR(0), frag_t(), &created);
423 if (r != 0) {
424 return r;
425 }
426
427 return 0;
428 }
429
430 int MetadataDriver::check_roots(bool *result)
431 {
432 int r;
433 r = root_exists(CEPH_INO_ROOT, result);
434 if (r != 0) {
435 return r;
436 }
437 if (!*result) {
438 return 0;
439 }
440
441 r = root_exists(MDS_INO_MDSDIR(0), result);
442 if (r != 0) {
443 return r;
444 }
445 if (!*result) {
446 return 0;
447 }
448
449 return 0;
450 }
451
452 /**
453 * Stages:
454 *
455 * SERIAL init
456 * 0. Create root inodes if don't exist
457 * PARALLEL scan_extents
458 * 1. Size and mtime recovery: scan ALL objects, and update 0th
459 * objects with max size and max mtime seen.
460 * PARALLEL scan_inodes
461 * 2. Inode recovery: scan ONLY 0th objects, and inject metadata
462 * into dirfrag OMAPs, creating blank dirfrags as needed. No stats
463 * or rstats at this stage. Inodes without backtraces go into
464 * lost+found
465 * TODO: SERIAL "recover stats"
466 * 3. Dirfrag statistics: depth first traverse into metadata tree,
467 * rebuilding dir sizes.
468 * TODO PARALLEL "clean up"
469 * 4. Cleanup; go over all 0th objects (and dirfrags if we tagged
470 * anything onto them) and remove any of the xattrs that we
471 * used for accumulating.
472 */
473
474
475 int parse_oid(const std::string &oid, uint64_t *inode_no, uint64_t *obj_id)
476 {
477 if (oid.find(".") == std::string::npos || oid.find(".") == oid.size() - 1) {
478 return -EINVAL;
479 }
480
481 std::string err;
482 std::string inode_str = oid.substr(0, oid.find("."));
483 *inode_no = strict_strtoll(inode_str.c_str(), 16, &err);
484 if (!err.empty()) {
485 return -EINVAL;
486 }
487
488 std::string pos_string = oid.substr(oid.find(".") + 1);
489 *obj_id = strict_strtoll(pos_string.c_str(), 16, &err);
490 if (!err.empty()) {
491 return -EINVAL;
492 }
493
494 return 0;
495 }
496
497
498 int DataScan::scan_extents()
499 {
500 return forall_objects(data_io, false, [this](
501 std::string const &oid,
502 uint64_t obj_name_ino,
503 uint64_t obj_name_offset) -> int
504 {
505 // Read size
506 uint64_t size;
507 time_t mtime;
508 int r = data_io.stat(oid, &size, &mtime);
509 dout(10) << "handling object " << obj_name_ino
510 << "." << obj_name_offset << dendl;
511 if (r != 0) {
512 dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
513 return r;
514 }
515
516 // I need to keep track of
517 // * The highest object ID seen
518 // * The size of the highest object ID seen
519 // * The largest object seen
520 //
521 // Given those things, I can later infer the object chunking
522 // size, the offset of the last object (chunk size * highest ID seen)
523 // and the actual size (offset of last object + size of highest ID seen)
524 //
525 // This logic doesn't take account of striping.
526 r = ClsCephFSClient::accumulate_inode_metadata(
527 data_io,
528 obj_name_ino,
529 obj_name_offset,
530 size,
531 mtime);
532 if (r < 0) {
533 derr << "Failed to accumulate metadata data from '"
534 << oid << "': " << cpp_strerror(r) << dendl;
535 return r;
536 }
537
538 return r;
539 });
540 }
541
542 int DataScan::probe_filter(librados::IoCtx &ioctx)
543 {
544 bufferlist filter_bl;
545 ClsCephFSClient::build_tag_filter("test", &filter_bl);
546 librados::ObjectCursor range_i;
547 librados::ObjectCursor range_end;
548
549 std::vector<librados::ObjectItem> tmp_result;
550 librados::ObjectCursor tmp_next;
551 int r = ioctx.object_list(ioctx.object_list_begin(), ioctx.object_list_end(),
552 1, filter_bl, &tmp_result, &tmp_next);
553
554 return r >= 0;
555 }
556
557 int DataScan::forall_objects(
558 librados::IoCtx &ioctx,
559 bool untagged_only,
560 std::function<int(std::string, uint64_t, uint64_t)> handler
561 )
562 {
563 librados::ObjectCursor range_i;
564 librados::ObjectCursor range_end;
565 ioctx.object_list_slice(
566 ioctx.object_list_begin(),
567 ioctx.object_list_end(),
568 n,
569 m,
570 &range_i,
571 &range_end);
572
573
574 bufferlist filter_bl;
575
576 bool legacy_filtering = false;
577 if (untagged_only) {
578 // probe to deal with older OSDs that don't support
579 // the cephfs pgls filtering mode
580 legacy_filtering = !probe_filter(ioctx);
581 if (!legacy_filtering) {
582 ClsCephFSClient::build_tag_filter(filter_tag, &filter_bl);
583 }
584 }
585
586 int r = 0;
587 while(range_i < range_end) {
588 std::vector<librados::ObjectItem> result;
589 int r = ioctx.object_list(range_i, range_end, 1,
590 filter_bl, &result, &range_i);
591 if (r < 0) {
592 derr << "Unexpected error listing objects: " << cpp_strerror(r) << dendl;
593 return r;
594 }
595
596 for (const auto &i : result) {
597 const std::string &oid = i.oid;
598 uint64_t obj_name_ino = 0;
599 uint64_t obj_name_offset = 0;
600 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
601 if (r != 0) {
602 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
603 continue;
604 }
605
606 if (untagged_only && legacy_filtering) {
607 dout(20) << "Applying filter to " << oid << dendl;
608
609 // We are only interested in 0th objects during this phase: we touched
610 // the other objects during scan_extents
611 if (obj_name_offset != 0) {
612 dout(20) << "Non-zeroth object" << dendl;
613 continue;
614 }
615
616 bufferlist scrub_tag_bl;
617 int r = ioctx.getxattr(oid, "scrub_tag", scrub_tag_bl);
618 if (r >= 0) {
619 std::string read_tag;
620 auto q = scrub_tag_bl.cbegin();
621 try {
622 decode(read_tag, q);
623 if (read_tag == filter_tag) {
624 dout(20) << "skipping " << oid << " because it has the filter_tag"
625 << dendl;
626 continue;
627 }
628 } catch (const buffer::error &err) {
629 }
630 dout(20) << "read non-matching tag '" << read_tag << "'" << dendl;
631 } else {
632 dout(20) << "no tag read (" << r << ")" << dendl;
633 }
634
635 } else if (untagged_only) {
636 ceph_assert(obj_name_offset == 0);
637 dout(20) << "OSD matched oid " << oid << dendl;
638 }
639
640 int this_oid_r = handler(oid, obj_name_ino, obj_name_offset);
641 if (r == 0 && this_oid_r < 0) {
642 r = this_oid_r;
643 }
644 }
645 }
646
647 return r;
648 }
649
650 int DataScan::scan_inodes()
651 {
652 bool roots_present;
653 int r = driver->check_roots(&roots_present);
654 if (r != 0) {
655 derr << "Unexpected error checking roots: '"
656 << cpp_strerror(r) << "'" << dendl;
657 return r;
658 }
659
660 if (!roots_present) {
661 std::cerr << "Some or all system inodes are absent. Run 'init' from "
662 "one node before running 'scan_inodes'" << std::endl;
663 return -EIO;
664 }
665
666 return forall_objects(data_io, true, [this](
667 std::string const &oid,
668 uint64_t obj_name_ino,
669 uint64_t obj_name_offset) -> int
670 {
671 int r = 0;
672
673 dout(10) << "handling object "
674 << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
675 << dendl;
676
677 AccumulateResult accum_res;
678 inode_backtrace_t backtrace;
679 file_layout_t loaded_layout = file_layout_t::get_default();
680 r = ClsCephFSClient::fetch_inode_accumulate_result(
681 data_io, oid, &backtrace, &loaded_layout, &accum_res);
682
683 if (r == -EINVAL) {
684 dout(4) << "Accumulated metadata missing from '"
685 << oid << ", did you run scan_extents?" << dendl;
686 return r;
687 } else if (r < 0) {
688 dout(4) << "Unexpected error loading accumulated metadata from '"
689 << oid << "': " << cpp_strerror(r) << dendl;
690 // FIXME: this creates situation where if a client has a corrupt
691 // backtrace/layout, we will fail to inject it. We should (optionally)
692 // proceed if the backtrace/layout is corrupt but we have valid
693 // accumulated metadata.
694 return r;
695 }
696
697 const time_t file_mtime = accum_res.max_mtime;
698 uint64_t file_size = 0;
699 bool have_backtrace = !(backtrace.ancestors.empty());
700
701 // This is the layout we will use for injection, populated either
702 // from loaded_layout or from best guesses
703 file_layout_t guessed_layout;
704 guessed_layout.pool_id = data_pool_id;
705
706 // Calculate file_size, guess the layout
707 if (accum_res.ceiling_obj_index > 0) {
708 uint32_t chunk_size = file_layout_t::get_default().object_size;
709 // When there are multiple objects, the largest object probably
710 // indicates the chunk size. But not necessarily, because files
711 // can be sparse. Only make this assumption if size seen
712 // is a power of two, as chunk sizes typically are.
713 if ((accum_res.max_obj_size & (accum_res.max_obj_size - 1)) == 0) {
714 chunk_size = accum_res.max_obj_size;
715 }
716
717 if (loaded_layout.pool_id == -1) {
718 // If no stashed layout was found, guess it
719 guessed_layout.object_size = chunk_size;
720 guessed_layout.stripe_unit = chunk_size;
721 guessed_layout.stripe_count = 1;
722 } else if (!loaded_layout.is_valid() ||
723 loaded_layout.object_size < accum_res.max_obj_size) {
724 // If the max size seen exceeds what the stashed layout claims, then
725 // disbelieve it. Guess instead. Same for invalid layouts on disk.
726 dout(4) << "bogus xattr layout on 0x" << std::hex << obj_name_ino
727 << std::dec << ", ignoring in favour of best guess" << dendl;
728 guessed_layout.object_size = chunk_size;
729 guessed_layout.stripe_unit = chunk_size;
730 guessed_layout.stripe_count = 1;
731 } else {
732 // We have a stashed layout that we can't disprove, so apply it
733 guessed_layout = loaded_layout;
734 dout(20) << "loaded layout from xattr:"
735 << " os: " << guessed_layout.object_size
736 << " sc: " << guessed_layout.stripe_count
737 << " su: " << guessed_layout.stripe_unit
738 << dendl;
739 // User might have transplanted files from a pool with a different
740 // ID, so whatever the loaded_layout says, we'll force the injected
741 // layout to point to the pool we really read from
742 guessed_layout.pool_id = data_pool_id;
743 }
744
745 if (guessed_layout.stripe_count == 1) {
746 // Unstriped file: simple chunking
747 file_size = guessed_layout.object_size * accum_res.ceiling_obj_index
748 + accum_res.ceiling_obj_size;
749 } else {
750 // Striped file: need to examine the last stripe_count objects
751 // in the file to determine the size.
752
753 // How many complete (i.e. not last stripe) objects?
754 uint64_t complete_objs = 0;
755 if (accum_res.ceiling_obj_index > guessed_layout.stripe_count - 1) {
756 complete_objs = (accum_res.ceiling_obj_index / guessed_layout.stripe_count) * guessed_layout.stripe_count;
757 } else {
758 complete_objs = 0;
759 }
760
761 // How many potentially-short objects (i.e. last stripe set) objects?
762 uint64_t partial_objs = accum_res.ceiling_obj_index + 1 - complete_objs;
763
764 dout(10) << "calculating striped size from complete objs: "
765 << complete_objs << ", partial objs: " << partial_objs
766 << dendl;
767
768 // Maximum amount of data that may be in the incomplete objects
769 uint64_t incomplete_size = 0;
770
771 // For each short object, calculate the max file size within it
772 // and accumulate the maximum
773 for (uint64_t i = complete_objs; i < complete_objs + partial_objs; ++i) {
774 char buf[60];
775 snprintf(buf, sizeof(buf), "%llx.%08llx",
776 (long long unsigned)obj_name_ino, (long long unsigned)i);
777
778 uint64_t osize(0);
779 time_t omtime(0);
780 r = data_io.stat(std::string(buf), &osize, &omtime);
781 if (r == 0) {
782 if (osize > 0) {
783 // Upper bound within this object
784 uint64_t upper_size = (osize - 1) / guessed_layout.stripe_unit
785 * (guessed_layout.stripe_unit * guessed_layout.stripe_count)
786 + (i % guessed_layout.stripe_count)
787 * guessed_layout.stripe_unit + (osize - 1)
788 % guessed_layout.stripe_unit + 1;
789 incomplete_size = std::max(incomplete_size, upper_size);
790 }
791 } else if (r == -ENOENT) {
792 // Absent object, treat as size 0 and ignore.
793 } else {
794 // Unexpected error, carry r to outer scope for handling.
795 break;
796 }
797 }
798 if (r != 0 && r != -ENOENT) {
799 derr << "Unexpected error checking size of ino 0x" << std::hex
800 << obj_name_ino << std::dec << ": " << cpp_strerror(r) << dendl;
801 return r;
802 }
803 file_size = complete_objs * guessed_layout.object_size
804 + incomplete_size;
805 }
806 } else {
807 file_size = accum_res.ceiling_obj_size;
808 if (loaded_layout.pool_id < 0
809 || loaded_layout.object_size < accum_res.max_obj_size) {
810 // No layout loaded, or inconsistent layout, use default
811 guessed_layout = file_layout_t::get_default();
812 guessed_layout.pool_id = data_pool_id;
813 } else {
814 guessed_layout = loaded_layout;
815 }
816 }
817
818 // Santity checking backtrace ino against object name
819 if (have_backtrace && backtrace.ino != obj_name_ino) {
820 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
821 << " doesn't match object name ino 0x" << obj_name_ino
822 << std::dec << dendl;
823 have_backtrace = false;
824 }
825
826 InodeStore dentry;
827 build_file_dentry(obj_name_ino, file_size, file_mtime, guessed_layout, &dentry);
828
829 // Inject inode to the metadata pool
830 if (have_backtrace) {
831 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
832 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
833 /* Special case for strays: even if we have a good backtrace,
834 * don't put it in the stray dir, because while that would technically
835 * give it linkage it would still be invisible to the user */
836 r = driver->inject_lost_and_found(obj_name_ino, dentry);
837 if (r < 0) {
838 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
839 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
840 if (r == -EINVAL) {
841 dout(4) << "Use --force-corrupt to overwrite structures that "
842 "appear to be corrupt" << dendl;
843 }
844 }
845 } else {
846 /* Happy case: we will inject a named dentry for this inode */
847 r = driver->inject_with_backtrace(backtrace, dentry);
848 if (r < 0) {
849 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
850 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
851 if (r == -EINVAL) {
852 dout(4) << "Use --force-corrupt to overwrite structures that "
853 "appear to be corrupt" << dendl;
854 }
855 }
856 }
857 } else {
858 /* Backtrace-less case: we will inject a lost+found dentry */
859 r = driver->inject_lost_and_found(
860 obj_name_ino, dentry);
861 if (r < 0) {
862 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
863 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
864 if (r == -EINVAL) {
865 dout(4) << "Use --force-corrupt to overwrite structures that "
866 "appear to be corrupt" << dendl;
867 }
868 }
869 }
870
871 return r;
872 });
873 }
874
875 int DataScan::cleanup()
876 {
877 // We are looking for only zeroth object
878 //
879 return forall_objects(data_io, true, [this](
880 std::string const &oid,
881 uint64_t obj_name_ino,
882 uint64_t obj_name_offset) -> int
883 {
884 int r = 0;
885 r = ClsCephFSClient::delete_inode_accumulate_result(data_io, oid);
886 if (r < 0) {
887 dout(4) << "Error deleting accumulated metadata from '"
888 << oid << "': " << cpp_strerror(r) << dendl;
889 }
890 return r;
891 });
892 }
893
894 bool DataScan::valid_ino(inodeno_t ino) const
895 {
896 return (ino >= inodeno_t((1ull << 40)))
897 || (MDS_INO_IS_STRAY(ino))
898 || (MDS_INO_IS_MDSDIR(ino))
899 || ino == CEPH_INO_ROOT
900 || ino == CEPH_INO_CEPH;
901 }
902
903 int DataScan::scan_links()
904 {
905 MetadataDriver *metadata_driver = dynamic_cast<MetadataDriver*>(driver);
906 if (!metadata_driver) {
907 derr << "Unexpected --output-dir option for scan_links" << dendl;
908 return -EINVAL;
909 }
910
911 interval_set<uint64_t> used_inos;
912 map<inodeno_t, int> remote_links;
913 map<snapid_t, SnapInfo> snaps;
914 snapid_t last_snap = 1;
915 snapid_t snaprealm_v2_since = 2;
916
917 struct link_info_t {
918 inodeno_t dirino;
919 frag_t frag;
920 string name;
921 version_t version;
922 int nlink;
923 bool is_dir;
924 map<snapid_t, SnapInfo> snaps;
925 link_info_t() : version(0), nlink(0), is_dir(false) {}
926 link_info_t(inodeno_t di, frag_t df, const string& n, const CInode::mempool_inode& i) :
927 dirino(di), frag(df), name(n),
928 version(i.version), nlink(i.nlink), is_dir(S_IFDIR & i.mode) {}
929 dirfrag_t dirfrag() const {
930 return dirfrag_t(dirino, frag);
931 }
932 };
933 map<inodeno_t, list<link_info_t> > dup_primaries;
934 map<inodeno_t, link_info_t> bad_nlink_inos;
935 map<inodeno_t, link_info_t> injected_inos;
936
937 map<dirfrag_t, set<string> > to_remove;
938
939 enum {
940 SCAN_INOS = 1,
941 CHECK_LINK,
942 };
943
944 for (int step = SCAN_INOS; step <= CHECK_LINK; step++) {
945 const librados::NObjectIterator it_end = metadata_io.nobjects_end();
946 for (auto it = metadata_io.nobjects_begin(); it != it_end; ++it) {
947 const std::string oid = it->get_oid();
948
949 uint64_t dir_ino = 0;
950 uint64_t frag_id = 0;
951 int r = parse_oid(oid, &dir_ino, &frag_id);
952 if (r == -EINVAL) {
953 dout(10) << "Not a dirfrag: '" << oid << "'" << dendl;
954 continue;
955 } else {
956 // parse_oid can only do 0 or -EINVAL
957 ceph_assert(r == 0);
958 }
959
960 if (!valid_ino(dir_ino)) {
961 dout(10) << "Not a dirfrag (invalid ino): '" << oid << "'" << dendl;
962 continue;
963 }
964
965 std::map<std::string, bufferlist> items;
966 r = metadata_io.omap_get_vals(oid, "", (uint64_t)-1, &items);
967 if (r < 0) {
968 derr << "Error getting omap from '" << oid << "': " << cpp_strerror(r) << dendl;
969 return r;
970 }
971
972 for (auto& p : items) {
973 auto q = p.second.cbegin();
974 string dname;
975 snapid_t last;
976 dentry_key_t::decode_helper(p.first, dname, last);
977
978 if (last != CEPH_NOSNAP) {
979 if (last > last_snap)
980 last_snap = last;
981 continue;
982 }
983
984 try {
985 snapid_t dnfirst;
986 decode(dnfirst, q);
987 if (dnfirst <= CEPH_MAXSNAP) {
988 if (dnfirst - 1 > last_snap)
989 last_snap = dnfirst - 1;
990 }
991 char dentry_type;
992 decode(dentry_type, q);
993 if (dentry_type == 'I') {
994 InodeStore inode;
995 inode.decode_bare(q);
996 inodeno_t ino = inode.inode.ino;
997
998 if (step == SCAN_INOS) {
999 if (used_inos.contains(ino, 1)) {
1000 dup_primaries[ino].size();
1001 } else {
1002 used_inos.insert(ino);
1003 }
1004 } else if (step == CHECK_LINK) {
1005 sr_t srnode;
1006 if (inode.snap_blob.length()) {
1007 auto p = inode.snap_blob.cbegin();
1008 decode(srnode, p);
1009 for (auto it = srnode.snaps.begin();
1010 it != srnode.snaps.end(); ) {
1011 if (it->second.ino != ino ||
1012 it->second.snapid != it->first) {
1013 srnode.snaps.erase(it++);
1014 } else {
1015 ++it;
1016 }
1017 }
1018 if (!srnode.past_parents.empty()) {
1019 snapid_t last = srnode.past_parents.rbegin()->first;
1020 if (last + 1 > snaprealm_v2_since)
1021 snaprealm_v2_since = last + 1;
1022 }
1023 }
1024 if (!inode.old_inodes.empty()) {
1025 if (inode.old_inodes.rbegin()->first > last_snap)
1026 last_snap = inode.old_inodes.rbegin()->first;
1027 }
1028 auto q = dup_primaries.find(ino);
1029 if (q != dup_primaries.end()) {
1030 q->second.push_back(link_info_t(dir_ino, frag_id, dname, inode.inode));
1031 q->second.back().snaps.swap(srnode.snaps);
1032 } else {
1033 int nlink = 0;
1034 auto r = remote_links.find(ino);
1035 if (r != remote_links.end())
1036 nlink = r->second;
1037 if (!MDS_INO_IS_STRAY(dir_ino))
1038 nlink++;
1039 if (inode.inode.nlink != nlink) {
1040 derr << "Bad nlink on " << ino << " expected " << nlink
1041 << " has " << inode.inode.nlink << dendl;
1042 bad_nlink_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1043 bad_nlink_inos[ino].nlink = nlink;
1044 }
1045 snaps.insert(make_move_iterator(begin(srnode.snaps)),
1046 make_move_iterator(end(srnode.snaps)));
1047 }
1048 if (dnfirst == CEPH_NOSNAP)
1049 injected_inos[ino] = link_info_t(dir_ino, frag_id, dname, inode.inode);
1050 }
1051 } else if (dentry_type == 'L') {
1052 inodeno_t ino;
1053 unsigned char d_type;
1054 decode(ino, q);
1055 decode(d_type, q);
1056
1057 if (step == SCAN_INOS) {
1058 remote_links[ino]++;
1059 } else if (step == CHECK_LINK) {
1060 if (!used_inos.contains(ino, 1)) {
1061 derr << "Bad remote link dentry 0x" << std::hex << dir_ino
1062 << std::dec << "/" << dname
1063 << ", ino " << ino << " not found" << dendl;
1064 std::string key;
1065 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1066 dn_key.encode(key);
1067 to_remove[dirfrag_t(dir_ino, frag_id)].insert(key);
1068 }
1069 }
1070 } else {
1071 derr << "Invalid tag char '" << dentry_type << "' dentry 0x" << dir_ino
1072 << std::dec << "/" << dname << dendl;
1073 return -EINVAL;
1074 }
1075 } catch (const buffer::error &err) {
1076 derr << "Error decoding dentry 0x" << std::hex << dir_ino
1077 << std::dec << "/" << dname << dendl;
1078 return -EINVAL;
1079 }
1080 }
1081 }
1082 }
1083
1084 map<unsigned, uint64_t> max_ino_map;
1085 {
1086 auto prev_max_ino = (uint64_t)1 << 40;
1087 for (auto p = used_inos.begin(); p != used_inos.end(); ++p) {
1088 auto cur_max = p.get_start() + p.get_len() - 1;
1089 if (cur_max < prev_max_ino)
1090 continue; // system inodes
1091
1092 if ((prev_max_ino >> 40) != (cur_max >> 40)) {
1093 unsigned rank = (prev_max_ino >> 40) - 1;
1094 max_ino_map[rank] = prev_max_ino;
1095 } else if ((p.get_start() >> 40) != (cur_max >> 40)) {
1096 unsigned rank = (p.get_start() >> 40) - 1;
1097 max_ino_map[rank] = ((uint64_t)(rank + 2) << 40) - 1;
1098 }
1099 prev_max_ino = cur_max;
1100 }
1101 unsigned rank = (prev_max_ino >> 40) - 1;
1102 max_ino_map[rank] = prev_max_ino;
1103 }
1104
1105 used_inos.clear();
1106
1107 for (auto& p : dup_primaries) {
1108 link_info_t newest;
1109 for (auto& q : p.second) {
1110 if (q.version > newest.version) {
1111 newest = q;
1112 } else if (q.version == newest.version &&
1113 !MDS_INO_IS_STRAY(q.dirino) &&
1114 MDS_INO_IS_STRAY(newest.dirino)) {
1115 newest = q;
1116 }
1117 }
1118
1119 for (auto& q : p.second) {
1120 // in the middle of dir fragmentation?
1121 if (newest.dirino == q.dirino && newest.name == q.name) {
1122 snaps.insert(make_move_iterator(begin(q.snaps)),
1123 make_move_iterator(end(q.snaps)));
1124 continue;
1125 }
1126
1127 std::string key;
1128 dentry_key_t dn_key(CEPH_NOSNAP, q.name.c_str());
1129 dn_key.encode(key);
1130 to_remove[q.dirfrag()].insert(key);
1131 derr << "Remove duplicated ino 0x" << p.first << " from "
1132 << q.dirfrag() << "/" << q.name << dendl;
1133 }
1134
1135 int nlink = 0;
1136 auto q = remote_links.find(p.first);
1137 if (q != remote_links.end())
1138 nlink = q->second;
1139 if (!MDS_INO_IS_STRAY(newest.dirino))
1140 nlink++;
1141
1142 if (nlink != newest.nlink) {
1143 derr << "Bad nlink on " << p.first << " expected " << nlink
1144 << " has " << newest.nlink << dendl;
1145 bad_nlink_inos[p.first] = newest;
1146 bad_nlink_inos[p.first].nlink = nlink;
1147 }
1148 }
1149 dup_primaries.clear();
1150 remote_links.clear();
1151
1152 {
1153 objecter->with_osdmap([&](const OSDMap& o) {
1154 for (auto p : data_pools) {
1155 const pg_pool_t *pi = o.get_pg_pool(p);
1156 if (!pi)
1157 continue;
1158 if (pi->snap_seq > last_snap)
1159 last_snap = pi->snap_seq;
1160 }
1161 });
1162
1163 if (!snaps.empty()) {
1164 if (snaps.rbegin()->first > last_snap)
1165 last_snap = snaps.rbegin()->first;
1166 }
1167 }
1168
1169 for (auto& p : to_remove) {
1170 object_t frag_oid = InodeStore::get_object_name(p.first.ino, p.first.frag, "");
1171
1172 int r = metadata_io.omap_rm_keys(frag_oid.name, p.second);
1173 if (r != 0) {
1174 derr << "Error removing duplicated dentries from " << p.first << dendl;
1175 return r;
1176 }
1177 }
1178 to_remove.clear();
1179
1180 for (auto &p : bad_nlink_inos) {
1181 InodeStore inode;
1182 snapid_t first;
1183 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
1184 if (r < 0) {
1185 derr << "Unexpected error reading dentry "
1186 << p.second.dirfrag() << "/" << p.second.name
1187 << ": " << cpp_strerror(r) << dendl;
1188 return r;
1189 }
1190
1191 if (inode.inode.ino != p.first || inode.inode.version != p.second.version)
1192 continue;
1193
1194 inode.inode.nlink = p.second.nlink;
1195 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
1196 if (r < 0)
1197 return r;
1198 }
1199
1200 for (auto &p : injected_inos) {
1201 InodeStore inode;
1202 snapid_t first;
1203 int r = read_dentry(p.second.dirino, p.second.frag, p.second.name, &inode, &first);
1204 if (r < 0) {
1205 derr << "Unexpected error reading dentry "
1206 << p.second.dirfrag() << "/" << p.second.name
1207 << ": " << cpp_strerror(r) << dendl;
1208 return r;
1209 }
1210
1211 if (first != CEPH_NOSNAP)
1212 continue;
1213
1214 first = last_snap + 1;
1215 r = metadata_driver->inject_linkage(p.second.dirino, p.second.name, p.second.frag, inode, first);
1216 if (r < 0)
1217 return r;
1218 }
1219
1220 for (auto& p : max_ino_map) {
1221 InoTable inotable(nullptr);
1222 inotable.set_rank(p.first);
1223 bool dirty = false;
1224 int r = metadata_driver->load_table(&inotable);
1225 if (r < 0) {
1226 inotable.reset_state();
1227 dirty = true;
1228 }
1229 if (inotable.force_consume_to(p.second))
1230 dirty = true;
1231 if (dirty) {
1232 r = metadata_driver->save_table(&inotable);
1233 if (r < 0)
1234 return r;
1235 }
1236 }
1237
1238 {
1239 SnapServer snaptable;
1240 snaptable.set_rank(0);
1241 bool dirty = false;
1242 int r = metadata_driver->load_table(&snaptable);
1243 if (r < 0) {
1244 snaptable.reset_state();
1245 dirty = true;
1246 }
1247 if (snaptable.force_update(last_snap, snaprealm_v2_since, snaps))
1248 dirty = true;
1249 if (dirty) {
1250 r = metadata_driver->save_table(&snaptable);
1251 if (r < 0)
1252 return r;
1253 }
1254 }
1255 return 0;
1256 }
1257
1258 int DataScan::scan_frags()
1259 {
1260 bool roots_present;
1261 int r = driver->check_roots(&roots_present);
1262 if (r != 0) {
1263 derr << "Unexpected error checking roots: '"
1264 << cpp_strerror(r) << "'" << dendl;
1265 return r;
1266 }
1267
1268 if (!roots_present) {
1269 std::cerr << "Some or all system inodes are absent. Run 'init' from "
1270 "one node before running 'scan_inodes'" << std::endl;
1271 return -EIO;
1272 }
1273
1274 return forall_objects(metadata_io, true, [this](
1275 std::string const &oid,
1276 uint64_t obj_name_ino,
1277 uint64_t obj_name_offset) -> int
1278 {
1279 int r = 0;
1280 r = parse_oid(oid, &obj_name_ino, &obj_name_offset);
1281 if (r != 0) {
1282 dout(4) << "Bad object name '" << oid << "', skipping" << dendl;
1283 return r;
1284 }
1285
1286 if (obj_name_ino < (1ULL << 40)) {
1287 // FIXME: we're skipping stray dirs here: if they're
1288 // orphaned then we should be resetting them some other
1289 // way
1290 dout(10) << "Skipping system ino " << obj_name_ino << dendl;
1291 return 0;
1292 }
1293
1294 AccumulateResult accum_res;
1295 inode_backtrace_t backtrace;
1296
1297 // Default to inherit layout (i.e. no explicit layout on dir) which is
1298 // expressed as a zeroed layout struct (see inode_t::has_layout)
1299 file_layout_t loaded_layout;
1300
1301 int parent_r = 0;
1302 bufferlist parent_bl;
1303 int layout_r = 0;
1304 bufferlist layout_bl;
1305 bufferlist op_bl;
1306
1307 librados::ObjectReadOperation op;
1308 op.getxattr("parent", &parent_bl, &parent_r);
1309 op.getxattr("layout", &layout_bl, &layout_r);
1310 r = metadata_io.operate(oid, &op, &op_bl);
1311 if (r != 0 && r != -ENODATA) {
1312 derr << "Unexpected error reading backtrace: " << cpp_strerror(parent_r) << dendl;
1313 return r;
1314 }
1315
1316 if (parent_r != -ENODATA) {
1317 try {
1318 auto q = parent_bl.cbegin();
1319 backtrace.decode(q);
1320 } catch (buffer::error &e) {
1321 dout(4) << "Corrupt backtrace on '" << oid << "': " << e << dendl;
1322 if (!force_corrupt) {
1323 return -EINVAL;
1324 } else {
1325 // Treat backtrace as absent: we'll inject into lost+found
1326 backtrace = inode_backtrace_t();
1327 }
1328 }
1329 }
1330
1331 if (layout_r != -ENODATA) {
1332 try {
1333 auto q = layout_bl.cbegin();
1334 decode(loaded_layout, q);
1335 } catch (buffer::error &e) {
1336 dout(4) << "Corrupt layout on '" << oid << "': " << e << dendl;
1337 if (!force_corrupt) {
1338 return -EINVAL;
1339 }
1340 }
1341 }
1342
1343 bool have_backtrace = !(backtrace.ancestors.empty());
1344
1345 // Santity checking backtrace ino against object name
1346 if (have_backtrace && backtrace.ino != obj_name_ino) {
1347 dout(4) << "Backtrace ino 0x" << std::hex << backtrace.ino
1348 << " doesn't match object name ino 0x" << obj_name_ino
1349 << std::dec << dendl;
1350 have_backtrace = false;
1351 }
1352
1353 uint64_t fnode_version = 0;
1354 fnode_t fnode;
1355 r = read_fnode(obj_name_ino, frag_t(), &fnode, &fnode_version);
1356 if (r == -EINVAL) {
1357 derr << "Corrupt fnode on " << oid << dendl;
1358 if (force_corrupt) {
1359 fnode.fragstat.mtime = 0;
1360 fnode.fragstat.nfiles = 1;
1361 fnode.fragstat.nsubdirs = 0;
1362 fnode.accounted_fragstat = fnode.fragstat;
1363 } else {
1364 return r;
1365 }
1366 }
1367
1368 InodeStore dentry;
1369 build_dir_dentry(obj_name_ino, fnode.accounted_fragstat,
1370 loaded_layout, &dentry);
1371
1372 // Inject inode to the metadata pool
1373 if (have_backtrace) {
1374 inode_backpointer_t root_bp = *(backtrace.ancestors.rbegin());
1375 if (MDS_INO_IS_MDSDIR(root_bp.dirino)) {
1376 /* Special case for strays: even if we have a good backtrace,
1377 * don't put it in the stray dir, because while that would technically
1378 * give it linkage it would still be invisible to the user */
1379 r = driver->inject_lost_and_found(obj_name_ino, dentry);
1380 if (r < 0) {
1381 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1382 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1383 if (r == -EINVAL) {
1384 dout(4) << "Use --force-corrupt to overwrite structures that "
1385 "appear to be corrupt" << dendl;
1386 }
1387 }
1388 } else {
1389 /* Happy case: we will inject a named dentry for this inode */
1390 r = driver->inject_with_backtrace(backtrace, dentry);
1391 if (r < 0) {
1392 dout(4) << "Error injecting 0x" << std::hex << backtrace.ino
1393 << std::dec << " with backtrace: " << cpp_strerror(r) << dendl;
1394 if (r == -EINVAL) {
1395 dout(4) << "Use --force-corrupt to overwrite structures that "
1396 "appear to be corrupt" << dendl;
1397 }
1398 }
1399 }
1400 } else {
1401 /* Backtrace-less case: we will inject a lost+found dentry */
1402 r = driver->inject_lost_and_found(
1403 obj_name_ino, dentry);
1404 if (r < 0) {
1405 dout(4) << "Error injecting 0x" << std::hex << obj_name_ino
1406 << std::dec << " into lost+found: " << cpp_strerror(r) << dendl;
1407 if (r == -EINVAL) {
1408 dout(4) << "Use --force-corrupt to overwrite structures that "
1409 "appear to be corrupt" << dendl;
1410 }
1411 }
1412 }
1413
1414 return r;
1415 });
1416 }
1417
1418 int MetadataTool::read_fnode(
1419 inodeno_t ino, frag_t frag, fnode_t *fnode,
1420 uint64_t *last_version)
1421 {
1422 ceph_assert(fnode != NULL);
1423
1424 object_t frag_oid = InodeStore::get_object_name(ino, frag, "");
1425 bufferlist fnode_bl;
1426 int r = metadata_io.omap_get_header(frag_oid.name, &fnode_bl);
1427 *last_version = metadata_io.get_last_version();
1428 if (r < 0) {
1429 return r;
1430 }
1431
1432 auto old_fnode_iter = fnode_bl.cbegin();
1433 try {
1434 (*fnode).decode(old_fnode_iter);
1435 } catch (const buffer::error &err) {
1436 return -EINVAL;
1437 }
1438
1439 return 0;
1440 }
1441
1442 int MetadataTool::read_dentry(inodeno_t parent_ino, frag_t frag,
1443 const std::string &dname, InodeStore *inode, snapid_t *dnfirst)
1444 {
1445 ceph_assert(inode != NULL);
1446
1447 std::string key;
1448 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1449 dn_key.encode(key);
1450
1451 std::set<std::string> keys;
1452 keys.insert(key);
1453 std::map<std::string, bufferlist> vals;
1454 object_t frag_oid = InodeStore::get_object_name(parent_ino, frag, "");
1455 int r = metadata_io.omap_get_vals_by_keys(frag_oid.name, keys, &vals);
1456 dout(20) << "oid=" << frag_oid.name
1457 << " dname=" << dname
1458 << " frag=" << frag
1459 << ", r=" << r << dendl;
1460 if (r < 0) {
1461 return r;
1462 }
1463
1464 if (vals.find(key) == vals.end()) {
1465 dout(20) << key << " not found in result" << dendl;
1466 return -ENOENT;
1467 }
1468
1469 try {
1470 auto q = vals[key].cbegin();
1471 snapid_t first;
1472 decode(first, q);
1473 char dentry_type;
1474 decode(dentry_type, q);
1475 if (dentry_type == 'I') {
1476 inode->decode_bare(q);
1477 } else {
1478 dout(20) << "dentry type '" << dentry_type << "': cannot"
1479 "read an inode out of that" << dendl;
1480 return -EINVAL;
1481 }
1482 if (dnfirst)
1483 *dnfirst = first;
1484 } catch (const buffer::error &err) {
1485 dout(20) << "encoding error in dentry 0x" << std::hex << parent_ino
1486 << std::dec << "/" << dname << dendl;
1487 return -EINVAL;
1488 }
1489
1490 return 0;
1491 }
1492
1493 int MetadataDriver::load_table(MDSTable *table)
1494 {
1495 object_t table_oid = table->get_object_name();
1496
1497 bufferlist table_bl;
1498 int r = metadata_io.read(table_oid.name, table_bl, 0, 0);
1499 if (r < 0) {
1500 derr << "unable to read mds table '" << table_oid.name << "': "
1501 << cpp_strerror(r) << dendl;
1502 return r;
1503 }
1504
1505 try {
1506 version_t table_ver;
1507 auto p = table_bl.cbegin();
1508 decode(table_ver, p);
1509 table->decode_state(p);
1510 table->force_replay_version(table_ver);
1511 } catch (const buffer::error &err) {
1512 derr << "unable to decode mds table '" << table_oid.name << "': "
1513 << err.what() << dendl;
1514 return -EIO;
1515 }
1516 return 0;
1517 }
1518
1519 int MetadataDriver::save_table(MDSTable *table)
1520 {
1521 object_t table_oid = table->get_object_name();
1522
1523 bufferlist table_bl;
1524 encode(table->get_version(), table_bl);
1525 table->encode_state(table_bl);
1526 int r = metadata_io.write_full(table_oid.name, table_bl);
1527 if (r != 0) {
1528 derr << "error updating mds table " << table_oid.name
1529 << ": " << cpp_strerror(r) << dendl;
1530 return r;
1531 }
1532 return 0;
1533 }
1534
1535 int MetadataDriver::inject_lost_and_found(
1536 inodeno_t ino, const InodeStore &dentry)
1537 {
1538 // Create lost+found if doesn't exist
1539 bool created = false;
1540 int r = find_or_create_dirfrag(CEPH_INO_ROOT, frag_t(), &created);
1541 if (r < 0) {
1542 return r;
1543 }
1544 InodeStore lf_ino;
1545 r = read_dentry(CEPH_INO_ROOT, frag_t(), "lost+found", &lf_ino);
1546 if (r == -ENOENT || r == -EINVAL) {
1547 if (r == -EINVAL && !force_corrupt) {
1548 return r;
1549 }
1550
1551 // To have a directory not specify a layout, give it zeros (see
1552 // inode_t::has_layout)
1553 file_layout_t inherit_layout;
1554
1555 // Construct LF inode
1556 frag_info_t fragstat;
1557 fragstat.nfiles = 1,
1558 build_dir_dentry(CEPH_INO_LOST_AND_FOUND, fragstat, inherit_layout, &lf_ino);
1559
1560 // Inject link to LF inode in the root dir
1561 r = inject_linkage(CEPH_INO_ROOT, "lost+found", frag_t(), lf_ino);
1562 if (r < 0) {
1563 return r;
1564 }
1565 } else {
1566 if (!(lf_ino.inode.mode & S_IFDIR)) {
1567 derr << "lost+found exists but is not a directory!" << dendl;
1568 // In this case we error out, and the user should do something about
1569 // this problem.
1570 return -EINVAL;
1571 }
1572 }
1573
1574 r = find_or_create_dirfrag(CEPH_INO_LOST_AND_FOUND, frag_t(), &created);
1575 if (r < 0) {
1576 return r;
1577 }
1578
1579 InodeStore recovered_ino;
1580
1581
1582 const std::string dname = lost_found_dname(ino);
1583
1584 // Write dentry into lost+found dirfrag
1585 return inject_linkage(lf_ino.inode.ino, dname, frag_t(), dentry);
1586 }
1587
1588
1589 int MetadataDriver::get_frag_of(
1590 inodeno_t dirino,
1591 const std::string &target_dname,
1592 frag_t *result_ft)
1593 {
1594 object_t root_frag_oid = InodeStore::get_object_name(dirino, frag_t(), "");
1595
1596 dout(20) << "dirino=" << dirino << " target_dname=" << target_dname << dendl;
1597
1598 // Find and load fragtree if existing dirfrag
1599 // ==========================================
1600 bool have_backtrace = false;
1601 bufferlist parent_bl;
1602 int r = metadata_io.getxattr(root_frag_oid.name, "parent", parent_bl);
1603 if (r == -ENODATA) {
1604 dout(10) << "No backtrace on '" << root_frag_oid << "'" << dendl;
1605 } else if (r < 0) {
1606 dout(4) << "Unexpected error on '" << root_frag_oid << "': "
1607 << cpp_strerror(r) << dendl;
1608 return r;
1609 }
1610
1611 // Deserialize backtrace
1612 inode_backtrace_t backtrace;
1613 if (parent_bl.length()) {
1614 try {
1615 auto q = parent_bl.cbegin();
1616 backtrace.decode(q);
1617 have_backtrace = true;
1618 } catch (buffer::error &e) {
1619 dout(4) << "Corrupt backtrace on '" << root_frag_oid << "': " << e << dendl;
1620 }
1621 }
1622
1623 if (!(have_backtrace && backtrace.ancestors.size())) {
1624 // Can't work out fragtree without a backtrace
1625 dout(4) << "No backtrace on '" << root_frag_oid
1626 << "': cannot determine fragtree" << dendl;
1627 return -ENOENT;
1628 }
1629
1630 // The parentage of dirino
1631 const inode_backpointer_t &bp = *(backtrace.ancestors.begin());
1632
1633 // The inode of dirino's parent
1634 const inodeno_t parent_ino = bp.dirino;
1635
1636 // The dname of dirino in its parent.
1637 const std::string &parent_dname = bp.dname;
1638
1639 dout(20) << "got backtrace parent " << parent_ino << "/"
1640 << parent_dname << dendl;
1641
1642 // The primary dentry for dirino
1643 InodeStore existing_dentry;
1644
1645 // See if we can find ourselves in dirfrag zero of the parent: this
1646 // is a fast path that avoids needing to go further up the tree
1647 // if the parent isn't fragmented (worst case we would have to
1648 // go all the way to the root)
1649 r = read_dentry(parent_ino, frag_t(), parent_dname, &existing_dentry);
1650 if (r >= 0) {
1651 // Great, fast path: return the fragtree from here
1652 if (existing_dentry.inode.ino != dirino) {
1653 dout(4) << "Unexpected inode in dentry! 0x" << std::hex
1654 << existing_dentry.inode.ino
1655 << " vs expected 0x" << dirino << std::dec << dendl;
1656 return -ENOENT;
1657 }
1658 dout(20) << "fast path, fragtree is "
1659 << existing_dentry.dirfragtree << dendl;
1660 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1661 dout(20) << "frag is " << *result_ft << dendl;
1662 return 0;
1663 } else if (r != -ENOENT) {
1664 // Dentry not present in 0th frag, must read parent's fragtree
1665 frag_t parent_frag;
1666 r = get_frag_of(parent_ino, parent_dname, &parent_frag);
1667 if (r == 0) {
1668 // We have the parent fragtree, so try again to load our dentry
1669 r = read_dentry(parent_ino, parent_frag, parent_dname, &existing_dentry);
1670 if (r >= 0) {
1671 // Got it!
1672 *result_ft = existing_dentry.pick_dirfrag(target_dname);
1673 dout(20) << "resolved via parent, frag is " << *result_ft << dendl;
1674 return 0;
1675 } else {
1676 if (r == -EINVAL || r == -ENOENT) {
1677 return -ENOENT; // dentry missing or corrupt, so frag is missing
1678 } else {
1679 return r;
1680 }
1681 }
1682 } else {
1683 // Couldn't resolve parent fragtree, so can't find ours.
1684 return r;
1685 }
1686 } else if (r == -EINVAL) {
1687 // Unreadable dentry, can't know the fragtree.
1688 return -ENOENT;
1689 } else {
1690 // Unexpected error, raise it
1691 return r;
1692 }
1693 }
1694
1695
1696 int MetadataDriver::inject_with_backtrace(
1697 const inode_backtrace_t &backtrace, const InodeStore &dentry)
1698
1699 {
1700
1701 // On dirfrags
1702 // ===========
1703 // In order to insert something into a directory, we first (ideally)
1704 // need to know the fragtree for the directory. Sometimes we can't
1705 // get that, in which case we just go ahead and insert it into
1706 // fragment zero for a good chance of that being the right thing
1707 // anyway (most moderate-sized dirs aren't fragmented!)
1708
1709 // On ancestry
1710 // ===========
1711 // My immediate ancestry should be correct, so if we can find that
1712 // directory's dirfrag then go inject it there. This works well
1713 // in the case that this inode's dentry was somehow lost and we
1714 // are recreating it, because the rest of the hierarchy
1715 // will probably still exist.
1716 //
1717 // It's more of a "better than nothing" approach when rebuilding
1718 // a whole tree, as backtraces will in general not be up to date
1719 // beyond the first parent, if anything in the trace was ever
1720 // moved after the file was created.
1721
1722 // On inode numbers
1723 // ================
1724 // The backtrace tells us inodes for each of the parents. If we are
1725 // creating those parent dirfrags, then there is a risk that somehow
1726 // the inode indicated here was also used for data (not a dirfrag) at
1727 // some stage. That would be a zany situation, and we don't check
1728 // for it here, because to do so would require extra IOs for everything
1729 // we inject, and anyway wouldn't guarantee that the inode number
1730 // wasn't in use in some dentry elsewhere in the metadata tree that
1731 // just happened not to have any data objects.
1732
1733 // On multiple workers touching the same traces
1734 // ============================================
1735 // When creating linkage for a directory, *only* create it if we are
1736 // also creating the object. That way, we might not manage to get the
1737 // *right* linkage for a directory, but at least we won't multiply link
1738 // it. We assume that if a root dirfrag exists for a directory, then
1739 // it is linked somewhere (i.e. that the metadata pool is not already
1740 // inconsistent).
1741 //
1742 // Making sure *that* is true is someone else's job! Probably someone
1743 // who is not going to run in parallel, so that they can self-consistently
1744 // look at versions and move things around as they go.
1745 // Note this isn't 100% safe: if we die immediately after creating dirfrag
1746 // object, next run will fail to create linkage for the dirfrag object
1747 // and leave it orphaned.
1748
1749 inodeno_t ino = backtrace.ino;
1750 dout(10) << " inode: 0x" << std::hex << ino << std::dec << dendl;
1751 for (std::vector<inode_backpointer_t>::const_iterator i = backtrace.ancestors.begin();
1752 i != backtrace.ancestors.end(); ++i) {
1753 const inode_backpointer_t &backptr = *i;
1754 dout(10) << " backptr: 0x" << std::hex << backptr.dirino << std::dec
1755 << "/" << backptr.dname << dendl;
1756
1757 // Examine root dirfrag for parent
1758 const inodeno_t parent_ino = backptr.dirino;
1759 const std::string dname = backptr.dname;
1760
1761 frag_t fragment;
1762 int r = get_frag_of(parent_ino, dname, &fragment);
1763 if (r == -ENOENT) {
1764 // Don't know fragment, fall back to assuming root
1765 dout(20) << "don't know fragment for 0x" << std::hex <<
1766 parent_ino << std::dec << "/" << dname << ", will insert to root"
1767 << dendl;
1768 }
1769
1770 // Find or create dirfrag
1771 // ======================
1772 bool created_dirfrag;
1773 r = find_or_create_dirfrag(parent_ino, fragment, &created_dirfrag);
1774 if (r < 0) {
1775 return r;
1776 }
1777
1778 // Check if dentry already exists
1779 // ==============================
1780 InodeStore existing_dentry;
1781 r = read_dentry(parent_ino, fragment, dname, &existing_dentry);
1782 bool write_dentry = false;
1783 if (r == -ENOENT || r == -EINVAL) {
1784 if (r == -EINVAL && !force_corrupt) {
1785 return r;
1786 }
1787 // Missing or corrupt dentry
1788 write_dentry = true;
1789 } else if (r < 0) {
1790 derr << "Unexpected error reading dentry 0x" << std::hex
1791 << parent_ino << std::dec << "/"
1792 << dname << ": " << cpp_strerror(r) << dendl;
1793 break;
1794 } else {
1795 // Dentry already present, does it link to me?
1796 if (existing_dentry.inode.ino == ino) {
1797 dout(20) << "Dentry 0x" << std::hex
1798 << parent_ino << std::dec << "/"
1799 << dname << " already exists and points to me" << dendl;
1800 } else {
1801 derr << "Dentry 0x" << std::hex
1802 << parent_ino << std::dec << "/"
1803 << dname << " already exists but points to 0x"
1804 << std::hex << existing_dentry.inode.ino << std::dec << dendl;
1805 // Fall back to lost+found!
1806 return inject_lost_and_found(backtrace.ino, dentry);
1807 }
1808 }
1809
1810 // Inject linkage
1811 // ==============
1812
1813 if (write_dentry) {
1814 if (i == backtrace.ancestors.begin()) {
1815 // This is the linkage for the file of interest
1816 dout(10) << "Linking inode 0x" << std::hex << ino
1817 << " at 0x" << parent_ino << "/" << dname << std::dec
1818 << " with size=" << dentry.inode.size << " bytes" << dendl;
1819
1820 r = inject_linkage(parent_ino, dname, fragment, dentry);
1821 } else {
1822 // This is the linkage for an ancestor directory
1823 InodeStore ancestor_dentry;
1824 ancestor_dentry.inode.mode = 0755 | S_IFDIR;
1825
1826 // Set nfiles to something non-zero, to fool any other code
1827 // that tries to ignore 'empty' directories. This won't be
1828 // accurate, but it should avoid functional issues.
1829
1830 ancestor_dentry.inode.dirstat.nfiles = 1;
1831 ancestor_dentry.inode.dir_layout.dl_dir_hash =
1832 g_conf()->mds_default_dir_hash;
1833
1834 ancestor_dentry.inode.nlink = 1;
1835 ancestor_dentry.inode.ino = ino;
1836 ancestor_dentry.inode.uid = g_conf()->mds_root_ino_uid;
1837 ancestor_dentry.inode.gid = g_conf()->mds_root_ino_gid;
1838 ancestor_dentry.inode.version = 1;
1839 ancestor_dentry.inode.backtrace_version = 1;
1840 r = inject_linkage(parent_ino, dname, fragment, ancestor_dentry);
1841 }
1842
1843 if (r < 0) {
1844 return r;
1845 }
1846 }
1847
1848 if (!created_dirfrag) {
1849 // If the parent dirfrag already existed, then stop traversing the
1850 // backtrace: assume that the other ancestors already exist too. This
1851 // is an assumption rather than a truth, but it's a convenient way
1852 // to avoid the risk of creating multiply-linked directories while
1853 // injecting data. If there are in fact missing ancestors, this
1854 // should be fixed up using a separate tool scanning the metadata
1855 // pool.
1856 break;
1857 } else {
1858 // Proceed up the backtrace, creating parents
1859 ino = parent_ino;
1860 }
1861 }
1862
1863 return 0;
1864 }
1865
1866 int MetadataDriver::find_or_create_dirfrag(
1867 inodeno_t ino,
1868 frag_t fragment,
1869 bool *created)
1870 {
1871 ceph_assert(created != NULL);
1872
1873 fnode_t existing_fnode;
1874 *created = false;
1875
1876 uint64_t read_version = 0;
1877 int r = read_fnode(ino, fragment, &existing_fnode, &read_version);
1878 dout(10) << "read_version = " << read_version << dendl;
1879
1880 if (r == -ENOENT || r == -EINVAL) {
1881 if (r == -EINVAL && !force_corrupt) {
1882 return r;
1883 }
1884
1885 // Missing or corrupt fnode, create afresh
1886 bufferlist fnode_bl;
1887 fnode_t blank_fnode;
1888 blank_fnode.version = 1;
1889 // mark it as non-empty
1890 blank_fnode.fragstat.nfiles = 1;
1891 blank_fnode.accounted_fragstat = blank_fnode.fragstat;
1892 blank_fnode.damage_flags |= (DAMAGE_STATS | DAMAGE_RSTATS);
1893 blank_fnode.encode(fnode_bl);
1894
1895
1896 librados::ObjectWriteOperation op;
1897
1898 if (read_version) {
1899 ceph_assert(r == -EINVAL);
1900 // Case A: We must assert that the version isn't changed since we saw the object
1901 // was unreadable, to avoid the possibility of two data-scan processes
1902 // both creating the frag.
1903 op.assert_version(read_version);
1904 } else {
1905 ceph_assert(r == -ENOENT);
1906 // Case B: The object didn't exist in read_fnode, so while creating it we must
1907 // use an exclusive create to correctly populate *creating with
1908 // whether we created it ourselves or someone beat us to it.
1909 op.create(true);
1910 }
1911
1912 object_t frag_oid = InodeStore::get_object_name(ino, fragment, "");
1913 op.omap_set_header(fnode_bl);
1914 r = metadata_io.operate(frag_oid.name, &op);
1915 if (r == -EOVERFLOW || r == -EEXIST) {
1916 // Someone else wrote it (see case A above)
1917 dout(10) << "Dirfrag creation race: 0x" << std::hex
1918 << ino << " " << fragment << std::dec << dendl;
1919 *created = false;
1920 return 0;
1921 } else if (r < 0) {
1922 // We were unable to create or write it, error out
1923 derr << "Failed to create dirfrag 0x" << std::hex
1924 << ino << std::dec << ": " << cpp_strerror(r) << dendl;
1925 return r;
1926 } else {
1927 // Success: the dirfrag object now exists with a value header
1928 dout(10) << "Created dirfrag: 0x" << std::hex
1929 << ino << std::dec << dendl;
1930 *created = true;
1931 }
1932 } else if (r < 0) {
1933 derr << "Unexpected error reading dirfrag 0x" << std::hex
1934 << ino << std::dec << " : " << cpp_strerror(r) << dendl;
1935 return r;
1936 } else {
1937 dout(20) << "Dirfrag already exists: 0x" << std::hex
1938 << ino << " " << fragment << std::dec << dendl;
1939 }
1940
1941 return 0;
1942 }
1943
1944 int MetadataDriver::inject_linkage(
1945 inodeno_t dir_ino, const std::string &dname,
1946 const frag_t fragment, const InodeStore &inode, const snapid_t dnfirst)
1947 {
1948 object_t frag_oid = InodeStore::get_object_name(dir_ino, fragment, "");
1949
1950 std::string key;
1951 dentry_key_t dn_key(CEPH_NOSNAP, dname.c_str());
1952 dn_key.encode(key);
1953
1954 bufferlist dentry_bl;
1955 encode(dnfirst, dentry_bl);
1956 encode('I', dentry_bl);
1957 inode.encode_bare(dentry_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
1958
1959 // Write out
1960 std::map<std::string, bufferlist> vals;
1961 vals[key] = dentry_bl;
1962 int r = metadata_io.omap_set(frag_oid.name, vals);
1963 if (r != 0) {
1964 derr << "Error writing dentry 0x" << std::hex
1965 << dir_ino << std::dec << "/"
1966 << dname << ": " << cpp_strerror(r) << dendl;
1967 return r;
1968 } else {
1969 dout(20) << "Injected dentry 0x" << std::hex
1970 << dir_ino << "/" << dname << " pointing to 0x"
1971 << inode.inode.ino << std::dec << dendl;
1972 return 0;
1973 }
1974 }
1975
1976
1977 int MetadataDriver::init(
1978 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
1979 fs_cluster_id_t fscid)
1980 {
1981 if (metadata_pool_name.empty()) {
1982 auto fs = fsmap->get_filesystem(fscid);
1983 ceph_assert(fs != nullptr);
1984 int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
1985
1986 dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
1987 int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
1988 if (r < 0) {
1989 derr << "Pool " << metadata_pool_id
1990 << " identified in MDS map not found in RADOS!" << dendl;
1991 return r;
1992 }
1993 dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
1994 } else {
1995 dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
1996 }
1997 return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
1998 }
1999
2000 int LocalFileDriver::init(
2001 librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
2002 fs_cluster_id_t fscid)
2003 {
2004 return 0;
2005 }
2006
2007 int LocalFileDriver::inject_data(
2008 const std::string &file_path,
2009 uint64_t size,
2010 uint32_t chunk_size,
2011 inodeno_t ino)
2012 {
2013 // Scrape the file contents out of the data pool and into the
2014 // local filesystem
2015 std::fstream f;
2016 f.open(file_path.c_str(), std::fstream::out | std::fstream::binary);
2017
2018 for (uint64_t offset = 0; offset < size; offset += chunk_size) {
2019 bufferlist bl;
2020
2021 char buf[32];
2022 snprintf(buf, sizeof(buf),
2023 "%llx.%08llx",
2024 (unsigned long long)ino,
2025 (unsigned long long)(offset / chunk_size));
2026 std::string oid(buf);
2027
2028 int r = data_io.read(oid, bl, chunk_size, 0);
2029
2030 if (r <= 0 && r != -ENOENT) {
2031 derr << "error reading data object '" << oid << "': "
2032 << cpp_strerror(r) << dendl;
2033 f.close();
2034 return r;
2035 } else if (r >=0) {
2036
2037 f.seekp(offset);
2038 bl.write_stream(f);
2039 }
2040 }
2041 f.close();
2042
2043 return 0;
2044 }
2045
2046
2047 int LocalFileDriver::inject_with_backtrace(
2048 const inode_backtrace_t &bt,
2049 const InodeStore &dentry)
2050 {
2051 std::string path_builder = path;
2052
2053 // Iterate through backtrace creating directory parents
2054 std::vector<inode_backpointer_t>::const_reverse_iterator i;
2055 for (i = bt.ancestors.rbegin();
2056 i != bt.ancestors.rend(); ++i) {
2057
2058 const inode_backpointer_t &backptr = *i;
2059 path_builder += "/";
2060 path_builder += backptr.dname;
2061
2062 // Last entry is the filename itself
2063 bool is_file = (i + 1 == bt.ancestors.rend());
2064 if (is_file) {
2065 // FIXME: inject_data won't cope with interesting (i.e. striped)
2066 // layouts (need a librados-compatible Filer to read these)
2067 inject_data(path_builder, dentry.inode.size,
2068 dentry.inode.layout.object_size, bt.ino);
2069 } else {
2070 int r = mkdir(path_builder.c_str(), 0755);
2071 if (r != 0 && r != -EPERM) {
2072 derr << "error creating directory: '" << path_builder << "': "
2073 << cpp_strerror(r) << dendl;
2074 return r;
2075 }
2076 }
2077 }
2078
2079 return 0;
2080 }
2081
2082 int LocalFileDriver::inject_lost_and_found(
2083 inodeno_t ino,
2084 const InodeStore &dentry)
2085 {
2086 std::string lf_path = path + "/lost+found";
2087 int r = mkdir(lf_path.c_str(), 0755);
2088 if (r != 0 && r != -EPERM) {
2089 derr << "error creating directory: '" << lf_path << "': "
2090 << cpp_strerror(r) << dendl;
2091 return r;
2092 }
2093
2094 std::string file_path = lf_path + "/" + lost_found_dname(ino);
2095 return inject_data(file_path, dentry.inode.size,
2096 dentry.inode.layout.object_size, ino);
2097 }
2098
2099 int LocalFileDriver::init_roots(int64_t data_pool_id)
2100 {
2101 // Ensure that the path exists and is a directory
2102 bool exists;
2103 int r = check_roots(&exists);
2104 if (r != 0) {
2105 return r;
2106 }
2107
2108 if (exists) {
2109 return 0;
2110 } else {
2111 return ::mkdir(path.c_str(), 0755);
2112 }
2113 }
2114
2115 int LocalFileDriver::check_roots(bool *result)
2116 {
2117 // Check if the path exists and is a directory
2118 DIR *d = ::opendir(path.c_str());
2119 if (d == NULL) {
2120 *result = false;
2121 } else {
2122 int r = closedir(d);
2123 if (r != 0) {
2124 // Weird, but maybe possible with e.g. stale FD on NFS mount?
2125 *result = false;
2126 } else {
2127 *result = true;
2128 }
2129 }
2130
2131 return 0;
2132 }
2133
2134 void MetadataTool::build_file_dentry(
2135 inodeno_t ino, uint64_t file_size, time_t file_mtime,
2136 const file_layout_t &layout, InodeStore *out)
2137 {
2138 ceph_assert(out != NULL);
2139
2140 out->inode.mode = 0500 | S_IFREG;
2141 out->inode.size = file_size;
2142 out->inode.max_size_ever = file_size;
2143 out->inode.mtime.tv.tv_sec = file_mtime;
2144 out->inode.atime.tv.tv_sec = file_mtime;
2145 out->inode.ctime.tv.tv_sec = file_mtime;
2146
2147 out->inode.layout = layout;
2148
2149 out->inode.truncate_seq = 1;
2150 out->inode.truncate_size = -1ull;
2151
2152 out->inode.inline_data.version = CEPH_INLINE_NONE;
2153
2154 out->inode.nlink = 1;
2155 out->inode.ino = ino;
2156 out->inode.version = 1;
2157 out->inode.backtrace_version = 1;
2158 out->inode.uid = g_conf()->mds_root_ino_uid;
2159 out->inode.gid = g_conf()->mds_root_ino_gid;
2160 }
2161
2162 void MetadataTool::build_dir_dentry(
2163 inodeno_t ino, const frag_info_t &fragstat,
2164 const file_layout_t &layout, InodeStore *out)
2165 {
2166 ceph_assert(out != NULL);
2167
2168 out->inode.mode = 0755 | S_IFDIR;
2169 out->inode.dirstat = fragstat;
2170 out->inode.mtime.tv.tv_sec = fragstat.mtime;
2171 out->inode.atime.tv.tv_sec = fragstat.mtime;
2172 out->inode.ctime.tv.tv_sec = fragstat.mtime;
2173
2174 out->inode.layout = layout;
2175 out->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
2176
2177 out->inode.truncate_seq = 1;
2178 out->inode.truncate_size = -1ull;
2179
2180 out->inode.inline_data.version = CEPH_INLINE_NONE;
2181
2182 out->inode.nlink = 1;
2183 out->inode.ino = ino;
2184 out->inode.version = 1;
2185 out->inode.backtrace_version = 1;
2186 out->inode.uid = g_conf()->mds_root_ino_uid;
2187 out->inode.gid = g_conf()->mds_root_ino_gid;
2188 }
2189