1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include "ScrubStack.h"
18 #include "common/Finisher.h"
19 #include "mds/MDSRank.h"
20 #include "mds/MDCache.h"
21 #include "mds/MDSContinuation.h"
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_mds
26 #define dout_prefix _prefix(_dout, scrubstack->mdcache->mds)
27 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
28 return *_dout
<< "mds." << mds
->get_nodeid() << ".scrubstack ";
31 void ScrubStack::push_inode(CInode
*in
)
33 dout(20) << "pushing " << *in
<< " on top of ScrubStack" << dendl
;
34 if (!in
->item_scrub
.is_on_list()) {
35 in
->get(CInode::PIN_SCRUBQUEUE
);
38 inode_stack
.push_front(&in
->item_scrub
);
41 void ScrubStack::push_inode_bottom(CInode
*in
)
43 dout(20) << "pushing " << *in
<< " on bottom of ScrubStack" << dendl
;
44 if (!in
->item_scrub
.is_on_list()) {
45 in
->get(CInode::PIN_SCRUBQUEUE
);
48 inode_stack
.push_back(&in
->item_scrub
);
51 void ScrubStack::pop_inode(CInode
*in
)
53 dout(20) << "popping " << *in
54 << " off of ScrubStack" << dendl
;
55 assert(in
->item_scrub
.is_on_list());
56 in
->put(CInode::PIN_SCRUBQUEUE
);
57 in
->item_scrub
.remove_myself();
61 void ScrubStack::_enqueue_inode(CInode
*in
, CDentry
*parent
,
62 ScrubHeaderRef
& header
,
63 MDSInternalContextBase
*on_finish
, bool top
)
65 dout(10) << __func__
<< " with {" << *in
<< "}"
66 << ", on_finish=" << on_finish
<< ", top=" << top
<< dendl
;
67 assert(mdcache
->mds
->mds_lock
.is_locked_by_me());
68 in
->scrub_initialize(parent
, header
, on_finish
);
72 push_inode_bottom(in
);
75 void ScrubStack::enqueue_inode(CInode
*in
, ScrubHeaderRef
& header
,
76 MDSInternalContextBase
*on_finish
, bool top
)
78 _enqueue_inode(in
, NULL
, header
, on_finish
, top
);
82 void ScrubStack::kick_off_scrubs()
84 dout(20) << __func__
<< " entering with " << scrubs_in_progress
<< " in "
85 "progress and " << stack_size
<< " in the stack" << dendl
;
86 bool can_continue
= true;
87 elist
<CInode
*>::iterator i
= inode_stack
.begin();
88 while (g_conf
->mds_max_scrub_ops_in_progress
> scrubs_in_progress
&&
89 can_continue
&& !i
.end()) {
91 ++i
; // we have our reference, push iterator forward
93 dout(20) << __func__
<< " examining " << *curi
<< dendl
;
95 if (!curi
->is_dir()) {
96 // it's a regular file, symlink, or hard link
97 pop_inode(curi
); // we only touch it this once, so remove from stack
99 if (!curi
->scrub_info()->on_finish
) {
100 scrubs_in_progress
++;
101 curi
->scrub_set_finisher(&scrub_kick
);
103 scrub_file_inode(curi
);
106 bool completed
; // it's done, so pop it off the stack
107 bool terminal
; // not done, but we can start ops on other directories
108 bool progress
; // it added new dentries to the top of the stack
109 scrub_dir_inode(curi
, &progress
, &terminal
, &completed
);
111 dout(20) << __func__
<< " dir completed" << dendl
;
113 } else if (progress
) {
114 dout(20) << __func__
<< " dir progressed" << dendl
;
115 // we added new stuff to top of stack, so reset ourselves there
116 i
= inode_stack
.begin();
118 dout(20) << __func__
<< " dir no-op" << dendl
;
121 can_continue
= progress
|| terminal
|| completed
;
126 void ScrubStack::scrub_dir_inode(CInode
*in
,
127 bool *added_children
,
131 dout(10) << __func__
<< *in
<< dendl
;
133 *added_children
= false;
134 bool all_frags_terminal
= true;
135 bool all_frags_done
= true;
137 ScrubHeaderRef header
= in
->get_scrub_header();
138 assert(header
!= nullptr);
140 if (header
->get_recursive()) {
141 list
<frag_t
> scrubbing_frags
;
142 list
<CDir
*> scrubbing_cdirs
;
143 in
->scrub_dirfrags_scrubbing(&scrubbing_frags
);
144 dout(20) << __func__
<< " iterating over " << scrubbing_frags
.size()
145 << " scrubbing frags" << dendl
;
146 for (list
<frag_t
>::iterator i
= scrubbing_frags
.begin();
147 i
!= scrubbing_frags
.end();
149 // turn frags into CDir *
150 CDir
*dir
= in
->get_dirfrag(*i
);
152 scrubbing_cdirs
.push_back(dir
);
153 dout(25) << __func__
<< " got CDir " << *dir
<< " presently scrubbing" << dendl
;
155 in
->scrub_dirfrag_finished(*i
);
156 dout(25) << __func__
<< " missing dirfrag " << *i
<< " skip scrubbing" << dendl
;
160 dout(20) << __func__
<< " consuming from " << scrubbing_cdirs
.size()
161 << " scrubbing cdirs" << dendl
;
163 list
<CDir
*>::iterator i
= scrubbing_cdirs
.begin();
164 while (g_conf
->mds_max_scrub_ops_in_progress
> scrubs_in_progress
) {
166 CDir
*cur_dir
= NULL
;
167 if (i
!= scrubbing_cdirs
.end()) {
170 dout(20) << __func__
<< " got cur_dir = " << *cur_dir
<< dendl
;
172 bool ready
= get_next_cdir(in
, &cur_dir
);
173 dout(20) << __func__
<< " get_next_cdir ready=" << ready
<< dendl
;
175 if (ready
&& cur_dir
) {
176 scrubbing_cdirs
.push_back(cur_dir
);
178 // We are waiting for load of a frag
179 all_frags_done
= false;
180 all_frags_terminal
= false;
183 // Finished with all frags
188 bool frag_added_children
= false;
189 bool frag_terminal
= true;
190 bool frag_done
= false;
191 scrub_dirfrag(cur_dir
, header
,
192 &frag_added_children
, &frag_terminal
, &frag_done
);
194 cur_dir
->inode
->scrub_dirfrag_finished(cur_dir
->frag
);
196 *added_children
|= frag_added_children
;
197 all_frags_terminal
= all_frags_terminal
&& frag_terminal
;
198 all_frags_done
= all_frags_done
&& frag_done
;
201 dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal
202 << ", all_frags_done=" << all_frags_done
<< dendl
;
204 dout(20) << "!scrub_recursive" << dendl
;
207 if (all_frags_done
) {
208 assert (!*added_children
); // can't do this if children are still pending
210 // OK, so now I can... fire off a validate on the dir inode, and
211 // when it completes, come through here again, noticing that we've
212 // set a flag to indicate the validate happened, and
213 scrub_dir_inode_final(in
);
216 *terminal
= all_frags_terminal
;
217 *done
= all_frags_done
;
218 dout(10) << __func__
<< " is exiting " << *terminal
<< " " << *done
<< dendl
;
222 bool ScrubStack::get_next_cdir(CInode
*in
, CDir
**new_dir
)
224 dout(20) << __func__
<< " on " << *in
<< dendl
;
226 int r
= in
->scrub_dirfrag_next(&next_frag
);
230 // we got a frag to scrub, otherwise it would be ENOENT
231 dout(25) << "looking up new frag " << next_frag
<< dendl
;
232 CDir
*next_dir
= in
->get_or_open_dirfrag(mdcache
, next_frag
);
233 if (!next_dir
->is_complete()) {
234 scrubs_in_progress
++;
235 next_dir
->fetch(&scrub_kick
);
236 dout(25) << "fetching frag from RADOS" << dendl
;
240 dout(25) << "returning dir " << *new_dir
<< dendl
;
244 // there are no dirfrags left
249 class C_InodeValidated
: public MDSInternalContext
253 CInode::validated_data result
;
256 C_InodeValidated(MDSRank
*mds
, ScrubStack
*stack_
, CInode
*target_
)
257 : MDSInternalContext(mds
), stack(stack_
), target(target_
)
260 void finish(int r
) override
262 stack
->_validate_inode_done(target
, r
, result
);
267 void ScrubStack::scrub_dir_inode_final(CInode
*in
)
269 dout(20) << __func__
<< *in
<< dendl
;
271 // Two passes through this function. First one triggers inode validation,
272 // second one sets finally_done
273 // FIXME: kind of overloading scrub_in_progress here, using it while
274 // dentry is still on stack to indicate that we have finished
275 // doing our validate_disk_state on the inode
276 // FIXME: the magic-constructing scrub_info() is going to leave
277 // an unneeded scrub_infop lying around here
278 if (!in
->scrub_info()->children_scrubbed
) {
279 if (!in
->scrub_info()->on_finish
) {
280 scrubs_in_progress
++;
281 in
->scrub_set_finisher(&scrub_kick
);
284 in
->scrub_children_finished();
285 C_InodeValidated
*fin
= new C_InodeValidated(mdcache
->mds
, this, in
);
286 in
->validate_disk_state(&fin
->result
, fin
);
292 void ScrubStack::scrub_dirfrag(CDir
*dir
,
293 ScrubHeaderRef
& header
,
294 bool *added_children
, bool *is_terminal
,
299 dout(20) << __func__
<< " on " << *dir
<< dendl
;
300 *added_children
= false;
301 *is_terminal
= false;
305 if (!dir
->scrub_info()->directory_scrubbing
) {
306 // Get the frag complete before calling
307 // scrub initialize, so that it can populate its lists
309 if (!dir
->is_complete()) {
310 scrubs_in_progress
++;
311 dir
->fetch(&scrub_kick
);
315 dir
->scrub_initialize(header
);
321 scrubs_in_progress
++;
322 r
= dir
->scrub_dentry_next(&scrub_kick
, &dn
);
324 scrubs_in_progress
--;
328 // Drop out, CDir fetcher will call back our kicker context
329 dout(20) << __func__
<< " waiting for fetch on " << *dir
<< dendl
;
334 // Nothing left to scrub, are we done?
335 std::list
<CDentry
*> scrubbing
;
336 dir
->scrub_dentries_scrubbing(&scrubbing
);
337 if (scrubbing
.empty()) {
338 dout(20) << __func__
<< " dirfrag done: " << *dir
<< dendl
;
339 // FIXME: greg: What's the diff meant to be between done and terminal
340 dir
->scrub_finished();
344 dout(20) << __func__
<< " " << scrubbing
.size() << " dentries still "
345 "scrubbing in " << *dir
<< dendl
;
350 // scrub_dentry_next defined to only give EAGAIN, ENOENT, 0 -- we should
351 // never get random IO errors here.
354 _enqueue_inode(dn
->get_projected_inode(), dn
, header
, NULL
, true);
356 *added_children
= true;
360 void ScrubStack::scrub_file_inode(CInode
*in
)
362 C_InodeValidated
*fin
= new C_InodeValidated(mdcache
->mds
, this, in
);
363 // At this stage the DN is already past scrub_initialize, so
364 // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned
365 in
->validate_disk_state(&fin
->result
, fin
);
368 void ScrubStack::_validate_inode_done(CInode
*in
, int r
,
369 const CInode::validated_data
&result
)
371 LogChannelRef clog
= mdcache
->mds
->clog
;
372 const ScrubHeaderRefConst header
= in
->scrub_info()->header
;
375 if (!result
.passed_validation
) {
376 // Build path string for use in messages
377 in
->make_path_string(path
, true);
380 if (result
.backtrace
.checked
&& !result
.backtrace
.passed
381 && !result
.backtrace
.repaired
)
383 // Record backtrace fails as remote linkage damage, as
384 // we may not be able to resolve hard links to this inode
385 mdcache
->mds
->damage_table
.notify_remote_damaged(in
->inode
.ino
, path
);
386 } else if (result
.inode
.checked
&& !result
.inode
.passed
) {
387 // Record damaged inode structures as damaged dentries as
388 // that is where they are stored
389 auto parent
= in
->get_projected_parent_dn();
391 auto dir
= parent
->get_dir();
392 mdcache
->mds
->damage_table
.notify_dentry(
393 dir
->inode
->ino(), dir
->frag
, parent
->last
, parent
->get_name(), path
);
397 // Inform the cluster log if we found an error
398 if (!result
.passed_validation
) {
399 if (result
.all_damage_repaired()) {
400 clog
->info() << "Scrub repaired inode " << in
->ino()
401 << " (" << path
<< ")";
403 clog
->warn() << "Scrub error on inode " << in
->ino()
404 << " (" << path
<< ") see " << g_conf
->name
405 << " log and `damage ls` output for details";
408 // Put the verbose JSON output into the MDS log for later inspection
411 std::ostringstream out
;
413 derr
<< __func__
<< " scrub error on inode " << *in
<< ": " << out
.str()
416 dout(10) << __func__
<< " scrub passed on inode " << *in
<< dendl
;
419 MDSInternalContextBase
*c
= NULL
;
420 in
->scrub_finished(&c
);
422 if (!header
->get_recursive() && in
== header
->get_origin()) {
423 if (r
>= 0) { // we got into the scrubbing dump it
424 result
.dump(&(header
->get_formatter()));
425 } else { // we failed the lookup or something; dump ourselves
426 header
->get_formatter().open_object_section("results");
427 header
->get_formatter().dump_int("return_code", r
);
428 header
->get_formatter().close_section(); // results
432 finisher
->queue(new MDSIOContextWrapper(mdcache
->mds
, c
), 0);
436 ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache
*mdcache
, ScrubStack
*s
)
437 : MDSInternalContext(mdcache
->mds
), stack(s
) { }