1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2014 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include "ScrubStack.h"
18 #include "common/Finisher.h"
19 #include "mds/MDSRank.h"
20 #include "mds/MDCache.h"
21 #include "mds/MDSContinuation.h"
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_mds
26 #define dout_prefix _prefix(_dout, scrubstack->mdcache->mds)
27 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
28 return *_dout
<< "mds." << mds
->get_nodeid() << ".scrubstack ";
31 std::ostream
&operator<<(std::ostream
&os
, const ScrubStack::State
&state
) {
33 case ScrubStack::STATE_RUNNING
:
36 case ScrubStack::STATE_IDLE
:
39 case ScrubStack::STATE_PAUSING
:
42 case ScrubStack::STATE_PAUSED
:
52 void ScrubStack::push_inode(CInode
*in
)
54 dout(20) << "pushing " << *in
<< " on top of ScrubStack" << dendl
;
55 if (!in
->item_scrub
.is_on_list()) {
56 in
->get(CInode::PIN_SCRUBQUEUE
);
59 inode_stack
.push_front(&in
->item_scrub
);
62 void ScrubStack::push_inode_bottom(CInode
*in
)
64 dout(20) << "pushing " << *in
<< " on bottom of ScrubStack" << dendl
;
65 if (!in
->item_scrub
.is_on_list()) {
66 in
->get(CInode::PIN_SCRUBQUEUE
);
69 inode_stack
.push_back(&in
->item_scrub
);
72 void ScrubStack::pop_inode(CInode
*in
)
74 dout(20) << "popping " << *in
75 << " off of ScrubStack" << dendl
;
76 ceph_assert(in
->item_scrub
.is_on_list());
77 in
->put(CInode::PIN_SCRUBQUEUE
);
78 in
->item_scrub
.remove_myself();
82 void ScrubStack::_enqueue_inode(CInode
*in
, CDentry
*parent
,
83 ScrubHeaderRef
& header
,
84 MDSContext
*on_finish
, bool top
)
86 dout(10) << __func__
<< " with {" << *in
<< "}"
87 << ", on_finish=" << on_finish
<< ", top=" << top
<< dendl
;
88 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
89 in
->scrub_initialize(parent
, header
, on_finish
);
93 push_inode_bottom(in
);
96 void ScrubStack::enqueue_inode(CInode
*in
, ScrubHeaderRef
& header
,
97 MDSContext
*on_finish
, bool top
)
100 if (clear_inode_stack
) {
101 on_finish
->complete(-EAGAIN
);
105 _enqueue_inode(in
, NULL
, header
, on_finish
, top
);
109 void ScrubStack::kick_off_scrubs()
111 ceph_assert(ceph_mutex_is_locked(mdcache
->mds
->mds_lock
));
112 dout(20) << __func__
<< ": state=" << state
<< dendl
;
114 if (clear_inode_stack
|| state
== STATE_PAUSING
|| state
== STATE_PAUSED
) {
115 if (scrubs_in_progress
== 0) {
116 dout(10) << __func__
<< ": in progress scrub operations finished, "
117 << stack_size
<< " in the stack" << dendl
;
119 State final_state
= state
;
120 if (clear_inode_stack
) {
121 abort_pending_scrubs();
122 final_state
= STATE_IDLE
;
124 if (state
== STATE_PAUSING
) {
125 final_state
= STATE_PAUSED
;
128 set_state(final_state
);
129 complete_control_contexts(0);
135 dout(20) << __func__
<< " entering with " << scrubs_in_progress
<< " in "
136 "progress and " << stack_size
<< " in the stack" << dendl
;
137 bool can_continue
= true;
138 elist
<CInode
*>::iterator i
= inode_stack
.begin();
139 while (g_conf()->mds_max_scrub_ops_in_progress
> scrubs_in_progress
&&
142 if (scrubs_in_progress
== 0) {
143 set_state(STATE_IDLE
);
149 assert(state
== STATE_RUNNING
|| state
== STATE_IDLE
);
150 set_state(STATE_RUNNING
);
153 ++i
; // we have our reference, push iterator forward
155 dout(20) << __func__
<< " examining " << *curi
<< dendl
;
157 if (!curi
->is_dir()) {
158 // it's a regular file, symlink, or hard link
159 pop_inode(curi
); // we only touch it this once, so remove from stack
161 if (!curi
->scrub_info()->on_finish
) {
162 scrubs_in_progress
++;
163 curi
->scrub_set_finisher(&scrub_kick
);
165 scrub_file_inode(curi
);
168 bool completed
; // it's done, so pop it off the stack
169 bool terminal
; // not done, but we can start ops on other directories
170 bool progress
; // it added new dentries to the top of the stack
171 scrub_dir_inode(curi
, &progress
, &terminal
, &completed
);
173 dout(20) << __func__
<< " dir completed" << dendl
;
175 } else if (progress
) {
176 dout(20) << __func__
<< " dir progressed" << dendl
;
177 // we added new stuff to top of stack, so reset ourselves there
178 i
= inode_stack
.begin();
180 dout(20) << __func__
<< " dir no-op" << dendl
;
183 can_continue
= progress
|| terminal
|| completed
;
188 void ScrubStack::scrub_dir_inode(CInode
*in
,
189 bool *added_children
,
193 dout(10) << __func__
<< " " << *in
<< dendl
;
195 *added_children
= false;
196 bool all_frags_terminal
= true;
197 bool all_frags_done
= true;
199 ScrubHeaderRef header
= in
->get_scrub_header();
200 ceph_assert(header
!= nullptr);
202 if (header
->get_recursive()) {
203 frag_vec_t scrubbing_frags
;
204 std::queue
<CDir
*> scrubbing_cdirs
;
205 in
->scrub_dirfrags_scrubbing(&scrubbing_frags
);
206 dout(20) << __func__
<< " iterating over " << scrubbing_frags
.size()
207 << " scrubbing frags" << dendl
;
208 for (const auto& fg
: scrubbing_frags
) {
209 // turn frags into CDir *
210 CDir
*dir
= in
->get_dirfrag(fg
);
212 scrubbing_cdirs
.push(dir
);
213 dout(25) << __func__
<< " got CDir " << *dir
<< " presently scrubbing" << dendl
;
215 in
->scrub_dirfrag_finished(fg
);
216 dout(25) << __func__
<< " missing dirfrag " << fg
<< " skip scrubbing" << dendl
;
220 dout(20) << __func__
<< " consuming from " << scrubbing_cdirs
.size()
221 << " scrubbing cdirs" << dendl
;
223 while (g_conf()->mds_max_scrub_ops_in_progress
> scrubs_in_progress
) {
225 CDir
*cur_dir
= NULL
;
226 if (!scrubbing_cdirs
.empty()) {
227 cur_dir
= scrubbing_cdirs
.front();
228 scrubbing_cdirs
.pop();
229 dout(20) << __func__
<< " got cur_dir = " << *cur_dir
<< dendl
;
231 bool ready
= get_next_cdir(in
, &cur_dir
);
232 dout(20) << __func__
<< " get_next_cdir ready=" << ready
<< dendl
;
234 if (ready
&& cur_dir
) {
235 scrubbing_cdirs
.push(cur_dir
);
237 // We are waiting for load of a frag
238 all_frags_done
= false;
239 all_frags_terminal
= false;
242 // Finished with all frags
247 bool frag_added_children
= false;
248 bool frag_terminal
= true;
249 bool frag_done
= false;
250 scrub_dirfrag(cur_dir
, header
,
251 &frag_added_children
, &frag_terminal
, &frag_done
);
253 cur_dir
->inode
->scrub_dirfrag_finished(cur_dir
->frag
);
255 *added_children
|= frag_added_children
;
256 all_frags_terminal
= all_frags_terminal
&& frag_terminal
;
257 all_frags_done
= all_frags_done
&& frag_done
;
260 dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal
261 << ", all_frags_done=" << all_frags_done
<< dendl
;
263 dout(20) << "!scrub_recursive" << dendl
;
266 if (all_frags_done
) {
267 assert (!*added_children
); // can't do this if children are still pending
269 // OK, so now I can... fire off a validate on the dir inode, and
270 // when it completes, come through here again, noticing that we've
271 // set a flag to indicate the validate happened, and
272 scrub_dir_inode_final(in
);
275 *terminal
= all_frags_terminal
;
276 *done
= all_frags_done
;
277 dout(10) << __func__
<< " is exiting " << *terminal
<< " " << *done
<< dendl
;
281 bool ScrubStack::get_next_cdir(CInode
*in
, CDir
**new_dir
)
283 dout(20) << __func__
<< " on " << *in
<< dendl
;
285 int r
= in
->scrub_dirfrag_next(&next_frag
);
289 // we got a frag to scrub, otherwise it would be ENOENT
290 dout(25) << "looking up new frag " << next_frag
<< dendl
;
291 CDir
*next_dir
= in
->get_or_open_dirfrag(mdcache
, next_frag
);
292 if (!next_dir
->is_complete()) {
293 scrubs_in_progress
++;
294 next_dir
->fetch(&scrub_kick
);
295 dout(25) << "fetching frag from RADOS" << dendl
;
299 dout(25) << "returning dir " << *new_dir
<< dendl
;
302 ceph_assert(r
== ENOENT
);
303 // there are no dirfrags left
308 class C_InodeValidated
: public MDSInternalContext
312 CInode::validated_data result
;
315 C_InodeValidated(MDSRank
*mds
, ScrubStack
*stack_
, CInode
*target_
)
316 : MDSInternalContext(mds
), stack(stack_
), target(target_
)
319 void finish(int r
) override
321 stack
->_validate_inode_done(target
, r
, result
);
326 void ScrubStack::scrub_dir_inode_final(CInode
*in
)
328 dout(20) << __func__
<< " " << *in
<< dendl
;
330 // Two passes through this function. First one triggers inode validation,
331 // second one sets finally_done
332 // FIXME: kind of overloading scrub_in_progress here, using it while
333 // dentry is still on stack to indicate that we have finished
334 // doing our validate_disk_state on the inode
335 // FIXME: the magic-constructing scrub_info() is going to leave
336 // an unneeded scrub_infop lying around here
337 if (!in
->scrub_info()->children_scrubbed
) {
338 if (!in
->scrub_info()->on_finish
) {
339 scrubs_in_progress
++;
340 in
->scrub_set_finisher(&scrub_kick
);
343 in
->scrub_children_finished();
344 C_InodeValidated
*fin
= new C_InodeValidated(mdcache
->mds
, this, in
);
345 in
->validate_disk_state(&fin
->result
, fin
);
351 void ScrubStack::scrub_dirfrag(CDir
*dir
,
352 ScrubHeaderRef
& header
,
353 bool *added_children
, bool *is_terminal
,
356 ceph_assert(dir
!= NULL
);
358 dout(20) << __func__
<< " on " << *dir
<< dendl
;
359 *added_children
= false;
360 *is_terminal
= false;
364 if (!dir
->scrub_info()->directory_scrubbing
) {
365 // Get the frag complete before calling
366 // scrub initialize, so that it can populate its lists
368 if (!dir
->is_complete()) {
369 scrubs_in_progress
++;
370 dir
->fetch(&scrub_kick
);
374 dir
->scrub_initialize(header
);
380 scrubs_in_progress
++;
381 r
= dir
->scrub_dentry_next(&scrub_kick
, &dn
);
383 scrubs_in_progress
--;
387 // Drop out, CDir fetcher will call back our kicker context
388 dout(20) << __func__
<< " waiting for fetch on " << *dir
<< dendl
;
393 // Nothing left to scrub, are we done?
394 auto&& scrubbing
= dir
->scrub_dentries_scrubbing();
395 if (scrubbing
.empty()) {
396 dout(20) << __func__
<< " dirfrag done: " << *dir
<< dendl
;
397 // FIXME: greg: What's the diff meant to be between done and terminal
398 dir
->scrub_finished();
402 dout(20) << __func__
<< " " << scrubbing
.size() << " dentries still "
403 "scrubbing in " << *dir
<< dendl
;
408 // scrub_dentry_next defined to only give EAGAIN, ENOENT, 0 -- we should
409 // never get random IO errors here.
412 _enqueue_inode(dn
->get_projected_inode(), dn
, header
, NULL
, true);
414 *added_children
= true;
418 void ScrubStack::scrub_file_inode(CInode
*in
)
420 C_InodeValidated
*fin
= new C_InodeValidated(mdcache
->mds
, this, in
);
421 // At this stage the DN is already past scrub_initialize, so
422 // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned
423 in
->validate_disk_state(&fin
->result
, fin
);
426 void ScrubStack::_validate_inode_done(CInode
*in
, int r
,
427 const CInode::validated_data
&result
)
429 LogChannelRef clog
= mdcache
->mds
->clog
;
430 const ScrubHeaderRefConst header
= in
->scrub_info()->header
;
433 if (!result
.passed_validation
) {
434 // Build path string for use in messages
435 in
->make_path_string(path
, true);
438 if (result
.backtrace
.checked
&& !result
.backtrace
.passed
&&
439 !result
.backtrace
.repaired
)
441 // Record backtrace fails as remote linkage damage, as
442 // we may not be able to resolve hard links to this inode
443 mdcache
->mds
->damage_table
.notify_remote_damaged(in
->inode
.ino
, path
);
444 } else if (result
.inode
.checked
&& !result
.inode
.passed
&&
445 !result
.inode
.repaired
) {
446 // Record damaged inode structures as damaged dentries as
447 // that is where they are stored
448 auto parent
= in
->get_projected_parent_dn();
450 auto dir
= parent
->get_dir();
451 mdcache
->mds
->damage_table
.notify_dentry(
452 dir
->inode
->ino(), dir
->frag
, parent
->last
, parent
->get_name(), path
);
456 // Inform the cluster log if we found an error
457 if (!result
.passed_validation
) {
458 if (result
.all_damage_repaired()) {
459 clog
->info() << "Scrub repaired inode " << in
->ino()
460 << " (" << path
<< ")";
462 clog
->warn() << "Scrub error on inode " << in
->ino()
463 << " (" << path
<< ") see " << g_conf()->name
464 << " log and `damage ls` output for details";
467 // Put the verbose JSON output into the MDS log for later inspection
470 std::ostringstream out
;
472 derr
<< __func__
<< " scrub error on inode " << *in
<< ": " << out
.str()
475 dout(10) << __func__
<< " scrub passed on inode " << *in
<< dendl
;
478 MDSContext
*c
= NULL
;
479 in
->scrub_finished(&c
);
481 if (in
== header
->get_origin()) {
482 scrub_origins
.erase(in
);
483 clog_scrub_summary(in
);
484 if (!header
->get_recursive()) {
485 if (r
>= 0) { // we got into the scrubbing dump it
486 result
.dump(&(header
->get_formatter()));
487 } else { // we failed the lookup or something; dump ourselves
488 header
->get_formatter().open_object_section("results");
489 header
->get_formatter().dump_int("return_code", r
);
490 header
->get_formatter().close_section(); // results
495 finisher
->queue(new MDSIOContextWrapper(mdcache
->mds
, c
), 0);
499 ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache
*mdcache
, ScrubStack
*s
)
500 : MDSInternalContext(mdcache
->mds
), stack(s
) { }
502 void ScrubStack::complete_control_contexts(int r
) {
503 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
505 for (auto &ctx
: control_ctxs
) {
508 control_ctxs
.clear();
511 void ScrubStack::set_state(State next_state
) {
512 if (state
!= next_state
) {
513 dout(20) << __func__
<< ", from state=" << state
<< ", to state="
514 << next_state
<< dendl
;
516 clog_scrub_summary();
520 bool ScrubStack::scrub_in_transition_state() {
521 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
522 dout(20) << __func__
<< ": state=" << state
<< dendl
;
524 // STATE_RUNNING is considered as a transition state so as to
525 // "delay" the scrub control operation.
526 if (state
== STATE_RUNNING
|| state
== STATE_PAUSING
) {
533 std::string_view
ScrubStack::scrub_summary() {
534 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
536 bool have_more
= false;
537 CachedStackStringStream cs
;
539 if (state
== STATE_IDLE
) {
543 if (state
== STATE_RUNNING
) {
544 if (clear_inode_stack
) {
550 if (state
== STATE_PAUSING
) {
553 } else if (state
== STATE_PAUSED
) {
558 if (clear_inode_stack
) {
566 if (!scrub_origins
.empty()) {
568 for (auto inode
= scrub_origins
.begin(); inode
!= scrub_origins
.end(); ++inode
) {
569 if (inode
!= scrub_origins
.begin()) {
573 *cs
<< scrub_inode_path(*inode
);
581 void ScrubStack::scrub_status(Formatter
*f
) {
582 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
584 f
->open_object_section("result");
586 std::stringstream ss
;
587 bool have_more
= false;
589 if (state
== STATE_IDLE
) {
590 ss
<< "no active scrubs running";
591 } else if (state
== STATE_RUNNING
) {
592 if (clear_inode_stack
) {
595 ss
<< "scrub active";
597 ss
<< " (" << stack_size
<< " inodes in the stack)";
599 if (state
== STATE_PAUSING
|| state
== STATE_PAUSED
) {
603 if (clear_inode_stack
) {
610 ss
<< " (" << stack_size
<< " inodes in the stack)";
612 f
->dump_string("status", ss
.str());
614 f
->open_object_section("scrubs");
615 for (auto &inode
: scrub_origins
) {
617 ScrubHeaderRefConst header
= inode
->get_scrub_header();
619 std::string
tag(header
->get_tag());
620 f
->open_object_section(tag
.c_str()); // scrub id
622 f
->dump_string("path", scrub_inode_path(inode
));
624 std::stringstream optss
;
625 if (header
->get_recursive()) {
626 optss
<< "recursive";
629 if (header
->get_repair()) {
636 if (header
->get_force()) {
643 f
->dump_string("options", optss
.str());
644 f
->close_section(); // scrub id
646 f
->close_section(); // scrubs
647 f
->close_section(); // result
650 void ScrubStack::abort_pending_scrubs() {
651 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
652 ceph_assert(clear_inode_stack
);
654 for (auto inode
= inode_stack
.begin(); !inode
.end(); ++inode
) {
656 if (in
== in
->scrub_info()->header
->get_origin()) {
657 scrub_origins
.erase(in
);
658 clog_scrub_summary(in
);
661 MDSContext
*ctx
= nullptr;
662 in
->scrub_aborted(&ctx
);
663 if (ctx
!= nullptr) {
664 ctx
->complete(-ECANCELED
);
670 clear_inode_stack
= false;
673 void ScrubStack::scrub_abort(Context
*on_finish
) {
674 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
675 ceph_assert(on_finish
!= nullptr);
677 dout(10) << __func__
<< ": aborting with " << scrubs_in_progress
678 << " scrubs in progress and " << stack_size
<< " in the"
679 << " stack" << dendl
;
681 clear_inode_stack
= true;
682 if (scrub_in_transition_state()) {
683 control_ctxs
.push_back(on_finish
);
687 abort_pending_scrubs();
688 if (state
!= STATE_PAUSED
) {
689 set_state(STATE_IDLE
);
691 on_finish
->complete(0);
694 void ScrubStack::scrub_pause(Context
*on_finish
) {
695 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
696 ceph_assert(on_finish
!= nullptr);
698 dout(10) << __func__
<< ": pausing with " << scrubs_in_progress
699 << " scrubs in progress and " << stack_size
<< " in the"
700 << " stack" << dendl
;
702 // abort is in progress
703 if (clear_inode_stack
) {
704 on_finish
->complete(-EINVAL
);
708 bool done
= scrub_in_transition_state();
710 set_state(STATE_PAUSING
);
711 control_ctxs
.push_back(on_finish
);
715 set_state(STATE_PAUSED
);
716 on_finish
->complete(0);
719 bool ScrubStack::scrub_resume() {
720 ceph_assert(ceph_mutex_is_locked_by_me(mdcache
->mds
->mds_lock
));
721 dout(20) << __func__
<< ": state=" << state
<< dendl
;
725 if (clear_inode_stack
) {
727 } else if (state
== STATE_PAUSING
) {
728 set_state(STATE_RUNNING
);
729 complete_control_contexts(-ECANCELED
);
730 } else if (state
== STATE_PAUSED
) {
731 set_state(STATE_RUNNING
);
738 // send current scrub summary to cluster log
739 void ScrubStack::clog_scrub_summary(CInode
*in
) {
742 if (clear_inode_stack
) {
744 } else if (scrub_origins
.count(in
)) {
749 clog
->info() << "scrub " << what
<< " for path: " << scrub_inode_path(in
);
752 clog
->info() << "scrub summary: " << scrub_summary();