1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=2 sw=2 smarttab
4 #include "./pg_scrubber.h" // the '.' notation used to affect clang-format order
12 #include "common/errno.h"
13 #include "messages/MOSDOp.h"
14 #include "messages/MOSDRepScrub.h"
15 #include "messages/MOSDRepScrubMap.h"
16 #include "messages/MOSDScrub.h"
17 #include "messages/MOSDScrubReserve.h"
20 #include "osd/osd_types_fmt.h"
21 #include "ScrubStore.h"
22 #include "scrub_machine.h"
28 using std::stringstream
;
30 using namespace Scrub
;
31 using namespace std::chrono
;
32 using namespace std::chrono_literals
;
33 using namespace std::literals
;
35 #define dout_context (m_osds->cct)
36 #define dout_subsys ceph_subsys_osd
38 #define dout_prefix _prefix(_dout, this)
41 static ostream
& _prefix(std::ostream
* _dout
, T
* t
)
43 return t
->gen_prefix(*_dout
);
46 ostream
& operator<<(ostream
& out
, const scrub_flags_t
& sf
)
49 out
<< " AUTO_REPAIR";
51 out
<< " CHECK_REPAIR";
52 if (sf
.deep_scrub_on_error
)
53 out
<< " DEEP_SCRUB_ON_ERROR";
60 ostream
& operator<<(ostream
& out
, const requested_scrub_t
& sf
)
63 out
<< " MUST_REPAIR";
65 out
<< " planned AUTO_REPAIR";
67 out
<< " planned CHECK_REPAIR";
68 if (sf
.deep_scrub_on_error
)
69 out
<< " planned DEEP_SCRUB_ON_ERROR";
70 if (sf
.must_deep_scrub
)
71 out
<< " MUST_DEEP_SCRUB";
75 out
<< " TIME_FOR_DEEP";
79 out
<< " planned REQ_SCRUB";
85 * if the incoming message is from a previous interval, it must mean
86 * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard
89 bool PgScrubber::check_interval(epoch_t epoch_to_verify
)
91 return epoch_to_verify
>= m_pg
->get_same_interval_since();
94 bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify
)
97 // not scrubbing. We can assume that the scrub was already terminated, and we
98 // can silently discard the incoming event.
102 // is this a message from before we started this scrub?
103 if (epoch_to_verify
< m_epoch_start
) {
107 // has a new interval started?
108 if (!check_interval(epoch_to_verify
)) {
109 // if this is a new interval, on_change() has already terminated that
114 ceph_assert(is_primary());
116 // were we instructed to abort?
117 return verify_against_abort(epoch_to_verify
);
120 bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify
)
122 if (!should_abort()) {
126 dout(10) << __func__
<< " aborting. incoming epoch: " << epoch_to_verify
127 << " vs last-aborted: " << m_last_aborted
<< dendl
;
129 // if we were not aware of the abort before - kill the scrub.
130 if (epoch_to_verify
> m_last_aborted
) {
132 m_last_aborted
= std::max(epoch_to_verify
, m_epoch_start
);
137 bool PgScrubber::should_abort() const
139 if (m_flags
.required
) {
140 return false; // not stopping 'required' scrubs for configuration changes
144 if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB
) ||
145 m_pg
->pool
.info
.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB
)) {
146 dout(10) << "nodeep_scrub set, aborting" << dendl
;
151 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB
) ||
152 m_pg
->pool
.info
.has_flag(pg_pool_t::FLAG_NOSCRUB
)) {
153 dout(10) << "noscrub set, aborting" << dendl
;
160 // initiating state-machine events --------------------------------
163 * a note re the checks performed before sending scrub-initiating messages:
165 * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
166 * possibly were in the queue while the PG changed state and became unavailable for
169 * The check_interval() catches all major changes to the PG. As for the other conditions
170 * we may check (and see is_message_relevant() above):
172 * - we are not 'active' yet, so must not check against is_active(), and:
174 * - the 'abort' flags were just verified (when the triggering message was queued). As
175 * those are only modified in human speeds - they need not be queried again.
177 * Some of the considerations above are also relevant to the replica-side initiation
178 * ('StartReplica' & 'StartReplicaNoWait').
181 void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued
)
183 dout(15) << __func__
<< " epoch: " << epoch_queued
<< dendl
;
184 // we may have lost our Primary status while the message languished in the queue
185 if (check_interval(epoch_queued
)) {
186 dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued
<< dendl
;
187 reset_epoch(epoch_queued
);
188 m_fsm
->process_event(StartScrub
{});
189 dout(10) << "scrubber event --<< StartScrub" << dendl
;
191 clear_queued_or_active(); // also restarts snap trimming
195 void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued
)
197 dout(15) << __func__
<< " epoch: " << epoch_queued
<< dendl
;
198 // we may have lost our Primary status while the message languished in the queue
199 if (check_interval(epoch_queued
)) {
200 dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued
<< dendl
;
201 reset_epoch(epoch_queued
);
202 m_fsm
->process_event(AfterRepairScrub
{});
203 dout(10) << "scrubber event --<< AfterRepairScrub" << dendl
;
205 clear_queued_or_active(); // also restarts snap trimming
209 void PgScrubber::send_scrub_unblock(epoch_t epoch_queued
)
211 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
212 if (is_message_relevant(epoch_queued
)) {
213 m_fsm
->process_event(Unblocked
{});
215 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
218 void PgScrubber::send_scrub_resched(epoch_t epoch_queued
)
220 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
221 if (is_message_relevant(epoch_queued
)) {
222 m_fsm
->process_event(InternalSchedScrub
{});
224 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
227 void PgScrubber::send_start_replica(epoch_t epoch_queued
, Scrub::act_token_t token
)
229 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
230 << " token: " << token
<< dendl
;
232 // shouldn't happen. Ignore
233 dout(1) << "got a replica scrub request while Primary!" << dendl
;
237 if (check_interval(epoch_queued
) && is_token_current(token
)) {
238 // save us some time by not waiting for updates if there are none
239 // to wait for. Affects the transition from NotActive into either
240 // ReplicaWaitUpdates or ActiveReplica.
241 if (pending_active_pushes())
242 m_fsm
->process_event(StartReplica
{});
244 m_fsm
->process_event(StartReplicaNoWait
{});
246 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
249 void PgScrubber::send_sched_replica(epoch_t epoch_queued
, Scrub::act_token_t token
)
251 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
252 << " token: " << token
<< dendl
;
253 if (check_interval(epoch_queued
) && is_token_current(token
)) {
254 m_fsm
->process_event(SchedReplica
{}); // retest for map availability
256 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
259 void PgScrubber::active_pushes_notification(epoch_t epoch_queued
)
261 // note: Primary only
262 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
263 if (is_message_relevant(epoch_queued
)) {
264 m_fsm
->process_event(ActivePushesUpd
{});
266 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
269 void PgScrubber::update_applied_notification(epoch_t epoch_queued
)
271 // note: Primary only
272 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
273 if (is_message_relevant(epoch_queued
)) {
274 m_fsm
->process_event(UpdatesApplied
{});
276 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
279 void PgScrubber::digest_update_notification(epoch_t epoch_queued
)
281 // note: Primary only
282 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
283 if (is_message_relevant(epoch_queued
)) {
284 m_fsm
->process_event(DigestUpdate
{});
286 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
289 void PgScrubber::send_local_map_done(epoch_t epoch_queued
)
291 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
292 if (is_message_relevant(epoch_queued
)) {
293 m_fsm
->process_event(Scrub::IntLocalMapDone
{});
295 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
298 void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued
)
300 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
301 if (is_message_relevant(epoch_queued
)) {
302 m_fsm
->process_event(GotReplicas
{});
304 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
307 void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued
)
309 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
310 if (check_interval(epoch_queued
)) {
311 m_fsm
->process_event(ReplicaPushesUpd
{});
313 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
316 void PgScrubber::send_remotes_reserved(epoch_t epoch_queued
)
318 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
319 // note: scrub is not active yet
320 if (check_interval(epoch_queued
)) {
321 m_fsm
->process_event(RemotesReserved
{});
323 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
326 void PgScrubber::send_reservation_failure(epoch_t epoch_queued
)
328 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
329 if (check_interval(epoch_queued
)) { // do not check for 'active'!
330 m_fsm
->process_event(ReservationFailure
{});
332 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
335 void PgScrubber::send_full_reset(epoch_t epoch_queued
)
337 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
339 m_fsm
->process_event(Scrub::FullReset
{});
341 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
344 void PgScrubber::send_chunk_free(epoch_t epoch_queued
)
346 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
347 if (check_interval(epoch_queued
)) {
348 m_fsm
->process_event(Scrub::SelectedChunkFree
{});
350 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
353 void PgScrubber::send_chunk_busy(epoch_t epoch_queued
)
355 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
356 if (check_interval(epoch_queued
)) {
357 m_fsm
->process_event(Scrub::ChunkIsBusy
{});
359 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
362 void PgScrubber::send_get_next_chunk(epoch_t epoch_queued
)
364 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
365 if (is_message_relevant(epoch_queued
)) {
366 m_fsm
->process_event(Scrub::NextChunk
{});
368 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
371 void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued
)
373 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
375 // can't check for "active"
377 m_fsm
->process_event(Scrub::ScrubFinished
{});
379 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
382 void PgScrubber::send_maps_compared(epoch_t epoch_queued
)
384 dout(10) << "scrubber event -->> " << __func__
<< " epoch: " << epoch_queued
<< dendl
;
386 m_fsm
->process_event(Scrub::MapsCompared
{});
388 dout(10) << "scrubber event --<< " << __func__
<< dendl
;
393 bool PgScrubber::is_reserving() const
395 return m_fsm
->is_reserving();
398 void PgScrubber::reset_epoch(epoch_t epoch_queued
)
400 dout(10) << __func__
<< " state deep? " << state_test(PG_STATE_DEEP_SCRUB
) << dendl
;
401 m_fsm
->assert_not_active();
403 m_epoch_start
= epoch_queued
;
404 m_needs_sleep
= true;
405 m_is_deep
= state_test(PG_STATE_DEEP_SCRUB
);
406 update_op_mode_text();
409 unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority
) const
411 unsigned int qu_priority
= m_flags
.priority
;
413 if (with_priority
== Scrub::scrub_prio_t::high_priority
) {
415 std::max(qu_priority
, (unsigned int)m_pg
->get_cct()->_conf
->osd_client_op_priority
);
420 unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority
,
421 unsigned int suggested_priority
) const
423 if (with_priority
== Scrub::scrub_prio_t::high_priority
) {
424 suggested_priority
= std::max(suggested_priority
,
425 (unsigned int)m_pg
->cct
->_conf
->osd_client_op_priority
);
427 return suggested_priority
;
430 // ///////////////////////////////////////////////////////////////////// //
431 // scrub-op registration handling
433 void PgScrubber::unregister_from_osd()
436 dout(15) << __func__
<< " prev. state: " << registration_state() << dendl
;
437 m_osds
->get_scrub_services().remove_from_osd_queue(m_scrub_job
);
441 bool PgScrubber::is_scrub_registered() const
443 return m_scrub_job
&& m_scrub_job
->in_queues
;
446 std::string_view
PgScrubber::registration_state() const
449 return m_scrub_job
->registration_state();
451 return "(no sched job)"sv
;
454 void PgScrubber::rm_from_osd_scrubbing()
456 // make sure the OSD won't try to scrub this one just now
457 unregister_from_osd();
460 void PgScrubber::on_primary_change(const requested_scrub_t
& request_flags
)
462 dout(10) << __func__
<< (is_primary() ? " Primary " : " Replica ")
463 << " flags: " << request_flags
<< dendl
;
469 dout(15) << __func__
<< " scrub-job state: " << m_scrub_job
->state_desc() << dendl
;
472 auto suggested
= determine_scrub_time(request_flags
);
473 m_osds
->get_scrub_services().register_with_osd(m_scrub_job
, suggested
);
475 m_osds
->get_scrub_services().remove_from_osd_queue(m_scrub_job
);
478 dout(15) << __func__
<< " done " << registration_state() << dendl
;
481 void PgScrubber::on_maybe_registration_change(const requested_scrub_t
& request_flags
)
483 dout(10) << __func__
<< (is_primary() ? " Primary " : " Replica/other ")
484 << registration_state() << " flags: " << request_flags
<< dendl
;
486 on_primary_change(request_flags
);
487 dout(15) << __func__
<< " done " << registration_state() << dendl
;
490 void PgScrubber::update_scrub_job(const requested_scrub_t
& request_flags
)
492 dout(10) << __func__
<< " flags: " << request_flags
<< dendl
;
495 // verify that the 'in_q' status matches our "Primariority"
496 if (m_scrub_job
&& is_primary() && !m_scrub_job
->in_queues
) {
497 dout(1) << __func__
<< " !!! primary but not scheduled! " << dendl
;
501 if (is_primary() && m_scrub_job
) {
502 auto suggested
= determine_scrub_time(request_flags
);
503 m_osds
->get_scrub_services().update_job(m_scrub_job
, suggested
);
506 dout(15) << __func__
<< " done " << registration_state() << dendl
;
509 ScrubQueue::sched_params_t
510 PgScrubber::determine_scrub_time(const requested_scrub_t
& request_flags
) const
512 ScrubQueue::sched_params_t res
;
515 return res
; // with ok_to_scrub set to 'false'
518 if (request_flags
.must_scrub
|| request_flags
.need_auto
) {
520 // Set the smallest time that isn't utime_t()
521 res
.proposed_time
= PgScrubber::scrub_must_stamp();
522 res
.is_must
= ScrubQueue::must_scrub_t::mandatory
;
523 // we do not need the interval data in this case
525 } else if (m_pg
->info
.stats
.stats_invalid
&&
526 m_pg
->cct
->_conf
->osd_scrub_invalid_stats
) {
527 res
.proposed_time
= ceph_clock_now();
528 res
.is_must
= ScrubQueue::must_scrub_t::mandatory
;
531 res
.proposed_time
= m_pg
->info
.history
.last_scrub_stamp
;
533 m_pg
->get_pool().info
.opts
.value_or(pool_opts_t::SCRUB_MIN_INTERVAL
, 0.0);
535 m_pg
->get_pool().info
.opts
.value_or(pool_opts_t::SCRUB_MAX_INTERVAL
, 0.0);
538 dout(15) << __func__
<< " suggested: " << res
.proposed_time
<< " hist: "
539 << m_pg
->info
.history
.last_scrub_stamp
<< " v:" << m_pg
->info
.stats
.stats_invalid
540 << " / " << m_pg
->cct
->_conf
->osd_scrub_invalid_stats
<< " must:"
541 << (res
.is_must
==ScrubQueue::must_scrub_t::mandatory
? "y" : "n" )
542 << " pool min: " << res
.min_interval
547 void PgScrubber::scrub_requested(scrub_level_t scrub_level
,
548 scrub_type_t scrub_type
,
549 requested_scrub_t
& req_flags
)
551 dout(10) << __func__
<< (scrub_level
== scrub_level_t::deep
? " deep " : " shallow ")
552 << (scrub_type
== scrub_type_t::do_repair
? " repair-scrub " : " not-repair ")
553 << " prev stamp: " << m_scrub_job
->get_sched_time()
554 << " registered? " << registration_state()
557 req_flags
.must_scrub
= true;
558 req_flags
.must_deep_scrub
=
559 (scrub_level
== scrub_level_t::deep
) || (scrub_type
== scrub_type_t::do_repair
);
560 req_flags
.must_repair
= (scrub_type
== scrub_type_t::do_repair
);
561 // User might intervene, so clear this
562 req_flags
.need_auto
= false;
563 req_flags
.req_scrub
= true;
565 dout(20) << __func__
<< " pg(" << m_pg_id
<< ") planned:" << req_flags
<< dendl
;
567 update_scrub_job(req_flags
);
568 m_pg
->publish_stats_to_osd();
572 void PgScrubber::request_rescrubbing(requested_scrub_t
& request_flags
)
574 dout(10) << __func__
<< " flags: " << request_flags
<< dendl
;
576 request_flags
.need_auto
= true;
577 update_scrub_job(request_flags
);
580 bool PgScrubber::reserve_local()
582 // try to create the reservation object (which translates into asking the
583 // OSD for the local scrub resource). If failing - undo it immediately
585 m_local_osd_resource
.emplace(m_osds
);
586 if (m_local_osd_resource
->is_reserved()) {
587 dout(15) << __func__
<< ": local resources reserved" << dendl
;
591 dout(10) << __func__
<< ": failed to reserve local scrub resources" << dendl
;
592 m_local_osd_resource
.reset();
596 // ----------------------------------------------------------------------------
598 bool PgScrubber::has_pg_marked_new_updates() const
600 auto last_applied
= m_pg
->recovery_state
.get_last_update_applied();
601 dout(10) << __func__
<< " recovery last: " << last_applied
602 << " vs. scrub's: " << m_subset_last_update
<< dendl
;
604 return last_applied
>= m_subset_last_update
;
607 void PgScrubber::set_subset_last_update(eversion_t e
)
609 m_subset_last_update
= e
;
610 dout(15) << __func__
<< " last-update: " << e
<< dendl
;
613 void PgScrubber::on_applied_when_primary(const eversion_t
& applied_version
)
615 // we are only interested in updates if we are the Primary, and in state
617 if (m_fsm
->is_accepting_updates() && (applied_version
>= m_subset_last_update
)) {
618 m_osds
->queue_scrub_applied_update(m_pg
, m_pg
->is_scrub_blocking_ops());
619 dout(15) << __func__
<< " update: " << applied_version
620 << " vs. required: " << m_subset_last_update
<< dendl
;
625 * The selected range is set directly into 'm_start' and 'm_end'
627 * - m_subset_last_update
632 bool PgScrubber::select_range()
634 m_primary_scrubmap
= ScrubMap
{};
635 m_received_maps
.clear();
637 /* get the start and end of our scrub chunk
639 * Our scrub chunk has an important restriction we're going to need to
640 * respect. We can't let head be start or end.
641 * Using a half-open interval means that if end == head,
642 * we'd scrub/lock head and the clone right next to head in different
643 * chunks which would allow us to miss clones created between
644 * scrubbing that chunk and scrubbing the chunk including head.
645 * This isn't true for any of the other clones since clones can
646 * only be created "just to the left of" head. There is one exception
647 * to this: promotion of clones which always happens to the left of the
648 * left-most clone, but promote_object checks the scrubber in that
649 * case, so it should be ok. Also, it's ok to "miss" clones at the
650 * left end of the range if we are a tier because they may legitimately
651 * not exist (see _scrub).
653 int min_idx
= static_cast<int>(std::max
<int64_t>(
654 3, m_pg
->get_cct()->_conf
->osd_scrub_chunk_min
/ (int)preemption_data
.chunk_divisor()));
656 int max_idx
= static_cast<int>(std::max
<int64_t>(min_idx
, m_pg
->get_cct()->_conf
->osd_scrub_chunk_max
/
657 (int)preemption_data
.chunk_divisor()));
659 dout(10) << __func__
<< " Min: " << min_idx
<< " Max: " << max_idx
660 << " Div: " << preemption_data
.chunk_divisor() << dendl
;
662 hobject_t start
= m_start
;
663 hobject_t candidate_end
;
664 std::vector
<hobject_t
> objects
;
665 int ret
= m_pg
->get_pgbackend()->objects_list_partial(start
, min_idx
, max_idx
, &objects
,
667 ceph_assert(ret
>= 0);
669 if (!objects
.empty()) {
671 hobject_t back
= objects
.back();
672 while (candidate_end
.is_head() && candidate_end
== back
.get_head()) {
673 candidate_end
= back
;
675 if (objects
.empty()) {
677 "Somehow we got more than 2 objects which"
678 "have the same head but are not clones");
680 back
= objects
.back();
683 if (candidate_end
.is_head()) {
684 ceph_assert(candidate_end
!= back
.get_head());
685 candidate_end
= candidate_end
.get_object_boundary();
689 ceph_assert(candidate_end
.is_max());
692 // is that range free for us? if not - we will be rescheduled later by whoever
693 // triggered us this time
695 if (!m_pg
->_range_available_for_scrub(m_start
, candidate_end
)) {
696 // we'll be requeued by whatever made us unavailable for scrub
697 dout(10) << __func__
<< ": scrub blocked somewhere in range "
698 << "[" << m_start
<< ", " << candidate_end
<< ")" << dendl
;
702 m_end
= candidate_end
;
703 if (m_end
> m_max_end
)
706 dout(15) << __func__
<< " range selected: " << m_start
<< " //// " << m_end
<< " //// "
707 << m_max_end
<< dendl
;
709 // debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command
710 if (m_debug_blockrange
> 0) {
711 m_debug_blockrange
--;
717 void PgScrubber::select_range_n_notify()
719 if (select_range()) {
720 // the next chunk to handle is not blocked
721 dout(20) << __func__
<< ": selection OK" << dendl
;
722 m_osds
->queue_scrub_chunk_free(m_pg
, Scrub::scrub_prio_t::low_priority
);
725 // we will wait for the objects range to become available for scrubbing
726 dout(10) << __func__
<< ": selected chunk is busy" << dendl
;
727 m_osds
->queue_scrub_chunk_busy(m_pg
, Scrub::scrub_prio_t::low_priority
);
731 bool PgScrubber::write_blocked_by_scrub(const hobject_t
& soid
)
733 if (soid
< m_start
|| soid
>= m_end
) {
737 dout(20) << __func__
<< " " << soid
<< " can preempt? "
738 << preemption_data
.is_preemptable() << " already preempted? "
739 << preemption_data
.was_preempted() << dendl
;
741 if (preemption_data
.was_preempted()) {
742 // otherwise - write requests arriving while 'already preempted' is set
743 // but 'preemptable' is not - will not be allowed to continue, and will
744 // not be requeued on time.
748 if (preemption_data
.is_preemptable()) {
750 dout(10) << __func__
<< " " << soid
<< " preempted" << dendl
;
752 // signal the preemption
753 preemption_data
.do_preempt();
754 m_end
= m_start
; // free the range we were scrubbing
761 bool PgScrubber::range_intersects_scrub(const hobject_t
& start
, const hobject_t
& end
)
763 // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
764 return (start
< m_max_end
&& end
>= m_start
);
767 Scrub::BlockedRangeWarning
PgScrubber::acquire_blocked_alarm()
769 return std::make_unique
<blocked_range_t
>(m_osds
, ceph::timespan
{300s
}, m_pg_id
);
773 * if we are required to sleep:
774 * arrange a callback sometimes later.
775 * be sure to be able to identify a stale callback.
776 * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
779 void PgScrubber::add_delayed_scheduling()
781 m_end
= m_start
; // not blocking any range now
783 milliseconds sleep_time
{0ms
};
786 1000.0 * m_osds
->get_scrub_services().scrub_sleep_time(m_flags
.required
);
787 sleep_time
= milliseconds
{int64_t(scrub_sleep
)};
789 dout(15) << __func__
<< " sleep: " << sleep_time
.count() << "ms. needed? "
790 << m_needs_sleep
<< dendl
;
792 if (sleep_time
.count()) {
793 // schedule a transition for some 'sleep_time' ms in the future
795 m_needs_sleep
= false;
796 m_sleep_started_at
= ceph_clock_now();
798 // the following log line is used by osd-scrub-test.sh
799 dout(20) << __func__
<< " scrub state is PendingTimer, sleeping" << dendl
;
801 // the 'delayer' for crimson is different. Will be factored out.
803 spg_t pgid
= m_pg
->get_pgid();
804 auto callbk
= new LambdaContext([osds
= m_osds
, pgid
, scrbr
= this](
805 [[maybe_unused
]] int r
) mutable {
806 PGRef pg
= osds
->osd
->lookup_lock_pg(pgid
);
808 lgeneric_subdout(g_ceph_context
, osd
, 10)
809 << "scrub_requeue_callback: Could not find "
810 << "PG " << pgid
<< " can't complete scrub requeue after sleep" << dendl
;
813 scrbr
->m_needs_sleep
= true;
814 lgeneric_dout(scrbr
->get_pg_cct(), 7)
815 << "scrub_requeue_callback: slept for "
816 << ceph_clock_now() - scrbr
->m_sleep_started_at
<< ", re-queuing scrub" << dendl
;
818 scrbr
->m_sleep_started_at
= utime_t
{};
819 osds
->queue_for_scrub_resched(&(*pg
), Scrub::scrub_prio_t::low_priority
);
823 std::lock_guard
l(m_osds
->sleep_lock
);
824 m_osds
->sleep_timer
.add_event_after(sleep_time
.count() / 1000.0f
, callbk
);
828 m_osds
->queue_for_scrub_resched(m_pg
, Scrub::scrub_prio_t::high_priority
);
832 eversion_t
PgScrubber::search_log_for_updates() const
834 auto& projected
= m_pg
->projected_log
.log
;
836 projected
.crbegin(), projected
.crend(),
837 [this](const auto& e
) -> bool { return e
.soid
>= m_start
&& e
.soid
< m_end
; });
839 if (pi
!= projected
.crend())
842 // there was no relevant update entry in the log
844 auto& log
= m_pg
->recovery_state
.get_pg_log().get_log().log
;
845 auto p
= find_if(log
.crbegin(), log
.crend(), [this](const auto& e
) -> bool {
846 return e
.soid
>= m_start
&& e
.soid
< m_end
;
849 if (p
== log
.crend())
855 void PgScrubber::get_replicas_maps(bool replica_can_preempt
)
857 dout(10) << __func__
<< " started in epoch/interval: " << m_epoch_start
<< "/"
859 << " pg same_interval_since: " << m_pg
->info
.history
.same_interval_since
862 m_primary_scrubmap_pos
.reset();
864 // ask replicas to scan and send maps
865 for (const auto& i
: m_pg
->get_acting_recovery_backfill()) {
867 if (i
== m_pg_whoami
)
870 m_maps_status
.mark_replica_map_request(i
);
871 _request_scrub_map(i
, m_subset_last_update
, m_start
, m_end
, m_is_deep
,
872 replica_can_preempt
);
875 dout(10) << __func__
<< " awaiting" << m_maps_status
<< dendl
;
878 bool PgScrubber::was_epoch_changed() const
880 // for crimson we have m_pg->get_info().history.same_interval_since
881 dout(10) << __func__
<< " epoch_start: " << m_interval_start
882 << " from pg: " << m_pg
->get_history().same_interval_since
<< dendl
;
884 return m_interval_start
< m_pg
->get_history().same_interval_since
;
887 void PgScrubber::mark_local_map_ready()
889 m_maps_status
.mark_local_map_ready();
892 bool PgScrubber::are_all_maps_available() const
894 return m_maps_status
.are_all_maps_available();
897 std::string
PgScrubber::dump_awaited_maps() const
899 return m_maps_status
.dump();
902 void PgScrubber::update_op_mode_text()
904 auto visible_repair
= state_test(PG_STATE_REPAIR
);
905 m_mode_desc
= (visible_repair
? "repair" : (m_is_deep
? "deep-scrub" : "scrub"));
907 dout(10) << __func__
<< ": repair: visible: " << (visible_repair
? "true" : "false")
908 << ", internal: " << (m_is_repair
? "true" : "false")
909 << ". Displayed: " << m_mode_desc
<< dendl
;
912 void PgScrubber::_request_scrub_map(pg_shard_t replica
,
917 bool allow_preemption
)
919 ceph_assert(replica
!= m_pg_whoami
);
920 dout(10) << __func__
<< " scrubmap from osd." << replica
921 << (deep
? " deep" : " shallow") << dendl
;
924 new MOSDRepScrub(spg_t(m_pg
->info
.pgid
.pgid
, replica
.shard
), version
,
925 get_osdmap_epoch(), m_pg
->get_last_peering_reset(), start
, end
, deep
,
926 allow_preemption
, m_flags
.priority
, m_pg
->ops_blocked_by_scrub());
928 // default priority. We want the replica-scrub processed prior to any recovery
929 // or client io messages (we are holding a lock!)
930 m_osds
->send_message_osd_cluster(replica
.osd
, repscrubop
, get_osdmap_epoch());
933 void PgScrubber::cleanup_store(ObjectStore::Transaction
* t
)
938 struct OnComplete
: Context
{
939 std::unique_ptr
<Scrub::Store
> store
;
940 explicit OnComplete(std::unique_ptr
<Scrub::Store
>&& store
) : store(std::move(store
))
942 void finish(int) override
{}
945 t
->register_on_complete(new OnComplete(std::move(m_store
)));
946 ceph_assert(!m_store
);
949 void PgScrubber::on_init()
951 // going upwards from 'inactive'
952 ceph_assert(!is_scrub_active());
953 m_pg
->reset_objects_scrubbed();
954 preemption_data
.reset();
955 m_pg
->publish_stats_to_osd();
956 m_interval_start
= m_pg
->get_history().same_interval_since
;
958 dout(10) << __func__
<< " start same_interval:" << m_interval_start
<< dendl
;
960 // create a new store
962 ObjectStore::Transaction t
;
965 Scrub::Store::create(m_pg
->osd
->store
, &t
, m_pg
->info
.pgid
, m_pg
->coll
));
966 m_pg
->osd
->store
->queue_transaction(m_pg
->ch
, std::move(t
), nullptr);
969 m_start
= m_pg
->info
.pgid
.pgid
.get_hobj_start();
971 ++m_sessions_counter
;
972 m_pg
->publish_stats_to_osd();
975 void PgScrubber::on_replica_init()
978 ++m_sessions_counter
;
981 void PgScrubber::_scan_snaps(ScrubMap
& smap
)
986 // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
988 dout(15) << "_scan_snaps starts" << dendl
;
990 for (auto i
= smap
.objects
.rbegin(); i
!= smap
.objects
.rend(); ++i
) {
992 const hobject_t
& hoid
= i
->first
;
993 ScrubMap::object
& o
= i
->second
;
995 dout(20) << __func__
<< " " << hoid
<< dendl
;
997 ceph_assert(!hoid
.is_snapdir());
998 if (hoid
.is_head()) {
1001 if (o
.attrs
.find(SS_ATTR
) == o
.attrs
.end()) {
1004 bl
.push_back(o
.attrs
[SS_ATTR
]);
1005 auto p
= bl
.cbegin();
1011 head
= hoid
.get_head();
1015 if (hoid
.snap
< CEPH_MAXSNAP
) {
1016 // check and if necessary fix snap_mapper
1017 if (hoid
.get_head() != head
) {
1018 derr
<< __func__
<< " no head for " << hoid
<< " (have " << head
<< ")" << dendl
;
1021 set
<snapid_t
> obj_snaps
;
1022 auto p
= snapset
.clone_snaps
.find(hoid
.snap
);
1023 if (p
== snapset
.clone_snaps
.end()) {
1024 derr
<< __func__
<< " no clone_snaps for " << hoid
<< " in " << snapset
<< dendl
;
1027 obj_snaps
.insert(p
->second
.begin(), p
->second
.end());
1028 set
<snapid_t
> cur_snaps
;
1029 int r
= m_pg
->snap_mapper
.get_snaps(hoid
, &cur_snaps
);
1030 if (r
!= 0 && r
!= -ENOENT
) {
1031 derr
<< __func__
<< ": get_snaps returned " << cpp_strerror(r
) << dendl
;
1034 if (r
== -ENOENT
|| cur_snaps
!= obj_snaps
) {
1035 ObjectStore::Transaction t
;
1036 OSDriver::OSTransaction
_t(m_pg
->osdriver
.get_transaction(&t
));
1038 r
= m_pg
->snap_mapper
.remove_oid(hoid
, &_t
);
1040 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
1043 m_pg
->osd
->clog
->error()
1044 << "osd." << m_pg
->osd
->whoami
<< " found snap mapper error on pg "
1045 << m_pg
->info
.pgid
<< " oid " << hoid
<< " snaps in mapper: " << cur_snaps
1046 << ", oi: " << obj_snaps
<< "...repaired";
1048 m_pg
->osd
->clog
->error()
1049 << "osd." << m_pg
->osd
->whoami
<< " found snap mapper error on pg "
1050 << m_pg
->info
.pgid
<< " oid " << hoid
<< " snaps missing in mapper"
1051 << ", should be: " << obj_snaps
<< " was " << cur_snaps
<< " r " << r
1054 m_pg
->snap_mapper
.add_oid(hoid
, obj_snaps
, &_t
);
1056 // wait for repair to apply to avoid confusing other bits of the system.
1058 dout(15) << __func__
<< " wait on repair!" << dendl
;
1060 ceph::condition_variable my_cond
;
1061 ceph::mutex my_lock
= ceph::make_mutex("PG::_scan_snaps my_lock");
1065 t
.register_on_applied_sync(new C_SafeCond(my_lock
, my_cond
, &done
, &e
));
1067 e
= m_pg
->osd
->store
->queue_transaction(m_pg
->ch
, std::move(t
));
1069 derr
<< __func__
<< ": queue_transaction got " << cpp_strerror(e
) << dendl
;
1071 std::unique_lock l
{my_lock
};
1072 my_cond
.wait(l
, [&done
] { return done
; });
1080 int PgScrubber::build_primary_map_chunk()
1082 epoch_t map_building_since
= m_pg
->get_osdmap_epoch();
1083 dout(20) << __func__
<< ": initiated at epoch " << map_building_since
<< dendl
;
1085 auto ret
= build_scrub_map_chunk(m_primary_scrubmap
, m_primary_scrubmap_pos
, m_start
,
1088 if (ret
== -EINPROGRESS
) {
1089 // reschedule another round of asking the backend to collect the scrub data
1090 m_osds
->queue_for_scrub_resched(m_pg
, Scrub::scrub_prio_t::low_priority
);
1095 int PgScrubber::build_replica_map_chunk()
1097 dout(10) << __func__
<< " interval start: " << m_interval_start
1098 << " current token: " << m_current_token
<< " epoch: " << m_epoch_start
1099 << " deep: " << m_is_deep
<< dendl
;
1101 auto ret
= build_scrub_map_chunk(replica_scrubmap
, replica_scrubmap_pos
, m_start
, m_end
,
1107 // must wait for the backend to finish. No external event source.
1108 // (note: previous version used low priority here. Now switched to using the
1109 // priority of the original message)
1110 m_osds
->queue_for_rep_scrub_resched(m_pg
, m_replica_request_priority
,
1111 m_flags
.priority
, m_current_token
);
1116 m_cleaned_meta_map
.clear_from(m_start
);
1117 m_cleaned_meta_map
.insert(replica_scrubmap
);
1118 auto for_meta_scrub
= clean_meta_map();
1119 _scan_snaps(for_meta_scrub
);
1121 // the local map has been created. Send it to the primary.
1122 // Note: once the message reaches the Primary, it may ask us for another
1123 // chunk - and we better be done with the current scrub. Thus - the preparation of
1124 // the reply message is separate, and we clear the scrub state before actually
1127 auto reply
= prep_replica_map_msg(PreemptionNoted::no_preemption
);
1128 replica_handling_done();
1129 dout(15) << __func__
<< " chunk map sent " << dendl
;
1130 send_replica_map(reply
);
1134 // negative retval: build_scrub_map_chunk() signalled an error
1135 // Pre-Pacific code ignored this option, treating it as a success.
1136 // \todo Add an error flag in the returning message.
1137 dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
1139 replica_handling_done();
1140 // only in debug mode for now:
1141 assert(false && "backend error");
1148 int PgScrubber::build_scrub_map_chunk(
1149 ScrubMap
& map
, ScrubMapBuilder
& pos
, hobject_t start
, hobject_t end
, bool deep
)
1151 dout(10) << __func__
<< " [" << start
<< "," << end
<< ") "
1152 << " pos " << pos
<< " Deep: " << deep
<< dendl
;
1155 while (pos
.empty()) {
1158 map
.valid_through
= m_pg
->info
.last_update
;
1161 vector
<ghobject_t
> rollback_obs
;
1163 m_pg
->get_pgbackend()->objects_list_range(start
, end
, &pos
.ls
, &rollback_obs
);
1164 dout(10) << __func__
<< " while pos empty " << pos
.ret
<< dendl
;
1166 dout(5) << "objects_list_range error: " << pos
.ret
<< dendl
;
1169 dout(10) << __func__
<< " pos.ls.empty()? " << (pos
.ls
.empty() ? "+" : "-") << dendl
;
1170 if (pos
.ls
.empty()) {
1173 m_pg
->_scan_rollback_obs(rollback_obs
);
1175 return -EINPROGRESS
;
1179 while (!pos
.done()) {
1181 int r
= m_pg
->get_pgbackend()->be_scan_list(map
, pos
);
1182 dout(30) << __func__
<< " BE returned " << r
<< dendl
;
1183 if (r
== -EINPROGRESS
) {
1184 dout(20) << __func__
<< " in progress" << dendl
;
1190 dout(20) << __func__
<< " finishing" << dendl
;
1191 ceph_assert(pos
.done());
1192 m_pg
->_repair_oinfo_oid(map
);
1194 dout(20) << __func__
<< " done, got " << map
.objects
.size() << " items" << dendl
;
1200 * Building a map of objects suitable for snapshot validation.
1201 * The data in m_cleaned_meta_map is the left over partial items that need to
1202 * be completed before they can be processed.
1204 * Snapshots in maps precede the head object, which is why we are scanning backwards.
1206 ScrubMap
PgScrubber::clean_meta_map()
1208 ScrubMap for_meta_scrub
;
1210 if (m_end
.is_max() || m_cleaned_meta_map
.objects
.empty()) {
1211 m_cleaned_meta_map
.swap(for_meta_scrub
);
1213 auto iter
= m_cleaned_meta_map
.objects
.end();
1214 --iter
; // not empty, see 'if' clause
1215 auto begin
= m_cleaned_meta_map
.objects
.begin();
1216 if (iter
->first
.has_snapset()) {
1219 while (iter
!= begin
) {
1221 if (next
->first
.get_head() != iter
->first
.get_head()) {
1227 for_meta_scrub
.objects
.insert(begin
, iter
);
1228 m_cleaned_meta_map
.objects
.erase(begin
, iter
);
1231 return for_meta_scrub
;
1234 void PgScrubber::run_callbacks()
1236 std::list
<Context
*> to_run
;
1237 to_run
.swap(m_callbacks
);
1239 for (auto& tr
: to_run
) {
1244 void PgScrubber::maps_compare_n_cleanup()
1246 scrub_compare_maps();
1250 m_osds
->queue_scrub_maps_compared(m_pg
, Scrub::scrub_prio_t::low_priority
);
1253 Scrub::preemption_t
& PgScrubber::get_preemptor()
1255 return preemption_data
;
1259 * Process note: called for the arriving "give me your map, replica!" request.
1260 * Unlike the original implementation, we do not requeue the Op waiting for
1261 * updates. Instead - we trigger the FSM.
1263 void PgScrubber::replica_scrub_op(OpRequestRef op
)
1266 auto msg
= op
->get_req
<MOSDRepScrub
>();
1267 dout(10) << __func__
<< " pg:" << m_pg
->pg_id
1268 << " Msg: map_epoch:" << msg
->map_epoch
1269 << " min_epoch:" << msg
->min_epoch
<< " deep?" << msg
->deep
<< dendl
;
1271 // are we still processing a previous scrub-map request without noticing that
1272 // the interval changed? won't see it here, but rather at the reservation
1275 if (msg
->map_epoch
< m_pg
->info
.history
.same_interval_since
) {
1276 dout(10) << "replica_scrub_op discarding old replica_scrub from "
1277 << msg
->map_epoch
<< " < "
1278 << m_pg
->info
.history
.same_interval_since
<< dendl
;
1280 // is there a general sync issue? are we holding a stale reservation?
1281 // not checking now - assuming we will actively react to interval change.
1286 if (is_queued_or_active()) {
1288 // Somehow, we have received a new scrub request from our Primary, before
1289 // having finished with the previous one. Did we go through an interval
1290 // change without reseting the FSM? Possible responses:
1291 // - crashing (the original assert_not_active() implemented that one), or
1292 // - trying to recover:
1293 // - (logging enough information to debug this scenario)
1295 m_osds
->clog
->warn() << fmt::format(
1296 "{}: error: a second scrub-op received while handling the previous one",
1299 scrub_clear_state();
1300 m_osds
->clog
->warn() << fmt::format(
1301 "{}: after a reset. Now handling the new OP", __func__
);
1303 // make sure the FSM is at NotActive
1304 m_fsm
->assert_not_active();
1306 replica_scrubmap
= ScrubMap
{};
1307 replica_scrubmap_pos
= ScrubMapBuilder
{};
1309 m_replica_min_epoch
= msg
->min_epoch
;
1310 m_start
= msg
->start
;
1312 m_max_end
= msg
->end
;
1313 m_is_deep
= msg
->deep
;
1314 m_interval_start
= m_pg
->info
.history
.same_interval_since
;
1315 m_replica_request_priority
= msg
->high_priority
1316 ? Scrub::scrub_prio_t::high_priority
1317 : Scrub::scrub_prio_t::low_priority
;
1318 m_flags
.priority
= msg
->priority
? msg
->priority
: m_pg
->get_scrub_priority();
1320 preemption_data
.reset();
1321 preemption_data
.force_preemptability(msg
->allow_preemption
);
1323 replica_scrubmap_pos
.reset();
1325 set_queued_or_active();
1326 m_osds
->queue_for_rep_scrub(m_pg
, m_replica_request_priority
,
1327 m_flags
.priority
, m_current_token
);
1330 void PgScrubber::set_op_parameters(requested_scrub_t
& request
)
1332 dout(10) << __func__
<< " input: " << request
<< dendl
;
1334 set_queued_or_active(); // we are fully committed now.
1336 // write down the epoch of starting a new scrub. Will be used
1337 // to discard stale messages from previous aborted scrubs.
1338 m_epoch_start
= m_pg
->get_osdmap_epoch();
1340 m_flags
.check_repair
= request
.check_repair
;
1341 m_flags
.auto_repair
= request
.auto_repair
|| request
.need_auto
;
1342 m_flags
.required
= request
.req_scrub
|| request
.must_scrub
;
1344 m_flags
.priority
= (request
.must_scrub
|| request
.need_auto
)
1345 ? get_pg_cct()->_conf
->osd_requested_scrub_priority
1346 : m_pg
->get_scrub_priority();
1348 state_set(PG_STATE_SCRUBBING
);
1350 // will we be deep-scrubbing?
1351 if (request
.must_deep_scrub
|| request
.need_auto
|| request
.time_for_deep
) {
1352 state_set(PG_STATE_DEEP_SCRUB
);
1355 // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
1356 // deep-scrub with the auto_repair configuration flag set). m_is_repair value
1357 // determines the scrubber behavior.
1358 // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
1359 // PG status as appearing in the logs).
1360 m_is_repair
= request
.must_repair
|| m_flags
.auto_repair
;
1361 if (request
.must_repair
) {
1362 state_set(PG_STATE_REPAIR
);
1363 // not calling update_op_mode_text() yet, as m_is_deep not set yet
1366 // the publishing here is required for tests synchronization
1367 m_pg
->publish_stats_to_osd();
1368 m_flags
.deep_scrub_on_error
= request
.deep_scrub_on_error
;
1371 void PgScrubber::scrub_compare_maps()
1373 dout(10) << __func__
<< " has maps, analyzing" << dendl
;
1375 // construct authoritative scrub map for type-specific scrubbing
1376 m_cleaned_meta_map
.insert(m_primary_scrubmap
);
1377 map
<hobject_t
, pair
<std::optional
<uint32_t>, std::optional
<uint32_t>>> missing_digest
;
1379 map
<pg_shard_t
, ScrubMap
*> maps
;
1380 maps
[m_pg_whoami
] = &m_primary_scrubmap
;
1382 for (const auto& i
: m_pg
->get_acting_recovery_backfill()) {
1383 if (i
== m_pg_whoami
)
1385 dout(2) << __func__
<< " replica " << i
<< " has "
1386 << m_received_maps
[i
].objects
.size() << " items" << dendl
;
1387 maps
[i
] = &m_received_maps
[i
];
1390 set
<hobject_t
> master_set
;
1392 // Construct master set
1393 for (const auto& map
: maps
) {
1394 for (const auto& i
: map
.second
->objects
) {
1395 master_set
.insert(i
.first
);
1400 m_pg
->get_pgbackend()->be_omap_checks(maps
, master_set
, m_omap_stats
, ss
);
1402 if (!ss
.str().empty()) {
1403 m_osds
->clog
->warn(ss
);
1406 if (m_pg
->recovery_state
.get_acting_recovery_backfill().size() > 1) {
1408 dout(10) << __func__
<< " comparing replica scrub maps" << dendl
;
1410 // Map from object with errors to good peer
1411 map
<hobject_t
, list
<pg_shard_t
>> authoritative
;
1413 dout(2) << __func__
<< ": primary (" << m_pg
->get_primary() << ") has "
1414 << m_primary_scrubmap
.objects
.size() << " items" << dendl
;
1415 m_pg
->add_objects_scrubbed_count(m_primary_scrubmap
.objects
.size());
1420 m_pg
->get_pgbackend()->be_compare_scrubmaps(
1421 maps
, master_set
, m_is_repair
, m_missing
, m_inconsistent
,
1422 authoritative
, missing_digest
, m_shallow_errors
, m_deep_errors
, m_store
.get(),
1423 m_pg
->info
.pgid
, m_pg
->recovery_state
.get_acting(), ss
);
1425 if (!ss
.str().empty()) {
1426 m_osds
->clog
->error(ss
);
1429 for (auto& i
: authoritative
) {
1430 list
<pair
<ScrubMap::object
, pg_shard_t
>> good_peers
;
1431 for (list
<pg_shard_t
>::const_iterator j
= i
.second
.begin(); j
!= i
.second
.end();
1433 good_peers
.emplace_back(maps
[*j
]->objects
[i
.first
], *j
);
1435 m_authoritative
.emplace(i
.first
, good_peers
);
1438 for (auto i
= authoritative
.begin(); i
!= authoritative
.end(); ++i
) {
1439 m_cleaned_meta_map
.objects
.erase(i
->first
);
1440 m_cleaned_meta_map
.objects
.insert(
1441 *(maps
[i
->second
.back()]->objects
.find(i
->first
)));
1445 auto for_meta_scrub
= clean_meta_map();
1447 // ok, do the pg-type specific scrubbing
1449 // (Validates consistency of the object info and snap sets)
1450 scrub_snapshot_metadata(for_meta_scrub
, missing_digest
);
1452 // Called here on the primary can use an authoritative map if it isn't the primary
1453 _scan_snaps(for_meta_scrub
);
1455 if (!m_store
->empty()) {
1458 dout(10) << __func__
<< ": discarding scrub results" << dendl
;
1459 m_store
->flush(nullptr);
1461 dout(10) << __func__
<< ": updating scrub object" << dendl
;
1462 ObjectStore::Transaction t
;
1464 m_pg
->osd
->store
->queue_transaction(m_pg
->ch
, std::move(t
), nullptr);
1469 ScrubMachineListener::MsgAndEpoch
PgScrubber::prep_replica_map_msg(
1470 PreemptionNoted was_preempted
)
1472 dout(10) << __func__
<< " min epoch:" << m_replica_min_epoch
<< dendl
;
1475 make_message
<MOSDRepScrubMap
>(spg_t(m_pg
->info
.pgid
.pgid
, m_pg
->get_primary().shard
),
1476 m_replica_min_epoch
, m_pg_whoami
);
1478 reply
->preempted
= (was_preempted
== PreemptionNoted::preempted
);
1479 ::encode(replica_scrubmap
, reply
->get_data());
1481 return ScrubMachineListener::MsgAndEpoch
{reply
, m_replica_min_epoch
};
1484 void PgScrubber::send_replica_map(const MsgAndEpoch
& preprepared
)
1486 m_pg
->send_cluster_message(m_pg
->get_primary().osd
, preprepared
.m_msg
,
1487 preprepared
.m_epoch
, false);
1490 void PgScrubber::send_preempted_replica()
1493 make_message
<MOSDRepScrubMap
>(spg_t
{m_pg
->info
.pgid
.pgid
, m_pg
->get_primary().shard
},
1494 m_replica_min_epoch
, m_pg_whoami
);
1496 reply
->preempted
= true;
1497 ::encode(replica_scrubmap
, reply
->get_data()); // must not skip this
1498 m_pg
->send_cluster_message(m_pg
->get_primary().osd
, reply
, m_replica_min_epoch
, false);
1502 * - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
1503 * The state-machine will react to that when all replica maps are received.
1504 * - when all maps are received, we signal the FSM with the GotReplicas event (see
1505 * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
1506 * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
1509 void PgScrubber::map_from_replica(OpRequestRef op
)
1511 auto m
= op
->get_req
<MOSDRepScrubMap
>();
1512 dout(15) << __func__
<< " " << *m
<< dendl
;
1514 if (m
->map_epoch
< m_pg
->info
.history
.same_interval_since
) {
1515 dout(10) << __func__
<< " discarding old from " << m
->map_epoch
<< " < "
1516 << m_pg
->info
.history
.same_interval_since
<< dendl
;
1520 auto p
= const_cast<bufferlist
&>(m
->get_data()).cbegin();
1522 m_received_maps
[m
->from
].decode(p
, m_pg
->info
.pgid
.pool());
1523 dout(15) << "map version is " << m_received_maps
[m
->from
].valid_through
<< dendl
;
1525 auto [is_ok
, err_txt
] = m_maps_status
.mark_arriving_map(m
->from
);
1527 // previously an unexpected map was triggering an assert. Now, as scrubs can be
1528 // aborted at any time, the chances of this happening have increased, and aborting is
1530 dout(1) << __func__
<< err_txt
<< " from OSD " << m
->from
<< dendl
;
1535 dout(10) << __func__
<< " replica was preempted, setting flag" << dendl
;
1536 preemption_data
.do_preempt();
1539 if (m_maps_status
.are_all_maps_available()) {
1540 dout(15) << __func__
<< " all repl-maps available" << dendl
;
1541 m_osds
->queue_scrub_got_repl_maps(m_pg
, m_pg
->is_scrub_blocking_ops());
1545 void PgScrubber::handle_scrub_reserve_request(OpRequestRef op
)
1547 dout(10) << __func__
<< " " << *op
->get_req() << dendl
;
1549 auto request_ep
= op
->get_req
<MOSDScrubReserve
>()->get_map_epoch();
1552 * if we are currently holding a reservation, then:
1553 * either (1) we, the scrubber, did not yet notice an interval change. The remembered
1554 * reservation epoch is from before our interval, and we can silently discard the
1555 * reservation (no message is required).
1557 * (2) the interval hasn't changed, but the same Primary that (we think) holds the
1558 * lock just sent us a new request. Note that we know it's the same Primary, as
1559 * otherwise the interval would have changed.
1560 * Ostensibly we can discard & redo the reservation. But then we
1561 * will be temporarily releasing the OSD resource - and might not be able to grab it
1562 * again. Thus, we simply treat this as a successful new request
1563 * (but mark the fact that if there is a previous request from the primary to
1564 * scrub a specific chunk - that request is now defunct).
1567 if (m_remote_osd_resource
.has_value() && m_remote_osd_resource
->is_stale()) {
1568 // we are holding a stale reservation from a past epoch
1569 m_remote_osd_resource
.reset();
1570 dout(10) << __func__
<< " cleared existing stale reservation" << dendl
;
1573 if (request_ep
< m_pg
->get_same_interval_since()) {
1574 // will not ack stale requests
1578 bool granted
{false};
1579 if (m_remote_osd_resource
.has_value()) {
1581 dout(10) << __func__
<< " already reserved." << dendl
;
1584 * it might well be that we did not yet finish handling the latest scrub-op from
1585 * our primary. This happens, for example, if 'noscrub' was set via a command, then
1586 * reset. The primary in this scenario will remain in the same interval, but we do need
1587 * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
1588 * from the primary will see us in active state, crashing the OSD).
1593 } else if (m_pg
->cct
->_conf
->osd_scrub_during_recovery
||
1594 !m_osds
->is_recovery_active()) {
1595 m_remote_osd_resource
.emplace(this, m_pg
, m_osds
, request_ep
);
1596 // OSD resources allocated?
1597 granted
= m_remote_osd_resource
->is_reserved();
1600 m_remote_osd_resource
.reset();
1601 dout(20) << __func__
<< ": failed to reserve remotely" << dendl
;
1605 dout(10) << __func__
<< " reserved? " << (granted
? "yes" : "no") << dendl
;
1607 Message
* reply
= new MOSDScrubReserve(
1608 spg_t(m_pg
->info
.pgid
.pgid
, m_pg
->get_primary().shard
), request_ep
,
1609 granted
? MOSDScrubReserve::GRANT
: MOSDScrubReserve::REJECT
, m_pg_whoami
);
1611 m_osds
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
1614 void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
)
1616 dout(10) << __func__
<< " " << *op
->get_req() << dendl
;
1619 if (m_reservations
.has_value()) {
1620 m_reservations
->handle_reserve_grant(op
, from
);
1622 derr
<< __func__
<< ": received unsolicited reservation grant from osd " << from
1623 << " (" << op
<< ")" << dendl
;
1627 void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
)
1629 dout(10) << __func__
<< " " << *op
->get_req() << dendl
;
1632 if (m_reservations
.has_value()) {
1633 // there is an active reservation process. No action is required otherwise.
1634 m_reservations
->handle_reserve_reject(op
, from
);
1638 void PgScrubber::handle_scrub_reserve_release(OpRequestRef op
)
1640 dout(10) << __func__
<< " " << *op
->get_req() << dendl
;
1644 * this specific scrub session has terminated. All incoming events carrying the old
1645 * tag will be discarded.
1648 m_remote_osd_resource
.reset();
1651 void PgScrubber::discard_replica_reservations()
1653 dout(10) << __func__
<< dendl
;
1654 if (m_reservations
.has_value()) {
1655 m_reservations
->discard_all();
1659 void PgScrubber::clear_scrub_reservations()
1661 dout(10) << __func__
<< dendl
;
1662 m_reservations
.reset(); // the remote reservations
1663 m_local_osd_resource
.reset(); // the local reservation
1664 m_remote_osd_resource
.reset(); // we as replica reserved for a Primary
1667 void PgScrubber::message_all_replicas(int32_t opcode
, std::string_view op_text
)
1669 ceph_assert(m_pg
->recovery_state
.get_backfill_targets().empty());
1671 std::vector
<pair
<int, Message
*>> messages
;
1672 messages
.reserve(m_pg
->get_actingset().size());
1674 epoch_t epch
= get_osdmap_epoch();
1676 for (auto& p
: m_pg
->get_actingset()) {
1678 if (p
== m_pg_whoami
)
1681 dout(10) << "scrub requesting " << op_text
<< " from osd." << p
<< " Epoch: " << epch
1683 Message
* m
= new MOSDScrubReserve(spg_t(m_pg
->info
.pgid
.pgid
, p
.shard
), epch
, opcode
,
1685 messages
.push_back(std::make_pair(p
.osd
, m
));
1688 if (!messages
.empty()) {
1689 m_osds
->send_message_osd_cluster(messages
, epch
);
1693 void PgScrubber::unreserve_replicas()
1695 dout(10) << __func__
<< dendl
;
1696 m_reservations
.reset();
1699 void PgScrubber::set_reserving_now()
1701 m_osds
->get_scrub_services().set_reserving_now();
1704 void PgScrubber::clear_reserving_now()
1706 m_osds
->get_scrub_services().clear_reserving_now();
1709 void PgScrubber::set_queued_or_active()
1711 m_queued_or_active
= true;
1714 void PgScrubber::clear_queued_or_active()
1716 if (m_queued_or_active
) {
1717 m_queued_or_active
= false;
1718 // and just in case snap trimming was blocked by the aborted scrub
1719 m_pg
->snap_trimmer_scrub_complete();
1723 bool PgScrubber::is_queued_or_active() const
1725 return m_queued_or_active
;
1728 [[nodiscard
]] bool PgScrubber::scrub_process_inconsistent()
1730 dout(10) << __func__
<< ": checking authoritative (mode="
1731 << m_mode_desc
<< ", auth remaining #: " << m_authoritative
.size()
1734 // authoritative only store objects which are missing or inconsistent.
1735 if (!m_authoritative
.empty()) {
1738 ss
<< m_pg
->info
.pgid
<< " " << m_mode_desc
<< " " << m_missing
.size() << " missing, "
1739 << m_inconsistent
.size() << " inconsistent objects";
1740 dout(2) << ss
.str() << dendl
;
1741 m_osds
->clog
->error(ss
);
1744 state_clear(PG_STATE_CLEAN
);
1745 // we know we have a problem, so it's OK to set the user-visible flag
1746 // even if we only reached here via auto-repair
1747 state_set(PG_STATE_REPAIR
);
1748 update_op_mode_text();
1750 for (const auto& [hobj
, shrd_list
] : m_authoritative
) {
1752 auto missing_entry
= m_missing
.find(hobj
);
1754 if (missing_entry
!= m_missing
.end()) {
1755 m_pg
->repair_object(hobj
, shrd_list
, missing_entry
->second
);
1756 m_fixed_count
+= missing_entry
->second
.size();
1759 if (m_inconsistent
.count(hobj
)) {
1760 m_pg
->repair_object(hobj
, shrd_list
, m_inconsistent
[hobj
]);
1761 m_fixed_count
+= m_inconsistent
[hobj
].size();
1766 return (!m_authoritative
.empty() && m_is_repair
);
1770 * note: only called for the Primary.
1772 void PgScrubber::scrub_finish()
1774 dout(10) << __func__
<< " before flags: " << m_flags
1775 << ". repair state: " << (state_test(PG_STATE_REPAIR
) ? "repair" : "no-repair")
1776 << ". deep_scrub_on_error: " << m_flags
.deep_scrub_on_error
<< dendl
;
1778 ceph_assert(m_pg
->is_locked());
1779 ceph_assert(is_queued_or_active());
1781 m_pg
->m_planned_scrub
= requested_scrub_t
{};
1783 // if the repair request comes from auto-repair and large number of errors,
1784 // we would like to cancel auto-repair
1785 if (m_is_repair
&& m_flags
.auto_repair
&&
1786 m_authoritative
.size() > m_pg
->cct
->_conf
->osd_scrub_auto_repair_num_errors
) {
1788 dout(10) << __func__
<< " undoing the repair" << dendl
;
1789 state_clear(PG_STATE_REPAIR
); // not expected to be set, anyway
1790 m_is_repair
= false;
1791 update_op_mode_text();
1794 bool do_auto_scrub
= false;
1796 // if a regular scrub had errors within the limit, do a deep scrub to auto repair
1797 if (m_flags
.deep_scrub_on_error
&& !m_authoritative
.empty() &&
1798 m_authoritative
.size() <= m_pg
->cct
->_conf
->osd_scrub_auto_repair_num_errors
) {
1799 ceph_assert(!m_is_deep
);
1800 do_auto_scrub
= true;
1801 dout(15) << __func__
<< " Try to auto repair after scrub errors" << dendl
;
1804 m_flags
.deep_scrub_on_error
= false;
1806 // type-specific finish (can tally more errors)
1809 bool has_error
= scrub_process_inconsistent();
1813 oss
<< m_pg
->info
.pgid
.pgid
<< " " << m_mode_desc
<< " ";
1814 int total_errors
= m_shallow_errors
+ m_deep_errors
;
1816 oss
<< total_errors
<< " errors";
1819 if (!m_is_deep
&& m_pg
->info
.stats
.stats
.sum
.num_deep_scrub_errors
)
1820 oss
<< " ( " << m_pg
->info
.stats
.stats
.sum
.num_deep_scrub_errors
1821 << " remaining deep scrub error details lost)";
1823 oss
<< ", " << m_fixed_count
<< " fixed";
1825 m_osds
->clog
->error(oss
);
1827 m_osds
->clog
->debug(oss
);
1830 // Since we don't know which errors were fixed, we can only clear them
1831 // when every one has been fixed.
1833 if (m_fixed_count
== m_shallow_errors
+ m_deep_errors
) {
1835 ceph_assert(m_is_deep
);
1836 m_shallow_errors
= 0;
1838 dout(20) << __func__
<< " All may be fixed" << dendl
;
1840 } else if (has_error
) {
1842 // Deep scrub in order to get corrected error counts
1843 m_pg
->scrub_after_recovery
= true;
1844 m_pg
->m_planned_scrub
.req_scrub
=
1845 m_pg
->m_planned_scrub
.req_scrub
|| m_flags
.required
;
1847 dout(20) << __func__
<< " Current 'required': " << m_flags
.required
1848 << " Planned 'req_scrub': " << m_pg
->m_planned_scrub
.req_scrub
<< dendl
;
1850 } else if (m_shallow_errors
|| m_deep_errors
) {
1852 // We have errors but nothing can be fixed, so there is no repair
1854 state_set(PG_STATE_FAILED_REPAIR
);
1855 dout(10) << __func__
<< " " << (m_shallow_errors
+ m_deep_errors
)
1856 << " error(s) present with no repair possible" << dendl
;
1862 ObjectStore::Transaction t
;
1863 m_pg
->recovery_state
.update_stats(
1864 [this](auto& history
, auto& stats
) {
1865 dout(10) << "m_pg->recovery_state.update_stats()" << dendl
;
1866 utime_t now
= ceph_clock_now();
1867 history
.last_scrub
= m_pg
->recovery_state
.get_info().last_update
;
1868 history
.last_scrub_stamp
= now
;
1870 history
.last_deep_scrub
= m_pg
->recovery_state
.get_info().last_update
;
1871 history
.last_deep_scrub_stamp
= now
;
1875 if ((m_shallow_errors
== 0) && (m_deep_errors
== 0))
1876 history
.last_clean_scrub_stamp
= now
;
1877 stats
.stats
.sum
.num_shallow_scrub_errors
= m_shallow_errors
;
1878 stats
.stats
.sum
.num_deep_scrub_errors
= m_deep_errors
;
1879 stats
.stats
.sum
.num_large_omap_objects
= m_omap_stats
.large_omap_objects
;
1880 stats
.stats
.sum
.num_omap_bytes
= m_omap_stats
.omap_bytes
;
1881 stats
.stats
.sum
.num_omap_keys
= m_omap_stats
.omap_keys
;
1882 dout(25) << "scrub_finish shard " << m_pg_whoami
1883 << " num_omap_bytes = " << stats
.stats
.sum
.num_omap_bytes
1884 << " num_omap_keys = " << stats
.stats
.sum
.num_omap_keys
<< dendl
;
1886 stats
.stats
.sum
.num_shallow_scrub_errors
= m_shallow_errors
;
1887 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
1888 // because of deep-scrub errors
1889 if (m_shallow_errors
== 0)
1890 history
.last_clean_scrub_stamp
= now
;
1892 stats
.stats
.sum
.num_scrub_errors
= stats
.stats
.sum
.num_shallow_scrub_errors
+
1893 stats
.stats
.sum
.num_deep_scrub_errors
;
1894 if (m_flags
.check_repair
) {
1895 m_flags
.check_repair
= false;
1896 if (m_pg
->info
.stats
.stats
.sum
.num_scrub_errors
) {
1897 state_set(PG_STATE_FAILED_REPAIR
);
1898 dout(10) << "scrub_finish " << m_pg
->info
.stats
.stats
.sum
.num_scrub_errors
1899 << " error(s) still present after re-scrub" << dendl
;
1905 int tr
= m_osds
->store
->queue_transaction(m_pg
->ch
, std::move(t
), nullptr);
1906 ceph_assert(tr
== 0);
1910 m_pg
->queue_peering_event(PGPeeringEventRef(std::make_shared
<PGPeeringEvent
>(
1911 get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
1913 m_is_repair
= false;
1914 state_clear(PG_STATE_REPAIR
);
1915 update_op_mode_text();
1918 cleanup_on_finish();
1919 if (do_auto_scrub
) {
1920 request_rescrubbing(m_pg
->m_planned_scrub
);
1923 if (m_pg
->is_active() && m_pg
->is_primary()) {
1924 m_pg
->recovery_state
.share_pg_info();
1928 void PgScrubber::on_digest_updates()
1930 dout(10) << __func__
<< " #pending: " << num_digest_updates_pending
1931 << (m_end
.is_max() ? " <last chunk>" : " <mid chunk>")
1932 << (is_queued_or_active() ? "" : " ** not marked as scrubbing **")
1935 if (num_digest_updates_pending
> 0) {
1936 // do nothing for now. We will be called again when new updates arrive
1940 // got all updates, and finished with this chunk. Any more?
1941 if (m_end
.is_max()) {
1942 m_osds
->queue_scrub_is_finished(m_pg
);
1944 // go get a new chunk (via "requeue")
1945 preemption_data
.reset();
1946 m_osds
->queue_scrub_next_chunk(m_pg
, m_pg
->is_scrub_blocking_ops());
1951 * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
1952 * is cleared once scrubbing starts; Some of the values dumped here are
1955 void PgScrubber::dump_scrubber(ceph::Formatter
* f
,
1956 const requested_scrub_t
& request_flags
) const
1958 f
->open_object_section("scrubber");
1960 if (m_active
) { // TBD replace with PR#42780's test
1961 f
->dump_bool("active", true);
1962 dump_active_scrubber(f
, state_test(PG_STATE_DEEP_SCRUB
));
1964 f
->dump_bool("active", false);
1965 f
->dump_bool("must_scrub",
1966 (m_pg
->m_planned_scrub
.must_scrub
|| m_flags
.required
));
1967 f
->dump_bool("must_deep_scrub", request_flags
.must_deep_scrub
);
1968 f
->dump_bool("must_repair", request_flags
.must_repair
);
1969 f
->dump_bool("need_auto", request_flags
.need_auto
);
1971 f
->dump_stream("scrub_reg_stamp") << m_scrub_job
->get_sched_time();
1973 // note that we are repeating logic that is coded elsewhere (currently PG.cc).
1974 // This is not optimal.
1975 bool deep_expected
= (ceph_clock_now() >= m_pg
->next_deepscrub_interval()) ||
1976 request_flags
.must_deep_scrub
|| request_flags
.need_auto
;
1978 m_scrub_job
->scheduling_state(ceph_clock_now(), deep_expected
);
1979 f
->dump_string("schedule", sched_state
);
1982 if (m_publish_sessions
) {
1983 f
->dump_int("test_sequence",
1984 m_sessions_counter
); // an ever-increasing number used by tests
1990 void PgScrubber::dump_active_scrubber(ceph::Formatter
* f
, bool is_deep
) const
1992 f
->dump_stream("epoch_start") << m_interval_start
;
1993 f
->dump_stream("start") << m_start
;
1994 f
->dump_stream("end") << m_end
;
1995 f
->dump_stream("max_end") << m_max_end
;
1996 f
->dump_stream("subset_last_update") << m_subset_last_update
;
1997 // note that m_is_deep will be set some time after PG_STATE_DEEP_SCRUB is
1998 // asserted. Thus, using the latter.
1999 f
->dump_bool("deep", is_deep
);
2001 // dump the scrub-type flags
2002 f
->dump_bool("req_scrub", m_flags
.required
);
2003 f
->dump_bool("auto_repair", m_flags
.auto_repair
);
2004 f
->dump_bool("check_repair", m_flags
.check_repair
);
2005 f
->dump_bool("deep_scrub_on_error", m_flags
.deep_scrub_on_error
);
2006 f
->dump_unsigned("priority", m_flags
.priority
);
2008 f
->dump_int("shallow_errors", m_shallow_errors
);
2009 f
->dump_int("deep_errors", m_deep_errors
);
2010 f
->dump_int("fixed", m_fixed_count
);
2012 f
->open_array_section("waiting_on_whom");
2013 for (const auto& p
: m_maps_status
.get_awaited()) {
2014 f
->dump_stream("shard") << p
;
2018 f
->dump_string("schedule", "scrubbing");
2021 pg_scrubbing_status_t
PgScrubber::get_schedule() const
2023 dout(25) << __func__
<< dendl
;
2026 return pg_scrubbing_status_t
{};
2029 auto now_is
= ceph_clock_now();
2032 // report current scrub info, including updated duration
2033 int32_t duration
= (utime_t
{now_is
} - scrub_begin_stamp
).sec();
2035 return pg_scrubbing_status_t
{
2038 pg_scrub_sched_status_t::active
,
2040 (m_is_deep
? scrub_level_t::deep
: scrub_level_t::shallow
),
2041 false /* is periodic? unknown, actually */};
2043 if (m_scrub_job
->state
!= ScrubQueue::qu_state_t::registered
) {
2044 return pg_scrubbing_status_t
{utime_t
{},
2046 pg_scrub_sched_status_t::not_queued
,
2048 scrub_level_t::shallow
,
2052 // Will next scrub surely be a deep one? note that deep-scrub might be
2053 // selected even if we report a regular scrub here.
2054 bool deep_expected
= (now_is
>= m_pg
->next_deepscrub_interval()) ||
2055 m_pg
->m_planned_scrub
.must_deep_scrub
||
2056 m_pg
->m_planned_scrub
.need_auto
;
2057 scrub_level_t expected_level
=
2058 deep_expected
? scrub_level_t::deep
: scrub_level_t::shallow
;
2059 bool periodic
= !m_pg
->m_planned_scrub
.must_scrub
&&
2060 !m_pg
->m_planned_scrub
.need_auto
&&
2061 !m_pg
->m_planned_scrub
.must_deep_scrub
;
2063 // are we ripe for scrubbing?
2064 if (now_is
> m_scrub_job
->schedule
.scheduled_at
) {
2065 // we are waiting for our turn at the OSD.
2066 return pg_scrubbing_status_t
{m_scrub_job
->schedule
.scheduled_at
,
2068 pg_scrub_sched_status_t::queued
,
2074 return pg_scrubbing_status_t
{m_scrub_job
->schedule
.scheduled_at
,
2076 pg_scrub_sched_status_t::scheduled
,
2082 void PgScrubber::handle_query_state(ceph::Formatter
* f
)
2084 dout(15) << __func__
<< dendl
;
2086 f
->open_object_section("scrub");
2087 f
->dump_stream("scrubber.epoch_start") << m_interval_start
;
2088 f
->dump_bool("scrubber.active", m_active
);
2089 f
->dump_stream("scrubber.start") << m_start
;
2090 f
->dump_stream("scrubber.end") << m_end
;
2091 f
->dump_stream("scrubber.max_end") << m_max_end
;
2092 f
->dump_stream("scrubber.subset_last_update") << m_subset_last_update
;
2093 f
->dump_bool("scrubber.deep", m_is_deep
);
2095 f
->open_array_section("scrubber.waiting_on_whom");
2096 for (const auto& p
: m_maps_status
.get_awaited()) {
2097 f
->dump_stream("shard") << p
;
2102 f
->dump_string("comment", "DEPRECATED - may be removed in the next release");
2107 PgScrubber::~PgScrubber()
2110 // make sure the OSD won't try to scrub this one just now
2111 rm_from_osd_scrubbing();
2112 m_scrub_job
.reset();
2116 PgScrubber::PgScrubber(PG
* pg
)
2118 , m_pg_id
{pg
->pg_id
}
2120 , m_pg_whoami
{pg
->pg_whoami
}
2121 , preemption_data
{pg
}
2123 m_fsm
= std::make_unique
<ScrubMachine
>(m_pg
, this);
2126 m_scrub_job
= ceph::make_ref
<ScrubQueue::ScrubJob
>(m_osds
->cct
, m_pg
->pg_id
,
2127 m_osds
->get_nodeid());
2130 void PgScrubber::set_scrub_begin_time()
2132 scrub_begin_stamp
= ceph_clock_now();
2135 void PgScrubber::set_scrub_duration()
2137 utime_t stamp
= ceph_clock_now();
2138 utime_t duration
= stamp
- scrub_begin_stamp
;
2139 m_pg
->recovery_state
.update_stats([=](auto& history
, auto& stats
) {
2140 stats
.last_scrub_duration
= ceill(duration
.to_msec()/1000.0);
2141 stats
.scrub_duration
= double(duration
);
2146 void PgScrubber::reserve_replicas()
2148 dout(10) << __func__
<< dendl
;
2149 m_reservations
.emplace(m_pg
, m_pg_whoami
, m_scrub_job
);
2152 void PgScrubber::cleanup_on_finish()
2154 dout(10) << __func__
<< dendl
;
2155 ceph_assert(m_pg
->is_locked());
2157 state_clear(PG_STATE_SCRUBBING
);
2158 state_clear(PG_STATE_DEEP_SCRUB
);
2159 m_pg
->publish_stats_to_osd();
2161 clear_scrub_reservations();
2162 m_pg
->publish_stats_to_osd();
2166 reset_internal_state();
2167 m_pg
->publish_stats_to_osd();
2168 m_flags
= scrub_flags_t
{};
2170 // type-specific state clear
2171 _scrub_clear_state();
2174 // uses process_event(), so must be invoked externally
2175 void PgScrubber::scrub_clear_state()
2177 dout(10) << __func__
<< dendl
;
2179 clear_pgscrub_state();
2180 m_fsm
->process_event(FullReset
{});
2184 * note: does not access the state-machine
2186 void PgScrubber::clear_pgscrub_state()
2188 dout(10) << __func__
<< dendl
;
2189 ceph_assert(m_pg
->is_locked());
2191 state_clear(PG_STATE_SCRUBBING
);
2192 state_clear(PG_STATE_DEEP_SCRUB
);
2194 state_clear(PG_STATE_REPAIR
);
2196 clear_scrub_reservations();
2197 m_pg
->publish_stats_to_osd();
2201 reset_internal_state();
2202 m_flags
= scrub_flags_t
{};
2204 // type-specific state clear
2205 _scrub_clear_state();
2206 m_pg
->publish_stats_to_osd();
2209 void PgScrubber::replica_handling_done()
2211 dout(10) << __func__
<< dendl
;
2213 state_clear(PG_STATE_SCRUBBING
);
2214 state_clear(PG_STATE_DEEP_SCRUB
);
2216 reset_internal_state();
2220 * note: performs run_callbacks()
2221 * note: reservations-related variables are not reset here
2223 void PgScrubber::reset_internal_state()
2225 dout(10) << __func__
<< dendl
;
2227 preemption_data
.reset();
2228 m_maps_status
.reset();
2229 m_received_maps
.clear();
2231 m_start
= hobject_t
{};
2232 m_end
= hobject_t
{};
2233 m_max_end
= hobject_t
{};
2234 m_subset_last_update
= eversion_t
{};
2235 m_shallow_errors
= 0;
2238 m_omap_stats
= (const struct omap_stat_t
){0};
2242 m_inconsistent
.clear();
2244 m_authoritative
.clear();
2245 num_digest_updates_pending
= 0;
2246 m_primary_scrubmap
= ScrubMap
{};
2247 m_primary_scrubmap_pos
.reset();
2248 replica_scrubmap
= ScrubMap
{};
2249 replica_scrubmap_pos
.reset();
2250 m_cleaned_meta_map
= ScrubMap
{};
2251 m_needs_sleep
= true;
2252 m_sleep_started_at
= utime_t
{};
2255 clear_queued_or_active();
2256 ++m_sessions_counter
;
2259 // note that only applicable to the Replica:
2260 void PgScrubber::advance_token()
2262 dout(10) << __func__
<< " was: " << m_current_token
<< dendl
;
2265 // when advance_token() is called, it is assumed that no scrubbing takes place.
2266 // We will, though, verify that. And if we are actually still handling a stale request -
2267 // both our internal state and the FSM state will be cleared.
2268 replica_handling_done();
2269 m_fsm
->process_event(FullReset
{});
2272 bool PgScrubber::is_token_current(Scrub::act_token_t received_token
)
2274 if (received_token
== 0 || received_token
== m_current_token
) {
2277 dout(5) << __func__
<< " obsolete token (" << received_token
2278 << " vs current " << m_current_token
<< dendl
;
2283 const OSDMapRef
& PgScrubber::get_osdmap() const
2285 return m_pg
->get_osdmap();
2288 ostream
& operator<<(ostream
& out
, const PgScrubber
& scrubber
)
2290 return out
<< scrubber
.m_flags
;
2293 std::ostream
& PgScrubber::gen_prefix(std::ostream
& out
) const
2295 const auto fsm_state
= m_fsm
? m_fsm
->current_states_desc() : "- :";
2297 return m_pg
->gen_prefix(out
) << "scrubber " << fsm_state
<< ": ";
2299 return out
<< " scrubber [~] " << fsm_state
<< ": ";
2303 void PgScrubber::log_cluster_warning(const std::string
& warning
) const
2305 m_osds
->clog
->do_log(CLOG_WARN
, warning
);
2308 ostream
& PgScrubber::show(ostream
& out
) const
2310 return out
<< " [ " << m_pg_id
<< ": " << m_flags
<< " ] ";
2313 int PgScrubber::asok_debug(std::string_view cmd
,
2318 dout(10) << __func__
<< " cmd: " << cmd
<< " param: " << param
<< dendl
;
2320 if (cmd
== "block") {
2321 // set a flag that will cause the next 'select_range' to report a blocked object
2322 m_debug_blockrange
= 1;
2324 } else if (cmd
== "unblock") {
2325 // send an 'unblock' event, as if a blocked range was freed
2326 m_debug_blockrange
= 0;
2327 m_fsm
->process_event(Unblocked
{});
2329 } else if ((cmd
== "set") || (cmd
== "unset")) {
2331 if (param
== "sessions") {
2332 // set/reset the inclusion of the scrub sessions counter in 'query' output
2333 m_publish_sessions
= (cmd
== "set");
2335 } else if (param
== "block") {
2337 // set a flag that will cause the next 'select_range' to report a blocked object
2338 m_debug_blockrange
= 1;
2340 // send an 'unblock' event, as if a blocked range was freed
2341 m_debug_blockrange
= 0;
2342 m_fsm
->process_event(Unblocked
{});
2349 // ///////////////////// preemption_data_t //////////////////////////////////
2351 PgScrubber::preemption_data_t::preemption_data_t(PG
* pg
) : m_pg
{pg
}
2353 m_left
= static_cast<int>(
2354 m_pg
->get_cct()->_conf
.get_val
<uint64_t>("osd_scrub_max_preemptions"));
2357 void PgScrubber::preemption_data_t::reset()
2359 std::lock_guard
<std::mutex
> lk
{m_preemption_lock
};
2361 m_preemptable
= false;
2362 m_preempted
= false;
2364 static_cast<int>(m_pg
->cct
->_conf
.get_val
<uint64_t>("osd_scrub_max_preemptions"));
2369 // ///////////////////// ReplicaReservations //////////////////////////////////
2372 void ReplicaReservations::release_replica(pg_shard_t peer
, epoch_t epoch
)
2374 auto m
= new MOSDScrubReserve(spg_t(m_pg_info
.pgid
.pgid
, peer
.shard
), epoch
,
2375 MOSDScrubReserve::RELEASE
, m_pg
->pg_whoami
);
2376 m_osds
->send_message_osd_cluster(peer
.osd
, m
, epoch
);
2379 ReplicaReservations::ReplicaReservations(PG
* pg
, pg_shard_t whoami
, ScrubQueue::ScrubJobRef scrubjob
)
2381 , m_acting_set
{pg
->get_actingset()}
2382 , m_osds
{m_pg
->get_pg_osd(ScrubberPasskey())}
2383 , m_pending
{static_cast<int>(m_acting_set
.size()) - 1}
2384 , m_pg_info
{m_pg
->get_pg_info(ScrubberPasskey())}
2385 , m_scrub_job
{scrubjob
}
2387 epoch_t epoch
= m_pg
->get_osdmap_epoch();
2390 std::stringstream prefix
;
2391 prefix
<< "osd." << m_osds
->whoami
<< " ep: " << epoch
2392 << " scrubber::ReplicaReservations pg[" << pg
->pg_id
<< "]: ";
2393 m_log_msg_prefix
= prefix
.str();
2396 // handle the special case of no replicas
2397 if (m_pending
<= 0) {
2398 // just signal the scrub state-machine to continue
2403 for (auto p
: m_acting_set
) {
2406 auto m
= new MOSDScrubReserve(spg_t(m_pg_info
.pgid
.pgid
, p
.shard
), epoch
,
2407 MOSDScrubReserve::REQUEST
, m_pg
->pg_whoami
);
2408 m_osds
->send_message_osd_cluster(p
.osd
, m
, epoch
);
2409 m_waited_for_peers
.push_back(p
);
2410 dout(10) << __func__
<< ": reserve " << p
.osd
<< dendl
;
2415 void ReplicaReservations::send_all_done()
2417 m_osds
->queue_for_scrub_granted(m_pg
, scrub_prio_t::low_priority
);
2420 void ReplicaReservations::send_reject()
2422 m_scrub_job
->resources_failure
= true;
2423 m_osds
->queue_for_scrub_denied(m_pg
, scrub_prio_t::low_priority
);
2426 void ReplicaReservations::discard_all()
2428 dout(10) << __func__
<< ": " << m_reserved_peers
<< dendl
;
2430 m_had_rejections
= true; // preventing late-coming responses from triggering events
2431 m_reserved_peers
.clear();
2432 m_waited_for_peers
.clear();
2435 ReplicaReservations::~ReplicaReservations()
2437 m_had_rejections
= true; // preventing late-coming responses from triggering events
2439 // send un-reserve messages to all reserved replicas. We do not wait for answer (there
2440 // wouldn't be one). Other incoming messages will be discarded on the way, by our
2442 epoch_t epoch
= m_pg
->get_osdmap_epoch();
2444 for (auto& p
: m_reserved_peers
) {
2445 release_replica(p
, epoch
);
2447 m_reserved_peers
.clear();
2449 // note: the release will follow on the heels of the request. When tried otherwise,
2450 // grants that followed a reject arrived after the whole scrub machine-state was
2451 // reset, causing leaked reservations.
2452 for (auto& p
: m_waited_for_peers
) {
2453 release_replica(p
, epoch
);
2455 m_waited_for_peers
.clear();
2459 * @ATTN we would not reach here if the ReplicaReservation object managed by the
2460 * scrubber was reset.
2462 void ReplicaReservations::handle_reserve_grant(OpRequestRef op
, pg_shard_t from
)
2464 dout(10) << __func__
<< ": granted by " << from
<< dendl
;
2468 // reduce the amount of extra release messages. Not a must, but the log is cleaner
2469 auto w
= find(m_waited_for_peers
.begin(), m_waited_for_peers
.end(), from
);
2470 if (w
!= m_waited_for_peers
.end())
2471 m_waited_for_peers
.erase(w
);
2474 // are we forced to reject the reservation?
2475 if (m_had_rejections
) {
2477 dout(10) << __func__
<< ": rejecting late-coming reservation from "
2479 release_replica(from
, m_pg
->get_osdmap_epoch());
2481 } else if (std::find(m_reserved_peers
.begin(), m_reserved_peers
.end(), from
) !=
2482 m_reserved_peers
.end()) {
2484 dout(10) << __func__
<< ": already had osd." << from
<< " reserved" << dendl
;
2488 dout(10) << __func__
<< ": osd." << from
<< " scrub reserve = success"
2490 m_reserved_peers
.push_back(from
);
2491 if (--m_pending
== 0) {
2497 void ReplicaReservations::handle_reserve_reject(OpRequestRef op
, pg_shard_t from
)
2499 dout(10) << __func__
<< ": rejected by " << from
<< dendl
;
2500 dout(15) << __func__
<< ": " << *op
->get_req() << dendl
;
2504 // reduce the amount of extra release messages. Not a must, but the log is cleaner
2505 auto w
= find(m_waited_for_peers
.begin(), m_waited_for_peers
.end(), from
);
2506 if (w
!= m_waited_for_peers
.end())
2507 m_waited_for_peers
.erase(w
);
2510 if (m_had_rejections
) {
2512 // our failure was already handled when the first rejection arrived
2513 dout(15) << __func__
<< ": ignoring late-coming rejection from "
2516 } else if (std::find(m_reserved_peers
.begin(), m_reserved_peers
.end(), from
) !=
2517 m_reserved_peers
.end()) {
2519 dout(10) << __func__
<< ": already had osd." << from
<< " reserved" << dendl
;
2523 dout(10) << __func__
<< ": osd." << from
<< " scrub reserve = fail" << dendl
;
2524 m_had_rejections
= true; // preventing any additional notifications
2529 std::ostream
& ReplicaReservations::gen_prefix(std::ostream
& out
) const
2531 return out
<< m_log_msg_prefix
;
2534 // ///////////////////// LocalReservation //////////////////////////////////
2536 // note: no dout()s in LocalReservation functions. Client logs interactions.
2537 LocalReservation::LocalReservation(OSDService
* osds
)
2540 if (m_osds
->get_scrub_services().inc_scrubs_local()) {
2541 // the failure is signalled by not having m_holding_local_reservation set
2542 m_holding_local_reservation
= true;
2546 LocalReservation::~LocalReservation()
2548 if (m_holding_local_reservation
) {
2549 m_holding_local_reservation
= false;
2550 m_osds
->get_scrub_services().dec_scrubs_local();
2554 // ///////////////////// ReservedByRemotePrimary ///////////////////////////////
2556 ReservedByRemotePrimary::ReservedByRemotePrimary(const PgScrubber
* scrubber
,
2560 : m_scrubber
{scrubber
}
2563 , m_reserved_at
{epoch
}
2565 if (!m_osds
->get_scrub_services().inc_scrubs_remote()) {
2566 dout(10) << __func__
<< ": failed to reserve at Primary request" << dendl
;
2567 // the failure is signalled by not having m_reserved_by_remote_primary set
2571 dout(20) << __func__
<< ": scrub resources reserved at Primary request" << dendl
;
2572 m_reserved_by_remote_primary
= true;
2575 bool ReservedByRemotePrimary::is_stale() const
2577 return m_reserved_at
< m_pg
->get_same_interval_since();
2580 ReservedByRemotePrimary::~ReservedByRemotePrimary()
2582 if (m_reserved_by_remote_primary
) {
2583 m_reserved_by_remote_primary
= false;
2584 m_osds
->get_scrub_services().dec_scrubs_remote();
2588 std::ostream
& ReservedByRemotePrimary::gen_prefix(std::ostream
& out
) const
2590 return m_scrubber
->gen_prefix(out
);
2593 // ///////////////////// MapsCollectionStatus ////////////////////////////////
2595 auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from
)
2596 -> std::tuple
<bool, std::string_view
>
2598 auto fe
= std::find(m_maps_awaited_for
.begin(), m_maps_awaited_for
.end(), from
);
2599 if (fe
!= m_maps_awaited_for
.end()) {
2600 // we are indeed waiting for a map from this replica
2601 m_maps_awaited_for
.erase(fe
);
2602 return std::tuple
{true, ""sv
};
2604 return std::tuple
{false, " unsolicited scrub-map"sv
};
2608 void MapsCollectionStatus::reset()
2610 *this = MapsCollectionStatus
{};
2613 std::string
MapsCollectionStatus::dump() const
2616 for (const auto& rp
: m_maps_awaited_for
) {
2617 all
.append(rp
.get_osd() + " "s
);
2622 ostream
& operator<<(ostream
& out
, const MapsCollectionStatus
& sf
)
2625 for (const auto& rp
: sf
.m_maps_awaited_for
) {
2626 out
<< rp
.get_osd() << " ";
2628 if (!sf
.m_local_map_ready
) {
2631 return out
<< " ] ";
2634 // ///////////////////// blocked_range_t ///////////////////////////////
2636 blocked_range_t::blocked_range_t(OSDService
* osds
, ceph::timespan waittime
, spg_t pg_id
)
2639 auto now_is
= std::chrono::system_clock::now();
2640 m_callbk
= new LambdaContext([now_is
, pg_id
, osds
]([[maybe_unused
]] int r
) {
2641 std::time_t now_c
= std::chrono::system_clock::to_time_t(now_is
);
2643 strftime(buf
, sizeof(buf
), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c
));
2644 lgeneric_subdout(g_ceph_context
, osd
, 10)
2645 << "PgScrubber: " << pg_id
<< " blocked on an object for too long (since " << buf
2647 osds
->clog
->warn() << "osd." << osds
->whoami
<< " PgScrubber: " << pg_id
<< " blocked on an object for too long (since " << buf
<< ")";
2651 std::lock_guard
l(m_osds
->sleep_lock
);
2652 m_osds
->sleep_timer
.add_event_after(waittime
, m_callbk
);
2655 blocked_range_t::~blocked_range_t()
2657 std::lock_guard
l(m_osds
->sleep_lock
);
2658 m_osds
->sleep_timer
.cancel_event(m_callbk
);
2661 } // namespace Scrub