1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
12 #include <string_view>
16 #include "ScrubStore.h"
17 #include "scrub_machine_lstnr.h"
18 #include "scrubber_common.h"
27 * Reserving/freeing scrub resources at the replicas.
29 * When constructed - sends reservation requests to the acting_set.
30 * A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
31 * All previous requests, whether already granted or not, are explicitly released.
33 * A note re performance: I've measured a few container alternatives for
34 * m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
35 * expected. flat_set is only slightly better. Surprisingly - std::vector (with no
36 * sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
38 class ReplicaReservations
{
39 using OrigSet
= decltype(std::declval
<PG
>().get_actingset());
44 std::vector
<pg_shard_t
> m_waited_for_peers
;
45 std::vector
<pg_shard_t
> m_reserved_peers
;
46 bool m_had_rejections
{false};
49 void release_replica(pg_shard_t peer
, epoch_t epoch
);
51 void send_all_done(); ///< all reservations are granted
53 /// notify the scrubber that we have failed to reserve replicas' resources
58 * quietly discard all knowledge about existing reservations. No messages
60 * To be used upon interval change, as we know the the running scrub is no longer
61 * relevant, and that the replicas had reset the reservations on their side.
65 ReplicaReservations(PG
* pg
, pg_shard_t whoami
);
67 ~ReplicaReservations();
69 void handle_reserve_grant(OpRequestRef op
, pg_shard_t from
);
71 void handle_reserve_reject(OpRequestRef op
, pg_shard_t from
);
75 * wraps the local OSD scrub resource reservation in an RAII wrapper
77 class LocalReservation
{
80 bool m_holding_local_reservation
{false};
83 LocalReservation(PG
* pg
, OSDService
* osds
);
85 bool is_reserved() const { return m_holding_local_reservation
; }
89 * wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
91 class ReservedByRemotePrimary
{
94 bool m_reserved_by_remote_primary
{false};
95 const epoch_t m_reserved_at
;
98 ReservedByRemotePrimary(PG
* pg
, OSDService
* osds
, epoch_t epoch
);
99 ~ReservedByRemotePrimary();
100 [[nodiscard
]] bool is_reserved() const { return m_reserved_by_remote_primary
; }
102 /// compare the remembered reserved-at epoch to the current interval
103 [[nodiscard
]] bool is_stale() const;
107 * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
108 * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
109 * combines the status of waiting for both the local map and the replicas, without
110 * resorting to adding dummy entries into a list.
112 class MapsCollectionStatus
{
114 bool m_local_map_ready
{false};
115 std::vector
<pg_shard_t
> m_maps_awaited_for
;
118 [[nodiscard
]] bool are_all_maps_available() const
120 return m_local_map_ready
&& m_maps_awaited_for
.empty();
123 void mark_local_map_ready() { m_local_map_ready
= true; }
125 void mark_replica_map_request(pg_shard_t from_whom
)
127 m_maps_awaited_for
.push_back(from_whom
);
130 /// @returns true if indeed waiting for this one. Otherwise: an error string
131 auto mark_arriving_map(pg_shard_t from
) -> std::tuple
<bool, std::string_view
>;
133 std::vector
<pg_shard_t
> get_awaited() const { return m_maps_awaited_for
; }
137 std::string
dump() const;
139 friend ostream
& operator<<(ostream
& out
, const MapsCollectionStatus
& sf
);
145 * the scrub operation flags. Primary only.
146 * Set at scrub start. Checked in multiple locations - mostly
149 struct scrub_flags_t
{
151 unsigned int priority
{0};
154 * set by queue_scrub() if either planned_scrub.auto_repair or
155 * need_auto were set.
156 * Tested at scrub end.
158 bool auto_repair
{false};
160 /// this flag indicates that we are scrubbing post repair to verify everything is fixed
161 bool check_repair
{false};
163 /// checked at the end of the scrub, to possibly initiate a deep-scrub
164 bool deep_scrub_on_error
{false};
167 * scrub must not be aborted.
168 * Set for explicitly requested scrubs, and for scrubs originated by the pairing
169 * process with the 'repair' flag set (in the RequestScrub event).
171 bool required
{false};
174 ostream
& operator<<(ostream
& out
, const scrub_flags_t
& sf
);
178 * The part of PG-scrubbing code that isn't state-machine wiring.
180 * Why the separation? I wish to move to a different FSM implementation. Thus I
181 * am forced to strongly decouple the state-machine implementation details from
182 * the actual scrubbing code.
184 class PgScrubber
: public ScrubPgIF
, public ScrubMachineListener
{
187 explicit PgScrubber(PG
* pg
);
189 // ------------------ the I/F exposed to the PG (ScrubPgIF) -------------
191 /// are we waiting for resource reservation grants form our replicas?
192 [[nodiscard
]] bool is_reserving() const final
;
194 void initiate_regular_scrub(epoch_t epoch_queued
) final
;
196 void initiate_scrub_after_repair(epoch_t epoch_queued
) final
;
198 void send_scrub_resched(epoch_t epoch_queued
) final
;
200 void active_pushes_notification(epoch_t epoch_queued
) final
;
202 void update_applied_notification(epoch_t epoch_queued
) final
;
204 void send_scrub_unblock(epoch_t epoch_queued
) final
;
206 void digest_update_notification(epoch_t epoch_queued
) final
;
208 void send_replica_maps_ready(epoch_t epoch_queued
) final
;
210 void send_start_replica(epoch_t epoch_queued
) final
;
212 void send_sched_replica(epoch_t epoch_queued
) final
;
214 void send_replica_pushes_upd(epoch_t epoch_queued
) final
;
217 * we allow some number of preemptions of the scrub, which mean we do
218 * not block. Then we start to block. Once we start blocking, we do
219 * not stop until the scrub range is completed.
221 bool write_blocked_by_scrub(const hobject_t
& soid
) final
;
223 /// true if the given range intersects the scrub interval in any way
224 bool range_intersects_scrub(const hobject_t
& start
, const hobject_t
& end
) final
;
227 * we are a replica being asked by the Primary to reserve OSD resources for
230 void handle_scrub_reserve_request(OpRequestRef op
) final
;
232 void handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
) final
;
233 void handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
) final
;
234 void handle_scrub_reserve_release(OpRequestRef op
) final
;
235 void discard_replica_reservations() final
;
236 void clear_scrub_reservations() final
; // PG::clear... fwds to here
237 void unreserve_replicas() final
;
239 // managing scrub op registration
241 void reg_next_scrub(const requested_scrub_t
& request_flags
) final
;
243 void unreg_next_scrub() final
;
245 void scrub_requested(scrub_level_t scrub_level
,
246 scrub_type_t scrub_type
,
247 requested_scrub_t
& req_flags
) final
;
250 * Reserve local scrub resources (managed by the OSD)
252 * Fails if OSD's local-scrubs budget was exhausted
253 * \returns were local resources reserved?
255 bool reserve_local() final
;
257 void handle_query_state(ceph::Formatter
* f
) final
;
259 void dump(ceph::Formatter
* f
) const override
;
261 // used if we are a replica
263 void replica_scrub_op(OpRequestRef op
) final
;
265 /// the op priority, taken from the primary's request message
266 Scrub::scrub_prio_t
replica_op_priority() const final
268 return m_replica_request_priority
;
271 unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority
,
272 unsigned int suggested_priority
) const final
;
273 /// the version that refers to m_flags.priority
274 unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority
) const final
;
276 void add_callback(Context
* context
) final
{ m_callbacks
.push_back(context
); }
278 [[nodiscard
]] bool are_callbacks_pending() const final
// used for an assert in PG.cc
280 return !m_callbacks
.empty();
283 /// handle a message carrying a replica map
284 void map_from_replica(OpRequestRef op
) final
;
287 * should we requeue blocked ops?
288 * Applicable to the PrimaryLogScrub derived class.
290 [[nodiscard
]] virtual bool should_requeue_blocked_ops(
291 eversion_t last_recovery_applied
) const override
296 void scrub_clear_state() final
;
299 * add to scrub statistics, but only if the soid is below the scrub start
301 virtual void stats_of_handled_objects(const object_stat_sum_t
& delta_stats
,
302 const hobject_t
& soid
) override
308 * finalize the parameters of the initiated scrubbing session:
310 * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
311 * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
313 void set_op_parameters(requested_scrub_t
& request
) final
;
315 void cleanup_store(ObjectStore::Transaction
* t
) final
;
317 bool get_store_errors(const scrub_ls_arg_t
& arg
,
318 scrub_ls_result_t
& res_inout
) const override
323 // -------------------------------------------------------------------------------------------
324 // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
326 bool select_range() final
;
328 /// walk the log to find the latest update that affects our chunk
329 eversion_t
search_log_for_updates() const final
;
331 eversion_t
get_last_update_applied() const final
333 return m_pg
->recovery_state
.get_last_update_applied();
336 int pending_active_pushes() const final
{ return m_pg
->active_pushes
; }
338 void scrub_compare_maps() final
;
340 void on_init() final
;
341 void on_replica_init() final
;
342 void replica_handling_done() final
;
344 /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
345 /// (thus can be called from FSM reactions)
346 void clear_pgscrub_state() final
;
348 void add_delayed_scheduling() final
;
351 * @returns have we asked at least one replica?
352 * 'false' means we are configured with no replicas, and
353 * should expect no maps to arrive.
355 bool get_replicas_maps(bool replica_can_preempt
) final
;
357 Scrub::FsmNext
on_digest_updates() final
;
359 void send_replica_map(Scrub::PreemptionNoted was_preempted
) final
;
361 void send_remotes_reserved(epoch_t epoch_queued
) final
;
362 void send_reservation_failure(epoch_t epoch_queued
) final
;
365 * does the PG have newer updates than what we (the scrubber) know?
367 [[nodiscard
]] bool has_pg_marked_new_updates() const final
;
369 void set_subset_last_update(eversion_t e
) final
;
371 void maps_compare_n_cleanup() final
;
373 Scrub::preemption_t
& get_preemptor() final
;
375 int build_primary_map_chunk() final
;
377 int build_replica_map_chunk() final
;
379 void reserve_replicas() final
;
381 [[nodiscard
]] bool was_epoch_changed() const final
;
383 void mark_local_map_ready() final
;
385 [[nodiscard
]] bool are_all_maps_available() const final
;
387 std::string
dump_awaited_maps() const final
;
390 bool state_test(uint64_t m
) const { return m_pg
->state_test(m
); }
391 void state_set(uint64_t m
) { m_pg
->state_set(m
); }
392 void state_clear(uint64_t m
) { m_pg
->state_clear(m
); }
394 [[nodiscard
]] bool is_primary() const { return m_pg
->recovery_state
.is_primary(); }
396 [[nodiscard
]] bool is_scrub_registered() const;
398 virtual void _scrub_clear_state() {}
400 utime_t m_scrub_reg_stamp
; ///< stamp we registered for
402 ostream
& show(ostream
& out
) const override
;
405 // -------------------------------------------------------------------------------------------
407 friend ostream
& operator<<(ostream
& out
, const PgScrubber
& scrubber
);
409 static utime_t
scrub_must_stamp() { return utime_t(1, 1); }
411 virtual ~PgScrubber(); // must be defined separately, in the .cc file
413 [[nodiscard
]] bool is_scrub_active() const final
{ return m_active
; }
416 void reset_internal_state();
418 void requeue_waiting() const { m_pg
->requeue_ops(m_pg
->waiting_for_scrub
); }
420 void _scan_snaps(ScrubMap
& smap
);
422 ScrubMap
clean_meta_map();
425 * mark down some parameters of the initiated scrub:
426 * - the epoch when started;
427 * - the depth of the scrub requested (from the PG_STATE variable)
429 void reset_epoch(epoch_t epoch_queued
);
431 void run_callbacks();
433 // ----- methods used to verify the relevance of incoming events:
436 * is the incoming event still relevant, and should be processed?
439 * - (1) we are no longer 'actively scrubbing'; or
440 * - (2) the message is from an epoch prior to when we started the current scrub
442 * - (3) the message epoch is from a previous interval; or
443 * - (4) the 'abort' configuration flags were set.
445 * For (1) & (2) - teh incoming message is discarded, w/o further action.
447 * For (3): (see check_interval() for a full description) if we have not reacted yet
448 * to this specific new interval, we do now:
449 * - replica reservations are silently discarded (we count on the replicas to notice
450 * the interval change and un-reserve themselves);
451 * - the scrubbing is halted.
453 * For (4): the message will be discarded, but also:
454 * if this is the first time we've noticed the 'abort' request, we perform the abort.
456 * \returns should the incoming event be processed?
458 bool is_message_relevant(epoch_t epoch_to_verify
);
461 * check the 'no scrub' configuration options.
463 [[nodiscard
]] bool should_abort() const;
466 * Check the 'no scrub' configuration flags.
468 * Reset everything if the abort was not handled before.
469 * @returns false if the message was discarded due to abort flag.
471 [[nodiscard
]] bool verify_against_abort(epoch_t epoch_to_verify
);
473 [[nodiscard
]] bool check_interval(epoch_t epoch_to_verify
);
475 epoch_t m_last_aborted
{}; // last time we've noticed a request to abort
478 * return true if any inconsistency/missing is repaired, false otherwise
480 [[nodiscard
]] bool scrub_process_inconsistent();
482 bool m_needs_sleep
{true}; ///< should we sleep before being rescheduled? always
483 ///< 'true', unless we just got out of a sleep period
485 utime_t m_sleep_started_at
;
488 // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
489 // to guarantee un-reserving when deleted.
490 std::optional
<Scrub::ReplicaReservations
> m_reservations
;
491 std::optional
<Scrub::LocalReservation
> m_local_osd_resource
;
493 /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
494 std::optional
<Scrub::ReservedByRemotePrimary
> m_remote_osd_resource
;
496 void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when
499 /// the part that actually finalizes a scrub
506 * the derivative-specific scrub-finishing touches:
508 virtual void _scrub_finish() {}
511 * Validate consistency of the object info and snap sets.
513 virtual void scrub_snapshot_metadata(ScrubMap
& map
, const missing_map_t
& missing_digest
)
516 // common code used by build_primary_map_chunk() and build_replica_map_chunk():
517 int build_scrub_map_chunk(ScrubMap
& map
, // primary or replica?
518 ScrubMapBuilder
& pos
,
523 std::unique_ptr
<Scrub::ScrubMachine
> m_fsm
;
524 const spg_t m_pg_id
; ///< a local copy of m_pg->pg_id
525 OSDService
* const m_osds
;
526 const pg_shard_t m_pg_whoami
; ///< a local copy of m_pg->pg_whoami;
528 epoch_t m_interval_start
{0}; ///< interval's 'from' of when scrubbing was first scheduled
530 * the exact epoch when the scrubbing actually started (started here - cleared checks
531 * for no-scrub conf). Incoming events are verified against this, with stale events
534 epoch_t m_epoch_start
{0}; ///< the actual epoch when scrubbing started
535 scrub_flags_t m_flags
;
537 bool m_active
{false};
539 eversion_t m_subset_last_update
{};
541 std::unique_ptr
<Scrub::Store
> m_store
;
543 int num_digest_updates_pending
{0};
544 hobject_t m_start
, m_end
; ///< note: half-closed: [start,end)
546 /// Returns reference to current osdmap
547 const OSDMapRef
& get_osdmap() const;
549 /// Returns epoch of current osdmap
550 epoch_t
get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
552 CephContext
* get_pg_cct() const { return m_pg
->cct
; }
554 // collected statistics
555 int m_shallow_errors
{0};
556 int m_deep_errors
{0};
557 int m_fixed_count
{0};
559 /// Maps from objects with errors to missing peers
560 HobjToShardSetMapping m_missing
;
564 * 'm_is_deep' - is the running scrub a deep one?
566 * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
567 * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
568 * meaningful both for the primary and the replicas, and is used as a parameter when
569 * building the scrub maps.
571 bool m_is_deep
{false};
574 * If set: affects the backend & scrubber-backend functions called after all
575 * scrub maps are available.
577 * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
578 * a "user facing" status display only).
580 bool m_is_repair
{false};
583 * User-readable summary of the scrubber's current mode of operation. Used for
584 * both osd.*.log and the cluster log.
590 * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
591 * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
594 std::string_view m_mode_desc
;
596 void update_op_mode_text();
601 * initiate a deep-scrub after the current scrub ended with errors.
603 void request_rescrubbing(requested_scrub_t
& req_flags
);
605 std::list
<Context
*> m_callbacks
;
608 * send a replica (un)reservation request to the acting set
610 * @param opcode - one of MOSDScrubReserve::REQUEST
611 * or MOSDScrubReserve::RELEASE
613 void message_all_replicas(int32_t opcode
, std::string_view op_text
);
615 hobject_t m_max_end
; ///< Largest end that may have been sent to replicas
616 ScrubMap m_primary_scrubmap
;
617 ScrubMapBuilder m_primary_scrubmap_pos
;
619 std::map
<pg_shard_t
, ScrubMap
> m_received_maps
;
621 /// Cleaned std::map pending snap metadata scrub
622 ScrubMap m_cleaned_meta_map
;
624 void _request_scrub_map(pg_shard_t replica
,
629 bool allow_preemption
);
632 Scrub::MapsCollectionStatus m_maps_status
;
634 omap_stat_t m_omap_stats
= (const struct omap_stat_t
){0};
636 /// Maps from objects with errors to inconsistent peers
637 HobjToShardSetMapping m_inconsistent
;
639 /// Maps from object with errors to good peers
640 std::map
<hobject_t
, std::list
<std::pair
<ScrubMap::object
, pg_shard_t
>>> m_authoritative
;
642 // ------------ members used if we are a replica
644 epoch_t m_replica_min_epoch
; ///< the min epoch needed to handle this message
646 ScrubMapBuilder replica_scrubmap_pos
;
647 ScrubMap replica_scrubmap
;
649 * we mark the request priority as it arrived. It influences the queuing priority
650 * when we wait for local updates
652 Scrub::scrub_prio_t m_replica_request_priority
;
655 * Queue a XX event to be sent to the replica, to trigger a re-check of the
656 * availability of the scrub map prepared by the backend.
658 void requeue_replica(Scrub::scrub_prio_t is_high_priority
);
661 * the 'preemption' "state-machine".
662 * Note: I was considering an orthogonal sub-machine implementation, but as
663 * the state diagram is extremely simple, the added complexity wasn't justified.
665 class preemption_data_t
: public Scrub::preemption_t
{
667 preemption_data_t(PG
* pg
); // the PG access is used for conf access (and logs)
669 [[nodiscard
]] bool is_preemptable() const final
{ return m_preemptable
; }
671 bool do_preempt() final
673 if (m_preempted
|| !m_preemptable
)
676 std::lock_guard
<std::mutex
> lk
{m_preemption_lock
};
684 /// same as 'do_preempt()' but w/o checks (as once a replica
685 /// was preempted, we cannot continue)
686 void replica_preempted() { m_preempted
= true; }
688 void enable_preemption()
690 std::lock_guard
<std::mutex
> lk
{m_preemption_lock
};
691 if (are_preemptions_left() && !m_preempted
) {
692 m_preemptable
= true;
696 /// used by a replica to set preemptability state according to the Primary's request
697 void force_preemptability(bool is_allowed
)
699 // note: no need to lock for a replica
701 m_preemptable
= is_allowed
;
704 bool disable_and_test() final
706 std::lock_guard
<std::mutex
> lk
{m_preemption_lock
};
707 m_preemptable
= false;
711 [[nodiscard
]] bool was_preempted() const { return m_preempted
; }
713 [[nodiscard
]] size_t chunk_divisor() const { return m_size_divisor
; }
717 void adjust_parameters() final
719 std::lock_guard
<std::mutex
> lk
{m_preemption_lock
};
723 m_preemptable
= adjust_left();
725 m_preemptable
= are_preemptions_left();
731 mutable std::mutex m_preemption_lock
;
732 bool m_preemptable
{false};
733 bool m_preempted
{false};
735 size_t m_size_divisor
{1};
736 bool are_preemptions_left() const { return m_left
> 0; }
748 preemption_data_t preemption_data
;