]>
git.proxmox.com Git - ceph.git/blob - ceph/src/osd/scrub_machine.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
7 #include <boost/statechart/custom_reaction.hpp>
8 #include <boost/statechart/deferral.hpp>
9 #include <boost/statechart/event.hpp>
10 #include <boost/statechart/event_base.hpp>
11 #include <boost/statechart/in_state_reaction.hpp>
12 #include <boost/statechart/simple_state.hpp>
13 #include <boost/statechart/state.hpp>
14 #include <boost/statechart/state_machine.hpp>
15 #include <boost/statechart/transition.hpp>
17 #include "common/version.h"
18 #include "include/Context.h"
20 #include "scrub_machine_lstnr.h"
21 #include "scrubber_common.h"
23 using namespace std::string_literals
;
25 class PG
; // holding a pointer to that one - just for testing
29 namespace sc
= ::boost::statechart
;
30 namespace mpl
= ::boost::mpl
;
36 void on_event_creation(std::string_view nm
);
37 void on_event_discard(std::string_view nm
);
40 struct E : sc::event<E> { \
41 inline static int actv{0}; \
45 on_event_creation(#E); \
50 on_event_discard(#E); \
54 MEV(RemotesReserved
) ///< all replicas have granted our reserve request
55 MEV(ReservationFailure
) ///< a reservation request has failed
57 MEV(StartScrub
) ///< initiate a new scrubbing session (relevant if we are a Primary)
58 MEV(AfterRepairScrub
) ///< initiate a new scrubbing session. Only triggered at Recovery
60 MEV(Unblocked
) ///< triggered when the PG unblocked an object that was marked for
61 ///< scrubbing. Via the PGScrubUnblocked op
62 MEV(InternalSchedScrub
)
63 MEV(SelectedChunkFree
)
65 MEV(ActivePushesUpd
) ///< Update to active_pushes. 'active_pushes' represents recovery
66 ///< that is in-flight to the local ObjectStore
67 MEV(UpdatesApplied
) // external
68 MEV(InternalAllUpdates
) ///< the internal counterpart of UpdatesApplied
69 MEV(GotReplicas
) ///< got a map from a replica
71 MEV(IntBmPreempted
) ///< internal - BuildMap preempted. Required, as detected within the
77 MEV(DigestUpdate
) ///< external. called upon success of a MODIFY op. See
78 ///< scrub_snapshot_metadata()
81 MEV(StartReplica
) ///< initiating replica scrub. replica_scrub_op() -> OSD Q ->
83 MEV(StartReplicaNoWait
) ///< 'start replica' when there are no pending updates
86 MEV(ReplicaPushesUpd
) ///< Update to active_pushes. 'active_pushes' represents recovery
87 ///< that is in-flight to the local ObjectStore
89 MEV(FullReset
) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
92 struct NotActive
; ///< the quiescent state. No active scrubbing.
93 struct ReservingReplicas
; ///< securing scrub resources from replicas' OSDs
94 struct ActiveScrubbing
; ///< the active state for a Primary. A sub-machine.
95 struct ReplicaWaitUpdates
; ///< an active state for a replica. Waiting for all active
96 ///< operations to finish.
97 struct ActiveReplica
; ///< an active state for a replica.
100 class ScrubMachine
: public sc::state_machine
<ScrubMachine
, NotActive
> {
102 friend class PgScrubber
;
105 explicit ScrubMachine(PG
* pg
, ScrubMachineListener
* pg_scrub
);
108 PG
* m_pg
; // only used for dout messages
110 ScrubMachineListener
* m_scrbr
;
112 void my_states() const;
113 void assert_not_active() const;
114 [[nodiscard
]] bool is_reserving() const;
118 * The Scrubber's base (quiescent) state.
119 * Scrubbing is triggered by one of the following events:
120 * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
121 * reservation process. Will be issued by PG::scrub(), following a
122 * queued "PGScrub" op.
123 * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
124 * not required to reserve resources.
125 * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
126 * MOSDRepScrub message.
128 struct NotActive
: sc::state
<NotActive
, ScrubMachine
> {
129 explicit NotActive(my_context ctx
);
131 using reactions
= mpl::list
<sc::transition
<StartScrub
, ReservingReplicas
>,
132 // a scrubbing that was initiated at recovery completion,
133 // and requires no resource reservations:
134 sc::transition
<AfterRepairScrub
, ActiveScrubbing
>,
135 sc::transition
<StartReplica
, ReplicaWaitUpdates
>,
136 sc::transition
<StartReplicaNoWait
, ActiveReplica
>>;
139 struct ReservingReplicas
: sc::state
<ReservingReplicas
, ScrubMachine
> {
141 explicit ReservingReplicas(my_context ctx
);
142 using reactions
= mpl::list
<sc::custom_reaction
<FullReset
>,
143 // all replicas granted our resources request
144 sc::transition
<RemotesReserved
, ActiveScrubbing
>,
145 sc::custom_reaction
<ReservationFailure
>>;
147 sc::result
react(const FullReset
&);
149 /// at least one replica denied us the scrub resources we've requested
150 sc::result
react(const ReservationFailure
&);
154 // the "active" sub-states
156 struct RangeBlocked
; ///< the objects range is blocked
157 struct PendingTimer
; ///< either delaying the scrub by some time and requeuing, or just
159 struct NewChunk
; ///< select a chunk to scrub, and verify its availability
161 struct WaitLastUpdate
;
163 struct DrainReplMaps
; ///< a problem during BuildMap. Wait for all replicas to report,
165 struct WaitReplicas
; ///< wait for all replicas to report
167 struct ActiveScrubbing
: sc::state
<ActiveScrubbing
, ScrubMachine
, PendingTimer
> {
169 explicit ActiveScrubbing(my_context ctx
);
172 using reactions
= mpl::list
<
174 sc::transition
<AllChunksDone
, NotActive
>,
176 sc::custom_reaction
<InternalError
>,
177 sc::custom_reaction
<FullReset
>>;
179 sc::result
react(const AllChunksDone
&);
180 sc::result
react(const FullReset
&);
181 sc::result
react(const InternalError
&);
184 struct RangeBlocked
: sc::state
<RangeBlocked
, ActiveScrubbing
> {
185 explicit RangeBlocked(my_context ctx
);
186 using reactions
= mpl::list
<sc::transition
<Unblocked
, PendingTimer
>>;
189 struct PendingTimer
: sc::state
<PendingTimer
, ActiveScrubbing
> {
191 explicit PendingTimer(my_context ctx
);
193 using reactions
= mpl::list
<sc::transition
<InternalSchedScrub
, NewChunk
>>;
196 struct NewChunk
: sc::state
<NewChunk
, ActiveScrubbing
> {
198 explicit NewChunk(my_context ctx
);
200 using reactions
= mpl::list
<sc::transition
<ChunkIsBusy
, RangeBlocked
>,
201 sc::custom_reaction
<SelectedChunkFree
>>;
203 sc::result
react(const SelectedChunkFree
&);
207 * initiate the update process for this chunk
209 * Wait fo 'active_pushes' to clear.
210 * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
211 * scrub waits until the correct data is readable (in-flight data to the Objectstore is
212 * not readable until written to disk, termed 'applied' here)
214 struct WaitPushes
: sc::state
<WaitPushes
, ActiveScrubbing
> {
216 explicit WaitPushes(my_context ctx
);
218 using reactions
= mpl::list
<sc::custom_reaction
<ActivePushesUpd
>>;
220 sc::result
react(const ActivePushesUpd
&);
223 struct WaitLastUpdate
: sc::state
<WaitLastUpdate
, ActiveScrubbing
> {
225 explicit WaitLastUpdate(my_context ctx
);
227 void on_new_updates(const UpdatesApplied
&);
229 using reactions
= mpl::list
<sc::custom_reaction
<InternalAllUpdates
>,
230 sc::in_state_reaction
<UpdatesApplied
,
232 &WaitLastUpdate::on_new_updates
>>;
234 sc::result
react(const InternalAllUpdates
&);
237 struct BuildMap
: sc::state
<BuildMap
, ActiveScrubbing
> {
238 explicit BuildMap(my_context ctx
);
240 // possible error scenarios:
241 // - an error reported by the backend will trigger an 'InternalError' event,
242 // handled by our parent state;
243 // - if preempted, we switch to DrainReplMaps, where we will wait for all
244 // replicas to send their maps before acknowledging the preemption;
245 // - an interval change will be handled by the relevant 'send-event' functions,
246 // and will translated into a 'FullReset' event.
248 mpl::list
<sc::transition
<IntBmPreempted
, DrainReplMaps
>,
249 sc::transition
<InternalSchedScrub
, BuildMap
>, // looping, waiting
250 // for the backend to
252 sc::custom_reaction
<IntLocalMapDone
>>;
254 sc::result
react(const IntLocalMapDone
&);
258 * "drain" scrub-maps responses from replicas
260 struct DrainReplMaps
: sc::state
<DrainReplMaps
, ActiveScrubbing
> {
261 explicit DrainReplMaps(my_context ctx
);
264 mpl::list
<sc::custom_reaction
<GotReplicas
> // all replicas are accounted for
267 sc::result
react(const GotReplicas
&);
270 struct WaitReplicas
: sc::state
<WaitReplicas
, ActiveScrubbing
> {
271 explicit WaitReplicas(my_context ctx
);
274 mpl::list
<sc::custom_reaction
<GotReplicas
>, sc::deferral
<DigestUpdate
>>;
276 sc::result
react(const GotReplicas
&);
279 struct WaitDigestUpdate
: sc::state
<WaitDigestUpdate
, ActiveScrubbing
> {
280 explicit WaitDigestUpdate(my_context ctx
);
282 using reactions
= mpl::list
<sc::custom_reaction
<DigestUpdate
>>;
283 sc::result
react(const DigestUpdate
&);
286 // ----------------------------- the "replica active" states -----------------------
289 * Waiting for 'active_pushes' to complete
291 * When in this state:
292 * - the details of the Primary's request were internalized by PgScrubber;
293 * - 'active' scrubbing is set
295 struct ReplicaWaitUpdates
: sc::state
<ReplicaWaitUpdates
, ScrubMachine
> {
296 explicit ReplicaWaitUpdates(my_context ctx
);
298 mpl::list
<sc::custom_reaction
<ReplicaPushesUpd
>, sc::custom_reaction
<FullReset
>>;
300 sc::result
react(const ReplicaPushesUpd
&);
301 sc::result
react(const FullReset
&);
305 struct ActiveReplica
: sc::state
<ActiveReplica
, ScrubMachine
> {
306 explicit ActiveReplica(my_context ctx
);
308 mpl::list
<sc::custom_reaction
<SchedReplica
>, sc::custom_reaction
<FullReset
>>;
310 sc::result
react(const SchedReplica
&);
311 sc::result
react(const FullReset
&);