1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
7 #include <boost/statechart/custom_reaction.hpp>
8 #include <boost/statechart/deferral.hpp>
9 #include <boost/statechart/event.hpp>
10 #include <boost/statechart/event_base.hpp>
11 #include <boost/statechart/in_state_reaction.hpp>
12 #include <boost/statechart/simple_state.hpp>
13 #include <boost/statechart/state.hpp>
14 #include <boost/statechart/state_machine.hpp>
15 #include <boost/statechart/transition.hpp>
17 #include "common/version.h"
18 #include "include/Context.h"
20 #include "scrub_machine_lstnr.h"
21 #include "osd/scrubber_common.h"
23 using namespace std::string_literals
;
25 class PG
; // holding a pointer to that one - just for testing
29 namespace sc
= ::boost::statechart
;
30 namespace mpl
= ::boost::mpl
;
36 void on_event_creation(std::string_view nm
);
37 void on_event_discard(std::string_view nm
);
40 struct E : sc::event<E> { \
41 inline static int actv{0}; \
45 on_event_creation(#E); \
50 on_event_discard(#E); \
52 void print(std::ostream* out) const { *out << #E; } \
53 std::string_view print() const { return #E; } \
56 MEV(RemotesReserved
) ///< all replicas have granted our reserve request
58 MEV(ReservationFailure
) ///< a reservation request has failed
60 MEV(StartScrub
) ///< initiate a new scrubbing session (relevant if we are a Primary)
62 MEV(AfterRepairScrub
) ///< initiate a new scrubbing session. Only triggered at Recovery
65 MEV(Unblocked
) ///< triggered when the PG unblocked an object that was marked for
66 ///< scrubbing. Via the PGScrubUnblocked op
68 MEV(InternalSchedScrub
)
70 MEV(SelectedChunkFree
)
74 MEV(ActivePushesUpd
) ///< Update to active_pushes. 'active_pushes' represents recovery
75 ///< that is in-flight to the local ObjectStore
77 MEV(UpdatesApplied
) ///< (Primary only) all updates are committed
79 MEV(InternalAllUpdates
) ///< the internal counterpart of UpdatesApplied
81 MEV(GotReplicas
) ///< got a map from a replica
83 MEV(IntBmPreempted
) ///< internal - BuildMap preempted. Required, as detected within the
90 MEV(DigestUpdate
) ///< external. called upon success of a MODIFY op. See
91 ///< scrub_snapshot_metadata()
93 MEV(MapsCompared
) ///< maps_compare_n_cleanup() transactions are done
95 MEV(StartReplica
) ///< initiating replica scrub.
97 MEV(StartReplicaNoWait
) ///< 'start replica' when there are no pending updates
101 MEV(ReplicaPushesUpd
) ///< Update to active_pushes. 'active_pushes' represents recovery
102 ///< that is in-flight to the local ObjectStore
104 MEV(FullReset
) ///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
106 MEV(NextChunk
) ///< finished handling this chunk. Go get the next one
108 MEV(ScrubFinished
) ///< all chunks handled
111 struct NotActive
; ///< the quiescent state. No active scrubbing.
112 struct ReservingReplicas
; ///< securing scrub resources from replicas' OSDs
113 struct ActiveScrubbing
; ///< the active state for a Primary. A sub-machine.
114 struct ReplicaWaitUpdates
; ///< an active state for a replica. Waiting for all active
115 ///< operations to finish.
116 struct ActiveReplica
; ///< an active state for a replica.
119 class ScrubMachine
: public sc::state_machine
<ScrubMachine
, NotActive
> {
121 friend class PgScrubber
;
124 explicit ScrubMachine(PG
* pg
, ScrubMachineListener
* pg_scrub
);
128 ScrubMachineListener
* m_scrbr
;
129 std::ostream
& gen_prefix(std::ostream
& out
) const;
131 std::string
current_states_desc() const;
132 void assert_not_active() const;
133 [[nodiscard
]] bool is_reserving() const;
134 [[nodiscard
]] bool is_accepting_updates() const;
138 * The Scrubber's base (quiescent) state.
139 * Scrubbing is triggered by one of the following events:
140 * - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
141 * reservation process. Will be issued by PG::scrub(), following a
142 * queued "PGScrub" op.
143 * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
144 * not required to reserve resources.
145 * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
146 * MOSDRepScrub message.
148 * note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting
149 * for replica resources to be acquired. But once replicas started using the
150 * resource-request to identify and tag the scrub session, this bypass cannot be
153 struct NotActive
: sc::state
<NotActive
, ScrubMachine
> {
154 explicit NotActive(my_context ctx
);
156 using reactions
= mpl::list
<sc::custom_reaction
<StartScrub
>,
157 // a scrubbing that was initiated at recovery completion,
158 // and requires no resource reservations:
159 sc::transition
<AfterRepairScrub
, ReservingReplicas
>,
160 sc::transition
<StartReplica
, ReplicaWaitUpdates
>,
161 sc::transition
<StartReplicaNoWait
, ActiveReplica
>>;
162 sc::result
react(const StartScrub
&);
165 struct ReservingReplicas
: sc::state
<ReservingReplicas
, ScrubMachine
> {
167 explicit ReservingReplicas(my_context ctx
);
168 ~ReservingReplicas();
169 using reactions
= mpl::list
<sc::custom_reaction
<FullReset
>,
170 // all replicas granted our resources request
171 sc::transition
<RemotesReserved
, ActiveScrubbing
>,
172 sc::custom_reaction
<ReservationFailure
>>;
174 sc::result
react(const FullReset
&);
176 /// at least one replica denied us the scrub resources we've requested
177 sc::result
react(const ReservationFailure
&);
181 // the "active" sub-states
183 struct RangeBlocked
; ///< the objects range is blocked
184 struct PendingTimer
; ///< either delaying the scrub by some time and requeuing, or just
186 struct NewChunk
; ///< select a chunk to scrub, and verify its availability
188 struct WaitLastUpdate
;
190 struct DrainReplMaps
; ///< a problem during BuildMap. Wait for all replicas to report,
192 struct WaitReplicas
; ///< wait for all replicas to report
193 struct WaitDigestUpdate
;
195 struct ActiveScrubbing
: sc::state
<ActiveScrubbing
, ScrubMachine
, PendingTimer
> {
197 explicit ActiveScrubbing(my_context ctx
);
200 using reactions
= mpl::list
<
201 sc::custom_reaction
<InternalError
>,
202 sc::custom_reaction
<FullReset
>>;
204 sc::result
react(const FullReset
&);
205 sc::result
react(const InternalError
&);
208 struct RangeBlocked
: sc::state
<RangeBlocked
, ActiveScrubbing
> {
209 explicit RangeBlocked(my_context ctx
);
210 using reactions
= mpl::list
<sc::transition
<Unblocked
, PendingTimer
>>;
212 Scrub::BlockedRangeWarning m_timeout
;
215 struct PendingTimer
: sc::state
<PendingTimer
, ActiveScrubbing
> {
217 explicit PendingTimer(my_context ctx
);
219 using reactions
= mpl::list
<sc::transition
<InternalSchedScrub
, NewChunk
>>;
222 struct NewChunk
: sc::state
<NewChunk
, ActiveScrubbing
> {
224 explicit NewChunk(my_context ctx
);
226 using reactions
= mpl::list
<sc::transition
<ChunkIsBusy
, RangeBlocked
>,
227 sc::custom_reaction
<SelectedChunkFree
>>;
229 sc::result
react(const SelectedChunkFree
&);
233 * initiate the update process for this chunk
235 * Wait fo 'active_pushes' to clear.
236 * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
237 * scrub waits until the correct data is readable (in-flight data to the Objectstore is
238 * not readable until written to disk, termed 'applied' here)
240 struct WaitPushes
: sc::state
<WaitPushes
, ActiveScrubbing
> {
242 explicit WaitPushes(my_context ctx
);
244 using reactions
= mpl::list
<sc::custom_reaction
<ActivePushesUpd
>>;
246 sc::result
react(const ActivePushesUpd
&);
249 struct WaitLastUpdate
: sc::state
<WaitLastUpdate
, ActiveScrubbing
> {
251 explicit WaitLastUpdate(my_context ctx
);
253 void on_new_updates(const UpdatesApplied
&);
255 using reactions
= mpl::list
<sc::custom_reaction
<InternalAllUpdates
>,
256 sc::in_state_reaction
<UpdatesApplied
,
258 &WaitLastUpdate::on_new_updates
>>;
260 sc::result
react(const InternalAllUpdates
&);
263 struct BuildMap
: sc::state
<BuildMap
, ActiveScrubbing
> {
264 explicit BuildMap(my_context ctx
);
266 // possible error scenarios:
267 // - an error reported by the backend will trigger an 'InternalError' event,
268 // handled by our parent state;
269 // - if preempted, we switch to DrainReplMaps, where we will wait for all
270 // replicas to send their maps before acknowledging the preemption;
271 // - an interval change will be handled by the relevant 'send-event' functions,
272 // and will translated into a 'FullReset' event.
274 mpl::list
<sc::transition
<IntBmPreempted
, DrainReplMaps
>,
275 sc::transition
<InternalSchedScrub
, BuildMap
>, // looping, waiting
276 // for the backend to
278 sc::custom_reaction
<IntLocalMapDone
>>;
280 sc::result
react(const IntLocalMapDone
&);
284 * "drain" scrub-maps responses from replicas
286 struct DrainReplMaps
: sc::state
<DrainReplMaps
, ActiveScrubbing
> {
287 explicit DrainReplMaps(my_context ctx
);
290 mpl::list
<sc::custom_reaction
<GotReplicas
> // all replicas are accounted for
293 sc::result
react(const GotReplicas
&);
296 struct WaitReplicas
: sc::state
<WaitReplicas
, ActiveScrubbing
> {
297 explicit WaitReplicas(my_context ctx
);
300 mpl::list
<sc::custom_reaction
<GotReplicas
>, // all replicas are accounted for
301 sc::transition
<MapsCompared
, WaitDigestUpdate
>,
302 sc::custom_reaction
<DigestUpdate
>
305 sc::result
react(const GotReplicas
&);
306 sc::result
react(const DigestUpdate
&);
307 bool all_maps_already_called
{false}; // see comment in react code
310 struct WaitDigestUpdate
: sc::state
<WaitDigestUpdate
, ActiveScrubbing
> {
311 explicit WaitDigestUpdate(my_context ctx
);
313 using reactions
= mpl::list
<sc::custom_reaction
<DigestUpdate
>,
314 sc::custom_reaction
<ScrubFinished
>,
315 sc::transition
<NextChunk
, PendingTimer
>>;
316 sc::result
react(const DigestUpdate
&);
317 sc::result
react(const ScrubFinished
&);
320 // ----------------------------- the "replica active" states -----------------------
323 * Waiting for 'active_pushes' to complete
325 * When in this state:
326 * - the details of the Primary's request were internalized by PgScrubber;
327 * - 'active' scrubbing is set
329 struct ReplicaWaitUpdates
: sc::state
<ReplicaWaitUpdates
, ScrubMachine
> {
330 explicit ReplicaWaitUpdates(my_context ctx
);
332 mpl::list
<sc::custom_reaction
<ReplicaPushesUpd
>, sc::custom_reaction
<FullReset
>>;
334 sc::result
react(const ReplicaPushesUpd
&);
335 sc::result
react(const FullReset
&);
339 struct ActiveReplica
: sc::state
<ActiveReplica
, ScrubMachine
> {
340 explicit ActiveReplica(my_context ctx
);
341 using reactions
= mpl::list
<sc::custom_reaction
<SchedReplica
>,
342 sc::custom_reaction
<FullReset
>,
343 sc::transition
<ScrubFinished
, NotActive
>>;
345 sc::result
react(const SchedReplica
&);
346 sc::result
react(const FullReset
&);