]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/scrub_machine.cc
buildsys: switch source download to quincy
[ceph.git] / ceph / src / osd / scrub_machine.cc
CommitLineData
f67539c2
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "scrub_machine.h"
5
6#include <chrono>
7#include <typeinfo>
8
9#include <boost/core/demangle.hpp>
10
11#include "OSD.h"
12#include "OpRequest.h"
13#include "ScrubStore.h"
14#include "scrub_machine_lstnr.h"
15
16#define dout_context g_ceph_context
17#define dout_subsys ceph_subsys_osd
18#undef dout_prefix
19#define dout_prefix *_dout << " scrubberFSM "
20
21using namespace std::chrono;
22using namespace std::chrono_literals;
23namespace sc = boost::statechart;
24
25#define DECLARE_LOCALS \
26 ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \
27 std::ignore = scrbr; \
28 auto pg_id = context<ScrubMachine>().m_pg_id; \
29 std::ignore = pg_id;
30
31namespace Scrub {
32
33// --------- trace/debug auxiliaries -------------------------------
34
35void on_event_creation(std::string_view nm)
36{
37 dout(20) << " event: --vvvv---- " << nm << dendl;
38}
39
40void on_event_discard(std::string_view nm)
41{
42 dout(20) << " event: --^^^^---- " << nm << dendl;
43}
44
45void ScrubMachine::my_states() const
46{
47 for (auto si = state_begin(); si != state_end(); ++si) {
48 const auto& siw{*si}; // prevents a warning re side-effects
49 dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl;
50 }
51}
52
53void ScrubMachine::assert_not_active() const
54{
55 ceph_assert(state_cast<const NotActive*>());
56}
57
58bool ScrubMachine::is_reserving() const
59{
60 return state_cast<const ReservingReplicas*>();
61}
62
63// for the rest of the code in this file - we know what PG we are dealing with:
64#undef dout_prefix
65#define dout_prefix _prefix(_dout, this->context<ScrubMachine>().m_pg)
66template <class T> static ostream& _prefix(std::ostream* _dout, T* t)
67{
68 return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") ";
69}
70
71// ////////////// the actual actions
72
73// ----------------------- NotActive -----------------------------------------
74
75NotActive::NotActive(my_context ctx) : my_base(ctx)
76{
77 dout(10) << "-- state -->> NotActive" << dendl;
78}
79
80// ----------------------- ReservingReplicas ---------------------------------
81
82ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
83{
84 dout(10) << "-- state -->> ReservingReplicas" << dendl;
85 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
86 scrbr->reserve_replicas();
87}
88
89sc::result ReservingReplicas::react(const ReservationFailure&)
90{
91 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
92 dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
93
94 // the Scrubber must release all resources and abort the scrubbing
95 scrbr->clear_pgscrub_state();
96 return transit<NotActive>();
97}
98
99/**
100 * note: the event poster is handling the scrubber reset
101 */
102sc::result ReservingReplicas::react(const FullReset&)
103{
104 dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
105 return transit<NotActive>();
106}
107
108// ----------------------- ActiveScrubbing -----------------------------------
109
110ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
111{
112 dout(10) << "-- state -->> ActiveScrubbing" << dendl;
113 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
114 scrbr->on_init();
115}
116
117/**
118 * upon exiting the Active state
119 */
120ActiveScrubbing::~ActiveScrubbing()
121{
122 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
123 dout(15) << __func__ << dendl;
124 scrbr->unreserve_replicas();
125}
126
127/*
128 * The only source of an InternalError event as of now is the BuildMap state,
129 * when encountering a backend error.
130 * We kill the scrub and reset the FSM.
131 */
132sc::result ActiveScrubbing::react(const InternalError&)
133{
134 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
135 dout(10) << __func__ << dendl;
136 scrbr->clear_pgscrub_state();
137 return transit<NotActive>();
138}
139
140sc::result ActiveScrubbing::react(const FullReset&)
141{
142 dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
143 // caller takes care of clearing the scrubber & FSM states
144 return transit<NotActive>();
145}
146
147// ----------------------- RangeBlocked -----------------------------------
148
149/*
150 * Blocked. Will be released by kick_object_context_blocked() (or upon
151 * an abort)
152 */
153RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
154{
155 dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
156}
157
158// ----------------------- PendingTimer -----------------------------------
159
160/**
161 * Sleeping till timer reactivation - or just requeuing
162 */
163PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
164{
165 dout(10) << "-- state -->> Act/PendingTimer" << dendl;
166 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
167
168 scrbr->add_delayed_scheduling();
169}
170
171// ----------------------- NewChunk -----------------------------------
172
173/**
174 * Preconditions:
175 * - preemption data was set
176 * - epoch start was updated
177 */
178NewChunk::NewChunk(my_context ctx) : my_base(ctx)
179{
180 dout(10) << "-- state -->> Act/NewChunk" << dendl;
181 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
182
183 scrbr->get_preemptor().adjust_parameters();
184
185 // choose range to work on
186 bool got_a_chunk = scrbr->select_range();
187 if (got_a_chunk) {
188 dout(15) << __func__ << " selection OK" << dendl;
189 post_event(SelectedChunkFree{});
190 } else {
191 dout(10) << __func__ << " selected chunk is busy" << dendl;
192 // wait until we are available (transitioning to Blocked)
193 post_event(ChunkIsBusy{});
194 }
195}
196
197sc::result NewChunk::react(const SelectedChunkFree&)
198{
199 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
200 dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl;
201
202 scrbr->set_subset_last_update(scrbr->search_log_for_updates());
203 return transit<WaitPushes>();
204}
205
206// ----------------------- WaitPushes -----------------------------------
207
208WaitPushes::WaitPushes(my_context ctx) : my_base(ctx)
209{
210 dout(10) << " -- state -->> Act/WaitPushes" << dendl;
211 post_event(ActivePushesUpd{});
212}
213
214/*
215 * Triggered externally, by the entity that had an update re pushes
216 */
217sc::result WaitPushes::react(const ActivePushesUpd&)
218{
219 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
220 dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: "
221 << scrbr->pending_active_pushes() << dendl;
222
223 if (!scrbr->pending_active_pushes()) {
224 // done waiting
225 return transit<WaitLastUpdate>();
226 }
227
228 return discard_event();
229}
230
231// ----------------------- WaitLastUpdate -----------------------------------
232
233WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
234{
235 dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
236 post_event(UpdatesApplied{});
237}
238
239void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
240{
241 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
242 dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl;
243
244 if (scrbr->has_pg_marked_new_updates()) {
245 post_event(InternalAllUpdates{});
246 } else {
247 // will be requeued by op_applied
248 dout(10) << "wait for EC read/modify/writes to queue" << dendl;
249 }
250}
251
252/*
253 * request maps from the replicas in the acting set
254 */
255sc::result WaitLastUpdate::react(const InternalAllUpdates&)
256{
257 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
258 dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
259
260 scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
261 return transit<BuildMap>();
262}
263
264// ----------------------- BuildMap -----------------------------------
265
266BuildMap::BuildMap(my_context ctx) : my_base(ctx)
267{
268 dout(10) << " -- state -->> Act/BuildMap" << dendl;
269 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
270
271 // no need to check for an epoch change, as all possible flows that brought us here have
272 // a check_interval() verification of their final event.
273
274 if (scrbr->get_preemptor().was_preempted()) {
275
276 // we were preempted, either directly or by a replica
277 dout(10) << __func__ << " preempted!!!" << dendl;
278 scrbr->mark_local_map_ready();
279 post_event(IntBmPreempted{});
280
281 } else {
282
283 auto ret = scrbr->build_primary_map_chunk();
284
285 if (ret == -EINPROGRESS) {
286 // must wait for the backend to finish. No specific event provided.
287 // build_primary_map_chunk() has already requeued us.
288 dout(20) << "waiting for the backend..." << dendl;
289
290 } else if (ret < 0) {
291
292 dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
293 // scrbr->mark_local_map_ready();
294 post_event(InternalError{});
295
296 } else {
297
298 // the local map was created
299 post_event(IntLocalMapDone{});
300 }
301 }
302}
303
304sc::result BuildMap::react(const IntLocalMapDone&)
305{
306 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
307 dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl;
308
309 scrbr->mark_local_map_ready();
310 return transit<WaitReplicas>();
311}
312
313// ----------------------- DrainReplMaps -----------------------------------
314
315DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
316{
317 dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
318 // we may have received all maps already. Send the event that will make us check.
319 post_event(GotReplicas{});
320}
321
322sc::result DrainReplMaps::react(const GotReplicas&)
323{
324 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
325 dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl;
326
327 if (scrbr->are_all_maps_available()) {
328 // NewChunk will handle the preemption that brought us to this state
329 return transit<PendingTimer>();
330 }
331
332 dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
333 << scrbr->dump_awaited_maps() << dendl;
334 return discard_event();
335}
336
337// ----------------------- WaitReplicas -----------------------------------
338
339WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
340{
341 dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
342 post_event(GotReplicas{});
343}
344
345sc::result WaitReplicas::react(const GotReplicas&)
346{
347 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
348 dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl;
349
350 if (scrbr->are_all_maps_available()) {
351 dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
352
353 // were we preempted?
354 if (scrbr->get_preemptor().disable_and_test()) { // a test&set
355
356
357 dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
358 return transit<PendingTimer>();
359
360 } else {
361
362 scrbr->maps_compare_n_cleanup();
363 return transit<WaitDigestUpdate>();
364 }
365 } else {
366 return discard_event();
367 }
368}
369
370// ----------------------- WaitDigestUpdate -----------------------------------
371
372WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
373{
374 dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
375 // perform an initial check: maybe we already
376 // have all the updates we need:
377 // (note that DigestUpdate is usually an external event)
378 post_event(DigestUpdate{});
379}
380
381sc::result WaitDigestUpdate::react(const DigestUpdate&)
382{
383 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
384 dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl;
385
386 switch (scrbr->on_digest_updates()) {
387
388 case Scrub::FsmNext::goto_notactive:
389 // scrubbing is done
390 return transit<NotActive>();
391
392 case Scrub::FsmNext::next_chunk:
393 // go get the next chunk
394 return transit<PendingTimer>();
395
396 case Scrub::FsmNext::do_discard:
397 // still waiting for more updates
398 return discard_event();
399 }
400 __builtin_unreachable(); // Prevent a gcc warning.
401 // Adding a phony 'default:' above is wrong: (a) prevents a
402 // warning if FsmNext is extended, and (b) elicits a correct
403 // warning from Clang
404}
405
406ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
407 : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
408{
409 dout(15) << "ScrubMachine created " << m_pg_id << dendl;
410}
411
412ScrubMachine::~ScrubMachine() = default;
413
414// -------- for replicas -----------------------------------------------------
415
416// ----------------------- ReplicaWaitUpdates --------------------------------
417
418ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
419{
420 dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
421 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
422 scrbr->on_replica_init();
423}
424
425/*
426 * Triggered externally, by the entity that had an update re pushes
427 */
428sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
429{
430 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
431 dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
432 << scrbr->pending_active_pushes() << dendl;
433
434 if (scrbr->pending_active_pushes() == 0) {
435
436 // done waiting
437 return transit<ActiveReplica>();
438 }
439
440 return discard_event();
441}
442
443/**
444 * the event poster is handling the scrubber reset
445 */
446sc::result ReplicaWaitUpdates::react(const FullReset&)
447{
448 dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
449 return transit<NotActive>();
450}
451
452// ----------------------- ActiveReplica -----------------------------------
453
454ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
455{
456 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
457 dout(10) << "-- state -->> ActiveReplica" << dendl;
458 scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates
459 post_event(SchedReplica{});
460}
461
462sc::result ActiveReplica::react(const SchedReplica&)
463{
464 DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
465 dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
466 << scrbr->get_preemptor().is_preemptable() << dendl;
467
468 if (scrbr->get_preemptor().was_preempted()) {
469 dout(10) << "replica scrub job preempted" << dendl;
470
471 scrbr->send_replica_map(PreemptionNoted::preempted);
472 scrbr->replica_handling_done();
473 return transit<NotActive>();
474 }
475
476 // start or check progress of build_replica_map_chunk()
477
478 auto ret = scrbr->build_replica_map_chunk();
479 dout(15) << "ActiveReplica::react(const SchedReplica&) Ret: " << ret << dendl;
480
481 if (ret == -EINPROGRESS) {
482 // must wait for the backend to finish. No external event source.
483 // build_replica_map_chunk() has already requeued a SchedReplica
484 // event.
485
486 dout(20) << "waiting for the backend..." << dendl;
487 return discard_event();
488 }
489
490 if (ret < 0) {
491 // the existing code ignores this option, treating an error
492 // report as a success.
493 dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
494 << dendl;
495 scrbr->replica_handling_done();
496 return transit<NotActive>();
497 }
498
499
500 // the local map was created. Send it to the primary.
501 scrbr->send_replica_map(PreemptionNoted::no_preemption);
502 scrbr->replica_handling_done();
503 return transit<NotActive>();
504}
505
506/**
507 * the event poster is handling the scrubber reset
508 */
509sc::result ActiveReplica::react(const FullReset&)
510{
511 dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
512 return transit<NotActive>();
513}
514
515} // namespace Scrub