]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include "scrub_machine.h" | |
5 | ||
6 | #include <chrono> | |
7 | #include <typeinfo> | |
8 | ||
9 | #include <boost/core/demangle.hpp> | |
10 | ||
11 | #include "OSD.h" | |
12 | #include "OpRequest.h" | |
13 | #include "ScrubStore.h" | |
14 | #include "scrub_machine_lstnr.h" | |
15 | ||
16 | #define dout_context g_ceph_context | |
17 | #define dout_subsys ceph_subsys_osd | |
18 | #undef dout_prefix | |
19 | #define dout_prefix *_dout << " scrubberFSM " | |
20 | ||
21 | using namespace std::chrono; | |
22 | using namespace std::chrono_literals; | |
23 | namespace sc = boost::statechart; | |
24 | ||
25 | #define DECLARE_LOCALS \ | |
26 | ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \ | |
27 | std::ignore = scrbr; \ | |
28 | auto pg_id = context<ScrubMachine>().m_pg_id; \ | |
29 | std::ignore = pg_id; | |
30 | ||
31 | namespace Scrub { | |
32 | ||
33 | // --------- trace/debug auxiliaries ------------------------------- | |
34 | ||
35 | void on_event_creation(std::string_view nm) | |
36 | { | |
37 | dout(20) << " event: --vvvv---- " << nm << dendl; | |
38 | } | |
39 | ||
40 | void on_event_discard(std::string_view nm) | |
41 | { | |
42 | dout(20) << " event: --^^^^---- " << nm << dendl; | |
43 | } | |
44 | ||
45 | void ScrubMachine::my_states() const | |
46 | { | |
47 | for (auto si = state_begin(); si != state_end(); ++si) { | |
48 | const auto& siw{*si}; // prevents a warning re side-effects | |
49 | dout(20) << " state: " << boost::core::demangle(typeid(siw).name()) << dendl; | |
50 | } | |
51 | } | |
52 | ||
53 | void ScrubMachine::assert_not_active() const | |
54 | { | |
55 | ceph_assert(state_cast<const NotActive*>()); | |
56 | } | |
57 | ||
58 | bool ScrubMachine::is_reserving() const | |
59 | { | |
60 | return state_cast<const ReservingReplicas*>(); | |
61 | } | |
62 | ||
63 | // for the rest of the code in this file - we know what PG we are dealing with: | |
64 | #undef dout_prefix | |
65 | #define dout_prefix _prefix(_dout, this->context<ScrubMachine>().m_pg) | |
66 | template <class T> static ostream& _prefix(std::ostream* _dout, T* t) | |
67 | { | |
68 | return t->gen_prefix(*_dout) << " scrubberFSM pg(" << t->pg_id << ") "; | |
69 | } | |
70 | ||
71 | // ////////////// the actual actions | |
72 | ||
73 | // ----------------------- NotActive ----------------------------------------- | |
74 | ||
75 | NotActive::NotActive(my_context ctx) : my_base(ctx) | |
76 | { | |
77 | dout(10) << "-- state -->> NotActive" << dendl; | |
78 | } | |
79 | ||
80 | // ----------------------- ReservingReplicas --------------------------------- | |
81 | ||
82 | ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) | |
83 | { | |
84 | dout(10) << "-- state -->> ReservingReplicas" << dendl; | |
85 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
86 | scrbr->reserve_replicas(); | |
87 | } | |
88 | ||
89 | sc::result ReservingReplicas::react(const ReservationFailure&) | |
90 | { | |
91 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
92 | dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl; | |
93 | ||
94 | // the Scrubber must release all resources and abort the scrubbing | |
95 | scrbr->clear_pgscrub_state(); | |
96 | return transit<NotActive>(); | |
97 | } | |
98 | ||
99 | /** | |
100 | * note: the event poster is handling the scrubber reset | |
101 | */ | |
102 | sc::result ReservingReplicas::react(const FullReset&) | |
103 | { | |
104 | dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl; | |
105 | return transit<NotActive>(); | |
106 | } | |
107 | ||
108 | // ----------------------- ActiveScrubbing ----------------------------------- | |
109 | ||
110 | ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx) | |
111 | { | |
112 | dout(10) << "-- state -->> ActiveScrubbing" << dendl; | |
113 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
114 | scrbr->on_init(); | |
115 | } | |
116 | ||
117 | /** | |
118 | * upon exiting the Active state | |
119 | */ | |
120 | ActiveScrubbing::~ActiveScrubbing() | |
121 | { | |
122 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
123 | dout(15) << __func__ << dendl; | |
124 | scrbr->unreserve_replicas(); | |
125 | } | |
126 | ||
127 | /* | |
128 | * The only source of an InternalError event as of now is the BuildMap state, | |
129 | * when encountering a backend error. | |
130 | * We kill the scrub and reset the FSM. | |
131 | */ | |
132 | sc::result ActiveScrubbing::react(const InternalError&) | |
133 | { | |
134 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
135 | dout(10) << __func__ << dendl; | |
136 | scrbr->clear_pgscrub_state(); | |
137 | return transit<NotActive>(); | |
138 | } | |
139 | ||
140 | sc::result ActiveScrubbing::react(const FullReset&) | |
141 | { | |
142 | dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl; | |
143 | // caller takes care of clearing the scrubber & FSM states | |
144 | return transit<NotActive>(); | |
145 | } | |
146 | ||
147 | // ----------------------- RangeBlocked ----------------------------------- | |
148 | ||
149 | /* | |
150 | * Blocked. Will be released by kick_object_context_blocked() (or upon | |
151 | * an abort) | |
152 | */ | |
153 | RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) | |
154 | { | |
155 | dout(10) << "-- state -->> Act/RangeBlocked" << dendl; | |
156 | } | |
157 | ||
158 | // ----------------------- PendingTimer ----------------------------------- | |
159 | ||
160 | /** | |
161 | * Sleeping till timer reactivation - or just requeuing | |
162 | */ | |
163 | PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) | |
164 | { | |
165 | dout(10) << "-- state -->> Act/PendingTimer" << dendl; | |
166 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
167 | ||
168 | scrbr->add_delayed_scheduling(); | |
169 | } | |
170 | ||
171 | // ----------------------- NewChunk ----------------------------------- | |
172 | ||
173 | /** | |
174 | * Preconditions: | |
175 | * - preemption data was set | |
176 | * - epoch start was updated | |
177 | */ | |
178 | NewChunk::NewChunk(my_context ctx) : my_base(ctx) | |
179 | { | |
180 | dout(10) << "-- state -->> Act/NewChunk" << dendl; | |
181 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
182 | ||
183 | scrbr->get_preemptor().adjust_parameters(); | |
184 | ||
185 | // choose range to work on | |
186 | bool got_a_chunk = scrbr->select_range(); | |
187 | if (got_a_chunk) { | |
188 | dout(15) << __func__ << " selection OK" << dendl; | |
189 | post_event(SelectedChunkFree{}); | |
190 | } else { | |
191 | dout(10) << __func__ << " selected chunk is busy" << dendl; | |
192 | // wait until we are available (transitioning to Blocked) | |
193 | post_event(ChunkIsBusy{}); | |
194 | } | |
195 | } | |
196 | ||
197 | sc::result NewChunk::react(const SelectedChunkFree&) | |
198 | { | |
199 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
200 | dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl; | |
201 | ||
202 | scrbr->set_subset_last_update(scrbr->search_log_for_updates()); | |
203 | return transit<WaitPushes>(); | |
204 | } | |
205 | ||
206 | // ----------------------- WaitPushes ----------------------------------- | |
207 | ||
208 | WaitPushes::WaitPushes(my_context ctx) : my_base(ctx) | |
209 | { | |
210 | dout(10) << " -- state -->> Act/WaitPushes" << dendl; | |
211 | post_event(ActivePushesUpd{}); | |
212 | } | |
213 | ||
214 | /* | |
215 | * Triggered externally, by the entity that had an update re pushes | |
216 | */ | |
217 | sc::result WaitPushes::react(const ActivePushesUpd&) | |
218 | { | |
219 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
220 | dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: " | |
221 | << scrbr->pending_active_pushes() << dendl; | |
222 | ||
223 | if (!scrbr->pending_active_pushes()) { | |
224 | // done waiting | |
225 | return transit<WaitLastUpdate>(); | |
226 | } | |
227 | ||
228 | return discard_event(); | |
229 | } | |
230 | ||
231 | // ----------------------- WaitLastUpdate ----------------------------------- | |
232 | ||
233 | WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx) | |
234 | { | |
235 | dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl; | |
236 | post_event(UpdatesApplied{}); | |
237 | } | |
238 | ||
239 | void WaitLastUpdate::on_new_updates(const UpdatesApplied&) | |
240 | { | |
241 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
242 | dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl; | |
243 | ||
244 | if (scrbr->has_pg_marked_new_updates()) { | |
245 | post_event(InternalAllUpdates{}); | |
246 | } else { | |
247 | // will be requeued by op_applied | |
248 | dout(10) << "wait for EC read/modify/writes to queue" << dendl; | |
249 | } | |
250 | } | |
251 | ||
252 | /* | |
253 | * request maps from the replicas in the acting set | |
254 | */ | |
255 | sc::result WaitLastUpdate::react(const InternalAllUpdates&) | |
256 | { | |
257 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
258 | dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl; | |
259 | ||
260 | scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable()); | |
261 | return transit<BuildMap>(); | |
262 | } | |
263 | ||
264 | // ----------------------- BuildMap ----------------------------------- | |
265 | ||
266 | BuildMap::BuildMap(my_context ctx) : my_base(ctx) | |
267 | { | |
268 | dout(10) << " -- state -->> Act/BuildMap" << dendl; | |
269 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
270 | ||
271 | // no need to check for an epoch change, as all possible flows that brought us here have | |
272 | // a check_interval() verification of their final event. | |
273 | ||
274 | if (scrbr->get_preemptor().was_preempted()) { | |
275 | ||
276 | // we were preempted, either directly or by a replica | |
277 | dout(10) << __func__ << " preempted!!!" << dendl; | |
278 | scrbr->mark_local_map_ready(); | |
279 | post_event(IntBmPreempted{}); | |
280 | ||
281 | } else { | |
282 | ||
283 | auto ret = scrbr->build_primary_map_chunk(); | |
284 | ||
285 | if (ret == -EINPROGRESS) { | |
286 | // must wait for the backend to finish. No specific event provided. | |
287 | // build_primary_map_chunk() has already requeued us. | |
288 | dout(20) << "waiting for the backend..." << dendl; | |
289 | ||
290 | } else if (ret < 0) { | |
291 | ||
292 | dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl; | |
293 | // scrbr->mark_local_map_ready(); | |
294 | post_event(InternalError{}); | |
295 | ||
296 | } else { | |
297 | ||
298 | // the local map was created | |
299 | post_event(IntLocalMapDone{}); | |
300 | } | |
301 | } | |
302 | } | |
303 | ||
304 | sc::result BuildMap::react(const IntLocalMapDone&) | |
305 | { | |
306 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
307 | dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl; | |
308 | ||
309 | scrbr->mark_local_map_ready(); | |
310 | return transit<WaitReplicas>(); | |
311 | } | |
312 | ||
313 | // ----------------------- DrainReplMaps ----------------------------------- | |
314 | ||
315 | DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx) | |
316 | { | |
317 | dout(10) << "-- state -->> Act/DrainReplMaps" << dendl; | |
318 | // we may have received all maps already. Send the event that will make us check. | |
319 | post_event(GotReplicas{}); | |
320 | } | |
321 | ||
322 | sc::result DrainReplMaps::react(const GotReplicas&) | |
323 | { | |
324 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
325 | dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl; | |
326 | ||
327 | if (scrbr->are_all_maps_available()) { | |
328 | // NewChunk will handle the preemption that brought us to this state | |
329 | return transit<PendingTimer>(); | |
330 | } | |
331 | ||
332 | dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: " | |
333 | << scrbr->dump_awaited_maps() << dendl; | |
334 | return discard_event(); | |
335 | } | |
336 | ||
337 | // ----------------------- WaitReplicas ----------------------------------- | |
338 | ||
339 | WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx) | |
340 | { | |
341 | dout(10) << "-- state -->> Act/WaitReplicas" << dendl; | |
342 | post_event(GotReplicas{}); | |
343 | } | |
344 | ||
345 | sc::result WaitReplicas::react(const GotReplicas&) | |
346 | { | |
347 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
348 | dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl; | |
349 | ||
350 | if (scrbr->are_all_maps_available()) { | |
351 | dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl; | |
352 | ||
353 | // were we preempted? | |
354 | if (scrbr->get_preemptor().disable_and_test()) { // a test&set | |
355 | ||
356 | ||
357 | dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl; | |
358 | return transit<PendingTimer>(); | |
359 | ||
360 | } else { | |
361 | ||
362 | scrbr->maps_compare_n_cleanup(); | |
363 | return transit<WaitDigestUpdate>(); | |
364 | } | |
365 | } else { | |
366 | return discard_event(); | |
367 | } | |
368 | } | |
369 | ||
370 | // ----------------------- WaitDigestUpdate ----------------------------------- | |
371 | ||
372 | WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx) | |
373 | { | |
374 | dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl; | |
375 | // perform an initial check: maybe we already | |
376 | // have all the updates we need: | |
377 | // (note that DigestUpdate is usually an external event) | |
378 | post_event(DigestUpdate{}); | |
379 | } | |
380 | ||
381 | sc::result WaitDigestUpdate::react(const DigestUpdate&) | |
382 | { | |
383 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
384 | dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl; | |
385 | ||
386 | switch (scrbr->on_digest_updates()) { | |
387 | ||
388 | case Scrub::FsmNext::goto_notactive: | |
389 | // scrubbing is done | |
390 | return transit<NotActive>(); | |
391 | ||
392 | case Scrub::FsmNext::next_chunk: | |
393 | // go get the next chunk | |
394 | return transit<PendingTimer>(); | |
395 | ||
396 | case Scrub::FsmNext::do_discard: | |
397 | // still waiting for more updates | |
398 | return discard_event(); | |
399 | } | |
400 | __builtin_unreachable(); // Prevent a gcc warning. | |
401 | // Adding a phony 'default:' above is wrong: (a) prevents a | |
402 | // warning if FsmNext is extended, and (b) elicits a correct | |
403 | // warning from Clang | |
404 | } | |
405 | ||
406 | ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub) | |
407 | : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub} | |
408 | { | |
409 | dout(15) << "ScrubMachine created " << m_pg_id << dendl; | |
410 | } | |
411 | ||
412 | ScrubMachine::~ScrubMachine() = default; | |
413 | ||
414 | // -------- for replicas ----------------------------------------------------- | |
415 | ||
416 | // ----------------------- ReplicaWaitUpdates -------------------------------- | |
417 | ||
418 | ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx) | |
419 | { | |
420 | dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl; | |
421 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
422 | scrbr->on_replica_init(); | |
423 | } | |
424 | ||
425 | /* | |
426 | * Triggered externally, by the entity that had an update re pushes | |
427 | */ | |
428 | sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&) | |
429 | { | |
430 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
431 | dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): " | |
432 | << scrbr->pending_active_pushes() << dendl; | |
433 | ||
434 | if (scrbr->pending_active_pushes() == 0) { | |
435 | ||
436 | // done waiting | |
437 | return transit<ActiveReplica>(); | |
438 | } | |
439 | ||
440 | return discard_event(); | |
441 | } | |
442 | ||
443 | /** | |
444 | * the event poster is handling the scrubber reset | |
445 | */ | |
446 | sc::result ReplicaWaitUpdates::react(const FullReset&) | |
447 | { | |
448 | dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl; | |
449 | return transit<NotActive>(); | |
450 | } | |
451 | ||
452 | // ----------------------- ActiveReplica ----------------------------------- | |
453 | ||
454 | ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx) | |
455 | { | |
456 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
457 | dout(10) << "-- state -->> ActiveReplica" << dendl; | |
458 | scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates | |
459 | post_event(SchedReplica{}); | |
460 | } | |
461 | ||
462 | sc::result ActiveReplica::react(const SchedReplica&) | |
463 | { | |
464 | DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases | |
465 | dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? " | |
466 | << scrbr->get_preemptor().is_preemptable() << dendl; | |
467 | ||
468 | if (scrbr->get_preemptor().was_preempted()) { | |
469 | dout(10) << "replica scrub job preempted" << dendl; | |
470 | ||
471 | scrbr->send_replica_map(PreemptionNoted::preempted); | |
472 | scrbr->replica_handling_done(); | |
473 | return transit<NotActive>(); | |
474 | } | |
475 | ||
476 | // start or check progress of build_replica_map_chunk() | |
477 | ||
478 | auto ret = scrbr->build_replica_map_chunk(); | |
479 | dout(15) << "ActiveReplica::react(const SchedReplica&) Ret: " << ret << dendl; | |
480 | ||
481 | if (ret == -EINPROGRESS) { | |
482 | // must wait for the backend to finish. No external event source. | |
483 | // build_replica_map_chunk() has already requeued a SchedReplica | |
484 | // event. | |
485 | ||
486 | dout(20) << "waiting for the backend..." << dendl; | |
487 | return discard_event(); | |
488 | } | |
489 | ||
490 | if (ret < 0) { | |
491 | // the existing code ignores this option, treating an error | |
492 | // report as a success. | |
493 | dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret | |
494 | << dendl; | |
495 | scrbr->replica_handling_done(); | |
496 | return transit<NotActive>(); | |
497 | } | |
498 | ||
499 | ||
500 | // the local map was created. Send it to the primary. | |
501 | scrbr->send_replica_map(PreemptionNoted::no_preemption); | |
502 | scrbr->replica_handling_done(); | |
503 | return transit<NotActive>(); | |
504 | } | |
505 | ||
506 | /** | |
507 | * the event poster is handling the scrubber reset | |
508 | */ | |
509 | sc::result ActiveReplica::react(const FullReset&) | |
510 | { | |
511 | dout(10) << "ActiveReplica::react(const FullReset&)" << dendl; | |
512 | return transit<NotActive>(); | |
513 | } | |
514 | ||
515 | } // namespace Scrub |