]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #pragma once | |
5 | ||
6 | #include <atomic> | |
7 | #include <chrono> | |
8 | #include <memory> | |
9 | #include <optional> | |
10 | #include <vector> | |
11 | ||
12 | #include "common/RefCountedObj.h" | |
13 | #include "common/ceph_atomic.h" | |
14 | #include "osd/osd_types.h" | |
15 | #include "osd/scrubber_common.h" | |
16 | ||
17 | #include "utime.h" | |
18 | ||
19 | class PG; | |
20 | ||
21 | namespace Scrub { | |
22 | ||
23 | using namespace ::std::literals; | |
24 | ||
25 | // possible outcome when trying to select a PG and scrub it | |
26 | enum class schedule_result_t { | |
27 | scrub_initiated, // successfully started a scrub | |
28 | none_ready, // no pg to scrub | |
29 | no_local_resources, // failure to secure local OSD scrub resource | |
30 | already_started, // failed, as already started scrubbing this pg | |
31 | no_such_pg, // can't find this pg | |
32 | bad_pg_state, // pg state (clean, active, etc.) | |
33 | preconditions // time, configuration, etc. | |
34 | }; | |
35 | ||
36 | } // namespace Scrub | |
37 | ||
38 | /** | |
39 | * the queue of PGs waiting to be scrubbed. | |
40 | * Main operations are scheduling/unscheduling a PG to be scrubbed at a certain | |
41 | * time. | |
42 | * | |
43 | * A "penalty" queue maintains those PGs that have failed to reserve the | |
44 | * resources of their replicas. The PGs in this list will be reinstated into the | |
45 | * scrub queue when all eligible PGs were already handled, or after a timeout | |
46 | * (or if their deadline has passed [[disabled at this time]]). | |
47 | */ | |
48 | class ScrubQueue { | |
49 | public: | |
50 | enum class must_scrub_t { not_mandatory, mandatory }; | |
51 | ||
52 | enum class qu_state_t { | |
53 | not_registered, // not a primary, thus not considered for scrubbing by this | |
54 | // OSD (also the temporary state when just created) | |
55 | registered, // in either of the two queues ('to_scrub' or 'penalized') | |
56 | unregistering // in the process of being unregistered. Will be finalized | |
57 | // under lock | |
58 | }; | |
59 | ||
60 | ScrubQueue(CephContext* cct, OSDService& osds); | |
61 | ||
62 | struct scrub_schedule_t { | |
63 | utime_t scheduled_at{}; | |
64 | utime_t deadline{0, 0}; | |
65 | }; | |
66 | ||
67 | struct sched_params_t { | |
68 | utime_t proposed_time{}; | |
69 | double min_interval{0.0}; | |
70 | double max_interval{0.0}; | |
71 | must_scrub_t is_must{ScrubQueue::must_scrub_t::not_mandatory}; | |
72 | }; | |
73 | ||
74 | struct ScrubJob final : public RefCountedObject { | |
75 | ||
76 | /** | |
77 | * a time scheduled for scrub, and a deadline: The scrub could be delayed if | |
78 | * system load is too high (but not if after the deadline),or if trying to | |
79 | * scrub out of scrub hours. | |
80 | */ | |
81 | scrub_schedule_t schedule; | |
82 | ||
83 | /// pg to be scrubbed | |
84 | const spg_t pgid; | |
85 | ||
86 | /// the OSD id (for the log) | |
87 | const int whoami; | |
88 | ||
89 | ceph::atomic<qu_state_t> state{qu_state_t::not_registered}; | |
90 | ||
91 | /** | |
92 | * the old 'is_registered'. Set whenever the job is registered with the OSD, | |
93 | * i.e. is in either the 'to_scrub' or the 'penalized' vectors. | |
94 | */ | |
95 | std::atomic_bool in_queues{false}; | |
96 | ||
97 | /// last scrub attempt failed to secure replica resources | |
98 | bool resources_failure{false}; | |
99 | ||
100 | /** | |
101 | * 'updated' is a temporary flag, used to create a barrier after | |
102 | * 'sched_time' and 'deadline' (or any other job entry) were modified by | |
103 | * different task. | |
104 | * 'updated' also signals the need to move a job back from the penalized | |
105 | * queue to the regular one. | |
106 | */ | |
107 | std::atomic_bool updated{false}; | |
108 | ||
109 | utime_t penalty_timeout{0, 0}; | |
110 | ||
111 | CephContext* cct; | |
112 | ||
113 | ScrubJob(CephContext* cct, const spg_t& pg, int node_id); | |
114 | ||
115 | utime_t get_sched_time() const { return schedule.scheduled_at; } | |
116 | ||
117 | /** | |
118 | * relatively low-cost(*) access to the scrub job's state, to be used in | |
119 | * logging. | |
120 | * (*) not a low-cost access on x64 architecture | |
121 | */ | |
122 | std::string_view state_desc() const | |
123 | { | |
124 | return ScrubQueue::qu_state_text(state.load(std::memory_order_relaxed)); | |
125 | } | |
126 | ||
127 | void update_schedule(const ScrubQueue::scrub_schedule_t& adjusted); | |
128 | ||
129 | void dump(ceph::Formatter* f) const; | |
130 | ||
131 | /* | |
132 | * as the atomic 'in_queues' appears in many log prints, accessing it for | |
133 | * display-only should be made less expensive (on ARM. On x86 the _relaxed | |
134 | * produces the same code as '_cs') | |
135 | */ | |
136 | std::string_view registration_state() const | |
137 | { | |
138 | return in_queues.load(std::memory_order_relaxed) ? " in-queue" | |
139 | : " not-queued"; | |
140 | } | |
141 | ||
142 | /** | |
143 | * a text description of the "scheduling intentions" of this PG: | |
144 | * are we already scheduled for a scrub/deep scrub? when? | |
145 | */ | |
146 | std::string scheduling_state(utime_t now_is, bool is_deep_expected) const; | |
147 | ||
148 | friend std::ostream& operator<<(std::ostream& out, const ScrubJob& pg); | |
149 | }; | |
150 | ||
151 | friend class TestOSDScrub; | |
152 | ||
153 | using ScrubJobRef = ceph::ref_t<ScrubJob>; | |
154 | using ScrubQContainer = std::vector<ScrubJobRef>; | |
155 | ||
156 | static std::string_view qu_state_text(qu_state_t st); | |
157 | ||
158 | /** | |
159 | * called periodically by the OSD to select the first scrub-eligible PG | |
160 | * and scrub it. | |
161 | * | |
162 | * Selection is affected by: | |
163 | * - time of day: scheduled scrubbing might be configured to only happen | |
164 | * during certain hours; | |
165 | * - same for days of the week, and for the system load; | |
166 | * | |
167 | * @param preconds: what types of scrub are allowed, given system status & | |
168 | * config. Some of the preconditions are calculated here. | |
169 | * @return Scrub::attempt_t::scrubbing if a scrub session was successfully | |
170 | * initiated. Otherwise - the failure cause. | |
171 | * | |
172 | * locking: locks jobs_lock | |
173 | */ | |
174 | Scrub::schedule_result_t select_pg_and_scrub(Scrub::ScrubPreconds& preconds); | |
175 | ||
176 | /** | |
177 | * Translate attempt_ values into readable text | |
178 | */ | |
179 | static std::string_view attempt_res_text(Scrub::schedule_result_t v); | |
180 | ||
181 | /** | |
182 | * remove the pg from set of PGs to be scanned for scrubbing. | |
183 | * To be used if we are no longer the PG's primary, or if the PG is removed. | |
184 | */ | |
185 | void remove_from_osd_queue(ScrubJobRef sjob); | |
186 | ||
187 | /** | |
188 | * @return the list (not std::set!) of all scrub jobs registered | |
189 | * (apart from PGs in the process of being removed) | |
190 | */ | |
191 | ScrubQContainer list_registered_jobs() const; | |
192 | ||
193 | /** | |
194 | * Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically | |
195 | * scrubbed by the OSD. | |
196 | * The registration is active as long as the PG exists and the OSD is its | |
197 | * primary. | |
198 | * | |
199 | * See update_job() for the handling of the 'suggested' parameter. | |
200 | * | |
201 | * locking: might lock jobs_lock | |
202 | */ | |
203 | void register_with_osd(ScrubJobRef sjob, const sched_params_t& suggested); | |
204 | ||
205 | /** | |
206 | * modify a scrub-job's schduled time and deadline | |
207 | * | |
208 | * There are 3 argument combinations to consider: | |
209 | * - 'must' is asserted, and the suggested time is 'scrub_must_stamp': | |
210 | * the registration will be with "beginning of time" target, making the | |
211 | * scrub-job eligible to immediate scrub (given that external conditions | |
212 | * do not prevent scrubbing) | |
213 | * | |
214 | * - 'must' is asserted, and the suggested time is 'now': | |
215 | * This happens if our stats are unknown. The results are similar to the | |
216 | * previous scenario. | |
217 | * | |
218 | * - not a 'must': we take the suggested time as a basis, and add to it some | |
219 | * configuration / random delays. | |
220 | * | |
221 | * ('must' is sched_params_t.is_must) | |
222 | * | |
223 | * locking: not using the jobs_lock | |
224 | */ | |
225 | void update_job(ScrubJobRef sjob, const sched_params_t& suggested); | |
226 | ||
227 | public: | |
228 | void dump_scrubs(ceph::Formatter* f) const; | |
229 | ||
230 | /** | |
231 | * No new scrub session will start while a scrub was initiated on a PG, | |
232 | * and that PG is trying to acquire replica resources. | |
233 | */ | |
234 | void set_reserving_now() { a_pg_is_reserving = true; } | |
235 | void clear_reserving_now() { a_pg_is_reserving = false; } | |
236 | bool is_reserving_now() const { return a_pg_is_reserving; } | |
237 | ||
238 | bool can_inc_scrubs() const; | |
239 | bool inc_scrubs_local(); | |
240 | void dec_scrubs_local(); | |
241 | bool inc_scrubs_remote(); | |
242 | void dec_scrubs_remote(); | |
243 | void dump_scrub_reservations(ceph::Formatter* f) const; | |
244 | ||
245 | /** | |
246 | * Pacing the scrub operation by inserting delays (mostly between chunks) | |
247 | * | |
248 | * Special handling for regular scrubs that continued into "no scrub" times. | |
249 | * Scrubbing will continue, but the delays will be controlled by a separate | |
250 | * (read - with higher value) configuration element | |
251 | * (osd_scrub_extended_sleep). | |
252 | */ | |
253 | double scrub_sleep_time( | |
254 | bool must_scrub) const; /// \todo (future) return milliseconds | |
255 | ||
256 | /** | |
257 | * called every heartbeat to update the "daily" load average | |
258 | * | |
259 | * @returns a load value for the logger | |
260 | */ | |
261 | [[nodiscard]] std::optional<double> update_load_average(); | |
262 | ||
263 | private: | |
264 | CephContext* cct; | |
265 | OSDService& osd_service; | |
266 | ||
267 | /** | |
268 | * jobs_lock protects the job containers and the relevant scrub-jobs state | |
269 | * variables. Specifically, the following are guaranteed: | |
270 | * - 'in_queues' is asserted only if the job is in one of the queues; | |
271 | * - a job will only be in state 'registered' if in one of the queues; | |
272 | * - no job will be in the two queues simulatenously | |
273 | * | |
274 | * Note that PG locks should not be acquired while holding jobs_lock. | |
275 | */ | |
276 | mutable ceph::mutex jobs_lock = ceph::make_mutex("ScrubQueue::jobs_lock"); | |
277 | ||
278 | ScrubQContainer to_scrub; ///< scrub jobs (i.e. PGs) to scrub | |
279 | ScrubQContainer penalized; ///< those that failed to reserve remote resources | |
280 | bool restore_penalized{false}; | |
281 | ||
282 | double daily_loadavg{0.0}; | |
283 | ||
284 | static inline constexpr auto registered_job = [](const auto& jobref) -> bool { | |
285 | return jobref->state == qu_state_t::registered; | |
286 | }; | |
287 | ||
288 | static inline constexpr auto invalid_state = [](const auto& jobref) -> bool { | |
289 | return jobref->state == qu_state_t::not_registered; | |
290 | }; | |
291 | ||
292 | /** | |
293 | * Are there scrub jobs that should be reinstated? | |
294 | */ | |
295 | void scan_penalized(bool forgive_all, utime_t time_now); | |
296 | ||
297 | /** | |
298 | * clear dead entries (unregistered, or belonging to removed PGs) from a | |
299 | * queue. Job state is changed to match new status. | |
300 | */ | |
301 | void rm_unregistered_jobs(ScrubQContainer& group); | |
302 | ||
303 | /** | |
304 | * the set of all scrub jobs in 'group' which are ready to be scrubbed | |
305 | * (ready = their scheduled time has passed). | |
306 | * The scrub jobs in the new collection are sorted according to | |
307 | * their scheduled time. | |
308 | * | |
309 | * Note that the returned container holds independent refs to the | |
310 | * scrub jobs. | |
311 | */ | |
312 | ScrubQContainer collect_ripe_jobs(ScrubQContainer& group, utime_t time_now); | |
313 | ||
314 | ||
315 | /// scrub resources management lock (guarding scrubs_local & scrubs_remote) | |
316 | mutable ceph::mutex resource_lock = | |
317 | ceph::make_mutex("ScrubQueue::resource_lock"); | |
318 | ||
319 | // the counters used to manage scrub activity parallelism: | |
320 | int scrubs_local{0}; | |
321 | int scrubs_remote{0}; | |
322 | ||
323 | std::atomic_bool a_pg_is_reserving{false}; | |
324 | ||
325 | [[nodiscard]] bool scrub_load_below_threshold() const; | |
326 | [[nodiscard]] bool scrub_time_permit(utime_t now) const; | |
327 | ||
328 | /** | |
329 | * If the scrub job was not explicitly requested, we postpone it by some | |
330 | * random length of time. | |
331 | * And if delaying the scrub - we calculate, based on pool parameters, a | |
332 | * deadline we should scrub before. | |
333 | * | |
334 | * @return a pair of values: the determined scrub time, and the deadline | |
335 | */ | |
336 | scrub_schedule_t adjust_target_time( | |
337 | const sched_params_t& recomputed_params) const; | |
338 | ||
339 | /** | |
340 | * Look for scrub jobs that have their 'resources_failure' set. These jobs | |
341 | * have failed to acquire remote resources last time we've initiated a scrub | |
342 | * session on them. They are now moved from the 'to_scrub' queue to the | |
343 | * 'penalized' set. | |
344 | * | |
345 | * locking: called with job_lock held | |
346 | */ | |
347 | void move_failed_pgs(utime_t now_is); | |
348 | ||
349 | Scrub::schedule_result_t select_from_group(ScrubQContainer& group, | |
350 | const Scrub::ScrubPreconds& preconds, | |
351 | utime_t now_is); | |
352 | }; |