]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/scrubber/osd_scrub_sched.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / osd / scrubber / osd_scrub_sched.h
CommitLineData
20effc67
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#pragma once
5
6#include <atomic>
7#include <chrono>
8#include <memory>
9#include <optional>
10#include <vector>
11
12#include "common/RefCountedObj.h"
13#include "common/ceph_atomic.h"
14#include "osd/osd_types.h"
15#include "osd/scrubber_common.h"
16
17#include "utime.h"
18
19class PG;
20
21namespace Scrub {
22
23using namespace ::std::literals;
24
25// possible outcome when trying to select a PG and scrub it
26enum class schedule_result_t {
27 scrub_initiated, // successfully started a scrub
28 none_ready, // no pg to scrub
29 no_local_resources, // failure to secure local OSD scrub resource
30 already_started, // failed, as already started scrubbing this pg
31 no_such_pg, // can't find this pg
32 bad_pg_state, // pg state (clean, active, etc.)
33 preconditions // time, configuration, etc.
34};
35
36} // namespace Scrub
37
38/**
39 * the queue of PGs waiting to be scrubbed.
40 * Main operations are scheduling/unscheduling a PG to be scrubbed at a certain
41 * time.
42 *
43 * A "penalty" queue maintains those PGs that have failed to reserve the
44 * resources of their replicas. The PGs in this list will be reinstated into the
45 * scrub queue when all eligible PGs were already handled, or after a timeout
46 * (or if their deadline has passed [[disabled at this time]]).
47 */
48class ScrubQueue {
49 public:
50 enum class must_scrub_t { not_mandatory, mandatory };
51
52 enum class qu_state_t {
53 not_registered, // not a primary, thus not considered for scrubbing by this
54 // OSD (also the temporary state when just created)
55 registered, // in either of the two queues ('to_scrub' or 'penalized')
56 unregistering // in the process of being unregistered. Will be finalized
57 // under lock
58 };
59
60 ScrubQueue(CephContext* cct, OSDService& osds);
61
62 struct scrub_schedule_t {
63 utime_t scheduled_at{};
64 utime_t deadline{0, 0};
65 };
66
67 struct sched_params_t {
68 utime_t proposed_time{};
69 double min_interval{0.0};
70 double max_interval{0.0};
71 must_scrub_t is_must{ScrubQueue::must_scrub_t::not_mandatory};
72 };
73
74 struct ScrubJob final : public RefCountedObject {
75
76 /**
77 * a time scheduled for scrub, and a deadline: The scrub could be delayed if
78 * system load is too high (but not if after the deadline),or if trying to
79 * scrub out of scrub hours.
80 */
81 scrub_schedule_t schedule;
82
83 /// pg to be scrubbed
84 const spg_t pgid;
85
86 /// the OSD id (for the log)
87 const int whoami;
88
89 ceph::atomic<qu_state_t> state{qu_state_t::not_registered};
90
91 /**
92 * the old 'is_registered'. Set whenever the job is registered with the OSD,
93 * i.e. is in either the 'to_scrub' or the 'penalized' vectors.
94 */
95 std::atomic_bool in_queues{false};
96
97 /// last scrub attempt failed to secure replica resources
98 bool resources_failure{false};
99
100 /**
101 * 'updated' is a temporary flag, used to create a barrier after
102 * 'sched_time' and 'deadline' (or any other job entry) were modified by
103 * different task.
104 * 'updated' also signals the need to move a job back from the penalized
105 * queue to the regular one.
106 */
107 std::atomic_bool updated{false};
108
109 utime_t penalty_timeout{0, 0};
110
111 CephContext* cct;
112
113 ScrubJob(CephContext* cct, const spg_t& pg, int node_id);
114
115 utime_t get_sched_time() const { return schedule.scheduled_at; }
116
117 /**
118 * relatively low-cost(*) access to the scrub job's state, to be used in
119 * logging.
120 * (*) not a low-cost access on x64 architecture
121 */
122 std::string_view state_desc() const
123 {
124 return ScrubQueue::qu_state_text(state.load(std::memory_order_relaxed));
125 }
126
127 void update_schedule(const ScrubQueue::scrub_schedule_t& adjusted);
128
129 void dump(ceph::Formatter* f) const;
130
131 /*
132 * as the atomic 'in_queues' appears in many log prints, accessing it for
133 * display-only should be made less expensive (on ARM. On x86 the _relaxed
134 * produces the same code as '_cs')
135 */
136 std::string_view registration_state() const
137 {
138 return in_queues.load(std::memory_order_relaxed) ? " in-queue"
139 : " not-queued";
140 }
141
142 /**
143 * a text description of the "scheduling intentions" of this PG:
144 * are we already scheduled for a scrub/deep scrub? when?
145 */
146 std::string scheduling_state(utime_t now_is, bool is_deep_expected) const;
147
148 friend std::ostream& operator<<(std::ostream& out, const ScrubJob& pg);
149 };
150
151 friend class TestOSDScrub;
152
153 using ScrubJobRef = ceph::ref_t<ScrubJob>;
154 using ScrubQContainer = std::vector<ScrubJobRef>;
155
156 static std::string_view qu_state_text(qu_state_t st);
157
158 /**
159 * called periodically by the OSD to select the first scrub-eligible PG
160 * and scrub it.
161 *
162 * Selection is affected by:
163 * - time of day: scheduled scrubbing might be configured to only happen
164 * during certain hours;
165 * - same for days of the week, and for the system load;
166 *
167 * @param preconds: what types of scrub are allowed, given system status &
168 * config. Some of the preconditions are calculated here.
169 * @return Scrub::attempt_t::scrubbing if a scrub session was successfully
170 * initiated. Otherwise - the failure cause.
171 *
172 * locking: locks jobs_lock
173 */
174 Scrub::schedule_result_t select_pg_and_scrub(Scrub::ScrubPreconds& preconds);
175
176 /**
177 * Translate attempt_ values into readable text
178 */
179 static std::string_view attempt_res_text(Scrub::schedule_result_t v);
180
181 /**
182 * remove the pg from set of PGs to be scanned for scrubbing.
183 * To be used if we are no longer the PG's primary, or if the PG is removed.
184 */
185 void remove_from_osd_queue(ScrubJobRef sjob);
186
187 /**
188 * @return the list (not std::set!) of all scrub jobs registered
189 * (apart from PGs in the process of being removed)
190 */
191 ScrubQContainer list_registered_jobs() const;
192
193 /**
194 * Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically
195 * scrubbed by the OSD.
196 * The registration is active as long as the PG exists and the OSD is its
197 * primary.
198 *
199 * See update_job() for the handling of the 'suggested' parameter.
200 *
201 * locking: might lock jobs_lock
202 */
203 void register_with_osd(ScrubJobRef sjob, const sched_params_t& suggested);
204
205 /**
206 * modify a scrub-job's schduled time and deadline
207 *
208 * There are 3 argument combinations to consider:
209 * - 'must' is asserted, and the suggested time is 'scrub_must_stamp':
210 * the registration will be with "beginning of time" target, making the
211 * scrub-job eligible to immediate scrub (given that external conditions
212 * do not prevent scrubbing)
213 *
214 * - 'must' is asserted, and the suggested time is 'now':
215 * This happens if our stats are unknown. The results are similar to the
216 * previous scenario.
217 *
218 * - not a 'must': we take the suggested time as a basis, and add to it some
219 * configuration / random delays.
220 *
221 * ('must' is sched_params_t.is_must)
222 *
223 * locking: not using the jobs_lock
224 */
225 void update_job(ScrubJobRef sjob, const sched_params_t& suggested);
226
227 public:
228 void dump_scrubs(ceph::Formatter* f) const;
229
230 /**
231 * No new scrub session will start while a scrub was initiated on a PG,
232 * and that PG is trying to acquire replica resources.
233 */
234 void set_reserving_now() { a_pg_is_reserving = true; }
235 void clear_reserving_now() { a_pg_is_reserving = false; }
236 bool is_reserving_now() const { return a_pg_is_reserving; }
237
238 bool can_inc_scrubs() const;
239 bool inc_scrubs_local();
240 void dec_scrubs_local();
241 bool inc_scrubs_remote();
242 void dec_scrubs_remote();
243 void dump_scrub_reservations(ceph::Formatter* f) const;
244
245 /**
246 * Pacing the scrub operation by inserting delays (mostly between chunks)
247 *
248 * Special handling for regular scrubs that continued into "no scrub" times.
249 * Scrubbing will continue, but the delays will be controlled by a separate
250 * (read - with higher value) configuration element
251 * (osd_scrub_extended_sleep).
252 */
253 double scrub_sleep_time(
254 bool must_scrub) const; /// \todo (future) return milliseconds
255
256 /**
257 * called every heartbeat to update the "daily" load average
258 *
259 * @returns a load value for the logger
260 */
261 [[nodiscard]] std::optional<double> update_load_average();
262
263 private:
264 CephContext* cct;
265 OSDService& osd_service;
266
267 /**
268 * jobs_lock protects the job containers and the relevant scrub-jobs state
269 * variables. Specifically, the following are guaranteed:
270 * - 'in_queues' is asserted only if the job is in one of the queues;
271 * - a job will only be in state 'registered' if in one of the queues;
272 * - no job will be in the two queues simulatenously
273 *
274 * Note that PG locks should not be acquired while holding jobs_lock.
275 */
276 mutable ceph::mutex jobs_lock = ceph::make_mutex("ScrubQueue::jobs_lock");
277
278 ScrubQContainer to_scrub; ///< scrub jobs (i.e. PGs) to scrub
279 ScrubQContainer penalized; ///< those that failed to reserve remote resources
280 bool restore_penalized{false};
281
282 double daily_loadavg{0.0};
283
284 static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
285 return jobref->state == qu_state_t::registered;
286 };
287
288 static inline constexpr auto invalid_state = [](const auto& jobref) -> bool {
289 return jobref->state == qu_state_t::not_registered;
290 };
291
292 /**
293 * Are there scrub jobs that should be reinstated?
294 */
295 void scan_penalized(bool forgive_all, utime_t time_now);
296
297 /**
298 * clear dead entries (unregistered, or belonging to removed PGs) from a
299 * queue. Job state is changed to match new status.
300 */
301 void rm_unregistered_jobs(ScrubQContainer& group);
302
303 /**
304 * the set of all scrub jobs in 'group' which are ready to be scrubbed
305 * (ready = their scheduled time has passed).
306 * The scrub jobs in the new collection are sorted according to
307 * their scheduled time.
308 *
309 * Note that the returned container holds independent refs to the
310 * scrub jobs.
311 */
312 ScrubQContainer collect_ripe_jobs(ScrubQContainer& group, utime_t time_now);
313
314
315 /// scrub resources management lock (guarding scrubs_local & scrubs_remote)
316 mutable ceph::mutex resource_lock =
317 ceph::make_mutex("ScrubQueue::resource_lock");
318
319 // the counters used to manage scrub activity parallelism:
320 int scrubs_local{0};
321 int scrubs_remote{0};
322
323 std::atomic_bool a_pg_is_reserving{false};
324
325 [[nodiscard]] bool scrub_load_below_threshold() const;
326 [[nodiscard]] bool scrub_time_permit(utime_t now) const;
327
328 /**
329 * If the scrub job was not explicitly requested, we postpone it by some
330 * random length of time.
331 * And if delaying the scrub - we calculate, based on pool parameters, a
332 * deadline we should scrub before.
333 *
334 * @return a pair of values: the determined scrub time, and the deadline
335 */
336 scrub_schedule_t adjust_target_time(
337 const sched_params_t& recomputed_params) const;
338
339 /**
340 * Look for scrub jobs that have their 'resources_failure' set. These jobs
341 * have failed to acquire remote resources last time we've initiated a scrub
342 * session on them. They are now moved from the 'to_scrub' queue to the
343 * 'penalized' set.
344 *
345 * locking: called with job_lock held
346 */
347 void move_failed_pgs(utime_t now_is);
348
349 Scrub::schedule_result_t select_from_group(ScrubQContainer& group,
350 const Scrub::ScrubPreconds& preconds,
351 utime_t now_is);
352};