[ceph.git] / ceph / src / osd / scrubber / osd_scrub_sched.h

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab

#pragma once

#include <atomic>
#include <chrono>
#include <memory>
#include <optional>
#include <vector>

#include "common/RefCountedObj.h"
#include "common/ceph_atomic.h"
#include "osd/osd_types.h"
#include "osd/scrubber_common.h"

#include "utime.h"

class PG;

namespace Scrub {

using namespace ::std::literals;

// possible outcome when trying to select a PG and scrub it
enum class schedule_result_t {
  scrub_initiated,     // successfully started a scrub
  none_ready,	       // no pg to scrub
  no_local_resources,  // failure to secure local OSD scrub resource
  already_started,     // failed, as already started scrubbing this pg
  no_such_pg,	       // can't find this pg
  bad_pg_state,	       // pg state (clean, active, etc.)
  preconditions	       // time, configuration, etc.
};

}  // namespace Scrub

/**
 * the queue of PGs waiting to be scrubbed.
 * Main operations are scheduling/unscheduling a PG to be scrubbed at a certain
 * time.
 *
 * A "penalty" queue maintains those PGs that have failed to reserve the
 * resources of their replicas. The PGs in this list will be reinstated into the
 * scrub queue when all eligible PGs were already handled, or after a timeout
 * (or if their deadline has passed [[disabled at this time]]).
 */
class ScrubQueue {
 public:
  enum class must_scrub_t { not_mandatory, mandatory };

  enum class qu_state_t {
    not_registered,  // not a primary, thus not considered for scrubbing by this
		     // OSD (also the temporary state when just created)
    registered,	     // in either of the two queues ('to_scrub' or 'penalized')
    unregistering    // in the process of being unregistered. Will be finalized
		     // under lock
  };

  ScrubQueue(CephContext* cct, OSDService& osds);

  struct scrub_schedule_t {
    utime_t scheduled_at{};
    utime_t deadline{0, 0};
  };

  struct sched_params_t {
    utime_t proposed_time{};
    double min_interval{0.0};
    double max_interval{0.0};
    must_scrub_t is_must{ScrubQueue::must_scrub_t::not_mandatory};
  };

  struct ScrubJob final : public RefCountedObject {

    /**
     *  a time scheduled for scrub, and a deadline: The scrub could be delayed if
     * system load is too high (but not if after the deadline),or if trying to
     * scrub out of scrub hours.
     */
    scrub_schedule_t schedule;

    /// pg to be scrubbed
    const spg_t pgid;

    /// the OSD id (for the log)
    const int whoami;

    ceph::atomic<qu_state_t> state{qu_state_t::not_registered};

    /**
     * the old 'is_registered'. Set whenever the job is registered with the OSD,
     * i.e. is in either the 'to_scrub' or the 'penalized' vectors.
     */
    std::atomic_bool in_queues{false};

    /// last scrub attempt failed to secure replica resources
    bool resources_failure{false};

    /**
     *  'updated' is a temporary flag, used to create a barrier after
     *  'sched_time' and 'deadline' (or any other job entry) were modified by
     *  different task.
     *  'updated' also signals the need to move a job back from the penalized
     *  queue to the regular one.
     */
    std::atomic_bool updated{false};

    utime_t penalty_timeout{0, 0};

    CephContext* cct;

    ScrubJob(CephContext* cct, const spg_t& pg, int node_id);

    utime_t get_sched_time() const { return schedule.scheduled_at; }

    /**
     * relatively low-cost(*) access to the scrub job's state, to be used in
     * logging.
     *  (*) not a low-cost access on x64 architecture
     */
    std::string_view state_desc() const
    {
      return ScrubQueue::qu_state_text(state.load(std::memory_order_relaxed));
    }

    void update_schedule(const ScrubQueue::scrub_schedule_t& adjusted);

    void dump(ceph::Formatter* f) const;

    /*
     * as the atomic 'in_queues' appears in many log prints, accessing it for
     * display-only should be made less expensive (on ARM. On x86 the _relaxed
     * produces the same code as '_cs')
     */
    std::string_view registration_state() const
    {
      return in_queues.load(std::memory_order_relaxed) ? " in-queue"
						       : " not-queued";
    }

    /**
     * a text description of the "scheduling intentions" of this PG:
     * are we already scheduled for a scrub/deep scrub? when?
     */
    std::string scheduling_state(utime_t now_is, bool is_deep_expected) const;

    friend std::ostream& operator<<(std::ostream& out, const ScrubJob& pg);
  };

  friend class TestOSDScrub;

  using ScrubJobRef = ceph::ref_t<ScrubJob>;
  using ScrubQContainer = std::vector<ScrubJobRef>;

  static std::string_view qu_state_text(qu_state_t st);

  /**
   * called periodically by the OSD to select the first scrub-eligible PG
   * and scrub it.
   *
   * Selection is affected by:
   * - time of day: scheduled scrubbing might be configured to only happen
   *   during certain hours;
   * - same for days of the week, and for the system load;
   *
   * @param preconds: what types of scrub are allowed, given system status &
   *                  config. Some of the preconditions are calculated here.
   * @return Scrub::attempt_t::scrubbing if a scrub session was successfully
   *         initiated. Otherwise - the failure cause.
   *
   * locking: locks jobs_lock
   */
  Scrub::schedule_result_t select_pg_and_scrub(Scrub::ScrubPreconds& preconds);

  /**
   * Translate attempt_ values into readable text
   */
  static std::string_view attempt_res_text(Scrub::schedule_result_t v);

  /**
   * remove the pg from set of PGs to be scanned for scrubbing.
   * To be used if we are no longer the PG's primary, or if the PG is removed.
   */
  void remove_from_osd_queue(ScrubJobRef sjob);

  /**
   * @return the list (not std::set!) of all scrub jobs registered
   *   (apart from PGs in the process of being removed)
   */
  ScrubQContainer list_registered_jobs() const;

  /**
   * Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically
   * scrubbed by the OSD.
   * The registration is active as long as the PG exists and the OSD is its
   * primary.
   *
   * See update_job() for the handling of the 'suggested' parameter.
   *
   * locking: might lock jobs_lock
   */
  void register_with_osd(ScrubJobRef sjob, const sched_params_t& suggested);

  /**
   * modify a scrub-job's schduled time and deadline
   *
   * There are 3 argument combinations to consider:
   * - 'must' is asserted, and the suggested time is 'scrub_must_stamp':
   *   the registration will be with "beginning of time" target, making the
   *   scrub-job eligible to immediate scrub (given that external conditions
   *   do not prevent scrubbing)
   *
   * - 'must' is asserted, and the suggested time is 'now':
   *   This happens if our stats are unknown. The results are similar to the
   *   previous scenario.
   *
   * - not a 'must': we take the suggested time as a basis, and add to it some
   *   configuration / random delays.
   *
   *  ('must' is sched_params_t.is_must)
   *
   *  locking: not using the jobs_lock
   */
  void update_job(ScrubJobRef sjob, const sched_params_t& suggested);

 public:
  void dump_scrubs(ceph::Formatter* f) const;

  /**
   * No new scrub session will start while a scrub was initiated on a PG,
   * and that PG is trying to acquire replica resources.
   */
  void set_reserving_now() { a_pg_is_reserving = true; }
  void clear_reserving_now() { a_pg_is_reserving = false; }
  bool is_reserving_now() const { return a_pg_is_reserving; }

  bool can_inc_scrubs() const;
  bool inc_scrubs_local();
  void dec_scrubs_local();
  bool inc_scrubs_remote();
  void dec_scrubs_remote();
  void dump_scrub_reservations(ceph::Formatter* f) const;

  /**
   * Pacing the scrub operation by inserting delays (mostly between chunks)
   *
   * Special handling for regular scrubs that continued into "no scrub" times.
   * Scrubbing will continue, but the delays will be controlled by a separate
   * (read - with higher value) configuration element
   * (osd_scrub_extended_sleep).
   */
  double scrub_sleep_time(
    bool must_scrub) const;  /// \todo (future) return milliseconds

  /**
   *  called every heartbeat to update the "daily" load average
   *
   *  @returns a load value for the logger
   */
  [[nodiscard]] std::optional<double> update_load_average();

 private:
  CephContext* cct;
  OSDService& osd_service;

  /**
   *  jobs_lock protects the job containers and the relevant scrub-jobs state
   *  variables. Specifically, the following are guaranteed:
   *  - 'in_queues' is asserted only if the job is in one of the queues;
   *  - a job will only be in state 'registered' if in one of the queues;
   *  - no job will be in the two queues simulatenously
   *
   *  Note that PG locks should not be acquired while holding jobs_lock.
   */
  mutable ceph::mutex jobs_lock = ceph::make_mutex("ScrubQueue::jobs_lock");

  ScrubQContainer to_scrub;   ///< scrub jobs (i.e. PGs) to scrub
  ScrubQContainer penalized;  ///< those that failed to reserve remote resources
  bool restore_penalized{false};

  double daily_loadavg{0.0};

  static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
    return jobref->state == qu_state_t::registered;
  };

  static inline constexpr auto invalid_state = [](const auto& jobref) -> bool {
    return jobref->state == qu_state_t::not_registered;
  };

  /**
   * Are there scrub jobs that should be reinstated?
   */
  void scan_penalized(bool forgive_all, utime_t time_now);

  /**
   * clear dead entries (unregistered, or belonging to removed PGs) from a
   * queue. Job state is changed to match new status.
   */
  void rm_unregistered_jobs(ScrubQContainer& group);

  /**
   * the set of all scrub jobs in 'group' which are ready to be scrubbed
   * (ready = their scheduled time has passed).
   * The scrub jobs in the new collection are sorted according to
   * their scheduled time.
   *
   * Note that the returned container holds independent refs to the
   * scrub jobs.
   */
  ScrubQContainer collect_ripe_jobs(ScrubQContainer& group, utime_t time_now);


  /// scrub resources management lock (guarding scrubs_local & scrubs_remote)
  mutable ceph::mutex resource_lock =
    ceph::make_mutex("ScrubQueue::resource_lock");

  // the counters used to manage scrub activity parallelism:
  int scrubs_local{0};
  int scrubs_remote{0};

  std::atomic_bool a_pg_is_reserving{false};

  [[nodiscard]] bool scrub_load_below_threshold() const;
  [[nodiscard]] bool scrub_time_permit(utime_t now) const;

  /**
   * If the scrub job was not explicitly requested, we postpone it by some
   * random length of time.
   * And if delaying the scrub - we calculate, based on pool parameters, a
   * deadline we should scrub before.
   *
   * @return a pair of values: the determined scrub time, and the deadline
   */
  scrub_schedule_t adjust_target_time(
    const sched_params_t& recomputed_params) const;

  /**
   * Look for scrub jobs that have their 'resources_failure' set. These jobs
   * have failed to acquire remote resources last time we've initiated a scrub
   * session on them. They are now moved from the 'to_scrub' queue to the
   * 'penalized' set.
   *
   * locking: called with job_lock held
   */
  void move_failed_pgs(utime_t now_is);

  Scrub::schedule_result_t select_from_group(ScrubQContainer& group,
					     const Scrub::ScrubPreconds& preconds,
					     utime_t now_is);
};
Commit	Line	Data
20effc67 TL	1	// -- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t --
	2	// vim: ts=8 sw=2 smarttab
	3
	4	#pragma once
	5
	6	#include <atomic>
	7	#include <chrono>
	8	#include <memory>
	9	#include <optional>
	10	#include <vector>
	11
	12	#include "common/RefCountedObj.h"
	13	#include "common/ceph_atomic.h"
	14	#include "osd/osd_types.h"
	15	#include "osd/scrubber_common.h"
	16
	17	#include "utime.h"
	18
	19	class PG;
	20
	21	namespace Scrub {
	22
	23	using namespace ::std::literals;
	24
	25	// possible outcome when trying to select a PG and scrub it
	26	enum class schedule_result_t {
	27	scrub_initiated, // successfully started a scrub
	28	none_ready, // no pg to scrub
	29	no_local_resources, // failure to secure local OSD scrub resource
	30	already_started, // failed, as already started scrubbing this pg
	31	no_such_pg, // can't find this pg
	32	bad_pg_state, // pg state (clean, active, etc.)
	33	preconditions // time, configuration, etc.
	34	};
	35
	36	} // namespace Scrub
	37
	38	/**
	39	* the queue of PGs waiting to be scrubbed.
	40	* Main operations are scheduling/unscheduling a PG to be scrubbed at a certain
	41	* time.
	42	*
	43	* A "penalty" queue maintains those PGs that have failed to reserve the
	44	* resources of their replicas. The PGs in this list will be reinstated into the
	45	* scrub queue when all eligible PGs were already handled, or after a timeout
	46	* (or if their deadline has passed [[disabled at this time]]).
	47	*/
	48	class ScrubQueue {
	49	public:
	50	enum class must_scrub_t { not_mandatory, mandatory };
	51
	52	enum class qu_state_t {
	53	not_registered, // not a primary, thus not considered for scrubbing by this
	54	// OSD (also the temporary state when just created)
	55	registered, // in either of the two queues ('to_scrub' or 'penalized')
	56	unregistering // in the process of being unregistered. Will be finalized
	57	// under lock
	58	};
	59
	60	ScrubQueue(CephContext* cct, OSDService& osds);
	61
	62	struct scrub_schedule_t {
	63	utime_t scheduled_at{};
	64	utime_t deadline{0, 0};
65	};
66
67	struct sched_params_t {
68	utime_t proposed_time{};
69	double min_interval{0.0};
70	double max_interval{0.0};
71	must_scrub_t is_must{ScrubQueue::must_scrub_t::not_mandatory};
72	};
73
74	struct ScrubJob final : public RefCountedObject {
75
76	/**
77	* a time scheduled for scrub, and a deadline: The scrub could be delayed if
78	* system load is too high (but not if after the deadline),or if trying to
79	* scrub out of scrub hours.
80	*/
81	scrub_schedule_t schedule;
82
83	/// pg to be scrubbed
84	const spg_t pgid;
85
86	/// the OSD id (for the log)
87	const int whoami;
88
89	ceph::atomic<qu_state_t> state{qu_state_t::not_registered};
90
91	/**
92	* the old 'is_registered'. Set whenever the job is registered with the OSD,
93	* i.e. is in either the 'to_scrub' or the 'penalized' vectors.
94	*/
95	std::atomic_bool in_queues{false};
96
97	/// last scrub attempt failed to secure replica resources
98	bool resources_failure{false};
99
100	/**
101	* 'updated' is a temporary flag, used to create a barrier after
102	* 'sched_time' and 'deadline' (or any other job entry) were modified by
103	* different task.
104	* 'updated' also signals the need to move a job back from the penalized
105	* queue to the regular one.
106	*/
107	std::atomic_bool updated{false};
108
109	utime_t penalty_timeout{0, 0};
110
111	CephContext* cct;
112
113	ScrubJob(CephContext* cct, const spg_t& pg, int node_id);
114
115	utime_t get_sched_time() const { return schedule.scheduled_at; }
116
117	/**
118	* relatively low-cost(*) access to the scrub job's state, to be used in
119	* logging.
120	* (*) not a low-cost access on x64 architecture
121	*/
122	std::string_view state_desc() const
123	{
124	return ScrubQueue::qu_state_text(state.load(std::memory_order_relaxed));
125	}
126
127	void update_schedule(const ScrubQueue::scrub_schedule_t& adjusted);
128
129	void dump(ceph::Formatter* f) const;
130
131	/*
132	* as the atomic 'in_queues' appears in many log prints, accessing it for
133	* display-only should be made less expensive (on ARM. On x86 the _relaxed
134	* produces the same code as '_cs')
135	*/
136	std::string_view registration_state() const
137	{
138	return in_queues.load(std::memory_order_relaxed) ? " in-queue"
139	: " not-queued";
140	}
141
142	/**
143	* a text description of the "scheduling intentions" of this PG:
144	* are we already scheduled for a scrub/deep scrub? when?
145	*/
146	std::string scheduling_state(utime_t now_is, bool is_deep_expected) const;
147
148	friend std::ostream& operator<<(std::ostream& out, const ScrubJob& pg);
149	};
150
151	friend class TestOSDScrub;
152
153	using ScrubJobRef = ceph::ref_t<ScrubJob>;
154	using ScrubQContainer = std::vector<ScrubJobRef>;
155
156	static std::string_view qu_state_text(qu_state_t st);
157
158	/**
159	* called periodically by the OSD to select the first scrub-eligible PG
160	* and scrub it.
161	*
162	* Selection is affected by:
163	* - time of day: scheduled scrubbing might be configured to only happen
164	* during certain hours;
165	* - same for days of the week, and for the system load;
166	*
167	* @param preconds: what types of scrub are allowed, given system status &
168	* config. Some of the preconditions are calculated here.
169	* @return Scrub::attempt_t::scrubbing if a scrub session was successfully
170	* initiated. Otherwise - the failure cause.
171	*
172	* locking: locks jobs_lock
173	*/
174	Scrub::schedule_result_t select_pg_and_scrub(Scrub::ScrubPreconds& preconds);
175
176	/**
177	* Translate attempt_ values into readable text
178	*/
179	static std::string_view attempt_res_text(Scrub::schedule_result_t v);
180
181	/**
182	* remove the pg from set of PGs to be scanned for scrubbing.
183	* To be used if we are no longer the PG's primary, or if the PG is removed.
184	*/
185	void remove_from_osd_queue(ScrubJobRef sjob);
186
187	/**
188	* @return the list (not std::set!) of all scrub jobs registered
189	* (apart from PGs in the process of being removed)
190	*/
191	ScrubQContainer list_registered_jobs() const;
192
193	/**
194	* Add the scrub job to the list of jobs (i.e. list of PGs) to be periodically
195	* scrubbed by the OSD.
196	* The registration is active as long as the PG exists and the OSD is its
197	* primary.
198	*
199	* See update_job() for the handling of the 'suggested' parameter.
200	*
201	* locking: might lock jobs_lock
202	*/
203	void register_with_osd(ScrubJobRef sjob, const sched_params_t& suggested);
204
205	/**
206	* modify a scrub-job's schduled time and deadline
207	*
208	* There are 3 argument combinations to consider:
209	* - 'must' is asserted, and the suggested time is 'scrub_must_stamp':
210	* the registration will be with "beginning of time" target, making the
211	* scrub-job eligible to immediate scrub (given that external conditions
212	* do not prevent scrubbing)
213	*
214	* - 'must' is asserted, and the suggested time is 'now':
215	* This happens if our stats are unknown. The results are similar to the
216	* previous scenario.
217	*
218	* - not a 'must': we take the suggested time as a basis, and add to it some
219	* configuration / random delays.
220	*
221	* ('must' is sched_params_t.is_must)
222	*
223	* locking: not using the jobs_lock
224	*/
225	void update_job(ScrubJobRef sjob, const sched_params_t& suggested);
226
227	public:
228	void dump_scrubs(ceph::Formatter* f) const;
229
230	/**
231	* No new scrub session will start while a scrub was initiated on a PG,
232	* and that PG is trying to acquire replica resources.
233	*/
234	void set_reserving_now() { a_pg_is_reserving = true; }
235	void clear_reserving_now() { a_pg_is_reserving = false; }
236	bool is_reserving_now() const { return a_pg_is_reserving; }
237
238	bool can_inc_scrubs() const;
239	bool inc_scrubs_local();
240	void dec_scrubs_local();
241	bool inc_scrubs_remote();
242	void dec_scrubs_remote();
243	void dump_scrub_reservations(ceph::Formatter* f) const;
244
245	/**
246	* Pacing the scrub operation by inserting delays (mostly between chunks)
247	*
248	* Special handling for regular scrubs that continued into "no scrub" times.
249	* Scrubbing will continue, but the delays will be controlled by a separate
250	* (read - with higher value) configuration element
251	* (osd_scrub_extended_sleep).
252	*/
253	double scrub_sleep_time(
254	bool must_scrub) const; /// \todo (future) return milliseconds
255
256	/**
257	* called every heartbeat to update the "daily" load average
258	*
259	* @returns a load value for the logger
260	*/
261	[[nodiscard]] std::optional<double> update_load_average();
262
263	private:
264	CephContext* cct;
265	OSDService& osd_service;
266
267	/**
268	* jobs_lock protects the job containers and the relevant scrub-jobs state
269	* variables. Specifically, the following are guaranteed:
270	* - 'in_queues' is asserted only if the job is in one of the queues;
271	* - a job will only be in state 'registered' if in one of the queues;
272	* - no job will be in the two queues simulatenously
273	*
274	* Note that PG locks should not be acquired while holding jobs_lock.
275	*/
276	mutable ceph::mutex jobs_lock = ceph::make_mutex("ScrubQueue::jobs_lock");
277
278	ScrubQContainer to_scrub; ///< scrub jobs (i.e. PGs) to scrub
279	ScrubQContainer penalized; ///< those that failed to reserve remote resources
280	bool restore_penalized{false};
281
282	double daily_loadavg{0.0};
283
284	static inline constexpr auto registered_job = [](const auto& jobref) -> bool {
285	return jobref->state == qu_state_t::registered;
286	};
287
288	static inline constexpr auto invalid_state = [](const auto& jobref) -> bool {
289	return jobref->state == qu_state_t::not_registered;
290	};
291
292	/**
293	* Are there scrub jobs that should be reinstated?
294	*/
295	void scan_penalized(bool forgive_all, utime_t time_now);
296
297	/**
298	* clear dead entries (unregistered, or belonging to removed PGs) from a
299	* queue. Job state is changed to match new status.
300	*/
301	void rm_unregistered_jobs(ScrubQContainer& group);
302
303	/**
304	* the set of all scrub jobs in 'group' which are ready to be scrubbed
305	* (ready = their scheduled time has passed).
306	* The scrub jobs in the new collection are sorted according to
307	* their scheduled time.
308	*
309	* Note that the returned container holds independent refs to the
310	* scrub jobs.
311	*/
312	ScrubQContainer collect_ripe_jobs(ScrubQContainer& group, utime_t time_now);
313
314
315	/// scrub resources management lock (guarding scrubs_local & scrubs_remote)
316	mutable ceph::mutex resource_lock =
317	ceph::make_mutex("ScrubQueue::resource_lock");
318
319	// the counters used to manage scrub activity parallelism:
320	int scrubs_local{0};
321	int scrubs_remote{0};
322
323	std::atomic_bool a_pg_is_reserving{false};
324
325	[[nodiscard]] bool scrub_load_below_threshold() const;
326	[[nodiscard]] bool scrub_time_permit(utime_t now) const;
327
328	/**
329	* If the scrub job was not explicitly requested, we postpone it by some
330	* random length of time.
331	* And if delaying the scrub - we calculate, based on pool parameters, a
332	* deadline we should scrub before.
333	*
334	* @return a pair of values: the determined scrub time, and the deadline
335	*/
336	scrub_schedule_t adjust_target_time(
337	const sched_params_t& recomputed_params) const;
338
339	/**
340	* Look for scrub jobs that have their 'resources_failure' set. These jobs
341	* have failed to acquire remote resources last time we've initiated a scrub
342	* session on them. They are now moved from the 'to_scrub' queue to the
343	* 'penalized' set.
344	*
345	* locking: called with job_lock held
346	*/
347	void move_failed_pgs(utime_t now_is);
348
349	Scrub::schedule_result_t select_from_group(ScrubQContainer& group,
350	const Scrub::ScrubPreconds& preconds,
351	utime_t now_is);
352	};