1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 Red Hat Inc.
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
23 #include "boost/variant.hpp"
25 #include "dmclock/src/dmclock_server.h"
27 #include "osd/scheduler/OpScheduler.h"
28 #include "common/config.h"
29 #include "common/ceph_context.h"
30 #include "common/mClockPriorityQueue.h"
31 #include "osd/scheduler/OpSchedulerItem.h"
34 namespace ceph::osd::scheduler
{
36 constexpr double default_min
= 0.0;
37 constexpr double default_max
= std::numeric_limits
<double>::is_iec559
?
38 std::numeric_limits
<double>::infinity() :
39 std::numeric_limits
<double>::max();
44 * client_id - global id (client.####) for client QoS
45 * profile_id - id generated by client's QoS profile
47 * Currently (Reef and below), both members are set to
48 * 0 which ensures that all external clients share the
49 * mClock profile allocated reservation and limit
52 * Note: Post Reef, both members will be set to non-zero
53 * values when the distributed feature of the mClock
54 * algorithm is utilized.
56 struct client_profile_id_t
{
57 uint64_t client_id
= 0;
58 uint64_t profile_id
= 0;
60 client_profile_id_t(uint64_t _client_id
, uint64_t _profile_id
) :
61 client_id(_client_id
),
62 profile_id(_profile_id
) {}
64 client_profile_id_t() = default;
66 auto operator<=>(const client_profile_id_t
&) const = default;
67 friend std::ostream
& operator<<(std::ostream
& out
,
68 const client_profile_id_t
& client_profile
) {
69 out
<< " client_id: " << client_profile
.client_id
70 << " profile_id: " << client_profile
.profile_id
;
75 struct scheduler_id_t
{
76 op_scheduler_class class_id
;
77 client_profile_id_t client_profile_id
;
79 auto operator<=>(const scheduler_id_t
&) const = default;
80 friend std::ostream
& operator<<(std::ostream
& out
,
81 const scheduler_id_t
& sched_id
) {
82 out
<< "{ class_id: " << sched_id
.class_id
83 << sched_id
.client_profile_id
;
89 * Scheduler implementation based on mclock.
91 * TODO: explain configs
93 class mClockScheduler
: public OpScheduler
, md_config_obs_t
{
97 const uint32_t num_shards
;
99 const bool is_rotational
;
103 * osd_bandwidth_cost_per_io
105 * mClock expects all queued items to have a uniform expression of
106 * "cost". However, IO devices generally have quite different capacity
107 * for sequential IO vs small random IO. This implementation handles this
108 * by expressing all costs as a number of sequential bytes written adding
109 * additional cost for each random IO equal to osd_bandwidth_cost_per_io.
111 * Thus, an IO operation requiring a total of <size> bytes to be written
112 * accross <iops> different locations will have a cost of
113 * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes.
115 * Set in set_osd_capacity_params_from_config in the constructor and upon
118 * Has units bytes/io.
120 double osd_bandwidth_cost_per_io
;
123 * osd_bandwidth_capacity_per_shard
125 * mClock expects reservation and limit paramters to be expressed in units
126 * of cost/second -- which means bytes/second for this implementation.
128 * Rather than expecting users to compute appropriate limit and reservation
129 * values for each class of OSDs in their cluster, we instead express
130 * reservation and limit paramaters as ratios of the OSD's maxmimum capacity.
131 * osd_bandwidth_capacity_per_shard is that capacity divided by the number
134 * Set in set_osd_capacity_params_from_config in the constructor and upon
137 * This value gets passed to ClientRegistry::update_from_config in order
138 * to resolve the full reservaiton and limit parameters for mclock from
139 * the configured ratios.
141 * Has units bytes/second.
143 double osd_bandwidth_capacity_per_shard
;
145 class ClientRegistry
{
147 crimson::dmclock::ClientInfo
,
148 static_cast<size_t>(op_scheduler_class::immediate
)
149 > internal_client_infos
= {
150 // Placeholder, gets replaced with configured values
151 crimson::dmclock::ClientInfo(1, 1, 1),
152 crimson::dmclock::ClientInfo(1, 1, 1)
155 crimson::dmclock::ClientInfo default_external_client_info
= {1, 1, 1};
156 std::map
<client_profile_id_t
,
157 crimson::dmclock::ClientInfo
> external_client_infos
;
158 const crimson::dmclock::ClientInfo
*get_external_client(
159 const client_profile_id_t
&client
) const;
164 * Sets the mclock paramaters (reservation, weight, and limit)
165 * for each class of IO (background_recovery, background_best_effort,
168 void update_from_config(
169 const ConfigProxy
&conf
,
170 double capacity_per_shard
);
171 const crimson::dmclock::ClientInfo
*get_info(
172 const scheduler_id_t
&id
) const;
175 using mclock_queue_t
= crimson::dmclock::PullPriorityQueue
<
181 using priority_t
= unsigned;
182 using SubQueue
= std::map
<priority_t
,
183 std::list
<OpSchedulerItem
>,
184 std::greater
<priority_t
>>;
185 mclock_queue_t scheduler
;
189 * Holds entries to be dequeued in strict order ahead of mClock
190 * Invariant: entries are never empty
192 SubQueue high_priority
;
193 priority_t immediate_class_priority
= std::numeric_limits
<priority_t
>::max();
195 static scheduler_id_t
get_scheduler_id(const OpSchedulerItem
&item
) {
196 return scheduler_id_t
{
197 item
.get_scheduler_class(),
198 client_profile_id_t()
202 static unsigned int get_io_prio_cut(CephContext
*cct
) {
203 if (cct
->_conf
->osd_op_queue_cut_off
== "debug_random") {
204 std::random_device rd
;
205 std::mt19937
random_gen(rd());
206 return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH
: CEPH_MSG_PRIO_LOW
;
207 } else if (cct
->_conf
->osd_op_queue_cut_off
== "high") {
208 return CEPH_MSG_PRIO_HIGH
;
210 // default / catch-all is 'low'
211 return CEPH_MSG_PRIO_LOW
;
215 unsigned cutoff_priority
= get_io_prio_cut(cct
);
218 * set_osd_capacity_params_from_config
220 * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io
221 * and osd_bandwidth_capacity_per_shard, internally. These two
222 * parameters are derived from config parameters
223 * osd_mclock_max_capacity_iops_(hdd|ssd) and
224 * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards.
225 * Invoking set_osd_capacity_params_from_config() resets those derived
226 * params based on the current config and should be invoked any time they
227 * are modified as well as in the constructor. See handle_conf_change().
229 void set_osd_capacity_params_from_config();
231 // Set the mclock related config params based on the profile
232 void set_config_defaults_from_profile();
235 mClockScheduler(CephContext
*cct
, int whoami
, uint32_t num_shards
,
236 int shard_id
, bool is_rotational
, MonClient
*monc
);
237 ~mClockScheduler() override
;
239 /// Calculate scaled cost per item
240 uint32_t calc_scaled_cost(int cost
);
242 // Helper method to display mclock queues
243 std::string
display_queues() const;
245 // Enqueue op in the back of the regular queue
246 void enqueue(OpSchedulerItem
&&item
) final
;
248 // Enqueue the op in the front of the high priority queue
249 void enqueue_front(OpSchedulerItem
&&item
) final
;
251 // Return an op to be dispatch
252 WorkItem
dequeue() final
;
254 // Returns if the queue is empty
255 bool empty() const final
{
256 return scheduler
.empty() && high_priority
.empty();
259 // Formatted output of the queue
260 void dump(ceph::Formatter
&f
) const final
;
262 void print(std::ostream
&ostream
) const final
{
263 ostream
<< "mClockScheduler";
266 // Update data associated with the modified mclock config key(s)
267 void update_configuration() final
;
269 const char** get_tracked_conf_keys() const final
;
270 void handle_conf_change(const ConfigProxy
& conf
,
271 const std::set
<std::string
> &changed
) final
;
273 // Enqueue the op to the high priority queue
274 void enqueue_high(unsigned prio
, OpSchedulerItem
&&item
, bool front
= false);