]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/scheduler/mClockScheduler.h
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / osd / scheduler / mClockScheduler.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 Red Hat Inc.
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #pragma once
17
18 #include <functional>
19 #include <ostream>
20 #include <map>
21 #include <vector>
22
23 #include "boost/variant.hpp"
24
25 #include "dmclock/src/dmclock_server.h"
26
27 #include "osd/scheduler/OpScheduler.h"
28 #include "common/config.h"
29 #include "common/ceph_context.h"
30 #include "common/mClockPriorityQueue.h"
31 #include "osd/scheduler/OpSchedulerItem.h"
32
33
34 namespace ceph::osd::scheduler {
35
36 constexpr double default_min = 0.0;
37 constexpr double default_max = std::numeric_limits<double>::is_iec559 ?
38 std::numeric_limits<double>::infinity() :
39 std::numeric_limits<double>::max();
40
41 /**
42 * client_profile_id_t
43 *
44 * client_id - global id (client.####) for client QoS
45 * profile_id - id generated by client's QoS profile
46 *
47 * Currently (Reef and below), both members are set to
48 * 0 which ensures that all external clients share the
49 * mClock profile allocated reservation and limit
50 * bandwidth.
51 *
52 * Note: Post Reef, both members will be set to non-zero
53 * values when the distributed feature of the mClock
54 * algorithm is utilized.
55 */
56 struct client_profile_id_t {
57 uint64_t client_id = 0;
58 uint64_t profile_id = 0;
59
60 client_profile_id_t(uint64_t _client_id, uint64_t _profile_id) :
61 client_id(_client_id),
62 profile_id(_profile_id) {}
63
64 client_profile_id_t() = default;
65
66 auto operator<=>(const client_profile_id_t&) const = default;
67 friend std::ostream& operator<<(std::ostream& out,
68 const client_profile_id_t& client_profile) {
69 out << " client_id: " << client_profile.client_id
70 << " profile_id: " << client_profile.profile_id;
71 return out;
72 }
73 };
74
75 struct scheduler_id_t {
76 op_scheduler_class class_id;
77 client_profile_id_t client_profile_id;
78
79 auto operator<=>(const scheduler_id_t&) const = default;
80 friend std::ostream& operator<<(std::ostream& out,
81 const scheduler_id_t& sched_id) {
82 out << "{ class_id: " << sched_id.class_id
83 << sched_id.client_profile_id;
84 return out << " }";
85 }
86 };
87
88 /**
89 * Scheduler implementation based on mclock.
90 *
91 * TODO: explain configs
92 */
93 class mClockScheduler : public OpScheduler, md_config_obs_t {
94
95 CephContext *cct;
96 const int whoami;
97 const uint32_t num_shards;
98 const int shard_id;
99 const bool is_rotational;
100 MonClient *monc;
101
102 /**
103 * osd_bandwidth_cost_per_io
104 *
105 * mClock expects all queued items to have a uniform expression of
106 * "cost". However, IO devices generally have quite different capacity
107 * for sequential IO vs small random IO. This implementation handles this
108 * by expressing all costs as a number of sequential bytes written adding
109 * additional cost for each random IO equal to osd_bandwidth_cost_per_io.
110 *
111 * Thus, an IO operation requiring a total of <size> bytes to be written
112 * accross <iops> different locations will have a cost of
113 * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes.
114 *
115 * Set in set_osd_capacity_params_from_config in the constructor and upon
116 * config change.
117 *
118 * Has units bytes/io.
119 */
120 double osd_bandwidth_cost_per_io;
121
122 /**
123 * osd_bandwidth_capacity_per_shard
124 *
125 * mClock expects reservation and limit paramters to be expressed in units
126 * of cost/second -- which means bytes/second for this implementation.
127 *
128 * Rather than expecting users to compute appropriate limit and reservation
129 * values for each class of OSDs in their cluster, we instead express
130 * reservation and limit paramaters as ratios of the OSD's maxmimum capacity.
131 * osd_bandwidth_capacity_per_shard is that capacity divided by the number
132 * of shards.
133 *
134 * Set in set_osd_capacity_params_from_config in the constructor and upon
135 * config change.
136 *
137 * This value gets passed to ClientRegistry::update_from_config in order
138 * to resolve the full reservaiton and limit parameters for mclock from
139 * the configured ratios.
140 *
141 * Has units bytes/second.
142 */
143 double osd_bandwidth_capacity_per_shard;
144
145 class ClientRegistry {
146 std::array<
147 crimson::dmclock::ClientInfo,
148 static_cast<size_t>(op_scheduler_class::immediate)
149 > internal_client_infos = {
150 // Placeholder, gets replaced with configured values
151 crimson::dmclock::ClientInfo(1, 1, 1),
152 crimson::dmclock::ClientInfo(1, 1, 1)
153 };
154
155 crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
156 std::map<client_profile_id_t,
157 crimson::dmclock::ClientInfo> external_client_infos;
158 const crimson::dmclock::ClientInfo *get_external_client(
159 const client_profile_id_t &client) const;
160 public:
161 /**
162 * update_from_config
163 *
164 * Sets the mclock paramaters (reservation, weight, and limit)
165 * for each class of IO (background_recovery, background_best_effort,
166 * and client).
167 */
168 void update_from_config(
169 const ConfigProxy &conf,
170 double capacity_per_shard);
171 const crimson::dmclock::ClientInfo *get_info(
172 const scheduler_id_t &id) const;
173 } client_registry;
174
175 using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
176 scheduler_id_t,
177 OpSchedulerItem,
178 true,
179 true,
180 2>;
181 using priority_t = unsigned;
182 using SubQueue = std::map<priority_t,
183 std::list<OpSchedulerItem>,
184 std::greater<priority_t>>;
185 mclock_queue_t scheduler;
186 /**
187 * high_priority
188 *
189 * Holds entries to be dequeued in strict order ahead of mClock
190 * Invariant: entries are never empty
191 */
192 SubQueue high_priority;
193 priority_t immediate_class_priority = std::numeric_limits<priority_t>::max();
194
195 static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) {
196 return scheduler_id_t{
197 item.get_scheduler_class(),
198 client_profile_id_t()
199 };
200 }
201
202 static unsigned int get_io_prio_cut(CephContext *cct) {
203 if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
204 std::random_device rd;
205 std::mt19937 random_gen(rd());
206 return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
207 } else if (cct->_conf->osd_op_queue_cut_off == "high") {
208 return CEPH_MSG_PRIO_HIGH;
209 } else {
210 // default / catch-all is 'low'
211 return CEPH_MSG_PRIO_LOW;
212 }
213 }
214
215 unsigned cutoff_priority = get_io_prio_cut(cct);
216
217 /**
218 * set_osd_capacity_params_from_config
219 *
220 * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io
221 * and osd_bandwidth_capacity_per_shard, internally. These two
222 * parameters are derived from config parameters
223 * osd_mclock_max_capacity_iops_(hdd|ssd) and
224 * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards.
225 * Invoking set_osd_capacity_params_from_config() resets those derived
226 * params based on the current config and should be invoked any time they
227 * are modified as well as in the constructor. See handle_conf_change().
228 */
229 void set_osd_capacity_params_from_config();
230
231 // Set the mclock related config params based on the profile
232 void set_config_defaults_from_profile();
233
234 public:
235 mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
236 int shard_id, bool is_rotational, MonClient *monc);
237 ~mClockScheduler() override;
238
239 /// Calculate scaled cost per item
240 uint32_t calc_scaled_cost(int cost);
241
242 // Helper method to display mclock queues
243 std::string display_queues() const;
244
245 // Enqueue op in the back of the regular queue
246 void enqueue(OpSchedulerItem &&item) final;
247
248 // Enqueue the op in the front of the high priority queue
249 void enqueue_front(OpSchedulerItem &&item) final;
250
251 // Return an op to be dispatch
252 WorkItem dequeue() final;
253
254 // Returns if the queue is empty
255 bool empty() const final {
256 return scheduler.empty() && high_priority.empty();
257 }
258
259 // Formatted output of the queue
260 void dump(ceph::Formatter &f) const final;
261
262 void print(std::ostream &ostream) const final {
263 ostream << "mClockScheduler";
264 }
265
266 // Update data associated with the modified mclock config key(s)
267 void update_configuration() final;
268
269 const char** get_tracked_conf_keys() const final;
270 void handle_conf_change(const ConfigProxy& conf,
271 const std::set<std::string> &changed) final;
272 private:
273 // Enqueue the op to the high priority queue
274 void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false);
275 };
276
277 }