]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/scheduler/mClockScheduler.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / osd / scheduler / mClockScheduler.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 Red Hat Inc.
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #include <memory>
17 #include <functional>
18
19 #include "osd/scheduler/mClockScheduler.h"
20 #include "common/dout.h"
21
22 namespace dmc = crimson::dmclock;
23 using namespace std::placeholders;
24
25 #define dout_context cct
26 #define dout_subsys ceph_subsys_mclock
27 #undef dout_prefix
28 #define dout_prefix *_dout << "mClockScheduler: "
29
30
31 namespace ceph::osd::scheduler {
32
33 mClockScheduler::mClockScheduler(CephContext *cct,
34 int whoami,
35 uint32_t num_shards,
36 int shard_id,
37 bool is_rotational,
38 MonClient *monc)
39 : cct(cct),
40 whoami(whoami),
41 num_shards(num_shards),
42 shard_id(shard_id),
43 is_rotational(is_rotational),
44 monc(monc),
45 scheduler(
46 std::bind(&mClockScheduler::ClientRegistry::get_info,
47 &client_registry,
48 _1),
49 dmc::AtLimit::Wait,
50 cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
51 {
52 cct->_conf.add_observer(this);
53 ceph_assert(num_shards > 0);
54 set_osd_capacity_params_from_config();
55 set_config_defaults_from_profile();
56 client_registry.update_from_config(
57 cct->_conf, osd_bandwidth_capacity_per_shard);
58 }
59
60 /* ClientRegistry holds the dmclock::ClientInfo configuration parameters
61 * (reservation (bytes/second), weight (unitless), limit (bytes/second))
62 * for each IO class in the OSD (client, background_recovery,
63 * background_best_effort).
64 *
65 * mclock expects limit and reservation to have units of <cost>/second
66 * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided
67 * as ratios of the OSD's capacity. We convert from the one to the other
68 * using the capacity_per_shard parameter.
69 *
70 * Note, mclock profile information will already have been set as a default
71 * for the osd_mclock_scheduler_client_* parameters prior to calling
72 * update_from_config -- see set_config_defaults_from_profile().
73 */
74 void mClockScheduler::ClientRegistry::update_from_config(
75 const ConfigProxy &conf,
76 const double capacity_per_shard)
77 {
78
79 auto get_res = [&](double res) {
80 if (res) {
81 return res * capacity_per_shard;
82 } else {
83 return default_min; // min reservation
84 }
85 };
86
87 auto get_lim = [&](double lim) {
88 if (lim) {
89 return lim * capacity_per_shard;
90 } else {
91 return default_max; // high limit
92 }
93 };
94
95 // Set external client infos
96 double res = conf.get_val<double>(
97 "osd_mclock_scheduler_client_res");
98 double lim = conf.get_val<double>(
99 "osd_mclock_scheduler_client_lim");
100 uint64_t wgt = conf.get_val<uint64_t>(
101 "osd_mclock_scheduler_client_wgt");
102 default_external_client_info.update(
103 get_res(res),
104 wgt,
105 get_lim(lim));
106
107 // Set background recovery client infos
108 res = conf.get_val<double>(
109 "osd_mclock_scheduler_background_recovery_res");
110 lim = conf.get_val<double>(
111 "osd_mclock_scheduler_background_recovery_lim");
112 wgt = conf.get_val<uint64_t>(
113 "osd_mclock_scheduler_background_recovery_wgt");
114 internal_client_infos[
115 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
116 get_res(res),
117 wgt,
118 get_lim(lim));
119
120 // Set background best effort client infos
121 res = conf.get_val<double>(
122 "osd_mclock_scheduler_background_best_effort_res");
123 lim = conf.get_val<double>(
124 "osd_mclock_scheduler_background_best_effort_lim");
125 wgt = conf.get_val<uint64_t>(
126 "osd_mclock_scheduler_background_best_effort_wgt");
127 internal_client_infos[
128 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
129 get_res(res),
130 wgt,
131 get_lim(lim));
132 }
133
134 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
135 const client_profile_id_t &client) const
136 {
137 auto ret = external_client_infos.find(client);
138 if (ret == external_client_infos.end())
139 return &default_external_client_info;
140 else
141 return &(ret->second);
142 }
143
144 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
145 const scheduler_id_t &id) const {
146 switch (id.class_id) {
147 case op_scheduler_class::immediate:
148 ceph_assert(0 == "Cannot schedule immediate");
149 return (dmc::ClientInfo*)nullptr;
150 case op_scheduler_class::client:
151 return get_external_client(id.client_profile_id);
152 default:
153 ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
154 return &internal_client_infos[static_cast<size_t>(id.class_id)];
155 }
156 }
157
158 void mClockScheduler::set_osd_capacity_params_from_config()
159 {
160 uint64_t osd_bandwidth_capacity;
161 double osd_iop_capacity;
162
163 std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] {
164 if (is_rotational) {
165 return std::make_tuple(
166 cct->_conf.get_val<Option::size_t>(
167 "osd_mclock_max_sequential_bandwidth_hdd"),
168 cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd"));
169 } else {
170 return std::make_tuple(
171 cct->_conf.get_val<Option::size_t>(
172 "osd_mclock_max_sequential_bandwidth_ssd"),
173 cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd"));
174 }
175 }();
176
177 osd_bandwidth_capacity = std::max<uint64_t>(1, osd_bandwidth_capacity);
178 osd_iop_capacity = std::max<double>(1.0, osd_iop_capacity);
179
180 osd_bandwidth_cost_per_io =
181 static_cast<double>(osd_bandwidth_capacity) / osd_iop_capacity;
182 osd_bandwidth_capacity_per_shard = static_cast<double>(osd_bandwidth_capacity)
183 / static_cast<double>(num_shards);
184
185 dout(1) << __func__ << ": osd_bandwidth_cost_per_io: "
186 << std::fixed << std::setprecision(2)
187 << osd_bandwidth_cost_per_io << " bytes/io"
188 << ", osd_bandwidth_capacity_per_shard "
189 << osd_bandwidth_capacity_per_shard << " bytes/second"
190 << dendl;
191 }
192
193 /**
194 * profile_t
195 *
196 * mclock profile -- 3 params for each of 3 client classes
197 * 0 (min): specifies no minimum reservation
198 * 0 (max): specifies no upper limit
199 */
200 struct profile_t {
201 struct client_config_t {
202 double reservation;
203 uint64_t weight;
204 double limit;
205 };
206 client_config_t client;
207 client_config_t background_recovery;
208 client_config_t background_best_effort;
209 };
210
211 static std::ostream &operator<<(
212 std::ostream &lhs, const profile_t::client_config_t &rhs)
213 {
214 return lhs << "{res: " << rhs.reservation
215 << ", wgt: " << rhs.weight
216 << ", lim: " << rhs.limit
217 << "}";
218 }
219
220 static std::ostream &operator<<(std::ostream &lhs, const profile_t &rhs)
221 {
222 return lhs << "[client: " << rhs.client
223 << ", background_recovery: " << rhs.background_recovery
224 << ", background_best_effort: " << rhs.background_best_effort
225 << "]";
226 }
227
228 void mClockScheduler::set_config_defaults_from_profile()
229 {
230 // Let only a single osd shard (id:0) set the profile configs
231 if (shard_id > 0) {
232 return;
233 }
234
235 /**
236 * high_client_ops
237 *
238 * Client Allocation:
239 * reservation: 60% | weight: 2 | limit: 0 (max) |
240 * Background Recovery Allocation:
241 * reservation: 40% | weight: 1 | limit: 0 (max) |
242 * Background Best Effort Allocation:
243 * reservation: 0 (min) | weight: 1 | limit: 70% |
244 */
245 static constexpr profile_t high_client_ops_profile{
246 { .6, 2, 0 },
247 { .4, 1, 0 },
248 { 0, 1, .7 }
249 };
250
251 /**
252 * high_recovery_ops
253 *
254 * Client Allocation:
255 * reservation: 30% | weight: 1 | limit: 0 (max) |
256 * Background Recovery Allocation:
257 * reservation: 70% | weight: 2 | limit: 0 (max) |
258 * Background Best Effort Allocation:
259 * reservation: 0 (min) | weight: 1 | limit: 0 (max) |
260 */
261 static constexpr profile_t high_recovery_ops_profile{
262 { .3, 1, 0 },
263 { .7, 2, 0 },
264 { 0, 1, 0 }
265 };
266
267 /**
268 * balanced
269 *
270 * Client Allocation:
271 * reservation: 50% | weight: 1 | limit: 0 (max) |
272 * Background Recovery Allocation:
273 * reservation: 50% | weight: 1 | limit: 0 (max) |
274 * Background Best Effort Allocation:
275 * reservation: 0 (min) | weight: 1 | limit: 90% |
276 */
277 static constexpr profile_t balanced_profile{
278 { .5, 1, 0 },
279 { .5, 1, 0 },
280 { 0, 1, .9 }
281 };
282
283 const profile_t *profile = nullptr;
284 auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
285 if (mclock_profile == "high_client_ops") {
286 profile = &high_client_ops_profile;
287 dout(10) << "Setting high_client_ops profile " << *profile << dendl;
288 } else if (mclock_profile == "high_recovery_ops") {
289 profile = &high_recovery_ops_profile;
290 dout(10) << "Setting high_recovery_ops profile " << *profile << dendl;
291 } else if (mclock_profile == "balanced") {
292 profile = &balanced_profile;
293 dout(10) << "Setting balanced profile " << *profile << dendl;
294 } else if (mclock_profile == "custom") {
295 dout(10) << "Profile set to custom, not setting defaults" << dendl;
296 return;
297 } else {
298 derr << "Invalid mclock profile: " << mclock_profile << dendl;
299 ceph_assert("Invalid choice of mclock profile" == 0);
300 return;
301 }
302 ceph_assert(nullptr != profile);
303
304 auto set_config = [&conf = cct->_conf](const char *key, auto val) {
305 conf.set_val_default(key, std::to_string(val));
306 };
307
308 set_config("osd_mclock_scheduler_client_res", profile->client.reservation);
309 set_config("osd_mclock_scheduler_client_wgt", profile->client.weight);
310 set_config("osd_mclock_scheduler_client_lim", profile->client.limit);
311
312 set_config(
313 "osd_mclock_scheduler_background_recovery_res",
314 profile->background_recovery.reservation);
315 set_config(
316 "osd_mclock_scheduler_background_recovery_wgt",
317 profile->background_recovery.weight);
318 set_config(
319 "osd_mclock_scheduler_background_recovery_lim",
320 profile->background_recovery.limit);
321
322 set_config(
323 "osd_mclock_scheduler_background_best_effort_res",
324 profile->background_best_effort.reservation);
325 set_config(
326 "osd_mclock_scheduler_background_best_effort_wgt",
327 profile->background_best_effort.weight);
328 set_config(
329 "osd_mclock_scheduler_background_best_effort_lim",
330 profile->background_best_effort.limit);
331
332 cct->_conf.apply_changes(nullptr);
333 }
334
335 uint32_t mClockScheduler::calc_scaled_cost(int item_cost)
336 {
337 auto cost = static_cast<uint32_t>(
338 std::max<int>(
339 1, // ensure cost is non-zero and positive
340 item_cost));
341 auto cost_per_io = static_cast<uint32_t>(osd_bandwidth_cost_per_io);
342
343 return std::max<uint32_t>(cost, cost_per_io);
344 }
345
346 void mClockScheduler::update_configuration()
347 {
348 // Apply configuration change. The expectation is that
349 // at least one of the tracked mclock config option keys
350 // is modified before calling this method.
351 cct->_conf.apply_changes(nullptr);
352 }
353
354 void mClockScheduler::dump(ceph::Formatter &f) const
355 {
356 // Display queue sizes
357 f.open_object_section("queue_sizes");
358 f.dump_int("high_priority_queue", high_priority.size());
359 f.dump_int("scheduler", scheduler.request_count());
360 f.close_section();
361
362 // client map and queue tops (res, wgt, lim)
363 std::ostringstream out;
364 f.open_object_section("mClockClients");
365 f.dump_int("client_count", scheduler.client_count());
366 out << scheduler;
367 f.dump_string("clients", out.str());
368 f.close_section();
369
370 // Display sorted queues (res, wgt, lim)
371 f.open_object_section("mClockQueues");
372 f.dump_string("queues", display_queues());
373 f.close_section();
374
375 f.open_object_section("HighPriorityQueue");
376 for (auto it = high_priority.begin();
377 it != high_priority.end(); it++) {
378 f.dump_int("priority", it->first);
379 f.dump_int("queue_size", it->second.size());
380 }
381 f.close_section();
382 }
383
384 void mClockScheduler::enqueue(OpSchedulerItem&& item)
385 {
386 auto id = get_scheduler_id(item);
387 unsigned priority = item.get_priority();
388
389 // TODO: move this check into OpSchedulerItem, handle backwards compat
390 if (op_scheduler_class::immediate == id.class_id) {
391 enqueue_high(immediate_class_priority, std::move(item));
392 } else if (priority >= cutoff_priority) {
393 enqueue_high(priority, std::move(item));
394 } else {
395 auto cost = calc_scaled_cost(item.get_cost());
396 item.set_qos_cost(cost);
397 dout(20) << __func__ << " " << id
398 << " item_cost: " << item.get_cost()
399 << " scaled_cost: " << cost
400 << dendl;
401
402 // Add item to scheduler queue
403 scheduler.add_request(
404 std::move(item),
405 id,
406 cost);
407 }
408
409 dout(20) << __func__ << " client_count: " << scheduler.client_count()
410 << " queue_sizes: [ "
411 << " high_priority_queue: " << high_priority.size()
412 << " sched: " << scheduler.request_count() << " ]"
413 << dendl;
414 dout(30) << __func__ << " mClockClients: "
415 << scheduler
416 << dendl;
417 dout(30) << __func__ << " mClockQueues: { "
418 << display_queues() << " }"
419 << dendl;
420 }
421
422 void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
423 {
424 unsigned priority = item.get_priority();
425 auto id = get_scheduler_id(item);
426
427 if (op_scheduler_class::immediate == id.class_id) {
428 enqueue_high(immediate_class_priority, std::move(item), true);
429 } else if (priority >= cutoff_priority) {
430 enqueue_high(priority, std::move(item), true);
431 } else {
432 // mClock does not support enqueue at front, so we use
433 // the high queue with priority 0
434 enqueue_high(0, std::move(item), true);
435 }
436 }
437
438 void mClockScheduler::enqueue_high(unsigned priority,
439 OpSchedulerItem&& item,
440 bool front)
441 {
442 if (front) {
443 high_priority[priority].push_back(std::move(item));
444 } else {
445 high_priority[priority].push_front(std::move(item));
446 }
447 }
448
449 WorkItem mClockScheduler::dequeue()
450 {
451 if (!high_priority.empty()) {
452 auto iter = high_priority.begin();
453 // invariant: high_priority entries are never empty
454 assert(!iter->second.empty());
455 WorkItem ret{std::move(iter->second.back())};
456 iter->second.pop_back();
457 if (iter->second.empty()) {
458 // maintain invariant, high priority entries are never empty
459 high_priority.erase(iter);
460 }
461 ceph_assert(std::get_if<OpSchedulerItem>(&ret));
462 return ret;
463 } else {
464 mclock_queue_t::PullReq result = scheduler.pull_request();
465 if (result.is_future()) {
466 return result.getTime();
467 } else if (result.is_none()) {
468 ceph_assert(
469 0 == "Impossible, must have checked empty() first");
470 return {};
471 } else {
472 ceph_assert(result.is_retn());
473
474 auto &retn = result.get_retn();
475 return std::move(*retn.request);
476 }
477 }
478 }
479
480 std::string mClockScheduler::display_queues() const
481 {
482 std::ostringstream out;
483 scheduler.display_queues(out);
484 return out.str();
485 }
486
487 const char** mClockScheduler::get_tracked_conf_keys() const
488 {
489 static const char* KEYS[] = {
490 "osd_mclock_scheduler_client_res",
491 "osd_mclock_scheduler_client_wgt",
492 "osd_mclock_scheduler_client_lim",
493 "osd_mclock_scheduler_background_recovery_res",
494 "osd_mclock_scheduler_background_recovery_wgt",
495 "osd_mclock_scheduler_background_recovery_lim",
496 "osd_mclock_scheduler_background_best_effort_res",
497 "osd_mclock_scheduler_background_best_effort_wgt",
498 "osd_mclock_scheduler_background_best_effort_lim",
499 "osd_mclock_max_capacity_iops_hdd",
500 "osd_mclock_max_capacity_iops_ssd",
501 "osd_mclock_max_sequential_bandwidth_hdd",
502 "osd_mclock_max_sequential_bandwidth_ssd",
503 "osd_mclock_profile",
504 NULL
505 };
506 return KEYS;
507 }
508
509 void mClockScheduler::handle_conf_change(
510 const ConfigProxy& conf,
511 const std::set<std::string> &changed)
512 {
513 if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
514 changed.count("osd_mclock_max_capacity_iops_ssd")) {
515 set_osd_capacity_params_from_config();
516 client_registry.update_from_config(
517 conf, osd_bandwidth_capacity_per_shard);
518 }
519 if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") ||
520 changed.count("osd_mclock_max_sequential_bandwidth_ssd")) {
521 set_osd_capacity_params_from_config();
522 client_registry.update_from_config(
523 conf, osd_bandwidth_capacity_per_shard);
524 }
525 if (changed.count("osd_mclock_profile")) {
526 set_config_defaults_from_profile();
527 client_registry.update_from_config(
528 conf, osd_bandwidth_capacity_per_shard);
529 }
530
531 auto get_changed_key = [&changed]() -> std::optional<std::string> {
532 static const std::vector<std::string> qos_params = {
533 "osd_mclock_scheduler_client_res",
534 "osd_mclock_scheduler_client_wgt",
535 "osd_mclock_scheduler_client_lim",
536 "osd_mclock_scheduler_background_recovery_res",
537 "osd_mclock_scheduler_background_recovery_wgt",
538 "osd_mclock_scheduler_background_recovery_lim",
539 "osd_mclock_scheduler_background_best_effort_res",
540 "osd_mclock_scheduler_background_best_effort_wgt",
541 "osd_mclock_scheduler_background_best_effort_lim"
542 };
543
544 for (auto &qp : qos_params) {
545 if (changed.count(qp)) {
546 return qp;
547 }
548 }
549 return std::nullopt;
550 };
551
552 if (auto key = get_changed_key(); key.has_value()) {
553 auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
554 if (mclock_profile == "custom") {
555 client_registry.update_from_config(
556 conf, osd_bandwidth_capacity_per_shard);
557 } else {
558 // Attempt to change QoS parameter for a built-in profile. Restore the
559 // profile defaults by making one of the OSD shards remove the key from
560 // config monitor store. Note: monc is included in the check since the
561 // mock unit test currently doesn't initialize it.
562 if (shard_id == 0 && monc) {
563 static const std::vector<std::string> osds = {
564 "osd",
565 "osd." + std::to_string(whoami)
566 };
567
568 for (auto osd : osds) {
569 std::string cmd =
570 "{"
571 "\"prefix\": \"config rm\", "
572 "\"who\": \"" + osd + "\", "
573 "\"name\": \"" + *key + "\""
574 "}";
575 std::vector<std::string> vcmd{cmd};
576
577 dout(10) << __func__ << " Removing Key: " << *key
578 << " for " << osd << " from Mon db" << dendl;
579 monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
580 }
581 }
582 }
583 // Alternatively, the QoS parameter, if set ephemerally for this OSD via
584 // the 'daemon' or 'tell' interfaces must be removed.
585 if (!cct->_conf.rm_val(*key)) {
586 dout(10) << __func__ << " Restored " << *key << " to default" << dendl;
587 cct->_conf.apply_changes(nullptr);
588 }
589 }
590 }
591
592 mClockScheduler::~mClockScheduler()
593 {
594 cct->_conf.remove_observer(this);
595 }
596
597 }