1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 Red Hat Inc.
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 #include "osd/scheduler/mClockScheduler.h"
20 #include "common/dout.h"
22 namespace dmc
= crimson::dmclock
;
23 using namespace std::placeholders
;
25 #define dout_context cct
26 #define dout_subsys ceph_subsys_mclock
28 #define dout_prefix *_dout << "mClockScheduler: "
31 namespace ceph::osd::scheduler
{
33 mClockScheduler::mClockScheduler(CephContext
*cct
,
41 num_shards(num_shards
),
43 is_rotational(is_rotational
),
46 std::bind(&mClockScheduler::ClientRegistry::get_info
,
50 cct
->_conf
.get_val
<double>("osd_mclock_scheduler_anticipation_timeout"))
52 cct
->_conf
.add_observer(this);
53 ceph_assert(num_shards
> 0);
54 set_osd_capacity_params_from_config();
55 set_config_defaults_from_profile();
56 client_registry
.update_from_config(
57 cct
->_conf
, osd_bandwidth_capacity_per_shard
);
60 /* ClientRegistry holds the dmclock::ClientInfo configuration parameters
61 * (reservation (bytes/second), weight (unitless), limit (bytes/second))
62 * for each IO class in the OSD (client, background_recovery,
63 * background_best_effort).
65 * mclock expects limit and reservation to have units of <cost>/second
66 * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided
67 * as ratios of the OSD's capacity. We convert from the one to the other
68 * using the capacity_per_shard parameter.
70 * Note, mclock profile information will already have been set as a default
71 * for the osd_mclock_scheduler_client_* parameters prior to calling
72 * update_from_config -- see set_config_defaults_from_profile().
74 void mClockScheduler::ClientRegistry::update_from_config(
75 const ConfigProxy
&conf
,
76 const double capacity_per_shard
)
79 auto get_res
= [&](double res
) {
81 return res
* capacity_per_shard
;
83 return default_min
; // min reservation
87 auto get_lim
= [&](double lim
) {
89 return lim
* capacity_per_shard
;
91 return default_max
; // high limit
95 // Set external client infos
96 double res
= conf
.get_val
<double>(
97 "osd_mclock_scheduler_client_res");
98 double lim
= conf
.get_val
<double>(
99 "osd_mclock_scheduler_client_lim");
100 uint64_t wgt
= conf
.get_val
<uint64_t>(
101 "osd_mclock_scheduler_client_wgt");
102 default_external_client_info
.update(
107 // Set background recovery client infos
108 res
= conf
.get_val
<double>(
109 "osd_mclock_scheduler_background_recovery_res");
110 lim
= conf
.get_val
<double>(
111 "osd_mclock_scheduler_background_recovery_lim");
112 wgt
= conf
.get_val
<uint64_t>(
113 "osd_mclock_scheduler_background_recovery_wgt");
114 internal_client_infos
[
115 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
120 // Set background best effort client infos
121 res
= conf
.get_val
<double>(
122 "osd_mclock_scheduler_background_best_effort_res");
123 lim
= conf
.get_val
<double>(
124 "osd_mclock_scheduler_background_best_effort_lim");
125 wgt
= conf
.get_val
<uint64_t>(
126 "osd_mclock_scheduler_background_best_effort_wgt");
127 internal_client_infos
[
128 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
134 const dmc::ClientInfo
*mClockScheduler::ClientRegistry::get_external_client(
135 const client_profile_id_t
&client
) const
137 auto ret
= external_client_infos
.find(client
);
138 if (ret
== external_client_infos
.end())
139 return &default_external_client_info
;
141 return &(ret
->second
);
144 const dmc::ClientInfo
*mClockScheduler::ClientRegistry::get_info(
145 const scheduler_id_t
&id
) const {
146 switch (id
.class_id
) {
147 case op_scheduler_class::immediate
:
148 ceph_assert(0 == "Cannot schedule immediate");
149 return (dmc::ClientInfo
*)nullptr;
150 case op_scheduler_class::client
:
151 return get_external_client(id
.client_profile_id
);
153 ceph_assert(static_cast<size_t>(id
.class_id
) < internal_client_infos
.size());
154 return &internal_client_infos
[static_cast<size_t>(id
.class_id
)];
158 void mClockScheduler::set_osd_capacity_params_from_config()
160 uint64_t osd_bandwidth_capacity
;
161 double osd_iop_capacity
;
163 std::tie(osd_bandwidth_capacity
, osd_iop_capacity
) = [&, this] {
165 return std::make_tuple(
166 cct
->_conf
.get_val
<Option::size_t>(
167 "osd_mclock_max_sequential_bandwidth_hdd"),
168 cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops_hdd"));
170 return std::make_tuple(
171 cct
->_conf
.get_val
<Option::size_t>(
172 "osd_mclock_max_sequential_bandwidth_ssd"),
173 cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops_ssd"));
177 osd_bandwidth_capacity
= std::max
<uint64_t>(1, osd_bandwidth_capacity
);
178 osd_iop_capacity
= std::max
<double>(1.0, osd_iop_capacity
);
180 osd_bandwidth_cost_per_io
=
181 static_cast<double>(osd_bandwidth_capacity
) / osd_iop_capacity
;
182 osd_bandwidth_capacity_per_shard
= static_cast<double>(osd_bandwidth_capacity
)
183 / static_cast<double>(num_shards
);
185 dout(1) << __func__
<< ": osd_bandwidth_cost_per_io: "
186 << std::fixed
<< std::setprecision(2)
187 << osd_bandwidth_cost_per_io
<< " bytes/io"
188 << ", osd_bandwidth_capacity_per_shard "
189 << osd_bandwidth_capacity_per_shard
<< " bytes/second"
196 * mclock profile -- 3 params for each of 3 client classes
197 * 0 (min): specifies no minimum reservation
198 * 0 (max): specifies no upper limit
201 struct client_config_t
{
206 client_config_t client
;
207 client_config_t background_recovery
;
208 client_config_t background_best_effort
;
211 static std::ostream
&operator<<(
212 std::ostream
&lhs
, const profile_t::client_config_t
&rhs
)
214 return lhs
<< "{res: " << rhs
.reservation
215 << ", wgt: " << rhs
.weight
216 << ", lim: " << rhs
.limit
220 static std::ostream
&operator<<(std::ostream
&lhs
, const profile_t
&rhs
)
222 return lhs
<< "[client: " << rhs
.client
223 << ", background_recovery: " << rhs
.background_recovery
224 << ", background_best_effort: " << rhs
.background_best_effort
228 void mClockScheduler::set_config_defaults_from_profile()
230 // Let only a single osd shard (id:0) set the profile configs
239 * reservation: 60% | weight: 2 | limit: 0 (max) |
240 * Background Recovery Allocation:
241 * reservation: 40% | weight: 1 | limit: 0 (max) |
242 * Background Best Effort Allocation:
243 * reservation: 0 (min) | weight: 1 | limit: 70% |
245 static constexpr profile_t high_client_ops_profile
{
255 * reservation: 30% | weight: 1 | limit: 0 (max) |
256 * Background Recovery Allocation:
257 * reservation: 70% | weight: 2 | limit: 0 (max) |
258 * Background Best Effort Allocation:
259 * reservation: 0 (min) | weight: 1 | limit: 0 (max) |
261 static constexpr profile_t high_recovery_ops_profile
{
271 * reservation: 50% | weight: 1 | limit: 0 (max) |
272 * Background Recovery Allocation:
273 * reservation: 50% | weight: 1 | limit: 0 (max) |
274 * Background Best Effort Allocation:
275 * reservation: 0 (min) | weight: 1 | limit: 90% |
277 static constexpr profile_t balanced_profile
{
283 const profile_t
*profile
= nullptr;
284 auto mclock_profile
= cct
->_conf
.get_val
<std::string
>("osd_mclock_profile");
285 if (mclock_profile
== "high_client_ops") {
286 profile
= &high_client_ops_profile
;
287 dout(10) << "Setting high_client_ops profile " << *profile
<< dendl
;
288 } else if (mclock_profile
== "high_recovery_ops") {
289 profile
= &high_recovery_ops_profile
;
290 dout(10) << "Setting high_recovery_ops profile " << *profile
<< dendl
;
291 } else if (mclock_profile
== "balanced") {
292 profile
= &balanced_profile
;
293 dout(10) << "Setting balanced profile " << *profile
<< dendl
;
294 } else if (mclock_profile
== "custom") {
295 dout(10) << "Profile set to custom, not setting defaults" << dendl
;
298 derr
<< "Invalid mclock profile: " << mclock_profile
<< dendl
;
299 ceph_assert("Invalid choice of mclock profile" == 0);
302 ceph_assert(nullptr != profile
);
304 auto set_config
= [&conf
= cct
->_conf
](const char *key
, auto val
) {
305 conf
.set_val_default(key
, std::to_string(val
));
308 set_config("osd_mclock_scheduler_client_res", profile
->client
.reservation
);
309 set_config("osd_mclock_scheduler_client_wgt", profile
->client
.weight
);
310 set_config("osd_mclock_scheduler_client_lim", profile
->client
.limit
);
313 "osd_mclock_scheduler_background_recovery_res",
314 profile
->background_recovery
.reservation
);
316 "osd_mclock_scheduler_background_recovery_wgt",
317 profile
->background_recovery
.weight
);
319 "osd_mclock_scheduler_background_recovery_lim",
320 profile
->background_recovery
.limit
);
323 "osd_mclock_scheduler_background_best_effort_res",
324 profile
->background_best_effort
.reservation
);
326 "osd_mclock_scheduler_background_best_effort_wgt",
327 profile
->background_best_effort
.weight
);
329 "osd_mclock_scheduler_background_best_effort_lim",
330 profile
->background_best_effort
.limit
);
332 cct
->_conf
.apply_changes(nullptr);
335 uint32_t mClockScheduler::calc_scaled_cost(int item_cost
)
337 auto cost
= static_cast<uint32_t>(
339 1, // ensure cost is non-zero and positive
341 auto cost_per_io
= static_cast<uint32_t>(osd_bandwidth_cost_per_io
);
343 return std::max
<uint32_t>(cost
, cost_per_io
);
346 void mClockScheduler::update_configuration()
348 // Apply configuration change. The expectation is that
349 // at least one of the tracked mclock config option keys
350 // is modified before calling this method.
351 cct
->_conf
.apply_changes(nullptr);
354 void mClockScheduler::dump(ceph::Formatter
&f
) const
356 // Display queue sizes
357 f
.open_object_section("queue_sizes");
358 f
.dump_int("high_priority_queue", high_priority
.size());
359 f
.dump_int("scheduler", scheduler
.request_count());
362 // client map and queue tops (res, wgt, lim)
363 std::ostringstream out
;
364 f
.open_object_section("mClockClients");
365 f
.dump_int("client_count", scheduler
.client_count());
367 f
.dump_string("clients", out
.str());
370 // Display sorted queues (res, wgt, lim)
371 f
.open_object_section("mClockQueues");
372 f
.dump_string("queues", display_queues());
375 f
.open_object_section("HighPriorityQueue");
376 for (auto it
= high_priority
.begin();
377 it
!= high_priority
.end(); it
++) {
378 f
.dump_int("priority", it
->first
);
379 f
.dump_int("queue_size", it
->second
.size());
384 void mClockScheduler::enqueue(OpSchedulerItem
&& item
)
386 auto id
= get_scheduler_id(item
);
387 unsigned priority
= item
.get_priority();
389 // TODO: move this check into OpSchedulerItem, handle backwards compat
390 if (op_scheduler_class::immediate
== id
.class_id
) {
391 enqueue_high(immediate_class_priority
, std::move(item
));
392 } else if (priority
>= cutoff_priority
) {
393 enqueue_high(priority
, std::move(item
));
395 auto cost
= calc_scaled_cost(item
.get_cost());
396 item
.set_qos_cost(cost
);
397 dout(20) << __func__
<< " " << id
398 << " item_cost: " << item
.get_cost()
399 << " scaled_cost: " << cost
402 // Add item to scheduler queue
403 scheduler
.add_request(
409 dout(20) << __func__
<< " client_count: " << scheduler
.client_count()
410 << " queue_sizes: [ "
411 << " high_priority_queue: " << high_priority
.size()
412 << " sched: " << scheduler
.request_count() << " ]"
414 dout(30) << __func__
<< " mClockClients: "
417 dout(30) << __func__
<< " mClockQueues: { "
418 << display_queues() << " }"
422 void mClockScheduler::enqueue_front(OpSchedulerItem
&& item
)
424 unsigned priority
= item
.get_priority();
425 auto id
= get_scheduler_id(item
);
427 if (op_scheduler_class::immediate
== id
.class_id
) {
428 enqueue_high(immediate_class_priority
, std::move(item
), true);
429 } else if (priority
>= cutoff_priority
) {
430 enqueue_high(priority
, std::move(item
), true);
432 // mClock does not support enqueue at front, so we use
433 // the high queue with priority 0
434 enqueue_high(0, std::move(item
), true);
438 void mClockScheduler::enqueue_high(unsigned priority
,
439 OpSchedulerItem
&& item
,
443 high_priority
[priority
].push_back(std::move(item
));
445 high_priority
[priority
].push_front(std::move(item
));
449 WorkItem
mClockScheduler::dequeue()
451 if (!high_priority
.empty()) {
452 auto iter
= high_priority
.begin();
453 // invariant: high_priority entries are never empty
454 assert(!iter
->second
.empty());
455 WorkItem ret
{std::move(iter
->second
.back())};
456 iter
->second
.pop_back();
457 if (iter
->second
.empty()) {
458 // maintain invariant, high priority entries are never empty
459 high_priority
.erase(iter
);
461 ceph_assert(std::get_if
<OpSchedulerItem
>(&ret
));
464 mclock_queue_t::PullReq result
= scheduler
.pull_request();
465 if (result
.is_future()) {
466 return result
.getTime();
467 } else if (result
.is_none()) {
469 0 == "Impossible, must have checked empty() first");
472 ceph_assert(result
.is_retn());
474 auto &retn
= result
.get_retn();
475 return std::move(*retn
.request
);
480 std::string
mClockScheduler::display_queues() const
482 std::ostringstream out
;
483 scheduler
.display_queues(out
);
487 const char** mClockScheduler::get_tracked_conf_keys() const
489 static const char* KEYS
[] = {
490 "osd_mclock_scheduler_client_res",
491 "osd_mclock_scheduler_client_wgt",
492 "osd_mclock_scheduler_client_lim",
493 "osd_mclock_scheduler_background_recovery_res",
494 "osd_mclock_scheduler_background_recovery_wgt",
495 "osd_mclock_scheduler_background_recovery_lim",
496 "osd_mclock_scheduler_background_best_effort_res",
497 "osd_mclock_scheduler_background_best_effort_wgt",
498 "osd_mclock_scheduler_background_best_effort_lim",
499 "osd_mclock_max_capacity_iops_hdd",
500 "osd_mclock_max_capacity_iops_ssd",
501 "osd_mclock_max_sequential_bandwidth_hdd",
502 "osd_mclock_max_sequential_bandwidth_ssd",
503 "osd_mclock_profile",
509 void mClockScheduler::handle_conf_change(
510 const ConfigProxy
& conf
,
511 const std::set
<std::string
> &changed
)
513 if (changed
.count("osd_mclock_max_capacity_iops_hdd") ||
514 changed
.count("osd_mclock_max_capacity_iops_ssd")) {
515 set_osd_capacity_params_from_config();
516 client_registry
.update_from_config(
517 conf
, osd_bandwidth_capacity_per_shard
);
519 if (changed
.count("osd_mclock_max_sequential_bandwidth_hdd") ||
520 changed
.count("osd_mclock_max_sequential_bandwidth_ssd")) {
521 set_osd_capacity_params_from_config();
522 client_registry
.update_from_config(
523 conf
, osd_bandwidth_capacity_per_shard
);
525 if (changed
.count("osd_mclock_profile")) {
526 set_config_defaults_from_profile();
527 client_registry
.update_from_config(
528 conf
, osd_bandwidth_capacity_per_shard
);
531 auto get_changed_key
= [&changed
]() -> std::optional
<std::string
> {
532 static const std::vector
<std::string
> qos_params
= {
533 "osd_mclock_scheduler_client_res",
534 "osd_mclock_scheduler_client_wgt",
535 "osd_mclock_scheduler_client_lim",
536 "osd_mclock_scheduler_background_recovery_res",
537 "osd_mclock_scheduler_background_recovery_wgt",
538 "osd_mclock_scheduler_background_recovery_lim",
539 "osd_mclock_scheduler_background_best_effort_res",
540 "osd_mclock_scheduler_background_best_effort_wgt",
541 "osd_mclock_scheduler_background_best_effort_lim"
544 for (auto &qp
: qos_params
) {
545 if (changed
.count(qp
)) {
552 if (auto key
= get_changed_key(); key
.has_value()) {
553 auto mclock_profile
= cct
->_conf
.get_val
<std::string
>("osd_mclock_profile");
554 if (mclock_profile
== "custom") {
555 client_registry
.update_from_config(
556 conf
, osd_bandwidth_capacity_per_shard
);
558 // Attempt to change QoS parameter for a built-in profile. Restore the
559 // profile defaults by making one of the OSD shards remove the key from
560 // config monitor store. Note: monc is included in the check since the
561 // mock unit test currently doesn't initialize it.
562 if (shard_id
== 0 && monc
) {
563 static const std::vector
<std::string
> osds
= {
565 "osd." + std::to_string(whoami
)
568 for (auto osd
: osds
) {
571 "\"prefix\": \"config rm\", "
572 "\"who\": \"" + osd
+ "\", "
573 "\"name\": \"" + *key
+ "\""
575 std::vector
<std::string
> vcmd
{cmd
};
577 dout(10) << __func__
<< " Removing Key: " << *key
578 << " for " << osd
<< " from Mon db" << dendl
;
579 monc
->start_mon_command(vcmd
, {}, nullptr, nullptr, nullptr);
583 // Alternatively, the QoS parameter, if set ephemerally for this OSD via
584 // the 'daemon' or 'tell' interfaces must be removed.
585 if (!cct
->_conf
.rm_val(*key
)) {
586 dout(10) << __func__
<< " Restored " << *key
<< " to default" << dendl
;
587 cct
->_conf
.apply_changes(nullptr);
592 mClockScheduler::~mClockScheduler()
594 cct
->_conf
.remove_observer(this);