1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 Red Hat Inc.
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 #include "osd/scheduler/mClockScheduler.h"
20 #include "common/dout.h"
22 namespace dmc
= crimson::dmclock
;
23 using namespace std::placeholders
;
25 #define dout_context cct
26 #define dout_subsys ceph_subsys_mclock
28 #define dout_prefix *_dout << "mClockScheduler: "
31 namespace ceph::osd::scheduler
{
33 mClockScheduler::mClockScheduler(CephContext
*cct
,
37 num_shards(num_shards
),
38 is_rotational(is_rotational
),
40 std::bind(&mClockScheduler::ClientRegistry::get_info
,
44 cct
->_conf
.get_val
<double>("osd_mclock_scheduler_anticipation_timeout"))
46 cct
->_conf
.add_observer(this);
47 ceph_assert(num_shards
> 0);
48 set_max_osd_capacity();
49 set_osd_mclock_cost_per_io();
50 set_osd_mclock_cost_per_byte();
52 enable_mclock_profile_settings();
53 client_registry
.update_from_config(cct
->_conf
);
56 void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy
&conf
)
58 default_external_client_info
.update(
59 conf
.get_val
<uint64_t>("osd_mclock_scheduler_client_res"),
60 conf
.get_val
<uint64_t>("osd_mclock_scheduler_client_wgt"),
61 conf
.get_val
<uint64_t>("osd_mclock_scheduler_client_lim"));
63 internal_client_infos
[
64 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
65 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
66 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
67 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
69 internal_client_infos
[
70 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
71 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
72 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
73 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
76 const dmc::ClientInfo
*mClockScheduler::ClientRegistry::get_external_client(
77 const client_profile_id_t
&client
) const
79 auto ret
= external_client_infos
.find(client
);
80 if (ret
== external_client_infos
.end())
81 return &default_external_client_info
;
83 return &(ret
->second
);
86 const dmc::ClientInfo
*mClockScheduler::ClientRegistry::get_info(
87 const scheduler_id_t
&id
) const {
88 switch (id
.class_id
) {
89 case op_scheduler_class::immediate
:
90 ceph_assert(0 == "Cannot schedule immediate");
91 return (dmc::ClientInfo
*)nullptr;
92 case op_scheduler_class::client
:
93 return get_external_client(id
.client_profile_id
);
95 ceph_assert(static_cast<size_t>(id
.class_id
) < internal_client_infos
.size());
96 return &internal_client_infos
[static_cast<size_t>(id
.class_id
)];
100 void mClockScheduler::set_max_osd_capacity()
104 cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops_hdd");
107 cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops_ssd");
109 // Set per op-shard iops limit
110 max_osd_capacity
/= num_shards
;
111 dout(1) << __func__
<< " #op shards: " << num_shards
112 << std::fixed
<< std::setprecision(2)
113 << " max osd capacity(iops) per shard: " << max_osd_capacity
117 void mClockScheduler::set_osd_mclock_cost_per_io()
119 std::chrono::seconds
sec(1);
120 if (cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec")) {
121 osd_mclock_cost_per_io
=
122 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec");
125 osd_mclock_cost_per_io
=
126 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec_hdd");
127 // For HDDs, convert value to seconds
128 osd_mclock_cost_per_io
/= std::chrono::microseconds(sec
).count();
130 // For SSDs, convert value to milliseconds
131 osd_mclock_cost_per_io
=
132 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec_ssd");
133 osd_mclock_cost_per_io
/= std::chrono::milliseconds(sec
).count();
136 dout(1) << __func__
<< " osd_mclock_cost_per_io: "
137 << std::fixed
<< std::setprecision(7) << osd_mclock_cost_per_io
141 void mClockScheduler::set_osd_mclock_cost_per_byte()
143 std::chrono::seconds
sec(1);
144 if (cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec")) {
145 osd_mclock_cost_per_byte
=
146 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec");
149 osd_mclock_cost_per_byte
=
150 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec_hdd");
151 // For HDDs, convert value to seconds
152 osd_mclock_cost_per_byte
/= std::chrono::microseconds(sec
).count();
154 osd_mclock_cost_per_byte
=
155 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec_ssd");
156 // For SSDs, convert value to milliseconds
157 osd_mclock_cost_per_byte
/= std::chrono::milliseconds(sec
).count();
160 dout(1) << __func__
<< " osd_mclock_cost_per_byte: "
161 << std::fixed
<< std::setprecision(7) << osd_mclock_cost_per_byte
165 void mClockScheduler::set_mclock_profile()
167 mclock_profile
= cct
->_conf
.get_val
<std::string
>("osd_mclock_profile");
168 dout(1) << __func__
<< " mclock profile: " << mclock_profile
<< dendl
;
171 std::string
mClockScheduler::get_mclock_profile()
173 return mclock_profile
;
176 void mClockScheduler::set_balanced_profile_allocations()
178 // Client Allocation:
179 // reservation: 40% | weight: 1 | limit: 100% |
180 // Background Recovery Allocation:
181 // reservation: 40% | weight: 1 | limit: 150% |
182 // Background Best Effort Allocation:
183 // reservation: 20% | weight: 2 | limit: max |
186 uint64_t client_res
= static_cast<uint64_t>(
187 std::round(0.40 * max_osd_capacity
));
188 uint64_t client_lim
= static_cast<uint64_t>(
189 std::round(max_osd_capacity
));
190 uint64_t client_wgt
= default_min
;
192 // Background Recovery
193 uint64_t rec_res
= static_cast<uint64_t>(
194 std::round(0.40 * max_osd_capacity
));
195 uint64_t rec_lim
= static_cast<uint64_t>(
196 std::round(1.5 * max_osd_capacity
));
197 uint64_t rec_wgt
= default_min
;
199 // Background Best Effort
200 uint64_t best_effort_res
= static_cast<uint64_t>(
201 std::round(0.20 * max_osd_capacity
));
202 uint64_t best_effort_lim
= default_max
;
203 uint64_t best_effort_wgt
= 2;
205 // Set the allocations for the mclock clients
207 static_cast<size_t>(op_scheduler_class::client
)].update(
212 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
217 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
223 void mClockScheduler::set_high_recovery_ops_profile_allocations()
225 // Client Allocation:
226 // reservation: 30% | weight: 1 | limit: 80% |
227 // Background Recovery Allocation:
228 // reservation: 60% | weight: 2 | limit: 200% |
229 // Background Best Effort Allocation:
230 // reservation: 1 | weight: 2 | limit: max |
233 uint64_t client_res
= static_cast<uint64_t>(
234 std::round(0.30 * max_osd_capacity
));
235 uint64_t client_lim
= static_cast<uint64_t>(
236 std::round(0.80 * max_osd_capacity
));
237 uint64_t client_wgt
= default_min
;
239 // Background Recovery
240 uint64_t rec_res
= static_cast<uint64_t>(
241 std::round(0.60 * max_osd_capacity
));
242 uint64_t rec_lim
= static_cast<uint64_t>(
243 std::round(2.0 * max_osd_capacity
));
244 uint64_t rec_wgt
= 2;
246 // Background Best Effort
247 uint64_t best_effort_res
= default_min
;
248 uint64_t best_effort_lim
= default_max
;
249 uint64_t best_effort_wgt
= 2;
251 // Set the allocations for the mclock clients
253 static_cast<size_t>(op_scheduler_class::client
)].update(
258 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
263 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
269 void mClockScheduler::set_high_client_ops_profile_allocations()
271 // Client Allocation:
272 // reservation: 50% | weight: 2 | limit: max |
273 // Background Recovery Allocation:
274 // reservation: 25% | weight: 1 | limit: 100% |
275 // Background Best Effort Allocation:
276 // reservation: 25% | weight: 2 | limit: max |
279 uint64_t client_res
= static_cast<uint64_t>(
280 std::round(0.50 * max_osd_capacity
));
281 uint64_t client_wgt
= 2;
282 uint64_t client_lim
= default_max
;
284 // Background Recovery
285 uint64_t rec_res
= static_cast<uint64_t>(
286 std::round(0.25 * max_osd_capacity
));
287 uint64_t rec_lim
= static_cast<uint64_t>(
288 std::round(max_osd_capacity
));
289 uint64_t rec_wgt
= default_min
;
291 // Background Best Effort
292 uint64_t best_effort_res
= static_cast<uint64_t>(
293 std::round(0.25 * max_osd_capacity
));
294 uint64_t best_effort_lim
= default_max
;
295 uint64_t best_effort_wgt
= 2;
297 // Set the allocations for the mclock clients
299 static_cast<size_t>(op_scheduler_class::client
)].update(
304 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
309 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
315 void mClockScheduler::enable_mclock_profile_settings()
317 // Nothing to do for "custom" profile
318 if (mclock_profile
== "custom") {
322 // Set mclock and ceph config options for the chosen profile
323 if (mclock_profile
== "balanced") {
324 set_balanced_profile_allocations();
325 } else if (mclock_profile
== "high_recovery_ops") {
326 set_high_recovery_ops_profile_allocations();
327 } else if (mclock_profile
== "high_client_ops") {
328 set_high_client_ops_profile_allocations();
330 ceph_assert("Invalid choice of mclock profile" == 0);
334 // Set the mclock config parameters
335 set_profile_config();
338 void mClockScheduler::set_profile_config()
340 ClientAllocs client
= client_allocs
[
341 static_cast<size_t>(op_scheduler_class::client
)];
342 ClientAllocs rec
= client_allocs
[
343 static_cast<size_t>(op_scheduler_class::background_recovery
)];
344 ClientAllocs best_effort
= client_allocs
[
345 static_cast<size_t>(op_scheduler_class::background_best_effort
)];
347 // Set external client params
348 cct
->_conf
.set_val("osd_mclock_scheduler_client_res",
349 std::to_string(client
.res
));
350 cct
->_conf
.set_val("osd_mclock_scheduler_client_wgt",
351 std::to_string(client
.wgt
));
352 cct
->_conf
.set_val("osd_mclock_scheduler_client_lim",
353 std::to_string(client
.lim
));
354 dout(10) << __func__
<< " client QoS params: " << "["
355 << client
.res
<< "," << client
.wgt
<< "," << client
.lim
358 // Set background recovery client params
359 cct
->_conf
.set_val("osd_mclock_scheduler_background_recovery_res",
360 std::to_string(rec
.res
));
361 cct
->_conf
.set_val("osd_mclock_scheduler_background_recovery_wgt",
362 std::to_string(rec
.wgt
));
363 cct
->_conf
.set_val("osd_mclock_scheduler_background_recovery_lim",
364 std::to_string(rec
.lim
));
365 dout(10) << __func__
<< " Recovery QoS params: " << "["
366 << rec
.res
<< "," << rec
.wgt
<< "," << rec
.lim
369 // Set background best effort client params
370 cct
->_conf
.set_val("osd_mclock_scheduler_background_best_effort_res",
371 std::to_string(best_effort
.res
));
372 cct
->_conf
.set_val("osd_mclock_scheduler_background_best_effort_wgt",
373 std::to_string(best_effort
.wgt
));
374 cct
->_conf
.set_val("osd_mclock_scheduler_background_best_effort_lim",
375 std::to_string(best_effort
.lim
));
376 dout(10) << __func__
<< " Best effort QoS params: " << "["
377 << best_effort
.res
<< "," << best_effort
.wgt
<< "," << best_effort
.lim
381 int mClockScheduler::calc_scaled_cost(int item_cost
)
383 // Calculate total scaled cost in secs
385 std::round(osd_mclock_cost_per_io
+ (osd_mclock_cost_per_byte
* item_cost
));
386 return std::max(scaled_cost
, 1);
389 void mClockScheduler::update_configuration()
391 // Apply configuration change. The expectation is that
392 // at least one of the tracked mclock config option keys
393 // is modified before calling this method.
394 cct
->_conf
.apply_changes(nullptr);
397 void mClockScheduler::dump(ceph::Formatter
&f
) const
399 // Display queue sizes
400 f
.open_object_section("queue_sizes");
401 f
.dump_int("immediate", immediate
.size());
402 f
.dump_int("scheduler", scheduler
.request_count());
405 // client map and queue tops (res, wgt, lim)
406 std::ostringstream out
;
407 f
.open_object_section("mClockClients");
408 f
.dump_int("client_count", scheduler
.client_count());
410 f
.dump_string("clients", out
.str());
413 // Display sorted queues (res, wgt, lim)
414 f
.open_object_section("mClockQueues");
415 f
.dump_string("queues", display_queues());
419 void mClockScheduler::enqueue(OpSchedulerItem
&& item
)
421 auto id
= get_scheduler_id(item
);
423 // TODO: move this check into OpSchedulerItem, handle backwards compat
424 if (op_scheduler_class::immediate
== id
.class_id
) {
425 immediate
.push_front(std::move(item
));
427 int cost
= calc_scaled_cost(item
.get_cost());
428 item
.set_qos_cost(cost
);
429 dout(20) << __func__
<< " " << id
430 << " item_cost: " << item
.get_cost()
431 << " scaled_cost: " << cost
434 // Add item to scheduler queue
435 scheduler
.add_request(
441 dout(20) << __func__
<< " client_count: " << scheduler
.client_count()
442 << " queue_sizes: [ imm: " << immediate
.size()
443 << " sched: " << scheduler
.request_count() << " ]"
445 dout(30) << __func__
<< " mClockClients: "
448 dout(30) << __func__
<< " mClockQueues: { "
449 << display_queues() << " }"
453 void mClockScheduler::enqueue_front(OpSchedulerItem
&& item
)
455 immediate
.push_back(std::move(item
));
456 // TODO: item may not be immediate, update mclock machinery to permit
457 // putting the item back in the queue
460 WorkItem
mClockScheduler::dequeue()
462 if (!immediate
.empty()) {
463 WorkItem work_item
{std::move(immediate
.back())};
464 immediate
.pop_back();
467 mclock_queue_t::PullReq result
= scheduler
.pull_request();
468 if (result
.is_future()) {
469 return result
.getTime();
470 } else if (result
.is_none()) {
472 0 == "Impossible, must have checked empty() first");
475 ceph_assert(result
.is_retn());
477 auto &retn
= result
.get_retn();
478 return std::move(*retn
.request
);
483 std::string
mClockScheduler::display_queues() const
485 std::ostringstream out
;
486 scheduler
.display_queues(out
);
490 const char** mClockScheduler::get_tracked_conf_keys() const
492 static const char* KEYS
[] = {
493 "osd_mclock_scheduler_client_res",
494 "osd_mclock_scheduler_client_wgt",
495 "osd_mclock_scheduler_client_lim",
496 "osd_mclock_scheduler_background_recovery_res",
497 "osd_mclock_scheduler_background_recovery_wgt",
498 "osd_mclock_scheduler_background_recovery_lim",
499 "osd_mclock_scheduler_background_best_effort_res",
500 "osd_mclock_scheduler_background_best_effort_wgt",
501 "osd_mclock_scheduler_background_best_effort_lim",
502 "osd_mclock_cost_per_io_usec",
503 "osd_mclock_cost_per_io_usec_hdd",
504 "osd_mclock_cost_per_io_usec_ssd",
505 "osd_mclock_cost_per_byte_usec",
506 "osd_mclock_cost_per_byte_usec_hdd",
507 "osd_mclock_cost_per_byte_usec_ssd",
508 "osd_mclock_max_capacity_iops_hdd",
509 "osd_mclock_max_capacity_iops_ssd",
510 "osd_mclock_profile",
516 void mClockScheduler::handle_conf_change(
517 const ConfigProxy
& conf
,
518 const std::set
<std::string
> &changed
)
520 if (changed
.count("osd_mclock_cost_per_io_usec") ||
521 changed
.count("osd_mclock_cost_per_io_usec_hdd") ||
522 changed
.count("osd_mclock_cost_per_io_usec_ssd")) {
523 set_osd_mclock_cost_per_io();
525 if (changed
.count("osd_mclock_cost_per_byte_usec") ||
526 changed
.count("osd_mclock_cost_per_byte_usec_hdd") ||
527 changed
.count("osd_mclock_cost_per_byte_usec_ssd")) {
528 set_osd_mclock_cost_per_byte();
530 if (changed
.count("osd_mclock_max_capacity_iops_hdd") ||
531 changed
.count("osd_mclock_max_capacity_iops_ssd")) {
532 set_max_osd_capacity();
533 if (mclock_profile
!= "custom") {
534 enable_mclock_profile_settings();
535 client_registry
.update_from_config(conf
);
538 if (changed
.count("osd_mclock_profile")) {
539 set_mclock_profile();
540 if (mclock_profile
!= "custom") {
541 enable_mclock_profile_settings();
542 client_registry
.update_from_config(conf
);
545 if (changed
.count("osd_mclock_scheduler_client_res") ||
546 changed
.count("osd_mclock_scheduler_client_wgt") ||
547 changed
.count("osd_mclock_scheduler_client_lim") ||
548 changed
.count("osd_mclock_scheduler_background_recovery_res") ||
549 changed
.count("osd_mclock_scheduler_background_recovery_wgt") ||
550 changed
.count("osd_mclock_scheduler_background_recovery_lim") ||
551 changed
.count("osd_mclock_scheduler_background_best_effort_res") ||
552 changed
.count("osd_mclock_scheduler_background_best_effort_wgt") ||
553 changed
.count("osd_mclock_scheduler_background_best_effort_lim")) {
554 if (mclock_profile
== "custom") {
555 client_registry
.update_from_config(conf
);
560 mClockScheduler::~mClockScheduler()
562 cct
->_conf
.remove_observer(this);