1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 Red Hat Inc.
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 #include "osd/scheduler/mClockScheduler.h"
20 #include "common/dout.h"
22 namespace dmc
= crimson::dmclock
;
23 using namespace std::placeholders
;
25 #define dout_context cct
26 #define dout_subsys ceph_subsys_osd
28 #define dout_prefix *_dout << "mClockScheduler: "
31 namespace ceph::osd::scheduler
{
33 mClockScheduler::mClockScheduler(CephContext
*cct
,
37 num_shards(num_shards
),
38 is_rotational(is_rotational
),
40 std::bind(&mClockScheduler::ClientRegistry::get_info
,
44 cct
->_conf
.get_val
<double>("osd_mclock_scheduler_anticipation_timeout"))
46 cct
->_conf
.add_observer(this);
47 ceph_assert(num_shards
> 0);
48 set_max_osd_capacity();
49 set_osd_mclock_cost_per_io();
50 set_osd_mclock_cost_per_byte();
52 enable_mclock_profile_settings();
53 client_registry
.update_from_config(cct
->_conf
);
56 void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy
&conf
)
58 default_external_client_info
.update(
59 conf
.get_val
<uint64_t>("osd_mclock_scheduler_client_res"),
60 conf
.get_val
<uint64_t>("osd_mclock_scheduler_client_wgt"),
61 conf
.get_val
<uint64_t>("osd_mclock_scheduler_client_lim"));
63 internal_client_infos
[
64 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
65 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
66 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
67 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
69 internal_client_infos
[
70 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
71 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
72 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
73 conf
.get_val
<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
76 const dmc::ClientInfo
*mClockScheduler::ClientRegistry::get_external_client(
77 const client_profile_id_t
&client
) const
79 auto ret
= external_client_infos
.find(client
);
80 if (ret
== external_client_infos
.end())
81 return &default_external_client_info
;
83 return &(ret
->second
);
86 const dmc::ClientInfo
*mClockScheduler::ClientRegistry::get_info(
87 const scheduler_id_t
&id
) const {
88 switch (id
.class_id
) {
89 case op_scheduler_class::immediate
:
90 ceph_assert(0 == "Cannot schedule immediate");
91 return (dmc::ClientInfo
*)nullptr;
92 case op_scheduler_class::client
:
93 return get_external_client(id
.client_profile_id
);
95 ceph_assert(static_cast<size_t>(id
.class_id
) < internal_client_infos
.size());
96 return &internal_client_infos
[static_cast<size_t>(id
.class_id
)];
100 void mClockScheduler::set_max_osd_capacity()
102 if (cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops")) {
104 cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops");
108 cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops_hdd");
111 cct
->_conf
.get_val
<double>("osd_mclock_max_capacity_iops_ssd");
114 // Set per op-shard iops limit
115 max_osd_capacity
/= num_shards
;
116 dout(1) << __func__
<< " #op shards: " << num_shards
117 << " max osd capacity(iops) per shard: " << max_osd_capacity
<< dendl
;
120 void mClockScheduler::set_osd_mclock_cost_per_io()
122 std::chrono::seconds
sec(1);
123 if (cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec")) {
124 osd_mclock_cost_per_io
=
125 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec");
128 osd_mclock_cost_per_io
=
129 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec_hdd");
130 // For HDDs, convert value to seconds
131 osd_mclock_cost_per_io
/= std::chrono::microseconds(sec
).count();
133 // For SSDs, convert value to milliseconds
134 osd_mclock_cost_per_io
=
135 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_io_usec_ssd");
136 osd_mclock_cost_per_io
/= std::chrono::milliseconds(sec
).count();
139 dout(1) << __func__
<< " osd_mclock_cost_per_io: "
140 << std::fixed
<< osd_mclock_cost_per_io
<< dendl
;
143 void mClockScheduler::set_osd_mclock_cost_per_byte()
145 std::chrono::seconds
sec(1);
146 if (cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec")) {
147 osd_mclock_cost_per_byte
=
148 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec");
151 osd_mclock_cost_per_byte
=
152 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec_hdd");
153 // For HDDs, convert value to seconds
154 osd_mclock_cost_per_byte
/= std::chrono::microseconds(sec
).count();
156 osd_mclock_cost_per_byte
=
157 cct
->_conf
.get_val
<double>("osd_mclock_cost_per_byte_usec_ssd");
158 // For SSDs, convert value to milliseconds
159 osd_mclock_cost_per_byte
/= std::chrono::milliseconds(sec
).count();
162 dout(1) << __func__
<< " osd_mclock_cost_per_byte: "
163 << std::fixed
<< osd_mclock_cost_per_byte
<< dendl
;
166 void mClockScheduler::set_mclock_profile()
168 mclock_profile
= cct
->_conf
.get_val
<std::string
>("osd_mclock_profile");
169 dout(1) << __func__
<< " mclock profile: " << mclock_profile
<< dendl
;
172 std::string
mClockScheduler::get_mclock_profile()
174 return mclock_profile
;
177 void mClockScheduler::set_balanced_profile_allocations()
179 // Client Allocation:
180 // reservation: 40% | weight: 1 | limit: 100% |
181 // Background Recovery Allocation:
182 // reservation: 40% | weight: 1 | limit: 150% |
183 // Background Best Effort Allocation:
184 // reservation: 20% | weight: 2 | limit: max |
187 uint64_t client_res
= static_cast<uint64_t>(
188 std::round(0.40 * max_osd_capacity
));
189 uint64_t client_lim
= static_cast<uint64_t>(
190 std::round(max_osd_capacity
));
191 uint64_t client_wgt
= default_min
;
193 // Background Recovery
194 uint64_t rec_res
= static_cast<uint64_t>(
195 std::round(0.40 * max_osd_capacity
));
196 uint64_t rec_lim
= static_cast<uint64_t>(
197 std::round(1.5 * max_osd_capacity
));
198 uint64_t rec_wgt
= default_min
;
200 // Background Best Effort
201 uint64_t best_effort_res
= static_cast<uint64_t>(
202 std::round(0.20 * max_osd_capacity
));
203 uint64_t best_effort_lim
= default_max
;
204 uint64_t best_effort_wgt
= 2;
206 // Set the allocations for the mclock clients
208 static_cast<size_t>(op_scheduler_class::client
)].update(
213 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
218 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
224 void mClockScheduler::set_high_recovery_ops_profile_allocations()
226 // Client Allocation:
227 // reservation: 30% | weight: 1 | limit: 80% |
228 // Background Recovery Allocation:
229 // reservation: 60% | weight: 2 | limit: 200% |
230 // Background Best Effort Allocation:
231 // reservation: 1 | weight: 2 | limit: max |
234 uint64_t client_res
= static_cast<uint64_t>(
235 std::round(0.30 * max_osd_capacity
));
236 uint64_t client_lim
= static_cast<uint64_t>(
237 std::round(0.80 * max_osd_capacity
));
238 uint64_t client_wgt
= default_min
;
240 // Background Recovery
241 uint64_t rec_res
= static_cast<uint64_t>(
242 std::round(0.60 * max_osd_capacity
));
243 uint64_t rec_lim
= static_cast<uint64_t>(
244 std::round(2.0 * max_osd_capacity
));
245 uint64_t rec_wgt
= 2;
247 // Background Best Effort
248 uint64_t best_effort_res
= default_min
;
249 uint64_t best_effort_lim
= default_max
;
250 uint64_t best_effort_wgt
= 2;
252 // Set the allocations for the mclock clients
254 static_cast<size_t>(op_scheduler_class::client
)].update(
259 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
264 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
270 void mClockScheduler::set_high_client_ops_profile_allocations()
272 // Client Allocation:
273 // reservation: 50% | weight: 2 | limit: max |
274 // Background Recovery Allocation:
275 // reservation: 25% | weight: 1 | limit: 100% |
276 // Background Best Effort Allocation:
277 // reservation: 25% | weight: 2 | limit: max |
280 uint64_t client_res
= static_cast<uint64_t>(
281 std::round(0.50 * max_osd_capacity
));
282 uint64_t client_wgt
= 2;
283 uint64_t client_lim
= default_max
;
285 // Background Recovery
286 uint64_t rec_res
= static_cast<uint64_t>(
287 std::round(0.25 * max_osd_capacity
));
288 uint64_t rec_lim
= static_cast<uint64_t>(
289 std::round(max_osd_capacity
));
290 uint64_t rec_wgt
= default_min
;
292 // Background Best Effort
293 uint64_t best_effort_res
= static_cast<uint64_t>(
294 std::round(0.25 * max_osd_capacity
));
295 uint64_t best_effort_lim
= default_max
;
296 uint64_t best_effort_wgt
= 2;
298 // Set the allocations for the mclock clients
300 static_cast<size_t>(op_scheduler_class::client
)].update(
305 static_cast<size_t>(op_scheduler_class::background_recovery
)].update(
310 static_cast<size_t>(op_scheduler_class::background_best_effort
)].update(
316 void mClockScheduler::enable_mclock_profile_settings()
318 // Nothing to do for "custom" profile
319 if (mclock_profile
== "custom") {
323 // Set mclock and ceph config options for the chosen profile
324 if (mclock_profile
== "balanced") {
325 set_balanced_profile_allocations();
326 } else if (mclock_profile
== "high_recovery_ops") {
327 set_high_recovery_ops_profile_allocations();
328 } else if (mclock_profile
== "high_client_ops") {
329 set_high_client_ops_profile_allocations();
331 ceph_assert("Invalid choice of mclock profile" == 0);
335 // Set the mclock config parameters
336 set_profile_config();
337 // Set recovery specific Ceph options
338 set_global_recovery_options();
341 void mClockScheduler::set_profile_config()
343 ClientAllocs client
= client_allocs
[
344 static_cast<size_t>(op_scheduler_class::client
)];
345 ClientAllocs rec
= client_allocs
[
346 static_cast<size_t>(op_scheduler_class::background_recovery
)];
347 ClientAllocs best_effort
= client_allocs
[
348 static_cast<size_t>(op_scheduler_class::background_best_effort
)];
350 // Set external client params
351 cct
->_conf
.set_val("osd_mclock_scheduler_client_res",
352 std::to_string(client
.res
));
353 cct
->_conf
.set_val("osd_mclock_scheduler_client_wgt",
354 std::to_string(client
.wgt
));
355 cct
->_conf
.set_val("osd_mclock_scheduler_client_lim",
356 std::to_string(client
.lim
));
358 // Set background recovery client params
359 cct
->_conf
.set_val("osd_mclock_scheduler_background_recovery_res",
360 std::to_string(rec
.res
));
361 cct
->_conf
.set_val("osd_mclock_scheduler_background_recovery_wgt",
362 std::to_string(rec
.wgt
));
363 cct
->_conf
.set_val("osd_mclock_scheduler_background_recovery_lim",
364 std::to_string(rec
.lim
));
366 // Set background best effort client params
367 cct
->_conf
.set_val("osd_mclock_scheduler_background_best_effort_res",
368 std::to_string(best_effort
.res
));
369 cct
->_conf
.set_val("osd_mclock_scheduler_background_best_effort_wgt",
370 std::to_string(best_effort
.wgt
));
371 cct
->_conf
.set_val("osd_mclock_scheduler_background_best_effort_lim",
372 std::to_string(best_effort
.lim
));
375 void mClockScheduler::set_global_recovery_options()
377 // Set high value for recovery max active and max backfill
378 int rec_max_active
= 1000;
379 int max_backfills
= 1000;
380 cct
->_conf
.set_val("osd_recovery_max_active", std::to_string(rec_max_active
));
381 cct
->_conf
.set_val("osd_max_backfills", std::to_string(max_backfills
));
383 // Disable recovery sleep
384 cct
->_conf
.set_val("osd_recovery_sleep", std::to_string(0));
385 cct
->_conf
.set_val("osd_recovery_sleep_hdd", std::to_string(0));
386 cct
->_conf
.set_val("osd_recovery_sleep_ssd", std::to_string(0));
387 cct
->_conf
.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
389 // Disable delete sleep
390 cct
->_conf
.set_val("osd_delete_sleep", std::to_string(0));
391 cct
->_conf
.set_val("osd_delete_sleep_hdd", std::to_string(0));
392 cct
->_conf
.set_val("osd_delete_sleep_ssd", std::to_string(0));
393 cct
->_conf
.set_val("osd_delete_sleep_hybrid", std::to_string(0));
395 // Disable snap trim sleep
396 cct
->_conf
.set_val("osd_snap_trim_sleep", std::to_string(0));
397 cct
->_conf
.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
398 cct
->_conf
.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
399 cct
->_conf
.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
401 // Disable scrub sleep
402 cct
->_conf
.set_val("osd_scrub_sleep", std::to_string(0));
405 cct
->_conf
.apply_changes(nullptr);
408 int mClockScheduler::calc_scaled_cost(int item_cost
)
410 // Calculate total scaled cost in secs
412 std::round(osd_mclock_cost_per_io
+ (osd_mclock_cost_per_byte
* item_cost
));
413 return std::max(scaled_cost
, 1);
416 void mClockScheduler::dump(ceph::Formatter
&f
) const
420 void mClockScheduler::enqueue(OpSchedulerItem
&& item
)
422 auto id
= get_scheduler_id(item
);
424 // TODO: move this check into OpSchedulerItem, handle backwards compat
425 if (op_scheduler_class::immediate
== id
.class_id
) {
426 immediate
.push_front(std::move(item
));
428 int cost
= calc_scaled_cost(item
.get_cost());
429 // Add item to scheduler queue
430 scheduler
.add_request(
437 void mClockScheduler::enqueue_front(OpSchedulerItem
&& item
)
439 immediate
.push_back(std::move(item
));
440 // TODO: item may not be immediate, update mclock machinery to permit
441 // putting the item back in the queue
444 WorkItem
mClockScheduler::dequeue()
446 if (!immediate
.empty()) {
447 WorkItem work_item
{std::move(immediate
.back())};
448 immediate
.pop_back();
451 mclock_queue_t::PullReq result
= scheduler
.pull_request();
452 if (result
.is_future()) {
453 return result
.getTime();
454 } else if (result
.is_none()) {
456 0 == "Impossible, must have checked empty() first");
459 ceph_assert(result
.is_retn());
461 auto &retn
= result
.get_retn();
462 return std::move(*retn
.request
);
467 const char** mClockScheduler::get_tracked_conf_keys() const
469 static const char* KEYS
[] = {
470 "osd_mclock_scheduler_client_res",
471 "osd_mclock_scheduler_client_wgt",
472 "osd_mclock_scheduler_client_lim",
473 "osd_mclock_scheduler_background_recovery_res",
474 "osd_mclock_scheduler_background_recovery_wgt",
475 "osd_mclock_scheduler_background_recovery_lim",
476 "osd_mclock_scheduler_background_best_effort_res",
477 "osd_mclock_scheduler_background_best_effort_wgt",
478 "osd_mclock_scheduler_background_best_effort_lim",
479 "osd_mclock_cost_per_io_usec",
480 "osd_mclock_cost_per_io_usec_hdd",
481 "osd_mclock_cost_per_io_usec_ssd",
482 "osd_mclock_cost_per_byte_usec",
483 "osd_mclock_cost_per_byte_usec_hdd",
484 "osd_mclock_cost_per_byte_usec_ssd",
485 "osd_mclock_max_capacity_iops",
486 "osd_mclock_max_capacity_iops_hdd",
487 "osd_mclock_max_capacity_iops_ssd",
488 "osd_mclock_profile",
494 void mClockScheduler::handle_conf_change(
495 const ConfigProxy
& conf
,
496 const std::set
<std::string
> &changed
)
498 if (changed
.count("osd_mclock_cost_per_io_usec") ||
499 changed
.count("osd_mclock_cost_per_io_usec_hdd") ||
500 changed
.count("osd_mclock_cost_per_io_usec_ssd")) {
501 set_osd_mclock_cost_per_io();
503 if (changed
.count("osd_mclock_cost_per_byte_usec") ||
504 changed
.count("osd_mclock_cost_per_byte_usec_hdd") ||
505 changed
.count("osd_mclock_cost_per_byte_usec_ssd")) {
506 set_osd_mclock_cost_per_byte();
508 if (changed
.count("osd_mclock_max_capacity_iops") ||
509 changed
.count("osd_mclock_max_capacity_iops_hdd") ||
510 changed
.count("osd_mclock_max_capacity_iops_ssd")) {
511 set_max_osd_capacity();
512 if (mclock_profile
!= "custom") {
513 enable_mclock_profile_settings();
514 client_registry
.update_from_config(conf
);
517 if (changed
.count("osd_mclock_profile")) {
518 set_mclock_profile();
519 if (mclock_profile
!= "custom") {
520 enable_mclock_profile_settings();
521 client_registry
.update_from_config(conf
);
524 if (changed
.count("osd_mclock_scheduler_client_res") ||
525 changed
.count("osd_mclock_scheduler_client_wgt") ||
526 changed
.count("osd_mclock_scheduler_client_lim") ||
527 changed
.count("osd_mclock_scheduler_background_recovery_res") ||
528 changed
.count("osd_mclock_scheduler_background_recovery_wgt") ||
529 changed
.count("osd_mclock_scheduler_background_recovery_lim")) {
530 if (mclock_profile
== "custom") {
531 client_registry
.update_from_config(conf
);
536 mClockScheduler::~mClockScheduler()
538 cct
->_conf
.remove_observer(this);