]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/scheduler/mClockScheduler.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / osd / scheduler / mClockScheduler.cc
CommitLineData
9f95a23c
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 Red Hat Inc.
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#include <memory>
17#include <functional>
18
19#include "osd/scheduler/mClockScheduler.h"
20#include "common/dout.h"
21
22namespace dmc = crimson::dmclock;
23using namespace std::placeholders;
24
25#define dout_context cct
26#define dout_subsys ceph_subsys_osd
27#undef dout_prefix
f67539c2 28#define dout_prefix *_dout << "mClockScheduler: "
9f95a23c
TL
29
30
31namespace ceph::osd::scheduler {
32
f67539c2
TL
33mClockScheduler::mClockScheduler(CephContext *cct,
34 uint32_t num_shards,
35 bool is_rotational)
36 : cct(cct),
37 num_shards(num_shards),
38 is_rotational(is_rotational),
39 scheduler(
40 std::bind(&mClockScheduler::ClientRegistry::get_info,
41 &client_registry,
42 _1),
43 dmc::AtLimit::Wait,
44 cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
9f95a23c
TL
45{
46 cct->_conf.add_observer(this);
f67539c2
TL
47 ceph_assert(num_shards > 0);
48 set_max_osd_capacity();
49 set_osd_mclock_cost_per_io();
50 set_osd_mclock_cost_per_byte();
51 set_mclock_profile();
52 enable_mclock_profile_settings();
9f95a23c
TL
53 client_registry.update_from_config(cct->_conf);
54}
55
56void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
57{
58 default_external_client_info.update(
59 conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
60 conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
61 conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
62
63 internal_client_infos[
64 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
65 conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
66 conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
67 conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
68
69 internal_client_infos[
70 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
71 conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
72 conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
73 conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
74}
75
76const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
77 const client_profile_id_t &client) const
78{
79 auto ret = external_client_infos.find(client);
80 if (ret == external_client_infos.end())
81 return &default_external_client_info;
82 else
83 return &(ret->second);
84}
85
86const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
87 const scheduler_id_t &id) const {
88 switch (id.class_id) {
89 case op_scheduler_class::immediate:
90 ceph_assert(0 == "Cannot schedule immediate");
91 return (dmc::ClientInfo*)nullptr;
92 case op_scheduler_class::client:
93 return get_external_client(id.client_profile_id);
94 default:
95 ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
96 return &internal_client_infos[static_cast<size_t>(id.class_id)];
97 }
98}
99
f67539c2
TL
100void mClockScheduler::set_max_osd_capacity()
101{
102 if (cct->_conf.get_val<double>("osd_mclock_max_capacity_iops")) {
103 max_osd_capacity =
104 cct->_conf.get_val<double>("osd_mclock_max_capacity_iops");
105 } else {
106 if (is_rotational) {
107 max_osd_capacity =
108 cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
109 } else {
110 max_osd_capacity =
111 cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
112 }
113 }
114 // Set per op-shard iops limit
115 max_osd_capacity /= num_shards;
116 dout(1) << __func__ << " #op shards: " << num_shards
117 << " max osd capacity(iops) per shard: " << max_osd_capacity << dendl;
118}
119
120void mClockScheduler::set_osd_mclock_cost_per_io()
121{
122 std::chrono::seconds sec(1);
123 if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
124 osd_mclock_cost_per_io =
125 cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
126 } else {
127 if (is_rotational) {
128 osd_mclock_cost_per_io =
129 cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
130 // For HDDs, convert value to seconds
131 osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
132 } else {
133 // For SSDs, convert value to milliseconds
134 osd_mclock_cost_per_io =
135 cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
136 osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
137 }
138 }
139 dout(1) << __func__ << " osd_mclock_cost_per_io: "
140 << std::fixed << osd_mclock_cost_per_io << dendl;
141}
142
143void mClockScheduler::set_osd_mclock_cost_per_byte()
144{
145 std::chrono::seconds sec(1);
146 if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
147 osd_mclock_cost_per_byte =
148 cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
149 } else {
150 if (is_rotational) {
151 osd_mclock_cost_per_byte =
152 cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
153 // For HDDs, convert value to seconds
154 osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
155 } else {
156 osd_mclock_cost_per_byte =
157 cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
158 // For SSDs, convert value to milliseconds
159 osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
160 }
161 }
162 dout(1) << __func__ << " osd_mclock_cost_per_byte: "
163 << std::fixed << osd_mclock_cost_per_byte << dendl;
164}
165
166void mClockScheduler::set_mclock_profile()
167{
168 mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
169 dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
170}
171
172std::string mClockScheduler::get_mclock_profile()
173{
174 return mclock_profile;
175}
176
177void mClockScheduler::set_balanced_profile_allocations()
178{
179 // Client Allocation:
180 // reservation: 40% | weight: 1 | limit: 100% |
181 // Background Recovery Allocation:
182 // reservation: 40% | weight: 1 | limit: 150% |
183 // Background Best Effort Allocation:
184 // reservation: 20% | weight: 2 | limit: max |
185
186 // Client
187 uint64_t client_res = static_cast<uint64_t>(
188 std::round(0.40 * max_osd_capacity));
189 uint64_t client_lim = static_cast<uint64_t>(
190 std::round(max_osd_capacity));
191 uint64_t client_wgt = default_min;
192
193 // Background Recovery
194 uint64_t rec_res = static_cast<uint64_t>(
195 std::round(0.40 * max_osd_capacity));
196 uint64_t rec_lim = static_cast<uint64_t>(
197 std::round(1.5 * max_osd_capacity));
198 uint64_t rec_wgt = default_min;
199
200 // Background Best Effort
201 uint64_t best_effort_res = static_cast<uint64_t>(
202 std::round(0.20 * max_osd_capacity));
203 uint64_t best_effort_lim = default_max;
204 uint64_t best_effort_wgt = 2;
205
206 // Set the allocations for the mclock clients
207 client_allocs[
208 static_cast<size_t>(op_scheduler_class::client)].update(
209 client_res,
210 client_wgt,
211 client_lim);
212 client_allocs[
213 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
214 rec_res,
215 rec_wgt,
216 rec_lim);
217 client_allocs[
218 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
219 best_effort_res,
220 best_effort_wgt,
221 best_effort_lim);
222}
223
224void mClockScheduler::set_high_recovery_ops_profile_allocations()
225{
226 // Client Allocation:
227 // reservation: 30% | weight: 1 | limit: 80% |
228 // Background Recovery Allocation:
229 // reservation: 60% | weight: 2 | limit: 200% |
230 // Background Best Effort Allocation:
231 // reservation: 1 | weight: 2 | limit: max |
232
233 // Client
234 uint64_t client_res = static_cast<uint64_t>(
235 std::round(0.30 * max_osd_capacity));
236 uint64_t client_lim = static_cast<uint64_t>(
237 std::round(0.80 * max_osd_capacity));
238 uint64_t client_wgt = default_min;
239
240 // Background Recovery
241 uint64_t rec_res = static_cast<uint64_t>(
242 std::round(0.60 * max_osd_capacity));
243 uint64_t rec_lim = static_cast<uint64_t>(
244 std::round(2.0 * max_osd_capacity));
245 uint64_t rec_wgt = 2;
246
247 // Background Best Effort
248 uint64_t best_effort_res = default_min;
249 uint64_t best_effort_lim = default_max;
250 uint64_t best_effort_wgt = 2;
251
252 // Set the allocations for the mclock clients
253 client_allocs[
254 static_cast<size_t>(op_scheduler_class::client)].update(
255 client_res,
256 client_wgt,
257 client_lim);
258 client_allocs[
259 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
260 rec_res,
261 rec_wgt,
262 rec_lim);
263 client_allocs[
264 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
265 best_effort_res,
266 best_effort_wgt,
267 best_effort_lim);
268}
269
270void mClockScheduler::set_high_client_ops_profile_allocations()
271{
272 // Client Allocation:
273 // reservation: 50% | weight: 2 | limit: max |
274 // Background Recovery Allocation:
275 // reservation: 25% | weight: 1 | limit: 100% |
276 // Background Best Effort Allocation:
277 // reservation: 25% | weight: 2 | limit: max |
278
279 // Client
280 uint64_t client_res = static_cast<uint64_t>(
281 std::round(0.50 * max_osd_capacity));
282 uint64_t client_wgt = 2;
283 uint64_t client_lim = default_max;
284
285 // Background Recovery
286 uint64_t rec_res = static_cast<uint64_t>(
287 std::round(0.25 * max_osd_capacity));
288 uint64_t rec_lim = static_cast<uint64_t>(
289 std::round(max_osd_capacity));
290 uint64_t rec_wgt = default_min;
291
292 // Background Best Effort
293 uint64_t best_effort_res = static_cast<uint64_t>(
294 std::round(0.25 * max_osd_capacity));
295 uint64_t best_effort_lim = default_max;
296 uint64_t best_effort_wgt = 2;
297
298 // Set the allocations for the mclock clients
299 client_allocs[
300 static_cast<size_t>(op_scheduler_class::client)].update(
301 client_res,
302 client_wgt,
303 client_lim);
304 client_allocs[
305 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
306 rec_res,
307 rec_wgt,
308 rec_lim);
309 client_allocs[
310 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
311 best_effort_res,
312 best_effort_wgt,
313 best_effort_lim);
314}
315
316void mClockScheduler::enable_mclock_profile_settings()
317{
318 // Nothing to do for "custom" profile
319 if (mclock_profile == "custom") {
320 return;
321 }
322
323 // Set mclock and ceph config options for the chosen profile
324 if (mclock_profile == "balanced") {
325 set_balanced_profile_allocations();
326 } else if (mclock_profile == "high_recovery_ops") {
327 set_high_recovery_ops_profile_allocations();
328 } else if (mclock_profile == "high_client_ops") {
329 set_high_client_ops_profile_allocations();
330 } else {
331 ceph_assert("Invalid choice of mclock profile" == 0);
332 return;
333 }
334
335 // Set the mclock config parameters
336 set_profile_config();
337 // Set recovery specific Ceph options
338 set_global_recovery_options();
339}
340
341void mClockScheduler::set_profile_config()
342{
343 ClientAllocs client = client_allocs[
344 static_cast<size_t>(op_scheduler_class::client)];
345 ClientAllocs rec = client_allocs[
346 static_cast<size_t>(op_scheduler_class::background_recovery)];
347 ClientAllocs best_effort = client_allocs[
348 static_cast<size_t>(op_scheduler_class::background_best_effort)];
349
350 // Set external client params
351 cct->_conf.set_val("osd_mclock_scheduler_client_res",
352 std::to_string(client.res));
353 cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
354 std::to_string(client.wgt));
355 cct->_conf.set_val("osd_mclock_scheduler_client_lim",
356 std::to_string(client.lim));
357
358 // Set background recovery client params
359 cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
360 std::to_string(rec.res));
361 cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
362 std::to_string(rec.wgt));
363 cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
364 std::to_string(rec.lim));
365
366 // Set background best effort client params
367 cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
368 std::to_string(best_effort.res));
369 cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
370 std::to_string(best_effort.wgt));
371 cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
372 std::to_string(best_effort.lim));
373}
374
375void mClockScheduler::set_global_recovery_options()
376{
377 // Set high value for recovery max active and max backfill
378 int rec_max_active = 1000;
379 int max_backfills = 1000;
380 cct->_conf.set_val("osd_recovery_max_active", std::to_string(rec_max_active));
381 cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
382
383 // Disable recovery sleep
384 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
385 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
386 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
387 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
388
389 // Disable delete sleep
390 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
391 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
392 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
393 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
394
395 // Disable snap trim sleep
396 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
397 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
398 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
399 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
400
401 // Disable scrub sleep
402 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
403
404 // Apply the changes
405 cct->_conf.apply_changes(nullptr);
406}
407
408int mClockScheduler::calc_scaled_cost(int item_cost)
409{
410 // Calculate total scaled cost in secs
411 int scaled_cost =
412 std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
413 return std::max(scaled_cost, 1);
414}
415
9f95a23c
TL
416void mClockScheduler::dump(ceph::Formatter &f) const
417{
418}
419
420void mClockScheduler::enqueue(OpSchedulerItem&& item)
421{
422 auto id = get_scheduler_id(item);
9f95a23c
TL
423
424 // TODO: move this check into OpSchedulerItem, handle backwards compat
f67539c2 425 if (op_scheduler_class::immediate == id.class_id) {
9f95a23c
TL
426 immediate.push_front(std::move(item));
427 } else {
f67539c2
TL
428 int cost = calc_scaled_cost(item.get_cost());
429 // Add item to scheduler queue
9f95a23c
TL
430 scheduler.add_request(
431 std::move(item),
432 id,
433 cost);
434 }
435}
436
437void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
438{
439 immediate.push_back(std::move(item));
440 // TODO: item may not be immediate, update mclock machinery to permit
441 // putting the item back in the queue
442}
443
f67539c2 444WorkItem mClockScheduler::dequeue()
9f95a23c
TL
445{
446 if (!immediate.empty()) {
f67539c2 447 WorkItem work_item{std::move(immediate.back())};
9f95a23c 448 immediate.pop_back();
f67539c2 449 return work_item;
9f95a23c
TL
450 } else {
451 mclock_queue_t::PullReq result = scheduler.pull_request();
452 if (result.is_future()) {
f67539c2 453 return result.getTime();
9f95a23c
TL
454 } else if (result.is_none()) {
455 ceph_assert(
456 0 == "Impossible, must have checked empty() first");
f67539c2 457 return {};
9f95a23c
TL
458 } else {
459 ceph_assert(result.is_retn());
460
461 auto &retn = result.get_retn();
462 return std::move(*retn.request);
463 }
464 }
465}
466
467const char** mClockScheduler::get_tracked_conf_keys() const
468{
469 static const char* KEYS[] = {
470 "osd_mclock_scheduler_client_res",
471 "osd_mclock_scheduler_client_wgt",
472 "osd_mclock_scheduler_client_lim",
473 "osd_mclock_scheduler_background_recovery_res",
474 "osd_mclock_scheduler_background_recovery_wgt",
475 "osd_mclock_scheduler_background_recovery_lim",
476 "osd_mclock_scheduler_background_best_effort_res",
477 "osd_mclock_scheduler_background_best_effort_wgt",
478 "osd_mclock_scheduler_background_best_effort_lim",
f67539c2
TL
479 "osd_mclock_cost_per_io_usec",
480 "osd_mclock_cost_per_io_usec_hdd",
481 "osd_mclock_cost_per_io_usec_ssd",
482 "osd_mclock_cost_per_byte_usec",
483 "osd_mclock_cost_per_byte_usec_hdd",
484 "osd_mclock_cost_per_byte_usec_ssd",
485 "osd_mclock_max_capacity_iops",
486 "osd_mclock_max_capacity_iops_hdd",
487 "osd_mclock_max_capacity_iops_ssd",
488 "osd_mclock_profile",
9f95a23c
TL
489 NULL
490 };
491 return KEYS;
492}
493
494void mClockScheduler::handle_conf_change(
495 const ConfigProxy& conf,
496 const std::set<std::string> &changed)
497{
f67539c2
TL
498 if (changed.count("osd_mclock_cost_per_io_usec") ||
499 changed.count("osd_mclock_cost_per_io_usec_hdd") ||
500 changed.count("osd_mclock_cost_per_io_usec_ssd")) {
501 set_osd_mclock_cost_per_io();
502 }
503 if (changed.count("osd_mclock_cost_per_byte_usec") ||
504 changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
505 changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
506 set_osd_mclock_cost_per_byte();
507 }
508 if (changed.count("osd_mclock_max_capacity_iops") ||
509 changed.count("osd_mclock_max_capacity_iops_hdd") ||
510 changed.count("osd_mclock_max_capacity_iops_ssd")) {
511 set_max_osd_capacity();
512 if (mclock_profile != "custom") {
513 enable_mclock_profile_settings();
514 client_registry.update_from_config(conf);
515 }
516 }
517 if (changed.count("osd_mclock_profile")) {
518 set_mclock_profile();
519 if (mclock_profile != "custom") {
520 enable_mclock_profile_settings();
521 client_registry.update_from_config(conf);
522 }
523 }
524 if (changed.count("osd_mclock_scheduler_client_res") ||
525 changed.count("osd_mclock_scheduler_client_wgt") ||
526 changed.count("osd_mclock_scheduler_client_lim") ||
527 changed.count("osd_mclock_scheduler_background_recovery_res") ||
528 changed.count("osd_mclock_scheduler_background_recovery_wgt") ||
529 changed.count("osd_mclock_scheduler_background_recovery_lim")) {
530 if (mclock_profile == "custom") {
531 client_registry.update_from_config(conf);
532 }
533 }
534}
535
536mClockScheduler::~mClockScheduler()
537{
538 cct->_conf.remove_observer(this);
9f95a23c
TL
539}
540
541}