]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/scheduler/mClockScheduler.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / osd / scheduler / mClockScheduler.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 Red Hat Inc.
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #include <memory>
17 #include <functional>
18
19 #include "osd/scheduler/mClockScheduler.h"
20 #include "common/dout.h"
21
22 namespace dmc = crimson::dmclock;
23 using namespace std::placeholders;
24
25 #define dout_context cct
26 #define dout_subsys ceph_subsys_mclock
27 #undef dout_prefix
28 #define dout_prefix *_dout << "mClockScheduler: "
29
30
31 namespace ceph::osd::scheduler {
32
33 mClockScheduler::mClockScheduler(CephContext *cct,
34 uint32_t num_shards,
35 bool is_rotational)
36 : cct(cct),
37 num_shards(num_shards),
38 is_rotational(is_rotational),
39 scheduler(
40 std::bind(&mClockScheduler::ClientRegistry::get_info,
41 &client_registry,
42 _1),
43 dmc::AtLimit::Wait,
44 cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
45 {
46 cct->_conf.add_observer(this);
47 ceph_assert(num_shards > 0);
48 set_max_osd_capacity();
49 set_osd_mclock_cost_per_io();
50 set_osd_mclock_cost_per_byte();
51 set_mclock_profile();
52 enable_mclock_profile_settings();
53 client_registry.update_from_config(cct->_conf);
54 }
55
56 void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
57 {
58 default_external_client_info.update(
59 conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
60 conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
61 conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
62
63 internal_client_infos[
64 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
65 conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
66 conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
67 conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
68
69 internal_client_infos[
70 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
71 conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
72 conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
73 conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
74 }
75
76 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
77 const client_profile_id_t &client) const
78 {
79 auto ret = external_client_infos.find(client);
80 if (ret == external_client_infos.end())
81 return &default_external_client_info;
82 else
83 return &(ret->second);
84 }
85
86 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
87 const scheduler_id_t &id) const {
88 switch (id.class_id) {
89 case op_scheduler_class::immediate:
90 ceph_assert(0 == "Cannot schedule immediate");
91 return (dmc::ClientInfo*)nullptr;
92 case op_scheduler_class::client:
93 return get_external_client(id.client_profile_id);
94 default:
95 ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
96 return &internal_client_infos[static_cast<size_t>(id.class_id)];
97 }
98 }
99
100 void mClockScheduler::set_max_osd_capacity()
101 {
102 if (is_rotational) {
103 max_osd_capacity =
104 cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
105 } else {
106 max_osd_capacity =
107 cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
108 }
109 // Set per op-shard iops limit
110 max_osd_capacity /= num_shards;
111 dout(1) << __func__ << " #op shards: " << num_shards
112 << std::fixed << std::setprecision(2)
113 << " max osd capacity(iops) per shard: " << max_osd_capacity
114 << dendl;
115 }
116
117 void mClockScheduler::set_osd_mclock_cost_per_io()
118 {
119 std::chrono::seconds sec(1);
120 if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
121 osd_mclock_cost_per_io =
122 cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
123 } else {
124 if (is_rotational) {
125 osd_mclock_cost_per_io =
126 cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
127 // For HDDs, convert value to seconds
128 osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
129 } else {
130 // For SSDs, convert value to milliseconds
131 osd_mclock_cost_per_io =
132 cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
133 osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
134 }
135 }
136 dout(1) << __func__ << " osd_mclock_cost_per_io: "
137 << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
138 << dendl;
139 }
140
141 void mClockScheduler::set_osd_mclock_cost_per_byte()
142 {
143 std::chrono::seconds sec(1);
144 if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
145 osd_mclock_cost_per_byte =
146 cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
147 } else {
148 if (is_rotational) {
149 osd_mclock_cost_per_byte =
150 cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
151 // For HDDs, convert value to seconds
152 osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
153 } else {
154 osd_mclock_cost_per_byte =
155 cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
156 // For SSDs, convert value to milliseconds
157 osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
158 }
159 }
160 dout(1) << __func__ << " osd_mclock_cost_per_byte: "
161 << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
162 << dendl;
163 }
164
165 void mClockScheduler::set_mclock_profile()
166 {
167 mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
168 dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
169 }
170
171 std::string mClockScheduler::get_mclock_profile()
172 {
173 return mclock_profile;
174 }
175
176 void mClockScheduler::set_balanced_profile_allocations()
177 {
178 // Client Allocation:
179 // reservation: 40% | weight: 1 | limit: 100% |
180 // Background Recovery Allocation:
181 // reservation: 40% | weight: 1 | limit: 150% |
182 // Background Best Effort Allocation:
183 // reservation: 20% | weight: 2 | limit: max |
184
185 // Client
186 uint64_t client_res = static_cast<uint64_t>(
187 std::round(0.40 * max_osd_capacity));
188 uint64_t client_lim = static_cast<uint64_t>(
189 std::round(max_osd_capacity));
190 uint64_t client_wgt = default_min;
191
192 // Background Recovery
193 uint64_t rec_res = static_cast<uint64_t>(
194 std::round(0.40 * max_osd_capacity));
195 uint64_t rec_lim = static_cast<uint64_t>(
196 std::round(1.5 * max_osd_capacity));
197 uint64_t rec_wgt = default_min;
198
199 // Background Best Effort
200 uint64_t best_effort_res = static_cast<uint64_t>(
201 std::round(0.20 * max_osd_capacity));
202 uint64_t best_effort_lim = default_max;
203 uint64_t best_effort_wgt = 2;
204
205 // Set the allocations for the mclock clients
206 client_allocs[
207 static_cast<size_t>(op_scheduler_class::client)].update(
208 client_res,
209 client_wgt,
210 client_lim);
211 client_allocs[
212 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
213 rec_res,
214 rec_wgt,
215 rec_lim);
216 client_allocs[
217 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
218 best_effort_res,
219 best_effort_wgt,
220 best_effort_lim);
221 }
222
223 void mClockScheduler::set_high_recovery_ops_profile_allocations()
224 {
225 // Client Allocation:
226 // reservation: 30% | weight: 1 | limit: 80% |
227 // Background Recovery Allocation:
228 // reservation: 60% | weight: 2 | limit: 200% |
229 // Background Best Effort Allocation:
230 // reservation: 1 | weight: 2 | limit: max |
231
232 // Client
233 uint64_t client_res = static_cast<uint64_t>(
234 std::round(0.30 * max_osd_capacity));
235 uint64_t client_lim = static_cast<uint64_t>(
236 std::round(0.80 * max_osd_capacity));
237 uint64_t client_wgt = default_min;
238
239 // Background Recovery
240 uint64_t rec_res = static_cast<uint64_t>(
241 std::round(0.60 * max_osd_capacity));
242 uint64_t rec_lim = static_cast<uint64_t>(
243 std::round(2.0 * max_osd_capacity));
244 uint64_t rec_wgt = 2;
245
246 // Background Best Effort
247 uint64_t best_effort_res = default_min;
248 uint64_t best_effort_lim = default_max;
249 uint64_t best_effort_wgt = 2;
250
251 // Set the allocations for the mclock clients
252 client_allocs[
253 static_cast<size_t>(op_scheduler_class::client)].update(
254 client_res,
255 client_wgt,
256 client_lim);
257 client_allocs[
258 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
259 rec_res,
260 rec_wgt,
261 rec_lim);
262 client_allocs[
263 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
264 best_effort_res,
265 best_effort_wgt,
266 best_effort_lim);
267 }
268
269 void mClockScheduler::set_high_client_ops_profile_allocations()
270 {
271 // Client Allocation:
272 // reservation: 50% | weight: 2 | limit: max |
273 // Background Recovery Allocation:
274 // reservation: 25% | weight: 1 | limit: 100% |
275 // Background Best Effort Allocation:
276 // reservation: 25% | weight: 2 | limit: max |
277
278 // Client
279 uint64_t client_res = static_cast<uint64_t>(
280 std::round(0.50 * max_osd_capacity));
281 uint64_t client_wgt = 2;
282 uint64_t client_lim = default_max;
283
284 // Background Recovery
285 uint64_t rec_res = static_cast<uint64_t>(
286 std::round(0.25 * max_osd_capacity));
287 uint64_t rec_lim = static_cast<uint64_t>(
288 std::round(max_osd_capacity));
289 uint64_t rec_wgt = default_min;
290
291 // Background Best Effort
292 uint64_t best_effort_res = static_cast<uint64_t>(
293 std::round(0.25 * max_osd_capacity));
294 uint64_t best_effort_lim = default_max;
295 uint64_t best_effort_wgt = 2;
296
297 // Set the allocations for the mclock clients
298 client_allocs[
299 static_cast<size_t>(op_scheduler_class::client)].update(
300 client_res,
301 client_wgt,
302 client_lim);
303 client_allocs[
304 static_cast<size_t>(op_scheduler_class::background_recovery)].update(
305 rec_res,
306 rec_wgt,
307 rec_lim);
308 client_allocs[
309 static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
310 best_effort_res,
311 best_effort_wgt,
312 best_effort_lim);
313 }
314
315 void mClockScheduler::enable_mclock_profile_settings()
316 {
317 // Nothing to do for "custom" profile
318 if (mclock_profile == "custom") {
319 return;
320 }
321
322 // Set mclock and ceph config options for the chosen profile
323 if (mclock_profile == "balanced") {
324 set_balanced_profile_allocations();
325 } else if (mclock_profile == "high_recovery_ops") {
326 set_high_recovery_ops_profile_allocations();
327 } else if (mclock_profile == "high_client_ops") {
328 set_high_client_ops_profile_allocations();
329 } else {
330 ceph_assert("Invalid choice of mclock profile" == 0);
331 return;
332 }
333
334 // Set the mclock config parameters
335 set_profile_config();
336 }
337
338 void mClockScheduler::set_profile_config()
339 {
340 ClientAllocs client = client_allocs[
341 static_cast<size_t>(op_scheduler_class::client)];
342 ClientAllocs rec = client_allocs[
343 static_cast<size_t>(op_scheduler_class::background_recovery)];
344 ClientAllocs best_effort = client_allocs[
345 static_cast<size_t>(op_scheduler_class::background_best_effort)];
346
347 // Set external client params
348 cct->_conf.set_val("osd_mclock_scheduler_client_res",
349 std::to_string(client.res));
350 cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
351 std::to_string(client.wgt));
352 cct->_conf.set_val("osd_mclock_scheduler_client_lim",
353 std::to_string(client.lim));
354 dout(10) << __func__ << " client QoS params: " << "["
355 << client.res << "," << client.wgt << "," << client.lim
356 << "]" << dendl;
357
358 // Set background recovery client params
359 cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
360 std::to_string(rec.res));
361 cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
362 std::to_string(rec.wgt));
363 cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
364 std::to_string(rec.lim));
365 dout(10) << __func__ << " Recovery QoS params: " << "["
366 << rec.res << "," << rec.wgt << "," << rec.lim
367 << "]" << dendl;
368
369 // Set background best effort client params
370 cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
371 std::to_string(best_effort.res));
372 cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
373 std::to_string(best_effort.wgt));
374 cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
375 std::to_string(best_effort.lim));
376 dout(10) << __func__ << " Best effort QoS params: " << "["
377 << best_effort.res << "," << best_effort.wgt << "," << best_effort.lim
378 << "]" << dendl;
379 }
380
381 int mClockScheduler::calc_scaled_cost(int item_cost)
382 {
383 // Calculate total scaled cost in secs
384 int scaled_cost =
385 std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
386 return std::max(scaled_cost, 1);
387 }
388
389 void mClockScheduler::update_configuration()
390 {
391 // Apply configuration change. The expectation is that
392 // at least one of the tracked mclock config option keys
393 // is modified before calling this method.
394 cct->_conf.apply_changes(nullptr);
395 }
396
397 void mClockScheduler::dump(ceph::Formatter &f) const
398 {
399 // Display queue sizes
400 f.open_object_section("queue_sizes");
401 f.dump_int("immediate", immediate.size());
402 f.dump_int("scheduler", scheduler.request_count());
403 f.close_section();
404
405 // client map and queue tops (res, wgt, lim)
406 std::ostringstream out;
407 f.open_object_section("mClockClients");
408 f.dump_int("client_count", scheduler.client_count());
409 out << scheduler;
410 f.dump_string("clients", out.str());
411 f.close_section();
412
413 // Display sorted queues (res, wgt, lim)
414 f.open_object_section("mClockQueues");
415 f.dump_string("queues", display_queues());
416 f.close_section();
417 }
418
419 void mClockScheduler::enqueue(OpSchedulerItem&& item)
420 {
421 auto id = get_scheduler_id(item);
422
423 // TODO: move this check into OpSchedulerItem, handle backwards compat
424 if (op_scheduler_class::immediate == id.class_id) {
425 immediate.push_front(std::move(item));
426 } else {
427 int cost = calc_scaled_cost(item.get_cost());
428 item.set_qos_cost(cost);
429 dout(20) << __func__ << " " << id
430 << " item_cost: " << item.get_cost()
431 << " scaled_cost: " << cost
432 << dendl;
433
434 // Add item to scheduler queue
435 scheduler.add_request(
436 std::move(item),
437 id,
438 cost);
439 }
440
441 dout(20) << __func__ << " client_count: " << scheduler.client_count()
442 << " queue_sizes: [ imm: " << immediate.size()
443 << " sched: " << scheduler.request_count() << " ]"
444 << dendl;
445 dout(30) << __func__ << " mClockClients: "
446 << scheduler
447 << dendl;
448 dout(30) << __func__ << " mClockQueues: { "
449 << display_queues() << " }"
450 << dendl;
451 }
452
453 void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
454 {
455 immediate.push_back(std::move(item));
456 // TODO: item may not be immediate, update mclock machinery to permit
457 // putting the item back in the queue
458 }
459
460 WorkItem mClockScheduler::dequeue()
461 {
462 if (!immediate.empty()) {
463 WorkItem work_item{std::move(immediate.back())};
464 immediate.pop_back();
465 return work_item;
466 } else {
467 mclock_queue_t::PullReq result = scheduler.pull_request();
468 if (result.is_future()) {
469 return result.getTime();
470 } else if (result.is_none()) {
471 ceph_assert(
472 0 == "Impossible, must have checked empty() first");
473 return {};
474 } else {
475 ceph_assert(result.is_retn());
476
477 auto &retn = result.get_retn();
478 return std::move(*retn.request);
479 }
480 }
481 }
482
483 std::string mClockScheduler::display_queues() const
484 {
485 std::ostringstream out;
486 scheduler.display_queues(out);
487 return out.str();
488 }
489
490 const char** mClockScheduler::get_tracked_conf_keys() const
491 {
492 static const char* KEYS[] = {
493 "osd_mclock_scheduler_client_res",
494 "osd_mclock_scheduler_client_wgt",
495 "osd_mclock_scheduler_client_lim",
496 "osd_mclock_scheduler_background_recovery_res",
497 "osd_mclock_scheduler_background_recovery_wgt",
498 "osd_mclock_scheduler_background_recovery_lim",
499 "osd_mclock_scheduler_background_best_effort_res",
500 "osd_mclock_scheduler_background_best_effort_wgt",
501 "osd_mclock_scheduler_background_best_effort_lim",
502 "osd_mclock_cost_per_io_usec",
503 "osd_mclock_cost_per_io_usec_hdd",
504 "osd_mclock_cost_per_io_usec_ssd",
505 "osd_mclock_cost_per_byte_usec",
506 "osd_mclock_cost_per_byte_usec_hdd",
507 "osd_mclock_cost_per_byte_usec_ssd",
508 "osd_mclock_max_capacity_iops_hdd",
509 "osd_mclock_max_capacity_iops_ssd",
510 "osd_mclock_profile",
511 NULL
512 };
513 return KEYS;
514 }
515
516 void mClockScheduler::handle_conf_change(
517 const ConfigProxy& conf,
518 const std::set<std::string> &changed)
519 {
520 if (changed.count("osd_mclock_cost_per_io_usec") ||
521 changed.count("osd_mclock_cost_per_io_usec_hdd") ||
522 changed.count("osd_mclock_cost_per_io_usec_ssd")) {
523 set_osd_mclock_cost_per_io();
524 }
525 if (changed.count("osd_mclock_cost_per_byte_usec") ||
526 changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
527 changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
528 set_osd_mclock_cost_per_byte();
529 }
530 if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
531 changed.count("osd_mclock_max_capacity_iops_ssd")) {
532 set_max_osd_capacity();
533 if (mclock_profile != "custom") {
534 enable_mclock_profile_settings();
535 client_registry.update_from_config(conf);
536 }
537 }
538 if (changed.count("osd_mclock_profile")) {
539 set_mclock_profile();
540 if (mclock_profile != "custom") {
541 enable_mclock_profile_settings();
542 client_registry.update_from_config(conf);
543 }
544 }
545 if (changed.count("osd_mclock_scheduler_client_res") ||
546 changed.count("osd_mclock_scheduler_client_wgt") ||
547 changed.count("osd_mclock_scheduler_client_lim") ||
548 changed.count("osd_mclock_scheduler_background_recovery_res") ||
549 changed.count("osd_mclock_scheduler_background_recovery_wgt") ||
550 changed.count("osd_mclock_scheduler_background_recovery_lim") ||
551 changed.count("osd_mclock_scheduler_background_best_effort_res") ||
552 changed.count("osd_mclock_scheduler_background_best_effort_wgt") ||
553 changed.count("osd_mclock_scheduler_background_best_effort_lim")) {
554 if (mclock_profile == "custom") {
555 client_registry.update_from_config(conf);
556 }
557 }
558 }
559
560 mClockScheduler::~mClockScheduler()
561 {
562 cct->_conf.remove_observer(this);
563 }
564
565 }