ceph/src/osd/scheduler/mClockScheduler.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2016 Red Hat Inc.
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 #include <memory>
  17 #include <functional>
  18
  19 #include "osd/scheduler/mClockScheduler.h"
  20 #include "common/dout.h"
  21
  22 namespace dmc = crimson::dmclock;
  23 using namespace std::placeholders;
  24
  25 #define dout_context cct
  26 #define dout_subsys ceph_subsys_mclock
  27 #undef dout_prefix
  28 #define dout_prefix *_dout << "mClockScheduler: "
  29
  30
  31 namespace ceph::osd::scheduler {
  32
  33 mClockScheduler::mClockScheduler(CephContext *cct,
  34   uint32_t num_shards,
  35   bool is_rotational)
  36   : cct(cct),
  37     num_shards(num_shards),
  38     is_rotational(is_rotational),
  39     scheduler(
  40       std::bind(&mClockScheduler::ClientRegistry::get_info,
  41                 &client_registry,
  42                 _1),
  43       dmc::AtLimit::Wait,
  44       cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
  45 {
  46   cct->_conf.add_observer(this);
  47   ceph_assert(num_shards > 0);
  48   set_max_osd_capacity();
  49   set_osd_mclock_cost_per_io();
  50   set_osd_mclock_cost_per_byte();
  51   set_mclock_profile();
  52   enable_mclock_profile_settings();
  53   client_registry.update_from_config(cct->_conf);
  54 }
  55
  56 void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
  57 {
  58   default_external_client_info.update(
  59     conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
  60     conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
  61     conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
  62
  63   internal_client_infos[
  64     static_cast<size_t>(op_scheduler_class::background_recovery)].update(
  65     conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
  66     conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
  67     conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
  68
  69   internal_client_infos[
  70     static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
  71     conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
  72     conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
  73     conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
  74 }
  75
  76 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
  77   const client_profile_id_t &client) const
  78 {
  79   auto ret = external_client_infos.find(client);
  80   if (ret == external_client_infos.end())
  81     return &default_external_client_info;
  82   else
  83     return &(ret->second);
  84 }
  85
  86 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
  87   const scheduler_id_t &id) const {
  88   switch (id.class_id) {
  89   case op_scheduler_class::immediate:
  90     ceph_assert(0 == "Cannot schedule immediate");
  91     return (dmc::ClientInfo*)nullptr;
  92   case op_scheduler_class::client:
  93     return get_external_client(id.client_profile_id);
  94   default:
  95     ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
  96     return &internal_client_infos[static_cast<size_t>(id.class_id)];
  97   }
  98 }
  99
 100 void mClockScheduler::set_max_osd_capacity()
 101 {
 102   if (is_rotational) {
 103     max_osd_capacity =
 104       cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
 105   } else {
 106     max_osd_capacity =
 107       cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
 108   }
 109   // Set per op-shard iops limit
 110   max_osd_capacity /= num_shards;
 111   dout(1) << __func__ << " #op shards: " << num_shards
 112           << std::fixed << std::setprecision(2)
 113           << " max osd capacity(iops) per shard: " << max_osd_capacity
 114           << dendl;
 115 }
 116
 117 void mClockScheduler::set_osd_mclock_cost_per_io()
 118 {
 119   std::chrono::seconds sec(1);
 120   if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
 121     osd_mclock_cost_per_io =
 122       cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
 123   } else {
 124     if (is_rotational) {
 125       osd_mclock_cost_per_io =
 126         cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
 127       // For HDDs, convert value to seconds
 128       osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
 129     } else {
 130       // For SSDs, convert value to milliseconds
 131       osd_mclock_cost_per_io =
 132         cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
 133       osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
 134     }
 135   }
 136   dout(1) << __func__ << " osd_mclock_cost_per_io: "
 137           << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
 138           << dendl;
 139 }
 140
 141 void mClockScheduler::set_osd_mclock_cost_per_byte()
 142 {
 143   std::chrono::seconds sec(1);
 144   if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
 145     osd_mclock_cost_per_byte =
 146       cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
 147   } else {
 148     if (is_rotational) {
 149       osd_mclock_cost_per_byte =
 150         cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
 151       // For HDDs, convert value to seconds
 152       osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
 153     } else {
 154       osd_mclock_cost_per_byte =
 155         cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
 156       // For SSDs, convert value to milliseconds
 157       osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
 158     }
 159   }
 160   dout(1) << __func__ << " osd_mclock_cost_per_byte: "
 161           << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
 162           << dendl;
 163 }
 164
 165 void mClockScheduler::set_mclock_profile()
 166 {
 167   mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
 168   dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
 169 }
 170
 171 std::string mClockScheduler::get_mclock_profile()
 172 {
 173   return mclock_profile;
 174 }
 175
 176 void mClockScheduler::set_balanced_profile_allocations()
 177 {
 178   // Client Allocation:
 179   //   reservation: 40% | weight: 1 | limit: 100% |
 180   // Background Recovery Allocation:
 181   //   reservation: 40% | weight: 1 | limit: 150% |
 182   // Background Best Effort Allocation:
 183   //   reservation: 20% | weight: 2 | limit: max |
 184
 185   // Client
 186   uint64_t client_res = static_cast<uint64_t>(
 187     std::round(0.40 * max_osd_capacity));
 188   uint64_t client_lim = static_cast<uint64_t>(
 189     std::round(max_osd_capacity));
 190   uint64_t client_wgt = default_min;
 191
 192   // Background Recovery
 193   uint64_t rec_res = static_cast<uint64_t>(
 194     std::round(0.40 * max_osd_capacity));
 195   uint64_t rec_lim = static_cast<uint64_t>(
 196     std::round(1.5 * max_osd_capacity));
 197   uint64_t rec_wgt = default_min;
 198
 199   // Background Best Effort
 200   uint64_t best_effort_res = static_cast<uint64_t>(
 201     std::round(0.20 * max_osd_capacity));
 202   uint64_t best_effort_lim = default_max;
 203   uint64_t best_effort_wgt = 2;
 204
 205   // Set the allocations for the mclock clients
 206   client_allocs[
 207     static_cast<size_t>(op_scheduler_class::client)].update(
 208       client_res,
 209       client_wgt,
 210       client_lim);
 211   client_allocs[
 212     static_cast<size_t>(op_scheduler_class::background_recovery)].update(
 213       rec_res,
 214       rec_wgt,
 215       rec_lim);
 216   client_allocs[
 217     static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
 218       best_effort_res,
 219       best_effort_wgt,
 220       best_effort_lim);
 221 }
 222
 223 void mClockScheduler::set_high_recovery_ops_profile_allocations()
 224 {
 225   // Client Allocation:
 226   //   reservation: 30% | weight: 1 | limit: 80% |
 227   // Background Recovery Allocation:
 228   //   reservation: 60% | weight: 2 | limit: 200% |
 229   // Background Best Effort Allocation:
 230   //   reservation: 1 | weight: 2 | limit: max |
 231
 232   // Client
 233   uint64_t client_res = static_cast<uint64_t>(
 234     std::round(0.30 * max_osd_capacity));
 235   uint64_t client_lim = static_cast<uint64_t>(
 236     std::round(0.80 * max_osd_capacity));
 237   uint64_t client_wgt = default_min;
 238
 239   // Background Recovery
 240   uint64_t rec_res = static_cast<uint64_t>(
 241     std::round(0.60 * max_osd_capacity));
 242   uint64_t rec_lim = static_cast<uint64_t>(
 243     std::round(2.0 * max_osd_capacity));
 244   uint64_t rec_wgt = 2;
 245
 246   // Background Best Effort
 247   uint64_t best_effort_res = default_min;
 248   uint64_t best_effort_lim = default_max;
 249   uint64_t best_effort_wgt = 2;
 250
 251   // Set the allocations for the mclock clients
 252   client_allocs[
 253     static_cast<size_t>(op_scheduler_class::client)].update(
 254       client_res,
 255       client_wgt,
 256       client_lim);
 257   client_allocs[
 258     static_cast<size_t>(op_scheduler_class::background_recovery)].update(
 259       rec_res,
 260       rec_wgt,
 261       rec_lim);
 262   client_allocs[
 263     static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
 264       best_effort_res,
 265       best_effort_wgt,
 266       best_effort_lim);
 267 }
 268
 269 void mClockScheduler::set_high_client_ops_profile_allocations()
 270 {
 271   // Client Allocation:
 272   //   reservation: 50% | weight: 2 | limit: max |
 273   // Background Recovery Allocation:
 274   //   reservation: 25% | weight: 1 | limit: 100% |
 275   // Background Best Effort Allocation:
 276   //   reservation: 25% | weight: 2 | limit: max |
 277
 278   // Client
 279   uint64_t client_res = static_cast<uint64_t>(
 280     std::round(0.50 * max_osd_capacity));
 281   uint64_t client_wgt = 2;
 282   uint64_t client_lim = default_max;
 283
 284   // Background Recovery
 285   uint64_t rec_res = static_cast<uint64_t>(
 286     std::round(0.25 * max_osd_capacity));
 287   uint64_t rec_lim = static_cast<uint64_t>(
 288     std::round(max_osd_capacity));
 289   uint64_t rec_wgt = default_min;
 290
 291   // Background Best Effort
 292   uint64_t best_effort_res = static_cast<uint64_t>(
 293     std::round(0.25 * max_osd_capacity));
 294   uint64_t best_effort_lim = default_max;
 295   uint64_t best_effort_wgt = 2;
 296
 297   // Set the allocations for the mclock clients
 298   client_allocs[
 299     static_cast<size_t>(op_scheduler_class::client)].update(
 300       client_res,
 301       client_wgt,
 302       client_lim);
 303   client_allocs[
 304     static_cast<size_t>(op_scheduler_class::background_recovery)].update(
 305       rec_res,
 306       rec_wgt,
 307       rec_lim);
 308   client_allocs[
 309     static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
 310       best_effort_res,
 311       best_effort_wgt,
 312       best_effort_lim);
 313 }
 314
 315 void mClockScheduler::enable_mclock_profile_settings()
 316 {
 317   // Nothing to do for "custom" profile
 318   if (mclock_profile == "custom") {
 319     return;
 320   }
 321
 322   // Set mclock and ceph config options for the chosen profile
 323   if (mclock_profile == "balanced") {
 324     set_balanced_profile_allocations();
 325   } else if (mclock_profile == "high_recovery_ops") {
 326     set_high_recovery_ops_profile_allocations();
 327   } else if (mclock_profile == "high_client_ops") {
 328     set_high_client_ops_profile_allocations();
 329   } else {
 330     ceph_assert("Invalid choice of mclock profile" == 0);
 331     return;
 332   }
 333
 334   // Set the mclock config parameters
 335   set_profile_config();
 336 }
 337
 338 void mClockScheduler::set_profile_config()
 339 {
 340   ClientAllocs client = client_allocs[
 341     static_cast<size_t>(op_scheduler_class::client)];
 342   ClientAllocs rec = client_allocs[
 343     static_cast<size_t>(op_scheduler_class::background_recovery)];
 344   ClientAllocs best_effort = client_allocs[
 345     static_cast<size_t>(op_scheduler_class::background_best_effort)];
 346
 347   // Set external client params
 348   cct->_conf.set_val("osd_mclock_scheduler_client_res",
 349     std::to_string(client.res));
 350   cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
 351     std::to_string(client.wgt));
 352   cct->_conf.set_val("osd_mclock_scheduler_client_lim",
 353     std::to_string(client.lim));
 354   dout(10) << __func__ << " client QoS params: " << "["
 355            << client.res << "," << client.wgt << "," << client.lim
 356            << "]" << dendl;
 357
 358   // Set background recovery client params
 359   cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
 360     std::to_string(rec.res));
 361   cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
 362     std::to_string(rec.wgt));
 363   cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
 364     std::to_string(rec.lim));
 365   dout(10) << __func__ << " Recovery QoS params: " << "["
 366            << rec.res << "," << rec.wgt << "," << rec.lim
 367            << "]" << dendl;
 368
 369   // Set background best effort client params
 370   cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
 371     std::to_string(best_effort.res));
 372   cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
 373     std::to_string(best_effort.wgt));
 374   cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
 375     std::to_string(best_effort.lim));
 376   dout(10) << __func__ << " Best effort QoS params: " << "["
 377     << best_effort.res << "," << best_effort.wgt << "," << best_effort.lim
 378     << "]" << dendl;
 379 }
 380
 381 int mClockScheduler::calc_scaled_cost(int item_cost)
 382 {
 383   // Calculate total scaled cost in secs
 384   int scaled_cost =
 385     std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
 386   return std::max(scaled_cost, 1);
 387 }
 388
 389 void mClockScheduler::update_configuration()
 390 {
 391   // Apply configuration change. The expectation is that
 392   // at least one of the tracked mclock config option keys
 393   // is modified before calling this method.
 394   cct->_conf.apply_changes(nullptr);
 395 }
 396
 397 void mClockScheduler::dump(ceph::Formatter &f) const
 398 {
 399   // Display queue sizes
 400   f.open_object_section("queue_sizes");
 401   f.dump_int("immediate", immediate.size());
 402   f.dump_int("scheduler", scheduler.request_count());
 403   f.close_section();
 404
 405   // client map and queue tops (res, wgt, lim)
 406   std::ostringstream out;
 407   f.open_object_section("mClockClients");
 408   f.dump_int("client_count", scheduler.client_count());
 409   out << scheduler;
 410   f.dump_string("clients", out.str());
 411   f.close_section();
 412
 413   // Display sorted queues (res, wgt, lim)
 414   f.open_object_section("mClockQueues");
 415   f.dump_string("queues", display_queues());
 416   f.close_section();
 417 }
 418
 419 void mClockScheduler::enqueue(OpSchedulerItem&& item)
 420 {
 421   auto id = get_scheduler_id(item);
 422
 423   // TODO: move this check into OpSchedulerItem, handle backwards compat
 424   if (op_scheduler_class::immediate == id.class_id) {
 425     immediate.push_front(std::move(item));
 426   } else {
 427     int cost = calc_scaled_cost(item.get_cost());
 428     item.set_qos_cost(cost);
 429     dout(20) << __func__ << " " << id
 430              << " item_cost: " << item.get_cost()
 431              << " scaled_cost: " << cost
 432              << dendl;
 433
 434     // Add item to scheduler queue
 435     scheduler.add_request(
 436       std::move(item),
 437       id,
 438       cost);
 439   }
 440
 441  dout(20) << __func__ << " client_count: " << scheduler.client_count()
 442           << " queue_sizes: [ imm: " << immediate.size()
 443           << " sched: " << scheduler.request_count() << " ]"
 444           << dendl;
 445  dout(30) << __func__ << " mClockClients: "
 446           << scheduler
 447           << dendl;
 448  dout(30) << __func__ << " mClockQueues: { "
 449           << display_queues() << " }"
 450           << dendl;
 451 }
 452
 453 void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
 454 {
 455   immediate.push_back(std::move(item));
 456   // TODO: item may not be immediate, update mclock machinery to permit
 457   // putting the item back in the queue
 458 }
 459
 460 WorkItem mClockScheduler::dequeue()
 461 {
 462   if (!immediate.empty()) {
 463     WorkItem work_item{std::move(immediate.back())};
 464     immediate.pop_back();
 465     return work_item;
 466   } else {
 467     mclock_queue_t::PullReq result = scheduler.pull_request();
 468     if (result.is_future()) {
 469       return result.getTime();
 470     } else if (result.is_none()) {
 471       ceph_assert(
 472         0 == "Impossible, must have checked empty() first");
 473       return {};
 474     } else {
 475       ceph_assert(result.is_retn());
 476
 477       auto &retn = result.get_retn();
 478       return std::move(*retn.request);
 479     }
 480   }
 481 }
 482
 483 std::string mClockScheduler::display_queues() const
 484 {
 485   std::ostringstream out;
 486   scheduler.display_queues(out);
 487   return out.str();
 488 }
 489
 490 const char** mClockScheduler::get_tracked_conf_keys() const
 491 {
 492   static const char* KEYS[] = {
 493     "osd_mclock_scheduler_client_res",
 494     "osd_mclock_scheduler_client_wgt",
 495     "osd_mclock_scheduler_client_lim",
 496     "osd_mclock_scheduler_background_recovery_res",
 497     "osd_mclock_scheduler_background_recovery_wgt",
 498     "osd_mclock_scheduler_background_recovery_lim",
 499     "osd_mclock_scheduler_background_best_effort_res",
 500     "osd_mclock_scheduler_background_best_effort_wgt",
 501     "osd_mclock_scheduler_background_best_effort_lim",
 502     "osd_mclock_cost_per_io_usec",
 503     "osd_mclock_cost_per_io_usec_hdd",
 504     "osd_mclock_cost_per_io_usec_ssd",
 505     "osd_mclock_cost_per_byte_usec",
 506     "osd_mclock_cost_per_byte_usec_hdd",
 507     "osd_mclock_cost_per_byte_usec_ssd",
 508     "osd_mclock_max_capacity_iops_hdd",
 509     "osd_mclock_max_capacity_iops_ssd",
 510     "osd_mclock_profile",
 511     NULL
 512   };
 513   return KEYS;
 514 }
 515
 516 void mClockScheduler::handle_conf_change(
 517   const ConfigProxy& conf,
 518   const std::set<std::string> &changed)
 519 {
 520   if (changed.count("osd_mclock_cost_per_io_usec") ||
 521       changed.count("osd_mclock_cost_per_io_usec_hdd") ||
 522       changed.count("osd_mclock_cost_per_io_usec_ssd")) {
 523     set_osd_mclock_cost_per_io();
 524   }
 525   if (changed.count("osd_mclock_cost_per_byte_usec") ||
 526       changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
 527       changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
 528     set_osd_mclock_cost_per_byte();
 529   }
 530   if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
 531       changed.count("osd_mclock_max_capacity_iops_ssd")) {
 532     set_max_osd_capacity();
 533     if (mclock_profile != "custom") {
 534       enable_mclock_profile_settings();
 535       client_registry.update_from_config(conf);
 536     }
 537   }
 538   if (changed.count("osd_mclock_profile")) {
 539     set_mclock_profile();
 540     if (mclock_profile != "custom") {
 541       enable_mclock_profile_settings();
 542       client_registry.update_from_config(conf);
 543     }
 544   }
 545   if (changed.count("osd_mclock_scheduler_client_res") ||
 546       changed.count("osd_mclock_scheduler_client_wgt") ||
 547       changed.count("osd_mclock_scheduler_client_lim") ||
 548       changed.count("osd_mclock_scheduler_background_recovery_res") ||
 549       changed.count("osd_mclock_scheduler_background_recovery_wgt") ||
 550       changed.count("osd_mclock_scheduler_background_recovery_lim") ||
 551       changed.count("osd_mclock_scheduler_background_best_effort_res") ||
 552       changed.count("osd_mclock_scheduler_background_best_effort_wgt") ||
 553       changed.count("osd_mclock_scheduler_background_best_effort_lim")) {
 554     if (mclock_profile == "custom") {
 555       client_registry.update_from_config(conf);
 556     }
 557   }
 558 }
 559
 560 mClockScheduler::~mClockScheduler()
 561 {
 562   cct->_conf.remove_observer(this);
 563 }
 564
 565 }