ceph/src/osd/scheduler/mClockScheduler.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2016 Red Hat Inc.
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 #include <memory>
  17 #include <functional>
  18
  19 #include "osd/scheduler/mClockScheduler.h"
  20 #include "common/dout.h"
  21
  22 namespace dmc = crimson::dmclock;
  23 using namespace std::placeholders;
  24
  25 #define dout_context cct
  26 #define dout_subsys ceph_subsys_mclock
  27 #undef dout_prefix
  28 #define dout_prefix *_dout << "mClockScheduler: "
  29
  30
  31 namespace ceph::osd::scheduler {
  32
  33 mClockScheduler::mClockScheduler(CephContext *cct,
  34   int whoami,
  35   uint32_t num_shards,
  36   int shard_id,
  37   bool is_rotational,
  38   MonClient *monc)
  39   : cct(cct),
  40     whoami(whoami),
  41     num_shards(num_shards),
  42     shard_id(shard_id),
  43     is_rotational(is_rotational),
  44     monc(monc),
  45     scheduler(
  46       std::bind(&mClockScheduler::ClientRegistry::get_info,
  47                 &client_registry,
  48                 _1),
  49       dmc::AtLimit::Wait,
  50       cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
  51 {
  52   cct->_conf.add_observer(this);
  53   ceph_assert(num_shards > 0);
  54   set_osd_capacity_params_from_config();
  55   set_config_defaults_from_profile();
  56   client_registry.update_from_config(
  57     cct->_conf, osd_bandwidth_capacity_per_shard);
  58 }
  59
  60 /* ClientRegistry holds the dmclock::ClientInfo configuration parameters
  61  * (reservation (bytes/second), weight (unitless), limit (bytes/second))
  62  * for each IO class in the OSD (client, background_recovery,
  63  * background_best_effort).
  64  *
  65  * mclock expects limit and reservation to have units of <cost>/second
  66  * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided
  67  * as ratios of the OSD's capacity.  We convert from the one to the other
  68  * using the capacity_per_shard parameter.
  69  *
  70  * Note, mclock profile information will already have been set as a default
  71  * for the osd_mclock_scheduler_client_* parameters prior to calling
  72  * update_from_config -- see set_config_defaults_from_profile().
  73  */
  74 void mClockScheduler::ClientRegistry::update_from_config(
  75   const ConfigProxy &conf,
  76   const double capacity_per_shard)
  77 {
  78
  79   auto get_res = [&](double res) {
  80     if (res) {
  81       return res * capacity_per_shard;
  82     } else {
  83       return default_min; // min reservation
  84     }
  85   };
  86
  87   auto get_lim = [&](double lim) {
  88     if (lim) {
  89       return lim * capacity_per_shard;
  90     } else {
  91       return default_max; // high limit
  92     }
  93   };
  94
  95   // Set external client infos
  96   double res = conf.get_val<double>(
  97     "osd_mclock_scheduler_client_res");
  98   double lim = conf.get_val<double>(
  99     "osd_mclock_scheduler_client_lim");
 100   uint64_t wgt = conf.get_val<uint64_t>(
 101     "osd_mclock_scheduler_client_wgt");
 102   default_external_client_info.update(
 103     get_res(res),
 104     wgt,
 105     get_lim(lim));
 106
 107   // Set background recovery client infos
 108   res = conf.get_val<double>(
 109     "osd_mclock_scheduler_background_recovery_res");
 110   lim = conf.get_val<double>(
 111     "osd_mclock_scheduler_background_recovery_lim");
 112   wgt = conf.get_val<uint64_t>(
 113     "osd_mclock_scheduler_background_recovery_wgt");
 114   internal_client_infos[
 115     static_cast<size_t>(op_scheduler_class::background_recovery)].update(
 116       get_res(res),
 117       wgt,
 118       get_lim(lim));
 119
 120   // Set background best effort client infos
 121   res = conf.get_val<double>(
 122     "osd_mclock_scheduler_background_best_effort_res");
 123   lim = conf.get_val<double>(
 124     "osd_mclock_scheduler_background_best_effort_lim");
 125   wgt = conf.get_val<uint64_t>(
 126     "osd_mclock_scheduler_background_best_effort_wgt");
 127   internal_client_infos[
 128     static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
 129       get_res(res),
 130       wgt,
 131       get_lim(lim));
 132 }
 133
 134 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
 135   const client_profile_id_t &client) const
 136 {
 137   auto ret = external_client_infos.find(client);
 138   if (ret == external_client_infos.end())
 139     return &default_external_client_info;
 140   else
 141     return &(ret->second);
 142 }
 143
 144 const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
 145   const scheduler_id_t &id) const {
 146   switch (id.class_id) {
 147   case op_scheduler_class::immediate:
 148     ceph_assert(0 == "Cannot schedule immediate");
 149     return (dmc::ClientInfo*)nullptr;
 150   case op_scheduler_class::client:
 151     return get_external_client(id.client_profile_id);
 152   default:
 153     ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
 154     return &internal_client_infos[static_cast<size_t>(id.class_id)];
 155   }
 156 }
 157
 158 void mClockScheduler::set_osd_capacity_params_from_config()
 159 {
 160   uint64_t osd_bandwidth_capacity;
 161   double osd_iop_capacity;
 162
 163   std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] {
 164     if (is_rotational) {
 165       return std::make_tuple(
 166         cct->_conf.get_val<Option::size_t>(
 167           "osd_mclock_max_sequential_bandwidth_hdd"),
 168         cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd"));
 169     } else {
 170       return std::make_tuple(
 171         cct->_conf.get_val<Option::size_t>(
 172           "osd_mclock_max_sequential_bandwidth_ssd"),
 173         cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd"));
 174     }
 175   }();
 176
 177   osd_bandwidth_capacity = std::max<uint64_t>(1, osd_bandwidth_capacity);
 178   osd_iop_capacity = std::max<double>(1.0, osd_iop_capacity);
 179
 180   osd_bandwidth_cost_per_io =
 181     static_cast<double>(osd_bandwidth_capacity) / osd_iop_capacity;
 182   osd_bandwidth_capacity_per_shard = static_cast<double>(osd_bandwidth_capacity)
 183     / static_cast<double>(num_shards);
 184
 185   dout(1) << __func__ << ": osd_bandwidth_cost_per_io: "
 186           << std::fixed << std::setprecision(2)
 187           << osd_bandwidth_cost_per_io << " bytes/io"
 188           << ", osd_bandwidth_capacity_per_shard "
 189           << osd_bandwidth_capacity_per_shard << " bytes/second"
 190           << dendl;
 191 }
 192
 193 /**
 194  * profile_t
 195  *
 196  * mclock profile -- 3 params for each of 3 client classes
 197  * 0 (min): specifies no minimum reservation
 198  * 0 (max): specifies no upper limit
 199  */
 200 struct profile_t {
 201   struct client_config_t {
 202     double reservation;
 203     uint64_t weight;
 204     double limit;
 205   };
 206   client_config_t client;
 207   client_config_t background_recovery;
 208   client_config_t background_best_effort;
 209 };
 210
 211 static std::ostream &operator<<(
 212   std::ostream &lhs, const profile_t::client_config_t &rhs)
 213 {
 214   return lhs << "{res: " << rhs.reservation
 215              << ", wgt: " << rhs.weight
 216              << ", lim: " << rhs.limit
 217              << "}";
 218 }
 219
 220 static std::ostream &operator<<(std::ostream &lhs, const profile_t &rhs)
 221 {
 222   return lhs << "[client: " << rhs.client
 223              << ", background_recovery: " << rhs.background_recovery
 224              << ", background_best_effort: " << rhs.background_best_effort
 225              << "]";
 226 }
 227
 228 void mClockScheduler::set_config_defaults_from_profile()
 229 {
 230   // Let only a single osd shard (id:0) set the profile configs
 231   if (shard_id > 0) {
 232     return;
 233   }
 234
 235   /**
 236    * high_client_ops
 237    *
 238    * Client Allocation:
 239    *   reservation: 60% | weight: 2 | limit: 0 (max) |
 240    * Background Recovery Allocation:
 241    *   reservation: 40% | weight: 1 | limit: 0 (max) |
 242    * Background Best Effort Allocation:
 243    *   reservation: 0 (min) | weight: 1 | limit: 70% |
 244    */
 245   static constexpr profile_t high_client_ops_profile{
 246     { .6, 2,  0 },
 247     { .4, 1,  0 },
 248     {  0, 1, .7 }
 249   };
 250
 251   /**
 252    * high_recovery_ops
 253    *
 254    * Client Allocation:
 255    *   reservation: 30% | weight: 1 | limit: 0 (max) |
 256    * Background Recovery Allocation:
 257    *   reservation: 70% | weight: 2 | limit: 0 (max) |
 258    * Background Best Effort Allocation:
 259    *   reservation: 0 (min) | weight: 1 | limit: 0 (max) |
 260    */
 261   static constexpr profile_t high_recovery_ops_profile{
 262     { .3, 1, 0 },
 263     { .7, 2, 0 },
 264     {  0, 1, 0 }
 265   };
 266
 267   /**
 268    * balanced
 269    *
 270    * Client Allocation:
 271    *   reservation: 50% | weight: 1 | limit: 0 (max) |
 272    * Background Recovery Allocation:
 273    *   reservation: 50% | weight: 1 | limit: 0 (max) |
 274    * Background Best Effort Allocation:
 275    *   reservation: 0 (min) | weight: 1 | limit: 90% |
 276    */
 277   static constexpr profile_t balanced_profile{
 278     { .5, 1, 0 },
 279     { .5, 1, 0 },
 280     {  0, 1, .9 }
 281   };
 282
 283   const profile_t *profile = nullptr;
 284   auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
 285   if (mclock_profile == "high_client_ops") {
 286     profile = &high_client_ops_profile;
 287     dout(10) << "Setting high_client_ops profile " << *profile << dendl;
 288   } else if (mclock_profile == "high_recovery_ops") {
 289     profile = &high_recovery_ops_profile;
 290     dout(10) << "Setting high_recovery_ops profile " << *profile << dendl;
 291   } else if (mclock_profile == "balanced") {
 292     profile = &balanced_profile;
 293     dout(10) << "Setting balanced profile " << *profile << dendl;
 294   } else if (mclock_profile == "custom") {
 295     dout(10) << "Profile set to custom, not setting defaults" << dendl;
 296     return;
 297   } else {
 298     derr << "Invalid mclock profile: " << mclock_profile << dendl;
 299     ceph_assert("Invalid choice of mclock profile" == 0);
 300     return;
 301   }
 302   ceph_assert(nullptr != profile);
 303
 304   auto set_config = [&conf = cct->_conf](const char *key, auto val) {
 305     conf.set_val_default(key, std::to_string(val));
 306   };
 307
 308   set_config("osd_mclock_scheduler_client_res", profile->client.reservation);
 309   set_config("osd_mclock_scheduler_client_wgt", profile->client.weight);
 310   set_config("osd_mclock_scheduler_client_lim", profile->client.limit);
 311
 312   set_config(
 313     "osd_mclock_scheduler_background_recovery_res",
 314     profile->background_recovery.reservation);
 315   set_config(
 316     "osd_mclock_scheduler_background_recovery_wgt",
 317     profile->background_recovery.weight);
 318   set_config(
 319     "osd_mclock_scheduler_background_recovery_lim",
 320     profile->background_recovery.limit);
 321
 322   set_config(
 323     "osd_mclock_scheduler_background_best_effort_res",
 324     profile->background_best_effort.reservation);
 325   set_config(
 326     "osd_mclock_scheduler_background_best_effort_wgt",
 327     profile->background_best_effort.weight);
 328   set_config(
 329     "osd_mclock_scheduler_background_best_effort_lim",
 330     profile->background_best_effort.limit);
 331
 332   cct->_conf.apply_changes(nullptr);
 333 }
 334
 335 uint32_t mClockScheduler::calc_scaled_cost(int item_cost)
 336 {
 337   auto cost = static_cast<uint32_t>(
 338     std::max<int>(
 339       1, // ensure cost is non-zero and positive
 340       item_cost));
 341   auto cost_per_io = static_cast<uint32_t>(osd_bandwidth_cost_per_io);
 342
 343   return std::max<uint32_t>(cost, cost_per_io);
 344 }
 345
 346 void mClockScheduler::update_configuration()
 347 {
 348   // Apply configuration change. The expectation is that
 349   // at least one of the tracked mclock config option keys
 350   // is modified before calling this method.
 351   cct->_conf.apply_changes(nullptr);
 352 }
 353
 354 void mClockScheduler::dump(ceph::Formatter &f) const
 355 {
 356   // Display queue sizes
 357   f.open_object_section("queue_sizes");
 358   f.dump_int("high_priority_queue", high_priority.size());
 359   f.dump_int("scheduler", scheduler.request_count());
 360   f.close_section();
 361
 362   // client map and queue tops (res, wgt, lim)
 363   std::ostringstream out;
 364   f.open_object_section("mClockClients");
 365   f.dump_int("client_count", scheduler.client_count());
 366   out << scheduler;
 367   f.dump_string("clients", out.str());
 368   f.close_section();
 369
 370   // Display sorted queues (res, wgt, lim)
 371   f.open_object_section("mClockQueues");
 372   f.dump_string("queues", display_queues());
 373   f.close_section();
 374
 375   f.open_object_section("HighPriorityQueue");
 376   for (auto it = high_priority.begin();
 377        it != high_priority.end(); it++) {
 378     f.dump_int("priority", it->first);
 379     f.dump_int("queue_size", it->second.size());
 380   }
 381   f.close_section();
 382 }
 383
 384 void mClockScheduler::enqueue(OpSchedulerItem&& item)
 385 {
 386   auto id = get_scheduler_id(item);
 387   unsigned priority = item.get_priority();
 388
 389   // TODO: move this check into OpSchedulerItem, handle backwards compat
 390   if (op_scheduler_class::immediate == id.class_id) {
 391     enqueue_high(immediate_class_priority, std::move(item));
 392   } else if (priority >= cutoff_priority) {
 393     enqueue_high(priority, std::move(item));
 394   } else {
 395     auto cost = calc_scaled_cost(item.get_cost());
 396     item.set_qos_cost(cost);
 397     dout(20) << __func__ << " " << id
 398              << " item_cost: " << item.get_cost()
 399              << " scaled_cost: " << cost
 400              << dendl;
 401
 402     // Add item to scheduler queue
 403     scheduler.add_request(
 404       std::move(item),
 405       id,
 406       cost);
 407   }
 408
 409  dout(20) << __func__ << " client_count: " << scheduler.client_count()
 410           << " queue_sizes: [ "
 411           << " high_priority_queue: " << high_priority.size()
 412           << " sched: " << scheduler.request_count() << " ]"
 413           << dendl;
 414  dout(30) << __func__ << " mClockClients: "
 415           << scheduler
 416           << dendl;
 417  dout(30) << __func__ << " mClockQueues: { "
 418           << display_queues() << " }"
 419           << dendl;
 420 }
 421
 422 void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
 423 {
 424   unsigned priority = item.get_priority();
 425   auto id = get_scheduler_id(item);
 426
 427   if (op_scheduler_class::immediate == id.class_id) {
 428     enqueue_high(immediate_class_priority, std::move(item), true);
 429   } else if (priority >= cutoff_priority) {
 430     enqueue_high(priority, std::move(item), true);
 431   } else {
 432     // mClock does not support enqueue at front, so we use
 433     // the high queue with priority 0
 434     enqueue_high(0, std::move(item), true);
 435   }
 436 }
 437
 438 void mClockScheduler::enqueue_high(unsigned priority,
 439                                    OpSchedulerItem&& item,
 440                                    bool front)
 441 {
 442   if (front) {
 443     high_priority[priority].push_back(std::move(item));
 444   } else {
 445     high_priority[priority].push_front(std::move(item));
 446   }
 447 }
 448
 449 WorkItem mClockScheduler::dequeue()
 450 {
 451   if (!high_priority.empty()) {
 452     auto iter = high_priority.begin();
 453     // invariant: high_priority entries are never empty
 454     assert(!iter->second.empty());
 455     WorkItem ret{std::move(iter->second.back())};
 456     iter->second.pop_back();
 457     if (iter->second.empty()) {
 458       // maintain invariant, high priority entries are never empty
 459       high_priority.erase(iter);
 460     }
 461     ceph_assert(std::get_if<OpSchedulerItem>(&ret));
 462     return ret;
 463   } else {
 464     mclock_queue_t::PullReq result = scheduler.pull_request();
 465     if (result.is_future()) {
 466       return result.getTime();
 467     } else if (result.is_none()) {
 468       ceph_assert(
 469         0 == "Impossible, must have checked empty() first");
 470       return {};
 471     } else {
 472       ceph_assert(result.is_retn());
 473
 474       auto &retn = result.get_retn();
 475       return std::move(*retn.request);
 476     }
 477   }
 478 }
 479
 480 std::string mClockScheduler::display_queues() const
 481 {
 482   std::ostringstream out;
 483   scheduler.display_queues(out);
 484   return out.str();
 485 }
 486
 487 const char** mClockScheduler::get_tracked_conf_keys() const
 488 {
 489   static const char* KEYS[] = {
 490     "osd_mclock_scheduler_client_res",
 491     "osd_mclock_scheduler_client_wgt",
 492     "osd_mclock_scheduler_client_lim",
 493     "osd_mclock_scheduler_background_recovery_res",
 494     "osd_mclock_scheduler_background_recovery_wgt",
 495     "osd_mclock_scheduler_background_recovery_lim",
 496     "osd_mclock_scheduler_background_best_effort_res",
 497     "osd_mclock_scheduler_background_best_effort_wgt",
 498     "osd_mclock_scheduler_background_best_effort_lim",
 499     "osd_mclock_max_capacity_iops_hdd",
 500     "osd_mclock_max_capacity_iops_ssd",
 501     "osd_mclock_max_sequential_bandwidth_hdd",
 502     "osd_mclock_max_sequential_bandwidth_ssd",
 503     "osd_mclock_profile",
 504     NULL
 505   };
 506   return KEYS;
 507 }
 508
 509 void mClockScheduler::handle_conf_change(
 510   const ConfigProxy& conf,
 511   const std::set<std::string> &changed)
 512 {
 513   if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
 514       changed.count("osd_mclock_max_capacity_iops_ssd")) {
 515     set_osd_capacity_params_from_config();
 516     client_registry.update_from_config(
 517       conf, osd_bandwidth_capacity_per_shard);
 518   }
 519   if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") ||
 520       changed.count("osd_mclock_max_sequential_bandwidth_ssd")) {
 521     set_osd_capacity_params_from_config();
 522     client_registry.update_from_config(
 523       conf, osd_bandwidth_capacity_per_shard);
 524   }
 525   if (changed.count("osd_mclock_profile")) {
 526     set_config_defaults_from_profile();
 527     client_registry.update_from_config(
 528       conf, osd_bandwidth_capacity_per_shard);
 529   }
 530
 531   auto get_changed_key = [&changed]() -> std::optional<std::string> {
 532     static const std::vector<std::string> qos_params = {
 533       "osd_mclock_scheduler_client_res",
 534       "osd_mclock_scheduler_client_wgt",
 535       "osd_mclock_scheduler_client_lim",
 536       "osd_mclock_scheduler_background_recovery_res",
 537       "osd_mclock_scheduler_background_recovery_wgt",
 538       "osd_mclock_scheduler_background_recovery_lim",
 539       "osd_mclock_scheduler_background_best_effort_res",
 540       "osd_mclock_scheduler_background_best_effort_wgt",
 541       "osd_mclock_scheduler_background_best_effort_lim"
 542     };
 543
 544     for (auto &qp : qos_params) {
 545       if (changed.count(qp)) {
 546         return qp;
 547       }
 548     }
 549     return std::nullopt;
 550   };
 551
 552   if (auto key = get_changed_key(); key.has_value()) {
 553     auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
 554     if (mclock_profile == "custom") {
 555       client_registry.update_from_config(
 556         conf, osd_bandwidth_capacity_per_shard);
 557     } else {
 558       // Attempt to change QoS parameter for a built-in profile. Restore the
 559       // profile defaults by making one of the OSD shards remove the key from
 560       // config monitor store. Note: monc is included in the check since the
 561       // mock unit test currently doesn't initialize it.
 562       if (shard_id == 0 && monc) {
 563         static const std::vector<std::string> osds = {
 564           "osd",
 565           "osd." + std::to_string(whoami)
 566         };
 567
 568         for (auto osd : osds) {
 569           std::string cmd =
 570             "{"
 571               "\"prefix\": \"config rm\", "
 572               "\"who\": \"" + osd + "\", "
 573               "\"name\": \"" + *key + "\""
 574             "}";
 575           std::vector<std::string> vcmd{cmd};
 576
 577           dout(10) << __func__ << " Removing Key: " << *key
 578                    << " for " << osd << " from Mon db" << dendl;
 579           monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
 580         }
 581       }
 582     }
 583     // Alternatively, the QoS parameter, if set ephemerally for this OSD via
 584     // the 'daemon' or 'tell' interfaces must be removed.
 585     if (!cct->_conf.rm_val(*key)) {
 586       dout(10) << __func__ << " Restored " << *key << " to default" << dendl;
 587       cct->_conf.apply_changes(nullptr);
 588     }
 589   }
 590 }
 591
 592 mClockScheduler::~mClockScheduler()
 593 {
 594   cct->_conf.remove_observer(this);
 595 }
 596
 597 }