update ceph source to reef 18.1.2

[ceph.git] / ceph / src / osd / scheduler / mClockScheduler.cc
diff --git a/ceph/src/osd/scheduler/mClockScheduler.cc b/ceph/src/osd/scheduler/mClockScheduler.cc

index 4b388bd5217aa2de4fd01049c772f663cca5617e..8f76e516530dadaba45b581fc2a76435b085c712 100644 (file)
--- a/ceph/src/osd/scheduler/mClockScheduler.cc
+++ b/ceph/src/osd/scheduler/mClockScheduler.cc
@@ -51,32 +51,84 @@ mClockScheduler::mClockScheduler(CephContext *cct,
  {
    cct->_conf.add_observer(this);
    ceph_assert(num_shards > 0);
-  set_max_osd_capacity();
-  set_osd_mclock_cost_per_io();
-  set_osd_mclock_cost_per_byte();
-  set_mclock_profile();
-  enable_mclock_profile_settings();
-  client_registry.update_from_config(cct->_conf);
+  set_osd_capacity_params_from_config();
+  set_config_defaults_from_profile();
+  client_registry.update_from_config(
+    cct->_conf, osd_bandwidth_capacity_per_shard);
  }
  
-void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+/* ClientRegistry holds the dmclock::ClientInfo configuration parameters
+ * (reservation (bytes/second), weight (unitless), limit (bytes/second))
+ * for each IO class in the OSD (client, background_recovery,
+ * background_best_effort).
+ *
+ * mclock expects limit and reservation to have units of <cost>/second
+ * (bytes/second), but osd_mclock_scheduler_client_(lim|res) are provided
+ * as ratios of the OSD's capacity.  We convert from the one to the other
+ * using the capacity_per_shard parameter.
+ *
+ * Note, mclock profile information will already have been set as a default
+ * for the osd_mclock_scheduler_client_* parameters prior to calling
+ * update_from_config -- see set_config_defaults_from_profile().
+ */
+void mClockScheduler::ClientRegistry::update_from_config(
+  const ConfigProxy &conf,
+  const double capacity_per_shard)
  {
-  default_external_client_info.update(
-    conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
  
+  auto get_res = [&](double res) {
+    if (res) {
+      return res * capacity_per_shard;
+    } else {
+      return default_min; // min reservation
+    }
+  };
+
+  auto get_lim = [&](double lim) {
+    if (lim) {
+      return lim * capacity_per_shard;
+    } else {
+      return default_max; // high limit
+    }
+  };
+
+  // Set external client infos
+  double res = conf.get_val<double>(
+    "osd_mclock_scheduler_client_res");
+  double lim = conf.get_val<double>(
+    "osd_mclock_scheduler_client_lim");
+  uint64_t wgt = conf.get_val<uint64_t>(
+    "osd_mclock_scheduler_client_wgt");
+  default_external_client_info.update(
+    get_res(res),
+    wgt,
+    get_lim(lim));
+
+  // Set background recovery client infos
+  res = conf.get_val<double>(
+    "osd_mclock_scheduler_background_recovery_res");
+  lim = conf.get_val<double>(
+    "osd_mclock_scheduler_background_recovery_lim");
+  wgt = conf.get_val<uint64_t>(
+    "osd_mclock_scheduler_background_recovery_wgt");
    internal_client_infos[
      static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
-
+      get_res(res),
+      wgt,
+      get_lim(lim));
+
+  // Set background best effort client infos
+  res = conf.get_val<double>(
+    "osd_mclock_scheduler_background_best_effort_res");
+  lim = conf.get_val<double>(
+    "osd_mclock_scheduler_background_best_effort_lim");
+  wgt = conf.get_val<uint64_t>(
+    "osd_mclock_scheduler_background_best_effort_wgt");
    internal_client_infos[
      static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
-    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
+      get_res(res),
+      wgt,
+      get_lim(lim));
  }
  
  const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
@@ -103,303 +155,193 @@ const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
    }
  }
  
-void mClockScheduler::set_max_osd_capacity()
+void mClockScheduler::set_osd_capacity_params_from_config()
  {
-  if (is_rotational) {
-    max_osd_capacity =
-      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
-    cct->_conf.set_val("osd_mclock_max_capacity_iops_ssd", "0");
-  } else {
-    max_osd_capacity =
-      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
-    cct->_conf.set_val("osd_mclock_max_capacity_iops_hdd", "0");
-  }
-  // Set per op-shard iops limit
-  max_osd_capacity /= num_shards;
-  dout(1) << __func__ << " #op shards: " << num_shards
-          << std::fixed << std::setprecision(2)
-          << " max osd capacity(iops) per shard: " << max_osd_capacity
-          << dendl;
-}
+  uint64_t osd_bandwidth_capacity;
+  double osd_iop_capacity;
  
-void mClockScheduler::set_osd_mclock_cost_per_io()
-{
-  std::chrono::seconds sec(1);
-  if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
-    osd_mclock_cost_per_io =
-      cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
-  } else {
+  std::tie(osd_bandwidth_capacity, osd_iop_capacity) = [&, this] {
      if (is_rotational) {
-      osd_mclock_cost_per_io =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
-      // For HDDs, convert value to seconds
-      osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
+      return std::make_tuple(
+        cct->_conf.get_val<Option::size_t>(
+          "osd_mclock_max_sequential_bandwidth_hdd"),
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd"));
      } else {
-      // For SSDs, convert value to milliseconds
-      osd_mclock_cost_per_io =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
-      osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
+      return std::make_tuple(
+        cct->_conf.get_val<Option::size_t>(
+          "osd_mclock_max_sequential_bandwidth_ssd"),
+        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd"));
      }
-  }
-  dout(1) << __func__ << " osd_mclock_cost_per_io: "
-          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
-          << dendl;
-}
+  }();
  
-void mClockScheduler::set_osd_mclock_cost_per_byte()
-{
-  std::chrono::seconds sec(1);
-  if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
-    osd_mclock_cost_per_byte =
-      cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
-  } else {
-    if (is_rotational) {
-      osd_mclock_cost_per_byte =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
-      // For HDDs, convert value to seconds
-      osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
-    } else {
-      osd_mclock_cost_per_byte =
-        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
-      // For SSDs, convert value to milliseconds
-      osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
-    }
-  }
-  dout(1) << __func__ << " osd_mclock_cost_per_byte: "
-          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
-          << dendl;
-}
+  osd_bandwidth_capacity = std::max<uint64_t>(1, osd_bandwidth_capacity);
+  osd_iop_capacity = std::max<double>(1.0, osd_iop_capacity);
  
-void mClockScheduler::set_mclock_profile()
-{
-  mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
-  dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
-}
+  osd_bandwidth_cost_per_io =
+    static_cast<double>(osd_bandwidth_capacity) / osd_iop_capacity;
+  osd_bandwidth_capacity_per_shard = static_cast<double>(osd_bandwidth_capacity)
+    / static_cast<double>(num_shards);
  
-std::string mClockScheduler::get_mclock_profile()
-{
-  return mclock_profile;
+  dout(1) << __func__ << ": osd_bandwidth_cost_per_io: "
+          << std::fixed << std::setprecision(2)
+          << osd_bandwidth_cost_per_io << " bytes/io"
+          << ", osd_bandwidth_capacity_per_shard "
+          << osd_bandwidth_capacity_per_shard << " bytes/second"
+          << dendl;
  }
  
-void mClockScheduler::set_balanced_profile_allocations()
-{
-  // Client Allocation:
-  //   reservation: 40% | weight: 1 | limit: 100% |
-  // Background Recovery Allocation:
-  //   reservation: 40% | weight: 1 | limit: 150% |
-  // Background Best Effort Allocation:
-  //   reservation: 20% | weight: 2 | limit: max |
-
-  // Client
-  uint64_t client_res = static_cast<uint64_t>(
-    std::round(0.40 * max_osd_capacity));
-  uint64_t client_lim = static_cast<uint64_t>(
-    std::round(max_osd_capacity));
-  uint64_t client_wgt = default_min;
-
-  // Background Recovery
-  uint64_t rec_res = static_cast<uint64_t>(
-    std::round(0.40 * max_osd_capacity));
-  uint64_t rec_lim = static_cast<uint64_t>(
-    std::round(1.5 * max_osd_capacity));
-  uint64_t rec_wgt = default_min;
-
-  // Background Best Effort
-  uint64_t best_effort_res = static_cast<uint64_t>(
-    std::round(0.20 * max_osd_capacity));
-  uint64_t best_effort_lim = default_max;
-  uint64_t best_effort_wgt = 2;
-
-  // Set the allocations for the mclock clients
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::client)].update(
-      client_res,
-      client_wgt,
-      client_lim);
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-      rec_res,
-      rec_wgt,
-      rec_lim);
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-      best_effort_res,
-      best_effort_wgt,
-      best_effort_lim);
-}
+/**
+ * profile_t
+ *
+ * mclock profile -- 3 params for each of 3 client classes
+ * 0 (min): specifies no minimum reservation
+ * 0 (max): specifies no upper limit
+ */
+struct profile_t {
+  struct client_config_t {
+    double reservation;
+    uint64_t weight;
+    double limit;
+  };
+  client_config_t client;
+  client_config_t background_recovery;
+  client_config_t background_best_effort;
+};
  
-void mClockScheduler::set_high_recovery_ops_profile_allocations()
+static std::ostream &operator<<(
+  std::ostream &lhs, const profile_t::client_config_t &rhs)
  {
-  // Client Allocation:
-  //   reservation: 30% | weight: 1 | limit: 80% |
-  // Background Recovery Allocation:
-  //   reservation: 60% | weight: 2 | limit: 200% |
-  // Background Best Effort Allocation:
-  //   reservation: 1 | weight: 2 | limit: max |
-
-  // Client
-  uint64_t client_res = static_cast<uint64_t>(
-    std::round(0.30 * max_osd_capacity));
-  uint64_t client_lim = static_cast<uint64_t>(
-    std::round(0.80 * max_osd_capacity));
-  uint64_t client_wgt = default_min;
-
-  // Background Recovery
-  uint64_t rec_res = static_cast<uint64_t>(
-    std::round(0.60 * max_osd_capacity));
-  uint64_t rec_lim = static_cast<uint64_t>(
-    std::round(2.0 * max_osd_capacity));
-  uint64_t rec_wgt = 2;
-
-  // Background Best Effort
-  uint64_t best_effort_res = default_min;
-  uint64_t best_effort_lim = default_max;
-  uint64_t best_effort_wgt = 2;
-
-  // Set the allocations for the mclock clients
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::client)].update(
-      client_res,
-      client_wgt,
-      client_lim);
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-      rec_res,
-      rec_wgt,
-      rec_lim);
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-      best_effort_res,
-      best_effort_wgt,
-      best_effort_lim);
+  return lhs << "{res: " << rhs.reservation
+             << ", wgt: " << rhs.weight
+             << ", lim: " << rhs.limit
+             << "}";
  }
  
-void mClockScheduler::set_high_client_ops_profile_allocations()
+static std::ostream &operator<<(std::ostream &lhs, const profile_t &rhs)
  {
-  // Client Allocation:
-  //   reservation: 50% | weight: 2 | limit: max |
-  // Background Recovery Allocation:
-  //   reservation: 25% | weight: 1 | limit: 100% |
-  // Background Best Effort Allocation:
-  //   reservation: 25% | weight: 2 | limit: max |
-
-  // Client
-  uint64_t client_res = static_cast<uint64_t>(
-    std::round(0.50 * max_osd_capacity));
-  uint64_t client_wgt = 2;
-  uint64_t client_lim = default_max;
-
-  // Background Recovery
-  uint64_t rec_res = static_cast<uint64_t>(
-    std::round(0.25 * max_osd_capacity));
-  uint64_t rec_lim = static_cast<uint64_t>(
-    std::round(max_osd_capacity));
-  uint64_t rec_wgt = default_min;
-
-  // Background Best Effort
-  uint64_t best_effort_res = static_cast<uint64_t>(
-    std::round(0.25 * max_osd_capacity));
-  uint64_t best_effort_lim = default_max;
-  uint64_t best_effort_wgt = 2;
-
-  // Set the allocations for the mclock clients
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::client)].update(
-      client_res,
-      client_wgt,
-      client_lim);
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
-      rec_res,
-      rec_wgt,
-      rec_lim);
-  client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
-      best_effort_res,
-      best_effort_wgt,
-      best_effort_lim);
+  return lhs << "[client: " << rhs.client
+             << ", background_recovery: " << rhs.background_recovery
+             << ", background_best_effort: " << rhs.background_best_effort
+             << "]";
  }
  
-void mClockScheduler::enable_mclock_profile_settings()
+void mClockScheduler::set_config_defaults_from_profile()
  {
-  // Nothing to do for "custom" profile
-  if (mclock_profile == "custom") {
+  // Let only a single osd shard (id:0) set the profile configs
+  if (shard_id > 0) {
      return;
    }
  
-  // Set mclock and ceph config options for the chosen profile
-  if (mclock_profile == "balanced") {
-    set_balanced_profile_allocations();
+  /**
+   * high_client_ops
+   *
+   * Client Allocation:
+   *   reservation: 60% | weight: 2 | limit: 0 (max) |
+   * Background Recovery Allocation:
+   *   reservation: 40% | weight: 1 | limit: 0 (max) |
+   * Background Best Effort Allocation:
+   *   reservation: 0 (min) | weight: 1 | limit: 70% |
+   */
+  static constexpr profile_t high_client_ops_profile{
+    { .6, 2,  0 },
+    { .4, 1,  0 },
+    {  0, 1, .7 }
+  };
+
+  /**
+   * high_recovery_ops
+   *
+   * Client Allocation:
+   *   reservation: 30% | weight: 1 | limit: 0 (max) |
+   * Background Recovery Allocation:
+   *   reservation: 70% | weight: 2 | limit: 0 (max) |
+   * Background Best Effort Allocation:
+   *   reservation: 0 (min) | weight: 1 | limit: 0 (max) |
+   */
+  static constexpr profile_t high_recovery_ops_profile{
+    { .3, 1, 0 },
+    { .7, 2, 0 },
+    {  0, 1, 0 }
+  };
+
+  /**
+   * balanced
+   *
+   * Client Allocation:
+   *   reservation: 50% | weight: 1 | limit: 0 (max) |
+   * Background Recovery Allocation:
+   *   reservation: 50% | weight: 1 | limit: 0 (max) |
+   * Background Best Effort Allocation:
+   *   reservation: 0 (min) | weight: 1 | limit: 90% |
+   */
+  static constexpr profile_t balanced_profile{
+    { .5, 1, 0 },
+    { .5, 1, 0 },
+    {  0, 1, .9 }
+  };
+
+  const profile_t *profile = nullptr;
+  auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
+  if (mclock_profile == "high_client_ops") {
+    profile = &high_client_ops_profile;
+    dout(10) << "Setting high_client_ops profile " << *profile << dendl;
    } else if (mclock_profile == "high_recovery_ops") {
-    set_high_recovery_ops_profile_allocations();
-  } else if (mclock_profile == "high_client_ops") {
-    set_high_client_ops_profile_allocations();
+    profile = &high_recovery_ops_profile;
+    dout(10) << "Setting high_recovery_ops profile " << *profile << dendl;
+  } else if (mclock_profile == "balanced") {
+    profile = &balanced_profile;
+    dout(10) << "Setting balanced profile " << *profile << dendl;
+  } else if (mclock_profile == "custom") {
+    dout(10) << "Profile set to custom, not setting defaults" << dendl;
+    return;
    } else {
+    derr << "Invalid mclock profile: " << mclock_profile << dendl;
      ceph_assert("Invalid choice of mclock profile" == 0);
      return;
    }
+  ceph_assert(nullptr != profile);
  
-  // Set the mclock config parameters
-  set_profile_config();
-}
+  auto set_config = [&conf = cct->_conf](const char *key, auto val) {
+    conf.set_val_default(key, std::to_string(val));
+  };
  
-void mClockScheduler::set_profile_config()
-{
-  // Let only a single osd shard (id:0) set the profile configs
-  if (shard_id > 0) {
-    return;
-  }
+  set_config("osd_mclock_scheduler_client_res", profile->client.reservation);
+  set_config("osd_mclock_scheduler_client_wgt", profile->client.weight);
+  set_config("osd_mclock_scheduler_client_lim", profile->client.limit);
+
+  set_config(
+    "osd_mclock_scheduler_background_recovery_res",
+    profile->background_recovery.reservation);
+  set_config(
+    "osd_mclock_scheduler_background_recovery_wgt",
+    profile->background_recovery.weight);
+  set_config(
+    "osd_mclock_scheduler_background_recovery_lim",
+    profile->background_recovery.limit);
  
-  ClientAllocs client = client_allocs[
-    static_cast<size_t>(op_scheduler_class::client)];
-  ClientAllocs rec = client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_recovery)];
-  ClientAllocs best_effort = client_allocs[
-    static_cast<size_t>(op_scheduler_class::background_best_effort)];
-
-  // Set external client params
-  cct->_conf.set_val_default("osd_mclock_scheduler_client_res",
-    std::to_string(client.res));
-  cct->_conf.set_val_default("osd_mclock_scheduler_client_wgt",
-    std::to_string(client.wgt));
-  cct->_conf.set_val_default("osd_mclock_scheduler_client_lim",
-    std::to_string(client.lim));
-  dout(10) << __func__ << " client QoS params: " << "["
-           << client.res << "," << client.wgt << "," << client.lim
-           << "]" << dendl;
-
-  // Set background recovery client params
-  cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_res",
-    std::to_string(rec.res));
-  cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_wgt",
-    std::to_string(rec.wgt));
-  cct->_conf.set_val_default("osd_mclock_scheduler_background_recovery_lim",
-    std::to_string(rec.lim));
-  dout(10) << __func__ << " Recovery QoS params: " << "["
-           << rec.res << "," << rec.wgt << "," << rec.lim
-           << "]" << dendl;
-
-  // Set background best effort client params
-  cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_res",
-    std::to_string(best_effort.res));
-  cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_wgt",
-    std::to_string(best_effort.wgt));
-  cct->_conf.set_val_default("osd_mclock_scheduler_background_best_effort_lim",
-    std::to_string(best_effort.lim));
-  dout(10) << __func__ << " Best effort QoS params: " << "["
-    << best_effort.res << "," << best_effort.wgt << "," << best_effort.lim
-    << "]" << dendl;
-
-  // Apply the configuration changes
-  update_configuration();
+  set_config(
+    "osd_mclock_scheduler_background_best_effort_res",
+    profile->background_best_effort.reservation);
+  set_config(
+    "osd_mclock_scheduler_background_best_effort_wgt",
+    profile->background_best_effort.weight);
+  set_config(
+    "osd_mclock_scheduler_background_best_effort_lim",
+    profile->background_best_effort.limit);
+
+  cct->_conf.apply_changes(nullptr);
  }
  
-int mClockScheduler::calc_scaled_cost(int item_cost)
+uint32_t mClockScheduler::calc_scaled_cost(int item_cost)
  {
-  // Calculate total scaled cost in secs
-  int scaled_cost =
-    std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
-  return std::max(scaled_cost, 1);
+  auto cost = static_cast<uint32_t>(
+    std::max<int>(
+      1, // ensure cost is non-zero and positive
+      item_cost));
+  auto cost_per_io = static_cast<uint32_t>(osd_bandwidth_cost_per_io);
+
+  // Calculate total scaled cost in bytes
+  return cost_per_io + cost;
  }
  
  void mClockScheduler::update_configuration()
@@ -414,7 +356,7 @@ void mClockScheduler::dump(ceph::Formatter &f) const
  {
    // Display queue sizes
    f.open_object_section("queue_sizes");
-  f.dump_int("immediate", immediate.size());
+  f.dump_int("high_priority_queue", high_priority.size());
    f.dump_int("scheduler", scheduler.request_count());
    f.close_section();
  
@@ -430,17 +372,28 @@ void mClockScheduler::dump(ceph::Formatter &f) const
    f.open_object_section("mClockQueues");
    f.dump_string("queues", display_queues());
    f.close_section();
+
+  f.open_object_section("HighPriorityQueue");
+  for (auto it = high_priority.begin();
+       it != high_priority.end(); it++) {
+    f.dump_int("priority", it->first);
+    f.dump_int("queue_size", it->second.size());
+  }
+  f.close_section();
  }
  
  void mClockScheduler::enqueue(OpSchedulerItem&& item)
  {
    auto id = get_scheduler_id(item);
-
+  unsigned priority = item.get_priority();
+  
    // TODO: move this check into OpSchedulerItem, handle backwards compat
    if (op_scheduler_class::immediate == id.class_id) {
-    immediate.push_front(std::move(item));
+    enqueue_high(immediate_class_priority, std::move(item));
+  } else if (priority >= cutoff_priority) {
+    enqueue_high(priority, std::move(item));
    } else {
-    int cost = calc_scaled_cost(item.get_cost());
+    auto cost = calc_scaled_cost(item.get_cost());
      item.set_qos_cost(cost);
      dout(20) << __func__ << " " << id
               << " item_cost: " << item.get_cost()
@@ -455,7 +408,8 @@ void mClockScheduler::enqueue(OpSchedulerItem&& item)
    }
  
   dout(20) << __func__ << " client_count: " << scheduler.client_count()
-          << " queue_sizes: [ imm: " << immediate.size()
+          << " queue_sizes: [ "
+         << " high_priority_queue: " << high_priority.size()
            << " sched: " << scheduler.request_count() << " ]"
            << dendl;
   dout(30) << __func__ << " mClockClients: "
@@ -468,17 +422,45 @@ void mClockScheduler::enqueue(OpSchedulerItem&& item)
  
  void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
  {
-  immediate.push_back(std::move(item));
-  // TODO: item may not be immediate, update mclock machinery to permit
-  // putting the item back in the queue
+  unsigned priority = item.get_priority();
+  auto id = get_scheduler_id(item);
+
+  if (op_scheduler_class::immediate == id.class_id) {
+    enqueue_high(immediate_class_priority, std::move(item), true);
+  } else if (priority >= cutoff_priority) {
+    enqueue_high(priority, std::move(item), true);
+  } else {
+    // mClock does not support enqueue at front, so we use
+    // the high queue with priority 0
+    enqueue_high(0, std::move(item), true);
+  }
+}
+
+void mClockScheduler::enqueue_high(unsigned priority,
+                                   OpSchedulerItem&& item,
+                                  bool front)
+{
+  if (front) {
+    high_priority[priority].push_back(std::move(item));
+  } else {
+    high_priority[priority].push_front(std::move(item));
+  }
  }
  
  WorkItem mClockScheduler::dequeue()
  {
-  if (!immediate.empty()) {
-    WorkItem work_item{std::move(immediate.back())};
-    immediate.pop_back();
-    return work_item;
+  if (!high_priority.empty()) {
+    auto iter = high_priority.begin();
+    // invariant: high_priority entries are never empty
+    assert(!iter->second.empty());
+    WorkItem ret{std::move(iter->second.back())};
+    iter->second.pop_back();
+    if (iter->second.empty()) {
+      // maintain invariant, high priority entries are never empty
+      high_priority.erase(iter);
+    }
+    ceph_assert(std::get_if<OpSchedulerItem>(&ret));
+    return ret;
    } else {
      mclock_queue_t::PullReq result = scheduler.pull_request();
      if (result.is_future()) {
@@ -515,14 +497,10 @@ const char** mClockScheduler::get_tracked_conf_keys() const
      "osd_mclock_scheduler_background_best_effort_res",
      "osd_mclock_scheduler_background_best_effort_wgt",
      "osd_mclock_scheduler_background_best_effort_lim",
-    "osd_mclock_cost_per_io_usec",
-    "osd_mclock_cost_per_io_usec_hdd",
-    "osd_mclock_cost_per_io_usec_ssd",
-    "osd_mclock_cost_per_byte_usec",
-    "osd_mclock_cost_per_byte_usec_hdd",
-    "osd_mclock_cost_per_byte_usec_ssd",
      "osd_mclock_max_capacity_iops_hdd",
      "osd_mclock_max_capacity_iops_ssd",
+    "osd_mclock_max_sequential_bandwidth_hdd",
+    "osd_mclock_max_sequential_bandwidth_ssd",
      "osd_mclock_profile",
      NULL
    };
@@ -533,30 +511,22 @@ void mClockScheduler::handle_conf_change(
    const ConfigProxy& conf,
    const std::set<std::string> &changed)
  {
-  if (changed.count("osd_mclock_cost_per_io_usec") ||
-      changed.count("osd_mclock_cost_per_io_usec_hdd") ||
-      changed.count("osd_mclock_cost_per_io_usec_ssd")) {
-    set_osd_mclock_cost_per_io();
-  }
-  if (changed.count("osd_mclock_cost_per_byte_usec") ||
-      changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
-      changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
-    set_osd_mclock_cost_per_byte();
-  }
    if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
        changed.count("osd_mclock_max_capacity_iops_ssd")) {
-    set_max_osd_capacity();
-    if (mclock_profile != "custom") {
-      enable_mclock_profile_settings();
-      client_registry.update_from_config(conf);
-    }
+    set_osd_capacity_params_from_config();
+    client_registry.update_from_config(
+      conf, osd_bandwidth_capacity_per_shard);
+  }
+  if (changed.count("osd_mclock_max_sequential_bandwidth_hdd") ||
+      changed.count("osd_mclock_max_sequential_bandwidth_ssd")) {
+    set_osd_capacity_params_from_config();
+    client_registry.update_from_config(
+      conf, osd_bandwidth_capacity_per_shard);
    }
    if (changed.count("osd_mclock_profile")) {
-    set_mclock_profile();
-    if (mclock_profile != "custom") {
-      enable_mclock_profile_settings();
-      client_registry.update_from_config(conf);
-    }
+    set_config_defaults_from_profile();
+    client_registry.update_from_config(
+      conf, osd_bandwidth_capacity_per_shard);
    }
  
    auto get_changed_key = [&changed]() -> std::optional<std::string> {
@@ -581,8 +551,10 @@ void mClockScheduler::handle_conf_change(
    };
  
    if (auto key = get_changed_key(); key.has_value()) {
+    auto mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
      if (mclock_profile == "custom") {
-      client_registry.update_from_config(conf);
+      client_registry.update_from_config(
+        conf, osd_bandwidth_capacity_per_shard);
      } else {
        // Attempt to change QoS parameter for a built-in profile. Restore the
        // profile defaults by making one of the OSD shards remove the key from
@@ -609,6 +581,12 @@ void mClockScheduler::handle_conf_change(
          }
        }
      }
+    // Alternatively, the QoS parameter, if set ephemerally for this OSD via
+    // the 'daemon' or 'tell' interfaces must be removed.
+    if (!cct->_conf.rm_val(*key)) {
+      dout(10) << __func__ << " Restored " << *key << " to default" << dendl;
+      cct->_conf.apply_changes(nullptr);
+    }
    }
  }