[ceph.git] / ceph / src / osd / scheduler / mClockScheduler.cc

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
 * Ceph - scalable distributed file system
 *
 * Copyright (C) 2016 Red Hat Inc.
 *
 * This is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License version 2.1, as published by the Free Software
 * Foundation.  See file COPYING.
 *
 */


#include <memory>
#include <functional>

#include "osd/scheduler/mClockScheduler.h"
#include "common/dout.h"

namespace dmc = crimson::dmclock;
using namespace std::placeholders;

#define dout_context cct
#define dout_subsys ceph_subsys_osd
#undef dout_prefix
#define dout_prefix *_dout << "mClockScheduler: "


namespace ceph::osd::scheduler {

mClockScheduler::mClockScheduler(CephContext *cct,
  uint32_t num_shards,
  bool is_rotational)
  : cct(cct),
    num_shards(num_shards),
    is_rotational(is_rotational),
    scheduler(
      std::bind(&mClockScheduler::ClientRegistry::get_info,
                &client_registry,
                _1),
      dmc::AtLimit::Wait,
      cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
{
  cct->_conf.add_observer(this);
  ceph_assert(num_shards > 0);
  set_max_osd_capacity();
  set_osd_mclock_cost_per_io();
  set_osd_mclock_cost_per_byte();
  set_mclock_profile();
  enable_mclock_profile_settings();
  client_registry.update_from_config(cct->_conf);
}

void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
{
  default_external_client_info.update(
    conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
    conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
    conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));

  internal_client_infos[
    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));

  internal_client_infos[
    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
}

const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
  const client_profile_id_t &client) const
{
  auto ret = external_client_infos.find(client);
  if (ret == external_client_infos.end())
    return &default_external_client_info;
  else
    return &(ret->second);
}

const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
  const scheduler_id_t &id) const {
  switch (id.class_id) {
  case op_scheduler_class::immediate:
    ceph_assert(0 == "Cannot schedule immediate");
    return (dmc::ClientInfo*)nullptr;
  case op_scheduler_class::client:
    return get_external_client(id.client_profile_id);
  default:
    ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
    return &internal_client_infos[static_cast<size_t>(id.class_id)];
  }
}

void mClockScheduler::set_max_osd_capacity()
{
  if (cct->_conf.get_val<double>("osd_mclock_max_capacity_iops")) {
    max_osd_capacity =
      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops");
  } else {
    if (is_rotational) {
      max_osd_capacity =
        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
    } else {
      max_osd_capacity =
        cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
    }
  }
  // Set per op-shard iops limit
  max_osd_capacity /= num_shards;
  dout(1) << __func__ << " #op shards: " << num_shards
          << " max osd capacity(iops) per shard: " << max_osd_capacity << dendl;
}

void mClockScheduler::set_osd_mclock_cost_per_io()
{
  std::chrono::seconds sec(1);
  if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
    osd_mclock_cost_per_io =
      cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
  } else {
    if (is_rotational) {
      osd_mclock_cost_per_io =
        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
      // For HDDs, convert value to seconds
      osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
    } else {
      // For SSDs, convert value to milliseconds
      osd_mclock_cost_per_io =
        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
      osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
    }
  }
  dout(1) << __func__ << " osd_mclock_cost_per_io: "
          << std::fixed << osd_mclock_cost_per_io << dendl;
}

void mClockScheduler::set_osd_mclock_cost_per_byte()
{
  std::chrono::seconds sec(1);
  if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
    osd_mclock_cost_per_byte =
      cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
  } else {
    if (is_rotational) {
      osd_mclock_cost_per_byte =
        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
      // For HDDs, convert value to seconds
      osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
    } else {
      osd_mclock_cost_per_byte =
        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
      // For SSDs, convert value to milliseconds
      osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
    }
  }
  dout(1) << __func__ << " osd_mclock_cost_per_byte: "
          << std::fixed << osd_mclock_cost_per_byte << dendl;
}

void mClockScheduler::set_mclock_profile()
{
  mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
  dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
}

std::string mClockScheduler::get_mclock_profile()
{
  return mclock_profile;
}

void mClockScheduler::set_balanced_profile_allocations()
{
  // Client Allocation:
  //   reservation: 40% | weight: 1 | limit: 100% |
  // Background Recovery Allocation:
  //   reservation: 40% | weight: 1 | limit: 150% |
  // Background Best Effort Allocation:
  //   reservation: 20% | weight: 2 | limit: max |

  // Client
  uint64_t client_res = static_cast<uint64_t>(
    std::round(0.40 * max_osd_capacity));
  uint64_t client_lim = static_cast<uint64_t>(
    std::round(max_osd_capacity));
  uint64_t client_wgt = default_min;

  // Background Recovery
  uint64_t rec_res = static_cast<uint64_t>(
    std::round(0.40 * max_osd_capacity));
  uint64_t rec_lim = static_cast<uint64_t>(
    std::round(1.5 * max_osd_capacity));
  uint64_t rec_wgt = default_min;

  // Background Best Effort
  uint64_t best_effort_res = static_cast<uint64_t>(
    std::round(0.20 * max_osd_capacity));
  uint64_t best_effort_lim = default_max;
  uint64_t best_effort_wgt = 2;

  // Set the allocations for the mclock clients
  client_allocs[
    static_cast<size_t>(op_scheduler_class::client)].update(
      client_res,
      client_wgt,
      client_lim);
  client_allocs[
    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
      rec_res,
      rec_wgt,
      rec_lim);
  client_allocs[
    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
      best_effort_res,
      best_effort_wgt,
      best_effort_lim);
}

void mClockScheduler::set_high_recovery_ops_profile_allocations()
{
  // Client Allocation:
  //   reservation: 30% | weight: 1 | limit: 80% |
  // Background Recovery Allocation:
  //   reservation: 60% | weight: 2 | limit: 200% |
  // Background Best Effort Allocation:
  //   reservation: 1 | weight: 2 | limit: max |

  // Client
  uint64_t client_res = static_cast<uint64_t>(
    std::round(0.30 * max_osd_capacity));
  uint64_t client_lim = static_cast<uint64_t>(
    std::round(0.80 * max_osd_capacity));
  uint64_t client_wgt = default_min;

  // Background Recovery
  uint64_t rec_res = static_cast<uint64_t>(
    std::round(0.60 * max_osd_capacity));
  uint64_t rec_lim = static_cast<uint64_t>(
    std::round(2.0 * max_osd_capacity));
  uint64_t rec_wgt = 2;

  // Background Best Effort
  uint64_t best_effort_res = default_min;
  uint64_t best_effort_lim = default_max;
  uint64_t best_effort_wgt = 2;

  // Set the allocations for the mclock clients
  client_allocs[
    static_cast<size_t>(op_scheduler_class::client)].update(
      client_res,
      client_wgt,
      client_lim);
  client_allocs[
    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
      rec_res,
      rec_wgt,
      rec_lim);
  client_allocs[
    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
      best_effort_res,
      best_effort_wgt,
      best_effort_lim);
}

void mClockScheduler::set_high_client_ops_profile_allocations()
{
  // Client Allocation:
  //   reservation: 50% | weight: 2 | limit: max |
  // Background Recovery Allocation:
  //   reservation: 25% | weight: 1 | limit: 100% |
  // Background Best Effort Allocation:
  //   reservation: 25% | weight: 2 | limit: max |

  // Client
  uint64_t client_res = static_cast<uint64_t>(
    std::round(0.50 * max_osd_capacity));
  uint64_t client_wgt = 2;
  uint64_t client_lim = default_max;

  // Background Recovery
  uint64_t rec_res = static_cast<uint64_t>(
    std::round(0.25 * max_osd_capacity));
  uint64_t rec_lim = static_cast<uint64_t>(
    std::round(max_osd_capacity));
  uint64_t rec_wgt = default_min;

  // Background Best Effort
  uint64_t best_effort_res = static_cast<uint64_t>(
    std::round(0.25 * max_osd_capacity));
  uint64_t best_effort_lim = default_max;
  uint64_t best_effort_wgt = 2;

  // Set the allocations for the mclock clients
  client_allocs[
    static_cast<size_t>(op_scheduler_class::client)].update(
      client_res,
      client_wgt,
      client_lim);
  client_allocs[
    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
      rec_res,
      rec_wgt,
      rec_lim);
  client_allocs[
    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
      best_effort_res,
      best_effort_wgt,
      best_effort_lim);
}

void mClockScheduler::enable_mclock_profile_settings()
{
  // Nothing to do for "custom" profile
  if (mclock_profile == "custom") {
    return;
  }

  // Set mclock and ceph config options for the chosen profile
  if (mclock_profile == "balanced") {
    set_balanced_profile_allocations();
  } else if (mclock_profile == "high_recovery_ops") {
    set_high_recovery_ops_profile_allocations();
  } else if (mclock_profile == "high_client_ops") {
    set_high_client_ops_profile_allocations();
  } else {
    ceph_assert("Invalid choice of mclock profile" == 0);
    return;
  }

  // Set the mclock config parameters
  set_profile_config();
  // Set recovery specific Ceph options
  set_global_recovery_options();
}

void mClockScheduler::set_profile_config()
{
  ClientAllocs client = client_allocs[
    static_cast<size_t>(op_scheduler_class::client)];
  ClientAllocs rec = client_allocs[
    static_cast<size_t>(op_scheduler_class::background_recovery)];
  ClientAllocs best_effort = client_allocs[
    static_cast<size_t>(op_scheduler_class::background_best_effort)];

  // Set external client params
  cct->_conf.set_val("osd_mclock_scheduler_client_res",
    std::to_string(client.res));
  cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
    std::to_string(client.wgt));
  cct->_conf.set_val("osd_mclock_scheduler_client_lim",
    std::to_string(client.lim));

  // Set background recovery client params
  cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
    std::to_string(rec.res));
  cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
    std::to_string(rec.wgt));
  cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
    std::to_string(rec.lim));

  // Set background best effort client params
  cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
    std::to_string(best_effort.res));
  cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
    std::to_string(best_effort.wgt));
  cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
    std::to_string(best_effort.lim));
}

void mClockScheduler::set_global_recovery_options()
{
  // Set high value for recovery max active and max backfill
  int rec_max_active = 1000;
  int max_backfills = 1000;
  cct->_conf.set_val("osd_recovery_max_active", std::to_string(rec_max_active));
  cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));

  // Disable recovery sleep
  cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
  cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
  cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
  cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));

  // Disable delete sleep
  cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
  cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
  cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
  cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));

  // Disable snap trim sleep
  cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
  cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
  cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
  cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));

  // Disable scrub sleep
  cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));

  // Apply the changes
  cct->_conf.apply_changes(nullptr);
}

int mClockScheduler::calc_scaled_cost(int item_cost)
{
  // Calculate total scaled cost in secs
  int scaled_cost =
    std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
  return std::max(scaled_cost, 1);
}

void mClockScheduler::dump(ceph::Formatter &f) const
{
}

void mClockScheduler::enqueue(OpSchedulerItem&& item)
{
  auto id = get_scheduler_id(item);

  // TODO: move this check into OpSchedulerItem, handle backwards compat
  if (op_scheduler_class::immediate == id.class_id) {
    immediate.push_front(std::move(item));
  } else {
    int cost = calc_scaled_cost(item.get_cost());
    // Add item to scheduler queue
    scheduler.add_request(
      std::move(item),
      id,
      cost);
  }
}

void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
{
  immediate.push_back(std::move(item));
  // TODO: item may not be immediate, update mclock machinery to permit
  // putting the item back in the queue
}

WorkItem mClockScheduler::dequeue()
{
  if (!immediate.empty()) {
    WorkItem work_item{std::move(immediate.back())};
    immediate.pop_back();
    return work_item;
  } else {
    mclock_queue_t::PullReq result = scheduler.pull_request();
    if (result.is_future()) {
      return result.getTime();
    } else if (result.is_none()) {
      ceph_assert(
	0 == "Impossible, must have checked empty() first");
      return {};
    } else {
      ceph_assert(result.is_retn());

      auto &retn = result.get_retn();
      return std::move(*retn.request);
    }
  }
}

const char** mClockScheduler::get_tracked_conf_keys() const
{
  static const char* KEYS[] = {
    "osd_mclock_scheduler_client_res",
    "osd_mclock_scheduler_client_wgt",
    "osd_mclock_scheduler_client_lim",
    "osd_mclock_scheduler_background_recovery_res",
    "osd_mclock_scheduler_background_recovery_wgt",
    "osd_mclock_scheduler_background_recovery_lim",
    "osd_mclock_scheduler_background_best_effort_res",
    "osd_mclock_scheduler_background_best_effort_wgt",
    "osd_mclock_scheduler_background_best_effort_lim",
    "osd_mclock_cost_per_io_usec",
    "osd_mclock_cost_per_io_usec_hdd",
    "osd_mclock_cost_per_io_usec_ssd",
    "osd_mclock_cost_per_byte_usec",
    "osd_mclock_cost_per_byte_usec_hdd",
    "osd_mclock_cost_per_byte_usec_ssd",
    "osd_mclock_max_capacity_iops",
    "osd_mclock_max_capacity_iops_hdd",
    "osd_mclock_max_capacity_iops_ssd",
    "osd_mclock_profile",
    NULL
  };
  return KEYS;
}

void mClockScheduler::handle_conf_change(
  const ConfigProxy& conf,
  const std::set<std::string> &changed)
{
  if (changed.count("osd_mclock_cost_per_io_usec") ||
      changed.count("osd_mclock_cost_per_io_usec_hdd") ||
      changed.count("osd_mclock_cost_per_io_usec_ssd")) {
    set_osd_mclock_cost_per_io();
  }
  if (changed.count("osd_mclock_cost_per_byte_usec") ||
      changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
      changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
    set_osd_mclock_cost_per_byte();
  }
  if (changed.count("osd_mclock_max_capacity_iops") ||
      changed.count("osd_mclock_max_capacity_iops_hdd") ||
      changed.count("osd_mclock_max_capacity_iops_ssd")) {
    set_max_osd_capacity();
    if (mclock_profile != "custom") {
      enable_mclock_profile_settings();
      client_registry.update_from_config(conf);
    }
  }
  if (changed.count("osd_mclock_profile")) {
    set_mclock_profile();
    if (mclock_profile != "custom") {
      enable_mclock_profile_settings();
      client_registry.update_from_config(conf);
    }
  }
  if (changed.count("osd_mclock_scheduler_client_res") ||
      changed.count("osd_mclock_scheduler_client_wgt") ||
      changed.count("osd_mclock_scheduler_client_lim") ||
      changed.count("osd_mclock_scheduler_background_recovery_res") ||
      changed.count("osd_mclock_scheduler_background_recovery_wgt") ||
      changed.count("osd_mclock_scheduler_background_recovery_lim")) {
    if (mclock_profile == "custom") {
      client_registry.update_from_config(conf);
    }
  }
}

mClockScheduler::~mClockScheduler()
{
  cct->_conf.remove_observer(this);
}

}
Commit	Line	Data
9f95a23c TL	1	// -- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t --
	2	// vim: ts=8 sw=2 smarttab
	3	/*
	4	* Ceph - scalable distributed file system
	5	*
	6	* Copyright (C) 2016 Red Hat Inc.
	7	*
	8	* This is free software; you can redistribute it and/or
	9	* modify it under the terms of the GNU Lesser General Public
	10	* License version 2.1, as published by the Free Software
	11	* Foundation. See file COPYING.
	12	*
	13	*/
	14
	15
	16	#include <memory>
	17	#include <functional>
	18
	19	#include "osd/scheduler/mClockScheduler.h"
	20	#include "common/dout.h"
	21
	22	namespace dmc = crimson::dmclock;
	23	using namespace std::placeholders;
	24
	25	#define dout_context cct
	26	#define dout_subsys ceph_subsys_osd
	27	#undef dout_prefix
f67539c2	28	#define dout_prefix *_dout << "mClockScheduler: "
9f95a23c TL	29
	30
	31	namespace ceph::osd::scheduler {
	32
f67539c2 TL	33	mClockScheduler::mClockScheduler(CephContext *cct,
	34	uint32_t num_shards,
	35	bool is_rotational)
	36	: cct(cct),
	37	num_shards(num_shards),
	38	is_rotational(is_rotational),
	39	scheduler(
	40	std::bind(&mClockScheduler::ClientRegistry::get_info,
	41	&client_registry,
	42	_1),
	43	dmc::AtLimit::Wait,
	44	cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
9f95a23c TL	45	{
9f95a23c TL	46	cct->_conf.add_observer(this);
f67539c2 TL	47	ceph_assert(num_shards > 0);
	48	set_max_osd_capacity();
	49	set_osd_mclock_cost_per_io();
	50	set_osd_mclock_cost_per_byte();
	51	set_mclock_profile();
	52	enable_mclock_profile_settings();
9f95a23c TL	53	client_registry.update_from_config(cct->_conf);
	54	}
	55
	56	void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
	57	{
	58	default_external_client_info.update(
	59	conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
	60	conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
	61	conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
	62
	63	internal_client_infos[
	64	static_cast<size_t>(op_scheduler_class::background_recovery)].update(
	65	conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
	66	conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
	67	conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
	68
	69	internal_client_infos[
	70	static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
	71	conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
	72	conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
	73	conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
	74	}
	75
	76	const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
	77	const client_profile_id_t &client) const
	78	{
	79	auto ret = external_client_infos.find(client);
	80	if (ret == external_client_infos.end())
	81	return &default_external_client_info;
	82	else
	83	return &(ret->second);
	84	}
	85
	86	const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
	87	const scheduler_id_t &id) const {
	88	switch (id.class_id) {
	89	case op_scheduler_class::immediate:
	90	ceph_assert(0 == "Cannot schedule immediate");
	91	return (dmc::ClientInfo*)nullptr;
	92	case op_scheduler_class::client:
	93	return get_external_client(id.client_profile_id);
	94	default:
	95	ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
	96	return &internal_client_infos[static_cast<size_t>(id.class_id)];
	97	}
	98	}
	99
f67539c2 TL	100	void mClockScheduler::set_max_osd_capacity()
	101	{
	102	if (cct->_conf.get_val<double>("osd_mclock_max_capacity_iops")) {
	103	max_osd_capacity =
	104	cct->_conf.get_val<double>("osd_mclock_max_capacity_iops");
	105	} else {
	106	if (is_rotational) {
	107	max_osd_capacity =
	108	cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
	109	} else {
	110	max_osd_capacity =
	111	cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
	112	}
	113	}
	114	// Set per op-shard iops limit
	115	max_osd_capacity /= num_shards;
	116	dout(1) << __func__ << " #op shards: " << num_shards
	117	<< " max osd capacity(iops) per shard: " << max_osd_capacity << dendl;
	118	}
	119
	120	void mClockScheduler::set_osd_mclock_cost_per_io()
	121	{
	122	std::chrono::seconds sec(1);
	123	if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
	124	osd_mclock_cost_per_io =
	125	cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
	126	} else {
	127	if (is_rotational) {
	128	osd_mclock_cost_per_io =
	129	cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
	130	// For HDDs, convert value to seconds
	131	osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
	132	} else {
	133	// For SSDs, convert value to milliseconds
	134	osd_mclock_cost_per_io =
	135	cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
	136	osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
	137	}
	138	}
	139	dout(1) << __func__ << " osd_mclock_cost_per_io: "
	140	<< std::fixed << osd_mclock_cost_per_io << dendl;
	141	}
	142
	143	void mClockScheduler::set_osd_mclock_cost_per_byte()
	144	{
	145	std::chrono::seconds sec(1);
	146	if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
	147	osd_mclock_cost_per_byte =
	148	cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
	149	} else {
	150	if (is_rotational) {
	151	osd_mclock_cost_per_byte =
	152	cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
	153	// For HDDs, convert value to seconds
	154	osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
	155	} else {
	156	osd_mclock_cost_per_byte =
	157	cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
	158	// For SSDs, convert value to milliseconds
	159	osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
	160	}
	161	}
	162	dout(1) << __func__ << " osd_mclock_cost_per_byte: "
	163	<< std::fixed << osd_mclock_cost_per_byte << dendl;
164	}
165
166	void mClockScheduler::set_mclock_profile()
167	{
168	mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
169	dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
170	}
171
172	std::string mClockScheduler::get_mclock_profile()
173	{
174	return mclock_profile;
175	}
176
177	void mClockScheduler::set_balanced_profile_allocations()
178	{
179	// Client Allocation:
180	// reservation: 40% \| weight: 1 \| limit: 100% \|
181	// Background Recovery Allocation:
182	// reservation: 40% \| weight: 1 \| limit: 150% \|
183	// Background Best Effort Allocation:
184	// reservation: 20% \| weight: 2 \| limit: max \|
185
186	// Client
187	uint64_t client_res = static_cast<uint64_t>(
188	std::round(0.40 * max_osd_capacity));
189	uint64_t client_lim = static_cast<uint64_t>(
190	std::round(max_osd_capacity));
191	uint64_t client_wgt = default_min;
192
193	// Background Recovery
194	uint64_t rec_res = static_cast<uint64_t>(
195	std::round(0.40 * max_osd_capacity));
196	uint64_t rec_lim = static_cast<uint64_t>(
197	std::round(1.5 * max_osd_capacity));
198	uint64_t rec_wgt = default_min;
199
200	// Background Best Effort
201	uint64_t best_effort_res = static_cast<uint64_t>(
202	std::round(0.20 * max_osd_capacity));
203	uint64_t best_effort_lim = default_max;
204	uint64_t best_effort_wgt = 2;
205
206	// Set the allocations for the mclock clients
207	client_allocs[
208	static_cast<size_t>(op_scheduler_class::client)].update(
209	client_res,
210	client_wgt,
211	client_lim);
212	client_allocs[
213	static_cast<size_t>(op_scheduler_class::background_recovery)].update(
214	rec_res,
215	rec_wgt,
216	rec_lim);
217	client_allocs[
218	static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
219	best_effort_res,
220	best_effort_wgt,
221	best_effort_lim);
222	}
223
224	void mClockScheduler::set_high_recovery_ops_profile_allocations()
225	{
226	// Client Allocation:
227	// reservation: 30% \| weight: 1 \| limit: 80% \|
228	// Background Recovery Allocation:
229	// reservation: 60% \| weight: 2 \| limit: 200% \|
230	// Background Best Effort Allocation:
231	// reservation: 1 \| weight: 2 \| limit: max \|
232
233	// Client
234	uint64_t client_res = static_cast<uint64_t>(
235	std::round(0.30 * max_osd_capacity));
236	uint64_t client_lim = static_cast<uint64_t>(
237	std::round(0.80 * max_osd_capacity));
238	uint64_t client_wgt = default_min;
239
240	// Background Recovery
241	uint64_t rec_res = static_cast<uint64_t>(
242	std::round(0.60 * max_osd_capacity));
243	uint64_t rec_lim = static_cast<uint64_t>(
244	std::round(2.0 * max_osd_capacity));
245	uint64_t rec_wgt = 2;
246
247	// Background Best Effort
248	uint64_t best_effort_res = default_min;
249	uint64_t best_effort_lim = default_max;
250	uint64_t best_effort_wgt = 2;
251
252	// Set the allocations for the mclock clients
253	client_allocs[
254	static_cast<size_t>(op_scheduler_class::client)].update(
255	client_res,
256	client_wgt,
257	client_lim);
258	client_allocs[
259	static_cast<size_t>(op_scheduler_class::background_recovery)].update(
260	rec_res,
261	rec_wgt,
262	rec_lim);
263	client_allocs[
264	static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
265	best_effort_res,
266	best_effort_wgt,
267	best_effort_lim);
268	}
269
270	void mClockScheduler::set_high_client_ops_profile_allocations()
271	{
272	// Client Allocation:
273	// reservation: 50% \| weight: 2 \| limit: max \|
274	// Background Recovery Allocation:
275	// reservation: 25% \| weight: 1 \| limit: 100% \|
276	// Background Best Effort Allocation:
277	// reservation: 25% \| weight: 2 \| limit: max \|
278
279	// Client
280	uint64_t client_res = static_cast<uint64_t>(
281	std::round(0.50 * max_osd_capacity));
282	uint64_t client_wgt = 2;
283	uint64_t client_lim = default_max;
284
285	// Background Recovery
286	uint64_t rec_res = static_cast<uint64_t>(
287	std::round(0.25 * max_osd_capacity));
288	uint64_t rec_lim = static_cast<uint64_t>(
289	std::round(max_osd_capacity));
290	uint64_t rec_wgt = default_min;
291
292	// Background Best Effort
293	uint64_t best_effort_res = static_cast<uint64_t>(
294	std::round(0.25 * max_osd_capacity));
295	uint64_t best_effort_lim = default_max;
296	uint64_t best_effort_wgt = 2;
297
298	// Set the allocations for the mclock clients
299	client_allocs[
300	static_cast<size_t>(op_scheduler_class::client)].update(
301	client_res,
302	client_wgt,
303	client_lim);
304	client_allocs[
305	static_cast<size_t>(op_scheduler_class::background_recovery)].update(
306	rec_res,
307	rec_wgt,
308	rec_lim);
309	client_allocs[
310	static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
311	best_effort_res,
312	best_effort_wgt,
313	best_effort_lim);
314	}
315
316	void mClockScheduler::enable_mclock_profile_settings()
317	{
318	// Nothing to do for "custom" profile
319	if (mclock_profile == "custom") {
320	return;
321	}
322
323	// Set mclock and ceph config options for the chosen profile
324	if (mclock_profile == "balanced") {
325	set_balanced_profile_allocations();
326	} else if (mclock_profile == "high_recovery_ops") {
327	set_high_recovery_ops_profile_allocations();
328	} else if (mclock_profile == "high_client_ops") {
329	set_high_client_ops_profile_allocations();
330	} else {
331	ceph_assert("Invalid choice of mclock profile" == 0);
332	return;
333	}
334
335	// Set the mclock config parameters
336	set_profile_config();
337	// Set recovery specific Ceph options
338	set_global_recovery_options();
339	}
340
341	void mClockScheduler::set_profile_config()
342	{
343	ClientAllocs client = client_allocs[
344	static_cast<size_t>(op_scheduler_class::client)];
345	ClientAllocs rec = client_allocs[
346	static_cast<size_t>(op_scheduler_class::background_recovery)];
347	ClientAllocs best_effort = client_allocs[
348	static_cast<size_t>(op_scheduler_class::background_best_effort)];
349
350	// Set external client params
351	cct->_conf.set_val("osd_mclock_scheduler_client_res",
352	std::to_string(client.res));
353	cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
354	std::to_string(client.wgt));
355	cct->_conf.set_val("osd_mclock_scheduler_client_lim",
356	std::to_string(client.lim));
357
358	// Set background recovery client params
359	cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
360	std::to_string(rec.res));
361	cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
362	std::to_string(rec.wgt));
363	cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
364	std::to_string(rec.lim));
365
366	// Set background best effort client params
367	cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
368	std::to_string(best_effort.res));
369	cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
370	std::to_string(best_effort.wgt));
371	cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
372	std::to_string(best_effort.lim));
373	}
374
375	void mClockScheduler::set_global_recovery_options()
376	{
377	// Set high value for recovery max active and max backfill
378	int rec_max_active = 1000;
379	int max_backfills = 1000;
380	cct->_conf.set_val("osd_recovery_max_active", std::to_string(rec_max_active));
381	cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
382
383	// Disable recovery sleep
384	cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
385	cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
386	cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
387	cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
388
389	// Disable delete sleep
390	cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
391	cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
392	cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
393	cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
394
395	// Disable snap trim sleep
396	cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
397	cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
398	cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
399	cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
400
401	// Disable scrub sleep
402	cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
403
404	// Apply the changes
405	cct->_conf.apply_changes(nullptr);
406	}
407
408	int mClockScheduler::calc_scaled_cost(int item_cost)
409	{
410	// Calculate total scaled cost in secs
411	int scaled_cost =
412	std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
413	return std::max(scaled_cost, 1);
414	}
415
9f95a23c TL	416	void mClockScheduler::dump(ceph::Formatter &f) const
	417	{
	418	}
	419
	420	void mClockScheduler::enqueue(OpSchedulerItem&& item)
	421	{
	422	auto id = get_scheduler_id(item);
9f95a23c TL	423
9f95a23c TL	424	// TODO: move this check into OpSchedulerItem, handle backwards compat
f67539c2	425	if (op_scheduler_class::immediate == id.class_id) {
9f95a23c TL	426	immediate.push_front(std::move(item));
9f95a23c TL	427	} else {
f67539c2 TL	428	int cost = calc_scaled_cost(item.get_cost());
f67539c2 TL	429	// Add item to scheduler queue
9f95a23c TL	430	scheduler.add_request(
	431	std::move(item),
	432	id,
	433	cost);
	434	}
	435	}
	436
	437	void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
	438	{
	439	immediate.push_back(std::move(item));
	440	// TODO: item may not be immediate, update mclock machinery to permit
	441	// putting the item back in the queue
	442	}
	443
f67539c2	444	WorkItem mClockScheduler::dequeue()
9f95a23c TL	445	{
9f95a23c TL	446	if (!immediate.empty()) {
f67539c2	447	WorkItem work_item{std::move(immediate.back())};
9f95a23c	448	immediate.pop_back();
f67539c2	449	return work_item;
9f95a23c TL	450	} else {
	451	mclock_queue_t::PullReq result = scheduler.pull_request();
	452	if (result.is_future()) {
f67539c2	453	return result.getTime();
9f95a23c TL	454	} else if (result.is_none()) {
	455	ceph_assert(
	456	0 == "Impossible, must have checked empty() first");
f67539c2	457	return {};
9f95a23c TL	458	} else {
	459	ceph_assert(result.is_retn());
	460
	461	auto &retn = result.get_retn();
	462	return std::move(*retn.request);
	463	}
	464	}
	465	}
	466
	467	const char** mClockScheduler::get_tracked_conf_keys() const
	468	{
	469	static const char* KEYS[] = {
	470	"osd_mclock_scheduler_client_res",
	471	"osd_mclock_scheduler_client_wgt",
	472	"osd_mclock_scheduler_client_lim",
	473	"osd_mclock_scheduler_background_recovery_res",
	474	"osd_mclock_scheduler_background_recovery_wgt",
	475	"osd_mclock_scheduler_background_recovery_lim",
	476	"osd_mclock_scheduler_background_best_effort_res",
	477	"osd_mclock_scheduler_background_best_effort_wgt",
	478	"osd_mclock_scheduler_background_best_effort_lim",
f67539c2 TL	479	"osd_mclock_cost_per_io_usec",
	480	"osd_mclock_cost_per_io_usec_hdd",
	481	"osd_mclock_cost_per_io_usec_ssd",
	482	"osd_mclock_cost_per_byte_usec",
	483	"osd_mclock_cost_per_byte_usec_hdd",
	484	"osd_mclock_cost_per_byte_usec_ssd",
	485	"osd_mclock_max_capacity_iops",
	486	"osd_mclock_max_capacity_iops_hdd",
	487	"osd_mclock_max_capacity_iops_ssd",
	488	"osd_mclock_profile",
9f95a23c TL	489	NULL
	490	};
	491	return KEYS;
	492	}
	493
	494	void mClockScheduler::handle_conf_change(
	495	const ConfigProxy& conf,
	496	const std::set<std::string> &changed)
	497	{
f67539c2 TL	498	if (changed.count("osd_mclock_cost_per_io_usec") \|\|
	499	changed.count("osd_mclock_cost_per_io_usec_hdd") \|\|
	500	changed.count("osd_mclock_cost_per_io_usec_ssd")) {
	501	set_osd_mclock_cost_per_io();
	502	}
	503	if (changed.count("osd_mclock_cost_per_byte_usec") \|\|
	504	changed.count("osd_mclock_cost_per_byte_usec_hdd") \|\|
	505	changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
	506	set_osd_mclock_cost_per_byte();
	507	}
	508	if (changed.count("osd_mclock_max_capacity_iops") \|\|
	509	changed.count("osd_mclock_max_capacity_iops_hdd") \|\|
	510	changed.count("osd_mclock_max_capacity_iops_ssd")) {
	511	set_max_osd_capacity();
	512	if (mclock_profile != "custom") {
	513	enable_mclock_profile_settings();
	514	client_registry.update_from_config(conf);
	515	}
	516	}
	517	if (changed.count("osd_mclock_profile")) {
	518	set_mclock_profile();
	519	if (mclock_profile != "custom") {
	520	enable_mclock_profile_settings();
	521	client_registry.update_from_config(conf);
	522	}
	523	}
	524	if (changed.count("osd_mclock_scheduler_client_res") \|\|
	525	changed.count("osd_mclock_scheduler_client_wgt") \|\|
	526	changed.count("osd_mclock_scheduler_client_lim") \|\|
	527	changed.count("osd_mclock_scheduler_background_recovery_res") \|\|
	528	changed.count("osd_mclock_scheduler_background_recovery_wgt") \|\|
	529	changed.count("osd_mclock_scheduler_background_recovery_lim")) {
	530	if (mclock_profile == "custom") {
	531	client_registry.update_from_config(conf);
	532	}
	533	}
	534	}
	535
	536	mClockScheduler::~mClockScheduler()
	537	{
	538	cct->_conf.remove_observer(this);
9f95a23c TL	539	}
	540
	541	}