ceph/src/osd/scheduler/mClockScheduler.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2016 Red Hat Inc.
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 #pragma once
  17
  18 #include <functional>
  19 #include <ostream>
  20 #include <map>
  21 #include <vector>
  22
  23 #include "boost/variant.hpp"
  24
  25 #include "dmclock/src/dmclock_server.h"
  26
  27 #include "osd/scheduler/OpScheduler.h"
  28 #include "common/config.h"
  29 #include "common/ceph_context.h"
  30 #include "common/mClockPriorityQueue.h"
  31 #include "osd/scheduler/OpSchedulerItem.h"
  32
  33
  34 namespace ceph::osd::scheduler {
  35
  36 constexpr double default_min = 0.0;
  37 constexpr double default_max = std::numeric_limits<double>::is_iec559 ?
  38   std::numeric_limits<double>::infinity() :
  39   std::numeric_limits<double>::max();
  40
  41 /**
  42  * client_profile_id_t
  43  *
  44  * client_id - global id (client.####) for client QoS
  45  * profile_id - id generated by client's QoS profile
  46  *
  47  * Currently (Reef and below), both members are set to
  48  * 0 which ensures that all external clients share the
  49  * mClock profile allocated reservation and limit
  50  * bandwidth.
  51  *
  52  * Note: Post Reef, both members will be set to non-zero
  53  * values when the distributed feature of the mClock
  54  * algorithm is utilized.
  55  */
  56 struct client_profile_id_t {
  57   uint64_t client_id = 0;
  58   uint64_t profile_id = 0;
  59
  60   client_profile_id_t(uint64_t _client_id, uint64_t _profile_id) :
  61     client_id(_client_id),
  62     profile_id(_profile_id) {}
  63
  64   client_profile_id_t() = default;
  65
  66   auto operator<=>(const client_profile_id_t&) const = default;
  67   friend std::ostream& operator<<(std::ostream& out,
  68                                   const client_profile_id_t& client_profile) {
  69     out << " client_id: " << client_profile.client_id
  70         << " profile_id: " << client_profile.profile_id;
  71     return out;
  72   }
  73 };
  74
  75 struct scheduler_id_t {
  76   op_scheduler_class class_id;
  77   client_profile_id_t client_profile_id;
  78
  79   auto operator<=>(const scheduler_id_t&) const = default;
  80   friend std::ostream& operator<<(std::ostream& out,
  81                                   const scheduler_id_t& sched_id) {
  82     out << "{ class_id: " << sched_id.class_id
  83         << sched_id.client_profile_id;
  84     return out << " }";
  85   }
  86 };
  87
  88 /**
  89  * Scheduler implementation based on mclock.
  90  *
  91  * TODO: explain configs
  92  */
  93 class mClockScheduler : public OpScheduler, md_config_obs_t {
  94
  95   CephContext *cct;
  96   const int whoami;
  97   const uint32_t num_shards;
  98   const int shard_id;
  99   const bool is_rotational;
 100   MonClient *monc;
 101
 102   /**
 103    * osd_bandwidth_cost_per_io
 104    *
 105    * mClock expects all queued items to have a uniform expression of
 106    * "cost".  However, IO devices generally have quite different capacity
 107    * for sequential IO vs small random IO.  This implementation handles this
 108    * by expressing all costs as a number of sequential bytes written adding
 109    * additional cost for each random IO equal to osd_bandwidth_cost_per_io.
 110    *
 111    * Thus, an IO operation requiring a total of <size> bytes to be written
 112    * accross <iops> different locations will have a cost of
 113    * <size> + (osd_bandwidth_cost_per_io * <iops>) bytes.
 114    *
 115    * Set in set_osd_capacity_params_from_config in the constructor and upon
 116    * config change.
 117    *
 118    * Has units bytes/io.
 119    */
 120   double osd_bandwidth_cost_per_io;
 121
 122   /**
 123    * osd_bandwidth_capacity_per_shard
 124    *
 125    * mClock expects reservation and limit paramters to be expressed in units
 126    * of cost/second -- which means bytes/second for this implementation.
 127    *
 128    * Rather than expecting users to compute appropriate limit and reservation
 129    * values for each class of OSDs in their cluster, we instead express
 130    * reservation and limit paramaters as ratios of the OSD's maxmimum capacity.
 131    * osd_bandwidth_capacity_per_shard is that capacity divided by the number
 132    * of shards.
 133    *
 134    * Set in set_osd_capacity_params_from_config in the constructor and upon
 135    * config change.
 136    *
 137    * This value gets passed to ClientRegistry::update_from_config in order
 138    * to resolve the full reservaiton and limit parameters for mclock from
 139    * the configured ratios.
 140    *
 141    * Has units bytes/second.
 142    */
 143   double osd_bandwidth_capacity_per_shard;
 144
 145   class ClientRegistry {
 146     std::array<
 147       crimson::dmclock::ClientInfo,
 148       static_cast<size_t>(op_scheduler_class::immediate)
 149     > internal_client_infos = {
 150       // Placeholder, gets replaced with configured values
 151       crimson::dmclock::ClientInfo(1, 1, 1),
 152       crimson::dmclock::ClientInfo(1, 1, 1)
 153     };
 154
 155     crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
 156     std::map<client_profile_id_t,
 157              crimson::dmclock::ClientInfo> external_client_infos;
 158     const crimson::dmclock::ClientInfo *get_external_client(
 159       const client_profile_id_t &client) const;
 160   public:
 161     /**
 162      * update_from_config
 163      *
 164      * Sets the mclock paramaters (reservation, weight, and limit)
 165      * for each class of IO (background_recovery, background_best_effort,
 166      * and client).
 167      */
 168     void update_from_config(
 169       const ConfigProxy &conf,
 170       double capacity_per_shard);
 171     const crimson::dmclock::ClientInfo *get_info(
 172       const scheduler_id_t &id) const;
 173   } client_registry;
 174
 175   using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
 176     scheduler_id_t,
 177     OpSchedulerItem,
 178     true,
 179     true,
 180     2>;
 181   using priority_t = unsigned;
 182   using SubQueue = std::map<priority_t,
 183         std::list<OpSchedulerItem>,
 184         std::greater<priority_t>>;
 185   mclock_queue_t scheduler;
 186   /**
 187    * high_priority
 188    *
 189    * Holds entries to be dequeued in strict order ahead of mClock
 190    * Invariant: entries are never empty
 191    */
 192   SubQueue high_priority;
 193   priority_t immediate_class_priority = std::numeric_limits<priority_t>::max();
 194
 195   static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) {
 196     return scheduler_id_t{
 197       item.get_scheduler_class(),
 198       client_profile_id_t()
 199     };
 200   }
 201
 202   static unsigned int get_io_prio_cut(CephContext *cct) {
 203     if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
 204       std::random_device rd;
 205       std::mt19937 random_gen(rd());
 206       return (random_gen() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
 207     } else if (cct->_conf->osd_op_queue_cut_off == "high") {
 208       return CEPH_MSG_PRIO_HIGH;
 209     } else {
 210       // default / catch-all is 'low'
 211       return CEPH_MSG_PRIO_LOW;
 212     }
 213   }
 214
 215   unsigned cutoff_priority = get_io_prio_cut(cct);
 216
 217   /**
 218    * set_osd_capacity_params_from_config
 219    *
 220    * mClockScheduler uses two parameters, osd_bandwidth_cost_per_io
 221    * and osd_bandwidth_capacity_per_shard, internally.  These two
 222    * parameters are derived from config parameters
 223    * osd_mclock_max_capacity_iops_(hdd|ssd) and
 224    * osd_mclock_max_sequential_bandwidth_(hdd|ssd) as well as num_shards.
 225    * Invoking set_osd_capacity_params_from_config() resets those derived
 226    * params based on the current config and should be invoked any time they
 227    * are modified as well as in the constructor.  See handle_conf_change().
 228    */
 229   void set_osd_capacity_params_from_config();
 230
 231   // Set the mclock related config params based on the profile
 232   void set_config_defaults_from_profile();
 233
 234 public:
 235   mClockScheduler(CephContext *cct, int whoami, uint32_t num_shards,
 236     int shard_id, bool is_rotational, MonClient *monc);
 237   ~mClockScheduler() override;
 238
 239   /// Calculate scaled cost per item
 240   uint32_t calc_scaled_cost(int cost);
 241
 242   // Helper method to display mclock queues
 243   std::string display_queues() const;
 244
 245   // Enqueue op in the back of the regular queue
 246   void enqueue(OpSchedulerItem &&item) final;
 247
 248   // Enqueue the op in the front of the high priority queue
 249   void enqueue_front(OpSchedulerItem &&item) final;
 250
 251   // Return an op to be dispatch
 252   WorkItem dequeue() final;
 253
 254   // Returns if the queue is empty
 255   bool empty() const final {
 256     return scheduler.empty() && high_priority.empty();
 257   }
 258
 259   // Formatted output of the queue
 260   void dump(ceph::Formatter &f) const final;
 261
 262   void print(std::ostream &ostream) const final {
 263     ostream << "mClockScheduler";
 264   }
 265
 266   // Update data associated with the modified mclock config key(s)
 267   void update_configuration() final;
 268
 269   const char** get_tracked_conf_keys() const final;
 270   void handle_conf_change(const ConfigProxy& conf,
 271                           const std::set<std::string> &changed) final;
 272 private:
 273   // Enqueue the op to the high priority queue
 274   void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false);
 275 };
 276
 277 }