[ceph.git] / ceph / src / common / PriorityCache.cc

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
 * Ceph - scalable distributed file system
 *
 * Copyright (C) 2018 Red Hat
 *
 * This is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License version 2.1, as published by the Free Software
 * Foundation.  See file COPYING.
 *
 */

#include "PriorityCache.h"
#include "common/dout.h"
#include "perfglue/heap_profiler.h"
#define dout_context cct
#define dout_subsys ceph_subsys_prioritycache
#undef dout_prefix
#define dout_prefix *_dout << "prioritycache "

namespace PriorityCache
{
  int64_t get_chunk(uint64_t usage, uint64_t total_bytes)
  {
    uint64_t chunk = total_bytes;

    // Find the nearest power of 2
    chunk -= 1;
    chunk |= chunk >> 1;
    chunk |= chunk >> 2;
    chunk |= chunk >> 4;
    chunk |= chunk >> 8;
    chunk |= chunk >> 16;
    chunk |= chunk >> 32;
    chunk += 1;
    // shrink it to 1/256 of the rounded up cache size
    chunk /= 256;

    // bound the chunk size to be between 4MB and 64MB
    chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
    chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024;

    /* FIXME: Hardcoded to force get_chunk to never drop below 64MB. 
     * if RocksDB is used, it's a good idea to have N MB of headroom where
     * N is the target_file_size_base value.  RocksDB will read SST files
     * into the block cache during compaction which potentially can force out
     * all existing cached data.  Once compaction is finished, the SST data is
     * released leaving an empty cache.  Having enough headroom to absorb
     * compaction reads allows the kv cache grow even during extremely heavy
     * compaction workloads.
     */
    uint64_t val = usage + 64*1024*1024;
    uint64_t r = (val) % chunk;
    if (r > 0)
      val = val + chunk - r;
    return val;
  }

  Manager::Manager(CephContext *c,
                   uint64_t min,
                   uint64_t max,
                   uint64_t target,
                   bool reserve_extra,
		   const std::string& name) :
      cct(c),
      caches{},
      min_mem(min),
      max_mem(max),
      target_mem(target),
      tuned_mem(min),
      reserve_extra(reserve_extra),
      name(name.empty() ? "prioritycache" : name)
  {
    PerfCountersBuilder b(cct, this->name, MallocStats::M_FIRST, MallocStats::M_LAST);

    b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes",
              "target process memory usage in bytes", "t",
              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));

    b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes",
              "total bytes mapped by the process", "m",
              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));

    b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes",
              "unmapped bytes that the kernel has yet to reclaim", "u",
              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));

    b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes",
              "aggregate bytes in use by the heap", "h",
              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));

    b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes",
              "current memory available for caches.", "c",
              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));

    logger = b.create_perf_counters();
    cct->get_perfcounters_collection()->add(logger);

    tune_memory();
  }

  Manager::~Manager()
  {
    clear();
    cct->get_perfcounters_collection()->remove(logger);
    delete logger;
  }

  void Manager::tune_memory()
  {
    size_t heap_size = 0;
    size_t unmapped = 0;
    uint64_t mapped = 0;

    ceph_heap_release_free_memory();
    ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
    ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
    mapped = heap_size - unmapped;

    uint64_t new_size = tuned_mem;
    new_size = (new_size < max_mem) ? new_size : max_mem;
    new_size = (new_size > min_mem) ? new_size : min_mem;

    // Approach the min/max slowly, but bounce away quickly.
    if ((uint64_t) mapped < target_mem) {
      double ratio = 1 - ((double) mapped / target_mem);
      new_size += ratio * (max_mem - new_size);
    } else { 
      double ratio = 1 - ((double) target_mem / mapped);
      new_size -= ratio * (new_size - min_mem);
    }

    ldout(cct, 5) << __func__
                  << " target: " << target_mem
                  << " mapped: " << mapped  
                  << " unmapped: " << unmapped
                  << " heap: " << heap_size
                  << " old mem: " << tuned_mem
                  << " new mem: " << new_size << dendl;

    tuned_mem = new_size;

    logger->set(MallocStats::M_TARGET_BYTES, target_mem);
    logger->set(MallocStats::M_MAPPED_BYTES, mapped);
    logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped);
    logger->set(MallocStats::M_HEAP_BYTES, heap_size);
    logger->set(MallocStats::M_CACHE_BYTES, new_size);
  }

  void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c,
                       bool enable_perf_counters)
  {
    ceph_assert(!caches.count(name));
    ceph_assert(!indexes.count(name));

    caches.emplace(name, c);

    if (!enable_perf_counters) {
      return;
    }

    // TODO: If we ever assign more than
    // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
    // priority caching we could run out of slots.  Recycle them some day?
    // Also note that start and end are *exclusive*.
    int start = cur_index++;
    int end = cur_index + Extra::E_LAST + 1;

    ceph_assert(end < PERF_COUNTER_MAX_BOUND);
    indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1));

    PerfCountersBuilder b(cct, this->name + ":" + name, start, end);

    b.add_u64(cur_index + Priority::PRI0, "pri0_bytes",
              "bytes allocated to pri0", "p0",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI1, "pri1_bytes",
              "bytes allocated to pri1", "p1",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI2, "pri2_bytes",
              "bytes allocated to pri2", "p2",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI3, "pri3_bytes",
              "bytes allocated to pri3", "p3",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI4, "pri4_bytes",
              "bytes allocated to pri4", "p4",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI5, "pri5_bytes",
              "bytes allocated to pri5", "p5",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI6, "pri6_bytes",
              "bytes allocated to pri6", "p6",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI7, "pri7_bytes",
              "bytes allocated to pri7", "p7",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI8, "pri8_bytes",
              "bytes allocated to pri8", "p8",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI9, "pri9_bytes",
              "bytes allocated to pri9", "p9",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI10, "pri10_bytes",
              "bytes allocated to pri10", "p10",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Priority::PRI11, "pri11_bytes",
              "bytes allocated to pri11", "p11",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes",
              "bytes reserved for future growth.", "r",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes",
              "total bytes committed,", "c",
              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));

    for (int i = 0; i < Extra::E_LAST+1; i++) {
      indexes[name][i] = cur_index + i;
    }

    auto l = b.create_perf_counters();
    loggers.emplace(name, l);
    cct->get_perfcounters_collection()->add(l);

    cur_index = end;
  }

  void Manager::erase(const std::string& name)
  {
    auto li = loggers.find(name);
    if (li != loggers.end()) {
      cct->get_perfcounters_collection()->remove(li->second);
      delete li->second;
      loggers.erase(li);
    }
    indexes.erase(name);
    caches.erase(name);
  }

  void Manager::clear()
  {
    auto li = loggers.begin();
    while (li != loggers.end()) {
      cct->get_perfcounters_collection()->remove(li->second);
      delete li->second;
      li = loggers.erase(li);
    }
    indexes.clear();
    caches.clear();
  }

  void Manager::balance()
  {
    int64_t mem_avail = tuned_mem;
    // Each cache is going to get a little extra from get_chunk, so shrink the
    // available memory here to compensate.
    if (reserve_extra) {
      mem_avail -= get_chunk(1, tuned_mem) * caches.size();
    }

    if (mem_avail < 0) {
      // There's so little memory available that just assigning a chunk per
      // cache pushes us over the limit. Set mem_avail to 0 and continue to
      // ensure each priority's byte counts are zeroed in balance_priority.
      mem_avail = 0;
    }

    // Assign memory for each priority level
    for (int i = 0; i < Priority::LAST+1; i++) {
      ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;

      auto pri = static_cast<Priority>(i);
      balance_priority(&mem_avail, pri);

      // Update the per-priority perf counters
      for (auto &l : loggers) {
        auto it = caches.find(l.first);
        ceph_assert(it != caches.end());

        auto bytes = it->second->get_cache_bytes(pri);
        l.second->set(indexes[it->first][pri], bytes);
      }
    }
    // assert if we assigned more memory than is available.
    ceph_assert(mem_avail >= 0);

    for (auto &l : loggers) {
      auto it = caches.find(l.first);
      ceph_assert(it != caches.end());

      // Commit the new cache size
      int64_t committed = it->second->commit_cache_size(tuned_mem);
      // Update the perf counters
      int64_t alloc = it->second->get_cache_bytes();

      l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc);
      l.second->set(indexes[it->first][Extra::E_COMMITTED], committed);
    }
  }

  void Manager::shift_bins()
  {
    for (auto &l : loggers) {
      auto it = caches.find(l.first);
      it->second->shift_bins();
    }
  }

  void Manager::balance_priority(int64_t *mem_avail, Priority pri)
  {
    std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
    double cur_ratios = 0;
    double new_ratios = 0;
    uint64_t round = 0;

    // First, zero this priority's bytes, sum the initial ratios.
    for (auto it = caches.begin(); it != caches.end(); it++) {
      it->second->set_cache_bytes(pri, 0);
      cur_ratios += it->second->get_cache_ratio();
    }

    // For other priorities, loop until caches are satisified or we run out of
    // memory (stop if we can't guarantee a full byte allocation).
    while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
      uint64_t total_assigned = 0;
      for (auto it = tmp_caches.begin(); it != tmp_caches.end();) {
        int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem);
        // Usually the ratio should be set to the fraction of the current caches'
        // assigned ratio compared to the total ratio of all caches that still
        // want memory.  There is a special case where the only caches left are
        // all assigned 0% ratios but still want memory.  In that case, give 
        // them an equal shot at the remaining memory for this priority.
        double ratio = 1.0 / tmp_caches.size();
        if (cur_ratios > 0) {
          ratio = it->second->get_cache_ratio() / cur_ratios;
        }
        int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);

        ldout(cct, 10) << __func__ << " " << it->first
                       << " pri: " << (int) pri
                       << " round: " << round
                       << " wanted: " << cache_wants
                       << " ratio: " << it->second->get_cache_ratio()
                       << " cur_ratios: " << cur_ratios
                       << " fair_share: " << fair_share
                       << " mem_avail: " << *mem_avail
                       << dendl;

        if (cache_wants > fair_share) {
          // If we want too much, take what we can get but stick around for more
          it->second->add_cache_bytes(pri, fair_share);
          total_assigned += fair_share;
          new_ratios += it->second->get_cache_ratio();
          ++it;
        } else {
          // Otherwise assign only what we want
          if (cache_wants > 0) {
            it->second->add_cache_bytes(pri, cache_wants);
            total_assigned += cache_wants;
          }
          // Either the cache didn't want anything or got what it wanted, so
          // remove it from the tmp list.
          it = tmp_caches.erase(it);
        }
      }
      // Reset the ratios 
      *mem_avail -= total_assigned;
      cur_ratios = new_ratios;
      new_ratios = 0;
      ++round;
    }

    // If this is the last priority, divide up any remaining memory based
    // solely on the ratios.
    if (pri == Priority::LAST) {
      uint64_t total_assigned = 0;
      for (auto it = caches.begin(); it != caches.end(); it++) {
        double ratio = it->second->get_cache_ratio();
        int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
        it->second->set_cache_bytes(Priority::LAST, fair_share);
        total_assigned += fair_share;
      }
      *mem_avail -= total_assigned;
      return;
    }
  }

  PriCache::~PriCache()
  {
  }
}
Commit	Line	Data
91327a77 AA	1	// -- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t --
	2	// vim: ts=8 sw=2 smarttab
	3	/*
	4	* Ceph - scalable distributed file system
	5	*
	6	* Copyright (C) 2018 Red Hat
	7	*
	8	* This is free software; you can redistribute it and/or
	9	* modify it under the terms of the GNU Lesser General Public
	10	* License version 2.1, as published by the Free Software
	11	* Foundation. See file COPYING.
	12	*
	13	*/
	14
	15	#include "PriorityCache.h"
eafe8130 TL	16	#include "common/dout.h"
	17	#include "perfglue/heap_profiler.h"
	18	#define dout_context cct
	19	#define dout_subsys ceph_subsys_prioritycache
	20	#undef dout_prefix
	21	#define dout_prefix *_dout << "prioritycache "
91327a77	22
eafe8130 TL	23	namespace PriorityCache
	24	{
	25	int64_t get_chunk(uint64_t usage, uint64_t total_bytes)
	26	{
11fdf7f2 TL	27	uint64_t chunk = total_bytes;
	28
	29	// Find the nearest power of 2
	30	chunk -= 1;
	31	chunk \|= chunk >> 1;
	32	chunk \|= chunk >> 2;
	33	chunk \|= chunk >> 4;
	34	chunk \|= chunk >> 8;
	35	chunk \|= chunk >> 16;
	36	chunk \|= chunk >> 32;
	37	chunk += 1;
	38	// shrink it to 1/256 of the rounded up cache size
	39	chunk /= 256;
	40
f67539c2	41	// bound the chunk size to be between 4MB and 64MB
11fdf7f2	42	chunk = (chunk > 4ul10241024) ? chunk : 4ul10241024;
f67539c2	43	chunk = (chunk < 64ul10241024) ? chunk : 64ul10241024;
11fdf7f2	44
f67539c2	45	/* FIXME: Hardcoded to force get_chunk to never drop below 64MB.
11fdf7f2 TL	46	* if RocksDB is used, it's a good idea to have N MB of headroom where
	47	* N is the target_file_size_base value. RocksDB will read SST files
	48	* into the block cache during compaction which potentially can force out
	49	* all existing cached data. Once compaction is finished, the SST data is
	50	* released leaving an empty cache. Having enough headroom to absorb
	51	* compaction reads allows the kv cache grow even during extremely heavy
	52	* compaction workloads.
	53	*/
f67539c2	54	uint64_t val = usage + 6410241024;
11fdf7f2	55	uint64_t r = (val) % chunk;
91327a77	56	if (r > 0)
11fdf7f2	57	val = val + chunk - r;
91327a77 AA	58	return val;
	59	}
	60
eafe8130 TL	61	Manager::Manager(CephContext *c,
	62	uint64_t min,
	63	uint64_t max,
	64	uint64_t target,
f67539c2 TL	65	bool reserve_extra,
f67539c2 TL	66	const std::string& name) :
eafe8130 TL	67	cct(c),
	68	caches{},
	69	min_mem(min),
	70	max_mem(max),
	71	target_mem(target),
	72	tuned_mem(min),
f67539c2 TL	73	reserve_extra(reserve_extra),
f67539c2 TL	74	name(name.empty() ? "prioritycache" : name)
eafe8130	75	{
20effc67	76	PerfCountersBuilder b(cct, this->name, MallocStats::M_FIRST, MallocStats::M_LAST);
eafe8130 TL	77
	78	b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes",
	79	"target process memory usage in bytes", "t",
	80	PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
	81
	82	b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes",
	83	"total bytes mapped by the process", "m",
	84	PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
	85
	86	b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes",
20effc67	87	"unmapped bytes that the kernel has yet to reclaim", "u",
eafe8130 TL	88	PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
	89
	90	b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes",
	91	"aggregate bytes in use by the heap", "h",
	92	PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
	93
	94	b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes",
	95	"current memory available for caches.", "c",
	96	PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
	97
	98	logger = b.create_perf_counters();
	99	cct->get_perfcounters_collection()->add(logger);
	100
	101	tune_memory();
	102	}
	103
	104	Manager::~Manager()
	105	{
	106	clear();
	107	cct->get_perfcounters_collection()->remove(logger);
	108	delete logger;
	109	}
	110
	111	void Manager::tune_memory()
	112	{
	113	size_t heap_size = 0;
	114	size_t unmapped = 0;
	115	uint64_t mapped = 0;
	116
	117	ceph_heap_release_free_memory();
	118	ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
	119	ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
	120	mapped = heap_size - unmapped;
	121
	122	uint64_t new_size = tuned_mem;
	123	new_size = (new_size < max_mem) ? new_size : max_mem;
	124	new_size = (new_size > min_mem) ? new_size : min_mem;
	125
	126	// Approach the min/max slowly, but bounce away quickly.
	127	if ((uint64_t) mapped < target_mem) {
	128	double ratio = 1 - ((double) mapped / target_mem);
	129	new_size += ratio * (max_mem - new_size);
	130	} else {
	131	double ratio = 1 - ((double) target_mem / mapped);
	132	new_size -= ratio * (new_size - min_mem);
	133	}
	134
	135	ldout(cct, 5) << __func__
	136	<< " target: " << target_mem
	137	<< " mapped: " << mapped
	138	<< " unmapped: " << unmapped
	139	<< " heap: " << heap_size
	140	<< " old mem: " << tuned_mem
	141	<< " new mem: " << new_size << dendl;
	142
	143	tuned_mem = new_size;
	144
	145	logger->set(MallocStats::M_TARGET_BYTES, target_mem);
	146	logger->set(MallocStats::M_MAPPED_BYTES, mapped);
	147	logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped);
	148	logger->set(MallocStats::M_HEAP_BYTES, heap_size);
	149	logger->set(MallocStats::M_CACHE_BYTES, new_size);
	150	}
	151
152	void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c,
153	bool enable_perf_counters)
154	{
155	ceph_assert(!caches.count(name));
156	ceph_assert(!indexes.count(name));
157
158	caches.emplace(name, c);
159
160	if (!enable_perf_counters) {
161	return;
162	}
163
164	// TODO: If we ever assign more than
165	// PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
166	// priority caching we could run out of slots. Recycle them some day?
167	// Also note that start and end are exclusive.
168	int start = cur_index++;
169	int end = cur_index + Extra::E_LAST + 1;
170
171	ceph_assert(end < PERF_COUNTER_MAX_BOUND);
172	indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1));
173
f67539c2	174	PerfCountersBuilder b(cct, this->name + ":" + name, start, end);
eafe8130 TL	175
	176	b.add_u64(cur_index + Priority::PRI0, "pri0_bytes",
	177	"bytes allocated to pri0", "p0",
20effc67	178	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	179
	180	b.add_u64(cur_index + Priority::PRI1, "pri1_bytes",
	181	"bytes allocated to pri1", "p1",
20effc67	182	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	183
	184	b.add_u64(cur_index + Priority::PRI2, "pri2_bytes",
	185	"bytes allocated to pri2", "p2",
20effc67	186	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	187
	188	b.add_u64(cur_index + Priority::PRI3, "pri3_bytes",
	189	"bytes allocated to pri3", "p3",
20effc67	190	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	191
	192	b.add_u64(cur_index + Priority::PRI4, "pri4_bytes",
	193	"bytes allocated to pri4", "p4",
20effc67	194	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	195
	196	b.add_u64(cur_index + Priority::PRI5, "pri5_bytes",
	197	"bytes allocated to pri5", "p5",
20effc67	198	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	199
	200	b.add_u64(cur_index + Priority::PRI6, "pri6_bytes",
	201	"bytes allocated to pri6", "p6",
20effc67	202	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	203
	204	b.add_u64(cur_index + Priority::PRI7, "pri7_bytes",
	205	"bytes allocated to pri7", "p7",
20effc67	206	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	207
	208	b.add_u64(cur_index + Priority::PRI8, "pri8_bytes",
	209	"bytes allocated to pri8", "p8",
20effc67	210	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	211
	212	b.add_u64(cur_index + Priority::PRI9, "pri9_bytes",
	213	"bytes allocated to pri9", "p9",
20effc67	214	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	215
	216	b.add_u64(cur_index + Priority::PRI10, "pri10_bytes",
	217	"bytes allocated to pri10", "p10",
20effc67	218	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	219
	220	b.add_u64(cur_index + Priority::PRI11, "pri11_bytes",
	221	"bytes allocated to pri11", "p11",
20effc67	222	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	223
	224	b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes",
	225	"bytes reserved for future growth.", "r",
20effc67	226	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	227
	228	b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes",
	229	"total bytes committed,", "c",
20effc67	230	PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130 TL	231
	232	for (int i = 0; i < Extra::E_LAST+1; i++) {
	233	indexes[name][i] = cur_index + i;
	234	}
	235
	236	auto l = b.create_perf_counters();
	237	loggers.emplace(name, l);
	238	cct->get_perfcounters_collection()->add(l);
	239
	240	cur_index = end;
	241	}
	242
	243	void Manager::erase(const std::string& name)
	244	{
	245	auto li = loggers.find(name);
	246	if (li != loggers.end()) {
	247	cct->get_perfcounters_collection()->remove(li->second);
	248	delete li->second;
	249	loggers.erase(li);
	250	}
	251	indexes.erase(name);
	252	caches.erase(name);
	253	}
	254
	255	void Manager::clear()
	256	{
	257	auto li = loggers.begin();
	258	while (li != loggers.end()) {
	259	cct->get_perfcounters_collection()->remove(li->second);
	260	delete li->second;
	261	li = loggers.erase(li);
	262	}
	263	indexes.clear();
	264	caches.clear();
	265	}
	266
	267	void Manager::balance()
	268	{
	269	int64_t mem_avail = tuned_mem;
	270	// Each cache is going to get a little extra from get_chunk, so shrink the
	271	// available memory here to compensate.
	272	if (reserve_extra) {
	273	mem_avail -= get_chunk(1, tuned_mem) * caches.size();
	274	}
	275
	276	if (mem_avail < 0) {
	277	// There's so little memory available that just assigning a chunk per
	278	// cache pushes us over the limit. Set mem_avail to 0 and continue to
	279	// ensure each priority's byte counts are zeroed in balance_priority.
	280	mem_avail = 0;
	281	}
	282
	283	// Assign memory for each priority level
	284	for (int i = 0; i < Priority::LAST+1; i++) {
	285	ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
	286
	287	auto pri = static_cast<Priority>(i);
	288	balance_priority(&mem_avail, pri);
	289
	290	// Update the per-priority perf counters
	291	for (auto &l : loggers) {
	292	auto it = caches.find(l.first);
	293	ceph_assert(it != caches.end());
	294
295	auto bytes = it->second->get_cache_bytes(pri);
296	l.second->set(indexes[it->first][pri], bytes);
297	}
298	}
299	// assert if we assigned more memory than is available.
300	ceph_assert(mem_avail >= 0);
301
302	for (auto &l : loggers) {
303	auto it = caches.find(l.first);
304	ceph_assert(it != caches.end());
305
306	// Commit the new cache size
307	int64_t committed = it->second->commit_cache_size(tuned_mem);
eafe8130 TL	308	// Update the perf counters
	309	int64_t alloc = it->second->get_cache_bytes();
	310
	311	l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc);
	312	l.second->set(indexes[it->first][Extra::E_COMMITTED], committed);
	313	}
	314	}
	315
20effc67 TL	316	void Manager::shift_bins()
	317	{
	318	for (auto &l : loggers) {
	319	auto it = caches.find(l.first);
	320	it->second->shift_bins();
	321	}
	322	}
	323
eafe8130 TL	324	void Manager::balance_priority(int64_t *mem_avail, Priority pri)
	325	{
	326	std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
	327	double cur_ratios = 0;
	328	double new_ratios = 0;
	329	uint64_t round = 0;
	330
	331	// First, zero this priority's bytes, sum the initial ratios.
	332	for (auto it = caches.begin(); it != caches.end(); it++) {
	333	it->second->set_cache_bytes(pri, 0);
	334	cur_ratios += it->second->get_cache_ratio();
	335	}
	336
	337	// For other priorities, loop until caches are satisified or we run out of
	338	// memory (stop if we can't guarantee a full byte allocation).
	339	while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
	340	uint64_t total_assigned = 0;
	341	for (auto it = tmp_caches.begin(); it != tmp_caches.end();) {
	342	int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem);
	343	// Usually the ratio should be set to the fraction of the current caches'
	344	// assigned ratio compared to the total ratio of all caches that still
	345	// want memory. There is a special case where the only caches left are
	346	// all assigned 0% ratios but still want memory. In that case, give
	347	// them an equal shot at the remaining memory for this priority.
	348	double ratio = 1.0 / tmp_caches.size();
	349	if (cur_ratios > 0) {
	350	ratio = it->second->get_cache_ratio() / cur_ratios;
	351	}
	352	int64_t fair_share = static_cast<int64_t>(mem_avail ratio);
	353
	354	ldout(cct, 10) << __func__ << " " << it->first
	355	<< " pri: " << (int) pri
	356	<< " round: " << round
	357	<< " wanted: " << cache_wants
	358	<< " ratio: " << it->second->get_cache_ratio()
	359	<< " cur_ratios: " << cur_ratios
	360	<< " fair_share: " << fair_share
	361	<< " mem_avail: " << *mem_avail
	362	<< dendl;
	363
	364	if (cache_wants > fair_share) {
	365	// If we want too much, take what we can get but stick around for more
	366	it->second->add_cache_bytes(pri, fair_share);
	367	total_assigned += fair_share;
	368	new_ratios += it->second->get_cache_ratio();
	369	++it;
	370	} else {
	371	// Otherwise assign only what we want
	372	if (cache_wants > 0) {
	373	it->second->add_cache_bytes(pri, cache_wants);
	374	total_assigned += cache_wants;
	375	}
	376	// Either the cache didn't want anything or got what it wanted, so
	377	// remove it from the tmp list.
	378	it = tmp_caches.erase(it);
	379	}
	380	}
	381	// Reset the ratios
	382	*mem_avail -= total_assigned;
	383	cur_ratios = new_ratios;
	384	new_ratios = 0;
	385	++round;
	386	}
	387
388	// If this is the last priority, divide up any remaining memory based
389	// solely on the ratios.
390	if (pri == Priority::LAST) {
391	uint64_t total_assigned = 0;
392	for (auto it = caches.begin(); it != caches.end(); it++) {
393	double ratio = it->second->get_cache_ratio();
394	int64_t fair_share = static_cast<int64_t>(mem_avail ratio);
395	it->second->set_cache_bytes(Priority::LAST, fair_share);
396	total_assigned += fair_share;
397	}
398	*mem_avail -= total_assigned;
399	return;
400	}
401	}
402
403	PriCache::~PriCache()
404	{
91327a77 AA	405	}
91327a77 AA	406	}