ceph/src/common/PriorityCache.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2018 Red Hat
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "PriorityCache.h"
  16 #include "common/dout.h"
  17 #include "perfglue/heap_profiler.h"
  18 #define dout_context cct
  19 #define dout_subsys ceph_subsys_prioritycache
  20 #undef dout_prefix
  21 #define dout_prefix *_dout << "prioritycache "
  22
  23 namespace PriorityCache
  24 {
  25   int64_t get_chunk(uint64_t usage, uint64_t total_bytes)
  26   {
  27     uint64_t chunk = total_bytes;
  28
  29     // Find the nearest power of 2
  30     chunk -= 1;
  31     chunk |= chunk >> 1;
  32     chunk |= chunk >> 2;
  33     chunk |= chunk >> 4;
  34     chunk |= chunk >> 8;
  35     chunk |= chunk >> 16;
  36     chunk |= chunk >> 32;
  37     chunk += 1;
  38     // shrink it to 1/256 of the rounded up cache size
  39     chunk /= 256;
  40
  41     // bound the chunk size to be between 4MB and 64MB
  42     chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
  43     chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024;
  44
  45     /* FIXME: Hardcoded to force get_chunk to never drop below 64MB.
  46      * if RocksDB is used, it's a good idea to have N MB of headroom where
  47      * N is the target_file_size_base value.  RocksDB will read SST files
  48      * into the block cache during compaction which potentially can force out
  49      * all existing cached data.  Once compaction is finished, the SST data is
  50      * released leaving an empty cache.  Having enough headroom to absorb
  51      * compaction reads allows the kv cache grow even during extremely heavy
  52      * compaction workloads.
  53      */
  54     uint64_t val = usage + 64*1024*1024;
  55     uint64_t r = (val) % chunk;
  56     if (r > 0)
  57       val = val + chunk - r;
  58     return val;
  59   }
  60
  61   Manager::Manager(CephContext *c,
  62                    uint64_t min,
  63                    uint64_t max,
  64                    uint64_t target,
  65                    bool reserve_extra,
  66                    const std::string& name) :
  67       cct(c),
  68       caches{},
  69       min_mem(min),
  70       max_mem(max),
  71       target_mem(target),
  72       tuned_mem(min),
  73       reserve_extra(reserve_extra),
  74       name(name.empty() ? "prioritycache" : name)
  75   {
  76     PerfCountersBuilder b(cct, name,
  77                           MallocStats::M_FIRST, MallocStats::M_LAST);
  78
  79     b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes",
  80               "target process memory usage in bytes", "t",
  81               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
  82
  83     b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes",
  84               "total bytes mapped by the process", "m",
  85               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
  86
  87     b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes",
  88               "unmapped bytes that the kernel has yet to reclaimed", "u",
  89               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
  90
  91     b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes",
  92               "aggregate bytes in use by the heap", "h",
  93               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
  94
  95     b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes",
  96               "current memory available for caches.", "c",
  97               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
  98
  99     logger = b.create_perf_counters();
 100     cct->get_perfcounters_collection()->add(logger);
 101
 102     tune_memory();
 103   }
 104
 105   Manager::~Manager()
 106   {
 107     clear();
 108     cct->get_perfcounters_collection()->remove(logger);
 109     delete logger;
 110   }
 111
 112   void Manager::tune_memory()
 113   {
 114     size_t heap_size = 0;
 115     size_t unmapped = 0;
 116     uint64_t mapped = 0;
 117
 118     ceph_heap_release_free_memory();
 119     ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
 120     ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
 121     mapped = heap_size - unmapped;
 122
 123     uint64_t new_size = tuned_mem;
 124     new_size = (new_size < max_mem) ? new_size : max_mem;
 125     new_size = (new_size > min_mem) ? new_size : min_mem;
 126
 127     // Approach the min/max slowly, but bounce away quickly.
 128     if ((uint64_t) mapped < target_mem) {
 129       double ratio = 1 - ((double) mapped / target_mem);
 130       new_size += ratio * (max_mem - new_size);
 131     } else {
 132       double ratio = 1 - ((double) target_mem / mapped);
 133       new_size -= ratio * (new_size - min_mem);
 134     }
 135
 136     ldout(cct, 5) << __func__
 137                   << " target: " << target_mem
 138                   << " mapped: " << mapped
 139                   << " unmapped: " << unmapped
 140                   << " heap: " << heap_size
 141                   << " old mem: " << tuned_mem
 142                   << " new mem: " << new_size << dendl;
 143
 144     tuned_mem = new_size;
 145
 146     logger->set(MallocStats::M_TARGET_BYTES, target_mem);
 147     logger->set(MallocStats::M_MAPPED_BYTES, mapped);
 148     logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped);
 149     logger->set(MallocStats::M_HEAP_BYTES, heap_size);
 150     logger->set(MallocStats::M_CACHE_BYTES, new_size);
 151   }
 152
 153   void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c,
 154                        bool enable_perf_counters)
 155   {
 156     ceph_assert(!caches.count(name));
 157     ceph_assert(!indexes.count(name));
 158
 159     caches.emplace(name, c);
 160
 161     if (!enable_perf_counters) {
 162       return;
 163     }
 164
 165     // TODO: If we ever assign more than
 166     // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
 167     // priority caching we could run out of slots.  Recycle them some day?
 168     // Also note that start and end are *exclusive*.
 169     int start = cur_index++;
 170     int end = cur_index + Extra::E_LAST + 1;
 171
 172     ceph_assert(end < PERF_COUNTER_MAX_BOUND);
 173     indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1));
 174
 175     PerfCountersBuilder b(cct, this->name + ":" + name, start, end);
 176
 177     b.add_u64(cur_index + Priority::PRI0, "pri0_bytes",
 178               "bytes allocated to pri0", "p0",
 179               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 180
 181     b.add_u64(cur_index + Priority::PRI1, "pri1_bytes",
 182               "bytes allocated to pri1", "p1",
 183               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 184
 185     b.add_u64(cur_index + Priority::PRI2, "pri2_bytes",
 186               "bytes allocated to pri2", "p2",
 187               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 188
 189     b.add_u64(cur_index + Priority::PRI3, "pri3_bytes",
 190               "bytes allocated to pri3", "p3",
 191               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 192
 193     b.add_u64(cur_index + Priority::PRI4, "pri4_bytes",
 194               "bytes allocated to pri4", "p4",
 195               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 196
 197     b.add_u64(cur_index + Priority::PRI5, "pri5_bytes",
 198               "bytes allocated to pri5", "p5",
 199               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 200
 201     b.add_u64(cur_index + Priority::PRI6, "pri6_bytes",
 202               "bytes allocated to pri6", "p6",
 203               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 204
 205     b.add_u64(cur_index + Priority::PRI7, "pri7_bytes",
 206               "bytes allocated to pri7", "p7",
 207               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 208
 209     b.add_u64(cur_index + Priority::PRI8, "pri8_bytes",
 210               "bytes allocated to pri8", "p8",
 211               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 212
 213     b.add_u64(cur_index + Priority::PRI9, "pri9_bytes",
 214               "bytes allocated to pri9", "p9",
 215               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 216
 217     b.add_u64(cur_index + Priority::PRI10, "pri10_bytes",
 218               "bytes allocated to pri10", "p10",
 219               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 220
 221     b.add_u64(cur_index + Priority::PRI11, "pri11_bytes",
 222               "bytes allocated to pri11", "p11",
 223               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 224
 225     b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes",
 226               "bytes reserved for future growth.", "r",
 227               PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 228
 229     b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes",
 230               "total bytes committed,", "c",
 231               PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
 232
 233     for (int i = 0; i < Extra::E_LAST+1; i++) {
 234       indexes[name][i] = cur_index + i;
 235     }
 236
 237     auto l = b.create_perf_counters();
 238     loggers.emplace(name, l);
 239     cct->get_perfcounters_collection()->add(l);
 240
 241     cur_index = end;
 242   }
 243
 244   void Manager::erase(const std::string& name)
 245   {
 246     auto li = loggers.find(name);
 247     if (li != loggers.end()) {
 248       cct->get_perfcounters_collection()->remove(li->second);
 249       delete li->second;
 250       loggers.erase(li);
 251     }
 252     indexes.erase(name);
 253     caches.erase(name);
 254   }
 255
 256   void Manager::clear()
 257   {
 258     auto li = loggers.begin();
 259     while (li != loggers.end()) {
 260       cct->get_perfcounters_collection()->remove(li->second);
 261       delete li->second;
 262       li = loggers.erase(li);
 263     }
 264     indexes.clear();
 265     caches.clear();
 266   }
 267
 268   void Manager::balance()
 269   {
 270     int64_t mem_avail = tuned_mem;
 271     // Each cache is going to get a little extra from get_chunk, so shrink the
 272     // available memory here to compensate.
 273     if (reserve_extra) {
 274       mem_avail -= get_chunk(1, tuned_mem) * caches.size();
 275     }
 276
 277     if (mem_avail < 0) {
 278       // There's so little memory available that just assigning a chunk per
 279       // cache pushes us over the limit. Set mem_avail to 0 and continue to
 280       // ensure each priority's byte counts are zeroed in balance_priority.
 281       mem_avail = 0;
 282     }
 283
 284     // Assign memory for each priority level
 285     for (int i = 0; i < Priority::LAST+1; i++) {
 286       ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
 287
 288       auto pri = static_cast<Priority>(i);
 289       balance_priority(&mem_avail, pri);
 290
 291       // Update the per-priority perf counters
 292       for (auto &l : loggers) {
 293         auto it = caches.find(l.first);
 294         ceph_assert(it != caches.end());
 295
 296         auto bytes = it->second->get_cache_bytes(pri);
 297         l.second->set(indexes[it->first][pri], bytes);
 298       }
 299     }
 300     // assert if we assigned more memory than is available.
 301     ceph_assert(mem_avail >= 0);
 302
 303     for (auto &l : loggers) {
 304       auto it = caches.find(l.first);
 305       ceph_assert(it != caches.end());
 306
 307       // Commit the new cache size
 308       int64_t committed = it->second->commit_cache_size(tuned_mem);
 309
 310       // Update the perf counters
 311       int64_t alloc = it->second->get_cache_bytes();
 312
 313       l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc);
 314       l.second->set(indexes[it->first][Extra::E_COMMITTED], committed);
 315     }
 316   }
 317
 318   void Manager::balance_priority(int64_t *mem_avail, Priority pri)
 319   {
 320     std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
 321     double cur_ratios = 0;
 322     double new_ratios = 0;
 323     uint64_t round = 0;
 324
 325     // First, zero this priority's bytes, sum the initial ratios.
 326     for (auto it = caches.begin(); it != caches.end(); it++) {
 327       it->second->set_cache_bytes(pri, 0);
 328       cur_ratios += it->second->get_cache_ratio();
 329     }
 330
 331     // For other priorities, loop until caches are satisified or we run out of
 332     // memory (stop if we can't guarantee a full byte allocation).
 333     while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
 334       uint64_t total_assigned = 0;
 335       for (auto it = tmp_caches.begin(); it != tmp_caches.end();) {
 336         int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem);
 337         // Usually the ratio should be set to the fraction of the current caches'
 338         // assigned ratio compared to the total ratio of all caches that still
 339         // want memory.  There is a special case where the only caches left are
 340         // all assigned 0% ratios but still want memory.  In that case, give
 341         // them an equal shot at the remaining memory for this priority.
 342         double ratio = 1.0 / tmp_caches.size();
 343         if (cur_ratios > 0) {
 344           ratio = it->second->get_cache_ratio() / cur_ratios;
 345         }
 346         int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
 347
 348         ldout(cct, 10) << __func__ << " " << it->first
 349                        << " pri: " << (int) pri
 350                        << " round: " << round
 351                        << " wanted: " << cache_wants
 352                        << " ratio: " << it->second->get_cache_ratio()
 353                        << " cur_ratios: " << cur_ratios
 354                        << " fair_share: " << fair_share
 355                        << " mem_avail: " << *mem_avail
 356                        << dendl;
 357
 358         if (cache_wants > fair_share) {
 359           // If we want too much, take what we can get but stick around for more
 360           it->second->add_cache_bytes(pri, fair_share);
 361           total_assigned += fair_share;
 362           new_ratios += it->second->get_cache_ratio();
 363           ++it;
 364         } else {
 365           // Otherwise assign only what we want
 366           if (cache_wants > 0) {
 367             it->second->add_cache_bytes(pri, cache_wants);
 368             total_assigned += cache_wants;
 369           }
 370           // Either the cache didn't want anything or got what it wanted, so
 371           // remove it from the tmp list.
 372           it = tmp_caches.erase(it);
 373         }
 374       }
 375       // Reset the ratios
 376       *mem_avail -= total_assigned;
 377       cur_ratios = new_ratios;
 378       new_ratios = 0;
 379       ++round;
 380     }
 381
 382     // If this is the last priority, divide up any remaining memory based
 383     // solely on the ratios.
 384     if (pri == Priority::LAST) {
 385       uint64_t total_assigned = 0;
 386       for (auto it = caches.begin(); it != caches.end(); it++) {
 387         double ratio = it->second->get_cache_ratio();
 388         int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
 389         it->second->set_cache_bytes(Priority::LAST, fair_share);
 390         total_assigned += fair_share;
 391       }
 392       *mem_avail -= total_assigned;
 393       return;
 394     }
 395   }
 396
 397   PriCache::~PriCache()
 398   {
 399   }
 400 }