]> git.proxmox.com Git - ceph.git/blame - ceph/src/common/PriorityCache.cc
import ceph quincy 17.2.6
[ceph.git] / ceph / src / common / PriorityCache.cc
CommitLineData
91327a77
AA
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2018 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "PriorityCache.h"
eafe8130
TL
16#include "common/dout.h"
17#include "perfglue/heap_profiler.h"
18#define dout_context cct
19#define dout_subsys ceph_subsys_prioritycache
20#undef dout_prefix
21#define dout_prefix *_dout << "prioritycache "
91327a77 22
eafe8130
TL
23namespace PriorityCache
24{
25 int64_t get_chunk(uint64_t usage, uint64_t total_bytes)
26 {
11fdf7f2
TL
27 uint64_t chunk = total_bytes;
28
29 // Find the nearest power of 2
30 chunk -= 1;
31 chunk |= chunk >> 1;
32 chunk |= chunk >> 2;
33 chunk |= chunk >> 4;
34 chunk |= chunk >> 8;
35 chunk |= chunk >> 16;
36 chunk |= chunk >> 32;
37 chunk += 1;
38 // shrink it to 1/256 of the rounded up cache size
39 chunk /= 256;
40
f67539c2 41 // bound the chunk size to be between 4MB and 64MB
11fdf7f2 42 chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
f67539c2 43 chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024;
11fdf7f2 44
f67539c2 45 /* FIXME: Hardcoded to force get_chunk to never drop below 64MB.
11fdf7f2
TL
46 * if RocksDB is used, it's a good idea to have N MB of headroom where
47 * N is the target_file_size_base value. RocksDB will read SST files
48 * into the block cache during compaction which potentially can force out
49 * all existing cached data. Once compaction is finished, the SST data is
50 * released leaving an empty cache. Having enough headroom to absorb
51 * compaction reads allows the kv cache grow even during extremely heavy
52 * compaction workloads.
53 */
f67539c2 54 uint64_t val = usage + 64*1024*1024;
11fdf7f2 55 uint64_t r = (val) % chunk;
91327a77 56 if (r > 0)
11fdf7f2 57 val = val + chunk - r;
91327a77
AA
58 return val;
59 }
60
eafe8130
TL
61 Manager::Manager(CephContext *c,
62 uint64_t min,
63 uint64_t max,
64 uint64_t target,
f67539c2
TL
65 bool reserve_extra,
66 const std::string& name) :
eafe8130
TL
67 cct(c),
68 caches{},
69 min_mem(min),
70 max_mem(max),
71 target_mem(target),
72 tuned_mem(min),
f67539c2
TL
73 reserve_extra(reserve_extra),
74 name(name.empty() ? "prioritycache" : name)
eafe8130 75 {
20effc67 76 PerfCountersBuilder b(cct, this->name, MallocStats::M_FIRST, MallocStats::M_LAST);
eafe8130
TL
77
78 b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes",
79 "target process memory usage in bytes", "t",
80 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
81
82 b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes",
83 "total bytes mapped by the process", "m",
84 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
85
86 b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes",
20effc67 87 "unmapped bytes that the kernel has yet to reclaim", "u",
eafe8130
TL
88 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
89
90 b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes",
91 "aggregate bytes in use by the heap", "h",
92 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
93
94 b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes",
95 "current memory available for caches.", "c",
96 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
97
98 logger = b.create_perf_counters();
99 cct->get_perfcounters_collection()->add(logger);
100
101 tune_memory();
102 }
103
104 Manager::~Manager()
105 {
106 clear();
107 cct->get_perfcounters_collection()->remove(logger);
108 delete logger;
109 }
110
111 void Manager::tune_memory()
112 {
113 size_t heap_size = 0;
114 size_t unmapped = 0;
115 uint64_t mapped = 0;
116
117 ceph_heap_release_free_memory();
118 ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
119 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
120 mapped = heap_size - unmapped;
121
122 uint64_t new_size = tuned_mem;
123 new_size = (new_size < max_mem) ? new_size : max_mem;
124 new_size = (new_size > min_mem) ? new_size : min_mem;
125
126 // Approach the min/max slowly, but bounce away quickly.
127 if ((uint64_t) mapped < target_mem) {
128 double ratio = 1 - ((double) mapped / target_mem);
129 new_size += ratio * (max_mem - new_size);
130 } else {
131 double ratio = 1 - ((double) target_mem / mapped);
132 new_size -= ratio * (new_size - min_mem);
133 }
134
135 ldout(cct, 5) << __func__
136 << " target: " << target_mem
137 << " mapped: " << mapped
138 << " unmapped: " << unmapped
139 << " heap: " << heap_size
140 << " old mem: " << tuned_mem
141 << " new mem: " << new_size << dendl;
142
143 tuned_mem = new_size;
144
145 logger->set(MallocStats::M_TARGET_BYTES, target_mem);
146 logger->set(MallocStats::M_MAPPED_BYTES, mapped);
147 logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped);
148 logger->set(MallocStats::M_HEAP_BYTES, heap_size);
149 logger->set(MallocStats::M_CACHE_BYTES, new_size);
150 }
151
152 void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c,
153 bool enable_perf_counters)
154 {
155 ceph_assert(!caches.count(name));
156 ceph_assert(!indexes.count(name));
157
158 caches.emplace(name, c);
159
160 if (!enable_perf_counters) {
161 return;
162 }
163
164 // TODO: If we ever assign more than
165 // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
166 // priority caching we could run out of slots. Recycle them some day?
167 // Also note that start and end are *exclusive*.
168 int start = cur_index++;
169 int end = cur_index + Extra::E_LAST + 1;
170
171 ceph_assert(end < PERF_COUNTER_MAX_BOUND);
172 indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1));
173
f67539c2 174 PerfCountersBuilder b(cct, this->name + ":" + name, start, end);
eafe8130
TL
175
176 b.add_u64(cur_index + Priority::PRI0, "pri0_bytes",
177 "bytes allocated to pri0", "p0",
20effc67 178 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
179
180 b.add_u64(cur_index + Priority::PRI1, "pri1_bytes",
181 "bytes allocated to pri1", "p1",
20effc67 182 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
183
184 b.add_u64(cur_index + Priority::PRI2, "pri2_bytes",
185 "bytes allocated to pri2", "p2",
20effc67 186 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
187
188 b.add_u64(cur_index + Priority::PRI3, "pri3_bytes",
189 "bytes allocated to pri3", "p3",
20effc67 190 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
191
192 b.add_u64(cur_index + Priority::PRI4, "pri4_bytes",
193 "bytes allocated to pri4", "p4",
20effc67 194 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
195
196 b.add_u64(cur_index + Priority::PRI5, "pri5_bytes",
197 "bytes allocated to pri5", "p5",
20effc67 198 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
199
200 b.add_u64(cur_index + Priority::PRI6, "pri6_bytes",
201 "bytes allocated to pri6", "p6",
20effc67 202 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
203
204 b.add_u64(cur_index + Priority::PRI7, "pri7_bytes",
205 "bytes allocated to pri7", "p7",
20effc67 206 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
207
208 b.add_u64(cur_index + Priority::PRI8, "pri8_bytes",
209 "bytes allocated to pri8", "p8",
20effc67 210 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
211
212 b.add_u64(cur_index + Priority::PRI9, "pri9_bytes",
213 "bytes allocated to pri9", "p9",
20effc67 214 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
215
216 b.add_u64(cur_index + Priority::PRI10, "pri10_bytes",
217 "bytes allocated to pri10", "p10",
20effc67 218 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
219
220 b.add_u64(cur_index + Priority::PRI11, "pri11_bytes",
221 "bytes allocated to pri11", "p11",
20effc67 222 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
223
224 b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes",
225 "bytes reserved for future growth.", "r",
20effc67 226 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
227
228 b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes",
229 "total bytes committed,", "c",
20effc67 230 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
eafe8130
TL
231
232 for (int i = 0; i < Extra::E_LAST+1; i++) {
233 indexes[name][i] = cur_index + i;
234 }
235
236 auto l = b.create_perf_counters();
237 loggers.emplace(name, l);
238 cct->get_perfcounters_collection()->add(l);
239
240 cur_index = end;
241 }
242
243 void Manager::erase(const std::string& name)
244 {
245 auto li = loggers.find(name);
246 if (li != loggers.end()) {
247 cct->get_perfcounters_collection()->remove(li->second);
248 delete li->second;
249 loggers.erase(li);
250 }
251 indexes.erase(name);
252 caches.erase(name);
253 }
254
255 void Manager::clear()
256 {
257 auto li = loggers.begin();
258 while (li != loggers.end()) {
259 cct->get_perfcounters_collection()->remove(li->second);
260 delete li->second;
261 li = loggers.erase(li);
262 }
263 indexes.clear();
264 caches.clear();
265 }
266
267 void Manager::balance()
268 {
269 int64_t mem_avail = tuned_mem;
270 // Each cache is going to get a little extra from get_chunk, so shrink the
271 // available memory here to compensate.
272 if (reserve_extra) {
273 mem_avail -= get_chunk(1, tuned_mem) * caches.size();
274 }
275
276 if (mem_avail < 0) {
277 // There's so little memory available that just assigning a chunk per
278 // cache pushes us over the limit. Set mem_avail to 0 and continue to
279 // ensure each priority's byte counts are zeroed in balance_priority.
280 mem_avail = 0;
281 }
282
283 // Assign memory for each priority level
284 for (int i = 0; i < Priority::LAST+1; i++) {
285 ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
286
287 auto pri = static_cast<Priority>(i);
288 balance_priority(&mem_avail, pri);
289
290 // Update the per-priority perf counters
291 for (auto &l : loggers) {
292 auto it = caches.find(l.first);
293 ceph_assert(it != caches.end());
294
295 auto bytes = it->second->get_cache_bytes(pri);
296 l.second->set(indexes[it->first][pri], bytes);
297 }
298 }
299 // assert if we assigned more memory than is available.
300 ceph_assert(mem_avail >= 0);
301
302 for (auto &l : loggers) {
303 auto it = caches.find(l.first);
304 ceph_assert(it != caches.end());
305
306 // Commit the new cache size
307 int64_t committed = it->second->commit_cache_size(tuned_mem);
eafe8130
TL
308 // Update the perf counters
309 int64_t alloc = it->second->get_cache_bytes();
310
311 l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc);
312 l.second->set(indexes[it->first][Extra::E_COMMITTED], committed);
313 }
314 }
315
20effc67
TL
316 void Manager::shift_bins()
317 {
318 for (auto &l : loggers) {
319 auto it = caches.find(l.first);
320 it->second->shift_bins();
321 }
322 }
323
eafe8130
TL
324 void Manager::balance_priority(int64_t *mem_avail, Priority pri)
325 {
326 std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
327 double cur_ratios = 0;
328 double new_ratios = 0;
329 uint64_t round = 0;
330
331 // First, zero this priority's bytes, sum the initial ratios.
332 for (auto it = caches.begin(); it != caches.end(); it++) {
333 it->second->set_cache_bytes(pri, 0);
334 cur_ratios += it->second->get_cache_ratio();
335 }
336
337 // For other priorities, loop until caches are satisified or we run out of
338 // memory (stop if we can't guarantee a full byte allocation).
339 while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
340 uint64_t total_assigned = 0;
341 for (auto it = tmp_caches.begin(); it != tmp_caches.end();) {
342 int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem);
343 // Usually the ratio should be set to the fraction of the current caches'
344 // assigned ratio compared to the total ratio of all caches that still
345 // want memory. There is a special case where the only caches left are
346 // all assigned 0% ratios but still want memory. In that case, give
347 // them an equal shot at the remaining memory for this priority.
348 double ratio = 1.0 / tmp_caches.size();
349 if (cur_ratios > 0) {
350 ratio = it->second->get_cache_ratio() / cur_ratios;
351 }
352 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
353
354 ldout(cct, 10) << __func__ << " " << it->first
355 << " pri: " << (int) pri
356 << " round: " << round
357 << " wanted: " << cache_wants
358 << " ratio: " << it->second->get_cache_ratio()
359 << " cur_ratios: " << cur_ratios
360 << " fair_share: " << fair_share
361 << " mem_avail: " << *mem_avail
362 << dendl;
363
364 if (cache_wants > fair_share) {
365 // If we want too much, take what we can get but stick around for more
366 it->second->add_cache_bytes(pri, fair_share);
367 total_assigned += fair_share;
368 new_ratios += it->second->get_cache_ratio();
369 ++it;
370 } else {
371 // Otherwise assign only what we want
372 if (cache_wants > 0) {
373 it->second->add_cache_bytes(pri, cache_wants);
374 total_assigned += cache_wants;
375 }
376 // Either the cache didn't want anything or got what it wanted, so
377 // remove it from the tmp list.
378 it = tmp_caches.erase(it);
379 }
380 }
381 // Reset the ratios
382 *mem_avail -= total_assigned;
383 cur_ratios = new_ratios;
384 new_ratios = 0;
385 ++round;
386 }
387
388 // If this is the last priority, divide up any remaining memory based
389 // solely on the ratios.
390 if (pri == Priority::LAST) {
391 uint64_t total_assigned = 0;
392 for (auto it = caches.begin(); it != caches.end(); it++) {
393 double ratio = it->second->get_cache_ratio();
394 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
395 it->second->set_cache_bytes(Priority::LAST, fair_share);
396 total_assigned += fair_share;
397 }
398 *mem_avail -= total_assigned;
399 return;
400 }
401 }
402
403 PriCache::~PriCache()
404 {
91327a77
AA
405 }
406}