]>
git.proxmox.com Git - ceph.git/blob - ceph/src/common/PriorityCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2018 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "PriorityCache.h"
16 #include "common/dout.h"
17 #include "perfglue/heap_profiler.h"
18 #define dout_context cct
19 #define dout_subsys ceph_subsys_prioritycache
21 #define dout_prefix *_dout << "prioritycache "
23 namespace PriorityCache
25 int64_t get_chunk(uint64_t usage
, uint64_t total_bytes
)
27 uint64_t chunk
= total_bytes
;
29 // Find the nearest power of 2
38 // shrink it to 1/256 of the rounded up cache size
41 // bound the chunk size to be between 4MB and 64MB
42 chunk
= (chunk
> 4ul*1024*1024) ? chunk
: 4ul*1024*1024;
43 chunk
= (chunk
< 64ul*1024*1024) ? chunk
: 64ul*1024*1024;
45 /* FIXME: Hardcoded to force get_chunk to never drop below 64MB.
46 * if RocksDB is used, it's a good idea to have N MB of headroom where
47 * N is the target_file_size_base value. RocksDB will read SST files
48 * into the block cache during compaction which potentially can force out
49 * all existing cached data. Once compaction is finished, the SST data is
50 * released leaving an empty cache. Having enough headroom to absorb
51 * compaction reads allows the kv cache grow even during extremely heavy
52 * compaction workloads.
54 uint64_t val
= usage
+ 64*1024*1024;
55 uint64_t r
= (val
) % chunk
;
57 val
= val
+ chunk
- r
;
61 Manager::Manager(CephContext
*c
,
66 const std::string
& name
) :
73 reserve_extra(reserve_extra
),
74 name(name
.empty() ? "prioritycache" : name
)
76 PerfCountersBuilder
b(cct
, name
,
77 MallocStats::M_FIRST
, MallocStats::M_LAST
);
79 b
.add_u64(MallocStats::M_TARGET_BYTES
, "target_bytes",
80 "target process memory usage in bytes", "t",
81 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
83 b
.add_u64(MallocStats::M_MAPPED_BYTES
, "mapped_bytes",
84 "total bytes mapped by the process", "m",
85 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
87 b
.add_u64(MallocStats::M_UNMAPPED_BYTES
, "unmapped_bytes",
88 "unmapped bytes that the kernel has yet to reclaimed", "u",
89 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
91 b
.add_u64(MallocStats::M_HEAP_BYTES
, "heap_bytes",
92 "aggregate bytes in use by the heap", "h",
93 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
95 b
.add_u64(MallocStats::M_CACHE_BYTES
, "cache_bytes",
96 "current memory available for caches.", "c",
97 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
99 logger
= b
.create_perf_counters();
100 cct
->get_perfcounters_collection()->add(logger
);
108 cct
->get_perfcounters_collection()->remove(logger
);
112 void Manager::tune_memory()
114 size_t heap_size
= 0;
118 ceph_heap_release_free_memory();
119 ceph_heap_get_numeric_property("generic.heap_size", &heap_size
);
120 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped
);
121 mapped
= heap_size
- unmapped
;
123 uint64_t new_size
= tuned_mem
;
124 new_size
= (new_size
< max_mem
) ? new_size
: max_mem
;
125 new_size
= (new_size
> min_mem
) ? new_size
: min_mem
;
127 // Approach the min/max slowly, but bounce away quickly.
128 if ((uint64_t) mapped
< target_mem
) {
129 double ratio
= 1 - ((double) mapped
/ target_mem
);
130 new_size
+= ratio
* (max_mem
- new_size
);
132 double ratio
= 1 - ((double) target_mem
/ mapped
);
133 new_size
-= ratio
* (new_size
- min_mem
);
136 ldout(cct
, 5) << __func__
137 << " target: " << target_mem
138 << " mapped: " << mapped
139 << " unmapped: " << unmapped
140 << " heap: " << heap_size
141 << " old mem: " << tuned_mem
142 << " new mem: " << new_size
<< dendl
;
144 tuned_mem
= new_size
;
146 logger
->set(MallocStats::M_TARGET_BYTES
, target_mem
);
147 logger
->set(MallocStats::M_MAPPED_BYTES
, mapped
);
148 logger
->set(MallocStats::M_UNMAPPED_BYTES
, unmapped
);
149 logger
->set(MallocStats::M_HEAP_BYTES
, heap_size
);
150 logger
->set(MallocStats::M_CACHE_BYTES
, new_size
);
153 void Manager::insert(const std::string
& name
, std::shared_ptr
<PriCache
> c
,
154 bool enable_perf_counters
)
156 ceph_assert(!caches
.count(name
));
157 ceph_assert(!indexes
.count(name
));
159 caches
.emplace(name
, c
);
161 if (!enable_perf_counters
) {
165 // TODO: If we ever assign more than
166 // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
167 // priority caching we could run out of slots. Recycle them some day?
168 // Also note that start and end are *exclusive*.
169 int start
= cur_index
++;
170 int end
= cur_index
+ Extra::E_LAST
+ 1;
172 ceph_assert(end
< PERF_COUNTER_MAX_BOUND
);
173 indexes
.emplace(name
, std::vector
<int>(Extra::E_LAST
+ 1));
175 PerfCountersBuilder
b(cct
, this->name
+ ":" + name
, start
, end
);
177 b
.add_u64(cur_index
+ Priority::PRI0
, "pri0_bytes",
178 "bytes allocated to pri0", "p0",
179 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
181 b
.add_u64(cur_index
+ Priority::PRI1
, "pri1_bytes",
182 "bytes allocated to pri1", "p1",
183 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
185 b
.add_u64(cur_index
+ Priority::PRI2
, "pri2_bytes",
186 "bytes allocated to pri2", "p2",
187 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
189 b
.add_u64(cur_index
+ Priority::PRI3
, "pri3_bytes",
190 "bytes allocated to pri3", "p3",
191 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
193 b
.add_u64(cur_index
+ Priority::PRI4
, "pri4_bytes",
194 "bytes allocated to pri4", "p4",
195 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
197 b
.add_u64(cur_index
+ Priority::PRI5
, "pri5_bytes",
198 "bytes allocated to pri5", "p5",
199 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
201 b
.add_u64(cur_index
+ Priority::PRI6
, "pri6_bytes",
202 "bytes allocated to pri6", "p6",
203 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
205 b
.add_u64(cur_index
+ Priority::PRI7
, "pri7_bytes",
206 "bytes allocated to pri7", "p7",
207 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
209 b
.add_u64(cur_index
+ Priority::PRI8
, "pri8_bytes",
210 "bytes allocated to pri8", "p8",
211 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
213 b
.add_u64(cur_index
+ Priority::PRI9
, "pri9_bytes",
214 "bytes allocated to pri9", "p9",
215 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
217 b
.add_u64(cur_index
+ Priority::PRI10
, "pri10_bytes",
218 "bytes allocated to pri10", "p10",
219 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
221 b
.add_u64(cur_index
+ Priority::PRI11
, "pri11_bytes",
222 "bytes allocated to pri11", "p11",
223 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
225 b
.add_u64(cur_index
+ Extra::E_RESERVED
, "reserved_bytes",
226 "bytes reserved for future growth.", "r",
227 PerfCountersBuilder::PRIO_INTERESTING
, unit_t(UNIT_BYTES
));
229 b
.add_u64(cur_index
+ Extra::E_COMMITTED
, "committed_bytes",
230 "total bytes committed,", "c",
231 PerfCountersBuilder::PRIO_CRITICAL
, unit_t(UNIT_BYTES
));
233 for (int i
= 0; i
< Extra::E_LAST
+1; i
++) {
234 indexes
[name
][i
] = cur_index
+ i
;
237 auto l
= b
.create_perf_counters();
238 loggers
.emplace(name
, l
);
239 cct
->get_perfcounters_collection()->add(l
);
244 void Manager::erase(const std::string
& name
)
246 auto li
= loggers
.find(name
);
247 if (li
!= loggers
.end()) {
248 cct
->get_perfcounters_collection()->remove(li
->second
);
256 void Manager::clear()
258 auto li
= loggers
.begin();
259 while (li
!= loggers
.end()) {
260 cct
->get_perfcounters_collection()->remove(li
->second
);
262 li
= loggers
.erase(li
);
268 void Manager::balance()
270 int64_t mem_avail
= tuned_mem
;
271 // Each cache is going to get a little extra from get_chunk, so shrink the
272 // available memory here to compensate.
274 mem_avail
-= get_chunk(1, tuned_mem
) * caches
.size();
278 // There's so little memory available that just assigning a chunk per
279 // cache pushes us over the limit. Set mem_avail to 0 and continue to
280 // ensure each priority's byte counts are zeroed in balance_priority.
284 // Assign memory for each priority level
285 for (int i
= 0; i
< Priority::LAST
+1; i
++) {
286 ldout(cct
, 10) << __func__
<< " assigning cache bytes for PRI: " << i
<< dendl
;
288 auto pri
= static_cast<Priority
>(i
);
289 balance_priority(&mem_avail
, pri
);
291 // Update the per-priority perf counters
292 for (auto &l
: loggers
) {
293 auto it
= caches
.find(l
.first
);
294 ceph_assert(it
!= caches
.end());
296 auto bytes
= it
->second
->get_cache_bytes(pri
);
297 l
.second
->set(indexes
[it
->first
][pri
], bytes
);
300 // assert if we assigned more memory than is available.
301 ceph_assert(mem_avail
>= 0);
303 for (auto &l
: loggers
) {
304 auto it
= caches
.find(l
.first
);
305 ceph_assert(it
!= caches
.end());
307 // Commit the new cache size
308 int64_t committed
= it
->second
->commit_cache_size(tuned_mem
);
310 // Update the perf counters
311 int64_t alloc
= it
->second
->get_cache_bytes();
313 l
.second
->set(indexes
[it
->first
][Extra::E_RESERVED
], committed
- alloc
);
314 l
.second
->set(indexes
[it
->first
][Extra::E_COMMITTED
], committed
);
318 void Manager::balance_priority(int64_t *mem_avail
, Priority pri
)
320 std::unordered_map
<std::string
, std::shared_ptr
<PriCache
>> tmp_caches
= caches
;
321 double cur_ratios
= 0;
322 double new_ratios
= 0;
325 // First, zero this priority's bytes, sum the initial ratios.
326 for (auto it
= caches
.begin(); it
!= caches
.end(); it
++) {
327 it
->second
->set_cache_bytes(pri
, 0);
328 cur_ratios
+= it
->second
->get_cache_ratio();
331 // For other priorities, loop until caches are satisified or we run out of
332 // memory (stop if we can't guarantee a full byte allocation).
333 while (!tmp_caches
.empty() && *mem_avail
> static_cast<int64_t>(tmp_caches
.size())) {
334 uint64_t total_assigned
= 0;
335 for (auto it
= tmp_caches
.begin(); it
!= tmp_caches
.end();) {
336 int64_t cache_wants
= it
->second
->request_cache_bytes(pri
, tuned_mem
);
337 // Usually the ratio should be set to the fraction of the current caches'
338 // assigned ratio compared to the total ratio of all caches that still
339 // want memory. There is a special case where the only caches left are
340 // all assigned 0% ratios but still want memory. In that case, give
341 // them an equal shot at the remaining memory for this priority.
342 double ratio
= 1.0 / tmp_caches
.size();
343 if (cur_ratios
> 0) {
344 ratio
= it
->second
->get_cache_ratio() / cur_ratios
;
346 int64_t fair_share
= static_cast<int64_t>(*mem_avail
* ratio
);
348 ldout(cct
, 10) << __func__
<< " " << it
->first
349 << " pri: " << (int) pri
350 << " round: " << round
351 << " wanted: " << cache_wants
352 << " ratio: " << it
->second
->get_cache_ratio()
353 << " cur_ratios: " << cur_ratios
354 << " fair_share: " << fair_share
355 << " mem_avail: " << *mem_avail
358 if (cache_wants
> fair_share
) {
359 // If we want too much, take what we can get but stick around for more
360 it
->second
->add_cache_bytes(pri
, fair_share
);
361 total_assigned
+= fair_share
;
362 new_ratios
+= it
->second
->get_cache_ratio();
365 // Otherwise assign only what we want
366 if (cache_wants
> 0) {
367 it
->second
->add_cache_bytes(pri
, cache_wants
);
368 total_assigned
+= cache_wants
;
370 // Either the cache didn't want anything or got what it wanted, so
371 // remove it from the tmp list.
372 it
= tmp_caches
.erase(it
);
376 *mem_avail
-= total_assigned
;
377 cur_ratios
= new_ratios
;
382 // If this is the last priority, divide up any remaining memory based
383 // solely on the ratios.
384 if (pri
== Priority::LAST
) {
385 uint64_t total_assigned
= 0;
386 for (auto it
= caches
.begin(); it
!= caches
.end(); it
++) {
387 double ratio
= it
->second
->get_cache_ratio();
388 int64_t fair_share
= static_cast<int64_t>(*mem_avail
* ratio
);
389 it
->second
->set_cache_bytes(Priority::LAST
, fair_share
);
390 total_assigned
+= fair_share
;
392 *mem_avail
-= total_assigned
;
397 PriCache::~PriCache()