]> git.proxmox.com Git - ceph.git/blob - ceph/src/common/PriorityCache.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / common / PriorityCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2018 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "PriorityCache.h"
16 #include "common/dout.h"
17 #include "perfglue/heap_profiler.h"
18 #define dout_context cct
19 #define dout_subsys ceph_subsys_prioritycache
20 #undef dout_prefix
21 #define dout_prefix *_dout << "prioritycache "
22
23 namespace PriorityCache
24 {
25 int64_t get_chunk(uint64_t usage, uint64_t total_bytes)
26 {
27 uint64_t chunk = total_bytes;
28
29 // Find the nearest power of 2
30 chunk -= 1;
31 chunk |= chunk >> 1;
32 chunk |= chunk >> 2;
33 chunk |= chunk >> 4;
34 chunk |= chunk >> 8;
35 chunk |= chunk >> 16;
36 chunk |= chunk >> 32;
37 chunk += 1;
38 // shrink it to 1/256 of the rounded up cache size
39 chunk /= 256;
40
41 // bound the chunk size to be between 4MB and 64MB
42 chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
43 chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024;
44
45 /* FIXME: Hardcoded to force get_chunk to never drop below 64MB.
46 * if RocksDB is used, it's a good idea to have N MB of headroom where
47 * N is the target_file_size_base value. RocksDB will read SST files
48 * into the block cache during compaction which potentially can force out
49 * all existing cached data. Once compaction is finished, the SST data is
50 * released leaving an empty cache. Having enough headroom to absorb
51 * compaction reads allows the kv cache grow even during extremely heavy
52 * compaction workloads.
53 */
54 uint64_t val = usage + 64*1024*1024;
55 uint64_t r = (val) % chunk;
56 if (r > 0)
57 val = val + chunk - r;
58 return val;
59 }
60
61 Manager::Manager(CephContext *c,
62 uint64_t min,
63 uint64_t max,
64 uint64_t target,
65 bool reserve_extra,
66 const std::string& name) :
67 cct(c),
68 caches{},
69 min_mem(min),
70 max_mem(max),
71 target_mem(target),
72 tuned_mem(min),
73 reserve_extra(reserve_extra),
74 name(name.empty() ? "prioritycache" : name)
75 {
76 PerfCountersBuilder b(cct, name,
77 MallocStats::M_FIRST, MallocStats::M_LAST);
78
79 b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes",
80 "target process memory usage in bytes", "t",
81 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
82
83 b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes",
84 "total bytes mapped by the process", "m",
85 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
86
87 b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes",
88 "unmapped bytes that the kernel has yet to reclaimed", "u",
89 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
90
91 b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes",
92 "aggregate bytes in use by the heap", "h",
93 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
94
95 b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes",
96 "current memory available for caches.", "c",
97 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
98
99 logger = b.create_perf_counters();
100 cct->get_perfcounters_collection()->add(logger);
101
102 tune_memory();
103 }
104
105 Manager::~Manager()
106 {
107 clear();
108 cct->get_perfcounters_collection()->remove(logger);
109 delete logger;
110 }
111
112 void Manager::tune_memory()
113 {
114 size_t heap_size = 0;
115 size_t unmapped = 0;
116 uint64_t mapped = 0;
117
118 ceph_heap_release_free_memory();
119 ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
120 ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
121 mapped = heap_size - unmapped;
122
123 uint64_t new_size = tuned_mem;
124 new_size = (new_size < max_mem) ? new_size : max_mem;
125 new_size = (new_size > min_mem) ? new_size : min_mem;
126
127 // Approach the min/max slowly, but bounce away quickly.
128 if ((uint64_t) mapped < target_mem) {
129 double ratio = 1 - ((double) mapped / target_mem);
130 new_size += ratio * (max_mem - new_size);
131 } else {
132 double ratio = 1 - ((double) target_mem / mapped);
133 new_size -= ratio * (new_size - min_mem);
134 }
135
136 ldout(cct, 5) << __func__
137 << " target: " << target_mem
138 << " mapped: " << mapped
139 << " unmapped: " << unmapped
140 << " heap: " << heap_size
141 << " old mem: " << tuned_mem
142 << " new mem: " << new_size << dendl;
143
144 tuned_mem = new_size;
145
146 logger->set(MallocStats::M_TARGET_BYTES, target_mem);
147 logger->set(MallocStats::M_MAPPED_BYTES, mapped);
148 logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped);
149 logger->set(MallocStats::M_HEAP_BYTES, heap_size);
150 logger->set(MallocStats::M_CACHE_BYTES, new_size);
151 }
152
153 void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c,
154 bool enable_perf_counters)
155 {
156 ceph_assert(!caches.count(name));
157 ceph_assert(!indexes.count(name));
158
159 caches.emplace(name, c);
160
161 if (!enable_perf_counters) {
162 return;
163 }
164
165 // TODO: If we ever assign more than
166 // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
167 // priority caching we could run out of slots. Recycle them some day?
168 // Also note that start and end are *exclusive*.
169 int start = cur_index++;
170 int end = cur_index + Extra::E_LAST + 1;
171
172 ceph_assert(end < PERF_COUNTER_MAX_BOUND);
173 indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1));
174
175 PerfCountersBuilder b(cct, this->name + ":" + name, start, end);
176
177 b.add_u64(cur_index + Priority::PRI0, "pri0_bytes",
178 "bytes allocated to pri0", "p0",
179 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
180
181 b.add_u64(cur_index + Priority::PRI1, "pri1_bytes",
182 "bytes allocated to pri1", "p1",
183 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
184
185 b.add_u64(cur_index + Priority::PRI2, "pri2_bytes",
186 "bytes allocated to pri2", "p2",
187 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
188
189 b.add_u64(cur_index + Priority::PRI3, "pri3_bytes",
190 "bytes allocated to pri3", "p3",
191 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
192
193 b.add_u64(cur_index + Priority::PRI4, "pri4_bytes",
194 "bytes allocated to pri4", "p4",
195 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
196
197 b.add_u64(cur_index + Priority::PRI5, "pri5_bytes",
198 "bytes allocated to pri5", "p5",
199 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
200
201 b.add_u64(cur_index + Priority::PRI6, "pri6_bytes",
202 "bytes allocated to pri6", "p6",
203 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
204
205 b.add_u64(cur_index + Priority::PRI7, "pri7_bytes",
206 "bytes allocated to pri7", "p7",
207 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
208
209 b.add_u64(cur_index + Priority::PRI8, "pri8_bytes",
210 "bytes allocated to pri8", "p8",
211 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
212
213 b.add_u64(cur_index + Priority::PRI9, "pri9_bytes",
214 "bytes allocated to pri9", "p9",
215 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
216
217 b.add_u64(cur_index + Priority::PRI10, "pri10_bytes",
218 "bytes allocated to pri10", "p10",
219 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
220
221 b.add_u64(cur_index + Priority::PRI11, "pri11_bytes",
222 "bytes allocated to pri11", "p11",
223 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
224
225 b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes",
226 "bytes reserved for future growth.", "r",
227 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
228
229 b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes",
230 "total bytes committed,", "c",
231 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
232
233 for (int i = 0; i < Extra::E_LAST+1; i++) {
234 indexes[name][i] = cur_index + i;
235 }
236
237 auto l = b.create_perf_counters();
238 loggers.emplace(name, l);
239 cct->get_perfcounters_collection()->add(l);
240
241 cur_index = end;
242 }
243
244 void Manager::erase(const std::string& name)
245 {
246 auto li = loggers.find(name);
247 if (li != loggers.end()) {
248 cct->get_perfcounters_collection()->remove(li->second);
249 delete li->second;
250 loggers.erase(li);
251 }
252 indexes.erase(name);
253 caches.erase(name);
254 }
255
256 void Manager::clear()
257 {
258 auto li = loggers.begin();
259 while (li != loggers.end()) {
260 cct->get_perfcounters_collection()->remove(li->second);
261 delete li->second;
262 li = loggers.erase(li);
263 }
264 indexes.clear();
265 caches.clear();
266 }
267
268 void Manager::balance()
269 {
270 int64_t mem_avail = tuned_mem;
271 // Each cache is going to get a little extra from get_chunk, so shrink the
272 // available memory here to compensate.
273 if (reserve_extra) {
274 mem_avail -= get_chunk(1, tuned_mem) * caches.size();
275 }
276
277 if (mem_avail < 0) {
278 // There's so little memory available that just assigning a chunk per
279 // cache pushes us over the limit. Set mem_avail to 0 and continue to
280 // ensure each priority's byte counts are zeroed in balance_priority.
281 mem_avail = 0;
282 }
283
284 // Assign memory for each priority level
285 for (int i = 0; i < Priority::LAST+1; i++) {
286 ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
287
288 auto pri = static_cast<Priority>(i);
289 balance_priority(&mem_avail, pri);
290
291 // Update the per-priority perf counters
292 for (auto &l : loggers) {
293 auto it = caches.find(l.first);
294 ceph_assert(it != caches.end());
295
296 auto bytes = it->second->get_cache_bytes(pri);
297 l.second->set(indexes[it->first][pri], bytes);
298 }
299 }
300 // assert if we assigned more memory than is available.
301 ceph_assert(mem_avail >= 0);
302
303 for (auto &l : loggers) {
304 auto it = caches.find(l.first);
305 ceph_assert(it != caches.end());
306
307 // Commit the new cache size
308 int64_t committed = it->second->commit_cache_size(tuned_mem);
309
310 // Update the perf counters
311 int64_t alloc = it->second->get_cache_bytes();
312
313 l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc);
314 l.second->set(indexes[it->first][Extra::E_COMMITTED], committed);
315 }
316 }
317
318 void Manager::balance_priority(int64_t *mem_avail, Priority pri)
319 {
320 std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
321 double cur_ratios = 0;
322 double new_ratios = 0;
323 uint64_t round = 0;
324
325 // First, zero this priority's bytes, sum the initial ratios.
326 for (auto it = caches.begin(); it != caches.end(); it++) {
327 it->second->set_cache_bytes(pri, 0);
328 cur_ratios += it->second->get_cache_ratio();
329 }
330
331 // For other priorities, loop until caches are satisified or we run out of
332 // memory (stop if we can't guarantee a full byte allocation).
333 while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
334 uint64_t total_assigned = 0;
335 for (auto it = tmp_caches.begin(); it != tmp_caches.end();) {
336 int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem);
337 // Usually the ratio should be set to the fraction of the current caches'
338 // assigned ratio compared to the total ratio of all caches that still
339 // want memory. There is a special case where the only caches left are
340 // all assigned 0% ratios but still want memory. In that case, give
341 // them an equal shot at the remaining memory for this priority.
342 double ratio = 1.0 / tmp_caches.size();
343 if (cur_ratios > 0) {
344 ratio = it->second->get_cache_ratio() / cur_ratios;
345 }
346 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
347
348 ldout(cct, 10) << __func__ << " " << it->first
349 << " pri: " << (int) pri
350 << " round: " << round
351 << " wanted: " << cache_wants
352 << " ratio: " << it->second->get_cache_ratio()
353 << " cur_ratios: " << cur_ratios
354 << " fair_share: " << fair_share
355 << " mem_avail: " << *mem_avail
356 << dendl;
357
358 if (cache_wants > fair_share) {
359 // If we want too much, take what we can get but stick around for more
360 it->second->add_cache_bytes(pri, fair_share);
361 total_assigned += fair_share;
362 new_ratios += it->second->get_cache_ratio();
363 ++it;
364 } else {
365 // Otherwise assign only what we want
366 if (cache_wants > 0) {
367 it->second->add_cache_bytes(pri, cache_wants);
368 total_assigned += cache_wants;
369 }
370 // Either the cache didn't want anything or got what it wanted, so
371 // remove it from the tmp list.
372 it = tmp_caches.erase(it);
373 }
374 }
375 // Reset the ratios
376 *mem_avail -= total_assigned;
377 cur_ratios = new_ratios;
378 new_ratios = 0;
379 ++round;
380 }
381
382 // If this is the last priority, divide up any remaining memory based
383 // solely on the ratios.
384 if (pri == Priority::LAST) {
385 uint64_t total_assigned = 0;
386 for (auto it = caches.begin(); it != caches.end(); it++) {
387 double ratio = it->second->get_cache_ratio();
388 int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
389 it->second->set_cache_bytes(Priority::LAST, fair_share);
390 total_assigned += fair_share;
391 }
392 *mem_avail -= total_assigned;
393 return;
394 }
395 }
396
397 PriCache::~PriCache()
398 {
399 }
400 }