]>
Commit | Line | Data |
---|---|---|
91327a77 AA |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2018 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "PriorityCache.h" | |
eafe8130 TL |
16 | #include "common/dout.h" |
17 | #include "perfglue/heap_profiler.h" | |
18 | #define dout_context cct | |
19 | #define dout_subsys ceph_subsys_prioritycache | |
20 | #undef dout_prefix | |
21 | #define dout_prefix *_dout << "prioritycache " | |
91327a77 | 22 | |
eafe8130 TL |
23 | namespace PriorityCache |
24 | { | |
25 | int64_t get_chunk(uint64_t usage, uint64_t total_bytes) | |
26 | { | |
11fdf7f2 TL |
27 | uint64_t chunk = total_bytes; |
28 | ||
29 | // Find the nearest power of 2 | |
30 | chunk -= 1; | |
31 | chunk |= chunk >> 1; | |
32 | chunk |= chunk >> 2; | |
33 | chunk |= chunk >> 4; | |
34 | chunk |= chunk >> 8; | |
35 | chunk |= chunk >> 16; | |
36 | chunk |= chunk >> 32; | |
37 | chunk += 1; | |
38 | // shrink it to 1/256 of the rounded up cache size | |
39 | chunk /= 256; | |
40 | ||
f67539c2 | 41 | // bound the chunk size to be between 4MB and 64MB |
11fdf7f2 | 42 | chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024; |
f67539c2 | 43 | chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024; |
11fdf7f2 | 44 | |
f67539c2 | 45 | /* FIXME: Hardcoded to force get_chunk to never drop below 64MB. |
11fdf7f2 TL |
46 | * if RocksDB is used, it's a good idea to have N MB of headroom where |
47 | * N is the target_file_size_base value. RocksDB will read SST files | |
48 | * into the block cache during compaction which potentially can force out | |
49 | * all existing cached data. Once compaction is finished, the SST data is | |
50 | * released leaving an empty cache. Having enough headroom to absorb | |
51 | * compaction reads allows the kv cache grow even during extremely heavy | |
52 | * compaction workloads. | |
53 | */ | |
f67539c2 | 54 | uint64_t val = usage + 64*1024*1024; |
11fdf7f2 | 55 | uint64_t r = (val) % chunk; |
91327a77 | 56 | if (r > 0) |
11fdf7f2 | 57 | val = val + chunk - r; |
91327a77 AA |
58 | return val; |
59 | } | |
60 | ||
eafe8130 TL |
61 | Manager::Manager(CephContext *c, |
62 | uint64_t min, | |
63 | uint64_t max, | |
64 | uint64_t target, | |
f67539c2 TL |
65 | bool reserve_extra, |
66 | const std::string& name) : | |
eafe8130 TL |
67 | cct(c), |
68 | caches{}, | |
69 | min_mem(min), | |
70 | max_mem(max), | |
71 | target_mem(target), | |
72 | tuned_mem(min), | |
f67539c2 TL |
73 | reserve_extra(reserve_extra), |
74 | name(name.empty() ? "prioritycache" : name) | |
eafe8130 | 75 | { |
20effc67 | 76 | PerfCountersBuilder b(cct, this->name, MallocStats::M_FIRST, MallocStats::M_LAST); |
eafe8130 TL |
77 | |
78 | b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes", | |
79 | "target process memory usage in bytes", "t", | |
80 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
81 | ||
82 | b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes", | |
83 | "total bytes mapped by the process", "m", | |
84 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
85 | ||
86 | b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes", | |
20effc67 | 87 | "unmapped bytes that the kernel has yet to reclaim", "u", |
eafe8130 TL |
88 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); |
89 | ||
90 | b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes", | |
91 | "aggregate bytes in use by the heap", "h", | |
92 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
93 | ||
94 | b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes", | |
95 | "current memory available for caches.", "c", | |
96 | PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES)); | |
97 | ||
98 | logger = b.create_perf_counters(); | |
99 | cct->get_perfcounters_collection()->add(logger); | |
100 | ||
101 | tune_memory(); | |
102 | } | |
103 | ||
104 | Manager::~Manager() | |
105 | { | |
106 | clear(); | |
107 | cct->get_perfcounters_collection()->remove(logger); | |
108 | delete logger; | |
109 | } | |
110 | ||
111 | void Manager::tune_memory() | |
112 | { | |
113 | size_t heap_size = 0; | |
114 | size_t unmapped = 0; | |
115 | uint64_t mapped = 0; | |
116 | ||
117 | ceph_heap_release_free_memory(); | |
118 | ceph_heap_get_numeric_property("generic.heap_size", &heap_size); | |
119 | ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped); | |
120 | mapped = heap_size - unmapped; | |
121 | ||
122 | uint64_t new_size = tuned_mem; | |
123 | new_size = (new_size < max_mem) ? new_size : max_mem; | |
124 | new_size = (new_size > min_mem) ? new_size : min_mem; | |
125 | ||
126 | // Approach the min/max slowly, but bounce away quickly. | |
127 | if ((uint64_t) mapped < target_mem) { | |
128 | double ratio = 1 - ((double) mapped / target_mem); | |
129 | new_size += ratio * (max_mem - new_size); | |
130 | } else { | |
131 | double ratio = 1 - ((double) target_mem / mapped); | |
132 | new_size -= ratio * (new_size - min_mem); | |
133 | } | |
134 | ||
135 | ldout(cct, 5) << __func__ | |
136 | << " target: " << target_mem | |
137 | << " mapped: " << mapped | |
138 | << " unmapped: " << unmapped | |
139 | << " heap: " << heap_size | |
140 | << " old mem: " << tuned_mem | |
141 | << " new mem: " << new_size << dendl; | |
142 | ||
143 | tuned_mem = new_size; | |
144 | ||
145 | logger->set(MallocStats::M_TARGET_BYTES, target_mem); | |
146 | logger->set(MallocStats::M_MAPPED_BYTES, mapped); | |
147 | logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped); | |
148 | logger->set(MallocStats::M_HEAP_BYTES, heap_size); | |
149 | logger->set(MallocStats::M_CACHE_BYTES, new_size); | |
150 | } | |
151 | ||
152 | void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c, | |
153 | bool enable_perf_counters) | |
154 | { | |
155 | ceph_assert(!caches.count(name)); | |
156 | ceph_assert(!indexes.count(name)); | |
157 | ||
158 | caches.emplace(name, c); | |
159 | ||
160 | if (!enable_perf_counters) { | |
161 | return; | |
162 | } | |
163 | ||
164 | // TODO: If we ever assign more than | |
165 | // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for | |
166 | // priority caching we could run out of slots. Recycle them some day? | |
167 | // Also note that start and end are *exclusive*. | |
168 | int start = cur_index++; | |
169 | int end = cur_index + Extra::E_LAST + 1; | |
170 | ||
171 | ceph_assert(end < PERF_COUNTER_MAX_BOUND); | |
172 | indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1)); | |
173 | ||
f67539c2 | 174 | PerfCountersBuilder b(cct, this->name + ":" + name, start, end); |
eafe8130 TL |
175 | |
176 | b.add_u64(cur_index + Priority::PRI0, "pri0_bytes", | |
177 | "bytes allocated to pri0", "p0", | |
20effc67 | 178 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
179 | |
180 | b.add_u64(cur_index + Priority::PRI1, "pri1_bytes", | |
181 | "bytes allocated to pri1", "p1", | |
20effc67 | 182 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
183 | |
184 | b.add_u64(cur_index + Priority::PRI2, "pri2_bytes", | |
185 | "bytes allocated to pri2", "p2", | |
20effc67 | 186 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
187 | |
188 | b.add_u64(cur_index + Priority::PRI3, "pri3_bytes", | |
189 | "bytes allocated to pri3", "p3", | |
20effc67 | 190 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
191 | |
192 | b.add_u64(cur_index + Priority::PRI4, "pri4_bytes", | |
193 | "bytes allocated to pri4", "p4", | |
20effc67 | 194 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
195 | |
196 | b.add_u64(cur_index + Priority::PRI5, "pri5_bytes", | |
197 | "bytes allocated to pri5", "p5", | |
20effc67 | 198 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
199 | |
200 | b.add_u64(cur_index + Priority::PRI6, "pri6_bytes", | |
201 | "bytes allocated to pri6", "p6", | |
20effc67 | 202 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
203 | |
204 | b.add_u64(cur_index + Priority::PRI7, "pri7_bytes", | |
205 | "bytes allocated to pri7", "p7", | |
20effc67 | 206 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
207 | |
208 | b.add_u64(cur_index + Priority::PRI8, "pri8_bytes", | |
209 | "bytes allocated to pri8", "p8", | |
20effc67 | 210 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
211 | |
212 | b.add_u64(cur_index + Priority::PRI9, "pri9_bytes", | |
213 | "bytes allocated to pri9", "p9", | |
20effc67 | 214 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
215 | |
216 | b.add_u64(cur_index + Priority::PRI10, "pri10_bytes", | |
217 | "bytes allocated to pri10", "p10", | |
20effc67 | 218 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
219 | |
220 | b.add_u64(cur_index + Priority::PRI11, "pri11_bytes", | |
221 | "bytes allocated to pri11", "p11", | |
20effc67 | 222 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
223 | |
224 | b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes", | |
225 | "bytes reserved for future growth.", "r", | |
20effc67 | 226 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
227 | |
228 | b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes", | |
229 | "total bytes committed,", "c", | |
20effc67 | 230 | PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); |
eafe8130 TL |
231 | |
232 | for (int i = 0; i < Extra::E_LAST+1; i++) { | |
233 | indexes[name][i] = cur_index + i; | |
234 | } | |
235 | ||
236 | auto l = b.create_perf_counters(); | |
237 | loggers.emplace(name, l); | |
238 | cct->get_perfcounters_collection()->add(l); | |
239 | ||
240 | cur_index = end; | |
241 | } | |
242 | ||
243 | void Manager::erase(const std::string& name) | |
244 | { | |
245 | auto li = loggers.find(name); | |
246 | if (li != loggers.end()) { | |
247 | cct->get_perfcounters_collection()->remove(li->second); | |
248 | delete li->second; | |
249 | loggers.erase(li); | |
250 | } | |
251 | indexes.erase(name); | |
252 | caches.erase(name); | |
253 | } | |
254 | ||
255 | void Manager::clear() | |
256 | { | |
257 | auto li = loggers.begin(); | |
258 | while (li != loggers.end()) { | |
259 | cct->get_perfcounters_collection()->remove(li->second); | |
260 | delete li->second; | |
261 | li = loggers.erase(li); | |
262 | } | |
263 | indexes.clear(); | |
264 | caches.clear(); | |
265 | } | |
266 | ||
267 | void Manager::balance() | |
268 | { | |
269 | int64_t mem_avail = tuned_mem; | |
270 | // Each cache is going to get a little extra from get_chunk, so shrink the | |
271 | // available memory here to compensate. | |
272 | if (reserve_extra) { | |
273 | mem_avail -= get_chunk(1, tuned_mem) * caches.size(); | |
274 | } | |
275 | ||
276 | if (mem_avail < 0) { | |
277 | // There's so little memory available that just assigning a chunk per | |
278 | // cache pushes us over the limit. Set mem_avail to 0 and continue to | |
279 | // ensure each priority's byte counts are zeroed in balance_priority. | |
280 | mem_avail = 0; | |
281 | } | |
282 | ||
283 | // Assign memory for each priority level | |
284 | for (int i = 0; i < Priority::LAST+1; i++) { | |
285 | ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl; | |
286 | ||
287 | auto pri = static_cast<Priority>(i); | |
288 | balance_priority(&mem_avail, pri); | |
289 | ||
290 | // Update the per-priority perf counters | |
291 | for (auto &l : loggers) { | |
292 | auto it = caches.find(l.first); | |
293 | ceph_assert(it != caches.end()); | |
294 | ||
295 | auto bytes = it->second->get_cache_bytes(pri); | |
296 | l.second->set(indexes[it->first][pri], bytes); | |
297 | } | |
298 | } | |
299 | // assert if we assigned more memory than is available. | |
300 | ceph_assert(mem_avail >= 0); | |
301 | ||
302 | for (auto &l : loggers) { | |
303 | auto it = caches.find(l.first); | |
304 | ceph_assert(it != caches.end()); | |
305 | ||
306 | // Commit the new cache size | |
307 | int64_t committed = it->second->commit_cache_size(tuned_mem); | |
eafe8130 TL |
308 | // Update the perf counters |
309 | int64_t alloc = it->second->get_cache_bytes(); | |
310 | ||
311 | l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc); | |
312 | l.second->set(indexes[it->first][Extra::E_COMMITTED], committed); | |
313 | } | |
314 | } | |
315 | ||
20effc67 TL |
316 | void Manager::shift_bins() |
317 | { | |
318 | for (auto &l : loggers) { | |
319 | auto it = caches.find(l.first); | |
320 | it->second->shift_bins(); | |
321 | } | |
322 | } | |
323 | ||
eafe8130 TL |
324 | void Manager::balance_priority(int64_t *mem_avail, Priority pri) |
325 | { | |
326 | std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches; | |
327 | double cur_ratios = 0; | |
328 | double new_ratios = 0; | |
329 | uint64_t round = 0; | |
330 | ||
331 | // First, zero this priority's bytes, sum the initial ratios. | |
332 | for (auto it = caches.begin(); it != caches.end(); it++) { | |
333 | it->second->set_cache_bytes(pri, 0); | |
334 | cur_ratios += it->second->get_cache_ratio(); | |
335 | } | |
336 | ||
337 | // For other priorities, loop until caches are satisified or we run out of | |
338 | // memory (stop if we can't guarantee a full byte allocation). | |
339 | while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) { | |
340 | uint64_t total_assigned = 0; | |
341 | for (auto it = tmp_caches.begin(); it != tmp_caches.end();) { | |
342 | int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem); | |
343 | // Usually the ratio should be set to the fraction of the current caches' | |
344 | // assigned ratio compared to the total ratio of all caches that still | |
345 | // want memory. There is a special case where the only caches left are | |
346 | // all assigned 0% ratios but still want memory. In that case, give | |
347 | // them an equal shot at the remaining memory for this priority. | |
348 | double ratio = 1.0 / tmp_caches.size(); | |
349 | if (cur_ratios > 0) { | |
350 | ratio = it->second->get_cache_ratio() / cur_ratios; | |
351 | } | |
352 | int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio); | |
353 | ||
354 | ldout(cct, 10) << __func__ << " " << it->first | |
355 | << " pri: " << (int) pri | |
356 | << " round: " << round | |
357 | << " wanted: " << cache_wants | |
358 | << " ratio: " << it->second->get_cache_ratio() | |
359 | << " cur_ratios: " << cur_ratios | |
360 | << " fair_share: " << fair_share | |
361 | << " mem_avail: " << *mem_avail | |
362 | << dendl; | |
363 | ||
364 | if (cache_wants > fair_share) { | |
365 | // If we want too much, take what we can get but stick around for more | |
366 | it->second->add_cache_bytes(pri, fair_share); | |
367 | total_assigned += fair_share; | |
368 | new_ratios += it->second->get_cache_ratio(); | |
369 | ++it; | |
370 | } else { | |
371 | // Otherwise assign only what we want | |
372 | if (cache_wants > 0) { | |
373 | it->second->add_cache_bytes(pri, cache_wants); | |
374 | total_assigned += cache_wants; | |
375 | } | |
376 | // Either the cache didn't want anything or got what it wanted, so | |
377 | // remove it from the tmp list. | |
378 | it = tmp_caches.erase(it); | |
379 | } | |
380 | } | |
381 | // Reset the ratios | |
382 | *mem_avail -= total_assigned; | |
383 | cur_ratios = new_ratios; | |
384 | new_ratios = 0; | |
385 | ++round; | |
386 | } | |
387 | ||
388 | // If this is the last priority, divide up any remaining memory based | |
389 | // solely on the ratios. | |
390 | if (pri == Priority::LAST) { | |
391 | uint64_t total_assigned = 0; | |
392 | for (auto it = caches.begin(); it != caches.end(); it++) { | |
393 | double ratio = it->second->get_cache_ratio(); | |
394 | int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio); | |
395 | it->second->set_cache_bytes(Priority::LAST, fair_share); | |
396 | total_assigned += fair_share; | |
397 | } | |
398 | *mem_avail -= total_assigned; | |
399 | return; | |
400 | } | |
401 | } | |
402 | ||
403 | PriCache::~PriCache() | |
404 | { | |
91327a77 AA |
405 | } |
406 | } |