]> git.proxmox.com Git - ceph.git/blob - ceph/src/kv/rocksdb_cache/BinnedLRUCache.h
import ceph quincy 17.2.1
[ceph.git] / ceph / src / kv / rocksdb_cache / BinnedLRUCache.h
1 // Copyright (c) 2018-Present Red Hat Inc. All rights reserved.
2 //
3 // Copyright (c) 2011-2018, Facebook, Inc. All rights reserved.
4 // This source code is licensed under both the GPLv2 and Apache 2.0 License
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #ifndef ROCKSDB_BINNED_LRU_CACHE
11 #define ROCKSDB_BINNED_LRU_CACHE
12
13 #include <string>
14 #include <mutex>
15 #include <boost/circular_buffer.hpp>
16
17 #include "ShardedCache.h"
18 #include "common/autovector.h"
19 #include "common/dout.h"
20 #include "include/ceph_assert.h"
21 #include "common/ceph_context.h"
22
23 namespace rocksdb_cache {
24
25 // LRU cache implementation
26
27 // An entry is a variable length heap-allocated structure.
28 // Entries are referenced by cache and/or by any external entity.
29 // The cache keeps all its entries in table. Some elements
30 // are also stored on LRU list.
31 //
32 // BinnedLRUHandle can be in these states:
33 // 1. Referenced externally AND in hash table.
34 // In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
35 // 2. Not referenced externally and in hash table. In that case the entry is
36 // in the LRU and can be freed. (refs == 1 && in_cache == true)
37 // 3. Referenced externally and not in hash table. In that case the entry is
38 // in not on LRU and not in table. (refs >= 1 && in_cache == false)
39 //
40 // All newly created BinnedLRUHandles are in state 1. If you call
41 // BinnedLRUCacheShard::Release
42 // on entry in state 1, it will go into state 2. To move from state 1 to
43 // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
44 // same key.
45 // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
46 // Before destruction, make sure that no handles are in state 1. This means
47 // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
48 // matching
49 // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
50
51 std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
52 CephContext *c,
53 size_t capacity,
54 int num_shard_bits = -1,
55 bool strict_capacity_limit = false,
56 double high_pri_pool_ratio = 0.0);
57
58 struct BinnedLRUHandle {
59 std::shared_ptr<uint64_t> age_bin;
60 void* value;
61 DeleterFn deleter;
62 BinnedLRUHandle* next_hash;
63 BinnedLRUHandle* next;
64 BinnedLRUHandle* prev;
65 size_t charge; // TODO(opt): Only allow uint32_t?
66 size_t key_length;
67 uint32_t refs; // a number of refs to this entry
68 // cache itself is counted as 1
69
70 // Include the following flags:
71 // in_cache: whether this entry is referenced by the hash table.
72 // is_high_pri: whether this entry is high priority entry.
73 // in_high_pri_pool: whether this entry is in high-pri pool.
74 char flags;
75
76 uint32_t hash; // Hash of key(); used for fast sharding and comparisons
77
78 char* key_data = nullptr; // Beginning of key
79
80 rocksdb::Slice key() const {
81 // For cheaper lookups, we allow a temporary Handle object
82 // to store a pointer to a key in "value".
83 if (next == this) {
84 return *(reinterpret_cast<rocksdb::Slice*>(value));
85 } else {
86 return rocksdb::Slice(key_data, key_length);
87 }
88 }
89
90 bool InCache() { return flags & 1; }
91 bool IsHighPri() { return flags & 2; }
92 bool InHighPriPool() { return flags & 4; }
93 bool HasHit() { return flags & 8; }
94
95 void SetInCache(bool in_cache) {
96 if (in_cache) {
97 flags |= 1;
98 } else {
99 flags &= ~1;
100 }
101 }
102
103 void SetPriority(rocksdb::Cache::Priority priority) {
104 if (priority == rocksdb::Cache::Priority::HIGH) {
105 flags |= 2;
106 } else {
107 flags &= ~2;
108 }
109 }
110
111 void SetInHighPriPool(bool in_high_pri_pool) {
112 if (in_high_pri_pool) {
113 flags |= 4;
114 } else {
115 flags &= ~4;
116 }
117 }
118
119 void SetHit() { flags |= 8; }
120
121 void Free() {
122 ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
123 if (deleter) {
124 (*deleter)(key(), value);
125 }
126 delete[] key_data;
127 delete this;
128 }
129 };
130
131 // We provide our own simple hash table since it removes a whole bunch
132 // of porting hacks and is also faster than some of the built-in hash
133 // table implementations in some of the compiler/runtime combinations
134 // we have tested. E.g., readrandom speeds up by ~5% over the g++
135 // 4.4.3's builtin hashtable.
136 class BinnedLRUHandleTable {
137 public:
138 BinnedLRUHandleTable();
139 ~BinnedLRUHandleTable();
140
141 BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash);
142 BinnedLRUHandle* Insert(BinnedLRUHandle* h);
143 BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash);
144
145 template <typename T>
146 void ApplyToAllCacheEntries(T func) {
147 for (uint32_t i = 0; i < length_; i++) {
148 BinnedLRUHandle* h = list_[i];
149 while (h != nullptr) {
150 auto n = h->next_hash;
151 ceph_assert(h->InCache());
152 func(h);
153 h = n;
154 }
155 }
156 }
157
158 private:
159 // Return a pointer to slot that points to a cache entry that
160 // matches key/hash. If there is no such cache entry, return a
161 // pointer to the trailing slot in the corresponding linked list.
162 BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash);
163
164 void Resize();
165
166 // The table consists of an array of buckets where each bucket is
167 // a linked list of cache entries that hash into the bucket.
168 BinnedLRUHandle** list_;
169 uint32_t length_;
170 uint32_t elems_;
171 };
172
173 // A single shard of sharded cache.
174 class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
175 public:
176 BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
177 double high_pri_pool_ratio);
178 virtual ~BinnedLRUCacheShard();
179
180 // Separate from constructor so caller can easily make an array of BinnedLRUCache
181 // if current usage is more than new capacity, the function will attempt to
182 // free the needed space
183 virtual void SetCapacity(size_t capacity) override;
184
185 // Set the flag to reject insertion if cache if full.
186 virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
187
188 // Set percentage of capacity reserved for high-pri cache entries.
189 void SetHighPriPoolRatio(double high_pri_pool_ratio);
190
191 // Like Cache methods, but with an extra "hash" parameter.
192 virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
193 size_t charge,
194 DeleterFn deleter,
195 rocksdb::Cache::Handle** handle,
196 rocksdb::Cache::Priority priority) override;
197 virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override;
198 virtual bool Ref(rocksdb::Cache::Handle* handle) override;
199 virtual bool Release(rocksdb::Cache::Handle* handle,
200 bool force_erase = false) override;
201 virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override;
202
203 // Although in some platforms the update of size_t is atomic, to make sure
204 // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
205 // protect them with mutex_.
206
207 virtual size_t GetUsage() const override;
208 virtual size_t GetPinnedUsage() const override;
209
210 virtual void ApplyToAllCacheEntries(
211 const std::function<void(const rocksdb::Slice& key,
212 void* value,
213 size_t charge,
214 DeleterFn)>& callback,
215 bool thread_safe) override;
216
217 virtual void EraseUnRefEntries() override;
218
219 virtual std::string GetPrintableOptions() const override;
220
221 virtual DeleterFn GetDeleter(rocksdb::Cache::Handle* handle) const override;
222
223 void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri);
224
225 // Retrieves number of elements in LRU, for unit test purpose only
226 // not threadsafe
227 size_t TEST_GetLRUSize();
228
229 // Retrieves high pri pool ratio
230 double GetHighPriPoolRatio() const;
231
232 // Retrieves high pri pool usage
233 size_t GetHighPriPoolUsage() const;
234
235 // Rotate the bins
236 void shift_bins();
237
238 // Get the bin count
239 uint32_t get_bin_count() const;
240
241 // Set the bin count
242 void set_bin_count(uint32_t count);
243
244 // Get the byte counts for a range of age bins
245 uint64_t sum_bins(uint32_t start, uint32_t end) const;
246
247 private:
248 CephContext *cct;
249 void LRU_Remove(BinnedLRUHandle* e);
250 void LRU_Insert(BinnedLRUHandle* e);
251
252 // Overflow the last entry in high-pri pool to low-pri pool until size of
253 // high-pri pool is no larger than the size specify by high_pri_pool_pct.
254 void MaintainPoolSize();
255
256 // Just reduce the reference count by 1.
257 // Return true if last reference
258 bool Unref(BinnedLRUHandle* e);
259
260 // Free some space following strict LRU policy until enough space
261 // to hold (usage_ + charge) is freed or the lru list is empty
262 // This function is not thread safe - it needs to be executed while
263 // holding the mutex_
264 void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted);
265
266 // Initialized before use.
267 size_t capacity_;
268
269 // Memory size for entries in high-pri pool.
270 size_t high_pri_pool_usage_;
271
272 // Whether to reject insertion if cache reaches its full capacity.
273 bool strict_capacity_limit_;
274
275 // Ratio of capacity reserved for high priority cache entries.
276 double high_pri_pool_ratio_;
277
278 // High-pri pool size, equals to capacity * high_pri_pool_ratio.
279 // Remember the value to avoid recomputing each time.
280 double high_pri_pool_capacity_;
281
282 // Dummy head of LRU list.
283 // lru.prev is newest entry, lru.next is oldest entry.
284 // LRU contains items which can be evicted, ie reference only by cache
285 BinnedLRUHandle lru_;
286
287 // Pointer to head of low-pri pool in LRU list.
288 BinnedLRUHandle* lru_low_pri_;
289
290 // ------------^^^^^^^^^^^^^-----------
291 // Not frequently modified data members
292 // ------------------------------------
293 //
294 // We separate data members that are updated frequently from the ones that
295 // are not frequently updated so that they don't share the same cache line
296 // which will lead into false cache sharing
297 //
298 // ------------------------------------
299 // Frequently modified data members
300 // ------------vvvvvvvvvvvvv-----------
301 BinnedLRUHandleTable table_;
302
303 // Memory size for entries residing in the cache
304 size_t usage_;
305
306 // Memory size for entries residing only in the LRU list
307 size_t lru_usage_;
308
309 // mutex_ protects the following state.
310 // We don't count mutex_ as the cache's internal state so semantically we
311 // don't mind mutex_ invoking the non-const actions.
312 mutable std::mutex mutex_;
313
314 // Circular buffer of byte counters for age binning
315 boost::circular_buffer<std::shared_ptr<uint64_t>> age_bins;
316 };
317
318 class BinnedLRUCache : public ShardedCache {
319 public:
320 BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits,
321 bool strict_capacity_limit, double high_pri_pool_ratio);
322 virtual ~BinnedLRUCache();
323 virtual const char* Name() const override { return "BinnedLRUCache"; }
324 virtual CacheShard* GetShard(int shard) override;
325 virtual const CacheShard* GetShard(int shard) const override;
326 virtual void* Value(Handle* handle) override;
327 virtual size_t GetCharge(Handle* handle) const override;
328 virtual uint32_t GetHash(Handle* handle) const override;
329 virtual void DisownData() override;
330 #if (ROCKSDB_MAJOR >= 7 || (ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR >= 22))
331 virtual DeleterFn GetDeleter(Handle* handle) const override;
332 #endif
333 // Retrieves number of elements in LRU, for unit test purpose only
334 size_t TEST_GetLRUSize();
335 // Sets the high pri pool ratio
336 void SetHighPriPoolRatio(double high_pri_pool_ratio);
337 // Retrieves high pri pool ratio
338 double GetHighPriPoolRatio() const;
339 // Retrieves high pri pool usage
340 size_t GetHighPriPoolUsage() const;
341
342 // PriorityCache
343 virtual int64_t request_cache_bytes(
344 PriorityCache::Priority pri, uint64_t total_cache) const;
345 virtual int64_t commit_cache_size(uint64_t total_cache);
346 virtual int64_t get_committed_size() const {
347 return GetCapacity();
348 }
349 virtual void shift_bins();
350 uint64_t sum_bins(uint32_t start, uint32_t end) const;
351 uint32_t get_bin_count() const;
352 void set_bin_count(uint32_t count);
353
354 virtual std::string get_cache_name() const {
355 return "RocksDB Binned LRU Cache";
356 }
357
358 private:
359 CephContext *cct;
360 BinnedLRUCacheShard* shards_;
361 int num_shards_ = 0;
362 };
363
364 } // namespace rocksdb_cache
365
366 #endif // ROCKSDB_BINNED_LRU_CACHE