1 // Copyright (c) 2018-Present Red Hat Inc. All rights reserved.
3 // Copyright (c) 2011-2018, Facebook, Inc. All rights reserved.
4 // This source code is licensed under both the GPLv2 and Apache 2.0 License
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #ifndef ROCKSDB_BINNED_LRU_CACHE
11 #define ROCKSDB_BINNED_LRU_CACHE
15 #include <boost/circular_buffer.hpp>
17 #include "ShardedCache.h"
18 #include "common/autovector.h"
19 #include "common/dout.h"
20 #include "include/ceph_assert.h"
21 #include "common/ceph_context.h"
23 namespace rocksdb_cache
{
25 // LRU cache implementation
27 // An entry is a variable length heap-allocated structure.
28 // Entries are referenced by cache and/or by any external entity.
29 // The cache keeps all its entries in table. Some elements
30 // are also stored on LRU list.
32 // BinnedLRUHandle can be in these states:
33 // 1. Referenced externally AND in hash table.
34 // In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
35 // 2. Not referenced externally and in hash table. In that case the entry is
36 // in the LRU and can be freed. (refs == 1 && in_cache == true)
37 // 3. Referenced externally and not in hash table. In that case the entry is
38 // in not on LRU and not in table. (refs >= 1 && in_cache == false)
40 // All newly created BinnedLRUHandles are in state 1. If you call
41 // BinnedLRUCacheShard::Release
42 // on entry in state 1, it will go into state 2. To move from state 1 to
43 // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
45 // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
46 // Before destruction, make sure that no handles are in state 1. This means
47 // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
49 // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
51 std::shared_ptr
<rocksdb::Cache
> NewBinnedLRUCache(
54 int num_shard_bits
= -1,
55 bool strict_capacity_limit
= false,
56 double high_pri_pool_ratio
= 0.0);
58 struct BinnedLRUHandle
{
59 std::shared_ptr
<uint64_t> age_bin
;
62 BinnedLRUHandle
* next_hash
;
63 BinnedLRUHandle
* next
;
64 BinnedLRUHandle
* prev
;
65 size_t charge
; // TODO(opt): Only allow uint32_t?
67 uint32_t refs
; // a number of refs to this entry
68 // cache itself is counted as 1
70 // Include the following flags:
71 // in_cache: whether this entry is referenced by the hash table.
72 // is_high_pri: whether this entry is high priority entry.
73 // in_high_pri_pool: whether this entry is in high-pri pool.
76 uint32_t hash
; // Hash of key(); used for fast sharding and comparisons
78 char* key_data
= nullptr; // Beginning of key
80 rocksdb::Slice
key() const {
81 // For cheaper lookups, we allow a temporary Handle object
82 // to store a pointer to a key in "value".
84 return *(reinterpret_cast<rocksdb::Slice
*>(value
));
86 return rocksdb::Slice(key_data
, key_length
);
90 bool InCache() { return flags
& 1; }
91 bool IsHighPri() { return flags
& 2; }
92 bool InHighPriPool() { return flags
& 4; }
93 bool HasHit() { return flags
& 8; }
95 void SetInCache(bool in_cache
) {
103 void SetPriority(rocksdb::Cache::Priority priority
) {
104 if (priority
== rocksdb::Cache::Priority::HIGH
) {
111 void SetInHighPriPool(bool in_high_pri_pool
) {
112 if (in_high_pri_pool
) {
119 void SetHit() { flags
|= 8; }
122 ceph_assert((refs
== 1 && InCache()) || (refs
== 0 && !InCache()));
124 (*deleter
)(key(), value
);
131 // We provide our own simple hash table since it removes a whole bunch
132 // of porting hacks and is also faster than some of the built-in hash
133 // table implementations in some of the compiler/runtime combinations
134 // we have tested. E.g., readrandom speeds up by ~5% over the g++
135 // 4.4.3's builtin hashtable.
136 class BinnedLRUHandleTable
{
138 BinnedLRUHandleTable();
139 ~BinnedLRUHandleTable();
141 BinnedLRUHandle
* Lookup(const rocksdb::Slice
& key
, uint32_t hash
);
142 BinnedLRUHandle
* Insert(BinnedLRUHandle
* h
);
143 BinnedLRUHandle
* Remove(const rocksdb::Slice
& key
, uint32_t hash
);
145 template <typename T
>
146 void ApplyToAllCacheEntries(T func
) {
147 for (uint32_t i
= 0; i
< length_
; i
++) {
148 BinnedLRUHandle
* h
= list_
[i
];
149 while (h
!= nullptr) {
150 auto n
= h
->next_hash
;
151 ceph_assert(h
->InCache());
159 // Return a pointer to slot that points to a cache entry that
160 // matches key/hash. If there is no such cache entry, return a
161 // pointer to the trailing slot in the corresponding linked list.
162 BinnedLRUHandle
** FindPointer(const rocksdb::Slice
& key
, uint32_t hash
);
166 // The table consists of an array of buckets where each bucket is
167 // a linked list of cache entries that hash into the bucket.
168 BinnedLRUHandle
** list_
;
173 // A single shard of sharded cache.
174 class alignas(CACHE_LINE_SIZE
) BinnedLRUCacheShard
: public CacheShard
{
176 BinnedLRUCacheShard(CephContext
*c
, size_t capacity
, bool strict_capacity_limit
,
177 double high_pri_pool_ratio
);
178 virtual ~BinnedLRUCacheShard();
180 // Separate from constructor so caller can easily make an array of BinnedLRUCache
181 // if current usage is more than new capacity, the function will attempt to
182 // free the needed space
183 virtual void SetCapacity(size_t capacity
) override
;
185 // Set the flag to reject insertion if cache if full.
186 virtual void SetStrictCapacityLimit(bool strict_capacity_limit
) override
;
188 // Set percentage of capacity reserved for high-pri cache entries.
189 void SetHighPriPoolRatio(double high_pri_pool_ratio
);
191 // Like Cache methods, but with an extra "hash" parameter.
192 virtual rocksdb::Status
Insert(const rocksdb::Slice
& key
, uint32_t hash
, void* value
,
195 rocksdb::Cache::Handle
** handle
,
196 rocksdb::Cache::Priority priority
) override
;
197 virtual rocksdb::Cache::Handle
* Lookup(const rocksdb::Slice
& key
, uint32_t hash
) override
;
198 virtual bool Ref(rocksdb::Cache::Handle
* handle
) override
;
199 virtual bool Release(rocksdb::Cache::Handle
* handle
,
200 bool force_erase
= false) override
;
201 virtual void Erase(const rocksdb::Slice
& key
, uint32_t hash
) override
;
203 // Although in some platforms the update of size_t is atomic, to make sure
204 // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
205 // protect them with mutex_.
207 virtual size_t GetUsage() const override
;
208 virtual size_t GetPinnedUsage() const override
;
210 virtual void ApplyToAllCacheEntries(
211 const std::function
<void(const rocksdb::Slice
& key
,
214 DeleterFn
)>& callback
,
215 bool thread_safe
) override
;
217 virtual void EraseUnRefEntries() override
;
219 virtual std::string
GetPrintableOptions() const override
;
221 virtual DeleterFn
GetDeleter(rocksdb::Cache::Handle
* handle
) const override
;
223 void TEST_GetLRUList(BinnedLRUHandle
** lru
, BinnedLRUHandle
** lru_low_pri
);
225 // Retrieves number of elements in LRU, for unit test purpose only
227 size_t TEST_GetLRUSize();
229 // Retrieves high pri pool ratio
230 double GetHighPriPoolRatio() const;
232 // Retrieves high pri pool usage
233 size_t GetHighPriPoolUsage() const;
239 uint32_t get_bin_count() const;
242 void set_bin_count(uint32_t count
);
244 // Get the byte counts for a range of age bins
245 uint64_t sum_bins(uint32_t start
, uint32_t end
) const;
249 void LRU_Remove(BinnedLRUHandle
* e
);
250 void LRU_Insert(BinnedLRUHandle
* e
);
252 // Overflow the last entry in high-pri pool to low-pri pool until size of
253 // high-pri pool is no larger than the size specify by high_pri_pool_pct.
254 void MaintainPoolSize();
256 // Just reduce the reference count by 1.
257 // Return true if last reference
258 bool Unref(BinnedLRUHandle
* e
);
260 // Free some space following strict LRU policy until enough space
261 // to hold (usage_ + charge) is freed or the lru list is empty
262 // This function is not thread safe - it needs to be executed while
263 // holding the mutex_
264 void EvictFromLRU(size_t charge
, ceph::autovector
<BinnedLRUHandle
*>* deleted
);
266 // Initialized before use.
269 // Memory size for entries in high-pri pool.
270 size_t high_pri_pool_usage_
;
272 // Whether to reject insertion if cache reaches its full capacity.
273 bool strict_capacity_limit_
;
275 // Ratio of capacity reserved for high priority cache entries.
276 double high_pri_pool_ratio_
;
278 // High-pri pool size, equals to capacity * high_pri_pool_ratio.
279 // Remember the value to avoid recomputing each time.
280 double high_pri_pool_capacity_
;
282 // Dummy head of LRU list.
283 // lru.prev is newest entry, lru.next is oldest entry.
284 // LRU contains items which can be evicted, ie reference only by cache
285 BinnedLRUHandle lru_
;
287 // Pointer to head of low-pri pool in LRU list.
288 BinnedLRUHandle
* lru_low_pri_
;
290 // ------------^^^^^^^^^^^^^-----------
291 // Not frequently modified data members
292 // ------------------------------------
294 // We separate data members that are updated frequently from the ones that
295 // are not frequently updated so that they don't share the same cache line
296 // which will lead into false cache sharing
298 // ------------------------------------
299 // Frequently modified data members
300 // ------------vvvvvvvvvvvvv-----------
301 BinnedLRUHandleTable table_
;
303 // Memory size for entries residing in the cache
306 // Memory size for entries residing only in the LRU list
309 // mutex_ protects the following state.
310 // We don't count mutex_ as the cache's internal state so semantically we
311 // don't mind mutex_ invoking the non-const actions.
312 mutable std::mutex mutex_
;
314 // Circular buffer of byte counters for age binning
315 boost::circular_buffer
<std::shared_ptr
<uint64_t>> age_bins
;
318 class BinnedLRUCache
: public ShardedCache
{
320 BinnedLRUCache(CephContext
*c
, size_t capacity
, int num_shard_bits
,
321 bool strict_capacity_limit
, double high_pri_pool_ratio
);
322 virtual ~BinnedLRUCache();
323 virtual const char* Name() const override
{ return "BinnedLRUCache"; }
324 virtual CacheShard
* GetShard(int shard
) override
;
325 virtual const CacheShard
* GetShard(int shard
) const override
;
326 virtual void* Value(Handle
* handle
) override
;
327 virtual size_t GetCharge(Handle
* handle
) const override
;
328 virtual uint32_t GetHash(Handle
* handle
) const override
;
329 virtual void DisownData() override
;
330 #if (ROCKSDB_MAJOR >= 7 || (ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR >= 22))
331 virtual DeleterFn
GetDeleter(Handle
* handle
) const override
;
333 // Retrieves number of elements in LRU, for unit test purpose only
334 size_t TEST_GetLRUSize();
335 // Sets the high pri pool ratio
336 void SetHighPriPoolRatio(double high_pri_pool_ratio
);
337 // Retrieves high pri pool ratio
338 double GetHighPriPoolRatio() const;
339 // Retrieves high pri pool usage
340 size_t GetHighPriPoolUsage() const;
343 virtual int64_t request_cache_bytes(
344 PriorityCache::Priority pri
, uint64_t total_cache
) const;
345 virtual int64_t commit_cache_size(uint64_t total_cache
);
346 virtual int64_t get_committed_size() const {
347 return GetCapacity();
349 virtual void shift_bins();
350 uint64_t sum_bins(uint32_t start
, uint32_t end
) const;
351 uint32_t get_bin_count() const;
352 void set_bin_count(uint32_t count
);
354 virtual std::string
get_cache_name() const {
355 return "RocksDB Binned LRU Cache";
360 BinnedLRUCacheShard
* shards_
;
364 } // namespace rocksdb_cache
366 #endif // ROCKSDB_BINNED_LRU_CACHE