1 // Copyright (c) 2018-Present Red Hat Inc. All rights reserved.
3 // Copyright (c) 2011-2018, Facebook, Inc. All rights reserved.
4 // This source code is licensed under both the GPLv2 and Apache 2.0 License
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #ifndef ROCKSDB_BINNED_LRU_CACHE
11 #define ROCKSDB_BINNED_LRU_CACHE
16 #include "ShardedCache.h"
17 #include "common/autovector.h"
18 #include "common/dout.h"
19 #include "include/ceph_assert.h"
20 #include "common/ceph_context.h"
22 namespace rocksdb_cache
{
24 // LRU cache implementation
26 // An entry is a variable length heap-allocated structure.
27 // Entries are referenced by cache and/or by any external entity.
28 // The cache keeps all its entries in table. Some elements
29 // are also stored on LRU list.
31 // BinnedLRUHandle can be in these states:
32 // 1. Referenced externally AND in hash table.
33 // In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
34 // 2. Not referenced externally and in hash table. In that case the entry is
35 // in the LRU and can be freed. (refs == 1 && in_cache == true)
36 // 3. Referenced externally and not in hash table. In that case the entry is
37 // in not on LRU and not in table. (refs >= 1 && in_cache == false)
39 // All newly created BinnedLRUHandles are in state 1. If you call
40 // BinnedLRUCacheShard::Release
41 // on entry in state 1, it will go into state 2. To move from state 1 to
42 // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
44 // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
45 // Before destruction, make sure that no handles are in state 1. This means
46 // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
48 // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
50 std::shared_ptr
<rocksdb::Cache
> NewBinnedLRUCache(
53 int num_shard_bits
= -1,
54 bool strict_capacity_limit
= false,
55 double high_pri_pool_ratio
= 0.0);
57 struct BinnedLRUHandle
{
59 void (*deleter
)(const rocksdb::Slice
&, void* value
);
60 BinnedLRUHandle
* next_hash
;
61 BinnedLRUHandle
* next
;
62 BinnedLRUHandle
* prev
;
63 size_t charge
; // TODO(opt): Only allow uint32_t?
65 uint32_t refs
; // a number of refs to this entry
66 // cache itself is counted as 1
68 // Include the following flags:
69 // in_cache: whether this entry is referenced by the hash table.
70 // is_high_pri: whether this entry is high priority entry.
71 // in_high_pri_pool: whether this entry is in high-pri pool.
74 uint32_t hash
; // Hash of key(); used for fast sharding and comparisons
76 char* key_data
= nullptr; // Beginning of key
78 rocksdb::Slice
key() const {
79 // For cheaper lookups, we allow a temporary Handle object
80 // to store a pointer to a key in "value".
82 return *(reinterpret_cast<rocksdb::Slice
*>(value
));
84 return rocksdb::Slice(key_data
, key_length
);
88 bool InCache() { return flags
& 1; }
89 bool IsHighPri() { return flags
& 2; }
90 bool InHighPriPool() { return flags
& 4; }
91 bool HasHit() { return flags
& 8; }
93 void SetInCache(bool in_cache
) {
101 void SetPriority(rocksdb::Cache::Priority priority
) {
102 if (priority
== rocksdb::Cache::Priority::HIGH
) {
109 void SetInHighPriPool(bool in_high_pri_pool
) {
110 if (in_high_pri_pool
) {
117 void SetHit() { flags
|= 8; }
120 ceph_assert((refs
== 1 && InCache()) || (refs
== 0 && !InCache()));
122 (*deleter
)(key(), value
);
129 // We provide our own simple hash table since it removes a whole bunch
130 // of porting hacks and is also faster than some of the built-in hash
131 // table implementations in some of the compiler/runtime combinations
132 // we have tested. E.g., readrandom speeds up by ~5% over the g++
133 // 4.4.3's builtin hashtable.
134 class BinnedLRUHandleTable
{
136 BinnedLRUHandleTable();
137 ~BinnedLRUHandleTable();
139 BinnedLRUHandle
* Lookup(const rocksdb::Slice
& key
, uint32_t hash
);
140 BinnedLRUHandle
* Insert(BinnedLRUHandle
* h
);
141 BinnedLRUHandle
* Remove(const rocksdb::Slice
& key
, uint32_t hash
);
143 template <typename T
>
144 void ApplyToAllCacheEntries(T func
) {
145 for (uint32_t i
= 0; i
< length_
; i
++) {
146 BinnedLRUHandle
* h
= list_
[i
];
147 while (h
!= nullptr) {
148 auto n
= h
->next_hash
;
149 ceph_assert(h
->InCache());
157 // Return a pointer to slot that points to a cache entry that
158 // matches key/hash. If there is no such cache entry, return a
159 // pointer to the trailing slot in the corresponding linked list.
160 BinnedLRUHandle
** FindPointer(const rocksdb::Slice
& key
, uint32_t hash
);
164 // The table consists of an array of buckets where each bucket is
165 // a linked list of cache entries that hash into the bucket.
166 BinnedLRUHandle
** list_
;
171 // A single shard of sharded cache.
172 class alignas(CACHE_LINE_SIZE
) BinnedLRUCacheShard
: public CacheShard
{
174 BinnedLRUCacheShard(CephContext
*c
, size_t capacity
, bool strict_capacity_limit
,
175 double high_pri_pool_ratio
);
176 virtual ~BinnedLRUCacheShard();
178 // Separate from constructor so caller can easily make an array of BinnedLRUCache
179 // if current usage is more than new capacity, the function will attempt to
180 // free the needed space
181 virtual void SetCapacity(size_t capacity
) override
;
183 // Set the flag to reject insertion if cache if full.
184 virtual void SetStrictCapacityLimit(bool strict_capacity_limit
) override
;
186 // Set percentage of capacity reserved for high-pri cache entries.
187 void SetHighPriPoolRatio(double high_pri_pool_ratio
);
189 // Like Cache methods, but with an extra "hash" parameter.
190 virtual rocksdb::Status
Insert(const rocksdb::Slice
& key
, uint32_t hash
, void* value
,
192 void (*deleter
)(const rocksdb::Slice
& key
, void* value
),
193 rocksdb::Cache::Handle
** handle
,
194 rocksdb::Cache::Priority priority
) override
;
195 virtual rocksdb::Cache::Handle
* Lookup(const rocksdb::Slice
& key
, uint32_t hash
) override
;
196 virtual bool Ref(rocksdb::Cache::Handle
* handle
) override
;
197 virtual bool Release(rocksdb::Cache::Handle
* handle
,
198 bool force_erase
= false) override
;
199 virtual void Erase(const rocksdb::Slice
& key
, uint32_t hash
) override
;
201 // Although in some platforms the update of size_t is atomic, to make sure
202 // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
203 // protect them with mutex_.
205 virtual size_t GetUsage() const override
;
206 virtual size_t GetPinnedUsage() const override
;
208 virtual void ApplyToAllCacheEntries(void (*callback
)(void*, size_t),
209 bool thread_safe
) override
;
211 virtual void EraseUnRefEntries() override
;
213 virtual std::string
GetPrintableOptions() const override
;
215 void TEST_GetLRUList(BinnedLRUHandle
** lru
, BinnedLRUHandle
** lru_low_pri
);
217 // Retrieves number of elements in LRU, for unit test purpose only
219 size_t TEST_GetLRUSize();
221 // Retrieves high pri pool ratio
222 double GetHighPriPoolRatio() const;
224 // Retrieves high pri pool usage
225 size_t GetHighPriPoolUsage() const;
229 void LRU_Remove(BinnedLRUHandle
* e
);
230 void LRU_Insert(BinnedLRUHandle
* e
);
232 // Overflow the last entry in high-pri pool to low-pri pool until size of
233 // high-pri pool is no larger than the size specify by high_pri_pool_pct.
234 void MaintainPoolSize();
236 // Just reduce the reference count by 1.
237 // Return true if last reference
238 bool Unref(BinnedLRUHandle
* e
);
240 // Free some space following strict LRU policy until enough space
241 // to hold (usage_ + charge) is freed or the lru list is empty
242 // This function is not thread safe - it needs to be executed while
243 // holding the mutex_
244 void EvictFromLRU(size_t charge
, ceph::autovector
<BinnedLRUHandle
*>* deleted
);
246 // Initialized before use.
249 // Memory size for entries in high-pri pool.
250 size_t high_pri_pool_usage_
;
252 // Whether to reject insertion if cache reaches its full capacity.
253 bool strict_capacity_limit_
;
255 // Ratio of capacity reserved for high priority cache entries.
256 double high_pri_pool_ratio_
;
258 // High-pri pool size, equals to capacity * high_pri_pool_ratio.
259 // Remember the value to avoid recomputing each time.
260 double high_pri_pool_capacity_
;
262 // Dummy head of LRU list.
263 // lru.prev is newest entry, lru.next is oldest entry.
264 // LRU contains items which can be evicted, ie reference only by cache
265 BinnedLRUHandle lru_
;
267 // Pointer to head of low-pri pool in LRU list.
268 BinnedLRUHandle
* lru_low_pri_
;
270 // ------------^^^^^^^^^^^^^-----------
271 // Not frequently modified data members
272 // ------------------------------------
274 // We separate data members that are updated frequently from the ones that
275 // are not frequently updated so that they don't share the same cache line
276 // which will lead into false cache sharing
278 // ------------------------------------
279 // Frequently modified data members
280 // ------------vvvvvvvvvvvvv-----------
281 BinnedLRUHandleTable table_
;
283 // Memory size for entries residing in the cache
286 // Memory size for entries residing only in the LRU list
289 // mutex_ protects the following state.
290 // We don't count mutex_ as the cache's internal state so semantically we
291 // don't mind mutex_ invoking the non-const actions.
292 mutable std::mutex mutex_
;
295 class BinnedLRUCache
: public ShardedCache
{
297 BinnedLRUCache(CephContext
*c
, size_t capacity
, int num_shard_bits
,
298 bool strict_capacity_limit
, double high_pri_pool_ratio
);
299 virtual ~BinnedLRUCache();
300 virtual const char* Name() const override
{ return "BinnedLRUCache"; }
301 virtual CacheShard
* GetShard(int shard
) override
;
302 virtual const CacheShard
* GetShard(int shard
) const override
;
303 virtual void* Value(Handle
* handle
) override
;
304 virtual size_t GetCharge(Handle
* handle
) const override
;
305 virtual uint32_t GetHash(Handle
* handle
) const override
;
306 virtual void DisownData() override
;
308 // Retrieves number of elements in LRU, for unit test purpose only
309 size_t TEST_GetLRUSize();
310 // Sets the high pri pool ratio
311 void SetHighPriPoolRatio(double high_pri_pool_ratio
);
312 // Retrieves high pri pool ratio
313 double GetHighPriPoolRatio() const;
314 // Retrieves high pri pool usage
315 size_t GetHighPriPoolUsage() const;
318 virtual int64_t request_cache_bytes(
319 PriorityCache::Priority pri
, uint64_t total_cache
) const;
320 virtual int64_t commit_cache_size(uint64_t total_cache
);
321 virtual int64_t get_committed_size() const {
322 return GetCapacity();
324 virtual std::string
get_cache_name() const {
325 return "RocksDB Binned LRU Cache";
330 BinnedLRUCacheShard
* shards_
;
334 } // namespace rocksdb_cache
336 #endif // ROCKSDB_BINNED_LRU_CACHE