]> git.proxmox.com Git - ceph.git/blame - ceph/src/kv/rocksdb_cache/BinnedLRUCache.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / kv / rocksdb_cache / BinnedLRUCache.h
CommitLineData
91327a77
AA
1// Copyright (c) 2018-Present Red Hat Inc. All rights reserved.
2//
3// Copyright (c) 2011-2018, Facebook, Inc. All rights reserved.
4// This source code is licensed under both the GPLv2 and Apache 2.0 License
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifndef ROCKSDB_BINNED_LRU_CACHE
11#define ROCKSDB_BINNED_LRU_CACHE
12
13#include <string>
14#include <mutex>
15
16#include "ShardedCache.h"
91327a77 17#include "common/autovector.h"
11fdf7f2
TL
18#include "common/dout.h"
19#include "include/ceph_assert.h"
20#include "common/ceph_context.h"
91327a77
AA
21
22namespace rocksdb_cache {
23
24// LRU cache implementation
25
26// An entry is a variable length heap-allocated structure.
27// Entries are referenced by cache and/or by any external entity.
28// The cache keeps all its entries in table. Some elements
29// are also stored on LRU list.
30//
31// BinnedLRUHandle can be in these states:
32// 1. Referenced externally AND in hash table.
33// In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
34// 2. Not referenced externally and in hash table. In that case the entry is
35// in the LRU and can be freed. (refs == 1 && in_cache == true)
36// 3. Referenced externally and not in hash table. In that case the entry is
37// in not on LRU and not in table. (refs >= 1 && in_cache == false)
38//
39// All newly created BinnedLRUHandles are in state 1. If you call
40// BinnedLRUCacheShard::Release
41// on entry in state 1, it will go into state 2. To move from state 1 to
42// state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
43// same key.
44// To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
45// Before destruction, make sure that no handles are in state 1. This means
46// that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
47// matching
48// RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
49
50std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
11fdf7f2 51 CephContext *c,
91327a77
AA
52 size_t capacity,
53 int num_shard_bits = -1,
54 bool strict_capacity_limit = false,
55 double high_pri_pool_ratio = 0.0);
56
57struct BinnedLRUHandle {
58 void* value;
59 void (*deleter)(const rocksdb::Slice&, void* value);
60 BinnedLRUHandle* next_hash;
61 BinnedLRUHandle* next;
62 BinnedLRUHandle* prev;
63 size_t charge; // TODO(opt): Only allow uint32_t?
64 size_t key_length;
65 uint32_t refs; // a number of refs to this entry
66 // cache itself is counted as 1
67
68 // Include the following flags:
69 // in_cache: whether this entry is referenced by the hash table.
70 // is_high_pri: whether this entry is high priority entry.
71 // in_high_pri_pool: whether this entry is in high-pri pool.
72 char flags;
73
74 uint32_t hash; // Hash of key(); used for fast sharding and comparisons
75
11fdf7f2 76 char* key_data = nullptr; // Beginning of key
91327a77
AA
77
78 rocksdb::Slice key() const {
79 // For cheaper lookups, we allow a temporary Handle object
80 // to store a pointer to a key in "value".
81 if (next == this) {
82 return *(reinterpret_cast<rocksdb::Slice*>(value));
83 } else {
84 return rocksdb::Slice(key_data, key_length);
85 }
86 }
87
88 bool InCache() { return flags & 1; }
89 bool IsHighPri() { return flags & 2; }
90 bool InHighPriPool() { return flags & 4; }
91 bool HasHit() { return flags & 8; }
92
93 void SetInCache(bool in_cache) {
94 if (in_cache) {
95 flags |= 1;
96 } else {
97 flags &= ~1;
98 }
99 }
100
101 void SetPriority(rocksdb::Cache::Priority priority) {
102 if (priority == rocksdb::Cache::Priority::HIGH) {
103 flags |= 2;
104 } else {
105 flags &= ~2;
106 }
107 }
108
109 void SetInHighPriPool(bool in_high_pri_pool) {
110 if (in_high_pri_pool) {
111 flags |= 4;
112 } else {
113 flags &= ~4;
114 }
115 }
116
117 void SetHit() { flags |= 8; }
118
119 void Free() {
11fdf7f2 120 ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
91327a77
AA
121 if (deleter) {
122 (*deleter)(key(), value);
123 }
11fdf7f2
TL
124 delete[] key_data;
125 delete this;
91327a77
AA
126 }
127};
128
129// We provide our own simple hash table since it removes a whole bunch
130// of porting hacks and is also faster than some of the built-in hash
131// table implementations in some of the compiler/runtime combinations
132// we have tested. E.g., readrandom speeds up by ~5% over the g++
133// 4.4.3's builtin hashtable.
134class BinnedLRUHandleTable {
135 public:
136 BinnedLRUHandleTable();
137 ~BinnedLRUHandleTable();
138
139 BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash);
140 BinnedLRUHandle* Insert(BinnedLRUHandle* h);
141 BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash);
142
143 template <typename T>
144 void ApplyToAllCacheEntries(T func) {
145 for (uint32_t i = 0; i < length_; i++) {
146 BinnedLRUHandle* h = list_[i];
147 while (h != nullptr) {
148 auto n = h->next_hash;
11fdf7f2 149 ceph_assert(h->InCache());
91327a77
AA
150 func(h);
151 h = n;
152 }
153 }
154 }
155
156 private:
157 // Return a pointer to slot that points to a cache entry that
158 // matches key/hash. If there is no such cache entry, return a
159 // pointer to the trailing slot in the corresponding linked list.
160 BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash);
161
162 void Resize();
163
164 // The table consists of an array of buckets where each bucket is
165 // a linked list of cache entries that hash into the bucket.
166 BinnedLRUHandle** list_;
167 uint32_t length_;
168 uint32_t elems_;
169};
170
171// A single shard of sharded cache.
172class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
173 public:
174 BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
175 double high_pri_pool_ratio);
176 virtual ~BinnedLRUCacheShard();
177
178 // Separate from constructor so caller can easily make an array of BinnedLRUCache
179 // if current usage is more than new capacity, the function will attempt to
180 // free the needed space
181 virtual void SetCapacity(size_t capacity) override;
182
183 // Set the flag to reject insertion if cache if full.
184 virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
185
186 // Set percentage of capacity reserved for high-pri cache entries.
187 void SetHighPriPoolRatio(double high_pri_pool_ratio);
188
189 // Like Cache methods, but with an extra "hash" parameter.
190 virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
191 size_t charge,
192 void (*deleter)(const rocksdb::Slice& key, void* value),
193 rocksdb::Cache::Handle** handle,
194 rocksdb::Cache::Priority priority) override;
195 virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override;
196 virtual bool Ref(rocksdb::Cache::Handle* handle) override;
197 virtual bool Release(rocksdb::Cache::Handle* handle,
198 bool force_erase = false) override;
199 virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override;
200
201 // Although in some platforms the update of size_t is atomic, to make sure
202 // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
203 // protect them with mutex_.
204
205 virtual size_t GetUsage() const override;
206 virtual size_t GetPinnedUsage() const override;
207
208 virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
209 bool thread_safe) override;
210
211 virtual void EraseUnRefEntries() override;
212
213 virtual std::string GetPrintableOptions() const override;
214
215 void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri);
216
217 // Retrieves number of elements in LRU, for unit test purpose only
218 // not threadsafe
219 size_t TEST_GetLRUSize();
220
11fdf7f2 221 // Retrieves high pri pool ratio
91327a77
AA
222 double GetHighPriPoolRatio() const;
223
11fdf7f2 224 // Retrieves high pri pool usage
91327a77
AA
225 size_t GetHighPriPoolUsage() const;
226
227 private:
228 void LRU_Remove(BinnedLRUHandle* e);
229 void LRU_Insert(BinnedLRUHandle* e);
230
231 // Overflow the last entry in high-pri pool to low-pri pool until size of
232 // high-pri pool is no larger than the size specify by high_pri_pool_pct.
233 void MaintainPoolSize();
234
235 // Just reduce the reference count by 1.
236 // Return true if last reference
237 bool Unref(BinnedLRUHandle* e);
238
239 // Free some space following strict LRU policy until enough space
240 // to hold (usage_ + charge) is freed or the lru list is empty
241 // This function is not thread safe - it needs to be executed while
242 // holding the mutex_
243 void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted);
244
245 // Initialized before use.
246 size_t capacity_;
247
248 // Memory size for entries in high-pri pool.
249 size_t high_pri_pool_usage_;
250
251 // Whether to reject insertion if cache reaches its full capacity.
252 bool strict_capacity_limit_;
253
254 // Ratio of capacity reserved for high priority cache entries.
255 double high_pri_pool_ratio_;
256
257 // High-pri pool size, equals to capacity * high_pri_pool_ratio.
258 // Remember the value to avoid recomputing each time.
259 double high_pri_pool_capacity_;
260
261 // Dummy head of LRU list.
262 // lru.prev is newest entry, lru.next is oldest entry.
263 // LRU contains items which can be evicted, ie reference only by cache
264 BinnedLRUHandle lru_;
265
266 // Pointer to head of low-pri pool in LRU list.
267 BinnedLRUHandle* lru_low_pri_;
268
269 // ------------^^^^^^^^^^^^^-----------
270 // Not frequently modified data members
271 // ------------------------------------
272 //
273 // We separate data members that are updated frequently from the ones that
274 // are not frequently updated so that they don't share the same cache line
275 // which will lead into false cache sharing
276 //
277 // ------------------------------------
278 // Frequently modified data members
279 // ------------vvvvvvvvvvvvv-----------
280 BinnedLRUHandleTable table_;
281
282 // Memory size for entries residing in the cache
283 size_t usage_;
284
285 // Memory size for entries residing only in the LRU list
286 size_t lru_usage_;
287
288 // mutex_ protects the following state.
289 // We don't count mutex_ as the cache's internal state so semantically we
290 // don't mind mutex_ invoking the non-const actions.
291 mutable std::mutex mutex_;
292};
293
294class BinnedLRUCache : public ShardedCache {
295 public:
11fdf7f2
TL
296 BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits,
297 bool strict_capacity_limit, double high_pri_pool_ratio);
91327a77
AA
298 virtual ~BinnedLRUCache();
299 virtual const char* Name() const override { return "BinnedLRUCache"; }
300 virtual CacheShard* GetShard(int shard) override;
301 virtual const CacheShard* GetShard(int shard) const override;
302 virtual void* Value(Handle* handle) override;
303 virtual size_t GetCharge(Handle* handle) const override;
304 virtual uint32_t GetHash(Handle* handle) const override;
305 virtual void DisownData() override;
306
307 // Retrieves number of elements in LRU, for unit test purpose only
308 size_t TEST_GetLRUSize();
309 // Sets the high pri pool ratio
310 void SetHighPriPoolRatio(double high_pri_pool_ratio);
11fdf7f2 311 // Retrieves high pri pool ratio
91327a77
AA
312 double GetHighPriPoolRatio() const;
313 // Retrieves high pri pool usage
314 size_t GetHighPriPoolUsage() const;
315
11fdf7f2
TL
316 // PriorityCache
317 virtual int64_t request_cache_bytes(
318 PriorityCache::Priority pri, uint64_t total_cache) const;
319 virtual int64_t commit_cache_size(uint64_t total_cache);
320 virtual int64_t get_committed_size() const {
321 return GetCapacity();
322 }
323 virtual std::string get_cache_name() const {
324 return "RocksDB Binned LRU Cache";
325 }
326
91327a77 327 private:
11fdf7f2 328 CephContext *cct;
91327a77
AA
329 BinnedLRUCacheShard* shards_;
330 int num_shards_ = 0;
331};
332
333} // namespace rocksdb_cache
334
335#endif // ROCKSDB_BINNED_LRU_CACHE