]> git.proxmox.com Git - ceph.git/blob - ceph/src/kv/rocksdb_cache/BinnedLRUCache.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / kv / rocksdb_cache / BinnedLRUCache.h
1 // Copyright (c) 2018-Present Red Hat Inc. All rights reserved.
2 //
3 // Copyright (c) 2011-2018, Facebook, Inc. All rights reserved.
4 // This source code is licensed under both the GPLv2 and Apache 2.0 License
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #ifndef ROCKSDB_BINNED_LRU_CACHE
11 #define ROCKSDB_BINNED_LRU_CACHE
12
13 #include <string>
14 #include <mutex>
15
16 #include "ShardedCache.h"
17 #include "common/autovector.h"
18 #include "common/dout.h"
19 #include "include/ceph_assert.h"
20 #include "common/ceph_context.h"
21
22 namespace rocksdb_cache {
23
24 // LRU cache implementation
25
26 // An entry is a variable length heap-allocated structure.
27 // Entries are referenced by cache and/or by any external entity.
28 // The cache keeps all its entries in table. Some elements
29 // are also stored on LRU list.
30 //
31 // BinnedLRUHandle can be in these states:
32 // 1. Referenced externally AND in hash table.
33 // In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
34 // 2. Not referenced externally and in hash table. In that case the entry is
35 // in the LRU and can be freed. (refs == 1 && in_cache == true)
36 // 3. Referenced externally and not in hash table. In that case the entry is
37 // in not on LRU and not in table. (refs >= 1 && in_cache == false)
38 //
39 // All newly created BinnedLRUHandles are in state 1. If you call
40 // BinnedLRUCacheShard::Release
41 // on entry in state 1, it will go into state 2. To move from state 1 to
42 // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
43 // same key.
44 // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
45 // Before destruction, make sure that no handles are in state 1. This means
46 // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
47 // matching
48 // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
49
50 std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
51 CephContext *c,
52 size_t capacity,
53 int num_shard_bits = -1,
54 bool strict_capacity_limit = false,
55 double high_pri_pool_ratio = 0.0);
56
57 struct BinnedLRUHandle {
58 void* value;
59 void (*deleter)(const rocksdb::Slice&, void* value);
60 BinnedLRUHandle* next_hash;
61 BinnedLRUHandle* next;
62 BinnedLRUHandle* prev;
63 size_t charge; // TODO(opt): Only allow uint32_t?
64 size_t key_length;
65 uint32_t refs; // a number of refs to this entry
66 // cache itself is counted as 1
67
68 // Include the following flags:
69 // in_cache: whether this entry is referenced by the hash table.
70 // is_high_pri: whether this entry is high priority entry.
71 // in_high_pri_pool: whether this entry is in high-pri pool.
72 char flags;
73
74 uint32_t hash; // Hash of key(); used for fast sharding and comparisons
75
76 char* key_data = nullptr; // Beginning of key
77
78 rocksdb::Slice key() const {
79 // For cheaper lookups, we allow a temporary Handle object
80 // to store a pointer to a key in "value".
81 if (next == this) {
82 return *(reinterpret_cast<rocksdb::Slice*>(value));
83 } else {
84 return rocksdb::Slice(key_data, key_length);
85 }
86 }
87
88 bool InCache() { return flags & 1; }
89 bool IsHighPri() { return flags & 2; }
90 bool InHighPriPool() { return flags & 4; }
91 bool HasHit() { return flags & 8; }
92
93 void SetInCache(bool in_cache) {
94 if (in_cache) {
95 flags |= 1;
96 } else {
97 flags &= ~1;
98 }
99 }
100
101 void SetPriority(rocksdb::Cache::Priority priority) {
102 if (priority == rocksdb::Cache::Priority::HIGH) {
103 flags |= 2;
104 } else {
105 flags &= ~2;
106 }
107 }
108
109 void SetInHighPriPool(bool in_high_pri_pool) {
110 if (in_high_pri_pool) {
111 flags |= 4;
112 } else {
113 flags &= ~4;
114 }
115 }
116
117 void SetHit() { flags |= 8; }
118
119 void Free() {
120 ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
121 if (deleter) {
122 (*deleter)(key(), value);
123 }
124 delete[] key_data;
125 delete this;
126 }
127 };
128
129 // We provide our own simple hash table since it removes a whole bunch
130 // of porting hacks and is also faster than some of the built-in hash
131 // table implementations in some of the compiler/runtime combinations
132 // we have tested. E.g., readrandom speeds up by ~5% over the g++
133 // 4.4.3's builtin hashtable.
134 class BinnedLRUHandleTable {
135 public:
136 BinnedLRUHandleTable();
137 ~BinnedLRUHandleTable();
138
139 BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash);
140 BinnedLRUHandle* Insert(BinnedLRUHandle* h);
141 BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash);
142
143 template <typename T>
144 void ApplyToAllCacheEntries(T func) {
145 for (uint32_t i = 0; i < length_; i++) {
146 BinnedLRUHandle* h = list_[i];
147 while (h != nullptr) {
148 auto n = h->next_hash;
149 ceph_assert(h->InCache());
150 func(h);
151 h = n;
152 }
153 }
154 }
155
156 private:
157 // Return a pointer to slot that points to a cache entry that
158 // matches key/hash. If there is no such cache entry, return a
159 // pointer to the trailing slot in the corresponding linked list.
160 BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash);
161
162 void Resize();
163
164 // The table consists of an array of buckets where each bucket is
165 // a linked list of cache entries that hash into the bucket.
166 BinnedLRUHandle** list_;
167 uint32_t length_;
168 uint32_t elems_;
169 };
170
171 // A single shard of sharded cache.
172 class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
173 public:
174 BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
175 double high_pri_pool_ratio);
176 virtual ~BinnedLRUCacheShard();
177
178 // Separate from constructor so caller can easily make an array of BinnedLRUCache
179 // if current usage is more than new capacity, the function will attempt to
180 // free the needed space
181 virtual void SetCapacity(size_t capacity) override;
182
183 // Set the flag to reject insertion if cache if full.
184 virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
185
186 // Set percentage of capacity reserved for high-pri cache entries.
187 void SetHighPriPoolRatio(double high_pri_pool_ratio);
188
189 // Like Cache methods, but with an extra "hash" parameter.
190 virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
191 size_t charge,
192 void (*deleter)(const rocksdb::Slice& key, void* value),
193 rocksdb::Cache::Handle** handle,
194 rocksdb::Cache::Priority priority) override;
195 virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override;
196 virtual bool Ref(rocksdb::Cache::Handle* handle) override;
197 virtual bool Release(rocksdb::Cache::Handle* handle,
198 bool force_erase = false) override;
199 virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override;
200
201 // Although in some platforms the update of size_t is atomic, to make sure
202 // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
203 // protect them with mutex_.
204
205 virtual size_t GetUsage() const override;
206 virtual size_t GetPinnedUsage() const override;
207
208 virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
209 bool thread_safe) override;
210
211 virtual void EraseUnRefEntries() override;
212
213 virtual std::string GetPrintableOptions() const override;
214
215 void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri);
216
217 // Retrieves number of elements in LRU, for unit test purpose only
218 // not threadsafe
219 size_t TEST_GetLRUSize();
220
221 // Retrieves high pri pool ratio
222 double GetHighPriPoolRatio() const;
223
224 // Retrieves high pri pool usage
225 size_t GetHighPriPoolUsage() const;
226
227 private:
228 CephContext *cct;
229 void LRU_Remove(BinnedLRUHandle* e);
230 void LRU_Insert(BinnedLRUHandle* e);
231
232 // Overflow the last entry in high-pri pool to low-pri pool until size of
233 // high-pri pool is no larger than the size specify by high_pri_pool_pct.
234 void MaintainPoolSize();
235
236 // Just reduce the reference count by 1.
237 // Return true if last reference
238 bool Unref(BinnedLRUHandle* e);
239
240 // Free some space following strict LRU policy until enough space
241 // to hold (usage_ + charge) is freed or the lru list is empty
242 // This function is not thread safe - it needs to be executed while
243 // holding the mutex_
244 void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted);
245
246 // Initialized before use.
247 size_t capacity_;
248
249 // Memory size for entries in high-pri pool.
250 size_t high_pri_pool_usage_;
251
252 // Whether to reject insertion if cache reaches its full capacity.
253 bool strict_capacity_limit_;
254
255 // Ratio of capacity reserved for high priority cache entries.
256 double high_pri_pool_ratio_;
257
258 // High-pri pool size, equals to capacity * high_pri_pool_ratio.
259 // Remember the value to avoid recomputing each time.
260 double high_pri_pool_capacity_;
261
262 // Dummy head of LRU list.
263 // lru.prev is newest entry, lru.next is oldest entry.
264 // LRU contains items which can be evicted, ie reference only by cache
265 BinnedLRUHandle lru_;
266
267 // Pointer to head of low-pri pool in LRU list.
268 BinnedLRUHandle* lru_low_pri_;
269
270 // ------------^^^^^^^^^^^^^-----------
271 // Not frequently modified data members
272 // ------------------------------------
273 //
274 // We separate data members that are updated frequently from the ones that
275 // are not frequently updated so that they don't share the same cache line
276 // which will lead into false cache sharing
277 //
278 // ------------------------------------
279 // Frequently modified data members
280 // ------------vvvvvvvvvvvvv-----------
281 BinnedLRUHandleTable table_;
282
283 // Memory size for entries residing in the cache
284 size_t usage_;
285
286 // Memory size for entries residing only in the LRU list
287 size_t lru_usage_;
288
289 // mutex_ protects the following state.
290 // We don't count mutex_ as the cache's internal state so semantically we
291 // don't mind mutex_ invoking the non-const actions.
292 mutable std::mutex mutex_;
293 };
294
295 class BinnedLRUCache : public ShardedCache {
296 public:
297 BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits,
298 bool strict_capacity_limit, double high_pri_pool_ratio);
299 virtual ~BinnedLRUCache();
300 virtual const char* Name() const override { return "BinnedLRUCache"; }
301 virtual CacheShard* GetShard(int shard) override;
302 virtual const CacheShard* GetShard(int shard) const override;
303 virtual void* Value(Handle* handle) override;
304 virtual size_t GetCharge(Handle* handle) const override;
305 virtual uint32_t GetHash(Handle* handle) const override;
306 virtual void DisownData() override;
307
308 // Retrieves number of elements in LRU, for unit test purpose only
309 size_t TEST_GetLRUSize();
310 // Sets the high pri pool ratio
311 void SetHighPriPoolRatio(double high_pri_pool_ratio);
312 // Retrieves high pri pool ratio
313 double GetHighPriPoolRatio() const;
314 // Retrieves high pri pool usage
315 size_t GetHighPriPoolUsage() const;
316
317 // PriorityCache
318 virtual int64_t request_cache_bytes(
319 PriorityCache::Priority pri, uint64_t total_cache) const;
320 virtual int64_t commit_cache_size(uint64_t total_cache);
321 virtual int64_t get_committed_size() const {
322 return GetCapacity();
323 }
324 virtual std::string get_cache_name() const {
325 return "RocksDB Binned LRU Cache";
326 }
327
328 private:
329 CephContext *cct;
330 BinnedLRUCacheShard* shards_;
331 int num_shards_ = 0;
332 };
333
334 } // namespace rocksdb_cache
335
336 #endif // ROCKSDB_BINNED_LRU_CACHE