]>
Commit | Line | Data |
---|---|---|
91327a77 AA |
1 | // Copyright (c) 2018-Present Red Hat Inc. All rights reserved. |
2 | // | |
3 | // Copyright (c) 2011-2018, Facebook, Inc. All rights reserved. | |
4 | // This source code is licensed under both the GPLv2 and Apache 2.0 License | |
5 | // | |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #ifndef ROCKSDB_BINNED_LRU_CACHE | |
11 | #define ROCKSDB_BINNED_LRU_CACHE | |
12 | ||
13 | #include <string> | |
14 | #include <mutex> | |
15 | ||
16 | #include "ShardedCache.h" | |
91327a77 | 17 | #include "common/autovector.h" |
11fdf7f2 TL |
18 | #include "common/dout.h" |
19 | #include "include/ceph_assert.h" | |
20 | #include "common/ceph_context.h" | |
91327a77 AA |
21 | |
22 | namespace rocksdb_cache { | |
23 | ||
24 | // LRU cache implementation | |
25 | ||
26 | // An entry is a variable length heap-allocated structure. | |
27 | // Entries are referenced by cache and/or by any external entity. | |
28 | // The cache keeps all its entries in table. Some elements | |
29 | // are also stored on LRU list. | |
30 | // | |
31 | // BinnedLRUHandle can be in these states: | |
32 | // 1. Referenced externally AND in hash table. | |
33 | // In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true) | |
34 | // 2. Not referenced externally and in hash table. In that case the entry is | |
35 | // in the LRU and can be freed. (refs == 1 && in_cache == true) | |
36 | // 3. Referenced externally and not in hash table. In that case the entry is | |
37 | // in not on LRU and not in table. (refs >= 1 && in_cache == false) | |
38 | // | |
39 | // All newly created BinnedLRUHandles are in state 1. If you call | |
40 | // BinnedLRUCacheShard::Release | |
41 | // on entry in state 1, it will go into state 2. To move from state 1 to | |
42 | // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the | |
43 | // same key. | |
44 | // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup. | |
45 | // Before destruction, make sure that no handles are in state 1. This means | |
46 | // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a | |
47 | // matching | |
48 | // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3) | |
49 | ||
50 | std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache( | |
11fdf7f2 | 51 | CephContext *c, |
91327a77 AA |
52 | size_t capacity, |
53 | int num_shard_bits = -1, | |
54 | bool strict_capacity_limit = false, | |
55 | double high_pri_pool_ratio = 0.0); | |
56 | ||
57 | struct BinnedLRUHandle { | |
58 | void* value; | |
59 | void (*deleter)(const rocksdb::Slice&, void* value); | |
60 | BinnedLRUHandle* next_hash; | |
61 | BinnedLRUHandle* next; | |
62 | BinnedLRUHandle* prev; | |
63 | size_t charge; // TODO(opt): Only allow uint32_t? | |
64 | size_t key_length; | |
65 | uint32_t refs; // a number of refs to this entry | |
66 | // cache itself is counted as 1 | |
67 | ||
68 | // Include the following flags: | |
69 | // in_cache: whether this entry is referenced by the hash table. | |
70 | // is_high_pri: whether this entry is high priority entry. | |
71 | // in_high_pri_pool: whether this entry is in high-pri pool. | |
72 | char flags; | |
73 | ||
74 | uint32_t hash; // Hash of key(); used for fast sharding and comparisons | |
75 | ||
11fdf7f2 | 76 | char* key_data = nullptr; // Beginning of key |
91327a77 AA |
77 | |
78 | rocksdb::Slice key() const { | |
79 | // For cheaper lookups, we allow a temporary Handle object | |
80 | // to store a pointer to a key in "value". | |
81 | if (next == this) { | |
82 | return *(reinterpret_cast<rocksdb::Slice*>(value)); | |
83 | } else { | |
84 | return rocksdb::Slice(key_data, key_length); | |
85 | } | |
86 | } | |
87 | ||
88 | bool InCache() { return flags & 1; } | |
89 | bool IsHighPri() { return flags & 2; } | |
90 | bool InHighPriPool() { return flags & 4; } | |
91 | bool HasHit() { return flags & 8; } | |
92 | ||
93 | void SetInCache(bool in_cache) { | |
94 | if (in_cache) { | |
95 | flags |= 1; | |
96 | } else { | |
97 | flags &= ~1; | |
98 | } | |
99 | } | |
100 | ||
101 | void SetPriority(rocksdb::Cache::Priority priority) { | |
102 | if (priority == rocksdb::Cache::Priority::HIGH) { | |
103 | flags |= 2; | |
104 | } else { | |
105 | flags &= ~2; | |
106 | } | |
107 | } | |
108 | ||
109 | void SetInHighPriPool(bool in_high_pri_pool) { | |
110 | if (in_high_pri_pool) { | |
111 | flags |= 4; | |
112 | } else { | |
113 | flags &= ~4; | |
114 | } | |
115 | } | |
116 | ||
117 | void SetHit() { flags |= 8; } | |
118 | ||
119 | void Free() { | |
11fdf7f2 | 120 | ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache())); |
91327a77 AA |
121 | if (deleter) { |
122 | (*deleter)(key(), value); | |
123 | } | |
11fdf7f2 TL |
124 | delete[] key_data; |
125 | delete this; | |
91327a77 AA |
126 | } |
127 | }; | |
128 | ||
129 | // We provide our own simple hash table since it removes a whole bunch | |
130 | // of porting hacks and is also faster than some of the built-in hash | |
131 | // table implementations in some of the compiler/runtime combinations | |
132 | // we have tested. E.g., readrandom speeds up by ~5% over the g++ | |
133 | // 4.4.3's builtin hashtable. | |
134 | class BinnedLRUHandleTable { | |
135 | public: | |
136 | BinnedLRUHandleTable(); | |
137 | ~BinnedLRUHandleTable(); | |
138 | ||
139 | BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash); | |
140 | BinnedLRUHandle* Insert(BinnedLRUHandle* h); | |
141 | BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash); | |
142 | ||
143 | template <typename T> | |
144 | void ApplyToAllCacheEntries(T func) { | |
145 | for (uint32_t i = 0; i < length_; i++) { | |
146 | BinnedLRUHandle* h = list_[i]; | |
147 | while (h != nullptr) { | |
148 | auto n = h->next_hash; | |
11fdf7f2 | 149 | ceph_assert(h->InCache()); |
91327a77 AA |
150 | func(h); |
151 | h = n; | |
152 | } | |
153 | } | |
154 | } | |
155 | ||
156 | private: | |
157 | // Return a pointer to slot that points to a cache entry that | |
158 | // matches key/hash. If there is no such cache entry, return a | |
159 | // pointer to the trailing slot in the corresponding linked list. | |
160 | BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash); | |
161 | ||
162 | void Resize(); | |
163 | ||
164 | // The table consists of an array of buckets where each bucket is | |
165 | // a linked list of cache entries that hash into the bucket. | |
166 | BinnedLRUHandle** list_; | |
167 | uint32_t length_; | |
168 | uint32_t elems_; | |
169 | }; | |
170 | ||
171 | // A single shard of sharded cache. | |
172 | class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard { | |
173 | public: | |
174 | BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit, | |
175 | double high_pri_pool_ratio); | |
176 | virtual ~BinnedLRUCacheShard(); | |
177 | ||
178 | // Separate from constructor so caller can easily make an array of BinnedLRUCache | |
179 | // if current usage is more than new capacity, the function will attempt to | |
180 | // free the needed space | |
181 | virtual void SetCapacity(size_t capacity) override; | |
182 | ||
183 | // Set the flag to reject insertion if cache if full. | |
184 | virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override; | |
185 | ||
186 | // Set percentage of capacity reserved for high-pri cache entries. | |
187 | void SetHighPriPoolRatio(double high_pri_pool_ratio); | |
188 | ||
189 | // Like Cache methods, but with an extra "hash" parameter. | |
190 | virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value, | |
191 | size_t charge, | |
192 | void (*deleter)(const rocksdb::Slice& key, void* value), | |
193 | rocksdb::Cache::Handle** handle, | |
194 | rocksdb::Cache::Priority priority) override; | |
195 | virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override; | |
196 | virtual bool Ref(rocksdb::Cache::Handle* handle) override; | |
197 | virtual bool Release(rocksdb::Cache::Handle* handle, | |
198 | bool force_erase = false) override; | |
199 | virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override; | |
200 | ||
201 | // Although in some platforms the update of size_t is atomic, to make sure | |
202 | // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll | |
203 | // protect them with mutex_. | |
204 | ||
205 | virtual size_t GetUsage() const override; | |
206 | virtual size_t GetPinnedUsage() const override; | |
207 | ||
208 | virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), | |
209 | bool thread_safe) override; | |
210 | ||
211 | virtual void EraseUnRefEntries() override; | |
212 | ||
213 | virtual std::string GetPrintableOptions() const override; | |
214 | ||
215 | void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri); | |
216 | ||
217 | // Retrieves number of elements in LRU, for unit test purpose only | |
218 | // not threadsafe | |
219 | size_t TEST_GetLRUSize(); | |
220 | ||
11fdf7f2 | 221 | // Retrieves high pri pool ratio |
91327a77 AA |
222 | double GetHighPriPoolRatio() const; |
223 | ||
11fdf7f2 | 224 | // Retrieves high pri pool usage |
91327a77 AA |
225 | size_t GetHighPriPoolUsage() const; |
226 | ||
227 | private: | |
228 | void LRU_Remove(BinnedLRUHandle* e); | |
229 | void LRU_Insert(BinnedLRUHandle* e); | |
230 | ||
231 | // Overflow the last entry in high-pri pool to low-pri pool until size of | |
232 | // high-pri pool is no larger than the size specify by high_pri_pool_pct. | |
233 | void MaintainPoolSize(); | |
234 | ||
235 | // Just reduce the reference count by 1. | |
236 | // Return true if last reference | |
237 | bool Unref(BinnedLRUHandle* e); | |
238 | ||
239 | // Free some space following strict LRU policy until enough space | |
240 | // to hold (usage_ + charge) is freed or the lru list is empty | |
241 | // This function is not thread safe - it needs to be executed while | |
242 | // holding the mutex_ | |
243 | void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted); | |
244 | ||
245 | // Initialized before use. | |
246 | size_t capacity_; | |
247 | ||
248 | // Memory size for entries in high-pri pool. | |
249 | size_t high_pri_pool_usage_; | |
250 | ||
251 | // Whether to reject insertion if cache reaches its full capacity. | |
252 | bool strict_capacity_limit_; | |
253 | ||
254 | // Ratio of capacity reserved for high priority cache entries. | |
255 | double high_pri_pool_ratio_; | |
256 | ||
257 | // High-pri pool size, equals to capacity * high_pri_pool_ratio. | |
258 | // Remember the value to avoid recomputing each time. | |
259 | double high_pri_pool_capacity_; | |
260 | ||
261 | // Dummy head of LRU list. | |
262 | // lru.prev is newest entry, lru.next is oldest entry. | |
263 | // LRU contains items which can be evicted, ie reference only by cache | |
264 | BinnedLRUHandle lru_; | |
265 | ||
266 | // Pointer to head of low-pri pool in LRU list. | |
267 | BinnedLRUHandle* lru_low_pri_; | |
268 | ||
269 | // ------------^^^^^^^^^^^^^----------- | |
270 | // Not frequently modified data members | |
271 | // ------------------------------------ | |
272 | // | |
273 | // We separate data members that are updated frequently from the ones that | |
274 | // are not frequently updated so that they don't share the same cache line | |
275 | // which will lead into false cache sharing | |
276 | // | |
277 | // ------------------------------------ | |
278 | // Frequently modified data members | |
279 | // ------------vvvvvvvvvvvvv----------- | |
280 | BinnedLRUHandleTable table_; | |
281 | ||
282 | // Memory size for entries residing in the cache | |
283 | size_t usage_; | |
284 | ||
285 | // Memory size for entries residing only in the LRU list | |
286 | size_t lru_usage_; | |
287 | ||
288 | // mutex_ protects the following state. | |
289 | // We don't count mutex_ as the cache's internal state so semantically we | |
290 | // don't mind mutex_ invoking the non-const actions. | |
291 | mutable std::mutex mutex_; | |
292 | }; | |
293 | ||
294 | class BinnedLRUCache : public ShardedCache { | |
295 | public: | |
11fdf7f2 TL |
296 | BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits, |
297 | bool strict_capacity_limit, double high_pri_pool_ratio); | |
91327a77 AA |
298 | virtual ~BinnedLRUCache(); |
299 | virtual const char* Name() const override { return "BinnedLRUCache"; } | |
300 | virtual CacheShard* GetShard(int shard) override; | |
301 | virtual const CacheShard* GetShard(int shard) const override; | |
302 | virtual void* Value(Handle* handle) override; | |
303 | virtual size_t GetCharge(Handle* handle) const override; | |
304 | virtual uint32_t GetHash(Handle* handle) const override; | |
305 | virtual void DisownData() override; | |
306 | ||
307 | // Retrieves number of elements in LRU, for unit test purpose only | |
308 | size_t TEST_GetLRUSize(); | |
309 | // Sets the high pri pool ratio | |
310 | void SetHighPriPoolRatio(double high_pri_pool_ratio); | |
11fdf7f2 | 311 | // Retrieves high pri pool ratio |
91327a77 AA |
312 | double GetHighPriPoolRatio() const; |
313 | // Retrieves high pri pool usage | |
314 | size_t GetHighPriPoolUsage() const; | |
315 | ||
11fdf7f2 TL |
316 | // PriorityCache |
317 | virtual int64_t request_cache_bytes( | |
318 | PriorityCache::Priority pri, uint64_t total_cache) const; | |
319 | virtual int64_t commit_cache_size(uint64_t total_cache); | |
320 | virtual int64_t get_committed_size() const { | |
321 | return GetCapacity(); | |
322 | } | |
323 | virtual std::string get_cache_name() const { | |
324 | return "RocksDB Binned LRU Cache"; | |
325 | } | |
326 | ||
91327a77 | 327 | private: |
11fdf7f2 | 328 | CephContext *cct; |
91327a77 AA |
329 | BinnedLRUCacheShard* shards_; |
330 | int num_shards_ = 0; | |
331 | }; | |
332 | ||
333 | } // namespace rocksdb_cache | |
334 | ||
335 | #endif // ROCKSDB_BINNED_LRU_CACHE |