]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/cache/lru_cache.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / cache / lru_cache.h
CommitLineData
1e59de90 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9#pragma once
10
1e59de90 11#include <memory>
7c673cae
FG
12#include <string>
13
14#include "cache/sharded_cache.h"
1e59de90 15#include "port/lang.h"
f67539c2 16#include "port/malloc.h"
7c673cae 17#include "port/port.h"
1e59de90 18#include "rocksdb/secondary_cache.h"
7c673cae 19#include "util/autovector.h"
1e59de90 20#include "util/distributed_mutex.h"
7c673cae 21
f67539c2 22namespace ROCKSDB_NAMESPACE {
1e59de90 23namespace lru_cache {
7c673cae 24
f67539c2 25// LRU cache implementation. This class is not thread-safe.
7c673cae
FG
26
27// An entry is a variable length heap-allocated structure.
28// Entries are referenced by cache and/or by any external entity.
f67539c2 29// The cache keeps all its entries in a hash table. Some elements
7c673cae
FG
30// are also stored on LRU list.
31//
32// LRUHandle can be in these states:
33// 1. Referenced externally AND in hash table.
f67539c2
TL
34// In that case the entry is *not* in the LRU list
35// (refs >= 1 && in_cache == true)
36// 2. Not referenced externally AND in hash table.
37// In that case the entry is in the LRU list and can be freed.
38// (refs == 0 && in_cache == true)
39// 3. Referenced externally AND not in hash table.
40// In that case the entry is not in the LRU list and not in hash table.
1e59de90 41// The entry must be freed if refs becomes 0 in this state.
f67539c2 42// (refs >= 1 && in_cache == false)
1e59de90
TL
43// If you call LRUCacheShard::Release enough times on an entry in state 1, it
44// will go into state 2. To move from state 1 to state 3, either call
45// LRUCacheShard::Erase or LRUCacheShard::Insert with the same key (but
46// possibly different value). To move from state 2 to state 1, use
47// LRUCacheShard::Lookup.
48// While refs > 0, public properties like value and deleter must not change.
7c673cae
FG
49
50struct LRUHandle {
51 void* value;
1e59de90
TL
52 union Info {
53 Info() {}
54 ~Info() {}
55 Cache::DeleterFn deleter;
56 const Cache::CacheItemHelper* helper;
57 } info_;
58 // An entry is not added to the LRUHandleTable until the secondary cache
59 // lookup is complete, so its safe to have this union.
60 union {
61 LRUHandle* next_hash;
62 SecondaryCacheResultHandle* sec_handle;
63 };
7c673cae
FG
64 LRUHandle* next;
65 LRUHandle* prev;
1e59de90 66 size_t total_charge; // TODO(opt): Only allow uint32_t?
7c673cae 67 size_t key_length;
f67539c2
TL
68 // The hash of key(). Used for fast sharding and comparisons.
69 uint32_t hash;
70 // The number of external refs to this entry. The cache itself is not counted.
71 uint32_t refs;
72
1e59de90
TL
73 // Mutable flags - access controlled by mutex
74 // The m_ and M_ prefixes (and im_ and IM_ later) are to hopefully avoid
75 // checking an M_ flag on im_flags or an IM_ flag on m_flags.
76 uint8_t m_flags;
77 enum MFlags : uint8_t {
f67539c2 78 // Whether this entry is referenced by the hash table.
1e59de90
TL
79 M_IN_CACHE = (1 << 0),
80 // Whether this entry has had any lookups (hits).
81 M_HAS_HIT = (1 << 1),
f67539c2 82 // Whether this entry is in high-pri pool.
1e59de90
TL
83 M_IN_HIGH_PRI_POOL = (1 << 2),
84 // Whether this entry is in low-pri pool.
85 M_IN_LOW_PRI_POOL = (1 << 3),
494da23a
TL
86 };
87
1e59de90
TL
88 // "Immutable" flags - only set in single-threaded context and then
89 // can be accessed without mutex
90 uint8_t im_flags;
91 enum ImFlags : uint8_t {
92 // Whether this entry is high priority entry.
93 IM_IS_HIGH_PRI = (1 << 0),
94 // Whether this entry is low priority entry.
95 IM_IS_LOW_PRI = (1 << 1),
96 // Can this be inserted into the secondary cache.
97 IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2),
98 // Is the handle still being read from a lower tier.
99 IM_IS_PENDING = (1 << 3),
100 // Whether this handle is still in a lower tier
101 IM_IS_IN_SECONDARY_CACHE = (1 << 4),
102 // Marks result handles that should not be inserted into cache
103 IM_IS_STANDALONE = (1 << 5),
104 };
7c673cae 105
f67539c2
TL
106 // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
107 char key_data[1];
7c673cae 108
f67539c2 109 Slice key() const { return Slice(key_data, key_length); }
7c673cae 110
1e59de90
TL
111 // For HandleImpl concept
112 uint32_t GetHash() const { return hash; }
113
f67539c2
TL
114 // Increase the reference count by 1.
115 void Ref() { refs++; }
116
117 // Just reduce the reference count by 1. Return true if it was last reference.
118 bool Unref() {
119 assert(refs > 0);
120 refs--;
121 return refs == 0;
7c673cae
FG
122 }
123
f67539c2
TL
124 // Return true if there are external refs, false otherwise.
125 bool HasRefs() const { return refs > 0; }
126
1e59de90
TL
127 bool InCache() const { return m_flags & M_IN_CACHE; }
128 bool IsHighPri() const { return im_flags & IM_IS_HIGH_PRI; }
129 bool InHighPriPool() const { return m_flags & M_IN_HIGH_PRI_POOL; }
130 bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; }
131 bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; }
132 bool HasHit() const { return m_flags & M_HAS_HIT; }
133 bool IsSecondaryCacheCompatible() const {
134 return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE;
135 }
136 bool IsPending() const { return im_flags & IM_IS_PENDING; }
137 bool IsInSecondaryCache() const {
138 return im_flags & IM_IS_IN_SECONDARY_CACHE;
139 }
140 bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; }
7c673cae
FG
141
142 void SetInCache(bool in_cache) {
143 if (in_cache) {
1e59de90 144 m_flags |= M_IN_CACHE;
7c673cae 145 } else {
1e59de90 146 m_flags &= ~M_IN_CACHE;
7c673cae
FG
147 }
148 }
149
150 void SetPriority(Cache::Priority priority) {
151 if (priority == Cache::Priority::HIGH) {
1e59de90
TL
152 im_flags |= IM_IS_HIGH_PRI;
153 im_flags &= ~IM_IS_LOW_PRI;
154 } else if (priority == Cache::Priority::LOW) {
155 im_flags &= ~IM_IS_HIGH_PRI;
156 im_flags |= IM_IS_LOW_PRI;
7c673cae 157 } else {
1e59de90
TL
158 im_flags &= ~IM_IS_HIGH_PRI;
159 im_flags &= ~IM_IS_LOW_PRI;
7c673cae
FG
160 }
161 }
162
163 void SetInHighPriPool(bool in_high_pri_pool) {
164 if (in_high_pri_pool) {
1e59de90
TL
165 m_flags |= M_IN_HIGH_PRI_POOL;
166 } else {
167 m_flags &= ~M_IN_HIGH_PRI_POOL;
168 }
169 }
170
171 void SetInLowPriPool(bool in_low_pri_pool) {
172 if (in_low_pri_pool) {
173 m_flags |= M_IN_LOW_PRI_POOL;
7c673cae 174 } else {
1e59de90 175 m_flags &= ~M_IN_LOW_PRI_POOL;
7c673cae
FG
176 }
177 }
178
1e59de90
TL
179 void SetHit() { m_flags |= M_HAS_HIT; }
180
181 void SetSecondaryCacheCompatible(bool compat) {
182 if (compat) {
183 im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE;
184 } else {
185 im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE;
186 }
187 }
188
189 void SetIsPending(bool pending) {
190 if (pending) {
191 im_flags |= IM_IS_PENDING;
192 } else {
193 im_flags &= ~IM_IS_PENDING;
194 }
195 }
196
197 void SetIsInSecondaryCache(bool is_in_secondary_cache) {
198 if (is_in_secondary_cache) {
199 im_flags |= IM_IS_IN_SECONDARY_CACHE;
200 } else {
201 im_flags &= ~IM_IS_IN_SECONDARY_CACHE;
202 }
203 }
204
205 void SetIsStandalone(bool is_standalone) {
206 if (is_standalone) {
207 im_flags |= IM_IS_STANDALONE;
208 } else {
209 im_flags &= ~IM_IS_STANDALONE;
210 }
211 }
11fdf7f2 212
7c673cae 213 void Free() {
f67539c2 214 assert(refs == 0);
1e59de90
TL
215
216 if (!IsSecondaryCacheCompatible() && info_.deleter) {
217 (*info_.deleter)(key(), value);
218 } else if (IsSecondaryCacheCompatible()) {
219 if (IsPending()) {
220 assert(sec_handle != nullptr);
221 SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
222 tmp_sec_handle->Wait();
223 value = tmp_sec_handle->Value();
224 delete tmp_sec_handle;
225 }
226 if (value) {
227 (*info_.helper->del_cb)(key(), value);
228 }
7c673cae 229 }
1e59de90
TL
230
231 free(this);
7c673cae 232 }
f67539c2 233
1e59de90
TL
234 inline size_t CalcuMetaCharge(
235 CacheMetadataChargePolicy metadata_charge_policy) const {
236 if (metadata_charge_policy != kFullChargeCacheMetadata) {
237 return 0;
238 } else {
f67539c2 239#ifdef ROCKSDB_MALLOC_USABLE_SIZE
1e59de90
TL
240 return malloc_usable_size(
241 const_cast<void*>(static_cast<const void*>(this)));
f67539c2 242#else
1e59de90
TL
243 // This is the size that is used when a new handle is created.
244 return sizeof(LRUHandle) - 1 + key_length;
f67539c2
TL
245#endif
246 }
1e59de90
TL
247 }
248
249 // Calculate the memory usage by metadata.
250 inline void CalcTotalCharge(
251 size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
252 total_charge = charge + CalcuMetaCharge(metadata_charge_policy);
253 }
254
255 inline size_t GetCharge(
256 CacheMetadataChargePolicy metadata_charge_policy) const {
257 size_t meta_charge = CalcuMetaCharge(metadata_charge_policy);
258 assert(total_charge >= meta_charge);
259 return total_charge - meta_charge;
f67539c2 260 }
7c673cae
FG
261};
262
263// We provide our own simple hash table since it removes a whole bunch
264// of porting hacks and is also faster than some of the built-in hash
265// table implementations in some of the compiler/runtime combinations
266// we have tested. E.g., readrandom speeds up by ~5% over the g++
267// 4.4.3's builtin hashtable.
268class LRUHandleTable {
269 public:
1e59de90 270 explicit LRUHandleTable(int max_upper_hash_bits);
7c673cae
FG
271 ~LRUHandleTable();
272
273 LRUHandle* Lookup(const Slice& key, uint32_t hash);
274 LRUHandle* Insert(LRUHandle* h);
275 LRUHandle* Remove(const Slice& key, uint32_t hash);
276
277 template <typename T>
1e59de90
TL
278 void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
279 for (size_t i = index_begin; i < index_end; i++) {
7c673cae
FG
280 LRUHandle* h = list_[i];
281 while (h != nullptr) {
282 auto n = h->next_hash;
283 assert(h->InCache());
284 func(h);
285 h = n;
286 }
287 }
288 }
289
1e59de90
TL
290 int GetLengthBits() const { return length_bits_; }
291
292 size_t GetOccupancyCount() const { return elems_; }
293
7c673cae
FG
294 private:
295 // Return a pointer to slot that points to a cache entry that
296 // matches key/hash. If there is no such cache entry, return a
297 // pointer to the trailing slot in the corresponding linked list.
298 LRUHandle** FindPointer(const Slice& key, uint32_t hash);
299
300 void Resize();
301
1e59de90
TL
302 // Number of hash bits (upper because lower bits used for sharding)
303 // used for table index. Length == 1 << length_bits_
304 int length_bits_;
305
7c673cae
FG
306 // The table consists of an array of buckets where each bucket is
307 // a linked list of cache entries that hash into the bucket.
1e59de90
TL
308 std::unique_ptr<LRUHandle*[]> list_;
309
310 // Number of elements currently in the table.
7c673cae 311 uint32_t elems_;
1e59de90
TL
312
313 // Set from max_upper_hash_bits (see constructor).
314 const int max_length_bits_;
7c673cae
FG
315};
316
317// A single shard of sharded cache.
1e59de90 318class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
7c673cae 319 public:
11fdf7f2 320 LRUCacheShard(size_t capacity, bool strict_capacity_limit,
1e59de90
TL
321 double high_pri_pool_ratio, double low_pri_pool_ratio,
322 bool use_adaptive_mutex,
323 CacheMetadataChargePolicy metadata_charge_policy,
324 int max_upper_hash_bits, SecondaryCache* secondary_cache);
325
326 public: // Type definitions expected as parameter to ShardedCache
327 using HandleImpl = LRUHandle;
328 using HashVal = uint32_t;
329 using HashCref = uint32_t;
330
331 public: // Function definitions expected as parameter to ShardedCache
332 static inline HashVal ComputeHash(const Slice& key) {
333 return Lower32of64(GetSliceNPHash64(key));
334 }
7c673cae
FG
335
336 // Separate from constructor so caller can easily make an array of LRUCache
337 // if current usage is more than new capacity, the function will attempt to
1e59de90
TL
338 // free the needed space.
339 void SetCapacity(size_t capacity);
7c673cae
FG
340
341 // Set the flag to reject insertion if cache if full.
1e59de90 342 void SetStrictCapacityLimit(bool strict_capacity_limit);
7c673cae
FG
343
344 // Set percentage of capacity reserved for high-pri cache entries.
345 void SetHighPriorityPoolRatio(double high_pri_pool_ratio);
346
1e59de90
TL
347 // Set percentage of capacity reserved for low-pri cache entries.
348 void SetLowPriorityPoolRatio(double low_pri_pool_ratio);
349
7c673cae 350 // Like Cache methods, but with an extra "hash" parameter.
1e59de90
TL
351 inline Status Insert(const Slice& key, uint32_t hash, void* value,
352 size_t charge, Cache::DeleterFn deleter,
353 LRUHandle** handle, Cache::Priority priority) {
354 return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
355 }
356 inline Status Insert(const Slice& key, uint32_t hash, void* value,
357 const Cache::CacheItemHelper* helper, size_t charge,
358 LRUHandle** handle, Cache::Priority priority) {
359 assert(helper);
360 return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
361 }
362 // If helper_cb is null, the values of the following arguments don't matter.
363 LRUHandle* Lookup(const Slice& key, uint32_t hash,
364 const Cache::CacheItemHelper* helper,
365 const Cache::CreateCallback& create_cb,
366 Cache::Priority priority, bool wait, Statistics* stats);
367 inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
368 return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
369 nullptr);
370 }
371 bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
372 bool IsReady(LRUHandle* /*handle*/);
373 void Wait(LRUHandle* /*handle*/) {}
374 bool Ref(LRUHandle* handle);
375 void Erase(const Slice& key, uint32_t hash);
7c673cae
FG
376
377 // Although in some platforms the update of size_t is atomic, to make sure
378 // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
379 // protect them with mutex_.
380
1e59de90
TL
381 size_t GetUsage() const;
382 size_t GetPinnedUsage() const;
383 size_t GetOccupancyCount() const;
384 size_t GetTableAddressCount() const;
7c673cae 385
1e59de90
TL
386 void ApplyToSomeEntries(
387 const std::function<void(const Slice& key, void* value, size_t charge,
388 DeleterFn deleter)>& callback,
389 size_t average_entries_per_lock, size_t* state);
7c673cae 390
1e59de90 391 void EraseUnRefEntries();
7c673cae 392
1e59de90
TL
393 public: // other function definitions
394 void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
395 LRUHandle** lru_bottom_pri);
7c673cae 396
1e59de90
TL
397 // Retrieves number of elements in LRU, for unit test purpose only.
398 // Not threadsafe.
11fdf7f2
TL
399 size_t TEST_GetLRUSize();
400
1e59de90 401 // Retrieves high pri pool ratio
11fdf7f2
TL
402 double GetHighPriPoolRatio();
403
1e59de90
TL
404 // Retrieves low pri pool ratio
405 double GetLowPriPoolRatio();
406
407 void AppendPrintableOptions(std::string& /*str*/) const;
408
7c673cae 409 private:
1e59de90
TL
410 friend class LRUCache;
411 // Insert an item into the hash table and, if handle is null, insert into
412 // the LRU list. Older items are evicted as necessary. If the cache is full
413 // and free_handle_on_fail is true, the item is deleted and handle is set to
414 // nullptr.
415 Status InsertItem(LRUHandle* item, LRUHandle** handle,
416 bool free_handle_on_fail);
417 Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
418 DeleterFn deleter, const Cache::CacheItemHelper* helper,
419 LRUHandle** handle, Cache::Priority priority);
420 // Promote an item looked up from the secondary cache to the LRU cache.
421 // The item may be still in the secondary cache.
422 // It is only inserted into the hash table and not the LRU list, and only
423 // if the cache is not at full capacity, as is the case during Insert. The
424 // caller should hold a reference on the LRUHandle. When the caller releases
425 // the last reference, the item is added to the LRU list.
426 // The item is promoted to the high pri or low pri pool as specified by the
427 // caller in Lookup.
428 void Promote(LRUHandle* e);
7c673cae
FG
429 void LRU_Remove(LRUHandle* e);
430 void LRU_Insert(LRUHandle* e);
431
432 // Overflow the last entry in high-pri pool to low-pri pool until size of
433 // high-pri pool is no larger than the size specify by high_pri_pool_pct.
434 void MaintainPoolSize();
435
7c673cae
FG
436 // Free some space following strict LRU policy until enough space
437 // to hold (usage_ + charge) is freed or the lru list is empty
438 // This function is not thread safe - it needs to be executed while
1e59de90 439 // holding the mutex_.
7c673cae
FG
440 void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
441
1e59de90
TL
442 // Try to insert the evicted handles into the secondary cache.
443 void TryInsertIntoSecondaryCache(autovector<LRUHandle*> evicted_handles);
444
7c673cae
FG
445 // Initialized before use.
446 size_t capacity_;
447
7c673cae
FG
448 // Memory size for entries in high-pri pool.
449 size_t high_pri_pool_usage_;
450
1e59de90
TL
451 // Memory size for entries in low-pri pool.
452 size_t low_pri_pool_usage_;
453
7c673cae
FG
454 // Whether to reject insertion if cache reaches its full capacity.
455 bool strict_capacity_limit_;
456
457 // Ratio of capacity reserved for high priority cache entries.
458 double high_pri_pool_ratio_;
459
460 // High-pri pool size, equals to capacity * high_pri_pool_ratio.
461 // Remember the value to avoid recomputing each time.
462 double high_pri_pool_capacity_;
463
1e59de90
TL
464 // Ratio of capacity reserved for low priority cache entries.
465 double low_pri_pool_ratio_;
466
467 // Low-pri pool size, equals to capacity * low_pri_pool_ratio.
468 // Remember the value to avoid recomputing each time.
469 double low_pri_pool_capacity_;
470
7c673cae
FG
471 // Dummy head of LRU list.
472 // lru.prev is newest entry, lru.next is oldest entry.
473 // LRU contains items which can be evicted, ie reference only by cache
474 LRUHandle lru_;
475
476 // Pointer to head of low-pri pool in LRU list.
477 LRUHandle* lru_low_pri_;
478
1e59de90
TL
479 // Pointer to head of bottom-pri pool in LRU list.
480 LRUHandle* lru_bottom_pri_;
481
11fdf7f2
TL
482 // ------------^^^^^^^^^^^^^-----------
483 // Not frequently modified data members
484 // ------------------------------------
485 //
486 // We separate data members that are updated frequently from the ones that
487 // are not frequently updated so that they don't share the same cache line
488 // which will lead into false cache sharing
489 //
490 // ------------------------------------
491 // Frequently modified data members
492 // ------------vvvvvvvvvvvvv-----------
7c673cae 493 LRUHandleTable table_;
11fdf7f2 494
1e59de90 495 // Memory size for entries residing in the cache.
11fdf7f2
TL
496 size_t usage_;
497
1e59de90 498 // Memory size for entries residing only in the LRU list.
11fdf7f2
TL
499 size_t lru_usage_;
500
501 // mutex_ protects the following state.
502 // We don't count mutex_ as the cache's internal state so semantically we
503 // don't mind mutex_ invoking the non-const actions.
1e59de90
TL
504 mutable DMutex mutex_;
505
506 // Owned by LRUCache
507 SecondaryCache* secondary_cache_;
7c673cae
FG
508};
509
494da23a
TL
510class LRUCache
511#ifdef NDEBUG
512 final
513#endif
1e59de90 514 : public ShardedCache<LRUCacheShard> {
7c673cae
FG
515 public:
516 LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
1e59de90 517 double high_pri_pool_ratio, double low_pri_pool_ratio,
494da23a 518 std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
f67539c2
TL
519 bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
520 CacheMetadataChargePolicy metadata_charge_policy =
1e59de90
TL
521 kDontChargeCacheMetadata,
522 std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
523 const char* Name() const override { return "LRUCache"; }
524 void* Value(Handle* handle) override;
525 size_t GetCharge(Handle* handle) const override;
526 DeleterFn GetDeleter(Handle* handle) const override;
527 void WaitAll(std::vector<Handle*>& handles) override;
528
529 // Retrieves number of elements in LRU, for unit test purpose only.
11fdf7f2 530 size_t TEST_GetLRUSize();
1e59de90 531 // Retrieves high pri pool ratio.
11fdf7f2
TL
532 double GetHighPriPoolRatio();
533
1e59de90
TL
534 void AppendPrintableOptions(std::string& str) const override;
535
7c673cae 536 private:
1e59de90 537 std::shared_ptr<SecondaryCache> secondary_cache_;
7c673cae
FG
538};
539
1e59de90
TL
540} // namespace lru_cache
541
542using LRUCache = lru_cache::LRUCache;
543using LRUHandle = lru_cache::LRUHandle;
544using LRUCacheShard = lru_cache::LRUCacheShard;
545
f67539c2 546} // namespace ROCKSDB_NAMESPACE