ceph/src/kv/rocksdb_cache/BinnedLRUCache.h

   1 // Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
   2 //
   3 // Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
   4 // This source code is licensed under both the GPLv2 and Apache 2.0 License
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9
  10 #ifndef ROCKSDB_BINNED_LRU_CACHE
  11 #define ROCKSDB_BINNED_LRU_CACHE
  12
  13 #include <string>
  14 #include <mutex>
  15
  16 #include "ShardedCache.h"
  17 #include "common/autovector.h"
  18 #include "common/dout.h"
  19 #include "include/ceph_assert.h"
  20 #include "common/ceph_context.h"
  21
  22 namespace rocksdb_cache {
  23
  24 // LRU cache implementation
  25
  26 // An entry is a variable length heap-allocated structure.
  27 // Entries are referenced by cache and/or by any external entity.
  28 // The cache keeps all its entries in table. Some elements
  29 // are also stored on LRU list.
  30 //
  31 // BinnedLRUHandle can be in these states:
  32 // 1. Referenced externally AND in hash table.
  33 //  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
  34 // 2. Not referenced externally and in hash table. In that case the entry is
  35 // in the LRU and can be freed. (refs == 1 && in_cache == true)
  36 // 3. Referenced externally and not in hash table. In that case the entry is
  37 // in not on LRU and not in table. (refs >= 1 && in_cache == false)
  38 //
  39 // All newly created BinnedLRUHandles are in state 1. If you call
  40 // BinnedLRUCacheShard::Release
  41 // on entry in state 1, it will go into state 2. To move from state 1 to
  42 // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
  43 // same key.
  44 // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
  45 // Before destruction, make sure that no handles are in state 1. This means
  46 // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
  47 // matching
  48 // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
  49
  50 std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
  51     CephContext *c,
  52     size_t capacity,
  53     int num_shard_bits = -1,
  54     bool strict_capacity_limit = false,
  55     double high_pri_pool_ratio = 0.0);
  56
  57 struct BinnedLRUHandle {
  58   void* value;
  59   void (*deleter)(const rocksdb::Slice&, void* value);
  60   BinnedLRUHandle* next_hash;
  61   BinnedLRUHandle* next;
  62   BinnedLRUHandle* prev;
  63   size_t charge;  // TODO(opt): Only allow uint32_t?
  64   size_t key_length;
  65   uint32_t refs;     // a number of refs to this entry
  66                      // cache itself is counted as 1
  67
  68   // Include the following flags:
  69   //   in_cache:    whether this entry is referenced by the hash table.
  70   //   is_high_pri: whether this entry is high priority entry.
  71   //   in_high_pri_pool: whether this entry is in high-pri pool.
  72   char flags;
  73
  74   uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
  75
  76   char* key_data = nullptr;  // Beginning of key
  77
  78   rocksdb::Slice key() const {
  79     // For cheaper lookups, we allow a temporary Handle object
  80     // to store a pointer to a key in "value".
  81     if (next == this) {
  82       return *(reinterpret_cast<rocksdb::Slice*>(value));
  83     } else {
  84       return rocksdb::Slice(key_data, key_length);
  85     }
  86   }
  87
  88   bool InCache() { return flags & 1; }
  89   bool IsHighPri() { return flags & 2; }
  90   bool InHighPriPool() { return flags & 4; }
  91   bool HasHit() { return flags & 8; }
  92
  93   void SetInCache(bool in_cache) {
  94     if (in_cache) {
  95       flags |= 1;
  96     } else {
  97       flags &= ~1;
  98     }
  99   }
 100
 101   void SetPriority(rocksdb::Cache::Priority priority) {
 102     if (priority == rocksdb::Cache::Priority::HIGH) {
 103       flags |= 2;
 104     } else {
 105       flags &= ~2;
 106     }
 107   }
 108
 109   void SetInHighPriPool(bool in_high_pri_pool) {
 110     if (in_high_pri_pool) {
 111       flags |= 4;
 112     } else {
 113       flags &= ~4;
 114     }
 115   }
 116
 117   void SetHit() { flags |= 8; }
 118
 119   void Free() {
 120     ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
 121     if (deleter) {
 122       (*deleter)(key(), value);
 123     }
 124     delete[] key_data;
 125     delete this;
 126   }
 127 };
 128
 129 // We provide our own simple hash table since it removes a whole bunch
 130 // of porting hacks and is also faster than some of the built-in hash
 131 // table implementations in some of the compiler/runtime combinations
 132 // we have tested.  E.g., readrandom speeds up by ~5% over the g++
 133 // 4.4.3's builtin hashtable.
 134 class BinnedLRUHandleTable {
 135  public:
 136   BinnedLRUHandleTable();
 137   ~BinnedLRUHandleTable();
 138
 139   BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash);
 140   BinnedLRUHandle* Insert(BinnedLRUHandle* h);
 141   BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash);
 142
 143   template <typename T>
 144   void ApplyToAllCacheEntries(T func) {
 145     for (uint32_t i = 0; i < length_; i++) {
 146       BinnedLRUHandle* h = list_[i];
 147       while (h != nullptr) {
 148         auto n = h->next_hash;
 149         ceph_assert(h->InCache());
 150         func(h);
 151         h = n;
 152       }
 153     }
 154   }
 155
 156  private:
 157   // Return a pointer to slot that points to a cache entry that
 158   // matches key/hash.  If there is no such cache entry, return a
 159   // pointer to the trailing slot in the corresponding linked list.
 160   BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash);
 161
 162   void Resize();
 163
 164   // The table consists of an array of buckets where each bucket is
 165   // a linked list of cache entries that hash into the bucket.
 166   BinnedLRUHandle** list_;
 167   uint32_t length_;
 168   uint32_t elems_;
 169 };
 170
 171 // A single shard of sharded cache.
 172 class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
 173  public:
 174   BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
 175                 double high_pri_pool_ratio);
 176   virtual ~BinnedLRUCacheShard();
 177
 178   // Separate from constructor so caller can easily make an array of BinnedLRUCache
 179   // if current usage is more than new capacity, the function will attempt to
 180   // free the needed space
 181   virtual void SetCapacity(size_t capacity) override;
 182
 183   // Set the flag to reject insertion if cache if full.
 184   virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
 185
 186   // Set percentage of capacity reserved for high-pri cache entries.
 187   void SetHighPriPoolRatio(double high_pri_pool_ratio);
 188
 189   // Like Cache methods, but with an extra "hash" parameter.
 190   virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
 191                         size_t charge,
 192                         void (*deleter)(const rocksdb::Slice& key, void* value),
 193                         rocksdb::Cache::Handle** handle,
 194                         rocksdb::Cache::Priority priority) override;
 195   virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override;
 196   virtual bool Ref(rocksdb::Cache::Handle* handle) override;
 197   virtual bool Release(rocksdb::Cache::Handle* handle,
 198                        bool force_erase = false) override;
 199   virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override;
 200
 201   // Although in some platforms the update of size_t is atomic, to make sure
 202   // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
 203   // protect them with mutex_.
 204
 205   virtual size_t GetUsage() const override;
 206   virtual size_t GetPinnedUsage() const override;
 207
 208   virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
 209                                       bool thread_safe) override;
 210
 211   virtual void EraseUnRefEntries() override;
 212
 213   virtual std::string GetPrintableOptions() const override;
 214
 215   void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri);
 216
 217   //  Retrieves number of elements in LRU, for unit test purpose only
 218   //  not threadsafe
 219   size_t TEST_GetLRUSize();
 220
 221   //  Retrieves high pri pool ratio
 222   double GetHighPriPoolRatio() const;
 223
 224   // Retrieves high pri pool usage
 225   size_t GetHighPriPoolUsage() const;
 226
 227  private:
 228   CephContext *cct;
 229   void LRU_Remove(BinnedLRUHandle* e);
 230   void LRU_Insert(BinnedLRUHandle* e);
 231
 232   // Overflow the last entry in high-pri pool to low-pri pool until size of
 233   // high-pri pool is no larger than the size specify by high_pri_pool_pct.
 234   void MaintainPoolSize();
 235
 236   // Just reduce the reference count by 1.
 237   // Return true if last reference
 238   bool Unref(BinnedLRUHandle* e);
 239
 240   // Free some space following strict LRU policy until enough space
 241   // to hold (usage_ + charge) is freed or the lru list is empty
 242   // This function is not thread safe - it needs to be executed while
 243   // holding the mutex_
 244   void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted);
 245
 246   // Initialized before use.
 247   size_t capacity_;
 248
 249   // Memory size for entries in high-pri pool.
 250   size_t high_pri_pool_usage_;
 251
 252   // Whether to reject insertion if cache reaches its full capacity.
 253   bool strict_capacity_limit_;
 254
 255   // Ratio of capacity reserved for high priority cache entries.
 256   double high_pri_pool_ratio_;
 257
 258   // High-pri pool size, equals to capacity * high_pri_pool_ratio.
 259   // Remember the value to avoid recomputing each time.
 260   double high_pri_pool_capacity_;
 261
 262   // Dummy head of LRU list.
 263   // lru.prev is newest entry, lru.next is oldest entry.
 264   // LRU contains items which can be evicted, ie reference only by cache
 265   BinnedLRUHandle lru_;
 266
 267   // Pointer to head of low-pri pool in LRU list.
 268   BinnedLRUHandle* lru_low_pri_;
 269
 270   // ------------^^^^^^^^^^^^^-----------
 271   // Not frequently modified data members
 272   // ------------------------------------
 273   //
 274   // We separate data members that are updated frequently from the ones that
 275   // are not frequently updated so that they don't share the same cache line
 276   // which will lead into false cache sharing
 277   //
 278   // ------------------------------------
 279   // Frequently modified data members
 280   // ------------vvvvvvvvvvvvv-----------
 281   BinnedLRUHandleTable table_;
 282
 283   // Memory size for entries residing in the cache
 284   size_t usage_;
 285
 286   // Memory size for entries residing only in the LRU list
 287   size_t lru_usage_;
 288
 289   // mutex_ protects the following state.
 290   // We don't count mutex_ as the cache's internal state so semantically we
 291   // don't mind mutex_ invoking the non-const actions.
 292   mutable std::mutex mutex_;
 293 };
 294
 295 class BinnedLRUCache : public ShardedCache {
 296  public:
 297   BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits,
 298       bool strict_capacity_limit, double high_pri_pool_ratio);
 299   virtual ~BinnedLRUCache();
 300   virtual const char* Name() const override { return "BinnedLRUCache"; }
 301   virtual CacheShard* GetShard(int shard) override;
 302   virtual const CacheShard* GetShard(int shard) const override;
 303   virtual void* Value(Handle* handle) override;
 304   virtual size_t GetCharge(Handle* handle) const override;
 305   virtual uint32_t GetHash(Handle* handle) const override;
 306   virtual void DisownData() override;
 307
 308   //  Retrieves number of elements in LRU, for unit test purpose only
 309   size_t TEST_GetLRUSize();
 310   // Sets the high pri pool ratio
 311   void SetHighPriPoolRatio(double high_pri_pool_ratio);
 312   //  Retrieves high pri pool ratio
 313   double GetHighPriPoolRatio() const;
 314   // Retrieves high pri pool usage
 315   size_t GetHighPriPoolUsage() const;
 316
 317   // PriorityCache
 318   virtual int64_t request_cache_bytes(
 319       PriorityCache::Priority pri, uint64_t total_cache) const;
 320   virtual int64_t commit_cache_size(uint64_t total_cache);
 321   virtual int64_t get_committed_size() const {
 322     return GetCapacity();
 323   }
 324   virtual std::string get_cache_name() const {
 325     return "RocksDB Binned LRU Cache";
 326   }
 327
 328  private:
 329   CephContext *cct;
 330   BinnedLRUCacheShard* shards_;
 331   int num_shards_ = 0;
 332 };
 333
 334 }  // namespace rocksdb_cache
 335
 336 #endif // ROCKSDB_BINNED_LRU_CACHE