ceph/src/kv/rocksdb_cache/BinnedLRUCache.h

   1 // Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
   2 //
   3 // Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
   4 // This source code is licensed under both the GPLv2 and Apache 2.0 License
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9
  10 #ifndef ROCKSDB_BINNED_LRU_CACHE
  11 #define ROCKSDB_BINNED_LRU_CACHE
  12
  13 #include <string>
  14 #include <mutex>
  15 #include <boost/circular_buffer.hpp>
  16
  17 #include "ShardedCache.h"
  18 #include "common/autovector.h"
  19 #include "common/dout.h"
  20 #include "include/ceph_assert.h"
  21 #include "common/ceph_context.h"
  22
  23 namespace rocksdb_cache {
  24
  25 // LRU cache implementation
  26
  27 // An entry is a variable length heap-allocated structure.
  28 // Entries are referenced by cache and/or by any external entity.
  29 // The cache keeps all its entries in table. Some elements
  30 // are also stored on LRU list.
  31 //
  32 // BinnedLRUHandle can be in these states:
  33 // 1. Referenced externally AND in hash table.
  34 //  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
  35 // 2. Not referenced externally and in hash table. In that case the entry is
  36 // in the LRU and can be freed. (refs == 1 && in_cache == true)
  37 // 3. Referenced externally and not in hash table. In that case the entry is
  38 // in not on LRU and not in table. (refs >= 1 && in_cache == false)
  39 //
  40 // All newly created BinnedLRUHandles are in state 1. If you call
  41 // BinnedLRUCacheShard::Release
  42 // on entry in state 1, it will go into state 2. To move from state 1 to
  43 // state 3, either call BinnedLRUCacheShard::Erase or BinnedLRUCacheShard::Insert with the
  44 // same key.
  45 // To move from state 2 to state 1, use BinnedLRUCacheShard::Lookup.
  46 // Before destruction, make sure that no handles are in state 1. This means
  47 // that any successful BinnedLRUCacheShard::Lookup/BinnedLRUCacheShard::Insert have a
  48 // matching
  49 // RUCache::Release (to move into state 2) or BinnedLRUCacheShard::Erase (for state 3)
  50
  51 std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
  52     CephContext *c,
  53     size_t capacity,
  54     int num_shard_bits = -1,
  55     bool strict_capacity_limit = false,
  56     double high_pri_pool_ratio = 0.0);
  57
  58 struct BinnedLRUHandle {
  59   std::shared_ptr<uint64_t> age_bin;
  60   void* value;
  61   DeleterFn deleter;
  62   BinnedLRUHandle* next_hash;
  63   BinnedLRUHandle* next;
  64   BinnedLRUHandle* prev;
  65   size_t charge;  // TODO(opt): Only allow uint32_t?
  66   size_t key_length;
  67   uint32_t refs;     // a number of refs to this entry
  68                      // cache itself is counted as 1
  69
  70   // Include the following flags:
  71   //   in_cache:    whether this entry is referenced by the hash table.
  72   //   is_high_pri: whether this entry is high priority entry.
  73   //   in_high_pri_pool: whether this entry is in high-pri pool.
  74   char flags;
  75
  76   uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
  77
  78   char* key_data = nullptr;  // Beginning of key
  79
  80   rocksdb::Slice key() const {
  81     // For cheaper lookups, we allow a temporary Handle object
  82     // to store a pointer to a key in "value".
  83     if (next == this) {
  84       return *(reinterpret_cast<rocksdb::Slice*>(value));
  85     } else {
  86       return rocksdb::Slice(key_data, key_length);
  87     }
  88   }
  89
  90   bool InCache() { return flags & 1; }
  91   bool IsHighPri() { return flags & 2; }
  92   bool InHighPriPool() { return flags & 4; }
  93   bool HasHit() { return flags & 8; }
  94
  95   void SetInCache(bool in_cache) {
  96     if (in_cache) {
  97       flags |= 1;
  98     } else {
  99       flags &= ~1;
 100     }
 101   }
 102
 103   void SetPriority(rocksdb::Cache::Priority priority) {
 104     if (priority == rocksdb::Cache::Priority::HIGH) {
 105       flags |= 2;
 106     } else {
 107       flags &= ~2;
 108     }
 109   }
 110
 111   void SetInHighPriPool(bool in_high_pri_pool) {
 112     if (in_high_pri_pool) {
 113       flags |= 4;
 114     } else {
 115       flags &= ~4;
 116     }
 117   }
 118
 119   void SetHit() { flags |= 8; }
 120
 121   void Free() {
 122     ceph_assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
 123     if (deleter) {
 124       (*deleter)(key(), value);
 125     }
 126     delete[] key_data;
 127     delete this;
 128   }
 129 };
 130
 131 // We provide our own simple hash table since it removes a whole bunch
 132 // of porting hacks and is also faster than some of the built-in hash
 133 // table implementations in some of the compiler/runtime combinations
 134 // we have tested.  E.g., readrandom speeds up by ~5% over the g++
 135 // 4.4.3's builtin hashtable.
 136 class BinnedLRUHandleTable {
 137  public:
 138   BinnedLRUHandleTable();
 139   ~BinnedLRUHandleTable();
 140
 141   BinnedLRUHandle* Lookup(const rocksdb::Slice& key, uint32_t hash);
 142   BinnedLRUHandle* Insert(BinnedLRUHandle* h);
 143   BinnedLRUHandle* Remove(const rocksdb::Slice& key, uint32_t hash);
 144
 145   template <typename T>
 146   void ApplyToAllCacheEntries(T func) {
 147     for (uint32_t i = 0; i < length_; i++) {
 148       BinnedLRUHandle* h = list_[i];
 149       while (h != nullptr) {
 150         auto n = h->next_hash;
 151         ceph_assert(h->InCache());
 152         func(h);
 153         h = n;
 154       }
 155     }
 156   }
 157
 158  private:
 159   // Return a pointer to slot that points to a cache entry that
 160   // matches key/hash.  If there is no such cache entry, return a
 161   // pointer to the trailing slot in the corresponding linked list.
 162   BinnedLRUHandle** FindPointer(const rocksdb::Slice& key, uint32_t hash);
 163
 164   void Resize();
 165
 166   // The table consists of an array of buckets where each bucket is
 167   // a linked list of cache entries that hash into the bucket.
 168   BinnedLRUHandle** list_;
 169   uint32_t length_;
 170   uint32_t elems_;
 171 };
 172
 173 // A single shard of sharded cache.
 174 class alignas(CACHE_LINE_SIZE) BinnedLRUCacheShard : public CacheShard {
 175  public:
 176   BinnedLRUCacheShard(CephContext *c, size_t capacity, bool strict_capacity_limit,
 177                 double high_pri_pool_ratio);
 178   virtual ~BinnedLRUCacheShard();
 179
 180   // Separate from constructor so caller can easily make an array of BinnedLRUCache
 181   // if current usage is more than new capacity, the function will attempt to
 182   // free the needed space
 183   virtual void SetCapacity(size_t capacity) override;
 184
 185   // Set the flag to reject insertion if cache if full.
 186   virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
 187
 188   // Set percentage of capacity reserved for high-pri cache entries.
 189   void SetHighPriPoolRatio(double high_pri_pool_ratio);
 190
 191   // Like Cache methods, but with an extra "hash" parameter.
 192   virtual rocksdb::Status Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
 193                         size_t charge,
 194                         DeleterFn deleter,
 195                         rocksdb::Cache::Handle** handle,
 196                         rocksdb::Cache::Priority priority) override;
 197   virtual rocksdb::Cache::Handle* Lookup(const rocksdb::Slice& key, uint32_t hash) override;
 198   virtual bool Ref(rocksdb::Cache::Handle* handle) override;
 199   virtual bool Release(rocksdb::Cache::Handle* handle,
 200                        bool force_erase = false) override;
 201   virtual void Erase(const rocksdb::Slice& key, uint32_t hash) override;
 202
 203   // Although in some platforms the update of size_t is atomic, to make sure
 204   // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
 205   // protect them with mutex_.
 206
 207   virtual size_t GetUsage() const override;
 208   virtual size_t GetPinnedUsage() const override;
 209
 210   virtual void ApplyToAllCacheEntries(
 211     const std::function<void(const rocksdb::Slice& key,
 212                              void* value,
 213                              size_t charge,
 214                              DeleterFn)>& callback,
 215     bool thread_safe) override;
 216
 217   virtual void EraseUnRefEntries() override;
 218
 219   virtual std::string GetPrintableOptions() const override;
 220
 221   virtual DeleterFn GetDeleter(rocksdb::Cache::Handle* handle) const override;
 222
 223   void TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri);
 224
 225   //  Retrieves number of elements in LRU, for unit test purpose only
 226   //  not threadsafe
 227   size_t TEST_GetLRUSize();
 228
 229   //  Retrieves high pri pool ratio
 230   double GetHighPriPoolRatio() const;
 231
 232   // Retrieves high pri pool usage
 233   size_t GetHighPriPoolUsage() const;
 234
 235   // Rotate the bins
 236   void shift_bins();
 237
 238   // Get the bin count
 239   uint32_t get_bin_count() const;
 240
 241   // Set the bin count
 242   void set_bin_count(uint32_t count);
 243
 244   // Get the byte counts for a range of age bins
 245   uint64_t sum_bins(uint32_t start, uint32_t end) const;
 246
 247  private:
 248   CephContext *cct;
 249   void LRU_Remove(BinnedLRUHandle* e);
 250   void LRU_Insert(BinnedLRUHandle* e);
 251
 252   // Overflow the last entry in high-pri pool to low-pri pool until size of
 253   // high-pri pool is no larger than the size specify by high_pri_pool_pct.
 254   void MaintainPoolSize();
 255
 256   // Just reduce the reference count by 1.
 257   // Return true if last reference
 258   bool Unref(BinnedLRUHandle* e);
 259
 260   // Free some space following strict LRU policy until enough space
 261   // to hold (usage_ + charge) is freed or the lru list is empty
 262   // This function is not thread safe - it needs to be executed while
 263   // holding the mutex_
 264   void EvictFromLRU(size_t charge, ceph::autovector<BinnedLRUHandle*>* deleted);
 265
 266   // Initialized before use.
 267   size_t capacity_;
 268
 269   // Memory size for entries in high-pri pool.
 270   size_t high_pri_pool_usage_;
 271
 272   // Whether to reject insertion if cache reaches its full capacity.
 273   bool strict_capacity_limit_;
 274
 275   // Ratio of capacity reserved for high priority cache entries.
 276   double high_pri_pool_ratio_;
 277
 278   // High-pri pool size, equals to capacity * high_pri_pool_ratio.
 279   // Remember the value to avoid recomputing each time.
 280   double high_pri_pool_capacity_;
 281
 282   // Dummy head of LRU list.
 283   // lru.prev is newest entry, lru.next is oldest entry.
 284   // LRU contains items which can be evicted, ie reference only by cache
 285   BinnedLRUHandle lru_;
 286
 287   // Pointer to head of low-pri pool in LRU list.
 288   BinnedLRUHandle* lru_low_pri_;
 289
 290   // ------------^^^^^^^^^^^^^-----------
 291   // Not frequently modified data members
 292   // ------------------------------------
 293   //
 294   // We separate data members that are updated frequently from the ones that
 295   // are not frequently updated so that they don't share the same cache line
 296   // which will lead into false cache sharing
 297   //
 298   // ------------------------------------
 299   // Frequently modified data members
 300   // ------------vvvvvvvvvvvvv-----------
 301   BinnedLRUHandleTable table_;
 302
 303   // Memory size for entries residing in the cache
 304   size_t usage_;
 305
 306   // Memory size for entries residing only in the LRU list
 307   size_t lru_usage_;
 308
 309   // mutex_ protects the following state.
 310   // We don't count mutex_ as the cache's internal state so semantically we
 311   // don't mind mutex_ invoking the non-const actions.
 312   mutable std::mutex mutex_;
 313
 314   // Circular buffer of byte counters for age binning
 315   boost::circular_buffer<std::shared_ptr<uint64_t>> age_bins;
 316 };
 317
 318 class BinnedLRUCache : public ShardedCache {
 319  public:
 320   BinnedLRUCache(CephContext *c, size_t capacity, int num_shard_bits,
 321       bool strict_capacity_limit, double high_pri_pool_ratio);
 322   virtual ~BinnedLRUCache();
 323   virtual const char* Name() const override { return "BinnedLRUCache"; }
 324   virtual CacheShard* GetShard(int shard) override;
 325   virtual const CacheShard* GetShard(int shard) const override;
 326   virtual void* Value(Handle* handle) override;
 327   virtual size_t GetCharge(Handle* handle) const override;
 328   virtual uint32_t GetHash(Handle* handle) const override;
 329   virtual void DisownData() override;
 330 #if (ROCKSDB_MAJOR >= 7 || (ROCKSDB_MAJOR == 6 && ROCKSDB_MINOR >= 22))
 331   virtual DeleterFn GetDeleter(Handle* handle) const override;
 332 #endif
 333   //  Retrieves number of elements in LRU, for unit test purpose only
 334   size_t TEST_GetLRUSize();
 335   // Sets the high pri pool ratio
 336   void SetHighPriPoolRatio(double high_pri_pool_ratio);
 337   //  Retrieves high pri pool ratio
 338   double GetHighPriPoolRatio() const;
 339   // Retrieves high pri pool usage
 340   size_t GetHighPriPoolUsage() const;
 341
 342   // PriorityCache
 343   virtual int64_t request_cache_bytes(
 344       PriorityCache::Priority pri, uint64_t total_cache) const;
 345   virtual int64_t commit_cache_size(uint64_t total_cache);
 346   virtual int64_t get_committed_size() const {
 347     return GetCapacity();
 348   }
 349   virtual void shift_bins();
 350   uint64_t sum_bins(uint32_t start, uint32_t end) const;
 351   uint32_t get_bin_count() const;
 352   void set_bin_count(uint32_t count);
 353
 354   virtual std::string get_cache_name() const {
 355     return "RocksDB Binned LRU Cache";
 356   }
 357
 358  private:
 359   CephContext *cct;
 360   BinnedLRUCacheShard* shards_;
 361   int num_shards_ = 0;
 362 };
 363
 364 }  // namespace rocksdb_cache
 365
 366 #endif // ROCKSDB_BINNED_LRU_CACHE