ceph/src/rocksdb/cache/clock_cache.cc

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9
  10 #include "cache/clock_cache.h"
  11
  12 #ifndef SUPPORT_CLOCK_CACHE
  13
  14 namespace rocksdb {
  15
  16 std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/,
  17                                      bool /*strict_capacity_limit*/) {
  18   // Clock cache not supported.
  19   return nullptr;
  20 }
  21
  22 }  // namespace rocksdb
  23
  24 #else
  25
  26 #include <assert.h>
  27 #include <atomic>
  28 #include <deque>
  29
  30 // "tbb/concurrent_hash_map.h" requires RTTI if exception is enabled.
  31 // Disable it so users can chooose to disable RTTI.
  32 #ifndef ROCKSDB_USE_RTTI
  33 #define TBB_USE_EXCEPTIONS 0
  34 #endif
  35 #include "tbb/concurrent_hash_map.h"
  36
  37 #include "cache/sharded_cache.h"
  38 #include "port/port.h"
  39 #include "util/autovector.h"
  40 #include "util/mutexlock.h"
  41
  42 namespace rocksdb {
  43
  44 namespace {
  45
  46 // An implementation of the Cache interface based on CLOCK algorithm, with
  47 // better concurrent performance than LRUCache. The idea of CLOCK algorithm
  48 // is to maintain all cache entries in a circular list, and an iterator
  49 // (the "head") pointing to the last examined entry. Eviction starts from the
  50 // current head. Each entry is given a second chance before eviction, if it
  51 // has been access since last examine. In contrast to LRU, no modification
  52 // to the internal data-structure (except for flipping the usage bit) needs
  53 // to be done upon lookup. This gives us oppertunity to implement a cache
  54 // with better concurrency.
  55 //
  56 // Each cache entry is represented by a cache handle, and all the handles
  57 // are arranged in a circular list, as describe above. Upon erase of an entry,
  58 // we never remove the handle. Instead, the handle is put into a recycle bin
  59 // to be re-use. This is to avoid memory dealocation, which is hard to deal
  60 // with in concurrent environment.
  61 //
  62 // The cache also maintains a concurrent hash map for lookup. Any concurrent
  63 // hash map implementation should do the work. We currently use
  64 // tbb::concurrent_hash_map because it supports concurrent erase.
  65 //
  66 // Each cache handle has the following flags and counters, which are squeeze
  67 // in an atomic interger, to make sure the handle always be in a consistent
  68 // state:
  69 //
  70 //   * In-cache bit: whether the entry is reference by the cache itself. If
  71 //     an entry is in cache, its key would also be available in the hash map.
  72 //   * Usage bit: whether the entry has been access by user since last
  73 //     examine for eviction. Can be reset by eviction.
  74 //   * Reference count: reference count by user.
  75 //
  76 // An entry can be reference only when it's in cache. An entry can be evicted
  77 // only when it is in cache, has no usage since last examine, and reference
  78 // count is zero.
  79 //
  80 // The follow figure shows a possible layout of the cache. Boxes represents
  81 // cache handles and numbers in each box being in-cache bit, usage bit and
  82 // reference count respectively.
  83 //
  84 //    hash map:
  85 //      +-------+--------+
  86 //      |  key  | handle |
  87 //      +-------+--------+
  88 //      | "foo" |    5   |-------------------------------------+
  89 //      +-------+--------+                                     |
  90 //      | "bar" |    2   |--+                                  |
  91 //      +-------+--------+  |                                  |
  92 //                          |                                  |
  93 //                     head |                                  |
  94 //                       |  |                                  |
  95 //    circular list:     |  |                                  |
  96 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
  97 //         |(0,0,0)|---|(1,1,0)|---|(0,0,0)|---|(0,1,3)|---|(1,0,0)|---|  ...
  98 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
  99 //             |                       |
 100 //             +-------+   +-----------+
 101 //                     |   |
 102 //                   +---+---+
 103 //    recycle bin:   | 1 | 3 |
 104 //                   +---+---+
 105 //
 106 // Suppose we try to insert "baz" into the cache at this point and the cache is
 107 // full. The cache will first look for entries to evict, starting from where
 108 // head points to (the second entry). It resets usage bit of the second entry,
 109 // skips the third and fourth entry since they are not in cache, and finally
 110 // evict the fifth entry ("foo"). It looks at recycle bin for available handle,
 111 // grabs handle 3, and insert the key into the handle. The following figure
 112 // shows the resulting layout.
 113 //
 114 //    hash map:
 115 //      +-------+--------+
 116 //      |  key  | handle |
 117 //      +-------+--------+
 118 //      | "baz" |    3   |-------------+
 119 //      +-------+--------+             |
 120 //      | "bar" |    2   |--+          |
 121 //      +-------+--------+  |          |
 122 //                          |          |
 123 //                          |          |                                 head
 124 //                          |          |                                   |
 125 //    circular list:        |          |                                   |
 126 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
 127 //         |(0,0,0)|---|(1,0,0)|---|(1,0,0)|---|(0,1,3)|---|(0,0,0)|---|  ...
 128 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
 129 //             |                                               |
 130 //             +-------+   +-----------------------------------+
 131 //                     |   |
 132 //                   +---+---+
 133 //    recycle bin:   | 1 | 5 |
 134 //                   +---+---+
 135 //
 136 // A global mutex guards the circular list, the head, and the recycle bin.
 137 // We additionally require that modifying the hash map needs to hold the mutex.
 138 // As such, Modifying the cache (such as Insert() and Erase()) require to
 139 // hold the mutex. Lookup() only access the hash map and the flags associated
 140 // with each handle, and don't require explicit locking. Release() has to
 141 // acquire the mutex only when it releases the last reference to the entry and
 142 // the entry has been erased from cache explicitly. A future improvement could
 143 // be to remove the mutex completely.
 144 //
 145 // Benchmark:
 146 // We run readrandom db_bench on a test DB of size 13GB, with size of each
 147 // level:
 148 //
 149 //    Level    Files   Size(MB)
 150 //    -------------------------
 151 //      L0        1       0.01
 152 //      L1       18      17.32
 153 //      L2      230     182.94
 154 //      L3     1186    1833.63
 155 //      L4     4602    8140.30
 156 //
 157 // We test with both 32 and 16 read threads, with 2GB cache size (the whole DB
 158 // doesn't fits in) and 64GB cache size (the whole DB can fit in cache), and
 159 // whether to put index and filter blocks in block cache. The benchmark runs
 160 // with
 161 // with RocksDB 4.10. We got the following result:
 162 //
 163 // Threads Cache     Cache               ClockCache               LRUCache
 164 //         Size  Index/Filter Throughput(MB/s)   Hit Throughput(MB/s)    Hit
 165 //     32   2GB       yes               466.7  85.9%           433.7   86.5%
 166 //     32   2GB       no                529.9  72.7%           532.7   73.9%
 167 //     32  64GB       yes               649.9  99.9%           507.9   99.9%
 168 //     32  64GB       no                740.4  99.9%           662.8   99.9%
 169 //     16   2GB       yes               278.4  85.9%           283.4   86.5%
 170 //     16   2GB       no                318.6  72.7%           335.8   73.9%
 171 //     16  64GB       yes               391.9  99.9%           353.3   99.9%
 172 //     16  64GB       no                433.8  99.8%           419.4   99.8%
 173
 174 // Cache entry meta data.
 175 struct CacheHandle {
 176   Slice key;
 177   uint32_t hash;
 178   void* value;
 179   size_t charge;
 180   void (*deleter)(const Slice&, void* value);
 181
 182   // Flags and counters associated with the cache handle:
 183   //   lowest bit: n-cache bit
 184   //   second lowest bit: usage bit
 185   //   the rest bits: reference count
 186   // The handle is unused when flags equals to 0. The thread decreases the count
 187   // to 0 is responsible to put the handle back to recycle_ and cleanup memory.
 188   std::atomic<uint32_t> flags;
 189
 190   CacheHandle() = default;
 191
 192   CacheHandle(const CacheHandle& a) { *this = a; }
 193
 194   CacheHandle(const Slice& k, void* v,
 195               void (*del)(const Slice& key, void* value))
 196       : key(k), value(v), deleter(del) {}
 197
 198   CacheHandle& operator=(const CacheHandle& a) {
 199     // Only copy members needed for deletion.
 200     key = a.key;
 201     value = a.value;
 202     deleter = a.deleter;
 203     return *this;
 204   }
 205 };
 206
 207 // Key of hash map. We store hash value with the key for convenience.
 208 struct CacheKey {
 209   Slice key;
 210   uint32_t hash_value;
 211
 212   CacheKey() = default;
 213
 214   CacheKey(const Slice& k, uint32_t h) {
 215     key = k;
 216     hash_value = h;
 217   }
 218
 219   static bool equal(const CacheKey& a, const CacheKey& b) {
 220     return a.hash_value == b.hash_value && a.key == b.key;
 221   }
 222
 223   static size_t hash(const CacheKey& a) {
 224     return static_cast<size_t>(a.hash_value);
 225   }
 226 };
 227
 228 struct CleanupContext {
 229   // List of values to be deleted, along with the key and deleter.
 230   autovector<CacheHandle> to_delete_value;
 231
 232   // List of keys to be deleted.
 233   autovector<const char*> to_delete_key;
 234 };
 235
 236 // A cache shard which maintains its own CLOCK cache.
 237 class ClockCacheShard final : public CacheShard {
 238  public:
 239   // Hash map type.
 240   typedef tbb::concurrent_hash_map<CacheKey, CacheHandle*, CacheKey> HashTable;
 241
 242   ClockCacheShard();
 243   ~ClockCacheShard() override;
 244
 245   // Interfaces
 246   void SetCapacity(size_t capacity) override;
 247   void SetStrictCapacityLimit(bool strict_capacity_limit) override;
 248   Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
 249                 void (*deleter)(const Slice& key, void* value),
 250                 Cache::Handle** handle, Cache::Priority priority) override;
 251   Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
 252   // If the entry in in cache, increase reference count and return true.
 253   // Return false otherwise.
 254   //
 255   // Not necessary to hold mutex_ before being called.
 256   bool Ref(Cache::Handle* handle) override;
 257   bool Release(Cache::Handle* handle, bool force_erase = false) override;
 258   void Erase(const Slice& key, uint32_t hash) override;
 259   bool EraseAndConfirm(const Slice& key, uint32_t hash,
 260                        CleanupContext* context);
 261   size_t GetUsage() const override;
 262   size_t GetPinnedUsage() const override;
 263   void EraseUnRefEntries() override;
 264   void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
 265                               bool thread_safe) override;
 266
 267  private:
 268   static const uint32_t kInCacheBit = 1;
 269   static const uint32_t kUsageBit = 2;
 270   static const uint32_t kRefsOffset = 2;
 271   static const uint32_t kOneRef = 1 << kRefsOffset;
 272
 273   // Helper functions to extract cache handle flags and counters.
 274   static bool InCache(uint32_t flags) { return flags & kInCacheBit; }
 275   static bool HasUsage(uint32_t flags) { return flags & kUsageBit; }
 276   static uint32_t CountRefs(uint32_t flags) { return flags >> kRefsOffset; }
 277
 278   // Decrease reference count of the entry. If this decreases the count to 0,
 279   // recycle the entry. If set_usage is true, also set the usage bit.
 280   //
 281   // returns true if a value is erased.
 282   //
 283   // Not necessary to hold mutex_ before being called.
 284   bool Unref(CacheHandle* handle, bool set_usage, CleanupContext* context);
 285
 286   // Unset in-cache bit of the entry. Recycle the handle if necessary.
 287   //
 288   // returns true if a value is erased.
 289   //
 290   // Has to hold mutex_ before being called.
 291   bool UnsetInCache(CacheHandle* handle, CleanupContext* context);
 292
 293   // Put the handle back to recycle_ list, and put the value associated with
 294   // it into to-be-deleted list. It doesn't cleanup the key as it might be
 295   // reused by another handle.
 296   //
 297   // Has to hold mutex_ before being called.
 298   void RecycleHandle(CacheHandle* handle, CleanupContext* context);
 299
 300   // Delete keys and values in to-be-deleted list. Call the method without
 301   // holding mutex, as destructors can be expensive.
 302   void Cleanup(const CleanupContext& context);
 303
 304   // Examine the handle for eviction. If the handle is in cache, usage bit is
 305   // not set, and referece count is 0, evict it from cache. Otherwise unset
 306   // the usage bit.
 307   //
 308   // Has to hold mutex_ before being called.
 309   bool TryEvict(CacheHandle* value, CleanupContext* context);
 310
 311   // Scan through the circular list, evict entries until we get enough capacity
 312   // for new cache entry of specific size. Return true if success, false
 313   // otherwise.
 314   //
 315   // Has to hold mutex_ before being called.
 316   bool EvictFromCache(size_t charge, CleanupContext* context);
 317
 318   CacheHandle* Insert(const Slice& key, uint32_t hash, void* value,
 319                       size_t change,
 320                       void (*deleter)(const Slice& key, void* value),
 321                       bool hold_reference, CleanupContext* context);
 322
 323   // Guards list_, head_, and recycle_. In addition, updating table_ also has
 324   // to hold the mutex, to avoid the cache being in inconsistent state.
 325   mutable port::Mutex mutex_;
 326
 327   // The circular list of cache handles. Initially the list is empty. Once a
 328   // handle is needed by insertion, and no more handles are available in
 329   // recycle bin, one more handle is appended to the end.
 330   //
 331   // We use std::deque for the circular list because we want to make sure
 332   // pointers to handles are valid through out the life-cycle of the cache
 333   // (in contrast to std::vector), and be able to grow the list (in contrast
 334   // to statically allocated arrays).
 335   std::deque<CacheHandle> list_;
 336
 337   // Pointer to the next handle in the circular list to be examine for
 338   // eviction.
 339   size_t head_;
 340
 341   // Recycle bin of cache handles.
 342   autovector<CacheHandle*> recycle_;
 343
 344   // Maximum cache size.
 345   std::atomic<size_t> capacity_;
 346
 347   // Current total size of the cache.
 348   std::atomic<size_t> usage_;
 349
 350   // Total un-released cache size.
 351   std::atomic<size_t> pinned_usage_;
 352
 353   // Whether allow insert into cache if cache is full.
 354   std::atomic<bool> strict_capacity_limit_;
 355
 356   // Hash table (tbb::concurrent_hash_map) for lookup.
 357   HashTable table_;
 358 };
 359
 360 ClockCacheShard::ClockCacheShard()
 361     : head_(0), usage_(0), pinned_usage_(0), strict_capacity_limit_(false) {}
 362
 363 ClockCacheShard::~ClockCacheShard() {
 364   for (auto& handle : list_) {
 365     uint32_t flags = handle.flags.load(std::memory_order_relaxed);
 366     if (InCache(flags) || CountRefs(flags) > 0) {
 367       if (handle.deleter != nullptr) {
 368         (*handle.deleter)(handle.key, handle.value);
 369       }
 370       delete[] handle.key.data();
 371     }
 372   }
 373 }
 374
 375 size_t ClockCacheShard::GetUsage() const {
 376   return usage_.load(std::memory_order_relaxed);
 377 }
 378
 379 size_t ClockCacheShard::GetPinnedUsage() const {
 380   return pinned_usage_.load(std::memory_order_relaxed);
 381 }
 382
 383 void ClockCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
 384                                              bool thread_safe) {
 385   if (thread_safe) {
 386     mutex_.Lock();
 387   }
 388   for (auto& handle : list_) {
 389     // Use relaxed semantics instead of acquire semantics since we are either
 390     // holding mutex, or don't have thread safe requirement.
 391     uint32_t flags = handle.flags.load(std::memory_order_relaxed);
 392     if (InCache(flags)) {
 393       callback(handle.value, handle.charge);
 394     }
 395   }
 396   if (thread_safe) {
 397     mutex_.Unlock();
 398   }
 399 }
 400
 401 void ClockCacheShard::RecycleHandle(CacheHandle* handle,
 402                                     CleanupContext* context) {
 403   mutex_.AssertHeld();
 404   assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
 405   context->to_delete_key.push_back(handle->key.data());
 406   context->to_delete_value.emplace_back(*handle);
 407   handle->key.clear();
 408   handle->value = nullptr;
 409   handle->deleter = nullptr;
 410   recycle_.push_back(handle);
 411   usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
 412 }
 413
 414 void ClockCacheShard::Cleanup(const CleanupContext& context) {
 415   for (const CacheHandle& handle : context.to_delete_value) {
 416     if (handle.deleter) {
 417       (*handle.deleter)(handle.key, handle.value);
 418     }
 419   }
 420   for (const char* key : context.to_delete_key) {
 421     delete[] key;
 422   }
 423 }
 424
 425 bool ClockCacheShard::Ref(Cache::Handle* h) {
 426   auto handle = reinterpret_cast<CacheHandle*>(h);
 427   // CAS loop to increase reference count.
 428   uint32_t flags = handle->flags.load(std::memory_order_relaxed);
 429   while (InCache(flags)) {
 430     // Use acquire semantics on success, as further operations on the cache
 431     // entry has to be order after reference count is increased.
 432     if (handle->flags.compare_exchange_weak(flags, flags + kOneRef,
 433                                             std::memory_order_acquire,
 434                                             std::memory_order_relaxed)) {
 435       if (CountRefs(flags) == 0) {
 436         // No reference count before the operation.
 437         pinned_usage_.fetch_add(handle->charge, std::memory_order_relaxed);
 438       }
 439       return true;
 440     }
 441   }
 442   return false;
 443 }
 444
 445 bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
 446                             CleanupContext* context) {
 447   if (set_usage) {
 448     handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed);
 449   }
 450   // Use acquire-release semantics as previous operations on the cache entry
 451   // has to be order before reference count is decreased, and potential cleanup
 452   // of the entry has to be order after.
 453   uint32_t flags = handle->flags.fetch_sub(kOneRef, std::memory_order_acq_rel);
 454   assert(CountRefs(flags) > 0);
 455   if (CountRefs(flags) == 1) {
 456     // this is the last reference.
 457     pinned_usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
 458     // Cleanup if it is the last reference.
 459     if (!InCache(flags)) {
 460       MutexLock l(&mutex_);
 461       RecycleHandle(handle, context);
 462     }
 463   }
 464   return context->to_delete_value.size();
 465 }
 466
 467 bool ClockCacheShard::UnsetInCache(CacheHandle* handle,
 468                                    CleanupContext* context) {
 469   mutex_.AssertHeld();
 470   // Use acquire-release semantics as previous operations on the cache entry
 471   // has to be order before reference count is decreased, and potential cleanup
 472   // of the entry has to be order after.
 473   uint32_t flags =
 474       handle->flags.fetch_and(~kInCacheBit, std::memory_order_acq_rel);
 475   // Cleanup if it is the last reference.
 476   if (InCache(flags) && CountRefs(flags) == 0) {
 477     RecycleHandle(handle, context);
 478   }
 479   return context->to_delete_value.size();
 480 }
 481
 482 bool ClockCacheShard::TryEvict(CacheHandle* handle, CleanupContext* context) {
 483   mutex_.AssertHeld();
 484   uint32_t flags = kInCacheBit;
 485   if (handle->flags.compare_exchange_strong(flags, 0, std::memory_order_acquire,
 486                                             std::memory_order_relaxed)) {
 487     bool erased __attribute__((__unused__)) =
 488         table_.erase(CacheKey(handle->key, handle->hash));
 489     assert(erased);
 490     RecycleHandle(handle, context);
 491     return true;
 492   }
 493   handle->flags.fetch_and(~kUsageBit, std::memory_order_relaxed);
 494   return false;
 495 }
 496
 497 bool ClockCacheShard::EvictFromCache(size_t charge, CleanupContext* context) {
 498   size_t usage = usage_.load(std::memory_order_relaxed);
 499   size_t capacity = capacity_.load(std::memory_order_relaxed);
 500   if (usage == 0) {
 501     return charge <= capacity;
 502   }
 503   size_t new_head = head_;
 504   bool second_iteration = false;
 505   while (usage + charge > capacity) {
 506     assert(new_head < list_.size());
 507     if (TryEvict(&list_[new_head], context)) {
 508       usage = usage_.load(std::memory_order_relaxed);
 509     }
 510     new_head = (new_head + 1 >= list_.size()) ? 0 : new_head + 1;
 511     if (new_head == head_) {
 512       if (second_iteration) {
 513         return false;
 514       } else {
 515         second_iteration = true;
 516       }
 517     }
 518   }
 519   head_ = new_head;
 520   return true;
 521 }
 522
 523 void ClockCacheShard::SetCapacity(size_t capacity) {
 524   CleanupContext context;
 525   {
 526     MutexLock l(&mutex_);
 527     capacity_.store(capacity, std::memory_order_relaxed);
 528     EvictFromCache(0, &context);
 529   }
 530   Cleanup(context);
 531 }
 532
 533 void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
 534   strict_capacity_limit_.store(strict_capacity_limit,
 535                                std::memory_order_relaxed);
 536 }
 537
 538 CacheHandle* ClockCacheShard::Insert(
 539     const Slice& key, uint32_t hash, void* value, size_t charge,
 540     void (*deleter)(const Slice& key, void* value), bool hold_reference,
 541     CleanupContext* context) {
 542   MutexLock l(&mutex_);
 543   bool success = EvictFromCache(charge, context);
 544   bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
 545   if (!success && (strict || !hold_reference)) {
 546     context->to_delete_key.push_back(key.data());
 547     if (!hold_reference) {
 548       context->to_delete_value.emplace_back(key, value, deleter);
 549     }
 550     return nullptr;
 551   }
 552   // Grab available handle from recycle bin. If recycle bin is empty, create
 553   // and append new handle to end of circular list.
 554   CacheHandle* handle = nullptr;
 555   if (!recycle_.empty()) {
 556     handle = recycle_.back();
 557     recycle_.pop_back();
 558   } else {
 559     list_.emplace_back();
 560     handle = &list_.back();
 561   }
 562   // Fill handle.
 563   handle->key = key;
 564   handle->hash = hash;
 565   handle->value = value;
 566   handle->charge = charge;
 567   handle->deleter = deleter;
 568   uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit;
 569   handle->flags.store(flags, std::memory_order_relaxed);
 570   HashTable::accessor accessor;
 571   if (table_.find(accessor, CacheKey(key, hash))) {
 572     CacheHandle* existing_handle = accessor->second;
 573     table_.erase(accessor);
 574     UnsetInCache(existing_handle, context);
 575   }
 576   table_.insert(HashTable::value_type(CacheKey(key, hash), handle));
 577   if (hold_reference) {
 578     pinned_usage_.fetch_add(charge, std::memory_order_relaxed);
 579   }
 580   usage_.fetch_add(charge, std::memory_order_relaxed);
 581   return handle;
 582 }
 583
 584 Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
 585                                size_t charge,
 586                                void (*deleter)(const Slice& key, void* value),
 587                                Cache::Handle** out_handle,
 588                                Cache::Priority /*priority*/) {
 589   CleanupContext context;
 590   HashTable::accessor accessor;
 591   char* key_data = new char[key.size()];
 592   memcpy(key_data, key.data(), key.size());
 593   Slice key_copy(key_data, key.size());
 594   CacheHandle* handle = Insert(key_copy, hash, value, charge, deleter,
 595                                out_handle != nullptr, &context);
 596   Status s;
 597   if (out_handle != nullptr) {
 598     if (handle == nullptr) {
 599       s = Status::Incomplete("Insert failed due to LRU cache being full.");
 600     } else {
 601       *out_handle = reinterpret_cast<Cache::Handle*>(handle);
 602     }
 603   }
 604   Cleanup(context);
 605   return s;
 606 }
 607
 608 Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
 609   HashTable::const_accessor accessor;
 610   if (!table_.find(accessor, CacheKey(key, hash))) {
 611     return nullptr;
 612   }
 613   CacheHandle* handle = accessor->second;
 614   accessor.release();
 615   // Ref() could fail if another thread sneak in and evict/erase the cache
 616   // entry before we are able to hold reference.
 617   if (!Ref(reinterpret_cast<Cache::Handle*>(handle))) {
 618     return nullptr;
 619   }
 620   // Double check the key since the handle may now representing another key
 621   // if other threads sneak in, evict/erase the entry and re-used the handle
 622   // for another cache entry.
 623   if (hash != handle->hash || key != handle->key) {
 624     CleanupContext context;
 625     Unref(handle, false, &context);
 626     // It is possible Unref() delete the entry, so we need to cleanup.
 627     Cleanup(context);
 628     return nullptr;
 629   }
 630   return reinterpret_cast<Cache::Handle*>(handle);
 631 }
 632
 633 bool ClockCacheShard::Release(Cache::Handle* h, bool force_erase) {
 634   CleanupContext context;
 635   CacheHandle* handle = reinterpret_cast<CacheHandle*>(h);
 636   bool erased = Unref(handle, true, &context);
 637   if (force_erase && !erased) {
 638     erased = EraseAndConfirm(handle->key, handle->hash, &context);
 639   }
 640   Cleanup(context);
 641   return erased;
 642 }
 643
 644 void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
 645   CleanupContext context;
 646   EraseAndConfirm(key, hash, &context);
 647   Cleanup(context);
 648 }
 649
 650 bool ClockCacheShard::EraseAndConfirm(const Slice& key, uint32_t hash,
 651                                       CleanupContext* context) {
 652   MutexLock l(&mutex_);
 653   HashTable::accessor accessor;
 654   bool erased = false;
 655   if (table_.find(accessor, CacheKey(key, hash))) {
 656     CacheHandle* handle = accessor->second;
 657     table_.erase(accessor);
 658     erased = UnsetInCache(handle, context);
 659   }
 660   return erased;
 661 }
 662
 663 void ClockCacheShard::EraseUnRefEntries() {
 664   CleanupContext context;
 665   {
 666     MutexLock l(&mutex_);
 667     table_.clear();
 668     for (auto& handle : list_) {
 669       UnsetInCache(&handle, &context);
 670     }
 671   }
 672   Cleanup(context);
 673 }
 674
 675 class ClockCache final : public ShardedCache {
 676  public:
 677   ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit)
 678       : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
 679     int num_shards = 1 << num_shard_bits;
 680     shards_ = new ClockCacheShard[num_shards];
 681     SetCapacity(capacity);
 682     SetStrictCapacityLimit(strict_capacity_limit);
 683   }
 684
 685   ~ClockCache() override { delete[] shards_; }
 686
 687   const char* Name() const override { return "ClockCache"; }
 688
 689   CacheShard* GetShard(int shard) override {
 690     return reinterpret_cast<CacheShard*>(&shards_[shard]);
 691   }
 692
 693   const CacheShard* GetShard(int shard) const override {
 694     return reinterpret_cast<CacheShard*>(&shards_[shard]);
 695   }
 696
 697   void* Value(Handle* handle) override {
 698     return reinterpret_cast<const CacheHandle*>(handle)->value;
 699   }
 700
 701   size_t GetCharge(Handle* handle) const override {
 702     return reinterpret_cast<const CacheHandle*>(handle)->charge;
 703   }
 704
 705   uint32_t GetHash(Handle* handle) const override {
 706     return reinterpret_cast<const CacheHandle*>(handle)->hash;
 707   }
 708
 709   void DisownData() override { shards_ = nullptr; }
 710
 711  private:
 712   ClockCacheShard* shards_;
 713 };
 714
 715 }  // end anonymous namespace
 716
 717 std::shared_ptr<Cache> NewClockCache(size_t capacity, int num_shard_bits,
 718                                      bool strict_capacity_limit) {
 719   if (num_shard_bits < 0) {
 720     num_shard_bits = GetDefaultCacheShardBits(capacity);
 721   }
 722   return std::make_shared<ClockCache>(capacity, num_shard_bits,
 723                                       strict_capacity_limit);
 724 }
 725
 726 }  // namespace rocksdb
 727
 728 #endif  // SUPPORT_CLOCK_CACHE