ceph/src/rocksdb/cache/clock_cache.cc

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9
  10 #include "cache/clock_cache.h"
  11
  12 #include <cassert>
  13 #include <functional>
  14 #include <numeric>
  15
  16 #include "cache/cache_key.h"
  17 #include "logging/logging.h"
  18 #include "monitoring/perf_context_imp.h"
  19 #include "monitoring/statistics.h"
  20 #include "port/lang.h"
  21 #include "util/hash.h"
  22 #include "util/math.h"
  23 #include "util/random.h"
  24
  25 namespace ROCKSDB_NAMESPACE {
  26
  27 namespace clock_cache {
  28
  29 namespace {
  30 inline uint64_t GetRefcount(uint64_t meta) {
  31   return ((meta >> ClockHandle::kAcquireCounterShift) -
  32           (meta >> ClockHandle::kReleaseCounterShift)) &
  33          ClockHandle::kCounterMask;
  34 }
  35
  36 inline uint64_t GetInitialCountdown(Cache::Priority priority) {
  37   // Set initial clock data from priority
  38   // TODO: configuration parameters for priority handling and clock cycle
  39   // count?
  40   switch (priority) {
  41     case Cache::Priority::HIGH:
  42       return ClockHandle::kHighCountdown;
  43     default:
  44       assert(false);
  45       FALLTHROUGH_INTENDED;
  46     case Cache::Priority::LOW:
  47       return ClockHandle::kLowCountdown;
  48     case Cache::Priority::BOTTOM:
  49       return ClockHandle::kBottomCountdown;
  50   }
  51 }
  52
  53 inline void FreeDataMarkEmpty(ClockHandle& h) {
  54   // NOTE: in theory there's more room for parallelism if we copy the handle
  55   // data and delay actions like this until after marking the entry as empty,
  56   // but performance tests only show a regression by copying the few words
  57   // of data.
  58   h.FreeData();
  59
  60 #ifndef NDEBUG
  61   // Mark slot as empty, with assertion
  62   uint64_t meta = h.meta.exchange(0, std::memory_order_release);
  63   assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
  64 #else
  65   // Mark slot as empty
  66   h.meta.store(0, std::memory_order_release);
  67 #endif
  68 }
  69
  70 inline bool ClockUpdate(ClockHandle& h) {
  71   uint64_t meta = h.meta.load(std::memory_order_relaxed);
  72
  73   uint64_t acquire_count =
  74       (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
  75   uint64_t release_count =
  76       (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
  77   // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count,
  78   // release_count, (unsigned)(meta >> ClockHandle::kStateShift));
  79   if (acquire_count != release_count) {
  80     // Only clock update entries with no outstanding refs
  81     return false;
  82   }
  83   if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) {
  84     // Only clock update Shareable entries
  85     return false;
  86   }
  87   if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
  88       acquire_count > 0) {
  89     // Decrement clock
  90     uint64_t new_count =
  91         std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
  92     // Compare-exchange in the decremented clock info, but
  93     // not aggressively
  94     uint64_t new_meta =
  95         (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
  96         (new_count << ClockHandle::kReleaseCounterShift) |
  97         (new_count << ClockHandle::kAcquireCounterShift);
  98     h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed);
  99     return false;
 100   }
 101   // Otherwise, remove entry (either unreferenced invisible or
 102   // unreferenced and expired visible).
 103   if (h.meta.compare_exchange_strong(
 104           meta,
 105           uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
 106           std::memory_order_acquire)) {
 107     // Took ownership.
 108     return true;
 109   } else {
 110     // Compare-exchange failing probably
 111     // indicates the entry was used, so skip it in that case.
 112     return false;
 113   }
 114 }
 115
 116 }  // namespace
 117
 118 void ClockHandleBasicData::FreeData() const {
 119   if (deleter) {
 120     UniqueId64x2 unhashed;
 121     (*deleter)(
 122         ClockCacheShard<HyperClockTable>::ReverseHash(hashed_key, &unhashed),
 123         value);
 124   }
 125 }
 126
 127 HyperClockTable::HyperClockTable(
 128     size_t capacity, bool /*strict_capacity_limit*/,
 129     CacheMetadataChargePolicy metadata_charge_policy, const Opts& opts)
 130     : length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
 131                                 metadata_charge_policy)),
 132       length_bits_mask_((size_t{1} << length_bits_) - 1),
 133       occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
 134                                            kStrictLoadFactor)),
 135       array_(new HandleImpl[size_t{1} << length_bits_]) {
 136   if (metadata_charge_policy ==
 137       CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
 138     usage_ += size_t{GetTableSize()} * sizeof(HandleImpl);
 139   }
 140
 141   static_assert(sizeof(HandleImpl) == 64U,
 142                 "Expecting size / alignment with common cache line size");
 143 }
 144
 145 HyperClockTable::~HyperClockTable() {
 146   // Assumes there are no references or active operations on any slot/element
 147   // in the table.
 148   for (size_t i = 0; i < GetTableSize(); i++) {
 149     HandleImpl& h = array_[i];
 150     switch (h.meta >> ClockHandle::kStateShift) {
 151       case ClockHandle::kStateEmpty:
 152         // noop
 153         break;
 154       case ClockHandle::kStateInvisible:  // rare but possible
 155       case ClockHandle::kStateVisible:
 156         assert(GetRefcount(h.meta) == 0);
 157         h.FreeData();
 158 #ifndef NDEBUG
 159         Rollback(h.hashed_key, &h);
 160         ReclaimEntryUsage(h.GetTotalCharge());
 161 #endif
 162         break;
 163       // otherwise
 164       default:
 165         assert(false);
 166         break;
 167     }
 168   }
 169
 170 #ifndef NDEBUG
 171   for (size_t i = 0; i < GetTableSize(); i++) {
 172     assert(array_[i].displacements.load() == 0);
 173   }
 174 #endif
 175
 176   assert(usage_.load() == 0 ||
 177          usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl));
 178   assert(occupancy_ == 0);
 179 }
 180
 181 // If an entry doesn't receive clock updates but is repeatedly referenced &
 182 // released, the acquire and release counters could overflow without some
 183 // intervention. This is that intervention, which should be inexpensive
 184 // because it only incurs a simple, very predictable check. (Applying a bit
 185 // mask in addition to an increment to every Release likely would be
 186 // relatively expensive, because it's an extra atomic update.)
 187 //
 188 // We do have to assume that we never have many millions of simultaneous
 189 // references to a cache handle, because we cannot represent so many
 190 // references with the difference in counters, masked to the number of
 191 // counter bits. Similarly, we assume there aren't millions of threads
 192 // holding transient references (which might be "undone" rather than
 193 // released by the way).
 194 //
 195 // Consider these possible states for each counter:
 196 // low: less than kMaxCountdown
 197 // medium: kMaxCountdown to half way to overflow + kMaxCountdown
 198 // high: half way to overflow + kMaxCountdown, or greater
 199 //
 200 // And these possible states for the combination of counters:
 201 // acquire / release
 202 // -------   -------
 203 // low       low       - Normal / common, with caveats (see below)
 204 // medium    low       - Can happen while holding some refs
 205 // high      low       - Violates assumptions (too many refs)
 206 // low       medium    - Violates assumptions (refs underflow, etc.)
 207 // medium    medium    - Normal (very read heavy cache)
 208 // high      medium    - Can happen while holding some refs
 209 // low       high      - This function is supposed to prevent
 210 // medium    high      - Violates assumptions (refs underflow, etc.)
 211 // high      high      - Needs CorrectNearOverflow
 212 //
 213 // Basically, this function detects (high, high) state (inferred from
 214 // release alone being high) and bumps it back down to (medium, medium)
 215 // state with the same refcount and the same logical countdown counter
 216 // (everything > kMaxCountdown is logically the same). Note that bumping
 217 // down to (low, low) would modify the countdown counter, so is "reserved"
 218 // in a sense.
 219 //
 220 // If near-overflow correction is triggered here, there's no guarantee
 221 // that another thread hasn't freed the entry and replaced it with another.
 222 // Therefore, it must be the case that the correction does not affect
 223 // entries unless they are very old (many millions of acquire-release cycles).
 224 // (Our bit manipulation is indeed idempotent and only affects entries in
 225 // exceptional cases.) We assume a pre-empted thread will not stall that long.
 226 // If it did, the state could be corrupted in the (unlikely) case that the top
 227 // bit of the acquire counter is set but not the release counter, and thus
 228 // we only clear the top bit of the acquire counter on resumption. It would
 229 // then appear that there are too many refs and the entry would be permanently
 230 // pinned (which is not terrible for an exceptionally rare occurrence), unless
 231 // it is referenced enough (at least kMaxCountdown more times) for the release
 232 // counter to reach "high" state again and bumped back to "medium." (This
 233 // motivates only checking for release counter in high state, not both in high
 234 // state.)
 235 inline void CorrectNearOverflow(uint64_t old_meta,
 236                                 std::atomic<uint64_t>& meta) {
 237   // We clear both top-most counter bits at the same time.
 238   constexpr uint64_t kCounterTopBit = uint64_t{1}
 239                                       << (ClockHandle::kCounterNumBits - 1);
 240   constexpr uint64_t kClearBits =
 241       (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
 242       (kCounterTopBit << ClockHandle::kReleaseCounterShift);
 243   // A simple check that allows us to initiate clearing the top bits for
 244   // a large portion of the "high" state space on release counter.
 245   constexpr uint64_t kCheckBits =
 246       (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
 247       << ClockHandle::kReleaseCounterShift;
 248
 249   if (UNLIKELY(old_meta & kCheckBits)) {
 250     meta.fetch_and(~kClearBits, std::memory_order_relaxed);
 251   }
 252 }
 253
 254 inline Status HyperClockTable::ChargeUsageMaybeEvictStrict(
 255     size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
 256   if (total_charge > capacity) {
 257     return Status::MemoryLimit(
 258         "Cache entry too large for a single cache shard: " +
 259         std::to_string(total_charge) + " > " + std::to_string(capacity));
 260   }
 261   // Grab any available capacity, and free up any more required.
 262   size_t old_usage = usage_.load(std::memory_order_relaxed);
 263   size_t new_usage;
 264   if (LIKELY(old_usage != capacity)) {
 265     do {
 266       new_usage = std::min(capacity, old_usage + total_charge);
 267     } while (!usage_.compare_exchange_weak(old_usage, new_usage,
 268                                            std::memory_order_relaxed));
 269   } else {
 270     new_usage = old_usage;
 271   }
 272   // How much do we need to evict then?
 273   size_t need_evict_charge = old_usage + total_charge - new_usage;
 274   size_t request_evict_charge = need_evict_charge;
 275   if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) {
 276     // Require at least 1 eviction.
 277     request_evict_charge = 1;
 278   }
 279   if (request_evict_charge > 0) {
 280     size_t evicted_charge = 0;
 281     size_t evicted_count = 0;
 282     Evict(request_evict_charge, &evicted_charge, &evicted_count);
 283     occupancy_.fetch_sub(evicted_count, std::memory_order_release);
 284     if (LIKELY(evicted_charge > need_evict_charge)) {
 285       assert(evicted_count > 0);
 286       // Evicted more than enough
 287       usage_.fetch_sub(evicted_charge - need_evict_charge,
 288                        std::memory_order_relaxed);
 289     } else if (evicted_charge < need_evict_charge ||
 290                (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) {
 291       // Roll back to old usage minus evicted
 292       usage_.fetch_sub(evicted_charge + (new_usage - old_usage),
 293                        std::memory_order_relaxed);
 294       if (evicted_charge < need_evict_charge) {
 295         return Status::MemoryLimit(
 296             "Insert failed because unable to evict entries to stay within "
 297             "capacity limit.");
 298       } else {
 299         return Status::MemoryLimit(
 300             "Insert failed because unable to evict entries to stay within "
 301             "table occupancy limit.");
 302       }
 303     }
 304     // If we needed to evict something and we are proceeding, we must have
 305     // evicted something.
 306     assert(evicted_count > 0);
 307   }
 308   return Status::OK();
 309 }
 310
 311 inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict(
 312     size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
 313   // For simplicity, we consider that either the cache can accept the insert
 314   // with no evictions, or we must evict enough to make (at least) enough
 315   // space. It could lead to unnecessary failures or excessive evictions in
 316   // some extreme cases, but allows a fast, simple protocol. If we allow a
 317   // race to get us over capacity, then we might never get back to capacity
 318   // limit if the sizes of entries allow each insertion to evict the minimum
 319   // charge. Thus, we should evict some extra if it's not a signifcant
 320   // portion of the shard capacity. This can have the side benefit of
 321   // involving fewer threads in eviction.
 322   size_t old_usage = usage_.load(std::memory_order_relaxed);
 323   size_t need_evict_charge;
 324   // NOTE: if total_charge > old_usage, there isn't yet enough to evict
 325   // `total_charge` amount. Even if we only try to evict `old_usage` amount,
 326   // there's likely something referenced and we would eat CPU looking for
 327   // enough to evict.
 328   if (old_usage + total_charge <= capacity || total_charge > old_usage) {
 329     // Good enough for me (might run over with a race)
 330     need_evict_charge = 0;
 331   } else {
 332     // Try to evict enough space, and maybe some extra
 333     need_evict_charge = total_charge;
 334     if (old_usage > capacity) {
 335       // Not too much to avoid thundering herd while avoiding strict
 336       // synchronization, such as the compare_exchange used with strict
 337       // capacity limit.
 338       need_evict_charge += std::min(capacity / 1024, total_charge) + 1;
 339     }
 340   }
 341   if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) {
 342     // Special case: require at least 1 eviction if we only have to
 343     // deal with occupancy
 344     need_evict_charge = 1;
 345   }
 346   size_t evicted_charge = 0;
 347   size_t evicted_count = 0;
 348   if (need_evict_charge > 0) {
 349     Evict(need_evict_charge, &evicted_charge, &evicted_count);
 350     // Deal with potential occupancy deficit
 351     if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) {
 352       assert(evicted_charge == 0);
 353       // Can't meet occupancy requirement
 354       return false;
 355     } else {
 356       // Update occupancy for evictions
 357       occupancy_.fetch_sub(evicted_count, std::memory_order_release);
 358     }
 359   }
 360   // Track new usage even if we weren't able to evict enough
 361   usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed);
 362   // No underflow
 363   assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
 364   // Success
 365   return true;
 366 }
 367
 368 inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert(
 369     const ClockHandleBasicData& proto) {
 370   // Heap allocated separate from table
 371   HandleImpl* h = new HandleImpl();
 372   ClockHandleBasicData* h_alias = h;
 373   *h_alias = proto;
 374   h->SetDetached();
 375   // Single reference (detached entries only created if returning a refed
 376   // Handle back to user)
 377   uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
 378                   << ClockHandle::kStateShift;
 379   meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
 380   h->meta.store(meta, std::memory_order_release);
 381   // Keep track of how much of usage is detached
 382   detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed);
 383   return h;
 384 }
 385
 386 Status HyperClockTable::Insert(const ClockHandleBasicData& proto,
 387                                HandleImpl** handle, Cache::Priority priority,
 388                                size_t capacity, bool strict_capacity_limit) {
 389   // Do we have the available occupancy? Optimistically assume we do
 390   // and deal with it if we don't.
 391   size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
 392   auto revert_occupancy_fn = [&]() {
 393     occupancy_.fetch_sub(1, std::memory_order_relaxed);
 394   };
 395   // Whether we over-committed and need an eviction to make up for it
 396   bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_;
 397
 398   // Usage/capacity handling is somewhat different depending on
 399   // strict_capacity_limit, but mostly pessimistic.
 400   bool use_detached_insert = false;
 401   const size_t total_charge = proto.GetTotalCharge();
 402   if (strict_capacity_limit) {
 403     Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity,
 404                                            need_evict_for_occupancy);
 405     if (!s.ok()) {
 406       revert_occupancy_fn();
 407       return s;
 408     }
 409   } else {
 410     // Case strict_capacity_limit == false
 411     bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity,
 412                                                   need_evict_for_occupancy);
 413     if (!success) {
 414       revert_occupancy_fn();
 415       if (handle == nullptr) {
 416         // Don't insert the entry but still return ok, as if the entry
 417         // inserted into cache and evicted immediately.
 418         proto.FreeData();
 419         return Status::OK();
 420       } else {
 421         // Need to track usage of fallback detached insert
 422         usage_.fetch_add(total_charge, std::memory_order_relaxed);
 423         use_detached_insert = true;
 424       }
 425     }
 426   }
 427   auto revert_usage_fn = [&]() {
 428     usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 429     // No underflow
 430     assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
 431   };
 432
 433   if (!use_detached_insert) {
 434     // Attempt a table insert, but abort if we find an existing entry for the
 435     // key. If we were to overwrite old entries, we would either
 436     // * Have to gain ownership over an existing entry to overwrite it, which
 437     // would only work if there are no outstanding (read) references and would
 438     // create a small gap in availability of the entry (old or new) to lookups.
 439     // * Have to insert into a suboptimal location (more probes) so that the
 440     // old entry can be kept around as well.
 441
 442     uint64_t initial_countdown = GetInitialCountdown(priority);
 443     assert(initial_countdown > 0);
 444
 445     size_t probe = 0;
 446     HandleImpl* e = FindSlot(
 447         proto.hashed_key,
 448         [&](HandleImpl* h) {
 449           // Optimistically transition the slot from "empty" to
 450           // "under construction" (no effect on other states)
 451           uint64_t old_meta =
 452               h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit}
 453                                    << ClockHandle::kStateShift,
 454                                std::memory_order_acq_rel);
 455           uint64_t old_state = old_meta >> ClockHandle::kStateShift;
 456
 457           if (old_state == ClockHandle::kStateEmpty) {
 458             // We've started inserting into an available slot, and taken
 459             // ownership Save data fields
 460             ClockHandleBasicData* h_alias = h;
 461             *h_alias = proto;
 462
 463             // Transition from "under construction" state to "visible" state
 464             uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
 465                                 << ClockHandle::kStateShift;
 466
 467             // Maybe with an outstanding reference
 468             new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
 469             new_meta |= (initial_countdown - (handle != nullptr))
 470                         << ClockHandle::kReleaseCounterShift;
 471
 472 #ifndef NDEBUG
 473             // Save the state transition, with assertion
 474             old_meta = h->meta.exchange(new_meta, std::memory_order_release);
 475             assert(old_meta >> ClockHandle::kStateShift ==
 476                    ClockHandle::kStateConstruction);
 477 #else
 478             // Save the state transition
 479             h->meta.store(new_meta, std::memory_order_release);
 480 #endif
 481             return true;
 482           } else if (old_state != ClockHandle::kStateVisible) {
 483             // Slot not usable / touchable now
 484             return false;
 485           }
 486           // Existing, visible entry, which might be a match.
 487           // But first, we need to acquire a ref to read it. In fact, number of
 488           // refs for initial countdown, so that we boost the clock state if
 489           // this is a match.
 490           old_meta = h->meta.fetch_add(
 491               ClockHandle::kAcquireIncrement * initial_countdown,
 492               std::memory_order_acq_rel);
 493           // Like Lookup
 494           if ((old_meta >> ClockHandle::kStateShift) ==
 495               ClockHandle::kStateVisible) {
 496             // Acquired a read reference
 497             if (h->hashed_key == proto.hashed_key) {
 498               // Match. Release in a way that boosts the clock state
 499               old_meta = h->meta.fetch_add(
 500                   ClockHandle::kReleaseIncrement * initial_countdown,
 501                   std::memory_order_acq_rel);
 502               // Correct for possible (but rare) overflow
 503               CorrectNearOverflow(old_meta, h->meta);
 504               // Insert detached instead (only if return handle needed)
 505               use_detached_insert = true;
 506               return true;
 507             } else {
 508               // Mismatch. Pretend we never took the reference
 509               old_meta = h->meta.fetch_sub(
 510                   ClockHandle::kAcquireIncrement * initial_countdown,
 511                   std::memory_order_acq_rel);
 512             }
 513           } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
 514                               ClockHandle::kStateInvisible)) {
 515             // Pretend we never took the reference
 516             // WART: there's a tiny chance we release last ref to invisible
 517             // entry here. If that happens, we let eviction take care of it.
 518             old_meta = h->meta.fetch_sub(
 519                 ClockHandle::kAcquireIncrement * initial_countdown,
 520                 std::memory_order_acq_rel);
 521           } else {
 522             // For other states, incrementing the acquire counter has no effect
 523             // so we don't need to undo it.
 524             // Slot not usable / touchable now.
 525           }
 526           (void)old_meta;
 527           return false;
 528         },
 529         [&](HandleImpl* /*h*/) { return false; },
 530         [&](HandleImpl* h) {
 531           h->displacements.fetch_add(1, std::memory_order_relaxed);
 532         },
 533         probe);
 534     if (e == nullptr) {
 535       // Occupancy check and never abort FindSlot above should generally
 536       // prevent this, except it's theoretically possible for other threads
 537       // to evict and replace entries in the right order to hit every slot
 538       // when it is populated. Assuming random hashing, the chance of that
 539       // should be no higher than pow(kStrictLoadFactor, n) for n slots.
 540       // That should be infeasible for roughly n >= 256, so if this assertion
 541       // fails, that suggests something is going wrong.
 542       assert(GetTableSize() < 256);
 543       use_detached_insert = true;
 544     }
 545     if (!use_detached_insert) {
 546       // Successfully inserted
 547       if (handle) {
 548         *handle = e;
 549       }
 550       return Status::OK();
 551     }
 552     // Roll back table insertion
 553     Rollback(proto.hashed_key, e);
 554     revert_occupancy_fn();
 555     // Maybe fall back on detached insert
 556     if (handle == nullptr) {
 557       revert_usage_fn();
 558       // As if unrefed entry immdiately evicted
 559       proto.FreeData();
 560       return Status::OK();
 561     }
 562   }
 563
 564   // Run detached insert
 565   assert(use_detached_insert);
 566
 567   *handle = DetachedInsert(proto);
 568
 569   // The OkOverwritten status is used to count "redundant" insertions into
 570   // block cache. This implementation doesn't strictly check for redundant
 571   // insertions, but we instead are probably interested in how many insertions
 572   // didn't go into the table (instead "detached"), which could be redundant
 573   // Insert or some other reason (use_detached_insert reasons above).
 574   return Status::OkOverwritten();
 575 }
 576
 577 HyperClockTable::HandleImpl* HyperClockTable::Lookup(
 578     const UniqueId64x2& hashed_key) {
 579   size_t probe = 0;
 580   HandleImpl* e = FindSlot(
 581       hashed_key,
 582       [&](HandleImpl* h) {
 583         // Mostly branch-free version (similar performance)
 584         /*
 585         uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
 586                                      std::memory_order_acquire);
 587         bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
 588         bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
 589         bool match = (h->key == key) & visible;
 590         h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
 591         ClockHandle::kAcquireCounterShift, std::memory_order_release); return
 592         match;
 593         */
 594         // Optimistic lookup should pay off when the table is relatively
 595         // sparse.
 596         constexpr bool kOptimisticLookup = true;
 597         uint64_t old_meta;
 598         if (!kOptimisticLookup) {
 599           old_meta = h->meta.load(std::memory_order_acquire);
 600           if ((old_meta >> ClockHandle::kStateShift) !=
 601               ClockHandle::kStateVisible) {
 602             return false;
 603           }
 604         }
 605         // (Optimistically) increment acquire counter
 606         old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
 607                                      std::memory_order_acquire);
 608         // Check if it's an entry visible to lookups
 609         if ((old_meta >> ClockHandle::kStateShift) ==
 610             ClockHandle::kStateVisible) {
 611           // Acquired a read reference
 612           if (h->hashed_key == hashed_key) {
 613             // Match
 614             return true;
 615           } else {
 616             // Mismatch. Pretend we never took the reference
 617             old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
 618                                          std::memory_order_release);
 619           }
 620         } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
 621                             ClockHandle::kStateInvisible)) {
 622           // Pretend we never took the reference
 623           // WART: there's a tiny chance we release last ref to invisible
 624           // entry here. If that happens, we let eviction take care of it.
 625           old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
 626                                        std::memory_order_release);
 627         } else {
 628           // For other states, incrementing the acquire counter has no effect
 629           // so we don't need to undo it. Furthermore, we cannot safely undo
 630           // it because we did not acquire a read reference to lock the
 631           // entry in a Shareable state.
 632         }
 633         (void)old_meta;
 634         return false;
 635       },
 636       [&](HandleImpl* h) {
 637         return h->displacements.load(std::memory_order_relaxed) == 0;
 638       },
 639       [&](HandleImpl* /*h*/) {}, probe);
 640
 641   return e;
 642 }
 643
 644 bool HyperClockTable::Release(HandleImpl* h, bool useful,
 645                               bool erase_if_last_ref) {
 646   // In contrast with LRUCache's Release, this function won't delete the handle
 647   // when the cache is above capacity and the reference is the last one. Space
 648   // is only freed up by EvictFromClock (called by Insert when space is needed)
 649   // and Erase. We do this to avoid an extra atomic read of the variable usage_.
 650
 651   uint64_t old_meta;
 652   if (useful) {
 653     // Increment release counter to indicate was used
 654     old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
 655                                  std::memory_order_release);
 656   } else {
 657     // Decrement acquire counter to pretend it never happened
 658     old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
 659                                  std::memory_order_release);
 660   }
 661
 662   assert((old_meta >> ClockHandle::kStateShift) &
 663          ClockHandle::kStateShareableBit);
 664   // No underflow
 665   assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
 666           ClockHandle::kCounterMask) !=
 667          ((old_meta >> ClockHandle::kReleaseCounterShift) &
 668           ClockHandle::kCounterMask));
 669
 670   if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
 671                                     ClockHandle::kStateInvisible)) {
 672     // Update for last fetch_add op
 673     if (useful) {
 674       old_meta += ClockHandle::kReleaseIncrement;
 675     } else {
 676       old_meta -= ClockHandle::kAcquireIncrement;
 677     }
 678     // Take ownership if no refs
 679     do {
 680       if (GetRefcount(old_meta) != 0) {
 681         // Not last ref at some point in time during this Release call
 682         // Correct for possible (but rare) overflow
 683         CorrectNearOverflow(old_meta, h->meta);
 684         return false;
 685       }
 686       if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
 687                        << ClockHandle::kStateShift)) == 0) {
 688         // Someone else took ownership
 689         return false;
 690       }
 691       // Note that there's a small chance that we release, another thread
 692       // replaces this entry with another, reaches zero refs, and then we end
 693       // up erasing that other entry. That's an acceptable risk / imprecision.
 694     } while (!h->meta.compare_exchange_weak(
 695         old_meta,
 696         uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
 697         std::memory_order_acquire));
 698     // Took ownership
 699     size_t total_charge = h->GetTotalCharge();
 700     if (UNLIKELY(h->IsDetached())) {
 701       h->FreeData();
 702       // Delete detached handle
 703       delete h;
 704       detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 705       usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 706     } else {
 707       Rollback(h->hashed_key, h);
 708       FreeDataMarkEmpty(*h);
 709       ReclaimEntryUsage(total_charge);
 710     }
 711     return true;
 712   } else {
 713     // Correct for possible (but rare) overflow
 714     CorrectNearOverflow(old_meta, h->meta);
 715     return false;
 716   }
 717 }
 718
 719 void HyperClockTable::Ref(HandleImpl& h) {
 720   // Increment acquire counter
 721   uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
 722                                        std::memory_order_acquire);
 723
 724   assert((old_meta >> ClockHandle::kStateShift) &
 725          ClockHandle::kStateShareableBit);
 726   // Must have already had a reference
 727   assert(GetRefcount(old_meta) > 0);
 728   (void)old_meta;
 729 }
 730
 731 void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) {
 732   // Increment acquire counter
 733   uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement,
 734                                        std::memory_order_acquire);
 735
 736   assert((old_meta >> ClockHandle::kStateShift) &
 737          ClockHandle::kStateShareableBit);
 738   (void)old_meta;
 739 }
 740
 741 void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
 742   if (n > 0) {
 743     // Split into n - 1 and 1 steps.
 744     uint64_t old_meta = h->meta.fetch_add(
 745         (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire);
 746     assert((old_meta >> ClockHandle::kStateShift) &
 747            ClockHandle::kStateShareableBit);
 748     (void)old_meta;
 749
 750     Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
 751   }
 752 }
 753
 754 void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
 755   size_t probe = 0;
 756   (void)FindSlot(
 757       hashed_key,
 758       [&](HandleImpl* h) {
 759         // Could be multiple entries in rare cases. Erase them all.
 760         // Optimistically increment acquire counter
 761         uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
 762                                               std::memory_order_acquire);
 763         // Check if it's an entry visible to lookups
 764         if ((old_meta >> ClockHandle::kStateShift) ==
 765             ClockHandle::kStateVisible) {
 766           // Acquired a read reference
 767           if (h->hashed_key == hashed_key) {
 768             // Match. Set invisible.
 769             old_meta =
 770                 h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
 771                                     << ClockHandle::kStateShift),
 772                                   std::memory_order_acq_rel);
 773             // Apply update to local copy
 774             old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
 775                           << ClockHandle::kStateShift);
 776             for (;;) {
 777               uint64_t refcount = GetRefcount(old_meta);
 778               assert(refcount > 0);
 779               if (refcount > 1) {
 780                 // Not last ref at some point in time during this Erase call
 781                 // Pretend we never took the reference
 782                 h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
 783                                   std::memory_order_release);
 784                 break;
 785               } else if (h->meta.compare_exchange_weak(
 786                              old_meta,
 787                              uint64_t{ClockHandle::kStateConstruction}
 788                                  << ClockHandle::kStateShift,
 789                              std::memory_order_acq_rel)) {
 790                 // Took ownership
 791                 assert(hashed_key == h->hashed_key);
 792                 size_t total_charge = h->GetTotalCharge();
 793                 FreeDataMarkEmpty(*h);
 794                 ReclaimEntryUsage(total_charge);
 795                 // We already have a copy of hashed_key in this case, so OK to
 796                 // delay Rollback until after releasing the entry
 797                 Rollback(hashed_key, h);
 798                 break;
 799               }
 800             }
 801           } else {
 802             // Mismatch. Pretend we never took the reference
 803             h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
 804                               std::memory_order_release);
 805           }
 806         } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
 807                             ClockHandle::kStateInvisible)) {
 808           // Pretend we never took the reference
 809           // WART: there's a tiny chance we release last ref to invisible
 810           // entry here. If that happens, we let eviction take care of it.
 811           h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
 812                             std::memory_order_release);
 813         } else {
 814           // For other states, incrementing the acquire counter has no effect
 815           // so we don't need to undo it.
 816         }
 817         return false;
 818       },
 819       [&](HandleImpl* h) {
 820         return h->displacements.load(std::memory_order_relaxed) == 0;
 821       },
 822       [&](HandleImpl* /*h*/) {}, probe);
 823 }
 824
 825 void HyperClockTable::ConstApplyToEntriesRange(
 826     std::function<void(const HandleImpl&)> func, size_t index_begin,
 827     size_t index_end, bool apply_if_will_be_deleted) const {
 828   uint64_t check_state_mask = ClockHandle::kStateShareableBit;
 829   if (!apply_if_will_be_deleted) {
 830     check_state_mask |= ClockHandle::kStateVisibleBit;
 831   }
 832
 833   for (size_t i = index_begin; i < index_end; i++) {
 834     HandleImpl& h = array_[i];
 835
 836     // Note: to avoid using compare_exchange, we have to be extra careful.
 837     uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
 838     // Check if it's an entry visible to lookups
 839     if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
 840       // Increment acquire counter. Note: it's possible that the entry has
 841       // completely changed since we loaded old_meta, but incrementing acquire
 842       // count is always safe. (Similar to optimistic Lookup here.)
 843       old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
 844                                   std::memory_order_acquire);
 845       // Check whether we actually acquired a reference.
 846       if ((old_meta >> ClockHandle::kStateShift) &
 847           ClockHandle::kStateShareableBit) {
 848         // Apply func if appropriate
 849         if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
 850           func(h);
 851         }
 852         // Pretend we never took the reference
 853         h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
 854                          std::memory_order_release);
 855         // No net change, so don't need to check for overflow
 856       } else {
 857         // For other states, incrementing the acquire counter has no effect
 858         // so we don't need to undo it. Furthermore, we cannot safely undo
 859         // it because we did not acquire a read reference to lock the
 860         // entry in a Shareable state.
 861       }
 862     }
 863   }
 864 }
 865
 866 void HyperClockTable::EraseUnRefEntries() {
 867   for (size_t i = 0; i <= this->length_bits_mask_; i++) {
 868     HandleImpl& h = array_[i];
 869
 870     uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
 871     if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
 872                     << ClockHandle::kStateShift) &&
 873         GetRefcount(old_meta) == 0 &&
 874         h.meta.compare_exchange_strong(old_meta,
 875                                        uint64_t{ClockHandle::kStateConstruction}
 876                                            << ClockHandle::kStateShift,
 877                                        std::memory_order_acquire)) {
 878       // Took ownership
 879       size_t total_charge = h.GetTotalCharge();
 880       Rollback(h.hashed_key, &h);
 881       FreeDataMarkEmpty(h);
 882       ReclaimEntryUsage(total_charge);
 883     }
 884   }
 885 }
 886
 887 inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
 888     const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
 889     std::function<bool(HandleImpl*)> abort_fn,
 890     std::function<void(HandleImpl*)> update_fn, size_t& probe) {
 891   // NOTE: upper 32 bits of hashed_key[0] is used for sharding
 892   //
 893   // We use double-hashing probing. Every probe in the sequence is a
 894   // pseudorandom integer, computed as a linear function of two random hashes,
 895   // which we call base and increment. Specifically, the i-th probe is base + i
 896   // * increment modulo the table size.
 897   size_t base = static_cast<size_t>(hashed_key[1]);
 898   // We use an odd increment, which is relatively prime with the power-of-two
 899   // table size. This implies that we cycle back to the first probe only
 900   // after probing every slot exactly once.
 901   // TODO: we could also reconsider linear probing, though locality benefits
 902   // are limited because each slot is a full cache line
 903   size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
 904   size_t current = ModTableSize(base + probe * increment);
 905   while (probe <= length_bits_mask_) {
 906     HandleImpl* h = &array_[current];
 907     if (match_fn(h)) {
 908       probe++;
 909       return h;
 910     }
 911     if (abort_fn(h)) {
 912       return nullptr;
 913     }
 914     probe++;
 915     update_fn(h);
 916     current = ModTableSize(current + increment);
 917   }
 918   // We looped back.
 919   return nullptr;
 920 }
 921
 922 inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key,
 923                                       const HandleImpl* h) {
 924   size_t current = ModTableSize(hashed_key[1]);
 925   size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
 926   while (&array_[current] != h) {
 927     array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
 928     current = ModTableSize(current + increment);
 929   }
 930 }
 931
 932 inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) {
 933   auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release);
 934   (void)old_occupancy;
 935   // No underflow
 936   assert(old_occupancy > 0);
 937   auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 938   (void)old_usage;
 939   // No underflow
 940   assert(old_usage >= total_charge);
 941 }
 942
 943 inline void HyperClockTable::Evict(size_t requested_charge,
 944                                    size_t* freed_charge, size_t* freed_count) {
 945   // precondition
 946   assert(requested_charge > 0);
 947
 948   // TODO: make a tuning parameter?
 949   constexpr size_t step_size = 4;
 950
 951   // First (concurrent) increment clock pointer
 952   uint64_t old_clock_pointer =
 953       clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
 954
 955   // Cap the eviction effort at this thread (along with those operating in
 956   // parallel) circling through the whole structure kMaxCountdown times.
 957   // In other words, this eviction run must find something/anything that is
 958   // unreferenced at start of and during the eviction run that isn't reclaimed
 959   // by a concurrent eviction run.
 960   uint64_t max_clock_pointer =
 961       old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);
 962
 963   for (;;) {
 964     for (size_t i = 0; i < step_size; i++) {
 965       HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
 966       bool evicting = ClockUpdate(h);
 967       if (evicting) {
 968         Rollback(h.hashed_key, &h);
 969         *freed_charge += h.GetTotalCharge();
 970         *freed_count += 1;
 971         FreeDataMarkEmpty(h);
 972       }
 973     }
 974
 975     // Loop exit condition
 976     if (*freed_charge >= requested_charge) {
 977       return;
 978     }
 979     if (old_clock_pointer >= max_clock_pointer) {
 980       return;
 981     }
 982
 983     // Advance clock pointer (concurrently)
 984     old_clock_pointer =
 985         clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
 986   }
 987 }
 988
 989 template <class Table>
 990 ClockCacheShard<Table>::ClockCacheShard(
 991     size_t capacity, bool strict_capacity_limit,
 992     CacheMetadataChargePolicy metadata_charge_policy,
 993     const typename Table::Opts& opts)
 994     : CacheShardBase(metadata_charge_policy),
 995       table_(capacity, strict_capacity_limit, metadata_charge_policy, opts),
 996       capacity_(capacity),
 997       strict_capacity_limit_(strict_capacity_limit) {
 998   // Initial charge metadata should not exceed capacity
 999   assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl));
1000 }
1001
1002 template <class Table>
1003 void ClockCacheShard<Table>::EraseUnRefEntries() {
1004   table_.EraseUnRefEntries();
1005 }
1006
1007 template <class Table>
1008 void ClockCacheShard<Table>::ApplyToSomeEntries(
1009     const std::function<void(const Slice& key, void* value, size_t charge,
1010                              DeleterFn deleter)>& callback,
1011     size_t average_entries_per_lock, size_t* state) {
1012   // The state is essentially going to be the starting hash, which works
1013   // nicely even if we resize between calls because we use upper-most
1014   // hash bits for table indexes.
1015   size_t length_bits = table_.GetLengthBits();
1016   size_t length = table_.GetTableSize();
1017
1018   assert(average_entries_per_lock > 0);
1019   // Assuming we are called with same average_entries_per_lock repeatedly,
1020   // this simplifies some logic (index_end will not overflow).
1021   assert(average_entries_per_lock < length || *state == 0);
1022
1023   size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
1024   size_t index_end = index_begin + average_entries_per_lock;
1025   if (index_end >= length) {
1026     // Going to end.
1027     index_end = length;
1028     *state = SIZE_MAX;
1029   } else {
1030     *state = index_end << (sizeof(size_t) * 8u - length_bits);
1031   }
1032
1033   table_.ConstApplyToEntriesRange(
1034       [callback](const HandleImpl& h) {
1035         UniqueId64x2 unhashed;
1036         callback(ReverseHash(h.hashed_key, &unhashed), h.value,
1037                  h.GetTotalCharge(), h.deleter);
1038       },
1039       index_begin, index_end, false);
1040 }
1041
1042 int HyperClockTable::CalcHashBits(
1043     size_t capacity, size_t estimated_value_size,
1044     CacheMetadataChargePolicy metadata_charge_policy) {
1045   double average_slot_charge = estimated_value_size * kLoadFactor;
1046   if (metadata_charge_policy == kFullChargeCacheMetadata) {
1047     average_slot_charge += sizeof(HandleImpl);
1048   }
1049   assert(average_slot_charge > 0.0);
1050   uint64_t num_slots =
1051       static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);
1052
1053   int hash_bits = FloorLog2((num_slots << 1) - 1);
1054   if (metadata_charge_policy == kFullChargeCacheMetadata) {
1055     // For very small estimated value sizes, it's possible to overshoot
1056     while (hash_bits > 0 &&
1057            uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) {
1058       hash_bits--;
1059     }
1060   }
1061   return hash_bits;
1062 }
1063
1064 template <class Table>
1065 void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
1066   capacity_.store(capacity, std::memory_order_relaxed);
1067   // next Insert will take care of any necessary evictions
1068 }
1069
1070 template <class Table>
1071 void ClockCacheShard<Table>::SetStrictCapacityLimit(
1072     bool strict_capacity_limit) {
1073   strict_capacity_limit_.store(strict_capacity_limit,
1074                                std::memory_order_relaxed);
1075   // next Insert will take care of any necessary evictions
1076 }
1077
1078 template <class Table>
1079 Status ClockCacheShard<Table>::Insert(const Slice& key,
1080                                       const UniqueId64x2& hashed_key,
1081                                       void* value, size_t charge,
1082                                       Cache::DeleterFn deleter,
1083                                       HandleImpl** handle,
1084                                       Cache::Priority priority) {
1085   if (UNLIKELY(key.size() != kCacheKeySize)) {
1086     return Status::NotSupported("ClockCache only supports key size " +
1087                                 std::to_string(kCacheKeySize) + "B");
1088   }
1089   ClockHandleBasicData proto;
1090   proto.hashed_key = hashed_key;
1091   proto.value = value;
1092   proto.deleter = deleter;
1093   proto.total_charge = charge;
1094   Status s = table_.Insert(
1095       proto, handle, priority, capacity_.load(std::memory_order_relaxed),
1096       strict_capacity_limit_.load(std::memory_order_relaxed));
1097   return s;
1098 }
1099
1100 template <class Table>
1101 typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup(
1102     const Slice& key, const UniqueId64x2& hashed_key) {
1103   if (UNLIKELY(key.size() != kCacheKeySize)) {
1104     return nullptr;
1105   }
1106   return table_.Lookup(hashed_key);
1107 }
1108
1109 template <class Table>
1110 bool ClockCacheShard<Table>::Ref(HandleImpl* h) {
1111   if (h == nullptr) {
1112     return false;
1113   }
1114   table_.Ref(*h);
1115   return true;
1116 }
1117
1118 template <class Table>
1119 bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
1120                                      bool erase_if_last_ref) {
1121   if (handle == nullptr) {
1122     return false;
1123   }
1124   return table_.Release(handle, useful, erase_if_last_ref);
1125 }
1126
1127 template <class Table>
1128 void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
1129   table_.TEST_RefN(*h, n);
1130 }
1131
1132 template <class Table>
1133 void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
1134   table_.TEST_ReleaseN(h, n);
1135 }
1136
1137 template <class Table>
1138 bool ClockCacheShard<Table>::Release(HandleImpl* handle,
1139                                      bool erase_if_last_ref) {
1140   return Release(handle, /*useful=*/true, erase_if_last_ref);
1141 }
1142
1143 template <class Table>
1144 void ClockCacheShard<Table>::Erase(const Slice& key,
1145                                    const UniqueId64x2& hashed_key) {
1146   if (UNLIKELY(key.size() != kCacheKeySize)) {
1147     return;
1148   }
1149   table_.Erase(hashed_key);
1150 }
1151
1152 template <class Table>
1153 size_t ClockCacheShard<Table>::GetUsage() const {
1154   return table_.GetUsage();
1155 }
1156
1157 template <class Table>
1158 size_t ClockCacheShard<Table>::GetDetachedUsage() const {
1159   return table_.GetDetachedUsage();
1160 }
1161
1162 template <class Table>
1163 size_t ClockCacheShard<Table>::GetCapacity() const {
1164   return capacity_;
1165 }
1166
1167 template <class Table>
1168 size_t ClockCacheShard<Table>::GetPinnedUsage() const {
1169   // Computes the pinned usage by scanning the whole hash table. This
1170   // is slow, but avoids keeping an exact counter on the clock usage,
1171   // i.e., the number of not externally referenced elements.
1172   // Why avoid this counter? Because Lookup removes elements from the clock
1173   // list, so it would need to update the pinned usage every time,
1174   // which creates additional synchronization costs.
1175   size_t table_pinned_usage = 0;
1176   const bool charge_metadata =
1177       metadata_charge_policy_ == kFullChargeCacheMetadata;
1178   table_.ConstApplyToEntriesRange(
1179       [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
1180         uint64_t meta = h.meta.load(std::memory_order_relaxed);
1181         uint64_t refcount = GetRefcount(meta);
1182         // Holding one ref for ConstApplyToEntriesRange
1183         assert(refcount > 0);
1184         if (refcount > 1) {
1185           table_pinned_usage += h.GetTotalCharge();
1186           if (charge_metadata) {
1187             table_pinned_usage += sizeof(HandleImpl);
1188           }
1189         }
1190       },
1191       0, table_.GetTableSize(), true);
1192
1193   return table_pinned_usage + table_.GetDetachedUsage();
1194 }
1195
1196 template <class Table>
1197 size_t ClockCacheShard<Table>::GetOccupancyCount() const {
1198   return table_.GetOccupancy();
1199 }
1200
1201 template <class Table>
1202 size_t ClockCacheShard<Table>::GetOccupancyLimit() const {
1203   return table_.GetOccupancyLimit();
1204 }
1205
1206 template <class Table>
1207 size_t ClockCacheShard<Table>::GetTableAddressCount() const {
1208   return table_.GetTableSize();
1209 }
1210
1211 // Explicit instantiation
1212 template class ClockCacheShard<HyperClockTable>;
1213
1214 HyperClockCache::HyperClockCache(
1215     size_t capacity, size_t estimated_value_size, int num_shard_bits,
1216     bool strict_capacity_limit,
1217     CacheMetadataChargePolicy metadata_charge_policy,
1218     std::shared_ptr<MemoryAllocator> memory_allocator)
1219     : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
1220                    std::move(memory_allocator)) {
1221   assert(estimated_value_size > 0 ||
1222          metadata_charge_policy != kDontChargeCacheMetadata);
1223   // TODO: should not need to go through two levels of pointer indirection to
1224   // get to table entries
1225   size_t per_shard = GetPerShardCapacity();
1226   InitShards([=](Shard* cs) {
1227     HyperClockTable::Opts opts;
1228     opts.estimated_value_size = estimated_value_size;
1229     new (cs)
1230         Shard(per_shard, strict_capacity_limit, metadata_charge_policy, opts);
1231   });
1232 }
1233
1234 void* HyperClockCache::Value(Handle* handle) {
1235   return reinterpret_cast<const HandleImpl*>(handle)->value;
1236 }
1237
1238 size_t HyperClockCache::GetCharge(Handle* handle) const {
1239   return reinterpret_cast<const HandleImpl*>(handle)->GetTotalCharge();
1240 }
1241
1242 Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const {
1243   auto h = reinterpret_cast<const HandleImpl*>(handle);
1244   return h->deleter;
1245 }
1246
1247 namespace {
1248
1249 // For each cache shard, estimate what the table load factor would be if
1250 // cache filled to capacity with average entries. This is considered
1251 // indicative of a potential problem if the shard is essentially operating
1252 // "at limit", which we define as high actual usage (>80% of capacity)
1253 // or actual occupancy very close to limit (>95% of limit).
1254 // Also, for each shard compute the recommended estimated_entry_charge,
1255 // and keep the minimum one for use as overall recommendation.
1256 void AddShardEvaluation(const HyperClockCache::Shard& shard,
1257                         std::vector<double>& predicted_load_factors,
1258                         size_t& min_recommendation) {
1259   size_t usage = shard.GetUsage() - shard.GetDetachedUsage();
1260   size_t capacity = shard.GetCapacity();
1261   double usage_ratio = 1.0 * usage / capacity;
1262
1263   size_t occupancy = shard.GetOccupancyCount();
1264   size_t occ_limit = shard.GetOccupancyLimit();
1265   double occ_ratio = 1.0 * occupancy / occ_limit;
1266   if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) {
1267     // Skip as described above
1268     return;
1269   }
1270
1271   // If filled to capacity, what would the occupancy ratio be?
1272   double ratio = occ_ratio / usage_ratio;
1273   // Given max load factor, what that load factor be?
1274   double lf = ratio * kStrictLoadFactor;
1275   predicted_load_factors.push_back(lf);
1276
1277   // Update min_recommendation also
1278   size_t recommendation = usage / occupancy;
1279   min_recommendation = std::min(min_recommendation, recommendation);
1280 }
1281
1282 }  // namespace
1283
1284 void HyperClockCache::ReportProblems(
1285     const std::shared_ptr<Logger>& info_log) const {
1286   uint32_t shard_count = GetNumShards();
1287   std::vector<double> predicted_load_factors;
1288   size_t min_recommendation = SIZE_MAX;
1289   const_cast<HyperClockCache*>(this)->ForEachShard(
1290       [&](HyperClockCache::Shard* shard) {
1291         AddShardEvaluation(*shard, predicted_load_factors, min_recommendation);
1292       });
1293
1294   if (predicted_load_factors.empty()) {
1295     // None operating "at limit" -> nothing to report
1296     return;
1297   }
1298   std::sort(predicted_load_factors.begin(), predicted_load_factors.end());
1299
1300   // First, if the average load factor is within spec, we aren't going to
1301   // complain about a few shards being out of spec.
1302   // NOTE: this is only the average among cache shards operating "at limit,"
1303   // which should be representative of what we care about. It it normal, even
1304   // desirable, for a cache to operate "at limit" so this should not create
1305   // selection bias. See AddShardEvaluation().
1306   // TODO: Consider detecting cases where decreasing the number of shards
1307   // would be good, e.g. serious imbalance among shards.
1308   double average_load_factor =
1309       std::accumulate(predicted_load_factors.begin(),
1310                       predicted_load_factors.end(), 0.0) /
1311       shard_count;
1312
1313   constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
1314   constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
1315   if (average_load_factor > kLoadFactor) {
1316     // Out of spec => Consider reporting load factor too high
1317     // Estimate effective overall capacity loss due to enforcing occupancy limit
1318     double lost_portion = 0.0;
1319     int over_count = 0;
1320     for (double lf : predicted_load_factors) {
1321       if (lf > kStrictLoadFactor) {
1322         ++over_count;
1323         lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
1324       }
1325     }
1326     // >= 20% loss -> error
1327     // >= 10% loss -> consistent warning
1328     // >= 1% loss -> intermittent warning
1329     InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
1330     bool report = true;
1331     if (lost_portion > 0.2) {
1332       level = InfoLogLevel::ERROR_LEVEL;
1333     } else if (lost_portion > 0.1) {
1334       level = InfoLogLevel::WARN_LEVEL;
1335     } else if (lost_portion > 0.01) {
1336       int report_percent = static_cast<int>(lost_portion * 100.0);
1337       if (Random::GetTLSInstance()->PercentTrue(report_percent)) {
1338         level = InfoLogLevel::WARN_LEVEL;
1339       }
1340     } else {
1341       // don't report
1342       report = false;
1343     }
1344     if (report) {
1345       ROCKS_LOG_AT_LEVEL(
1346           info_log, level,
1347           "HyperClockCache@%p unable to use estimated %.1f%% capacity because "
1348           "of "
1349           "full occupancy in %d/%u cache shards (estimated_entry_charge too "
1350           "high). Recommend estimated_entry_charge=%zu",
1351           this, lost_portion * 100.0, over_count, (unsigned)shard_count,
1352           min_recommendation);
1353     }
1354   } else if (average_load_factor < kLowSpecLoadFactor) {
1355     // Out of spec => Consider reporting load factor too low
1356     // But cautiously because low is not as big of a problem.
1357
1358     // Only report if highest occupancy shard is also below
1359     // spec and only if average is substantially out of spec
1360     if (predicted_load_factors.back() < kLowSpecLoadFactor &&
1361         average_load_factor < kLowSpecLoadFactor / 1.414) {
1362       InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
1363       if (average_load_factor < kLowSpecLoadFactor / 2) {
1364         level = InfoLogLevel::WARN_LEVEL;
1365       }
1366       ROCKS_LOG_AT_LEVEL(
1367           info_log, level,
1368           "HyperClockCache@%p table has low occupancy at full capacity. Higher "
1369           "estimated_entry_charge (about %.1fx) would likely improve "
1370           "performance. Recommend estimated_entry_charge=%zu",
1371           this, kMidSpecLoadFactor / average_load_factor, min_recommendation);
1372     }
1373   }
1374 }
1375
1376 }  // namespace clock_cache
1377
1378 // DEPRECATED (see public API)
1379 std::shared_ptr<Cache> NewClockCache(
1380     size_t capacity, int num_shard_bits, bool strict_capacity_limit,
1381     CacheMetadataChargePolicy metadata_charge_policy) {
1382   return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
1383                      /* high_pri_pool_ratio */ 0.5, nullptr,
1384                      kDefaultToAdaptiveMutex, metadata_charge_policy,
1385                      /* low_pri_pool_ratio */ 0.0);
1386 }
1387
1388 std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
1389   auto my_num_shard_bits = num_shard_bits;
1390   if (my_num_shard_bits >= 20) {
1391     return nullptr;  // The cache cannot be sharded into too many fine pieces.
1392   }
1393   if (my_num_shard_bits < 0) {
1394     // Use larger shard size to reduce risk of large entries clustering
1395     // or skewing individual shards.
1396     constexpr size_t min_shard_size = 32U * 1024U * 1024U;
1397     my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size);
1398   }
1399   return std::make_shared<clock_cache::HyperClockCache>(
1400       capacity, estimated_entry_charge, my_num_shard_bits,
1401       strict_capacity_limit, metadata_charge_policy, memory_allocator);
1402 }
1403
1404 }  // namespace ROCKSDB_NAMESPACE