]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/cache/clock_cache.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / cache / clock_cache.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "cache/clock_cache.h"
11
12 #include <cassert>
13 #include <functional>
14 #include <numeric>
15
16 #include "cache/cache_key.h"
17 #include "logging/logging.h"
18 #include "monitoring/perf_context_imp.h"
19 #include "monitoring/statistics.h"
20 #include "port/lang.h"
21 #include "util/hash.h"
22 #include "util/math.h"
23 #include "util/random.h"
24
25 namespace ROCKSDB_NAMESPACE {
26
27 namespace clock_cache {
28
29 namespace {
30 inline uint64_t GetRefcount(uint64_t meta) {
31 return ((meta >> ClockHandle::kAcquireCounterShift) -
32 (meta >> ClockHandle::kReleaseCounterShift)) &
33 ClockHandle::kCounterMask;
34 }
35
36 inline uint64_t GetInitialCountdown(Cache::Priority priority) {
37 // Set initial clock data from priority
38 // TODO: configuration parameters for priority handling and clock cycle
39 // count?
40 switch (priority) {
41 case Cache::Priority::HIGH:
42 return ClockHandle::kHighCountdown;
43 default:
44 assert(false);
45 FALLTHROUGH_INTENDED;
46 case Cache::Priority::LOW:
47 return ClockHandle::kLowCountdown;
48 case Cache::Priority::BOTTOM:
49 return ClockHandle::kBottomCountdown;
50 }
51 }
52
53 inline void FreeDataMarkEmpty(ClockHandle& h) {
54 // NOTE: in theory there's more room for parallelism if we copy the handle
55 // data and delay actions like this until after marking the entry as empty,
56 // but performance tests only show a regression by copying the few words
57 // of data.
58 h.FreeData();
59
60 #ifndef NDEBUG
61 // Mark slot as empty, with assertion
62 uint64_t meta = h.meta.exchange(0, std::memory_order_release);
63 assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
64 #else
65 // Mark slot as empty
66 h.meta.store(0, std::memory_order_release);
67 #endif
68 }
69
70 inline bool ClockUpdate(ClockHandle& h) {
71 uint64_t meta = h.meta.load(std::memory_order_relaxed);
72
73 uint64_t acquire_count =
74 (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
75 uint64_t release_count =
76 (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
77 // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count,
78 // release_count, (unsigned)(meta >> ClockHandle::kStateShift));
79 if (acquire_count != release_count) {
80 // Only clock update entries with no outstanding refs
81 return false;
82 }
83 if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) {
84 // Only clock update Shareable entries
85 return false;
86 }
87 if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
88 acquire_count > 0) {
89 // Decrement clock
90 uint64_t new_count =
91 std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
92 // Compare-exchange in the decremented clock info, but
93 // not aggressively
94 uint64_t new_meta =
95 (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
96 (new_count << ClockHandle::kReleaseCounterShift) |
97 (new_count << ClockHandle::kAcquireCounterShift);
98 h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed);
99 return false;
100 }
101 // Otherwise, remove entry (either unreferenced invisible or
102 // unreferenced and expired visible).
103 if (h.meta.compare_exchange_strong(
104 meta,
105 uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
106 std::memory_order_acquire)) {
107 // Took ownership.
108 return true;
109 } else {
110 // Compare-exchange failing probably
111 // indicates the entry was used, so skip it in that case.
112 return false;
113 }
114 }
115
116 } // namespace
117
118 void ClockHandleBasicData::FreeData() const {
119 if (deleter) {
120 UniqueId64x2 unhashed;
121 (*deleter)(
122 ClockCacheShard<HyperClockTable>::ReverseHash(hashed_key, &unhashed),
123 value);
124 }
125 }
126
127 HyperClockTable::HyperClockTable(
128 size_t capacity, bool /*strict_capacity_limit*/,
129 CacheMetadataChargePolicy metadata_charge_policy, const Opts& opts)
130 : length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
131 metadata_charge_policy)),
132 length_bits_mask_((size_t{1} << length_bits_) - 1),
133 occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_) *
134 kStrictLoadFactor)),
135 array_(new HandleImpl[size_t{1} << length_bits_]) {
136 if (metadata_charge_policy ==
137 CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
138 usage_ += size_t{GetTableSize()} * sizeof(HandleImpl);
139 }
140
141 static_assert(sizeof(HandleImpl) == 64U,
142 "Expecting size / alignment with common cache line size");
143 }
144
145 HyperClockTable::~HyperClockTable() {
146 // Assumes there are no references or active operations on any slot/element
147 // in the table.
148 for (size_t i = 0; i < GetTableSize(); i++) {
149 HandleImpl& h = array_[i];
150 switch (h.meta >> ClockHandle::kStateShift) {
151 case ClockHandle::kStateEmpty:
152 // noop
153 break;
154 case ClockHandle::kStateInvisible: // rare but possible
155 case ClockHandle::kStateVisible:
156 assert(GetRefcount(h.meta) == 0);
157 h.FreeData();
158 #ifndef NDEBUG
159 Rollback(h.hashed_key, &h);
160 ReclaimEntryUsage(h.GetTotalCharge());
161 #endif
162 break;
163 // otherwise
164 default:
165 assert(false);
166 break;
167 }
168 }
169
170 #ifndef NDEBUG
171 for (size_t i = 0; i < GetTableSize(); i++) {
172 assert(array_[i].displacements.load() == 0);
173 }
174 #endif
175
176 assert(usage_.load() == 0 ||
177 usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl));
178 assert(occupancy_ == 0);
179 }
180
181 // If an entry doesn't receive clock updates but is repeatedly referenced &
182 // released, the acquire and release counters could overflow without some
183 // intervention. This is that intervention, which should be inexpensive
184 // because it only incurs a simple, very predictable check. (Applying a bit
185 // mask in addition to an increment to every Release likely would be
186 // relatively expensive, because it's an extra atomic update.)
187 //
188 // We do have to assume that we never have many millions of simultaneous
189 // references to a cache handle, because we cannot represent so many
190 // references with the difference in counters, masked to the number of
191 // counter bits. Similarly, we assume there aren't millions of threads
192 // holding transient references (which might be "undone" rather than
193 // released by the way).
194 //
195 // Consider these possible states for each counter:
196 // low: less than kMaxCountdown
197 // medium: kMaxCountdown to half way to overflow + kMaxCountdown
198 // high: half way to overflow + kMaxCountdown, or greater
199 //
200 // And these possible states for the combination of counters:
201 // acquire / release
202 // ------- -------
203 // low low - Normal / common, with caveats (see below)
204 // medium low - Can happen while holding some refs
205 // high low - Violates assumptions (too many refs)
206 // low medium - Violates assumptions (refs underflow, etc.)
207 // medium medium - Normal (very read heavy cache)
208 // high medium - Can happen while holding some refs
209 // low high - This function is supposed to prevent
210 // medium high - Violates assumptions (refs underflow, etc.)
211 // high high - Needs CorrectNearOverflow
212 //
213 // Basically, this function detects (high, high) state (inferred from
214 // release alone being high) and bumps it back down to (medium, medium)
215 // state with the same refcount and the same logical countdown counter
216 // (everything > kMaxCountdown is logically the same). Note that bumping
217 // down to (low, low) would modify the countdown counter, so is "reserved"
218 // in a sense.
219 //
220 // If near-overflow correction is triggered here, there's no guarantee
221 // that another thread hasn't freed the entry and replaced it with another.
222 // Therefore, it must be the case that the correction does not affect
223 // entries unless they are very old (many millions of acquire-release cycles).
224 // (Our bit manipulation is indeed idempotent and only affects entries in
225 // exceptional cases.) We assume a pre-empted thread will not stall that long.
226 // If it did, the state could be corrupted in the (unlikely) case that the top
227 // bit of the acquire counter is set but not the release counter, and thus
228 // we only clear the top bit of the acquire counter on resumption. It would
229 // then appear that there are too many refs and the entry would be permanently
230 // pinned (which is not terrible for an exceptionally rare occurrence), unless
231 // it is referenced enough (at least kMaxCountdown more times) for the release
232 // counter to reach "high" state again and bumped back to "medium." (This
233 // motivates only checking for release counter in high state, not both in high
234 // state.)
235 inline void CorrectNearOverflow(uint64_t old_meta,
236 std::atomic<uint64_t>& meta) {
237 // We clear both top-most counter bits at the same time.
238 constexpr uint64_t kCounterTopBit = uint64_t{1}
239 << (ClockHandle::kCounterNumBits - 1);
240 constexpr uint64_t kClearBits =
241 (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
242 (kCounterTopBit << ClockHandle::kReleaseCounterShift);
243 // A simple check that allows us to initiate clearing the top bits for
244 // a large portion of the "high" state space on release counter.
245 constexpr uint64_t kCheckBits =
246 (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
247 << ClockHandle::kReleaseCounterShift;
248
249 if (UNLIKELY(old_meta & kCheckBits)) {
250 meta.fetch_and(~kClearBits, std::memory_order_relaxed);
251 }
252 }
253
254 inline Status HyperClockTable::ChargeUsageMaybeEvictStrict(
255 size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
256 if (total_charge > capacity) {
257 return Status::MemoryLimit(
258 "Cache entry too large for a single cache shard: " +
259 std::to_string(total_charge) + " > " + std::to_string(capacity));
260 }
261 // Grab any available capacity, and free up any more required.
262 size_t old_usage = usage_.load(std::memory_order_relaxed);
263 size_t new_usage;
264 if (LIKELY(old_usage != capacity)) {
265 do {
266 new_usage = std::min(capacity, old_usage + total_charge);
267 } while (!usage_.compare_exchange_weak(old_usage, new_usage,
268 std::memory_order_relaxed));
269 } else {
270 new_usage = old_usage;
271 }
272 // How much do we need to evict then?
273 size_t need_evict_charge = old_usage + total_charge - new_usage;
274 size_t request_evict_charge = need_evict_charge;
275 if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) {
276 // Require at least 1 eviction.
277 request_evict_charge = 1;
278 }
279 if (request_evict_charge > 0) {
280 size_t evicted_charge = 0;
281 size_t evicted_count = 0;
282 Evict(request_evict_charge, &evicted_charge, &evicted_count);
283 occupancy_.fetch_sub(evicted_count, std::memory_order_release);
284 if (LIKELY(evicted_charge > need_evict_charge)) {
285 assert(evicted_count > 0);
286 // Evicted more than enough
287 usage_.fetch_sub(evicted_charge - need_evict_charge,
288 std::memory_order_relaxed);
289 } else if (evicted_charge < need_evict_charge ||
290 (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) {
291 // Roll back to old usage minus evicted
292 usage_.fetch_sub(evicted_charge + (new_usage - old_usage),
293 std::memory_order_relaxed);
294 if (evicted_charge < need_evict_charge) {
295 return Status::MemoryLimit(
296 "Insert failed because unable to evict entries to stay within "
297 "capacity limit.");
298 } else {
299 return Status::MemoryLimit(
300 "Insert failed because unable to evict entries to stay within "
301 "table occupancy limit.");
302 }
303 }
304 // If we needed to evict something and we are proceeding, we must have
305 // evicted something.
306 assert(evicted_count > 0);
307 }
308 return Status::OK();
309 }
310
311 inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict(
312 size_t total_charge, size_t capacity, bool need_evict_for_occupancy) {
313 // For simplicity, we consider that either the cache can accept the insert
314 // with no evictions, or we must evict enough to make (at least) enough
315 // space. It could lead to unnecessary failures or excessive evictions in
316 // some extreme cases, but allows a fast, simple protocol. If we allow a
317 // race to get us over capacity, then we might never get back to capacity
318 // limit if the sizes of entries allow each insertion to evict the minimum
319 // charge. Thus, we should evict some extra if it's not a signifcant
320 // portion of the shard capacity. This can have the side benefit of
321 // involving fewer threads in eviction.
322 size_t old_usage = usage_.load(std::memory_order_relaxed);
323 size_t need_evict_charge;
324 // NOTE: if total_charge > old_usage, there isn't yet enough to evict
325 // `total_charge` amount. Even if we only try to evict `old_usage` amount,
326 // there's likely something referenced and we would eat CPU looking for
327 // enough to evict.
328 if (old_usage + total_charge <= capacity || total_charge > old_usage) {
329 // Good enough for me (might run over with a race)
330 need_evict_charge = 0;
331 } else {
332 // Try to evict enough space, and maybe some extra
333 need_evict_charge = total_charge;
334 if (old_usage > capacity) {
335 // Not too much to avoid thundering herd while avoiding strict
336 // synchronization, such as the compare_exchange used with strict
337 // capacity limit.
338 need_evict_charge += std::min(capacity / 1024, total_charge) + 1;
339 }
340 }
341 if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) {
342 // Special case: require at least 1 eviction if we only have to
343 // deal with occupancy
344 need_evict_charge = 1;
345 }
346 size_t evicted_charge = 0;
347 size_t evicted_count = 0;
348 if (need_evict_charge > 0) {
349 Evict(need_evict_charge, &evicted_charge, &evicted_count);
350 // Deal with potential occupancy deficit
351 if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) {
352 assert(evicted_charge == 0);
353 // Can't meet occupancy requirement
354 return false;
355 } else {
356 // Update occupancy for evictions
357 occupancy_.fetch_sub(evicted_count, std::memory_order_release);
358 }
359 }
360 // Track new usage even if we weren't able to evict enough
361 usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed);
362 // No underflow
363 assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
364 // Success
365 return true;
366 }
367
368 inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert(
369 const ClockHandleBasicData& proto) {
370 // Heap allocated separate from table
371 HandleImpl* h = new HandleImpl();
372 ClockHandleBasicData* h_alias = h;
373 *h_alias = proto;
374 h->SetDetached();
375 // Single reference (detached entries only created if returning a refed
376 // Handle back to user)
377 uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
378 << ClockHandle::kStateShift;
379 meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
380 h->meta.store(meta, std::memory_order_release);
381 // Keep track of how much of usage is detached
382 detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed);
383 return h;
384 }
385
386 Status HyperClockTable::Insert(const ClockHandleBasicData& proto,
387 HandleImpl** handle, Cache::Priority priority,
388 size_t capacity, bool strict_capacity_limit) {
389 // Do we have the available occupancy? Optimistically assume we do
390 // and deal with it if we don't.
391 size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire);
392 auto revert_occupancy_fn = [&]() {
393 occupancy_.fetch_sub(1, std::memory_order_relaxed);
394 };
395 // Whether we over-committed and need an eviction to make up for it
396 bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_;
397
398 // Usage/capacity handling is somewhat different depending on
399 // strict_capacity_limit, but mostly pessimistic.
400 bool use_detached_insert = false;
401 const size_t total_charge = proto.GetTotalCharge();
402 if (strict_capacity_limit) {
403 Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity,
404 need_evict_for_occupancy);
405 if (!s.ok()) {
406 revert_occupancy_fn();
407 return s;
408 }
409 } else {
410 // Case strict_capacity_limit == false
411 bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity,
412 need_evict_for_occupancy);
413 if (!success) {
414 revert_occupancy_fn();
415 if (handle == nullptr) {
416 // Don't insert the entry but still return ok, as if the entry
417 // inserted into cache and evicted immediately.
418 proto.FreeData();
419 return Status::OK();
420 } else {
421 // Need to track usage of fallback detached insert
422 usage_.fetch_add(total_charge, std::memory_order_relaxed);
423 use_detached_insert = true;
424 }
425 }
426 }
427 auto revert_usage_fn = [&]() {
428 usage_.fetch_sub(total_charge, std::memory_order_relaxed);
429 // No underflow
430 assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2);
431 };
432
433 if (!use_detached_insert) {
434 // Attempt a table insert, but abort if we find an existing entry for the
435 // key. If we were to overwrite old entries, we would either
436 // * Have to gain ownership over an existing entry to overwrite it, which
437 // would only work if there are no outstanding (read) references and would
438 // create a small gap in availability of the entry (old or new) to lookups.
439 // * Have to insert into a suboptimal location (more probes) so that the
440 // old entry can be kept around as well.
441
442 uint64_t initial_countdown = GetInitialCountdown(priority);
443 assert(initial_countdown > 0);
444
445 size_t probe = 0;
446 HandleImpl* e = FindSlot(
447 proto.hashed_key,
448 [&](HandleImpl* h) {
449 // Optimistically transition the slot from "empty" to
450 // "under construction" (no effect on other states)
451 uint64_t old_meta =
452 h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit}
453 << ClockHandle::kStateShift,
454 std::memory_order_acq_rel);
455 uint64_t old_state = old_meta >> ClockHandle::kStateShift;
456
457 if (old_state == ClockHandle::kStateEmpty) {
458 // We've started inserting into an available slot, and taken
459 // ownership Save data fields
460 ClockHandleBasicData* h_alias = h;
461 *h_alias = proto;
462
463 // Transition from "under construction" state to "visible" state
464 uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
465 << ClockHandle::kStateShift;
466
467 // Maybe with an outstanding reference
468 new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
469 new_meta |= (initial_countdown - (handle != nullptr))
470 << ClockHandle::kReleaseCounterShift;
471
472 #ifndef NDEBUG
473 // Save the state transition, with assertion
474 old_meta = h->meta.exchange(new_meta, std::memory_order_release);
475 assert(old_meta >> ClockHandle::kStateShift ==
476 ClockHandle::kStateConstruction);
477 #else
478 // Save the state transition
479 h->meta.store(new_meta, std::memory_order_release);
480 #endif
481 return true;
482 } else if (old_state != ClockHandle::kStateVisible) {
483 // Slot not usable / touchable now
484 return false;
485 }
486 // Existing, visible entry, which might be a match.
487 // But first, we need to acquire a ref to read it. In fact, number of
488 // refs for initial countdown, so that we boost the clock state if
489 // this is a match.
490 old_meta = h->meta.fetch_add(
491 ClockHandle::kAcquireIncrement * initial_countdown,
492 std::memory_order_acq_rel);
493 // Like Lookup
494 if ((old_meta >> ClockHandle::kStateShift) ==
495 ClockHandle::kStateVisible) {
496 // Acquired a read reference
497 if (h->hashed_key == proto.hashed_key) {
498 // Match. Release in a way that boosts the clock state
499 old_meta = h->meta.fetch_add(
500 ClockHandle::kReleaseIncrement * initial_countdown,
501 std::memory_order_acq_rel);
502 // Correct for possible (but rare) overflow
503 CorrectNearOverflow(old_meta, h->meta);
504 // Insert detached instead (only if return handle needed)
505 use_detached_insert = true;
506 return true;
507 } else {
508 // Mismatch. Pretend we never took the reference
509 old_meta = h->meta.fetch_sub(
510 ClockHandle::kAcquireIncrement * initial_countdown,
511 std::memory_order_acq_rel);
512 }
513 } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
514 ClockHandle::kStateInvisible)) {
515 // Pretend we never took the reference
516 // WART: there's a tiny chance we release last ref to invisible
517 // entry here. If that happens, we let eviction take care of it.
518 old_meta = h->meta.fetch_sub(
519 ClockHandle::kAcquireIncrement * initial_countdown,
520 std::memory_order_acq_rel);
521 } else {
522 // For other states, incrementing the acquire counter has no effect
523 // so we don't need to undo it.
524 // Slot not usable / touchable now.
525 }
526 (void)old_meta;
527 return false;
528 },
529 [&](HandleImpl* /*h*/) { return false; },
530 [&](HandleImpl* h) {
531 h->displacements.fetch_add(1, std::memory_order_relaxed);
532 },
533 probe);
534 if (e == nullptr) {
535 // Occupancy check and never abort FindSlot above should generally
536 // prevent this, except it's theoretically possible for other threads
537 // to evict and replace entries in the right order to hit every slot
538 // when it is populated. Assuming random hashing, the chance of that
539 // should be no higher than pow(kStrictLoadFactor, n) for n slots.
540 // That should be infeasible for roughly n >= 256, so if this assertion
541 // fails, that suggests something is going wrong.
542 assert(GetTableSize() < 256);
543 use_detached_insert = true;
544 }
545 if (!use_detached_insert) {
546 // Successfully inserted
547 if (handle) {
548 *handle = e;
549 }
550 return Status::OK();
551 }
552 // Roll back table insertion
553 Rollback(proto.hashed_key, e);
554 revert_occupancy_fn();
555 // Maybe fall back on detached insert
556 if (handle == nullptr) {
557 revert_usage_fn();
558 // As if unrefed entry immdiately evicted
559 proto.FreeData();
560 return Status::OK();
561 }
562 }
563
564 // Run detached insert
565 assert(use_detached_insert);
566
567 *handle = DetachedInsert(proto);
568
569 // The OkOverwritten status is used to count "redundant" insertions into
570 // block cache. This implementation doesn't strictly check for redundant
571 // insertions, but we instead are probably interested in how many insertions
572 // didn't go into the table (instead "detached"), which could be redundant
573 // Insert or some other reason (use_detached_insert reasons above).
574 return Status::OkOverwritten();
575 }
576
577 HyperClockTable::HandleImpl* HyperClockTable::Lookup(
578 const UniqueId64x2& hashed_key) {
579 size_t probe = 0;
580 HandleImpl* e = FindSlot(
581 hashed_key,
582 [&](HandleImpl* h) {
583 // Mostly branch-free version (similar performance)
584 /*
585 uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
586 std::memory_order_acquire);
587 bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
588 bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
589 bool match = (h->key == key) & visible;
590 h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
591 ClockHandle::kAcquireCounterShift, std::memory_order_release); return
592 match;
593 */
594 // Optimistic lookup should pay off when the table is relatively
595 // sparse.
596 constexpr bool kOptimisticLookup = true;
597 uint64_t old_meta;
598 if (!kOptimisticLookup) {
599 old_meta = h->meta.load(std::memory_order_acquire);
600 if ((old_meta >> ClockHandle::kStateShift) !=
601 ClockHandle::kStateVisible) {
602 return false;
603 }
604 }
605 // (Optimistically) increment acquire counter
606 old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
607 std::memory_order_acquire);
608 // Check if it's an entry visible to lookups
609 if ((old_meta >> ClockHandle::kStateShift) ==
610 ClockHandle::kStateVisible) {
611 // Acquired a read reference
612 if (h->hashed_key == hashed_key) {
613 // Match
614 return true;
615 } else {
616 // Mismatch. Pretend we never took the reference
617 old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
618 std::memory_order_release);
619 }
620 } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
621 ClockHandle::kStateInvisible)) {
622 // Pretend we never took the reference
623 // WART: there's a tiny chance we release last ref to invisible
624 // entry here. If that happens, we let eviction take care of it.
625 old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
626 std::memory_order_release);
627 } else {
628 // For other states, incrementing the acquire counter has no effect
629 // so we don't need to undo it. Furthermore, we cannot safely undo
630 // it because we did not acquire a read reference to lock the
631 // entry in a Shareable state.
632 }
633 (void)old_meta;
634 return false;
635 },
636 [&](HandleImpl* h) {
637 return h->displacements.load(std::memory_order_relaxed) == 0;
638 },
639 [&](HandleImpl* /*h*/) {}, probe);
640
641 return e;
642 }
643
644 bool HyperClockTable::Release(HandleImpl* h, bool useful,
645 bool erase_if_last_ref) {
646 // In contrast with LRUCache's Release, this function won't delete the handle
647 // when the cache is above capacity and the reference is the last one. Space
648 // is only freed up by EvictFromClock (called by Insert when space is needed)
649 // and Erase. We do this to avoid an extra atomic read of the variable usage_.
650
651 uint64_t old_meta;
652 if (useful) {
653 // Increment release counter to indicate was used
654 old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement,
655 std::memory_order_release);
656 } else {
657 // Decrement acquire counter to pretend it never happened
658 old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
659 std::memory_order_release);
660 }
661
662 assert((old_meta >> ClockHandle::kStateShift) &
663 ClockHandle::kStateShareableBit);
664 // No underflow
665 assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
666 ClockHandle::kCounterMask) !=
667 ((old_meta >> ClockHandle::kReleaseCounterShift) &
668 ClockHandle::kCounterMask));
669
670 if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
671 ClockHandle::kStateInvisible)) {
672 // Update for last fetch_add op
673 if (useful) {
674 old_meta += ClockHandle::kReleaseIncrement;
675 } else {
676 old_meta -= ClockHandle::kAcquireIncrement;
677 }
678 // Take ownership if no refs
679 do {
680 if (GetRefcount(old_meta) != 0) {
681 // Not last ref at some point in time during this Release call
682 // Correct for possible (but rare) overflow
683 CorrectNearOverflow(old_meta, h->meta);
684 return false;
685 }
686 if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
687 << ClockHandle::kStateShift)) == 0) {
688 // Someone else took ownership
689 return false;
690 }
691 // Note that there's a small chance that we release, another thread
692 // replaces this entry with another, reaches zero refs, and then we end
693 // up erasing that other entry. That's an acceptable risk / imprecision.
694 } while (!h->meta.compare_exchange_weak(
695 old_meta,
696 uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift,
697 std::memory_order_acquire));
698 // Took ownership
699 size_t total_charge = h->GetTotalCharge();
700 if (UNLIKELY(h->IsDetached())) {
701 h->FreeData();
702 // Delete detached handle
703 delete h;
704 detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
705 usage_.fetch_sub(total_charge, std::memory_order_relaxed);
706 } else {
707 Rollback(h->hashed_key, h);
708 FreeDataMarkEmpty(*h);
709 ReclaimEntryUsage(total_charge);
710 }
711 return true;
712 } else {
713 // Correct for possible (but rare) overflow
714 CorrectNearOverflow(old_meta, h->meta);
715 return false;
716 }
717 }
718
719 void HyperClockTable::Ref(HandleImpl& h) {
720 // Increment acquire counter
721 uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
722 std::memory_order_acquire);
723
724 assert((old_meta >> ClockHandle::kStateShift) &
725 ClockHandle::kStateShareableBit);
726 // Must have already had a reference
727 assert(GetRefcount(old_meta) > 0);
728 (void)old_meta;
729 }
730
731 void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) {
732 // Increment acquire counter
733 uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement,
734 std::memory_order_acquire);
735
736 assert((old_meta >> ClockHandle::kStateShift) &
737 ClockHandle::kStateShareableBit);
738 (void)old_meta;
739 }
740
741 void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
742 if (n > 0) {
743 // Split into n - 1 and 1 steps.
744 uint64_t old_meta = h->meta.fetch_add(
745 (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire);
746 assert((old_meta >> ClockHandle::kStateShift) &
747 ClockHandle::kStateShareableBit);
748 (void)old_meta;
749
750 Release(h, /*useful*/ true, /*erase_if_last_ref*/ false);
751 }
752 }
753
754 void HyperClockTable::Erase(const UniqueId64x2& hashed_key) {
755 size_t probe = 0;
756 (void)FindSlot(
757 hashed_key,
758 [&](HandleImpl* h) {
759 // Could be multiple entries in rare cases. Erase them all.
760 // Optimistically increment acquire counter
761 uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
762 std::memory_order_acquire);
763 // Check if it's an entry visible to lookups
764 if ((old_meta >> ClockHandle::kStateShift) ==
765 ClockHandle::kStateVisible) {
766 // Acquired a read reference
767 if (h->hashed_key == hashed_key) {
768 // Match. Set invisible.
769 old_meta =
770 h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
771 << ClockHandle::kStateShift),
772 std::memory_order_acq_rel);
773 // Apply update to local copy
774 old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
775 << ClockHandle::kStateShift);
776 for (;;) {
777 uint64_t refcount = GetRefcount(old_meta);
778 assert(refcount > 0);
779 if (refcount > 1) {
780 // Not last ref at some point in time during this Erase call
781 // Pretend we never took the reference
782 h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
783 std::memory_order_release);
784 break;
785 } else if (h->meta.compare_exchange_weak(
786 old_meta,
787 uint64_t{ClockHandle::kStateConstruction}
788 << ClockHandle::kStateShift,
789 std::memory_order_acq_rel)) {
790 // Took ownership
791 assert(hashed_key == h->hashed_key);
792 size_t total_charge = h->GetTotalCharge();
793 FreeDataMarkEmpty(*h);
794 ReclaimEntryUsage(total_charge);
795 // We already have a copy of hashed_key in this case, so OK to
796 // delay Rollback until after releasing the entry
797 Rollback(hashed_key, h);
798 break;
799 }
800 }
801 } else {
802 // Mismatch. Pretend we never took the reference
803 h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
804 std::memory_order_release);
805 }
806 } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
807 ClockHandle::kStateInvisible)) {
808 // Pretend we never took the reference
809 // WART: there's a tiny chance we release last ref to invisible
810 // entry here. If that happens, we let eviction take care of it.
811 h->meta.fetch_sub(ClockHandle::kAcquireIncrement,
812 std::memory_order_release);
813 } else {
814 // For other states, incrementing the acquire counter has no effect
815 // so we don't need to undo it.
816 }
817 return false;
818 },
819 [&](HandleImpl* h) {
820 return h->displacements.load(std::memory_order_relaxed) == 0;
821 },
822 [&](HandleImpl* /*h*/) {}, probe);
823 }
824
825 void HyperClockTable::ConstApplyToEntriesRange(
826 std::function<void(const HandleImpl&)> func, size_t index_begin,
827 size_t index_end, bool apply_if_will_be_deleted) const {
828 uint64_t check_state_mask = ClockHandle::kStateShareableBit;
829 if (!apply_if_will_be_deleted) {
830 check_state_mask |= ClockHandle::kStateVisibleBit;
831 }
832
833 for (size_t i = index_begin; i < index_end; i++) {
834 HandleImpl& h = array_[i];
835
836 // Note: to avoid using compare_exchange, we have to be extra careful.
837 uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
838 // Check if it's an entry visible to lookups
839 if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
840 // Increment acquire counter. Note: it's possible that the entry has
841 // completely changed since we loaded old_meta, but incrementing acquire
842 // count is always safe. (Similar to optimistic Lookup here.)
843 old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
844 std::memory_order_acquire);
845 // Check whether we actually acquired a reference.
846 if ((old_meta >> ClockHandle::kStateShift) &
847 ClockHandle::kStateShareableBit) {
848 // Apply func if appropriate
849 if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
850 func(h);
851 }
852 // Pretend we never took the reference
853 h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
854 std::memory_order_release);
855 // No net change, so don't need to check for overflow
856 } else {
857 // For other states, incrementing the acquire counter has no effect
858 // so we don't need to undo it. Furthermore, we cannot safely undo
859 // it because we did not acquire a read reference to lock the
860 // entry in a Shareable state.
861 }
862 }
863 }
864 }
865
866 void HyperClockTable::EraseUnRefEntries() {
867 for (size_t i = 0; i <= this->length_bits_mask_; i++) {
868 HandleImpl& h = array_[i];
869
870 uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
871 if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
872 << ClockHandle::kStateShift) &&
873 GetRefcount(old_meta) == 0 &&
874 h.meta.compare_exchange_strong(old_meta,
875 uint64_t{ClockHandle::kStateConstruction}
876 << ClockHandle::kStateShift,
877 std::memory_order_acquire)) {
878 // Took ownership
879 size_t total_charge = h.GetTotalCharge();
880 Rollback(h.hashed_key, &h);
881 FreeDataMarkEmpty(h);
882 ReclaimEntryUsage(total_charge);
883 }
884 }
885 }
886
887 inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot(
888 const UniqueId64x2& hashed_key, std::function<bool(HandleImpl*)> match_fn,
889 std::function<bool(HandleImpl*)> abort_fn,
890 std::function<void(HandleImpl*)> update_fn, size_t& probe) {
891 // NOTE: upper 32 bits of hashed_key[0] is used for sharding
892 //
893 // We use double-hashing probing. Every probe in the sequence is a
894 // pseudorandom integer, computed as a linear function of two random hashes,
895 // which we call base and increment. Specifically, the i-th probe is base + i
896 // * increment modulo the table size.
897 size_t base = static_cast<size_t>(hashed_key[1]);
898 // We use an odd increment, which is relatively prime with the power-of-two
899 // table size. This implies that we cycle back to the first probe only
900 // after probing every slot exactly once.
901 // TODO: we could also reconsider linear probing, though locality benefits
902 // are limited because each slot is a full cache line
903 size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
904 size_t current = ModTableSize(base + probe * increment);
905 while (probe <= length_bits_mask_) {
906 HandleImpl* h = &array_[current];
907 if (match_fn(h)) {
908 probe++;
909 return h;
910 }
911 if (abort_fn(h)) {
912 return nullptr;
913 }
914 probe++;
915 update_fn(h);
916 current = ModTableSize(current + increment);
917 }
918 // We looped back.
919 return nullptr;
920 }
921
922 inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key,
923 const HandleImpl* h) {
924 size_t current = ModTableSize(hashed_key[1]);
925 size_t increment = static_cast<size_t>(hashed_key[0]) | 1U;
926 while (&array_[current] != h) {
927 array_[current].displacements.fetch_sub(1, std::memory_order_relaxed);
928 current = ModTableSize(current + increment);
929 }
930 }
931
932 inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) {
933 auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release);
934 (void)old_occupancy;
935 // No underflow
936 assert(old_occupancy > 0);
937 auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed);
938 (void)old_usage;
939 // No underflow
940 assert(old_usage >= total_charge);
941 }
942
943 inline void HyperClockTable::Evict(size_t requested_charge,
944 size_t* freed_charge, size_t* freed_count) {
945 // precondition
946 assert(requested_charge > 0);
947
948 // TODO: make a tuning parameter?
949 constexpr size_t step_size = 4;
950
951 // First (concurrent) increment clock pointer
952 uint64_t old_clock_pointer =
953 clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
954
955 // Cap the eviction effort at this thread (along with those operating in
956 // parallel) circling through the whole structure kMaxCountdown times.
957 // In other words, this eviction run must find something/anything that is
958 // unreferenced at start of and during the eviction run that isn't reclaimed
959 // by a concurrent eviction run.
960 uint64_t max_clock_pointer =
961 old_clock_pointer + (ClockHandle::kMaxCountdown << length_bits_);
962
963 for (;;) {
964 for (size_t i = 0; i < step_size; i++) {
965 HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))];
966 bool evicting = ClockUpdate(h);
967 if (evicting) {
968 Rollback(h.hashed_key, &h);
969 *freed_charge += h.GetTotalCharge();
970 *freed_count += 1;
971 FreeDataMarkEmpty(h);
972 }
973 }
974
975 // Loop exit condition
976 if (*freed_charge >= requested_charge) {
977 return;
978 }
979 if (old_clock_pointer >= max_clock_pointer) {
980 return;
981 }
982
983 // Advance clock pointer (concurrently)
984 old_clock_pointer =
985 clock_pointer_.fetch_add(step_size, std::memory_order_relaxed);
986 }
987 }
988
989 template <class Table>
990 ClockCacheShard<Table>::ClockCacheShard(
991 size_t capacity, bool strict_capacity_limit,
992 CacheMetadataChargePolicy metadata_charge_policy,
993 const typename Table::Opts& opts)
994 : CacheShardBase(metadata_charge_policy),
995 table_(capacity, strict_capacity_limit, metadata_charge_policy, opts),
996 capacity_(capacity),
997 strict_capacity_limit_(strict_capacity_limit) {
998 // Initial charge metadata should not exceed capacity
999 assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl));
1000 }
1001
1002 template <class Table>
1003 void ClockCacheShard<Table>::EraseUnRefEntries() {
1004 table_.EraseUnRefEntries();
1005 }
1006
1007 template <class Table>
1008 void ClockCacheShard<Table>::ApplyToSomeEntries(
1009 const std::function<void(const Slice& key, void* value, size_t charge,
1010 DeleterFn deleter)>& callback,
1011 size_t average_entries_per_lock, size_t* state) {
1012 // The state is essentially going to be the starting hash, which works
1013 // nicely even if we resize between calls because we use upper-most
1014 // hash bits for table indexes.
1015 size_t length_bits = table_.GetLengthBits();
1016 size_t length = table_.GetTableSize();
1017
1018 assert(average_entries_per_lock > 0);
1019 // Assuming we are called with same average_entries_per_lock repeatedly,
1020 // this simplifies some logic (index_end will not overflow).
1021 assert(average_entries_per_lock < length || *state == 0);
1022
1023 size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
1024 size_t index_end = index_begin + average_entries_per_lock;
1025 if (index_end >= length) {
1026 // Going to end.
1027 index_end = length;
1028 *state = SIZE_MAX;
1029 } else {
1030 *state = index_end << (sizeof(size_t) * 8u - length_bits);
1031 }
1032
1033 table_.ConstApplyToEntriesRange(
1034 [callback](const HandleImpl& h) {
1035 UniqueId64x2 unhashed;
1036 callback(ReverseHash(h.hashed_key, &unhashed), h.value,
1037 h.GetTotalCharge(), h.deleter);
1038 },
1039 index_begin, index_end, false);
1040 }
1041
1042 int HyperClockTable::CalcHashBits(
1043 size_t capacity, size_t estimated_value_size,
1044 CacheMetadataChargePolicy metadata_charge_policy) {
1045 double average_slot_charge = estimated_value_size * kLoadFactor;
1046 if (metadata_charge_policy == kFullChargeCacheMetadata) {
1047 average_slot_charge += sizeof(HandleImpl);
1048 }
1049 assert(average_slot_charge > 0.0);
1050 uint64_t num_slots =
1051 static_cast<uint64_t>(capacity / average_slot_charge + 0.999999);
1052
1053 int hash_bits = FloorLog2((num_slots << 1) - 1);
1054 if (metadata_charge_policy == kFullChargeCacheMetadata) {
1055 // For very small estimated value sizes, it's possible to overshoot
1056 while (hash_bits > 0 &&
1057 uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) {
1058 hash_bits--;
1059 }
1060 }
1061 return hash_bits;
1062 }
1063
1064 template <class Table>
1065 void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
1066 capacity_.store(capacity, std::memory_order_relaxed);
1067 // next Insert will take care of any necessary evictions
1068 }
1069
1070 template <class Table>
1071 void ClockCacheShard<Table>::SetStrictCapacityLimit(
1072 bool strict_capacity_limit) {
1073 strict_capacity_limit_.store(strict_capacity_limit,
1074 std::memory_order_relaxed);
1075 // next Insert will take care of any necessary evictions
1076 }
1077
1078 template <class Table>
1079 Status ClockCacheShard<Table>::Insert(const Slice& key,
1080 const UniqueId64x2& hashed_key,
1081 void* value, size_t charge,
1082 Cache::DeleterFn deleter,
1083 HandleImpl** handle,
1084 Cache::Priority priority) {
1085 if (UNLIKELY(key.size() != kCacheKeySize)) {
1086 return Status::NotSupported("ClockCache only supports key size " +
1087 std::to_string(kCacheKeySize) + "B");
1088 }
1089 ClockHandleBasicData proto;
1090 proto.hashed_key = hashed_key;
1091 proto.value = value;
1092 proto.deleter = deleter;
1093 proto.total_charge = charge;
1094 Status s = table_.Insert(
1095 proto, handle, priority, capacity_.load(std::memory_order_relaxed),
1096 strict_capacity_limit_.load(std::memory_order_relaxed));
1097 return s;
1098 }
1099
1100 template <class Table>
1101 typename ClockCacheShard<Table>::HandleImpl* ClockCacheShard<Table>::Lookup(
1102 const Slice& key, const UniqueId64x2& hashed_key) {
1103 if (UNLIKELY(key.size() != kCacheKeySize)) {
1104 return nullptr;
1105 }
1106 return table_.Lookup(hashed_key);
1107 }
1108
1109 template <class Table>
1110 bool ClockCacheShard<Table>::Ref(HandleImpl* h) {
1111 if (h == nullptr) {
1112 return false;
1113 }
1114 table_.Ref(*h);
1115 return true;
1116 }
1117
1118 template <class Table>
1119 bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
1120 bool erase_if_last_ref) {
1121 if (handle == nullptr) {
1122 return false;
1123 }
1124 return table_.Release(handle, useful, erase_if_last_ref);
1125 }
1126
1127 template <class Table>
1128 void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
1129 table_.TEST_RefN(*h, n);
1130 }
1131
1132 template <class Table>
1133 void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
1134 table_.TEST_ReleaseN(h, n);
1135 }
1136
1137 template <class Table>
1138 bool ClockCacheShard<Table>::Release(HandleImpl* handle,
1139 bool erase_if_last_ref) {
1140 return Release(handle, /*useful=*/true, erase_if_last_ref);
1141 }
1142
1143 template <class Table>
1144 void ClockCacheShard<Table>::Erase(const Slice& key,
1145 const UniqueId64x2& hashed_key) {
1146 if (UNLIKELY(key.size() != kCacheKeySize)) {
1147 return;
1148 }
1149 table_.Erase(hashed_key);
1150 }
1151
1152 template <class Table>
1153 size_t ClockCacheShard<Table>::GetUsage() const {
1154 return table_.GetUsage();
1155 }
1156
1157 template <class Table>
1158 size_t ClockCacheShard<Table>::GetDetachedUsage() const {
1159 return table_.GetDetachedUsage();
1160 }
1161
1162 template <class Table>
1163 size_t ClockCacheShard<Table>::GetCapacity() const {
1164 return capacity_;
1165 }
1166
1167 template <class Table>
1168 size_t ClockCacheShard<Table>::GetPinnedUsage() const {
1169 // Computes the pinned usage by scanning the whole hash table. This
1170 // is slow, but avoids keeping an exact counter on the clock usage,
1171 // i.e., the number of not externally referenced elements.
1172 // Why avoid this counter? Because Lookup removes elements from the clock
1173 // list, so it would need to update the pinned usage every time,
1174 // which creates additional synchronization costs.
1175 size_t table_pinned_usage = 0;
1176 const bool charge_metadata =
1177 metadata_charge_policy_ == kFullChargeCacheMetadata;
1178 table_.ConstApplyToEntriesRange(
1179 [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
1180 uint64_t meta = h.meta.load(std::memory_order_relaxed);
1181 uint64_t refcount = GetRefcount(meta);
1182 // Holding one ref for ConstApplyToEntriesRange
1183 assert(refcount > 0);
1184 if (refcount > 1) {
1185 table_pinned_usage += h.GetTotalCharge();
1186 if (charge_metadata) {
1187 table_pinned_usage += sizeof(HandleImpl);
1188 }
1189 }
1190 },
1191 0, table_.GetTableSize(), true);
1192
1193 return table_pinned_usage + table_.GetDetachedUsage();
1194 }
1195
1196 template <class Table>
1197 size_t ClockCacheShard<Table>::GetOccupancyCount() const {
1198 return table_.GetOccupancy();
1199 }
1200
1201 template <class Table>
1202 size_t ClockCacheShard<Table>::GetOccupancyLimit() const {
1203 return table_.GetOccupancyLimit();
1204 }
1205
1206 template <class Table>
1207 size_t ClockCacheShard<Table>::GetTableAddressCount() const {
1208 return table_.GetTableSize();
1209 }
1210
1211 // Explicit instantiation
1212 template class ClockCacheShard<HyperClockTable>;
1213
1214 HyperClockCache::HyperClockCache(
1215 size_t capacity, size_t estimated_value_size, int num_shard_bits,
1216 bool strict_capacity_limit,
1217 CacheMetadataChargePolicy metadata_charge_policy,
1218 std::shared_ptr<MemoryAllocator> memory_allocator)
1219 : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
1220 std::move(memory_allocator)) {
1221 assert(estimated_value_size > 0 ||
1222 metadata_charge_policy != kDontChargeCacheMetadata);
1223 // TODO: should not need to go through two levels of pointer indirection to
1224 // get to table entries
1225 size_t per_shard = GetPerShardCapacity();
1226 InitShards([=](Shard* cs) {
1227 HyperClockTable::Opts opts;
1228 opts.estimated_value_size = estimated_value_size;
1229 new (cs)
1230 Shard(per_shard, strict_capacity_limit, metadata_charge_policy, opts);
1231 });
1232 }
1233
1234 void* HyperClockCache::Value(Handle* handle) {
1235 return reinterpret_cast<const HandleImpl*>(handle)->value;
1236 }
1237
1238 size_t HyperClockCache::GetCharge(Handle* handle) const {
1239 return reinterpret_cast<const HandleImpl*>(handle)->GetTotalCharge();
1240 }
1241
1242 Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const {
1243 auto h = reinterpret_cast<const HandleImpl*>(handle);
1244 return h->deleter;
1245 }
1246
1247 namespace {
1248
1249 // For each cache shard, estimate what the table load factor would be if
1250 // cache filled to capacity with average entries. This is considered
1251 // indicative of a potential problem if the shard is essentially operating
1252 // "at limit", which we define as high actual usage (>80% of capacity)
1253 // or actual occupancy very close to limit (>95% of limit).
1254 // Also, for each shard compute the recommended estimated_entry_charge,
1255 // and keep the minimum one for use as overall recommendation.
1256 void AddShardEvaluation(const HyperClockCache::Shard& shard,
1257 std::vector<double>& predicted_load_factors,
1258 size_t& min_recommendation) {
1259 size_t usage = shard.GetUsage() - shard.GetDetachedUsage();
1260 size_t capacity = shard.GetCapacity();
1261 double usage_ratio = 1.0 * usage / capacity;
1262
1263 size_t occupancy = shard.GetOccupancyCount();
1264 size_t occ_limit = shard.GetOccupancyLimit();
1265 double occ_ratio = 1.0 * occupancy / occ_limit;
1266 if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) {
1267 // Skip as described above
1268 return;
1269 }
1270
1271 // If filled to capacity, what would the occupancy ratio be?
1272 double ratio = occ_ratio / usage_ratio;
1273 // Given max load factor, what that load factor be?
1274 double lf = ratio * kStrictLoadFactor;
1275 predicted_load_factors.push_back(lf);
1276
1277 // Update min_recommendation also
1278 size_t recommendation = usage / occupancy;
1279 min_recommendation = std::min(min_recommendation, recommendation);
1280 }
1281
1282 } // namespace
1283
1284 void HyperClockCache::ReportProblems(
1285 const std::shared_ptr<Logger>& info_log) const {
1286 uint32_t shard_count = GetNumShards();
1287 std::vector<double> predicted_load_factors;
1288 size_t min_recommendation = SIZE_MAX;
1289 const_cast<HyperClockCache*>(this)->ForEachShard(
1290 [&](HyperClockCache::Shard* shard) {
1291 AddShardEvaluation(*shard, predicted_load_factors, min_recommendation);
1292 });
1293
1294 if (predicted_load_factors.empty()) {
1295 // None operating "at limit" -> nothing to report
1296 return;
1297 }
1298 std::sort(predicted_load_factors.begin(), predicted_load_factors.end());
1299
1300 // First, if the average load factor is within spec, we aren't going to
1301 // complain about a few shards being out of spec.
1302 // NOTE: this is only the average among cache shards operating "at limit,"
1303 // which should be representative of what we care about. It it normal, even
1304 // desirable, for a cache to operate "at limit" so this should not create
1305 // selection bias. See AddShardEvaluation().
1306 // TODO: Consider detecting cases where decreasing the number of shards
1307 // would be good, e.g. serious imbalance among shards.
1308 double average_load_factor =
1309 std::accumulate(predicted_load_factors.begin(),
1310 predicted_load_factors.end(), 0.0) /
1311 shard_count;
1312
1313 constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
1314 constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
1315 if (average_load_factor > kLoadFactor) {
1316 // Out of spec => Consider reporting load factor too high
1317 // Estimate effective overall capacity loss due to enforcing occupancy limit
1318 double lost_portion = 0.0;
1319 int over_count = 0;
1320 for (double lf : predicted_load_factors) {
1321 if (lf > kStrictLoadFactor) {
1322 ++over_count;
1323 lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
1324 }
1325 }
1326 // >= 20% loss -> error
1327 // >= 10% loss -> consistent warning
1328 // >= 1% loss -> intermittent warning
1329 InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
1330 bool report = true;
1331 if (lost_portion > 0.2) {
1332 level = InfoLogLevel::ERROR_LEVEL;
1333 } else if (lost_portion > 0.1) {
1334 level = InfoLogLevel::WARN_LEVEL;
1335 } else if (lost_portion > 0.01) {
1336 int report_percent = static_cast<int>(lost_portion * 100.0);
1337 if (Random::GetTLSInstance()->PercentTrue(report_percent)) {
1338 level = InfoLogLevel::WARN_LEVEL;
1339 }
1340 } else {
1341 // don't report
1342 report = false;
1343 }
1344 if (report) {
1345 ROCKS_LOG_AT_LEVEL(
1346 info_log, level,
1347 "HyperClockCache@%p unable to use estimated %.1f%% capacity because "
1348 "of "
1349 "full occupancy in %d/%u cache shards (estimated_entry_charge too "
1350 "high). Recommend estimated_entry_charge=%zu",
1351 this, lost_portion * 100.0, over_count, (unsigned)shard_count,
1352 min_recommendation);
1353 }
1354 } else if (average_load_factor < kLowSpecLoadFactor) {
1355 // Out of spec => Consider reporting load factor too low
1356 // But cautiously because low is not as big of a problem.
1357
1358 // Only report if highest occupancy shard is also below
1359 // spec and only if average is substantially out of spec
1360 if (predicted_load_factors.back() < kLowSpecLoadFactor &&
1361 average_load_factor < kLowSpecLoadFactor / 1.414) {
1362 InfoLogLevel level = InfoLogLevel::INFO_LEVEL;
1363 if (average_load_factor < kLowSpecLoadFactor / 2) {
1364 level = InfoLogLevel::WARN_LEVEL;
1365 }
1366 ROCKS_LOG_AT_LEVEL(
1367 info_log, level,
1368 "HyperClockCache@%p table has low occupancy at full capacity. Higher "
1369 "estimated_entry_charge (about %.1fx) would likely improve "
1370 "performance. Recommend estimated_entry_charge=%zu",
1371 this, kMidSpecLoadFactor / average_load_factor, min_recommendation);
1372 }
1373 }
1374 }
1375
1376 } // namespace clock_cache
1377
1378 // DEPRECATED (see public API)
1379 std::shared_ptr<Cache> NewClockCache(
1380 size_t capacity, int num_shard_bits, bool strict_capacity_limit,
1381 CacheMetadataChargePolicy metadata_charge_policy) {
1382 return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
1383 /* high_pri_pool_ratio */ 0.5, nullptr,
1384 kDefaultToAdaptiveMutex, metadata_charge_policy,
1385 /* low_pri_pool_ratio */ 0.0);
1386 }
1387
1388 std::shared_ptr<Cache> HyperClockCacheOptions::MakeSharedCache() const {
1389 auto my_num_shard_bits = num_shard_bits;
1390 if (my_num_shard_bits >= 20) {
1391 return nullptr; // The cache cannot be sharded into too many fine pieces.
1392 }
1393 if (my_num_shard_bits < 0) {
1394 // Use larger shard size to reduce risk of large entries clustering
1395 // or skewing individual shards.
1396 constexpr size_t min_shard_size = 32U * 1024U * 1024U;
1397 my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size);
1398 }
1399 return std::make_shared<clock_cache::HyperClockCache>(
1400 capacity, estimated_entry_charge, my_num_shard_bits,
1401 strict_capacity_limit, metadata_charge_policy, memory_allocator);
1402 }
1403
1404 } // namespace ROCKSDB_NAMESPACE