1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #include "cache/clock_cache.h"
16 #include "cache/cache_key.h"
17 #include "logging/logging.h"
18 #include "monitoring/perf_context_imp.h"
19 #include "monitoring/statistics.h"
20 #include "port/lang.h"
21 #include "util/hash.h"
22 #include "util/math.h"
23 #include "util/random.h"
25 namespace ROCKSDB_NAMESPACE
{
27 namespace clock_cache
{
30 inline uint64_t GetRefcount(uint64_t meta
) {
31 return ((meta
>> ClockHandle::kAcquireCounterShift
) -
32 (meta
>> ClockHandle::kReleaseCounterShift
)) &
33 ClockHandle::kCounterMask
;
36 inline uint64_t GetInitialCountdown(Cache::Priority priority
) {
37 // Set initial clock data from priority
38 // TODO: configuration parameters for priority handling and clock cycle
41 case Cache::Priority::HIGH
:
42 return ClockHandle::kHighCountdown
;
46 case Cache::Priority::LOW
:
47 return ClockHandle::kLowCountdown
;
48 case Cache::Priority::BOTTOM
:
49 return ClockHandle::kBottomCountdown
;
53 inline void FreeDataMarkEmpty(ClockHandle
& h
) {
54 // NOTE: in theory there's more room for parallelism if we copy the handle
55 // data and delay actions like this until after marking the entry as empty,
56 // but performance tests only show a regression by copying the few words
61 // Mark slot as empty, with assertion
62 uint64_t meta
= h
.meta
.exchange(0, std::memory_order_release
);
63 assert(meta
>> ClockHandle::kStateShift
== ClockHandle::kStateConstruction
);
66 h
.meta
.store(0, std::memory_order_release
);
70 inline bool ClockUpdate(ClockHandle
& h
) {
71 uint64_t meta
= h
.meta
.load(std::memory_order_relaxed
);
73 uint64_t acquire_count
=
74 (meta
>> ClockHandle::kAcquireCounterShift
) & ClockHandle::kCounterMask
;
75 uint64_t release_count
=
76 (meta
>> ClockHandle::kReleaseCounterShift
) & ClockHandle::kCounterMask
;
77 // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count,
78 // release_count, (unsigned)(meta >> ClockHandle::kStateShift));
79 if (acquire_count
!= release_count
) {
80 // Only clock update entries with no outstanding refs
83 if (!((meta
>> ClockHandle::kStateShift
) & ClockHandle::kStateShareableBit
)) {
84 // Only clock update Shareable entries
87 if ((meta
>> ClockHandle::kStateShift
== ClockHandle::kStateVisible
) &&
91 std::min(acquire_count
- 1, uint64_t{ClockHandle::kMaxCountdown
} - 1);
92 // Compare-exchange in the decremented clock info, but
95 (uint64_t{ClockHandle::kStateVisible
} << ClockHandle::kStateShift
) |
96 (new_count
<< ClockHandle::kReleaseCounterShift
) |
97 (new_count
<< ClockHandle::kAcquireCounterShift
);
98 h
.meta
.compare_exchange_strong(meta
, new_meta
, std::memory_order_relaxed
);
101 // Otherwise, remove entry (either unreferenced invisible or
102 // unreferenced and expired visible).
103 if (h
.meta
.compare_exchange_strong(
105 uint64_t{ClockHandle::kStateConstruction
} << ClockHandle::kStateShift
,
106 std::memory_order_acquire
)) {
110 // Compare-exchange failing probably
111 // indicates the entry was used, so skip it in that case.
118 void ClockHandleBasicData::FreeData() const {
120 UniqueId64x2 unhashed
;
122 ClockCacheShard
<HyperClockTable
>::ReverseHash(hashed_key
, &unhashed
),
127 HyperClockTable::HyperClockTable(
128 size_t capacity
, bool /*strict_capacity_limit*/,
129 CacheMetadataChargePolicy metadata_charge_policy
, const Opts
& opts
)
130 : length_bits_(CalcHashBits(capacity
, opts
.estimated_value_size
,
131 metadata_charge_policy
)),
132 length_bits_mask_((size_t{1} << length_bits_
) - 1),
133 occupancy_limit_(static_cast<size_t>((uint64_t{1} << length_bits_
) *
135 array_(new HandleImpl
[size_t{1} << length_bits_
]) {
136 if (metadata_charge_policy
==
137 CacheMetadataChargePolicy::kFullChargeCacheMetadata
) {
138 usage_
+= size_t{GetTableSize()} * sizeof(HandleImpl
);
141 static_assert(sizeof(HandleImpl
) == 64U,
142 "Expecting size / alignment with common cache line size");
145 HyperClockTable::~HyperClockTable() {
146 // Assumes there are no references or active operations on any slot/element
148 for (size_t i
= 0; i
< GetTableSize(); i
++) {
149 HandleImpl
& h
= array_
[i
];
150 switch (h
.meta
>> ClockHandle::kStateShift
) {
151 case ClockHandle::kStateEmpty
:
154 case ClockHandle::kStateInvisible
: // rare but possible
155 case ClockHandle::kStateVisible
:
156 assert(GetRefcount(h
.meta
) == 0);
159 Rollback(h
.hashed_key
, &h
);
160 ReclaimEntryUsage(h
.GetTotalCharge());
171 for (size_t i
= 0; i
< GetTableSize(); i
++) {
172 assert(array_
[i
].displacements
.load() == 0);
176 assert(usage_
.load() == 0 ||
177 usage_
.load() == size_t{GetTableSize()} * sizeof(HandleImpl
));
178 assert(occupancy_
== 0);
181 // If an entry doesn't receive clock updates but is repeatedly referenced &
182 // released, the acquire and release counters could overflow without some
183 // intervention. This is that intervention, which should be inexpensive
184 // because it only incurs a simple, very predictable check. (Applying a bit
185 // mask in addition to an increment to every Release likely would be
186 // relatively expensive, because it's an extra atomic update.)
188 // We do have to assume that we never have many millions of simultaneous
189 // references to a cache handle, because we cannot represent so many
190 // references with the difference in counters, masked to the number of
191 // counter bits. Similarly, we assume there aren't millions of threads
192 // holding transient references (which might be "undone" rather than
193 // released by the way).
195 // Consider these possible states for each counter:
196 // low: less than kMaxCountdown
197 // medium: kMaxCountdown to half way to overflow + kMaxCountdown
198 // high: half way to overflow + kMaxCountdown, or greater
200 // And these possible states for the combination of counters:
203 // low low - Normal / common, with caveats (see below)
204 // medium low - Can happen while holding some refs
205 // high low - Violates assumptions (too many refs)
206 // low medium - Violates assumptions (refs underflow, etc.)
207 // medium medium - Normal (very read heavy cache)
208 // high medium - Can happen while holding some refs
209 // low high - This function is supposed to prevent
210 // medium high - Violates assumptions (refs underflow, etc.)
211 // high high - Needs CorrectNearOverflow
213 // Basically, this function detects (high, high) state (inferred from
214 // release alone being high) and bumps it back down to (medium, medium)
215 // state with the same refcount and the same logical countdown counter
216 // (everything > kMaxCountdown is logically the same). Note that bumping
217 // down to (low, low) would modify the countdown counter, so is "reserved"
220 // If near-overflow correction is triggered here, there's no guarantee
221 // that another thread hasn't freed the entry and replaced it with another.
222 // Therefore, it must be the case that the correction does not affect
223 // entries unless they are very old (many millions of acquire-release cycles).
224 // (Our bit manipulation is indeed idempotent and only affects entries in
225 // exceptional cases.) We assume a pre-empted thread will not stall that long.
226 // If it did, the state could be corrupted in the (unlikely) case that the top
227 // bit of the acquire counter is set but not the release counter, and thus
228 // we only clear the top bit of the acquire counter on resumption. It would
229 // then appear that there are too many refs and the entry would be permanently
230 // pinned (which is not terrible for an exceptionally rare occurrence), unless
231 // it is referenced enough (at least kMaxCountdown more times) for the release
232 // counter to reach "high" state again and bumped back to "medium." (This
233 // motivates only checking for release counter in high state, not both in high
235 inline void CorrectNearOverflow(uint64_t old_meta
,
236 std::atomic
<uint64_t>& meta
) {
237 // We clear both top-most counter bits at the same time.
238 constexpr uint64_t kCounterTopBit
= uint64_t{1}
239 << (ClockHandle::kCounterNumBits
- 1);
240 constexpr uint64_t kClearBits
=
241 (kCounterTopBit
<< ClockHandle::kAcquireCounterShift
) |
242 (kCounterTopBit
<< ClockHandle::kReleaseCounterShift
);
243 // A simple check that allows us to initiate clearing the top bits for
244 // a large portion of the "high" state space on release counter.
245 constexpr uint64_t kCheckBits
=
246 (kCounterTopBit
| (ClockHandle::kMaxCountdown
+ 1))
247 << ClockHandle::kReleaseCounterShift
;
249 if (UNLIKELY(old_meta
& kCheckBits
)) {
250 meta
.fetch_and(~kClearBits
, std::memory_order_relaxed
);
254 inline Status
HyperClockTable::ChargeUsageMaybeEvictStrict(
255 size_t total_charge
, size_t capacity
, bool need_evict_for_occupancy
) {
256 if (total_charge
> capacity
) {
257 return Status::MemoryLimit(
258 "Cache entry too large for a single cache shard: " +
259 std::to_string(total_charge
) + " > " + std::to_string(capacity
));
261 // Grab any available capacity, and free up any more required.
262 size_t old_usage
= usage_
.load(std::memory_order_relaxed
);
264 if (LIKELY(old_usage
!= capacity
)) {
266 new_usage
= std::min(capacity
, old_usage
+ total_charge
);
267 } while (!usage_
.compare_exchange_weak(old_usage
, new_usage
,
268 std::memory_order_relaxed
));
270 new_usage
= old_usage
;
272 // How much do we need to evict then?
273 size_t need_evict_charge
= old_usage
+ total_charge
- new_usage
;
274 size_t request_evict_charge
= need_evict_charge
;
275 if (UNLIKELY(need_evict_for_occupancy
) && request_evict_charge
== 0) {
276 // Require at least 1 eviction.
277 request_evict_charge
= 1;
279 if (request_evict_charge
> 0) {
280 size_t evicted_charge
= 0;
281 size_t evicted_count
= 0;
282 Evict(request_evict_charge
, &evicted_charge
, &evicted_count
);
283 occupancy_
.fetch_sub(evicted_count
, std::memory_order_release
);
284 if (LIKELY(evicted_charge
> need_evict_charge
)) {
285 assert(evicted_count
> 0);
286 // Evicted more than enough
287 usage_
.fetch_sub(evicted_charge
- need_evict_charge
,
288 std::memory_order_relaxed
);
289 } else if (evicted_charge
< need_evict_charge
||
290 (UNLIKELY(need_evict_for_occupancy
) && evicted_count
== 0)) {
291 // Roll back to old usage minus evicted
292 usage_
.fetch_sub(evicted_charge
+ (new_usage
- old_usage
),
293 std::memory_order_relaxed
);
294 if (evicted_charge
< need_evict_charge
) {
295 return Status::MemoryLimit(
296 "Insert failed because unable to evict entries to stay within "
299 return Status::MemoryLimit(
300 "Insert failed because unable to evict entries to stay within "
301 "table occupancy limit.");
304 // If we needed to evict something and we are proceeding, we must have
305 // evicted something.
306 assert(evicted_count
> 0);
311 inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict(
312 size_t total_charge
, size_t capacity
, bool need_evict_for_occupancy
) {
313 // For simplicity, we consider that either the cache can accept the insert
314 // with no evictions, or we must evict enough to make (at least) enough
315 // space. It could lead to unnecessary failures or excessive evictions in
316 // some extreme cases, but allows a fast, simple protocol. If we allow a
317 // race to get us over capacity, then we might never get back to capacity
318 // limit if the sizes of entries allow each insertion to evict the minimum
319 // charge. Thus, we should evict some extra if it's not a signifcant
320 // portion of the shard capacity. This can have the side benefit of
321 // involving fewer threads in eviction.
322 size_t old_usage
= usage_
.load(std::memory_order_relaxed
);
323 size_t need_evict_charge
;
324 // NOTE: if total_charge > old_usage, there isn't yet enough to evict
325 // `total_charge` amount. Even if we only try to evict `old_usage` amount,
326 // there's likely something referenced and we would eat CPU looking for
328 if (old_usage
+ total_charge
<= capacity
|| total_charge
> old_usage
) {
329 // Good enough for me (might run over with a race)
330 need_evict_charge
= 0;
332 // Try to evict enough space, and maybe some extra
333 need_evict_charge
= total_charge
;
334 if (old_usage
> capacity
) {
335 // Not too much to avoid thundering herd while avoiding strict
336 // synchronization, such as the compare_exchange used with strict
338 need_evict_charge
+= std::min(capacity
/ 1024, total_charge
) + 1;
341 if (UNLIKELY(need_evict_for_occupancy
) && need_evict_charge
== 0) {
342 // Special case: require at least 1 eviction if we only have to
343 // deal with occupancy
344 need_evict_charge
= 1;
346 size_t evicted_charge
= 0;
347 size_t evicted_count
= 0;
348 if (need_evict_charge
> 0) {
349 Evict(need_evict_charge
, &evicted_charge
, &evicted_count
);
350 // Deal with potential occupancy deficit
351 if (UNLIKELY(need_evict_for_occupancy
) && evicted_count
== 0) {
352 assert(evicted_charge
== 0);
353 // Can't meet occupancy requirement
356 // Update occupancy for evictions
357 occupancy_
.fetch_sub(evicted_count
, std::memory_order_release
);
360 // Track new usage even if we weren't able to evict enough
361 usage_
.fetch_add(total_charge
- evicted_charge
, std::memory_order_relaxed
);
363 assert(usage_
.load(std::memory_order_relaxed
) < SIZE_MAX
/ 2);
368 inline HyperClockTable::HandleImpl
* HyperClockTable::DetachedInsert(
369 const ClockHandleBasicData
& proto
) {
370 // Heap allocated separate from table
371 HandleImpl
* h
= new HandleImpl();
372 ClockHandleBasicData
* h_alias
= h
;
375 // Single reference (detached entries only created if returning a refed
376 // Handle back to user)
377 uint64_t meta
= uint64_t{ClockHandle::kStateInvisible
}
378 << ClockHandle::kStateShift
;
379 meta
|= uint64_t{1} << ClockHandle::kAcquireCounterShift
;
380 h
->meta
.store(meta
, std::memory_order_release
);
381 // Keep track of how much of usage is detached
382 detached_usage_
.fetch_add(proto
.GetTotalCharge(), std::memory_order_relaxed
);
386 Status
HyperClockTable::Insert(const ClockHandleBasicData
& proto
,
387 HandleImpl
** handle
, Cache::Priority priority
,
388 size_t capacity
, bool strict_capacity_limit
) {
389 // Do we have the available occupancy? Optimistically assume we do
390 // and deal with it if we don't.
391 size_t old_occupancy
= occupancy_
.fetch_add(1, std::memory_order_acquire
);
392 auto revert_occupancy_fn
= [&]() {
393 occupancy_
.fetch_sub(1, std::memory_order_relaxed
);
395 // Whether we over-committed and need an eviction to make up for it
396 bool need_evict_for_occupancy
= old_occupancy
>= occupancy_limit_
;
398 // Usage/capacity handling is somewhat different depending on
399 // strict_capacity_limit, but mostly pessimistic.
400 bool use_detached_insert
= false;
401 const size_t total_charge
= proto
.GetTotalCharge();
402 if (strict_capacity_limit
) {
403 Status s
= ChargeUsageMaybeEvictStrict(total_charge
, capacity
,
404 need_evict_for_occupancy
);
406 revert_occupancy_fn();
410 // Case strict_capacity_limit == false
411 bool success
= ChargeUsageMaybeEvictNonStrict(total_charge
, capacity
,
412 need_evict_for_occupancy
);
414 revert_occupancy_fn();
415 if (handle
== nullptr) {
416 // Don't insert the entry but still return ok, as if the entry
417 // inserted into cache and evicted immediately.
421 // Need to track usage of fallback detached insert
422 usage_
.fetch_add(total_charge
, std::memory_order_relaxed
);
423 use_detached_insert
= true;
427 auto revert_usage_fn
= [&]() {
428 usage_
.fetch_sub(total_charge
, std::memory_order_relaxed
);
430 assert(usage_
.load(std::memory_order_relaxed
) < SIZE_MAX
/ 2);
433 if (!use_detached_insert
) {
434 // Attempt a table insert, but abort if we find an existing entry for the
435 // key. If we were to overwrite old entries, we would either
436 // * Have to gain ownership over an existing entry to overwrite it, which
437 // would only work if there are no outstanding (read) references and would
438 // create a small gap in availability of the entry (old or new) to lookups.
439 // * Have to insert into a suboptimal location (more probes) so that the
440 // old entry can be kept around as well.
442 uint64_t initial_countdown
= GetInitialCountdown(priority
);
443 assert(initial_countdown
> 0);
446 HandleImpl
* e
= FindSlot(
449 // Optimistically transition the slot from "empty" to
450 // "under construction" (no effect on other states)
452 h
->meta
.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit
}
453 << ClockHandle::kStateShift
,
454 std::memory_order_acq_rel
);
455 uint64_t old_state
= old_meta
>> ClockHandle::kStateShift
;
457 if (old_state
== ClockHandle::kStateEmpty
) {
458 // We've started inserting into an available slot, and taken
459 // ownership Save data fields
460 ClockHandleBasicData
* h_alias
= h
;
463 // Transition from "under construction" state to "visible" state
464 uint64_t new_meta
= uint64_t{ClockHandle::kStateVisible
}
465 << ClockHandle::kStateShift
;
467 // Maybe with an outstanding reference
468 new_meta
|= initial_countdown
<< ClockHandle::kAcquireCounterShift
;
469 new_meta
|= (initial_countdown
- (handle
!= nullptr))
470 << ClockHandle::kReleaseCounterShift
;
473 // Save the state transition, with assertion
474 old_meta
= h
->meta
.exchange(new_meta
, std::memory_order_release
);
475 assert(old_meta
>> ClockHandle::kStateShift
==
476 ClockHandle::kStateConstruction
);
478 // Save the state transition
479 h
->meta
.store(new_meta
, std::memory_order_release
);
482 } else if (old_state
!= ClockHandle::kStateVisible
) {
483 // Slot not usable / touchable now
486 // Existing, visible entry, which might be a match.
487 // But first, we need to acquire a ref to read it. In fact, number of
488 // refs for initial countdown, so that we boost the clock state if
490 old_meta
= h
->meta
.fetch_add(
491 ClockHandle::kAcquireIncrement
* initial_countdown
,
492 std::memory_order_acq_rel
);
494 if ((old_meta
>> ClockHandle::kStateShift
) ==
495 ClockHandle::kStateVisible
) {
496 // Acquired a read reference
497 if (h
->hashed_key
== proto
.hashed_key
) {
498 // Match. Release in a way that boosts the clock state
499 old_meta
= h
->meta
.fetch_add(
500 ClockHandle::kReleaseIncrement
* initial_countdown
,
501 std::memory_order_acq_rel
);
502 // Correct for possible (but rare) overflow
503 CorrectNearOverflow(old_meta
, h
->meta
);
504 // Insert detached instead (only if return handle needed)
505 use_detached_insert
= true;
508 // Mismatch. Pretend we never took the reference
509 old_meta
= h
->meta
.fetch_sub(
510 ClockHandle::kAcquireIncrement
* initial_countdown
,
511 std::memory_order_acq_rel
);
513 } else if (UNLIKELY((old_meta
>> ClockHandle::kStateShift
) ==
514 ClockHandle::kStateInvisible
)) {
515 // Pretend we never took the reference
516 // WART: there's a tiny chance we release last ref to invisible
517 // entry here. If that happens, we let eviction take care of it.
518 old_meta
= h
->meta
.fetch_sub(
519 ClockHandle::kAcquireIncrement
* initial_countdown
,
520 std::memory_order_acq_rel
);
522 // For other states, incrementing the acquire counter has no effect
523 // so we don't need to undo it.
524 // Slot not usable / touchable now.
529 [&](HandleImpl
* /*h*/) { return false; },
531 h
->displacements
.fetch_add(1, std::memory_order_relaxed
);
535 // Occupancy check and never abort FindSlot above should generally
536 // prevent this, except it's theoretically possible for other threads
537 // to evict and replace entries in the right order to hit every slot
538 // when it is populated. Assuming random hashing, the chance of that
539 // should be no higher than pow(kStrictLoadFactor, n) for n slots.
540 // That should be infeasible for roughly n >= 256, so if this assertion
541 // fails, that suggests something is going wrong.
542 assert(GetTableSize() < 256);
543 use_detached_insert
= true;
545 if (!use_detached_insert
) {
546 // Successfully inserted
552 // Roll back table insertion
553 Rollback(proto
.hashed_key
, e
);
554 revert_occupancy_fn();
555 // Maybe fall back on detached insert
556 if (handle
== nullptr) {
558 // As if unrefed entry immdiately evicted
564 // Run detached insert
565 assert(use_detached_insert
);
567 *handle
= DetachedInsert(proto
);
569 // The OkOverwritten status is used to count "redundant" insertions into
570 // block cache. This implementation doesn't strictly check for redundant
571 // insertions, but we instead are probably interested in how many insertions
572 // didn't go into the table (instead "detached"), which could be redundant
573 // Insert or some other reason (use_detached_insert reasons above).
574 return Status::OkOverwritten();
577 HyperClockTable::HandleImpl
* HyperClockTable::Lookup(
578 const UniqueId64x2
& hashed_key
) {
580 HandleImpl
* e
= FindSlot(
583 // Mostly branch-free version (similar performance)
585 uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement,
586 std::memory_order_acquire);
587 bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
588 bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
589 bool match = (h->key == key) & visible;
590 h->meta.fetch_sub(static_cast<uint64_t>(Shareable & !match) <<
591 ClockHandle::kAcquireCounterShift, std::memory_order_release); return
594 // Optimistic lookup should pay off when the table is relatively
596 constexpr bool kOptimisticLookup
= true;
598 if (!kOptimisticLookup
) {
599 old_meta
= h
->meta
.load(std::memory_order_acquire
);
600 if ((old_meta
>> ClockHandle::kStateShift
) !=
601 ClockHandle::kStateVisible
) {
605 // (Optimistically) increment acquire counter
606 old_meta
= h
->meta
.fetch_add(ClockHandle::kAcquireIncrement
,
607 std::memory_order_acquire
);
608 // Check if it's an entry visible to lookups
609 if ((old_meta
>> ClockHandle::kStateShift
) ==
610 ClockHandle::kStateVisible
) {
611 // Acquired a read reference
612 if (h
->hashed_key
== hashed_key
) {
616 // Mismatch. Pretend we never took the reference
617 old_meta
= h
->meta
.fetch_sub(ClockHandle::kAcquireIncrement
,
618 std::memory_order_release
);
620 } else if (UNLIKELY((old_meta
>> ClockHandle::kStateShift
) ==
621 ClockHandle::kStateInvisible
)) {
622 // Pretend we never took the reference
623 // WART: there's a tiny chance we release last ref to invisible
624 // entry here. If that happens, we let eviction take care of it.
625 old_meta
= h
->meta
.fetch_sub(ClockHandle::kAcquireIncrement
,
626 std::memory_order_release
);
628 // For other states, incrementing the acquire counter has no effect
629 // so we don't need to undo it. Furthermore, we cannot safely undo
630 // it because we did not acquire a read reference to lock the
631 // entry in a Shareable state.
637 return h
->displacements
.load(std::memory_order_relaxed
) == 0;
639 [&](HandleImpl
* /*h*/) {}, probe
);
644 bool HyperClockTable::Release(HandleImpl
* h
, bool useful
,
645 bool erase_if_last_ref
) {
646 // In contrast with LRUCache's Release, this function won't delete the handle
647 // when the cache is above capacity and the reference is the last one. Space
648 // is only freed up by EvictFromClock (called by Insert when space is needed)
649 // and Erase. We do this to avoid an extra atomic read of the variable usage_.
653 // Increment release counter to indicate was used
654 old_meta
= h
->meta
.fetch_add(ClockHandle::kReleaseIncrement
,
655 std::memory_order_release
);
657 // Decrement acquire counter to pretend it never happened
658 old_meta
= h
->meta
.fetch_sub(ClockHandle::kAcquireIncrement
,
659 std::memory_order_release
);
662 assert((old_meta
>> ClockHandle::kStateShift
) &
663 ClockHandle::kStateShareableBit
);
665 assert(((old_meta
>> ClockHandle::kAcquireCounterShift
) &
666 ClockHandle::kCounterMask
) !=
667 ((old_meta
>> ClockHandle::kReleaseCounterShift
) &
668 ClockHandle::kCounterMask
));
670 if (erase_if_last_ref
|| UNLIKELY(old_meta
>> ClockHandle::kStateShift
==
671 ClockHandle::kStateInvisible
)) {
672 // Update for last fetch_add op
674 old_meta
+= ClockHandle::kReleaseIncrement
;
676 old_meta
-= ClockHandle::kAcquireIncrement
;
678 // Take ownership if no refs
680 if (GetRefcount(old_meta
) != 0) {
681 // Not last ref at some point in time during this Release call
682 // Correct for possible (but rare) overflow
683 CorrectNearOverflow(old_meta
, h
->meta
);
686 if ((old_meta
& (uint64_t{ClockHandle::kStateShareableBit
}
687 << ClockHandle::kStateShift
)) == 0) {
688 // Someone else took ownership
691 // Note that there's a small chance that we release, another thread
692 // replaces this entry with another, reaches zero refs, and then we end
693 // up erasing that other entry. That's an acceptable risk / imprecision.
694 } while (!h
->meta
.compare_exchange_weak(
696 uint64_t{ClockHandle::kStateConstruction
} << ClockHandle::kStateShift
,
697 std::memory_order_acquire
));
699 size_t total_charge
= h
->GetTotalCharge();
700 if (UNLIKELY(h
->IsDetached())) {
702 // Delete detached handle
704 detached_usage_
.fetch_sub(total_charge
, std::memory_order_relaxed
);
705 usage_
.fetch_sub(total_charge
, std::memory_order_relaxed
);
707 Rollback(h
->hashed_key
, h
);
708 FreeDataMarkEmpty(*h
);
709 ReclaimEntryUsage(total_charge
);
713 // Correct for possible (but rare) overflow
714 CorrectNearOverflow(old_meta
, h
->meta
);
719 void HyperClockTable::Ref(HandleImpl
& h
) {
720 // Increment acquire counter
721 uint64_t old_meta
= h
.meta
.fetch_add(ClockHandle::kAcquireIncrement
,
722 std::memory_order_acquire
);
724 assert((old_meta
>> ClockHandle::kStateShift
) &
725 ClockHandle::kStateShareableBit
);
726 // Must have already had a reference
727 assert(GetRefcount(old_meta
) > 0);
731 void HyperClockTable::TEST_RefN(HandleImpl
& h
, size_t n
) {
732 // Increment acquire counter
733 uint64_t old_meta
= h
.meta
.fetch_add(n
* ClockHandle::kAcquireIncrement
,
734 std::memory_order_acquire
);
736 assert((old_meta
>> ClockHandle::kStateShift
) &
737 ClockHandle::kStateShareableBit
);
741 void HyperClockTable::TEST_ReleaseN(HandleImpl
* h
, size_t n
) {
743 // Split into n - 1 and 1 steps.
744 uint64_t old_meta
= h
->meta
.fetch_add(
745 (n
- 1) * ClockHandle::kReleaseIncrement
, std::memory_order_acquire
);
746 assert((old_meta
>> ClockHandle::kStateShift
) &
747 ClockHandle::kStateShareableBit
);
750 Release(h
, /*useful*/ true, /*erase_if_last_ref*/ false);
754 void HyperClockTable::Erase(const UniqueId64x2
& hashed_key
) {
759 // Could be multiple entries in rare cases. Erase them all.
760 // Optimistically increment acquire counter
761 uint64_t old_meta
= h
->meta
.fetch_add(ClockHandle::kAcquireIncrement
,
762 std::memory_order_acquire
);
763 // Check if it's an entry visible to lookups
764 if ((old_meta
>> ClockHandle::kStateShift
) ==
765 ClockHandle::kStateVisible
) {
766 // Acquired a read reference
767 if (h
->hashed_key
== hashed_key
) {
768 // Match. Set invisible.
770 h
->meta
.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit
}
771 << ClockHandle::kStateShift
),
772 std::memory_order_acq_rel
);
773 // Apply update to local copy
774 old_meta
&= ~(uint64_t{ClockHandle::kStateVisibleBit
}
775 << ClockHandle::kStateShift
);
777 uint64_t refcount
= GetRefcount(old_meta
);
778 assert(refcount
> 0);
780 // Not last ref at some point in time during this Erase call
781 // Pretend we never took the reference
782 h
->meta
.fetch_sub(ClockHandle::kAcquireIncrement
,
783 std::memory_order_release
);
785 } else if (h
->meta
.compare_exchange_weak(
787 uint64_t{ClockHandle::kStateConstruction
}
788 << ClockHandle::kStateShift
,
789 std::memory_order_acq_rel
)) {
791 assert(hashed_key
== h
->hashed_key
);
792 size_t total_charge
= h
->GetTotalCharge();
793 FreeDataMarkEmpty(*h
);
794 ReclaimEntryUsage(total_charge
);
795 // We already have a copy of hashed_key in this case, so OK to
796 // delay Rollback until after releasing the entry
797 Rollback(hashed_key
, h
);
802 // Mismatch. Pretend we never took the reference
803 h
->meta
.fetch_sub(ClockHandle::kAcquireIncrement
,
804 std::memory_order_release
);
806 } else if (UNLIKELY((old_meta
>> ClockHandle::kStateShift
) ==
807 ClockHandle::kStateInvisible
)) {
808 // Pretend we never took the reference
809 // WART: there's a tiny chance we release last ref to invisible
810 // entry here. If that happens, we let eviction take care of it.
811 h
->meta
.fetch_sub(ClockHandle::kAcquireIncrement
,
812 std::memory_order_release
);
814 // For other states, incrementing the acquire counter has no effect
815 // so we don't need to undo it.
820 return h
->displacements
.load(std::memory_order_relaxed
) == 0;
822 [&](HandleImpl
* /*h*/) {}, probe
);
825 void HyperClockTable::ConstApplyToEntriesRange(
826 std::function
<void(const HandleImpl
&)> func
, size_t index_begin
,
827 size_t index_end
, bool apply_if_will_be_deleted
) const {
828 uint64_t check_state_mask
= ClockHandle::kStateShareableBit
;
829 if (!apply_if_will_be_deleted
) {
830 check_state_mask
|= ClockHandle::kStateVisibleBit
;
833 for (size_t i
= index_begin
; i
< index_end
; i
++) {
834 HandleImpl
& h
= array_
[i
];
836 // Note: to avoid using compare_exchange, we have to be extra careful.
837 uint64_t old_meta
= h
.meta
.load(std::memory_order_relaxed
);
838 // Check if it's an entry visible to lookups
839 if ((old_meta
>> ClockHandle::kStateShift
) & check_state_mask
) {
840 // Increment acquire counter. Note: it's possible that the entry has
841 // completely changed since we loaded old_meta, but incrementing acquire
842 // count is always safe. (Similar to optimistic Lookup here.)
843 old_meta
= h
.meta
.fetch_add(ClockHandle::kAcquireIncrement
,
844 std::memory_order_acquire
);
845 // Check whether we actually acquired a reference.
846 if ((old_meta
>> ClockHandle::kStateShift
) &
847 ClockHandle::kStateShareableBit
) {
848 // Apply func if appropriate
849 if ((old_meta
>> ClockHandle::kStateShift
) & check_state_mask
) {
852 // Pretend we never took the reference
853 h
.meta
.fetch_sub(ClockHandle::kAcquireIncrement
,
854 std::memory_order_release
);
855 // No net change, so don't need to check for overflow
857 // For other states, incrementing the acquire counter has no effect
858 // so we don't need to undo it. Furthermore, we cannot safely undo
859 // it because we did not acquire a read reference to lock the
860 // entry in a Shareable state.
866 void HyperClockTable::EraseUnRefEntries() {
867 for (size_t i
= 0; i
<= this->length_bits_mask_
; i
++) {
868 HandleImpl
& h
= array_
[i
];
870 uint64_t old_meta
= h
.meta
.load(std::memory_order_relaxed
);
871 if (old_meta
& (uint64_t{ClockHandle::kStateShareableBit
}
872 << ClockHandle::kStateShift
) &&
873 GetRefcount(old_meta
) == 0 &&
874 h
.meta
.compare_exchange_strong(old_meta
,
875 uint64_t{ClockHandle::kStateConstruction
}
876 << ClockHandle::kStateShift
,
877 std::memory_order_acquire
)) {
879 size_t total_charge
= h
.GetTotalCharge();
880 Rollback(h
.hashed_key
, &h
);
881 FreeDataMarkEmpty(h
);
882 ReclaimEntryUsage(total_charge
);
887 inline HyperClockTable::HandleImpl
* HyperClockTable::FindSlot(
888 const UniqueId64x2
& hashed_key
, std::function
<bool(HandleImpl
*)> match_fn
,
889 std::function
<bool(HandleImpl
*)> abort_fn
,
890 std::function
<void(HandleImpl
*)> update_fn
, size_t& probe
) {
891 // NOTE: upper 32 bits of hashed_key[0] is used for sharding
893 // We use double-hashing probing. Every probe in the sequence is a
894 // pseudorandom integer, computed as a linear function of two random hashes,
895 // which we call base and increment. Specifically, the i-th probe is base + i
896 // * increment modulo the table size.
897 size_t base
= static_cast<size_t>(hashed_key
[1]);
898 // We use an odd increment, which is relatively prime with the power-of-two
899 // table size. This implies that we cycle back to the first probe only
900 // after probing every slot exactly once.
901 // TODO: we could also reconsider linear probing, though locality benefits
902 // are limited because each slot is a full cache line
903 size_t increment
= static_cast<size_t>(hashed_key
[0]) | 1U;
904 size_t current
= ModTableSize(base
+ probe
* increment
);
905 while (probe
<= length_bits_mask_
) {
906 HandleImpl
* h
= &array_
[current
];
916 current
= ModTableSize(current
+ increment
);
922 inline void HyperClockTable::Rollback(const UniqueId64x2
& hashed_key
,
923 const HandleImpl
* h
) {
924 size_t current
= ModTableSize(hashed_key
[1]);
925 size_t increment
= static_cast<size_t>(hashed_key
[0]) | 1U;
926 while (&array_
[current
] != h
) {
927 array_
[current
].displacements
.fetch_sub(1, std::memory_order_relaxed
);
928 current
= ModTableSize(current
+ increment
);
932 inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge
) {
933 auto old_occupancy
= occupancy_
.fetch_sub(1U, std::memory_order_release
);
936 assert(old_occupancy
> 0);
937 auto old_usage
= usage_
.fetch_sub(total_charge
, std::memory_order_relaxed
);
940 assert(old_usage
>= total_charge
);
943 inline void HyperClockTable::Evict(size_t requested_charge
,
944 size_t* freed_charge
, size_t* freed_count
) {
946 assert(requested_charge
> 0);
948 // TODO: make a tuning parameter?
949 constexpr size_t step_size
= 4;
951 // First (concurrent) increment clock pointer
952 uint64_t old_clock_pointer
=
953 clock_pointer_
.fetch_add(step_size
, std::memory_order_relaxed
);
955 // Cap the eviction effort at this thread (along with those operating in
956 // parallel) circling through the whole structure kMaxCountdown times.
957 // In other words, this eviction run must find something/anything that is
958 // unreferenced at start of and during the eviction run that isn't reclaimed
959 // by a concurrent eviction run.
960 uint64_t max_clock_pointer
=
961 old_clock_pointer
+ (ClockHandle::kMaxCountdown
<< length_bits_
);
964 for (size_t i
= 0; i
< step_size
; i
++) {
965 HandleImpl
& h
= array_
[ModTableSize(Lower32of64(old_clock_pointer
+ i
))];
966 bool evicting
= ClockUpdate(h
);
968 Rollback(h
.hashed_key
, &h
);
969 *freed_charge
+= h
.GetTotalCharge();
971 FreeDataMarkEmpty(h
);
975 // Loop exit condition
976 if (*freed_charge
>= requested_charge
) {
979 if (old_clock_pointer
>= max_clock_pointer
) {
983 // Advance clock pointer (concurrently)
985 clock_pointer_
.fetch_add(step_size
, std::memory_order_relaxed
);
989 template <class Table
>
990 ClockCacheShard
<Table
>::ClockCacheShard(
991 size_t capacity
, bool strict_capacity_limit
,
992 CacheMetadataChargePolicy metadata_charge_policy
,
993 const typename
Table::Opts
& opts
)
994 : CacheShardBase(metadata_charge_policy
),
995 table_(capacity
, strict_capacity_limit
, metadata_charge_policy
, opts
),
997 strict_capacity_limit_(strict_capacity_limit
) {
998 // Initial charge metadata should not exceed capacity
999 assert(table_
.GetUsage() <= capacity_
|| capacity_
< sizeof(HandleImpl
));
1002 template <class Table
>
1003 void ClockCacheShard
<Table
>::EraseUnRefEntries() {
1004 table_
.EraseUnRefEntries();
1007 template <class Table
>
1008 void ClockCacheShard
<Table
>::ApplyToSomeEntries(
1009 const std::function
<void(const Slice
& key
, void* value
, size_t charge
,
1010 DeleterFn deleter
)>& callback
,
1011 size_t average_entries_per_lock
, size_t* state
) {
1012 // The state is essentially going to be the starting hash, which works
1013 // nicely even if we resize between calls because we use upper-most
1014 // hash bits for table indexes.
1015 size_t length_bits
= table_
.GetLengthBits();
1016 size_t length
= table_
.GetTableSize();
1018 assert(average_entries_per_lock
> 0);
1019 // Assuming we are called with same average_entries_per_lock repeatedly,
1020 // this simplifies some logic (index_end will not overflow).
1021 assert(average_entries_per_lock
< length
|| *state
== 0);
1023 size_t index_begin
= *state
>> (sizeof(size_t) * 8u - length_bits
);
1024 size_t index_end
= index_begin
+ average_entries_per_lock
;
1025 if (index_end
>= length
) {
1030 *state
= index_end
<< (sizeof(size_t) * 8u - length_bits
);
1033 table_
.ConstApplyToEntriesRange(
1034 [callback
](const HandleImpl
& h
) {
1035 UniqueId64x2 unhashed
;
1036 callback(ReverseHash(h
.hashed_key
, &unhashed
), h
.value
,
1037 h
.GetTotalCharge(), h
.deleter
);
1039 index_begin
, index_end
, false);
1042 int HyperClockTable::CalcHashBits(
1043 size_t capacity
, size_t estimated_value_size
,
1044 CacheMetadataChargePolicy metadata_charge_policy
) {
1045 double average_slot_charge
= estimated_value_size
* kLoadFactor
;
1046 if (metadata_charge_policy
== kFullChargeCacheMetadata
) {
1047 average_slot_charge
+= sizeof(HandleImpl
);
1049 assert(average_slot_charge
> 0.0);
1050 uint64_t num_slots
=
1051 static_cast<uint64_t>(capacity
/ average_slot_charge
+ 0.999999);
1053 int hash_bits
= FloorLog2((num_slots
<< 1) - 1);
1054 if (metadata_charge_policy
== kFullChargeCacheMetadata
) {
1055 // For very small estimated value sizes, it's possible to overshoot
1056 while (hash_bits
> 0 &&
1057 uint64_t{sizeof(HandleImpl
)} << hash_bits
> capacity
) {
1064 template <class Table
>
1065 void ClockCacheShard
<Table
>::SetCapacity(size_t capacity
) {
1066 capacity_
.store(capacity
, std::memory_order_relaxed
);
1067 // next Insert will take care of any necessary evictions
1070 template <class Table
>
1071 void ClockCacheShard
<Table
>::SetStrictCapacityLimit(
1072 bool strict_capacity_limit
) {
1073 strict_capacity_limit_
.store(strict_capacity_limit
,
1074 std::memory_order_relaxed
);
1075 // next Insert will take care of any necessary evictions
1078 template <class Table
>
1079 Status ClockCacheShard
<Table
>::Insert(const Slice
& key
,
1080 const UniqueId64x2
& hashed_key
,
1081 void* value
, size_t charge
,
1082 Cache::DeleterFn deleter
,
1083 HandleImpl
** handle
,
1084 Cache::Priority priority
) {
1085 if (UNLIKELY(key
.size() != kCacheKeySize
)) {
1086 return Status::NotSupported("ClockCache only supports key size " +
1087 std::to_string(kCacheKeySize
) + "B");
1089 ClockHandleBasicData proto
;
1090 proto
.hashed_key
= hashed_key
;
1091 proto
.value
= value
;
1092 proto
.deleter
= deleter
;
1093 proto
.total_charge
= charge
;
1094 Status s
= table_
.Insert(
1095 proto
, handle
, priority
, capacity_
.load(std::memory_order_relaxed
),
1096 strict_capacity_limit_
.load(std::memory_order_relaxed
));
1100 template <class Table
>
1101 typename ClockCacheShard
<Table
>::HandleImpl
* ClockCacheShard
<Table
>::Lookup(
1102 const Slice
& key
, const UniqueId64x2
& hashed_key
) {
1103 if (UNLIKELY(key
.size() != kCacheKeySize
)) {
1106 return table_
.Lookup(hashed_key
);
1109 template <class Table
>
1110 bool ClockCacheShard
<Table
>::Ref(HandleImpl
* h
) {
1118 template <class Table
>
1119 bool ClockCacheShard
<Table
>::Release(HandleImpl
* handle
, bool useful
,
1120 bool erase_if_last_ref
) {
1121 if (handle
== nullptr) {
1124 return table_
.Release(handle
, useful
, erase_if_last_ref
);
1127 template <class Table
>
1128 void ClockCacheShard
<Table
>::TEST_RefN(HandleImpl
* h
, size_t n
) {
1129 table_
.TEST_RefN(*h
, n
);
1132 template <class Table
>
1133 void ClockCacheShard
<Table
>::TEST_ReleaseN(HandleImpl
* h
, size_t n
) {
1134 table_
.TEST_ReleaseN(h
, n
);
1137 template <class Table
>
1138 bool ClockCacheShard
<Table
>::Release(HandleImpl
* handle
,
1139 bool erase_if_last_ref
) {
1140 return Release(handle
, /*useful=*/true, erase_if_last_ref
);
1143 template <class Table
>
1144 void ClockCacheShard
<Table
>::Erase(const Slice
& key
,
1145 const UniqueId64x2
& hashed_key
) {
1146 if (UNLIKELY(key
.size() != kCacheKeySize
)) {
1149 table_
.Erase(hashed_key
);
1152 template <class Table
>
1153 size_t ClockCacheShard
<Table
>::GetUsage() const {
1154 return table_
.GetUsage();
1157 template <class Table
>
1158 size_t ClockCacheShard
<Table
>::GetDetachedUsage() const {
1159 return table_
.GetDetachedUsage();
1162 template <class Table
>
1163 size_t ClockCacheShard
<Table
>::GetCapacity() const {
1167 template <class Table
>
1168 size_t ClockCacheShard
<Table
>::GetPinnedUsage() const {
1169 // Computes the pinned usage by scanning the whole hash table. This
1170 // is slow, but avoids keeping an exact counter on the clock usage,
1171 // i.e., the number of not externally referenced elements.
1172 // Why avoid this counter? Because Lookup removes elements from the clock
1173 // list, so it would need to update the pinned usage every time,
1174 // which creates additional synchronization costs.
1175 size_t table_pinned_usage
= 0;
1176 const bool charge_metadata
=
1177 metadata_charge_policy_
== kFullChargeCacheMetadata
;
1178 table_
.ConstApplyToEntriesRange(
1179 [&table_pinned_usage
, charge_metadata
](const HandleImpl
& h
) {
1180 uint64_t meta
= h
.meta
.load(std::memory_order_relaxed
);
1181 uint64_t refcount
= GetRefcount(meta
);
1182 // Holding one ref for ConstApplyToEntriesRange
1183 assert(refcount
> 0);
1185 table_pinned_usage
+= h
.GetTotalCharge();
1186 if (charge_metadata
) {
1187 table_pinned_usage
+= sizeof(HandleImpl
);
1191 0, table_
.GetTableSize(), true);
1193 return table_pinned_usage
+ table_
.GetDetachedUsage();
1196 template <class Table
>
1197 size_t ClockCacheShard
<Table
>::GetOccupancyCount() const {
1198 return table_
.GetOccupancy();
1201 template <class Table
>
1202 size_t ClockCacheShard
<Table
>::GetOccupancyLimit() const {
1203 return table_
.GetOccupancyLimit();
1206 template <class Table
>
1207 size_t ClockCacheShard
<Table
>::GetTableAddressCount() const {
1208 return table_
.GetTableSize();
1211 // Explicit instantiation
1212 template class ClockCacheShard
<HyperClockTable
>;
1214 HyperClockCache::HyperClockCache(
1215 size_t capacity
, size_t estimated_value_size
, int num_shard_bits
,
1216 bool strict_capacity_limit
,
1217 CacheMetadataChargePolicy metadata_charge_policy
,
1218 std::shared_ptr
<MemoryAllocator
> memory_allocator
)
1219 : ShardedCache(capacity
, num_shard_bits
, strict_capacity_limit
,
1220 std::move(memory_allocator
)) {
1221 assert(estimated_value_size
> 0 ||
1222 metadata_charge_policy
!= kDontChargeCacheMetadata
);
1223 // TODO: should not need to go through two levels of pointer indirection to
1224 // get to table entries
1225 size_t per_shard
= GetPerShardCapacity();
1226 InitShards([=](Shard
* cs
) {
1227 HyperClockTable::Opts opts
;
1228 opts
.estimated_value_size
= estimated_value_size
;
1230 Shard(per_shard
, strict_capacity_limit
, metadata_charge_policy
, opts
);
1234 void* HyperClockCache::Value(Handle
* handle
) {
1235 return reinterpret_cast<const HandleImpl
*>(handle
)->value
;
1238 size_t HyperClockCache::GetCharge(Handle
* handle
) const {
1239 return reinterpret_cast<const HandleImpl
*>(handle
)->GetTotalCharge();
1242 Cache::DeleterFn
HyperClockCache::GetDeleter(Handle
* handle
) const {
1243 auto h
= reinterpret_cast<const HandleImpl
*>(handle
);
1249 // For each cache shard, estimate what the table load factor would be if
1250 // cache filled to capacity with average entries. This is considered
1251 // indicative of a potential problem if the shard is essentially operating
1252 // "at limit", which we define as high actual usage (>80% of capacity)
1253 // or actual occupancy very close to limit (>95% of limit).
1254 // Also, for each shard compute the recommended estimated_entry_charge,
1255 // and keep the minimum one for use as overall recommendation.
1256 void AddShardEvaluation(const HyperClockCache::Shard
& shard
,
1257 std::vector
<double>& predicted_load_factors
,
1258 size_t& min_recommendation
) {
1259 size_t usage
= shard
.GetUsage() - shard
.GetDetachedUsage();
1260 size_t capacity
= shard
.GetCapacity();
1261 double usage_ratio
= 1.0 * usage
/ capacity
;
1263 size_t occupancy
= shard
.GetOccupancyCount();
1264 size_t occ_limit
= shard
.GetOccupancyLimit();
1265 double occ_ratio
= 1.0 * occupancy
/ occ_limit
;
1266 if (usage
== 0 || occupancy
== 0 || (usage_ratio
< 0.8 && occ_ratio
< 0.95)) {
1267 // Skip as described above
1271 // If filled to capacity, what would the occupancy ratio be?
1272 double ratio
= occ_ratio
/ usage_ratio
;
1273 // Given max load factor, what that load factor be?
1274 double lf
= ratio
* kStrictLoadFactor
;
1275 predicted_load_factors
.push_back(lf
);
1277 // Update min_recommendation also
1278 size_t recommendation
= usage
/ occupancy
;
1279 min_recommendation
= std::min(min_recommendation
, recommendation
);
1284 void HyperClockCache::ReportProblems(
1285 const std::shared_ptr
<Logger
>& info_log
) const {
1286 uint32_t shard_count
= GetNumShards();
1287 std::vector
<double> predicted_load_factors
;
1288 size_t min_recommendation
= SIZE_MAX
;
1289 const_cast<HyperClockCache
*>(this)->ForEachShard(
1290 [&](HyperClockCache::Shard
* shard
) {
1291 AddShardEvaluation(*shard
, predicted_load_factors
, min_recommendation
);
1294 if (predicted_load_factors
.empty()) {
1295 // None operating "at limit" -> nothing to report
1298 std::sort(predicted_load_factors
.begin(), predicted_load_factors
.end());
1300 // First, if the average load factor is within spec, we aren't going to
1301 // complain about a few shards being out of spec.
1302 // NOTE: this is only the average among cache shards operating "at limit,"
1303 // which should be representative of what we care about. It it normal, even
1304 // desirable, for a cache to operate "at limit" so this should not create
1305 // selection bias. See AddShardEvaluation().
1306 // TODO: Consider detecting cases where decreasing the number of shards
1307 // would be good, e.g. serious imbalance among shards.
1308 double average_load_factor
=
1309 std::accumulate(predicted_load_factors
.begin(),
1310 predicted_load_factors
.end(), 0.0) /
1313 constexpr double kLowSpecLoadFactor
= kLoadFactor
/ 2;
1314 constexpr double kMidSpecLoadFactor
= kLoadFactor
/ 1.414;
1315 if (average_load_factor
> kLoadFactor
) {
1316 // Out of spec => Consider reporting load factor too high
1317 // Estimate effective overall capacity loss due to enforcing occupancy limit
1318 double lost_portion
= 0.0;
1320 for (double lf
: predicted_load_factors
) {
1321 if (lf
> kStrictLoadFactor
) {
1323 lost_portion
+= (lf
- kStrictLoadFactor
) / lf
/ shard_count
;
1326 // >= 20% loss -> error
1327 // >= 10% loss -> consistent warning
1328 // >= 1% loss -> intermittent warning
1329 InfoLogLevel level
= InfoLogLevel::INFO_LEVEL
;
1331 if (lost_portion
> 0.2) {
1332 level
= InfoLogLevel::ERROR_LEVEL
;
1333 } else if (lost_portion
> 0.1) {
1334 level
= InfoLogLevel::WARN_LEVEL
;
1335 } else if (lost_portion
> 0.01) {
1336 int report_percent
= static_cast<int>(lost_portion
* 100.0);
1337 if (Random::GetTLSInstance()->PercentTrue(report_percent
)) {
1338 level
= InfoLogLevel::WARN_LEVEL
;
1347 "HyperClockCache@%p unable to use estimated %.1f%% capacity because "
1349 "full occupancy in %d/%u cache shards (estimated_entry_charge too "
1350 "high). Recommend estimated_entry_charge=%zu",
1351 this, lost_portion
* 100.0, over_count
, (unsigned)shard_count
,
1352 min_recommendation
);
1354 } else if (average_load_factor
< kLowSpecLoadFactor
) {
1355 // Out of spec => Consider reporting load factor too low
1356 // But cautiously because low is not as big of a problem.
1358 // Only report if highest occupancy shard is also below
1359 // spec and only if average is substantially out of spec
1360 if (predicted_load_factors
.back() < kLowSpecLoadFactor
&&
1361 average_load_factor
< kLowSpecLoadFactor
/ 1.414) {
1362 InfoLogLevel level
= InfoLogLevel::INFO_LEVEL
;
1363 if (average_load_factor
< kLowSpecLoadFactor
/ 2) {
1364 level
= InfoLogLevel::WARN_LEVEL
;
1368 "HyperClockCache@%p table has low occupancy at full capacity. Higher "
1369 "estimated_entry_charge (about %.1fx) would likely improve "
1370 "performance. Recommend estimated_entry_charge=%zu",
1371 this, kMidSpecLoadFactor
/ average_load_factor
, min_recommendation
);
1376 } // namespace clock_cache
1378 // DEPRECATED (see public API)
1379 std::shared_ptr
<Cache
> NewClockCache(
1380 size_t capacity
, int num_shard_bits
, bool strict_capacity_limit
,
1381 CacheMetadataChargePolicy metadata_charge_policy
) {
1382 return NewLRUCache(capacity
, num_shard_bits
, strict_capacity_limit
,
1383 /* high_pri_pool_ratio */ 0.5, nullptr,
1384 kDefaultToAdaptiveMutex
, metadata_charge_policy
,
1385 /* low_pri_pool_ratio */ 0.0);
1388 std::shared_ptr
<Cache
> HyperClockCacheOptions::MakeSharedCache() const {
1389 auto my_num_shard_bits
= num_shard_bits
;
1390 if (my_num_shard_bits
>= 20) {
1391 return nullptr; // The cache cannot be sharded into too many fine pieces.
1393 if (my_num_shard_bits
< 0) {
1394 // Use larger shard size to reduce risk of large entries clustering
1395 // or skewing individual shards.
1396 constexpr size_t min_shard_size
= 32U * 1024U * 1024U;
1397 my_num_shard_bits
= GetDefaultCacheShardBits(capacity
, min_shard_size
);
1399 return std::make_shared
<clock_cache::HyperClockCache
>(
1400 capacity
, estimated_entry_charge
, my_num_shard_bits
,
1401 strict_capacity_limit
, metadata_charge_policy
, memory_allocator
);
1404 } // namespace ROCKSDB_NAMESPACE