]> git.proxmox.com Git - ceph.git/blame - ceph/src/kv/rocksdb_cache/BinnedLRUCache.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / kv / rocksdb_cache / BinnedLRUCache.cc
CommitLineData
91327a77
AA
1// Copyright (c) 2018-Present Red Hat Inc. All rights reserved.
2//
3// Copyright (c) 2011-2018, Facebook, Inc. All rights reserved.
4// This source code is licensed under both the GPLv2 and Apache 2.0 License
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifndef __STDC_FORMAT_MACROS
11#define __STDC_FORMAT_MACROS
12#endif
13
14#include "BinnedLRUCache.h"
15
91327a77
AA
16#include <stdio.h>
17#include <stdlib.h>
18#include <string>
19
11fdf7f2
TL
20#define dout_context cct
21#define dout_subsys ceph_subsys_rocksdb
22#undef dout_prefix
23#define dout_prefix *_dout << "rocksdb: "
24
91327a77
AA
25namespace rocksdb_cache {
26
27BinnedLRUHandleTable::BinnedLRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
28 Resize();
29}
30
31BinnedLRUHandleTable::~BinnedLRUHandleTable() {
32 ApplyToAllCacheEntries([](BinnedLRUHandle* h) {
33 if (h->refs == 1) {
34 h->Free();
35 }
36 });
37 delete[] list_;
38}
39
40BinnedLRUHandle* BinnedLRUHandleTable::Lookup(const rocksdb::Slice& key, uint32_t hash) {
41 return *FindPointer(key, hash);
42}
43
44BinnedLRUHandle* BinnedLRUHandleTable::Insert(BinnedLRUHandle* h) {
45 BinnedLRUHandle** ptr = FindPointer(h->key(), h->hash);
46 BinnedLRUHandle* old = *ptr;
47 h->next_hash = (old == nullptr ? nullptr : old->next_hash);
48 *ptr = h;
49 if (old == nullptr) {
50 ++elems_;
51 if (elems_ > length_) {
52 // Since each cache entry is fairly large, we aim for a small
53 // average linked list length (<= 1).
54 Resize();
55 }
56 }
57 return old;
58}
59
60BinnedLRUHandle* BinnedLRUHandleTable::Remove(const rocksdb::Slice& key, uint32_t hash) {
61 BinnedLRUHandle** ptr = FindPointer(key, hash);
62 BinnedLRUHandle* result = *ptr;
63 if (result != nullptr) {
64 *ptr = result->next_hash;
65 --elems_;
66 }
67 return result;
68}
69
70BinnedLRUHandle** BinnedLRUHandleTable::FindPointer(const rocksdb::Slice& key, uint32_t hash) {
71 BinnedLRUHandle** ptr = &list_[hash & (length_ - 1)];
72 while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
73 ptr = &(*ptr)->next_hash;
74 }
75 return ptr;
76}
77
78void BinnedLRUHandleTable::Resize() {
79 uint32_t new_length = 16;
80 while (new_length < elems_ * 1.5) {
81 new_length *= 2;
82 }
83 BinnedLRUHandle** new_list = new BinnedLRUHandle*[new_length];
84 memset(new_list, 0, sizeof(new_list[0]) * new_length);
85 uint32_t count = 0;
86 for (uint32_t i = 0; i < length_; i++) {
87 BinnedLRUHandle* h = list_[i];
88 while (h != nullptr) {
89 BinnedLRUHandle* next = h->next_hash;
90 uint32_t hash = h->hash;
91 BinnedLRUHandle** ptr = &new_list[hash & (new_length - 1)];
92 h->next_hash = *ptr;
93 *ptr = h;
94 h = next;
95 count++;
96 }
97 }
11fdf7f2 98 ceph_assert(elems_ == count);
91327a77
AA
99 delete[] list_;
100 list_ = new_list;
101 length_ = new_length;
102}
103
104BinnedLRUCacheShard::BinnedLRUCacheShard(size_t capacity, bool strict_capacity_limit,
105 double high_pri_pool_ratio)
106 : capacity_(0),
107 high_pri_pool_usage_(0),
108 strict_capacity_limit_(strict_capacity_limit),
109 high_pri_pool_ratio_(high_pri_pool_ratio),
110 high_pri_pool_capacity_(0),
111 usage_(0),
112 lru_usage_(0) {
113 // Make empty circular linked list
114 lru_.next = &lru_;
115 lru_.prev = &lru_;
116 lru_low_pri_ = &lru_;
117 SetCapacity(capacity);
118}
119
120BinnedLRUCacheShard::~BinnedLRUCacheShard() {}
121
122bool BinnedLRUCacheShard::Unref(BinnedLRUHandle* e) {
11fdf7f2 123 ceph_assert(e->refs > 0);
91327a77
AA
124 e->refs--;
125 return e->refs == 0;
126}
127
128// Call deleter and free
129
130void BinnedLRUCacheShard::EraseUnRefEntries() {
131 ceph::autovector<BinnedLRUHandle*> last_reference_list;
132 {
133 std::lock_guard<std::mutex> l(mutex_);
134 while (lru_.next != &lru_) {
135 BinnedLRUHandle* old = lru_.next;
11fdf7f2
TL
136 ceph_assert(old->InCache());
137 ceph_assert(old->refs ==
91327a77
AA
138 1); // LRU list contains elements which may be evicted
139 LRU_Remove(old);
140 table_.Remove(old->key(), old->hash);
141 old->SetInCache(false);
142 Unref(old);
143 usage_ -= old->charge;
144 last_reference_list.push_back(old);
145 }
146 }
147
148 for (auto entry : last_reference_list) {
149 entry->Free();
150 }
151}
152
153void BinnedLRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
154 bool thread_safe) {
155 if (thread_safe) {
156 mutex_.lock();
157 }
158 table_.ApplyToAllCacheEntries(
159 [callback](BinnedLRUHandle* h) { callback(h->value, h->charge); });
160 if (thread_safe) {
161 mutex_.unlock();
162 }
163}
164
165void BinnedLRUCacheShard::TEST_GetLRUList(BinnedLRUHandle** lru, BinnedLRUHandle** lru_low_pri) {
166 *lru = &lru_;
167 *lru_low_pri = lru_low_pri_;
168}
169
170size_t BinnedLRUCacheShard::TEST_GetLRUSize() {
171 BinnedLRUHandle* lru_handle = lru_.next;
172 size_t lru_size = 0;
173 while (lru_handle != &lru_) {
174 lru_size++;
175 lru_handle = lru_handle->next;
176 }
177 return lru_size;
178}
179
180double BinnedLRUCacheShard::GetHighPriPoolRatio() const {
181 std::lock_guard<std::mutex> l(mutex_);
182 return high_pri_pool_ratio_;
183}
184
185size_t BinnedLRUCacheShard::GetHighPriPoolUsage() const {
186 std::lock_guard<std::mutex> l(mutex_);
187 return high_pri_pool_usage_;
188}
189
190void BinnedLRUCacheShard::LRU_Remove(BinnedLRUHandle* e) {
11fdf7f2
TL
191 ceph_assert(e->next != nullptr);
192 ceph_assert(e->prev != nullptr);
91327a77
AA
193 if (lru_low_pri_ == e) {
194 lru_low_pri_ = e->prev;
195 }
196 e->next->prev = e->prev;
197 e->prev->next = e->next;
198 e->prev = e->next = nullptr;
199 lru_usage_ -= e->charge;
200 if (e->InHighPriPool()) {
11fdf7f2 201 ceph_assert(high_pri_pool_usage_ >= e->charge);
91327a77
AA
202 high_pri_pool_usage_ -= e->charge;
203 }
204}
205
206void BinnedLRUCacheShard::LRU_Insert(BinnedLRUHandle* e) {
11fdf7f2
TL
207 ceph_assert(e->next == nullptr);
208 ceph_assert(e->prev == nullptr);
91327a77
AA
209 if (high_pri_pool_ratio_ > 0 && e->IsHighPri()) {
210 // Inset "e" to head of LRU list.
211 e->next = &lru_;
212 e->prev = lru_.prev;
213 e->prev->next = e;
214 e->next->prev = e;
215 e->SetInHighPriPool(true);
216 high_pri_pool_usage_ += e->charge;
217 MaintainPoolSize();
218 } else {
219 // Insert "e" to the head of low-pri pool. Note that when
220 // high_pri_pool_ratio is 0, head of low-pri pool is also head of LRU list.
221 e->next = lru_low_pri_->next;
222 e->prev = lru_low_pri_;
223 e->prev->next = e;
224 e->next->prev = e;
225 e->SetInHighPriPool(false);
226 lru_low_pri_ = e;
227 }
228 lru_usage_ += e->charge;
229}
230
231void BinnedLRUCacheShard::MaintainPoolSize() {
232 while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
233 // Overflow last entry in high-pri pool to low-pri pool.
234 lru_low_pri_ = lru_low_pri_->next;
11fdf7f2 235 ceph_assert(lru_low_pri_ != &lru_);
91327a77
AA
236 lru_low_pri_->SetInHighPriPool(false);
237 high_pri_pool_usage_ -= lru_low_pri_->charge;
238 }
239}
240
241void BinnedLRUCacheShard::EvictFromLRU(size_t charge,
242 ceph::autovector<BinnedLRUHandle*>* deleted) {
243 while (usage_ + charge > capacity_ && lru_.next != &lru_) {
244 BinnedLRUHandle* old = lru_.next;
11fdf7f2
TL
245 ceph_assert(old->InCache());
246 ceph_assert(old->refs == 1); // LRU list contains elements which may be evicted
91327a77
AA
247 LRU_Remove(old);
248 table_.Remove(old->key(), old->hash);
249 old->SetInCache(false);
250 Unref(old);
251 usage_ -= old->charge;
252 deleted->push_back(old);
253 }
254}
255
256void BinnedLRUCacheShard::SetCapacity(size_t capacity) {
257 ceph::autovector<BinnedLRUHandle*> last_reference_list;
258 {
259 std::lock_guard<std::mutex> l(mutex_);
260 capacity_ = capacity;
261 high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
262 EvictFromLRU(0, &last_reference_list);
263 }
264 // we free the entries here outside of mutex for
265 // performance reasons
266 for (auto entry : last_reference_list) {
267 entry->Free();
268 }
269}
270
271void BinnedLRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
272 std::lock_guard<std::mutex> l(mutex_);
273 strict_capacity_limit_ = strict_capacity_limit;
274}
275
276rocksdb::Cache::Handle* BinnedLRUCacheShard::Lookup(const rocksdb::Slice& key, uint32_t hash) {
277 std::lock_guard<std::mutex> l(mutex_);
278 BinnedLRUHandle* e = table_.Lookup(key, hash);
279 if (e != nullptr) {
11fdf7f2 280 ceph_assert(e->InCache());
91327a77
AA
281 if (e->refs == 1) {
282 LRU_Remove(e);
283 }
284 e->refs++;
285 e->SetHit();
286 }
287 return reinterpret_cast<rocksdb::Cache::Handle*>(e);
288}
289
290bool BinnedLRUCacheShard::Ref(rocksdb::Cache::Handle* h) {
291 BinnedLRUHandle* handle = reinterpret_cast<BinnedLRUHandle*>(h);
292 std::lock_guard<std::mutex> l(mutex_);
293 if (handle->InCache() && handle->refs == 1) {
294 LRU_Remove(handle);
295 }
296 handle->refs++;
297 return true;
298}
299
300void BinnedLRUCacheShard::SetHighPriPoolRatio(double high_pri_pool_ratio) {
301 std::lock_guard<std::mutex> l(mutex_);
302 high_pri_pool_ratio_ = high_pri_pool_ratio;
303 high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
304 MaintainPoolSize();
305}
306
307bool BinnedLRUCacheShard::Release(rocksdb::Cache::Handle* handle, bool force_erase) {
308 if (handle == nullptr) {
309 return false;
310 }
311 BinnedLRUHandle* e = reinterpret_cast<BinnedLRUHandle*>(handle);
312 bool last_reference = false;
313 {
314 std::lock_guard<std::mutex> l(mutex_);
315 last_reference = Unref(e);
316 if (last_reference) {
317 usage_ -= e->charge;
318 }
319 if (e->refs == 1 && e->InCache()) {
320 // The item is still in cache, and nobody else holds a reference to it
321 if (usage_ > capacity_ || force_erase) {
322 // the cache is full
323 // The LRU list must be empty since the cache is full
11fdf7f2 324 ceph_assert(!(usage_ > capacity_) || lru_.next == &lru_);
91327a77
AA
325 // take this opportunity and remove the item
326 table_.Remove(e->key(), e->hash);
327 e->SetInCache(false);
328 Unref(e);
329 usage_ -= e->charge;
330 last_reference = true;
331 } else {
332 // put the item on the list to be potentially freed
333 LRU_Insert(e);
334 }
335 }
336 }
337
338 // free outside of mutex
339 if (last_reference) {
340 e->Free();
341 }
342 return last_reference;
343}
344
345rocksdb::Status BinnedLRUCacheShard::Insert(const rocksdb::Slice& key, uint32_t hash, void* value,
346 size_t charge,
347 void (*deleter)(const rocksdb::Slice& key, void* value),
348 rocksdb::Cache::Handle** handle, rocksdb::Cache::Priority priority) {
11fdf7f2 349 auto e = new BinnedLRUHandle();
91327a77
AA
350 rocksdb::Status s;
351 ceph::autovector<BinnedLRUHandle*> last_reference_list;
352
353 e->value = value;
354 e->deleter = deleter;
355 e->charge = charge;
356 e->key_length = key.size();
11fdf7f2 357 e->key_data = new char[e->key_length];
91327a77
AA
358 e->flags = 0;
359 e->hash = hash;
360 e->refs = (handle == nullptr
361 ? 1
362 : 2); // One from BinnedLRUCache, one for the returned handle
363 e->next = e->prev = nullptr;
364 e->SetInCache(true);
365 e->SetPriority(priority);
11fdf7f2 366 std::copy_n(key.data(), e->key_length, e->key_data);
91327a77
AA
367
368 {
369 std::lock_guard<std::mutex> l(mutex_);
370 // Free the space following strict LRU policy until enough space
371 // is freed or the lru list is empty
372 EvictFromLRU(charge, &last_reference_list);
373
374 if (usage_ - lru_usage_ + charge > capacity_ &&
375 (strict_capacity_limit_ || handle == nullptr)) {
376 if (handle == nullptr) {
377 // Don't insert the entry but still return ok, as if the entry inserted
378 // into cache and get evicted immediately.
379 last_reference_list.push_back(e);
380 } else {
11fdf7f2 381 delete e;
91327a77
AA
382 *handle = nullptr;
383 s = rocksdb::Status::Incomplete("Insert failed due to LRU cache being full.");
384 }
385 } else {
386 // insert into the cache
387 // note that the cache might get larger than its capacity if not enough
388 // space was freed
389 BinnedLRUHandle* old = table_.Insert(e);
390 usage_ += e->charge;
391 if (old != nullptr) {
392 old->SetInCache(false);
393 if (Unref(old)) {
394 usage_ -= old->charge;
395 // old is on LRU because it's in cache and its reference count
396 // was just 1 (Unref returned 0)
397 LRU_Remove(old);
398 last_reference_list.push_back(old);
399 }
400 }
401 if (handle == nullptr) {
402 LRU_Insert(e);
403 } else {
404 *handle = reinterpret_cast<rocksdb::Cache::Handle*>(e);
405 }
406 s = rocksdb::Status::OK();
407 }
408 }
409
410 // we free the entries here outside of mutex for
411 // performance reasons
412 for (auto entry : last_reference_list) {
413 entry->Free();
414 }
415
416 return s;
417}
418
419void BinnedLRUCacheShard::Erase(const rocksdb::Slice& key, uint32_t hash) {
420 BinnedLRUHandle* e;
421 bool last_reference = false;
422 {
423 std::lock_guard<std::mutex> l(mutex_);
424 e = table_.Remove(key, hash);
425 if (e != nullptr) {
426 last_reference = Unref(e);
427 if (last_reference) {
428 usage_ -= e->charge;
429 }
430 if (last_reference && e->InCache()) {
431 LRU_Remove(e);
432 }
433 e->SetInCache(false);
434 }
435 }
436
437 // mutex not held here
438 // last_reference will only be true if e != nullptr
439 if (last_reference) {
440 e->Free();
441 }
442}
443
444size_t BinnedLRUCacheShard::GetUsage() const {
445 std::lock_guard<std::mutex> l(mutex_);
446 return usage_;
447}
448
449size_t BinnedLRUCacheShard::GetPinnedUsage() const {
450 std::lock_guard<std::mutex> l(mutex_);
11fdf7f2 451 ceph_assert(usage_ >= lru_usage_);
91327a77
AA
452 return usage_ - lru_usage_;
453}
454
455std::string BinnedLRUCacheShard::GetPrintableOptions() const {
456 const int kBufferSize = 200;
457 char buffer[kBufferSize];
458 {
459 std::lock_guard<std::mutex> l(mutex_);
460 snprintf(buffer, kBufferSize, " high_pri_pool_ratio: %.3lf\n",
461 high_pri_pool_ratio_);
462 }
463 return std::string(buffer);
464}
465
11fdf7f2
TL
466BinnedLRUCache::BinnedLRUCache(CephContext *c,
467 size_t capacity,
468 int num_shard_bits,
469 bool strict_capacity_limit,
470 double high_pri_pool_ratio)
471 : ShardedCache(capacity, num_shard_bits, strict_capacity_limit), cct(c) {
91327a77
AA
472 num_shards_ = 1 << num_shard_bits;
473 // TODO: Switch over to use mempool
474 int rc = posix_memalign((void**) &shards_,
475 CACHE_LINE_SIZE,
476 sizeof(BinnedLRUCacheShard) * num_shards_);
477 if (rc != 0) {
478 throw std::bad_alloc();
479 }
480 size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
481 for (int i = 0; i < num_shards_; i++) {
482 new (&shards_[i])
483 BinnedLRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio);
484 }
485}
486
487BinnedLRUCache::~BinnedLRUCache() {
488 for (int i = 0; i < num_shards_; i++) {
489 shards_[i].~BinnedLRUCacheShard();
490 }
491 free(shards_);
492}
493
494CacheShard* BinnedLRUCache::GetShard(int shard) {
495 return reinterpret_cast<CacheShard*>(&shards_[shard]);
496}
497
498const CacheShard* BinnedLRUCache::GetShard(int shard) const {
499 return reinterpret_cast<CacheShard*>(&shards_[shard]);
500}
501
502void* BinnedLRUCache::Value(Handle* handle) {
503 return reinterpret_cast<const BinnedLRUHandle*>(handle)->value;
504}
505
506size_t BinnedLRUCache::GetCharge(Handle* handle) const {
507 return reinterpret_cast<const BinnedLRUHandle*>(handle)->charge;
508}
509
510uint32_t BinnedLRUCache::GetHash(Handle* handle) const {
511 return reinterpret_cast<const BinnedLRUHandle*>(handle)->hash;
512}
513
514void BinnedLRUCache::DisownData() {
515// Do not drop data if compile with ASAN to suppress leak warning.
516#ifndef __SANITIZE_ADDRESS__
517 shards_ = nullptr;
518#endif // !__SANITIZE_ADDRESS__
519}
520
521size_t BinnedLRUCache::TEST_GetLRUSize() {
522 size_t lru_size_of_all_shards = 0;
523 for (int i = 0; i < num_shards_; i++) {
524 lru_size_of_all_shards += shards_[i].TEST_GetLRUSize();
525 }
526 return lru_size_of_all_shards;
527}
528
529void BinnedLRUCache::SetHighPriPoolRatio(double high_pri_pool_ratio) {
530 for (int i = 0; i < num_shards_; i++) {
531 shards_[i].SetHighPriPoolRatio(high_pri_pool_ratio);
532 }
533}
534
535double BinnedLRUCache::GetHighPriPoolRatio() const {
536 double result = 0.0;
537 if (num_shards_ > 0) {
538 result = shards_[0].GetHighPriPoolRatio();
539 }
540 return result;
541}
542
543size_t BinnedLRUCache::GetHighPriPoolUsage() const {
544 // We will not lock the cache when getting the usage from shards.
545 size_t usage = 0;
546 for (int s = 0; s < num_shards_; s++) {
547 usage += shards_[s].GetHighPriPoolUsage();
548 }
549 return usage;
550}
551
11fdf7f2
TL
552// PriCache
553
554int64_t BinnedLRUCache::request_cache_bytes(PriorityCache::Priority pri, uint64_t total_cache) const
555{
556 int64_t assigned = get_cache_bytes(pri);
557 int64_t request = 0;
558
559 switch (pri) {
560 // PRI0 is for rocksdb's high priority items (indexes/filters)
561 case PriorityCache::Priority::PRI0:
562 {
563 request = GetHighPriPoolUsage();
564 break;
565 }
566 // All other cache items are currently shoved into the LAST priority.
567 case PriorityCache::Priority::LAST:
568 {
569 request = GetUsage();
570 request -= GetHighPriPoolUsage();
571 break;
572 }
573 default:
574 break;
575 }
576 request = (request > assigned) ? request - assigned : 0;
577 ldout(cct, 10) << __func__ << " Priority: " << static_cast<uint32_t>(pri)
578 << " Request: " << request << dendl;
579 return request;
580}
581
582int64_t BinnedLRUCache::commit_cache_size(uint64_t total_bytes)
583{
584 size_t old_bytes = GetCapacity();
585 int64_t new_bytes = PriorityCache::get_chunk(
586 get_cache_bytes(), total_bytes);
587 ldout(cct, 10) << __func__ << " old: " << old_bytes
588 << " new: " << new_bytes << dendl;
589 SetCapacity((size_t) new_bytes);
590 double ratio =
591 (double) get_cache_bytes(PriorityCache::Priority::PRI0) / new_bytes;
592 ldout(cct, 10) << __func__ << " High Pri Pool Ratio set to " << ratio << dendl;
593 SetHighPriPoolRatio(ratio);
594 return new_bytes;
595}
596
597std::shared_ptr<rocksdb::Cache> NewBinnedLRUCache(
598 CephContext *c,
599 size_t capacity,
600 int num_shard_bits,
601 bool strict_capacity_limit,
602 double high_pri_pool_ratio) {
91327a77
AA
603 if (num_shard_bits >= 20) {
604 return nullptr; // the cache cannot be sharded into too many fine pieces
605 }
606 if (high_pri_pool_ratio < 0.0 || high_pri_pool_ratio > 1.0) {
607 // invalid high_pri_pool_ratio
608 return nullptr;
609 }
610 if (num_shard_bits < 0) {
611 num_shard_bits = GetDefaultCacheShardBits(capacity);
612 }
11fdf7f2
TL
613 return std::make_shared<BinnedLRUCache>(
614 c, capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio);
91327a77
AA
615}
616
617} // namespace rocksdb_cache