]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/utilities/persistent_cache/hash_table.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / utilities / persistent_cache / hash_table.h
CommitLineData
7c673cae 1// Copyright (c) 2013, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6#pragma once
7
8#ifndef ROCKSDB_LITE
9
10#include <assert.h>
1e59de90 11
7c673cae
FG
12#include <list>
13#include <vector>
14
15#ifdef OS_LINUX
16#include <sys/mman.h>
17#endif
18
f67539c2 19#include "rocksdb/env.h"
7c673cae
FG
20#include "util/mutexlock.h"
21
f67539c2 22namespace ROCKSDB_NAMESPACE {
7c673cae
FG
23
24// HashTable<T, Hash, Equal>
25//
26// Traditional implementation of hash table with synchronization built on top
27// don't perform very well in multi-core scenarios. This is an implementation
28// designed for multi-core scenarios with high lock contention.
29//
30// |<-------- alpha ------------->|
31// Buckets Collision list
32// ---- +----+ +---+---+--- ...... ---+---+---+
33// / | |--->| | | | | |
34// / +----+ +---+---+--- ...... ---+---+---+
35// / | |
36// Locks/ +----+
37// +--+/ . .
38// | | . .
39// +--+ . .
40// | | . .
41// +--+ . .
42// | | . .
43// +--+ . .
44// \ +----+
45// \ | |
46// \ +----+
47// \ | |
48// \---- +----+
49//
50// The lock contention is spread over an array of locks. This helps improve
51// concurrent access. The spine is designed for a certain capacity and load
52// factor. When the capacity planning is done correctly we can expect
53// O(load_factor = 1) insert, access and remove time.
54//
55// Micro benchmark on debug build gives about .5 Million/sec rate of insert,
56// erase and lookup in parallel (total of about 1.5 Million ops/sec). If the
57// blocks were of 4K, the hash table can support a virtual throughput of
58// 6 GB/s.
59//
60// T Object type (contains both key and value)
61// Hash Function that returns an hash from type T
62// Equal Returns if two objects are equal
63// (We need explicit equal for pointer type)
64//
65template <class T, class Hash, class Equal>
66class HashTable {
67 public:
68 explicit HashTable(const size_t capacity = 1024 * 1024,
69 const float load_factor = 2.0, const uint32_t nlocks = 256)
70 : nbuckets_(
71 static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)),
72 nlocks_(nlocks) {
73 // pre-conditions
74 assert(capacity);
75 assert(load_factor);
76 assert(nbuckets_);
77 assert(nlocks_);
78
79 buckets_.reset(new Bucket[nbuckets_]);
80#ifdef OS_LINUX
81 mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));
82#endif
83
84 // initialize locks
85 locks_.reset(new port::RWMutex[nlocks_]);
86#ifdef OS_LINUX
87 mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));
88#endif
89
90 // post-conditions
91 assert(buckets_);
92 assert(locks_);
93 }
94
95 virtual ~HashTable() { AssertEmptyBuckets(); }
96
97 //
98 // Insert given record to hash table
99 //
100 bool Insert(const T& t) {
101 const uint64_t h = Hash()(t);
102 const uint32_t bucket_idx = h % nbuckets_;
103 const uint32_t lock_idx = bucket_idx % nlocks_;
104
105 WriteLock _(&locks_[lock_idx]);
106 auto& bucket = buckets_[bucket_idx];
107 return Insert(&bucket, t);
108 }
109
110 // Lookup hash table
111 //
112 // Please note that read lock should be held by the caller. This is because
113 // the caller owns the data, and should hold the read lock as long as he
114 // operates on the data.
115 bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {
116 const uint64_t h = Hash()(t);
117 const uint32_t bucket_idx = h % nbuckets_;
118 const uint32_t lock_idx = bucket_idx % nlocks_;
119
120 port::RWMutex& lock = locks_[lock_idx];
121 lock.ReadLock();
122
123 auto& bucket = buckets_[bucket_idx];
124 if (Find(&bucket, t, ret)) {
125 *ret_lock = &lock;
126 return true;
127 }
128
129 lock.ReadUnlock();
130 return false;
131 }
132
133 //
134 // Erase a given key from the hash table
135 //
136 bool Erase(const T& t, T* ret) {
137 const uint64_t h = Hash()(t);
138 const uint32_t bucket_idx = h % nbuckets_;
139 const uint32_t lock_idx = bucket_idx % nlocks_;
140
141 WriteLock _(&locks_[lock_idx]);
142
143 auto& bucket = buckets_[bucket_idx];
144 return Erase(&bucket, t, ret);
145 }
146
147 // Fetch the mutex associated with a key
148 // This call is used to hold the lock for a given data for extended period of
149 // time.
150 port::RWMutex* GetMutex(const T& t) {
151 const uint64_t h = Hash()(t);
152 const uint32_t bucket_idx = h % nbuckets_;
153 const uint32_t lock_idx = bucket_idx % nlocks_;
154
155 return &locks_[lock_idx];
156 }
157
158 void Clear(void (*fn)(T)) {
159 for (uint32_t i = 0; i < nbuckets_; ++i) {
160 const uint32_t lock_idx = i % nlocks_;
161 WriteLock _(&locks_[lock_idx]);
162 for (auto& t : buckets_[i].list_) {
163 (*fn)(t);
164 }
165 buckets_[i].list_.clear();
166 }
167 }
168
169 protected:
170 // Models bucket of keys that hash to the same bucket number
171 struct Bucket {
172 std::list<T> list_;
173 };
174
175 // Substitute for std::find with custom comparator operator
176 typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {
177 for (auto it = list->begin(); it != list->end(); ++it) {
178 if (Equal()(*it, t)) {
179 return it;
180 }
181 }
182 return list->end();
183 }
184
185 bool Insert(Bucket* bucket, const T& t) {
186 // Check if the key already exists
187 auto it = Find(&bucket->list_, t);
188 if (it != bucket->list_.end()) {
189 return false;
190 }
191
192 // insert to bucket
193 bucket->list_.push_back(t);
194 return true;
195 }
196
197 bool Find(Bucket* bucket, const T& t, T* ret) {
198 auto it = Find(&bucket->list_, t);
199 if (it != bucket->list_.end()) {
200 if (ret) {
201 *ret = *it;
202 }
203 return true;
204 }
205 return false;
206 }
207
208 bool Erase(Bucket* bucket, const T& t, T* ret) {
209 auto it = Find(&bucket->list_, t);
210 if (it != bucket->list_.end()) {
211 if (ret) {
212 *ret = *it;
213 }
214
215 bucket->list_.erase(it);
216 return true;
217 }
218 return false;
219 }
220
221 // assert that all buckets are empty
222 void AssertEmptyBuckets() {
223#ifndef NDEBUG
224 for (size_t i = 0; i < nbuckets_; ++i) {
225 WriteLock _(&locks_[i % nlocks_]);
226 assert(buckets_[i].list_.empty());
227 }
228#endif
229 }
230
231 const uint32_t nbuckets_; // No. of buckets in the spine
232 std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets
233 const uint32_t nlocks_; // No. of locks
234 std::unique_ptr<port::RWMutex[]> locks_; // Granular locks
235};
236
f67539c2 237} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
238
239#endif