]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/utilities/persistent_cache/hash_table.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / utilities / persistent_cache / hash_table.h
1 // Copyright (c) 2013, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 #pragma once
7
8 #ifndef ROCKSDB_LITE
9
10 #include <assert.h>
11 #include <list>
12 #include <vector>
13
14 #ifdef OS_LINUX
15 #include <sys/mman.h>
16 #endif
17
18 #include "include/rocksdb/env.h"
19 #include "util/mutexlock.h"
20
21 namespace rocksdb {
22
23 // HashTable<T, Hash, Equal>
24 //
25 // Traditional implementation of hash table with synchronization built on top
26 // don't perform very well in multi-core scenarios. This is an implementation
27 // designed for multi-core scenarios with high lock contention.
28 //
29 // |<-------- alpha ------------->|
30 // Buckets Collision list
31 // ---- +----+ +---+---+--- ...... ---+---+---+
32 // / | |--->| | | | | |
33 // / +----+ +---+---+--- ...... ---+---+---+
34 // / | |
35 // Locks/ +----+
36 // +--+/ . .
37 // | | . .
38 // +--+ . .
39 // | | . .
40 // +--+ . .
41 // | | . .
42 // +--+ . .
43 // \ +----+
44 // \ | |
45 // \ +----+
46 // \ | |
47 // \---- +----+
48 //
49 // The lock contention is spread over an array of locks. This helps improve
50 // concurrent access. The spine is designed for a certain capacity and load
51 // factor. When the capacity planning is done correctly we can expect
52 // O(load_factor = 1) insert, access and remove time.
53 //
54 // Micro benchmark on debug build gives about .5 Million/sec rate of insert,
55 // erase and lookup in parallel (total of about 1.5 Million ops/sec). If the
56 // blocks were of 4K, the hash table can support a virtual throughput of
57 // 6 GB/s.
58 //
59 // T Object type (contains both key and value)
60 // Hash Function that returns an hash from type T
61 // Equal Returns if two objects are equal
62 // (We need explicit equal for pointer type)
63 //
64 template <class T, class Hash, class Equal>
65 class HashTable {
66 public:
67 explicit HashTable(const size_t capacity = 1024 * 1024,
68 const float load_factor = 2.0, const uint32_t nlocks = 256)
69 : nbuckets_(
70 static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)),
71 nlocks_(nlocks) {
72 // pre-conditions
73 assert(capacity);
74 assert(load_factor);
75 assert(nbuckets_);
76 assert(nlocks_);
77
78 buckets_.reset(new Bucket[nbuckets_]);
79 #ifdef OS_LINUX
80 mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));
81 #endif
82
83 // initialize locks
84 locks_.reset(new port::RWMutex[nlocks_]);
85 #ifdef OS_LINUX
86 mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));
87 #endif
88
89 // post-conditions
90 assert(buckets_);
91 assert(locks_);
92 }
93
94 virtual ~HashTable() { AssertEmptyBuckets(); }
95
96 //
97 // Insert given record to hash table
98 //
99 bool Insert(const T& t) {
100 const uint64_t h = Hash()(t);
101 const uint32_t bucket_idx = h % nbuckets_;
102 const uint32_t lock_idx = bucket_idx % nlocks_;
103
104 WriteLock _(&locks_[lock_idx]);
105 auto& bucket = buckets_[bucket_idx];
106 return Insert(&bucket, t);
107 }
108
109 // Lookup hash table
110 //
111 // Please note that read lock should be held by the caller. This is because
112 // the caller owns the data, and should hold the read lock as long as he
113 // operates on the data.
114 bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {
115 const uint64_t h = Hash()(t);
116 const uint32_t bucket_idx = h % nbuckets_;
117 const uint32_t lock_idx = bucket_idx % nlocks_;
118
119 port::RWMutex& lock = locks_[lock_idx];
120 lock.ReadLock();
121
122 auto& bucket = buckets_[bucket_idx];
123 if (Find(&bucket, t, ret)) {
124 *ret_lock = &lock;
125 return true;
126 }
127
128 lock.ReadUnlock();
129 return false;
130 }
131
132 //
133 // Erase a given key from the hash table
134 //
135 bool Erase(const T& t, T* ret) {
136 const uint64_t h = Hash()(t);
137 const uint32_t bucket_idx = h % nbuckets_;
138 const uint32_t lock_idx = bucket_idx % nlocks_;
139
140 WriteLock _(&locks_[lock_idx]);
141
142 auto& bucket = buckets_[bucket_idx];
143 return Erase(&bucket, t, ret);
144 }
145
146 // Fetch the mutex associated with a key
147 // This call is used to hold the lock for a given data for extended period of
148 // time.
149 port::RWMutex* GetMutex(const T& t) {
150 const uint64_t h = Hash()(t);
151 const uint32_t bucket_idx = h % nbuckets_;
152 const uint32_t lock_idx = bucket_idx % nlocks_;
153
154 return &locks_[lock_idx];
155 }
156
157 void Clear(void (*fn)(T)) {
158 for (uint32_t i = 0; i < nbuckets_; ++i) {
159 const uint32_t lock_idx = i % nlocks_;
160 WriteLock _(&locks_[lock_idx]);
161 for (auto& t : buckets_[i].list_) {
162 (*fn)(t);
163 }
164 buckets_[i].list_.clear();
165 }
166 }
167
168 protected:
169 // Models bucket of keys that hash to the same bucket number
170 struct Bucket {
171 std::list<T> list_;
172 };
173
174 // Substitute for std::find with custom comparator operator
175 typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {
176 for (auto it = list->begin(); it != list->end(); ++it) {
177 if (Equal()(*it, t)) {
178 return it;
179 }
180 }
181 return list->end();
182 }
183
184 bool Insert(Bucket* bucket, const T& t) {
185 // Check if the key already exists
186 auto it = Find(&bucket->list_, t);
187 if (it != bucket->list_.end()) {
188 return false;
189 }
190
191 // insert to bucket
192 bucket->list_.push_back(t);
193 return true;
194 }
195
196 bool Find(Bucket* bucket, const T& t, T* ret) {
197 auto it = Find(&bucket->list_, t);
198 if (it != bucket->list_.end()) {
199 if (ret) {
200 *ret = *it;
201 }
202 return true;
203 }
204 return false;
205 }
206
207 bool Erase(Bucket* bucket, const T& t, T* ret) {
208 auto it = Find(&bucket->list_, t);
209 if (it != bucket->list_.end()) {
210 if (ret) {
211 *ret = *it;
212 }
213
214 bucket->list_.erase(it);
215 return true;
216 }
217 return false;
218 }
219
220 // assert that all buckets are empty
221 void AssertEmptyBuckets() {
222 #ifndef NDEBUG
223 for (size_t i = 0; i < nbuckets_; ++i) {
224 WriteLock _(&locks_[i % nlocks_]);
225 assert(buckets_[i].list_.empty());
226 }
227 #endif
228 }
229
230 const uint32_t nbuckets_; // No. of buckets in the spine
231 std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets
232 const uint32_t nlocks_; // No. of locks
233 std::unique_ptr<port::RWMutex[]> locks_; // Granular locks
234 };
235
236 } // namespace rocksdb
237
238 #endif