]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2013, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | #pragma once | |
7 | ||
8 | #ifndef ROCKSDB_LITE | |
9 | ||
10 | #include <assert.h> | |
1e59de90 | 11 | |
7c673cae FG |
12 | #include <list> |
13 | #include <vector> | |
14 | ||
15 | #ifdef OS_LINUX | |
16 | #include <sys/mman.h> | |
17 | #endif | |
18 | ||
f67539c2 | 19 | #include "rocksdb/env.h" |
7c673cae FG |
20 | #include "util/mutexlock.h" |
21 | ||
f67539c2 | 22 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
23 | |
24 | // HashTable<T, Hash, Equal> | |
25 | // | |
26 | // Traditional implementation of hash table with synchronization built on top | |
27 | // don't perform very well in multi-core scenarios. This is an implementation | |
28 | // designed for multi-core scenarios with high lock contention. | |
29 | // | |
30 | // |<-------- alpha ------------->| | |
31 | // Buckets Collision list | |
32 | // ---- +----+ +---+---+--- ...... ---+---+---+ | |
33 | // / | |--->| | | | | | | |
34 | // / +----+ +---+---+--- ...... ---+---+---+ | |
35 | // / | | | |
36 | // Locks/ +----+ | |
37 | // +--+/ . . | |
38 | // | | . . | |
39 | // +--+ . . | |
40 | // | | . . | |
41 | // +--+ . . | |
42 | // | | . . | |
43 | // +--+ . . | |
44 | // \ +----+ | |
45 | // \ | | | |
46 | // \ +----+ | |
47 | // \ | | | |
48 | // \---- +----+ | |
49 | // | |
50 | // The lock contention is spread over an array of locks. This helps improve | |
51 | // concurrent access. The spine is designed for a certain capacity and load | |
52 | // factor. When the capacity planning is done correctly we can expect | |
53 | // O(load_factor = 1) insert, access and remove time. | |
54 | // | |
55 | // Micro benchmark on debug build gives about .5 Million/sec rate of insert, | |
56 | // erase and lookup in parallel (total of about 1.5 Million ops/sec). If the | |
57 | // blocks were of 4K, the hash table can support a virtual throughput of | |
58 | // 6 GB/s. | |
59 | // | |
60 | // T Object type (contains both key and value) | |
61 | // Hash Function that returns an hash from type T | |
62 | // Equal Returns if two objects are equal | |
63 | // (We need explicit equal for pointer type) | |
64 | // | |
65 | template <class T, class Hash, class Equal> | |
66 | class HashTable { | |
67 | public: | |
68 | explicit HashTable(const size_t capacity = 1024 * 1024, | |
69 | const float load_factor = 2.0, const uint32_t nlocks = 256) | |
70 | : nbuckets_( | |
71 | static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)), | |
72 | nlocks_(nlocks) { | |
73 | // pre-conditions | |
74 | assert(capacity); | |
75 | assert(load_factor); | |
76 | assert(nbuckets_); | |
77 | assert(nlocks_); | |
78 | ||
79 | buckets_.reset(new Bucket[nbuckets_]); | |
80 | #ifdef OS_LINUX | |
81 | mlock(buckets_.get(), nbuckets_ * sizeof(Bucket)); | |
82 | #endif | |
83 | ||
84 | // initialize locks | |
85 | locks_.reset(new port::RWMutex[nlocks_]); | |
86 | #ifdef OS_LINUX | |
87 | mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex)); | |
88 | #endif | |
89 | ||
90 | // post-conditions | |
91 | assert(buckets_); | |
92 | assert(locks_); | |
93 | } | |
94 | ||
95 | virtual ~HashTable() { AssertEmptyBuckets(); } | |
96 | ||
97 | // | |
98 | // Insert given record to hash table | |
99 | // | |
100 | bool Insert(const T& t) { | |
101 | const uint64_t h = Hash()(t); | |
102 | const uint32_t bucket_idx = h % nbuckets_; | |
103 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
104 | ||
105 | WriteLock _(&locks_[lock_idx]); | |
106 | auto& bucket = buckets_[bucket_idx]; | |
107 | return Insert(&bucket, t); | |
108 | } | |
109 | ||
110 | // Lookup hash table | |
111 | // | |
112 | // Please note that read lock should be held by the caller. This is because | |
113 | // the caller owns the data, and should hold the read lock as long as he | |
114 | // operates on the data. | |
115 | bool Find(const T& t, T* ret, port::RWMutex** ret_lock) { | |
116 | const uint64_t h = Hash()(t); | |
117 | const uint32_t bucket_idx = h % nbuckets_; | |
118 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
119 | ||
120 | port::RWMutex& lock = locks_[lock_idx]; | |
121 | lock.ReadLock(); | |
122 | ||
123 | auto& bucket = buckets_[bucket_idx]; | |
124 | if (Find(&bucket, t, ret)) { | |
125 | *ret_lock = &lock; | |
126 | return true; | |
127 | } | |
128 | ||
129 | lock.ReadUnlock(); | |
130 | return false; | |
131 | } | |
132 | ||
133 | // | |
134 | // Erase a given key from the hash table | |
135 | // | |
136 | bool Erase(const T& t, T* ret) { | |
137 | const uint64_t h = Hash()(t); | |
138 | const uint32_t bucket_idx = h % nbuckets_; | |
139 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
140 | ||
141 | WriteLock _(&locks_[lock_idx]); | |
142 | ||
143 | auto& bucket = buckets_[bucket_idx]; | |
144 | return Erase(&bucket, t, ret); | |
145 | } | |
146 | ||
147 | // Fetch the mutex associated with a key | |
148 | // This call is used to hold the lock for a given data for extended period of | |
149 | // time. | |
150 | port::RWMutex* GetMutex(const T& t) { | |
151 | const uint64_t h = Hash()(t); | |
152 | const uint32_t bucket_idx = h % nbuckets_; | |
153 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
154 | ||
155 | return &locks_[lock_idx]; | |
156 | } | |
157 | ||
158 | void Clear(void (*fn)(T)) { | |
159 | for (uint32_t i = 0; i < nbuckets_; ++i) { | |
160 | const uint32_t lock_idx = i % nlocks_; | |
161 | WriteLock _(&locks_[lock_idx]); | |
162 | for (auto& t : buckets_[i].list_) { | |
163 | (*fn)(t); | |
164 | } | |
165 | buckets_[i].list_.clear(); | |
166 | } | |
167 | } | |
168 | ||
169 | protected: | |
170 | // Models bucket of keys that hash to the same bucket number | |
171 | struct Bucket { | |
172 | std::list<T> list_; | |
173 | }; | |
174 | ||
175 | // Substitute for std::find with custom comparator operator | |
176 | typename std::list<T>::iterator Find(std::list<T>* list, const T& t) { | |
177 | for (auto it = list->begin(); it != list->end(); ++it) { | |
178 | if (Equal()(*it, t)) { | |
179 | return it; | |
180 | } | |
181 | } | |
182 | return list->end(); | |
183 | } | |
184 | ||
185 | bool Insert(Bucket* bucket, const T& t) { | |
186 | // Check if the key already exists | |
187 | auto it = Find(&bucket->list_, t); | |
188 | if (it != bucket->list_.end()) { | |
189 | return false; | |
190 | } | |
191 | ||
192 | // insert to bucket | |
193 | bucket->list_.push_back(t); | |
194 | return true; | |
195 | } | |
196 | ||
197 | bool Find(Bucket* bucket, const T& t, T* ret) { | |
198 | auto it = Find(&bucket->list_, t); | |
199 | if (it != bucket->list_.end()) { | |
200 | if (ret) { | |
201 | *ret = *it; | |
202 | } | |
203 | return true; | |
204 | } | |
205 | return false; | |
206 | } | |
207 | ||
208 | bool Erase(Bucket* bucket, const T& t, T* ret) { | |
209 | auto it = Find(&bucket->list_, t); | |
210 | if (it != bucket->list_.end()) { | |
211 | if (ret) { | |
212 | *ret = *it; | |
213 | } | |
214 | ||
215 | bucket->list_.erase(it); | |
216 | return true; | |
217 | } | |
218 | return false; | |
219 | } | |
220 | ||
221 | // assert that all buckets are empty | |
222 | void AssertEmptyBuckets() { | |
223 | #ifndef NDEBUG | |
224 | for (size_t i = 0; i < nbuckets_; ++i) { | |
225 | WriteLock _(&locks_[i % nlocks_]); | |
226 | assert(buckets_[i].list_.empty()); | |
227 | } | |
228 | #endif | |
229 | } | |
230 | ||
231 | const uint32_t nbuckets_; // No. of buckets in the spine | |
232 | std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets | |
233 | const uint32_t nlocks_; // No. of locks | |
234 | std::unique_ptr<port::RWMutex[]> locks_; // Granular locks | |
235 | }; | |
236 | ||
f67539c2 | 237 | } // namespace ROCKSDB_NAMESPACE |
7c673cae FG |
238 | |
239 | #endif |