]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2013, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | #pragma once | |
7 | ||
8 | #ifndef ROCKSDB_LITE | |
9 | ||
10 | #include <assert.h> | |
11 | #include <list> | |
12 | #include <vector> | |
13 | ||
14 | #ifdef OS_LINUX | |
15 | #include <sys/mman.h> | |
16 | #endif | |
17 | ||
18 | #include "include/rocksdb/env.h" | |
19 | #include "util/mutexlock.h" | |
20 | ||
21 | namespace rocksdb { | |
22 | ||
23 | // HashTable<T, Hash, Equal> | |
24 | // | |
25 | // Traditional implementation of hash table with synchronization built on top | |
26 | // don't perform very well in multi-core scenarios. This is an implementation | |
27 | // designed for multi-core scenarios with high lock contention. | |
28 | // | |
29 | // |<-------- alpha ------------->| | |
30 | // Buckets Collision list | |
31 | // ---- +----+ +---+---+--- ...... ---+---+---+ | |
32 | // / | |--->| | | | | | | |
33 | // / +----+ +---+---+--- ...... ---+---+---+ | |
34 | // / | | | |
35 | // Locks/ +----+ | |
36 | // +--+/ . . | |
37 | // | | . . | |
38 | // +--+ . . | |
39 | // | | . . | |
40 | // +--+ . . | |
41 | // | | . . | |
42 | // +--+ . . | |
43 | // \ +----+ | |
44 | // \ | | | |
45 | // \ +----+ | |
46 | // \ | | | |
47 | // \---- +----+ | |
48 | // | |
49 | // The lock contention is spread over an array of locks. This helps improve | |
50 | // concurrent access. The spine is designed for a certain capacity and load | |
51 | // factor. When the capacity planning is done correctly we can expect | |
52 | // O(load_factor = 1) insert, access and remove time. | |
53 | // | |
54 | // Micro benchmark on debug build gives about .5 Million/sec rate of insert, | |
55 | // erase and lookup in parallel (total of about 1.5 Million ops/sec). If the | |
56 | // blocks were of 4K, the hash table can support a virtual throughput of | |
57 | // 6 GB/s. | |
58 | // | |
59 | // T Object type (contains both key and value) | |
60 | // Hash Function that returns an hash from type T | |
61 | // Equal Returns if two objects are equal | |
62 | // (We need explicit equal for pointer type) | |
63 | // | |
64 | template <class T, class Hash, class Equal> | |
65 | class HashTable { | |
66 | public: | |
67 | explicit HashTable(const size_t capacity = 1024 * 1024, | |
68 | const float load_factor = 2.0, const uint32_t nlocks = 256) | |
69 | : nbuckets_( | |
70 | static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)), | |
71 | nlocks_(nlocks) { | |
72 | // pre-conditions | |
73 | assert(capacity); | |
74 | assert(load_factor); | |
75 | assert(nbuckets_); | |
76 | assert(nlocks_); | |
77 | ||
78 | buckets_.reset(new Bucket[nbuckets_]); | |
79 | #ifdef OS_LINUX | |
80 | mlock(buckets_.get(), nbuckets_ * sizeof(Bucket)); | |
81 | #endif | |
82 | ||
83 | // initialize locks | |
84 | locks_.reset(new port::RWMutex[nlocks_]); | |
85 | #ifdef OS_LINUX | |
86 | mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex)); | |
87 | #endif | |
88 | ||
89 | // post-conditions | |
90 | assert(buckets_); | |
91 | assert(locks_); | |
92 | } | |
93 | ||
94 | virtual ~HashTable() { AssertEmptyBuckets(); } | |
95 | ||
96 | // | |
97 | // Insert given record to hash table | |
98 | // | |
99 | bool Insert(const T& t) { | |
100 | const uint64_t h = Hash()(t); | |
101 | const uint32_t bucket_idx = h % nbuckets_; | |
102 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
103 | ||
104 | WriteLock _(&locks_[lock_idx]); | |
105 | auto& bucket = buckets_[bucket_idx]; | |
106 | return Insert(&bucket, t); | |
107 | } | |
108 | ||
109 | // Lookup hash table | |
110 | // | |
111 | // Please note that read lock should be held by the caller. This is because | |
112 | // the caller owns the data, and should hold the read lock as long as he | |
113 | // operates on the data. | |
114 | bool Find(const T& t, T* ret, port::RWMutex** ret_lock) { | |
115 | const uint64_t h = Hash()(t); | |
116 | const uint32_t bucket_idx = h % nbuckets_; | |
117 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
118 | ||
119 | port::RWMutex& lock = locks_[lock_idx]; | |
120 | lock.ReadLock(); | |
121 | ||
122 | auto& bucket = buckets_[bucket_idx]; | |
123 | if (Find(&bucket, t, ret)) { | |
124 | *ret_lock = &lock; | |
125 | return true; | |
126 | } | |
127 | ||
128 | lock.ReadUnlock(); | |
129 | return false; | |
130 | } | |
131 | ||
132 | // | |
133 | // Erase a given key from the hash table | |
134 | // | |
135 | bool Erase(const T& t, T* ret) { | |
136 | const uint64_t h = Hash()(t); | |
137 | const uint32_t bucket_idx = h % nbuckets_; | |
138 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
139 | ||
140 | WriteLock _(&locks_[lock_idx]); | |
141 | ||
142 | auto& bucket = buckets_[bucket_idx]; | |
143 | return Erase(&bucket, t, ret); | |
144 | } | |
145 | ||
146 | // Fetch the mutex associated with a key | |
147 | // This call is used to hold the lock for a given data for extended period of | |
148 | // time. | |
149 | port::RWMutex* GetMutex(const T& t) { | |
150 | const uint64_t h = Hash()(t); | |
151 | const uint32_t bucket_idx = h % nbuckets_; | |
152 | const uint32_t lock_idx = bucket_idx % nlocks_; | |
153 | ||
154 | return &locks_[lock_idx]; | |
155 | } | |
156 | ||
157 | void Clear(void (*fn)(T)) { | |
158 | for (uint32_t i = 0; i < nbuckets_; ++i) { | |
159 | const uint32_t lock_idx = i % nlocks_; | |
160 | WriteLock _(&locks_[lock_idx]); | |
161 | for (auto& t : buckets_[i].list_) { | |
162 | (*fn)(t); | |
163 | } | |
164 | buckets_[i].list_.clear(); | |
165 | } | |
166 | } | |
167 | ||
168 | protected: | |
169 | // Models bucket of keys that hash to the same bucket number | |
170 | struct Bucket { | |
171 | std::list<T> list_; | |
172 | }; | |
173 | ||
174 | // Substitute for std::find with custom comparator operator | |
175 | typename std::list<T>::iterator Find(std::list<T>* list, const T& t) { | |
176 | for (auto it = list->begin(); it != list->end(); ++it) { | |
177 | if (Equal()(*it, t)) { | |
178 | return it; | |
179 | } | |
180 | } | |
181 | return list->end(); | |
182 | } | |
183 | ||
184 | bool Insert(Bucket* bucket, const T& t) { | |
185 | // Check if the key already exists | |
186 | auto it = Find(&bucket->list_, t); | |
187 | if (it != bucket->list_.end()) { | |
188 | return false; | |
189 | } | |
190 | ||
191 | // insert to bucket | |
192 | bucket->list_.push_back(t); | |
193 | return true; | |
194 | } | |
195 | ||
196 | bool Find(Bucket* bucket, const T& t, T* ret) { | |
197 | auto it = Find(&bucket->list_, t); | |
198 | if (it != bucket->list_.end()) { | |
199 | if (ret) { | |
200 | *ret = *it; | |
201 | } | |
202 | return true; | |
203 | } | |
204 | return false; | |
205 | } | |
206 | ||
207 | bool Erase(Bucket* bucket, const T& t, T* ret) { | |
208 | auto it = Find(&bucket->list_, t); | |
209 | if (it != bucket->list_.end()) { | |
210 | if (ret) { | |
211 | *ret = *it; | |
212 | } | |
213 | ||
214 | bucket->list_.erase(it); | |
215 | return true; | |
216 | } | |
217 | return false; | |
218 | } | |
219 | ||
220 | // assert that all buckets are empty | |
221 | void AssertEmptyBuckets() { | |
222 | #ifndef NDEBUG | |
223 | for (size_t i = 0; i < nbuckets_; ++i) { | |
224 | WriteLock _(&locks_[i % nlocks_]); | |
225 | assert(buckets_[i].list_.empty()); | |
226 | } | |
227 | #endif | |
228 | } | |
229 | ||
230 | const uint32_t nbuckets_; // No. of buckets in the spine | |
231 | std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets | |
232 | const uint32_t nlocks_; // No. of locks | |
233 | std::unique_ptr<port::RWMutex[]> locks_; // Granular locks | |
234 | }; | |
235 | ||
236 | } // namespace rocksdb | |
237 | ||
238 | #endif |