]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | ||
7 | #ifndef ROCKSDB_LITE | |
8 | #include "memtable/hash_linklist_rep.h" | |
9 | ||
10 | #include <algorithm> | |
11 | #include <atomic> | |
12 | #include "db/memtable.h" | |
f67539c2 | 13 | #include "memory/arena.h" |
7c673cae FG |
14 | #include "memtable/skiplist.h" |
15 | #include "monitoring/histogram.h" | |
16 | #include "port/port.h" | |
17 | #include "rocksdb/memtablerep.h" | |
18 | #include "rocksdb/slice.h" | |
19 | #include "rocksdb/slice_transform.h" | |
494da23a | 20 | #include "util/hash.h" |
7c673cae | 21 | |
f67539c2 | 22 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
23 | namespace { |
24 | ||
25 | typedef const char* Key; | |
26 | typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList; | |
27 | typedef std::atomic<void*> Pointer; | |
28 | ||
29 | // A data structure used as the header of a link list of a hash bucket. | |
30 | struct BucketHeader { | |
31 | Pointer next; | |
32 | std::atomic<uint32_t> num_entries; | |
33 | ||
34 | explicit BucketHeader(void* n, uint32_t count) | |
35 | : next(n), num_entries(count) {} | |
36 | ||
37 | bool IsSkipListBucket() { | |
38 | return next.load(std::memory_order_relaxed) == this; | |
39 | } | |
40 | ||
41 | uint32_t GetNumEntries() const { | |
42 | return num_entries.load(std::memory_order_relaxed); | |
43 | } | |
44 | ||
45 | // REQUIRES: called from single-threaded Insert() | |
46 | void IncNumEntries() { | |
47 | // Only one thread can do write at one time. No need to do atomic | |
48 | // incremental. Update it with relaxed load and store. | |
49 | num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed); | |
50 | } | |
51 | }; | |
52 | ||
53 | // A data structure used as the header of a skip list of a hash bucket. | |
54 | struct SkipListBucketHeader { | |
55 | BucketHeader Counting_header; | |
56 | MemtableSkipList skip_list; | |
57 | ||
58 | explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp, | |
11fdf7f2 | 59 | Allocator* allocator, uint32_t count) |
7c673cae FG |
60 | : Counting_header(this, // Pointing to itself to indicate header type. |
61 | count), | |
62 | skip_list(cmp, allocator) {} | |
63 | }; | |
64 | ||
65 | struct Node { | |
66 | // Accessors/mutators for links. Wrapped in methods so we can | |
67 | // add the appropriate barriers as necessary. | |
68 | Node* Next() { | |
69 | // Use an 'acquire load' so that we observe a fully initialized | |
70 | // version of the returned Node. | |
71 | return next_.load(std::memory_order_acquire); | |
72 | } | |
73 | void SetNext(Node* x) { | |
74 | // Use a 'release store' so that anybody who reads through this | |
75 | // pointer observes a fully initialized version of the inserted node. | |
76 | next_.store(x, std::memory_order_release); | |
77 | } | |
78 | // No-barrier variants that can be safely used in a few locations. | |
79 | Node* NoBarrier_Next() { | |
80 | return next_.load(std::memory_order_relaxed); | |
81 | } | |
82 | ||
83 | void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); } | |
84 | ||
85 | // Needed for placement new below which is fine | |
86 | Node() {} | |
87 | ||
88 | private: | |
89 | std::atomic<Node*> next_; | |
90 | ||
91 | // Prohibit copying due to the below | |
92 | Node(const Node&) = delete; | |
93 | Node& operator=(const Node&) = delete; | |
94 | ||
95 | public: | |
96 | char key[1]; | |
97 | }; | |
98 | ||
99 | // Memory structure of the mem table: | |
100 | // It is a hash table, each bucket points to one entry, a linked list or a | |
101 | // skip list. In order to track total number of records in a bucket to determine | |
102 | // whether should switch to skip list, a header is added just to indicate | |
103 | // number of entries in the bucket. | |
104 | // | |
105 | // | |
106 | // +-----> NULL Case 1. Empty bucket | |
107 | // | | |
108 | // | | |
109 | // | +---> +-------+ | |
110 | // | | | Next +--> NULL | |
111 | // | | +-------+ | |
112 | // +-----+ | | | | Case 2. One Entry in bucket. | |
113 | // | +-+ | | Data | next pointer points to | |
114 | // +-----+ | | | NULL. All other cases | |
115 | // | | | | | next pointer is not NULL. | |
116 | // +-----+ | +-------+ | |
117 | // | +---+ | |
118 | // +-----+ +-> +-------+ +> +-------+ +-> +-------+ | |
119 | // | | | | Next +--+ | Next +--+ | Next +-->NULL | |
120 | // +-----+ | +-------+ +-------+ +-------+ | |
121 | // | +-----+ | Count | | | | | | |
122 | // +-----+ +-------+ | Data | | Data | | |
123 | // | | | | | | | |
124 | // +-----+ Case 3. | | | | | |
125 | // | | A header +-------+ +-------+ | |
126 | // +-----+ points to | |
127 | // | | a linked list. Count indicates total number | |
128 | // +-----+ of rows in this bucket. | |
129 | // | | | |
130 | // +-----+ +-> +-------+ <--+ | |
131 | // | | | | Next +----+ | |
132 | // +-----+ | +-------+ Case 4. A header points to a skip | |
133 | // | +----+ | Count | list and next pointer points to | |
134 | // +-----+ +-------+ itself, to distinguish case 3 or 4. | |
135 | // | | | | Count still is kept to indicates total | |
136 | // +-----+ | Skip +--> of entries in the bucket for debugging | |
137 | // | | | List | Data purpose. | |
138 | // | | | +--> | |
139 | // +-----+ | | | |
140 | // | | +-------+ | |
141 | // +-----+ | |
142 | // | |
143 | // We don't have data race when changing cases because: | |
144 | // (1) When changing from case 2->3, we create a new bucket header, put the | |
145 | // single node there first without changing the original node, and do a | |
146 | // release store when changing the bucket pointer. In that case, a reader | |
147 | // who sees a stale value of the bucket pointer will read this node, while | |
148 | // a reader sees the correct value because of the release store. | |
149 | // (2) When changing case 3->4, a new header is created with skip list points | |
150 | // to the data, before doing an acquire store to change the bucket pointer. | |
151 | // The old header and nodes are never changed, so any reader sees any | |
152 | // of those existing pointers will guarantee to be able to iterate to the | |
153 | // end of the linked list. | |
154 | // (3) Header's next pointer in case 3 might change, but they are never equal | |
155 | // to itself, so no matter a reader sees any stale or newer value, it will | |
156 | // be able to correctly distinguish case 3 and 4. | |
157 | // | |
158 | // The reason that we use case 2 is we want to make the format to be efficient | |
159 | // when the utilization of buckets is relatively low. If we use case 3 for | |
160 | // single entry bucket, we will need to waste 12 bytes for every entry, | |
161 | // which can be significant decrease of memory utilization. | |
162 | class HashLinkListRep : public MemTableRep { | |
163 | public: | |
164 | HashLinkListRep(const MemTableRep::KeyComparator& compare, | |
11fdf7f2 | 165 | Allocator* allocator, const SliceTransform* transform, |
7c673cae FG |
166 | size_t bucket_size, uint32_t threshold_use_skiplist, |
167 | size_t huge_page_tlb_size, Logger* logger, | |
168 | int bucket_entries_logging_threshold, | |
169 | bool if_log_bucket_dist_when_flash); | |
170 | ||
494da23a | 171 | KeyHandle Allocate(const size_t len, char** buf) override; |
7c673cae | 172 | |
494da23a | 173 | void Insert(KeyHandle handle) override; |
7c673cae | 174 | |
494da23a | 175 | bool Contains(const char* key) const override; |
7c673cae | 176 | |
494da23a | 177 | size_t ApproximateMemoryUsage() override; |
7c673cae | 178 | |
494da23a TL |
179 | void Get(const LookupKey& k, void* callback_args, |
180 | bool (*callback_func)(void* arg, const char* entry)) override; | |
7c673cae | 181 | |
494da23a | 182 | ~HashLinkListRep() override; |
7c673cae | 183 | |
494da23a | 184 | MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; |
7c673cae | 185 | |
494da23a TL |
186 | MemTableRep::Iterator* GetDynamicPrefixIterator( |
187 | Arena* arena = nullptr) override; | |
7c673cae FG |
188 | |
189 | private: | |
190 | friend class DynamicIterator; | |
191 | ||
192 | size_t bucket_size_; | |
193 | ||
194 | // Maps slices (which are transformed user keys) to buckets of keys sharing | |
195 | // the same transform. | |
196 | Pointer* buckets_; | |
197 | ||
198 | const uint32_t threshold_use_skiplist_; | |
199 | ||
200 | // The user-supplied transform whose domain is the user keys. | |
201 | const SliceTransform* transform_; | |
202 | ||
203 | const MemTableRep::KeyComparator& compare_; | |
204 | ||
205 | Logger* logger_; | |
206 | int bucket_entries_logging_threshold_; | |
207 | bool if_log_bucket_dist_when_flash_; | |
208 | ||
209 | bool LinkListContains(Node* head, const Slice& key) const; | |
210 | ||
211 | SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer) | |
212 | const; | |
213 | ||
214 | Node* GetLinkListFirstNode(Pointer* first_next_pointer) const; | |
215 | ||
216 | Slice GetPrefix(const Slice& internal_key) const { | |
217 | return transform_->Transform(ExtractUserKey(internal_key)); | |
218 | } | |
219 | ||
220 | size_t GetHash(const Slice& slice) const { | |
20effc67 | 221 | return GetSliceRangedNPHash(slice, bucket_size_); |
7c673cae FG |
222 | } |
223 | ||
224 | Pointer* GetBucket(size_t i) const { | |
225 | return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire)); | |
226 | } | |
227 | ||
228 | Pointer* GetBucket(const Slice& slice) const { | |
229 | return GetBucket(GetHash(slice)); | |
230 | } | |
231 | ||
232 | bool Equal(const Slice& a, const Key& b) const { | |
233 | return (compare_(b, a) == 0); | |
234 | } | |
235 | ||
236 | bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } | |
237 | ||
238 | bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { | |
239 | // nullptr n is considered infinite | |
240 | return (n != nullptr) && (compare_(n->key, internal_key) < 0); | |
241 | } | |
242 | ||
243 | bool KeyIsAfterNode(const Key& key, const Node* n) const { | |
244 | // nullptr n is considered infinite | |
245 | return (n != nullptr) && (compare_(n->key, key) < 0); | |
246 | } | |
247 | ||
248 | bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const { | |
249 | // nullptr n is considered infinite | |
250 | return (n != nullptr) && (compare_(n->key, internal_key) <= 0); | |
251 | } | |
252 | ||
253 | bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const { | |
254 | // nullptr n is considered infinite | |
255 | return (n != nullptr) && (compare_(n->key, key) <= 0); | |
256 | } | |
257 | ||
258 | Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; | |
259 | Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const; | |
260 | ||
261 | class FullListIterator : public MemTableRep::Iterator { | |
262 | public: | |
263 | explicit FullListIterator(MemtableSkipList* list, Allocator* allocator) | |
264 | : iter_(list), full_list_(list), allocator_(allocator) {} | |
265 | ||
494da23a | 266 | ~FullListIterator() override {} |
7c673cae FG |
267 | |
268 | // Returns true iff the iterator is positioned at a valid node. | |
494da23a | 269 | bool Valid() const override { return iter_.Valid(); } |
7c673cae FG |
270 | |
271 | // Returns the key at the current position. | |
272 | // REQUIRES: Valid() | |
494da23a | 273 | const char* key() const override { |
7c673cae FG |
274 | assert(Valid()); |
275 | return iter_.key(); | |
276 | } | |
277 | ||
278 | // Advances to the next position. | |
279 | // REQUIRES: Valid() | |
494da23a | 280 | void Next() override { |
7c673cae FG |
281 | assert(Valid()); |
282 | iter_.Next(); | |
283 | } | |
284 | ||
285 | // Advances to the previous position. | |
286 | // REQUIRES: Valid() | |
494da23a | 287 | void Prev() override { |
7c673cae FG |
288 | assert(Valid()); |
289 | iter_.Prev(); | |
290 | } | |
291 | ||
292 | // Advance to the first entry with a key >= target | |
494da23a | 293 | void Seek(const Slice& internal_key, const char* memtable_key) override { |
7c673cae FG |
294 | const char* encoded_key = |
295 | (memtable_key != nullptr) ? | |
296 | memtable_key : EncodeKey(&tmp_, internal_key); | |
297 | iter_.Seek(encoded_key); | |
298 | } | |
299 | ||
300 | // Retreat to the last entry with a key <= target | |
494da23a TL |
301 | void SeekForPrev(const Slice& internal_key, |
302 | const char* memtable_key) override { | |
7c673cae FG |
303 | const char* encoded_key = (memtable_key != nullptr) |
304 | ? memtable_key | |
305 | : EncodeKey(&tmp_, internal_key); | |
306 | iter_.SeekForPrev(encoded_key); | |
307 | } | |
308 | ||
309 | // Position at the first entry in collection. | |
310 | // Final state of iterator is Valid() iff collection is not empty. | |
494da23a | 311 | void SeekToFirst() override { iter_.SeekToFirst(); } |
7c673cae FG |
312 | |
313 | // Position at the last entry in collection. | |
314 | // Final state of iterator is Valid() iff collection is not empty. | |
494da23a TL |
315 | void SeekToLast() override { iter_.SeekToLast(); } |
316 | ||
7c673cae FG |
317 | private: |
318 | MemtableSkipList::Iterator iter_; | |
319 | // To destruct with the iterator. | |
320 | std::unique_ptr<MemtableSkipList> full_list_; | |
321 | std::unique_ptr<Allocator> allocator_; | |
322 | std::string tmp_; // For passing to EncodeKey | |
323 | }; | |
324 | ||
325 | class LinkListIterator : public MemTableRep::Iterator { | |
326 | public: | |
327 | explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep, | |
328 | Node* head) | |
329 | : hash_link_list_rep_(hash_link_list_rep), | |
330 | head_(head), | |
331 | node_(nullptr) {} | |
332 | ||
494da23a | 333 | ~LinkListIterator() override {} |
7c673cae FG |
334 | |
335 | // Returns true iff the iterator is positioned at a valid node. | |
494da23a | 336 | bool Valid() const override { return node_ != nullptr; } |
7c673cae FG |
337 | |
338 | // Returns the key at the current position. | |
339 | // REQUIRES: Valid() | |
494da23a | 340 | const char* key() const override { |
7c673cae FG |
341 | assert(Valid()); |
342 | return node_->key; | |
343 | } | |
344 | ||
345 | // Advances to the next position. | |
346 | // REQUIRES: Valid() | |
494da23a | 347 | void Next() override { |
7c673cae FG |
348 | assert(Valid()); |
349 | node_ = node_->Next(); | |
350 | } | |
351 | ||
352 | // Advances to the previous position. | |
353 | // REQUIRES: Valid() | |
494da23a | 354 | void Prev() override { |
7c673cae FG |
355 | // Prefix iterator does not support total order. |
356 | // We simply set the iterator to invalid state | |
357 | Reset(nullptr); | |
358 | } | |
359 | ||
360 | // Advance to the first entry with a key >= target | |
494da23a TL |
361 | void Seek(const Slice& internal_key, |
362 | const char* /*memtable_key*/) override { | |
7c673cae FG |
363 | node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, |
364 | internal_key); | |
365 | } | |
366 | ||
367 | // Retreat to the last entry with a key <= target | |
494da23a TL |
368 | void SeekForPrev(const Slice& /*internal_key*/, |
369 | const char* /*memtable_key*/) override { | |
7c673cae FG |
370 | // Since we do not support Prev() |
371 | // We simply do not support SeekForPrev | |
372 | Reset(nullptr); | |
373 | } | |
374 | ||
375 | // Position at the first entry in collection. | |
376 | // Final state of iterator is Valid() iff collection is not empty. | |
494da23a | 377 | void SeekToFirst() override { |
7c673cae FG |
378 | // Prefix iterator does not support total order. |
379 | // We simply set the iterator to invalid state | |
380 | Reset(nullptr); | |
381 | } | |
382 | ||
383 | // Position at the last entry in collection. | |
384 | // Final state of iterator is Valid() iff collection is not empty. | |
494da23a | 385 | void SeekToLast() override { |
7c673cae FG |
386 | // Prefix iterator does not support total order. |
387 | // We simply set the iterator to invalid state | |
388 | Reset(nullptr); | |
389 | } | |
390 | ||
391 | protected: | |
392 | void Reset(Node* head) { | |
393 | head_ = head; | |
394 | node_ = nullptr; | |
395 | } | |
396 | private: | |
397 | friend class HashLinkListRep; | |
398 | const HashLinkListRep* const hash_link_list_rep_; | |
399 | Node* head_; | |
400 | Node* node_; | |
401 | ||
402 | virtual void SeekToHead() { | |
403 | node_ = head_; | |
404 | } | |
405 | }; | |
406 | ||
407 | class DynamicIterator : public HashLinkListRep::LinkListIterator { | |
408 | public: | |
409 | explicit DynamicIterator(HashLinkListRep& memtable_rep) | |
410 | : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr), | |
411 | memtable_rep_(memtable_rep) {} | |
412 | ||
413 | // Advance to the first entry with a key >= target | |
494da23a | 414 | void Seek(const Slice& k, const char* memtable_key) override { |
7c673cae FG |
415 | auto transformed = memtable_rep_.GetPrefix(k); |
416 | auto* bucket = memtable_rep_.GetBucket(transformed); | |
417 | ||
418 | SkipListBucketHeader* skip_list_header = | |
419 | memtable_rep_.GetSkipListBucketHeader(bucket); | |
420 | if (skip_list_header != nullptr) { | |
421 | // The bucket is organized as a skip list | |
422 | if (!skip_list_iter_) { | |
423 | skip_list_iter_.reset( | |
424 | new MemtableSkipList::Iterator(&skip_list_header->skip_list)); | |
425 | } else { | |
426 | skip_list_iter_->SetList(&skip_list_header->skip_list); | |
427 | } | |
428 | if (memtable_key != nullptr) { | |
429 | skip_list_iter_->Seek(memtable_key); | |
430 | } else { | |
431 | IterKey encoded_key; | |
432 | encoded_key.EncodeLengthPrefixedKey(k); | |
433 | skip_list_iter_->Seek(encoded_key.GetUserKey().data()); | |
434 | } | |
435 | } else { | |
436 | // The bucket is organized as a linked list | |
437 | skip_list_iter_.reset(); | |
438 | Reset(memtable_rep_.GetLinkListFirstNode(bucket)); | |
439 | HashLinkListRep::LinkListIterator::Seek(k, memtable_key); | |
440 | } | |
441 | } | |
442 | ||
494da23a | 443 | bool Valid() const override { |
7c673cae FG |
444 | if (skip_list_iter_) { |
445 | return skip_list_iter_->Valid(); | |
446 | } | |
447 | return HashLinkListRep::LinkListIterator::Valid(); | |
448 | } | |
449 | ||
494da23a | 450 | const char* key() const override { |
7c673cae FG |
451 | if (skip_list_iter_) { |
452 | return skip_list_iter_->key(); | |
453 | } | |
454 | return HashLinkListRep::LinkListIterator::key(); | |
455 | } | |
456 | ||
494da23a | 457 | void Next() override { |
7c673cae FG |
458 | if (skip_list_iter_) { |
459 | skip_list_iter_->Next(); | |
460 | } else { | |
461 | HashLinkListRep::LinkListIterator::Next(); | |
462 | } | |
463 | } | |
464 | ||
465 | private: | |
466 | // the underlying memtable | |
467 | const HashLinkListRep& memtable_rep_; | |
468 | std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_; | |
469 | }; | |
470 | ||
471 | class EmptyIterator : public MemTableRep::Iterator { | |
472 | // This is used when there wasn't a bucket. It is cheaper than | |
473 | // instantiating an empty bucket over which to iterate. | |
474 | public: | |
475 | EmptyIterator() { } | |
494da23a TL |
476 | bool Valid() const override { return false; } |
477 | const char* key() const override { | |
7c673cae FG |
478 | assert(false); |
479 | return nullptr; | |
480 | } | |
494da23a TL |
481 | void Next() override {} |
482 | void Prev() override {} | |
483 | void Seek(const Slice& /*user_key*/, | |
484 | const char* /*memtable_key*/) override {} | |
485 | void SeekForPrev(const Slice& /*user_key*/, | |
486 | const char* /*memtable_key*/) override {} | |
487 | void SeekToFirst() override {} | |
488 | void SeekToLast() override {} | |
7c673cae FG |
489 | |
490 | private: | |
491 | }; | |
492 | }; | |
493 | ||
11fdf7f2 TL |
494 | HashLinkListRep::HashLinkListRep( |
495 | const MemTableRep::KeyComparator& compare, Allocator* allocator, | |
496 | const SliceTransform* transform, size_t bucket_size, | |
497 | uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger, | |
498 | int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash) | |
7c673cae FG |
499 | : MemTableRep(allocator), |
500 | bucket_size_(bucket_size), | |
501 | // Threshold to use skip list doesn't make sense if less than 3, so we | |
502 | // force it to be minimum of 3 to simplify implementation. | |
503 | threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)), | |
504 | transform_(transform), | |
505 | compare_(compare), | |
506 | logger_(logger), | |
507 | bucket_entries_logging_threshold_(bucket_entries_logging_threshold), | |
508 | if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) { | |
509 | char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size, | |
510 | huge_page_tlb_size, logger); | |
511 | ||
512 | buckets_ = new (mem) Pointer[bucket_size]; | |
513 | ||
514 | for (size_t i = 0; i < bucket_size_; ++i) { | |
515 | buckets_[i].store(nullptr, std::memory_order_relaxed); | |
516 | } | |
517 | } | |
518 | ||
519 | HashLinkListRep::~HashLinkListRep() { | |
520 | } | |
521 | ||
522 | KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { | |
523 | char* mem = allocator_->AllocateAligned(sizeof(Node) + len); | |
524 | Node* x = new (mem) Node(); | |
525 | *buf = x->key; | |
526 | return static_cast<void*>(x); | |
527 | } | |
528 | ||
529 | SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader( | |
530 | Pointer* first_next_pointer) const { | |
531 | if (first_next_pointer == nullptr) { | |
532 | return nullptr; | |
533 | } | |
534 | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { | |
535 | // Single entry bucket | |
536 | return nullptr; | |
537 | } | |
538 | // Counting header | |
539 | BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); | |
540 | if (header->IsSkipListBucket()) { | |
541 | assert(header->GetNumEntries() > threshold_use_skiplist_); | |
542 | auto* skip_list_bucket_header = | |
543 | reinterpret_cast<SkipListBucketHeader*>(header); | |
544 | assert(skip_list_bucket_header->Counting_header.next.load( | |
545 | std::memory_order_relaxed) == header); | |
546 | return skip_list_bucket_header; | |
547 | } | |
548 | assert(header->GetNumEntries() <= threshold_use_skiplist_); | |
549 | return nullptr; | |
550 | } | |
551 | ||
552 | Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const { | |
553 | if (first_next_pointer == nullptr) { | |
554 | return nullptr; | |
555 | } | |
556 | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { | |
557 | // Single entry bucket | |
558 | return reinterpret_cast<Node*>(first_next_pointer); | |
559 | } | |
560 | // Counting header | |
561 | BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); | |
562 | if (!header->IsSkipListBucket()) { | |
563 | assert(header->GetNumEntries() <= threshold_use_skiplist_); | |
564 | return reinterpret_cast<Node*>( | |
565 | header->next.load(std::memory_order_acquire)); | |
566 | } | |
567 | assert(header->GetNumEntries() > threshold_use_skiplist_); | |
568 | return nullptr; | |
569 | } | |
570 | ||
571 | void HashLinkListRep::Insert(KeyHandle handle) { | |
572 | Node* x = static_cast<Node*>(handle); | |
573 | assert(!Contains(x->key)); | |
574 | Slice internal_key = GetLengthPrefixedSlice(x->key); | |
575 | auto transformed = GetPrefix(internal_key); | |
576 | auto& bucket = buckets_[GetHash(transformed)]; | |
577 | Pointer* first_next_pointer = | |
578 | static_cast<Pointer*>(bucket.load(std::memory_order_relaxed)); | |
579 | ||
580 | if (first_next_pointer == nullptr) { | |
581 | // Case 1. empty bucket | |
582 | // NoBarrier_SetNext() suffices since we will add a barrier when | |
583 | // we publish a pointer to "x" in prev[i]. | |
584 | x->NoBarrier_SetNext(nullptr); | |
585 | bucket.store(x, std::memory_order_release); | |
586 | return; | |
587 | } | |
588 | ||
589 | BucketHeader* header = nullptr; | |
590 | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { | |
591 | // Case 2. only one entry in the bucket | |
592 | // Need to convert to a Counting bucket and turn to case 4. | |
593 | Node* first = reinterpret_cast<Node*>(first_next_pointer); | |
594 | // Need to add a bucket header. | |
595 | // We have to first convert it to a bucket with header before inserting | |
596 | // the new node. Otherwise, we might need to change next pointer of first. | |
597 | // In that case, a reader might sees the next pointer is NULL and wrongly | |
598 | // think the node is a bucket header. | |
599 | auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader)); | |
600 | header = new (mem) BucketHeader(first, 1); | |
601 | bucket.store(header, std::memory_order_release); | |
602 | } else { | |
603 | header = reinterpret_cast<BucketHeader*>(first_next_pointer); | |
604 | if (header->IsSkipListBucket()) { | |
605 | // Case 4. Bucket is already a skip list | |
606 | assert(header->GetNumEntries() > threshold_use_skiplist_); | |
607 | auto* skip_list_bucket_header = | |
608 | reinterpret_cast<SkipListBucketHeader*>(header); | |
609 | // Only one thread can execute Insert() at one time. No need to do atomic | |
610 | // incremental. | |
611 | skip_list_bucket_header->Counting_header.IncNumEntries(); | |
612 | skip_list_bucket_header->skip_list.Insert(x->key); | |
613 | return; | |
614 | } | |
615 | } | |
616 | ||
617 | if (bucket_entries_logging_threshold_ > 0 && | |
618 | header->GetNumEntries() == | |
619 | static_cast<uint32_t>(bucket_entries_logging_threshold_)) { | |
620 | Info(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt | |
621 | " has more than %d " | |
622 | "entries. Key to insert: %s", | |
623 | GetHash(transformed), header->GetNumEntries(), | |
624 | GetLengthPrefixedSlice(x->key).ToString(true).c_str()); | |
625 | } | |
626 | ||
627 | if (header->GetNumEntries() == threshold_use_skiplist_) { | |
628 | // Case 3. number of entries reaches the threshold so need to convert to | |
629 | // skip list. | |
630 | LinkListIterator bucket_iter( | |
631 | this, reinterpret_cast<Node*>( | |
632 | first_next_pointer->load(std::memory_order_relaxed))); | |
633 | auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader)); | |
634 | SkipListBucketHeader* new_skip_list_header = new (mem) | |
635 | SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1); | |
636 | auto& skip_list = new_skip_list_header->skip_list; | |
637 | ||
638 | // Add all current entries to the skip list | |
639 | for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) { | |
640 | skip_list.Insert(bucket_iter.key()); | |
641 | } | |
642 | ||
643 | // insert the new entry | |
644 | skip_list.Insert(x->key); | |
645 | // Set the bucket | |
646 | bucket.store(new_skip_list_header, std::memory_order_release); | |
647 | } else { | |
648 | // Case 5. Need to insert to the sorted linked list without changing the | |
649 | // header. | |
650 | Node* first = | |
651 | reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed)); | |
652 | assert(first != nullptr); | |
653 | // Advance counter unless the bucket needs to be advanced to skip list. | |
654 | // In that case, we need to make sure the previous count never exceeds | |
655 | // threshold_use_skiplist_ to avoid readers to cast to wrong format. | |
656 | header->IncNumEntries(); | |
657 | ||
658 | Node* cur = first; | |
659 | Node* prev = nullptr; | |
660 | while (true) { | |
661 | if (cur == nullptr) { | |
662 | break; | |
663 | } | |
664 | Node* next = cur->Next(); | |
665 | // Make sure the lists are sorted. | |
666 | // If x points to head_ or next points nullptr, it is trivially satisfied. | |
667 | assert((cur == first) || (next == nullptr) || | |
668 | KeyIsAfterNode(next->key, cur)); | |
669 | if (KeyIsAfterNode(internal_key, cur)) { | |
670 | // Keep searching in this list | |
671 | prev = cur; | |
672 | cur = next; | |
673 | } else { | |
674 | break; | |
675 | } | |
676 | } | |
677 | ||
678 | // Our data structure does not allow duplicate insertion | |
679 | assert(cur == nullptr || !Equal(x->key, cur->key)); | |
680 | ||
681 | // NoBarrier_SetNext() suffices since we will add a barrier when | |
682 | // we publish a pointer to "x" in prev[i]. | |
683 | x->NoBarrier_SetNext(cur); | |
684 | ||
685 | if (prev) { | |
686 | prev->SetNext(x); | |
687 | } else { | |
688 | header->next.store(static_cast<void*>(x), std::memory_order_release); | |
689 | } | |
690 | } | |
691 | } | |
692 | ||
693 | bool HashLinkListRep::Contains(const char* key) const { | |
694 | Slice internal_key = GetLengthPrefixedSlice(key); | |
695 | ||
696 | auto transformed = GetPrefix(internal_key); | |
697 | auto bucket = GetBucket(transformed); | |
698 | if (bucket == nullptr) { | |
699 | return false; | |
700 | } | |
701 | ||
702 | SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket); | |
703 | if (skip_list_header != nullptr) { | |
704 | return skip_list_header->skip_list.Contains(key); | |
705 | } else { | |
706 | return LinkListContains(GetLinkListFirstNode(bucket), internal_key); | |
707 | } | |
708 | } | |
709 | ||
710 | size_t HashLinkListRep::ApproximateMemoryUsage() { | |
711 | // Memory is always allocated from the allocator. | |
712 | return 0; | |
713 | } | |
714 | ||
715 | void HashLinkListRep::Get(const LookupKey& k, void* callback_args, | |
716 | bool (*callback_func)(void* arg, const char* entry)) { | |
717 | auto transformed = transform_->Transform(k.user_key()); | |
718 | auto bucket = GetBucket(transformed); | |
719 | ||
720 | auto* skip_list_header = GetSkipListBucketHeader(bucket); | |
721 | if (skip_list_header != nullptr) { | |
722 | // Is a skip list | |
723 | MemtableSkipList::Iterator iter(&skip_list_header->skip_list); | |
724 | for (iter.Seek(k.memtable_key().data()); | |
725 | iter.Valid() && callback_func(callback_args, iter.key()); | |
726 | iter.Next()) { | |
727 | } | |
728 | } else { | |
729 | auto* link_list_head = GetLinkListFirstNode(bucket); | |
730 | if (link_list_head != nullptr) { | |
731 | LinkListIterator iter(this, link_list_head); | |
732 | for (iter.Seek(k.internal_key(), nullptr); | |
733 | iter.Valid() && callback_func(callback_args, iter.key()); | |
734 | iter.Next()) { | |
735 | } | |
736 | } | |
737 | } | |
738 | } | |
739 | ||
740 | MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { | |
741 | // allocate a new arena of similar size to the one currently in use | |
742 | Arena* new_arena = new Arena(allocator_->BlockSize()); | |
743 | auto list = new MemtableSkipList(compare_, new_arena); | |
744 | HistogramImpl keys_per_bucket_hist; | |
745 | ||
746 | for (size_t i = 0; i < bucket_size_; ++i) { | |
747 | int count = 0; | |
748 | auto* bucket = GetBucket(i); | |
749 | if (bucket != nullptr) { | |
750 | auto* skip_list_header = GetSkipListBucketHeader(bucket); | |
751 | if (skip_list_header != nullptr) { | |
752 | // Is a skip list | |
753 | MemtableSkipList::Iterator itr(&skip_list_header->skip_list); | |
754 | for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { | |
755 | list->Insert(itr.key()); | |
756 | count++; | |
757 | } | |
758 | } else { | |
759 | auto* link_list_head = GetLinkListFirstNode(bucket); | |
760 | if (link_list_head != nullptr) { | |
761 | LinkListIterator itr(this, link_list_head); | |
762 | for (itr.SeekToHead(); itr.Valid(); itr.Next()) { | |
763 | list->Insert(itr.key()); | |
764 | count++; | |
765 | } | |
766 | } | |
767 | } | |
768 | } | |
769 | if (if_log_bucket_dist_when_flash_) { | |
770 | keys_per_bucket_hist.Add(count); | |
771 | } | |
772 | } | |
773 | if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) { | |
774 | Info(logger_, "hashLinkedList Entry distribution among buckets: %s", | |
775 | keys_per_bucket_hist.ToString().c_str()); | |
776 | } | |
777 | ||
778 | if (alloc_arena == nullptr) { | |
779 | return new FullListIterator(list, new_arena); | |
780 | } else { | |
781 | auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator)); | |
782 | return new (mem) FullListIterator(list, new_arena); | |
783 | } | |
784 | } | |
785 | ||
786 | MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator( | |
787 | Arena* alloc_arena) { | |
788 | if (alloc_arena == nullptr) { | |
789 | return new DynamicIterator(*this); | |
790 | } else { | |
791 | auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator)); | |
792 | return new (mem) DynamicIterator(*this); | |
793 | } | |
794 | } | |
795 | ||
796 | bool HashLinkListRep::LinkListContains(Node* head, | |
797 | const Slice& user_key) const { | |
798 | Node* x = FindGreaterOrEqualInBucket(head, user_key); | |
799 | return (x != nullptr && Equal(user_key, x->key)); | |
800 | } | |
801 | ||
802 | Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, | |
803 | const Slice& key) const { | |
804 | Node* x = head; | |
805 | while (true) { | |
806 | if (x == nullptr) { | |
807 | return x; | |
808 | } | |
809 | Node* next = x->Next(); | |
810 | // Make sure the lists are sorted. | |
811 | // If x points to head_ or next points nullptr, it is trivially satisfied. | |
812 | assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); | |
813 | if (KeyIsAfterNode(key, x)) { | |
814 | // Keep searching in this list | |
815 | x = next; | |
816 | } else { | |
817 | break; | |
818 | } | |
819 | } | |
820 | return x; | |
821 | } | |
822 | ||
823 | } // anon namespace | |
824 | ||
825 | MemTableRep* HashLinkListRepFactory::CreateMemTableRep( | |
11fdf7f2 | 826 | const MemTableRep::KeyComparator& compare, Allocator* allocator, |
7c673cae FG |
827 | const SliceTransform* transform, Logger* logger) { |
828 | return new HashLinkListRep(compare, allocator, transform, bucket_count_, | |
829 | threshold_use_skiplist_, huge_page_tlb_size_, | |
830 | logger, bucket_entries_logging_threshold_, | |
831 | if_log_bucket_dist_when_flash_); | |
832 | } | |
833 | ||
834 | MemTableRepFactory* NewHashLinkListRepFactory( | |
835 | size_t bucket_count, size_t huge_page_tlb_size, | |
836 | int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash, | |
837 | uint32_t threshold_use_skiplist) { | |
838 | return new HashLinkListRepFactory( | |
839 | bucket_count, threshold_use_skiplist, huge_page_tlb_size, | |
840 | bucket_entries_logging_threshold, if_log_bucket_dist_when_flash); | |
841 | } | |
842 | ||
f67539c2 | 843 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 844 | #endif // ROCKSDB_LITE |