]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/memtable/hash_linklist_rep.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / memtable / hash_linklist_rep.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6
7#ifndef ROCKSDB_LITE
8#include "memtable/hash_linklist_rep.h"
9
10#include <algorithm>
11#include <atomic>
12#include "db/memtable.h"
f67539c2 13#include "memory/arena.h"
7c673cae
FG
14#include "memtable/skiplist.h"
15#include "monitoring/histogram.h"
16#include "port/port.h"
17#include "rocksdb/memtablerep.h"
18#include "rocksdb/slice.h"
19#include "rocksdb/slice_transform.h"
494da23a 20#include "util/hash.h"
7c673cae 21
f67539c2 22namespace ROCKSDB_NAMESPACE {
7c673cae
FG
23namespace {
24
25typedef const char* Key;
26typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
27typedef std::atomic<void*> Pointer;
28
29// A data structure used as the header of a link list of a hash bucket.
30struct BucketHeader {
31 Pointer next;
32 std::atomic<uint32_t> num_entries;
33
34 explicit BucketHeader(void* n, uint32_t count)
35 : next(n), num_entries(count) {}
36
37 bool IsSkipListBucket() {
38 return next.load(std::memory_order_relaxed) == this;
39 }
40
41 uint32_t GetNumEntries() const {
42 return num_entries.load(std::memory_order_relaxed);
43 }
44
45 // REQUIRES: called from single-threaded Insert()
46 void IncNumEntries() {
47 // Only one thread can do write at one time. No need to do atomic
48 // incremental. Update it with relaxed load and store.
49 num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
50 }
51};
52
53// A data structure used as the header of a skip list of a hash bucket.
54struct SkipListBucketHeader {
55 BucketHeader Counting_header;
56 MemtableSkipList skip_list;
57
58 explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
11fdf7f2 59 Allocator* allocator, uint32_t count)
7c673cae
FG
60 : Counting_header(this, // Pointing to itself to indicate header type.
61 count),
62 skip_list(cmp, allocator) {}
63};
64
65struct Node {
66 // Accessors/mutators for links. Wrapped in methods so we can
67 // add the appropriate barriers as necessary.
68 Node* Next() {
69 // Use an 'acquire load' so that we observe a fully initialized
70 // version of the returned Node.
71 return next_.load(std::memory_order_acquire);
72 }
73 void SetNext(Node* x) {
74 // Use a 'release store' so that anybody who reads through this
75 // pointer observes a fully initialized version of the inserted node.
76 next_.store(x, std::memory_order_release);
77 }
78 // No-barrier variants that can be safely used in a few locations.
79 Node* NoBarrier_Next() {
80 return next_.load(std::memory_order_relaxed);
81 }
82
83 void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
84
85 // Needed for placement new below which is fine
86 Node() {}
87
88 private:
89 std::atomic<Node*> next_;
90
91 // Prohibit copying due to the below
92 Node(const Node&) = delete;
93 Node& operator=(const Node&) = delete;
94
95 public:
96 char key[1];
97};
98
99// Memory structure of the mem table:
100// It is a hash table, each bucket points to one entry, a linked list or a
101// skip list. In order to track total number of records in a bucket to determine
102// whether should switch to skip list, a header is added just to indicate
103// number of entries in the bucket.
104//
105//
106// +-----> NULL Case 1. Empty bucket
107// |
108// |
109// | +---> +-------+
110// | | | Next +--> NULL
111// | | +-------+
112// +-----+ | | | | Case 2. One Entry in bucket.
113// | +-+ | | Data | next pointer points to
114// +-----+ | | | NULL. All other cases
115// | | | | | next pointer is not NULL.
116// +-----+ | +-------+
117// | +---+
118// +-----+ +-> +-------+ +> +-------+ +-> +-------+
119// | | | | Next +--+ | Next +--+ | Next +-->NULL
120// +-----+ | +-------+ +-------+ +-------+
121// | +-----+ | Count | | | | |
122// +-----+ +-------+ | Data | | Data |
123// | | | | | |
124// +-----+ Case 3. | | | |
125// | | A header +-------+ +-------+
126// +-----+ points to
127// | | a linked list. Count indicates total number
128// +-----+ of rows in this bucket.
129// | |
130// +-----+ +-> +-------+ <--+
131// | | | | Next +----+
132// +-----+ | +-------+ Case 4. A header points to a skip
133// | +----+ | Count | list and next pointer points to
134// +-----+ +-------+ itself, to distinguish case 3 or 4.
135// | | | | Count still is kept to indicates total
136// +-----+ | Skip +--> of entries in the bucket for debugging
137// | | | List | Data purpose.
138// | | | +-->
139// +-----+ | |
140// | | +-------+
141// +-----+
142//
143// We don't have data race when changing cases because:
144// (1) When changing from case 2->3, we create a new bucket header, put the
145// single node there first without changing the original node, and do a
146// release store when changing the bucket pointer. In that case, a reader
147// who sees a stale value of the bucket pointer will read this node, while
148// a reader sees the correct value because of the release store.
149// (2) When changing case 3->4, a new header is created with skip list points
150// to the data, before doing an acquire store to change the bucket pointer.
151// The old header and nodes are never changed, so any reader sees any
152// of those existing pointers will guarantee to be able to iterate to the
153// end of the linked list.
154// (3) Header's next pointer in case 3 might change, but they are never equal
155// to itself, so no matter a reader sees any stale or newer value, it will
156// be able to correctly distinguish case 3 and 4.
157//
158// The reason that we use case 2 is we want to make the format to be efficient
159// when the utilization of buckets is relatively low. If we use case 3 for
160// single entry bucket, we will need to waste 12 bytes for every entry,
161// which can be significant decrease of memory utilization.
162class HashLinkListRep : public MemTableRep {
163 public:
164 HashLinkListRep(const MemTableRep::KeyComparator& compare,
11fdf7f2 165 Allocator* allocator, const SliceTransform* transform,
7c673cae
FG
166 size_t bucket_size, uint32_t threshold_use_skiplist,
167 size_t huge_page_tlb_size, Logger* logger,
168 int bucket_entries_logging_threshold,
169 bool if_log_bucket_dist_when_flash);
170
494da23a 171 KeyHandle Allocate(const size_t len, char** buf) override;
7c673cae 172
494da23a 173 void Insert(KeyHandle handle) override;
7c673cae 174
494da23a 175 bool Contains(const char* key) const override;
7c673cae 176
494da23a 177 size_t ApproximateMemoryUsage() override;
7c673cae 178
494da23a
TL
179 void Get(const LookupKey& k, void* callback_args,
180 bool (*callback_func)(void* arg, const char* entry)) override;
7c673cae 181
494da23a 182 ~HashLinkListRep() override;
7c673cae 183
494da23a 184 MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
7c673cae 185
494da23a
TL
186 MemTableRep::Iterator* GetDynamicPrefixIterator(
187 Arena* arena = nullptr) override;
7c673cae
FG
188
189 private:
190 friend class DynamicIterator;
191
192 size_t bucket_size_;
193
194 // Maps slices (which are transformed user keys) to buckets of keys sharing
195 // the same transform.
196 Pointer* buckets_;
197
198 const uint32_t threshold_use_skiplist_;
199
200 // The user-supplied transform whose domain is the user keys.
201 const SliceTransform* transform_;
202
203 const MemTableRep::KeyComparator& compare_;
204
205 Logger* logger_;
206 int bucket_entries_logging_threshold_;
207 bool if_log_bucket_dist_when_flash_;
208
209 bool LinkListContains(Node* head, const Slice& key) const;
210
211 SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
212 const;
213
214 Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
215
216 Slice GetPrefix(const Slice& internal_key) const {
217 return transform_->Transform(ExtractUserKey(internal_key));
218 }
219
220 size_t GetHash(const Slice& slice) const {
20effc67 221 return GetSliceRangedNPHash(slice, bucket_size_);
7c673cae
FG
222 }
223
224 Pointer* GetBucket(size_t i) const {
225 return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
226 }
227
228 Pointer* GetBucket(const Slice& slice) const {
229 return GetBucket(GetHash(slice));
230 }
231
232 bool Equal(const Slice& a, const Key& b) const {
233 return (compare_(b, a) == 0);
234 }
235
236 bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
237
238 bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
239 // nullptr n is considered infinite
240 return (n != nullptr) && (compare_(n->key, internal_key) < 0);
241 }
242
243 bool KeyIsAfterNode(const Key& key, const Node* n) const {
244 // nullptr n is considered infinite
245 return (n != nullptr) && (compare_(n->key, key) < 0);
246 }
247
248 bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const {
249 // nullptr n is considered infinite
250 return (n != nullptr) && (compare_(n->key, internal_key) <= 0);
251 }
252
253 bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const {
254 // nullptr n is considered infinite
255 return (n != nullptr) && (compare_(n->key, key) <= 0);
256 }
257
258 Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
259 Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const;
260
261 class FullListIterator : public MemTableRep::Iterator {
262 public:
263 explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
264 : iter_(list), full_list_(list), allocator_(allocator) {}
265
494da23a 266 ~FullListIterator() override {}
7c673cae
FG
267
268 // Returns true iff the iterator is positioned at a valid node.
494da23a 269 bool Valid() const override { return iter_.Valid(); }
7c673cae
FG
270
271 // Returns the key at the current position.
272 // REQUIRES: Valid()
494da23a 273 const char* key() const override {
7c673cae
FG
274 assert(Valid());
275 return iter_.key();
276 }
277
278 // Advances to the next position.
279 // REQUIRES: Valid()
494da23a 280 void Next() override {
7c673cae
FG
281 assert(Valid());
282 iter_.Next();
283 }
284
285 // Advances to the previous position.
286 // REQUIRES: Valid()
494da23a 287 void Prev() override {
7c673cae
FG
288 assert(Valid());
289 iter_.Prev();
290 }
291
292 // Advance to the first entry with a key >= target
494da23a 293 void Seek(const Slice& internal_key, const char* memtable_key) override {
7c673cae
FG
294 const char* encoded_key =
295 (memtable_key != nullptr) ?
296 memtable_key : EncodeKey(&tmp_, internal_key);
297 iter_.Seek(encoded_key);
298 }
299
300 // Retreat to the last entry with a key <= target
494da23a
TL
301 void SeekForPrev(const Slice& internal_key,
302 const char* memtable_key) override {
7c673cae
FG
303 const char* encoded_key = (memtable_key != nullptr)
304 ? memtable_key
305 : EncodeKey(&tmp_, internal_key);
306 iter_.SeekForPrev(encoded_key);
307 }
308
309 // Position at the first entry in collection.
310 // Final state of iterator is Valid() iff collection is not empty.
494da23a 311 void SeekToFirst() override { iter_.SeekToFirst(); }
7c673cae
FG
312
313 // Position at the last entry in collection.
314 // Final state of iterator is Valid() iff collection is not empty.
494da23a
TL
315 void SeekToLast() override { iter_.SeekToLast(); }
316
7c673cae
FG
317 private:
318 MemtableSkipList::Iterator iter_;
319 // To destruct with the iterator.
320 std::unique_ptr<MemtableSkipList> full_list_;
321 std::unique_ptr<Allocator> allocator_;
322 std::string tmp_; // For passing to EncodeKey
323 };
324
325 class LinkListIterator : public MemTableRep::Iterator {
326 public:
327 explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
328 Node* head)
329 : hash_link_list_rep_(hash_link_list_rep),
330 head_(head),
331 node_(nullptr) {}
332
494da23a 333 ~LinkListIterator() override {}
7c673cae
FG
334
335 // Returns true iff the iterator is positioned at a valid node.
494da23a 336 bool Valid() const override { return node_ != nullptr; }
7c673cae
FG
337
338 // Returns the key at the current position.
339 // REQUIRES: Valid()
494da23a 340 const char* key() const override {
7c673cae
FG
341 assert(Valid());
342 return node_->key;
343 }
344
345 // Advances to the next position.
346 // REQUIRES: Valid()
494da23a 347 void Next() override {
7c673cae
FG
348 assert(Valid());
349 node_ = node_->Next();
350 }
351
352 // Advances to the previous position.
353 // REQUIRES: Valid()
494da23a 354 void Prev() override {
7c673cae
FG
355 // Prefix iterator does not support total order.
356 // We simply set the iterator to invalid state
357 Reset(nullptr);
358 }
359
360 // Advance to the first entry with a key >= target
494da23a
TL
361 void Seek(const Slice& internal_key,
362 const char* /*memtable_key*/) override {
7c673cae
FG
363 node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
364 internal_key);
365 }
366
367 // Retreat to the last entry with a key <= target
494da23a
TL
368 void SeekForPrev(const Slice& /*internal_key*/,
369 const char* /*memtable_key*/) override {
7c673cae
FG
370 // Since we do not support Prev()
371 // We simply do not support SeekForPrev
372 Reset(nullptr);
373 }
374
375 // Position at the first entry in collection.
376 // Final state of iterator is Valid() iff collection is not empty.
494da23a 377 void SeekToFirst() override {
7c673cae
FG
378 // Prefix iterator does not support total order.
379 // We simply set the iterator to invalid state
380 Reset(nullptr);
381 }
382
383 // Position at the last entry in collection.
384 // Final state of iterator is Valid() iff collection is not empty.
494da23a 385 void SeekToLast() override {
7c673cae
FG
386 // Prefix iterator does not support total order.
387 // We simply set the iterator to invalid state
388 Reset(nullptr);
389 }
390
391 protected:
392 void Reset(Node* head) {
393 head_ = head;
394 node_ = nullptr;
395 }
396 private:
397 friend class HashLinkListRep;
398 const HashLinkListRep* const hash_link_list_rep_;
399 Node* head_;
400 Node* node_;
401
402 virtual void SeekToHead() {
403 node_ = head_;
404 }
405 };
406
407 class DynamicIterator : public HashLinkListRep::LinkListIterator {
408 public:
409 explicit DynamicIterator(HashLinkListRep& memtable_rep)
410 : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
411 memtable_rep_(memtable_rep) {}
412
413 // Advance to the first entry with a key >= target
494da23a 414 void Seek(const Slice& k, const char* memtable_key) override {
7c673cae
FG
415 auto transformed = memtable_rep_.GetPrefix(k);
416 auto* bucket = memtable_rep_.GetBucket(transformed);
417
418 SkipListBucketHeader* skip_list_header =
419 memtable_rep_.GetSkipListBucketHeader(bucket);
420 if (skip_list_header != nullptr) {
421 // The bucket is organized as a skip list
422 if (!skip_list_iter_) {
423 skip_list_iter_.reset(
424 new MemtableSkipList::Iterator(&skip_list_header->skip_list));
425 } else {
426 skip_list_iter_->SetList(&skip_list_header->skip_list);
427 }
428 if (memtable_key != nullptr) {
429 skip_list_iter_->Seek(memtable_key);
430 } else {
431 IterKey encoded_key;
432 encoded_key.EncodeLengthPrefixedKey(k);
433 skip_list_iter_->Seek(encoded_key.GetUserKey().data());
434 }
435 } else {
436 // The bucket is organized as a linked list
437 skip_list_iter_.reset();
438 Reset(memtable_rep_.GetLinkListFirstNode(bucket));
439 HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
440 }
441 }
442
494da23a 443 bool Valid() const override {
7c673cae
FG
444 if (skip_list_iter_) {
445 return skip_list_iter_->Valid();
446 }
447 return HashLinkListRep::LinkListIterator::Valid();
448 }
449
494da23a 450 const char* key() const override {
7c673cae
FG
451 if (skip_list_iter_) {
452 return skip_list_iter_->key();
453 }
454 return HashLinkListRep::LinkListIterator::key();
455 }
456
494da23a 457 void Next() override {
7c673cae
FG
458 if (skip_list_iter_) {
459 skip_list_iter_->Next();
460 } else {
461 HashLinkListRep::LinkListIterator::Next();
462 }
463 }
464
465 private:
466 // the underlying memtable
467 const HashLinkListRep& memtable_rep_;
468 std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
469 };
470
471 class EmptyIterator : public MemTableRep::Iterator {
472 // This is used when there wasn't a bucket. It is cheaper than
473 // instantiating an empty bucket over which to iterate.
474 public:
475 EmptyIterator() { }
494da23a
TL
476 bool Valid() const override { return false; }
477 const char* key() const override {
7c673cae
FG
478 assert(false);
479 return nullptr;
480 }
494da23a
TL
481 void Next() override {}
482 void Prev() override {}
483 void Seek(const Slice& /*user_key*/,
484 const char* /*memtable_key*/) override {}
485 void SeekForPrev(const Slice& /*user_key*/,
486 const char* /*memtable_key*/) override {}
487 void SeekToFirst() override {}
488 void SeekToLast() override {}
7c673cae
FG
489
490 private:
491 };
492};
493
11fdf7f2
TL
494HashLinkListRep::HashLinkListRep(
495 const MemTableRep::KeyComparator& compare, Allocator* allocator,
496 const SliceTransform* transform, size_t bucket_size,
497 uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger,
498 int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash)
7c673cae
FG
499 : MemTableRep(allocator),
500 bucket_size_(bucket_size),
501 // Threshold to use skip list doesn't make sense if less than 3, so we
502 // force it to be minimum of 3 to simplify implementation.
503 threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
504 transform_(transform),
505 compare_(compare),
506 logger_(logger),
507 bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
508 if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
509 char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
510 huge_page_tlb_size, logger);
511
512 buckets_ = new (mem) Pointer[bucket_size];
513
514 for (size_t i = 0; i < bucket_size_; ++i) {
515 buckets_[i].store(nullptr, std::memory_order_relaxed);
516 }
517}
518
519HashLinkListRep::~HashLinkListRep() {
520}
521
522KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
523 char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
524 Node* x = new (mem) Node();
525 *buf = x->key;
526 return static_cast<void*>(x);
527}
528
529SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
530 Pointer* first_next_pointer) const {
531 if (first_next_pointer == nullptr) {
532 return nullptr;
533 }
534 if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
535 // Single entry bucket
536 return nullptr;
537 }
538 // Counting header
539 BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
540 if (header->IsSkipListBucket()) {
541 assert(header->GetNumEntries() > threshold_use_skiplist_);
542 auto* skip_list_bucket_header =
543 reinterpret_cast<SkipListBucketHeader*>(header);
544 assert(skip_list_bucket_header->Counting_header.next.load(
545 std::memory_order_relaxed) == header);
546 return skip_list_bucket_header;
547 }
548 assert(header->GetNumEntries() <= threshold_use_skiplist_);
549 return nullptr;
550}
551
552Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
553 if (first_next_pointer == nullptr) {
554 return nullptr;
555 }
556 if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
557 // Single entry bucket
558 return reinterpret_cast<Node*>(first_next_pointer);
559 }
560 // Counting header
561 BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
562 if (!header->IsSkipListBucket()) {
563 assert(header->GetNumEntries() <= threshold_use_skiplist_);
564 return reinterpret_cast<Node*>(
565 header->next.load(std::memory_order_acquire));
566 }
567 assert(header->GetNumEntries() > threshold_use_skiplist_);
568 return nullptr;
569}
570
571void HashLinkListRep::Insert(KeyHandle handle) {
572 Node* x = static_cast<Node*>(handle);
573 assert(!Contains(x->key));
574 Slice internal_key = GetLengthPrefixedSlice(x->key);
575 auto transformed = GetPrefix(internal_key);
576 auto& bucket = buckets_[GetHash(transformed)];
577 Pointer* first_next_pointer =
578 static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
579
580 if (first_next_pointer == nullptr) {
581 // Case 1. empty bucket
582 // NoBarrier_SetNext() suffices since we will add a barrier when
583 // we publish a pointer to "x" in prev[i].
584 x->NoBarrier_SetNext(nullptr);
585 bucket.store(x, std::memory_order_release);
586 return;
587 }
588
589 BucketHeader* header = nullptr;
590 if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
591 // Case 2. only one entry in the bucket
592 // Need to convert to a Counting bucket and turn to case 4.
593 Node* first = reinterpret_cast<Node*>(first_next_pointer);
594 // Need to add a bucket header.
595 // We have to first convert it to a bucket with header before inserting
596 // the new node. Otherwise, we might need to change next pointer of first.
597 // In that case, a reader might sees the next pointer is NULL and wrongly
598 // think the node is a bucket header.
599 auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
600 header = new (mem) BucketHeader(first, 1);
601 bucket.store(header, std::memory_order_release);
602 } else {
603 header = reinterpret_cast<BucketHeader*>(first_next_pointer);
604 if (header->IsSkipListBucket()) {
605 // Case 4. Bucket is already a skip list
606 assert(header->GetNumEntries() > threshold_use_skiplist_);
607 auto* skip_list_bucket_header =
608 reinterpret_cast<SkipListBucketHeader*>(header);
609 // Only one thread can execute Insert() at one time. No need to do atomic
610 // incremental.
611 skip_list_bucket_header->Counting_header.IncNumEntries();
612 skip_list_bucket_header->skip_list.Insert(x->key);
613 return;
614 }
615 }
616
617 if (bucket_entries_logging_threshold_ > 0 &&
618 header->GetNumEntries() ==
619 static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
620 Info(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt
621 " has more than %d "
622 "entries. Key to insert: %s",
623 GetHash(transformed), header->GetNumEntries(),
624 GetLengthPrefixedSlice(x->key).ToString(true).c_str());
625 }
626
627 if (header->GetNumEntries() == threshold_use_skiplist_) {
628 // Case 3. number of entries reaches the threshold so need to convert to
629 // skip list.
630 LinkListIterator bucket_iter(
631 this, reinterpret_cast<Node*>(
632 first_next_pointer->load(std::memory_order_relaxed)));
633 auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
634 SkipListBucketHeader* new_skip_list_header = new (mem)
635 SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
636 auto& skip_list = new_skip_list_header->skip_list;
637
638 // Add all current entries to the skip list
639 for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) {
640 skip_list.Insert(bucket_iter.key());
641 }
642
643 // insert the new entry
644 skip_list.Insert(x->key);
645 // Set the bucket
646 bucket.store(new_skip_list_header, std::memory_order_release);
647 } else {
648 // Case 5. Need to insert to the sorted linked list without changing the
649 // header.
650 Node* first =
651 reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
652 assert(first != nullptr);
653 // Advance counter unless the bucket needs to be advanced to skip list.
654 // In that case, we need to make sure the previous count never exceeds
655 // threshold_use_skiplist_ to avoid readers to cast to wrong format.
656 header->IncNumEntries();
657
658 Node* cur = first;
659 Node* prev = nullptr;
660 while (true) {
661 if (cur == nullptr) {
662 break;
663 }
664 Node* next = cur->Next();
665 // Make sure the lists are sorted.
666 // If x points to head_ or next points nullptr, it is trivially satisfied.
667 assert((cur == first) || (next == nullptr) ||
668 KeyIsAfterNode(next->key, cur));
669 if (KeyIsAfterNode(internal_key, cur)) {
670 // Keep searching in this list
671 prev = cur;
672 cur = next;
673 } else {
674 break;
675 }
676 }
677
678 // Our data structure does not allow duplicate insertion
679 assert(cur == nullptr || !Equal(x->key, cur->key));
680
681 // NoBarrier_SetNext() suffices since we will add a barrier when
682 // we publish a pointer to "x" in prev[i].
683 x->NoBarrier_SetNext(cur);
684
685 if (prev) {
686 prev->SetNext(x);
687 } else {
688 header->next.store(static_cast<void*>(x), std::memory_order_release);
689 }
690 }
691}
692
693bool HashLinkListRep::Contains(const char* key) const {
694 Slice internal_key = GetLengthPrefixedSlice(key);
695
696 auto transformed = GetPrefix(internal_key);
697 auto bucket = GetBucket(transformed);
698 if (bucket == nullptr) {
699 return false;
700 }
701
702 SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
703 if (skip_list_header != nullptr) {
704 return skip_list_header->skip_list.Contains(key);
705 } else {
706 return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
707 }
708}
709
710size_t HashLinkListRep::ApproximateMemoryUsage() {
711 // Memory is always allocated from the allocator.
712 return 0;
713}
714
715void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
716 bool (*callback_func)(void* arg, const char* entry)) {
717 auto transformed = transform_->Transform(k.user_key());
718 auto bucket = GetBucket(transformed);
719
720 auto* skip_list_header = GetSkipListBucketHeader(bucket);
721 if (skip_list_header != nullptr) {
722 // Is a skip list
723 MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
724 for (iter.Seek(k.memtable_key().data());
725 iter.Valid() && callback_func(callback_args, iter.key());
726 iter.Next()) {
727 }
728 } else {
729 auto* link_list_head = GetLinkListFirstNode(bucket);
730 if (link_list_head != nullptr) {
731 LinkListIterator iter(this, link_list_head);
732 for (iter.Seek(k.internal_key(), nullptr);
733 iter.Valid() && callback_func(callback_args, iter.key());
734 iter.Next()) {
735 }
736 }
737 }
738}
739
740MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
741 // allocate a new arena of similar size to the one currently in use
742 Arena* new_arena = new Arena(allocator_->BlockSize());
743 auto list = new MemtableSkipList(compare_, new_arena);
744 HistogramImpl keys_per_bucket_hist;
745
746 for (size_t i = 0; i < bucket_size_; ++i) {
747 int count = 0;
748 auto* bucket = GetBucket(i);
749 if (bucket != nullptr) {
750 auto* skip_list_header = GetSkipListBucketHeader(bucket);
751 if (skip_list_header != nullptr) {
752 // Is a skip list
753 MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
754 for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
755 list->Insert(itr.key());
756 count++;
757 }
758 } else {
759 auto* link_list_head = GetLinkListFirstNode(bucket);
760 if (link_list_head != nullptr) {
761 LinkListIterator itr(this, link_list_head);
762 for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
763 list->Insert(itr.key());
764 count++;
765 }
766 }
767 }
768 }
769 if (if_log_bucket_dist_when_flash_) {
770 keys_per_bucket_hist.Add(count);
771 }
772 }
773 if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) {
774 Info(logger_, "hashLinkedList Entry distribution among buckets: %s",
775 keys_per_bucket_hist.ToString().c_str());
776 }
777
778 if (alloc_arena == nullptr) {
779 return new FullListIterator(list, new_arena);
780 } else {
781 auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator));
782 return new (mem) FullListIterator(list, new_arena);
783 }
784}
785
786MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
787 Arena* alloc_arena) {
788 if (alloc_arena == nullptr) {
789 return new DynamicIterator(*this);
790 } else {
791 auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator));
792 return new (mem) DynamicIterator(*this);
793 }
794}
795
796bool HashLinkListRep::LinkListContains(Node* head,
797 const Slice& user_key) const {
798 Node* x = FindGreaterOrEqualInBucket(head, user_key);
799 return (x != nullptr && Equal(user_key, x->key));
800}
801
802Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
803 const Slice& key) const {
804 Node* x = head;
805 while (true) {
806 if (x == nullptr) {
807 return x;
808 }
809 Node* next = x->Next();
810 // Make sure the lists are sorted.
811 // If x points to head_ or next points nullptr, it is trivially satisfied.
812 assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
813 if (KeyIsAfterNode(key, x)) {
814 // Keep searching in this list
815 x = next;
816 } else {
817 break;
818 }
819 }
820 return x;
821}
822
823} // anon namespace
824
825MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
11fdf7f2 826 const MemTableRep::KeyComparator& compare, Allocator* allocator,
7c673cae
FG
827 const SliceTransform* transform, Logger* logger) {
828 return new HashLinkListRep(compare, allocator, transform, bucket_count_,
829 threshold_use_skiplist_, huge_page_tlb_size_,
830 logger, bucket_entries_logging_threshold_,
831 if_log_bucket_dist_when_flash_);
832}
833
834MemTableRepFactory* NewHashLinkListRepFactory(
835 size_t bucket_count, size_t huge_page_tlb_size,
836 int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
837 uint32_t threshold_use_skiplist) {
838 return new HashLinkListRepFactory(
839 bucket_count, threshold_use_skiplist, huge_page_tlb_size,
840 bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
841}
842
f67539c2 843} // namespace ROCKSDB_NAMESPACE
7c673cae 844#endif // ROCKSDB_LITE