]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/memtable/skiplistrep.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / memtable / skiplistrep.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 #include <random>
7
8 #include "db/memtable.h"
9 #include "memory/arena.h"
10 #include "memtable/inlineskiplist.h"
11 #include "rocksdb/memtablerep.h"
12 #include "rocksdb/utilities/options_type.h"
13 #include "util/string_util.h"
14
15 namespace ROCKSDB_NAMESPACE {
16 namespace {
17 class SkipListRep : public MemTableRep {
18 InlineSkipList<const MemTableRep::KeyComparator&> skip_list_;
19 const MemTableRep::KeyComparator& cmp_;
20 const SliceTransform* transform_;
21 const size_t lookahead_;
22
23 friend class LookaheadIterator;
24
25 public:
26 explicit SkipListRep(const MemTableRep::KeyComparator& compare,
27 Allocator* allocator, const SliceTransform* transform,
28 const size_t lookahead)
29 : MemTableRep(allocator),
30 skip_list_(compare, allocator),
31 cmp_(compare),
32 transform_(transform),
33 lookahead_(lookahead) {}
34
35 KeyHandle Allocate(const size_t len, char** buf) override {
36 *buf = skip_list_.AllocateKey(len);
37 return static_cast<KeyHandle>(*buf);
38 }
39
40 // Insert key into the list.
41 // REQUIRES: nothing that compares equal to key is currently in the list.
42 void Insert(KeyHandle handle) override {
43 skip_list_.Insert(static_cast<char*>(handle));
44 }
45
46 bool InsertKey(KeyHandle handle) override {
47 return skip_list_.Insert(static_cast<char*>(handle));
48 }
49
50 void InsertWithHint(KeyHandle handle, void** hint) override {
51 skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
52 }
53
54 bool InsertKeyWithHint(KeyHandle handle, void** hint) override {
55 return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
56 }
57
58 void InsertWithHintConcurrently(KeyHandle handle, void** hint) override {
59 skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint);
60 }
61
62 bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override {
63 return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle),
64 hint);
65 }
66
67 void InsertConcurrently(KeyHandle handle) override {
68 skip_list_.InsertConcurrently(static_cast<char*>(handle));
69 }
70
71 bool InsertKeyConcurrently(KeyHandle handle) override {
72 return skip_list_.InsertConcurrently(static_cast<char*>(handle));
73 }
74
75 // Returns true iff an entry that compares equal to key is in the list.
76 bool Contains(const char* key) const override {
77 return skip_list_.Contains(key);
78 }
79
80 size_t ApproximateMemoryUsage() override {
81 // All memory is allocated through allocator; nothing to report here
82 return 0;
83 }
84
85 void Get(const LookupKey& k, void* callback_args,
86 bool (*callback_func)(void* arg, const char* entry)) override {
87 SkipListRep::Iterator iter(&skip_list_);
88 Slice dummy_slice;
89 for (iter.Seek(dummy_slice, k.memtable_key().data());
90 iter.Valid() && callback_func(callback_args, iter.key());
91 iter.Next()) {
92 }
93 }
94
95 uint64_t ApproximateNumEntries(const Slice& start_ikey,
96 const Slice& end_ikey) override {
97 std::string tmp;
98 uint64_t start_count =
99 skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey));
100 uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey));
101 return (end_count >= start_count) ? (end_count - start_count) : 0;
102 }
103
104 void UniqueRandomSample(const uint64_t num_entries,
105 const uint64_t target_sample_size,
106 std::unordered_set<const char*>* entries) override {
107 entries->clear();
108 // Avoid divide-by-0.
109 assert(target_sample_size > 0);
110 assert(num_entries > 0);
111 // NOTE: the size of entries is not enforced to be exactly
112 // target_sample_size at the end of this function, it might be slightly
113 // greater or smaller.
114 SkipListRep::Iterator iter(&skip_list_);
115 // There are two methods to create the subset of samples (size m)
116 // from the table containing N elements:
117 // 1-Iterate linearly through the N memtable entries. For each entry i,
118 // add it to the sample set with a probability
119 // (target_sample_size - entries.size() ) / (N-i).
120 //
121 // 2-Pick m random elements without repetition.
122 // We pick Option 2 when m<sqrt(N) and
123 // Option 1 when m > sqrt(N).
124 if (target_sample_size >
125 static_cast<uint64_t>(std::sqrt(1.0 * num_entries))) {
126 Random* rnd = Random::GetTLSInstance();
127 iter.SeekToFirst();
128 uint64_t counter = 0, num_samples_left = target_sample_size;
129 for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) {
130 // Add entry to sample set with probability
131 // num_samples_left/(num_entries - counter).
132 if (rnd->Next() % (num_entries - counter) < num_samples_left) {
133 entries->insert(iter.key());
134 num_samples_left--;
135 }
136 }
137 } else {
138 // Option 2: pick m random elements with no duplicates.
139 // If Option 2 is picked, then target_sample_size<sqrt(N)
140 // Using a set spares the need to check for duplicates.
141 for (uint64_t i = 0; i < target_sample_size; i++) {
142 // We give it 5 attempts to find a non-duplicate
143 // With 5 attempts, the chances of returning `entries` set
144 // of size target_sample_size is:
145 // PROD_{i=1}^{target_sample_size-1} [1-(i/N)^5]
146 // which is monotonically increasing with N in the worse case
147 // of target_sample_size=sqrt(N), and is always >99.9% for N>4.
148 // At worst, for the final pick , when m=sqrt(N) there is
149 // a probability of p= 1/sqrt(N) chances to find a duplicate.
150 for (uint64_t j = 0; j < 5; j++) {
151 iter.RandomSeek();
152 // unordered_set::insert returns pair<iterator, bool>.
153 // The second element is true if an insert successfully happened.
154 // If element is already in the set, this bool will be false, and
155 // true otherwise.
156 if ((entries->insert(iter.key())).second) {
157 break;
158 }
159 }
160 }
161 }
162 }
163
164 ~SkipListRep() override {}
165
166 // Iteration over the contents of a skip list
167 class Iterator : public MemTableRep::Iterator {
168 InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
169
170 public:
171 // Initialize an iterator over the specified list.
172 // The returned iterator is not valid.
173 explicit Iterator(
174 const InlineSkipList<const MemTableRep::KeyComparator&>* list)
175 : iter_(list) {}
176
177 ~Iterator() override {}
178
179 // Returns true iff the iterator is positioned at a valid node.
180 bool Valid() const override { return iter_.Valid(); }
181
182 // Returns the key at the current position.
183 // REQUIRES: Valid()
184 const char* key() const override { return iter_.key(); }
185
186 // Advances to the next position.
187 // REQUIRES: Valid()
188 void Next() override { iter_.Next(); }
189
190 // Advances to the previous position.
191 // REQUIRES: Valid()
192 void Prev() override { iter_.Prev(); }
193
194 // Advance to the first entry with a key >= target
195 void Seek(const Slice& user_key, const char* memtable_key) override {
196 if (memtable_key != nullptr) {
197 iter_.Seek(memtable_key);
198 } else {
199 iter_.Seek(EncodeKey(&tmp_, user_key));
200 }
201 }
202
203 // Retreat to the last entry with a key <= target
204 void SeekForPrev(const Slice& user_key, const char* memtable_key) override {
205 if (memtable_key != nullptr) {
206 iter_.SeekForPrev(memtable_key);
207 } else {
208 iter_.SeekForPrev(EncodeKey(&tmp_, user_key));
209 }
210 }
211
212 void RandomSeek() override { iter_.RandomSeek(); }
213
214 // Position at the first entry in list.
215 // Final state of iterator is Valid() iff list is not empty.
216 void SeekToFirst() override { iter_.SeekToFirst(); }
217
218 // Position at the last entry in list.
219 // Final state of iterator is Valid() iff list is not empty.
220 void SeekToLast() override { iter_.SeekToLast(); }
221
222 protected:
223 std::string tmp_; // For passing to EncodeKey
224 };
225
226 // Iterator over the contents of a skip list which also keeps track of the
227 // previously visited node. In Seek(), it examines a few nodes after it
228 // first, falling back to O(log n) search from the head of the list only if
229 // the target key hasn't been found.
230 class LookaheadIterator : public MemTableRep::Iterator {
231 public:
232 explicit LookaheadIterator(const SkipListRep& rep)
233 : rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
234
235 ~LookaheadIterator() override {}
236
237 bool Valid() const override { return iter_.Valid(); }
238
239 const char* key() const override {
240 assert(Valid());
241 return iter_.key();
242 }
243
244 void Next() override {
245 assert(Valid());
246
247 bool advance_prev = true;
248 if (prev_.Valid()) {
249 auto k1 = rep_.UserKey(prev_.key());
250 auto k2 = rep_.UserKey(iter_.key());
251
252 if (k1.compare(k2) == 0) {
253 // same user key, don't move prev_
254 advance_prev = false;
255 } else if (rep_.transform_) {
256 // only advance prev_ if it has the same prefix as iter_
257 auto t1 = rep_.transform_->Transform(k1);
258 auto t2 = rep_.transform_->Transform(k2);
259 advance_prev = t1.compare(t2) == 0;
260 }
261 }
262
263 if (advance_prev) {
264 prev_ = iter_;
265 }
266 iter_.Next();
267 }
268
269 void Prev() override {
270 assert(Valid());
271 iter_.Prev();
272 prev_ = iter_;
273 }
274
275 void Seek(const Slice& internal_key, const char* memtable_key) override {
276 const char* encoded_key = (memtable_key != nullptr)
277 ? memtable_key
278 : EncodeKey(&tmp_, internal_key);
279
280 if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) {
281 // prev_.key() is smaller or equal to our target key; do a quick
282 // linear search (at most lookahead_ steps) starting from prev_
283 iter_ = prev_;
284
285 size_t cur = 0;
286 while (cur++ <= rep_.lookahead_ && iter_.Valid()) {
287 if (rep_.cmp_(encoded_key, iter_.key()) <= 0) {
288 return;
289 }
290 Next();
291 }
292 }
293
294 iter_.Seek(encoded_key);
295 prev_ = iter_;
296 }
297
298 void SeekForPrev(const Slice& internal_key,
299 const char* memtable_key) override {
300 const char* encoded_key = (memtable_key != nullptr)
301 ? memtable_key
302 : EncodeKey(&tmp_, internal_key);
303 iter_.SeekForPrev(encoded_key);
304 prev_ = iter_;
305 }
306
307 void SeekToFirst() override {
308 iter_.SeekToFirst();
309 prev_ = iter_;
310 }
311
312 void SeekToLast() override {
313 iter_.SeekToLast();
314 prev_ = iter_;
315 }
316
317 protected:
318 std::string tmp_; // For passing to EncodeKey
319
320 private:
321 const SkipListRep& rep_;
322 InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
323 InlineSkipList<const MemTableRep::KeyComparator&>::Iterator prev_;
324 };
325
326 MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
327 if (lookahead_ > 0) {
328 void* mem =
329 arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
330 :
331 operator new(sizeof(SkipListRep::LookaheadIterator));
332 return new (mem) SkipListRep::LookaheadIterator(*this);
333 } else {
334 void* mem = arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator))
335 :
336 operator new(sizeof(SkipListRep::Iterator));
337 return new (mem) SkipListRep::Iterator(&skip_list_);
338 }
339 }
340 };
341 } // namespace
342
343 static std::unordered_map<std::string, OptionTypeInfo> skiplist_factory_info = {
344 #ifndef ROCKSDB_LITE
345 {"lookahead",
346 {0, OptionType::kSizeT, OptionVerificationType::kNormal,
347 OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}},
348 #endif
349 };
350
351 SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) {
352 RegisterOptions("SkipListFactoryOptions", &lookahead_,
353 &skiplist_factory_info);
354 }
355
356 std::string SkipListFactory::GetId() const {
357 std::string id = Name();
358 if (lookahead_ > 0) {
359 id.append(":").append(std::to_string(lookahead_));
360 }
361 return id;
362 }
363
364 MemTableRep* SkipListFactory::CreateMemTableRep(
365 const MemTableRep::KeyComparator& compare, Allocator* allocator,
366 const SliceTransform* transform, Logger* /*logger*/) {
367 return new SkipListRep(compare, allocator, transform, lookahead_);
368 }
369
370 } // namespace ROCKSDB_NAMESPACE