]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // This file contains the interface that must be implemented by any collection | |
7 | // to be used as the backing store for a MemTable. Such a collection must | |
8 | // satisfy the following properties: | |
9 | // (1) It does not store duplicate items. | |
10 | // (2) It uses MemTableRep::KeyComparator to compare items for iteration and | |
11 | // equality. | |
12 | // (3) It can be accessed concurrently by multiple readers and can support | |
13 | // during reads. However, it needn't support multiple concurrent writes. | |
14 | // (4) Items are never deleted. | |
15 | // The liberal use of assertions is encouraged to enforce (1). | |
16 | // | |
17 | // The factory will be passed an MemTableAllocator object when a new MemTableRep | |
18 | // is requested. | |
19 | // | |
20 | // Users can implement their own memtable representations. We include three | |
21 | // types built in: | |
22 | // - SkipListRep: This is the default; it is backed by a skip list. | |
23 | // - HashSkipListRep: The memtable rep that is best used for keys that are | |
24 | // structured like "prefix:suffix" where iteration within a prefix is | |
25 | // common and iteration across different prefixes is rare. It is backed by | |
26 | // a hash map where each bucket is a skip list. | |
27 | // - VectorRep: This is backed by an unordered std::vector. On iteration, the | |
28 | // vector is sorted. It is intelligent about sorting; once the MarkReadOnly() | |
29 | // has been called, the vector will only be sorted once. It is optimized for | |
30 | // random-write-heavy workloads. | |
31 | // | |
32 | // The last four implementations are designed for situations in which | |
33 | // iteration over the entire collection is rare since doing so requires all the | |
34 | // keys to be copied into a sorted data structure. | |
35 | ||
36 | #pragma once | |
37 | ||
7c673cae FG |
38 | #include <stdint.h> |
39 | #include <stdlib.h> | |
1e59de90 | 40 | |
494da23a TL |
41 | #include <memory> |
42 | #include <stdexcept> | |
1e59de90 TL |
43 | #include <unordered_set> |
44 | ||
45 | #include "rocksdb/customizable.h" | |
46 | #include "rocksdb/slice.h" | |
7c673cae | 47 | |
f67539c2 | 48 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
49 | |
50 | class Arena; | |
11fdf7f2 | 51 | class Allocator; |
7c673cae | 52 | class LookupKey; |
7c673cae FG |
53 | class SliceTransform; |
54 | class Logger; | |
1e59de90 | 55 | struct DBOptions; |
7c673cae | 56 | |
1e59de90 | 57 | using KeyHandle = void*; |
7c673cae | 58 | |
11fdf7f2 TL |
59 | extern Slice GetLengthPrefixedSlice(const char* data); |
60 | ||
7c673cae FG |
61 | class MemTableRep { |
62 | public: | |
63 | // KeyComparator provides a means to compare keys, which are internal keys | |
64 | // concatenated with values. | |
65 | class KeyComparator { | |
66 | public: | |
1e59de90 | 67 | using DecodedType = ROCKSDB_NAMESPACE::Slice; |
11fdf7f2 TL |
68 | |
69 | virtual DecodedType decode_key(const char* key) const { | |
1e59de90 | 70 | // The format of key is frozen and can be treated as a part of the API |
11fdf7f2 TL |
71 | // contract. Refer to MemTable::Add for details. |
72 | return GetLengthPrefixedSlice(key); | |
73 | } | |
74 | ||
7c673cae FG |
75 | // Compare a and b. Return a negative value if a is less than b, 0 if they |
76 | // are equal, and a positive value if a is greater than b | |
77 | virtual int operator()(const char* prefix_len_key1, | |
78 | const char* prefix_len_key2) const = 0; | |
79 | ||
80 | virtual int operator()(const char* prefix_len_key, | |
81 | const Slice& key) const = 0; | |
82 | ||
494da23a | 83 | virtual ~KeyComparator() {} |
7c673cae FG |
84 | }; |
85 | ||
11fdf7f2 | 86 | explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {} |
7c673cae FG |
87 | |
88 | // Allocate a buf of len size for storing key. The idea is that a | |
89 | // specific memtable representation knows its underlying data structure | |
90 | // better. By allowing it to allocate memory, it can possibly put | |
91 | // correlated stuff in consecutive memory area to make processor | |
92 | // prefetching more efficient. | |
93 | virtual KeyHandle Allocate(const size_t len, char** buf); | |
94 | ||
95 | // Insert key into the collection. (The caller will pack key and value into a | |
96 | // single buffer and pass that in as the parameter to Insert). | |
97 | // REQUIRES: nothing that compares equal to key is currently in the | |
98 | // collection, and no concurrent modifications to the table in progress | |
99 | virtual void Insert(KeyHandle handle) = 0; | |
100 | ||
11fdf7f2 TL |
101 | // Same as ::Insert |
102 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
103 | // the <key, seq> already exists. | |
104 | virtual bool InsertKey(KeyHandle handle) { | |
105 | Insert(handle); | |
106 | return true; | |
107 | } | |
108 | ||
7c673cae FG |
109 | // Same as Insert(), but in additional pass a hint to insert location for |
110 | // the key. If hint points to nullptr, a new hint will be populated. | |
111 | // otherwise the hint will be updated to reflect the last insert location. | |
112 | // | |
113 | // Currently only skip-list based memtable implement the interface. Other | |
114 | // implementations will fallback to Insert() by default. | |
11fdf7f2 | 115 | virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { |
7c673cae FG |
116 | // Ignore the hint by default. |
117 | Insert(handle); | |
118 | } | |
119 | ||
11fdf7f2 TL |
120 | // Same as ::InsertWithHint |
121 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
122 | // the <key, seq> already exists. | |
123 | virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) { | |
124 | InsertWithHint(handle, hint); | |
125 | return true; | |
126 | } | |
127 | ||
1e59de90 | 128 | // Same as ::InsertWithHint, but allow concurrent write |
f67539c2 TL |
129 | // |
130 | // If hint points to nullptr, a new hint will be allocated on heap, otherwise | |
131 | // the hint will be updated to reflect the last insert location. The hint is | |
132 | // owned by the caller and it is the caller's responsibility to delete the | |
133 | // hint later. | |
134 | // | |
135 | // Currently only skip-list based memtable implement the interface. Other | |
136 | // implementations will fallback to InsertConcurrently() by default. | |
137 | virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) { | |
138 | // Ignore the hint by default. | |
139 | InsertConcurrently(handle); | |
140 | } | |
141 | ||
142 | // Same as ::InsertWithHintConcurrently | |
143 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
144 | // the <key, seq> already exists. | |
145 | virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) { | |
146 | InsertWithHintConcurrently(handle, hint); | |
147 | return true; | |
148 | } | |
149 | ||
7c673cae | 150 | // Like Insert(handle), but may be called concurrent with other calls |
11fdf7f2 TL |
151 | // to InsertConcurrently for other handles. |
152 | // | |
153 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
154 | // the <key, seq> already exists. | |
155 | virtual void InsertConcurrently(KeyHandle handle); | |
156 | ||
157 | // Same as ::InsertConcurrently | |
158 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
159 | // the <key, seq> already exists. | |
160 | virtual bool InsertKeyConcurrently(KeyHandle handle) { | |
161 | InsertConcurrently(handle); | |
162 | return true; | |
7c673cae FG |
163 | } |
164 | ||
165 | // Returns true iff an entry that compares equal to key is in the collection. | |
166 | virtual bool Contains(const char* key) const = 0; | |
167 | ||
168 | // Notify this table rep that it will no longer be added to. By default, | |
169 | // does nothing. After MarkReadOnly() is called, this table rep will | |
170 | // not be written to (ie No more calls to Allocate(), Insert(), | |
171 | // or any writes done directly to entries accessed through the iterator.) | |
494da23a | 172 | virtual void MarkReadOnly() {} |
7c673cae | 173 | |
11fdf7f2 TL |
174 | // Notify this table rep that it has been flushed to stable storage. |
175 | // By default, does nothing. | |
176 | // | |
177 | // Invariant: MarkReadOnly() is called, before MarkFlushed(). | |
178 | // Note that this method if overridden, should not run for an extended period | |
179 | // of time. Otherwise, RocksDB may be blocked. | |
494da23a | 180 | virtual void MarkFlushed() {} |
11fdf7f2 | 181 | |
7c673cae FG |
182 | // Look up key from the mem table, since the first key in the mem table whose |
183 | // user_key matches the one given k, call the function callback_func(), with | |
184 | // callback_args directly forwarded as the first parameter, and the mem table | |
185 | // key as the second parameter. If the return value is false, then terminates. | |
186 | // Otherwise, go through the next key. | |
187 | // | |
188 | // It's safe for Get() to terminate after having finished all the potential | |
189 | // key for the k.user_key(), or not. | |
190 | // | |
191 | // Default: | |
192 | // Get() function with a default value of dynamically construct an iterator, | |
193 | // seek and call the call back function. | |
194 | virtual void Get(const LookupKey& k, void* callback_args, | |
195 | bool (*callback_func)(void* arg, const char* entry)); | |
196 | ||
11fdf7f2 TL |
197 | virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, |
198 | const Slice& /*end_key*/) { | |
7c673cae FG |
199 | return 0; |
200 | } | |
201 | ||
1e59de90 TL |
202 | // Returns a vector of unique random memtable entries of approximate |
203 | // size 'target_sample_size' (this size is not strictly enforced). | |
204 | virtual void UniqueRandomSample(const uint64_t num_entries, | |
205 | const uint64_t target_sample_size, | |
206 | std::unordered_set<const char*>* entries) { | |
207 | (void)num_entries; | |
208 | (void)target_sample_size; | |
209 | (void)entries; | |
210 | assert(false); | |
211 | } | |
212 | ||
7c673cae FG |
213 | // Report an approximation of how much memory has been used other than memory |
214 | // that was allocated through the allocator. Safe to call from any thread. | |
215 | virtual size_t ApproximateMemoryUsage() = 0; | |
216 | ||
494da23a | 217 | virtual ~MemTableRep() {} |
7c673cae FG |
218 | |
219 | // Iteration over the contents of a skip collection | |
220 | class Iterator { | |
221 | public: | |
222 | // Initialize an iterator over the specified collection. | |
223 | // The returned iterator is not valid. | |
224 | // explicit Iterator(const MemTableRep* collection); | |
225 | virtual ~Iterator() {} | |
226 | ||
227 | // Returns true iff the iterator is positioned at a valid node. | |
228 | virtual bool Valid() const = 0; | |
229 | ||
230 | // Returns the key at the current position. | |
231 | // REQUIRES: Valid() | |
232 | virtual const char* key() const = 0; | |
233 | ||
234 | // Advances to the next position. | |
235 | // REQUIRES: Valid() | |
236 | virtual void Next() = 0; | |
237 | ||
238 | // Advances to the previous position. | |
239 | // REQUIRES: Valid() | |
240 | virtual void Prev() = 0; | |
241 | ||
242 | // Advance to the first entry with a key >= target | |
243 | virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; | |
244 | ||
245 | // retreat to the first entry with a key <= target | |
246 | virtual void SeekForPrev(const Slice& internal_key, | |
247 | const char* memtable_key) = 0; | |
248 | ||
1e59de90 TL |
249 | virtual void RandomSeek() {} |
250 | ||
7c673cae FG |
251 | // Position at the first entry in collection. |
252 | // Final state of iterator is Valid() iff collection is not empty. | |
253 | virtual void SeekToFirst() = 0; | |
254 | ||
255 | // Position at the last entry in collection. | |
256 | // Final state of iterator is Valid() iff collection is not empty. | |
257 | virtual void SeekToLast() = 0; | |
258 | }; | |
259 | ||
260 | // Return an iterator over the keys in this representation. | |
261 | // arena: If not null, the arena needs to be used to allocate the Iterator. | |
262 | // When destroying the iterator, the caller will not call "delete" | |
263 | // but Iterator::~Iterator() directly. The destructor needs to destroy | |
264 | // all the states but those allocated in arena. | |
265 | virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; | |
266 | ||
267 | // Return an iterator that has a special Seek semantics. The result of | |
268 | // a Seek might only include keys with the same prefix as the target key. | |
269 | // arena: If not null, the arena is used to allocate the Iterator. | |
270 | // When destroying the iterator, the caller will not call "delete" | |
271 | // but Iterator::~Iterator() directly. The destructor needs to destroy | |
272 | // all the states but those allocated in arena. | |
273 | virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { | |
274 | return GetIterator(arena); | |
275 | } | |
276 | ||
277 | // Return true if the current MemTableRep supports merge operator. | |
278 | // Default: true | |
279 | virtual bool IsMergeOperatorSupported() const { return true; } | |
280 | ||
281 | // Return true if the current MemTableRep supports snapshot | |
282 | // Default: true | |
283 | virtual bool IsSnapshotSupported() const { return true; } | |
284 | ||
285 | protected: | |
286 | // When *key is an internal key concatenated with the value, returns the | |
287 | // user key. | |
288 | virtual Slice UserKey(const char* key) const; | |
289 | ||
11fdf7f2 | 290 | Allocator* allocator_; |
7c673cae FG |
291 | }; |
292 | ||
293 | // This is the base class for all factories that are used by RocksDB to create | |
294 | // new MemTableRep objects | |
1e59de90 | 295 | class MemTableRepFactory : public Customizable { |
7c673cae | 296 | public: |
1e59de90 TL |
297 | ~MemTableRepFactory() override {} |
298 | ||
299 | static const char* Type() { return "MemTableRepFactory"; } | |
300 | static Status CreateFromString(const ConfigOptions& config_options, | |
301 | const std::string& id, | |
302 | std::unique_ptr<MemTableRepFactory>* factory); | |
303 | static Status CreateFromString(const ConfigOptions& config_options, | |
304 | const std::string& id, | |
305 | std::shared_ptr<MemTableRepFactory>* factory); | |
11fdf7f2 | 306 | |
7c673cae | 307 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 308 | Allocator*, const SliceTransform*, |
7c673cae | 309 | Logger* logger) = 0; |
11fdf7f2 TL |
310 | virtual MemTableRep* CreateMemTableRep( |
311 | const MemTableRep::KeyComparator& key_cmp, Allocator* allocator, | |
312 | const SliceTransform* slice_transform, Logger* logger, | |
313 | uint32_t /* column_family_id */) { | |
314 | return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); | |
315 | } | |
316 | ||
1e59de90 | 317 | const char* Name() const override = 0; |
7c673cae FG |
318 | |
319 | // Return true if the current MemTableRep supports concurrent inserts | |
320 | // Default: false | |
321 | virtual bool IsInsertConcurrentlySupported() const { return false; } | |
11fdf7f2 TL |
322 | |
323 | // Return true if the current MemTableRep supports detecting duplicate | |
324 | // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns | |
325 | // false when if the <key,seq> already exists. | |
326 | // Default: false | |
327 | virtual bool CanHandleDuplicatedKey() const { return false; } | |
7c673cae FG |
328 | }; |
329 | ||
330 | // This uses a skip list to store keys. It is the default. | |
331 | // | |
332 | // Parameters: | |
333 | // lookahead: If non-zero, each iterator's seek operation will start the | |
334 | // search from the previously visited record (doing at most 'lookahead' | |
335 | // steps). This is an optimization for the access pattern including many | |
336 | // seeks with consecutive keys. | |
337 | class SkipListFactory : public MemTableRepFactory { | |
338 | public: | |
1e59de90 TL |
339 | explicit SkipListFactory(size_t lookahead = 0); |
340 | ||
341 | // Methods for Configurable/Customizable class overrides | |
342 | static const char* kClassName() { return "SkipListFactory"; } | |
343 | static const char* kNickName() { return "skip_list"; } | |
344 | virtual const char* Name() const override { return kClassName(); } | |
345 | virtual const char* NickName() const override { return kNickName(); } | |
346 | std::string GetId() const override; | |
7c673cae | 347 | |
1e59de90 | 348 | // Methods for MemTableRepFactory class overrides |
11fdf7f2 | 349 | using MemTableRepFactory::CreateMemTableRep; |
7c673cae | 350 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 351 | Allocator*, const SliceTransform*, |
7c673cae | 352 | Logger* logger) override; |
7c673cae FG |
353 | |
354 | bool IsInsertConcurrentlySupported() const override { return true; } | |
355 | ||
11fdf7f2 TL |
356 | bool CanHandleDuplicatedKey() const override { return true; } |
357 | ||
7c673cae | 358 | private: |
1e59de90 | 359 | size_t lookahead_; |
7c673cae FG |
360 | }; |
361 | ||
362 | #ifndef ROCKSDB_LITE | |
363 | // This creates MemTableReps that are backed by an std::vector. On iteration, | |
364 | // the vector is sorted. This is useful for workloads where iteration is very | |
365 | // rare and writes are generally not issued after reads begin. | |
366 | // | |
367 | // Parameters: | |
368 | // count: Passed to the constructor of the underlying std::vector of each | |
369 | // VectorRep. On initialization, the underlying array will be at least count | |
370 | // bytes reserved for usage. | |
371 | class VectorRepFactory : public MemTableRepFactory { | |
1e59de90 | 372 | size_t count_; |
7c673cae FG |
373 | |
374 | public: | |
1e59de90 | 375 | explicit VectorRepFactory(size_t count = 0); |
11fdf7f2 | 376 | |
1e59de90 TL |
377 | // Methods for Configurable/Customizable class overrides |
378 | static const char* kClassName() { return "VectorRepFactory"; } | |
379 | static const char* kNickName() { return "vector"; } | |
380 | const char* Name() const override { return kClassName(); } | |
381 | const char* NickName() const override { return kNickName(); } | |
382 | ||
383 | // Methods for MemTableRepFactory class overrides | |
11fdf7f2 | 384 | using MemTableRepFactory::CreateMemTableRep; |
7c673cae | 385 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 386 | Allocator*, const SliceTransform*, |
7c673cae | 387 | Logger* logger) override; |
7c673cae FG |
388 | }; |
389 | ||
390 | // This class contains a fixed array of buckets, each | |
391 | // pointing to a skiplist (null if the bucket is empty). | |
392 | // bucket_count: number of fixed array buckets | |
393 | // skiplist_height: the max height of the skiplist | |
394 | // skiplist_branching_factor: probabilistic size ratio between adjacent | |
395 | // link lists in the skiplist | |
396 | extern MemTableRepFactory* NewHashSkipListRepFactory( | |
397 | size_t bucket_count = 1000000, int32_t skiplist_height = 4, | |
494da23a | 398 | int32_t skiplist_branching_factor = 4); |
7c673cae FG |
399 | |
400 | // The factory is to create memtables based on a hash table: | |
401 | // it contains a fixed array of buckets, each pointing to either a linked list | |
402 | // or a skip list if number of entries inside the bucket exceeds | |
403 | // threshold_use_skiplist. | |
404 | // @bucket_count: number of fixed array buckets | |
405 | // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. | |
406 | // Otherwise from huge page TLB. The user needs to reserve | |
407 | // huge pages for it to be allocated, like: | |
408 | // sysctl -w vm.nr_hugepages=20 | |
409 | // See linux doc Documentation/vm/hugetlbpage.txt | |
410 | // @bucket_entries_logging_threshold: if number of entries in one bucket | |
411 | // exceeds this number, log about it. | |
412 | // @if_log_bucket_dist_when_flash: if true, log distribution of number of | |
413 | // entries when flushing. | |
414 | // @threshold_use_skiplist: a bucket switches to skip list if number of | |
415 | // entries exceed this parameter. | |
416 | extern MemTableRepFactory* NewHashLinkListRepFactory( | |
417 | size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, | |
418 | int bucket_entries_logging_threshold = 4096, | |
419 | bool if_log_bucket_dist_when_flash = true, | |
420 | uint32_t threshold_use_skiplist = 256); | |
421 | ||
7c673cae | 422 | #endif // ROCKSDB_LITE |
f67539c2 | 423 | } // namespace ROCKSDB_NAMESPACE |