1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // This file contains the interface that must be implemented by any collection
7 // to be used as the backing store for a MemTable. Such a collection must
8 // satisfy the following properties:
9 // (1) It does not store duplicate items.
10 // (2) It uses MemTableRep::KeyComparator to compare items for iteration and
12 // (3) It can be accessed concurrently by multiple readers and can support
13 // during reads. However, it needn't support multiple concurrent writes.
14 // (4) Items are never deleted.
15 // The liberal use of assertions is encouraged to enforce (1).
17 // The factory will be passed an MemTableAllocator object when a new MemTableRep
20 // Users can implement their own memtable representations. We include three
22 // - SkipListRep: This is the default; it is backed by a skip list.
23 // - HashSkipListRep: The memtable rep that is best used for keys that are
24 // structured like "prefix:suffix" where iteration within a prefix is
25 // common and iteration across different prefixes is rare. It is backed by
26 // a hash map where each bucket is a skip list.
27 // - VectorRep: This is backed by an unordered std::vector. On iteration, the
28 // vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
29 // has been called, the vector will only be sorted once. It is optimized for
30 // random-write-heavy workloads.
32 // The last four implementations are designed for situations in which
33 // iteration over the entire collection is rare since doing so requires all the
34 // keys to be copied into a sorted data structure.
38 #include <rocksdb/slice.h>
52 typedef void* KeyHandle
;
54 extern Slice
GetLengthPrefixedSlice(const char* data
);
58 // KeyComparator provides a means to compare keys, which are internal keys
59 // concatenated with values.
62 typedef rocksdb::Slice DecodedType
;
64 virtual DecodedType
decode_key(const char* key
) const {
65 // The format of key is frozen and can be terated as a part of the API
66 // contract. Refer to MemTable::Add for details.
67 return GetLengthPrefixedSlice(key
);
70 // Compare a and b. Return a negative value if a is less than b, 0 if they
71 // are equal, and a positive value if a is greater than b
72 virtual int operator()(const char* prefix_len_key1
,
73 const char* prefix_len_key2
) const = 0;
75 virtual int operator()(const char* prefix_len_key
,
76 const Slice
& key
) const = 0;
78 virtual ~KeyComparator() {}
81 explicit MemTableRep(Allocator
* allocator
) : allocator_(allocator
) {}
83 // Allocate a buf of len size for storing key. The idea is that a
84 // specific memtable representation knows its underlying data structure
85 // better. By allowing it to allocate memory, it can possibly put
86 // correlated stuff in consecutive memory area to make processor
87 // prefetching more efficient.
88 virtual KeyHandle
Allocate(const size_t len
, char** buf
);
90 // Insert key into the collection. (The caller will pack key and value into a
91 // single buffer and pass that in as the parameter to Insert).
92 // REQUIRES: nothing that compares equal to key is currently in the
93 // collection, and no concurrent modifications to the table in progress
94 virtual void Insert(KeyHandle handle
) = 0;
97 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
98 // the <key, seq> already exists.
99 virtual bool InsertKey(KeyHandle handle
) {
104 // Same as Insert(), but in additional pass a hint to insert location for
105 // the key. If hint points to nullptr, a new hint will be populated.
106 // otherwise the hint will be updated to reflect the last insert location.
108 // Currently only skip-list based memtable implement the interface. Other
109 // implementations will fallback to Insert() by default.
110 virtual void InsertWithHint(KeyHandle handle
, void** /*hint*/) {
111 // Ignore the hint by default.
115 // Same as ::InsertWithHint
116 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
117 // the <key, seq> already exists.
118 virtual bool InsertKeyWithHint(KeyHandle handle
, void** hint
) {
119 InsertWithHint(handle
, hint
);
123 // Like Insert(handle), but may be called concurrent with other calls
124 // to InsertConcurrently for other handles.
126 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
127 // the <key, seq> already exists.
128 virtual void InsertConcurrently(KeyHandle handle
);
130 // Same as ::InsertConcurrently
131 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
132 // the <key, seq> already exists.
133 virtual bool InsertKeyConcurrently(KeyHandle handle
) {
134 InsertConcurrently(handle
);
138 // Returns true iff an entry that compares equal to key is in the collection.
139 virtual bool Contains(const char* key
) const = 0;
141 // Notify this table rep that it will no longer be added to. By default,
142 // does nothing. After MarkReadOnly() is called, this table rep will
143 // not be written to (ie No more calls to Allocate(), Insert(),
144 // or any writes done directly to entries accessed through the iterator.)
145 virtual void MarkReadOnly() {}
147 // Notify this table rep that it has been flushed to stable storage.
148 // By default, does nothing.
150 // Invariant: MarkReadOnly() is called, before MarkFlushed().
151 // Note that this method if overridden, should not run for an extended period
152 // of time. Otherwise, RocksDB may be blocked.
153 virtual void MarkFlushed() {}
155 // Look up key from the mem table, since the first key in the mem table whose
156 // user_key matches the one given k, call the function callback_func(), with
157 // callback_args directly forwarded as the first parameter, and the mem table
158 // key as the second parameter. If the return value is false, then terminates.
159 // Otherwise, go through the next key.
161 // It's safe for Get() to terminate after having finished all the potential
162 // key for the k.user_key(), or not.
165 // Get() function with a default value of dynamically construct an iterator,
166 // seek and call the call back function.
167 virtual void Get(const LookupKey
& k
, void* callback_args
,
168 bool (*callback_func
)(void* arg
, const char* entry
));
170 virtual uint64_t ApproximateNumEntries(const Slice
& /*start_ikey*/,
171 const Slice
& /*end_key*/) {
175 // Report an approximation of how much memory has been used other than memory
176 // that was allocated through the allocator. Safe to call from any thread.
177 virtual size_t ApproximateMemoryUsage() = 0;
179 virtual ~MemTableRep() {}
181 // Iteration over the contents of a skip collection
184 // Initialize an iterator over the specified collection.
185 // The returned iterator is not valid.
186 // explicit Iterator(const MemTableRep* collection);
187 virtual ~Iterator() {}
189 // Returns true iff the iterator is positioned at a valid node.
190 virtual bool Valid() const = 0;
192 // Returns the key at the current position.
194 virtual const char* key() const = 0;
196 // Advances to the next position.
198 virtual void Next() = 0;
200 // Advances to the previous position.
202 virtual void Prev() = 0;
204 // Advance to the first entry with a key >= target
205 virtual void Seek(const Slice
& internal_key
, const char* memtable_key
) = 0;
207 // retreat to the first entry with a key <= target
208 virtual void SeekForPrev(const Slice
& internal_key
,
209 const char* memtable_key
) = 0;
211 // Position at the first entry in collection.
212 // Final state of iterator is Valid() iff collection is not empty.
213 virtual void SeekToFirst() = 0;
215 // Position at the last entry in collection.
216 // Final state of iterator is Valid() iff collection is not empty.
217 virtual void SeekToLast() = 0;
220 // Return an iterator over the keys in this representation.
221 // arena: If not null, the arena needs to be used to allocate the Iterator.
222 // When destroying the iterator, the caller will not call "delete"
223 // but Iterator::~Iterator() directly. The destructor needs to destroy
224 // all the states but those allocated in arena.
225 virtual Iterator
* GetIterator(Arena
* arena
= nullptr) = 0;
227 // Return an iterator that has a special Seek semantics. The result of
228 // a Seek might only include keys with the same prefix as the target key.
229 // arena: If not null, the arena is used to allocate the Iterator.
230 // When destroying the iterator, the caller will not call "delete"
231 // but Iterator::~Iterator() directly. The destructor needs to destroy
232 // all the states but those allocated in arena.
233 virtual Iterator
* GetDynamicPrefixIterator(Arena
* arena
= nullptr) {
234 return GetIterator(arena
);
237 // Return true if the current MemTableRep supports merge operator.
239 virtual bool IsMergeOperatorSupported() const { return true; }
241 // Return true if the current MemTableRep supports snapshot
243 virtual bool IsSnapshotSupported() const { return true; }
246 // When *key is an internal key concatenated with the value, returns the
248 virtual Slice
UserKey(const char* key
) const;
250 Allocator
* allocator_
;
253 // This is the base class for all factories that are used by RocksDB to create
254 // new MemTableRep objects
255 class MemTableRepFactory
{
257 virtual ~MemTableRepFactory() {}
259 virtual MemTableRep
* CreateMemTableRep(const MemTableRep::KeyComparator
&,
260 Allocator
*, const SliceTransform
*,
262 virtual MemTableRep
* CreateMemTableRep(
263 const MemTableRep::KeyComparator
& key_cmp
, Allocator
* allocator
,
264 const SliceTransform
* slice_transform
, Logger
* logger
,
265 uint32_t /* column_family_id */) {
266 return CreateMemTableRep(key_cmp
, allocator
, slice_transform
, logger
);
269 virtual const char* Name() const = 0;
271 // Return true if the current MemTableRep supports concurrent inserts
273 virtual bool IsInsertConcurrentlySupported() const { return false; }
275 // Return true if the current MemTableRep supports detecting duplicate
276 // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
277 // false when if the <key,seq> already exists.
279 virtual bool CanHandleDuplicatedKey() const { return false; }
282 // This uses a skip list to store keys. It is the default.
285 // lookahead: If non-zero, each iterator's seek operation will start the
286 // search from the previously visited record (doing at most 'lookahead'
287 // steps). This is an optimization for the access pattern including many
288 // seeks with consecutive keys.
289 class SkipListFactory
: public MemTableRepFactory
{
291 explicit SkipListFactory(size_t lookahead
= 0) : lookahead_(lookahead
) {}
293 using MemTableRepFactory::CreateMemTableRep
;
294 virtual MemTableRep
* CreateMemTableRep(const MemTableRep::KeyComparator
&,
295 Allocator
*, const SliceTransform
*,
296 Logger
* logger
) override
;
297 virtual const char* Name() const override
{ return "SkipListFactory"; }
299 bool IsInsertConcurrentlySupported() const override
{ return true; }
301 bool CanHandleDuplicatedKey() const override
{ return true; }
304 const size_t lookahead_
;
308 // This creates MemTableReps that are backed by an std::vector. On iteration,
309 // the vector is sorted. This is useful for workloads where iteration is very
310 // rare and writes are generally not issued after reads begin.
313 // count: Passed to the constructor of the underlying std::vector of each
314 // VectorRep. On initialization, the underlying array will be at least count
315 // bytes reserved for usage.
316 class VectorRepFactory
: public MemTableRepFactory
{
320 explicit VectorRepFactory(size_t count
= 0) : count_(count
) {}
322 using MemTableRepFactory::CreateMemTableRep
;
323 virtual MemTableRep
* CreateMemTableRep(const MemTableRep::KeyComparator
&,
324 Allocator
*, const SliceTransform
*,
325 Logger
* logger
) override
;
327 virtual const char* Name() const override
{ return "VectorRepFactory"; }
330 // This class contains a fixed array of buckets, each
331 // pointing to a skiplist (null if the bucket is empty).
332 // bucket_count: number of fixed array buckets
333 // skiplist_height: the max height of the skiplist
334 // skiplist_branching_factor: probabilistic size ratio between adjacent
335 // link lists in the skiplist
336 extern MemTableRepFactory
* NewHashSkipListRepFactory(
337 size_t bucket_count
= 1000000, int32_t skiplist_height
= 4,
338 int32_t skiplist_branching_factor
= 4);
340 // The factory is to create memtables based on a hash table:
341 // it contains a fixed array of buckets, each pointing to either a linked list
342 // or a skip list if number of entries inside the bucket exceeds
343 // threshold_use_skiplist.
344 // @bucket_count: number of fixed array buckets
345 // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
346 // Otherwise from huge page TLB. The user needs to reserve
347 // huge pages for it to be allocated, like:
348 // sysctl -w vm.nr_hugepages=20
349 // See linux doc Documentation/vm/hugetlbpage.txt
350 // @bucket_entries_logging_threshold: if number of entries in one bucket
351 // exceeds this number, log about it.
352 // @if_log_bucket_dist_when_flash: if true, log distribution of number of
353 // entries when flushing.
354 // @threshold_use_skiplist: a bucket switches to skip list if number of
355 // entries exceed this parameter.
356 extern MemTableRepFactory
* NewHashLinkListRepFactory(
357 size_t bucket_count
= 50000, size_t huge_page_tlb_size
= 0,
358 int bucket_entries_logging_threshold
= 4096,
359 bool if_log_bucket_dist_when_flash
= true,
360 uint32_t threshold_use_skiplist
= 256);
362 #endif // ROCKSDB_LITE
363 } // namespace rocksdb