]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/memtablerep.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / include / rocksdb / memtablerep.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// This file contains the interface that must be implemented by any collection
7// to be used as the backing store for a MemTable. Such a collection must
8// satisfy the following properties:
9// (1) It does not store duplicate items.
10// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
11// equality.
12// (3) It can be accessed concurrently by multiple readers and can support
13// during reads. However, it needn't support multiple concurrent writes.
14// (4) Items are never deleted.
15// The liberal use of assertions is encouraged to enforce (1).
16//
17// The factory will be passed an MemTableAllocator object when a new MemTableRep
18// is requested.
19//
20// Users can implement their own memtable representations. We include three
21// types built in:
22// - SkipListRep: This is the default; it is backed by a skip list.
23// - HashSkipListRep: The memtable rep that is best used for keys that are
24// structured like "prefix:suffix" where iteration within a prefix is
25// common and iteration across different prefixes is rare. It is backed by
26// a hash map where each bucket is a skip list.
27// - VectorRep: This is backed by an unordered std::vector. On iteration, the
28// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
29// has been called, the vector will only be sorted once. It is optimized for
30// random-write-heavy workloads.
31//
32// The last four implementations are designed for situations in which
33// iteration over the entire collection is rare since doing so requires all the
34// keys to be copied into a sorted data structure.
35
36#pragma once
37
7c673cae
FG
38#include <stdint.h>
39#include <stdlib.h>
1e59de90 40
494da23a
TL
41#include <memory>
42#include <stdexcept>
1e59de90
TL
43#include <unordered_set>
44
45#include "rocksdb/customizable.h"
46#include "rocksdb/slice.h"
7c673cae 47
f67539c2 48namespace ROCKSDB_NAMESPACE {
7c673cae
FG
49
50class Arena;
11fdf7f2 51class Allocator;
7c673cae 52class LookupKey;
7c673cae
FG
53class SliceTransform;
54class Logger;
1e59de90 55struct DBOptions;
7c673cae 56
1e59de90 57using KeyHandle = void*;
7c673cae 58
11fdf7f2
TL
59extern Slice GetLengthPrefixedSlice(const char* data);
60
7c673cae
FG
61class MemTableRep {
62 public:
63 // KeyComparator provides a means to compare keys, which are internal keys
64 // concatenated with values.
65 class KeyComparator {
66 public:
1e59de90 67 using DecodedType = ROCKSDB_NAMESPACE::Slice;
11fdf7f2
TL
68
69 virtual DecodedType decode_key(const char* key) const {
1e59de90 70 // The format of key is frozen and can be treated as a part of the API
11fdf7f2
TL
71 // contract. Refer to MemTable::Add for details.
72 return GetLengthPrefixedSlice(key);
73 }
74
7c673cae
FG
75 // Compare a and b. Return a negative value if a is less than b, 0 if they
76 // are equal, and a positive value if a is greater than b
77 virtual int operator()(const char* prefix_len_key1,
78 const char* prefix_len_key2) const = 0;
79
80 virtual int operator()(const char* prefix_len_key,
81 const Slice& key) const = 0;
82
494da23a 83 virtual ~KeyComparator() {}
7c673cae
FG
84 };
85
11fdf7f2 86 explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
7c673cae
FG
87
88 // Allocate a buf of len size for storing key. The idea is that a
89 // specific memtable representation knows its underlying data structure
90 // better. By allowing it to allocate memory, it can possibly put
91 // correlated stuff in consecutive memory area to make processor
92 // prefetching more efficient.
93 virtual KeyHandle Allocate(const size_t len, char** buf);
94
95 // Insert key into the collection. (The caller will pack key and value into a
96 // single buffer and pass that in as the parameter to Insert).
97 // REQUIRES: nothing that compares equal to key is currently in the
98 // collection, and no concurrent modifications to the table in progress
99 virtual void Insert(KeyHandle handle) = 0;
100
11fdf7f2
TL
101 // Same as ::Insert
102 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
103 // the <key, seq> already exists.
104 virtual bool InsertKey(KeyHandle handle) {
105 Insert(handle);
106 return true;
107 }
108
7c673cae
FG
109 // Same as Insert(), but in additional pass a hint to insert location for
110 // the key. If hint points to nullptr, a new hint will be populated.
111 // otherwise the hint will be updated to reflect the last insert location.
112 //
113 // Currently only skip-list based memtable implement the interface. Other
114 // implementations will fallback to Insert() by default.
11fdf7f2 115 virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
7c673cae
FG
116 // Ignore the hint by default.
117 Insert(handle);
118 }
119
11fdf7f2
TL
120 // Same as ::InsertWithHint
121 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
122 // the <key, seq> already exists.
123 virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
124 InsertWithHint(handle, hint);
125 return true;
126 }
127
1e59de90 128 // Same as ::InsertWithHint, but allow concurrent write
f67539c2
TL
129 //
130 // If hint points to nullptr, a new hint will be allocated on heap, otherwise
131 // the hint will be updated to reflect the last insert location. The hint is
132 // owned by the caller and it is the caller's responsibility to delete the
133 // hint later.
134 //
135 // Currently only skip-list based memtable implement the interface. Other
136 // implementations will fallback to InsertConcurrently() by default.
137 virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
138 // Ignore the hint by default.
139 InsertConcurrently(handle);
140 }
141
142 // Same as ::InsertWithHintConcurrently
143 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
144 // the <key, seq> already exists.
145 virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
146 InsertWithHintConcurrently(handle, hint);
147 return true;
148 }
149
7c673cae 150 // Like Insert(handle), but may be called concurrent with other calls
11fdf7f2
TL
151 // to InsertConcurrently for other handles.
152 //
153 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
154 // the <key, seq> already exists.
155 virtual void InsertConcurrently(KeyHandle handle);
156
157 // Same as ::InsertConcurrently
158 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
159 // the <key, seq> already exists.
160 virtual bool InsertKeyConcurrently(KeyHandle handle) {
161 InsertConcurrently(handle);
162 return true;
7c673cae
FG
163 }
164
165 // Returns true iff an entry that compares equal to key is in the collection.
166 virtual bool Contains(const char* key) const = 0;
167
168 // Notify this table rep that it will no longer be added to. By default,
169 // does nothing. After MarkReadOnly() is called, this table rep will
170 // not be written to (ie No more calls to Allocate(), Insert(),
171 // or any writes done directly to entries accessed through the iterator.)
494da23a 172 virtual void MarkReadOnly() {}
7c673cae 173
11fdf7f2
TL
174 // Notify this table rep that it has been flushed to stable storage.
175 // By default, does nothing.
176 //
177 // Invariant: MarkReadOnly() is called, before MarkFlushed().
178 // Note that this method if overridden, should not run for an extended period
179 // of time. Otherwise, RocksDB may be blocked.
494da23a 180 virtual void MarkFlushed() {}
11fdf7f2 181
7c673cae
FG
182 // Look up key from the mem table, since the first key in the mem table whose
183 // user_key matches the one given k, call the function callback_func(), with
184 // callback_args directly forwarded as the first parameter, and the mem table
185 // key as the second parameter. If the return value is false, then terminates.
186 // Otherwise, go through the next key.
187 //
188 // It's safe for Get() to terminate after having finished all the potential
189 // key for the k.user_key(), or not.
190 //
191 // Default:
192 // Get() function with a default value of dynamically construct an iterator,
193 // seek and call the call back function.
194 virtual void Get(const LookupKey& k, void* callback_args,
195 bool (*callback_func)(void* arg, const char* entry));
196
11fdf7f2
TL
197 virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
198 const Slice& /*end_key*/) {
7c673cae
FG
199 return 0;
200 }
201
1e59de90
TL
202 // Returns a vector of unique random memtable entries of approximate
203 // size 'target_sample_size' (this size is not strictly enforced).
204 virtual void UniqueRandomSample(const uint64_t num_entries,
205 const uint64_t target_sample_size,
206 std::unordered_set<const char*>* entries) {
207 (void)num_entries;
208 (void)target_sample_size;
209 (void)entries;
210 assert(false);
211 }
212
7c673cae
FG
213 // Report an approximation of how much memory has been used other than memory
214 // that was allocated through the allocator. Safe to call from any thread.
215 virtual size_t ApproximateMemoryUsage() = 0;
216
494da23a 217 virtual ~MemTableRep() {}
7c673cae
FG
218
219 // Iteration over the contents of a skip collection
220 class Iterator {
221 public:
222 // Initialize an iterator over the specified collection.
223 // The returned iterator is not valid.
224 // explicit Iterator(const MemTableRep* collection);
225 virtual ~Iterator() {}
226
227 // Returns true iff the iterator is positioned at a valid node.
228 virtual bool Valid() const = 0;
229
230 // Returns the key at the current position.
231 // REQUIRES: Valid()
232 virtual const char* key() const = 0;
233
234 // Advances to the next position.
235 // REQUIRES: Valid()
236 virtual void Next() = 0;
237
238 // Advances to the previous position.
239 // REQUIRES: Valid()
240 virtual void Prev() = 0;
241
242 // Advance to the first entry with a key >= target
243 virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
244
245 // retreat to the first entry with a key <= target
246 virtual void SeekForPrev(const Slice& internal_key,
247 const char* memtable_key) = 0;
248
1e59de90
TL
249 virtual void RandomSeek() {}
250
7c673cae
FG
251 // Position at the first entry in collection.
252 // Final state of iterator is Valid() iff collection is not empty.
253 virtual void SeekToFirst() = 0;
254
255 // Position at the last entry in collection.
256 // Final state of iterator is Valid() iff collection is not empty.
257 virtual void SeekToLast() = 0;
258 };
259
260 // Return an iterator over the keys in this representation.
261 // arena: If not null, the arena needs to be used to allocate the Iterator.
262 // When destroying the iterator, the caller will not call "delete"
263 // but Iterator::~Iterator() directly. The destructor needs to destroy
264 // all the states but those allocated in arena.
265 virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
266
267 // Return an iterator that has a special Seek semantics. The result of
268 // a Seek might only include keys with the same prefix as the target key.
269 // arena: If not null, the arena is used to allocate the Iterator.
270 // When destroying the iterator, the caller will not call "delete"
271 // but Iterator::~Iterator() directly. The destructor needs to destroy
272 // all the states but those allocated in arena.
273 virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
274 return GetIterator(arena);
275 }
276
277 // Return true if the current MemTableRep supports merge operator.
278 // Default: true
279 virtual bool IsMergeOperatorSupported() const { return true; }
280
281 // Return true if the current MemTableRep supports snapshot
282 // Default: true
283 virtual bool IsSnapshotSupported() const { return true; }
284
285 protected:
286 // When *key is an internal key concatenated with the value, returns the
287 // user key.
288 virtual Slice UserKey(const char* key) const;
289
11fdf7f2 290 Allocator* allocator_;
7c673cae
FG
291};
292
293// This is the base class for all factories that are used by RocksDB to create
294// new MemTableRep objects
1e59de90 295class MemTableRepFactory : public Customizable {
7c673cae 296 public:
1e59de90
TL
297 ~MemTableRepFactory() override {}
298
299 static const char* Type() { return "MemTableRepFactory"; }
300 static Status CreateFromString(const ConfigOptions& config_options,
301 const std::string& id,
302 std::unique_ptr<MemTableRepFactory>* factory);
303 static Status CreateFromString(const ConfigOptions& config_options,
304 const std::string& id,
305 std::shared_ptr<MemTableRepFactory>* factory);
11fdf7f2 306
7c673cae 307 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
11fdf7f2 308 Allocator*, const SliceTransform*,
7c673cae 309 Logger* logger) = 0;
11fdf7f2
TL
310 virtual MemTableRep* CreateMemTableRep(
311 const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
312 const SliceTransform* slice_transform, Logger* logger,
313 uint32_t /* column_family_id */) {
314 return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
315 }
316
1e59de90 317 const char* Name() const override = 0;
7c673cae
FG
318
319 // Return true if the current MemTableRep supports concurrent inserts
320 // Default: false
321 virtual bool IsInsertConcurrentlySupported() const { return false; }
11fdf7f2
TL
322
323 // Return true if the current MemTableRep supports detecting duplicate
324 // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
325 // false when if the <key,seq> already exists.
326 // Default: false
327 virtual bool CanHandleDuplicatedKey() const { return false; }
7c673cae
FG
328};
329
330// This uses a skip list to store keys. It is the default.
331//
332// Parameters:
333// lookahead: If non-zero, each iterator's seek operation will start the
334// search from the previously visited record (doing at most 'lookahead'
335// steps). This is an optimization for the access pattern including many
336// seeks with consecutive keys.
337class SkipListFactory : public MemTableRepFactory {
338 public:
1e59de90
TL
339 explicit SkipListFactory(size_t lookahead = 0);
340
341 // Methods for Configurable/Customizable class overrides
342 static const char* kClassName() { return "SkipListFactory"; }
343 static const char* kNickName() { return "skip_list"; }
344 virtual const char* Name() const override { return kClassName(); }
345 virtual const char* NickName() const override { return kNickName(); }
346 std::string GetId() const override;
7c673cae 347
1e59de90 348 // Methods for MemTableRepFactory class overrides
11fdf7f2 349 using MemTableRepFactory::CreateMemTableRep;
7c673cae 350 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
11fdf7f2 351 Allocator*, const SliceTransform*,
7c673cae 352 Logger* logger) override;
7c673cae
FG
353
354 bool IsInsertConcurrentlySupported() const override { return true; }
355
11fdf7f2
TL
356 bool CanHandleDuplicatedKey() const override { return true; }
357
7c673cae 358 private:
1e59de90 359 size_t lookahead_;
7c673cae
FG
360};
361
362#ifndef ROCKSDB_LITE
363// This creates MemTableReps that are backed by an std::vector. On iteration,
364// the vector is sorted. This is useful for workloads where iteration is very
365// rare and writes are generally not issued after reads begin.
366//
367// Parameters:
368// count: Passed to the constructor of the underlying std::vector of each
369// VectorRep. On initialization, the underlying array will be at least count
370// bytes reserved for usage.
371class VectorRepFactory : public MemTableRepFactory {
1e59de90 372 size_t count_;
7c673cae
FG
373
374 public:
1e59de90 375 explicit VectorRepFactory(size_t count = 0);
11fdf7f2 376
1e59de90
TL
377 // Methods for Configurable/Customizable class overrides
378 static const char* kClassName() { return "VectorRepFactory"; }
379 static const char* kNickName() { return "vector"; }
380 const char* Name() const override { return kClassName(); }
381 const char* NickName() const override { return kNickName(); }
382
383 // Methods for MemTableRepFactory class overrides
11fdf7f2 384 using MemTableRepFactory::CreateMemTableRep;
7c673cae 385 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
11fdf7f2 386 Allocator*, const SliceTransform*,
7c673cae 387 Logger* logger) override;
7c673cae
FG
388};
389
390// This class contains a fixed array of buckets, each
391// pointing to a skiplist (null if the bucket is empty).
392// bucket_count: number of fixed array buckets
393// skiplist_height: the max height of the skiplist
394// skiplist_branching_factor: probabilistic size ratio between adjacent
395// link lists in the skiplist
396extern MemTableRepFactory* NewHashSkipListRepFactory(
397 size_t bucket_count = 1000000, int32_t skiplist_height = 4,
494da23a 398 int32_t skiplist_branching_factor = 4);
7c673cae
FG
399
400// The factory is to create memtables based on a hash table:
401// it contains a fixed array of buckets, each pointing to either a linked list
402// or a skip list if number of entries inside the bucket exceeds
403// threshold_use_skiplist.
404// @bucket_count: number of fixed array buckets
405// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
406// Otherwise from huge page TLB. The user needs to reserve
407// huge pages for it to be allocated, like:
408// sysctl -w vm.nr_hugepages=20
409// See linux doc Documentation/vm/hugetlbpage.txt
410// @bucket_entries_logging_threshold: if number of entries in one bucket
411// exceeds this number, log about it.
412// @if_log_bucket_dist_when_flash: if true, log distribution of number of
413// entries when flushing.
414// @threshold_use_skiplist: a bucket switches to skip list if number of
415// entries exceed this parameter.
416extern MemTableRepFactory* NewHashLinkListRepFactory(
417 size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
418 int bucket_entries_logging_threshold = 4096,
419 bool if_log_bucket_dist_when_flash = true,
420 uint32_t threshold_use_skiplist = 256);
421
7c673cae 422#endif // ROCKSDB_LITE
f67539c2 423} // namespace ROCKSDB_NAMESPACE