]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/memtablerep.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / include / rocksdb / memtablerep.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// This file contains the interface that must be implemented by any collection
7// to be used as the backing store for a MemTable. Such a collection must
8// satisfy the following properties:
9// (1) It does not store duplicate items.
10// (2) It uses MemTableRep::KeyComparator to compare items for iteration and
11// equality.
12// (3) It can be accessed concurrently by multiple readers and can support
13// during reads. However, it needn't support multiple concurrent writes.
14// (4) Items are never deleted.
15// The liberal use of assertions is encouraged to enforce (1).
16//
17// The factory will be passed an MemTableAllocator object when a new MemTableRep
18// is requested.
19//
20// Users can implement their own memtable representations. We include three
21// types built in:
22// - SkipListRep: This is the default; it is backed by a skip list.
23// - HashSkipListRep: The memtable rep that is best used for keys that are
24// structured like "prefix:suffix" where iteration within a prefix is
25// common and iteration across different prefixes is rare. It is backed by
26// a hash map where each bucket is a skip list.
27// - VectorRep: This is backed by an unordered std::vector. On iteration, the
28// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
29// has been called, the vector will only be sorted once. It is optimized for
30// random-write-heavy workloads.
31//
32// The last four implementations are designed for situations in which
33// iteration over the entire collection is rare since doing so requires all the
34// keys to be copied into a sorted data structure.
35
36#pragma once
37
38#include <memory>
39#include <stdexcept>
40#include <stdint.h>
41#include <stdlib.h>
11fdf7f2 42#include <rocksdb/slice.h>
7c673cae
FG
43
44namespace rocksdb {
45
46class Arena;
11fdf7f2 47class Allocator;
7c673cae 48class LookupKey;
7c673cae
FG
49class SliceTransform;
50class Logger;
51
52typedef void* KeyHandle;
53
11fdf7f2
TL
54extern Slice GetLengthPrefixedSlice(const char* data);
55
7c673cae
FG
56class MemTableRep {
57 public:
58 // KeyComparator provides a means to compare keys, which are internal keys
59 // concatenated with values.
60 class KeyComparator {
61 public:
11fdf7f2
TL
62 typedef rocksdb::Slice DecodedType;
63
64 virtual DecodedType decode_key(const char* key) const {
65 // The format of key is frozen and can be terated as a part of the API
66 // contract. Refer to MemTable::Add for details.
67 return GetLengthPrefixedSlice(key);
68 }
69
7c673cae
FG
70 // Compare a and b. Return a negative value if a is less than b, 0 if they
71 // are equal, and a positive value if a is greater than b
72 virtual int operator()(const char* prefix_len_key1,
73 const char* prefix_len_key2) const = 0;
74
75 virtual int operator()(const char* prefix_len_key,
76 const Slice& key) const = 0;
77
78 virtual ~KeyComparator() { }
79 };
80
11fdf7f2 81 explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
7c673cae
FG
82
83 // Allocate a buf of len size for storing key. The idea is that a
84 // specific memtable representation knows its underlying data structure
85 // better. By allowing it to allocate memory, it can possibly put
86 // correlated stuff in consecutive memory area to make processor
87 // prefetching more efficient.
88 virtual KeyHandle Allocate(const size_t len, char** buf);
89
90 // Insert key into the collection. (The caller will pack key and value into a
91 // single buffer and pass that in as the parameter to Insert).
92 // REQUIRES: nothing that compares equal to key is currently in the
93 // collection, and no concurrent modifications to the table in progress
94 virtual void Insert(KeyHandle handle) = 0;
95
11fdf7f2
TL
96 // Same as ::Insert
97 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
98 // the <key, seq> already exists.
99 virtual bool InsertKey(KeyHandle handle) {
100 Insert(handle);
101 return true;
102 }
103
7c673cae
FG
104 // Same as Insert(), but in additional pass a hint to insert location for
105 // the key. If hint points to nullptr, a new hint will be populated.
106 // otherwise the hint will be updated to reflect the last insert location.
107 //
108 // Currently only skip-list based memtable implement the interface. Other
109 // implementations will fallback to Insert() by default.
11fdf7f2 110 virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
7c673cae
FG
111 // Ignore the hint by default.
112 Insert(handle);
113 }
114
11fdf7f2
TL
115 // Same as ::InsertWithHint
116 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
117 // the <key, seq> already exists.
118 virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
119 InsertWithHint(handle, hint);
120 return true;
121 }
122
7c673cae 123 // Like Insert(handle), but may be called concurrent with other calls
11fdf7f2
TL
124 // to InsertConcurrently for other handles.
125 //
126 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
127 // the <key, seq> already exists.
128 virtual void InsertConcurrently(KeyHandle handle);
129
130 // Same as ::InsertConcurrently
131 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
132 // the <key, seq> already exists.
133 virtual bool InsertKeyConcurrently(KeyHandle handle) {
134 InsertConcurrently(handle);
135 return true;
7c673cae
FG
136 }
137
138 // Returns true iff an entry that compares equal to key is in the collection.
139 virtual bool Contains(const char* key) const = 0;
140
141 // Notify this table rep that it will no longer be added to. By default,
142 // does nothing. After MarkReadOnly() is called, this table rep will
143 // not be written to (ie No more calls to Allocate(), Insert(),
144 // or any writes done directly to entries accessed through the iterator.)
145 virtual void MarkReadOnly() { }
146
11fdf7f2
TL
147 // Notify this table rep that it has been flushed to stable storage.
148 // By default, does nothing.
149 //
150 // Invariant: MarkReadOnly() is called, before MarkFlushed().
151 // Note that this method if overridden, should not run for an extended period
152 // of time. Otherwise, RocksDB may be blocked.
153 virtual void MarkFlushed() { }
154
7c673cae
FG
155 // Look up key from the mem table, since the first key in the mem table whose
156 // user_key matches the one given k, call the function callback_func(), with
157 // callback_args directly forwarded as the first parameter, and the mem table
158 // key as the second parameter. If the return value is false, then terminates.
159 // Otherwise, go through the next key.
160 //
161 // It's safe for Get() to terminate after having finished all the potential
162 // key for the k.user_key(), or not.
163 //
164 // Default:
165 // Get() function with a default value of dynamically construct an iterator,
166 // seek and call the call back function.
167 virtual void Get(const LookupKey& k, void* callback_args,
168 bool (*callback_func)(void* arg, const char* entry));
169
11fdf7f2
TL
170 virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
171 const Slice& /*end_key*/) {
7c673cae
FG
172 return 0;
173 }
174
175 // Report an approximation of how much memory has been used other than memory
176 // that was allocated through the allocator. Safe to call from any thread.
177 virtual size_t ApproximateMemoryUsage() = 0;
178
179 virtual ~MemTableRep() { }
180
181 // Iteration over the contents of a skip collection
182 class Iterator {
183 public:
184 // Initialize an iterator over the specified collection.
185 // The returned iterator is not valid.
186 // explicit Iterator(const MemTableRep* collection);
187 virtual ~Iterator() {}
188
189 // Returns true iff the iterator is positioned at a valid node.
190 virtual bool Valid() const = 0;
191
192 // Returns the key at the current position.
193 // REQUIRES: Valid()
194 virtual const char* key() const = 0;
195
196 // Advances to the next position.
197 // REQUIRES: Valid()
198 virtual void Next() = 0;
199
200 // Advances to the previous position.
201 // REQUIRES: Valid()
202 virtual void Prev() = 0;
203
204 // Advance to the first entry with a key >= target
205 virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
206
207 // retreat to the first entry with a key <= target
208 virtual void SeekForPrev(const Slice& internal_key,
209 const char* memtable_key) = 0;
210
211 // Position at the first entry in collection.
212 // Final state of iterator is Valid() iff collection is not empty.
213 virtual void SeekToFirst() = 0;
214
215 // Position at the last entry in collection.
216 // Final state of iterator is Valid() iff collection is not empty.
217 virtual void SeekToLast() = 0;
218 };
219
220 // Return an iterator over the keys in this representation.
221 // arena: If not null, the arena needs to be used to allocate the Iterator.
222 // When destroying the iterator, the caller will not call "delete"
223 // but Iterator::~Iterator() directly. The destructor needs to destroy
224 // all the states but those allocated in arena.
225 virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
226
227 // Return an iterator that has a special Seek semantics. The result of
228 // a Seek might only include keys with the same prefix as the target key.
229 // arena: If not null, the arena is used to allocate the Iterator.
230 // When destroying the iterator, the caller will not call "delete"
231 // but Iterator::~Iterator() directly. The destructor needs to destroy
232 // all the states but those allocated in arena.
233 virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
234 return GetIterator(arena);
235 }
236
237 // Return true if the current MemTableRep supports merge operator.
238 // Default: true
239 virtual bool IsMergeOperatorSupported() const { return true; }
240
241 // Return true if the current MemTableRep supports snapshot
242 // Default: true
243 virtual bool IsSnapshotSupported() const { return true; }
244
245 protected:
246 // When *key is an internal key concatenated with the value, returns the
247 // user key.
248 virtual Slice UserKey(const char* key) const;
249
11fdf7f2 250 Allocator* allocator_;
7c673cae
FG
251};
252
253// This is the base class for all factories that are used by RocksDB to create
254// new MemTableRep objects
255class MemTableRepFactory {
256 public:
257 virtual ~MemTableRepFactory() {}
11fdf7f2 258
7c673cae 259 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
11fdf7f2 260 Allocator*, const SliceTransform*,
7c673cae 261 Logger* logger) = 0;
11fdf7f2
TL
262 virtual MemTableRep* CreateMemTableRep(
263 const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
264 const SliceTransform* slice_transform, Logger* logger,
265 uint32_t /* column_family_id */) {
266 return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
267 }
268
7c673cae
FG
269 virtual const char* Name() const = 0;
270
271 // Return true if the current MemTableRep supports concurrent inserts
272 // Default: false
273 virtual bool IsInsertConcurrentlySupported() const { return false; }
11fdf7f2
TL
274
275 // Return true if the current MemTableRep supports detecting duplicate
276 // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
277 // false when if the <key,seq> already exists.
278 // Default: false
279 virtual bool CanHandleDuplicatedKey() const { return false; }
7c673cae
FG
280};
281
282// This uses a skip list to store keys. It is the default.
283//
284// Parameters:
285// lookahead: If non-zero, each iterator's seek operation will start the
286// search from the previously visited record (doing at most 'lookahead'
287// steps). This is an optimization for the access pattern including many
288// seeks with consecutive keys.
289class SkipListFactory : public MemTableRepFactory {
290 public:
291 explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {}
292
11fdf7f2 293 using MemTableRepFactory::CreateMemTableRep;
7c673cae 294 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
11fdf7f2 295 Allocator*, const SliceTransform*,
7c673cae
FG
296 Logger* logger) override;
297 virtual const char* Name() const override { return "SkipListFactory"; }
298
299 bool IsInsertConcurrentlySupported() const override { return true; }
300
11fdf7f2
TL
301 bool CanHandleDuplicatedKey() const override { return true; }
302
7c673cae
FG
303 private:
304 const size_t lookahead_;
305};
306
307#ifndef ROCKSDB_LITE
308// This creates MemTableReps that are backed by an std::vector. On iteration,
309// the vector is sorted. This is useful for workloads where iteration is very
310// rare and writes are generally not issued after reads begin.
311//
312// Parameters:
313// count: Passed to the constructor of the underlying std::vector of each
314// VectorRep. On initialization, the underlying array will be at least count
315// bytes reserved for usage.
316class VectorRepFactory : public MemTableRepFactory {
317 const size_t count_;
318
319 public:
320 explicit VectorRepFactory(size_t count = 0) : count_(count) { }
11fdf7f2
TL
321
322 using MemTableRepFactory::CreateMemTableRep;
7c673cae 323 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
11fdf7f2 324 Allocator*, const SliceTransform*,
7c673cae 325 Logger* logger) override;
11fdf7f2 326
7c673cae
FG
327 virtual const char* Name() const override {
328 return "VectorRepFactory";
329 }
330};
331
332// This class contains a fixed array of buckets, each
333// pointing to a skiplist (null if the bucket is empty).
334// bucket_count: number of fixed array buckets
335// skiplist_height: the max height of the skiplist
336// skiplist_branching_factor: probabilistic size ratio between adjacent
337// link lists in the skiplist
338extern MemTableRepFactory* NewHashSkipListRepFactory(
339 size_t bucket_count = 1000000, int32_t skiplist_height = 4,
340 int32_t skiplist_branching_factor = 4
341);
342
343// The factory is to create memtables based on a hash table:
344// it contains a fixed array of buckets, each pointing to either a linked list
345// or a skip list if number of entries inside the bucket exceeds
346// threshold_use_skiplist.
347// @bucket_count: number of fixed array buckets
348// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
349// Otherwise from huge page TLB. The user needs to reserve
350// huge pages for it to be allocated, like:
351// sysctl -w vm.nr_hugepages=20
352// See linux doc Documentation/vm/hugetlbpage.txt
353// @bucket_entries_logging_threshold: if number of entries in one bucket
354// exceeds this number, log about it.
355// @if_log_bucket_dist_when_flash: if true, log distribution of number of
356// entries when flushing.
357// @threshold_use_skiplist: a bucket switches to skip list if number of
358// entries exceed this parameter.
359extern MemTableRepFactory* NewHashLinkListRepFactory(
360 size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
361 int bucket_entries_logging_threshold = 4096,
362 bool if_log_bucket_dist_when_flash = true,
363 uint32_t threshold_use_skiplist = 256);
364
365// This factory creates a cuckoo-hashing based mem-table representation.
366// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
11fdf7f2 367// are stored in the bucket array itself instead of in some data structures
7c673cae
FG
368// external to the bucket array. In addition, each key in cuckoo hash
369// has a constant number of possible buckets in the bucket array. These
370// two properties together makes cuckoo hash more memory efficient and
371// a constant worst-case read time. Cuckoo hash is best suitable for
372// point-lookup workload.
373//
374// When inserting a key / value, it first checks whether one of its possible
375// buckets is empty. If so, the key / value will be inserted to that vacant
376// bucket. Otherwise, one of the keys originally stored in one of these
377// possible buckets will be "kicked out" and move to one of its possible
378// buckets (and possibly kicks out another victim.) In the current
379// implementation, such "kick-out" path is bounded. If it cannot find a
380// "kick-out" path for a specific key, this key will be stored in a backup
381// structure, and the current memtable to be forced to immutable.
382//
383// Note that currently this mem-table representation does not support
384// snapshot (i.e., it only queries latest state) and iterators. In addition,
385// MultiGet operation might also lose its atomicity due to the lack of
386// snapshot support.
387//
388// Parameters:
389// write_buffer_size: the write buffer size in bytes.
390// average_data_size: the average size of key + value in bytes. This value
391// together with write_buffer_size will be used to compute the number
392// of buckets.
393// hash_function_count: the number of hash functions that will be used by
394// the cuckoo-hash. The number also equals to the number of possible
395// buckets each key will have.
396extern MemTableRepFactory* NewHashCuckooRepFactory(
397 size_t write_buffer_size, size_t average_data_size = 64,
398 unsigned int hash_function_count = 4);
399#endif // ROCKSDB_LITE
400} // namespace rocksdb