]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // This file contains the interface that must be implemented by any collection | |
7 | // to be used as the backing store for a MemTable. Such a collection must | |
8 | // satisfy the following properties: | |
9 | // (1) It does not store duplicate items. | |
10 | // (2) It uses MemTableRep::KeyComparator to compare items for iteration and | |
11 | // equality. | |
12 | // (3) It can be accessed concurrently by multiple readers and can support | |
13 | // during reads. However, it needn't support multiple concurrent writes. | |
14 | // (4) Items are never deleted. | |
15 | // The liberal use of assertions is encouraged to enforce (1). | |
16 | // | |
17 | // The factory will be passed an MemTableAllocator object when a new MemTableRep | |
18 | // is requested. | |
19 | // | |
20 | // Users can implement their own memtable representations. We include three | |
21 | // types built in: | |
22 | // - SkipListRep: This is the default; it is backed by a skip list. | |
23 | // - HashSkipListRep: The memtable rep that is best used for keys that are | |
24 | // structured like "prefix:suffix" where iteration within a prefix is | |
25 | // common and iteration across different prefixes is rare. It is backed by | |
26 | // a hash map where each bucket is a skip list. | |
27 | // - VectorRep: This is backed by an unordered std::vector. On iteration, the | |
28 | // vector is sorted. It is intelligent about sorting; once the MarkReadOnly() | |
29 | // has been called, the vector will only be sorted once. It is optimized for | |
30 | // random-write-heavy workloads. | |
31 | // | |
32 | // The last four implementations are designed for situations in which | |
33 | // iteration over the entire collection is rare since doing so requires all the | |
34 | // keys to be copied into a sorted data structure. | |
35 | ||
36 | #pragma once | |
37 | ||
38 | #include <memory> | |
39 | #include <stdexcept> | |
40 | #include <stdint.h> | |
41 | #include <stdlib.h> | |
11fdf7f2 | 42 | #include <rocksdb/slice.h> |
7c673cae FG |
43 | |
44 | namespace rocksdb { | |
45 | ||
46 | class Arena; | |
11fdf7f2 | 47 | class Allocator; |
7c673cae | 48 | class LookupKey; |
7c673cae FG |
49 | class SliceTransform; |
50 | class Logger; | |
51 | ||
52 | typedef void* KeyHandle; | |
53 | ||
11fdf7f2 TL |
54 | extern Slice GetLengthPrefixedSlice(const char* data); |
55 | ||
7c673cae FG |
56 | class MemTableRep { |
57 | public: | |
58 | // KeyComparator provides a means to compare keys, which are internal keys | |
59 | // concatenated with values. | |
60 | class KeyComparator { | |
61 | public: | |
11fdf7f2 TL |
62 | typedef rocksdb::Slice DecodedType; |
63 | ||
64 | virtual DecodedType decode_key(const char* key) const { | |
65 | // The format of key is frozen and can be terated as a part of the API | |
66 | // contract. Refer to MemTable::Add for details. | |
67 | return GetLengthPrefixedSlice(key); | |
68 | } | |
69 | ||
7c673cae FG |
70 | // Compare a and b. Return a negative value if a is less than b, 0 if they |
71 | // are equal, and a positive value if a is greater than b | |
72 | virtual int operator()(const char* prefix_len_key1, | |
73 | const char* prefix_len_key2) const = 0; | |
74 | ||
75 | virtual int operator()(const char* prefix_len_key, | |
76 | const Slice& key) const = 0; | |
77 | ||
78 | virtual ~KeyComparator() { } | |
79 | }; | |
80 | ||
11fdf7f2 | 81 | explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {} |
7c673cae FG |
82 | |
83 | // Allocate a buf of len size for storing key. The idea is that a | |
84 | // specific memtable representation knows its underlying data structure | |
85 | // better. By allowing it to allocate memory, it can possibly put | |
86 | // correlated stuff in consecutive memory area to make processor | |
87 | // prefetching more efficient. | |
88 | virtual KeyHandle Allocate(const size_t len, char** buf); | |
89 | ||
90 | // Insert key into the collection. (The caller will pack key and value into a | |
91 | // single buffer and pass that in as the parameter to Insert). | |
92 | // REQUIRES: nothing that compares equal to key is currently in the | |
93 | // collection, and no concurrent modifications to the table in progress | |
94 | virtual void Insert(KeyHandle handle) = 0; | |
95 | ||
11fdf7f2 TL |
96 | // Same as ::Insert |
97 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
98 | // the <key, seq> already exists. | |
99 | virtual bool InsertKey(KeyHandle handle) { | |
100 | Insert(handle); | |
101 | return true; | |
102 | } | |
103 | ||
7c673cae FG |
104 | // Same as Insert(), but in additional pass a hint to insert location for |
105 | // the key. If hint points to nullptr, a new hint will be populated. | |
106 | // otherwise the hint will be updated to reflect the last insert location. | |
107 | // | |
108 | // Currently only skip-list based memtable implement the interface. Other | |
109 | // implementations will fallback to Insert() by default. | |
11fdf7f2 | 110 | virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { |
7c673cae FG |
111 | // Ignore the hint by default. |
112 | Insert(handle); | |
113 | } | |
114 | ||
11fdf7f2 TL |
115 | // Same as ::InsertWithHint |
116 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
117 | // the <key, seq> already exists. | |
118 | virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) { | |
119 | InsertWithHint(handle, hint); | |
120 | return true; | |
121 | } | |
122 | ||
7c673cae | 123 | // Like Insert(handle), but may be called concurrent with other calls |
11fdf7f2 TL |
124 | // to InsertConcurrently for other handles. |
125 | // | |
126 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
127 | // the <key, seq> already exists. | |
128 | virtual void InsertConcurrently(KeyHandle handle); | |
129 | ||
130 | // Same as ::InsertConcurrently | |
131 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
132 | // the <key, seq> already exists. | |
133 | virtual bool InsertKeyConcurrently(KeyHandle handle) { | |
134 | InsertConcurrently(handle); | |
135 | return true; | |
7c673cae FG |
136 | } |
137 | ||
138 | // Returns true iff an entry that compares equal to key is in the collection. | |
139 | virtual bool Contains(const char* key) const = 0; | |
140 | ||
141 | // Notify this table rep that it will no longer be added to. By default, | |
142 | // does nothing. After MarkReadOnly() is called, this table rep will | |
143 | // not be written to (ie No more calls to Allocate(), Insert(), | |
144 | // or any writes done directly to entries accessed through the iterator.) | |
145 | virtual void MarkReadOnly() { } | |
146 | ||
11fdf7f2 TL |
147 | // Notify this table rep that it has been flushed to stable storage. |
148 | // By default, does nothing. | |
149 | // | |
150 | // Invariant: MarkReadOnly() is called, before MarkFlushed(). | |
151 | // Note that this method if overridden, should not run for an extended period | |
152 | // of time. Otherwise, RocksDB may be blocked. | |
153 | virtual void MarkFlushed() { } | |
154 | ||
7c673cae FG |
155 | // Look up key from the mem table, since the first key in the mem table whose |
156 | // user_key matches the one given k, call the function callback_func(), with | |
157 | // callback_args directly forwarded as the first parameter, and the mem table | |
158 | // key as the second parameter. If the return value is false, then terminates. | |
159 | // Otherwise, go through the next key. | |
160 | // | |
161 | // It's safe for Get() to terminate after having finished all the potential | |
162 | // key for the k.user_key(), or not. | |
163 | // | |
164 | // Default: | |
165 | // Get() function with a default value of dynamically construct an iterator, | |
166 | // seek and call the call back function. | |
167 | virtual void Get(const LookupKey& k, void* callback_args, | |
168 | bool (*callback_func)(void* arg, const char* entry)); | |
169 | ||
11fdf7f2 TL |
170 | virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, |
171 | const Slice& /*end_key*/) { | |
7c673cae FG |
172 | return 0; |
173 | } | |
174 | ||
175 | // Report an approximation of how much memory has been used other than memory | |
176 | // that was allocated through the allocator. Safe to call from any thread. | |
177 | virtual size_t ApproximateMemoryUsage() = 0; | |
178 | ||
179 | virtual ~MemTableRep() { } | |
180 | ||
181 | // Iteration over the contents of a skip collection | |
182 | class Iterator { | |
183 | public: | |
184 | // Initialize an iterator over the specified collection. | |
185 | // The returned iterator is not valid. | |
186 | // explicit Iterator(const MemTableRep* collection); | |
187 | virtual ~Iterator() {} | |
188 | ||
189 | // Returns true iff the iterator is positioned at a valid node. | |
190 | virtual bool Valid() const = 0; | |
191 | ||
192 | // Returns the key at the current position. | |
193 | // REQUIRES: Valid() | |
194 | virtual const char* key() const = 0; | |
195 | ||
196 | // Advances to the next position. | |
197 | // REQUIRES: Valid() | |
198 | virtual void Next() = 0; | |
199 | ||
200 | // Advances to the previous position. | |
201 | // REQUIRES: Valid() | |
202 | virtual void Prev() = 0; | |
203 | ||
204 | // Advance to the first entry with a key >= target | |
205 | virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; | |
206 | ||
207 | // retreat to the first entry with a key <= target | |
208 | virtual void SeekForPrev(const Slice& internal_key, | |
209 | const char* memtable_key) = 0; | |
210 | ||
211 | // Position at the first entry in collection. | |
212 | // Final state of iterator is Valid() iff collection is not empty. | |
213 | virtual void SeekToFirst() = 0; | |
214 | ||
215 | // Position at the last entry in collection. | |
216 | // Final state of iterator is Valid() iff collection is not empty. | |
217 | virtual void SeekToLast() = 0; | |
218 | }; | |
219 | ||
220 | // Return an iterator over the keys in this representation. | |
221 | // arena: If not null, the arena needs to be used to allocate the Iterator. | |
222 | // When destroying the iterator, the caller will not call "delete" | |
223 | // but Iterator::~Iterator() directly. The destructor needs to destroy | |
224 | // all the states but those allocated in arena. | |
225 | virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; | |
226 | ||
227 | // Return an iterator that has a special Seek semantics. The result of | |
228 | // a Seek might only include keys with the same prefix as the target key. | |
229 | // arena: If not null, the arena is used to allocate the Iterator. | |
230 | // When destroying the iterator, the caller will not call "delete" | |
231 | // but Iterator::~Iterator() directly. The destructor needs to destroy | |
232 | // all the states but those allocated in arena. | |
233 | virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { | |
234 | return GetIterator(arena); | |
235 | } | |
236 | ||
237 | // Return true if the current MemTableRep supports merge operator. | |
238 | // Default: true | |
239 | virtual bool IsMergeOperatorSupported() const { return true; } | |
240 | ||
241 | // Return true if the current MemTableRep supports snapshot | |
242 | // Default: true | |
243 | virtual bool IsSnapshotSupported() const { return true; } | |
244 | ||
245 | protected: | |
246 | // When *key is an internal key concatenated with the value, returns the | |
247 | // user key. | |
248 | virtual Slice UserKey(const char* key) const; | |
249 | ||
11fdf7f2 | 250 | Allocator* allocator_; |
7c673cae FG |
251 | }; |
252 | ||
253 | // This is the base class for all factories that are used by RocksDB to create | |
254 | // new MemTableRep objects | |
255 | class MemTableRepFactory { | |
256 | public: | |
257 | virtual ~MemTableRepFactory() {} | |
11fdf7f2 | 258 | |
7c673cae | 259 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 260 | Allocator*, const SliceTransform*, |
7c673cae | 261 | Logger* logger) = 0; |
11fdf7f2 TL |
262 | virtual MemTableRep* CreateMemTableRep( |
263 | const MemTableRep::KeyComparator& key_cmp, Allocator* allocator, | |
264 | const SliceTransform* slice_transform, Logger* logger, | |
265 | uint32_t /* column_family_id */) { | |
266 | return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); | |
267 | } | |
268 | ||
7c673cae FG |
269 | virtual const char* Name() const = 0; |
270 | ||
271 | // Return true if the current MemTableRep supports concurrent inserts | |
272 | // Default: false | |
273 | virtual bool IsInsertConcurrentlySupported() const { return false; } | |
11fdf7f2 TL |
274 | |
275 | // Return true if the current MemTableRep supports detecting duplicate | |
276 | // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns | |
277 | // false when if the <key,seq> already exists. | |
278 | // Default: false | |
279 | virtual bool CanHandleDuplicatedKey() const { return false; } | |
7c673cae FG |
280 | }; |
281 | ||
282 | // This uses a skip list to store keys. It is the default. | |
283 | // | |
284 | // Parameters: | |
285 | // lookahead: If non-zero, each iterator's seek operation will start the | |
286 | // search from the previously visited record (doing at most 'lookahead' | |
287 | // steps). This is an optimization for the access pattern including many | |
288 | // seeks with consecutive keys. | |
289 | class SkipListFactory : public MemTableRepFactory { | |
290 | public: | |
291 | explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {} | |
292 | ||
11fdf7f2 | 293 | using MemTableRepFactory::CreateMemTableRep; |
7c673cae | 294 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 295 | Allocator*, const SliceTransform*, |
7c673cae FG |
296 | Logger* logger) override; |
297 | virtual const char* Name() const override { return "SkipListFactory"; } | |
298 | ||
299 | bool IsInsertConcurrentlySupported() const override { return true; } | |
300 | ||
11fdf7f2 TL |
301 | bool CanHandleDuplicatedKey() const override { return true; } |
302 | ||
7c673cae FG |
303 | private: |
304 | const size_t lookahead_; | |
305 | }; | |
306 | ||
307 | #ifndef ROCKSDB_LITE | |
308 | // This creates MemTableReps that are backed by an std::vector. On iteration, | |
309 | // the vector is sorted. This is useful for workloads where iteration is very | |
310 | // rare and writes are generally not issued after reads begin. | |
311 | // | |
312 | // Parameters: | |
313 | // count: Passed to the constructor of the underlying std::vector of each | |
314 | // VectorRep. On initialization, the underlying array will be at least count | |
315 | // bytes reserved for usage. | |
316 | class VectorRepFactory : public MemTableRepFactory { | |
317 | const size_t count_; | |
318 | ||
319 | public: | |
320 | explicit VectorRepFactory(size_t count = 0) : count_(count) { } | |
11fdf7f2 TL |
321 | |
322 | using MemTableRepFactory::CreateMemTableRep; | |
7c673cae | 323 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 324 | Allocator*, const SliceTransform*, |
7c673cae | 325 | Logger* logger) override; |
11fdf7f2 | 326 | |
7c673cae FG |
327 | virtual const char* Name() const override { |
328 | return "VectorRepFactory"; | |
329 | } | |
330 | }; | |
331 | ||
332 | // This class contains a fixed array of buckets, each | |
333 | // pointing to a skiplist (null if the bucket is empty). | |
334 | // bucket_count: number of fixed array buckets | |
335 | // skiplist_height: the max height of the skiplist | |
336 | // skiplist_branching_factor: probabilistic size ratio between adjacent | |
337 | // link lists in the skiplist | |
338 | extern MemTableRepFactory* NewHashSkipListRepFactory( | |
339 | size_t bucket_count = 1000000, int32_t skiplist_height = 4, | |
340 | int32_t skiplist_branching_factor = 4 | |
341 | ); | |
342 | ||
343 | // The factory is to create memtables based on a hash table: | |
344 | // it contains a fixed array of buckets, each pointing to either a linked list | |
345 | // or a skip list if number of entries inside the bucket exceeds | |
346 | // threshold_use_skiplist. | |
347 | // @bucket_count: number of fixed array buckets | |
348 | // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. | |
349 | // Otherwise from huge page TLB. The user needs to reserve | |
350 | // huge pages for it to be allocated, like: | |
351 | // sysctl -w vm.nr_hugepages=20 | |
352 | // See linux doc Documentation/vm/hugetlbpage.txt | |
353 | // @bucket_entries_logging_threshold: if number of entries in one bucket | |
354 | // exceeds this number, log about it. | |
355 | // @if_log_bucket_dist_when_flash: if true, log distribution of number of | |
356 | // entries when flushing. | |
357 | // @threshold_use_skiplist: a bucket switches to skip list if number of | |
358 | // entries exceed this parameter. | |
359 | extern MemTableRepFactory* NewHashLinkListRepFactory( | |
360 | size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, | |
361 | int bucket_entries_logging_threshold = 4096, | |
362 | bool if_log_bucket_dist_when_flash = true, | |
363 | uint32_t threshold_use_skiplist = 256); | |
364 | ||
365 | // This factory creates a cuckoo-hashing based mem-table representation. | |
366 | // Cuckoo-hash is a closed-hash strategy, in which all key/value pairs | |
11fdf7f2 | 367 | // are stored in the bucket array itself instead of in some data structures |
7c673cae FG |
368 | // external to the bucket array. In addition, each key in cuckoo hash |
369 | // has a constant number of possible buckets in the bucket array. These | |
370 | // two properties together makes cuckoo hash more memory efficient and | |
371 | // a constant worst-case read time. Cuckoo hash is best suitable for | |
372 | // point-lookup workload. | |
373 | // | |
374 | // When inserting a key / value, it first checks whether one of its possible | |
375 | // buckets is empty. If so, the key / value will be inserted to that vacant | |
376 | // bucket. Otherwise, one of the keys originally stored in one of these | |
377 | // possible buckets will be "kicked out" and move to one of its possible | |
378 | // buckets (and possibly kicks out another victim.) In the current | |
379 | // implementation, such "kick-out" path is bounded. If it cannot find a | |
380 | // "kick-out" path for a specific key, this key will be stored in a backup | |
381 | // structure, and the current memtable to be forced to immutable. | |
382 | // | |
383 | // Note that currently this mem-table representation does not support | |
384 | // snapshot (i.e., it only queries latest state) and iterators. In addition, | |
385 | // MultiGet operation might also lose its atomicity due to the lack of | |
386 | // snapshot support. | |
387 | // | |
388 | // Parameters: | |
389 | // write_buffer_size: the write buffer size in bytes. | |
390 | // average_data_size: the average size of key + value in bytes. This value | |
391 | // together with write_buffer_size will be used to compute the number | |
392 | // of buckets. | |
393 | // hash_function_count: the number of hash functions that will be used by | |
394 | // the cuckoo-hash. The number also equals to the number of possible | |
395 | // buckets each key will have. | |
396 | extern MemTableRepFactory* NewHashCuckooRepFactory( | |
397 | size_t write_buffer_size, size_t average_data_size = 64, | |
398 | unsigned int hash_function_count = 4); | |
399 | #endif // ROCKSDB_LITE | |
400 | } // namespace rocksdb |