]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // This file contains the interface that must be implemented by any collection | |
7 | // to be used as the backing store for a MemTable. Such a collection must | |
8 | // satisfy the following properties: | |
9 | // (1) It does not store duplicate items. | |
10 | // (2) It uses MemTableRep::KeyComparator to compare items for iteration and | |
11 | // equality. | |
12 | // (3) It can be accessed concurrently by multiple readers and can support | |
13 | // during reads. However, it needn't support multiple concurrent writes. | |
14 | // (4) Items are never deleted. | |
15 | // The liberal use of assertions is encouraged to enforce (1). | |
16 | // | |
17 | // The factory will be passed an MemTableAllocator object when a new MemTableRep | |
18 | // is requested. | |
19 | // | |
20 | // Users can implement their own memtable representations. We include three | |
21 | // types built in: | |
22 | // - SkipListRep: This is the default; it is backed by a skip list. | |
23 | // - HashSkipListRep: The memtable rep that is best used for keys that are | |
24 | // structured like "prefix:suffix" where iteration within a prefix is | |
25 | // common and iteration across different prefixes is rare. It is backed by | |
26 | // a hash map where each bucket is a skip list. | |
27 | // - VectorRep: This is backed by an unordered std::vector. On iteration, the | |
28 | // vector is sorted. It is intelligent about sorting; once the MarkReadOnly() | |
29 | // has been called, the vector will only be sorted once. It is optimized for | |
30 | // random-write-heavy workloads. | |
31 | // | |
32 | // The last four implementations are designed for situations in which | |
33 | // iteration over the entire collection is rare since doing so requires all the | |
34 | // keys to be copied into a sorted data structure. | |
35 | ||
36 | #pragma once | |
37 | ||
494da23a | 38 | #include <rocksdb/slice.h> |
7c673cae FG |
39 | #include <stdint.h> |
40 | #include <stdlib.h> | |
494da23a TL |
41 | #include <memory> |
42 | #include <stdexcept> | |
7c673cae FG |
43 | |
44 | namespace rocksdb { | |
45 | ||
46 | class Arena; | |
11fdf7f2 | 47 | class Allocator; |
7c673cae | 48 | class LookupKey; |
7c673cae FG |
49 | class SliceTransform; |
50 | class Logger; | |
51 | ||
52 | typedef void* KeyHandle; | |
53 | ||
11fdf7f2 TL |
54 | extern Slice GetLengthPrefixedSlice(const char* data); |
55 | ||
7c673cae FG |
56 | class MemTableRep { |
57 | public: | |
58 | // KeyComparator provides a means to compare keys, which are internal keys | |
59 | // concatenated with values. | |
60 | class KeyComparator { | |
61 | public: | |
11fdf7f2 TL |
62 | typedef rocksdb::Slice DecodedType; |
63 | ||
64 | virtual DecodedType decode_key(const char* key) const { | |
65 | // The format of key is frozen and can be terated as a part of the API | |
66 | // contract. Refer to MemTable::Add for details. | |
67 | return GetLengthPrefixedSlice(key); | |
68 | } | |
69 | ||
7c673cae FG |
70 | // Compare a and b. Return a negative value if a is less than b, 0 if they |
71 | // are equal, and a positive value if a is greater than b | |
72 | virtual int operator()(const char* prefix_len_key1, | |
73 | const char* prefix_len_key2) const = 0; | |
74 | ||
75 | virtual int operator()(const char* prefix_len_key, | |
76 | const Slice& key) const = 0; | |
77 | ||
494da23a | 78 | virtual ~KeyComparator() {} |
7c673cae FG |
79 | }; |
80 | ||
11fdf7f2 | 81 | explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {} |
7c673cae FG |
82 | |
83 | // Allocate a buf of len size for storing key. The idea is that a | |
84 | // specific memtable representation knows its underlying data structure | |
85 | // better. By allowing it to allocate memory, it can possibly put | |
86 | // correlated stuff in consecutive memory area to make processor | |
87 | // prefetching more efficient. | |
88 | virtual KeyHandle Allocate(const size_t len, char** buf); | |
89 | ||
90 | // Insert key into the collection. (The caller will pack key and value into a | |
91 | // single buffer and pass that in as the parameter to Insert). | |
92 | // REQUIRES: nothing that compares equal to key is currently in the | |
93 | // collection, and no concurrent modifications to the table in progress | |
94 | virtual void Insert(KeyHandle handle) = 0; | |
95 | ||
11fdf7f2 TL |
96 | // Same as ::Insert |
97 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
98 | // the <key, seq> already exists. | |
99 | virtual bool InsertKey(KeyHandle handle) { | |
100 | Insert(handle); | |
101 | return true; | |
102 | } | |
103 | ||
7c673cae FG |
104 | // Same as Insert(), but in additional pass a hint to insert location for |
105 | // the key. If hint points to nullptr, a new hint will be populated. | |
106 | // otherwise the hint will be updated to reflect the last insert location. | |
107 | // | |
108 | // Currently only skip-list based memtable implement the interface. Other | |
109 | // implementations will fallback to Insert() by default. | |
11fdf7f2 | 110 | virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { |
7c673cae FG |
111 | // Ignore the hint by default. |
112 | Insert(handle); | |
113 | } | |
114 | ||
11fdf7f2 TL |
115 | // Same as ::InsertWithHint |
116 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
117 | // the <key, seq> already exists. | |
118 | virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) { | |
119 | InsertWithHint(handle, hint); | |
120 | return true; | |
121 | } | |
122 | ||
7c673cae | 123 | // Like Insert(handle), but may be called concurrent with other calls |
11fdf7f2 TL |
124 | // to InsertConcurrently for other handles. |
125 | // | |
126 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
127 | // the <key, seq> already exists. | |
128 | virtual void InsertConcurrently(KeyHandle handle); | |
129 | ||
130 | // Same as ::InsertConcurrently | |
131 | // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and | |
132 | // the <key, seq> already exists. | |
133 | virtual bool InsertKeyConcurrently(KeyHandle handle) { | |
134 | InsertConcurrently(handle); | |
135 | return true; | |
7c673cae FG |
136 | } |
137 | ||
138 | // Returns true iff an entry that compares equal to key is in the collection. | |
139 | virtual bool Contains(const char* key) const = 0; | |
140 | ||
141 | // Notify this table rep that it will no longer be added to. By default, | |
142 | // does nothing. After MarkReadOnly() is called, this table rep will | |
143 | // not be written to (ie No more calls to Allocate(), Insert(), | |
144 | // or any writes done directly to entries accessed through the iterator.) | |
494da23a | 145 | virtual void MarkReadOnly() {} |
7c673cae | 146 | |
11fdf7f2 TL |
147 | // Notify this table rep that it has been flushed to stable storage. |
148 | // By default, does nothing. | |
149 | // | |
150 | // Invariant: MarkReadOnly() is called, before MarkFlushed(). | |
151 | // Note that this method if overridden, should not run for an extended period | |
152 | // of time. Otherwise, RocksDB may be blocked. | |
494da23a | 153 | virtual void MarkFlushed() {} |
11fdf7f2 | 154 | |
7c673cae FG |
155 | // Look up key from the mem table, since the first key in the mem table whose |
156 | // user_key matches the one given k, call the function callback_func(), with | |
157 | // callback_args directly forwarded as the first parameter, and the mem table | |
158 | // key as the second parameter. If the return value is false, then terminates. | |
159 | // Otherwise, go through the next key. | |
160 | // | |
161 | // It's safe for Get() to terminate after having finished all the potential | |
162 | // key for the k.user_key(), or not. | |
163 | // | |
164 | // Default: | |
165 | // Get() function with a default value of dynamically construct an iterator, | |
166 | // seek and call the call back function. | |
167 | virtual void Get(const LookupKey& k, void* callback_args, | |
168 | bool (*callback_func)(void* arg, const char* entry)); | |
169 | ||
11fdf7f2 TL |
170 | virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, |
171 | const Slice& /*end_key*/) { | |
7c673cae FG |
172 | return 0; |
173 | } | |
174 | ||
175 | // Report an approximation of how much memory has been used other than memory | |
176 | // that was allocated through the allocator. Safe to call from any thread. | |
177 | virtual size_t ApproximateMemoryUsage() = 0; | |
178 | ||
494da23a | 179 | virtual ~MemTableRep() {} |
7c673cae FG |
180 | |
181 | // Iteration over the contents of a skip collection | |
182 | class Iterator { | |
183 | public: | |
184 | // Initialize an iterator over the specified collection. | |
185 | // The returned iterator is not valid. | |
186 | // explicit Iterator(const MemTableRep* collection); | |
187 | virtual ~Iterator() {} | |
188 | ||
189 | // Returns true iff the iterator is positioned at a valid node. | |
190 | virtual bool Valid() const = 0; | |
191 | ||
192 | // Returns the key at the current position. | |
193 | // REQUIRES: Valid() | |
194 | virtual const char* key() const = 0; | |
195 | ||
196 | // Advances to the next position. | |
197 | // REQUIRES: Valid() | |
198 | virtual void Next() = 0; | |
199 | ||
200 | // Advances to the previous position. | |
201 | // REQUIRES: Valid() | |
202 | virtual void Prev() = 0; | |
203 | ||
204 | // Advance to the first entry with a key >= target | |
205 | virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; | |
206 | ||
207 | // retreat to the first entry with a key <= target | |
208 | virtual void SeekForPrev(const Slice& internal_key, | |
209 | const char* memtable_key) = 0; | |
210 | ||
211 | // Position at the first entry in collection. | |
212 | // Final state of iterator is Valid() iff collection is not empty. | |
213 | virtual void SeekToFirst() = 0; | |
214 | ||
215 | // Position at the last entry in collection. | |
216 | // Final state of iterator is Valid() iff collection is not empty. | |
217 | virtual void SeekToLast() = 0; | |
218 | }; | |
219 | ||
220 | // Return an iterator over the keys in this representation. | |
221 | // arena: If not null, the arena needs to be used to allocate the Iterator. | |
222 | // When destroying the iterator, the caller will not call "delete" | |
223 | // but Iterator::~Iterator() directly. The destructor needs to destroy | |
224 | // all the states but those allocated in arena. | |
225 | virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; | |
226 | ||
227 | // Return an iterator that has a special Seek semantics. The result of | |
228 | // a Seek might only include keys with the same prefix as the target key. | |
229 | // arena: If not null, the arena is used to allocate the Iterator. | |
230 | // When destroying the iterator, the caller will not call "delete" | |
231 | // but Iterator::~Iterator() directly. The destructor needs to destroy | |
232 | // all the states but those allocated in arena. | |
233 | virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { | |
234 | return GetIterator(arena); | |
235 | } | |
236 | ||
237 | // Return true if the current MemTableRep supports merge operator. | |
238 | // Default: true | |
239 | virtual bool IsMergeOperatorSupported() const { return true; } | |
240 | ||
241 | // Return true if the current MemTableRep supports snapshot | |
242 | // Default: true | |
243 | virtual bool IsSnapshotSupported() const { return true; } | |
244 | ||
245 | protected: | |
246 | // When *key is an internal key concatenated with the value, returns the | |
247 | // user key. | |
248 | virtual Slice UserKey(const char* key) const; | |
249 | ||
11fdf7f2 | 250 | Allocator* allocator_; |
7c673cae FG |
251 | }; |
252 | ||
253 | // This is the base class for all factories that are used by RocksDB to create | |
254 | // new MemTableRep objects | |
255 | class MemTableRepFactory { | |
256 | public: | |
257 | virtual ~MemTableRepFactory() {} | |
11fdf7f2 | 258 | |
7c673cae | 259 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 260 | Allocator*, const SliceTransform*, |
7c673cae | 261 | Logger* logger) = 0; |
11fdf7f2 TL |
262 | virtual MemTableRep* CreateMemTableRep( |
263 | const MemTableRep::KeyComparator& key_cmp, Allocator* allocator, | |
264 | const SliceTransform* slice_transform, Logger* logger, | |
265 | uint32_t /* column_family_id */) { | |
266 | return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); | |
267 | } | |
268 | ||
7c673cae FG |
269 | virtual const char* Name() const = 0; |
270 | ||
271 | // Return true if the current MemTableRep supports concurrent inserts | |
272 | // Default: false | |
273 | virtual bool IsInsertConcurrentlySupported() const { return false; } | |
11fdf7f2 TL |
274 | |
275 | // Return true if the current MemTableRep supports detecting duplicate | |
276 | // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns | |
277 | // false when if the <key,seq> already exists. | |
278 | // Default: false | |
279 | virtual bool CanHandleDuplicatedKey() const { return false; } | |
7c673cae FG |
280 | }; |
281 | ||
282 | // This uses a skip list to store keys. It is the default. | |
283 | // | |
284 | // Parameters: | |
285 | // lookahead: If non-zero, each iterator's seek operation will start the | |
286 | // search from the previously visited record (doing at most 'lookahead' | |
287 | // steps). This is an optimization for the access pattern including many | |
288 | // seeks with consecutive keys. | |
289 | class SkipListFactory : public MemTableRepFactory { | |
290 | public: | |
291 | explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {} | |
292 | ||
11fdf7f2 | 293 | using MemTableRepFactory::CreateMemTableRep; |
7c673cae | 294 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 295 | Allocator*, const SliceTransform*, |
7c673cae FG |
296 | Logger* logger) override; |
297 | virtual const char* Name() const override { return "SkipListFactory"; } | |
298 | ||
299 | bool IsInsertConcurrentlySupported() const override { return true; } | |
300 | ||
11fdf7f2 TL |
301 | bool CanHandleDuplicatedKey() const override { return true; } |
302 | ||
7c673cae FG |
303 | private: |
304 | const size_t lookahead_; | |
305 | }; | |
306 | ||
307 | #ifndef ROCKSDB_LITE | |
308 | // This creates MemTableReps that are backed by an std::vector. On iteration, | |
309 | // the vector is sorted. This is useful for workloads where iteration is very | |
310 | // rare and writes are generally not issued after reads begin. | |
311 | // | |
312 | // Parameters: | |
313 | // count: Passed to the constructor of the underlying std::vector of each | |
314 | // VectorRep. On initialization, the underlying array will be at least count | |
315 | // bytes reserved for usage. | |
316 | class VectorRepFactory : public MemTableRepFactory { | |
317 | const size_t count_; | |
318 | ||
319 | public: | |
494da23a | 320 | explicit VectorRepFactory(size_t count = 0) : count_(count) {} |
11fdf7f2 TL |
321 | |
322 | using MemTableRepFactory::CreateMemTableRep; | |
7c673cae | 323 | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
11fdf7f2 | 324 | Allocator*, const SliceTransform*, |
7c673cae | 325 | Logger* logger) override; |
11fdf7f2 | 326 | |
494da23a | 327 | virtual const char* Name() const override { return "VectorRepFactory"; } |
7c673cae FG |
328 | }; |
329 | ||
330 | // This class contains a fixed array of buckets, each | |
331 | // pointing to a skiplist (null if the bucket is empty). | |
332 | // bucket_count: number of fixed array buckets | |
333 | // skiplist_height: the max height of the skiplist | |
334 | // skiplist_branching_factor: probabilistic size ratio between adjacent | |
335 | // link lists in the skiplist | |
336 | extern MemTableRepFactory* NewHashSkipListRepFactory( | |
337 | size_t bucket_count = 1000000, int32_t skiplist_height = 4, | |
494da23a | 338 | int32_t skiplist_branching_factor = 4); |
7c673cae FG |
339 | |
340 | // The factory is to create memtables based on a hash table: | |
341 | // it contains a fixed array of buckets, each pointing to either a linked list | |
342 | // or a skip list if number of entries inside the bucket exceeds | |
343 | // threshold_use_skiplist. | |
344 | // @bucket_count: number of fixed array buckets | |
345 | // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. | |
346 | // Otherwise from huge page TLB. The user needs to reserve | |
347 | // huge pages for it to be allocated, like: | |
348 | // sysctl -w vm.nr_hugepages=20 | |
349 | // See linux doc Documentation/vm/hugetlbpage.txt | |
350 | // @bucket_entries_logging_threshold: if number of entries in one bucket | |
351 | // exceeds this number, log about it. | |
352 | // @if_log_bucket_dist_when_flash: if true, log distribution of number of | |
353 | // entries when flushing. | |
354 | // @threshold_use_skiplist: a bucket switches to skip list if number of | |
355 | // entries exceed this parameter. | |
356 | extern MemTableRepFactory* NewHashLinkListRepFactory( | |
357 | size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, | |
358 | int bucket_entries_logging_threshold = 4096, | |
359 | bool if_log_bucket_dist_when_flash = true, | |
360 | uint32_t threshold_use_skiplist = 256); | |
361 | ||
7c673cae FG |
362 | #endif // ROCKSDB_LITE |
363 | } // namespace rocksdb |