]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | // This source code is licensed under the BSD-style license found in the | |
3 | // LICENSE file in the root directory of this source tree. An additional grant | |
4 | // of patent rights can be found in the PATENTS file in the same directory. | |
5 | // | |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | #include <atomic> | |
12 | #include <deque> | |
13 | #include <functional> | |
14 | #include <memory> | |
15 | #include <string> | |
16 | #include <unordered_map> | |
17 | #include <vector> | |
18 | #include "db/dbformat.h" | |
19 | #include "db/range_del_aggregator.h" | |
20 | #include "db/version_edit.h" | |
21 | #include "memtable/memtable_allocator.h" | |
22 | #include "monitoring/instrumented_mutex.h" | |
23 | #include "options/cf_options.h" | |
24 | #include "rocksdb/db.h" | |
25 | #include "rocksdb/env.h" | |
26 | #include "rocksdb/memtablerep.h" | |
27 | #include "util/concurrent_arena.h" | |
28 | #include "util/dynamic_bloom.h" | |
29 | #include "util/hash.h" | |
30 | ||
31 | namespace rocksdb { | |
32 | ||
33 | class Mutex; | |
34 | class MemTableIterator; | |
35 | class MergeContext; | |
36 | class InternalIterator; | |
37 | ||
38 | struct MemTableOptions { | |
39 | explicit MemTableOptions( | |
40 | const ImmutableCFOptions& ioptions, | |
41 | const MutableCFOptions& mutable_cf_options); | |
42 | size_t write_buffer_size; | |
43 | size_t arena_block_size; | |
44 | uint32_t memtable_prefix_bloom_bits; | |
45 | size_t memtable_huge_page_size; | |
46 | bool inplace_update_support; | |
47 | size_t inplace_update_num_locks; | |
48 | UpdateStatus (*inplace_callback)(char* existing_value, | |
49 | uint32_t* existing_value_size, | |
50 | Slice delta_value, | |
51 | std::string* merged_value); | |
52 | size_t max_successive_merges; | |
53 | Statistics* statistics; | |
54 | MergeOperator* merge_operator; | |
55 | Logger* info_log; | |
56 | }; | |
57 | ||
58 | // Batched counters to updated when inserting keys in one write batch. | |
59 | // In post process of the write batch, these can be updated together. | |
60 | // Only used in concurrent memtable insert case. | |
61 | struct MemTablePostProcessInfo { | |
62 | uint64_t data_size = 0; | |
63 | uint64_t num_entries = 0; | |
64 | uint64_t num_deletes = 0; | |
65 | }; | |
66 | ||
67 | // Note: Many of the methods in this class have comments indicating that | |
68 | // external synchromization is required as these methods are not thread-safe. | |
69 | // It is up to higher layers of code to decide how to prevent concurrent | |
70 | // invokation of these methods. This is usually done by acquiring either | |
71 | // the db mutex or the single writer thread. | |
72 | // | |
73 | // Some of these methods are documented to only require external | |
74 | // synchronization if this memtable is immutable. Calling MarkImmutable() is | |
75 | // not sufficient to guarantee immutability. It is up to higher layers of | |
76 | // code to determine if this MemTable can still be modified by other threads. | |
77 | // Eg: The Superversion stores a pointer to the current MemTable (that can | |
78 | // be modified) and a separate list of the MemTables that can no longer be | |
79 | // written to (aka the 'immutable memtables'). | |
80 | class MemTable { | |
81 | public: | |
82 | struct KeyComparator : public MemTableRep::KeyComparator { | |
83 | const InternalKeyComparator comparator; | |
84 | explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } | |
85 | virtual int operator()(const char* prefix_len_key1, | |
86 | const char* prefix_len_key2) const override; | |
87 | virtual int operator()(const char* prefix_len_key, | |
88 | const Slice& key) const override; | |
89 | }; | |
90 | ||
91 | // MemTables are reference counted. The initial reference count | |
92 | // is zero and the caller must call Ref() at least once. | |
93 | // | |
94 | // earliest_seq should be the current SequenceNumber in the db such that any | |
95 | // key inserted into this memtable will have an equal or larger seq number. | |
96 | // (When a db is first created, the earliest sequence number will be 0). | |
97 | // If the earliest sequence number is not known, kMaxSequenceNumber may be | |
98 | // used, but this may prevent some transactions from succeeding until the | |
99 | // first key is inserted into the memtable. | |
100 | explicit MemTable(const InternalKeyComparator& comparator, | |
101 | const ImmutableCFOptions& ioptions, | |
102 | const MutableCFOptions& mutable_cf_options, | |
103 | WriteBufferManager* write_buffer_manager, | |
104 | SequenceNumber earliest_seq); | |
105 | ||
106 | // Do not delete this MemTable unless Unref() indicates it not in use. | |
107 | ~MemTable(); | |
108 | ||
109 | // Increase reference count. | |
110 | // REQUIRES: external synchronization to prevent simultaneous | |
111 | // operations on the same MemTable. | |
112 | void Ref() { ++refs_; } | |
113 | ||
114 | // Drop reference count. | |
115 | // If the refcount goes to zero return this memtable, otherwise return null. | |
116 | // REQUIRES: external synchronization to prevent simultaneous | |
117 | // operations on the same MemTable. | |
118 | MemTable* Unref() { | |
119 | --refs_; | |
120 | assert(refs_ >= 0); | |
121 | if (refs_ <= 0) { | |
122 | return this; | |
123 | } | |
124 | return nullptr; | |
125 | } | |
126 | ||
127 | // Returns an estimate of the number of bytes of data in use by this | |
128 | // data structure. | |
129 | // | |
130 | // REQUIRES: external synchronization to prevent simultaneous | |
131 | // operations on the same MemTable (unless this Memtable is immutable). | |
132 | size_t ApproximateMemoryUsage(); | |
133 | ||
134 | // This method heuristically determines if the memtable should continue to | |
135 | // host more data. | |
136 | bool ShouldScheduleFlush() const { | |
137 | return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED; | |
138 | } | |
139 | ||
140 | // Returns true if a flush should be scheduled and the caller should | |
141 | // be the one to schedule it | |
142 | bool MarkFlushScheduled() { | |
143 | auto before = FLUSH_REQUESTED; | |
144 | return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED, | |
145 | std::memory_order_relaxed, | |
146 | std::memory_order_relaxed); | |
147 | } | |
148 | ||
149 | // Return an iterator that yields the contents of the memtable. | |
150 | // | |
151 | // The caller must ensure that the underlying MemTable remains live | |
152 | // while the returned iterator is live. The keys returned by this | |
153 | // iterator are internal keys encoded by AppendInternalKey in the | |
154 | // db/dbformat.{h,cc} module. | |
155 | // | |
156 | // By default, it returns an iterator for prefix seek if prefix_extractor | |
157 | // is configured in Options. | |
158 | // arena: If not null, the arena needs to be used to allocate the Iterator. | |
159 | // Calling ~Iterator of the iterator will destroy all the states but | |
160 | // those allocated in arena. | |
161 | InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); | |
162 | ||
163 | InternalIterator* NewRangeTombstoneIterator(const ReadOptions& read_options); | |
164 | ||
165 | // Add an entry into memtable that maps key to value at the | |
166 | // specified sequence number and with the specified type. | |
167 | // Typically value will be empty if type==kTypeDeletion. | |
168 | // | |
169 | // REQUIRES: if allow_concurrent = false, external synchronization to prevent | |
170 | // simultaneous operations on the same MemTable. | |
171 | void Add(SequenceNumber seq, ValueType type, const Slice& key, | |
172 | const Slice& value, bool allow_concurrent = false, | |
173 | MemTablePostProcessInfo* post_process_info = nullptr); | |
174 | ||
175 | // If memtable contains a value for key, store it in *value and return true. | |
176 | // If memtable contains a deletion for key, store a NotFound() error | |
177 | // in *status and return true. | |
178 | // If memtable contains Merge operation as the most recent entry for a key, | |
179 | // and the merge process does not stop (not reaching a value or delete), | |
180 | // prepend the current merge operand to *operands. | |
181 | // store MergeInProgress in s, and return false. | |
182 | // Else, return false. | |
183 | // If any operation was found, its most recent sequence number | |
184 | // will be stored in *seq on success (regardless of whether true/false is | |
185 | // returned). Otherwise, *seq will be set to kMaxSequenceNumber. | |
186 | // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other | |
187 | // status returned indicates a corruption or other unexpected error. | |
188 | bool Get(const LookupKey& key, std::string* value, Status* s, | |
189 | MergeContext* merge_context, RangeDelAggregator* range_del_agg, | |
190 | SequenceNumber* seq, const ReadOptions& read_opts); | |
191 | ||
192 | bool Get(const LookupKey& key, std::string* value, Status* s, | |
193 | MergeContext* merge_context, RangeDelAggregator* range_del_agg, | |
194 | const ReadOptions& read_opts) { | |
195 | SequenceNumber seq; | |
196 | return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts); | |
197 | } | |
198 | ||
199 | // Attempts to update the new_value inplace, else does normal Add | |
200 | // Pseudocode | |
201 | // if key exists in current memtable && prev_value is of type kTypeValue | |
202 | // if new sizeof(new_value) <= sizeof(prev_value) | |
203 | // update inplace | |
204 | // else add(key, new_value) | |
205 | // else add(key, new_value) | |
206 | // | |
207 | // REQUIRES: external synchronization to prevent simultaneous | |
208 | // operations on the same MemTable. | |
209 | void Update(SequenceNumber seq, | |
210 | const Slice& key, | |
211 | const Slice& value); | |
212 | ||
213 | // If prev_value for key exists, attempts to update it inplace. | |
214 | // else returns false | |
215 | // Pseudocode | |
216 | // if key exists in current memtable && prev_value is of type kTypeValue | |
217 | // new_value = delta(prev_value) | |
218 | // if sizeof(new_value) <= sizeof(prev_value) | |
219 | // update inplace | |
220 | // else add(key, new_value) | |
221 | // else return false | |
222 | // | |
223 | // REQUIRES: external synchronization to prevent simultaneous | |
224 | // operations on the same MemTable. | |
225 | bool UpdateCallback(SequenceNumber seq, | |
226 | const Slice& key, | |
227 | const Slice& delta); | |
228 | ||
229 | // Returns the number of successive merge entries starting from the newest | |
230 | // entry for the key up to the last non-merge entry or last entry for the | |
231 | // key in the memtable. | |
232 | size_t CountSuccessiveMergeEntries(const LookupKey& key); | |
233 | ||
234 | // Update counters and flush status after inserting a whole write batch | |
235 | // Used in concurrent memtable inserts. | |
236 | void BatchPostProcess(const MemTablePostProcessInfo& update_counters) { | |
237 | num_entries_.fetch_add(update_counters.num_entries, | |
238 | std::memory_order_relaxed); | |
239 | data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed); | |
240 | if (update_counters.num_deletes != 0) { | |
241 | num_deletes_.fetch_add(update_counters.num_deletes, | |
242 | std::memory_order_relaxed); | |
243 | } | |
244 | UpdateFlushState(); | |
245 | } | |
246 | ||
247 | // Get total number of entries in the mem table. | |
248 | // REQUIRES: external synchronization to prevent simultaneous | |
249 | // operations on the same MemTable (unless this Memtable is immutable). | |
250 | uint64_t num_entries() const { | |
251 | return num_entries_.load(std::memory_order_relaxed); | |
252 | } | |
253 | ||
254 | // Get total number of deletes in the mem table. | |
255 | // REQUIRES: external synchronization to prevent simultaneous | |
256 | // operations on the same MemTable (unless this Memtable is immutable). | |
257 | uint64_t num_deletes() const { | |
258 | return num_deletes_.load(std::memory_order_relaxed); | |
259 | } | |
260 | ||
261 | // Returns the edits area that is needed for flushing the memtable | |
262 | VersionEdit* GetEdits() { return &edit_; } | |
263 | ||
264 | // Returns if there is no entry inserted to the mem table. | |
265 | // REQUIRES: external synchronization to prevent simultaneous | |
266 | // operations on the same MemTable (unless this Memtable is immutable). | |
267 | bool IsEmpty() const { return first_seqno_ == 0; } | |
268 | ||
269 | // Returns the sequence number of the first element that was inserted | |
270 | // into the memtable. | |
271 | // REQUIRES: external synchronization to prevent simultaneous | |
272 | // operations on the same MemTable (unless this Memtable is immutable). | |
273 | SequenceNumber GetFirstSequenceNumber() { | |
274 | return first_seqno_.load(std::memory_order_relaxed); | |
275 | } | |
276 | ||
277 | // Returns the sequence number that is guaranteed to be smaller than or equal | |
278 | // to the sequence number of any key that could be inserted into this | |
279 | // memtable. It can then be assumed that any write with a larger(or equal) | |
280 | // sequence number will be present in this memtable or a later memtable. | |
281 | // | |
282 | // If the earliest sequence number could not be determined, | |
283 | // kMaxSequenceNumber will be returned. | |
284 | SequenceNumber GetEarliestSequenceNumber() { | |
285 | return earliest_seqno_.load(std::memory_order_relaxed); | |
286 | } | |
287 | ||
288 | // DB's latest sequence ID when the memtable is created. This number | |
289 | // may be updated to a more recent one before any key is inserted. | |
290 | SequenceNumber GetCreationSeq() const { return creation_seq_; } | |
291 | ||
292 | void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; } | |
293 | ||
294 | // Returns the next active logfile number when this memtable is about to | |
295 | // be flushed to storage | |
296 | // REQUIRES: external synchronization to prevent simultaneous | |
297 | // operations on the same MemTable. | |
298 | uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } | |
299 | ||
300 | // Sets the next active logfile number when this memtable is about to | |
301 | // be flushed to storage | |
302 | // REQUIRES: external synchronization to prevent simultaneous | |
303 | // operations on the same MemTable. | |
304 | void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } | |
305 | ||
306 | // if this memtable contains data from a committed | |
307 | // two phase transaction we must take note of the | |
308 | // log which contains that data so we can know | |
309 | // when to relese that log | |
310 | void RefLogContainingPrepSection(uint64_t log); | |
311 | uint64_t GetMinLogContainingPrepSection(); | |
312 | ||
313 | // Notify the underlying storage that no more items will be added. | |
314 | // REQUIRES: external synchronization to prevent simultaneous | |
315 | // operations on the same MemTable. | |
316 | // After MarkImmutable() is called, you should not attempt to | |
317 | // write anything to this MemTable(). (Ie. do not call Add() or Update()). | |
318 | void MarkImmutable() { | |
319 | table_->MarkReadOnly(); | |
320 | allocator_.DoneAllocating(); | |
321 | } | |
322 | ||
323 | // return true if the current MemTableRep supports merge operator. | |
324 | bool IsMergeOperatorSupported() const { | |
325 | return table_->IsMergeOperatorSupported(); | |
326 | } | |
327 | ||
328 | // return true if the current MemTableRep supports snapshots. | |
329 | // inplace update prevents snapshots, | |
330 | bool IsSnapshotSupported() const { | |
331 | return table_->IsSnapshotSupported() && !moptions_.inplace_update_support; | |
332 | } | |
333 | ||
334 | struct MemTableStats { | |
335 | uint64_t size; | |
336 | uint64_t count; | |
337 | }; | |
338 | ||
339 | MemTableStats ApproximateStats(const Slice& start_ikey, | |
340 | const Slice& end_ikey); | |
341 | ||
342 | // Get the lock associated for the key | |
343 | port::RWMutex* GetLock(const Slice& key); | |
344 | ||
345 | const InternalKeyComparator& GetInternalKeyComparator() const { | |
346 | return comparator_.comparator; | |
347 | } | |
348 | ||
349 | const MemTableOptions* GetMemTableOptions() const { return &moptions_; } | |
350 | ||
351 | private: | |
352 | enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; | |
353 | ||
354 | friend class MemTableIterator; | |
355 | friend class MemTableBackwardIterator; | |
356 | friend class MemTableList; | |
357 | ||
358 | KeyComparator comparator_; | |
359 | const MemTableOptions moptions_; | |
360 | int refs_; | |
361 | const size_t kArenaBlockSize; | |
362 | ConcurrentArena arena_; | |
363 | MemTableAllocator allocator_; | |
364 | unique_ptr<MemTableRep> table_; | |
365 | unique_ptr<MemTableRep> range_del_table_; | |
366 | bool is_range_del_table_empty_; | |
367 | ||
368 | // Total data size of all data inserted | |
369 | std::atomic<uint64_t> data_size_; | |
370 | std::atomic<uint64_t> num_entries_; | |
371 | std::atomic<uint64_t> num_deletes_; | |
372 | ||
373 | // These are used to manage memtable flushes to storage | |
374 | bool flush_in_progress_; // started the flush | |
375 | bool flush_completed_; // finished the flush | |
376 | uint64_t file_number_; // filled up after flush is complete | |
377 | ||
378 | // The updates to be applied to the transaction log when this | |
379 | // memtable is flushed to storage. | |
380 | VersionEdit edit_; | |
381 | ||
382 | // The sequence number of the kv that was inserted first | |
383 | std::atomic<SequenceNumber> first_seqno_; | |
384 | ||
385 | // The db sequence number at the time of creation or kMaxSequenceNumber | |
386 | // if not set. | |
387 | std::atomic<SequenceNumber> earliest_seqno_; | |
388 | ||
389 | SequenceNumber creation_seq_; | |
390 | ||
391 | // The log files earlier than this number can be deleted. | |
392 | uint64_t mem_next_logfile_number_; | |
393 | ||
394 | // the earliest log containing a prepared section | |
395 | // which has been inserted into this memtable. | |
396 | std::atomic<uint64_t> min_prep_log_referenced_; | |
397 | ||
398 | // rw locks for inplace updates | |
399 | std::vector<port::RWMutex> locks_; | |
400 | ||
401 | const SliceTransform* const prefix_extractor_; | |
402 | std::unique_ptr<DynamicBloom> prefix_bloom_; | |
403 | ||
404 | std::atomic<FlushStateEnum> flush_state_; | |
405 | ||
406 | Env* env_; | |
407 | ||
408 | // Extract sequential insert prefixes. | |
409 | const SliceTransform* insert_with_hint_prefix_extractor_; | |
410 | ||
411 | // Insert hints for each prefix. | |
412 | std::unordered_map<Slice, void*, SliceHasher> insert_hints_; | |
413 | ||
414 | // Returns a heuristic flush decision | |
415 | bool ShouldFlushNow() const; | |
416 | ||
417 | // Updates flush_state_ using ShouldFlushNow() | |
418 | void UpdateFlushState(); | |
419 | ||
420 | // No copying allowed | |
421 | MemTable(const MemTable&); | |
422 | MemTable& operator=(const MemTable&); | |
423 | }; | |
424 | ||
425 | extern const char* EncodeKey(std::string* scratch, const Slice& target); | |
426 | ||
427 | } // namespace rocksdb |