]>
Commit | Line | Data |
---|---|---|
1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. | |
2 | // This source code is licensed under both the GPLv2 (found in the | |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
5 | // | |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | #include <atomic> | |
12 | #include <deque> | |
13 | #include <functional> | |
14 | #include <memory> | |
15 | #include <string> | |
16 | #include <unordered_set> | |
17 | #include <vector> | |
18 | ||
19 | #include "db/dbformat.h" | |
20 | #include "db/kv_checksum.h" | |
21 | #include "db/range_tombstone_fragmenter.h" | |
22 | #include "db/read_callback.h" | |
23 | #include "db/version_edit.h" | |
24 | #include "memory/allocator.h" | |
25 | #include "memory/concurrent_arena.h" | |
26 | #include "monitoring/instrumented_mutex.h" | |
27 | #include "options/cf_options.h" | |
28 | #include "rocksdb/db.h" | |
29 | #include "rocksdb/memtablerep.h" | |
30 | #include "table/multiget_context.h" | |
31 | #include "util/dynamic_bloom.h" | |
32 | #include "util/hash.h" | |
33 | #include "util/hash_containers.h" | |
34 | ||
35 | namespace ROCKSDB_NAMESPACE { | |
36 | ||
37 | struct FlushJobInfo; | |
38 | class Mutex; | |
39 | class MemTableIterator; | |
40 | class MergeContext; | |
41 | class SystemClock; | |
42 | ||
43 | struct ImmutableMemTableOptions { | |
44 | explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions, | |
45 | const MutableCFOptions& mutable_cf_options); | |
46 | size_t arena_block_size; | |
47 | uint32_t memtable_prefix_bloom_bits; | |
48 | size_t memtable_huge_page_size; | |
49 | bool memtable_whole_key_filtering; | |
50 | bool inplace_update_support; | |
51 | size_t inplace_update_num_locks; | |
52 | UpdateStatus (*inplace_callback)(char* existing_value, | |
53 | uint32_t* existing_value_size, | |
54 | Slice delta_value, | |
55 | std::string* merged_value); | |
56 | size_t max_successive_merges; | |
57 | Statistics* statistics; | |
58 | MergeOperator* merge_operator; | |
59 | Logger* info_log; | |
60 | bool allow_data_in_errors; | |
61 | uint32_t protection_bytes_per_key; | |
62 | }; | |
63 | ||
64 | // Batched counters to updated when inserting keys in one write batch. | |
65 | // In post process of the write batch, these can be updated together. | |
66 | // Only used in concurrent memtable insert case. | |
67 | struct MemTablePostProcessInfo { | |
68 | uint64_t data_size = 0; | |
69 | uint64_t num_entries = 0; | |
70 | uint64_t num_deletes = 0; | |
71 | }; | |
72 | ||
73 | using MultiGetRange = MultiGetContext::Range; | |
74 | // Note: Many of the methods in this class have comments indicating that | |
75 | // external synchronization is required as these methods are not thread-safe. | |
76 | // It is up to higher layers of code to decide how to prevent concurrent | |
77 | // invocation of these methods. This is usually done by acquiring either | |
78 | // the db mutex or the single writer thread. | |
79 | // | |
80 | // Some of these methods are documented to only require external | |
81 | // synchronization if this memtable is immutable. Calling MarkImmutable() is | |
82 | // not sufficient to guarantee immutability. It is up to higher layers of | |
83 | // code to determine if this MemTable can still be modified by other threads. | |
84 | // Eg: The Superversion stores a pointer to the current MemTable (that can | |
85 | // be modified) and a separate list of the MemTables that can no longer be | |
86 | // written to (aka the 'immutable memtables'). | |
87 | class MemTable { | |
88 | public: | |
89 | struct KeyComparator : public MemTableRep::KeyComparator { | |
90 | const InternalKeyComparator comparator; | |
91 | explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {} | |
92 | virtual int operator()(const char* prefix_len_key1, | |
93 | const char* prefix_len_key2) const override; | |
94 | virtual int operator()(const char* prefix_len_key, | |
95 | const DecodedType& key) const override; | |
96 | }; | |
97 | ||
98 | // MemTables are reference counted. The initial reference count | |
99 | // is zero and the caller must call Ref() at least once. | |
100 | // | |
101 | // earliest_seq should be the current SequenceNumber in the db such that any | |
102 | // key inserted into this memtable will have an equal or larger seq number. | |
103 | // (When a db is first created, the earliest sequence number will be 0). | |
104 | // If the earliest sequence number is not known, kMaxSequenceNumber may be | |
105 | // used, but this may prevent some transactions from succeeding until the | |
106 | // first key is inserted into the memtable. | |
107 | explicit MemTable(const InternalKeyComparator& comparator, | |
108 | const ImmutableOptions& ioptions, | |
109 | const MutableCFOptions& mutable_cf_options, | |
110 | WriteBufferManager* write_buffer_manager, | |
111 | SequenceNumber earliest_seq, uint32_t column_family_id); | |
112 | // No copying allowed | |
113 | MemTable(const MemTable&) = delete; | |
114 | MemTable& operator=(const MemTable&) = delete; | |
115 | ||
116 | // Do not delete this MemTable unless Unref() indicates it not in use. | |
117 | ~MemTable(); | |
118 | ||
119 | // Increase reference count. | |
120 | // REQUIRES: external synchronization to prevent simultaneous | |
121 | // operations on the same MemTable. | |
122 | void Ref() { ++refs_; } | |
123 | ||
124 | // Drop reference count. | |
125 | // If the refcount goes to zero return this memtable, otherwise return null. | |
126 | // REQUIRES: external synchronization to prevent simultaneous | |
127 | // operations on the same MemTable. | |
128 | MemTable* Unref() { | |
129 | --refs_; | |
130 | assert(refs_ >= 0); | |
131 | if (refs_ <= 0) { | |
132 | return this; | |
133 | } | |
134 | return nullptr; | |
135 | } | |
136 | ||
137 | // Returns an estimate of the number of bytes of data in use by this | |
138 | // data structure. | |
139 | // | |
140 | // REQUIRES: external synchronization to prevent simultaneous | |
141 | // operations on the same MemTable (unless this Memtable is immutable). | |
142 | size_t ApproximateMemoryUsage(); | |
143 | ||
144 | // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't | |
145 | // require external synchronization. The value may be less accurate though | |
146 | size_t ApproximateMemoryUsageFast() const { | |
147 | return approximate_memory_usage_.load(std::memory_order_relaxed); | |
148 | } | |
149 | ||
150 | // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast | |
151 | size_t MemoryAllocatedBytes() const { | |
152 | return table_->ApproximateMemoryUsage() + | |
153 | range_del_table_->ApproximateMemoryUsage() + | |
154 | arena_.MemoryAllocatedBytes(); | |
155 | } | |
156 | ||
157 | // Returns a vector of unique random memtable entries of size 'sample_size'. | |
158 | // | |
159 | // Note: the entries are stored in the unordered_set as length-prefixed keys, | |
160 | // hence their representation in the set as "const char*". | |
161 | // Note2: the size of the output set 'entries' is not enforced to be strictly | |
162 | // equal to 'target_sample_size'. Its final size might be slightly | |
163 | // greater or slightly less than 'target_sample_size' | |
164 | // | |
165 | // REQUIRES: external synchronization to prevent simultaneous | |
166 | // operations on the same MemTable (unless this Memtable is immutable). | |
167 | // REQUIRES: SkipList memtable representation. This function is not | |
168 | // implemented for any other type of memtable representation (vectorrep, | |
169 | // hashskiplist,...). | |
170 | void UniqueRandomSample(const uint64_t& target_sample_size, | |
171 | std::unordered_set<const char*>* entries) { | |
172 | // TODO(bjlemaire): at the moment, only supported by skiplistrep. | |
173 | // Extend it to all other memtable representations. | |
174 | table_->UniqueRandomSample(num_entries(), target_sample_size, entries); | |
175 | } | |
176 | ||
177 | // This method heuristically determines if the memtable should continue to | |
178 | // host more data. | |
179 | bool ShouldScheduleFlush() const { | |
180 | return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED; | |
181 | } | |
182 | ||
183 | // Returns true if a flush should be scheduled and the caller should | |
184 | // be the one to schedule it | |
185 | bool MarkFlushScheduled() { | |
186 | auto before = FLUSH_REQUESTED; | |
187 | return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED, | |
188 | std::memory_order_relaxed, | |
189 | std::memory_order_relaxed); | |
190 | } | |
191 | ||
192 | // Return an iterator that yields the contents of the memtable. | |
193 | // | |
194 | // The caller must ensure that the underlying MemTable remains live | |
195 | // while the returned iterator is live. The keys returned by this | |
196 | // iterator are internal keys encoded by AppendInternalKey in the | |
197 | // db/dbformat.{h,cc} module. | |
198 | // | |
199 | // By default, it returns an iterator for prefix seek if prefix_extractor | |
200 | // is configured in Options. | |
201 | // arena: If not null, the arena needs to be used to allocate the Iterator. | |
202 | // Calling ~Iterator of the iterator will destroy all the states but | |
203 | // those allocated in arena. | |
204 | InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena); | |
205 | ||
206 | // Returns an iterator that yields the range tombstones of the memtable. | |
207 | // The caller must ensure that the underlying MemTable remains live | |
208 | // while the returned iterator is live. | |
209 | // @param immutable_memtable Whether this memtable is an immutable memtable. | |
210 | // This information is not stored in memtable itself, so it needs to be | |
211 | // specified by the caller. This flag is used internally to decide whether a | |
212 | // cached fragmented range tombstone list can be returned. This cached version | |
213 | // is constructed when a memtable becomes immutable. Setting the flag to false | |
214 | // will always yield correct result, but may incur performance penalty as it | |
215 | // always creates a new fragmented range tombstone list. | |
216 | FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( | |
217 | const ReadOptions& read_options, SequenceNumber read_seq, | |
218 | bool immutable_memtable); | |
219 | ||
220 | Status VerifyEncodedEntry(Slice encoded, | |
221 | const ProtectionInfoKVOS64& kv_prot_info); | |
222 | ||
223 | // Add an entry into memtable that maps key to value at the | |
224 | // specified sequence number and with the specified type. | |
225 | // Typically value will be empty if type==kTypeDeletion. | |
226 | // | |
227 | // REQUIRES: if allow_concurrent = false, external synchronization to prevent | |
228 | // simultaneous operations on the same MemTable. | |
229 | // | |
230 | // Returns `Status::TryAgain` if the `seq`, `key` combination already exists | |
231 | // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. | |
232 | // The next attempt should try a larger value for `seq`. | |
233 | Status Add(SequenceNumber seq, ValueType type, const Slice& key, | |
234 | const Slice& value, const ProtectionInfoKVOS64* kv_prot_info, | |
235 | bool allow_concurrent = false, | |
236 | MemTablePostProcessInfo* post_process_info = nullptr, | |
237 | void** hint = nullptr); | |
238 | ||
239 | // Used to Get value associated with key or Get Merge Operands associated | |
240 | // with key. | |
241 | // If do_merge = true the default behavior which is Get value for key is | |
242 | // executed. Expected behavior is described right below. | |
243 | // If memtable contains a value for key, store it in *value and return true. | |
244 | // If memtable contains a deletion for key, store a NotFound() error | |
245 | // in *status and return true. | |
246 | // If memtable contains Merge operation as the most recent entry for a key, | |
247 | // and the merge process does not stop (not reaching a value or delete), | |
248 | // prepend the current merge operand to *operands. | |
249 | // store MergeInProgress in s, and return false. | |
250 | // Else, return false. | |
251 | // If any operation was found, its most recent sequence number | |
252 | // will be stored in *seq on success (regardless of whether true/false is | |
253 | // returned). Otherwise, *seq will be set to kMaxSequenceNumber. | |
254 | // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other | |
255 | // status returned indicates a corruption or other unexpected error. | |
256 | // If do_merge = false then any Merge Operands encountered for key are simply | |
257 | // stored in merge_context.operands_list and never actually merged to get a | |
258 | // final value. The raw Merge Operands are eventually returned to the user. | |
259 | // @param immutable_memtable Whether this memtable is immutable. Used | |
260 | // internally by NewRangeTombstoneIterator(). See comment above | |
261 | // NewRangeTombstoneIterator() for more detail. | |
262 | bool Get(const LookupKey& key, std::string* value, | |
263 | PinnableWideColumns* columns, std::string* timestamp, Status* s, | |
264 | MergeContext* merge_context, | |
265 | SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, | |
266 | const ReadOptions& read_opts, bool immutable_memtable, | |
267 | ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, | |
268 | bool do_merge = true); | |
269 | ||
270 | bool Get(const LookupKey& key, std::string* value, | |
271 | PinnableWideColumns* columns, std::string* timestamp, Status* s, | |
272 | MergeContext* merge_context, | |
273 | SequenceNumber* max_covering_tombstone_seq, | |
274 | const ReadOptions& read_opts, bool immutable_memtable, | |
275 | ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, | |
276 | bool do_merge = true) { | |
277 | SequenceNumber seq; | |
278 | return Get(key, value, columns, timestamp, s, merge_context, | |
279 | max_covering_tombstone_seq, &seq, read_opts, immutable_memtable, | |
280 | callback, is_blob_index, do_merge); | |
281 | } | |
282 | ||
283 | // @param immutable_memtable Whether this memtable is immutable. Used | |
284 | // internally by NewRangeTombstoneIterator(). See comment above | |
285 | // NewRangeTombstoneIterator() for more detail. | |
286 | void MultiGet(const ReadOptions& read_options, MultiGetRange* range, | |
287 | ReadCallback* callback, bool immutable_memtable); | |
288 | ||
289 | // If `key` exists in current memtable with type value_type and the existing | |
290 | // value is at least as large as the new value, updates it in-place. Otherwise | |
291 | // adds the new value to the memtable out-of-place. | |
292 | // | |
293 | // Returns `Status::TryAgain` if the `seq`, `key` combination already exists | |
294 | // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. | |
295 | // The next attempt should try a larger value for `seq`. | |
296 | // | |
297 | // REQUIRES: external synchronization to prevent simultaneous | |
298 | // operations on the same MemTable. | |
299 | Status Update(SequenceNumber seq, ValueType value_type, const Slice& key, | |
300 | const Slice& value, const ProtectionInfoKVOS64* kv_prot_info); | |
301 | ||
302 | // If `key` exists in current memtable with type `kTypeValue` and the existing | |
303 | // value is at least as large as the new value, updates it in-place. Otherwise | |
304 | // if `key` exists in current memtable with type `kTypeValue`, adds the new | |
305 | // value to the memtable out-of-place. | |
306 | // | |
307 | // Returns `Status::NotFound` if `key` does not exist in current memtable or | |
308 | // the latest version of `key` does not have `kTypeValue`. | |
309 | // | |
310 | // Returns `Status::TryAgain` if the `seq`, `key` combination already exists | |
311 | // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. | |
312 | // The next attempt should try a larger value for `seq`. | |
313 | // | |
314 | // REQUIRES: external synchronization to prevent simultaneous | |
315 | // operations on the same MemTable. | |
316 | Status UpdateCallback(SequenceNumber seq, const Slice& key, | |
317 | const Slice& delta, | |
318 | const ProtectionInfoKVOS64* kv_prot_info); | |
319 | ||
320 | // Returns the number of successive merge entries starting from the newest | |
321 | // entry for the key up to the last non-merge entry or last entry for the | |
322 | // key in the memtable. | |
323 | size_t CountSuccessiveMergeEntries(const LookupKey& key); | |
324 | ||
325 | // Update counters and flush status after inserting a whole write batch | |
326 | // Used in concurrent memtable inserts. | |
327 | void BatchPostProcess(const MemTablePostProcessInfo& update_counters) { | |
328 | num_entries_.fetch_add(update_counters.num_entries, | |
329 | std::memory_order_relaxed); | |
330 | data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed); | |
331 | if (update_counters.num_deletes != 0) { | |
332 | num_deletes_.fetch_add(update_counters.num_deletes, | |
333 | std::memory_order_relaxed); | |
334 | } | |
335 | UpdateFlushState(); | |
336 | } | |
337 | ||
338 | // Get total number of entries in the mem table. | |
339 | // REQUIRES: external synchronization to prevent simultaneous | |
340 | // operations on the same MemTable (unless this Memtable is immutable). | |
341 | uint64_t num_entries() const { | |
342 | return num_entries_.load(std::memory_order_relaxed); | |
343 | } | |
344 | ||
345 | // Get total number of deletes in the mem table. | |
346 | // REQUIRES: external synchronization to prevent simultaneous | |
347 | // operations on the same MemTable (unless this Memtable is immutable). | |
348 | uint64_t num_deletes() const { | |
349 | return num_deletes_.load(std::memory_order_relaxed); | |
350 | } | |
351 | ||
352 | uint64_t get_data_size() const { | |
353 | return data_size_.load(std::memory_order_relaxed); | |
354 | } | |
355 | ||
356 | // Dynamically change the memtable's capacity. If set below the current usage, | |
357 | // the next key added will trigger a flush. Can only increase size when | |
358 | // memtable prefix bloom is disabled, since we can't easily allocate more | |
359 | // space. | |
360 | void UpdateWriteBufferSize(size_t new_write_buffer_size) { | |
361 | if (bloom_filter_ == nullptr || | |
362 | new_write_buffer_size < write_buffer_size_) { | |
363 | write_buffer_size_.store(new_write_buffer_size, | |
364 | std::memory_order_relaxed); | |
365 | } | |
366 | } | |
367 | ||
368 | // Returns the edits area that is needed for flushing the memtable | |
369 | VersionEdit* GetEdits() { return &edit_; } | |
370 | ||
371 | // Returns if there is no entry inserted to the mem table. | |
372 | // REQUIRES: external synchronization to prevent simultaneous | |
373 | // operations on the same MemTable (unless this Memtable is immutable). | |
374 | bool IsEmpty() const { return first_seqno_ == 0; } | |
375 | ||
376 | // Returns the sequence number of the first element that was inserted | |
377 | // into the memtable. | |
378 | // REQUIRES: external synchronization to prevent simultaneous | |
379 | // operations on the same MemTable (unless this Memtable is immutable). | |
380 | SequenceNumber GetFirstSequenceNumber() { | |
381 | return first_seqno_.load(std::memory_order_relaxed); | |
382 | } | |
383 | ||
384 | // Returns the sequence number of the first element that was inserted | |
385 | // into the memtable. | |
386 | // REQUIRES: external synchronization to prevent simultaneous | |
387 | // operations on the same MemTable (unless this Memtable is immutable). | |
388 | void SetFirstSequenceNumber(SequenceNumber first_seqno) { | |
389 | return first_seqno_.store(first_seqno, std::memory_order_relaxed); | |
390 | } | |
391 | ||
392 | // Returns the sequence number that is guaranteed to be smaller than or equal | |
393 | // to the sequence number of any key that could be inserted into this | |
394 | // memtable. It can then be assumed that any write with a larger(or equal) | |
395 | // sequence number will be present in this memtable or a later memtable. | |
396 | // | |
397 | // If the earliest sequence number could not be determined, | |
398 | // kMaxSequenceNumber will be returned. | |
399 | SequenceNumber GetEarliestSequenceNumber() { | |
400 | return earliest_seqno_.load(std::memory_order_relaxed); | |
401 | } | |
402 | ||
403 | // Sets the sequence number that is guaranteed to be smaller than or equal | |
404 | // to the sequence number of any key that could be inserted into this | |
405 | // memtable. It can then be assumed that any write with a larger(or equal) | |
406 | // sequence number will be present in this memtable or a later memtable. | |
407 | // Used only for MemPurge operation | |
408 | void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) { | |
409 | return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed); | |
410 | } | |
411 | ||
412 | // DB's latest sequence ID when the memtable is created. This number | |
413 | // may be updated to a more recent one before any key is inserted. | |
414 | SequenceNumber GetCreationSeq() const { return creation_seq_; } | |
415 | ||
416 | void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; } | |
417 | ||
418 | // Returns the next active logfile number when this memtable is about to | |
419 | // be flushed to storage | |
420 | // REQUIRES: external synchronization to prevent simultaneous | |
421 | // operations on the same MemTable. | |
422 | uint64_t GetNextLogNumber() { return mem_next_logfile_number_; } | |
423 | ||
424 | // Sets the next active logfile number when this memtable is about to | |
425 | // be flushed to storage | |
426 | // REQUIRES: external synchronization to prevent simultaneous | |
427 | // operations on the same MemTable. | |
428 | void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } | |
429 | ||
430 | // if this memtable contains data from a committed | |
431 | // two phase transaction we must take note of the | |
432 | // log which contains that data so we can know | |
433 | // when to relese that log | |
434 | void RefLogContainingPrepSection(uint64_t log); | |
435 | uint64_t GetMinLogContainingPrepSection(); | |
436 | ||
437 | // Notify the underlying storage that no more items will be added. | |
438 | // REQUIRES: external synchronization to prevent simultaneous | |
439 | // operations on the same MemTable. | |
440 | // After MarkImmutable() is called, you should not attempt to | |
441 | // write anything to this MemTable(). (Ie. do not call Add() or Update()). | |
442 | void MarkImmutable() { | |
443 | table_->MarkReadOnly(); | |
444 | mem_tracker_.DoneAllocating(); | |
445 | } | |
446 | ||
447 | // Notify the underlying storage that all data it contained has been | |
448 | // persisted. | |
449 | // REQUIRES: external synchronization to prevent simultaneous | |
450 | // operations on the same MemTable. | |
451 | void MarkFlushed() { table_->MarkFlushed(); } | |
452 | ||
453 | // return true if the current MemTableRep supports merge operator. | |
454 | bool IsMergeOperatorSupported() const { | |
455 | return table_->IsMergeOperatorSupported(); | |
456 | } | |
457 | ||
458 | // return true if the current MemTableRep supports snapshots. | |
459 | // inplace update prevents snapshots, | |
460 | bool IsSnapshotSupported() const { | |
461 | return table_->IsSnapshotSupported() && !moptions_.inplace_update_support; | |
462 | } | |
463 | ||
464 | struct MemTableStats { | |
465 | uint64_t size; | |
466 | uint64_t count; | |
467 | }; | |
468 | ||
469 | MemTableStats ApproximateStats(const Slice& start_ikey, | |
470 | const Slice& end_ikey); | |
471 | ||
472 | // Get the lock associated for the key | |
473 | port::RWMutex* GetLock(const Slice& key); | |
474 | ||
475 | const InternalKeyComparator& GetInternalKeyComparator() const { | |
476 | return comparator_.comparator; | |
477 | } | |
478 | ||
479 | const ImmutableMemTableOptions* GetImmutableMemTableOptions() const { | |
480 | return &moptions_; | |
481 | } | |
482 | ||
483 | uint64_t ApproximateOldestKeyTime() const { | |
484 | return oldest_key_time_.load(std::memory_order_relaxed); | |
485 | } | |
486 | ||
487 | // REQUIRES: db_mutex held. | |
488 | void SetID(uint64_t id) { id_ = id; } | |
489 | ||
490 | uint64_t GetID() const { return id_; } | |
491 | ||
492 | void SetFlushCompleted(bool completed) { flush_completed_ = completed; } | |
493 | ||
494 | uint64_t GetFileNumber() const { return file_number_; } | |
495 | ||
496 | void SetFileNumber(uint64_t file_num) { file_number_ = file_num; } | |
497 | ||
498 | void SetFlushInProgress(bool in_progress) { | |
499 | flush_in_progress_ = in_progress; | |
500 | } | |
501 | ||
502 | #ifndef ROCKSDB_LITE | |
503 | void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) { | |
504 | flush_job_info_ = std::move(info); | |
505 | } | |
506 | ||
507 | std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() { | |
508 | return std::move(flush_job_info_); | |
509 | } | |
510 | #endif // !ROCKSDB_LITE | |
511 | ||
512 | // Returns a heuristic flush decision | |
513 | bool ShouldFlushNow(); | |
514 | ||
515 | void ConstructFragmentedRangeTombstones(); | |
516 | ||
517 | // Returns whether a fragmented range tombstone list is already constructed | |
518 | // for this memtable. It should be constructed right before a memtable is | |
519 | // added to an immutable memtable list. Note that if a memtable does not have | |
520 | // any range tombstone, then no range tombstone list will ever be constructed. | |
521 | // @param allow_empty Specifies whether a memtable with no range tombstone is | |
522 | // considered to have its fragmented range tombstone list constructed. | |
523 | bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const { | |
524 | if (allow_empty) { | |
525 | return fragmented_range_tombstone_list_.get() != nullptr || | |
526 | is_range_del_table_empty_; | |
527 | } else { | |
528 | return fragmented_range_tombstone_list_.get() != nullptr; | |
529 | } | |
530 | } | |
531 | ||
532 | // Returns Corruption status if verification fails. | |
533 | static Status VerifyEntryChecksum(const char* entry, | |
534 | size_t protection_bytes_per_key, | |
535 | bool allow_data_in_errors = false); | |
536 | ||
537 | private: | |
538 | enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; | |
539 | ||
540 | friend class MemTableIterator; | |
541 | friend class MemTableBackwardIterator; | |
542 | friend class MemTableList; | |
543 | ||
544 | KeyComparator comparator_; | |
545 | const ImmutableMemTableOptions moptions_; | |
546 | int refs_; | |
547 | const size_t kArenaBlockSize; | |
548 | AllocTracker mem_tracker_; | |
549 | ConcurrentArena arena_; | |
550 | std::unique_ptr<MemTableRep> table_; | |
551 | std::unique_ptr<MemTableRep> range_del_table_; | |
552 | std::atomic_bool is_range_del_table_empty_; | |
553 | ||
554 | // Total data size of all data inserted | |
555 | std::atomic<uint64_t> data_size_; | |
556 | std::atomic<uint64_t> num_entries_; | |
557 | std::atomic<uint64_t> num_deletes_; | |
558 | ||
559 | // Dynamically changeable memtable option | |
560 | std::atomic<size_t> write_buffer_size_; | |
561 | ||
562 | // These are used to manage memtable flushes to storage | |
563 | bool flush_in_progress_; // started the flush | |
564 | bool flush_completed_; // finished the flush | |
565 | uint64_t file_number_; // filled up after flush is complete | |
566 | ||
567 | // The updates to be applied to the transaction log when this | |
568 | // memtable is flushed to storage. | |
569 | VersionEdit edit_; | |
570 | ||
571 | // The sequence number of the kv that was inserted first | |
572 | std::atomic<SequenceNumber> first_seqno_; | |
573 | ||
574 | // The db sequence number at the time of creation or kMaxSequenceNumber | |
575 | // if not set. | |
576 | std::atomic<SequenceNumber> earliest_seqno_; | |
577 | ||
578 | SequenceNumber creation_seq_; | |
579 | ||
580 | // The log files earlier than this number can be deleted. | |
581 | uint64_t mem_next_logfile_number_; | |
582 | ||
583 | // the earliest log containing a prepared section | |
584 | // which has been inserted into this memtable. | |
585 | std::atomic<uint64_t> min_prep_log_referenced_; | |
586 | ||
587 | // rw locks for inplace updates | |
588 | std::vector<port::RWMutex> locks_; | |
589 | ||
590 | const SliceTransform* const prefix_extractor_; | |
591 | std::unique_ptr<DynamicBloom> bloom_filter_; | |
592 | ||
593 | std::atomic<FlushStateEnum> flush_state_; | |
594 | ||
595 | SystemClock* clock_; | |
596 | ||
597 | // Extract sequential insert prefixes. | |
598 | const SliceTransform* insert_with_hint_prefix_extractor_; | |
599 | ||
600 | // Insert hints for each prefix. | |
601 | UnorderedMapH<Slice, void*, SliceHasher> insert_hints_; | |
602 | ||
603 | // Timestamp of oldest key | |
604 | std::atomic<uint64_t> oldest_key_time_; | |
605 | ||
606 | // Memtable id to track flush. | |
607 | uint64_t id_ = 0; | |
608 | ||
609 | // Sequence number of the atomic flush that is responsible for this memtable. | |
610 | // The sequence number of atomic flush is a seq, such that no writes with | |
611 | // sequence numbers greater than or equal to seq are flushed, while all | |
612 | // writes with sequence number smaller than seq are flushed. | |
613 | SequenceNumber atomic_flush_seqno_; | |
614 | ||
615 | // keep track of memory usage in table_, arena_, and range_del_table_. | |
616 | // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` | |
617 | std::atomic<uint64_t> approximate_memory_usage_; | |
618 | ||
619 | #ifndef ROCKSDB_LITE | |
620 | // Flush job info of the current memtable. | |
621 | std::unique_ptr<FlushJobInfo> flush_job_info_; | |
622 | #endif // !ROCKSDB_LITE | |
623 | ||
624 | // Updates flush_state_ using ShouldFlushNow() | |
625 | void UpdateFlushState(); | |
626 | ||
627 | void UpdateOldestKeyTime(); | |
628 | ||
629 | void GetFromTable(const LookupKey& key, | |
630 | SequenceNumber max_covering_tombstone_seq, bool do_merge, | |
631 | ReadCallback* callback, bool* is_blob_index, | |
632 | std::string* value, PinnableWideColumns* columns, | |
633 | std::string* timestamp, Status* s, | |
634 | MergeContext* merge_context, SequenceNumber* seq, | |
635 | bool* found_final_value, bool* merge_in_progress); | |
636 | ||
637 | // Always returns non-null and assumes certain pre-checks (e.g., | |
638 | // is_range_del_table_empty_) are done. This is only valid during the lifetime | |
639 | // of the underlying memtable. | |
640 | // read_seq and read_options.timestamp will be used as the upper bound | |
641 | // for range tombstones. | |
642 | FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal( | |
643 | const ReadOptions& read_options, SequenceNumber read_seq, | |
644 | bool immutable_memtable); | |
645 | ||
646 | // The fragmented range tombstones of this memtable. | |
647 | // This is constructed when this memtable becomes immutable | |
648 | // if !is_range_del_table_empty_. | |
649 | std::unique_ptr<FragmentedRangeTombstoneList> | |
650 | fragmented_range_tombstone_list_; | |
651 | ||
652 | // makes sure there is a single range tombstone writer to invalidate cache | |
653 | std::mutex range_del_mutex_; | |
654 | CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>> | |
655 | cached_range_tombstone_; | |
656 | ||
657 | void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, | |
658 | const Slice& key, const Slice& value, ValueType type, | |
659 | SequenceNumber s, char* checksum_ptr); | |
660 | }; | |
661 | ||
662 | extern const char* EncodeKey(std::string* scratch, const Slice& target); | |
663 | ||
664 | } // namespace ROCKSDB_NAMESPACE |