]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/memtable.h
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / rocksdb / db / memtable.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11#include <atomic>
12#include <deque>
13#include <functional>
14#include <memory>
15#include <string>
1e59de90 16#include <unordered_set>
7c673cae 17#include <vector>
1e59de90 18
7c673cae 19#include "db/dbformat.h"
1e59de90 20#include "db/kv_checksum.h"
494da23a 21#include "db/range_tombstone_fragmenter.h"
11fdf7f2 22#include "db/read_callback.h"
7c673cae 23#include "db/version_edit.h"
f67539c2
TL
24#include "memory/allocator.h"
25#include "memory/concurrent_arena.h"
7c673cae
FG
26#include "monitoring/instrumented_mutex.h"
27#include "options/cf_options.h"
28#include "rocksdb/db.h"
7c673cae 29#include "rocksdb/memtablerep.h"
f67539c2 30#include "table/multiget_context.h"
7c673cae
FG
31#include "util/dynamic_bloom.h"
32#include "util/hash.h"
1e59de90 33#include "util/hash_containers.h"
7c673cae 34
f67539c2 35namespace ROCKSDB_NAMESPACE {
7c673cae 36
f67539c2 37struct FlushJobInfo;
7c673cae
FG
38class Mutex;
39class MemTableIterator;
40class MergeContext;
1e59de90 41class SystemClock;
7c673cae 42
11fdf7f2 43struct ImmutableMemTableOptions {
1e59de90 44 explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions,
11fdf7f2 45 const MutableCFOptions& mutable_cf_options);
7c673cae
FG
46 size_t arena_block_size;
47 uint32_t memtable_prefix_bloom_bits;
48 size_t memtable_huge_page_size;
494da23a 49 bool memtable_whole_key_filtering;
7c673cae
FG
50 bool inplace_update_support;
51 size_t inplace_update_num_locks;
52 UpdateStatus (*inplace_callback)(char* existing_value,
53 uint32_t* existing_value_size,
54 Slice delta_value,
55 std::string* merged_value);
56 size_t max_successive_merges;
57 Statistics* statistics;
58 MergeOperator* merge_operator;
59 Logger* info_log;
20effc67 60 bool allow_data_in_errors;
1e59de90 61 uint32_t protection_bytes_per_key;
7c673cae
FG
62};
63
64// Batched counters to updated when inserting keys in one write batch.
65// In post process of the write batch, these can be updated together.
66// Only used in concurrent memtable insert case.
67struct MemTablePostProcessInfo {
68 uint64_t data_size = 0;
69 uint64_t num_entries = 0;
70 uint64_t num_deletes = 0;
71};
72
f67539c2 73using MultiGetRange = MultiGetContext::Range;
7c673cae 74// Note: Many of the methods in this class have comments indicating that
11fdf7f2 75// external synchronization is required as these methods are not thread-safe.
7c673cae 76// It is up to higher layers of code to decide how to prevent concurrent
1e59de90 77// invocation of these methods. This is usually done by acquiring either
7c673cae
FG
78// the db mutex or the single writer thread.
79//
80// Some of these methods are documented to only require external
81// synchronization if this memtable is immutable. Calling MarkImmutable() is
82// not sufficient to guarantee immutability. It is up to higher layers of
83// code to determine if this MemTable can still be modified by other threads.
84// Eg: The Superversion stores a pointer to the current MemTable (that can
85// be modified) and a separate list of the MemTables that can no longer be
86// written to (aka the 'immutable memtables').
87class MemTable {
88 public:
89 struct KeyComparator : public MemTableRep::KeyComparator {
90 const InternalKeyComparator comparator;
1e59de90 91 explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
7c673cae
FG
92 virtual int operator()(const char* prefix_len_key1,
93 const char* prefix_len_key2) const override;
94 virtual int operator()(const char* prefix_len_key,
11fdf7f2 95 const DecodedType& key) const override;
7c673cae
FG
96 };
97
98 // MemTables are reference counted. The initial reference count
99 // is zero and the caller must call Ref() at least once.
100 //
101 // earliest_seq should be the current SequenceNumber in the db such that any
102 // key inserted into this memtable will have an equal or larger seq number.
103 // (When a db is first created, the earliest sequence number will be 0).
104 // If the earliest sequence number is not known, kMaxSequenceNumber may be
105 // used, but this may prevent some transactions from succeeding until the
106 // first key is inserted into the memtable.
107 explicit MemTable(const InternalKeyComparator& comparator,
1e59de90 108 const ImmutableOptions& ioptions,
7c673cae
FG
109 const MutableCFOptions& mutable_cf_options,
110 WriteBufferManager* write_buffer_manager,
11fdf7f2 111 SequenceNumber earliest_seq, uint32_t column_family_id);
f67539c2
TL
112 // No copying allowed
113 MemTable(const MemTable&) = delete;
114 MemTable& operator=(const MemTable&) = delete;
7c673cae
FG
115
116 // Do not delete this MemTable unless Unref() indicates it not in use.
117 ~MemTable();
118
119 // Increase reference count.
120 // REQUIRES: external synchronization to prevent simultaneous
121 // operations on the same MemTable.
122 void Ref() { ++refs_; }
123
124 // Drop reference count.
125 // If the refcount goes to zero return this memtable, otherwise return null.
126 // REQUIRES: external synchronization to prevent simultaneous
127 // operations on the same MemTable.
128 MemTable* Unref() {
129 --refs_;
130 assert(refs_ >= 0);
131 if (refs_ <= 0) {
132 return this;
133 }
134 return nullptr;
135 }
136
137 // Returns an estimate of the number of bytes of data in use by this
138 // data structure.
139 //
140 // REQUIRES: external synchronization to prevent simultaneous
141 // operations on the same MemTable (unless this Memtable is immutable).
142 size_t ApproximateMemoryUsage();
143
1e59de90 144 // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
f67539c2
TL
145 // require external synchronization. The value may be less accurate though
146 size_t ApproximateMemoryUsageFast() const {
147 return approximate_memory_usage_.load(std::memory_order_relaxed);
148 }
149
1e59de90
TL
150 // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
151 size_t MemoryAllocatedBytes() const {
152 return table_->ApproximateMemoryUsage() +
153 range_del_table_->ApproximateMemoryUsage() +
154 arena_.MemoryAllocatedBytes();
155 }
156
157 // Returns a vector of unique random memtable entries of size 'sample_size'.
158 //
159 // Note: the entries are stored in the unordered_set as length-prefixed keys,
160 // hence their representation in the set as "const char*".
161 // Note2: the size of the output set 'entries' is not enforced to be strictly
162 // equal to 'target_sample_size'. Its final size might be slightly
163 // greater or slightly less than 'target_sample_size'
164 //
165 // REQUIRES: external synchronization to prevent simultaneous
166 // operations on the same MemTable (unless this Memtable is immutable).
167 // REQUIRES: SkipList memtable representation. This function is not
168 // implemented for any other type of memtable representation (vectorrep,
169 // hashskiplist,...).
170 void UniqueRandomSample(const uint64_t& target_sample_size,
171 std::unordered_set<const char*>* entries) {
172 // TODO(bjlemaire): at the moment, only supported by skiplistrep.
173 // Extend it to all other memtable representations.
174 table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
175 }
176
7c673cae
FG
177 // This method heuristically determines if the memtable should continue to
178 // host more data.
179 bool ShouldScheduleFlush() const {
180 return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
181 }
182
183 // Returns true if a flush should be scheduled and the caller should
184 // be the one to schedule it
185 bool MarkFlushScheduled() {
186 auto before = FLUSH_REQUESTED;
187 return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
188 std::memory_order_relaxed,
189 std::memory_order_relaxed);
190 }
191
192 // Return an iterator that yields the contents of the memtable.
193 //
194 // The caller must ensure that the underlying MemTable remains live
195 // while the returned iterator is live. The keys returned by this
196 // iterator are internal keys encoded by AppendInternalKey in the
197 // db/dbformat.{h,cc} module.
198 //
199 // By default, it returns an iterator for prefix seek if prefix_extractor
200 // is configured in Options.
201 // arena: If not null, the arena needs to be used to allocate the Iterator.
202 // Calling ~Iterator of the iterator will destroy all the states but
203 // those allocated in arena.
204 InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
205
1e59de90
TL
206 // Returns an iterator that yields the range tombstones of the memtable.
207 // The caller must ensure that the underlying MemTable remains live
208 // while the returned iterator is live.
209 // @param immutable_memtable Whether this memtable is an immutable memtable.
210 // This information is not stored in memtable itself, so it needs to be
211 // specified by the caller. This flag is used internally to decide whether a
212 // cached fragmented range tombstone list can be returned. This cached version
213 // is constructed when a memtable becomes immutable. Setting the flag to false
214 // will always yield correct result, but may incur performance penalty as it
215 // always creates a new fragmented range tombstone list.
494da23a 216 FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
1e59de90
TL
217 const ReadOptions& read_options, SequenceNumber read_seq,
218 bool immutable_memtable);
219
220 Status VerifyEncodedEntry(Slice encoded,
221 const ProtectionInfoKVOS64& kv_prot_info);
7c673cae
FG
222
223 // Add an entry into memtable that maps key to value at the
224 // specified sequence number and with the specified type.
225 // Typically value will be empty if type==kTypeDeletion.
226 //
227 // REQUIRES: if allow_concurrent = false, external synchronization to prevent
228 // simultaneous operations on the same MemTable.
11fdf7f2 229 //
1e59de90
TL
230 // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
231 // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
232 // The next attempt should try a larger value for `seq`.
233 Status Add(SequenceNumber seq, ValueType type, const Slice& key,
234 const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
235 bool allow_concurrent = false,
236 MemTablePostProcessInfo* post_process_info = nullptr,
237 void** hint = nullptr);
7c673cae 238
f67539c2
TL
239 // Used to Get value associated with key or Get Merge Operands associated
240 // with key.
241 // If do_merge = true the default behavior which is Get value for key is
242 // executed. Expected behavior is described right below.
7c673cae
FG
243 // If memtable contains a value for key, store it in *value and return true.
244 // If memtable contains a deletion for key, store a NotFound() error
245 // in *status and return true.
246 // If memtable contains Merge operation as the most recent entry for a key,
247 // and the merge process does not stop (not reaching a value or delete),
248 // prepend the current merge operand to *operands.
249 // store MergeInProgress in s, and return false.
250 // Else, return false.
251 // If any operation was found, its most recent sequence number
252 // will be stored in *seq on success (regardless of whether true/false is
253 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
254 // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other
255 // status returned indicates a corruption or other unexpected error.
f67539c2
TL
256 // If do_merge = false then any Merge Operands encountered for key are simply
257 // stored in merge_context.operands_list and never actually merged to get a
258 // final value. The raw Merge Operands are eventually returned to the user.
1e59de90
TL
259 // @param immutable_memtable Whether this memtable is immutable. Used
260 // internally by NewRangeTombstoneIterator(). See comment above
261 // NewRangeTombstoneIterator() for more detail.
262 bool Get(const LookupKey& key, std::string* value,
263 PinnableWideColumns* columns, std::string* timestamp, Status* s,
494da23a
TL
264 MergeContext* merge_context,
265 SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
1e59de90
TL
266 const ReadOptions& read_opts, bool immutable_memtable,
267 ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
268 bool do_merge = true);
7c673cae 269
1e59de90
TL
270 bool Get(const LookupKey& key, std::string* value,
271 PinnableWideColumns* columns, std::string* timestamp, Status* s,
272 MergeContext* merge_context,
494da23a 273 SequenceNumber* max_covering_tombstone_seq,
1e59de90
TL
274 const ReadOptions& read_opts, bool immutable_memtable,
275 ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
276 bool do_merge = true) {
7c673cae 277 SequenceNumber seq;
1e59de90
TL
278 return Get(key, value, columns, timestamp, s, merge_context,
279 max_covering_tombstone_seq, &seq, read_opts, immutable_memtable,
280 callback, is_blob_index, do_merge);
7c673cae
FG
281 }
282
1e59de90
TL
283 // @param immutable_memtable Whether this memtable is immutable. Used
284 // internally by NewRangeTombstoneIterator(). See comment above
285 // NewRangeTombstoneIterator() for more detail.
f67539c2 286 void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
1e59de90
TL
287 ReadCallback* callback, bool immutable_memtable);
288
289 // If `key` exists in current memtable with type value_type and the existing
290 // value is at least as large as the new value, updates it in-place. Otherwise
291 // adds the new value to the memtable out-of-place.
292 //
293 // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
294 // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
295 // The next attempt should try a larger value for `seq`.
7c673cae
FG
296 //
297 // REQUIRES: external synchronization to prevent simultaneous
298 // operations on the same MemTable.
1e59de90
TL
299 Status Update(SequenceNumber seq, ValueType value_type, const Slice& key,
300 const Slice& value, const ProtectionInfoKVOS64* kv_prot_info);
301
302 // If `key` exists in current memtable with type `kTypeValue` and the existing
303 // value is at least as large as the new value, updates it in-place. Otherwise
304 // if `key` exists in current memtable with type `kTypeValue`, adds the new
305 // value to the memtable out-of-place.
306 //
307 // Returns `Status::NotFound` if `key` does not exist in current memtable or
308 // the latest version of `key` does not have `kTypeValue`.
309 //
310 // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
311 // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
312 // The next attempt should try a larger value for `seq`.
7c673cae
FG
313 //
314 // REQUIRES: external synchronization to prevent simultaneous
315 // operations on the same MemTable.
1e59de90
TL
316 Status UpdateCallback(SequenceNumber seq, const Slice& key,
317 const Slice& delta,
318 const ProtectionInfoKVOS64* kv_prot_info);
7c673cae
FG
319
320 // Returns the number of successive merge entries starting from the newest
321 // entry for the key up to the last non-merge entry or last entry for the
322 // key in the memtable.
323 size_t CountSuccessiveMergeEntries(const LookupKey& key);
324
325 // Update counters and flush status after inserting a whole write batch
326 // Used in concurrent memtable inserts.
327 void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
328 num_entries_.fetch_add(update_counters.num_entries,
329 std::memory_order_relaxed);
330 data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
331 if (update_counters.num_deletes != 0) {
332 num_deletes_.fetch_add(update_counters.num_deletes,
333 std::memory_order_relaxed);
334 }
335 UpdateFlushState();
336 }
337
338 // Get total number of entries in the mem table.
339 // REQUIRES: external synchronization to prevent simultaneous
340 // operations on the same MemTable (unless this Memtable is immutable).
341 uint64_t num_entries() const {
342 return num_entries_.load(std::memory_order_relaxed);
343 }
344
345 // Get total number of deletes in the mem table.
346 // REQUIRES: external synchronization to prevent simultaneous
347 // operations on the same MemTable (unless this Memtable is immutable).
348 uint64_t num_deletes() const {
349 return num_deletes_.load(std::memory_order_relaxed);
350 }
351
494da23a
TL
352 uint64_t get_data_size() const {
353 return data_size_.load(std::memory_order_relaxed);
354 }
355
11fdf7f2
TL
356 // Dynamically change the memtable's capacity. If set below the current usage,
357 // the next key added will trigger a flush. Can only increase size when
358 // memtable prefix bloom is disabled, since we can't easily allocate more
359 // space.
360 void UpdateWriteBufferSize(size_t new_write_buffer_size) {
494da23a 361 if (bloom_filter_ == nullptr ||
11fdf7f2
TL
362 new_write_buffer_size < write_buffer_size_) {
363 write_buffer_size_.store(new_write_buffer_size,
364 std::memory_order_relaxed);
365 }
366 }
367
7c673cae
FG
368 // Returns the edits area that is needed for flushing the memtable
369 VersionEdit* GetEdits() { return &edit_; }
370
371 // Returns if there is no entry inserted to the mem table.
372 // REQUIRES: external synchronization to prevent simultaneous
373 // operations on the same MemTable (unless this Memtable is immutable).
374 bool IsEmpty() const { return first_seqno_ == 0; }
375
376 // Returns the sequence number of the first element that was inserted
377 // into the memtable.
378 // REQUIRES: external synchronization to prevent simultaneous
379 // operations on the same MemTable (unless this Memtable is immutable).
380 SequenceNumber GetFirstSequenceNumber() {
381 return first_seqno_.load(std::memory_order_relaxed);
382 }
383
1e59de90
TL
384 // Returns the sequence number of the first element that was inserted
385 // into the memtable.
386 // REQUIRES: external synchronization to prevent simultaneous
387 // operations on the same MemTable (unless this Memtable is immutable).
388 void SetFirstSequenceNumber(SequenceNumber first_seqno) {
389 return first_seqno_.store(first_seqno, std::memory_order_relaxed);
390 }
391
7c673cae
FG
392 // Returns the sequence number that is guaranteed to be smaller than or equal
393 // to the sequence number of any key that could be inserted into this
394 // memtable. It can then be assumed that any write with a larger(or equal)
395 // sequence number will be present in this memtable or a later memtable.
396 //
397 // If the earliest sequence number could not be determined,
398 // kMaxSequenceNumber will be returned.
399 SequenceNumber GetEarliestSequenceNumber() {
400 return earliest_seqno_.load(std::memory_order_relaxed);
401 }
402
1e59de90
TL
403 // Sets the sequence number that is guaranteed to be smaller than or equal
404 // to the sequence number of any key that could be inserted into this
405 // memtable. It can then be assumed that any write with a larger(or equal)
406 // sequence number will be present in this memtable or a later memtable.
407 // Used only for MemPurge operation
408 void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) {
409 return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed);
410 }
411
7c673cae
FG
412 // DB's latest sequence ID when the memtable is created. This number
413 // may be updated to a more recent one before any key is inserted.
414 SequenceNumber GetCreationSeq() const { return creation_seq_; }
415
416 void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
417
418 // Returns the next active logfile number when this memtable is about to
419 // be flushed to storage
420 // REQUIRES: external synchronization to prevent simultaneous
421 // operations on the same MemTable.
422 uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
423
424 // Sets the next active logfile number when this memtable is about to
425 // be flushed to storage
426 // REQUIRES: external synchronization to prevent simultaneous
427 // operations on the same MemTable.
428 void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
429
430 // if this memtable contains data from a committed
431 // two phase transaction we must take note of the
432 // log which contains that data so we can know
433 // when to relese that log
434 void RefLogContainingPrepSection(uint64_t log);
435 uint64_t GetMinLogContainingPrepSection();
436
437 // Notify the underlying storage that no more items will be added.
438 // REQUIRES: external synchronization to prevent simultaneous
439 // operations on the same MemTable.
440 // After MarkImmutable() is called, you should not attempt to
441 // write anything to this MemTable(). (Ie. do not call Add() or Update()).
442 void MarkImmutable() {
443 table_->MarkReadOnly();
11fdf7f2
TL
444 mem_tracker_.DoneAllocating();
445 }
446
447 // Notify the underlying storage that all data it contained has been
448 // persisted.
449 // REQUIRES: external synchronization to prevent simultaneous
450 // operations on the same MemTable.
1e59de90 451 void MarkFlushed() { table_->MarkFlushed(); }
7c673cae
FG
452
453 // return true if the current MemTableRep supports merge operator.
454 bool IsMergeOperatorSupported() const {
455 return table_->IsMergeOperatorSupported();
456 }
457
458 // return true if the current MemTableRep supports snapshots.
459 // inplace update prevents snapshots,
460 bool IsSnapshotSupported() const {
461 return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
462 }
463
464 struct MemTableStats {
465 uint64_t size;
466 uint64_t count;
467 };
468
469 MemTableStats ApproximateStats(const Slice& start_ikey,
470 const Slice& end_ikey);
471
472 // Get the lock associated for the key
473 port::RWMutex* GetLock(const Slice& key);
474
475 const InternalKeyComparator& GetInternalKeyComparator() const {
476 return comparator_.comparator;
477 }
478
11fdf7f2
TL
479 const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
480 return &moptions_;
481 }
482
483 uint64_t ApproximateOldestKeyTime() const {
484 return oldest_key_time_.load(std::memory_order_relaxed);
485 }
486
487 // REQUIRES: db_mutex held.
488 void SetID(uint64_t id) { id_ = id; }
489
490 uint64_t GetID() const { return id_; }
7c673cae 491
494da23a
TL
492 void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
493
494 uint64_t GetFileNumber() const { return file_number_; }
495
496 void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
497
498 void SetFlushInProgress(bool in_progress) {
499 flush_in_progress_ = in_progress;
500 }
501
f67539c2
TL
502#ifndef ROCKSDB_LITE
503 void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
504 flush_job_info_ = std::move(info);
505 }
506
507 std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
508 return std::move(flush_job_info_);
509 }
510#endif // !ROCKSDB_LITE
511
1e59de90
TL
512 // Returns a heuristic flush decision
513 bool ShouldFlushNow();
514
515 void ConstructFragmentedRangeTombstones();
516
517 // Returns whether a fragmented range tombstone list is already constructed
518 // for this memtable. It should be constructed right before a memtable is
519 // added to an immutable memtable list. Note that if a memtable does not have
520 // any range tombstone, then no range tombstone list will ever be constructed.
521 // @param allow_empty Specifies whether a memtable with no range tombstone is
522 // considered to have its fragmented range tombstone list constructed.
523 bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const {
524 if (allow_empty) {
525 return fragmented_range_tombstone_list_.get() != nullptr ||
526 is_range_del_table_empty_;
527 } else {
528 return fragmented_range_tombstone_list_.get() != nullptr;
529 }
530 }
531
532 // Returns Corruption status if verification fails.
533 static Status VerifyEntryChecksum(const char* entry,
534 size_t protection_bytes_per_key,
535 bool allow_data_in_errors = false);
536
7c673cae
FG
537 private:
538 enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
539
540 friend class MemTableIterator;
541 friend class MemTableBackwardIterator;
542 friend class MemTableList;
543
544 KeyComparator comparator_;
11fdf7f2 545 const ImmutableMemTableOptions moptions_;
7c673cae
FG
546 int refs_;
547 const size_t kArenaBlockSize;
11fdf7f2 548 AllocTracker mem_tracker_;
7c673cae 549 ConcurrentArena arena_;
494da23a
TL
550 std::unique_ptr<MemTableRep> table_;
551 std::unique_ptr<MemTableRep> range_del_table_;
552 std::atomic_bool is_range_del_table_empty_;
7c673cae
FG
553
554 // Total data size of all data inserted
555 std::atomic<uint64_t> data_size_;
556 std::atomic<uint64_t> num_entries_;
557 std::atomic<uint64_t> num_deletes_;
558
11fdf7f2
TL
559 // Dynamically changeable memtable option
560 std::atomic<size_t> write_buffer_size_;
561
7c673cae 562 // These are used to manage memtable flushes to storage
1e59de90
TL
563 bool flush_in_progress_; // started the flush
564 bool flush_completed_; // finished the flush
7c673cae
FG
565 uint64_t file_number_; // filled up after flush is complete
566
567 // The updates to be applied to the transaction log when this
568 // memtable is flushed to storage.
569 VersionEdit edit_;
570
571 // The sequence number of the kv that was inserted first
572 std::atomic<SequenceNumber> first_seqno_;
573
574 // The db sequence number at the time of creation or kMaxSequenceNumber
575 // if not set.
576 std::atomic<SequenceNumber> earliest_seqno_;
577
578 SequenceNumber creation_seq_;
579
580 // The log files earlier than this number can be deleted.
581 uint64_t mem_next_logfile_number_;
582
583 // the earliest log containing a prepared section
584 // which has been inserted into this memtable.
585 std::atomic<uint64_t> min_prep_log_referenced_;
586
587 // rw locks for inplace updates
588 std::vector<port::RWMutex> locks_;
589
590 const SliceTransform* const prefix_extractor_;
494da23a 591 std::unique_ptr<DynamicBloom> bloom_filter_;
7c673cae
FG
592
593 std::atomic<FlushStateEnum> flush_state_;
594
1e59de90 595 SystemClock* clock_;
7c673cae
FG
596
597 // Extract sequential insert prefixes.
598 const SliceTransform* insert_with_hint_prefix_extractor_;
599
600 // Insert hints for each prefix.
1e59de90 601 UnorderedMapH<Slice, void*, SliceHasher> insert_hints_;
7c673cae 602
11fdf7f2
TL
603 // Timestamp of oldest key
604 std::atomic<uint64_t> oldest_key_time_;
605
606 // Memtable id to track flush.
607 uint64_t id_ = 0;
608
494da23a
TL
609 // Sequence number of the atomic flush that is responsible for this memtable.
610 // The sequence number of atomic flush is a seq, such that no writes with
611 // sequence numbers greater than or equal to seq are flushed, while all
612 // writes with sequence number smaller than seq are flushed.
613 SequenceNumber atomic_flush_seqno_;
614
f67539c2 615 // keep track of memory usage in table_, arena_, and range_del_table_.
1e59de90 616 // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
f67539c2
TL
617 std::atomic<uint64_t> approximate_memory_usage_;
618
619#ifndef ROCKSDB_LITE
620 // Flush job info of the current memtable.
621 std::unique_ptr<FlushJobInfo> flush_job_info_;
622#endif // !ROCKSDB_LITE
623
7c673cae
FG
624 // Updates flush_state_ using ShouldFlushNow()
625 void UpdateFlushState();
626
11fdf7f2
TL
627 void UpdateOldestKeyTime();
628
f67539c2
TL
629 void GetFromTable(const LookupKey& key,
630 SequenceNumber max_covering_tombstone_seq, bool do_merge,
631 ReadCallback* callback, bool* is_blob_index,
1e59de90
TL
632 std::string* value, PinnableWideColumns* columns,
633 std::string* timestamp, Status* s,
20effc67
TL
634 MergeContext* merge_context, SequenceNumber* seq,
635 bool* found_final_value, bool* merge_in_progress);
1e59de90
TL
636
637 // Always returns non-null and assumes certain pre-checks (e.g.,
638 // is_range_del_table_empty_) are done. This is only valid during the lifetime
639 // of the underlying memtable.
640 // read_seq and read_options.timestamp will be used as the upper bound
641 // for range tombstones.
642 FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
643 const ReadOptions& read_options, SequenceNumber read_seq,
644 bool immutable_memtable);
645
646 // The fragmented range tombstones of this memtable.
647 // This is constructed when this memtable becomes immutable
648 // if !is_range_del_table_empty_.
649 std::unique_ptr<FragmentedRangeTombstoneList>
650 fragmented_range_tombstone_list_;
651
652 // makes sure there is a single range tombstone writer to invalidate cache
653 std::mutex range_del_mutex_;
654 CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
655 cached_range_tombstone_;
656
657 void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
658 const Slice& key, const Slice& value, ValueType type,
659 SequenceNumber s, char* checksum_ptr);
7c673cae
FG
660};
661
662extern const char* EncodeKey(std::string* scratch, const Slice& target);
663
f67539c2 664} // namespace ROCKSDB_NAMESPACE