1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
15 #include "db/dbformat.h"
16 #include "db/logs_with_prep_tracker.h"
17 #include "db/memtable.h"
18 #include "db/range_del_aggregator.h"
19 #include "file/filename.h"
20 #include "logging/log_buffer.h"
21 #include "monitoring/instrumented_mutex.h"
22 #include "rocksdb/db.h"
23 #include "rocksdb/iterator.h"
24 #include "rocksdb/options.h"
25 #include "rocksdb/types.h"
26 #include "util/autovector.h"
28 namespace ROCKSDB_NAMESPACE
{
30 class ColumnFamilyData
;
31 class InternalKeyComparator
;
32 class InstrumentedMutex
;
33 class MergeIteratorBuilder
;
38 // keeps a list of immutable memtables in a vector. the list is immutable
39 // if refcount is bigger than one. It is used as a state for Get() and
40 // Iterator code paths
42 // This class is not thread-safe. External synchronization is required
43 // (such as holding the db mutex or being on the write thread).
44 class MemTableListVersion
{
46 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage
,
47 const MemTableListVersion
& old
);
48 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage
,
49 int max_write_buffer_number_to_maintain
,
50 int64_t max_write_buffer_size_to_maintain
);
53 void Unref(autovector
<MemTable
*>* to_delete
= nullptr);
55 // Search all the memtables starting from the most recent one.
56 // Return the most recent value found, if any.
58 // If any operation was found for this key, its most recent sequence number
59 // will be stored in *seq on success (regardless of whether true/false is
60 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
61 bool Get(const LookupKey
& key
, std::string
* value
, std::string
* timestamp
,
62 Status
* s
, MergeContext
* merge_context
,
63 SequenceNumber
* max_covering_tombstone_seq
, SequenceNumber
* seq
,
64 const ReadOptions
& read_opts
, ReadCallback
* callback
= nullptr,
65 bool* is_blob_index
= nullptr);
67 bool Get(const LookupKey
& key
, std::string
* value
, std::string
* timestamp
,
68 Status
* s
, MergeContext
* merge_context
,
69 SequenceNumber
* max_covering_tombstone_seq
,
70 const ReadOptions
& read_opts
, ReadCallback
* callback
= nullptr,
71 bool* is_blob_index
= nullptr) {
73 return Get(key
, value
, timestamp
, s
, merge_context
,
74 max_covering_tombstone_seq
, &seq
, read_opts
, callback
,
78 void MultiGet(const ReadOptions
& read_options
, MultiGetRange
* range
,
79 ReadCallback
* callback
, bool* is_blob
);
81 // Returns all the merge operands corresponding to the key by searching all
82 // memtables starting from the most recent one.
83 bool GetMergeOperands(const LookupKey
& key
, Status
* s
,
84 MergeContext
* merge_context
,
85 SequenceNumber
* max_covering_tombstone_seq
,
86 const ReadOptions
& read_opts
);
88 // Similar to Get(), but searches the Memtable history of memtables that
89 // have already been flushed. Should only be used from in-memory only
90 // queries (such as Transaction validation) as the history may contain
91 // writes that are also present in the SST files.
92 bool GetFromHistory(const LookupKey
& key
, std::string
* value
,
93 std::string
* timestamp
, Status
* s
,
94 MergeContext
* merge_context
,
95 SequenceNumber
* max_covering_tombstone_seq
,
96 SequenceNumber
* seq
, const ReadOptions
& read_opts
,
97 bool* is_blob_index
= nullptr);
98 bool GetFromHistory(const LookupKey
& key
, std::string
* value
,
99 std::string
* timestamp
, Status
* s
,
100 MergeContext
* merge_context
,
101 SequenceNumber
* max_covering_tombstone_seq
,
102 const ReadOptions
& read_opts
,
103 bool* is_blob_index
= nullptr) {
105 return GetFromHistory(key
, value
, timestamp
, s
, merge_context
,
106 max_covering_tombstone_seq
, &seq
, read_opts
,
110 Status
AddRangeTombstoneIterators(const ReadOptions
& read_opts
, Arena
* arena
,
111 RangeDelAggregator
* range_del_agg
);
113 void AddIterators(const ReadOptions
& options
,
114 std::vector
<InternalIterator
*>* iterator_list
,
117 void AddIterators(const ReadOptions
& options
,
118 MergeIteratorBuilder
* merge_iter_builder
);
120 uint64_t GetTotalNumEntries() const;
122 uint64_t GetTotalNumDeletes() const;
124 MemTable::MemTableStats
ApproximateStats(const Slice
& start_ikey
,
125 const Slice
& end_ikey
);
127 // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
128 // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
129 // If include_history=true, will also search Memtables in MemTableList
131 SequenceNumber
GetEarliestSequenceNumber(bool include_history
= false) const;
134 friend class MemTableList
;
136 friend Status
InstallMemtableAtomicFlushResults(
137 const autovector
<MemTableList
*>* imm_lists
,
138 const autovector
<ColumnFamilyData
*>& cfds
,
139 const autovector
<const MutableCFOptions
*>& mutable_cf_options_list
,
140 const autovector
<const autovector
<MemTable
*>*>& mems_list
,
141 VersionSet
* vset
, InstrumentedMutex
* mu
,
142 const autovector
<FileMetaData
*>& file_meta
,
143 autovector
<MemTable
*>* to_delete
, FSDirectory
* db_directory
,
144 LogBuffer
* log_buffer
);
146 // REQUIRE: m is an immutable memtable
147 void Add(MemTable
* m
, autovector
<MemTable
*>* to_delete
);
148 // REQUIRE: m is an immutable memtable
149 void Remove(MemTable
* m
, autovector
<MemTable
*>* to_delete
);
151 // Return true if memtable is trimmed
152 bool TrimHistory(autovector
<MemTable
*>* to_delete
, size_t usage
);
154 bool GetFromList(std::list
<MemTable
*>* list
, const LookupKey
& key
,
155 std::string
* value
, std::string
* timestamp
, Status
* s
,
156 MergeContext
* merge_context
,
157 SequenceNumber
* max_covering_tombstone_seq
,
158 SequenceNumber
* seq
, const ReadOptions
& read_opts
,
159 ReadCallback
* callback
= nullptr,
160 bool* is_blob_index
= nullptr);
162 void AddMemTable(MemTable
* m
);
164 void UnrefMemTable(autovector
<MemTable
*>* to_delete
, MemTable
* m
);
166 // Calculate the total amount of memory used by memlist_ and memlist_history_
167 // excluding the last MemTable in memlist_history_. The reason for excluding
168 // the last MemTable is to see if dropping the last MemTable will keep total
169 // memory usage above or equal to max_write_buffer_size_to_maintain_
170 size_t ApproximateMemoryUsageExcludingLast() const;
172 // Whether this version contains flushed memtables that are only kept around
173 // for transaction conflict checking.
174 bool HasHistory() const { return !memlist_history_
.empty(); }
176 bool MemtableLimitExceeded(size_t usage
);
178 // Immutable MemTables that have not yet been flushed.
179 std::list
<MemTable
*> memlist_
;
181 // MemTables that have already been flushed
182 // (used during Transaction validation)
183 std::list
<MemTable
*> memlist_history_
;
185 // Maximum number of MemTables to keep in memory (including both flushed
186 const int max_write_buffer_number_to_maintain_
;
187 // Maximum size of MemTables to keep in memory (including both flushed
188 // and not-yet-flushed tables).
189 const int64_t max_write_buffer_size_to_maintain_
;
193 size_t* parent_memtable_list_memory_usage_
;
196 // This class stores references to all the immutable memtables.
197 // The memtables are flushed to L0 as soon as possible and in
198 // any order. If there are more than one immutable memtable, their
199 // flushes can occur concurrently. However, they are 'committed'
200 // to the manifest in FIFO order to maintain correctness and
201 // recoverability from a crash.
204 // Other than imm_flush_needed and imm_trim_needed, this class is not
205 // thread-safe and requires external synchronization (such as holding the db
206 // mutex or being on the write thread.)
209 // A list of memtables.
210 explicit MemTableList(int min_write_buffer_number_to_merge
,
211 int max_write_buffer_number_to_maintain
,
212 int64_t max_write_buffer_size_to_maintain
)
213 : imm_flush_needed(false),
214 imm_trim_needed(false),
215 min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge
),
216 current_(new MemTableListVersion(¤t_memory_usage_
,
217 max_write_buffer_number_to_maintain
,
218 max_write_buffer_size_to_maintain
)),
219 num_flush_not_started_(0),
220 commit_in_progress_(false),
221 flush_requested_(false),
222 current_memory_usage_(0),
223 current_memory_usage_excluding_last_(0),
224 current_has_history_(false) {
228 // Should not delete MemTableList without making sure MemTableList::current()
232 MemTableListVersion
* current() const { return current_
; }
234 // so that background threads can detect non-nullptr pointer to
235 // determine whether there is anything more to start flushing.
236 std::atomic
<bool> imm_flush_needed
;
238 std::atomic
<bool> imm_trim_needed
;
240 // Returns the total number of memtables in the list that haven't yet
241 // been flushed and logged.
242 int NumNotFlushed() const;
244 // Returns total number of memtables in the list that have been
245 // completely flushed and logged.
246 int NumFlushed() const;
248 // Returns true if there is at least one memtable on which flush has
250 bool IsFlushPending() const;
252 // Returns the earliest memtables that needs to be flushed. The returned
253 // memtables are guaranteed to be in the ascending order of created time.
254 void PickMemtablesToFlush(const uint64_t* max_memtable_id
,
255 autovector
<MemTable
*>* mems
);
257 // Reset status of the given memtable list back to pending state so that
258 // they can get picked up again on the next round of flush.
259 void RollbackMemtableFlush(const autovector
<MemTable
*>& mems
,
260 uint64_t file_number
);
262 // Try commit a successful flush in the manifest file. It might just return
263 // Status::OK letting a concurrent flush to do the actual the recording.
264 Status
TryInstallMemtableFlushResults(
265 ColumnFamilyData
* cfd
, const MutableCFOptions
& mutable_cf_options
,
266 const autovector
<MemTable
*>& m
, LogsWithPrepTracker
* prep_tracker
,
267 VersionSet
* vset
, InstrumentedMutex
* mu
, uint64_t file_number
,
268 autovector
<MemTable
*>* to_delete
, FSDirectory
* db_directory
,
269 LogBuffer
* log_buffer
,
270 std::list
<std::unique_ptr
<FlushJobInfo
>>* committed_flush_jobs_info
,
273 // New memtables are inserted at the front of the list.
274 // Takes ownership of the referenced held on *m by the caller of Add().
275 void Add(MemTable
* m
, autovector
<MemTable
*>* to_delete
);
277 // Returns an estimate of the number of bytes of data in use.
278 size_t ApproximateMemoryUsage();
280 // Returns the cached current_memory_usage_excluding_last_ value.
281 size_t ApproximateMemoryUsageExcludingLast() const;
283 // Returns the cached current_has_history_ value.
284 bool HasHistory() const;
286 // Updates current_memory_usage_excluding_last_ and current_has_history_
287 // from MemTableListVersion. Must be called whenever InstallNewVersion is
289 void UpdateCachedValuesFromMemTableListVersion();
291 // `usage` is the current size of the mutable Memtable. When
292 // max_write_buffer_size_to_maintain is used, total size of mutable and
293 // immutable memtables is checked against it to decide whether to trim
296 // Return true if memtable is trimmed
297 bool TrimHistory(autovector
<MemTable
*>* to_delete
, size_t usage
);
299 // Returns an estimate of the number of bytes of data used by
300 // the unflushed mem-tables.
301 size_t ApproximateUnflushedMemTablesMemoryUsage();
303 // Returns an estimate of the timestamp of the earliest key.
304 uint64_t ApproximateOldestKeyTime() const;
306 // Request a flush of all existing memtables to storage. This will
307 // cause future calls to IsFlushPending() to return true if this list is
308 // non-empty (regardless of the min_write_buffer_number_to_merge
309 // parameter). This flush request will persist until the next time
310 // PickMemtablesToFlush() is called.
311 void FlushRequested() { flush_requested_
= true; }
313 bool HasFlushRequested() { return flush_requested_
; }
315 // Returns true if a trim history should be scheduled and the caller should
316 // be the one to schedule it
317 bool MarkTrimHistoryNeeded() {
318 auto expected
= false;
319 return imm_trim_needed
.compare_exchange_strong(
320 expected
, true, std::memory_order_relaxed
, std::memory_order_relaxed
);
323 void ResetTrimHistoryNeeded() {
324 auto expected
= true;
325 imm_trim_needed
.compare_exchange_strong(
326 expected
, false, std::memory_order_relaxed
, std::memory_order_relaxed
);
330 // MemTableList(const MemTableList&);
331 // void operator=(const MemTableList&);
333 size_t* current_memory_usage() { return ¤t_memory_usage_
; }
335 // Returns the min log containing the prep section after memtables listsed in
336 // `memtables_to_flush` are flushed and their status is persisted in manifest.
337 uint64_t PrecomputeMinLogContainingPrepSection(
338 const autovector
<MemTable
*>& memtables_to_flush
);
340 uint64_t GetEarliestMemTableID() const {
341 auto& memlist
= current_
->memlist_
;
342 if (memlist
.empty()) {
343 return std::numeric_limits
<uint64_t>::max();
345 return memlist
.back()->GetID();
348 uint64_t GetLatestMemTableID() const {
349 auto& memlist
= current_
->memlist_
;
350 if (memlist
.empty()) {
353 return memlist
.front()->GetID();
356 void AssignAtomicFlushSeq(const SequenceNumber
& seq
) {
357 const auto& memlist
= current_
->memlist_
;
358 // Scan the memtable list from new to old
359 for (auto it
= memlist
.begin(); it
!= memlist
.end(); ++it
) {
361 if (mem
->atomic_flush_seqno_
== kMaxSequenceNumber
) {
362 mem
->atomic_flush_seqno_
= seq
;
364 // Earlier memtables must have been assigned a atomic flush seq, no
365 // need to continue scan.
371 // Used only by DBImplSecondary during log replay.
372 // Remove memtables whose data were written before the WAL with log_number
373 // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
374 // not freed, but put into a vector for future deref and reclamation.
375 void RemoveOldMemTables(uint64_t log_number
,
376 autovector
<MemTable
*>* to_delete
);
379 friend Status
InstallMemtableAtomicFlushResults(
380 const autovector
<MemTableList
*>* imm_lists
,
381 const autovector
<ColumnFamilyData
*>& cfds
,
382 const autovector
<const MutableCFOptions
*>& mutable_cf_options_list
,
383 const autovector
<const autovector
<MemTable
*>*>& mems_list
,
384 VersionSet
* vset
, InstrumentedMutex
* mu
,
385 const autovector
<FileMetaData
*>& file_meta
,
386 autovector
<MemTable
*>* to_delete
, FSDirectory
* db_directory
,
387 LogBuffer
* log_buffer
);
390 void InstallNewVersion();
393 // Called after writing to MANIFEST
394 void RemoveMemTablesOrRestoreFlags(const Status
& s
, ColumnFamilyData
* cfd
,
395 size_t batch_count
, LogBuffer
* log_buffer
,
396 autovector
<MemTable
*>* to_delete
,
397 InstrumentedMutex
* mu
);
399 const int min_write_buffer_number_to_merge_
;
401 MemTableListVersion
* current_
;
403 // the number of elements that still need flushing
404 int num_flush_not_started_
;
406 // committing in progress
407 bool commit_in_progress_
;
409 // Requested a flush of memtables to storage. It's possible to request that
410 // a subset of memtables be flushed.
411 bool flush_requested_
;
413 // The current memory usage.
414 size_t current_memory_usage_
;
416 // Cached value of current_->ApproximateMemoryUsageExcludingLast().
417 std::atomic
<size_t> current_memory_usage_excluding_last_
;
419 // Cached value of current_->HasHistory().
420 std::atomic
<bool> current_has_history_
;
423 // Installs memtable atomic flush results.
424 // In most cases, imm_lists is nullptr, and the function simply uses the
425 // immutable memtable lists associated with the cfds. There are unit tests that
426 // installs flush results for external immutable memtable lists other than the
427 // cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
428 // imm_lists parameter is not nullptr.
429 extern Status
InstallMemtableAtomicFlushResults(
430 const autovector
<MemTableList
*>* imm_lists
,
431 const autovector
<ColumnFamilyData
*>& cfds
,
432 const autovector
<const MutableCFOptions
*>& mutable_cf_options_list
,
433 const autovector
<const autovector
<MemTable
*>*>& mems_list
, VersionSet
* vset
,
434 InstrumentedMutex
* mu
, const autovector
<FileMetaData
*>& file_meta
,
435 autovector
<MemTable
*>* to_delete
, FSDirectory
* db_directory
,
436 LogBuffer
* log_buffer
);
437 } // namespace ROCKSDB_NAMESPACE