1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
15 #include "db/dbformat.h"
16 #include "db/logs_with_prep_tracker.h"
17 #include "db/memtable.h"
18 #include "db/range_del_aggregator.h"
19 #include "monitoring/instrumented_mutex.h"
20 #include "rocksdb/db.h"
21 #include "rocksdb/iterator.h"
22 #include "rocksdb/options.h"
23 #include "rocksdb/types.h"
24 #include "util/autovector.h"
25 #include "util/filename.h"
26 #include "util/log_buffer.h"
30 class ColumnFamilyData
;
31 class InternalKeyComparator
;
32 class InstrumentedMutex
;
33 class MergeIteratorBuilder
;
36 // keeps a list of immutable memtables in a vector. the list is immutable
37 // if refcount is bigger than one. It is used as a state for Get() and
38 // Iterator code paths
40 // This class is not thread-safe. External synchronization is required
41 // (such as holding the db mutex or being on the write thread).
42 class MemTableListVersion
{
44 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage
,
45 MemTableListVersion
* old
= nullptr);
46 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage
,
47 int max_write_buffer_number_to_maintain
);
50 void Unref(autovector
<MemTable
*>* to_delete
= nullptr);
52 // Search all the memtables starting from the most recent one.
53 // Return the most recent value found, if any.
55 // If any operation was found for this key, its most recent sequence number
56 // will be stored in *seq on success (regardless of whether true/false is
57 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
58 bool Get(const LookupKey
& key
, std::string
* value
, Status
* s
,
59 MergeContext
* merge_context
,
60 SequenceNumber
* max_covering_tombstone_seq
, SequenceNumber
* seq
,
61 const ReadOptions
& read_opts
, ReadCallback
* callback
= nullptr,
62 bool* is_blob_index
= nullptr);
64 bool Get(const LookupKey
& key
, std::string
* value
, Status
* s
,
65 MergeContext
* merge_context
,
66 SequenceNumber
* max_covering_tombstone_seq
,
67 const ReadOptions
& read_opts
, ReadCallback
* callback
= nullptr,
68 bool* is_blob_index
= nullptr) {
70 return Get(key
, value
, s
, merge_context
, max_covering_tombstone_seq
, &seq
,
71 read_opts
, callback
, is_blob_index
);
74 // Similar to Get(), but searches the Memtable history of memtables that
75 // have already been flushed. Should only be used from in-memory only
76 // queries (such as Transaction validation) as the history may contain
77 // writes that are also present in the SST files.
78 bool GetFromHistory(const LookupKey
& key
, std::string
* value
, Status
* s
,
79 MergeContext
* merge_context
,
80 SequenceNumber
* max_covering_tombstone_seq
,
81 SequenceNumber
* seq
, const ReadOptions
& read_opts
,
82 bool* is_blob_index
= nullptr);
83 bool GetFromHistory(const LookupKey
& key
, std::string
* value
, Status
* s
,
84 MergeContext
* merge_context
,
85 SequenceNumber
* max_covering_tombstone_seq
,
86 const ReadOptions
& read_opts
,
87 bool* is_blob_index
= nullptr) {
89 return GetFromHistory(key
, value
, s
, merge_context
,
90 max_covering_tombstone_seq
, &seq
, read_opts
,
94 Status
AddRangeTombstoneIterators(const ReadOptions
& read_opts
, Arena
* arena
,
95 RangeDelAggregator
* range_del_agg
);
97 void AddIterators(const ReadOptions
& options
,
98 std::vector
<InternalIterator
*>* iterator_list
,
101 void AddIterators(const ReadOptions
& options
,
102 MergeIteratorBuilder
* merge_iter_builder
);
104 uint64_t GetTotalNumEntries() const;
106 uint64_t GetTotalNumDeletes() const;
108 MemTable::MemTableStats
ApproximateStats(const Slice
& start_ikey
,
109 const Slice
& end_ikey
);
111 // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
112 // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
113 // If include_history=true, will also search Memtables in MemTableList
115 SequenceNumber
GetEarliestSequenceNumber(bool include_history
= false) const;
118 friend class MemTableList
;
120 friend Status
InstallMemtableAtomicFlushResults(
121 const autovector
<MemTableList
*>* imm_lists
,
122 const autovector
<ColumnFamilyData
*>& cfds
,
123 const autovector
<const MutableCFOptions
*>& mutable_cf_options_list
,
124 const autovector
<const autovector
<MemTable
*>*>& mems_list
,
125 VersionSet
* vset
, InstrumentedMutex
* mu
,
126 const autovector
<FileMetaData
*>& file_meta
,
127 autovector
<MemTable
*>* to_delete
, Directory
* db_directory
,
128 LogBuffer
* log_buffer
);
130 // REQUIRE: m is an immutable memtable
131 void Add(MemTable
* m
, autovector
<MemTable
*>* to_delete
);
132 // REQUIRE: m is an immutable memtable
133 void Remove(MemTable
* m
, autovector
<MemTable
*>* to_delete
);
135 void TrimHistory(autovector
<MemTable
*>* to_delete
);
137 bool GetFromList(std::list
<MemTable
*>* list
, const LookupKey
& key
,
138 std::string
* value
, Status
* s
, MergeContext
* merge_context
,
139 SequenceNumber
* max_covering_tombstone_seq
,
140 SequenceNumber
* seq
, const ReadOptions
& read_opts
,
141 ReadCallback
* callback
= nullptr,
142 bool* is_blob_index
= nullptr);
144 void AddMemTable(MemTable
* m
);
146 void UnrefMemTable(autovector
<MemTable
*>* to_delete
, MemTable
* m
);
148 // Immutable MemTables that have not yet been flushed.
149 std::list
<MemTable
*> memlist_
;
151 // MemTables that have already been flushed
152 // (used during Transaction validation)
153 std::list
<MemTable
*> memlist_history_
;
155 // Maximum number of MemTables to keep in memory (including both flushed
156 // and not-yet-flushed tables).
157 const int max_write_buffer_number_to_maintain_
;
161 size_t* parent_memtable_list_memory_usage_
;
164 // This class stores references to all the immutable memtables.
165 // The memtables are flushed to L0 as soon as possible and in
166 // any order. If there are more than one immutable memtable, their
167 // flushes can occur concurrently. However, they are 'committed'
168 // to the manifest in FIFO order to maintain correctness and
169 // recoverability from a crash.
172 // Other than imm_flush_needed, this class is not thread-safe and requires
173 // external synchronization (such as holding the db mutex or being on the
177 // A list of memtables.
178 explicit MemTableList(int min_write_buffer_number_to_merge
,
179 int max_write_buffer_number_to_maintain
)
180 : imm_flush_needed(false),
181 min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge
),
182 current_(new MemTableListVersion(¤t_memory_usage_
,
183 max_write_buffer_number_to_maintain
)),
184 num_flush_not_started_(0),
185 commit_in_progress_(false),
186 flush_requested_(false) {
188 current_memory_usage_
= 0;
191 // Should not delete MemTableList without making sure MemTableList::current()
195 MemTableListVersion
* current() { return current_
; }
197 // so that background threads can detect non-nullptr pointer to
198 // determine whether there is anything more to start flushing.
199 std::atomic
<bool> imm_flush_needed
;
201 // Returns the total number of memtables in the list that haven't yet
202 // been flushed and logged.
203 int NumNotFlushed() const;
205 // Returns total number of memtables in the list that have been
206 // completely flushed and logged.
207 int NumFlushed() const;
209 // Returns true if there is at least one memtable on which flush has
211 bool IsFlushPending() const;
213 // Returns the earliest memtables that needs to be flushed. The returned
214 // memtables are guaranteed to be in the ascending order of created time.
215 void PickMemtablesToFlush(const uint64_t* max_memtable_id
,
216 autovector
<MemTable
*>* mems
);
218 // Reset status of the given memtable list back to pending state so that
219 // they can get picked up again on the next round of flush.
220 void RollbackMemtableFlush(const autovector
<MemTable
*>& mems
,
221 uint64_t file_number
);
223 // Try commit a successful flush in the manifest file. It might just return
224 // Status::OK letting a concurrent flush to do the actual the recording.
225 Status
TryInstallMemtableFlushResults(
226 ColumnFamilyData
* cfd
, const MutableCFOptions
& mutable_cf_options
,
227 const autovector
<MemTable
*>& m
, LogsWithPrepTracker
* prep_tracker
,
228 VersionSet
* vset
, InstrumentedMutex
* mu
, uint64_t file_number
,
229 autovector
<MemTable
*>* to_delete
, Directory
* db_directory
,
230 LogBuffer
* log_buffer
);
232 // New memtables are inserted at the front of the list.
233 // Takes ownership of the referenced held on *m by the caller of Add().
234 void Add(MemTable
* m
, autovector
<MemTable
*>* to_delete
);
236 // Returns an estimate of the number of bytes of data in use.
237 size_t ApproximateMemoryUsage();
239 // Returns an estimate of the number of bytes of data used by
240 // the unflushed mem-tables.
241 size_t ApproximateUnflushedMemTablesMemoryUsage();
243 // Returns an estimate of the timestamp of the earliest key.
244 uint64_t ApproximateOldestKeyTime() const;
246 // Request a flush of all existing memtables to storage. This will
247 // cause future calls to IsFlushPending() to return true if this list is
248 // non-empty (regardless of the min_write_buffer_number_to_merge
249 // parameter). This flush request will persist until the next time
250 // PickMemtablesToFlush() is called.
251 void FlushRequested() { flush_requested_
= true; }
253 bool HasFlushRequested() { return flush_requested_
; }
256 // MemTableList(const MemTableList&);
257 // void operator=(const MemTableList&);
259 size_t* current_memory_usage() { return ¤t_memory_usage_
; }
261 // Returns the min log containing the prep section after memtables listsed in
262 // `memtables_to_flush` are flushed and their status is persisted in manifest.
263 uint64_t PrecomputeMinLogContainingPrepSection(
264 const autovector
<MemTable
*>& memtables_to_flush
);
266 uint64_t GetEarliestMemTableID() const {
267 auto& memlist
= current_
->memlist_
;
268 if (memlist
.empty()) {
269 return std::numeric_limits
<uint64_t>::max();
271 return memlist
.back()->GetID();
274 uint64_t GetLatestMemTableID() const {
275 auto& memlist
= current_
->memlist_
;
276 if (memlist
.empty()) {
279 return memlist
.front()->GetID();
282 void AssignAtomicFlushSeq(const SequenceNumber
& seq
) {
283 const auto& memlist
= current_
->memlist_
;
284 // Scan the memtable list from new to old
285 for (auto it
= memlist
.begin(); it
!= memlist
.end(); ++it
) {
287 if (mem
->atomic_flush_seqno_
== kMaxSequenceNumber
) {
288 mem
->atomic_flush_seqno_
= seq
;
290 // Earlier memtables must have been assigned a atomic flush seq, no
291 // need to continue scan.
298 friend Status
InstallMemtableAtomicFlushResults(
299 const autovector
<MemTableList
*>* imm_lists
,
300 const autovector
<ColumnFamilyData
*>& cfds
,
301 const autovector
<const MutableCFOptions
*>& mutable_cf_options_list
,
302 const autovector
<const autovector
<MemTable
*>*>& mems_list
,
303 VersionSet
* vset
, InstrumentedMutex
* mu
,
304 const autovector
<FileMetaData
*>& file_meta
,
305 autovector
<MemTable
*>* to_delete
, Directory
* db_directory
,
306 LogBuffer
* log_buffer
);
309 void InstallNewVersion();
311 const int min_write_buffer_number_to_merge_
;
313 MemTableListVersion
* current_
;
315 // the number of elements that still need flushing
316 int num_flush_not_started_
;
318 // committing in progress
319 bool commit_in_progress_
;
321 // Requested a flush of memtables to storage. It's possible to request that
322 // a subset of memtables be flushed.
323 bool flush_requested_
;
325 // The current memory usage.
326 size_t current_memory_usage_
;
329 // Installs memtable atomic flush results.
330 // In most cases, imm_lists is nullptr, and the function simply uses the
331 // immutable memtable lists associated with the cfds. There are unit tests that
332 // installs flush results for external immutable memtable lists other than the
333 // cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
334 // imm_lists parameter is not nullptr.
335 extern Status
InstallMemtableAtomicFlushResults(
336 const autovector
<MemTableList
*>* imm_lists
,
337 const autovector
<ColumnFamilyData
*>& cfds
,
338 const autovector
<const MutableCFOptions
*>& mutable_cf_options_list
,
339 const autovector
<const autovector
<MemTable
*>*>& mems_list
, VersionSet
* vset
,
340 InstrumentedMutex
* mu
, const autovector
<FileMetaData
*>& file_meta
,
341 autovector
<MemTable
*>* to_delete
, Directory
* db_directory
,
342 LogBuffer
* log_buffer
);
343 } // namespace rocksdb