]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/memtable_list.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / db / memtable_list.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 #pragma once
7
8 #include <deque>
9 #include <limits>
10 #include <list>
11 #include <set>
12 #include <string>
13 #include <vector>
14
15 #include "db/dbformat.h"
16 #include "db/logs_with_prep_tracker.h"
17 #include "db/memtable.h"
18 #include "db/range_del_aggregator.h"
19 #include "monitoring/instrumented_mutex.h"
20 #include "rocksdb/db.h"
21 #include "rocksdb/iterator.h"
22 #include "rocksdb/options.h"
23 #include "rocksdb/types.h"
24 #include "util/autovector.h"
25 #include "util/filename.h"
26 #include "util/log_buffer.h"
27
28 namespace rocksdb {
29
30 class ColumnFamilyData;
31 class InternalKeyComparator;
32 class InstrumentedMutex;
33 class MergeIteratorBuilder;
34 class MemTableList;
35
36 // keeps a list of immutable memtables in a vector. the list is immutable
37 // if refcount is bigger than one. It is used as a state for Get() and
38 // Iterator code paths
39 //
40 // This class is not thread-safe. External synchronization is required
41 // (such as holding the db mutex or being on the write thread).
42 class MemTableListVersion {
43 public:
44 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
45 MemTableListVersion* old = nullptr);
46 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
47 int max_write_buffer_number_to_maintain);
48
49 void Ref();
50 void Unref(autovector<MemTable*>* to_delete = nullptr);
51
52 // Search all the memtables starting from the most recent one.
53 // Return the most recent value found, if any.
54 //
55 // If any operation was found for this key, its most recent sequence number
56 // will be stored in *seq on success (regardless of whether true/false is
57 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
58 bool Get(const LookupKey& key, std::string* value, Status* s,
59 MergeContext* merge_context,
60 SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
61 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
62 bool* is_blob_index = nullptr);
63
64 bool Get(const LookupKey& key, std::string* value, Status* s,
65 MergeContext* merge_context,
66 SequenceNumber* max_covering_tombstone_seq,
67 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
68 bool* is_blob_index = nullptr) {
69 SequenceNumber seq;
70 return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
71 read_opts, callback, is_blob_index);
72 }
73
74 // Similar to Get(), but searches the Memtable history of memtables that
75 // have already been flushed. Should only be used from in-memory only
76 // queries (such as Transaction validation) as the history may contain
77 // writes that are also present in the SST files.
78 bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
79 MergeContext* merge_context,
80 SequenceNumber* max_covering_tombstone_seq,
81 SequenceNumber* seq, const ReadOptions& read_opts,
82 bool* is_blob_index = nullptr);
83 bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
84 MergeContext* merge_context,
85 SequenceNumber* max_covering_tombstone_seq,
86 const ReadOptions& read_opts,
87 bool* is_blob_index = nullptr) {
88 SequenceNumber seq;
89 return GetFromHistory(key, value, s, merge_context,
90 max_covering_tombstone_seq, &seq, read_opts,
91 is_blob_index);
92 }
93
94 Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
95 RangeDelAggregator* range_del_agg);
96
97 void AddIterators(const ReadOptions& options,
98 std::vector<InternalIterator*>* iterator_list,
99 Arena* arena);
100
101 void AddIterators(const ReadOptions& options,
102 MergeIteratorBuilder* merge_iter_builder);
103
104 uint64_t GetTotalNumEntries() const;
105
106 uint64_t GetTotalNumDeletes() const;
107
108 MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
109 const Slice& end_ikey);
110
111 // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
112 // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
113 // If include_history=true, will also search Memtables in MemTableList
114 // History.
115 SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
116
117 private:
118 friend class MemTableList;
119
120 friend Status InstallMemtableAtomicFlushResults(
121 const autovector<MemTableList*>* imm_lists,
122 const autovector<ColumnFamilyData*>& cfds,
123 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
124 const autovector<const autovector<MemTable*>*>& mems_list,
125 VersionSet* vset, InstrumentedMutex* mu,
126 const autovector<FileMetaData*>& file_meta,
127 autovector<MemTable*>* to_delete, Directory* db_directory,
128 LogBuffer* log_buffer);
129
130 // REQUIRE: m is an immutable memtable
131 void Add(MemTable* m, autovector<MemTable*>* to_delete);
132 // REQUIRE: m is an immutable memtable
133 void Remove(MemTable* m, autovector<MemTable*>* to_delete);
134
135 void TrimHistory(autovector<MemTable*>* to_delete);
136
137 bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
138 std::string* value, Status* s, MergeContext* merge_context,
139 SequenceNumber* max_covering_tombstone_seq,
140 SequenceNumber* seq, const ReadOptions& read_opts,
141 ReadCallback* callback = nullptr,
142 bool* is_blob_index = nullptr);
143
144 void AddMemTable(MemTable* m);
145
146 void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
147
148 // Immutable MemTables that have not yet been flushed.
149 std::list<MemTable*> memlist_;
150
151 // MemTables that have already been flushed
152 // (used during Transaction validation)
153 std::list<MemTable*> memlist_history_;
154
155 // Maximum number of MemTables to keep in memory (including both flushed
156 // and not-yet-flushed tables).
157 const int max_write_buffer_number_to_maintain_;
158
159 int refs_ = 0;
160
161 size_t* parent_memtable_list_memory_usage_;
162 };
163
164 // This class stores references to all the immutable memtables.
165 // The memtables are flushed to L0 as soon as possible and in
166 // any order. If there are more than one immutable memtable, their
167 // flushes can occur concurrently. However, they are 'committed'
168 // to the manifest in FIFO order to maintain correctness and
169 // recoverability from a crash.
170 //
171 //
172 // Other than imm_flush_needed, this class is not thread-safe and requires
173 // external synchronization (such as holding the db mutex or being on the
174 // write thread.)
175 class MemTableList {
176 public:
177 // A list of memtables.
178 explicit MemTableList(int min_write_buffer_number_to_merge,
179 int max_write_buffer_number_to_maintain)
180 : imm_flush_needed(false),
181 min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
182 current_(new MemTableListVersion(&current_memory_usage_,
183 max_write_buffer_number_to_maintain)),
184 num_flush_not_started_(0),
185 commit_in_progress_(false),
186 flush_requested_(false) {
187 current_->Ref();
188 current_memory_usage_ = 0;
189 }
190
191 // Should not delete MemTableList without making sure MemTableList::current()
192 // is Unref()'d.
193 ~MemTableList() {}
194
195 MemTableListVersion* current() { return current_; }
196
197 // so that background threads can detect non-nullptr pointer to
198 // determine whether there is anything more to start flushing.
199 std::atomic<bool> imm_flush_needed;
200
201 // Returns the total number of memtables in the list that haven't yet
202 // been flushed and logged.
203 int NumNotFlushed() const;
204
205 // Returns total number of memtables in the list that have been
206 // completely flushed and logged.
207 int NumFlushed() const;
208
209 // Returns true if there is at least one memtable on which flush has
210 // not yet started.
211 bool IsFlushPending() const;
212
213 // Returns the earliest memtables that needs to be flushed. The returned
214 // memtables are guaranteed to be in the ascending order of created time.
215 void PickMemtablesToFlush(const uint64_t* max_memtable_id,
216 autovector<MemTable*>* mems);
217
218 // Reset status of the given memtable list back to pending state so that
219 // they can get picked up again on the next round of flush.
220 void RollbackMemtableFlush(const autovector<MemTable*>& mems,
221 uint64_t file_number);
222
223 // Try commit a successful flush in the manifest file. It might just return
224 // Status::OK letting a concurrent flush to do the actual the recording.
225 Status TryInstallMemtableFlushResults(
226 ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
227 const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
228 VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
229 autovector<MemTable*>* to_delete, Directory* db_directory,
230 LogBuffer* log_buffer);
231
232 // New memtables are inserted at the front of the list.
233 // Takes ownership of the referenced held on *m by the caller of Add().
234 void Add(MemTable* m, autovector<MemTable*>* to_delete);
235
236 // Returns an estimate of the number of bytes of data in use.
237 size_t ApproximateMemoryUsage();
238
239 // Returns an estimate of the number of bytes of data used by
240 // the unflushed mem-tables.
241 size_t ApproximateUnflushedMemTablesMemoryUsage();
242
243 // Returns an estimate of the timestamp of the earliest key.
244 uint64_t ApproximateOldestKeyTime() const;
245
246 // Request a flush of all existing memtables to storage. This will
247 // cause future calls to IsFlushPending() to return true if this list is
248 // non-empty (regardless of the min_write_buffer_number_to_merge
249 // parameter). This flush request will persist until the next time
250 // PickMemtablesToFlush() is called.
251 void FlushRequested() { flush_requested_ = true; }
252
253 bool HasFlushRequested() { return flush_requested_; }
254
255 // Copying allowed
256 // MemTableList(const MemTableList&);
257 // void operator=(const MemTableList&);
258
259 size_t* current_memory_usage() { return &current_memory_usage_; }
260
261 // Returns the min log containing the prep section after memtables listsed in
262 // `memtables_to_flush` are flushed and their status is persisted in manifest.
263 uint64_t PrecomputeMinLogContainingPrepSection(
264 const autovector<MemTable*>& memtables_to_flush);
265
266 uint64_t GetEarliestMemTableID() const {
267 auto& memlist = current_->memlist_;
268 if (memlist.empty()) {
269 return std::numeric_limits<uint64_t>::max();
270 }
271 return memlist.back()->GetID();
272 }
273
274 uint64_t GetLatestMemTableID() const {
275 auto& memlist = current_->memlist_;
276 if (memlist.empty()) {
277 return 0;
278 }
279 return memlist.front()->GetID();
280 }
281
282 void AssignAtomicFlushSeq(const SequenceNumber& seq) {
283 const auto& memlist = current_->memlist_;
284 // Scan the memtable list from new to old
285 for (auto it = memlist.begin(); it != memlist.end(); ++it) {
286 MemTable* mem = *it;
287 if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
288 mem->atomic_flush_seqno_ = seq;
289 } else {
290 // Earlier memtables must have been assigned a atomic flush seq, no
291 // need to continue scan.
292 break;
293 }
294 }
295 }
296
297 private:
298 friend Status InstallMemtableAtomicFlushResults(
299 const autovector<MemTableList*>* imm_lists,
300 const autovector<ColumnFamilyData*>& cfds,
301 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
302 const autovector<const autovector<MemTable*>*>& mems_list,
303 VersionSet* vset, InstrumentedMutex* mu,
304 const autovector<FileMetaData*>& file_meta,
305 autovector<MemTable*>* to_delete, Directory* db_directory,
306 LogBuffer* log_buffer);
307
308 // DB mutex held
309 void InstallNewVersion();
310
311 const int min_write_buffer_number_to_merge_;
312
313 MemTableListVersion* current_;
314
315 // the number of elements that still need flushing
316 int num_flush_not_started_;
317
318 // committing in progress
319 bool commit_in_progress_;
320
321 // Requested a flush of memtables to storage. It's possible to request that
322 // a subset of memtables be flushed.
323 bool flush_requested_;
324
325 // The current memory usage.
326 size_t current_memory_usage_;
327 };
328
329 // Installs memtable atomic flush results.
330 // In most cases, imm_lists is nullptr, and the function simply uses the
331 // immutable memtable lists associated with the cfds. There are unit tests that
332 // installs flush results for external immutable memtable lists other than the
333 // cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
334 // imm_lists parameter is not nullptr.
335 extern Status InstallMemtableAtomicFlushResults(
336 const autovector<MemTableList*>* imm_lists,
337 const autovector<ColumnFamilyData*>& cfds,
338 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
339 const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
340 InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
341 autovector<MemTable*>* to_delete, Directory* db_directory,
342 LogBuffer* log_buffer);
343 } // namespace rocksdb