]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/memtable_list.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / memtable_list.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6#pragma once
7
11fdf7f2
TL
8#include <deque>
9#include <limits>
7c673cae 10#include <list>
7c673cae 11#include <set>
11fdf7f2
TL
12#include <string>
13#include <vector>
7c673cae 14
11fdf7f2 15#include "db/logs_with_prep_tracker.h"
7c673cae
FG
16#include "db/memtable.h"
17#include "db/range_del_aggregator.h"
f67539c2
TL
18#include "file/filename.h"
19#include "logging/log_buffer.h"
7c673cae
FG
20#include "monitoring/instrumented_mutex.h"
21#include "rocksdb/db.h"
22#include "rocksdb/iterator.h"
23#include "rocksdb/options.h"
24#include "rocksdb/types.h"
25#include "util/autovector.h"
7c673cae 26
f67539c2 27namespace ROCKSDB_NAMESPACE {
7c673cae
FG
28
29class ColumnFamilyData;
30class InternalKeyComparator;
31class InstrumentedMutex;
32class MergeIteratorBuilder;
494da23a 33class MemTableList;
7c673cae 34
f67539c2
TL
35struct FlushJobInfo;
36
7c673cae
FG
37// keeps a list of immutable memtables in a vector. the list is immutable
38// if refcount is bigger than one. It is used as a state for Get() and
39// Iterator code paths
40//
41// This class is not thread-safe. External synchronization is required
42// (such as holding the db mutex or being on the write thread).
43class MemTableListVersion {
44 public:
45 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
20effc67 46 const MemTableListVersion& old);
7c673cae 47 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
f67539c2
TL
48 int max_write_buffer_number_to_maintain,
49 int64_t max_write_buffer_size_to_maintain);
7c673cae
FG
50
51 void Ref();
52 void Unref(autovector<MemTable*>* to_delete = nullptr);
53
54 // Search all the memtables starting from the most recent one.
55 // Return the most recent value found, if any.
56 //
57 // If any operation was found for this key, its most recent sequence number
58 // will be stored in *seq on success (regardless of whether true/false is
59 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
1e59de90
TL
60 bool Get(const LookupKey& key, std::string* value,
61 PinnableWideColumns* columns, std::string* timestamp, Status* s,
62 MergeContext* merge_context,
494da23a
TL
63 SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
64 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
65 bool* is_blob_index = nullptr);
7c673cae 66
1e59de90
TL
67 bool Get(const LookupKey& key, std::string* value,
68 PinnableWideColumns* columns, std::string* timestamp, Status* s,
69 MergeContext* merge_context,
494da23a 70 SequenceNumber* max_covering_tombstone_seq,
11fdf7f2
TL
71 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
72 bool* is_blob_index = nullptr) {
7c673cae 73 SequenceNumber seq;
1e59de90 74 return Get(key, value, columns, timestamp, s, merge_context,
20effc67
TL
75 max_covering_tombstone_seq, &seq, read_opts, callback,
76 is_blob_index);
7c673cae
FG
77 }
78
f67539c2 79 void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
1e59de90 80 ReadCallback* callback);
f67539c2
TL
81
82 // Returns all the merge operands corresponding to the key by searching all
83 // memtables starting from the most recent one.
84 bool GetMergeOperands(const LookupKey& key, Status* s,
85 MergeContext* merge_context,
86 SequenceNumber* max_covering_tombstone_seq,
87 const ReadOptions& read_opts);
88
7c673cae
FG
89 // Similar to Get(), but searches the Memtable history of memtables that
90 // have already been flushed. Should only be used from in-memory only
91 // queries (such as Transaction validation) as the history may contain
92 // writes that are also present in the SST files.
20effc67 93 bool GetFromHistory(const LookupKey& key, std::string* value,
1e59de90
TL
94 PinnableWideColumns* columns, std::string* timestamp,
95 Status* s, MergeContext* merge_context,
494da23a
TL
96 SequenceNumber* max_covering_tombstone_seq,
97 SequenceNumber* seq, const ReadOptions& read_opts,
11fdf7f2 98 bool* is_blob_index = nullptr);
20effc67 99 bool GetFromHistory(const LookupKey& key, std::string* value,
1e59de90
TL
100 PinnableWideColumns* columns, std::string* timestamp,
101 Status* s, MergeContext* merge_context,
494da23a 102 SequenceNumber* max_covering_tombstone_seq,
11fdf7f2
TL
103 const ReadOptions& read_opts,
104 bool* is_blob_index = nullptr) {
7c673cae 105 SequenceNumber seq;
1e59de90 106 return GetFromHistory(key, value, columns, timestamp, s, merge_context,
494da23a
TL
107 max_covering_tombstone_seq, &seq, read_opts,
108 is_blob_index);
7c673cae
FG
109 }
110
111 Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
112 RangeDelAggregator* range_del_agg);
113
114 void AddIterators(const ReadOptions& options,
115 std::vector<InternalIterator*>* iterator_list,
116 Arena* arena);
117
118 void AddIterators(const ReadOptions& options,
1e59de90
TL
119 MergeIteratorBuilder* merge_iter_builder,
120 bool add_range_tombstone_iter);
7c673cae
FG
121
122 uint64_t GetTotalNumEntries() const;
123
124 uint64_t GetTotalNumDeletes() const;
125
126 MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
127 const Slice& end_ikey);
128
129 // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
130 // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
131 // If include_history=true, will also search Memtables in MemTableList
132 // History.
133 SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
134
1e59de90
TL
135 // Return the first sequence number from the memtable list, which is the
136 // smallest sequence number of all FirstSequenceNumber.
137 // Return kMaxSequenceNumber if the list is empty.
138 SequenceNumber GetFirstSequenceNumber() const;
139
7c673cae 140 private:
494da23a
TL
141 friend class MemTableList;
142
143 friend Status InstallMemtableAtomicFlushResults(
144 const autovector<MemTableList*>* imm_lists,
145 const autovector<ColumnFamilyData*>& cfds,
146 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
147 const autovector<const autovector<MemTable*>*>& mems_list,
1e59de90
TL
148 VersionSet* vset, LogsWithPrepTracker* prep_tracker,
149 InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
150 const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
151 committed_flush_jobs_info,
20effc67 152 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
494da23a
TL
153 LogBuffer* log_buffer);
154
7c673cae
FG
155 // REQUIRE: m is an immutable memtable
156 void Add(MemTable* m, autovector<MemTable*>* to_delete);
157 // REQUIRE: m is an immutable memtable
158 void Remove(MemTable* m, autovector<MemTable*>* to_delete);
159
20effc67
TL
160 // Return true if memtable is trimmed
161 bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
7c673cae
FG
162
163 bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
1e59de90
TL
164 std::string* value, PinnableWideColumns* columns,
165 std::string* timestamp, Status* s,
20effc67 166 MergeContext* merge_context,
494da23a
TL
167 SequenceNumber* max_covering_tombstone_seq,
168 SequenceNumber* seq, const ReadOptions& read_opts,
11fdf7f2
TL
169 ReadCallback* callback = nullptr,
170 bool* is_blob_index = nullptr);
7c673cae
FG
171
172 void AddMemTable(MemTable* m);
173
174 void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
175
f67539c2
TL
176 // Calculate the total amount of memory used by memlist_ and memlist_history_
177 // excluding the last MemTable in memlist_history_. The reason for excluding
178 // the last MemTable is to see if dropping the last MemTable will keep total
179 // memory usage above or equal to max_write_buffer_size_to_maintain_
1e59de90 180 size_t MemoryAllocatedBytesExcludingLast() const;
f67539c2
TL
181
182 // Whether this version contains flushed memtables that are only kept around
183 // for transaction conflict checking.
184 bool HasHistory() const { return !memlist_history_.empty(); }
185
186 bool MemtableLimitExceeded(size_t usage);
187
7c673cae
FG
188 // Immutable MemTables that have not yet been flushed.
189 std::list<MemTable*> memlist_;
190
191 // MemTables that have already been flushed
192 // (used during Transaction validation)
193 std::list<MemTable*> memlist_history_;
194
195 // Maximum number of MemTables to keep in memory (including both flushed
7c673cae 196 const int max_write_buffer_number_to_maintain_;
f67539c2
TL
197 // Maximum size of MemTables to keep in memory (including both flushed
198 // and not-yet-flushed tables).
199 const int64_t max_write_buffer_size_to_maintain_;
7c673cae
FG
200
201 int refs_ = 0;
202
203 size_t* parent_memtable_list_memory_usage_;
204};
205
206// This class stores references to all the immutable memtables.
207// The memtables are flushed to L0 as soon as possible and in
208// any order. If there are more than one immutable memtable, their
209// flushes can occur concurrently. However, they are 'committed'
210// to the manifest in FIFO order to maintain correctness and
211// recoverability from a crash.
212//
213//
f67539c2
TL
214// Other than imm_flush_needed and imm_trim_needed, this class is not
215// thread-safe and requires external synchronization (such as holding the db
216// mutex or being on the write thread.)
7c673cae
FG
217class MemTableList {
218 public:
219 // A list of memtables.
220 explicit MemTableList(int min_write_buffer_number_to_merge,
f67539c2
TL
221 int max_write_buffer_number_to_maintain,
222 int64_t max_write_buffer_size_to_maintain)
7c673cae 223 : imm_flush_needed(false),
f67539c2 224 imm_trim_needed(false),
7c673cae
FG
225 min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
226 current_(new MemTableListVersion(&current_memory_usage_,
f67539c2
TL
227 max_write_buffer_number_to_maintain,
228 max_write_buffer_size_to_maintain)),
7c673cae
FG
229 num_flush_not_started_(0),
230 commit_in_progress_(false),
f67539c2
TL
231 flush_requested_(false),
232 current_memory_usage_(0),
1e59de90 233 current_memory_allocted_bytes_excluding_last_(0),
f67539c2 234 current_has_history_(false) {
7c673cae 235 current_->Ref();
7c673cae
FG
236 }
237
238 // Should not delete MemTableList without making sure MemTableList::current()
239 // is Unref()'d.
240 ~MemTableList() {}
241
f67539c2 242 MemTableListVersion* current() const { return current_; }
7c673cae
FG
243
244 // so that background threads can detect non-nullptr pointer to
245 // determine whether there is anything more to start flushing.
246 std::atomic<bool> imm_flush_needed;
247
f67539c2
TL
248 std::atomic<bool> imm_trim_needed;
249
7c673cae
FG
250 // Returns the total number of memtables in the list that haven't yet
251 // been flushed and logged.
252 int NumNotFlushed() const;
253
254 // Returns total number of memtables in the list that have been
255 // completely flushed and logged.
256 int NumFlushed() const;
257
258 // Returns true if there is at least one memtable on which flush has
259 // not yet started.
260 bool IsFlushPending() const;
261
1e59de90
TL
262 // Returns true if there is at least one memtable that is pending flush or
263 // flushing.
264 bool IsFlushPendingOrRunning() const;
265
7c673cae
FG
266 // Returns the earliest memtables that needs to be flushed. The returned
267 // memtables are guaranteed to be in the ascending order of created time.
1e59de90
TL
268 void PickMemtablesToFlush(uint64_t max_memtable_id,
269 autovector<MemTable*>* mems,
270 uint64_t* max_next_log_number = nullptr);
7c673cae
FG
271
272 // Reset status of the given memtable list back to pending state so that
273 // they can get picked up again on the next round of flush.
274 void RollbackMemtableFlush(const autovector<MemTable*>& mems,
275 uint64_t file_number);
276
494da23a
TL
277 // Try commit a successful flush in the manifest file. It might just return
278 // Status::OK letting a concurrent flush to do the actual the recording.
279 Status TryInstallMemtableFlushResults(
7c673cae 280 ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
11fdf7f2
TL
281 const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
282 VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
20effc67 283 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
f67539c2 284 LogBuffer* log_buffer,
20effc67 285 std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
1e59de90 286 bool write_edits = true);
7c673cae
FG
287
288 // New memtables are inserted at the front of the list.
289 // Takes ownership of the referenced held on *m by the caller of Add().
1e59de90
TL
290 // By default, adding memtables will flag that the memtable list needs to be
291 // flushed, but in certain situations, like after a mempurge, we may want to
292 // avoid flushing the memtable list upon addition of a memtable.
7c673cae
FG
293 void Add(MemTable* m, autovector<MemTable*>* to_delete);
294
295 // Returns an estimate of the number of bytes of data in use.
296 size_t ApproximateMemoryUsage();
297
1e59de90
TL
298 // Returns the cached current_memory_allocted_bytes_excluding_last_ value.
299 size_t MemoryAllocatedBytesExcludingLast() const;
f67539c2
TL
300
301 // Returns the cached current_has_history_ value.
302 bool HasHistory() const;
303
1e59de90
TL
304 // Updates current_memory_allocted_bytes_excluding_last_ and
305 // current_has_history_ from MemTableListVersion. Must be called whenever
306 // InstallNewVersion is called.
f67539c2
TL
307 void UpdateCachedValuesFromMemTableListVersion();
308
309 // `usage` is the current size of the mutable Memtable. When
310 // max_write_buffer_size_to_maintain is used, total size of mutable and
311 // immutable memtables is checked against it to decide whether to trim
312 // memtable list.
20effc67
TL
313 //
314 // Return true if memtable is trimmed
315 bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
f67539c2 316
7c673cae
FG
317 // Returns an estimate of the number of bytes of data used by
318 // the unflushed mem-tables.
319 size_t ApproximateUnflushedMemTablesMemoryUsage();
320
11fdf7f2
TL
321 // Returns an estimate of the timestamp of the earliest key.
322 uint64_t ApproximateOldestKeyTime() const;
323
7c673cae
FG
324 // Request a flush of all existing memtables to storage. This will
325 // cause future calls to IsFlushPending() to return true if this list is
326 // non-empty (regardless of the min_write_buffer_number_to_merge
327 // parameter). This flush request will persist until the next time
328 // PickMemtablesToFlush() is called.
1e59de90
TL
329 void FlushRequested() {
330 flush_requested_ = true;
331 // If there are some memtables stored in imm() that don't trigger
332 // flush (eg: mempurge output memtable), then update imm_flush_needed.
333 // Note: if race condition and imm_flush_needed is set to true
334 // when there is num_flush_not_started_==0, then there is no
335 // impact whatsoever. Imm_flush_needed is only used in an assert
336 // in IsFlushPending().
337 if (num_flush_not_started_ > 0) {
338 imm_flush_needed.store(true, std::memory_order_release);
339 }
340 }
7c673cae
FG
341
342 bool HasFlushRequested() { return flush_requested_; }
343
f67539c2
TL
344 // Returns true if a trim history should be scheduled and the caller should
345 // be the one to schedule it
346 bool MarkTrimHistoryNeeded() {
347 auto expected = false;
348 return imm_trim_needed.compare_exchange_strong(
349 expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
350 }
351
352 void ResetTrimHistoryNeeded() {
353 auto expected = true;
354 imm_trim_needed.compare_exchange_strong(
355 expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
356 }
357
7c673cae
FG
358 // Copying allowed
359 // MemTableList(const MemTableList&);
360 // void operator=(const MemTableList&);
361
362 size_t* current_memory_usage() { return &current_memory_usage_; }
363
11fdf7f2
TL
364 // Returns the min log containing the prep section after memtables listsed in
365 // `memtables_to_flush` are flushed and their status is persisted in manifest.
366 uint64_t PrecomputeMinLogContainingPrepSection(
1e59de90 367 const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
11fdf7f2
TL
368
369 uint64_t GetEarliestMemTableID() const {
370 auto& memlist = current_->memlist_;
371 if (memlist.empty()) {
372 return std::numeric_limits<uint64_t>::max();
373 }
374 return memlist.back()->GetID();
375 }
376
377 uint64_t GetLatestMemTableID() const {
378 auto& memlist = current_->memlist_;
379 if (memlist.empty()) {
380 return 0;
381 }
382 return memlist.front()->GetID();
383 }
7c673cae 384
494da23a
TL
385 void AssignAtomicFlushSeq(const SequenceNumber& seq) {
386 const auto& memlist = current_->memlist_;
387 // Scan the memtable list from new to old
388 for (auto it = memlist.begin(); it != memlist.end(); ++it) {
389 MemTable* mem = *it;
390 if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
391 mem->atomic_flush_seqno_ = seq;
392 } else {
393 // Earlier memtables must have been assigned a atomic flush seq, no
394 // need to continue scan.
395 break;
396 }
397 }
398 }
399
f67539c2
TL
400 // Used only by DBImplSecondary during log replay.
401 // Remove memtables whose data were written before the WAL with log_number
402 // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
403 // not freed, but put into a vector for future deref and reclamation.
404 void RemoveOldMemTables(uint64_t log_number,
405 autovector<MemTable*>* to_delete);
406
7c673cae 407 private:
494da23a
TL
408 friend Status InstallMemtableAtomicFlushResults(
409 const autovector<MemTableList*>* imm_lists,
410 const autovector<ColumnFamilyData*>& cfds,
411 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
412 const autovector<const autovector<MemTable*>*>& mems_list,
1e59de90
TL
413 VersionSet* vset, LogsWithPrepTracker* prep_tracker,
414 InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
415 const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
416 committed_flush_jobs_info,
20effc67 417 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
494da23a
TL
418 LogBuffer* log_buffer);
419
7c673cae
FG
420 // DB mutex held
421 void InstallNewVersion();
422
20effc67
TL
423 // DB mutex held
424 // Called after writing to MANIFEST
425 void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
426 size_t batch_count, LogBuffer* log_buffer,
427 autovector<MemTable*>* to_delete,
428 InstrumentedMutex* mu);
429
7c673cae
FG
430 const int min_write_buffer_number_to_merge_;
431
432 MemTableListVersion* current_;
433
434 // the number of elements that still need flushing
435 int num_flush_not_started_;
436
437 // committing in progress
438 bool commit_in_progress_;
439
494da23a
TL
440 // Requested a flush of memtables to storage. It's possible to request that
441 // a subset of memtables be flushed.
7c673cae
FG
442 bool flush_requested_;
443
444 // The current memory usage.
445 size_t current_memory_usage_;
f67539c2 446
1e59de90
TL
447 // Cached value of current_->MemoryAllocatedBytesExcludingLast().
448 std::atomic<size_t> current_memory_allocted_bytes_excluding_last_;
f67539c2
TL
449
450 // Cached value of current_->HasHistory().
451 std::atomic<bool> current_has_history_;
7c673cae
FG
452};
453
494da23a
TL
454// Installs memtable atomic flush results.
455// In most cases, imm_lists is nullptr, and the function simply uses the
456// immutable memtable lists associated with the cfds. There are unit tests that
457// installs flush results for external immutable memtable lists other than the
458// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
459// imm_lists parameter is not nullptr.
460extern Status InstallMemtableAtomicFlushResults(
461 const autovector<MemTableList*>* imm_lists,
462 const autovector<ColumnFamilyData*>& cfds,
463 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
464 const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
1e59de90
TL
465 LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
466 const autovector<FileMetaData*>& file_meta,
467 const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
468 committed_flush_jobs_info,
20effc67 469 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
494da23a 470 LogBuffer* log_buffer);
f67539c2 471} // namespace ROCKSDB_NAMESPACE