]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/memtable_list.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / db / memtable_list.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6#pragma once
7
11fdf7f2
TL
8#include <deque>
9#include <limits>
7c673cae 10#include <list>
7c673cae 11#include <set>
11fdf7f2
TL
12#include <string>
13#include <vector>
7c673cae
FG
14
15#include "db/dbformat.h"
11fdf7f2 16#include "db/logs_with_prep_tracker.h"
7c673cae
FG
17#include "db/memtable.h"
18#include "db/range_del_aggregator.h"
19#include "monitoring/instrumented_mutex.h"
20#include "rocksdb/db.h"
21#include "rocksdb/iterator.h"
22#include "rocksdb/options.h"
23#include "rocksdb/types.h"
24#include "util/autovector.h"
25#include "util/filename.h"
26#include "util/log_buffer.h"
27
28namespace rocksdb {
29
30class ColumnFamilyData;
31class InternalKeyComparator;
32class InstrumentedMutex;
33class MergeIteratorBuilder;
34
35// keeps a list of immutable memtables in a vector. the list is immutable
36// if refcount is bigger than one. It is used as a state for Get() and
37// Iterator code paths
38//
39// This class is not thread-safe. External synchronization is required
40// (such as holding the db mutex or being on the write thread).
41class MemTableListVersion {
42 public:
43 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
44 MemTableListVersion* old = nullptr);
45 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
46 int max_write_buffer_number_to_maintain);
47
48 void Ref();
49 void Unref(autovector<MemTable*>* to_delete = nullptr);
50
51 // Search all the memtables starting from the most recent one.
52 // Return the most recent value found, if any.
53 //
54 // If any operation was found for this key, its most recent sequence number
55 // will be stored in *seq on success (regardless of whether true/false is
56 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
57 bool Get(const LookupKey& key, std::string* value, Status* s,
58 MergeContext* merge_context, RangeDelAggregator* range_del_agg,
11fdf7f2
TL
59 SequenceNumber* seq, const ReadOptions& read_opts,
60 ReadCallback* callback = nullptr, bool* is_blob_index = nullptr);
7c673cae
FG
61
62 bool Get(const LookupKey& key, std::string* value, Status* s,
63 MergeContext* merge_context, RangeDelAggregator* range_del_agg,
11fdf7f2
TL
64 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
65 bool* is_blob_index = nullptr) {
7c673cae 66 SequenceNumber seq;
11fdf7f2
TL
67 return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
68 callback, is_blob_index);
7c673cae
FG
69 }
70
71 // Similar to Get(), but searches the Memtable history of memtables that
72 // have already been flushed. Should only be used from in-memory only
73 // queries (such as Transaction validation) as the history may contain
74 // writes that are also present in the SST files.
75 bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
76 MergeContext* merge_context,
77 RangeDelAggregator* range_del_agg, SequenceNumber* seq,
11fdf7f2
TL
78 const ReadOptions& read_opts,
79 bool* is_blob_index = nullptr);
7c673cae
FG
80 bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
81 MergeContext* merge_context,
82 RangeDelAggregator* range_del_agg,
11fdf7f2
TL
83 const ReadOptions& read_opts,
84 bool* is_blob_index = nullptr) {
7c673cae
FG
85 SequenceNumber seq;
86 return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq,
11fdf7f2 87 read_opts, is_blob_index);
7c673cae
FG
88 }
89
90 Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
91 RangeDelAggregator* range_del_agg);
11fdf7f2
TL
92 Status AddRangeTombstoneIterators(
93 const ReadOptions& read_opts,
94 std::vector<InternalIterator*>* range_del_iters);
7c673cae
FG
95
96 void AddIterators(const ReadOptions& options,
97 std::vector<InternalIterator*>* iterator_list,
98 Arena* arena);
99
100 void AddIterators(const ReadOptions& options,
101 MergeIteratorBuilder* merge_iter_builder);
102
103 uint64_t GetTotalNumEntries() const;
104
105 uint64_t GetTotalNumDeletes() const;
106
107 MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
108 const Slice& end_ikey);
109
110 // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
111 // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
112 // If include_history=true, will also search Memtables in MemTableList
113 // History.
114 SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
115
116 private:
117 // REQUIRE: m is an immutable memtable
118 void Add(MemTable* m, autovector<MemTable*>* to_delete);
119 // REQUIRE: m is an immutable memtable
120 void Remove(MemTable* m, autovector<MemTable*>* to_delete);
121
122 void TrimHistory(autovector<MemTable*>* to_delete);
123
124 bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
125 std::string* value, Status* s, MergeContext* merge_context,
126 RangeDelAggregator* range_del_agg, SequenceNumber* seq,
11fdf7f2
TL
127 const ReadOptions& read_opts,
128 ReadCallback* callback = nullptr,
129 bool* is_blob_index = nullptr);
7c673cae
FG
130
131 void AddMemTable(MemTable* m);
132
133 void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
134
135 friend class MemTableList;
136
137 // Immutable MemTables that have not yet been flushed.
138 std::list<MemTable*> memlist_;
139
140 // MemTables that have already been flushed
141 // (used during Transaction validation)
142 std::list<MemTable*> memlist_history_;
143
144 // Maximum number of MemTables to keep in memory (including both flushed
145 // and not-yet-flushed tables).
146 const int max_write_buffer_number_to_maintain_;
147
148 int refs_ = 0;
149
150 size_t* parent_memtable_list_memory_usage_;
151};
152
153// This class stores references to all the immutable memtables.
154// The memtables are flushed to L0 as soon as possible and in
155// any order. If there are more than one immutable memtable, their
156// flushes can occur concurrently. However, they are 'committed'
157// to the manifest in FIFO order to maintain correctness and
158// recoverability from a crash.
159//
160//
161// Other than imm_flush_needed, this class is not thread-safe and requires
162// external synchronization (such as holding the db mutex or being on the
163// write thread.)
164class MemTableList {
165 public:
166 // A list of memtables.
167 explicit MemTableList(int min_write_buffer_number_to_merge,
168 int max_write_buffer_number_to_maintain)
169 : imm_flush_needed(false),
170 min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
171 current_(new MemTableListVersion(&current_memory_usage_,
172 max_write_buffer_number_to_maintain)),
173 num_flush_not_started_(0),
174 commit_in_progress_(false),
175 flush_requested_(false) {
176 current_->Ref();
177 current_memory_usage_ = 0;
178 }
179
180 // Should not delete MemTableList without making sure MemTableList::current()
181 // is Unref()'d.
182 ~MemTableList() {}
183
184 MemTableListVersion* current() { return current_; }
185
186 // so that background threads can detect non-nullptr pointer to
187 // determine whether there is anything more to start flushing.
188 std::atomic<bool> imm_flush_needed;
189
190 // Returns the total number of memtables in the list that haven't yet
191 // been flushed and logged.
192 int NumNotFlushed() const;
193
194 // Returns total number of memtables in the list that have been
195 // completely flushed and logged.
196 int NumFlushed() const;
197
198 // Returns true if there is at least one memtable on which flush has
199 // not yet started.
200 bool IsFlushPending() const;
201
202 // Returns the earliest memtables that needs to be flushed. The returned
203 // memtables are guaranteed to be in the ascending order of created time.
204 void PickMemtablesToFlush(autovector<MemTable*>* mems);
205
206 // Reset status of the given memtable list back to pending state so that
207 // they can get picked up again on the next round of flush.
208 void RollbackMemtableFlush(const autovector<MemTable*>& mems,
209 uint64_t file_number);
210
211 // Commit a successful flush in the manifest file
212 Status InstallMemtableFlushResults(
213 ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
11fdf7f2
TL
214 const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
215 VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
216 autovector<MemTable*>* to_delete, Directory* db_directory,
217 LogBuffer* log_buffer);
7c673cae
FG
218
219 // New memtables are inserted at the front of the list.
220 // Takes ownership of the referenced held on *m by the caller of Add().
221 void Add(MemTable* m, autovector<MemTable*>* to_delete);
222
223 // Returns an estimate of the number of bytes of data in use.
224 size_t ApproximateMemoryUsage();
225
226 // Returns an estimate of the number of bytes of data used by
227 // the unflushed mem-tables.
228 size_t ApproximateUnflushedMemTablesMemoryUsage();
229
11fdf7f2
TL
230 // Returns an estimate of the timestamp of the earliest key.
231 uint64_t ApproximateOldestKeyTime() const;
232
7c673cae
FG
233 // Request a flush of all existing memtables to storage. This will
234 // cause future calls to IsFlushPending() to return true if this list is
235 // non-empty (regardless of the min_write_buffer_number_to_merge
236 // parameter). This flush request will persist until the next time
237 // PickMemtablesToFlush() is called.
238 void FlushRequested() { flush_requested_ = true; }
239
240 bool HasFlushRequested() { return flush_requested_; }
241
242 // Copying allowed
243 // MemTableList(const MemTableList&);
244 // void operator=(const MemTableList&);
245
246 size_t* current_memory_usage() { return &current_memory_usage_; }
247
11fdf7f2
TL
248 // Returns the min log containing the prep section after memtables listsed in
249 // `memtables_to_flush` are flushed and their status is persisted in manifest.
250 uint64_t PrecomputeMinLogContainingPrepSection(
251 const autovector<MemTable*>& memtables_to_flush);
252
253 uint64_t GetEarliestMemTableID() const {
254 auto& memlist = current_->memlist_;
255 if (memlist.empty()) {
256 return std::numeric_limits<uint64_t>::max();
257 }
258 return memlist.back()->GetID();
259 }
260
261 uint64_t GetLatestMemTableID() const {
262 auto& memlist = current_->memlist_;
263 if (memlist.empty()) {
264 return 0;
265 }
266 return memlist.front()->GetID();
267 }
7c673cae
FG
268
269 private:
270 // DB mutex held
271 void InstallNewVersion();
272
273 const int min_write_buffer_number_to_merge_;
274
275 MemTableListVersion* current_;
276
277 // the number of elements that still need flushing
278 int num_flush_not_started_;
279
280 // committing in progress
281 bool commit_in_progress_;
282
283 // Requested a flush of all memtables to storage
284 bool flush_requested_;
285
286 // The current memory usage.
287 size_t current_memory_usage_;
288};
289
290} // namespace rocksdb