]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/memtable_list.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / db / memtable_list.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 #pragma once
7
8 #include <deque>
9 #include <limits>
10 #include <list>
11 #include <set>
12 #include <string>
13 #include <vector>
14
15 #include "db/dbformat.h"
16 #include "db/logs_with_prep_tracker.h"
17 #include "db/memtable.h"
18 #include "db/range_del_aggregator.h"
19 #include "file/filename.h"
20 #include "logging/log_buffer.h"
21 #include "monitoring/instrumented_mutex.h"
22 #include "rocksdb/db.h"
23 #include "rocksdb/iterator.h"
24 #include "rocksdb/options.h"
25 #include "rocksdb/types.h"
26 #include "util/autovector.h"
27
28 namespace ROCKSDB_NAMESPACE {
29
30 class ColumnFamilyData;
31 class InternalKeyComparator;
32 class InstrumentedMutex;
33 class MergeIteratorBuilder;
34 class MemTableList;
35
36 struct FlushJobInfo;
37
38 // keeps a list of immutable memtables in a vector. the list is immutable
39 // if refcount is bigger than one. It is used as a state for Get() and
40 // Iterator code paths
41 //
42 // This class is not thread-safe. External synchronization is required
43 // (such as holding the db mutex or being on the write thread).
44 class MemTableListVersion {
45 public:
46 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
47 const MemTableListVersion& old);
48 explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
49 int max_write_buffer_number_to_maintain,
50 int64_t max_write_buffer_size_to_maintain);
51
52 void Ref();
53 void Unref(autovector<MemTable*>* to_delete = nullptr);
54
55 // Search all the memtables starting from the most recent one.
56 // Return the most recent value found, if any.
57 //
58 // If any operation was found for this key, its most recent sequence number
59 // will be stored in *seq on success (regardless of whether true/false is
60 // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
61 bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
62 Status* s, MergeContext* merge_context,
63 SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
64 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
65 bool* is_blob_index = nullptr);
66
67 bool Get(const LookupKey& key, std::string* value, std::string* timestamp,
68 Status* s, MergeContext* merge_context,
69 SequenceNumber* max_covering_tombstone_seq,
70 const ReadOptions& read_opts, ReadCallback* callback = nullptr,
71 bool* is_blob_index = nullptr) {
72 SequenceNumber seq;
73 return Get(key, value, timestamp, s, merge_context,
74 max_covering_tombstone_seq, &seq, read_opts, callback,
75 is_blob_index);
76 }
77
78 void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
79 ReadCallback* callback, bool* is_blob);
80
81 // Returns all the merge operands corresponding to the key by searching all
82 // memtables starting from the most recent one.
83 bool GetMergeOperands(const LookupKey& key, Status* s,
84 MergeContext* merge_context,
85 SequenceNumber* max_covering_tombstone_seq,
86 const ReadOptions& read_opts);
87
88 // Similar to Get(), but searches the Memtable history of memtables that
89 // have already been flushed. Should only be used from in-memory only
90 // queries (such as Transaction validation) as the history may contain
91 // writes that are also present in the SST files.
92 bool GetFromHistory(const LookupKey& key, std::string* value,
93 std::string* timestamp, Status* s,
94 MergeContext* merge_context,
95 SequenceNumber* max_covering_tombstone_seq,
96 SequenceNumber* seq, const ReadOptions& read_opts,
97 bool* is_blob_index = nullptr);
98 bool GetFromHistory(const LookupKey& key, std::string* value,
99 std::string* timestamp, Status* s,
100 MergeContext* merge_context,
101 SequenceNumber* max_covering_tombstone_seq,
102 const ReadOptions& read_opts,
103 bool* is_blob_index = nullptr) {
104 SequenceNumber seq;
105 return GetFromHistory(key, value, timestamp, s, merge_context,
106 max_covering_tombstone_seq, &seq, read_opts,
107 is_blob_index);
108 }
109
110 Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
111 RangeDelAggregator* range_del_agg);
112
113 void AddIterators(const ReadOptions& options,
114 std::vector<InternalIterator*>* iterator_list,
115 Arena* arena);
116
117 void AddIterators(const ReadOptions& options,
118 MergeIteratorBuilder* merge_iter_builder);
119
120 uint64_t GetTotalNumEntries() const;
121
122 uint64_t GetTotalNumDeletes() const;
123
124 MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
125 const Slice& end_ikey);
126
127 // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
128 // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
129 // If include_history=true, will also search Memtables in MemTableList
130 // History.
131 SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
132
133 private:
134 friend class MemTableList;
135
136 friend Status InstallMemtableAtomicFlushResults(
137 const autovector<MemTableList*>* imm_lists,
138 const autovector<ColumnFamilyData*>& cfds,
139 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
140 const autovector<const autovector<MemTable*>*>& mems_list,
141 VersionSet* vset, InstrumentedMutex* mu,
142 const autovector<FileMetaData*>& file_meta,
143 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
144 LogBuffer* log_buffer);
145
146 // REQUIRE: m is an immutable memtable
147 void Add(MemTable* m, autovector<MemTable*>* to_delete);
148 // REQUIRE: m is an immutable memtable
149 void Remove(MemTable* m, autovector<MemTable*>* to_delete);
150
151 // Return true if memtable is trimmed
152 bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
153
154 bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
155 std::string* value, std::string* timestamp, Status* s,
156 MergeContext* merge_context,
157 SequenceNumber* max_covering_tombstone_seq,
158 SequenceNumber* seq, const ReadOptions& read_opts,
159 ReadCallback* callback = nullptr,
160 bool* is_blob_index = nullptr);
161
162 void AddMemTable(MemTable* m);
163
164 void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
165
166 // Calculate the total amount of memory used by memlist_ and memlist_history_
167 // excluding the last MemTable in memlist_history_. The reason for excluding
168 // the last MemTable is to see if dropping the last MemTable will keep total
169 // memory usage above or equal to max_write_buffer_size_to_maintain_
170 size_t ApproximateMemoryUsageExcludingLast() const;
171
172 // Whether this version contains flushed memtables that are only kept around
173 // for transaction conflict checking.
174 bool HasHistory() const { return !memlist_history_.empty(); }
175
176 bool MemtableLimitExceeded(size_t usage);
177
178 // Immutable MemTables that have not yet been flushed.
179 std::list<MemTable*> memlist_;
180
181 // MemTables that have already been flushed
182 // (used during Transaction validation)
183 std::list<MemTable*> memlist_history_;
184
185 // Maximum number of MemTables to keep in memory (including both flushed
186 const int max_write_buffer_number_to_maintain_;
187 // Maximum size of MemTables to keep in memory (including both flushed
188 // and not-yet-flushed tables).
189 const int64_t max_write_buffer_size_to_maintain_;
190
191 int refs_ = 0;
192
193 size_t* parent_memtable_list_memory_usage_;
194 };
195
196 // This class stores references to all the immutable memtables.
197 // The memtables are flushed to L0 as soon as possible and in
198 // any order. If there are more than one immutable memtable, their
199 // flushes can occur concurrently. However, they are 'committed'
200 // to the manifest in FIFO order to maintain correctness and
201 // recoverability from a crash.
202 //
203 //
204 // Other than imm_flush_needed and imm_trim_needed, this class is not
205 // thread-safe and requires external synchronization (such as holding the db
206 // mutex or being on the write thread.)
207 class MemTableList {
208 public:
209 // A list of memtables.
210 explicit MemTableList(int min_write_buffer_number_to_merge,
211 int max_write_buffer_number_to_maintain,
212 int64_t max_write_buffer_size_to_maintain)
213 : imm_flush_needed(false),
214 imm_trim_needed(false),
215 min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
216 current_(new MemTableListVersion(&current_memory_usage_,
217 max_write_buffer_number_to_maintain,
218 max_write_buffer_size_to_maintain)),
219 num_flush_not_started_(0),
220 commit_in_progress_(false),
221 flush_requested_(false),
222 current_memory_usage_(0),
223 current_memory_usage_excluding_last_(0),
224 current_has_history_(false) {
225 current_->Ref();
226 }
227
228 // Should not delete MemTableList without making sure MemTableList::current()
229 // is Unref()'d.
230 ~MemTableList() {}
231
232 MemTableListVersion* current() const { return current_; }
233
234 // so that background threads can detect non-nullptr pointer to
235 // determine whether there is anything more to start flushing.
236 std::atomic<bool> imm_flush_needed;
237
238 std::atomic<bool> imm_trim_needed;
239
240 // Returns the total number of memtables in the list that haven't yet
241 // been flushed and logged.
242 int NumNotFlushed() const;
243
244 // Returns total number of memtables in the list that have been
245 // completely flushed and logged.
246 int NumFlushed() const;
247
248 // Returns true if there is at least one memtable on which flush has
249 // not yet started.
250 bool IsFlushPending() const;
251
252 // Returns the earliest memtables that needs to be flushed. The returned
253 // memtables are guaranteed to be in the ascending order of created time.
254 void PickMemtablesToFlush(const uint64_t* max_memtable_id,
255 autovector<MemTable*>* mems);
256
257 // Reset status of the given memtable list back to pending state so that
258 // they can get picked up again on the next round of flush.
259 void RollbackMemtableFlush(const autovector<MemTable*>& mems,
260 uint64_t file_number);
261
262 // Try commit a successful flush in the manifest file. It might just return
263 // Status::OK letting a concurrent flush to do the actual the recording.
264 Status TryInstallMemtableFlushResults(
265 ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
266 const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
267 VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
268 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
269 LogBuffer* log_buffer,
270 std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
271 IOStatus* io_s);
272
273 // New memtables are inserted at the front of the list.
274 // Takes ownership of the referenced held on *m by the caller of Add().
275 void Add(MemTable* m, autovector<MemTable*>* to_delete);
276
277 // Returns an estimate of the number of bytes of data in use.
278 size_t ApproximateMemoryUsage();
279
280 // Returns the cached current_memory_usage_excluding_last_ value.
281 size_t ApproximateMemoryUsageExcludingLast() const;
282
283 // Returns the cached current_has_history_ value.
284 bool HasHistory() const;
285
286 // Updates current_memory_usage_excluding_last_ and current_has_history_
287 // from MemTableListVersion. Must be called whenever InstallNewVersion is
288 // called.
289 void UpdateCachedValuesFromMemTableListVersion();
290
291 // `usage` is the current size of the mutable Memtable. When
292 // max_write_buffer_size_to_maintain is used, total size of mutable and
293 // immutable memtables is checked against it to decide whether to trim
294 // memtable list.
295 //
296 // Return true if memtable is trimmed
297 bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
298
299 // Returns an estimate of the number of bytes of data used by
300 // the unflushed mem-tables.
301 size_t ApproximateUnflushedMemTablesMemoryUsage();
302
303 // Returns an estimate of the timestamp of the earliest key.
304 uint64_t ApproximateOldestKeyTime() const;
305
306 // Request a flush of all existing memtables to storage. This will
307 // cause future calls to IsFlushPending() to return true if this list is
308 // non-empty (regardless of the min_write_buffer_number_to_merge
309 // parameter). This flush request will persist until the next time
310 // PickMemtablesToFlush() is called.
311 void FlushRequested() { flush_requested_ = true; }
312
313 bool HasFlushRequested() { return flush_requested_; }
314
315 // Returns true if a trim history should be scheduled and the caller should
316 // be the one to schedule it
317 bool MarkTrimHistoryNeeded() {
318 auto expected = false;
319 return imm_trim_needed.compare_exchange_strong(
320 expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
321 }
322
323 void ResetTrimHistoryNeeded() {
324 auto expected = true;
325 imm_trim_needed.compare_exchange_strong(
326 expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
327 }
328
329 // Copying allowed
330 // MemTableList(const MemTableList&);
331 // void operator=(const MemTableList&);
332
333 size_t* current_memory_usage() { return &current_memory_usage_; }
334
335 // Returns the min log containing the prep section after memtables listsed in
336 // `memtables_to_flush` are flushed and their status is persisted in manifest.
337 uint64_t PrecomputeMinLogContainingPrepSection(
338 const autovector<MemTable*>& memtables_to_flush);
339
340 uint64_t GetEarliestMemTableID() const {
341 auto& memlist = current_->memlist_;
342 if (memlist.empty()) {
343 return std::numeric_limits<uint64_t>::max();
344 }
345 return memlist.back()->GetID();
346 }
347
348 uint64_t GetLatestMemTableID() const {
349 auto& memlist = current_->memlist_;
350 if (memlist.empty()) {
351 return 0;
352 }
353 return memlist.front()->GetID();
354 }
355
356 void AssignAtomicFlushSeq(const SequenceNumber& seq) {
357 const auto& memlist = current_->memlist_;
358 // Scan the memtable list from new to old
359 for (auto it = memlist.begin(); it != memlist.end(); ++it) {
360 MemTable* mem = *it;
361 if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
362 mem->atomic_flush_seqno_ = seq;
363 } else {
364 // Earlier memtables must have been assigned a atomic flush seq, no
365 // need to continue scan.
366 break;
367 }
368 }
369 }
370
371 // Used only by DBImplSecondary during log replay.
372 // Remove memtables whose data were written before the WAL with log_number
373 // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
374 // not freed, but put into a vector for future deref and reclamation.
375 void RemoveOldMemTables(uint64_t log_number,
376 autovector<MemTable*>* to_delete);
377
378 private:
379 friend Status InstallMemtableAtomicFlushResults(
380 const autovector<MemTableList*>* imm_lists,
381 const autovector<ColumnFamilyData*>& cfds,
382 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
383 const autovector<const autovector<MemTable*>*>& mems_list,
384 VersionSet* vset, InstrumentedMutex* mu,
385 const autovector<FileMetaData*>& file_meta,
386 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
387 LogBuffer* log_buffer);
388
389 // DB mutex held
390 void InstallNewVersion();
391
392 // DB mutex held
393 // Called after writing to MANIFEST
394 void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
395 size_t batch_count, LogBuffer* log_buffer,
396 autovector<MemTable*>* to_delete,
397 InstrumentedMutex* mu);
398
399 const int min_write_buffer_number_to_merge_;
400
401 MemTableListVersion* current_;
402
403 // the number of elements that still need flushing
404 int num_flush_not_started_;
405
406 // committing in progress
407 bool commit_in_progress_;
408
409 // Requested a flush of memtables to storage. It's possible to request that
410 // a subset of memtables be flushed.
411 bool flush_requested_;
412
413 // The current memory usage.
414 size_t current_memory_usage_;
415
416 // Cached value of current_->ApproximateMemoryUsageExcludingLast().
417 std::atomic<size_t> current_memory_usage_excluding_last_;
418
419 // Cached value of current_->HasHistory().
420 std::atomic<bool> current_has_history_;
421 };
422
423 // Installs memtable atomic flush results.
424 // In most cases, imm_lists is nullptr, and the function simply uses the
425 // immutable memtable lists associated with the cfds. There are unit tests that
426 // installs flush results for external immutable memtable lists other than the
427 // cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
428 // imm_lists parameter is not nullptr.
429 extern Status InstallMemtableAtomicFlushResults(
430 const autovector<MemTableList*>* imm_lists,
431 const autovector<ColumnFamilyData*>& cfds,
432 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
433 const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
434 InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
435 autovector<MemTable*>* to_delete, FSDirectory* db_directory,
436 LogBuffer* log_buffer);
437 } // namespace ROCKSDB_NAMESPACE