]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | #pragma once | |
7 | ||
11fdf7f2 TL |
8 | #include <deque> |
9 | #include <limits> | |
7c673cae | 10 | #include <list> |
7c673cae | 11 | #include <set> |
11fdf7f2 TL |
12 | #include <string> |
13 | #include <vector> | |
7c673cae FG |
14 | |
15 | #include "db/dbformat.h" | |
11fdf7f2 | 16 | #include "db/logs_with_prep_tracker.h" |
7c673cae FG |
17 | #include "db/memtable.h" |
18 | #include "db/range_del_aggregator.h" | |
19 | #include "monitoring/instrumented_mutex.h" | |
20 | #include "rocksdb/db.h" | |
21 | #include "rocksdb/iterator.h" | |
22 | #include "rocksdb/options.h" | |
23 | #include "rocksdb/types.h" | |
24 | #include "util/autovector.h" | |
25 | #include "util/filename.h" | |
26 | #include "util/log_buffer.h" | |
27 | ||
28 | namespace rocksdb { | |
29 | ||
30 | class ColumnFamilyData; | |
31 | class InternalKeyComparator; | |
32 | class InstrumentedMutex; | |
33 | class MergeIteratorBuilder; | |
34 | ||
35 | // keeps a list of immutable memtables in a vector. the list is immutable | |
36 | // if refcount is bigger than one. It is used as a state for Get() and | |
37 | // Iterator code paths | |
38 | // | |
39 | // This class is not thread-safe. External synchronization is required | |
40 | // (such as holding the db mutex or being on the write thread). | |
41 | class MemTableListVersion { | |
42 | public: | |
43 | explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, | |
44 | MemTableListVersion* old = nullptr); | |
45 | explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, | |
46 | int max_write_buffer_number_to_maintain); | |
47 | ||
48 | void Ref(); | |
49 | void Unref(autovector<MemTable*>* to_delete = nullptr); | |
50 | ||
51 | // Search all the memtables starting from the most recent one. | |
52 | // Return the most recent value found, if any. | |
53 | // | |
54 | // If any operation was found for this key, its most recent sequence number | |
55 | // will be stored in *seq on success (regardless of whether true/false is | |
56 | // returned). Otherwise, *seq will be set to kMaxSequenceNumber. | |
57 | bool Get(const LookupKey& key, std::string* value, Status* s, | |
58 | MergeContext* merge_context, RangeDelAggregator* range_del_agg, | |
11fdf7f2 TL |
59 | SequenceNumber* seq, const ReadOptions& read_opts, |
60 | ReadCallback* callback = nullptr, bool* is_blob_index = nullptr); | |
7c673cae FG |
61 | |
62 | bool Get(const LookupKey& key, std::string* value, Status* s, | |
63 | MergeContext* merge_context, RangeDelAggregator* range_del_agg, | |
11fdf7f2 TL |
64 | const ReadOptions& read_opts, ReadCallback* callback = nullptr, |
65 | bool* is_blob_index = nullptr) { | |
7c673cae | 66 | SequenceNumber seq; |
11fdf7f2 TL |
67 | return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts, |
68 | callback, is_blob_index); | |
7c673cae FG |
69 | } |
70 | ||
71 | // Similar to Get(), but searches the Memtable history of memtables that | |
72 | // have already been flushed. Should only be used from in-memory only | |
73 | // queries (such as Transaction validation) as the history may contain | |
74 | // writes that are also present in the SST files. | |
75 | bool GetFromHistory(const LookupKey& key, std::string* value, Status* s, | |
76 | MergeContext* merge_context, | |
77 | RangeDelAggregator* range_del_agg, SequenceNumber* seq, | |
11fdf7f2 TL |
78 | const ReadOptions& read_opts, |
79 | bool* is_blob_index = nullptr); | |
7c673cae FG |
80 | bool GetFromHistory(const LookupKey& key, std::string* value, Status* s, |
81 | MergeContext* merge_context, | |
82 | RangeDelAggregator* range_del_agg, | |
11fdf7f2 TL |
83 | const ReadOptions& read_opts, |
84 | bool* is_blob_index = nullptr) { | |
7c673cae FG |
85 | SequenceNumber seq; |
86 | return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq, | |
11fdf7f2 | 87 | read_opts, is_blob_index); |
7c673cae FG |
88 | } |
89 | ||
90 | Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena, | |
91 | RangeDelAggregator* range_del_agg); | |
11fdf7f2 TL |
92 | Status AddRangeTombstoneIterators( |
93 | const ReadOptions& read_opts, | |
94 | std::vector<InternalIterator*>* range_del_iters); | |
7c673cae FG |
95 | |
96 | void AddIterators(const ReadOptions& options, | |
97 | std::vector<InternalIterator*>* iterator_list, | |
98 | Arena* arena); | |
99 | ||
100 | void AddIterators(const ReadOptions& options, | |
101 | MergeIteratorBuilder* merge_iter_builder); | |
102 | ||
103 | uint64_t GetTotalNumEntries() const; | |
104 | ||
105 | uint64_t GetTotalNumDeletes() const; | |
106 | ||
107 | MemTable::MemTableStats ApproximateStats(const Slice& start_ikey, | |
108 | const Slice& end_ikey); | |
109 | ||
110 | // Returns the value of MemTable::GetEarliestSequenceNumber() on the most | |
111 | // recent MemTable in this list or kMaxSequenceNumber if the list is empty. | |
112 | // If include_history=true, will also search Memtables in MemTableList | |
113 | // History. | |
114 | SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const; | |
115 | ||
116 | private: | |
117 | // REQUIRE: m is an immutable memtable | |
118 | void Add(MemTable* m, autovector<MemTable*>* to_delete); | |
119 | // REQUIRE: m is an immutable memtable | |
120 | void Remove(MemTable* m, autovector<MemTable*>* to_delete); | |
121 | ||
122 | void TrimHistory(autovector<MemTable*>* to_delete); | |
123 | ||
124 | bool GetFromList(std::list<MemTable*>* list, const LookupKey& key, | |
125 | std::string* value, Status* s, MergeContext* merge_context, | |
126 | RangeDelAggregator* range_del_agg, SequenceNumber* seq, | |
11fdf7f2 TL |
127 | const ReadOptions& read_opts, |
128 | ReadCallback* callback = nullptr, | |
129 | bool* is_blob_index = nullptr); | |
7c673cae FG |
130 | |
131 | void AddMemTable(MemTable* m); | |
132 | ||
133 | void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m); | |
134 | ||
135 | friend class MemTableList; | |
136 | ||
137 | // Immutable MemTables that have not yet been flushed. | |
138 | std::list<MemTable*> memlist_; | |
139 | ||
140 | // MemTables that have already been flushed | |
141 | // (used during Transaction validation) | |
142 | std::list<MemTable*> memlist_history_; | |
143 | ||
144 | // Maximum number of MemTables to keep in memory (including both flushed | |
145 | // and not-yet-flushed tables). | |
146 | const int max_write_buffer_number_to_maintain_; | |
147 | ||
148 | int refs_ = 0; | |
149 | ||
150 | size_t* parent_memtable_list_memory_usage_; | |
151 | }; | |
152 | ||
153 | // This class stores references to all the immutable memtables. | |
154 | // The memtables are flushed to L0 as soon as possible and in | |
155 | // any order. If there are more than one immutable memtable, their | |
156 | // flushes can occur concurrently. However, they are 'committed' | |
157 | // to the manifest in FIFO order to maintain correctness and | |
158 | // recoverability from a crash. | |
159 | // | |
160 | // | |
161 | // Other than imm_flush_needed, this class is not thread-safe and requires | |
162 | // external synchronization (such as holding the db mutex or being on the | |
163 | // write thread.) | |
164 | class MemTableList { | |
165 | public: | |
166 | // A list of memtables. | |
167 | explicit MemTableList(int min_write_buffer_number_to_merge, | |
168 | int max_write_buffer_number_to_maintain) | |
169 | : imm_flush_needed(false), | |
170 | min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), | |
171 | current_(new MemTableListVersion(¤t_memory_usage_, | |
172 | max_write_buffer_number_to_maintain)), | |
173 | num_flush_not_started_(0), | |
174 | commit_in_progress_(false), | |
175 | flush_requested_(false) { | |
176 | current_->Ref(); | |
177 | current_memory_usage_ = 0; | |
178 | } | |
179 | ||
180 | // Should not delete MemTableList without making sure MemTableList::current() | |
181 | // is Unref()'d. | |
182 | ~MemTableList() {} | |
183 | ||
184 | MemTableListVersion* current() { return current_; } | |
185 | ||
186 | // so that background threads can detect non-nullptr pointer to | |
187 | // determine whether there is anything more to start flushing. | |
188 | std::atomic<bool> imm_flush_needed; | |
189 | ||
190 | // Returns the total number of memtables in the list that haven't yet | |
191 | // been flushed and logged. | |
192 | int NumNotFlushed() const; | |
193 | ||
194 | // Returns total number of memtables in the list that have been | |
195 | // completely flushed and logged. | |
196 | int NumFlushed() const; | |
197 | ||
198 | // Returns true if there is at least one memtable on which flush has | |
199 | // not yet started. | |
200 | bool IsFlushPending() const; | |
201 | ||
202 | // Returns the earliest memtables that needs to be flushed. The returned | |
203 | // memtables are guaranteed to be in the ascending order of created time. | |
204 | void PickMemtablesToFlush(autovector<MemTable*>* mems); | |
205 | ||
206 | // Reset status of the given memtable list back to pending state so that | |
207 | // they can get picked up again on the next round of flush. | |
208 | void RollbackMemtableFlush(const autovector<MemTable*>& mems, | |
209 | uint64_t file_number); | |
210 | ||
211 | // Commit a successful flush in the manifest file | |
212 | Status InstallMemtableFlushResults( | |
213 | ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, | |
11fdf7f2 TL |
214 | const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker, |
215 | VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, | |
216 | autovector<MemTable*>* to_delete, Directory* db_directory, | |
217 | LogBuffer* log_buffer); | |
7c673cae FG |
218 | |
219 | // New memtables are inserted at the front of the list. | |
220 | // Takes ownership of the referenced held on *m by the caller of Add(). | |
221 | void Add(MemTable* m, autovector<MemTable*>* to_delete); | |
222 | ||
223 | // Returns an estimate of the number of bytes of data in use. | |
224 | size_t ApproximateMemoryUsage(); | |
225 | ||
226 | // Returns an estimate of the number of bytes of data used by | |
227 | // the unflushed mem-tables. | |
228 | size_t ApproximateUnflushedMemTablesMemoryUsage(); | |
229 | ||
11fdf7f2 TL |
230 | // Returns an estimate of the timestamp of the earliest key. |
231 | uint64_t ApproximateOldestKeyTime() const; | |
232 | ||
7c673cae FG |
233 | // Request a flush of all existing memtables to storage. This will |
234 | // cause future calls to IsFlushPending() to return true if this list is | |
235 | // non-empty (regardless of the min_write_buffer_number_to_merge | |
236 | // parameter). This flush request will persist until the next time | |
237 | // PickMemtablesToFlush() is called. | |
238 | void FlushRequested() { flush_requested_ = true; } | |
239 | ||
240 | bool HasFlushRequested() { return flush_requested_; } | |
241 | ||
242 | // Copying allowed | |
243 | // MemTableList(const MemTableList&); | |
244 | // void operator=(const MemTableList&); | |
245 | ||
246 | size_t* current_memory_usage() { return ¤t_memory_usage_; } | |
247 | ||
11fdf7f2 TL |
248 | // Returns the min log containing the prep section after memtables listsed in |
249 | // `memtables_to_flush` are flushed and their status is persisted in manifest. | |
250 | uint64_t PrecomputeMinLogContainingPrepSection( | |
251 | const autovector<MemTable*>& memtables_to_flush); | |
252 | ||
253 | uint64_t GetEarliestMemTableID() const { | |
254 | auto& memlist = current_->memlist_; | |
255 | if (memlist.empty()) { | |
256 | return std::numeric_limits<uint64_t>::max(); | |
257 | } | |
258 | return memlist.back()->GetID(); | |
259 | } | |
260 | ||
261 | uint64_t GetLatestMemTableID() const { | |
262 | auto& memlist = current_->memlist_; | |
263 | if (memlist.empty()) { | |
264 | return 0; | |
265 | } | |
266 | return memlist.front()->GetID(); | |
267 | } | |
7c673cae FG |
268 | |
269 | private: | |
270 | // DB mutex held | |
271 | void InstallNewVersion(); | |
272 | ||
273 | const int min_write_buffer_number_to_merge_; | |
274 | ||
275 | MemTableListVersion* current_; | |
276 | ||
277 | // the number of elements that still need flushing | |
278 | int num_flush_not_started_; | |
279 | ||
280 | // committing in progress | |
281 | bool commit_in_progress_; | |
282 | ||
283 | // Requested a flush of all memtables to storage | |
284 | bool flush_requested_; | |
285 | ||
286 | // The current memory usage. | |
287 | size_t current_memory_usage_; | |
288 | }; | |
289 | ||
290 | } // namespace rocksdb |