]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | // | |
10 | // The representation of a DBImpl consists of a set of Versions. The | |
11 | // newest version is called "current". Older versions may be kept | |
12 | // around to provide a consistent view to live iterators. | |
13 | // | |
20effc67 TL |
14 | // Each Version keeps track of a set of table files per level, as well as a |
15 | // set of blob files. The entire set of versions is maintained in a | |
16 | // VersionSet. | |
7c673cae FG |
17 | // |
18 | // Version,VersionSet are thread-compatible, but require external | |
19 | // synchronization on all accesses. | |
20 | ||
21 | #pragma once | |
22 | #include <atomic> | |
23 | #include <deque> | |
24 | #include <limits> | |
25 | #include <map> | |
26 | #include <memory> | |
1e59de90 | 27 | #include <optional> |
7c673cae FG |
28 | #include <set> |
29 | #include <string> | |
1e59de90 | 30 | #include <unordered_set> |
7c673cae FG |
31 | #include <utility> |
32 | #include <vector> | |
33 | ||
20effc67 TL |
34 | #include "cache/cache_helpers.h" |
35 | #include "db/blob/blob_file_meta.h" | |
7c673cae | 36 | #include "db/column_family.h" |
f67539c2 TL |
37 | #include "db/compaction/compaction.h" |
38 | #include "db/compaction/compaction_picker.h" | |
7c673cae FG |
39 | #include "db/dbformat.h" |
40 | #include "db/file_indexer.h" | |
41 | #include "db/log_reader.h" | |
42 | #include "db/range_del_aggregator.h" | |
11fdf7f2 | 43 | #include "db/read_callback.h" |
7c673cae FG |
44 | #include "db/table_cache.h" |
45 | #include "db/version_builder.h" | |
46 | #include "db/version_edit.h" | |
47 | #include "db/write_controller.h" | |
20effc67 | 48 | #include "env/file_system_tracer.h" |
1e59de90 TL |
49 | #if USE_COROUTINES |
50 | #include "folly/experimental/coro/BlockingWait.h" | |
51 | #include "folly/experimental/coro/Collect.h" | |
52 | #endif | |
7c673cae FG |
53 | #include "monitoring/instrumented_mutex.h" |
54 | #include "options/db_options.h" | |
55 | #include "port/port.h" | |
56 | #include "rocksdb/env.h" | |
f67539c2 TL |
57 | #include "rocksdb/file_checksum.h" |
58 | #include "table/get_context.h" | |
59 | #include "table/multiget_context.h" | |
60 | #include "trace_replay/block_cache_tracer.h" | |
1e59de90 TL |
61 | #include "util/autovector.h" |
62 | #include "util/coro_utils.h" | |
63 | #include "util/hash_containers.h" | |
7c673cae | 64 | |
f67539c2 | 65 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
66 | |
67 | namespace log { | |
68 | class Writer; | |
69 | } | |
70 | ||
1e59de90 | 71 | class BlobIndex; |
7c673cae | 72 | class Compaction; |
7c673cae FG |
73 | class LogBuffer; |
74 | class LookupKey; | |
75 | class MemTable; | |
76 | class Version; | |
77 | class VersionSet; | |
78 | class WriteBufferManager; | |
79 | class MergeContext; | |
80 | class ColumnFamilySet; | |
7c673cae | 81 | class MergeIteratorBuilder; |
1e59de90 TL |
82 | class SystemClock; |
83 | class ManifestTailer; | |
84 | class FilePickerMultiGet; | |
7c673cae | 85 | |
f67539c2 TL |
86 | // VersionEdit is always supposed to be valid and it is used to point at |
87 | // entries in Manifest. Ideally it should not be used as a container to | |
88 | // carry around few of its fields as function params because it can cause | |
89 | // readers to think it's a valid entry from Manifest. To avoid that confusion | |
90 | // introducing VersionEditParams to simply carry around multiple VersionEdit | |
91 | // params. It need not point to a valid record in Manifest. | |
92 | using VersionEditParams = VersionEdit; | |
93 | ||
7c673cae FG |
94 | // Return the smallest index i such that file_level.files[i]->largest >= key. |
95 | // Return file_level.num_files if there is no such file. | |
96 | // REQUIRES: "file_level.files" contains a sorted list of | |
97 | // non-overlapping files. | |
98 | extern int FindFile(const InternalKeyComparator& icmp, | |
99 | const LevelFilesBrief& file_level, const Slice& key); | |
100 | ||
101 | // Returns true iff some file in "files" overlaps the user key range | |
102 | // [*smallest,*largest]. | |
103 | // smallest==nullptr represents a key smaller than all keys in the DB. | |
104 | // largest==nullptr represents a key largest than all keys in the DB. | |
105 | // REQUIRES: If disjoint_sorted_files, file_level.files[] | |
106 | // contains disjoint ranges in sorted order. | |
107 | extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, | |
108 | bool disjoint_sorted_files, | |
109 | const LevelFilesBrief& file_level, | |
110 | const Slice* smallest_user_key, | |
111 | const Slice* largest_user_key); | |
112 | ||
113 | // Generate LevelFilesBrief from vector<FdWithKeyRange*> | |
114 | // Would copy smallest_key and largest_key data to sequential memory | |
115 | // arena: Arena used to allocate the memory | |
116 | extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, | |
117 | const std::vector<FileMetaData*>& files, | |
118 | Arena* arena); | |
119 | ||
f67539c2 TL |
120 | // Information of the storage associated with each Version, including number of |
121 | // levels of LSM tree, files information at each level, files marked for | |
20effc67 | 122 | // compaction, blob files, etc. |
7c673cae FG |
123 | class VersionStorageInfo { |
124 | public: | |
125 | VersionStorageInfo(const InternalKeyComparator* internal_comparator, | |
126 | const Comparator* user_comparator, int num_levels, | |
127 | CompactionStyle compaction_style, | |
128 | VersionStorageInfo* src_vstorage, | |
129 | bool _force_consistency_checks); | |
f67539c2 TL |
130 | // No copying allowed |
131 | VersionStorageInfo(const VersionStorageInfo&) = delete; | |
132 | void operator=(const VersionStorageInfo&) = delete; | |
7c673cae FG |
133 | ~VersionStorageInfo(); |
134 | ||
135 | void Reserve(int level, size_t size) { files_[level].reserve(size); } | |
136 | ||
20effc67 TL |
137 | void AddFile(int level, FileMetaData* f); |
138 | ||
1e59de90 TL |
139 | // Resize/Initialize the space for compact_cursor_ |
140 | void ResizeCompactCursors(int level) { | |
141 | compact_cursor_.resize(level, InternalKey()); | |
142 | } | |
7c673cae | 143 | |
1e59de90 TL |
144 | const std::vector<InternalKey>& GetCompactCursors() const { |
145 | return compact_cursor_; | |
146 | } | |
7c673cae | 147 | |
1e59de90 TL |
148 | // REQUIRES: ResizeCompactCursors has been called |
149 | void AddCursorForOneLevel(int level, | |
150 | const InternalKey& smallest_uncompacted_key) { | |
151 | compact_cursor_[level] = smallest_uncompacted_key; | |
152 | } | |
7c673cae | 153 | |
1e59de90 TL |
154 | // REQUIRES: lock is held |
155 | // Update the compact cursor and advance the file index using increment | |
156 | // so that it can point to the next cursor (increment means the number of | |
157 | // input files in this level of the last compaction) | |
158 | const InternalKey& GetNextCompactCursor(int level, size_t increment) { | |
159 | int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment; | |
160 | assert(cmp_idx <= (int)files_by_compaction_pri_[level].size()); | |
161 | // TODO(zichen): may need to update next_file_to_compact_by_size_ | |
162 | // for parallel compaction. | |
163 | InternalKey new_cursor; | |
164 | if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) { | |
165 | cmp_idx = 0; | |
166 | } | |
167 | // TODO(zichen): rethink if this strategy gives us some good guarantee | |
168 | return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest; | |
7c673cae FG |
169 | } |
170 | ||
1e59de90 TL |
171 | void ReserveBlob(size_t size) { blob_files_.reserve(size); } |
172 | ||
173 | void AddBlobFile(std::shared_ptr<BlobFileMetaData> blob_file_meta); | |
174 | ||
175 | void PrepareForVersionAppend(const ImmutableOptions& immutable_options, | |
176 | const MutableCFOptions& mutable_cf_options); | |
177 | ||
178 | // REQUIRES: PrepareForVersionAppend has been called | |
179 | void SetFinalized(); | |
180 | ||
7c673cae FG |
181 | // Update the accumulated stats from a file-meta. |
182 | void UpdateAccumulatedStats(FileMetaData* file_meta); | |
183 | ||
11fdf7f2 | 184 | // Decrease the current stat from a to-be-deleted file-meta |
7c673cae FG |
185 | void RemoveCurrentStats(FileMetaData* file_meta); |
186 | ||
7c673cae FG |
187 | // Updates internal structures that keep track of compaction scores |
188 | // We use compaction scores to figure out which compaction to do next | |
189 | // REQUIRES: db_mutex held!! | |
190 | // TODO find a better way to pass compaction_options_fifo. | |
1e59de90 | 191 | void ComputeCompactionScore(const ImmutableOptions& immutable_options, |
7c673cae FG |
192 | const MutableCFOptions& mutable_cf_options); |
193 | ||
194 | // Estimate est_comp_needed_bytes_ | |
195 | void EstimateCompactionBytesNeeded( | |
196 | const MutableCFOptions& mutable_cf_options); | |
197 | ||
198 | // This computes files_marked_for_compaction_ and is called by | |
199 | // ComputeCompactionScore() | |
200 | void ComputeFilesMarkedForCompaction(); | |
201 | ||
11fdf7f2 TL |
202 | // This computes ttl_expired_files_ and is called by |
203 | // ComputeCompactionScore() | |
1e59de90 | 204 | void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions, |
11fdf7f2 TL |
205 | const uint64_t ttl); |
206 | ||
f67539c2 TL |
207 | // This computes files_marked_for_periodic_compaction_ and is called by |
208 | // ComputeCompactionScore() | |
209 | void ComputeFilesMarkedForPeriodicCompaction( | |
1e59de90 | 210 | const ImmutableOptions& ioptions, |
f67539c2 TL |
211 | const uint64_t periodic_compaction_seconds); |
212 | ||
11fdf7f2 TL |
213 | // This computes bottommost_files_marked_for_compaction_ and is called by |
214 | // ComputeCompactionScore() or UpdateOldestSnapshot(). | |
215 | // | |
216 | // Among bottommost files (assumes they've already been computed), marks the | |
217 | // ones that have keys that would be eliminated if recompacted, according to | |
218 | // the seqnum of the oldest existing snapshot. Must be called every time | |
219 | // oldest snapshot changes as that is when bottom-level files can become | |
220 | // eligible for compaction. | |
221 | // | |
222 | // REQUIRES: DB mutex held | |
223 | void ComputeBottommostFilesMarkedForCompaction(); | |
224 | ||
1e59de90 TL |
225 | // This computes files_marked_for_forced_blob_gc_ and is called by |
226 | // ComputeCompactionScore() | |
227 | // | |
228 | // REQUIRES: DB mutex held | |
229 | void ComputeFilesMarkedForForcedBlobGC( | |
230 | double blob_garbage_collection_age_cutoff, | |
231 | double blob_garbage_collection_force_threshold); | |
7c673cae | 232 | |
1e59de90 | 233 | bool level0_non_overlapping() const { return level0_non_overlapping_; } |
11fdf7f2 TL |
234 | |
235 | // Updates the oldest snapshot and related internal state, like the bottommost | |
236 | // files marked for compaction. | |
237 | // REQUIRES: DB mutex held | |
238 | void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum); | |
239 | ||
7c673cae | 240 | int MaxInputLevel() const; |
11fdf7f2 | 241 | int MaxOutputLevel(bool allow_ingest_behind) const; |
7c673cae FG |
242 | |
243 | // Return level number that has idx'th highest score | |
244 | int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } | |
245 | ||
246 | // Return idx'th highest score | |
247 | double CompactionScore(int idx) const { return compaction_score_[idx]; } | |
248 | ||
249 | void GetOverlappingInputs( | |
250 | int level, const InternalKey* begin, // nullptr means before all keys | |
251 | const InternalKey* end, // nullptr means after all keys | |
252 | std::vector<FileMetaData*>* inputs, | |
253 | int hint_index = -1, // index of overlap file | |
254 | int* file_index = nullptr, // return index of overlap file | |
11fdf7f2 TL |
255 | bool expand_range = true, // if set, returns files which overlap the |
256 | // range and overlap each other. If false, | |
7c673cae | 257 | // then just files intersecting the range |
11fdf7f2 TL |
258 | InternalKey** next_smallest = nullptr) // if non-null, returns the |
259 | const; // smallest key of next file not included | |
7c673cae FG |
260 | void GetCleanInputsWithinInterval( |
261 | int level, const InternalKey* begin, // nullptr means before all keys | |
262 | const InternalKey* end, // nullptr means after all keys | |
263 | std::vector<FileMetaData*>* inputs, | |
264 | int hint_index = -1, // index of overlap file | |
265 | int* file_index = nullptr) // return index of overlap file | |
266 | const; | |
267 | ||
268 | void GetOverlappingInputsRangeBinarySearch( | |
11fdf7f2 TL |
269 | int level, // level > 0 |
270 | const InternalKey* begin, // nullptr means before all keys | |
271 | const InternalKey* end, // nullptr means after all keys | |
7c673cae FG |
272 | std::vector<FileMetaData*>* inputs, |
273 | int hint_index, // index of overlap file | |
274 | int* file_index, // return index of overlap file | |
11fdf7f2 TL |
275 | bool within_interval = false, // if set, force the inputs within interval |
276 | InternalKey** next_smallest = nullptr) // if non-null, returns the | |
277 | const; // smallest key of next file not included | |
7c673cae | 278 | |
7c673cae FG |
279 | // Returns true iff some file in the specified level overlaps |
280 | // some part of [*smallest_user_key,*largest_user_key]. | |
281 | // smallest_user_key==NULL represents a key smaller than all keys in the DB. | |
282 | // largest_user_key==NULL represents a key largest than all keys in the DB. | |
283 | bool OverlapInLevel(int level, const Slice* smallest_user_key, | |
284 | const Slice* largest_user_key); | |
285 | ||
286 | // Returns true iff the first or last file in inputs contains | |
287 | // an overlapping user key to the file "just outside" of it (i.e. | |
288 | // just after the last file, or just before the first file) | |
289 | // REQUIRES: "*inputs" is a sorted list of non-overlapping files | |
290 | bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs, | |
291 | int level); | |
292 | ||
293 | int num_levels() const { return num_levels_; } | |
294 | ||
1e59de90 | 295 | // REQUIRES: PrepareForVersionAppend has been called |
7c673cae FG |
296 | int num_non_empty_levels() const { |
297 | assert(finalized_); | |
298 | return num_non_empty_levels_; | |
299 | } | |
300 | ||
1e59de90 | 301 | // REQUIRES: PrepareForVersionAppend has been called |
7c673cae FG |
302 | // This may or may not return number of level files. It is to keep backward |
303 | // compatible behavior in universal compaction. | |
304 | int l0_delay_trigger_count() const { return l0_delay_trigger_count_; } | |
305 | ||
306 | void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; } | |
307 | ||
1e59de90 | 308 | // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) |
7c673cae FG |
309 | int NumLevelFiles(int level) const { |
310 | assert(finalized_); | |
311 | return static_cast<int>(files_[level].size()); | |
312 | } | |
313 | ||
314 | // Return the combined file size of all files at the specified level. | |
315 | uint64_t NumLevelBytes(int level) const; | |
316 | ||
1e59de90 | 317 | // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) |
7c673cae FG |
318 | const std::vector<FileMetaData*>& LevelFiles(int level) const { |
319 | return files_[level]; | |
320 | } | |
321 | ||
20effc67 TL |
322 | class FileLocation { |
323 | public: | |
324 | FileLocation() = default; | |
325 | FileLocation(int level, size_t position) | |
326 | : level_(level), position_(position) {} | |
327 | ||
328 | int GetLevel() const { return level_; } | |
329 | size_t GetPosition() const { return position_; } | |
330 | ||
331 | bool IsValid() const { return level_ >= 0; } | |
332 | ||
333 | bool operator==(const FileLocation& rhs) const { | |
334 | return level_ == rhs.level_ && position_ == rhs.position_; | |
335 | } | |
336 | ||
337 | bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); } | |
338 | ||
339 | static FileLocation Invalid() { return FileLocation(); } | |
340 | ||
341 | private: | |
342 | int level_ = -1; | |
343 | size_t position_ = 0; | |
344 | }; | |
345 | ||
1e59de90 | 346 | // REQUIRES: PrepareForVersionAppend has been called |
20effc67 TL |
347 | FileLocation GetFileLocation(uint64_t file_number) const { |
348 | const auto it = file_locations_.find(file_number); | |
349 | ||
350 | if (it == file_locations_.end()) { | |
351 | return FileLocation::Invalid(); | |
352 | } | |
353 | ||
354 | assert(it->second.GetLevel() < num_levels_); | |
355 | assert(it->second.GetPosition() < files_[it->second.GetLevel()].size()); | |
356 | assert(files_[it->second.GetLevel()][it->second.GetPosition()]); | |
357 | assert(files_[it->second.GetLevel()][it->second.GetPosition()] | |
358 | ->fd.GetNumber() == file_number); | |
359 | ||
360 | return it->second; | |
361 | } | |
362 | ||
1e59de90 | 363 | // REQUIRES: PrepareForVersionAppend has been called |
20effc67 TL |
364 | FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const { |
365 | auto location = GetFileLocation(file_number); | |
366 | ||
367 | if (!location.IsValid()) { | |
368 | return nullptr; | |
369 | } | |
370 | ||
371 | return files_[location.GetLevel()][location.GetPosition()]; | |
372 | } | |
373 | ||
1e59de90 TL |
374 | // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) |
375 | using BlobFiles = std::vector<std::shared_ptr<BlobFileMetaData>>; | |
20effc67 TL |
376 | const BlobFiles& GetBlobFiles() const { return blob_files_; } |
377 | ||
1e59de90 TL |
378 | // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) |
379 | BlobFiles::const_iterator GetBlobFileMetaDataLB( | |
380 | uint64_t blob_file_number) const; | |
381 | ||
382 | // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) | |
383 | std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData( | |
384 | uint64_t blob_file_number) const { | |
385 | const auto it = GetBlobFileMetaDataLB(blob_file_number); | |
386 | ||
387 | assert(it == blob_files_.end() || *it); | |
388 | ||
389 | if (it != blob_files_.end() && | |
390 | (*it)->GetBlobFileNumber() == blob_file_number) { | |
391 | return *it; | |
392 | } | |
393 | ||
394 | return std::shared_ptr<BlobFileMetaData>(); | |
395 | } | |
396 | ||
397 | // REQUIRES: This version has been saved (see VersionBuilder::SaveTo) | |
398 | struct BlobStats { | |
399 | uint64_t total_file_size = 0; | |
400 | uint64_t total_garbage_size = 0; | |
401 | double space_amp = 0.0; | |
402 | }; | |
403 | ||
404 | BlobStats GetBlobStats() const { | |
405 | uint64_t total_file_size = 0; | |
406 | uint64_t total_garbage_size = 0; | |
407 | ||
408 | for (const auto& meta : blob_files_) { | |
409 | assert(meta); | |
410 | ||
411 | total_file_size += meta->GetBlobFileSize(); | |
412 | total_garbage_size += meta->GetGarbageBlobBytes(); | |
413 | } | |
414 | ||
415 | double space_amp = 0.0; | |
416 | if (total_file_size > total_garbage_size) { | |
417 | space_amp = static_cast<double>(total_file_size) / | |
418 | (total_file_size - total_garbage_size); | |
419 | } | |
420 | ||
421 | return BlobStats{total_file_size, total_garbage_size, space_amp}; | |
422 | } | |
423 | ||
f67539c2 | 424 | const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const { |
7c673cae FG |
425 | assert(level < static_cast<int>(level_files_brief_.size())); |
426 | return level_files_brief_[level]; | |
427 | } | |
428 | ||
1e59de90 | 429 | // REQUIRES: PrepareForVersionAppend has been called |
7c673cae FG |
430 | const std::vector<int>& FilesByCompactionPri(int level) const { |
431 | assert(finalized_); | |
432 | return files_by_compaction_pri_[level]; | |
433 | } | |
434 | ||
1e59de90 | 435 | // REQUIRES: ComputeCompactionScore has been called |
7c673cae FG |
436 | // REQUIRES: DB mutex held during access |
437 | const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction() | |
438 | const { | |
439 | assert(finalized_); | |
440 | return files_marked_for_compaction_; | |
441 | } | |
442 | ||
1e59de90 | 443 | // REQUIRES: ComputeCompactionScore has been called |
11fdf7f2 TL |
444 | // REQUIRES: DB mutex held during access |
445 | const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const { | |
446 | assert(finalized_); | |
447 | return expired_ttl_files_; | |
448 | } | |
449 | ||
1e59de90 | 450 | // REQUIRES: ComputeCompactionScore has been called |
f67539c2 TL |
451 | // REQUIRES: DB mutex held during access |
452 | const autovector<std::pair<int, FileMetaData*>>& | |
453 | FilesMarkedForPeriodicCompaction() const { | |
454 | assert(finalized_); | |
455 | return files_marked_for_periodic_compaction_; | |
456 | } | |
457 | ||
458 | void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) { | |
459 | files_marked_for_periodic_compaction_.emplace_back(level, f); | |
460 | } | |
461 | ||
1e59de90 | 462 | // REQUIRES: ComputeCompactionScore has been called |
11fdf7f2 TL |
463 | // REQUIRES: DB mutex held during access |
464 | const autovector<std::pair<int, FileMetaData*>>& | |
465 | BottommostFilesMarkedForCompaction() const { | |
466 | assert(finalized_); | |
467 | return bottommost_files_marked_for_compaction_; | |
468 | } | |
469 | ||
1e59de90 TL |
470 | // REQUIRES: ComputeCompactionScore has been called |
471 | // REQUIRES: DB mutex held during access | |
472 | const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC() | |
473 | const { | |
474 | assert(finalized_); | |
475 | return files_marked_for_forced_blob_gc_; | |
476 | } | |
477 | ||
7c673cae | 478 | int base_level() const { return base_level_; } |
494da23a | 479 | double level_multiplier() const { return level_multiplier_; } |
7c673cae FG |
480 | |
481 | // REQUIRES: lock is held | |
482 | // Set the index that is used to offset into files_by_compaction_pri_ to find | |
483 | // the next compaction candidate file. | |
484 | void SetNextCompactionIndex(int level, int index) { | |
485 | next_file_to_compact_by_size_[level] = index; | |
486 | } | |
487 | ||
488 | // REQUIRES: lock is held | |
489 | int NextCompactionIndex(int level) const { | |
490 | return next_file_to_compact_by_size_[level]; | |
491 | } | |
492 | ||
1e59de90 | 493 | // REQUIRES: PrepareForVersionAppend has been called |
7c673cae FG |
494 | const FileIndexer& file_indexer() const { |
495 | assert(finalized_); | |
496 | return file_indexer_; | |
497 | } | |
498 | ||
499 | // Only the first few entries of files_by_compaction_pri_ are sorted. | |
500 | // There is no need to sort all the files because it is likely | |
501 | // that on a running system, we need to look at only the first | |
502 | // few largest files because a new version is created every few | |
503 | // seconds/minutes (because of concurrent compactions). | |
504 | static const size_t kNumberFilesToSort = 50; | |
505 | ||
506 | // Return a human-readable short (single-line) summary of the number | |
507 | // of files per level. Uses *scratch as backing store. | |
508 | struct LevelSummaryStorage { | |
509 | char buffer[1000]; | |
510 | }; | |
511 | struct FileSummaryStorage { | |
512 | char buffer[3000]; | |
513 | }; | |
514 | const char* LevelSummary(LevelSummaryStorage* scratch) const; | |
515 | // Return a human-readable short (single-line) summary of files | |
516 | // in a specified level. Uses *scratch as backing store. | |
517 | const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; | |
518 | ||
519 | // Return the maximum overlapping data (in bytes) at next level for any | |
520 | // file at a level >= 1. | |
1e59de90 | 521 | uint64_t MaxNextLevelOverlappingBytes(); |
7c673cae FG |
522 | |
523 | // Return a human readable string that describes this version's contents. | |
524 | std::string DebugString(bool hex = false) const; | |
525 | ||
526 | uint64_t GetAverageValueSize() const { | |
527 | if (accumulated_num_non_deletions_ == 0) { | |
528 | return 0; | |
529 | } | |
530 | assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0); | |
531 | assert(accumulated_file_size_ > 0); | |
532 | return accumulated_raw_value_size_ / accumulated_num_non_deletions_ * | |
533 | accumulated_file_size_ / | |
534 | (accumulated_raw_key_size_ + accumulated_raw_value_size_); | |
535 | } | |
536 | ||
537 | uint64_t GetEstimatedActiveKeys() const; | |
538 | ||
539 | double GetEstimatedCompressionRatioAtLevel(int level) const; | |
540 | ||
541 | // re-initializes the index that is used to offset into | |
542 | // files_by_compaction_pri_ | |
543 | // to find the next compaction candidate file. | |
544 | void ResetNextCompactionIndex(int level) { | |
545 | next_file_to_compact_by_size_[level] = 0; | |
546 | } | |
547 | ||
1e59de90 | 548 | const InternalKeyComparator* InternalComparator() const { |
7c673cae FG |
549 | return internal_comparator_; |
550 | } | |
551 | ||
552 | // Returns maximum total bytes of data on a given level. | |
553 | uint64_t MaxBytesForLevel(int level) const; | |
554 | ||
7c673cae FG |
555 | // Returns an estimate of the amount of live data in bytes. |
556 | uint64_t EstimateLiveDataSize() const; | |
557 | ||
558 | uint64_t estimated_compaction_needed_bytes() const { | |
559 | return estimated_compaction_needed_bytes_; | |
560 | } | |
561 | ||
562 | void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { | |
563 | estimated_compaction_needed_bytes_ = v; | |
564 | } | |
565 | ||
566 | bool force_consistency_checks() const { return force_consistency_checks_; } | |
567 | ||
494da23a TL |
568 | SequenceNumber bottommost_files_mark_threshold() const { |
569 | return bottommost_files_mark_threshold_; | |
570 | } | |
571 | ||
11fdf7f2 TL |
572 | // Returns whether any key in [`smallest_key`, `largest_key`] could appear in |
573 | // an older L0 file than `last_l0_idx` or in a greater level than `last_level` | |
574 | // | |
575 | // @param last_level Level after which we check for overlap | |
576 | // @param last_l0_idx If `last_level == 0`, index of L0 file after which we | |
577 | // check for overlap; otherwise, must be -1 | |
494da23a TL |
578 | bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key, |
579 | const Slice& largest_user_key, | |
580 | int last_level, int last_l0_idx); | |
11fdf7f2 | 581 | |
7c673cae | 582 | private: |
1e59de90 TL |
583 | void ComputeCompensatedSizes(); |
584 | void UpdateNumNonEmptyLevels(); | |
585 | void CalculateBaseBytes(const ImmutableOptions& ioptions, | |
586 | const MutableCFOptions& options); | |
587 | void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options, | |
588 | const MutableCFOptions& mutable_cf_options); | |
589 | ||
590 | void GenerateFileIndexer() { | |
591 | file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); | |
592 | } | |
593 | ||
594 | void GenerateLevelFilesBrief(); | |
595 | void GenerateLevel0NonOverlapping(); | |
596 | void GenerateBottommostFiles(); | |
597 | void GenerateFileLocationIndex(); | |
598 | ||
7c673cae FG |
599 | const InternalKeyComparator* internal_comparator_; |
600 | const Comparator* user_comparator_; | |
601 | int num_levels_; // Number of levels | |
602 | int num_non_empty_levels_; // Number of levels. Any level larger than it | |
603 | // is guaranteed to be empty. | |
604 | // Per-level max bytes | |
605 | std::vector<uint64_t> level_max_bytes_; | |
606 | ||
607 | // A short brief metadata of files per level | |
f67539c2 | 608 | autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_; |
7c673cae FG |
609 | FileIndexer file_indexer_; |
610 | Arena arena_; // Used to allocate space for file_levels_ | |
611 | ||
612 | CompactionStyle compaction_style_; | |
613 | ||
614 | // List of files per level, files in each level are arranged | |
615 | // in increasing order of keys | |
616 | std::vector<FileMetaData*>* files_; | |
617 | ||
20effc67 TL |
618 | // Map of all table files in version. Maps file number to (level, position on |
619 | // level). | |
1e59de90 | 620 | using FileLocations = UnorderedMap<uint64_t, FileLocation>; |
20effc67 TL |
621 | FileLocations file_locations_; |
622 | ||
1e59de90 | 623 | // Vector of blob files in version sorted by blob file number. |
20effc67 TL |
624 | BlobFiles blob_files_; |
625 | ||
7c673cae FG |
626 | // Level that L0 data should be compacted to. All levels < base_level_ should |
627 | // be empty. -1 if it is not level-compaction so it's not applicable. | |
628 | int base_level_; | |
629 | ||
494da23a TL |
630 | double level_multiplier_; |
631 | ||
7c673cae FG |
632 | // A list for the same set of files that are stored in files_, |
633 | // but files in each level are now sorted based on file | |
634 | // size. The file with the largest size is at the front. | |
635 | // This vector stores the index of the file from files_. | |
636 | std::vector<std::vector<int>> files_by_compaction_pri_; | |
637 | ||
638 | // If true, means that files in L0 have keys with non overlapping ranges | |
639 | bool level0_non_overlapping_; | |
640 | ||
641 | // An index into files_by_compaction_pri_ that specifies the first | |
642 | // file that is not yet compacted | |
643 | std::vector<int> next_file_to_compact_by_size_; | |
644 | ||
645 | // Only the first few entries of files_by_compaction_pri_ are sorted. | |
646 | // There is no need to sort all the files because it is likely | |
647 | // that on a running system, we need to look at only the first | |
648 | // few largest files because a new version is created every few | |
649 | // seconds/minutes (because of concurrent compactions). | |
650 | static const size_t number_of_files_to_sort_ = 50; | |
651 | ||
652 | // This vector contains list of files marked for compaction and also not | |
653 | // currently being compacted. It is protected by DB mutex. It is calculated in | |
654 | // ComputeCompactionScore() | |
655 | autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_; | |
656 | ||
11fdf7f2 TL |
657 | autovector<std::pair<int, FileMetaData*>> expired_ttl_files_; |
658 | ||
f67539c2 TL |
659 | autovector<std::pair<int, FileMetaData*>> |
660 | files_marked_for_periodic_compaction_; | |
661 | ||
11fdf7f2 TL |
662 | // These files are considered bottommost because none of their keys can exist |
663 | // at lower levels. They are not necessarily all in the same level. The marked | |
664 | // ones are eligible for compaction because they contain duplicate key | |
665 | // versions that are no longer protected by snapshot. These variables are | |
666 | // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and | |
667 | // `ComputeBottommostFilesMarkedForCompaction()`. | |
668 | autovector<std::pair<int, FileMetaData*>> bottommost_files_; | |
669 | autovector<std::pair<int, FileMetaData*>> | |
670 | bottommost_files_marked_for_compaction_; | |
671 | ||
1e59de90 TL |
672 | autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_; |
673 | ||
11fdf7f2 TL |
674 | // Threshold for needing to mark another bottommost file. Maintain it so we |
675 | // can quickly check when releasing a snapshot whether more bottommost files | |
676 | // became eligible for compaction. It's defined as the min of the max nonzero | |
677 | // seqnums of unmarked bottommost files. | |
678 | SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; | |
679 | ||
680 | // Monotonically increases as we release old snapshots. Zero indicates no | |
681 | // snapshots have been released yet. When no snapshots remain we set it to the | |
682 | // current seqnum, which needs to be protected as a snapshot can still be | |
683 | // created that references it. | |
684 | SequenceNumber oldest_snapshot_seqnum_ = 0; | |
685 | ||
7c673cae FG |
686 | // Level that should be compacted next and its compaction score. |
687 | // Score < 1 means compaction is not strictly needed. These fields | |
1e59de90 | 688 | // are initialized by ComputeCompactionScore. |
7c673cae FG |
689 | // The most critical level to be compacted is listed first |
690 | // These are used to pick the best compaction level | |
691 | std::vector<double> compaction_score_; | |
692 | std::vector<int> compaction_level_; | |
693 | int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop | |
694 | // for number of L0 files. | |
695 | ||
1e59de90 TL |
696 | // Compact cursors for round-robin compactions in each level |
697 | std::vector<InternalKey> compact_cursor_; | |
698 | ||
7c673cae FG |
699 | // the following are the sampled temporary stats. |
700 | // the current accumulated size of sampled files. | |
701 | uint64_t accumulated_file_size_; | |
702 | // the current accumulated size of all raw keys based on the sampled files. | |
703 | uint64_t accumulated_raw_key_size_; | |
704 | // the current accumulated size of all raw keys based on the sampled files. | |
705 | uint64_t accumulated_raw_value_size_; | |
706 | // total number of non-deletion entries | |
707 | uint64_t accumulated_num_non_deletions_; | |
708 | // total number of deletion entries | |
709 | uint64_t accumulated_num_deletions_; | |
710 | // current number of non_deletion entries | |
711 | uint64_t current_num_non_deletions_; | |
11fdf7f2 | 712 | // current number of deletion entries |
7c673cae FG |
713 | uint64_t current_num_deletions_; |
714 | // current number of file samples | |
715 | uint64_t current_num_samples_; | |
716 | // Estimated bytes needed to be compacted until all levels' size is down to | |
717 | // target sizes. | |
718 | uint64_t estimated_compaction_needed_bytes_; | |
719 | ||
720 | bool finalized_; | |
721 | ||
722 | // If set to true, we will run consistency checks even if RocksDB | |
723 | // is compiled in release mode | |
724 | bool force_consistency_checks_; | |
725 | ||
726 | friend class Version; | |
727 | friend class VersionSet; | |
7c673cae FG |
728 | }; |
729 | ||
1e59de90 TL |
730 | struct ObsoleteFileInfo { |
731 | FileMetaData* metadata; | |
732 | std::string path; | |
733 | // If true, the FileMataData should be destroyed but the file should | |
734 | // not be deleted. This is because another FileMetaData still references | |
735 | // the file, usually because the file is trivial moved so two FileMetadata | |
736 | // is managing the file. | |
737 | bool only_delete_metadata = false; | |
738 | ||
739 | ObsoleteFileInfo() noexcept | |
740 | : metadata(nullptr), only_delete_metadata(false) {} | |
741 | ObsoleteFileInfo(FileMetaData* f, const std::string& file_path, | |
742 | std::shared_ptr<CacheReservationManager> | |
743 | file_metadata_cache_res_mgr_arg = nullptr) | |
744 | : metadata(f), | |
745 | path(file_path), | |
746 | only_delete_metadata(false), | |
747 | file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {} | |
748 | ||
749 | ObsoleteFileInfo(const ObsoleteFileInfo&) = delete; | |
750 | ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete; | |
751 | ||
752 | ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() { | |
753 | *this = std::move(rhs); | |
754 | } | |
755 | ||
756 | ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept { | |
757 | path = std::move(rhs.path); | |
758 | metadata = rhs.metadata; | |
759 | rhs.metadata = nullptr; | |
760 | file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr; | |
761 | rhs.file_metadata_cache_res_mgr = nullptr; | |
762 | ||
763 | return *this; | |
764 | } | |
765 | void DeleteMetadata() { | |
766 | if (file_metadata_cache_res_mgr) { | |
767 | Status s = file_metadata_cache_res_mgr->UpdateCacheReservation( | |
768 | metadata->ApproximateMemoryUsage(), false /* increase */); | |
769 | s.PermitUncheckedError(); | |
770 | } | |
771 | delete metadata; | |
772 | metadata = nullptr; | |
773 | } | |
774 | ||
775 | private: | |
776 | std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr; | |
777 | }; | |
778 | ||
779 | class ObsoleteBlobFileInfo { | |
780 | public: | |
781 | ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path) | |
782 | : blob_file_number_(blob_file_number), path_(std::move(path)) {} | |
783 | ||
784 | uint64_t GetBlobFileNumber() const { return blob_file_number_; } | |
785 | const std::string& GetPath() const { return path_; } | |
786 | ||
787 | private: | |
788 | uint64_t blob_file_number_; | |
789 | std::string path_; | |
790 | }; | |
791 | ||
f67539c2 | 792 | using MultiGetRange = MultiGetContext::Range; |
20effc67 TL |
793 | // A column family's version consists of the table and blob files owned by |
794 | // the column family at a certain point in time. | |
7c673cae FG |
795 | class Version { |
796 | public: | |
797 | // Append to *iters a sequence of iterators that will | |
798 | // yield the contents of this Version when merged together. | |
20effc67 TL |
799 | // @param read_options Must outlive any iterator built by |
800 | // `merger_iter_builder`. | |
20effc67 TL |
801 | void AddIterators(const ReadOptions& read_options, |
802 | const FileOptions& soptions, | |
7c673cae | 803 | MergeIteratorBuilder* merger_iter_builder, |
20effc67 | 804 | bool allow_unprepared_value); |
7c673cae | 805 | |
20effc67 TL |
806 | // @param read_options Must outlive any iterator built by |
807 | // `merger_iter_builder`. | |
808 | void AddIteratorsForLevel(const ReadOptions& read_options, | |
809 | const FileOptions& soptions, | |
7c673cae | 810 | MergeIteratorBuilder* merger_iter_builder, |
1e59de90 | 811 | int level, bool allow_unprepared_value); |
7c673cae | 812 | |
f67539c2 | 813 | Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&, |
11fdf7f2 | 814 | const Slice& smallest_user_key, |
1e59de90 TL |
815 | const Slice& largest_user_key, int level, |
816 | bool* overlap); | |
11fdf7f2 | 817 | |
f67539c2 TL |
818 | // Lookup the value for key or get all merge operands for key. |
819 | // If do_merge = true (default) then lookup value for key. | |
820 | // Behavior if do_merge = true: | |
821 | // If found, store it in *value and | |
822 | // return OK. Else return a non-OK status. | |
823 | // Uses *operands to store merge_operator operations to apply later. | |
7c673cae | 824 | // |
f67539c2 TL |
825 | // If the ReadOptions.read_tier is set to do a read-only fetch, then |
826 | // *value_found will be set to false if it cannot be determined whether | |
827 | // this value exists without doing IO. | |
7c673cae | 828 | // |
f67539c2 | 829 | // If the key is Deleted, *status will be set to NotFound and |
7c673cae | 830 | // *key_exists will be set to true. |
f67539c2 | 831 | // If no key was found, *status will be set to NotFound and |
7c673cae | 832 | // *key_exists will be set to false. |
f67539c2 TL |
833 | // If seq is non-null, *seq will be set to the sequence number found |
834 | // for the key if a key was found. | |
835 | // Behavior if do_merge = false | |
836 | // If the key has any merge operands then store them in | |
837 | // merge_context.operands_list and don't merge the operands | |
7c673cae | 838 | // REQUIRES: lock is not held |
1e59de90 | 839 | // REQUIRES: pinned_iters_mgr != nullptr |
7c673cae | 840 | void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, |
1e59de90 TL |
841 | PinnableWideColumns* columns, std::string* timestamp, Status* status, |
842 | MergeContext* merge_context, | |
494da23a | 843 | SequenceNumber* max_covering_tombstone_seq, |
1e59de90 | 844 | PinnedIteratorsManager* pinned_iters_mgr, |
494da23a TL |
845 | bool* value_found = nullptr, bool* key_exists = nullptr, |
846 | SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, | |
f67539c2 TL |
847 | bool* is_blob = nullptr, bool do_merge = true); |
848 | ||
849 | void MultiGet(const ReadOptions&, MultiGetRange* range, | |
1e59de90 | 850 | ReadCallback* callback = nullptr); |
7c673cae | 851 | |
1e59de90 TL |
852 | // Interprets blob_index_slice as a blob reference, and (assuming the |
853 | // corresponding blob file is part of this Version) retrieves the blob and | |
854 | // saves it in *value. | |
855 | // REQUIRES: blob_index_slice stores an encoded blob reference | |
856 | Status GetBlob(const ReadOptions& read_options, const Slice& user_key, | |
857 | const Slice& blob_index_slice, | |
858 | FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, | |
859 | uint64_t* bytes_read) const; | |
860 | ||
861 | // Retrieves a blob using a blob reference and saves it in *value, | |
862 | // assuming the corresponding blob file is part of this Version. | |
863 | Status GetBlob(const ReadOptions& read_options, const Slice& user_key, | |
864 | const BlobIndex& blob_index, | |
865 | FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, | |
866 | uint64_t* bytes_read) const; | |
867 | ||
868 | using BlobReadContext = | |
869 | std::pair<BlobIndex, std::reference_wrapper<const KeyContext>>; | |
870 | using BlobReadContexts = std::vector<BlobReadContext>; | |
871 | void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range, | |
872 | std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs); | |
873 | ||
874 | // Loads some stats information from files (if update_stats is set) and | |
875 | // populates derived data structures. Call without mutex held. It needs to be | |
876 | // called before appending the version to the version set. | |
877 | void PrepareAppend(const MutableCFOptions& mutable_cf_options, | |
878 | bool update_stats); | |
7c673cae FG |
879 | |
880 | // Reference count management (so Versions do not disappear out from | |
881 | // under live iterators) | |
882 | void Ref(); | |
883 | // Decrease reference count. Delete the object if no reference left | |
884 | // and return true. Otherwise, return false. | |
885 | bool Unref(); | |
886 | ||
20effc67 TL |
887 | // Add all files listed in the current version to *live_table_files and |
888 | // *live_blob_files. | |
889 | void AddLiveFiles(std::vector<uint64_t>* live_table_files, | |
890 | std::vector<uint64_t>* live_blob_files) const; | |
7c673cae | 891 | |
1e59de90 TL |
892 | // Remove live files that are in the delete candidate lists. |
893 | void RemoveLiveFiles( | |
894 | std::vector<ObsoleteFileInfo>& sst_delete_candidates, | |
895 | std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const; | |
896 | ||
7c673cae | 897 | // Return a human readable string that describes this version's contents. |
11fdf7f2 | 898 | std::string DebugString(bool hex = false, bool print_stats = false) const; |
7c673cae | 899 | |
11fdf7f2 | 900 | // Returns the version number of this version |
7c673cae FG |
901 | uint64_t GetVersionNumber() const { return version_number_; } |
902 | ||
903 | // REQUIRES: lock is held | |
904 | // On success, "tp" will contains the table properties of the file | |
905 | // specified in "file_meta". If the file name of "file_meta" is | |
11fdf7f2 | 906 | // known ahead, passing it by a non-null "fname" can save a |
7c673cae FG |
907 | // file-name conversion. |
908 | Status GetTableProperties(std::shared_ptr<const TableProperties>* tp, | |
909 | const FileMetaData* file_meta, | |
910 | const std::string* fname = nullptr) const; | |
911 | ||
912 | // REQUIRES: lock is held | |
913 | // On success, *props will be populated with all SSTables' table properties. | |
914 | // The keys of `props` are the sst file name, the values of `props` are the | |
494da23a | 915 | // tables' properties, represented as std::shared_ptr. |
7c673cae FG |
916 | Status GetPropertiesOfAllTables(TablePropertiesCollection* props); |
917 | Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); | |
918 | Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, | |
919 | TablePropertiesCollection* props) const; | |
920 | ||
f67539c2 TL |
921 | // Print summary of range delete tombstones in SST files into out_str, |
922 | // with maximum max_entries_to_print entries printed out. | |
923 | Status TablesRangeTombstoneSummary(int max_entries_to_print, | |
924 | std::string* out_str); | |
925 | ||
7c673cae | 926 | // REQUIRES: lock is held |
11fdf7f2 | 927 | // On success, "tp" will contains the aggregated table property among |
7c673cae FG |
928 | // the table properties of all sst files in this version. |
929 | Status GetAggregatedTableProperties( | |
930 | std::shared_ptr<const TableProperties>* tp, int level = -1); | |
931 | ||
932 | uint64_t GetEstimatedActiveKeys() { | |
933 | return storage_info_.GetEstimatedActiveKeys(); | |
934 | } | |
935 | ||
936 | size_t GetMemoryUsageByTableReaders(); | |
937 | ||
938 | ColumnFamilyData* cfd() const { return cfd_; } | |
939 | ||
1e59de90 TL |
940 | // Return the next Version in the linked list. |
941 | Version* Next() const { return next_; } | |
7c673cae FG |
942 | |
943 | int TEST_refs() const { return refs_; } | |
944 | ||
945 | VersionStorageInfo* storage_info() { return &storage_info_; } | |
1e59de90 | 946 | const VersionStorageInfo* storage_info() const { return &storage_info_; } |
7c673cae FG |
947 | |
948 | VersionSet* version_set() { return vset_; } | |
949 | ||
950 | void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); | |
951 | ||
11fdf7f2 TL |
952 | uint64_t GetSstFilesSize(); |
953 | ||
f67539c2 TL |
954 | // Retrieves the file_creation_time of the oldest file in the DB. |
955 | // Prerequisite for this API is max_open_files = -1 | |
956 | void GetCreationTimeOfOldestFile(uint64_t* creation_time); | |
957 | ||
958 | const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; } | |
11fdf7f2 | 959 | |
1e59de90 TL |
960 | InternalIterator* TEST_GetLevelIterator( |
961 | const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder, | |
962 | int level, bool allow_unprepared_value); | |
963 | ||
7c673cae FG |
964 | private: |
965 | Env* env_; | |
1e59de90 TL |
966 | SystemClock* clock_; |
967 | ||
494da23a | 968 | friend class ReactiveVersionSet; |
7c673cae | 969 | friend class VersionSet; |
20effc67 TL |
970 | friend class VersionEditHandler; |
971 | friend class VersionEditHandlerPointInTime; | |
7c673cae FG |
972 | |
973 | const InternalKeyComparator* internal_comparator() const { | |
974 | return storage_info_.internal_comparator_; | |
975 | } | |
976 | const Comparator* user_comparator() const { | |
977 | return storage_info_.user_comparator_; | |
978 | } | |
979 | ||
7c673cae FG |
980 | // Returns true if the filter blocks in the specified level will not be |
981 | // checked during read operations. In certain cases (trivial move or preload), | |
982 | // the filter block may already be cached, but we still do not access it such | |
983 | // that it eventually expires from the cache. | |
984 | bool IsFilterSkipped(int level, bool is_file_last_in_level = false); | |
985 | ||
986 | // The helper function of UpdateAccumulatedStats, which may fill the missing | |
11fdf7f2 | 987 | // fields of file_meta from its associated TableProperties. |
7c673cae FG |
988 | // Returns true if it does initialize FileMetaData. |
989 | bool MaybeInitializeFileMetaData(FileMetaData* file_meta); | |
990 | ||
991 | // Update the accumulated stats associated with the current version. | |
992 | // This accumulated stats will be used in compaction. | |
1e59de90 TL |
993 | void UpdateAccumulatedStats(); |
994 | ||
995 | DECLARE_SYNC_AND_ASYNC( | |
996 | /* ret_type */ Status, /* func_name */ MultiGetFromSST, | |
997 | const ReadOptions& read_options, MultiGetRange file_range, | |
998 | int hit_file_level, bool skip_filters, bool skip_range_deletions, | |
999 | FdWithKeyRange* f, | |
1000 | std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs, | |
1001 | Cache::Handle* table_handle, uint64_t& num_filter_read, | |
1002 | uint64_t& num_index_read, uint64_t& num_sst_read); | |
1003 | ||
1004 | #ifdef USE_COROUTINES | |
1005 | // MultiGet using async IO to read data blocks from SST files in parallel | |
1006 | // within and across levels | |
1007 | Status MultiGetAsync( | |
1008 | const ReadOptions& options, MultiGetRange* range, | |
1009 | std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs); | |
1010 | ||
1011 | // A helper function to lookup a batch of keys in a single level. It will | |
1012 | // queue coroutine tasks to mget_tasks. It may also split the input batch | |
1013 | // by creating a new batch with keys definitely not in this level and | |
1014 | // enqueuing it to to_process. | |
1015 | Status ProcessBatch( | |
1016 | const ReadOptions& read_options, FilePickerMultiGet* batch, | |
1017 | std::vector<folly::coro::Task<Status>>& mget_tasks, | |
1018 | std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs, | |
1019 | autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting, | |
1020 | std::deque<size_t>& to_process, unsigned int& num_tasks_queued, | |
1021 | std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>& | |
1022 | mget_stats); | |
1023 | #endif | |
7c673cae FG |
1024 | |
1025 | ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs | |
1026 | Logger* info_log_; | |
1027 | Statistics* db_statistics_; | |
1028 | TableCache* table_cache_; | |
1e59de90 | 1029 | BlobSource* blob_source_; |
7c673cae FG |
1030 | const MergeOperator* merge_operator_; |
1031 | ||
1032 | VersionStorageInfo storage_info_; | |
1e59de90 TL |
1033 | VersionSet* vset_; // VersionSet to which this Version belongs |
1034 | Version* next_; // Next version in linked list | |
1035 | Version* prev_; // Previous version in linked list | |
1036 | int refs_; // Number of live refs to this version | |
f67539c2 | 1037 | const FileOptions file_options_; |
11fdf7f2 | 1038 | const MutableCFOptions mutable_cf_options_; |
20effc67 TL |
1039 | // Cached value to avoid recomputing it on every read. |
1040 | const size_t max_file_size_for_l0_meta_pin_; | |
7c673cae FG |
1041 | |
1042 | // A version number that uniquely represents this version. This is | |
1043 | // used for debugging and logging purposes only. | |
1044 | uint64_t version_number_; | |
20effc67 | 1045 | std::shared_ptr<IOTracer> io_tracer_; |
7c673cae | 1046 | |
f67539c2 | 1047 | Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, |
20effc67 TL |
1048 | MutableCFOptions mutable_cf_options, |
1049 | const std::shared_ptr<IOTracer>& io_tracer, | |
1050 | uint64_t version_number = 0); | |
7c673cae FG |
1051 | |
1052 | ~Version(); | |
1053 | ||
1054 | // No copying allowed | |
f67539c2 TL |
1055 | Version(const Version&) = delete; |
1056 | void operator=(const Version&) = delete; | |
7c673cae FG |
1057 | }; |
1058 | ||
11fdf7f2 | 1059 | class BaseReferencedVersionBuilder; |
11fdf7f2 | 1060 | |
f67539c2 TL |
1061 | class AtomicGroupReadBuffer { |
1062 | public: | |
1e59de90 | 1063 | AtomicGroupReadBuffer() = default; |
f67539c2 TL |
1064 | Status AddEdit(VersionEdit* edit); |
1065 | void Clear(); | |
1066 | bool IsFull() const; | |
1067 | bool IsEmpty() const; | |
1068 | ||
1069 | uint64_t TEST_read_edits_in_atomic_group() const { | |
1070 | return read_edits_in_atomic_group_; | |
1071 | } | |
1072 | std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; } | |
1073 | ||
1074 | private: | |
1075 | uint64_t read_edits_in_atomic_group_ = 0; | |
1076 | std::vector<VersionEdit> replay_buffer_; | |
1077 | }; | |
1078 | ||
1079 | // VersionSet is the collection of versions of all the column families of the | |
1080 | // database. Each database owns one VersionSet. A VersionSet has access to all | |
1081 | // column families via ColumnFamilySet, i.e. set of the column families. | |
7c673cae FG |
1082 | class VersionSet { |
1083 | public: | |
1084 | VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, | |
f67539c2 | 1085 | const FileOptions& file_options, Cache* table_cache, |
7c673cae | 1086 | WriteBufferManager* write_buffer_manager, |
f67539c2 | 1087 | WriteController* write_controller, |
20effc67 | 1088 | BlockCacheTracer* const block_cache_tracer, |
1e59de90 TL |
1089 | const std::shared_ptr<IOTracer>& io_tracer, |
1090 | const std::string& db_id, const std::string& db_session_id); | |
f67539c2 TL |
1091 | // No copying allowed |
1092 | VersionSet(const VersionSet&) = delete; | |
1093 | void operator=(const VersionSet&) = delete; | |
1094 | ||
494da23a | 1095 | virtual ~VersionSet(); |
7c673cae | 1096 | |
20effc67 TL |
1097 | Status LogAndApplyToDefaultColumnFamily( |
1098 | VersionEdit* edit, InstrumentedMutex* mu, | |
1e59de90 | 1099 | FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, |
20effc67 TL |
1100 | const ColumnFamilyOptions* column_family_options = nullptr) { |
1101 | ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); | |
1102 | const MutableCFOptions* cf_options = | |
1103 | default_cf->GetLatestMutableCFOptions(); | |
1e59de90 TL |
1104 | return LogAndApply(default_cf, *cf_options, edit, mu, |
1105 | dir_contains_current_file, new_descriptor_log, | |
1106 | column_family_options); | |
20effc67 TL |
1107 | } |
1108 | ||
7c673cae FG |
1109 | // Apply *edit to the current version to form a new descriptor that |
1110 | // is both saved to persistent state and installed as the new | |
1111 | // current version. Will release *mu while actually writing to the file. | |
1112 | // column_family_options has to be set if edit is column family add | |
1113 | // REQUIRES: *mu is held on entry. | |
1114 | // REQUIRES: no other thread concurrently calls LogAndApply() | |
1115 | Status LogAndApply( | |
1116 | ColumnFamilyData* column_family_data, | |
1117 | const MutableCFOptions& mutable_cf_options, VersionEdit* edit, | |
1e59de90 | 1118 | InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, |
7c673cae FG |
1119 | bool new_descriptor_log = false, |
1120 | const ColumnFamilyOptions* column_family_options = nullptr) { | |
494da23a TL |
1121 | autovector<ColumnFamilyData*> cfds; |
1122 | cfds.emplace_back(column_family_data); | |
1123 | autovector<const MutableCFOptions*> mutable_cf_options_list; | |
1124 | mutable_cf_options_list.emplace_back(&mutable_cf_options); | |
1125 | autovector<autovector<VersionEdit*>> edit_lists; | |
1126 | autovector<VersionEdit*> edit_list; | |
1127 | edit_list.emplace_back(edit); | |
1128 | edit_lists.emplace_back(edit_list); | |
11fdf7f2 | 1129 | return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, |
1e59de90 TL |
1130 | dir_contains_current_file, new_descriptor_log, |
1131 | column_family_options); | |
7c673cae FG |
1132 | } |
1133 | // The batch version. If edit_list.size() > 1, caller must ensure that | |
1134 | // no edit in the list column family add or drop | |
1135 | Status LogAndApply( | |
1136 | ColumnFamilyData* column_family_data, | |
1137 | const MutableCFOptions& mutable_cf_options, | |
1138 | const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu, | |
1e59de90 | 1139 | FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, |
20effc67 TL |
1140 | const ColumnFamilyOptions* column_family_options = nullptr, |
1141 | const std::function<void(const Status&)>& manifest_wcb = {}) { | |
494da23a TL |
1142 | autovector<ColumnFamilyData*> cfds; |
1143 | cfds.emplace_back(column_family_data); | |
1144 | autovector<const MutableCFOptions*> mutable_cf_options_list; | |
1145 | mutable_cf_options_list.emplace_back(&mutable_cf_options); | |
1146 | autovector<autovector<VersionEdit*>> edit_lists; | |
1147 | edit_lists.emplace_back(edit_list); | |
11fdf7f2 | 1148 | return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, |
1e59de90 TL |
1149 | dir_contains_current_file, new_descriptor_log, |
1150 | column_family_options, {manifest_wcb}); | |
11fdf7f2 TL |
1151 | } |
1152 | ||
1153 | // The across-multi-cf batch version. If edit_lists contain more than | |
1154 | // 1 version edits, caller must ensure that no edit in the []list is column | |
1155 | // family manipulation. | |
494da23a TL |
1156 | virtual Status LogAndApply( |
1157 | const autovector<ColumnFamilyData*>& cfds, | |
1158 | const autovector<const MutableCFOptions*>& mutable_cf_options_list, | |
1159 | const autovector<autovector<VersionEdit*>>& edit_lists, | |
1e59de90 | 1160 | InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, |
494da23a | 1161 | bool new_descriptor_log = false, |
20effc67 TL |
1162 | const ColumnFamilyOptions* new_cf_options = nullptr, |
1163 | const std::vector<std::function<void(const Status&)>>& manifest_wcbs = | |
1164 | {}); | |
494da23a | 1165 | |
f67539c2 TL |
1166 | static Status GetCurrentManifestPath(const std::string& dbname, |
1167 | FileSystem* fs, | |
1168 | std::string* manifest_filename, | |
1169 | uint64_t* manifest_file_number); | |
1e59de90 | 1170 | void WakeUpWaitingManifestWriters(); |
7c673cae FG |
1171 | |
1172 | // Recover the last saved descriptor from persistent storage. | |
1173 | // If read_only == true, Recover() will not complain if some column families | |
1174 | // are not opened | |
1175 | Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families, | |
1e59de90 TL |
1176 | bool read_only = false, std::string* db_id = nullptr, |
1177 | bool no_error_if_files_missing = false); | |
7c673cae | 1178 | |
20effc67 TL |
1179 | Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families, |
1180 | bool read_only, | |
1181 | const std::vector<std::string>& files_in_dbname, | |
1182 | std::string* db_id, bool* has_missing_table_file); | |
1183 | ||
1184 | // Try to recover the version set to the most recent consistent state | |
1185 | // recorded in the specified manifest. | |
1186 | Status TryRecoverFromOneManifest( | |
1187 | const std::string& manifest_path, | |
1188 | const std::vector<ColumnFamilyDescriptor>& column_families, | |
1189 | bool read_only, std::string* db_id, bool* has_missing_table_file); | |
1190 | ||
7c673cae FG |
1191 | // Reads a manifest file and returns a list of column families in |
1192 | // column_families. | |
1193 | static Status ListColumnFamilies(std::vector<std::string>* column_families, | |
f67539c2 | 1194 | const std::string& dbname, FileSystem* fs); |
1e59de90 TL |
1195 | static Status ListColumnFamiliesFromManifest( |
1196 | const std::string& manifest_path, FileSystem* fs, | |
1197 | std::vector<std::string>* column_families); | |
7c673cae FG |
1198 | |
1199 | #ifndef ROCKSDB_LITE | |
1200 | // Try to reduce the number of levels. This call is valid when | |
1201 | // only one level from the new max level to the old | |
1202 | // max level containing files. | |
1203 | // The call is static, since number of levels is immutable during | |
1204 | // the lifetime of a RocksDB instance. It reduces number of levels | |
1205 | // in a DB by applying changes to manifest. | |
1206 | // For example, a db currently has 7 levels [0-6], and a call to | |
1207 | // to reduce to 5 [0-4] can only be executed when only one level | |
1208 | // among [4-6] contains files. | |
1209 | static Status ReduceNumberOfLevels(const std::string& dbname, | |
1210 | const Options* options, | |
f67539c2 | 1211 | const FileOptions& file_options, |
7c673cae FG |
1212 | int new_levels); |
1213 | ||
f67539c2 TL |
1214 | // Get the checksum information of all live files |
1215 | Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list); | |
1216 | ||
7c673cae FG |
1217 | // printf contents (for debugging) |
1218 | Status DumpManifest(Options& options, std::string& manifestFileName, | |
1219 | bool verbose, bool hex = false, bool json = false); | |
1220 | ||
1221 | #endif // ROCKSDB_LITE | |
1222 | ||
1e59de90 TL |
1223 | const std::string& DbSessionId() const { return db_session_id_; } |
1224 | ||
7c673cae FG |
1225 | // Return the current manifest file number |
1226 | uint64_t manifest_file_number() const { return manifest_file_number_; } | |
1227 | ||
1228 | uint64_t options_file_number() const { return options_file_number_; } | |
1229 | ||
1230 | uint64_t pending_manifest_file_number() const { | |
1231 | return pending_manifest_file_number_; | |
1232 | } | |
1233 | ||
1234 | uint64_t current_next_file_number() const { return next_file_number_.load(); } | |
1235 | ||
1e59de90 TL |
1236 | uint64_t min_log_number_to_keep() const { |
1237 | return min_log_number_to_keep_.load(); | |
11fdf7f2 TL |
1238 | } |
1239 | ||
7c673cae FG |
1240 | // Allocate and return a new file number |
1241 | uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } | |
1242 | ||
11fdf7f2 TL |
1243 | // Fetch And Add n new file number |
1244 | uint64_t FetchAddFileNumber(uint64_t n) { | |
1245 | return next_file_number_.fetch_add(n); | |
1246 | } | |
1247 | ||
7c673cae FG |
1248 | // Return the last sequence number. |
1249 | uint64_t LastSequence() const { | |
1250 | return last_sequence_.load(std::memory_order_acquire); | |
1251 | } | |
1252 | ||
11fdf7f2 TL |
1253 | // Note: memory_order_acquire must be sufficient. |
1254 | uint64_t LastAllocatedSequence() const { | |
1255 | return last_allocated_sequence_.load(std::memory_order_seq_cst); | |
1256 | } | |
1257 | ||
1258 | // Note: memory_order_acquire must be sufficient. | |
1259 | uint64_t LastPublishedSequence() const { | |
1260 | return last_published_sequence_.load(std::memory_order_seq_cst); | |
1261 | } | |
1262 | ||
7c673cae FG |
1263 | // Set the last sequence number to s. |
1264 | void SetLastSequence(uint64_t s) { | |
1265 | assert(s >= last_sequence_); | |
11fdf7f2 TL |
1266 | // Last visible sequence must always be less than last written seq |
1267 | assert(!db_options_->two_write_queues || s <= last_allocated_sequence_); | |
7c673cae FG |
1268 | last_sequence_.store(s, std::memory_order_release); |
1269 | } | |
1270 | ||
11fdf7f2 TL |
1271 | // Note: memory_order_release must be sufficient |
1272 | void SetLastPublishedSequence(uint64_t s) { | |
1273 | assert(s >= last_published_sequence_); | |
1274 | last_published_sequence_.store(s, std::memory_order_seq_cst); | |
1275 | } | |
1276 | ||
1277 | // Note: memory_order_release must be sufficient | |
1278 | void SetLastAllocatedSequence(uint64_t s) { | |
1279 | assert(s >= last_allocated_sequence_); | |
1280 | last_allocated_sequence_.store(s, std::memory_order_seq_cst); | |
1281 | } | |
1282 | ||
1283 | // Note: memory_order_release must be sufficient | |
1284 | uint64_t FetchAddLastAllocatedSequence(uint64_t s) { | |
1285 | return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst); | |
1286 | } | |
1287 | ||
7c673cae | 1288 | // Mark the specified file number as used. |
11fdf7f2 TL |
1289 | // REQUIRED: this is only called during single-threaded recovery or repair. |
1290 | void MarkFileNumberUsed(uint64_t number); | |
1291 | ||
1292 | // Mark the specified log number as deleted | |
1293 | // REQUIRED: this is only called during single-threaded recovery or repair, or | |
1294 | // from ::LogAndApply where the global mutex is held. | |
1e59de90 | 1295 | void MarkMinLogNumberToKeep(uint64_t number); |
7c673cae FG |
1296 | |
1297 | // Return the log file number for the log file that is currently | |
1298 | // being compacted, or zero if there is no such log file. | |
1299 | uint64_t prev_log_number() const { return prev_log_number_; } | |
1300 | ||
11fdf7f2 TL |
1301 | // Returns the minimum log number which still has data not flushed to any SST |
1302 | // file. | |
1303 | // In non-2PC mode, all the log numbers smaller than this number can be safely | |
1e59de90 TL |
1304 | // deleted, although we still use `min_log_number_to_keep_` to determine when |
1305 | // to delete a WAL file. | |
11fdf7f2 TL |
1306 | uint64_t MinLogNumberWithUnflushedData() const { |
1307 | return PreComputeMinLogNumberWithUnflushedData(nullptr); | |
1308 | } | |
1e59de90 TL |
1309 | |
1310 | // Returns the minimum log number which still has data not flushed to any SST | |
1311 | // file. | |
1312 | // Empty column families' log number is considered to be | |
1313 | // new_log_number_for_empty_cf. | |
1314 | uint64_t PreComputeMinLogNumberWithUnflushedData( | |
1315 | uint64_t new_log_number_for_empty_cf) const { | |
1316 | uint64_t min_log_num = std::numeric_limits<uint64_t>::max(); | |
1317 | for (auto cfd : *column_family_set_) { | |
1318 | // It's safe to ignore dropped column families here: | |
1319 | // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. | |
1320 | uint64_t num = | |
1321 | cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber(); | |
1322 | if (min_log_num > num && !cfd->IsDropped()) { | |
1323 | min_log_num = num; | |
1324 | } | |
1325 | } | |
1326 | return min_log_num; | |
1327 | } | |
11fdf7f2 TL |
1328 | // Returns the minimum log number which still has data not flushed to any SST |
1329 | // file, except data from `cfd_to_skip`. | |
1330 | uint64_t PreComputeMinLogNumberWithUnflushedData( | |
1331 | const ColumnFamilyData* cfd_to_skip) const { | |
7c673cae FG |
1332 | uint64_t min_log_num = std::numeric_limits<uint64_t>::max(); |
1333 | for (auto cfd : *column_family_set_) { | |
11fdf7f2 TL |
1334 | if (cfd == cfd_to_skip) { |
1335 | continue; | |
1336 | } | |
7c673cae FG |
1337 | // It's safe to ignore dropped column families here: |
1338 | // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. | |
1339 | if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { | |
1340 | min_log_num = cfd->GetLogNumber(); | |
1341 | } | |
1342 | } | |
1343 | return min_log_num; | |
1344 | } | |
1e59de90 TL |
1345 | // Returns the minimum log number which still has data not flushed to any SST |
1346 | // file, except data from `cfds_to_skip`. | |
1347 | uint64_t PreComputeMinLogNumberWithUnflushedData( | |
1348 | const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const { | |
1349 | uint64_t min_log_num = std::numeric_limits<uint64_t>::max(); | |
1350 | for (auto cfd : *column_family_set_) { | |
1351 | if (cfds_to_skip.count(cfd)) { | |
1352 | continue; | |
1353 | } | |
1354 | // It's safe to ignore dropped column families here: | |
1355 | // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. | |
1356 | if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { | |
1357 | min_log_num = cfd->GetLogNumber(); | |
1358 | } | |
1359 | } | |
1360 | return min_log_num; | |
1361 | } | |
7c673cae FG |
1362 | |
1363 | // Create an iterator that reads over the compaction inputs for "*c". | |
1364 | // The caller should delete the iterator when no longer needed. | |
20effc67 | 1365 | // @param read_options Must outlive the returned iterator. |
1e59de90 | 1366 | // @param start, end indicates compaction range |
11fdf7f2 | 1367 | InternalIterator* MakeInputIterator( |
20effc67 TL |
1368 | const ReadOptions& read_options, const Compaction* c, |
1369 | RangeDelAggregator* range_del_agg, | |
1e59de90 TL |
1370 | const FileOptions& file_options_compactions, |
1371 | const std::optional<const Slice>& start, | |
1372 | const std::optional<const Slice>& end); | |
7c673cae | 1373 | |
20effc67 TL |
1374 | // Add all files listed in any live version to *live_table_files and |
1375 | // *live_blob_files. Note that these lists may contain duplicates. | |
1376 | void AddLiveFiles(std::vector<uint64_t>* live_table_files, | |
1377 | std::vector<uint64_t>* live_blob_files) const; | |
7c673cae | 1378 | |
1e59de90 TL |
1379 | // Remove live files that are in the delete candidate lists. |
1380 | void RemoveLiveFiles( | |
1381 | std::vector<ObsoleteFileInfo>& sst_delete_candidates, | |
1382 | std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const; | |
1383 | ||
7c673cae | 1384 | // Return the approximate size of data to be scanned for range [start, end) |
f67539c2 | 1385 | // in levels [start_level, end_level). If end_level == -1 it will search |
7c673cae | 1386 | // through all non-empty levels |
f67539c2 TL |
1387 | uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, |
1388 | const Slice& start, const Slice& end, | |
1389 | int start_level, int end_level, | |
1390 | TableReaderCaller caller); | |
7c673cae FG |
1391 | |
1392 | // Return the size of the current manifest file | |
1393 | uint64_t manifest_file_size() const { return manifest_file_size_; } | |
1394 | ||
7c673cae FG |
1395 | Status GetMetadataForFile(uint64_t number, int* filelevel, |
1396 | FileMetaData** metadata, ColumnFamilyData** cfd); | |
1397 | ||
1398 | // This function doesn't support leveldb SST filenames | |
1e59de90 | 1399 | void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata); |
7c673cae | 1400 | |
20effc67 TL |
1401 | void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) { |
1402 | assert(table_cache_); | |
1403 | ||
1404 | table_cache_->Erase(GetSlice(&blob_file_number)); | |
1405 | ||
1406 | obsolete_blob_files_.emplace_back(blob_file_number, std::move(path)); | |
1407 | } | |
1408 | ||
11fdf7f2 | 1409 | void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files, |
20effc67 | 1410 | std::vector<ObsoleteBlobFileInfo>* blob_files, |
7c673cae FG |
1411 | std::vector<std::string>* manifest_filenames, |
1412 | uint64_t min_pending_output); | |
1413 | ||
1414 | ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } | |
1e59de90 TL |
1415 | RefedColumnFamilySet GetRefedColumnFamilySet() { |
1416 | return RefedColumnFamilySet(GetColumnFamilySet()); | |
1417 | } | |
1418 | ||
f67539c2 TL |
1419 | const FileOptions& file_options() { return file_options_; } |
1420 | void ChangeFileOptions(const MutableDBOptions& new_options) { | |
1421 | file_options_.writable_file_max_buffer_size = | |
11fdf7f2 TL |
1422 | new_options.writable_file_max_buffer_size; |
1423 | } | |
1424 | ||
1425 | const ImmutableDBOptions* db_options() const { return db_options_; } | |
7c673cae FG |
1426 | |
1427 | static uint64_t GetNumLiveVersions(Version* dummy_versions); | |
1428 | ||
1429 | static uint64_t GetTotalSstFilesSize(Version* dummy_versions); | |
1430 | ||
1e59de90 TL |
1431 | static uint64_t GetTotalBlobFileSize(Version* dummy_versions); |
1432 | ||
20effc67 TL |
1433 | // Get the IO Status returned by written Manifest. |
1434 | const IOStatus& io_status() const { return io_status_; } | |
1435 | ||
1436 | // The returned WalSet needs to be accessed with DB mutex held. | |
1437 | const WalSet& GetWalSet() const { return wals_; } | |
1438 | ||
1439 | void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) { | |
1440 | assert(cfd); | |
1441 | ||
1442 | const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); | |
1443 | Version* const version = | |
1444 | new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); | |
1445 | ||
1446 | constexpr bool update_stats = false; | |
1e59de90 | 1447 | version->PrepareAppend(mutable_cf_options, update_stats); |
20effc67 TL |
1448 | AppendVersion(cfd, version); |
1449 | } | |
1450 | ||
494da23a | 1451 | protected: |
20effc67 | 1452 | using VersionBuilderMap = |
1e59de90 | 1453 | UnorderedMap<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>; |
20effc67 | 1454 | |
7c673cae FG |
1455 | struct ManifestWriter; |
1456 | ||
1457 | friend class Version; | |
20effc67 TL |
1458 | friend class VersionEditHandler; |
1459 | friend class VersionEditHandlerPointInTime; | |
1460 | friend class DumpManifestHandler; | |
7c673cae | 1461 | friend class DBImpl; |
494da23a | 1462 | friend class DBImplReadOnly; |
7c673cae FG |
1463 | |
1464 | struct LogReporter : public log::Reader::Reporter { | |
1465 | Status* status; | |
11fdf7f2 | 1466 | virtual void Corruption(size_t /*bytes*/, const Status& s) override { |
20effc67 TL |
1467 | if (status->ok()) { |
1468 | *status = s; | |
1469 | } | |
7c673cae FG |
1470 | } |
1471 | }; | |
1472 | ||
20effc67 TL |
1473 | void Reset(); |
1474 | ||
f67539c2 TL |
1475 | // Returns approximated offset of a key in a file for a given version. |
1476 | uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, | |
1477 | const Slice& key, TableReaderCaller caller); | |
7c673cae | 1478 | |
f67539c2 TL |
1479 | // Returns approximated data size between start and end keys in a file |
1480 | // for a given version. | |
7c673cae | 1481 | uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, |
f67539c2 TL |
1482 | const Slice& start, const Slice& end, |
1483 | TableReaderCaller caller); | |
1484 | ||
1485 | struct MutableCFState { | |
1486 | uint64_t log_number; | |
1e59de90 TL |
1487 | std::string full_history_ts_low; |
1488 | ||
1489 | explicit MutableCFState() = default; | |
1490 | explicit MutableCFState(uint64_t _log_number, std::string ts_low) | |
1491 | : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {} | |
f67539c2 | 1492 | }; |
7c673cae FG |
1493 | |
1494 | // Save current contents to *log | |
f67539c2 TL |
1495 | Status WriteCurrentStateToManifest( |
1496 | const std::unordered_map<uint32_t, MutableCFState>& curr_state, | |
20effc67 | 1497 | const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s); |
7c673cae FG |
1498 | |
1499 | void AppendVersion(ColumnFamilyData* column_family_data, Version* v); | |
1500 | ||
1501 | ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, | |
20effc67 | 1502 | const VersionEdit* edit); |
7c673cae | 1503 | |
1e59de90 TL |
1504 | Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath, |
1505 | int level, const FileMetaData& meta); | |
20effc67 TL |
1506 | |
1507 | // Protected by DB mutex. | |
1508 | WalSet wals_; | |
7c673cae | 1509 | |
20effc67 TL |
1510 | std::unique_ptr<ColumnFamilySet> column_family_set_; |
1511 | Cache* table_cache_; | |
7c673cae | 1512 | Env* const env_; |
20effc67 | 1513 | FileSystemPtr const fs_; |
1e59de90 | 1514 | SystemClock* const clock_; |
7c673cae | 1515 | const std::string dbname_; |
f67539c2 | 1516 | std::string db_id_; |
7c673cae FG |
1517 | const ImmutableDBOptions* const db_options_; |
1518 | std::atomic<uint64_t> next_file_number_; | |
1e59de90 TL |
1519 | // Any WAL number smaller than this should be ignored during recovery, |
1520 | // and is qualified for being deleted. | |
1521 | std::atomic<uint64_t> min_log_number_to_keep_ = {0}; | |
7c673cae FG |
1522 | uint64_t manifest_file_number_; |
1523 | uint64_t options_file_number_; | |
1e59de90 | 1524 | uint64_t options_file_size_; |
7c673cae | 1525 | uint64_t pending_manifest_file_number_; |
11fdf7f2 TL |
1526 | // The last seq visible to reads. It normally indicates the last sequence in |
1527 | // the memtable but when using two write queues it could also indicate the | |
1528 | // last sequence in the WAL visible to reads. | |
7c673cae | 1529 | std::atomic<uint64_t> last_sequence_; |
1e59de90 TL |
1530 | // The last sequence number of data committed to the descriptor (manifest |
1531 | // file). | |
1532 | SequenceNumber descriptor_last_sequence_ = 0; | |
11fdf7f2 TL |
1533 | // The last seq that is already allocated. It is applicable only when we have |
1534 | // two write queues. In that case seq might or might not have appreated in | |
1535 | // memtable but it is expected to appear in the WAL. | |
1536 | // We have last_sequence <= last_allocated_sequence_ | |
1537 | std::atomic<uint64_t> last_allocated_sequence_; | |
1538 | // The last allocated sequence that is also published to the readers. This is | |
1539 | // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise | |
1540 | // last_sequence_ also indicates the last published seq. | |
1541 | // We have last_sequence <= last_published_sequence_ <= | |
1542 | // last_allocated_sequence_ | |
1543 | std::atomic<uint64_t> last_published_sequence_; | |
7c673cae FG |
1544 | uint64_t prev_log_number_; // 0 or backing store for memtable being compacted |
1545 | ||
1546 | // Opened lazily | |
494da23a | 1547 | std::unique_ptr<log::Writer> descriptor_log_; |
7c673cae FG |
1548 | |
1549 | // generates a increasing version number for every new version | |
1550 | uint64_t current_version_number_; | |
1551 | ||
1552 | // Queue of writers to the manifest file | |
1553 | std::deque<ManifestWriter*> manifest_writers_; | |
1554 | ||
1555 | // Current size of manifest file | |
1556 | uint64_t manifest_file_size_; | |
1557 | ||
11fdf7f2 | 1558 | std::vector<ObsoleteFileInfo> obsolete_files_; |
20effc67 | 1559 | std::vector<ObsoleteBlobFileInfo> obsolete_blob_files_; |
7c673cae FG |
1560 | std::vector<std::string> obsolete_manifests_; |
1561 | ||
1562 | // env options for all reads and writes except compactions | |
f67539c2 | 1563 | FileOptions file_options_; |
7c673cae | 1564 | |
f67539c2 | 1565 | BlockCacheTracer* const block_cache_tracer_; |
7c673cae | 1566 | |
20effc67 TL |
1567 | // Store the IO status when Manifest is written |
1568 | IOStatus io_status_; | |
1569 | ||
1570 | std::shared_ptr<IOTracer> io_tracer_; | |
1571 | ||
1e59de90 TL |
1572 | std::string db_session_id_; |
1573 | ||
f67539c2 | 1574 | private: |
494da23a TL |
1575 | // REQUIRES db mutex at beginning. may release and re-acquire db mutex |
1576 | Status ProcessManifestWrites(std::deque<ManifestWriter>& writers, | |
1e59de90 TL |
1577 | InstrumentedMutex* mu, |
1578 | FSDirectory* dir_contains_current_file, | |
494da23a TL |
1579 | bool new_descriptor_log, |
1580 | const ColumnFamilyOptions* new_cf_options); | |
1581 | ||
1e59de90 TL |
1582 | void LogAndApplyCFHelper(VersionEdit* edit, |
1583 | SequenceNumber* max_last_sequence); | |
f67539c2 | 1584 | Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, |
1e59de90 TL |
1585 | VersionEdit* edit, SequenceNumber* max_last_sequence, |
1586 | InstrumentedMutex* mu); | |
7c673cae FG |
1587 | }; |
1588 | ||
f67539c2 TL |
1589 | // ReactiveVersionSet represents a collection of versions of the column |
1590 | // families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary, | |
1591 | // need to replay the MANIFEST (description log in older terms) in order to | |
1592 | // reconstruct and install versions. | |
494da23a TL |
1593 | class ReactiveVersionSet : public VersionSet { |
1594 | public: | |
1595 | ReactiveVersionSet(const std::string& dbname, | |
1596 | const ImmutableDBOptions* _db_options, | |
f67539c2 | 1597 | const FileOptions& _file_options, Cache* table_cache, |
494da23a | 1598 | WriteBufferManager* write_buffer_manager, |
20effc67 TL |
1599 | WriteController* write_controller, |
1600 | const std::shared_ptr<IOTracer>& io_tracer); | |
494da23a TL |
1601 | |
1602 | ~ReactiveVersionSet() override; | |
1603 | ||
1604 | Status ReadAndApply( | |
1605 | InstrumentedMutex* mu, | |
1606 | std::unique_ptr<log::FragmentBufferedReader>* manifest_reader, | |
1e59de90 | 1607 | Status* manifest_read_status, |
494da23a TL |
1608 | std::unordered_set<ColumnFamilyData*>* cfds_changed); |
1609 | ||
1610 | Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families, | |
1611 | std::unique_ptr<log::FragmentBufferedReader>* manifest_reader, | |
1612 | std::unique_ptr<log::Reader::Reporter>* manifest_reporter, | |
1613 | std::unique_ptr<Status>* manifest_reader_status); | |
1e59de90 TL |
1614 | #ifndef NDEBUG |
1615 | uint64_t TEST_read_edits_in_atomic_group() const; | |
1616 | #endif //! NDEBUG | |
494da23a | 1617 | |
1e59de90 | 1618 | std::vector<VersionEdit>& replay_buffer(); |
f67539c2 | 1619 | |
494da23a | 1620 | protected: |
494da23a TL |
1621 | // REQUIRES db mutex |
1622 | Status ApplyOneVersionEditToBuilder( | |
f67539c2 TL |
1623 | VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed, |
1624 | VersionEdit* version_edit); | |
494da23a TL |
1625 | |
1626 | Status MaybeSwitchManifest( | |
1627 | log::Reader::Reporter* reporter, | |
1628 | std::unique_ptr<log::FragmentBufferedReader>* manifest_reader); | |
1629 | ||
1630 | private: | |
1e59de90 | 1631 | std::unique_ptr<ManifestTailer> manifest_tailer_; |
494da23a TL |
1632 | |
1633 | using VersionSet::LogAndApply; | |
1634 | using VersionSet::Recover; | |
1635 | ||
1636 | Status LogAndApply( | |
1637 | const autovector<ColumnFamilyData*>& /*cfds*/, | |
1638 | const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/, | |
1639 | const autovector<autovector<VersionEdit*>>& /*edit_lists*/, | |
1e59de90 | 1640 | InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, |
20effc67 TL |
1641 | bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, |
1642 | const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/) | |
1643 | override { | |
494da23a TL |
1644 | return Status::NotSupported("not supported in reactive mode"); |
1645 | } | |
1646 | ||
1647 | // No copy allowed | |
1648 | ReactiveVersionSet(const ReactiveVersionSet&); | |
1649 | ReactiveVersionSet& operator=(const ReactiveVersionSet&); | |
1650 | }; | |
1651 | ||
f67539c2 | 1652 | } // namespace ROCKSDB_NAMESPACE |