]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/version_set.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / version_set.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9//
10// The representation of a DBImpl consists of a set of Versions. The
11// newest version is called "current". Older versions may be kept
12// around to provide a consistent view to live iterators.
13//
20effc67
TL
14// Each Version keeps track of a set of table files per level, as well as a
15// set of blob files. The entire set of versions is maintained in a
16// VersionSet.
7c673cae
FG
17//
18// Version,VersionSet are thread-compatible, but require external
19// synchronization on all accesses.
20
21#pragma once
22#include <atomic>
23#include <deque>
24#include <limits>
25#include <map>
26#include <memory>
1e59de90 27#include <optional>
7c673cae
FG
28#include <set>
29#include <string>
1e59de90 30#include <unordered_set>
7c673cae
FG
31#include <utility>
32#include <vector>
33
20effc67
TL
34#include "cache/cache_helpers.h"
35#include "db/blob/blob_file_meta.h"
7c673cae 36#include "db/column_family.h"
f67539c2
TL
37#include "db/compaction/compaction.h"
38#include "db/compaction/compaction_picker.h"
7c673cae
FG
39#include "db/dbformat.h"
40#include "db/file_indexer.h"
41#include "db/log_reader.h"
42#include "db/range_del_aggregator.h"
11fdf7f2 43#include "db/read_callback.h"
7c673cae
FG
44#include "db/table_cache.h"
45#include "db/version_builder.h"
46#include "db/version_edit.h"
47#include "db/write_controller.h"
20effc67 48#include "env/file_system_tracer.h"
1e59de90
TL
49#if USE_COROUTINES
50#include "folly/experimental/coro/BlockingWait.h"
51#include "folly/experimental/coro/Collect.h"
52#endif
7c673cae
FG
53#include "monitoring/instrumented_mutex.h"
54#include "options/db_options.h"
55#include "port/port.h"
56#include "rocksdb/env.h"
f67539c2
TL
57#include "rocksdb/file_checksum.h"
58#include "table/get_context.h"
59#include "table/multiget_context.h"
60#include "trace_replay/block_cache_tracer.h"
1e59de90
TL
61#include "util/autovector.h"
62#include "util/coro_utils.h"
63#include "util/hash_containers.h"
7c673cae 64
f67539c2 65namespace ROCKSDB_NAMESPACE {
7c673cae
FG
66
67namespace log {
68class Writer;
69}
70
1e59de90 71class BlobIndex;
7c673cae 72class Compaction;
7c673cae
FG
73class LogBuffer;
74class LookupKey;
75class MemTable;
76class Version;
77class VersionSet;
78class WriteBufferManager;
79class MergeContext;
80class ColumnFamilySet;
7c673cae 81class MergeIteratorBuilder;
1e59de90
TL
82class SystemClock;
83class ManifestTailer;
84class FilePickerMultiGet;
7c673cae 85
f67539c2
TL
86// VersionEdit is always supposed to be valid and it is used to point at
87// entries in Manifest. Ideally it should not be used as a container to
88// carry around few of its fields as function params because it can cause
89// readers to think it's a valid entry from Manifest. To avoid that confusion
90// introducing VersionEditParams to simply carry around multiple VersionEdit
91// params. It need not point to a valid record in Manifest.
92using VersionEditParams = VersionEdit;
93
7c673cae
FG
94// Return the smallest index i such that file_level.files[i]->largest >= key.
95// Return file_level.num_files if there is no such file.
96// REQUIRES: "file_level.files" contains a sorted list of
97// non-overlapping files.
98extern int FindFile(const InternalKeyComparator& icmp,
99 const LevelFilesBrief& file_level, const Slice& key);
100
101// Returns true iff some file in "files" overlaps the user key range
102// [*smallest,*largest].
103// smallest==nullptr represents a key smaller than all keys in the DB.
104// largest==nullptr represents a key largest than all keys in the DB.
105// REQUIRES: If disjoint_sorted_files, file_level.files[]
106// contains disjoint ranges in sorted order.
107extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
108 bool disjoint_sorted_files,
109 const LevelFilesBrief& file_level,
110 const Slice* smallest_user_key,
111 const Slice* largest_user_key);
112
113// Generate LevelFilesBrief from vector<FdWithKeyRange*>
114// Would copy smallest_key and largest_key data to sequential memory
115// arena: Arena used to allocate the memory
116extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
117 const std::vector<FileMetaData*>& files,
118 Arena* arena);
119
f67539c2
TL
120// Information of the storage associated with each Version, including number of
121// levels of LSM tree, files information at each level, files marked for
20effc67 122// compaction, blob files, etc.
7c673cae
FG
123class VersionStorageInfo {
124 public:
125 VersionStorageInfo(const InternalKeyComparator* internal_comparator,
126 const Comparator* user_comparator, int num_levels,
127 CompactionStyle compaction_style,
128 VersionStorageInfo* src_vstorage,
129 bool _force_consistency_checks);
f67539c2
TL
130 // No copying allowed
131 VersionStorageInfo(const VersionStorageInfo&) = delete;
132 void operator=(const VersionStorageInfo&) = delete;
7c673cae
FG
133 ~VersionStorageInfo();
134
135 void Reserve(int level, size_t size) { files_[level].reserve(size); }
136
20effc67
TL
137 void AddFile(int level, FileMetaData* f);
138
1e59de90
TL
139 // Resize/Initialize the space for compact_cursor_
140 void ResizeCompactCursors(int level) {
141 compact_cursor_.resize(level, InternalKey());
142 }
7c673cae 143
1e59de90
TL
144 const std::vector<InternalKey>& GetCompactCursors() const {
145 return compact_cursor_;
146 }
7c673cae 147
1e59de90
TL
148 // REQUIRES: ResizeCompactCursors has been called
149 void AddCursorForOneLevel(int level,
150 const InternalKey& smallest_uncompacted_key) {
151 compact_cursor_[level] = smallest_uncompacted_key;
152 }
7c673cae 153
1e59de90
TL
154 // REQUIRES: lock is held
155 // Update the compact cursor and advance the file index using increment
156 // so that it can point to the next cursor (increment means the number of
157 // input files in this level of the last compaction)
158 const InternalKey& GetNextCompactCursor(int level, size_t increment) {
159 int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment;
160 assert(cmp_idx <= (int)files_by_compaction_pri_[level].size());
161 // TODO(zichen): may need to update next_file_to_compact_by_size_
162 // for parallel compaction.
163 InternalKey new_cursor;
164 if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) {
165 cmp_idx = 0;
166 }
167 // TODO(zichen): rethink if this strategy gives us some good guarantee
168 return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest;
7c673cae
FG
169 }
170
1e59de90
TL
171 void ReserveBlob(size_t size) { blob_files_.reserve(size); }
172
173 void AddBlobFile(std::shared_ptr<BlobFileMetaData> blob_file_meta);
174
175 void PrepareForVersionAppend(const ImmutableOptions& immutable_options,
176 const MutableCFOptions& mutable_cf_options);
177
178 // REQUIRES: PrepareForVersionAppend has been called
179 void SetFinalized();
180
7c673cae
FG
181 // Update the accumulated stats from a file-meta.
182 void UpdateAccumulatedStats(FileMetaData* file_meta);
183
11fdf7f2 184 // Decrease the current stat from a to-be-deleted file-meta
7c673cae
FG
185 void RemoveCurrentStats(FileMetaData* file_meta);
186
7c673cae
FG
187 // Updates internal structures that keep track of compaction scores
188 // We use compaction scores to figure out which compaction to do next
189 // REQUIRES: db_mutex held!!
190 // TODO find a better way to pass compaction_options_fifo.
1e59de90 191 void ComputeCompactionScore(const ImmutableOptions& immutable_options,
7c673cae
FG
192 const MutableCFOptions& mutable_cf_options);
193
194 // Estimate est_comp_needed_bytes_
195 void EstimateCompactionBytesNeeded(
196 const MutableCFOptions& mutable_cf_options);
197
198 // This computes files_marked_for_compaction_ and is called by
199 // ComputeCompactionScore()
200 void ComputeFilesMarkedForCompaction();
201
11fdf7f2
TL
202 // This computes ttl_expired_files_ and is called by
203 // ComputeCompactionScore()
1e59de90 204 void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions,
11fdf7f2
TL
205 const uint64_t ttl);
206
f67539c2
TL
207 // This computes files_marked_for_periodic_compaction_ and is called by
208 // ComputeCompactionScore()
209 void ComputeFilesMarkedForPeriodicCompaction(
1e59de90 210 const ImmutableOptions& ioptions,
f67539c2
TL
211 const uint64_t periodic_compaction_seconds);
212
11fdf7f2
TL
213 // This computes bottommost_files_marked_for_compaction_ and is called by
214 // ComputeCompactionScore() or UpdateOldestSnapshot().
215 //
216 // Among bottommost files (assumes they've already been computed), marks the
217 // ones that have keys that would be eliminated if recompacted, according to
218 // the seqnum of the oldest existing snapshot. Must be called every time
219 // oldest snapshot changes as that is when bottom-level files can become
220 // eligible for compaction.
221 //
222 // REQUIRES: DB mutex held
223 void ComputeBottommostFilesMarkedForCompaction();
224
1e59de90
TL
225 // This computes files_marked_for_forced_blob_gc_ and is called by
226 // ComputeCompactionScore()
227 //
228 // REQUIRES: DB mutex held
229 void ComputeFilesMarkedForForcedBlobGC(
230 double blob_garbage_collection_age_cutoff,
231 double blob_garbage_collection_force_threshold);
7c673cae 232
1e59de90 233 bool level0_non_overlapping() const { return level0_non_overlapping_; }
11fdf7f2
TL
234
235 // Updates the oldest snapshot and related internal state, like the bottommost
236 // files marked for compaction.
237 // REQUIRES: DB mutex held
238 void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
239
7c673cae 240 int MaxInputLevel() const;
11fdf7f2 241 int MaxOutputLevel(bool allow_ingest_behind) const;
7c673cae
FG
242
243 // Return level number that has idx'th highest score
244 int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
245
246 // Return idx'th highest score
247 double CompactionScore(int idx) const { return compaction_score_[idx]; }
248
249 void GetOverlappingInputs(
250 int level, const InternalKey* begin, // nullptr means before all keys
251 const InternalKey* end, // nullptr means after all keys
252 std::vector<FileMetaData*>* inputs,
253 int hint_index = -1, // index of overlap file
254 int* file_index = nullptr, // return index of overlap file
11fdf7f2
TL
255 bool expand_range = true, // if set, returns files which overlap the
256 // range and overlap each other. If false,
7c673cae 257 // then just files intersecting the range
11fdf7f2
TL
258 InternalKey** next_smallest = nullptr) // if non-null, returns the
259 const; // smallest key of next file not included
7c673cae
FG
260 void GetCleanInputsWithinInterval(
261 int level, const InternalKey* begin, // nullptr means before all keys
262 const InternalKey* end, // nullptr means after all keys
263 std::vector<FileMetaData*>* inputs,
264 int hint_index = -1, // index of overlap file
265 int* file_index = nullptr) // return index of overlap file
266 const;
267
268 void GetOverlappingInputsRangeBinarySearch(
11fdf7f2
TL
269 int level, // level > 0
270 const InternalKey* begin, // nullptr means before all keys
271 const InternalKey* end, // nullptr means after all keys
7c673cae
FG
272 std::vector<FileMetaData*>* inputs,
273 int hint_index, // index of overlap file
274 int* file_index, // return index of overlap file
11fdf7f2
TL
275 bool within_interval = false, // if set, force the inputs within interval
276 InternalKey** next_smallest = nullptr) // if non-null, returns the
277 const; // smallest key of next file not included
7c673cae 278
7c673cae
FG
279 // Returns true iff some file in the specified level overlaps
280 // some part of [*smallest_user_key,*largest_user_key].
281 // smallest_user_key==NULL represents a key smaller than all keys in the DB.
282 // largest_user_key==NULL represents a key largest than all keys in the DB.
283 bool OverlapInLevel(int level, const Slice* smallest_user_key,
284 const Slice* largest_user_key);
285
286 // Returns true iff the first or last file in inputs contains
287 // an overlapping user key to the file "just outside" of it (i.e.
288 // just after the last file, or just before the first file)
289 // REQUIRES: "*inputs" is a sorted list of non-overlapping files
290 bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
291 int level);
292
293 int num_levels() const { return num_levels_; }
294
1e59de90 295 // REQUIRES: PrepareForVersionAppend has been called
7c673cae
FG
296 int num_non_empty_levels() const {
297 assert(finalized_);
298 return num_non_empty_levels_;
299 }
300
1e59de90 301 // REQUIRES: PrepareForVersionAppend has been called
7c673cae
FG
302 // This may or may not return number of level files. It is to keep backward
303 // compatible behavior in universal compaction.
304 int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
305
306 void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
307
1e59de90 308 // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
7c673cae
FG
309 int NumLevelFiles(int level) const {
310 assert(finalized_);
311 return static_cast<int>(files_[level].size());
312 }
313
314 // Return the combined file size of all files at the specified level.
315 uint64_t NumLevelBytes(int level) const;
316
1e59de90 317 // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
7c673cae
FG
318 const std::vector<FileMetaData*>& LevelFiles(int level) const {
319 return files_[level];
320 }
321
20effc67
TL
322 class FileLocation {
323 public:
324 FileLocation() = default;
325 FileLocation(int level, size_t position)
326 : level_(level), position_(position) {}
327
328 int GetLevel() const { return level_; }
329 size_t GetPosition() const { return position_; }
330
331 bool IsValid() const { return level_ >= 0; }
332
333 bool operator==(const FileLocation& rhs) const {
334 return level_ == rhs.level_ && position_ == rhs.position_;
335 }
336
337 bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); }
338
339 static FileLocation Invalid() { return FileLocation(); }
340
341 private:
342 int level_ = -1;
343 size_t position_ = 0;
344 };
345
1e59de90 346 // REQUIRES: PrepareForVersionAppend has been called
20effc67
TL
347 FileLocation GetFileLocation(uint64_t file_number) const {
348 const auto it = file_locations_.find(file_number);
349
350 if (it == file_locations_.end()) {
351 return FileLocation::Invalid();
352 }
353
354 assert(it->second.GetLevel() < num_levels_);
355 assert(it->second.GetPosition() < files_[it->second.GetLevel()].size());
356 assert(files_[it->second.GetLevel()][it->second.GetPosition()]);
357 assert(files_[it->second.GetLevel()][it->second.GetPosition()]
358 ->fd.GetNumber() == file_number);
359
360 return it->second;
361 }
362
1e59de90 363 // REQUIRES: PrepareForVersionAppend has been called
20effc67
TL
364 FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const {
365 auto location = GetFileLocation(file_number);
366
367 if (!location.IsValid()) {
368 return nullptr;
369 }
370
371 return files_[location.GetLevel()][location.GetPosition()];
372 }
373
1e59de90
TL
374 // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
375 using BlobFiles = std::vector<std::shared_ptr<BlobFileMetaData>>;
20effc67
TL
376 const BlobFiles& GetBlobFiles() const { return blob_files_; }
377
1e59de90
TL
378 // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
379 BlobFiles::const_iterator GetBlobFileMetaDataLB(
380 uint64_t blob_file_number) const;
381
382 // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
383 std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData(
384 uint64_t blob_file_number) const {
385 const auto it = GetBlobFileMetaDataLB(blob_file_number);
386
387 assert(it == blob_files_.end() || *it);
388
389 if (it != blob_files_.end() &&
390 (*it)->GetBlobFileNumber() == blob_file_number) {
391 return *it;
392 }
393
394 return std::shared_ptr<BlobFileMetaData>();
395 }
396
397 // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
398 struct BlobStats {
399 uint64_t total_file_size = 0;
400 uint64_t total_garbage_size = 0;
401 double space_amp = 0.0;
402 };
403
404 BlobStats GetBlobStats() const {
405 uint64_t total_file_size = 0;
406 uint64_t total_garbage_size = 0;
407
408 for (const auto& meta : blob_files_) {
409 assert(meta);
410
411 total_file_size += meta->GetBlobFileSize();
412 total_garbage_size += meta->GetGarbageBlobBytes();
413 }
414
415 double space_amp = 0.0;
416 if (total_file_size > total_garbage_size) {
417 space_amp = static_cast<double>(total_file_size) /
418 (total_file_size - total_garbage_size);
419 }
420
421 return BlobStats{total_file_size, total_garbage_size, space_amp};
422 }
423
f67539c2 424 const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
7c673cae
FG
425 assert(level < static_cast<int>(level_files_brief_.size()));
426 return level_files_brief_[level];
427 }
428
1e59de90 429 // REQUIRES: PrepareForVersionAppend has been called
7c673cae
FG
430 const std::vector<int>& FilesByCompactionPri(int level) const {
431 assert(finalized_);
432 return files_by_compaction_pri_[level];
433 }
434
1e59de90 435 // REQUIRES: ComputeCompactionScore has been called
7c673cae
FG
436 // REQUIRES: DB mutex held during access
437 const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
438 const {
439 assert(finalized_);
440 return files_marked_for_compaction_;
441 }
442
1e59de90 443 // REQUIRES: ComputeCompactionScore has been called
11fdf7f2
TL
444 // REQUIRES: DB mutex held during access
445 const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
446 assert(finalized_);
447 return expired_ttl_files_;
448 }
449
1e59de90 450 // REQUIRES: ComputeCompactionScore has been called
f67539c2
TL
451 // REQUIRES: DB mutex held during access
452 const autovector<std::pair<int, FileMetaData*>>&
453 FilesMarkedForPeriodicCompaction() const {
454 assert(finalized_);
455 return files_marked_for_periodic_compaction_;
456 }
457
458 void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
459 files_marked_for_periodic_compaction_.emplace_back(level, f);
460 }
461
1e59de90 462 // REQUIRES: ComputeCompactionScore has been called
11fdf7f2
TL
463 // REQUIRES: DB mutex held during access
464 const autovector<std::pair<int, FileMetaData*>>&
465 BottommostFilesMarkedForCompaction() const {
466 assert(finalized_);
467 return bottommost_files_marked_for_compaction_;
468 }
469
1e59de90
TL
470 // REQUIRES: ComputeCompactionScore has been called
471 // REQUIRES: DB mutex held during access
472 const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC()
473 const {
474 assert(finalized_);
475 return files_marked_for_forced_blob_gc_;
476 }
477
7c673cae 478 int base_level() const { return base_level_; }
494da23a 479 double level_multiplier() const { return level_multiplier_; }
7c673cae
FG
480
481 // REQUIRES: lock is held
482 // Set the index that is used to offset into files_by_compaction_pri_ to find
483 // the next compaction candidate file.
484 void SetNextCompactionIndex(int level, int index) {
485 next_file_to_compact_by_size_[level] = index;
486 }
487
488 // REQUIRES: lock is held
489 int NextCompactionIndex(int level) const {
490 return next_file_to_compact_by_size_[level];
491 }
492
1e59de90 493 // REQUIRES: PrepareForVersionAppend has been called
7c673cae
FG
494 const FileIndexer& file_indexer() const {
495 assert(finalized_);
496 return file_indexer_;
497 }
498
499 // Only the first few entries of files_by_compaction_pri_ are sorted.
500 // There is no need to sort all the files because it is likely
501 // that on a running system, we need to look at only the first
502 // few largest files because a new version is created every few
503 // seconds/minutes (because of concurrent compactions).
504 static const size_t kNumberFilesToSort = 50;
505
506 // Return a human-readable short (single-line) summary of the number
507 // of files per level. Uses *scratch as backing store.
508 struct LevelSummaryStorage {
509 char buffer[1000];
510 };
511 struct FileSummaryStorage {
512 char buffer[3000];
513 };
514 const char* LevelSummary(LevelSummaryStorage* scratch) const;
515 // Return a human-readable short (single-line) summary of files
516 // in a specified level. Uses *scratch as backing store.
517 const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
518
519 // Return the maximum overlapping data (in bytes) at next level for any
520 // file at a level >= 1.
1e59de90 521 uint64_t MaxNextLevelOverlappingBytes();
7c673cae
FG
522
523 // Return a human readable string that describes this version's contents.
524 std::string DebugString(bool hex = false) const;
525
526 uint64_t GetAverageValueSize() const {
527 if (accumulated_num_non_deletions_ == 0) {
528 return 0;
529 }
530 assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
531 assert(accumulated_file_size_ > 0);
532 return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
533 accumulated_file_size_ /
534 (accumulated_raw_key_size_ + accumulated_raw_value_size_);
535 }
536
537 uint64_t GetEstimatedActiveKeys() const;
538
539 double GetEstimatedCompressionRatioAtLevel(int level) const;
540
541 // re-initializes the index that is used to offset into
542 // files_by_compaction_pri_
543 // to find the next compaction candidate file.
544 void ResetNextCompactionIndex(int level) {
545 next_file_to_compact_by_size_[level] = 0;
546 }
547
1e59de90 548 const InternalKeyComparator* InternalComparator() const {
7c673cae
FG
549 return internal_comparator_;
550 }
551
552 // Returns maximum total bytes of data on a given level.
553 uint64_t MaxBytesForLevel(int level) const;
554
7c673cae
FG
555 // Returns an estimate of the amount of live data in bytes.
556 uint64_t EstimateLiveDataSize() const;
557
558 uint64_t estimated_compaction_needed_bytes() const {
559 return estimated_compaction_needed_bytes_;
560 }
561
562 void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
563 estimated_compaction_needed_bytes_ = v;
564 }
565
566 bool force_consistency_checks() const { return force_consistency_checks_; }
567
494da23a
TL
568 SequenceNumber bottommost_files_mark_threshold() const {
569 return bottommost_files_mark_threshold_;
570 }
571
11fdf7f2
TL
572 // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
573 // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
574 //
575 // @param last_level Level after which we check for overlap
576 // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
577 // check for overlap; otherwise, must be -1
494da23a
TL
578 bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
579 const Slice& largest_user_key,
580 int last_level, int last_l0_idx);
11fdf7f2 581
7c673cae 582 private:
1e59de90
TL
583 void ComputeCompensatedSizes();
584 void UpdateNumNonEmptyLevels();
585 void CalculateBaseBytes(const ImmutableOptions& ioptions,
586 const MutableCFOptions& options);
587 void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options,
588 const MutableCFOptions& mutable_cf_options);
589
590 void GenerateFileIndexer() {
591 file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
592 }
593
594 void GenerateLevelFilesBrief();
595 void GenerateLevel0NonOverlapping();
596 void GenerateBottommostFiles();
597 void GenerateFileLocationIndex();
598
7c673cae
FG
599 const InternalKeyComparator* internal_comparator_;
600 const Comparator* user_comparator_;
601 int num_levels_; // Number of levels
602 int num_non_empty_levels_; // Number of levels. Any level larger than it
603 // is guaranteed to be empty.
604 // Per-level max bytes
605 std::vector<uint64_t> level_max_bytes_;
606
607 // A short brief metadata of files per level
f67539c2 608 autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
7c673cae
FG
609 FileIndexer file_indexer_;
610 Arena arena_; // Used to allocate space for file_levels_
611
612 CompactionStyle compaction_style_;
613
614 // List of files per level, files in each level are arranged
615 // in increasing order of keys
616 std::vector<FileMetaData*>* files_;
617
20effc67
TL
618 // Map of all table files in version. Maps file number to (level, position on
619 // level).
1e59de90 620 using FileLocations = UnorderedMap<uint64_t, FileLocation>;
20effc67
TL
621 FileLocations file_locations_;
622
1e59de90 623 // Vector of blob files in version sorted by blob file number.
20effc67
TL
624 BlobFiles blob_files_;
625
7c673cae
FG
626 // Level that L0 data should be compacted to. All levels < base_level_ should
627 // be empty. -1 if it is not level-compaction so it's not applicable.
628 int base_level_;
629
494da23a
TL
630 double level_multiplier_;
631
7c673cae
FG
632 // A list for the same set of files that are stored in files_,
633 // but files in each level are now sorted based on file
634 // size. The file with the largest size is at the front.
635 // This vector stores the index of the file from files_.
636 std::vector<std::vector<int>> files_by_compaction_pri_;
637
638 // If true, means that files in L0 have keys with non overlapping ranges
639 bool level0_non_overlapping_;
640
641 // An index into files_by_compaction_pri_ that specifies the first
642 // file that is not yet compacted
643 std::vector<int> next_file_to_compact_by_size_;
644
645 // Only the first few entries of files_by_compaction_pri_ are sorted.
646 // There is no need to sort all the files because it is likely
647 // that on a running system, we need to look at only the first
648 // few largest files because a new version is created every few
649 // seconds/minutes (because of concurrent compactions).
650 static const size_t number_of_files_to_sort_ = 50;
651
652 // This vector contains list of files marked for compaction and also not
653 // currently being compacted. It is protected by DB mutex. It is calculated in
654 // ComputeCompactionScore()
655 autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
656
11fdf7f2
TL
657 autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
658
f67539c2
TL
659 autovector<std::pair<int, FileMetaData*>>
660 files_marked_for_periodic_compaction_;
661
11fdf7f2
TL
662 // These files are considered bottommost because none of their keys can exist
663 // at lower levels. They are not necessarily all in the same level. The marked
664 // ones are eligible for compaction because they contain duplicate key
665 // versions that are no longer protected by snapshot. These variables are
666 // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
667 // `ComputeBottommostFilesMarkedForCompaction()`.
668 autovector<std::pair<int, FileMetaData*>> bottommost_files_;
669 autovector<std::pair<int, FileMetaData*>>
670 bottommost_files_marked_for_compaction_;
671
1e59de90
TL
672 autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_;
673
11fdf7f2
TL
674 // Threshold for needing to mark another bottommost file. Maintain it so we
675 // can quickly check when releasing a snapshot whether more bottommost files
676 // became eligible for compaction. It's defined as the min of the max nonzero
677 // seqnums of unmarked bottommost files.
678 SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
679
680 // Monotonically increases as we release old snapshots. Zero indicates no
681 // snapshots have been released yet. When no snapshots remain we set it to the
682 // current seqnum, which needs to be protected as a snapshot can still be
683 // created that references it.
684 SequenceNumber oldest_snapshot_seqnum_ = 0;
685
7c673cae
FG
686 // Level that should be compacted next and its compaction score.
687 // Score < 1 means compaction is not strictly needed. These fields
1e59de90 688 // are initialized by ComputeCompactionScore.
7c673cae
FG
689 // The most critical level to be compacted is listed first
690 // These are used to pick the best compaction level
691 std::vector<double> compaction_score_;
692 std::vector<int> compaction_level_;
693 int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop
694 // for number of L0 files.
695
1e59de90
TL
696 // Compact cursors for round-robin compactions in each level
697 std::vector<InternalKey> compact_cursor_;
698
7c673cae
FG
699 // the following are the sampled temporary stats.
700 // the current accumulated size of sampled files.
701 uint64_t accumulated_file_size_;
702 // the current accumulated size of all raw keys based on the sampled files.
703 uint64_t accumulated_raw_key_size_;
704 // the current accumulated size of all raw keys based on the sampled files.
705 uint64_t accumulated_raw_value_size_;
706 // total number of non-deletion entries
707 uint64_t accumulated_num_non_deletions_;
708 // total number of deletion entries
709 uint64_t accumulated_num_deletions_;
710 // current number of non_deletion entries
711 uint64_t current_num_non_deletions_;
11fdf7f2 712 // current number of deletion entries
7c673cae
FG
713 uint64_t current_num_deletions_;
714 // current number of file samples
715 uint64_t current_num_samples_;
716 // Estimated bytes needed to be compacted until all levels' size is down to
717 // target sizes.
718 uint64_t estimated_compaction_needed_bytes_;
719
720 bool finalized_;
721
722 // If set to true, we will run consistency checks even if RocksDB
723 // is compiled in release mode
724 bool force_consistency_checks_;
725
726 friend class Version;
727 friend class VersionSet;
7c673cae
FG
728};
729
1e59de90
TL
730struct ObsoleteFileInfo {
731 FileMetaData* metadata;
732 std::string path;
733 // If true, the FileMataData should be destroyed but the file should
734 // not be deleted. This is because another FileMetaData still references
735 // the file, usually because the file is trivial moved so two FileMetadata
736 // is managing the file.
737 bool only_delete_metadata = false;
738
739 ObsoleteFileInfo() noexcept
740 : metadata(nullptr), only_delete_metadata(false) {}
741 ObsoleteFileInfo(FileMetaData* f, const std::string& file_path,
742 std::shared_ptr<CacheReservationManager>
743 file_metadata_cache_res_mgr_arg = nullptr)
744 : metadata(f),
745 path(file_path),
746 only_delete_metadata(false),
747 file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {}
748
749 ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
750 ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
751
752 ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() {
753 *this = std::move(rhs);
754 }
755
756 ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
757 path = std::move(rhs.path);
758 metadata = rhs.metadata;
759 rhs.metadata = nullptr;
760 file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr;
761 rhs.file_metadata_cache_res_mgr = nullptr;
762
763 return *this;
764 }
765 void DeleteMetadata() {
766 if (file_metadata_cache_res_mgr) {
767 Status s = file_metadata_cache_res_mgr->UpdateCacheReservation(
768 metadata->ApproximateMemoryUsage(), false /* increase */);
769 s.PermitUncheckedError();
770 }
771 delete metadata;
772 metadata = nullptr;
773 }
774
775 private:
776 std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr;
777};
778
779class ObsoleteBlobFileInfo {
780 public:
781 ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path)
782 : blob_file_number_(blob_file_number), path_(std::move(path)) {}
783
784 uint64_t GetBlobFileNumber() const { return blob_file_number_; }
785 const std::string& GetPath() const { return path_; }
786
787 private:
788 uint64_t blob_file_number_;
789 std::string path_;
790};
791
f67539c2 792using MultiGetRange = MultiGetContext::Range;
20effc67
TL
793// A column family's version consists of the table and blob files owned by
794// the column family at a certain point in time.
7c673cae
FG
795class Version {
796 public:
797 // Append to *iters a sequence of iterators that will
798 // yield the contents of this Version when merged together.
20effc67
TL
799 // @param read_options Must outlive any iterator built by
800 // `merger_iter_builder`.
20effc67
TL
801 void AddIterators(const ReadOptions& read_options,
802 const FileOptions& soptions,
7c673cae 803 MergeIteratorBuilder* merger_iter_builder,
20effc67 804 bool allow_unprepared_value);
7c673cae 805
20effc67
TL
806 // @param read_options Must outlive any iterator built by
807 // `merger_iter_builder`.
808 void AddIteratorsForLevel(const ReadOptions& read_options,
809 const FileOptions& soptions,
7c673cae 810 MergeIteratorBuilder* merger_iter_builder,
1e59de90 811 int level, bool allow_unprepared_value);
7c673cae 812
f67539c2 813 Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
11fdf7f2 814 const Slice& smallest_user_key,
1e59de90
TL
815 const Slice& largest_user_key, int level,
816 bool* overlap);
11fdf7f2 817
f67539c2
TL
818 // Lookup the value for key or get all merge operands for key.
819 // If do_merge = true (default) then lookup value for key.
820 // Behavior if do_merge = true:
821 // If found, store it in *value and
822 // return OK. Else return a non-OK status.
823 // Uses *operands to store merge_operator operations to apply later.
7c673cae 824 //
f67539c2
TL
825 // If the ReadOptions.read_tier is set to do a read-only fetch, then
826 // *value_found will be set to false if it cannot be determined whether
827 // this value exists without doing IO.
7c673cae 828 //
f67539c2 829 // If the key is Deleted, *status will be set to NotFound and
7c673cae 830 // *key_exists will be set to true.
f67539c2 831 // If no key was found, *status will be set to NotFound and
7c673cae 832 // *key_exists will be set to false.
f67539c2
TL
833 // If seq is non-null, *seq will be set to the sequence number found
834 // for the key if a key was found.
835 // Behavior if do_merge = false
836 // If the key has any merge operands then store them in
837 // merge_context.operands_list and don't merge the operands
7c673cae 838 // REQUIRES: lock is not held
1e59de90 839 // REQUIRES: pinned_iters_mgr != nullptr
7c673cae 840 void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
1e59de90
TL
841 PinnableWideColumns* columns, std::string* timestamp, Status* status,
842 MergeContext* merge_context,
494da23a 843 SequenceNumber* max_covering_tombstone_seq,
1e59de90 844 PinnedIteratorsManager* pinned_iters_mgr,
494da23a
TL
845 bool* value_found = nullptr, bool* key_exists = nullptr,
846 SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
f67539c2
TL
847 bool* is_blob = nullptr, bool do_merge = true);
848
849 void MultiGet(const ReadOptions&, MultiGetRange* range,
1e59de90 850 ReadCallback* callback = nullptr);
7c673cae 851
1e59de90
TL
852 // Interprets blob_index_slice as a blob reference, and (assuming the
853 // corresponding blob file is part of this Version) retrieves the blob and
854 // saves it in *value.
855 // REQUIRES: blob_index_slice stores an encoded blob reference
856 Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
857 const Slice& blob_index_slice,
858 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
859 uint64_t* bytes_read) const;
860
861 // Retrieves a blob using a blob reference and saves it in *value,
862 // assuming the corresponding blob file is part of this Version.
863 Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
864 const BlobIndex& blob_index,
865 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
866 uint64_t* bytes_read) const;
867
868 using BlobReadContext =
869 std::pair<BlobIndex, std::reference_wrapper<const KeyContext>>;
870 using BlobReadContexts = std::vector<BlobReadContext>;
871 void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range,
872 std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs);
873
874 // Loads some stats information from files (if update_stats is set) and
875 // populates derived data structures. Call without mutex held. It needs to be
876 // called before appending the version to the version set.
877 void PrepareAppend(const MutableCFOptions& mutable_cf_options,
878 bool update_stats);
7c673cae
FG
879
880 // Reference count management (so Versions do not disappear out from
881 // under live iterators)
882 void Ref();
883 // Decrease reference count. Delete the object if no reference left
884 // and return true. Otherwise, return false.
885 bool Unref();
886
20effc67
TL
887 // Add all files listed in the current version to *live_table_files and
888 // *live_blob_files.
889 void AddLiveFiles(std::vector<uint64_t>* live_table_files,
890 std::vector<uint64_t>* live_blob_files) const;
7c673cae 891
1e59de90
TL
892 // Remove live files that are in the delete candidate lists.
893 void RemoveLiveFiles(
894 std::vector<ObsoleteFileInfo>& sst_delete_candidates,
895 std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
896
7c673cae 897 // Return a human readable string that describes this version's contents.
11fdf7f2 898 std::string DebugString(bool hex = false, bool print_stats = false) const;
7c673cae 899
11fdf7f2 900 // Returns the version number of this version
7c673cae
FG
901 uint64_t GetVersionNumber() const { return version_number_; }
902
903 // REQUIRES: lock is held
904 // On success, "tp" will contains the table properties of the file
905 // specified in "file_meta". If the file name of "file_meta" is
11fdf7f2 906 // known ahead, passing it by a non-null "fname" can save a
7c673cae
FG
907 // file-name conversion.
908 Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
909 const FileMetaData* file_meta,
910 const std::string* fname = nullptr) const;
911
912 // REQUIRES: lock is held
913 // On success, *props will be populated with all SSTables' table properties.
914 // The keys of `props` are the sst file name, the values of `props` are the
494da23a 915 // tables' properties, represented as std::shared_ptr.
7c673cae
FG
916 Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
917 Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
918 Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
919 TablePropertiesCollection* props) const;
920
f67539c2
TL
921 // Print summary of range delete tombstones in SST files into out_str,
922 // with maximum max_entries_to_print entries printed out.
923 Status TablesRangeTombstoneSummary(int max_entries_to_print,
924 std::string* out_str);
925
7c673cae 926 // REQUIRES: lock is held
11fdf7f2 927 // On success, "tp" will contains the aggregated table property among
7c673cae
FG
928 // the table properties of all sst files in this version.
929 Status GetAggregatedTableProperties(
930 std::shared_ptr<const TableProperties>* tp, int level = -1);
931
932 uint64_t GetEstimatedActiveKeys() {
933 return storage_info_.GetEstimatedActiveKeys();
934 }
935
936 size_t GetMemoryUsageByTableReaders();
937
938 ColumnFamilyData* cfd() const { return cfd_; }
939
1e59de90
TL
940 // Return the next Version in the linked list.
941 Version* Next() const { return next_; }
7c673cae
FG
942
943 int TEST_refs() const { return refs_; }
944
945 VersionStorageInfo* storage_info() { return &storage_info_; }
1e59de90 946 const VersionStorageInfo* storage_info() const { return &storage_info_; }
7c673cae
FG
947
948 VersionSet* version_set() { return vset_; }
949
950 void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
951
11fdf7f2
TL
952 uint64_t GetSstFilesSize();
953
f67539c2
TL
954 // Retrieves the file_creation_time of the oldest file in the DB.
955 // Prerequisite for this API is max_open_files = -1
956 void GetCreationTimeOfOldestFile(uint64_t* creation_time);
957
958 const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
11fdf7f2 959
1e59de90
TL
960 InternalIterator* TEST_GetLevelIterator(
961 const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
962 int level, bool allow_unprepared_value);
963
7c673cae
FG
964 private:
965 Env* env_;
1e59de90
TL
966 SystemClock* clock_;
967
494da23a 968 friend class ReactiveVersionSet;
7c673cae 969 friend class VersionSet;
20effc67
TL
970 friend class VersionEditHandler;
971 friend class VersionEditHandlerPointInTime;
7c673cae
FG
972
973 const InternalKeyComparator* internal_comparator() const {
974 return storage_info_.internal_comparator_;
975 }
976 const Comparator* user_comparator() const {
977 return storage_info_.user_comparator_;
978 }
979
7c673cae
FG
980 // Returns true if the filter blocks in the specified level will not be
981 // checked during read operations. In certain cases (trivial move or preload),
982 // the filter block may already be cached, but we still do not access it such
983 // that it eventually expires from the cache.
984 bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
985
986 // The helper function of UpdateAccumulatedStats, which may fill the missing
11fdf7f2 987 // fields of file_meta from its associated TableProperties.
7c673cae
FG
988 // Returns true if it does initialize FileMetaData.
989 bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
990
991 // Update the accumulated stats associated with the current version.
992 // This accumulated stats will be used in compaction.
1e59de90
TL
993 void UpdateAccumulatedStats();
994
995 DECLARE_SYNC_AND_ASYNC(
996 /* ret_type */ Status, /* func_name */ MultiGetFromSST,
997 const ReadOptions& read_options, MultiGetRange file_range,
998 int hit_file_level, bool skip_filters, bool skip_range_deletions,
999 FdWithKeyRange* f,
1000 std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
1001 Cache::Handle* table_handle, uint64_t& num_filter_read,
1002 uint64_t& num_index_read, uint64_t& num_sst_read);
1003
1004#ifdef USE_COROUTINES
1005 // MultiGet using async IO to read data blocks from SST files in parallel
1006 // within and across levels
1007 Status MultiGetAsync(
1008 const ReadOptions& options, MultiGetRange* range,
1009 std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs);
1010
1011 // A helper function to lookup a batch of keys in a single level. It will
1012 // queue coroutine tasks to mget_tasks. It may also split the input batch
1013 // by creating a new batch with keys definitely not in this level and
1014 // enqueuing it to to_process.
1015 Status ProcessBatch(
1016 const ReadOptions& read_options, FilePickerMultiGet* batch,
1017 std::vector<folly::coro::Task<Status>>& mget_tasks,
1018 std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
1019 autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
1020 std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
1021 std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
1022 mget_stats);
1023#endif
7c673cae
FG
1024
1025 ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
1026 Logger* info_log_;
1027 Statistics* db_statistics_;
1028 TableCache* table_cache_;
1e59de90 1029 BlobSource* blob_source_;
7c673cae
FG
1030 const MergeOperator* merge_operator_;
1031
1032 VersionStorageInfo storage_info_;
1e59de90
TL
1033 VersionSet* vset_; // VersionSet to which this Version belongs
1034 Version* next_; // Next version in linked list
1035 Version* prev_; // Previous version in linked list
1036 int refs_; // Number of live refs to this version
f67539c2 1037 const FileOptions file_options_;
11fdf7f2 1038 const MutableCFOptions mutable_cf_options_;
20effc67
TL
1039 // Cached value to avoid recomputing it on every read.
1040 const size_t max_file_size_for_l0_meta_pin_;
7c673cae
FG
1041
1042 // A version number that uniquely represents this version. This is
1043 // used for debugging and logging purposes only.
1044 uint64_t version_number_;
20effc67 1045 std::shared_ptr<IOTracer> io_tracer_;
7c673cae 1046
f67539c2 1047 Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
20effc67
TL
1048 MutableCFOptions mutable_cf_options,
1049 const std::shared_ptr<IOTracer>& io_tracer,
1050 uint64_t version_number = 0);
7c673cae
FG
1051
1052 ~Version();
1053
1054 // No copying allowed
f67539c2
TL
1055 Version(const Version&) = delete;
1056 void operator=(const Version&) = delete;
7c673cae
FG
1057};
1058
11fdf7f2 1059class BaseReferencedVersionBuilder;
11fdf7f2 1060
f67539c2
TL
1061class AtomicGroupReadBuffer {
1062 public:
1e59de90 1063 AtomicGroupReadBuffer() = default;
f67539c2
TL
1064 Status AddEdit(VersionEdit* edit);
1065 void Clear();
1066 bool IsFull() const;
1067 bool IsEmpty() const;
1068
1069 uint64_t TEST_read_edits_in_atomic_group() const {
1070 return read_edits_in_atomic_group_;
1071 }
1072 std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
1073
1074 private:
1075 uint64_t read_edits_in_atomic_group_ = 0;
1076 std::vector<VersionEdit> replay_buffer_;
1077};
1078
1079// VersionSet is the collection of versions of all the column families of the
1080// database. Each database owns one VersionSet. A VersionSet has access to all
1081// column families via ColumnFamilySet, i.e. set of the column families.
7c673cae
FG
1082class VersionSet {
1083 public:
1084 VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
f67539c2 1085 const FileOptions& file_options, Cache* table_cache,
7c673cae 1086 WriteBufferManager* write_buffer_manager,
f67539c2 1087 WriteController* write_controller,
20effc67 1088 BlockCacheTracer* const block_cache_tracer,
1e59de90
TL
1089 const std::shared_ptr<IOTracer>& io_tracer,
1090 const std::string& db_id, const std::string& db_session_id);
f67539c2
TL
1091 // No copying allowed
1092 VersionSet(const VersionSet&) = delete;
1093 void operator=(const VersionSet&) = delete;
1094
494da23a 1095 virtual ~VersionSet();
7c673cae 1096
20effc67
TL
1097 Status LogAndApplyToDefaultColumnFamily(
1098 VersionEdit* edit, InstrumentedMutex* mu,
1e59de90 1099 FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
20effc67
TL
1100 const ColumnFamilyOptions* column_family_options = nullptr) {
1101 ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault();
1102 const MutableCFOptions* cf_options =
1103 default_cf->GetLatestMutableCFOptions();
1e59de90
TL
1104 return LogAndApply(default_cf, *cf_options, edit, mu,
1105 dir_contains_current_file, new_descriptor_log,
1106 column_family_options);
20effc67
TL
1107 }
1108
7c673cae
FG
1109 // Apply *edit to the current version to form a new descriptor that
1110 // is both saved to persistent state and installed as the new
1111 // current version. Will release *mu while actually writing to the file.
1112 // column_family_options has to be set if edit is column family add
1113 // REQUIRES: *mu is held on entry.
1114 // REQUIRES: no other thread concurrently calls LogAndApply()
1115 Status LogAndApply(
1116 ColumnFamilyData* column_family_data,
1117 const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
1e59de90 1118 InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
7c673cae
FG
1119 bool new_descriptor_log = false,
1120 const ColumnFamilyOptions* column_family_options = nullptr) {
494da23a
TL
1121 autovector<ColumnFamilyData*> cfds;
1122 cfds.emplace_back(column_family_data);
1123 autovector<const MutableCFOptions*> mutable_cf_options_list;
1124 mutable_cf_options_list.emplace_back(&mutable_cf_options);
1125 autovector<autovector<VersionEdit*>> edit_lists;
1126 autovector<VersionEdit*> edit_list;
1127 edit_list.emplace_back(edit);
1128 edit_lists.emplace_back(edit_list);
11fdf7f2 1129 return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
1e59de90
TL
1130 dir_contains_current_file, new_descriptor_log,
1131 column_family_options);
7c673cae
FG
1132 }
1133 // The batch version. If edit_list.size() > 1, caller must ensure that
1134 // no edit in the list column family add or drop
1135 Status LogAndApply(
1136 ColumnFamilyData* column_family_data,
1137 const MutableCFOptions& mutable_cf_options,
1138 const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
1e59de90 1139 FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
20effc67
TL
1140 const ColumnFamilyOptions* column_family_options = nullptr,
1141 const std::function<void(const Status&)>& manifest_wcb = {}) {
494da23a
TL
1142 autovector<ColumnFamilyData*> cfds;
1143 cfds.emplace_back(column_family_data);
1144 autovector<const MutableCFOptions*> mutable_cf_options_list;
1145 mutable_cf_options_list.emplace_back(&mutable_cf_options);
1146 autovector<autovector<VersionEdit*>> edit_lists;
1147 edit_lists.emplace_back(edit_list);
11fdf7f2 1148 return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
1e59de90
TL
1149 dir_contains_current_file, new_descriptor_log,
1150 column_family_options, {manifest_wcb});
11fdf7f2
TL
1151 }
1152
1153 // The across-multi-cf batch version. If edit_lists contain more than
1154 // 1 version edits, caller must ensure that no edit in the []list is column
1155 // family manipulation.
494da23a
TL
1156 virtual Status LogAndApply(
1157 const autovector<ColumnFamilyData*>& cfds,
1158 const autovector<const MutableCFOptions*>& mutable_cf_options_list,
1159 const autovector<autovector<VersionEdit*>>& edit_lists,
1e59de90 1160 InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
494da23a 1161 bool new_descriptor_log = false,
20effc67
TL
1162 const ColumnFamilyOptions* new_cf_options = nullptr,
1163 const std::vector<std::function<void(const Status&)>>& manifest_wcbs =
1164 {});
494da23a 1165
f67539c2
TL
1166 static Status GetCurrentManifestPath(const std::string& dbname,
1167 FileSystem* fs,
1168 std::string* manifest_filename,
1169 uint64_t* manifest_file_number);
1e59de90 1170 void WakeUpWaitingManifestWriters();
7c673cae
FG
1171
1172 // Recover the last saved descriptor from persistent storage.
1173 // If read_only == true, Recover() will not complain if some column families
1174 // are not opened
1175 Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
1e59de90
TL
1176 bool read_only = false, std::string* db_id = nullptr,
1177 bool no_error_if_files_missing = false);
7c673cae 1178
20effc67
TL
1179 Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
1180 bool read_only,
1181 const std::vector<std::string>& files_in_dbname,
1182 std::string* db_id, bool* has_missing_table_file);
1183
1184 // Try to recover the version set to the most recent consistent state
1185 // recorded in the specified manifest.
1186 Status TryRecoverFromOneManifest(
1187 const std::string& manifest_path,
1188 const std::vector<ColumnFamilyDescriptor>& column_families,
1189 bool read_only, std::string* db_id, bool* has_missing_table_file);
1190
7c673cae
FG
1191 // Reads a manifest file and returns a list of column families in
1192 // column_families.
1193 static Status ListColumnFamilies(std::vector<std::string>* column_families,
f67539c2 1194 const std::string& dbname, FileSystem* fs);
1e59de90
TL
1195 static Status ListColumnFamiliesFromManifest(
1196 const std::string& manifest_path, FileSystem* fs,
1197 std::vector<std::string>* column_families);
7c673cae
FG
1198
1199#ifndef ROCKSDB_LITE
1200 // Try to reduce the number of levels. This call is valid when
1201 // only one level from the new max level to the old
1202 // max level containing files.
1203 // The call is static, since number of levels is immutable during
1204 // the lifetime of a RocksDB instance. It reduces number of levels
1205 // in a DB by applying changes to manifest.
1206 // For example, a db currently has 7 levels [0-6], and a call to
1207 // to reduce to 5 [0-4] can only be executed when only one level
1208 // among [4-6] contains files.
1209 static Status ReduceNumberOfLevels(const std::string& dbname,
1210 const Options* options,
f67539c2 1211 const FileOptions& file_options,
7c673cae
FG
1212 int new_levels);
1213
f67539c2
TL
1214 // Get the checksum information of all live files
1215 Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
1216
7c673cae
FG
1217 // printf contents (for debugging)
1218 Status DumpManifest(Options& options, std::string& manifestFileName,
1219 bool verbose, bool hex = false, bool json = false);
1220
1221#endif // ROCKSDB_LITE
1222
1e59de90
TL
1223 const std::string& DbSessionId() const { return db_session_id_; }
1224
7c673cae
FG
1225 // Return the current manifest file number
1226 uint64_t manifest_file_number() const { return manifest_file_number_; }
1227
1228 uint64_t options_file_number() const { return options_file_number_; }
1229
1230 uint64_t pending_manifest_file_number() const {
1231 return pending_manifest_file_number_;
1232 }
1233
1234 uint64_t current_next_file_number() const { return next_file_number_.load(); }
1235
1e59de90
TL
1236 uint64_t min_log_number_to_keep() const {
1237 return min_log_number_to_keep_.load();
11fdf7f2
TL
1238 }
1239
7c673cae
FG
1240 // Allocate and return a new file number
1241 uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
1242
11fdf7f2
TL
1243 // Fetch And Add n new file number
1244 uint64_t FetchAddFileNumber(uint64_t n) {
1245 return next_file_number_.fetch_add(n);
1246 }
1247
7c673cae
FG
1248 // Return the last sequence number.
1249 uint64_t LastSequence() const {
1250 return last_sequence_.load(std::memory_order_acquire);
1251 }
1252
11fdf7f2
TL
1253 // Note: memory_order_acquire must be sufficient.
1254 uint64_t LastAllocatedSequence() const {
1255 return last_allocated_sequence_.load(std::memory_order_seq_cst);
1256 }
1257
1258 // Note: memory_order_acquire must be sufficient.
1259 uint64_t LastPublishedSequence() const {
1260 return last_published_sequence_.load(std::memory_order_seq_cst);
1261 }
1262
7c673cae
FG
1263 // Set the last sequence number to s.
1264 void SetLastSequence(uint64_t s) {
1265 assert(s >= last_sequence_);
11fdf7f2
TL
1266 // Last visible sequence must always be less than last written seq
1267 assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
7c673cae
FG
1268 last_sequence_.store(s, std::memory_order_release);
1269 }
1270
11fdf7f2
TL
1271 // Note: memory_order_release must be sufficient
1272 void SetLastPublishedSequence(uint64_t s) {
1273 assert(s >= last_published_sequence_);
1274 last_published_sequence_.store(s, std::memory_order_seq_cst);
1275 }
1276
1277 // Note: memory_order_release must be sufficient
1278 void SetLastAllocatedSequence(uint64_t s) {
1279 assert(s >= last_allocated_sequence_);
1280 last_allocated_sequence_.store(s, std::memory_order_seq_cst);
1281 }
1282
1283 // Note: memory_order_release must be sufficient
1284 uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
1285 return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
1286 }
1287
7c673cae 1288 // Mark the specified file number as used.
11fdf7f2
TL
1289 // REQUIRED: this is only called during single-threaded recovery or repair.
1290 void MarkFileNumberUsed(uint64_t number);
1291
1292 // Mark the specified log number as deleted
1293 // REQUIRED: this is only called during single-threaded recovery or repair, or
1294 // from ::LogAndApply where the global mutex is held.
1e59de90 1295 void MarkMinLogNumberToKeep(uint64_t number);
7c673cae
FG
1296
1297 // Return the log file number for the log file that is currently
1298 // being compacted, or zero if there is no such log file.
1299 uint64_t prev_log_number() const { return prev_log_number_; }
1300
11fdf7f2
TL
1301 // Returns the minimum log number which still has data not flushed to any SST
1302 // file.
1303 // In non-2PC mode, all the log numbers smaller than this number can be safely
1e59de90
TL
1304 // deleted, although we still use `min_log_number_to_keep_` to determine when
1305 // to delete a WAL file.
11fdf7f2
TL
1306 uint64_t MinLogNumberWithUnflushedData() const {
1307 return PreComputeMinLogNumberWithUnflushedData(nullptr);
1308 }
1e59de90
TL
1309
1310 // Returns the minimum log number which still has data not flushed to any SST
1311 // file.
1312 // Empty column families' log number is considered to be
1313 // new_log_number_for_empty_cf.
1314 uint64_t PreComputeMinLogNumberWithUnflushedData(
1315 uint64_t new_log_number_for_empty_cf) const {
1316 uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
1317 for (auto cfd : *column_family_set_) {
1318 // It's safe to ignore dropped column families here:
1319 // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
1320 uint64_t num =
1321 cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber();
1322 if (min_log_num > num && !cfd->IsDropped()) {
1323 min_log_num = num;
1324 }
1325 }
1326 return min_log_num;
1327 }
11fdf7f2
TL
1328 // Returns the minimum log number which still has data not flushed to any SST
1329 // file, except data from `cfd_to_skip`.
1330 uint64_t PreComputeMinLogNumberWithUnflushedData(
1331 const ColumnFamilyData* cfd_to_skip) const {
7c673cae
FG
1332 uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
1333 for (auto cfd : *column_family_set_) {
11fdf7f2
TL
1334 if (cfd == cfd_to_skip) {
1335 continue;
1336 }
7c673cae
FG
1337 // It's safe to ignore dropped column families here:
1338 // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
1339 if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
1340 min_log_num = cfd->GetLogNumber();
1341 }
1342 }
1343 return min_log_num;
1344 }
1e59de90
TL
1345 // Returns the minimum log number which still has data not flushed to any SST
1346 // file, except data from `cfds_to_skip`.
1347 uint64_t PreComputeMinLogNumberWithUnflushedData(
1348 const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const {
1349 uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
1350 for (auto cfd : *column_family_set_) {
1351 if (cfds_to_skip.count(cfd)) {
1352 continue;
1353 }
1354 // It's safe to ignore dropped column families here:
1355 // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
1356 if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
1357 min_log_num = cfd->GetLogNumber();
1358 }
1359 }
1360 return min_log_num;
1361 }
7c673cae
FG
1362
1363 // Create an iterator that reads over the compaction inputs for "*c".
1364 // The caller should delete the iterator when no longer needed.
20effc67 1365 // @param read_options Must outlive the returned iterator.
1e59de90 1366 // @param start, end indicates compaction range
11fdf7f2 1367 InternalIterator* MakeInputIterator(
20effc67
TL
1368 const ReadOptions& read_options, const Compaction* c,
1369 RangeDelAggregator* range_del_agg,
1e59de90
TL
1370 const FileOptions& file_options_compactions,
1371 const std::optional<const Slice>& start,
1372 const std::optional<const Slice>& end);
7c673cae 1373
20effc67
TL
1374 // Add all files listed in any live version to *live_table_files and
1375 // *live_blob_files. Note that these lists may contain duplicates.
1376 void AddLiveFiles(std::vector<uint64_t>* live_table_files,
1377 std::vector<uint64_t>* live_blob_files) const;
7c673cae 1378
1e59de90
TL
1379 // Remove live files that are in the delete candidate lists.
1380 void RemoveLiveFiles(
1381 std::vector<ObsoleteFileInfo>& sst_delete_candidates,
1382 std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
1383
7c673cae 1384 // Return the approximate size of data to be scanned for range [start, end)
f67539c2 1385 // in levels [start_level, end_level). If end_level == -1 it will search
7c673cae 1386 // through all non-empty levels
f67539c2
TL
1387 uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
1388 const Slice& start, const Slice& end,
1389 int start_level, int end_level,
1390 TableReaderCaller caller);
7c673cae
FG
1391
1392 // Return the size of the current manifest file
1393 uint64_t manifest_file_size() const { return manifest_file_size_; }
1394
7c673cae
FG
1395 Status GetMetadataForFile(uint64_t number, int* filelevel,
1396 FileMetaData** metadata, ColumnFamilyData** cfd);
1397
1398 // This function doesn't support leveldb SST filenames
1e59de90 1399 void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
7c673cae 1400
20effc67
TL
1401 void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
1402 assert(table_cache_);
1403
1404 table_cache_->Erase(GetSlice(&blob_file_number));
1405
1406 obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
1407 }
1408
11fdf7f2 1409 void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
20effc67 1410 std::vector<ObsoleteBlobFileInfo>* blob_files,
7c673cae
FG
1411 std::vector<std::string>* manifest_filenames,
1412 uint64_t min_pending_output);
1413
1414 ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
1e59de90
TL
1415 RefedColumnFamilySet GetRefedColumnFamilySet() {
1416 return RefedColumnFamilySet(GetColumnFamilySet());
1417 }
1418
f67539c2
TL
1419 const FileOptions& file_options() { return file_options_; }
1420 void ChangeFileOptions(const MutableDBOptions& new_options) {
1421 file_options_.writable_file_max_buffer_size =
11fdf7f2
TL
1422 new_options.writable_file_max_buffer_size;
1423 }
1424
1425 const ImmutableDBOptions* db_options() const { return db_options_; }
7c673cae
FG
1426
1427 static uint64_t GetNumLiveVersions(Version* dummy_versions);
1428
1429 static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
1430
1e59de90
TL
1431 static uint64_t GetTotalBlobFileSize(Version* dummy_versions);
1432
20effc67
TL
1433 // Get the IO Status returned by written Manifest.
1434 const IOStatus& io_status() const { return io_status_; }
1435
1436 // The returned WalSet needs to be accessed with DB mutex held.
1437 const WalSet& GetWalSet() const { return wals_; }
1438
1439 void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) {
1440 assert(cfd);
1441
1442 const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
1443 Version* const version =
1444 new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_);
1445
1446 constexpr bool update_stats = false;
1e59de90 1447 version->PrepareAppend(mutable_cf_options, update_stats);
20effc67
TL
1448 AppendVersion(cfd, version);
1449 }
1450
494da23a 1451 protected:
20effc67 1452 using VersionBuilderMap =
1e59de90 1453 UnorderedMap<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>;
20effc67 1454
7c673cae
FG
1455 struct ManifestWriter;
1456
1457 friend class Version;
20effc67
TL
1458 friend class VersionEditHandler;
1459 friend class VersionEditHandlerPointInTime;
1460 friend class DumpManifestHandler;
7c673cae 1461 friend class DBImpl;
494da23a 1462 friend class DBImplReadOnly;
7c673cae
FG
1463
1464 struct LogReporter : public log::Reader::Reporter {
1465 Status* status;
11fdf7f2 1466 virtual void Corruption(size_t /*bytes*/, const Status& s) override {
20effc67
TL
1467 if (status->ok()) {
1468 *status = s;
1469 }
7c673cae
FG
1470 }
1471 };
1472
20effc67
TL
1473 void Reset();
1474
f67539c2
TL
1475 // Returns approximated offset of a key in a file for a given version.
1476 uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
1477 const Slice& key, TableReaderCaller caller);
7c673cae 1478
f67539c2
TL
1479 // Returns approximated data size between start and end keys in a file
1480 // for a given version.
7c673cae 1481 uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
f67539c2
TL
1482 const Slice& start, const Slice& end,
1483 TableReaderCaller caller);
1484
1485 struct MutableCFState {
1486 uint64_t log_number;
1e59de90
TL
1487 std::string full_history_ts_low;
1488
1489 explicit MutableCFState() = default;
1490 explicit MutableCFState(uint64_t _log_number, std::string ts_low)
1491 : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {}
f67539c2 1492 };
7c673cae
FG
1493
1494 // Save current contents to *log
f67539c2
TL
1495 Status WriteCurrentStateToManifest(
1496 const std::unordered_map<uint32_t, MutableCFState>& curr_state,
20effc67 1497 const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s);
7c673cae
FG
1498
1499 void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
1500
1501 ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
20effc67 1502 const VersionEdit* edit);
7c673cae 1503
1e59de90
TL
1504 Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
1505 int level, const FileMetaData& meta);
20effc67
TL
1506
1507 // Protected by DB mutex.
1508 WalSet wals_;
7c673cae 1509
20effc67
TL
1510 std::unique_ptr<ColumnFamilySet> column_family_set_;
1511 Cache* table_cache_;
7c673cae 1512 Env* const env_;
20effc67 1513 FileSystemPtr const fs_;
1e59de90 1514 SystemClock* const clock_;
7c673cae 1515 const std::string dbname_;
f67539c2 1516 std::string db_id_;
7c673cae
FG
1517 const ImmutableDBOptions* const db_options_;
1518 std::atomic<uint64_t> next_file_number_;
1e59de90
TL
1519 // Any WAL number smaller than this should be ignored during recovery,
1520 // and is qualified for being deleted.
1521 std::atomic<uint64_t> min_log_number_to_keep_ = {0};
7c673cae
FG
1522 uint64_t manifest_file_number_;
1523 uint64_t options_file_number_;
1e59de90 1524 uint64_t options_file_size_;
7c673cae 1525 uint64_t pending_manifest_file_number_;
11fdf7f2
TL
1526 // The last seq visible to reads. It normally indicates the last sequence in
1527 // the memtable but when using two write queues it could also indicate the
1528 // last sequence in the WAL visible to reads.
7c673cae 1529 std::atomic<uint64_t> last_sequence_;
1e59de90
TL
1530 // The last sequence number of data committed to the descriptor (manifest
1531 // file).
1532 SequenceNumber descriptor_last_sequence_ = 0;
11fdf7f2
TL
1533 // The last seq that is already allocated. It is applicable only when we have
1534 // two write queues. In that case seq might or might not have appreated in
1535 // memtable but it is expected to appear in the WAL.
1536 // We have last_sequence <= last_allocated_sequence_
1537 std::atomic<uint64_t> last_allocated_sequence_;
1538 // The last allocated sequence that is also published to the readers. This is
1539 // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
1540 // last_sequence_ also indicates the last published seq.
1541 // We have last_sequence <= last_published_sequence_ <=
1542 // last_allocated_sequence_
1543 std::atomic<uint64_t> last_published_sequence_;
7c673cae
FG
1544 uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
1545
1546 // Opened lazily
494da23a 1547 std::unique_ptr<log::Writer> descriptor_log_;
7c673cae
FG
1548
1549 // generates a increasing version number for every new version
1550 uint64_t current_version_number_;
1551
1552 // Queue of writers to the manifest file
1553 std::deque<ManifestWriter*> manifest_writers_;
1554
1555 // Current size of manifest file
1556 uint64_t manifest_file_size_;
1557
11fdf7f2 1558 std::vector<ObsoleteFileInfo> obsolete_files_;
20effc67 1559 std::vector<ObsoleteBlobFileInfo> obsolete_blob_files_;
7c673cae
FG
1560 std::vector<std::string> obsolete_manifests_;
1561
1562 // env options for all reads and writes except compactions
f67539c2 1563 FileOptions file_options_;
7c673cae 1564
f67539c2 1565 BlockCacheTracer* const block_cache_tracer_;
7c673cae 1566
20effc67
TL
1567 // Store the IO status when Manifest is written
1568 IOStatus io_status_;
1569
1570 std::shared_ptr<IOTracer> io_tracer_;
1571
1e59de90
TL
1572 std::string db_session_id_;
1573
f67539c2 1574 private:
494da23a
TL
1575 // REQUIRES db mutex at beginning. may release and re-acquire db mutex
1576 Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
1e59de90
TL
1577 InstrumentedMutex* mu,
1578 FSDirectory* dir_contains_current_file,
494da23a
TL
1579 bool new_descriptor_log,
1580 const ColumnFamilyOptions* new_cf_options);
1581
1e59de90
TL
1582 void LogAndApplyCFHelper(VersionEdit* edit,
1583 SequenceNumber* max_last_sequence);
f67539c2 1584 Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
1e59de90
TL
1585 VersionEdit* edit, SequenceNumber* max_last_sequence,
1586 InstrumentedMutex* mu);
7c673cae
FG
1587};
1588
f67539c2
TL
1589// ReactiveVersionSet represents a collection of versions of the column
1590// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
1591// need to replay the MANIFEST (description log in older terms) in order to
1592// reconstruct and install versions.
494da23a
TL
1593class ReactiveVersionSet : public VersionSet {
1594 public:
1595 ReactiveVersionSet(const std::string& dbname,
1596 const ImmutableDBOptions* _db_options,
f67539c2 1597 const FileOptions& _file_options, Cache* table_cache,
494da23a 1598 WriteBufferManager* write_buffer_manager,
20effc67
TL
1599 WriteController* write_controller,
1600 const std::shared_ptr<IOTracer>& io_tracer);
494da23a
TL
1601
1602 ~ReactiveVersionSet() override;
1603
1604 Status ReadAndApply(
1605 InstrumentedMutex* mu,
1606 std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
1e59de90 1607 Status* manifest_read_status,
494da23a
TL
1608 std::unordered_set<ColumnFamilyData*>* cfds_changed);
1609
1610 Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
1611 std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
1612 std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
1613 std::unique_ptr<Status>* manifest_reader_status);
1e59de90
TL
1614#ifndef NDEBUG
1615 uint64_t TEST_read_edits_in_atomic_group() const;
1616#endif //! NDEBUG
494da23a 1617
1e59de90 1618 std::vector<VersionEdit>& replay_buffer();
f67539c2 1619
494da23a 1620 protected:
494da23a
TL
1621 // REQUIRES db mutex
1622 Status ApplyOneVersionEditToBuilder(
f67539c2
TL
1623 VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
1624 VersionEdit* version_edit);
494da23a
TL
1625
1626 Status MaybeSwitchManifest(
1627 log::Reader::Reporter* reporter,
1628 std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
1629
1630 private:
1e59de90 1631 std::unique_ptr<ManifestTailer> manifest_tailer_;
494da23a
TL
1632
1633 using VersionSet::LogAndApply;
1634 using VersionSet::Recover;
1635
1636 Status LogAndApply(
1637 const autovector<ColumnFamilyData*>& /*cfds*/,
1638 const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
1639 const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
1e59de90 1640 InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/,
20effc67
TL
1641 bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/,
1642 const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/)
1643 override {
494da23a
TL
1644 return Status::NotSupported("not supported in reactive mode");
1645 }
1646
1647 // No copy allowed
1648 ReactiveVersionSet(const ReactiveVersionSet&);
1649 ReactiveVersionSet& operator=(const ReactiveVersionSet&);
1650};
1651
f67539c2 1652} // namespace ROCKSDB_NAMESPACE