1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 // The representation of a DBImpl consists of a set of Versions. The
11 // newest version is called "current". Older versions may be kept
12 // around to provide a consistent view to live iterators.
14 // Each Version keeps track of a set of Table files per level. The
15 // entire set of versions is maintained in a VersionSet.
17 // Version,VersionSet are thread-compatible, but require external
18 // synchronization on all accesses.
31 #include "db/column_family.h"
32 #include "db/compaction.h"
33 #include "db/compaction_picker.h"
34 #include "db/dbformat.h"
35 #include "db/file_indexer.h"
36 #include "db/log_reader.h"
37 #include "db/range_del_aggregator.h"
38 #include "db/table_cache.h"
39 #include "db/version_builder.h"
40 #include "db/version_edit.h"
41 #include "db/write_controller.h"
42 #include "monitoring/instrumented_mutex.h"
43 #include "options/db_options.h"
44 #include "port/port.h"
45 #include "rocksdb/env.h"
54 class InternalIterator
;
60 class WriteBufferManager
;
62 class ColumnFamilySet
;
64 class MergeIteratorBuilder
;
66 // Return the smallest index i such that file_level.files[i]->largest >= key.
67 // Return file_level.num_files if there is no such file.
68 // REQUIRES: "file_level.files" contains a sorted list of
69 // non-overlapping files.
70 extern int FindFile(const InternalKeyComparator
& icmp
,
71 const LevelFilesBrief
& file_level
, const Slice
& key
);
73 // Returns true iff some file in "files" overlaps the user key range
74 // [*smallest,*largest].
75 // smallest==nullptr represents a key smaller than all keys in the DB.
76 // largest==nullptr represents a key largest than all keys in the DB.
77 // REQUIRES: If disjoint_sorted_files, file_level.files[]
78 // contains disjoint ranges in sorted order.
79 extern bool SomeFileOverlapsRange(const InternalKeyComparator
& icmp
,
80 bool disjoint_sorted_files
,
81 const LevelFilesBrief
& file_level
,
82 const Slice
* smallest_user_key
,
83 const Slice
* largest_user_key
);
85 // Generate LevelFilesBrief from vector<FdWithKeyRange*>
86 // Would copy smallest_key and largest_key data to sequential memory
87 // arena: Arena used to allocate the memory
88 extern void DoGenerateLevelFilesBrief(LevelFilesBrief
* file_level
,
89 const std::vector
<FileMetaData
*>& files
,
92 class VersionStorageInfo
{
94 VersionStorageInfo(const InternalKeyComparator
* internal_comparator
,
95 const Comparator
* user_comparator
, int num_levels
,
96 CompactionStyle compaction_style
,
97 VersionStorageInfo
* src_vstorage
,
98 bool _force_consistency_checks
);
99 ~VersionStorageInfo();
101 void Reserve(int level
, size_t size
) { files_
[level
].reserve(size
); }
103 void AddFile(int level
, FileMetaData
* f
, Logger
* info_log
= nullptr);
107 // Update num_non_empty_levels_.
108 void UpdateNumNonEmptyLevels();
110 void GenerateFileIndexer() {
111 file_indexer_
.UpdateIndex(&arena_
, num_non_empty_levels_
, files_
);
114 // Update the accumulated stats from a file-meta.
115 void UpdateAccumulatedStats(FileMetaData
* file_meta
);
117 // Decrease the current stat form a to-be-delected file-meta
118 void RemoveCurrentStats(FileMetaData
* file_meta
);
120 void ComputeCompensatedSizes();
122 // Updates internal structures that keep track of compaction scores
123 // We use compaction scores to figure out which compaction to do next
124 // REQUIRES: db_mutex held!!
125 // TODO find a better way to pass compaction_options_fifo.
126 void ComputeCompactionScore(const ImmutableCFOptions
& immutable_cf_options
,
127 const MutableCFOptions
& mutable_cf_options
);
129 // Estimate est_comp_needed_bytes_
130 void EstimateCompactionBytesNeeded(
131 const MutableCFOptions
& mutable_cf_options
);
133 // This computes files_marked_for_compaction_ and is called by
134 // ComputeCompactionScore()
135 void ComputeFilesMarkedForCompaction();
137 // Generate level_files_brief_ from files_
138 void GenerateLevelFilesBrief();
139 // Sort all files for this version based on their file size and
140 // record results in files_by_compaction_pri_. The largest files are listed
142 void UpdateFilesByCompactionPri(CompactionPri compaction_pri
);
144 void GenerateLevel0NonOverlapping();
145 bool level0_non_overlapping() const {
146 return level0_non_overlapping_
;
149 int MaxInputLevel() const;
151 // Return level number that has idx'th highest score
152 int CompactionScoreLevel(int idx
) const { return compaction_level_
[idx
]; }
154 // Return idx'th highest score
155 double CompactionScore(int idx
) const { return compaction_score_
[idx
]; }
157 void GetOverlappingInputs(
158 int level
, const InternalKey
* begin
, // nullptr means before all keys
159 const InternalKey
* end
, // nullptr means after all keys
160 std::vector
<FileMetaData
*>* inputs
,
161 int hint_index
= -1, // index of overlap file
162 int* file_index
= nullptr, // return index of overlap file
163 bool expand_range
= true) // if set, returns files which overlap the
164 const; // range and overlap each other. If false,
165 // then just files intersecting the range
166 void GetCleanInputsWithinInterval(
167 int level
, const InternalKey
* begin
, // nullptr means before all keys
168 const InternalKey
* end
, // nullptr means after all keys
169 std::vector
<FileMetaData
*>* inputs
,
170 int hint_index
= -1, // index of overlap file
171 int* file_index
= nullptr) // return index of overlap file
174 void GetOverlappingInputsRangeBinarySearch(
175 int level
, // level > 0
176 const Slice
& begin
, // nullptr means before all keys
177 const Slice
& end
, // nullptr means after all keys
178 std::vector
<FileMetaData
*>* inputs
,
179 int hint_index
, // index of overlap file
180 int* file_index
, // return index of overlap file
181 bool within_interval
= false) // if set, force the inputs within interval
184 void ExtendFileRangeOverlappingInterval(
186 const Slice
& begin
, // nullptr means before all keys
187 const Slice
& end
, // nullptr means after all keys
188 unsigned int index
, // start extending from this index
189 int* startIndex
, // return the startIndex of input range
190 int* endIndex
) // return the endIndex of input range
193 void ExtendFileRangeWithinInterval(
195 const Slice
& begin
, // nullptr means before all keys
196 const Slice
& end
, // nullptr means after all keys
197 unsigned int index
, // start extending from this index
198 int* startIndex
, // return the startIndex of input range
199 int* endIndex
) // return the endIndex of input range
202 // Returns true iff some file in the specified level overlaps
203 // some part of [*smallest_user_key,*largest_user_key].
204 // smallest_user_key==NULL represents a key smaller than all keys in the DB.
205 // largest_user_key==NULL represents a key largest than all keys in the DB.
206 bool OverlapInLevel(int level
, const Slice
* smallest_user_key
,
207 const Slice
* largest_user_key
);
209 // Returns true iff the first or last file in inputs contains
210 // an overlapping user key to the file "just outside" of it (i.e.
211 // just after the last file, or just before the first file)
212 // REQUIRES: "*inputs" is a sorted list of non-overlapping files
213 bool HasOverlappingUserKey(const std::vector
<FileMetaData
*>* inputs
,
216 int num_levels() const { return num_levels_
; }
218 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
219 int num_non_empty_levels() const {
221 return num_non_empty_levels_
;
224 // REQUIRES: This version has been finalized.
225 // (CalculateBaseBytes() is called)
226 // This may or may not return number of level files. It is to keep backward
227 // compatible behavior in universal compaction.
228 int l0_delay_trigger_count() const { return l0_delay_trigger_count_
; }
230 void set_l0_delay_trigger_count(int v
) { l0_delay_trigger_count_
= v
; }
232 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
233 int NumLevelFiles(int level
) const {
235 return static_cast<int>(files_
[level
].size());
238 // Return the combined file size of all files at the specified level.
239 uint64_t NumLevelBytes(int level
) const;
241 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
242 const std::vector
<FileMetaData
*>& LevelFiles(int level
) const {
243 return files_
[level
];
246 const rocksdb::LevelFilesBrief
& LevelFilesBrief(int level
) const {
247 assert(level
< static_cast<int>(level_files_brief_
.size()));
248 return level_files_brief_
[level
];
251 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
252 const std::vector
<int>& FilesByCompactionPri(int level
) const {
254 return files_by_compaction_pri_
[level
];
257 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
258 // REQUIRES: DB mutex held during access
259 const autovector
<std::pair
<int, FileMetaData
*>>& FilesMarkedForCompaction()
262 return files_marked_for_compaction_
;
265 int base_level() const { return base_level_
; }
267 // REQUIRES: lock is held
268 // Set the index that is used to offset into files_by_compaction_pri_ to find
269 // the next compaction candidate file.
270 void SetNextCompactionIndex(int level
, int index
) {
271 next_file_to_compact_by_size_
[level
] = index
;
274 // REQUIRES: lock is held
275 int NextCompactionIndex(int level
) const {
276 return next_file_to_compact_by_size_
[level
];
279 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
280 const FileIndexer
& file_indexer() const {
282 return file_indexer_
;
285 // Only the first few entries of files_by_compaction_pri_ are sorted.
286 // There is no need to sort all the files because it is likely
287 // that on a running system, we need to look at only the first
288 // few largest files because a new version is created every few
289 // seconds/minutes (because of concurrent compactions).
290 static const size_t kNumberFilesToSort
= 50;
292 // Return a human-readable short (single-line) summary of the number
293 // of files per level. Uses *scratch as backing store.
294 struct LevelSummaryStorage
{
297 struct FileSummaryStorage
{
300 const char* LevelSummary(LevelSummaryStorage
* scratch
) const;
301 // Return a human-readable short (single-line) summary of files
302 // in a specified level. Uses *scratch as backing store.
303 const char* LevelFileSummary(FileSummaryStorage
* scratch
, int level
) const;
305 // Return the maximum overlapping data (in bytes) at next level for any
306 // file at a level >= 1.
307 int64_t MaxNextLevelOverlappingBytes();
309 // Return a human readable string that describes this version's contents.
310 std::string
DebugString(bool hex
= false) const;
312 uint64_t GetAverageValueSize() const {
313 if (accumulated_num_non_deletions_
== 0) {
316 assert(accumulated_raw_key_size_
+ accumulated_raw_value_size_
> 0);
317 assert(accumulated_file_size_
> 0);
318 return accumulated_raw_value_size_
/ accumulated_num_non_deletions_
*
319 accumulated_file_size_
/
320 (accumulated_raw_key_size_
+ accumulated_raw_value_size_
);
323 uint64_t GetEstimatedActiveKeys() const;
325 double GetEstimatedCompressionRatioAtLevel(int level
) const;
327 // re-initializes the index that is used to offset into
328 // files_by_compaction_pri_
329 // to find the next compaction candidate file.
330 void ResetNextCompactionIndex(int level
) {
331 next_file_to_compact_by_size_
[level
] = 0;
334 const InternalKeyComparator
* InternalComparator() {
335 return internal_comparator_
;
338 // Returns maximum total bytes of data on a given level.
339 uint64_t MaxBytesForLevel(int level
) const;
341 // Must be called after any change to MutableCFOptions.
342 void CalculateBaseBytes(const ImmutableCFOptions
& ioptions
,
343 const MutableCFOptions
& options
);
345 // Returns an estimate of the amount of live data in bytes.
346 uint64_t EstimateLiveDataSize() const;
348 uint64_t estimated_compaction_needed_bytes() const {
349 return estimated_compaction_needed_bytes_
;
352 void TEST_set_estimated_compaction_needed_bytes(uint64_t v
) {
353 estimated_compaction_needed_bytes_
= v
;
356 bool force_consistency_checks() const { return force_consistency_checks_
; }
359 const InternalKeyComparator
* internal_comparator_
;
360 const Comparator
* user_comparator_
;
361 int num_levels_
; // Number of levels
362 int num_non_empty_levels_
; // Number of levels. Any level larger than it
363 // is guaranteed to be empty.
364 // Per-level max bytes
365 std::vector
<uint64_t> level_max_bytes_
;
367 // A short brief metadata of files per level
368 autovector
<rocksdb::LevelFilesBrief
> level_files_brief_
;
369 FileIndexer file_indexer_
;
370 Arena arena_
; // Used to allocate space for file_levels_
372 CompactionStyle compaction_style_
;
374 // List of files per level, files in each level are arranged
375 // in increasing order of keys
376 std::vector
<FileMetaData
*>* files_
;
378 // Level that L0 data should be compacted to. All levels < base_level_ should
379 // be empty. -1 if it is not level-compaction so it's not applicable.
382 // A list for the same set of files that are stored in files_,
383 // but files in each level are now sorted based on file
384 // size. The file with the largest size is at the front.
385 // This vector stores the index of the file from files_.
386 std::vector
<std::vector
<int>> files_by_compaction_pri_
;
388 // If true, means that files in L0 have keys with non overlapping ranges
389 bool level0_non_overlapping_
;
391 // An index into files_by_compaction_pri_ that specifies the first
392 // file that is not yet compacted
393 std::vector
<int> next_file_to_compact_by_size_
;
395 // Only the first few entries of files_by_compaction_pri_ are sorted.
396 // There is no need to sort all the files because it is likely
397 // that on a running system, we need to look at only the first
398 // few largest files because a new version is created every few
399 // seconds/minutes (because of concurrent compactions).
400 static const size_t number_of_files_to_sort_
= 50;
402 // This vector contains list of files marked for compaction and also not
403 // currently being compacted. It is protected by DB mutex. It is calculated in
404 // ComputeCompactionScore()
405 autovector
<std::pair
<int, FileMetaData
*>> files_marked_for_compaction_
;
407 // Level that should be compacted next and its compaction score.
408 // Score < 1 means compaction is not strictly needed. These fields
409 // are initialized by Finalize().
410 // The most critical level to be compacted is listed first
411 // These are used to pick the best compaction level
412 std::vector
<double> compaction_score_
;
413 std::vector
<int> compaction_level_
;
414 int l0_delay_trigger_count_
= 0; // Count used to trigger slow down and stop
415 // for number of L0 files.
417 // the following are the sampled temporary stats.
418 // the current accumulated size of sampled files.
419 uint64_t accumulated_file_size_
;
420 // the current accumulated size of all raw keys based on the sampled files.
421 uint64_t accumulated_raw_key_size_
;
422 // the current accumulated size of all raw keys based on the sampled files.
423 uint64_t accumulated_raw_value_size_
;
424 // total number of non-deletion entries
425 uint64_t accumulated_num_non_deletions_
;
426 // total number of deletion entries
427 uint64_t accumulated_num_deletions_
;
428 // current number of non_deletion entries
429 uint64_t current_num_non_deletions_
;
430 // current number of delection entries
431 uint64_t current_num_deletions_
;
432 // current number of file samples
433 uint64_t current_num_samples_
;
434 // Estimated bytes needed to be compacted until all levels' size is down to
436 uint64_t estimated_compaction_needed_bytes_
;
440 // If set to true, we will run consistency checks even if RocksDB
441 // is compiled in release mode
442 bool force_consistency_checks_
;
444 friend class Version
;
445 friend class VersionSet
;
446 // No copying allowed
447 VersionStorageInfo(const VersionStorageInfo
&) = delete;
448 void operator=(const VersionStorageInfo
&) = delete;
453 // Append to *iters a sequence of iterators that will
454 // yield the contents of this Version when merged together.
455 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
456 void AddIterators(const ReadOptions
&, const EnvOptions
& soptions
,
457 MergeIteratorBuilder
* merger_iter_builder
,
458 RangeDelAggregator
* range_del_agg
);
460 void AddIteratorsForLevel(const ReadOptions
&, const EnvOptions
& soptions
,
461 MergeIteratorBuilder
* merger_iter_builder
,
462 int level
, RangeDelAggregator
* range_del_agg
);
464 // Lookup the value for key. If found, store it in *val and
465 // return OK. Else return a non-OK status.
466 // Uses *operands to store merge_operator operations to apply later.
468 // If the ReadOptions.read_tier is set to do a read-only fetch, then
469 // *value_found will be set to false if it cannot be determined whether
470 // this value exists without doing IO.
472 // If the key is Deleted, *status will be set to NotFound and
473 // *key_exists will be set to true.
474 // If no key was found, *status will be set to NotFound and
475 // *key_exists will be set to false.
476 // If seq is non-null, *seq will be set to the sequence number found
477 // for the key if a key was found.
479 // REQUIRES: lock is not held
480 void Get(const ReadOptions
&, const LookupKey
& key
, PinnableSlice
* value
,
481 Status
* status
, MergeContext
* merge_context
,
482 RangeDelAggregator
* range_del_agg
, bool* value_found
= nullptr,
483 bool* key_exists
= nullptr, SequenceNumber
* seq
= nullptr);
485 // Loads some stats information from files. Call without mutex held. It needs
486 // to be called before applying the version to the version set.
487 void PrepareApply(const MutableCFOptions
& mutable_cf_options
,
490 // Reference count management (so Versions do not disappear out from
491 // under live iterators)
493 // Decrease reference count. Delete the object if no reference left
494 // and return true. Otherwise, return false.
497 // Add all files listed in the current version to *live.
498 void AddLiveFiles(std::vector
<FileDescriptor
>* live
);
500 // Return a human readable string that describes this version's contents.
501 std::string
DebugString(bool hex
= false) const;
503 // Returns the version nuber of this version
504 uint64_t GetVersionNumber() const { return version_number_
; }
506 // REQUIRES: lock is held
507 // On success, "tp" will contains the table properties of the file
508 // specified in "file_meta". If the file name of "file_meta" is
509 // known ahread, passing it by a non-null "fname" can save a
510 // file-name conversion.
511 Status
GetTableProperties(std::shared_ptr
<const TableProperties
>* tp
,
512 const FileMetaData
* file_meta
,
513 const std::string
* fname
= nullptr) const;
515 // REQUIRES: lock is held
516 // On success, *props will be populated with all SSTables' table properties.
517 // The keys of `props` are the sst file name, the values of `props` are the
518 // tables' propertis, represented as shared_ptr.
519 Status
GetPropertiesOfAllTables(TablePropertiesCollection
* props
);
520 Status
GetPropertiesOfAllTables(TablePropertiesCollection
* props
, int level
);
521 Status
GetPropertiesOfTablesInRange(const Range
* range
, std::size_t n
,
522 TablePropertiesCollection
* props
) const;
524 // REQUIRES: lock is held
525 // On success, "tp" will contains the aggregated table property amoug
526 // the table properties of all sst files in this version.
527 Status
GetAggregatedTableProperties(
528 std::shared_ptr
<const TableProperties
>* tp
, int level
= -1);
530 uint64_t GetEstimatedActiveKeys() {
531 return storage_info_
.GetEstimatedActiveKeys();
534 size_t GetMemoryUsageByTableReaders();
536 ColumnFamilyData
* cfd() const { return cfd_
; }
538 // Return the next Version in the linked list. Used for debug only
539 Version
* TEST_Next() const {
543 int TEST_refs() const { return refs_
; }
545 VersionStorageInfo
* storage_info() { return &storage_info_
; }
547 VersionSet
* version_set() { return vset_
; }
549 void GetColumnFamilyMetaData(ColumnFamilyMetaData
* cf_meta
);
553 friend class VersionSet
;
555 const InternalKeyComparator
* internal_comparator() const {
556 return storage_info_
.internal_comparator_
;
558 const Comparator
* user_comparator() const {
559 return storage_info_
.user_comparator_
;
562 bool PrefixMayMatch(const ReadOptions
& read_options
,
563 InternalIterator
* level_iter
,
564 const Slice
& internal_prefix
) const;
566 // Returns true if the filter blocks in the specified level will not be
567 // checked during read operations. In certain cases (trivial move or preload),
568 // the filter block may already be cached, but we still do not access it such
569 // that it eventually expires from the cache.
570 bool IsFilterSkipped(int level
, bool is_file_last_in_level
= false);
572 // The helper function of UpdateAccumulatedStats, which may fill the missing
573 // fields of file_mata from its associated TableProperties.
574 // Returns true if it does initialize FileMetaData.
575 bool MaybeInitializeFileMetaData(FileMetaData
* file_meta
);
577 // Update the accumulated stats associated with the current version.
578 // This accumulated stats will be used in compaction.
579 void UpdateAccumulatedStats(bool update_stats
);
581 // Sort all files for this version based on their file size and
582 // record results in files_by_compaction_pri_. The largest files are listed
584 void UpdateFilesByCompactionPri();
586 ColumnFamilyData
* cfd_
; // ColumnFamilyData to which this Version belongs
588 Statistics
* db_statistics_
;
589 TableCache
* table_cache_
;
590 const MergeOperator
* merge_operator_
;
592 VersionStorageInfo storage_info_
;
593 VersionSet
* vset_
; // VersionSet to which this Version belongs
594 Version
* next_
; // Next version in linked list
595 Version
* prev_
; // Previous version in linked list
596 int refs_
; // Number of live refs to this version
598 // A version number that uniquely represents this version. This is
599 // used for debugging and logging purposes only.
600 uint64_t version_number_
;
602 Version(ColumnFamilyData
* cfd
, VersionSet
* vset
, uint64_t version_number
= 0);
606 // No copying allowed
607 Version(const Version
&);
608 void operator=(const Version
&);
613 VersionSet(const std::string
& dbname
, const ImmutableDBOptions
* db_options
,
614 const EnvOptions
& env_options
, Cache
* table_cache
,
615 WriteBufferManager
* write_buffer_manager
,
616 WriteController
* write_controller
);
619 // Apply *edit to the current version to form a new descriptor that
620 // is both saved to persistent state and installed as the new
621 // current version. Will release *mu while actually writing to the file.
622 // column_family_options has to be set if edit is column family add
623 // REQUIRES: *mu is held on entry.
624 // REQUIRES: no other thread concurrently calls LogAndApply()
626 ColumnFamilyData
* column_family_data
,
627 const MutableCFOptions
& mutable_cf_options
, VersionEdit
* edit
,
628 InstrumentedMutex
* mu
, Directory
* db_directory
= nullptr,
629 bool new_descriptor_log
= false,
630 const ColumnFamilyOptions
* column_family_options
= nullptr) {
631 autovector
<VersionEdit
*> edit_list
;
632 edit_list
.push_back(edit
);
633 return LogAndApply(column_family_data
, mutable_cf_options
, edit_list
, mu
,
634 db_directory
, new_descriptor_log
, column_family_options
);
636 // The batch version. If edit_list.size() > 1, caller must ensure that
637 // no edit in the list column family add or drop
639 ColumnFamilyData
* column_family_data
,
640 const MutableCFOptions
& mutable_cf_options
,
641 const autovector
<VersionEdit
*>& edit_list
, InstrumentedMutex
* mu
,
642 Directory
* db_directory
= nullptr, bool new_descriptor_log
= false,
643 const ColumnFamilyOptions
* column_family_options
= nullptr);
645 // Recover the last saved descriptor from persistent storage.
646 // If read_only == true, Recover() will not complain if some column families
648 Status
Recover(const std::vector
<ColumnFamilyDescriptor
>& column_families
,
649 bool read_only
= false);
651 // Reads a manifest file and returns a list of column families in
653 static Status
ListColumnFamilies(std::vector
<std::string
>* column_families
,
654 const std::string
& dbname
, Env
* env
);
657 // Try to reduce the number of levels. This call is valid when
658 // only one level from the new max level to the old
659 // max level containing files.
660 // The call is static, since number of levels is immutable during
661 // the lifetime of a RocksDB instance. It reduces number of levels
662 // in a DB by applying changes to manifest.
663 // For example, a db currently has 7 levels [0-6], and a call to
664 // to reduce to 5 [0-4] can only be executed when only one level
665 // among [4-6] contains files.
666 static Status
ReduceNumberOfLevels(const std::string
& dbname
,
667 const Options
* options
,
668 const EnvOptions
& env_options
,
671 // printf contents (for debugging)
672 Status
DumpManifest(Options
& options
, std::string
& manifestFileName
,
673 bool verbose
, bool hex
= false, bool json
= false);
675 #endif // ROCKSDB_LITE
677 // Return the current manifest file number
678 uint64_t manifest_file_number() const { return manifest_file_number_
; }
680 uint64_t options_file_number() const { return options_file_number_
; }
682 uint64_t pending_manifest_file_number() const {
683 return pending_manifest_file_number_
;
686 uint64_t current_next_file_number() const { return next_file_number_
.load(); }
688 // Allocate and return a new file number
689 uint64_t NewFileNumber() { return next_file_number_
.fetch_add(1); }
691 // Return the last sequence number.
692 uint64_t LastSequence() const {
693 return last_sequence_
.load(std::memory_order_acquire
);
696 // Set the last sequence number to s.
697 void SetLastSequence(uint64_t s
) {
698 assert(s
>= last_sequence_
);
699 last_sequence_
.store(s
, std::memory_order_release
);
702 // Mark the specified file number as used.
703 // REQUIRED: this is only called during single-threaded recovery
704 void MarkFileNumberUsedDuringRecovery(uint64_t number
);
706 // Return the log file number for the log file that is currently
707 // being compacted, or zero if there is no such log file.
708 uint64_t prev_log_number() const { return prev_log_number_
; }
710 // Returns the minimum log number such that all
711 // log numbers less than or equal to it can be deleted
712 uint64_t MinLogNumber() const {
713 uint64_t min_log_num
= std::numeric_limits
<uint64_t>::max();
714 for (auto cfd
: *column_family_set_
) {
715 // It's safe to ignore dropped column families here:
716 // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
717 if (min_log_num
> cfd
->GetLogNumber() && !cfd
->IsDropped()) {
718 min_log_num
= cfd
->GetLogNumber();
724 // Create an iterator that reads over the compaction inputs for "*c".
725 // The caller should delete the iterator when no longer needed.
726 InternalIterator
* MakeInputIterator(const Compaction
* c
,
727 RangeDelAggregator
* range_del_agg
);
729 // Add all files listed in any live version to *live.
730 void AddLiveFiles(std::vector
<FileDescriptor
>* live_list
);
732 // Return the approximate size of data to be scanned for range [start, end)
733 // in levels [start_level, end_level). If end_level == 0 it will search
734 // through all non-empty levels
735 uint64_t ApproximateSize(Version
* v
, const Slice
& start
, const Slice
& end
,
736 int start_level
= 0, int end_level
= -1);
738 // Return the size of the current manifest file
739 uint64_t manifest_file_size() const { return manifest_file_size_
; }
741 // verify that the files that we started with for a compaction
742 // still exist in the current version and in the same original level.
743 // This ensures that a concurrent compaction did not erroneously
744 // pick the same files to compact.
745 bool VerifyCompactionFileConsistency(Compaction
* c
);
747 Status
GetMetadataForFile(uint64_t number
, int* filelevel
,
748 FileMetaData
** metadata
, ColumnFamilyData
** cfd
);
750 // This function doesn't support leveldb SST filenames
751 void GetLiveFilesMetaData(std::vector
<LiveFileMetaData
> *metadata
);
753 void GetObsoleteFiles(std::vector
<FileMetaData
*>* files
,
754 std::vector
<std::string
>* manifest_filenames
,
755 uint64_t min_pending_output
);
757 ColumnFamilySet
* GetColumnFamilySet() { return column_family_set_
.get(); }
758 const EnvOptions
& env_options() { return env_options_
; }
760 static uint64_t GetNumLiveVersions(Version
* dummy_versions
);
762 static uint64_t GetTotalSstFilesSize(Version
* dummy_versions
);
765 struct ManifestWriter
;
767 friend class Version
;
770 struct LogReporter
: public log::Reader::Reporter
{
772 virtual void Corruption(size_t bytes
, const Status
& s
) override
{
773 if (this->status
->ok()) *this->status
= s
;
777 // ApproximateSize helper
778 uint64_t ApproximateSizeLevel0(Version
* v
, const LevelFilesBrief
& files_brief
,
779 const Slice
& start
, const Slice
& end
);
781 uint64_t ApproximateSize(Version
* v
, const FdWithKeyRange
& f
,
784 // Save current contents to *log
785 Status
WriteSnapshot(log::Writer
* log
);
787 void AppendVersion(ColumnFamilyData
* column_family_data
, Version
* v
);
789 ColumnFamilyData
* CreateColumnFamily(const ColumnFamilyOptions
& cf_options
,
792 std::unique_ptr
<ColumnFamilySet
> column_family_set_
;
795 const std::string dbname_
;
796 const ImmutableDBOptions
* const db_options_
;
797 std::atomic
<uint64_t> next_file_number_
;
798 uint64_t manifest_file_number_
;
799 uint64_t options_file_number_
;
800 uint64_t pending_manifest_file_number_
;
801 std::atomic
<uint64_t> last_sequence_
;
802 uint64_t prev_log_number_
; // 0 or backing store for memtable being compacted
805 unique_ptr
<log::Writer
> descriptor_log_
;
807 // generates a increasing version number for every new version
808 uint64_t current_version_number_
;
810 // Queue of writers to the manifest file
811 std::deque
<ManifestWriter
*> manifest_writers_
;
813 // Current size of manifest file
814 uint64_t manifest_file_size_
;
816 std::vector
<FileMetaData
*> obsolete_files_
;
817 std::vector
<std::string
> obsolete_manifests_
;
819 // env options for all reads and writes except compactions
820 const EnvOptions
& env_options_
;
822 // env options used for compactions. This is a copy of
823 // env_options_ but with readaheads set to readahead_compactions_.
824 const EnvOptions env_options_compactions_
;
826 // No copying allowed
827 VersionSet(const VersionSet
&);
828 void operator=(const VersionSet
&);
830 void LogAndApplyCFHelper(VersionEdit
* edit
);
831 void LogAndApplyHelper(ColumnFamilyData
* cfd
, VersionBuilder
* b
, Version
* v
,
832 VersionEdit
* edit
, InstrumentedMutex
* mu
);
835 } // namespace rocksdb