]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/version_set.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / db / version_set.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // The representation of a DBImpl consists of a set of Versions. The
11 // newest version is called "current". Older versions may be kept
12 // around to provide a consistent view to live iterators.
13 //
14 // Each Version keeps track of a set of Table files per level. The
15 // entire set of versions is maintained in a VersionSet.
16 //
17 // Version,VersionSet are thread-compatible, but require external
18 // synchronization on all accesses.
19
20 #pragma once
21 #include <atomic>
22 #include <deque>
23 #include <limits>
24 #include <map>
25 #include <memory>
26 #include <set>
27 #include <string>
28 #include <utility>
29 #include <vector>
30
31 #include "db/column_family.h"
32 #include "db/compaction.h"
33 #include "db/compaction_picker.h"
34 #include "db/dbformat.h"
35 #include "db/file_indexer.h"
36 #include "db/log_reader.h"
37 #include "db/range_del_aggregator.h"
38 #include "db/table_cache.h"
39 #include "db/version_builder.h"
40 #include "db/version_edit.h"
41 #include "db/write_controller.h"
42 #include "monitoring/instrumented_mutex.h"
43 #include "options/db_options.h"
44 #include "port/port.h"
45 #include "rocksdb/env.h"
46
47 namespace rocksdb {
48
49 namespace log {
50 class Writer;
51 }
52
53 class Compaction;
54 class InternalIterator;
55 class LogBuffer;
56 class LookupKey;
57 class MemTable;
58 class Version;
59 class VersionSet;
60 class WriteBufferManager;
61 class MergeContext;
62 class ColumnFamilySet;
63 class TableCache;
64 class MergeIteratorBuilder;
65
66 // Return the smallest index i such that file_level.files[i]->largest >= key.
67 // Return file_level.num_files if there is no such file.
68 // REQUIRES: "file_level.files" contains a sorted list of
69 // non-overlapping files.
70 extern int FindFile(const InternalKeyComparator& icmp,
71 const LevelFilesBrief& file_level, const Slice& key);
72
73 // Returns true iff some file in "files" overlaps the user key range
74 // [*smallest,*largest].
75 // smallest==nullptr represents a key smaller than all keys in the DB.
76 // largest==nullptr represents a key largest than all keys in the DB.
77 // REQUIRES: If disjoint_sorted_files, file_level.files[]
78 // contains disjoint ranges in sorted order.
79 extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
80 bool disjoint_sorted_files,
81 const LevelFilesBrief& file_level,
82 const Slice* smallest_user_key,
83 const Slice* largest_user_key);
84
85 // Generate LevelFilesBrief from vector<FdWithKeyRange*>
86 // Would copy smallest_key and largest_key data to sequential memory
87 // arena: Arena used to allocate the memory
88 extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
89 const std::vector<FileMetaData*>& files,
90 Arena* arena);
91
92 class VersionStorageInfo {
93 public:
94 VersionStorageInfo(const InternalKeyComparator* internal_comparator,
95 const Comparator* user_comparator, int num_levels,
96 CompactionStyle compaction_style,
97 VersionStorageInfo* src_vstorage,
98 bool _force_consistency_checks);
99 ~VersionStorageInfo();
100
101 void Reserve(int level, size_t size) { files_[level].reserve(size); }
102
103 void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
104
105 void SetFinalized();
106
107 // Update num_non_empty_levels_.
108 void UpdateNumNonEmptyLevels();
109
110 void GenerateFileIndexer() {
111 file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
112 }
113
114 // Update the accumulated stats from a file-meta.
115 void UpdateAccumulatedStats(FileMetaData* file_meta);
116
117 // Decrease the current stat form a to-be-delected file-meta
118 void RemoveCurrentStats(FileMetaData* file_meta);
119
120 void ComputeCompensatedSizes();
121
122 // Updates internal structures that keep track of compaction scores
123 // We use compaction scores to figure out which compaction to do next
124 // REQUIRES: db_mutex held!!
125 // TODO find a better way to pass compaction_options_fifo.
126 void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options,
127 const MutableCFOptions& mutable_cf_options);
128
129 // Estimate est_comp_needed_bytes_
130 void EstimateCompactionBytesNeeded(
131 const MutableCFOptions& mutable_cf_options);
132
133 // This computes files_marked_for_compaction_ and is called by
134 // ComputeCompactionScore()
135 void ComputeFilesMarkedForCompaction();
136
137 // Generate level_files_brief_ from files_
138 void GenerateLevelFilesBrief();
139 // Sort all files for this version based on their file size and
140 // record results in files_by_compaction_pri_. The largest files are listed
141 // first.
142 void UpdateFilesByCompactionPri(CompactionPri compaction_pri);
143
144 void GenerateLevel0NonOverlapping();
145 bool level0_non_overlapping() const {
146 return level0_non_overlapping_;
147 }
148
149 int MaxInputLevel() const;
150
151 // Return level number that has idx'th highest score
152 int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
153
154 // Return idx'th highest score
155 double CompactionScore(int idx) const { return compaction_score_[idx]; }
156
157 void GetOverlappingInputs(
158 int level, const InternalKey* begin, // nullptr means before all keys
159 const InternalKey* end, // nullptr means after all keys
160 std::vector<FileMetaData*>* inputs,
161 int hint_index = -1, // index of overlap file
162 int* file_index = nullptr, // return index of overlap file
163 bool expand_range = true) // if set, returns files which overlap the
164 const; // range and overlap each other. If false,
165 // then just files intersecting the range
166 void GetCleanInputsWithinInterval(
167 int level, const InternalKey* begin, // nullptr means before all keys
168 const InternalKey* end, // nullptr means after all keys
169 std::vector<FileMetaData*>* inputs,
170 int hint_index = -1, // index of overlap file
171 int* file_index = nullptr) // return index of overlap file
172 const;
173
174 void GetOverlappingInputsRangeBinarySearch(
175 int level, // level > 0
176 const Slice& begin, // nullptr means before all keys
177 const Slice& end, // nullptr means after all keys
178 std::vector<FileMetaData*>* inputs,
179 int hint_index, // index of overlap file
180 int* file_index, // return index of overlap file
181 bool within_interval = false) // if set, force the inputs within interval
182 const;
183
184 void ExtendFileRangeOverlappingInterval(
185 int level,
186 const Slice& begin, // nullptr means before all keys
187 const Slice& end, // nullptr means after all keys
188 unsigned int index, // start extending from this index
189 int* startIndex, // return the startIndex of input range
190 int* endIndex) // return the endIndex of input range
191 const;
192
193 void ExtendFileRangeWithinInterval(
194 int level,
195 const Slice& begin, // nullptr means before all keys
196 const Slice& end, // nullptr means after all keys
197 unsigned int index, // start extending from this index
198 int* startIndex, // return the startIndex of input range
199 int* endIndex) // return the endIndex of input range
200 const;
201
202 // Returns true iff some file in the specified level overlaps
203 // some part of [*smallest_user_key,*largest_user_key].
204 // smallest_user_key==NULL represents a key smaller than all keys in the DB.
205 // largest_user_key==NULL represents a key largest than all keys in the DB.
206 bool OverlapInLevel(int level, const Slice* smallest_user_key,
207 const Slice* largest_user_key);
208
209 // Returns true iff the first or last file in inputs contains
210 // an overlapping user key to the file "just outside" of it (i.e.
211 // just after the last file, or just before the first file)
212 // REQUIRES: "*inputs" is a sorted list of non-overlapping files
213 bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
214 int level);
215
216 int num_levels() const { return num_levels_; }
217
218 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
219 int num_non_empty_levels() const {
220 assert(finalized_);
221 return num_non_empty_levels_;
222 }
223
224 // REQUIRES: This version has been finalized.
225 // (CalculateBaseBytes() is called)
226 // This may or may not return number of level files. It is to keep backward
227 // compatible behavior in universal compaction.
228 int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
229
230 void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
231
232 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
233 int NumLevelFiles(int level) const {
234 assert(finalized_);
235 return static_cast<int>(files_[level].size());
236 }
237
238 // Return the combined file size of all files at the specified level.
239 uint64_t NumLevelBytes(int level) const;
240
241 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
242 const std::vector<FileMetaData*>& LevelFiles(int level) const {
243 return files_[level];
244 }
245
246 const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const {
247 assert(level < static_cast<int>(level_files_brief_.size()));
248 return level_files_brief_[level];
249 }
250
251 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
252 const std::vector<int>& FilesByCompactionPri(int level) const {
253 assert(finalized_);
254 return files_by_compaction_pri_[level];
255 }
256
257 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
258 // REQUIRES: DB mutex held during access
259 const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
260 const {
261 assert(finalized_);
262 return files_marked_for_compaction_;
263 }
264
265 int base_level() const { return base_level_; }
266
267 // REQUIRES: lock is held
268 // Set the index that is used to offset into files_by_compaction_pri_ to find
269 // the next compaction candidate file.
270 void SetNextCompactionIndex(int level, int index) {
271 next_file_to_compact_by_size_[level] = index;
272 }
273
274 // REQUIRES: lock is held
275 int NextCompactionIndex(int level) const {
276 return next_file_to_compact_by_size_[level];
277 }
278
279 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
280 const FileIndexer& file_indexer() const {
281 assert(finalized_);
282 return file_indexer_;
283 }
284
285 // Only the first few entries of files_by_compaction_pri_ are sorted.
286 // There is no need to sort all the files because it is likely
287 // that on a running system, we need to look at only the first
288 // few largest files because a new version is created every few
289 // seconds/minutes (because of concurrent compactions).
290 static const size_t kNumberFilesToSort = 50;
291
292 // Return a human-readable short (single-line) summary of the number
293 // of files per level. Uses *scratch as backing store.
294 struct LevelSummaryStorage {
295 char buffer[1000];
296 };
297 struct FileSummaryStorage {
298 char buffer[3000];
299 };
300 const char* LevelSummary(LevelSummaryStorage* scratch) const;
301 // Return a human-readable short (single-line) summary of files
302 // in a specified level. Uses *scratch as backing store.
303 const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
304
305 // Return the maximum overlapping data (in bytes) at next level for any
306 // file at a level >= 1.
307 int64_t MaxNextLevelOverlappingBytes();
308
309 // Return a human readable string that describes this version's contents.
310 std::string DebugString(bool hex = false) const;
311
312 uint64_t GetAverageValueSize() const {
313 if (accumulated_num_non_deletions_ == 0) {
314 return 0;
315 }
316 assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
317 assert(accumulated_file_size_ > 0);
318 return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
319 accumulated_file_size_ /
320 (accumulated_raw_key_size_ + accumulated_raw_value_size_);
321 }
322
323 uint64_t GetEstimatedActiveKeys() const;
324
325 double GetEstimatedCompressionRatioAtLevel(int level) const;
326
327 // re-initializes the index that is used to offset into
328 // files_by_compaction_pri_
329 // to find the next compaction candidate file.
330 void ResetNextCompactionIndex(int level) {
331 next_file_to_compact_by_size_[level] = 0;
332 }
333
334 const InternalKeyComparator* InternalComparator() {
335 return internal_comparator_;
336 }
337
338 // Returns maximum total bytes of data on a given level.
339 uint64_t MaxBytesForLevel(int level) const;
340
341 // Must be called after any change to MutableCFOptions.
342 void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
343 const MutableCFOptions& options);
344
345 // Returns an estimate of the amount of live data in bytes.
346 uint64_t EstimateLiveDataSize() const;
347
348 uint64_t estimated_compaction_needed_bytes() const {
349 return estimated_compaction_needed_bytes_;
350 }
351
352 void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
353 estimated_compaction_needed_bytes_ = v;
354 }
355
356 bool force_consistency_checks() const { return force_consistency_checks_; }
357
358 private:
359 const InternalKeyComparator* internal_comparator_;
360 const Comparator* user_comparator_;
361 int num_levels_; // Number of levels
362 int num_non_empty_levels_; // Number of levels. Any level larger than it
363 // is guaranteed to be empty.
364 // Per-level max bytes
365 std::vector<uint64_t> level_max_bytes_;
366
367 // A short brief metadata of files per level
368 autovector<rocksdb::LevelFilesBrief> level_files_brief_;
369 FileIndexer file_indexer_;
370 Arena arena_; // Used to allocate space for file_levels_
371
372 CompactionStyle compaction_style_;
373
374 // List of files per level, files in each level are arranged
375 // in increasing order of keys
376 std::vector<FileMetaData*>* files_;
377
378 // Level that L0 data should be compacted to. All levels < base_level_ should
379 // be empty. -1 if it is not level-compaction so it's not applicable.
380 int base_level_;
381
382 // A list for the same set of files that are stored in files_,
383 // but files in each level are now sorted based on file
384 // size. The file with the largest size is at the front.
385 // This vector stores the index of the file from files_.
386 std::vector<std::vector<int>> files_by_compaction_pri_;
387
388 // If true, means that files in L0 have keys with non overlapping ranges
389 bool level0_non_overlapping_;
390
391 // An index into files_by_compaction_pri_ that specifies the first
392 // file that is not yet compacted
393 std::vector<int> next_file_to_compact_by_size_;
394
395 // Only the first few entries of files_by_compaction_pri_ are sorted.
396 // There is no need to sort all the files because it is likely
397 // that on a running system, we need to look at only the first
398 // few largest files because a new version is created every few
399 // seconds/minutes (because of concurrent compactions).
400 static const size_t number_of_files_to_sort_ = 50;
401
402 // This vector contains list of files marked for compaction and also not
403 // currently being compacted. It is protected by DB mutex. It is calculated in
404 // ComputeCompactionScore()
405 autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
406
407 // Level that should be compacted next and its compaction score.
408 // Score < 1 means compaction is not strictly needed. These fields
409 // are initialized by Finalize().
410 // The most critical level to be compacted is listed first
411 // These are used to pick the best compaction level
412 std::vector<double> compaction_score_;
413 std::vector<int> compaction_level_;
414 int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop
415 // for number of L0 files.
416
417 // the following are the sampled temporary stats.
418 // the current accumulated size of sampled files.
419 uint64_t accumulated_file_size_;
420 // the current accumulated size of all raw keys based on the sampled files.
421 uint64_t accumulated_raw_key_size_;
422 // the current accumulated size of all raw keys based on the sampled files.
423 uint64_t accumulated_raw_value_size_;
424 // total number of non-deletion entries
425 uint64_t accumulated_num_non_deletions_;
426 // total number of deletion entries
427 uint64_t accumulated_num_deletions_;
428 // current number of non_deletion entries
429 uint64_t current_num_non_deletions_;
430 // current number of delection entries
431 uint64_t current_num_deletions_;
432 // current number of file samples
433 uint64_t current_num_samples_;
434 // Estimated bytes needed to be compacted until all levels' size is down to
435 // target sizes.
436 uint64_t estimated_compaction_needed_bytes_;
437
438 bool finalized_;
439
440 // If set to true, we will run consistency checks even if RocksDB
441 // is compiled in release mode
442 bool force_consistency_checks_;
443
444 friend class Version;
445 friend class VersionSet;
446 // No copying allowed
447 VersionStorageInfo(const VersionStorageInfo&) = delete;
448 void operator=(const VersionStorageInfo&) = delete;
449 };
450
451 class Version {
452 public:
453 // Append to *iters a sequence of iterators that will
454 // yield the contents of this Version when merged together.
455 // REQUIRES: This version has been saved (see VersionSet::SaveTo)
456 void AddIterators(const ReadOptions&, const EnvOptions& soptions,
457 MergeIteratorBuilder* merger_iter_builder,
458 RangeDelAggregator* range_del_agg);
459
460 void AddIteratorsForLevel(const ReadOptions&, const EnvOptions& soptions,
461 MergeIteratorBuilder* merger_iter_builder,
462 int level, RangeDelAggregator* range_del_agg);
463
464 // Lookup the value for key. If found, store it in *val and
465 // return OK. Else return a non-OK status.
466 // Uses *operands to store merge_operator operations to apply later.
467 //
468 // If the ReadOptions.read_tier is set to do a read-only fetch, then
469 // *value_found will be set to false if it cannot be determined whether
470 // this value exists without doing IO.
471 //
472 // If the key is Deleted, *status will be set to NotFound and
473 // *key_exists will be set to true.
474 // If no key was found, *status will be set to NotFound and
475 // *key_exists will be set to false.
476 // If seq is non-null, *seq will be set to the sequence number found
477 // for the key if a key was found.
478 //
479 // REQUIRES: lock is not held
480 void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
481 Status* status, MergeContext* merge_context,
482 RangeDelAggregator* range_del_agg, bool* value_found = nullptr,
483 bool* key_exists = nullptr, SequenceNumber* seq = nullptr);
484
485 // Loads some stats information from files. Call without mutex held. It needs
486 // to be called before applying the version to the version set.
487 void PrepareApply(const MutableCFOptions& mutable_cf_options,
488 bool update_stats);
489
490 // Reference count management (so Versions do not disappear out from
491 // under live iterators)
492 void Ref();
493 // Decrease reference count. Delete the object if no reference left
494 // and return true. Otherwise, return false.
495 bool Unref();
496
497 // Add all files listed in the current version to *live.
498 void AddLiveFiles(std::vector<FileDescriptor>* live);
499
500 // Return a human readable string that describes this version's contents.
501 std::string DebugString(bool hex = false) const;
502
503 // Returns the version nuber of this version
504 uint64_t GetVersionNumber() const { return version_number_; }
505
506 // REQUIRES: lock is held
507 // On success, "tp" will contains the table properties of the file
508 // specified in "file_meta". If the file name of "file_meta" is
509 // known ahread, passing it by a non-null "fname" can save a
510 // file-name conversion.
511 Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
512 const FileMetaData* file_meta,
513 const std::string* fname = nullptr) const;
514
515 // REQUIRES: lock is held
516 // On success, *props will be populated with all SSTables' table properties.
517 // The keys of `props` are the sst file name, the values of `props` are the
518 // tables' propertis, represented as shared_ptr.
519 Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
520 Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
521 Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
522 TablePropertiesCollection* props) const;
523
524 // REQUIRES: lock is held
525 // On success, "tp" will contains the aggregated table property amoug
526 // the table properties of all sst files in this version.
527 Status GetAggregatedTableProperties(
528 std::shared_ptr<const TableProperties>* tp, int level = -1);
529
530 uint64_t GetEstimatedActiveKeys() {
531 return storage_info_.GetEstimatedActiveKeys();
532 }
533
534 size_t GetMemoryUsageByTableReaders();
535
536 ColumnFamilyData* cfd() const { return cfd_; }
537
538 // Return the next Version in the linked list. Used for debug only
539 Version* TEST_Next() const {
540 return next_;
541 }
542
543 int TEST_refs() const { return refs_; }
544
545 VersionStorageInfo* storage_info() { return &storage_info_; }
546
547 VersionSet* version_set() { return vset_; }
548
549 void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
550
551 private:
552 Env* env_;
553 friend class VersionSet;
554
555 const InternalKeyComparator* internal_comparator() const {
556 return storage_info_.internal_comparator_;
557 }
558 const Comparator* user_comparator() const {
559 return storage_info_.user_comparator_;
560 }
561
562 bool PrefixMayMatch(const ReadOptions& read_options,
563 InternalIterator* level_iter,
564 const Slice& internal_prefix) const;
565
566 // Returns true if the filter blocks in the specified level will not be
567 // checked during read operations. In certain cases (trivial move or preload),
568 // the filter block may already be cached, but we still do not access it such
569 // that it eventually expires from the cache.
570 bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
571
572 // The helper function of UpdateAccumulatedStats, which may fill the missing
573 // fields of file_mata from its associated TableProperties.
574 // Returns true if it does initialize FileMetaData.
575 bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
576
577 // Update the accumulated stats associated with the current version.
578 // This accumulated stats will be used in compaction.
579 void UpdateAccumulatedStats(bool update_stats);
580
581 // Sort all files for this version based on their file size and
582 // record results in files_by_compaction_pri_. The largest files are listed
583 // first.
584 void UpdateFilesByCompactionPri();
585
586 ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
587 Logger* info_log_;
588 Statistics* db_statistics_;
589 TableCache* table_cache_;
590 const MergeOperator* merge_operator_;
591
592 VersionStorageInfo storage_info_;
593 VersionSet* vset_; // VersionSet to which this Version belongs
594 Version* next_; // Next version in linked list
595 Version* prev_; // Previous version in linked list
596 int refs_; // Number of live refs to this version
597
598 // A version number that uniquely represents this version. This is
599 // used for debugging and logging purposes only.
600 uint64_t version_number_;
601
602 Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
603
604 ~Version();
605
606 // No copying allowed
607 Version(const Version&);
608 void operator=(const Version&);
609 };
610
611 class VersionSet {
612 public:
613 VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
614 const EnvOptions& env_options, Cache* table_cache,
615 WriteBufferManager* write_buffer_manager,
616 WriteController* write_controller);
617 ~VersionSet();
618
619 // Apply *edit to the current version to form a new descriptor that
620 // is both saved to persistent state and installed as the new
621 // current version. Will release *mu while actually writing to the file.
622 // column_family_options has to be set if edit is column family add
623 // REQUIRES: *mu is held on entry.
624 // REQUIRES: no other thread concurrently calls LogAndApply()
625 Status LogAndApply(
626 ColumnFamilyData* column_family_data,
627 const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
628 InstrumentedMutex* mu, Directory* db_directory = nullptr,
629 bool new_descriptor_log = false,
630 const ColumnFamilyOptions* column_family_options = nullptr) {
631 autovector<VersionEdit*> edit_list;
632 edit_list.push_back(edit);
633 return LogAndApply(column_family_data, mutable_cf_options, edit_list, mu,
634 db_directory, new_descriptor_log, column_family_options);
635 }
636 // The batch version. If edit_list.size() > 1, caller must ensure that
637 // no edit in the list column family add or drop
638 Status LogAndApply(
639 ColumnFamilyData* column_family_data,
640 const MutableCFOptions& mutable_cf_options,
641 const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
642 Directory* db_directory = nullptr, bool new_descriptor_log = false,
643 const ColumnFamilyOptions* column_family_options = nullptr);
644
645 // Recover the last saved descriptor from persistent storage.
646 // If read_only == true, Recover() will not complain if some column families
647 // are not opened
648 Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
649 bool read_only = false);
650
651 // Reads a manifest file and returns a list of column families in
652 // column_families.
653 static Status ListColumnFamilies(std::vector<std::string>* column_families,
654 const std::string& dbname, Env* env);
655
656 #ifndef ROCKSDB_LITE
657 // Try to reduce the number of levels. This call is valid when
658 // only one level from the new max level to the old
659 // max level containing files.
660 // The call is static, since number of levels is immutable during
661 // the lifetime of a RocksDB instance. It reduces number of levels
662 // in a DB by applying changes to manifest.
663 // For example, a db currently has 7 levels [0-6], and a call to
664 // to reduce to 5 [0-4] can only be executed when only one level
665 // among [4-6] contains files.
666 static Status ReduceNumberOfLevels(const std::string& dbname,
667 const Options* options,
668 const EnvOptions& env_options,
669 int new_levels);
670
671 // printf contents (for debugging)
672 Status DumpManifest(Options& options, std::string& manifestFileName,
673 bool verbose, bool hex = false, bool json = false);
674
675 #endif // ROCKSDB_LITE
676
677 // Return the current manifest file number
678 uint64_t manifest_file_number() const { return manifest_file_number_; }
679
680 uint64_t options_file_number() const { return options_file_number_; }
681
682 uint64_t pending_manifest_file_number() const {
683 return pending_manifest_file_number_;
684 }
685
686 uint64_t current_next_file_number() const { return next_file_number_.load(); }
687
688 // Allocate and return a new file number
689 uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
690
691 // Return the last sequence number.
692 uint64_t LastSequence() const {
693 return last_sequence_.load(std::memory_order_acquire);
694 }
695
696 // Set the last sequence number to s.
697 void SetLastSequence(uint64_t s) {
698 assert(s >= last_sequence_);
699 last_sequence_.store(s, std::memory_order_release);
700 }
701
702 // Mark the specified file number as used.
703 // REQUIRED: this is only called during single-threaded recovery
704 void MarkFileNumberUsedDuringRecovery(uint64_t number);
705
706 // Return the log file number for the log file that is currently
707 // being compacted, or zero if there is no such log file.
708 uint64_t prev_log_number() const { return prev_log_number_; }
709
710 // Returns the minimum log number such that all
711 // log numbers less than or equal to it can be deleted
712 uint64_t MinLogNumber() const {
713 uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
714 for (auto cfd : *column_family_set_) {
715 // It's safe to ignore dropped column families here:
716 // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
717 if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
718 min_log_num = cfd->GetLogNumber();
719 }
720 }
721 return min_log_num;
722 }
723
724 // Create an iterator that reads over the compaction inputs for "*c".
725 // The caller should delete the iterator when no longer needed.
726 InternalIterator* MakeInputIterator(const Compaction* c,
727 RangeDelAggregator* range_del_agg);
728
729 // Add all files listed in any live version to *live.
730 void AddLiveFiles(std::vector<FileDescriptor>* live_list);
731
732 // Return the approximate size of data to be scanned for range [start, end)
733 // in levels [start_level, end_level). If end_level == 0 it will search
734 // through all non-empty levels
735 uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
736 int start_level = 0, int end_level = -1);
737
738 // Return the size of the current manifest file
739 uint64_t manifest_file_size() const { return manifest_file_size_; }
740
741 // verify that the files that we started with for a compaction
742 // still exist in the current version and in the same original level.
743 // This ensures that a concurrent compaction did not erroneously
744 // pick the same files to compact.
745 bool VerifyCompactionFileConsistency(Compaction* c);
746
747 Status GetMetadataForFile(uint64_t number, int* filelevel,
748 FileMetaData** metadata, ColumnFamilyData** cfd);
749
750 // This function doesn't support leveldb SST filenames
751 void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
752
753 void GetObsoleteFiles(std::vector<FileMetaData*>* files,
754 std::vector<std::string>* manifest_filenames,
755 uint64_t min_pending_output);
756
757 ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
758 const EnvOptions& env_options() { return env_options_; }
759
760 static uint64_t GetNumLiveVersions(Version* dummy_versions);
761
762 static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
763
764 private:
765 struct ManifestWriter;
766
767 friend class Version;
768 friend class DBImpl;
769
770 struct LogReporter : public log::Reader::Reporter {
771 Status* status;
772 virtual void Corruption(size_t bytes, const Status& s) override {
773 if (this->status->ok()) *this->status = s;
774 }
775 };
776
777 // ApproximateSize helper
778 uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
779 const Slice& start, const Slice& end);
780
781 uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
782 const Slice& key);
783
784 // Save current contents to *log
785 Status WriteSnapshot(log::Writer* log);
786
787 void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
788
789 ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
790 VersionEdit* edit);
791
792 std::unique_ptr<ColumnFamilySet> column_family_set_;
793
794 Env* const env_;
795 const std::string dbname_;
796 const ImmutableDBOptions* const db_options_;
797 std::atomic<uint64_t> next_file_number_;
798 uint64_t manifest_file_number_;
799 uint64_t options_file_number_;
800 uint64_t pending_manifest_file_number_;
801 std::atomic<uint64_t> last_sequence_;
802 uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
803
804 // Opened lazily
805 unique_ptr<log::Writer> descriptor_log_;
806
807 // generates a increasing version number for every new version
808 uint64_t current_version_number_;
809
810 // Queue of writers to the manifest file
811 std::deque<ManifestWriter*> manifest_writers_;
812
813 // Current size of manifest file
814 uint64_t manifest_file_size_;
815
816 std::vector<FileMetaData*> obsolete_files_;
817 std::vector<std::string> obsolete_manifests_;
818
819 // env options for all reads and writes except compactions
820 const EnvOptions& env_options_;
821
822 // env options used for compactions. This is a copy of
823 // env_options_ but with readaheads set to readahead_compactions_.
824 const EnvOptions env_options_compactions_;
825
826 // No copying allowed
827 VersionSet(const VersionSet&);
828 void operator=(const VersionSet&);
829
830 void LogAndApplyCFHelper(VersionEdit* edit);
831 void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v,
832 VersionEdit* edit, InstrumentedMutex* mu);
833 };
834
835 } // namespace rocksdb