]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/version_edit.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / rocksdb / db / version_edit.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11#include <algorithm>
12#include <set>
f67539c2 13#include <string>
7c673cae
FG
14#include <utility>
15#include <vector>
7c673cae 16#include "db/dbformat.h"
f67539c2
TL
17#include "memory/arena.h"
18#include "rocksdb/cache.h"
19#include "table/table_reader.h"
7c673cae
FG
20#include "util/autovector.h"
21
f67539c2 22namespace ROCKSDB_NAMESPACE {
7c673cae
FG
23
24class VersionSet;
25
f67539c2
TL
26constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
27constexpr uint64_t kInvalidBlobFileNumber = 0;
28constexpr uint64_t kUnknownOldestAncesterTime = 0;
29constexpr uint64_t kUnknownFileCreationTime = 0;
30
31extern const std::string kUnknownFileChecksum;
32extern const std::string kUnknownFileChecksumFuncName;
7c673cae
FG
33
34extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
35
36// A copyable structure contains information needed to read data from an SST
11fdf7f2 37// file. It can contain a pointer to a table reader opened for the file, or
7c673cae
FG
38// file number and size, which can be used to create a new table reader for it.
39// The behavior is undefined when a copied of the structure is used when the
40// file is not in any live version any more.
41struct FileDescriptor {
42 // Table reader in table_reader_handle
43 TableReader* table_reader;
44 uint64_t packed_number_and_path_id;
45 uint64_t file_size; // File size in bytes
11fdf7f2
TL
46 SequenceNumber smallest_seqno; // The smallest seqno in this file
47 SequenceNumber largest_seqno; // The largest seqno in this file
7c673cae
FG
48
49 FileDescriptor() : FileDescriptor(0, 0, 0) {}
50
51 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
11fdf7f2
TL
52 : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
53
54 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
55 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
7c673cae
FG
56 : table_reader(nullptr),
57 packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
11fdf7f2
TL
58 file_size(_file_size),
59 smallest_seqno(_smallest_seqno),
60 largest_seqno(_largest_seqno) {}
7c673cae 61
f67539c2
TL
62 FileDescriptor(const FileDescriptor& fd) { *this = fd; }
63
7c673cae
FG
64 FileDescriptor& operator=(const FileDescriptor& fd) {
65 table_reader = fd.table_reader;
66 packed_number_and_path_id = fd.packed_number_and_path_id;
67 file_size = fd.file_size;
11fdf7f2
TL
68 smallest_seqno = fd.smallest_seqno;
69 largest_seqno = fd.largest_seqno;
7c673cae
FG
70 return *this;
71 }
72
73 uint64_t GetNumber() const {
74 return packed_number_and_path_id & kFileNumberMask;
75 }
76 uint32_t GetPathId() const {
77 return static_cast<uint32_t>(
78 packed_number_and_path_id / (kFileNumberMask + 1));
79 }
80 uint64_t GetFileSize() const { return file_size; }
81};
82
11fdf7f2
TL
83struct FileSampledStats {
84 FileSampledStats() : num_reads_sampled(0) {}
85 FileSampledStats(const FileSampledStats& other) { *this = other; }
86 FileSampledStats& operator=(const FileSampledStats& other) {
87 num_reads_sampled = other.num_reads_sampled.load();
88 return *this;
89 }
90
91 // number of user reads to this file.
92 mutable std::atomic<uint64_t> num_reads_sampled;
93};
94
7c673cae 95struct FileMetaData {
7c673cae
FG
96 FileDescriptor fd;
97 InternalKey smallest; // Smallest internal key served by table
98 InternalKey largest; // Largest internal key served by table
7c673cae
FG
99
100 // Needs to be disposed when refs becomes 0.
f67539c2 101 Cache::Handle* table_reader_handle = nullptr;
7c673cae 102
11fdf7f2
TL
103 FileSampledStats stats;
104
7c673cae
FG
105 // Stats for compensating deletion entries during compaction
106
107 // File size compensated by deletion entry.
108 // This is updated in Version::UpdateAccumulatedStats() first time when the
109 // file is created or loaded. After it is updated (!= 0), it is immutable.
f67539c2 110 uint64_t compensated_file_size = 0;
7c673cae
FG
111 // These values can mutate, but they can only be read or written from
112 // single-threaded LogAndApply thread
f67539c2
TL
113 uint64_t num_entries = 0; // the number of entries.
114 uint64_t num_deletions = 0; // the number of deletion entries.
115 uint64_t raw_key_size = 0; // total uncompressed key size.
116 uint64_t raw_value_size = 0; // total uncompressed value size.
117
118 int refs = 0; // Reference count
119
120 bool being_compacted = false; // Is this file undergoing compaction?
121 bool init_stats_from_file = false; // true if the data-entry stats of this
122 // file has initialized from file.
123
124 bool marked_for_compaction = false; // True if client asked us nicely to
125 // compact this file.
126
127 // Used only in BlobDB. The file number of the oldest blob file this SST file
128 // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
129 uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
130
131 // The file could be the compaction output from other SST files, which could
132 // in turn be outputs for compact older SST files. We track the memtable
133 // flush timestamp for the oldest SST file that eventaully contribute data
134 // to this file. 0 means the information is not available.
135 uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
136
137 // Unix time when the SST file is created.
138 uint64_t file_creation_time = kUnknownFileCreationTime;
139
140 // File checksum
141 std::string file_checksum = kUnknownFileChecksum;
142
143 // File checksum function name
144 std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
145
146 FileMetaData() = default;
147
148 FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
149 const InternalKey& smallest_key, const InternalKey& largest_key,
150 const SequenceNumber& smallest_seq,
151 const SequenceNumber& largest_seq, bool marked_for_compact,
152 uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
153 uint64_t _file_creation_time, const std::string& _file_checksum,
154 const std::string& _file_checksum_func_name)
155 : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
156 smallest(smallest_key),
157 largest(largest_key),
158 marked_for_compaction(marked_for_compact),
159 oldest_blob_file_number(oldest_blob_file),
160 oldest_ancester_time(_oldest_ancester_time),
161 file_creation_time(_file_creation_time),
162 file_checksum(_file_checksum),
163 file_checksum_func_name(_file_checksum_func_name) {
164 TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
165 }
7c673cae
FG
166
167 // REQUIRED: Keys must be given to the function in sorted order (it expects
168 // the last key to be the largest).
f67539c2
TL
169 void UpdateBoundaries(const Slice& key, const Slice& value,
170 SequenceNumber seqno, ValueType value_type);
11fdf7f2
TL
171
172 // Unlike UpdateBoundaries, ranges do not need to be presented in any
173 // particular order.
174 void UpdateBoundariesForRange(const InternalKey& start,
175 const InternalKey& end, SequenceNumber seqno,
176 const InternalKeyComparator& icmp) {
177 if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
178 smallest = start;
179 }
180 if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
181 largest = end;
182 }
183 fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
184 fd.largest_seqno = std::max(fd.largest_seqno, seqno);
7c673cae 185 }
f67539c2
TL
186
187 // Try to get oldest ancester time from the class itself or table properties
188 // if table reader is already pinned.
189 // 0 means the information is not available.
190 uint64_t TryGetOldestAncesterTime() {
191 if (oldest_ancester_time != kUnknownOldestAncesterTime) {
192 return oldest_ancester_time;
193 } else if (fd.table_reader != nullptr &&
194 fd.table_reader->GetTableProperties() != nullptr) {
195 return fd.table_reader->GetTableProperties()->creation_time;
196 }
197 return kUnknownOldestAncesterTime;
198 }
199
200 uint64_t TryGetFileCreationTime() {
201 if (file_creation_time != kUnknownFileCreationTime) {
202 return file_creation_time;
203 } else if (fd.table_reader != nullptr &&
204 fd.table_reader->GetTableProperties() != nullptr) {
205 return fd.table_reader->GetTableProperties()->file_creation_time;
206 }
207 return kUnknownFileCreationTime;
208 }
7c673cae
FG
209};
210
11fdf7f2
TL
211// A compressed copy of file meta data that just contain minimum data needed
212// to server read operations, while still keeping the pointer to full metadata
213// of the file in case it is needed.
7c673cae
FG
214struct FdWithKeyRange {
215 FileDescriptor fd;
11fdf7f2 216 FileMetaData* file_metadata; // Point to all metadata
7c673cae
FG
217 Slice smallest_key; // slice that contain smallest key
218 Slice largest_key; // slice that contain largest key
219
220 FdWithKeyRange()
221 : fd(),
11fdf7f2 222 file_metadata(nullptr),
7c673cae
FG
223 smallest_key(),
224 largest_key() {
225 }
226
11fdf7f2
TL
227 FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
228 FileMetaData* _file_metadata)
229 : fd(_fd),
230 file_metadata(_file_metadata),
231 smallest_key(_smallest_key),
232 largest_key(_largest_key) {}
7c673cae
FG
233};
234
235// Data structure to store an array of FdWithKeyRange in one level
236// Actual data is guaranteed to be stored closely
237struct LevelFilesBrief {
238 size_t num_files;
239 FdWithKeyRange* files;
240 LevelFilesBrief() {
241 num_files = 0;
242 files = nullptr;
243 }
244};
245
f67539c2
TL
246// The state of a DB at any given time is referred to as a Version.
247// Any modification to the Version is considered a Version Edit. A Version is
248// constructed by joining a sequence of Version Edits. Version Edits are written
249// to the MANIFEST file.
7c673cae
FG
250class VersionEdit {
251 public:
7c673cae
FG
252 void Clear();
253
f67539c2
TL
254 void SetDBId(const std::string& db_id) {
255 has_db_id_ = true;
256 db_id_ = db_id;
257 }
258 bool HasDbId() const { return has_db_id_; }
259 const std::string& GetDbId() const { return db_id_; }
260
7c673cae
FG
261 void SetComparatorName(const Slice& name) {
262 has_comparator_ = true;
263 comparator_ = name.ToString();
264 }
f67539c2
TL
265 bool HasComparatorName() const { return has_comparator_; }
266 const std::string& GetComparatorName() const { return comparator_; }
267
7c673cae
FG
268 void SetLogNumber(uint64_t num) {
269 has_log_number_ = true;
270 log_number_ = num;
271 }
f67539c2
TL
272 bool HasLogNumber() const { return has_log_number_; }
273 uint64_t GetLogNumber() const { return log_number_; }
274
7c673cae
FG
275 void SetPrevLogNumber(uint64_t num) {
276 has_prev_log_number_ = true;
277 prev_log_number_ = num;
278 }
f67539c2
TL
279 bool HasPrevLogNumber() const { return has_prev_log_number_; }
280 uint64_t GetPrevLogNumber() const { return prev_log_number_; }
281
7c673cae
FG
282 void SetNextFile(uint64_t num) {
283 has_next_file_number_ = true;
284 next_file_number_ = num;
285 }
f67539c2
TL
286 bool HasNextFile() const { return has_next_file_number_; }
287 uint64_t GetNextFile() const { return next_file_number_; }
288
7c673cae
FG
289 void SetMaxColumnFamily(uint32_t max_column_family) {
290 has_max_column_family_ = true;
291 max_column_family_ = max_column_family;
292 }
f67539c2
TL
293 bool HasMaxColumnFamily() const { return has_max_column_family_; }
294 uint32_t GetMaxColumnFamily() const { return max_column_family_; }
295
11fdf7f2
TL
296 void SetMinLogNumberToKeep(uint64_t num) {
297 has_min_log_number_to_keep_ = true;
298 min_log_number_to_keep_ = num;
299 }
f67539c2
TL
300 bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
301 uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
11fdf7f2 302
f67539c2
TL
303 void SetLastSequence(SequenceNumber seq) {
304 has_last_sequence_ = true;
305 last_sequence_ = seq;
306 }
307 bool HasLastSequence() const { return has_last_sequence_; }
308 SequenceNumber GetLastSequence() const { return last_sequence_; }
7c673cae 309
f67539c2
TL
310 // Delete the specified "file" from the specified "level".
311 void DeleteFile(int level, uint64_t file) {
312 deleted_files_.emplace(level, file);
313 }
494da23a 314
f67539c2
TL
315 // Retrieve the files deleted as well as their associated levels.
316 using DeletedFiles = std::set<std::pair<int, uint64_t>>;
317 const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
494da23a 318
f67539c2 319 // Add the specified file at the specified level.
7c673cae
FG
320 // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
321 // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
f67539c2
TL
322 // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
323 // referred to by this file if any, kInvalidBlobFileNumber otherwise.
7c673cae
FG
324 void AddFile(int level, uint64_t file, uint32_t file_path_id,
325 uint64_t file_size, const InternalKey& smallest,
326 const InternalKey& largest, const SequenceNumber& smallest_seqno,
f67539c2
TL
327 const SequenceNumber& largest_seqno, bool marked_for_compaction,
328 uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
329 uint64_t file_creation_time, const std::string& file_checksum,
330 const std::string& file_checksum_func_name) {
7c673cae 331 assert(smallest_seqno <= largest_seqno);
f67539c2
TL
332 new_files_.emplace_back(
333 level, FileMetaData(file, file_path_id, file_size, smallest, largest,
334 smallest_seqno, largest_seqno,
335 marked_for_compaction, oldest_blob_file_number,
336 oldest_ancester_time, file_creation_time,
337 file_checksum, file_checksum_func_name));
7c673cae
FG
338 }
339
340 void AddFile(int level, const FileMetaData& f) {
11fdf7f2 341 assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
7c673cae
FG
342 new_files_.emplace_back(level, f);
343 }
344
f67539c2
TL
345 // Retrieve the files added as well as their associated levels.
346 using NewFiles = std::vector<std::pair<int, FileMetaData>>;
347 const NewFiles& GetNewFiles() const { return new_files_; }
7c673cae
FG
348
349 // Number of edits
f67539c2 350 size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); }
7c673cae
FG
351
352 void SetColumnFamily(uint32_t column_family_id) {
353 column_family_ = column_family_id;
354 }
f67539c2 355 uint32_t GetColumnFamily() const { return column_family_; }
7c673cae
FG
356
357 // set column family ID by calling SetColumnFamily()
358 void AddColumnFamily(const std::string& name) {
359 assert(!is_column_family_drop_);
360 assert(!is_column_family_add_);
361 assert(NumEntries() == 0);
362 is_column_family_add_ = true;
363 column_family_name_ = name;
364 }
365
366 // set column family ID by calling SetColumnFamily()
367 void DropColumnFamily() {
368 assert(!is_column_family_drop_);
369 assert(!is_column_family_add_);
370 assert(NumEntries() == 0);
371 is_column_family_drop_ = true;
372 }
373
f67539c2
TL
374 bool IsColumnFamilyManipulation() const {
375 return is_column_family_add_ || is_column_family_drop_;
7c673cae
FG
376 }
377
11fdf7f2
TL
378 void MarkAtomicGroup(uint32_t remaining_entries) {
379 is_in_atomic_group_ = true;
380 remaining_entries_ = remaining_entries;
381 }
f67539c2
TL
382 bool IsInAtomicGroup() const { return is_in_atomic_group_; }
383 uint32_t GetRemainingEntries() const { return remaining_entries_; }
384
385 // return true on success.
386 bool EncodeTo(std::string* dst) const;
387 Status DecodeFrom(const Slice& src);
11fdf7f2 388
7c673cae
FG
389 std::string DebugString(bool hex_key = false) const;
390 std::string DebugJSON(int edit_num, bool hex_key = false) const;
391
392 private:
494da23a 393 friend class ReactiveVersionSet;
7c673cae
FG
394 friend class VersionSet;
395 friend class Version;
f67539c2 396 friend class AtomicGroupReadBuffer;
7c673cae
FG
397
398 bool GetLevel(Slice* input, int* level, const char** msg);
399
f67539c2
TL
400 const char* DecodeNewFile4From(Slice* input);
401
402 int max_level_ = 0;
403 std::string db_id_;
7c673cae 404 std::string comparator_;
f67539c2
TL
405 uint64_t log_number_ = 0;
406 uint64_t prev_log_number_ = 0;
407 uint64_t next_file_number_ = 0;
408 uint32_t max_column_family_ = 0;
11fdf7f2 409 // The most recent WAL log number that is deleted
f67539c2
TL
410 uint64_t min_log_number_to_keep_ = 0;
411 SequenceNumber last_sequence_ = 0;
412 bool has_db_id_ = false;
413 bool has_comparator_ = false;
414 bool has_log_number_ = false;
415 bool has_prev_log_number_ = false;
416 bool has_next_file_number_ = false;
417 bool has_max_column_family_ = false;
418 bool has_min_log_number_to_keep_ = false;
419 bool has_last_sequence_ = false;
420
421 DeletedFiles deleted_files_;
422 NewFiles new_files_;
7c673cae 423
11fdf7f2 424 // Each version edit record should have column_family_ set
7c673cae 425 // If it's not set, it is default (0)
f67539c2 426 uint32_t column_family_ = 0;
7c673cae
FG
427 // a version edit can be either column_family add or
428 // column_family drop. If it's column family add,
429 // it also includes column family name.
f67539c2
TL
430 bool is_column_family_drop_ = false;
431 bool is_column_family_add_ = false;
7c673cae 432 std::string column_family_name_;
11fdf7f2 433
f67539c2
TL
434 bool is_in_atomic_group_ = false;
435 uint32_t remaining_entries_ = 0;
7c673cae
FG
436};
437
f67539c2 438} // namespace ROCKSDB_NAMESPACE