]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/version_edit.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / db / version_edit.h
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11#include <algorithm>
12#include <set>
13#include <utility>
14#include <vector>
15#include <string>
16#include "rocksdb/cache.h"
17#include "db/dbformat.h"
18#include "util/arena.h"
19#include "util/autovector.h"
20
21namespace rocksdb {
22
23class VersionSet;
24
25const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
26
27extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
28
29// A copyable structure contains information needed to read data from an SST
11fdf7f2 30// file. It can contain a pointer to a table reader opened for the file, or
7c673cae
FG
31// file number and size, which can be used to create a new table reader for it.
32// The behavior is undefined when a copied of the structure is used when the
33// file is not in any live version any more.
34struct FileDescriptor {
35 // Table reader in table_reader_handle
36 TableReader* table_reader;
37 uint64_t packed_number_and_path_id;
38 uint64_t file_size; // File size in bytes
11fdf7f2
TL
39 SequenceNumber smallest_seqno; // The smallest seqno in this file
40 SequenceNumber largest_seqno; // The largest seqno in this file
7c673cae
FG
41
42 FileDescriptor() : FileDescriptor(0, 0, 0) {}
43
44 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
11fdf7f2
TL
45 : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
46
47 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
48 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
7c673cae
FG
49 : table_reader(nullptr),
50 packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
11fdf7f2
TL
51 file_size(_file_size),
52 smallest_seqno(_smallest_seqno),
53 largest_seqno(_largest_seqno) {}
7c673cae
FG
54
55 FileDescriptor& operator=(const FileDescriptor& fd) {
56 table_reader = fd.table_reader;
57 packed_number_and_path_id = fd.packed_number_and_path_id;
58 file_size = fd.file_size;
11fdf7f2
TL
59 smallest_seqno = fd.smallest_seqno;
60 largest_seqno = fd.largest_seqno;
7c673cae
FG
61 return *this;
62 }
63
64 uint64_t GetNumber() const {
65 return packed_number_and_path_id & kFileNumberMask;
66 }
67 uint32_t GetPathId() const {
68 return static_cast<uint32_t>(
69 packed_number_and_path_id / (kFileNumberMask + 1));
70 }
71 uint64_t GetFileSize() const { return file_size; }
72};
73
11fdf7f2
TL
74struct FileSampledStats {
75 FileSampledStats() : num_reads_sampled(0) {}
76 FileSampledStats(const FileSampledStats& other) { *this = other; }
77 FileSampledStats& operator=(const FileSampledStats& other) {
78 num_reads_sampled = other.num_reads_sampled.load();
79 return *this;
80 }
81
82 // number of user reads to this file.
83 mutable std::atomic<uint64_t> num_reads_sampled;
84};
85
7c673cae 86struct FileMetaData {
7c673cae
FG
87 FileDescriptor fd;
88 InternalKey smallest; // Smallest internal key served by table
89 InternalKey largest; // Largest internal key served by table
7c673cae
FG
90
91 // Needs to be disposed when refs becomes 0.
92 Cache::Handle* table_reader_handle;
93
11fdf7f2
TL
94 FileSampledStats stats;
95
7c673cae
FG
96 // Stats for compensating deletion entries during compaction
97
98 // File size compensated by deletion entry.
99 // This is updated in Version::UpdateAccumulatedStats() first time when the
100 // file is created or loaded. After it is updated (!= 0), it is immutable.
101 uint64_t compensated_file_size;
102 // These values can mutate, but they can only be read or written from
103 // single-threaded LogAndApply thread
104 uint64_t num_entries; // the number of entries.
105 uint64_t num_deletions; // the number of deletion entries.
106 uint64_t raw_key_size; // total uncompressed key size.
107 uint64_t raw_value_size; // total uncompressed value size.
11fdf7f2
TL
108
109 int refs; // Reference count
110
111 bool being_compacted; // Is this file undergoing compaction?
7c673cae
FG
112 bool init_stats_from_file; // true if the data-entry stats of this file
113 // has initialized from file.
114
115 bool marked_for_compaction; // True if client asked us nicely to compact this
116 // file.
117
118 FileMetaData()
11fdf7f2 119 : table_reader_handle(nullptr),
7c673cae
FG
120 compensated_file_size(0),
121 num_entries(0),
122 num_deletions(0),
123 raw_key_size(0),
124 raw_value_size(0),
11fdf7f2
TL
125 refs(0),
126 being_compacted(false),
7c673cae
FG
127 init_stats_from_file(false),
128 marked_for_compaction(false) {}
129
130 // REQUIRED: Keys must be given to the function in sorted order (it expects
131 // the last key to be the largest).
132 void UpdateBoundaries(const Slice& key, SequenceNumber seqno) {
133 if (smallest.size() == 0) {
134 smallest.DecodeFrom(key);
135 }
136 largest.DecodeFrom(key);
11fdf7f2
TL
137 fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
138 fd.largest_seqno = std::max(fd.largest_seqno, seqno);
139 }
140
141 // Unlike UpdateBoundaries, ranges do not need to be presented in any
142 // particular order.
143 void UpdateBoundariesForRange(const InternalKey& start,
144 const InternalKey& end, SequenceNumber seqno,
145 const InternalKeyComparator& icmp) {
146 if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
147 smallest = start;
148 }
149 if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
150 largest = end;
151 }
152 fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
153 fd.largest_seqno = std::max(fd.largest_seqno, seqno);
7c673cae
FG
154 }
155};
156
11fdf7f2
TL
157// A compressed copy of file meta data that just contain minimum data needed
158// to server read operations, while still keeping the pointer to full metadata
159// of the file in case it is needed.
7c673cae
FG
160struct FdWithKeyRange {
161 FileDescriptor fd;
11fdf7f2 162 FileMetaData* file_metadata; // Point to all metadata
7c673cae
FG
163 Slice smallest_key; // slice that contain smallest key
164 Slice largest_key; // slice that contain largest key
165
166 FdWithKeyRange()
167 : fd(),
11fdf7f2 168 file_metadata(nullptr),
7c673cae
FG
169 smallest_key(),
170 largest_key() {
171 }
172
11fdf7f2
TL
173 FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
174 FileMetaData* _file_metadata)
175 : fd(_fd),
176 file_metadata(_file_metadata),
177 smallest_key(_smallest_key),
178 largest_key(_largest_key) {}
7c673cae
FG
179};
180
181// Data structure to store an array of FdWithKeyRange in one level
182// Actual data is guaranteed to be stored closely
183struct LevelFilesBrief {
184 size_t num_files;
185 FdWithKeyRange* files;
186 LevelFilesBrief() {
187 num_files = 0;
188 files = nullptr;
189 }
190};
191
192class VersionEdit {
193 public:
194 VersionEdit() { Clear(); }
195 ~VersionEdit() { }
196
197 void Clear();
198
199 void SetComparatorName(const Slice& name) {
200 has_comparator_ = true;
201 comparator_ = name.ToString();
202 }
203 void SetLogNumber(uint64_t num) {
204 has_log_number_ = true;
205 log_number_ = num;
206 }
207 void SetPrevLogNumber(uint64_t num) {
208 has_prev_log_number_ = true;
209 prev_log_number_ = num;
210 }
211 void SetNextFile(uint64_t num) {
212 has_next_file_number_ = true;
213 next_file_number_ = num;
214 }
215 void SetLastSequence(SequenceNumber seq) {
216 has_last_sequence_ = true;
217 last_sequence_ = seq;
218 }
219 void SetMaxColumnFamily(uint32_t max_column_family) {
220 has_max_column_family_ = true;
221 max_column_family_ = max_column_family;
222 }
11fdf7f2
TL
223 void SetMinLogNumberToKeep(uint64_t num) {
224 has_min_log_number_to_keep_ = true;
225 min_log_number_to_keep_ = num;
226 }
227
228 bool has_log_number() { return has_log_number_; }
229
230 uint64_t log_number() { return log_number_; }
7c673cae 231
494da23a
TL
232 bool has_next_file_number() const { return has_next_file_number_; }
233
234 uint64_t next_file_number() const { return next_file_number_; }
235
7c673cae
FG
236 // Add the specified file at the specified number.
237 // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
238 // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
239 void AddFile(int level, uint64_t file, uint32_t file_path_id,
240 uint64_t file_size, const InternalKey& smallest,
241 const InternalKey& largest, const SequenceNumber& smallest_seqno,
242 const SequenceNumber& largest_seqno,
243 bool marked_for_compaction) {
244 assert(smallest_seqno <= largest_seqno);
245 FileMetaData f;
11fdf7f2
TL
246 f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno,
247 largest_seqno);
7c673cae
FG
248 f.smallest = smallest;
249 f.largest = largest;
11fdf7f2
TL
250 f.fd.smallest_seqno = smallest_seqno;
251 f.fd.largest_seqno = largest_seqno;
7c673cae
FG
252 f.marked_for_compaction = marked_for_compaction;
253 new_files_.emplace_back(level, std::move(f));
254 }
255
256 void AddFile(int level, const FileMetaData& f) {
11fdf7f2 257 assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
7c673cae
FG
258 new_files_.emplace_back(level, f);
259 }
260
261 // Delete the specified "file" from the specified "level".
262 void DeleteFile(int level, uint64_t file) {
263 deleted_files_.insert({level, file});
264 }
265
266 // Number of edits
267 size_t NumEntries() { return new_files_.size() + deleted_files_.size(); }
268
269 bool IsColumnFamilyManipulation() {
270 return is_column_family_add_ || is_column_family_drop_;
271 }
272
273 void SetColumnFamily(uint32_t column_family_id) {
274 column_family_ = column_family_id;
275 }
276
277 // set column family ID by calling SetColumnFamily()
278 void AddColumnFamily(const std::string& name) {
279 assert(!is_column_family_drop_);
280 assert(!is_column_family_add_);
281 assert(NumEntries() == 0);
282 is_column_family_add_ = true;
283 column_family_name_ = name;
284 }
285
286 // set column family ID by calling SetColumnFamily()
287 void DropColumnFamily() {
288 assert(!is_column_family_drop_);
289 assert(!is_column_family_add_);
290 assert(NumEntries() == 0);
291 is_column_family_drop_ = true;
292 }
293
294 // return true on success.
295 bool EncodeTo(std::string* dst) const;
296 Status DecodeFrom(const Slice& src);
297
298 const char* DecodeNewFile4From(Slice* input);
299
300 typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
301
302 const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }
303 const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() {
304 return new_files_;
305 }
306
11fdf7f2
TL
307 void MarkAtomicGroup(uint32_t remaining_entries) {
308 is_in_atomic_group_ = true;
309 remaining_entries_ = remaining_entries;
310 }
311
7c673cae
FG
312 std::string DebugString(bool hex_key = false) const;
313 std::string DebugJSON(int edit_num, bool hex_key = false) const;
314
315 private:
494da23a 316 friend class ReactiveVersionSet;
7c673cae
FG
317 friend class VersionSet;
318 friend class Version;
319
320 bool GetLevel(Slice* input, int* level, const char** msg);
321
322 int max_level_;
323 std::string comparator_;
324 uint64_t log_number_;
325 uint64_t prev_log_number_;
326 uint64_t next_file_number_;
327 uint32_t max_column_family_;
11fdf7f2
TL
328 // The most recent WAL log number that is deleted
329 uint64_t min_log_number_to_keep_;
7c673cae
FG
330 SequenceNumber last_sequence_;
331 bool has_comparator_;
332 bool has_log_number_;
333 bool has_prev_log_number_;
334 bool has_next_file_number_;
335 bool has_last_sequence_;
336 bool has_max_column_family_;
11fdf7f2 337 bool has_min_log_number_to_keep_;
7c673cae
FG
338
339 DeletedFileSet deleted_files_;
340 std::vector<std::pair<int, FileMetaData>> new_files_;
341
11fdf7f2 342 // Each version edit record should have column_family_ set
7c673cae
FG
343 // If it's not set, it is default (0)
344 uint32_t column_family_;
345 // a version edit can be either column_family add or
346 // column_family drop. If it's column family add,
347 // it also includes column family name.
348 bool is_column_family_drop_;
349 bool is_column_family_add_;
350 std::string column_family_name_;
11fdf7f2
TL
351
352 bool is_in_atomic_group_;
353 uint32_t remaining_entries_;
7c673cae
FG
354};
355
356} // namespace rocksdb