]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #pragma once | |
11 | #include <algorithm> | |
12 | #include <set> | |
f67539c2 | 13 | #include <string> |
7c673cae FG |
14 | #include <utility> |
15 | #include <vector> | |
7c673cae | 16 | #include "db/dbformat.h" |
f67539c2 TL |
17 | #include "memory/arena.h" |
18 | #include "rocksdb/cache.h" | |
19 | #include "table/table_reader.h" | |
7c673cae FG |
20 | #include "util/autovector.h" |
21 | ||
f67539c2 | 22 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
23 | |
24 | class VersionSet; | |
25 | ||
f67539c2 TL |
26 | constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; |
27 | constexpr uint64_t kInvalidBlobFileNumber = 0; | |
28 | constexpr uint64_t kUnknownOldestAncesterTime = 0; | |
29 | constexpr uint64_t kUnknownFileCreationTime = 0; | |
30 | ||
31 | extern const std::string kUnknownFileChecksum; | |
32 | extern const std::string kUnknownFileChecksumFuncName; | |
7c673cae FG |
33 | |
34 | extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); | |
35 | ||
36 | // A copyable structure contains information needed to read data from an SST | |
11fdf7f2 | 37 | // file. It can contain a pointer to a table reader opened for the file, or |
7c673cae FG |
38 | // file number and size, which can be used to create a new table reader for it. |
39 | // The behavior is undefined when a copied of the structure is used when the | |
40 | // file is not in any live version any more. | |
41 | struct FileDescriptor { | |
42 | // Table reader in table_reader_handle | |
43 | TableReader* table_reader; | |
44 | uint64_t packed_number_and_path_id; | |
45 | uint64_t file_size; // File size in bytes | |
11fdf7f2 TL |
46 | SequenceNumber smallest_seqno; // The smallest seqno in this file |
47 | SequenceNumber largest_seqno; // The largest seqno in this file | |
7c673cae FG |
48 | |
49 | FileDescriptor() : FileDescriptor(0, 0, 0) {} | |
50 | ||
51 | FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) | |
11fdf7f2 TL |
52 | : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {} |
53 | ||
54 | FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size, | |
55 | SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno) | |
7c673cae FG |
56 | : table_reader(nullptr), |
57 | packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), | |
11fdf7f2 TL |
58 | file_size(_file_size), |
59 | smallest_seqno(_smallest_seqno), | |
60 | largest_seqno(_largest_seqno) {} | |
7c673cae | 61 | |
f67539c2 TL |
62 | FileDescriptor(const FileDescriptor& fd) { *this = fd; } |
63 | ||
7c673cae FG |
64 | FileDescriptor& operator=(const FileDescriptor& fd) { |
65 | table_reader = fd.table_reader; | |
66 | packed_number_and_path_id = fd.packed_number_and_path_id; | |
67 | file_size = fd.file_size; | |
11fdf7f2 TL |
68 | smallest_seqno = fd.smallest_seqno; |
69 | largest_seqno = fd.largest_seqno; | |
7c673cae FG |
70 | return *this; |
71 | } | |
72 | ||
73 | uint64_t GetNumber() const { | |
74 | return packed_number_and_path_id & kFileNumberMask; | |
75 | } | |
76 | uint32_t GetPathId() const { | |
77 | return static_cast<uint32_t>( | |
78 | packed_number_and_path_id / (kFileNumberMask + 1)); | |
79 | } | |
80 | uint64_t GetFileSize() const { return file_size; } | |
81 | }; | |
82 | ||
11fdf7f2 TL |
83 | struct FileSampledStats { |
84 | FileSampledStats() : num_reads_sampled(0) {} | |
85 | FileSampledStats(const FileSampledStats& other) { *this = other; } | |
86 | FileSampledStats& operator=(const FileSampledStats& other) { | |
87 | num_reads_sampled = other.num_reads_sampled.load(); | |
88 | return *this; | |
89 | } | |
90 | ||
91 | // number of user reads to this file. | |
92 | mutable std::atomic<uint64_t> num_reads_sampled; | |
93 | }; | |
94 | ||
7c673cae | 95 | struct FileMetaData { |
7c673cae FG |
96 | FileDescriptor fd; |
97 | InternalKey smallest; // Smallest internal key served by table | |
98 | InternalKey largest; // Largest internal key served by table | |
7c673cae FG |
99 | |
100 | // Needs to be disposed when refs becomes 0. | |
f67539c2 | 101 | Cache::Handle* table_reader_handle = nullptr; |
7c673cae | 102 | |
11fdf7f2 TL |
103 | FileSampledStats stats; |
104 | ||
7c673cae FG |
105 | // Stats for compensating deletion entries during compaction |
106 | ||
107 | // File size compensated by deletion entry. | |
108 | // This is updated in Version::UpdateAccumulatedStats() first time when the | |
109 | // file is created or loaded. After it is updated (!= 0), it is immutable. | |
f67539c2 | 110 | uint64_t compensated_file_size = 0; |
7c673cae FG |
111 | // These values can mutate, but they can only be read or written from |
112 | // single-threaded LogAndApply thread | |
f67539c2 TL |
113 | uint64_t num_entries = 0; // the number of entries. |
114 | uint64_t num_deletions = 0; // the number of deletion entries. | |
115 | uint64_t raw_key_size = 0; // total uncompressed key size. | |
116 | uint64_t raw_value_size = 0; // total uncompressed value size. | |
117 | ||
118 | int refs = 0; // Reference count | |
119 | ||
120 | bool being_compacted = false; // Is this file undergoing compaction? | |
121 | bool init_stats_from_file = false; // true if the data-entry stats of this | |
122 | // file has initialized from file. | |
123 | ||
124 | bool marked_for_compaction = false; // True if client asked us nicely to | |
125 | // compact this file. | |
126 | ||
127 | // Used only in BlobDB. The file number of the oldest blob file this SST file | |
128 | // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. | |
129 | uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; | |
130 | ||
131 | // The file could be the compaction output from other SST files, which could | |
132 | // in turn be outputs for compact older SST files. We track the memtable | |
133 | // flush timestamp for the oldest SST file that eventaully contribute data | |
134 | // to this file. 0 means the information is not available. | |
135 | uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; | |
136 | ||
137 | // Unix time when the SST file is created. | |
138 | uint64_t file_creation_time = kUnknownFileCreationTime; | |
139 | ||
140 | // File checksum | |
141 | std::string file_checksum = kUnknownFileChecksum; | |
142 | ||
143 | // File checksum function name | |
144 | std::string file_checksum_func_name = kUnknownFileChecksumFuncName; | |
145 | ||
146 | FileMetaData() = default; | |
147 | ||
148 | FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, | |
149 | const InternalKey& smallest_key, const InternalKey& largest_key, | |
150 | const SequenceNumber& smallest_seq, | |
151 | const SequenceNumber& largest_seq, bool marked_for_compact, | |
152 | uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, | |
153 | uint64_t _file_creation_time, const std::string& _file_checksum, | |
154 | const std::string& _file_checksum_func_name) | |
155 | : fd(file, file_path_id, file_size, smallest_seq, largest_seq), | |
156 | smallest(smallest_key), | |
157 | largest(largest_key), | |
158 | marked_for_compaction(marked_for_compact), | |
159 | oldest_blob_file_number(oldest_blob_file), | |
160 | oldest_ancester_time(_oldest_ancester_time), | |
161 | file_creation_time(_file_creation_time), | |
162 | file_checksum(_file_checksum), | |
163 | file_checksum_func_name(_file_checksum_func_name) { | |
164 | TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); | |
165 | } | |
7c673cae FG |
166 | |
167 | // REQUIRED: Keys must be given to the function in sorted order (it expects | |
168 | // the last key to be the largest). | |
f67539c2 TL |
169 | void UpdateBoundaries(const Slice& key, const Slice& value, |
170 | SequenceNumber seqno, ValueType value_type); | |
11fdf7f2 TL |
171 | |
172 | // Unlike UpdateBoundaries, ranges do not need to be presented in any | |
173 | // particular order. | |
174 | void UpdateBoundariesForRange(const InternalKey& start, | |
175 | const InternalKey& end, SequenceNumber seqno, | |
176 | const InternalKeyComparator& icmp) { | |
177 | if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) { | |
178 | smallest = start; | |
179 | } | |
180 | if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { | |
181 | largest = end; | |
182 | } | |
183 | fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); | |
184 | fd.largest_seqno = std::max(fd.largest_seqno, seqno); | |
7c673cae | 185 | } |
f67539c2 TL |
186 | |
187 | // Try to get oldest ancester time from the class itself or table properties | |
188 | // if table reader is already pinned. | |
189 | // 0 means the information is not available. | |
190 | uint64_t TryGetOldestAncesterTime() { | |
191 | if (oldest_ancester_time != kUnknownOldestAncesterTime) { | |
192 | return oldest_ancester_time; | |
193 | } else if (fd.table_reader != nullptr && | |
194 | fd.table_reader->GetTableProperties() != nullptr) { | |
195 | return fd.table_reader->GetTableProperties()->creation_time; | |
196 | } | |
197 | return kUnknownOldestAncesterTime; | |
198 | } | |
199 | ||
200 | uint64_t TryGetFileCreationTime() { | |
201 | if (file_creation_time != kUnknownFileCreationTime) { | |
202 | return file_creation_time; | |
203 | } else if (fd.table_reader != nullptr && | |
204 | fd.table_reader->GetTableProperties() != nullptr) { | |
205 | return fd.table_reader->GetTableProperties()->file_creation_time; | |
206 | } | |
207 | return kUnknownFileCreationTime; | |
208 | } | |
7c673cae FG |
209 | }; |
210 | ||
11fdf7f2 TL |
211 | // A compressed copy of file meta data that just contain minimum data needed |
212 | // to server read operations, while still keeping the pointer to full metadata | |
213 | // of the file in case it is needed. | |
7c673cae FG |
214 | struct FdWithKeyRange { |
215 | FileDescriptor fd; | |
11fdf7f2 | 216 | FileMetaData* file_metadata; // Point to all metadata |
7c673cae FG |
217 | Slice smallest_key; // slice that contain smallest key |
218 | Slice largest_key; // slice that contain largest key | |
219 | ||
220 | FdWithKeyRange() | |
221 | : fd(), | |
11fdf7f2 | 222 | file_metadata(nullptr), |
7c673cae FG |
223 | smallest_key(), |
224 | largest_key() { | |
225 | } | |
226 | ||
11fdf7f2 TL |
227 | FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, |
228 | FileMetaData* _file_metadata) | |
229 | : fd(_fd), | |
230 | file_metadata(_file_metadata), | |
231 | smallest_key(_smallest_key), | |
232 | largest_key(_largest_key) {} | |
7c673cae FG |
233 | }; |
234 | ||
235 | // Data structure to store an array of FdWithKeyRange in one level | |
236 | // Actual data is guaranteed to be stored closely | |
237 | struct LevelFilesBrief { | |
238 | size_t num_files; | |
239 | FdWithKeyRange* files; | |
240 | LevelFilesBrief() { | |
241 | num_files = 0; | |
242 | files = nullptr; | |
243 | } | |
244 | }; | |
245 | ||
f67539c2 TL |
246 | // The state of a DB at any given time is referred to as a Version. |
247 | // Any modification to the Version is considered a Version Edit. A Version is | |
248 | // constructed by joining a sequence of Version Edits. Version Edits are written | |
249 | // to the MANIFEST file. | |
7c673cae FG |
250 | class VersionEdit { |
251 | public: | |
7c673cae FG |
252 | void Clear(); |
253 | ||
f67539c2 TL |
254 | void SetDBId(const std::string& db_id) { |
255 | has_db_id_ = true; | |
256 | db_id_ = db_id; | |
257 | } | |
258 | bool HasDbId() const { return has_db_id_; } | |
259 | const std::string& GetDbId() const { return db_id_; } | |
260 | ||
7c673cae FG |
261 | void SetComparatorName(const Slice& name) { |
262 | has_comparator_ = true; | |
263 | comparator_ = name.ToString(); | |
264 | } | |
f67539c2 TL |
265 | bool HasComparatorName() const { return has_comparator_; } |
266 | const std::string& GetComparatorName() const { return comparator_; } | |
267 | ||
7c673cae FG |
268 | void SetLogNumber(uint64_t num) { |
269 | has_log_number_ = true; | |
270 | log_number_ = num; | |
271 | } | |
f67539c2 TL |
272 | bool HasLogNumber() const { return has_log_number_; } |
273 | uint64_t GetLogNumber() const { return log_number_; } | |
274 | ||
7c673cae FG |
275 | void SetPrevLogNumber(uint64_t num) { |
276 | has_prev_log_number_ = true; | |
277 | prev_log_number_ = num; | |
278 | } | |
f67539c2 TL |
279 | bool HasPrevLogNumber() const { return has_prev_log_number_; } |
280 | uint64_t GetPrevLogNumber() const { return prev_log_number_; } | |
281 | ||
7c673cae FG |
282 | void SetNextFile(uint64_t num) { |
283 | has_next_file_number_ = true; | |
284 | next_file_number_ = num; | |
285 | } | |
f67539c2 TL |
286 | bool HasNextFile() const { return has_next_file_number_; } |
287 | uint64_t GetNextFile() const { return next_file_number_; } | |
288 | ||
7c673cae FG |
289 | void SetMaxColumnFamily(uint32_t max_column_family) { |
290 | has_max_column_family_ = true; | |
291 | max_column_family_ = max_column_family; | |
292 | } | |
f67539c2 TL |
293 | bool HasMaxColumnFamily() const { return has_max_column_family_; } |
294 | uint32_t GetMaxColumnFamily() const { return max_column_family_; } | |
295 | ||
11fdf7f2 TL |
296 | void SetMinLogNumberToKeep(uint64_t num) { |
297 | has_min_log_number_to_keep_ = true; | |
298 | min_log_number_to_keep_ = num; | |
299 | } | |
f67539c2 TL |
300 | bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; } |
301 | uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; } | |
11fdf7f2 | 302 | |
f67539c2 TL |
303 | void SetLastSequence(SequenceNumber seq) { |
304 | has_last_sequence_ = true; | |
305 | last_sequence_ = seq; | |
306 | } | |
307 | bool HasLastSequence() const { return has_last_sequence_; } | |
308 | SequenceNumber GetLastSequence() const { return last_sequence_; } | |
7c673cae | 309 | |
f67539c2 TL |
310 | // Delete the specified "file" from the specified "level". |
311 | void DeleteFile(int level, uint64_t file) { | |
312 | deleted_files_.emplace(level, file); | |
313 | } | |
494da23a | 314 | |
f67539c2 TL |
315 | // Retrieve the files deleted as well as their associated levels. |
316 | using DeletedFiles = std::set<std::pair<int, uint64_t>>; | |
317 | const DeletedFiles& GetDeletedFiles() const { return deleted_files_; } | |
494da23a | 318 | |
f67539c2 | 319 | // Add the specified file at the specified level. |
7c673cae FG |
320 | // REQUIRES: This version has not been saved (see VersionSet::SaveTo) |
321 | // REQUIRES: "smallest" and "largest" are smallest and largest keys in file | |
f67539c2 TL |
322 | // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file |
323 | // referred to by this file if any, kInvalidBlobFileNumber otherwise. | |
7c673cae FG |
324 | void AddFile(int level, uint64_t file, uint32_t file_path_id, |
325 | uint64_t file_size, const InternalKey& smallest, | |
326 | const InternalKey& largest, const SequenceNumber& smallest_seqno, | |
f67539c2 TL |
327 | const SequenceNumber& largest_seqno, bool marked_for_compaction, |
328 | uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, | |
329 | uint64_t file_creation_time, const std::string& file_checksum, | |
330 | const std::string& file_checksum_func_name) { | |
7c673cae | 331 | assert(smallest_seqno <= largest_seqno); |
f67539c2 TL |
332 | new_files_.emplace_back( |
333 | level, FileMetaData(file, file_path_id, file_size, smallest, largest, | |
334 | smallest_seqno, largest_seqno, | |
335 | marked_for_compaction, oldest_blob_file_number, | |
336 | oldest_ancester_time, file_creation_time, | |
337 | file_checksum, file_checksum_func_name)); | |
7c673cae FG |
338 | } |
339 | ||
340 | void AddFile(int level, const FileMetaData& f) { | |
11fdf7f2 | 341 | assert(f.fd.smallest_seqno <= f.fd.largest_seqno); |
7c673cae FG |
342 | new_files_.emplace_back(level, f); |
343 | } | |
344 | ||
f67539c2 TL |
345 | // Retrieve the files added as well as their associated levels. |
346 | using NewFiles = std::vector<std::pair<int, FileMetaData>>; | |
347 | const NewFiles& GetNewFiles() const { return new_files_; } | |
7c673cae FG |
348 | |
349 | // Number of edits | |
f67539c2 | 350 | size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); } |
7c673cae FG |
351 | |
352 | void SetColumnFamily(uint32_t column_family_id) { | |
353 | column_family_ = column_family_id; | |
354 | } | |
f67539c2 | 355 | uint32_t GetColumnFamily() const { return column_family_; } |
7c673cae FG |
356 | |
357 | // set column family ID by calling SetColumnFamily() | |
358 | void AddColumnFamily(const std::string& name) { | |
359 | assert(!is_column_family_drop_); | |
360 | assert(!is_column_family_add_); | |
361 | assert(NumEntries() == 0); | |
362 | is_column_family_add_ = true; | |
363 | column_family_name_ = name; | |
364 | } | |
365 | ||
366 | // set column family ID by calling SetColumnFamily() | |
367 | void DropColumnFamily() { | |
368 | assert(!is_column_family_drop_); | |
369 | assert(!is_column_family_add_); | |
370 | assert(NumEntries() == 0); | |
371 | is_column_family_drop_ = true; | |
372 | } | |
373 | ||
f67539c2 TL |
374 | bool IsColumnFamilyManipulation() const { |
375 | return is_column_family_add_ || is_column_family_drop_; | |
7c673cae FG |
376 | } |
377 | ||
11fdf7f2 TL |
378 | void MarkAtomicGroup(uint32_t remaining_entries) { |
379 | is_in_atomic_group_ = true; | |
380 | remaining_entries_ = remaining_entries; | |
381 | } | |
f67539c2 TL |
382 | bool IsInAtomicGroup() const { return is_in_atomic_group_; } |
383 | uint32_t GetRemainingEntries() const { return remaining_entries_; } | |
384 | ||
385 | // return true on success. | |
386 | bool EncodeTo(std::string* dst) const; | |
387 | Status DecodeFrom(const Slice& src); | |
11fdf7f2 | 388 | |
7c673cae FG |
389 | std::string DebugString(bool hex_key = false) const; |
390 | std::string DebugJSON(int edit_num, bool hex_key = false) const; | |
391 | ||
392 | private: | |
494da23a | 393 | friend class ReactiveVersionSet; |
7c673cae FG |
394 | friend class VersionSet; |
395 | friend class Version; | |
f67539c2 | 396 | friend class AtomicGroupReadBuffer; |
7c673cae FG |
397 | |
398 | bool GetLevel(Slice* input, int* level, const char** msg); | |
399 | ||
f67539c2 TL |
400 | const char* DecodeNewFile4From(Slice* input); |
401 | ||
402 | int max_level_ = 0; | |
403 | std::string db_id_; | |
7c673cae | 404 | std::string comparator_; |
f67539c2 TL |
405 | uint64_t log_number_ = 0; |
406 | uint64_t prev_log_number_ = 0; | |
407 | uint64_t next_file_number_ = 0; | |
408 | uint32_t max_column_family_ = 0; | |
11fdf7f2 | 409 | // The most recent WAL log number that is deleted |
f67539c2 TL |
410 | uint64_t min_log_number_to_keep_ = 0; |
411 | SequenceNumber last_sequence_ = 0; | |
412 | bool has_db_id_ = false; | |
413 | bool has_comparator_ = false; | |
414 | bool has_log_number_ = false; | |
415 | bool has_prev_log_number_ = false; | |
416 | bool has_next_file_number_ = false; | |
417 | bool has_max_column_family_ = false; | |
418 | bool has_min_log_number_to_keep_ = false; | |
419 | bool has_last_sequence_ = false; | |
420 | ||
421 | DeletedFiles deleted_files_; | |
422 | NewFiles new_files_; | |
7c673cae | 423 | |
11fdf7f2 | 424 | // Each version edit record should have column_family_ set |
7c673cae | 425 | // If it's not set, it is default (0) |
f67539c2 | 426 | uint32_t column_family_ = 0; |
7c673cae FG |
427 | // a version edit can be either column_family add or |
428 | // column_family drop. If it's column family add, | |
429 | // it also includes column family name. | |
f67539c2 TL |
430 | bool is_column_family_drop_ = false; |
431 | bool is_column_family_add_ = false; | |
7c673cae | 432 | std::string column_family_name_; |
11fdf7f2 | 433 | |
f67539c2 TL |
434 | bool is_in_atomic_group_ = false; |
435 | uint32_t remaining_entries_ = 0; | |
7c673cae FG |
436 | }; |
437 | ||
f67539c2 | 438 | } // namespace ROCKSDB_NAMESPACE |