]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #include "db/version_builder.h" | |
11 | ||
7c673cae FG |
12 | #include <algorithm> |
13 | #include <atomic> | |
f67539c2 | 14 | #include <cinttypes> |
7c673cae | 15 | #include <functional> |
11fdf7f2 | 16 | #include <map> |
20effc67 | 17 | #include <memory> |
7c673cae | 18 | #include <set> |
20effc67 | 19 | #include <sstream> |
7c673cae FG |
20 | #include <thread> |
21 | #include <unordered_map> | |
22 | #include <unordered_set> | |
23 | #include <utility> | |
24 | #include <vector> | |
25 | ||
1e59de90 | 26 | #include "cache/cache_reservation_manager.h" |
20effc67 | 27 | #include "db/blob/blob_file_meta.h" |
7c673cae FG |
28 | #include "db/dbformat.h" |
29 | #include "db/internal_stats.h" | |
30 | #include "db/table_cache.h" | |
31 | #include "db/version_set.h" | |
32 | #include "port/port.h" | |
33 | #include "table/table_reader.h" | |
f67539c2 | 34 | #include "util/string_util.h" |
7c673cae | 35 | |
f67539c2 | 36 | namespace ROCKSDB_NAMESPACE { |
7c673cae | 37 | |
1e59de90 TL |
38 | class VersionBuilder::Rep { |
39 | class NewestFirstBySeqNo { | |
40 | public: | |
41 | bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { | |
42 | assert(lhs); | |
43 | assert(rhs); | |
7c673cae | 44 | |
1e59de90 TL |
45 | if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { |
46 | return lhs->fd.largest_seqno > rhs->fd.largest_seqno; | |
47 | } | |
7c673cae | 48 | |
1e59de90 TL |
49 | if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { |
50 | return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; | |
7c673cae | 51 | } |
1e59de90 TL |
52 | |
53 | // Break ties by file number | |
54 | return lhs->fd.GetNumber() > rhs->fd.GetNumber(); | |
55 | } | |
56 | }; | |
57 | ||
58 | class BySmallestKey { | |
59 | public: | |
60 | explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {} | |
61 | ||
62 | bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { | |
63 | assert(lhs); | |
64 | assert(rhs); | |
65 | assert(cmp_); | |
66 | ||
67 | const int r = cmp_->Compare(lhs->smallest, rhs->smallest); | |
68 | if (r != 0) { | |
69 | return (r < 0); | |
70 | } | |
71 | ||
72 | // Break ties by file number | |
73 | return (lhs->fd.GetNumber() < rhs->fd.GetNumber()); | |
7c673cae | 74 | } |
1e59de90 TL |
75 | |
76 | private: | |
77 | const InternalKeyComparator* cmp_; | |
7c673cae FG |
78 | }; |
79 | ||
80 | struct LevelState { | |
81 | std::unordered_set<uint64_t> deleted_files; | |
82 | // Map from file number to file meta data. | |
83 | std::unordered_map<uint64_t, FileMetaData*> added_files; | |
84 | }; | |
85 | ||
1e59de90 TL |
86 | // A class that represents the accumulated changes (like additional garbage or |
87 | // newly linked/unlinked SST files) for a given blob file after applying a | |
88 | // series of VersionEdits. | |
20effc67 TL |
89 | class BlobFileMetaDataDelta { |
90 | public: | |
91 | bool IsEmpty() const { | |
1e59de90 TL |
92 | return !additional_garbage_count_ && !additional_garbage_bytes_ && |
93 | newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty(); | |
20effc67 TL |
94 | } |
95 | ||
96 | uint64_t GetAdditionalGarbageCount() const { | |
97 | return additional_garbage_count_; | |
98 | } | |
99 | ||
100 | uint64_t GetAdditionalGarbageBytes() const { | |
101 | return additional_garbage_bytes_; | |
102 | } | |
103 | ||
104 | const std::unordered_set<uint64_t>& GetNewlyLinkedSsts() const { | |
105 | return newly_linked_ssts_; | |
106 | } | |
107 | ||
108 | const std::unordered_set<uint64_t>& GetNewlyUnlinkedSsts() const { | |
109 | return newly_unlinked_ssts_; | |
110 | } | |
111 | ||
20effc67 TL |
112 | void AddGarbage(uint64_t count, uint64_t bytes) { |
113 | additional_garbage_count_ += count; | |
114 | additional_garbage_bytes_ += bytes; | |
115 | } | |
116 | ||
117 | void LinkSst(uint64_t sst_file_number) { | |
118 | assert(newly_linked_ssts_.find(sst_file_number) == | |
119 | newly_linked_ssts_.end()); | |
120 | ||
121 | // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be | |
122 | // linked to and unlinked from the same blob file in the case of a trivial | |
123 | // move.) | |
124 | auto it = newly_unlinked_ssts_.find(sst_file_number); | |
125 | ||
126 | if (it != newly_unlinked_ssts_.end()) { | |
127 | newly_unlinked_ssts_.erase(it); | |
128 | } else { | |
129 | newly_linked_ssts_.emplace(sst_file_number); | |
130 | } | |
131 | } | |
132 | ||
133 | void UnlinkSst(uint64_t sst_file_number) { | |
134 | assert(newly_unlinked_ssts_.find(sst_file_number) == | |
135 | newly_unlinked_ssts_.end()); | |
136 | ||
137 | // Reconcile with newly linked SSTs on the fly. (Note: an SST can be | |
138 | // linked to and unlinked from the same blob file in the case of a trivial | |
139 | // move.) | |
140 | auto it = newly_linked_ssts_.find(sst_file_number); | |
141 | ||
142 | if (it != newly_linked_ssts_.end()) { | |
143 | newly_linked_ssts_.erase(it); | |
144 | } else { | |
145 | newly_unlinked_ssts_.emplace(sst_file_number); | |
146 | } | |
147 | } | |
148 | ||
149 | private: | |
20effc67 TL |
150 | uint64_t additional_garbage_count_ = 0; |
151 | uint64_t additional_garbage_bytes_ = 0; | |
152 | std::unordered_set<uint64_t> newly_linked_ssts_; | |
153 | std::unordered_set<uint64_t> newly_unlinked_ssts_; | |
154 | }; | |
155 | ||
1e59de90 TL |
156 | // A class that represents the state of a blob file after applying a series of |
157 | // VersionEdits. In addition to the resulting state, it also contains the | |
158 | // delta (see BlobFileMetaDataDelta above). The resulting state can be used to | |
159 | // identify obsolete blob files, while the delta makes it possible to | |
160 | // efficiently detect trivial moves. | |
161 | class MutableBlobFileMetaData { | |
162 | public: | |
163 | // To be used for brand new blob files | |
164 | explicit MutableBlobFileMetaData( | |
165 | std::shared_ptr<SharedBlobFileMetaData>&& shared_meta) | |
166 | : shared_meta_(std::move(shared_meta)) {} | |
167 | ||
168 | // To be used for pre-existing blob files | |
169 | explicit MutableBlobFileMetaData( | |
170 | const std::shared_ptr<BlobFileMetaData>& meta) | |
171 | : shared_meta_(meta->GetSharedMeta()), | |
172 | linked_ssts_(meta->GetLinkedSsts()), | |
173 | garbage_blob_count_(meta->GetGarbageBlobCount()), | |
174 | garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {} | |
175 | ||
176 | const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const { | |
177 | return shared_meta_; | |
178 | } | |
179 | ||
180 | uint64_t GetBlobFileNumber() const { | |
181 | assert(shared_meta_); | |
182 | return shared_meta_->GetBlobFileNumber(); | |
183 | } | |
184 | ||
185 | bool HasDelta() const { return !delta_.IsEmpty(); } | |
186 | ||
187 | const std::unordered_set<uint64_t>& GetLinkedSsts() const { | |
188 | return linked_ssts_; | |
189 | } | |
190 | ||
191 | uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; } | |
192 | ||
193 | uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; } | |
194 | ||
195 | bool AddGarbage(uint64_t count, uint64_t bytes) { | |
196 | assert(shared_meta_); | |
197 | ||
198 | if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() || | |
199 | garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) { | |
200 | return false; | |
201 | } | |
202 | ||
203 | delta_.AddGarbage(count, bytes); | |
204 | ||
205 | garbage_blob_count_ += count; | |
206 | garbage_blob_bytes_ += bytes; | |
207 | ||
208 | return true; | |
209 | } | |
210 | ||
211 | void LinkSst(uint64_t sst_file_number) { | |
212 | delta_.LinkSst(sst_file_number); | |
213 | ||
214 | assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end()); | |
215 | linked_ssts_.emplace(sst_file_number); | |
216 | } | |
217 | ||
218 | void UnlinkSst(uint64_t sst_file_number) { | |
219 | delta_.UnlinkSst(sst_file_number); | |
220 | ||
221 | assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end()); | |
222 | linked_ssts_.erase(sst_file_number); | |
223 | } | |
224 | ||
225 | private: | |
226 | std::shared_ptr<SharedBlobFileMetaData> shared_meta_; | |
227 | // Accumulated changes | |
228 | BlobFileMetaDataDelta delta_; | |
229 | // Resulting state after applying the changes | |
230 | BlobFileMetaData::LinkedSsts linked_ssts_; | |
231 | uint64_t garbage_blob_count_ = 0; | |
232 | uint64_t garbage_blob_bytes_ = 0; | |
233 | }; | |
234 | ||
f67539c2 | 235 | const FileOptions& file_options_; |
20effc67 | 236 | const ImmutableCFOptions* const ioptions_; |
7c673cae FG |
237 | TableCache* table_cache_; |
238 | VersionStorageInfo* base_vstorage_; | |
20effc67 | 239 | VersionSet* version_set_; |
11fdf7f2 | 240 | int num_levels_; |
7c673cae | 241 | LevelState* levels_; |
20effc67 | 242 | // Store sizes of levels larger than num_levels_. We do this instead of |
11fdf7f2 TL |
243 | // storing them in levels_ to avoid regression in case there are no files |
244 | // on invalid levels. The version is not consistent if in the end the files | |
245 | // on invalid levels don't cancel out. | |
20effc67 | 246 | std::unordered_map<int, size_t> invalid_level_sizes_; |
11fdf7f2 TL |
247 | // Whether there are invalid new files or invalid deletion on levels larger |
248 | // than num_levels_. | |
249 | bool has_invalid_levels_; | |
20effc67 TL |
250 | // Current levels of table files affected by additions/deletions. |
251 | std::unordered_map<uint64_t, int> table_file_levels_; | |
1e59de90 TL |
252 | // Current compact cursors that should be changed after the last compaction |
253 | std::unordered_map<int, InternalKey> updated_compact_cursors_; | |
254 | NewestFirstBySeqNo level_zero_cmp_; | |
255 | BySmallestKey level_nonzero_cmp_; | |
256 | ||
257 | // Mutable metadata objects for all blob files affected by the series of | |
258 | // version edits. | |
259 | std::map<uint64_t, MutableBlobFileMetaData> mutable_blob_file_metas_; | |
7c673cae | 260 | |
1e59de90 | 261 | std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_; |
20effc67 | 262 | |
7c673cae | 263 | public: |
20effc67 TL |
264 | Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions, |
265 | TableCache* table_cache, VersionStorageInfo* base_vstorage, | |
1e59de90 TL |
266 | VersionSet* version_set, |
267 | std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr) | |
f67539c2 | 268 | : file_options_(file_options), |
20effc67 | 269 | ioptions_(ioptions), |
7c673cae | 270 | table_cache_(table_cache), |
11fdf7f2 | 271 | base_vstorage_(base_vstorage), |
20effc67 | 272 | version_set_(version_set), |
11fdf7f2 | 273 | num_levels_(base_vstorage->num_levels()), |
1e59de90 TL |
274 | has_invalid_levels_(false), |
275 | level_nonzero_cmp_(base_vstorage_->InternalComparator()), | |
276 | file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) { | |
20effc67 TL |
277 | assert(ioptions_); |
278 | ||
11fdf7f2 | 279 | levels_ = new LevelState[num_levels_]; |
7c673cae FG |
280 | } |
281 | ||
282 | ~Rep() { | |
11fdf7f2 | 283 | for (int level = 0; level < num_levels_; level++) { |
7c673cae FG |
284 | const auto& added = levels_[level].added_files; |
285 | for (auto& pair : added) { | |
286 | UnrefFile(pair.second); | |
287 | } | |
288 | } | |
289 | ||
290 | delete[] levels_; | |
291 | } | |
292 | ||
293 | void UnrefFile(FileMetaData* f) { | |
294 | f->refs--; | |
295 | if (f->refs <= 0) { | |
296 | if (f->table_reader_handle) { | |
297 | assert(table_cache_ != nullptr); | |
298 | table_cache_->ReleaseHandle(f->table_reader_handle); | |
299 | f->table_reader_handle = nullptr; | |
300 | } | |
7c673cae | 301 | |
1e59de90 TL |
302 | if (file_metadata_cache_res_mgr_) { |
303 | Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( | |
304 | f->ApproximateMemoryUsage(), false /* increase */); | |
305 | s.PermitUncheckedError(); | |
20effc67 | 306 | } |
1e59de90 | 307 | delete f; |
7c673cae | 308 | } |
20effc67 TL |
309 | } |
310 | ||
1e59de90 TL |
311 | // Mapping used for checking the consistency of links between SST files and |
312 | // blob files. It is built using the forward links (table file -> blob file), | |
313 | // and is subsequently compared with the inverse mapping stored in the | |
314 | // BlobFileMetaData objects. | |
20effc67 TL |
315 | using ExpectedLinkedSsts = |
316 | std::unordered_map<uint64_t, BlobFileMetaData::LinkedSsts>; | |
317 | ||
318 | static void UpdateExpectedLinkedSsts( | |
319 | uint64_t table_file_number, uint64_t blob_file_number, | |
320 | ExpectedLinkedSsts* expected_linked_ssts) { | |
321 | assert(expected_linked_ssts); | |
322 | ||
323 | if (blob_file_number == kInvalidBlobFileNumber) { | |
324 | return; | |
325 | } | |
326 | ||
327 | (*expected_linked_ssts)[blob_file_number].emplace(table_file_number); | |
328 | } | |
329 | ||
1e59de90 TL |
330 | template <typename Checker> |
331 | Status CheckConsistencyDetailsForLevel( | |
332 | const VersionStorageInfo* vstorage, int level, Checker checker, | |
333 | const std::string& sync_point, | |
334 | ExpectedLinkedSsts* expected_linked_ssts) const { | |
335 | #ifdef NDEBUG | |
336 | (void)sync_point; | |
337 | #endif | |
20effc67 | 338 | |
1e59de90 TL |
339 | assert(vstorage); |
340 | assert(level >= 0 && level < num_levels_); | |
341 | assert(expected_linked_ssts); | |
20effc67 | 342 | |
1e59de90 TL |
343 | const auto& level_files = vstorage->LevelFiles(level); |
344 | ||
345 | if (level_files.empty()) { | |
346 | return Status::OK(); | |
347 | } | |
348 | ||
349 | assert(level_files[0]); | |
350 | UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(), | |
351 | level_files[0]->oldest_blob_file_number, | |
352 | expected_linked_ssts); | |
353 | ||
354 | for (size_t i = 1; i < level_files.size(); ++i) { | |
355 | assert(level_files[i]); | |
356 | UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(), | |
357 | level_files[i]->oldest_blob_file_number, | |
358 | expected_linked_ssts); | |
359 | ||
360 | auto lhs = level_files[i - 1]; | |
361 | auto rhs = level_files[i]; | |
20effc67 | 362 | |
f67539c2 | 363 | #ifndef NDEBUG |
1e59de90 TL |
364 | auto pair = std::make_pair(&lhs, &rhs); |
365 | TEST_SYNC_POINT_CALLBACK(sync_point, &pair); | |
f67539c2 | 366 | #endif |
1e59de90 TL |
367 | |
368 | const Status s = checker(lhs, rhs); | |
369 | if (!s.ok()) { | |
370 | return s; | |
371 | } | |
372 | } | |
373 | ||
374 | return Status::OK(); | |
375 | } | |
376 | ||
377 | // Make sure table files are sorted correctly and that the links between | |
378 | // table files and blob files are consistent. | |
379 | Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const { | |
380 | assert(vstorage); | |
381 | ||
382 | ExpectedLinkedSsts expected_linked_ssts; | |
383 | ||
384 | if (num_levels_ > 0) { | |
385 | // Check L0 | |
386 | { | |
387 | auto l0_checker = [this](const FileMetaData* lhs, | |
388 | const FileMetaData* rhs) { | |
389 | assert(lhs); | |
390 | assert(rhs); | |
391 | ||
392 | if (!level_zero_cmp_(lhs, rhs)) { | |
393 | std::ostringstream oss; | |
394 | oss << "L0 files are not sorted properly: files #" | |
395 | << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); | |
396 | ||
397 | return Status::Corruption("VersionBuilder", oss.str()); | |
7c673cae FG |
398 | } |
399 | ||
1e59de90 | 400 | if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) { |
7c673cae | 401 | // This is an external file that we ingested |
1e59de90 TL |
402 | const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno; |
403 | ||
404 | if (!(external_file_seqno < lhs->fd.largest_seqno || | |
7c673cae | 405 | external_file_seqno == 0)) { |
1e59de90 TL |
406 | std::ostringstream oss; |
407 | oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " | |
408 | << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno | |
409 | << " vs. file #" << rhs->fd.GetNumber() | |
410 | << " with global_seqno " << external_file_seqno; | |
411 | ||
412 | return Status::Corruption("VersionBuilder", oss.str()); | |
7c673cae | 413 | } |
1e59de90 TL |
414 | } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) { |
415 | std::ostringstream oss; | |
416 | oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " | |
417 | << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno | |
418 | << " vs. file #" << rhs->fd.GetNumber() << " with seqno " | |
419 | << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno; | |
420 | ||
421 | return Status::Corruption("VersionBuilder", oss.str()); | |
7c673cae | 422 | } |
1e59de90 TL |
423 | |
424 | return Status::OK(); | |
425 | }; | |
426 | ||
427 | const Status s = CheckConsistencyDetailsForLevel( | |
428 | vstorage, /* level */ 0, l0_checker, | |
429 | "VersionBuilder::CheckConsistency0", &expected_linked_ssts); | |
430 | if (!s.ok()) { | |
431 | return s; | |
432 | } | |
433 | } | |
434 | ||
435 | // Check L1 and up | |
436 | const InternalKeyComparator* const icmp = vstorage->InternalComparator(); | |
437 | assert(icmp); | |
438 | ||
439 | for (int level = 1; level < num_levels_; ++level) { | |
440 | auto checker = [this, level, icmp](const FileMetaData* lhs, | |
441 | const FileMetaData* rhs) { | |
442 | assert(lhs); | |
443 | assert(rhs); | |
444 | ||
445 | if (!level_nonzero_cmp_(lhs, rhs)) { | |
446 | std::ostringstream oss; | |
447 | oss << 'L' << level << " files are not sorted properly: files #" | |
448 | << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); | |
449 | ||
450 | return Status::Corruption("VersionBuilder", oss.str()); | |
7c673cae FG |
451 | } |
452 | ||
1e59de90 TL |
453 | // Make sure there is no overlap in level |
454 | if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) { | |
455 | std::ostringstream oss; | |
456 | oss << 'L' << level << " has overlapping ranges: file #" | |
457 | << lhs->fd.GetNumber() | |
458 | << " largest key: " << lhs->largest.DebugString(true) | |
459 | << " vs. file #" << rhs->fd.GetNumber() | |
460 | << " smallest key: " << rhs->smallest.DebugString(true); | |
461 | ||
462 | return Status::Corruption("VersionBuilder", oss.str()); | |
7c673cae | 463 | } |
1e59de90 TL |
464 | |
465 | return Status::OK(); | |
466 | }; | |
467 | ||
468 | const Status s = CheckConsistencyDetailsForLevel( | |
469 | vstorage, level, checker, "VersionBuilder::CheckConsistency1", | |
470 | &expected_linked_ssts); | |
471 | if (!s.ok()) { | |
472 | return s; | |
7c673cae FG |
473 | } |
474 | } | |
475 | } | |
20effc67 | 476 | |
1e59de90 TL |
477 | // Make sure that all blob files in the version have non-garbage data and |
478 | // the links between them and the table files are consistent. | |
20effc67 | 479 | const auto& blob_files = vstorage->GetBlobFiles(); |
1e59de90 | 480 | for (const auto& blob_file_meta : blob_files) { |
20effc67 TL |
481 | assert(blob_file_meta); |
482 | ||
1e59de90 TL |
483 | const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber(); |
484 | ||
20effc67 TL |
485 | if (blob_file_meta->GetGarbageBlobCount() >= |
486 | blob_file_meta->GetTotalBlobCount()) { | |
487 | std::ostringstream oss; | |
488 | oss << "Blob file #" << blob_file_number | |
489 | << " consists entirely of garbage"; | |
490 | ||
491 | return Status::Corruption("VersionBuilder", oss.str()); | |
492 | } | |
493 | ||
494 | if (blob_file_meta->GetLinkedSsts() != | |
495 | expected_linked_ssts[blob_file_number]) { | |
496 | std::ostringstream oss; | |
497 | oss << "Links are inconsistent between table files and blob file #" | |
498 | << blob_file_number; | |
499 | ||
500 | return Status::Corruption("VersionBuilder", oss.str()); | |
501 | } | |
502 | } | |
503 | ||
504 | Status ret_s; | |
505 | TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn", | |
506 | &ret_s); | |
507 | return ret_s; | |
7c673cae FG |
508 | } |
509 | ||
1e59de90 TL |
510 | Status CheckConsistency(const VersionStorageInfo* vstorage) const { |
511 | assert(vstorage); | |
512 | ||
20effc67 | 513 | // Always run consistency checks in debug build |
7c673cae | 514 | #ifdef NDEBUG |
20effc67 | 515 | if (!vstorage->force_consistency_checks()) { |
f67539c2 | 516 | return Status::OK(); |
7c673cae FG |
517 | } |
518 | #endif | |
20effc67 TL |
519 | Status s = CheckConsistencyDetails(vstorage); |
520 | if (s.IsCorruption() && s.getState()) { | |
521 | // Make it clear the error is due to force_consistency_checks = 1 or | |
522 | // debug build | |
523 | #ifdef NDEBUG | |
524 | auto prefix = "force_consistency_checks"; | |
525 | #else | |
526 | auto prefix = "force_consistency_checks(DEBUG)"; | |
527 | #endif | |
528 | s = Status::Corruption(prefix, s.getState()); | |
529 | } else { | |
530 | // was only expecting corruption with message, or OK | |
531 | assert(s.ok()); | |
532 | } | |
533 | return s; | |
534 | } | |
535 | ||
536 | bool CheckConsistencyForNumLevels() const { | |
537 | // Make sure there are no files on or beyond num_levels(). | |
538 | if (has_invalid_levels_) { | |
539 | return false; | |
540 | } | |
541 | ||
542 | for (const auto& pair : invalid_level_sizes_) { | |
543 | const size_t level_size = pair.second; | |
544 | if (level_size != 0) { | |
545 | return false; | |
7c673cae FG |
546 | } |
547 | } | |
20effc67 TL |
548 | |
549 | return true; | |
550 | } | |
551 | ||
1e59de90 TL |
552 | bool IsBlobFileInVersion(uint64_t blob_file_number) const { |
553 | auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); | |
554 | if (mutable_it != mutable_blob_file_metas_.end()) { | |
555 | return true; | |
556 | } | |
557 | ||
558 | assert(base_vstorage_); | |
559 | const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number); | |
560 | ||
561 | return !!meta; | |
562 | } | |
563 | ||
564 | MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData( | |
565 | uint64_t blob_file_number) { | |
566 | auto mutable_it = mutable_blob_file_metas_.find(blob_file_number); | |
567 | if (mutable_it != mutable_blob_file_metas_.end()) { | |
568 | return &mutable_it->second; | |
569 | } | |
570 | ||
571 | assert(base_vstorage_); | |
572 | const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number); | |
573 | ||
574 | if (meta) { | |
575 | mutable_it = mutable_blob_file_metas_ | |
576 | .emplace(blob_file_number, MutableBlobFileMetaData(meta)) | |
577 | .first; | |
578 | return &mutable_it->second; | |
579 | } | |
580 | ||
581 | return nullptr; | |
582 | } | |
583 | ||
20effc67 TL |
584 | Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) { |
585 | const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber(); | |
586 | ||
587 | if (IsBlobFileInVersion(blob_file_number)) { | |
588 | std::ostringstream oss; | |
589 | oss << "Blob file #" << blob_file_number << " already added"; | |
590 | ||
591 | return Status::Corruption("VersionBuilder", oss.str()); | |
592 | } | |
593 | ||
594 | // Note: we use C++11 for now but in C++14, this could be done in a more | |
595 | // elegant way using generalized lambda capture. | |
596 | VersionSet* const vs = version_set_; | |
597 | const ImmutableCFOptions* const ioptions = ioptions_; | |
598 | ||
599 | auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) { | |
600 | if (vs) { | |
601 | assert(ioptions); | |
602 | assert(!ioptions->cf_paths.empty()); | |
603 | assert(shared_meta); | |
604 | ||
605 | vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(), | |
606 | ioptions->cf_paths.front().path); | |
7c673cae | 607 | } |
20effc67 TL |
608 | |
609 | delete shared_meta; | |
610 | }; | |
611 | ||
612 | auto shared_meta = SharedBlobFileMetaData::Create( | |
613 | blob_file_number, blob_file_addition.GetTotalBlobCount(), | |
614 | blob_file_addition.GetTotalBlobBytes(), | |
615 | blob_file_addition.GetChecksumMethod(), | |
616 | blob_file_addition.GetChecksumValue(), deleter); | |
617 | ||
1e59de90 TL |
618 | mutable_blob_file_metas_.emplace( |
619 | blob_file_number, MutableBlobFileMetaData(std::move(shared_meta))); | |
20effc67 TL |
620 | |
621 | return Status::OK(); | |
622 | } | |
623 | ||
624 | Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) { | |
625 | const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber(); | |
626 | ||
1e59de90 TL |
627 | MutableBlobFileMetaData* const mutable_meta = |
628 | GetOrCreateMutableBlobFileMetaData(blob_file_number); | |
629 | ||
630 | if (!mutable_meta) { | |
20effc67 TL |
631 | std::ostringstream oss; |
632 | oss << "Blob file #" << blob_file_number << " not found"; | |
633 | ||
634 | return Status::Corruption("VersionBuilder", oss.str()); | |
635 | } | |
636 | ||
1e59de90 TL |
637 | if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(), |
638 | blob_file_garbage.GetGarbageBlobBytes())) { | |
639 | std::ostringstream oss; | |
640 | oss << "Garbage overflow for blob file #" << blob_file_number; | |
641 | return Status::Corruption("VersionBuilder", oss.str()); | |
642 | } | |
20effc67 TL |
643 | |
644 | return Status::OK(); | |
645 | } | |
646 | ||
647 | int GetCurrentLevelForTableFile(uint64_t file_number) const { | |
648 | auto it = table_file_levels_.find(file_number); | |
649 | if (it != table_file_levels_.end()) { | |
650 | return it->second; | |
651 | } | |
652 | ||
653 | assert(base_vstorage_); | |
654 | return base_vstorage_->GetFileLocation(file_number).GetLevel(); | |
655 | } | |
656 | ||
657 | uint64_t GetOldestBlobFileNumberForTableFile(int level, | |
658 | uint64_t file_number) const { | |
659 | assert(level < num_levels_); | |
660 | ||
661 | const auto& added_files = levels_[level].added_files; | |
662 | ||
663 | auto it = added_files.find(file_number); | |
664 | if (it != added_files.end()) { | |
665 | const FileMetaData* const meta = it->second; | |
666 | assert(meta); | |
667 | ||
668 | return meta->oldest_blob_file_number; | |
7c673cae FG |
669 | } |
670 | ||
20effc67 TL |
671 | assert(base_vstorage_); |
672 | const FileMetaData* const meta = | |
673 | base_vstorage_->GetFileMetaDataByNumber(file_number); | |
674 | assert(meta); | |
675 | ||
676 | return meta->oldest_blob_file_number; | |
677 | } | |
678 | ||
679 | Status ApplyFileDeletion(int level, uint64_t file_number) { | |
680 | assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); | |
681 | ||
682 | const int current_level = GetCurrentLevelForTableFile(file_number); | |
683 | ||
684 | if (level != current_level) { | |
685 | if (level >= num_levels_) { | |
686 | has_invalid_levels_ = true; | |
7c673cae | 687 | } |
20effc67 TL |
688 | |
689 | std::ostringstream oss; | |
690 | oss << "Cannot delete table file #" << file_number << " from level " | |
691 | << level << " since it is "; | |
692 | if (current_level == | |
693 | VersionStorageInfo::FileLocation::Invalid().GetLevel()) { | |
694 | oss << "not in the LSM tree"; | |
695 | } else { | |
696 | oss << "on level " << current_level; | |
697 | } | |
698 | ||
699 | return Status::Corruption("VersionBuilder", oss.str()); | |
7c673cae | 700 | } |
20effc67 TL |
701 | |
702 | if (level >= num_levels_) { | |
703 | assert(invalid_level_sizes_[level] > 0); | |
704 | --invalid_level_sizes_[level]; | |
705 | ||
706 | table_file_levels_[file_number] = | |
707 | VersionStorageInfo::FileLocation::Invalid().GetLevel(); | |
708 | ||
709 | return Status::OK(); | |
7c673cae | 710 | } |
20effc67 TL |
711 | |
712 | const uint64_t blob_file_number = | |
713 | GetOldestBlobFileNumberForTableFile(level, file_number); | |
714 | ||
1e59de90 TL |
715 | if (blob_file_number != kInvalidBlobFileNumber) { |
716 | MutableBlobFileMetaData* const mutable_meta = | |
717 | GetOrCreateMutableBlobFileMetaData(blob_file_number); | |
718 | if (mutable_meta) { | |
719 | mutable_meta->UnlinkSst(file_number); | |
720 | } | |
20effc67 TL |
721 | } |
722 | ||
723 | auto& level_state = levels_[level]; | |
724 | ||
725 | auto& add_files = level_state.added_files; | |
726 | auto add_it = add_files.find(file_number); | |
727 | if (add_it != add_files.end()) { | |
728 | UnrefFile(add_it->second); | |
729 | add_files.erase(add_it); | |
730 | } | |
731 | ||
732 | auto& del_files = level_state.deleted_files; | |
733 | assert(del_files.find(file_number) == del_files.end()); | |
734 | del_files.emplace(file_number); | |
735 | ||
736 | table_file_levels_[file_number] = | |
737 | VersionStorageInfo::FileLocation::Invalid().GetLevel(); | |
738 | ||
f67539c2 | 739 | return Status::OK(); |
7c673cae FG |
740 | } |
741 | ||
20effc67 TL |
742 | Status ApplyFileAddition(int level, const FileMetaData& meta) { |
743 | assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); | |
744 | ||
745 | const uint64_t file_number = meta.fd.GetNumber(); | |
746 | ||
747 | const int current_level = GetCurrentLevelForTableFile(file_number); | |
748 | ||
749 | if (current_level != | |
750 | VersionStorageInfo::FileLocation::Invalid().GetLevel()) { | |
751 | if (level >= num_levels_) { | |
752 | has_invalid_levels_ = true; | |
11fdf7f2 | 753 | } |
20effc67 TL |
754 | |
755 | std::ostringstream oss; | |
756 | oss << "Cannot add table file #" << file_number << " to level " << level | |
757 | << " since it is already in the LSM tree on level " << current_level; | |
758 | return Status::Corruption("VersionBuilder", oss.str()); | |
11fdf7f2 | 759 | } |
20effc67 TL |
760 | |
761 | if (level >= num_levels_) { | |
762 | ++invalid_level_sizes_[level]; | |
763 | table_file_levels_[file_number] = level; | |
764 | ||
765 | return Status::OK(); | |
766 | } | |
767 | ||
768 | auto& level_state = levels_[level]; | |
769 | ||
770 | auto& del_files = level_state.deleted_files; | |
771 | auto del_it = del_files.find(file_number); | |
772 | if (del_it != del_files.end()) { | |
773 | del_files.erase(del_it); | |
774 | } | |
775 | ||
776 | FileMetaData* const f = new FileMetaData(meta); | |
777 | f->refs = 1; | |
778 | ||
1e59de90 TL |
779 | if (file_metadata_cache_res_mgr_) { |
780 | Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( | |
781 | f->ApproximateMemoryUsage(), true /* increase */); | |
782 | if (!s.ok()) { | |
783 | delete f; | |
784 | s = Status::MemoryLimit( | |
785 | "Can't allocate " + | |
786 | kCacheEntryRoleToCamelString[static_cast<std::uint32_t>( | |
787 | CacheEntryRole::kFileMetadata)] + | |
788 | " due to exceeding the memory limit " | |
789 | "based on " | |
790 | "cache capacity"); | |
791 | return s; | |
792 | } | |
793 | } | |
794 | ||
20effc67 TL |
795 | auto& add_files = level_state.added_files; |
796 | assert(add_files.find(file_number) == add_files.end()); | |
797 | add_files.emplace(file_number, f); | |
798 | ||
799 | const uint64_t blob_file_number = f->oldest_blob_file_number; | |
800 | ||
1e59de90 TL |
801 | if (blob_file_number != kInvalidBlobFileNumber) { |
802 | MutableBlobFileMetaData* const mutable_meta = | |
803 | GetOrCreateMutableBlobFileMetaData(blob_file_number); | |
804 | if (mutable_meta) { | |
805 | mutable_meta->LinkSst(file_number); | |
806 | } | |
20effc67 TL |
807 | } |
808 | ||
809 | table_file_levels_[file_number] = level; | |
810 | ||
811 | return Status::OK(); | |
11fdf7f2 TL |
812 | } |
813 | ||
1e59de90 TL |
814 | Status ApplyCompactCursors(int level, |
815 | const InternalKey& smallest_uncompacted_key) { | |
816 | if (level < 0) { | |
817 | std::ostringstream oss; | |
818 | oss << "Cannot add compact cursor (" << level << "," | |
819 | << smallest_uncompacted_key.Encode().ToString() | |
820 | << " due to invalid level (level = " << level << ")"; | |
821 | return Status::Corruption("VersionBuilder", oss.str()); | |
822 | } | |
823 | if (level < num_levels_) { | |
824 | // Omit levels (>= num_levels_) when re-open with shrinking num_levels_ | |
825 | updated_compact_cursors_[level] = smallest_uncompacted_key; | |
826 | } | |
827 | return Status::OK(); | |
828 | } | |
829 | ||
7c673cae | 830 | // Apply all of the edits in *edit to the current state. |
1e59de90 | 831 | Status Apply(const VersionEdit* edit) { |
20effc67 TL |
832 | { |
833 | const Status s = CheckConsistency(base_vstorage_); | |
834 | if (!s.ok()) { | |
835 | return s; | |
836 | } | |
f67539c2 | 837 | } |
7c673cae | 838 | |
20effc67 TL |
839 | // Note: we process the blob file related changes first because the |
840 | // table file addition/deletion logic depends on the blob files | |
841 | // already being there. | |
11fdf7f2 | 842 | |
20effc67 TL |
843 | // Add new blob files |
844 | for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) { | |
845 | const Status s = ApplyBlobFileAddition(blob_file_addition); | |
846 | if (!s.ok()) { | |
847 | return s; | |
848 | } | |
849 | } | |
850 | ||
851 | // Increase the amount of garbage for blob files affected by GC | |
852 | for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) { | |
853 | const Status s = ApplyBlobFileGarbage(blob_file_garbage); | |
854 | if (!s.ok()) { | |
855 | return s; | |
856 | } | |
857 | } | |
858 | ||
859 | // Delete table files | |
860 | for (const auto& deleted_file : edit->GetDeletedFiles()) { | |
861 | const int level = deleted_file.first; | |
862 | const uint64_t file_number = deleted_file.second; | |
863 | ||
864 | const Status s = ApplyFileDeletion(level, file_number); | |
865 | if (!s.ok()) { | |
866 | return s; | |
7c673cae FG |
867 | } |
868 | } | |
869 | ||
20effc67 | 870 | // Add new table files |
7c673cae FG |
871 | for (const auto& new_file : edit->GetNewFiles()) { |
872 | const int level = new_file.first; | |
20effc67 TL |
873 | const FileMetaData& meta = new_file.second; |
874 | ||
875 | const Status s = ApplyFileAddition(level, meta); | |
876 | if (!s.ok()) { | |
877 | return s; | |
878 | } | |
879 | } | |
880 | ||
1e59de90 TL |
881 | // Populate compact cursors for round-robin compaction, leave |
882 | // the cursor to be empty to indicate it is invalid | |
883 | for (const auto& cursor : edit->GetCompactCursors()) { | |
884 | const int level = cursor.first; | |
885 | const InternalKey smallest_uncompacted_key = cursor.second; | |
886 | const Status s = ApplyCompactCursors(level, smallest_uncompacted_key); | |
887 | if (!s.ok()) { | |
888 | return s; | |
889 | } | |
890 | } | |
20effc67 TL |
891 | return Status::OK(); |
892 | } | |
893 | ||
1e59de90 TL |
894 | // Helper function template for merging the blob file metadata from the base |
895 | // version with the mutable metadata representing the state after applying the | |
896 | // edits. The function objects process_base and process_mutable are | |
897 | // respectively called to handle a base version object when there is no | |
898 | // matching mutable object, and a mutable object when there is no matching | |
899 | // base version object. process_both is called to perform the merge when a | |
900 | // given blob file appears both in the base version and the mutable list. The | |
901 | // helper stops processing objects if a function object returns false. Blob | |
902 | // files with a file number below first_blob_file are not processed. | |
903 | template <typename ProcessBase, typename ProcessMutable, typename ProcessBoth> | |
904 | void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base, | |
905 | ProcessMutable process_mutable, | |
906 | ProcessBoth process_both) const { | |
907 | assert(base_vstorage_); | |
908 | ||
909 | auto base_it = base_vstorage_->GetBlobFileMetaDataLB(first_blob_file); | |
910 | const auto base_it_end = base_vstorage_->GetBlobFiles().end(); | |
911 | ||
912 | auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file); | |
913 | const auto mutable_it_end = mutable_blob_file_metas_.end(); | |
914 | ||
915 | while (base_it != base_it_end && mutable_it != mutable_it_end) { | |
916 | const auto& base_meta = *base_it; | |
917 | assert(base_meta); | |
918 | ||
919 | const uint64_t base_blob_file_number = base_meta->GetBlobFileNumber(); | |
920 | const uint64_t mutable_blob_file_number = mutable_it->first; | |
921 | ||
922 | if (base_blob_file_number < mutable_blob_file_number) { | |
923 | if (!process_base(base_meta)) { | |
924 | return; | |
925 | } | |
926 | ||
927 | ++base_it; | |
928 | } else if (mutable_blob_file_number < base_blob_file_number) { | |
929 | const auto& mutable_meta = mutable_it->second; | |
930 | ||
931 | if (!process_mutable(mutable_meta)) { | |
932 | return; | |
933 | } | |
934 | ||
935 | ++mutable_it; | |
936 | } else { | |
937 | assert(base_blob_file_number == mutable_blob_file_number); | |
20effc67 | 938 | |
1e59de90 TL |
939 | const auto& mutable_meta = mutable_it->second; |
940 | ||
941 | if (!process_both(base_meta, mutable_meta)) { | |
942 | return; | |
943 | } | |
20effc67 | 944 | |
1e59de90 TL |
945 | ++base_it; |
946 | ++mutable_it; | |
947 | } | |
20effc67 TL |
948 | } |
949 | ||
1e59de90 TL |
950 | while (base_it != base_it_end) { |
951 | const auto& base_meta = *base_it; | |
20effc67 | 952 | |
1e59de90 TL |
953 | if (!process_base(base_meta)) { |
954 | return; | |
955 | } | |
956 | ||
957 | ++base_it; | |
20effc67 TL |
958 | } |
959 | ||
1e59de90 TL |
960 | while (mutable_it != mutable_it_end) { |
961 | const auto& mutable_meta = mutable_it->second; | |
962 | ||
963 | if (!process_mutable(mutable_meta)) { | |
964 | return; | |
965 | } | |
966 | ||
967 | ++mutable_it; | |
968 | } | |
20effc67 TL |
969 | } |
970 | ||
1e59de90 TL |
971 | // Helper function template for finding the first blob file that has linked |
972 | // SSTs. | |
973 | template <typename Meta> | |
974 | static bool CheckLinkedSsts(const Meta& meta, | |
975 | uint64_t* min_oldest_blob_file_num) { | |
976 | assert(min_oldest_blob_file_num); | |
977 | ||
978 | if (!meta.GetLinkedSsts().empty()) { | |
979 | assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber); | |
20effc67 | 980 | |
1e59de90 | 981 | *min_oldest_blob_file_num = meta.GetBlobFileNumber(); |
20effc67 | 982 | |
1e59de90 TL |
983 | return false; |
984 | } | |
20effc67 | 985 | |
1e59de90 | 986 | return true; |
20effc67 TL |
987 | } |
988 | ||
1e59de90 TL |
989 | // Find the oldest blob file that has linked SSTs. |
990 | uint64_t GetMinOldestBlobFileNumber() const { | |
991 | uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber; | |
20effc67 | 992 | |
1e59de90 TL |
993 | auto process_base = |
994 | [&min_oldest_blob_file_num]( | |
995 | const std::shared_ptr<BlobFileMetaData>& base_meta) { | |
996 | assert(base_meta); | |
20effc67 | 997 | |
1e59de90 TL |
998 | return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num); |
999 | }; | |
20effc67 | 1000 | |
1e59de90 TL |
1001 | auto process_mutable = [&min_oldest_blob_file_num]( |
1002 | const MutableBlobFileMetaData& mutable_meta) { | |
1003 | return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); | |
1004 | }; | |
1005 | ||
1006 | auto process_both = [&min_oldest_blob_file_num]( | |
1007 | const std::shared_ptr<BlobFileMetaData>& base_meta, | |
1008 | const MutableBlobFileMetaData& mutable_meta) { | |
1009 | #ifndef NDEBUG | |
1010 | assert(base_meta); | |
1011 | assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); | |
1012 | #else | |
1013 | (void)base_meta; | |
1014 | #endif | |
1015 | ||
1016 | // Look at mutable_meta since it supersedes *base_meta | |
1017 | return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num); | |
1018 | }; | |
20effc67 | 1019 | |
1e59de90 TL |
1020 | MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable, |
1021 | process_both); | |
20effc67 | 1022 | |
1e59de90 TL |
1023 | return min_oldest_blob_file_num; |
1024 | } | |
1025 | ||
1026 | static std::shared_ptr<BlobFileMetaData> CreateBlobFileMetaData( | |
1027 | const MutableBlobFileMetaData& mutable_meta) { | |
1028 | return BlobFileMetaData::Create( | |
1029 | mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(), | |
1030 | mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes()); | |
20effc67 TL |
1031 | } |
1032 | ||
1033 | // Add the blob file specified by meta to *vstorage if it is determined to | |
1e59de90 TL |
1034 | // contain valid data (blobs). |
1035 | template <typename Meta> | |
1036 | static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) { | |
20effc67 TL |
1037 | assert(vstorage); |
1038 | assert(meta); | |
20effc67 | 1039 | |
1e59de90 TL |
1040 | if (meta->GetLinkedSsts().empty() && |
1041 | meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) { | |
20effc67 TL |
1042 | return; |
1043 | } | |
1044 | ||
1e59de90 | 1045 | vstorage->AddBlobFile(std::forward<Meta>(meta)); |
20effc67 TL |
1046 | } |
1047 | ||
1048 | // Merge the blob file metadata from the base version with the changes (edits) | |
1049 | // applied, and save the result into *vstorage. | |
1050 | void SaveBlobFilesTo(VersionStorageInfo* vstorage) const { | |
20effc67 TL |
1051 | assert(vstorage); |
1052 | ||
1e59de90 TL |
1053 | assert(base_vstorage_); |
1054 | vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() + | |
1055 | mutable_blob_file_metas_.size()); | |
20effc67 | 1056 | |
1e59de90 TL |
1057 | const uint64_t oldest_blob_file_with_linked_ssts = |
1058 | GetMinOldestBlobFileNumber(); | |
20effc67 | 1059 | |
1e59de90 TL |
1060 | auto process_base = |
1061 | [vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) { | |
1062 | assert(base_meta); | |
20effc67 | 1063 | |
1e59de90 | 1064 | AddBlobFileIfNeeded(vstorage, base_meta); |
20effc67 | 1065 | |
1e59de90 TL |
1066 | return true; |
1067 | }; | |
20effc67 | 1068 | |
1e59de90 TL |
1069 | auto process_mutable = |
1070 | [vstorage](const MutableBlobFileMetaData& mutable_meta) { | |
1071 | AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); | |
20effc67 | 1072 | |
1e59de90 TL |
1073 | return true; |
1074 | }; | |
20effc67 | 1075 | |
1e59de90 TL |
1076 | auto process_both = [vstorage]( |
1077 | const std::shared_ptr<BlobFileMetaData>& base_meta, | |
1078 | const MutableBlobFileMetaData& mutable_meta) { | |
1079 | assert(base_meta); | |
1080 | assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta()); | |
20effc67 | 1081 | |
1e59de90 TL |
1082 | if (!mutable_meta.HasDelta()) { |
1083 | assert(base_meta->GetGarbageBlobCount() == | |
1084 | mutable_meta.GetGarbageBlobCount()); | |
1085 | assert(base_meta->GetGarbageBlobBytes() == | |
1086 | mutable_meta.GetGarbageBlobBytes()); | |
1087 | assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts()); | |
20effc67 | 1088 | |
1e59de90 | 1089 | AddBlobFileIfNeeded(vstorage, base_meta); |
20effc67 | 1090 | |
1e59de90 TL |
1091 | return true; |
1092 | } | |
20effc67 | 1093 | |
1e59de90 | 1094 | AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta)); |
20effc67 | 1095 | |
1e59de90 TL |
1096 | return true; |
1097 | }; | |
20effc67 | 1098 | |
1e59de90 TL |
1099 | MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base, |
1100 | process_mutable, process_both); | |
1101 | } | |
1102 | ||
1103 | void MaybeAddFile(VersionStorageInfo* vstorage, int level, | |
1104 | FileMetaData* f) const { | |
1105 | const uint64_t file_number = f->fd.GetNumber(); | |
1106 | ||
1107 | const auto& level_state = levels_[level]; | |
1108 | ||
1109 | const auto& del_files = level_state.deleted_files; | |
1110 | const auto del_it = del_files.find(file_number); | |
1111 | ||
1112 | if (del_it != del_files.end()) { | |
1113 | // f is to-be-deleted table file | |
1114 | vstorage->RemoveCurrentStats(f); | |
1115 | } else { | |
1116 | const auto& add_files = level_state.added_files; | |
1117 | const auto add_it = add_files.find(file_number); | |
1118 | ||
1119 | // Note: if the file appears both in the base version and in the added | |
1120 | // list, the added FileMetaData supersedes the one in the base version. | |
1121 | if (add_it != add_files.end() && add_it->second != f) { | |
1122 | vstorage->RemoveCurrentStats(f); | |
1123 | } else { | |
1124 | vstorage->AddFile(level, f); | |
11fdf7f2 | 1125 | } |
7c673cae | 1126 | } |
1e59de90 | 1127 | } |
20effc67 | 1128 | |
1e59de90 TL |
1129 | template <typename Cmp> |
1130 | void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const { | |
1131 | // Merge the set of added files with the set of pre-existing files. | |
1132 | // Drop any deleted files. Store the result in *vstorage. | |
1133 | const auto& base_files = base_vstorage_->LevelFiles(level); | |
1134 | const auto& unordered_added_files = levels_[level].added_files; | |
1135 | vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); | |
1136 | ||
1137 | // Sort added files for the level. | |
1138 | std::vector<FileMetaData*> added_files; | |
1139 | added_files.reserve(unordered_added_files.size()); | |
1140 | for (const auto& pair : unordered_added_files) { | |
1141 | added_files.push_back(pair.second); | |
1142 | } | |
1143 | std::sort(added_files.begin(), added_files.end(), cmp); | |
1144 | ||
1145 | auto base_iter = base_files.begin(); | |
1146 | auto base_end = base_files.end(); | |
1147 | auto added_iter = added_files.begin(); | |
1148 | auto added_end = added_files.end(); | |
1149 | while (added_iter != added_end || base_iter != base_end) { | |
1150 | if (base_iter == base_end || | |
1151 | (added_iter != added_end && cmp(*added_iter, *base_iter))) { | |
1152 | MaybeAddFile(vstorage, level, *added_iter++); | |
1153 | } else { | |
1154 | MaybeAddFile(vstorage, level, *base_iter++); | |
1155 | } | |
1156 | } | |
1157 | } | |
20effc67 | 1158 | |
1e59de90 TL |
1159 | void SaveSSTFilesTo(VersionStorageInfo* vstorage) const { |
1160 | assert(vstorage); | |
20effc67 | 1161 | |
1e59de90 TL |
1162 | if (!num_levels_) { |
1163 | return; | |
20effc67 TL |
1164 | } |
1165 | ||
1e59de90 | 1166 | SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_); |
20effc67 | 1167 | |
1e59de90 TL |
1168 | for (int level = 1; level < num_levels_; ++level) { |
1169 | SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); | |
1170 | } | |
1171 | } | |
20effc67 | 1172 | |
1e59de90 TL |
1173 | void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const { |
1174 | for (auto iter = updated_compact_cursors_.begin(); | |
1175 | iter != updated_compact_cursors_.end(); iter++) { | |
1176 | vstorage->AddCursorForOneLevel(iter->first, iter->second); | |
20effc67 | 1177 | } |
7c673cae FG |
1178 | } |
1179 | ||
1e59de90 TL |
1180 | // Save the current state in *vstorage. |
1181 | Status SaveTo(VersionStorageInfo* vstorage) const { | |
1182 | Status s; | |
1183 | ||
1184 | #ifndef NDEBUG | |
1185 | // The same check is done within Apply() so we skip it in release mode. | |
1186 | s = CheckConsistency(base_vstorage_); | |
f67539c2 TL |
1187 | if (!s.ok()) { |
1188 | return s; | |
1189 | } | |
1e59de90 | 1190 | #endif // NDEBUG |
f67539c2 TL |
1191 | |
1192 | s = CheckConsistency(vstorage); | |
1193 | if (!s.ok()) { | |
1194 | return s; | |
1195 | } | |
7c673cae | 1196 | |
1e59de90 | 1197 | SaveSSTFilesTo(vstorage); |
7c673cae | 1198 | |
20effc67 TL |
1199 | SaveBlobFilesTo(vstorage); |
1200 | ||
1e59de90 TL |
1201 | SaveCompactCursorsTo(vstorage); |
1202 | ||
f67539c2 TL |
1203 | s = CheckConsistency(vstorage); |
1204 | return s; | |
7c673cae FG |
1205 | } |
1206 | ||
1e59de90 TL |
1207 | Status LoadTableHandlers( |
1208 | InternalStats* internal_stats, int max_threads, | |
1209 | bool prefetch_index_and_filter_in_cache, bool is_initial_load, | |
1210 | const std::shared_ptr<const SliceTransform>& prefix_extractor, | |
1211 | size_t max_file_size_for_l0_meta_pin) { | |
7c673cae | 1212 | assert(table_cache_ != nullptr); |
494da23a TL |
1213 | |
1214 | size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity(); | |
1215 | bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity); | |
1e59de90 | 1216 | size_t max_load = std::numeric_limits<size_t>::max(); |
494da23a TL |
1217 | |
1218 | if (!always_load) { | |
20effc67 | 1219 | // If it is initial loading and not set to always loading all the |
494da23a TL |
1220 | // files, we only load up to kInitialLoadLimit files, to limit the |
1221 | // time reopening the DB. | |
1222 | const size_t kInitialLoadLimit = 16; | |
1223 | size_t load_limit; | |
1224 | // If the table cache is not 1/4 full, we pin the table handle to | |
1225 | // file metadata to avoid the cache read costs when reading the file. | |
1226 | // The downside of pinning those files is that LRU won't be followed | |
1227 | // for those files. This doesn't matter much because if number of files | |
1228 | // of the DB excceeds table cache capacity, eventually no table reader | |
1229 | // will be pinned and LRU will be followed. | |
1230 | if (is_initial_load) { | |
1231 | load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4); | |
1232 | } else { | |
1233 | load_limit = table_cache_capacity / 4; | |
1234 | } | |
1235 | ||
1236 | size_t table_cache_usage = table_cache_->get_cache()->GetUsage(); | |
1237 | if (table_cache_usage >= load_limit) { | |
1238 | // TODO (yanqin) find a suitable status code. | |
1239 | return Status::OK(); | |
1240 | } else { | |
1241 | max_load = load_limit - table_cache_usage; | |
1242 | } | |
1243 | } | |
1244 | ||
7c673cae FG |
1245 | // <file metadata, level> |
1246 | std::vector<std::pair<FileMetaData*, int>> files_meta; | |
494da23a | 1247 | std::vector<Status> statuses; |
11fdf7f2 | 1248 | for (int level = 0; level < num_levels_; level++) { |
7c673cae FG |
1249 | for (auto& file_meta_pair : levels_[level].added_files) { |
1250 | auto* file_meta = file_meta_pair.second; | |
494da23a TL |
1251 | // If the file has been opened before, just skip it. |
1252 | if (!file_meta->table_reader_handle) { | |
1253 | files_meta.emplace_back(file_meta, level); | |
1254 | statuses.emplace_back(Status::OK()); | |
1255 | } | |
1256 | if (files_meta.size() >= max_load) { | |
1257 | break; | |
1258 | } | |
1259 | } | |
1260 | if (files_meta.size() >= max_load) { | |
1261 | break; | |
7c673cae FG |
1262 | } |
1263 | } | |
1264 | ||
1265 | std::atomic<size_t> next_file_meta_idx(0); | |
11fdf7f2 | 1266 | std::function<void()> load_handlers_func([&]() { |
7c673cae FG |
1267 | while (true) { |
1268 | size_t file_idx = next_file_meta_idx.fetch_add(1); | |
1269 | if (file_idx >= files_meta.size()) { | |
1270 | break; | |
1271 | } | |
1272 | ||
1273 | auto* file_meta = files_meta[file_idx].first; | |
1274 | int level = files_meta[file_idx].second; | |
494da23a | 1275 | statuses[file_idx] = table_cache_->FindTable( |
20effc67 | 1276 | ReadOptions(), file_options_, |
1e59de90 | 1277 | *(base_vstorage_->InternalComparator()), *file_meta, |
20effc67 TL |
1278 | &file_meta->table_reader_handle, prefix_extractor, false /*no_io */, |
1279 | true /* record_read_stats */, | |
11fdf7f2 | 1280 | internal_stats->GetFileReadHist(level), false, level, |
1e59de90 TL |
1281 | prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, |
1282 | file_meta->temperature); | |
7c673cae FG |
1283 | if (file_meta->table_reader_handle != nullptr) { |
1284 | // Load table_reader | |
1285 | file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( | |
1286 | file_meta->table_reader_handle); | |
1287 | } | |
1288 | } | |
11fdf7f2 | 1289 | }); |
7c673cae | 1290 | |
11fdf7f2 TL |
1291 | std::vector<port::Thread> threads; |
1292 | for (int i = 1; i < max_threads; i++) { | |
1293 | threads.emplace_back(load_handlers_func); | |
1294 | } | |
1295 | load_handlers_func(); | |
1296 | for (auto& t : threads) { | |
1297 | t.join(); | |
7c673cae | 1298 | } |
20effc67 | 1299 | Status ret; |
494da23a TL |
1300 | for (const auto& s : statuses) { |
1301 | if (!s.ok()) { | |
20effc67 TL |
1302 | if (ret.ok()) { |
1303 | ret = s; | |
1304 | } | |
494da23a TL |
1305 | } |
1306 | } | |
20effc67 | 1307 | return ret; |
7c673cae | 1308 | } |
7c673cae FG |
1309 | }; |
1310 | ||
1e59de90 TL |
1311 | VersionBuilder::VersionBuilder( |
1312 | const FileOptions& file_options, const ImmutableCFOptions* ioptions, | |
1313 | TableCache* table_cache, VersionStorageInfo* base_vstorage, | |
1314 | VersionSet* version_set, | |
1315 | std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr) | |
20effc67 | 1316 | : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage, |
1e59de90 | 1317 | version_set, file_metadata_cache_res_mgr)) {} |
11fdf7f2 | 1318 | |
20effc67 | 1319 | VersionBuilder::~VersionBuilder() = default; |
11fdf7f2 TL |
1320 | |
1321 | bool VersionBuilder::CheckConsistencyForNumLevels() { | |
1322 | return rep_->CheckConsistencyForNumLevels(); | |
1323 | } | |
1324 | ||
1e59de90 TL |
1325 | Status VersionBuilder::Apply(const VersionEdit* edit) { |
1326 | return rep_->Apply(edit); | |
1327 | } | |
11fdf7f2 | 1328 | |
1e59de90 | 1329 | Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const { |
f67539c2 | 1330 | return rep_->SaveTo(vstorage); |
7c673cae | 1331 | } |
11fdf7f2 | 1332 | |
494da23a TL |
1333 | Status VersionBuilder::LoadTableHandlers( |
1334 | InternalStats* internal_stats, int max_threads, | |
1335 | bool prefetch_index_and_filter_in_cache, bool is_initial_load, | |
1e59de90 | 1336 | const std::shared_ptr<const SliceTransform>& prefix_extractor, |
20effc67 TL |
1337 | size_t max_file_size_for_l0_meta_pin) { |
1338 | return rep_->LoadTableHandlers( | |
1339 | internal_stats, max_threads, prefetch_index_and_filter_in_cache, | |
1340 | is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin); | |
1341 | } | |
1342 | ||
1e59de90 TL |
1343 | uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { |
1344 | return rep_->GetMinOldestBlobFileNumber(); | |
1345 | } | |
1346 | ||
20effc67 TL |
1347 | BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( |
1348 | ColumnFamilyData* cfd) | |
1349 | : version_builder_(new VersionBuilder( | |
1350 | cfd->current()->version_set()->file_options(), cfd->ioptions(), | |
1351 | cfd->table_cache(), cfd->current()->storage_info(), | |
1e59de90 TL |
1352 | cfd->current()->version_set(), |
1353 | cfd->GetFileMetadataCacheReservationManager())), | |
20effc67 TL |
1354 | version_(cfd->current()) { |
1355 | version_->Ref(); | |
1356 | } | |
1357 | ||
1358 | BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( | |
1359 | ColumnFamilyData* cfd, Version* v) | |
1360 | : version_builder_(new VersionBuilder( | |
1361 | cfd->current()->version_set()->file_options(), cfd->ioptions(), | |
1e59de90 TL |
1362 | cfd->table_cache(), v->storage_info(), v->version_set(), |
1363 | cfd->GetFileMetadataCacheReservationManager())), | |
20effc67 TL |
1364 | version_(v) { |
1365 | assert(version_ != cfd->current()); | |
7c673cae | 1366 | } |
11fdf7f2 | 1367 | |
20effc67 TL |
1368 | BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() { |
1369 | version_->Unref(); | |
7c673cae FG |
1370 | } |
1371 | ||
f67539c2 | 1372 | } // namespace ROCKSDB_NAMESPACE |