]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/version_builder.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / version_builder.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#include "db/version_builder.h"
11
7c673cae
FG
12#include <algorithm>
13#include <atomic>
f67539c2 14#include <cinttypes>
7c673cae 15#include <functional>
11fdf7f2 16#include <map>
20effc67 17#include <memory>
7c673cae 18#include <set>
20effc67 19#include <sstream>
7c673cae
FG
20#include <thread>
21#include <unordered_map>
22#include <unordered_set>
23#include <utility>
24#include <vector>
25
1e59de90 26#include "cache/cache_reservation_manager.h"
20effc67 27#include "db/blob/blob_file_meta.h"
7c673cae
FG
28#include "db/dbformat.h"
29#include "db/internal_stats.h"
30#include "db/table_cache.h"
31#include "db/version_set.h"
32#include "port/port.h"
33#include "table/table_reader.h"
f67539c2 34#include "util/string_util.h"
7c673cae 35
f67539c2 36namespace ROCKSDB_NAMESPACE {
7c673cae 37
1e59de90
TL
38class VersionBuilder::Rep {
39 class NewestFirstBySeqNo {
40 public:
41 bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
42 assert(lhs);
43 assert(rhs);
7c673cae 44
1e59de90
TL
45 if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
46 return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
47 }
7c673cae 48
1e59de90
TL
49 if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
50 return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
7c673cae 51 }
1e59de90
TL
52
53 // Break ties by file number
54 return lhs->fd.GetNumber() > rhs->fd.GetNumber();
55 }
56 };
57
58 class BySmallestKey {
59 public:
60 explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {}
61
62 bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
63 assert(lhs);
64 assert(rhs);
65 assert(cmp_);
66
67 const int r = cmp_->Compare(lhs->smallest, rhs->smallest);
68 if (r != 0) {
69 return (r < 0);
70 }
71
72 // Break ties by file number
73 return (lhs->fd.GetNumber() < rhs->fd.GetNumber());
7c673cae 74 }
1e59de90
TL
75
76 private:
77 const InternalKeyComparator* cmp_;
7c673cae
FG
78 };
79
80 struct LevelState {
81 std::unordered_set<uint64_t> deleted_files;
82 // Map from file number to file meta data.
83 std::unordered_map<uint64_t, FileMetaData*> added_files;
84 };
85
1e59de90
TL
86 // A class that represents the accumulated changes (like additional garbage or
87 // newly linked/unlinked SST files) for a given blob file after applying a
88 // series of VersionEdits.
20effc67
TL
89 class BlobFileMetaDataDelta {
90 public:
91 bool IsEmpty() const {
1e59de90
TL
92 return !additional_garbage_count_ && !additional_garbage_bytes_ &&
93 newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty();
20effc67
TL
94 }
95
96 uint64_t GetAdditionalGarbageCount() const {
97 return additional_garbage_count_;
98 }
99
100 uint64_t GetAdditionalGarbageBytes() const {
101 return additional_garbage_bytes_;
102 }
103
104 const std::unordered_set<uint64_t>& GetNewlyLinkedSsts() const {
105 return newly_linked_ssts_;
106 }
107
108 const std::unordered_set<uint64_t>& GetNewlyUnlinkedSsts() const {
109 return newly_unlinked_ssts_;
110 }
111
20effc67
TL
112 void AddGarbage(uint64_t count, uint64_t bytes) {
113 additional_garbage_count_ += count;
114 additional_garbage_bytes_ += bytes;
115 }
116
117 void LinkSst(uint64_t sst_file_number) {
118 assert(newly_linked_ssts_.find(sst_file_number) ==
119 newly_linked_ssts_.end());
120
121 // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be
122 // linked to and unlinked from the same blob file in the case of a trivial
123 // move.)
124 auto it = newly_unlinked_ssts_.find(sst_file_number);
125
126 if (it != newly_unlinked_ssts_.end()) {
127 newly_unlinked_ssts_.erase(it);
128 } else {
129 newly_linked_ssts_.emplace(sst_file_number);
130 }
131 }
132
133 void UnlinkSst(uint64_t sst_file_number) {
134 assert(newly_unlinked_ssts_.find(sst_file_number) ==
135 newly_unlinked_ssts_.end());
136
137 // Reconcile with newly linked SSTs on the fly. (Note: an SST can be
138 // linked to and unlinked from the same blob file in the case of a trivial
139 // move.)
140 auto it = newly_linked_ssts_.find(sst_file_number);
141
142 if (it != newly_linked_ssts_.end()) {
143 newly_linked_ssts_.erase(it);
144 } else {
145 newly_unlinked_ssts_.emplace(sst_file_number);
146 }
147 }
148
149 private:
20effc67
TL
150 uint64_t additional_garbage_count_ = 0;
151 uint64_t additional_garbage_bytes_ = 0;
152 std::unordered_set<uint64_t> newly_linked_ssts_;
153 std::unordered_set<uint64_t> newly_unlinked_ssts_;
154 };
155
1e59de90
TL
156 // A class that represents the state of a blob file after applying a series of
157 // VersionEdits. In addition to the resulting state, it also contains the
158 // delta (see BlobFileMetaDataDelta above). The resulting state can be used to
159 // identify obsolete blob files, while the delta makes it possible to
160 // efficiently detect trivial moves.
161 class MutableBlobFileMetaData {
162 public:
163 // To be used for brand new blob files
164 explicit MutableBlobFileMetaData(
165 std::shared_ptr<SharedBlobFileMetaData>&& shared_meta)
166 : shared_meta_(std::move(shared_meta)) {}
167
168 // To be used for pre-existing blob files
169 explicit MutableBlobFileMetaData(
170 const std::shared_ptr<BlobFileMetaData>& meta)
171 : shared_meta_(meta->GetSharedMeta()),
172 linked_ssts_(meta->GetLinkedSsts()),
173 garbage_blob_count_(meta->GetGarbageBlobCount()),
174 garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {}
175
176 const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
177 return shared_meta_;
178 }
179
180 uint64_t GetBlobFileNumber() const {
181 assert(shared_meta_);
182 return shared_meta_->GetBlobFileNumber();
183 }
184
185 bool HasDelta() const { return !delta_.IsEmpty(); }
186
187 const std::unordered_set<uint64_t>& GetLinkedSsts() const {
188 return linked_ssts_;
189 }
190
191 uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
192
193 uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
194
195 bool AddGarbage(uint64_t count, uint64_t bytes) {
196 assert(shared_meta_);
197
198 if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() ||
199 garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) {
200 return false;
201 }
202
203 delta_.AddGarbage(count, bytes);
204
205 garbage_blob_count_ += count;
206 garbage_blob_bytes_ += bytes;
207
208 return true;
209 }
210
211 void LinkSst(uint64_t sst_file_number) {
212 delta_.LinkSst(sst_file_number);
213
214 assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end());
215 linked_ssts_.emplace(sst_file_number);
216 }
217
218 void UnlinkSst(uint64_t sst_file_number) {
219 delta_.UnlinkSst(sst_file_number);
220
221 assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end());
222 linked_ssts_.erase(sst_file_number);
223 }
224
225 private:
226 std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
227 // Accumulated changes
228 BlobFileMetaDataDelta delta_;
229 // Resulting state after applying the changes
230 BlobFileMetaData::LinkedSsts linked_ssts_;
231 uint64_t garbage_blob_count_ = 0;
232 uint64_t garbage_blob_bytes_ = 0;
233 };
234
f67539c2 235 const FileOptions& file_options_;
20effc67 236 const ImmutableCFOptions* const ioptions_;
7c673cae
FG
237 TableCache* table_cache_;
238 VersionStorageInfo* base_vstorage_;
20effc67 239 VersionSet* version_set_;
11fdf7f2 240 int num_levels_;
7c673cae 241 LevelState* levels_;
20effc67 242 // Store sizes of levels larger than num_levels_. We do this instead of
11fdf7f2
TL
243 // storing them in levels_ to avoid regression in case there are no files
244 // on invalid levels. The version is not consistent if in the end the files
245 // on invalid levels don't cancel out.
20effc67 246 std::unordered_map<int, size_t> invalid_level_sizes_;
11fdf7f2
TL
247 // Whether there are invalid new files or invalid deletion on levels larger
248 // than num_levels_.
249 bool has_invalid_levels_;
20effc67
TL
250 // Current levels of table files affected by additions/deletions.
251 std::unordered_map<uint64_t, int> table_file_levels_;
1e59de90
TL
252 // Current compact cursors that should be changed after the last compaction
253 std::unordered_map<int, InternalKey> updated_compact_cursors_;
254 NewestFirstBySeqNo level_zero_cmp_;
255 BySmallestKey level_nonzero_cmp_;
256
257 // Mutable metadata objects for all blob files affected by the series of
258 // version edits.
259 std::map<uint64_t, MutableBlobFileMetaData> mutable_blob_file_metas_;
7c673cae 260
1e59de90 261 std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
20effc67 262
7c673cae 263 public:
20effc67
TL
264 Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
265 TableCache* table_cache, VersionStorageInfo* base_vstorage,
1e59de90
TL
266 VersionSet* version_set,
267 std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
f67539c2 268 : file_options_(file_options),
20effc67 269 ioptions_(ioptions),
7c673cae 270 table_cache_(table_cache),
11fdf7f2 271 base_vstorage_(base_vstorage),
20effc67 272 version_set_(version_set),
11fdf7f2 273 num_levels_(base_vstorage->num_levels()),
1e59de90
TL
274 has_invalid_levels_(false),
275 level_nonzero_cmp_(base_vstorage_->InternalComparator()),
276 file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {
20effc67
TL
277 assert(ioptions_);
278
11fdf7f2 279 levels_ = new LevelState[num_levels_];
7c673cae
FG
280 }
281
282 ~Rep() {
11fdf7f2 283 for (int level = 0; level < num_levels_; level++) {
7c673cae
FG
284 const auto& added = levels_[level].added_files;
285 for (auto& pair : added) {
286 UnrefFile(pair.second);
287 }
288 }
289
290 delete[] levels_;
291 }
292
293 void UnrefFile(FileMetaData* f) {
294 f->refs--;
295 if (f->refs <= 0) {
296 if (f->table_reader_handle) {
297 assert(table_cache_ != nullptr);
298 table_cache_->ReleaseHandle(f->table_reader_handle);
299 f->table_reader_handle = nullptr;
300 }
7c673cae 301
1e59de90
TL
302 if (file_metadata_cache_res_mgr_) {
303 Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
304 f->ApproximateMemoryUsage(), false /* increase */);
305 s.PermitUncheckedError();
20effc67 306 }
1e59de90 307 delete f;
7c673cae 308 }
20effc67
TL
309 }
310
1e59de90
TL
311 // Mapping used for checking the consistency of links between SST files and
312 // blob files. It is built using the forward links (table file -> blob file),
313 // and is subsequently compared with the inverse mapping stored in the
314 // BlobFileMetaData objects.
20effc67
TL
315 using ExpectedLinkedSsts =
316 std::unordered_map<uint64_t, BlobFileMetaData::LinkedSsts>;
317
318 static void UpdateExpectedLinkedSsts(
319 uint64_t table_file_number, uint64_t blob_file_number,
320 ExpectedLinkedSsts* expected_linked_ssts) {
321 assert(expected_linked_ssts);
322
323 if (blob_file_number == kInvalidBlobFileNumber) {
324 return;
325 }
326
327 (*expected_linked_ssts)[blob_file_number].emplace(table_file_number);
328 }
329
1e59de90
TL
330 template <typename Checker>
331 Status CheckConsistencyDetailsForLevel(
332 const VersionStorageInfo* vstorage, int level, Checker checker,
333 const std::string& sync_point,
334 ExpectedLinkedSsts* expected_linked_ssts) const {
335#ifdef NDEBUG
336 (void)sync_point;
337#endif
20effc67 338
1e59de90
TL
339 assert(vstorage);
340 assert(level >= 0 && level < num_levels_);
341 assert(expected_linked_ssts);
20effc67 342
1e59de90
TL
343 const auto& level_files = vstorage->LevelFiles(level);
344
345 if (level_files.empty()) {
346 return Status::OK();
347 }
348
349 assert(level_files[0]);
350 UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(),
351 level_files[0]->oldest_blob_file_number,
352 expected_linked_ssts);
353
354 for (size_t i = 1; i < level_files.size(); ++i) {
355 assert(level_files[i]);
356 UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(),
357 level_files[i]->oldest_blob_file_number,
358 expected_linked_ssts);
359
360 auto lhs = level_files[i - 1];
361 auto rhs = level_files[i];
20effc67 362
f67539c2 363#ifndef NDEBUG
1e59de90
TL
364 auto pair = std::make_pair(&lhs, &rhs);
365 TEST_SYNC_POINT_CALLBACK(sync_point, &pair);
f67539c2 366#endif
1e59de90
TL
367
368 const Status s = checker(lhs, rhs);
369 if (!s.ok()) {
370 return s;
371 }
372 }
373
374 return Status::OK();
375 }
376
377 // Make sure table files are sorted correctly and that the links between
378 // table files and blob files are consistent.
379 Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const {
380 assert(vstorage);
381
382 ExpectedLinkedSsts expected_linked_ssts;
383
384 if (num_levels_ > 0) {
385 // Check L0
386 {
387 auto l0_checker = [this](const FileMetaData* lhs,
388 const FileMetaData* rhs) {
389 assert(lhs);
390 assert(rhs);
391
392 if (!level_zero_cmp_(lhs, rhs)) {
393 std::ostringstream oss;
394 oss << "L0 files are not sorted properly: files #"
395 << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
396
397 return Status::Corruption("VersionBuilder", oss.str());
7c673cae
FG
398 }
399
1e59de90 400 if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) {
7c673cae 401 // This is an external file that we ingested
1e59de90
TL
402 const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno;
403
404 if (!(external_file_seqno < lhs->fd.largest_seqno ||
7c673cae 405 external_file_seqno == 0)) {
1e59de90
TL
406 std::ostringstream oss;
407 oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
408 << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
409 << " vs. file #" << rhs->fd.GetNumber()
410 << " with global_seqno " << external_file_seqno;
411
412 return Status::Corruption("VersionBuilder", oss.str());
7c673cae 413 }
1e59de90
TL
414 } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) {
415 std::ostringstream oss;
416 oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
417 << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
418 << " vs. file #" << rhs->fd.GetNumber() << " with seqno "
419 << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno;
420
421 return Status::Corruption("VersionBuilder", oss.str());
7c673cae 422 }
1e59de90
TL
423
424 return Status::OK();
425 };
426
427 const Status s = CheckConsistencyDetailsForLevel(
428 vstorage, /* level */ 0, l0_checker,
429 "VersionBuilder::CheckConsistency0", &expected_linked_ssts);
430 if (!s.ok()) {
431 return s;
432 }
433 }
434
435 // Check L1 and up
436 const InternalKeyComparator* const icmp = vstorage->InternalComparator();
437 assert(icmp);
438
439 for (int level = 1; level < num_levels_; ++level) {
440 auto checker = [this, level, icmp](const FileMetaData* lhs,
441 const FileMetaData* rhs) {
442 assert(lhs);
443 assert(rhs);
444
445 if (!level_nonzero_cmp_(lhs, rhs)) {
446 std::ostringstream oss;
447 oss << 'L' << level << " files are not sorted properly: files #"
448 << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
449
450 return Status::Corruption("VersionBuilder", oss.str());
7c673cae
FG
451 }
452
1e59de90
TL
453 // Make sure there is no overlap in level
454 if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) {
455 std::ostringstream oss;
456 oss << 'L' << level << " has overlapping ranges: file #"
457 << lhs->fd.GetNumber()
458 << " largest key: " << lhs->largest.DebugString(true)
459 << " vs. file #" << rhs->fd.GetNumber()
460 << " smallest key: " << rhs->smallest.DebugString(true);
461
462 return Status::Corruption("VersionBuilder", oss.str());
7c673cae 463 }
1e59de90
TL
464
465 return Status::OK();
466 };
467
468 const Status s = CheckConsistencyDetailsForLevel(
469 vstorage, level, checker, "VersionBuilder::CheckConsistency1",
470 &expected_linked_ssts);
471 if (!s.ok()) {
472 return s;
7c673cae
FG
473 }
474 }
475 }
20effc67 476
1e59de90
TL
477 // Make sure that all blob files in the version have non-garbage data and
478 // the links between them and the table files are consistent.
20effc67 479 const auto& blob_files = vstorage->GetBlobFiles();
1e59de90 480 for (const auto& blob_file_meta : blob_files) {
20effc67
TL
481 assert(blob_file_meta);
482
1e59de90
TL
483 const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber();
484
20effc67
TL
485 if (blob_file_meta->GetGarbageBlobCount() >=
486 blob_file_meta->GetTotalBlobCount()) {
487 std::ostringstream oss;
488 oss << "Blob file #" << blob_file_number
489 << " consists entirely of garbage";
490
491 return Status::Corruption("VersionBuilder", oss.str());
492 }
493
494 if (blob_file_meta->GetLinkedSsts() !=
495 expected_linked_ssts[blob_file_number]) {
496 std::ostringstream oss;
497 oss << "Links are inconsistent between table files and blob file #"
498 << blob_file_number;
499
500 return Status::Corruption("VersionBuilder", oss.str());
501 }
502 }
503
504 Status ret_s;
505 TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn",
506 &ret_s);
507 return ret_s;
7c673cae
FG
508 }
509
1e59de90
TL
510 Status CheckConsistency(const VersionStorageInfo* vstorage) const {
511 assert(vstorage);
512
20effc67 513 // Always run consistency checks in debug build
7c673cae 514#ifdef NDEBUG
20effc67 515 if (!vstorage->force_consistency_checks()) {
f67539c2 516 return Status::OK();
7c673cae
FG
517 }
518#endif
20effc67
TL
519 Status s = CheckConsistencyDetails(vstorage);
520 if (s.IsCorruption() && s.getState()) {
521 // Make it clear the error is due to force_consistency_checks = 1 or
522 // debug build
523#ifdef NDEBUG
524 auto prefix = "force_consistency_checks";
525#else
526 auto prefix = "force_consistency_checks(DEBUG)";
527#endif
528 s = Status::Corruption(prefix, s.getState());
529 } else {
530 // was only expecting corruption with message, or OK
531 assert(s.ok());
532 }
533 return s;
534 }
535
536 bool CheckConsistencyForNumLevels() const {
537 // Make sure there are no files on or beyond num_levels().
538 if (has_invalid_levels_) {
539 return false;
540 }
541
542 for (const auto& pair : invalid_level_sizes_) {
543 const size_t level_size = pair.second;
544 if (level_size != 0) {
545 return false;
7c673cae
FG
546 }
547 }
20effc67
TL
548
549 return true;
550 }
551
1e59de90
TL
552 bool IsBlobFileInVersion(uint64_t blob_file_number) const {
553 auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
554 if (mutable_it != mutable_blob_file_metas_.end()) {
555 return true;
556 }
557
558 assert(base_vstorage_);
559 const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
560
561 return !!meta;
562 }
563
564 MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData(
565 uint64_t blob_file_number) {
566 auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
567 if (mutable_it != mutable_blob_file_metas_.end()) {
568 return &mutable_it->second;
569 }
570
571 assert(base_vstorage_);
572 const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
573
574 if (meta) {
575 mutable_it = mutable_blob_file_metas_
576 .emplace(blob_file_number, MutableBlobFileMetaData(meta))
577 .first;
578 return &mutable_it->second;
579 }
580
581 return nullptr;
582 }
583
20effc67
TL
584 Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) {
585 const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber();
586
587 if (IsBlobFileInVersion(blob_file_number)) {
588 std::ostringstream oss;
589 oss << "Blob file #" << blob_file_number << " already added";
590
591 return Status::Corruption("VersionBuilder", oss.str());
592 }
593
594 // Note: we use C++11 for now but in C++14, this could be done in a more
595 // elegant way using generalized lambda capture.
596 VersionSet* const vs = version_set_;
597 const ImmutableCFOptions* const ioptions = ioptions_;
598
599 auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) {
600 if (vs) {
601 assert(ioptions);
602 assert(!ioptions->cf_paths.empty());
603 assert(shared_meta);
604
605 vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(),
606 ioptions->cf_paths.front().path);
7c673cae 607 }
20effc67
TL
608
609 delete shared_meta;
610 };
611
612 auto shared_meta = SharedBlobFileMetaData::Create(
613 blob_file_number, blob_file_addition.GetTotalBlobCount(),
614 blob_file_addition.GetTotalBlobBytes(),
615 blob_file_addition.GetChecksumMethod(),
616 blob_file_addition.GetChecksumValue(), deleter);
617
1e59de90
TL
618 mutable_blob_file_metas_.emplace(
619 blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
20effc67
TL
620
621 return Status::OK();
622 }
623
624 Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
625 const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber();
626
1e59de90
TL
627 MutableBlobFileMetaData* const mutable_meta =
628 GetOrCreateMutableBlobFileMetaData(blob_file_number);
629
630 if (!mutable_meta) {
20effc67
TL
631 std::ostringstream oss;
632 oss << "Blob file #" << blob_file_number << " not found";
633
634 return Status::Corruption("VersionBuilder", oss.str());
635 }
636
1e59de90
TL
637 if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(),
638 blob_file_garbage.GetGarbageBlobBytes())) {
639 std::ostringstream oss;
640 oss << "Garbage overflow for blob file #" << blob_file_number;
641 return Status::Corruption("VersionBuilder", oss.str());
642 }
20effc67
TL
643
644 return Status::OK();
645 }
646
647 int GetCurrentLevelForTableFile(uint64_t file_number) const {
648 auto it = table_file_levels_.find(file_number);
649 if (it != table_file_levels_.end()) {
650 return it->second;
651 }
652
653 assert(base_vstorage_);
654 return base_vstorage_->GetFileLocation(file_number).GetLevel();
655 }
656
657 uint64_t GetOldestBlobFileNumberForTableFile(int level,
658 uint64_t file_number) const {
659 assert(level < num_levels_);
660
661 const auto& added_files = levels_[level].added_files;
662
663 auto it = added_files.find(file_number);
664 if (it != added_files.end()) {
665 const FileMetaData* const meta = it->second;
666 assert(meta);
667
668 return meta->oldest_blob_file_number;
7c673cae
FG
669 }
670
20effc67
TL
671 assert(base_vstorage_);
672 const FileMetaData* const meta =
673 base_vstorage_->GetFileMetaDataByNumber(file_number);
674 assert(meta);
675
676 return meta->oldest_blob_file_number;
677 }
678
679 Status ApplyFileDeletion(int level, uint64_t file_number) {
680 assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
681
682 const int current_level = GetCurrentLevelForTableFile(file_number);
683
684 if (level != current_level) {
685 if (level >= num_levels_) {
686 has_invalid_levels_ = true;
7c673cae 687 }
20effc67
TL
688
689 std::ostringstream oss;
690 oss << "Cannot delete table file #" << file_number << " from level "
691 << level << " since it is ";
692 if (current_level ==
693 VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
694 oss << "not in the LSM tree";
695 } else {
696 oss << "on level " << current_level;
697 }
698
699 return Status::Corruption("VersionBuilder", oss.str());
7c673cae 700 }
20effc67
TL
701
702 if (level >= num_levels_) {
703 assert(invalid_level_sizes_[level] > 0);
704 --invalid_level_sizes_[level];
705
706 table_file_levels_[file_number] =
707 VersionStorageInfo::FileLocation::Invalid().GetLevel();
708
709 return Status::OK();
7c673cae 710 }
20effc67
TL
711
712 const uint64_t blob_file_number =
713 GetOldestBlobFileNumberForTableFile(level, file_number);
714
1e59de90
TL
715 if (blob_file_number != kInvalidBlobFileNumber) {
716 MutableBlobFileMetaData* const mutable_meta =
717 GetOrCreateMutableBlobFileMetaData(blob_file_number);
718 if (mutable_meta) {
719 mutable_meta->UnlinkSst(file_number);
720 }
20effc67
TL
721 }
722
723 auto& level_state = levels_[level];
724
725 auto& add_files = level_state.added_files;
726 auto add_it = add_files.find(file_number);
727 if (add_it != add_files.end()) {
728 UnrefFile(add_it->second);
729 add_files.erase(add_it);
730 }
731
732 auto& del_files = level_state.deleted_files;
733 assert(del_files.find(file_number) == del_files.end());
734 del_files.emplace(file_number);
735
736 table_file_levels_[file_number] =
737 VersionStorageInfo::FileLocation::Invalid().GetLevel();
738
f67539c2 739 return Status::OK();
7c673cae
FG
740 }
741
20effc67
TL
742 Status ApplyFileAddition(int level, const FileMetaData& meta) {
743 assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
744
745 const uint64_t file_number = meta.fd.GetNumber();
746
747 const int current_level = GetCurrentLevelForTableFile(file_number);
748
749 if (current_level !=
750 VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
751 if (level >= num_levels_) {
752 has_invalid_levels_ = true;
11fdf7f2 753 }
20effc67
TL
754
755 std::ostringstream oss;
756 oss << "Cannot add table file #" << file_number << " to level " << level
757 << " since it is already in the LSM tree on level " << current_level;
758 return Status::Corruption("VersionBuilder", oss.str());
11fdf7f2 759 }
20effc67
TL
760
761 if (level >= num_levels_) {
762 ++invalid_level_sizes_[level];
763 table_file_levels_[file_number] = level;
764
765 return Status::OK();
766 }
767
768 auto& level_state = levels_[level];
769
770 auto& del_files = level_state.deleted_files;
771 auto del_it = del_files.find(file_number);
772 if (del_it != del_files.end()) {
773 del_files.erase(del_it);
774 }
775
776 FileMetaData* const f = new FileMetaData(meta);
777 f->refs = 1;
778
1e59de90
TL
779 if (file_metadata_cache_res_mgr_) {
780 Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
781 f->ApproximateMemoryUsage(), true /* increase */);
782 if (!s.ok()) {
783 delete f;
784 s = Status::MemoryLimit(
785 "Can't allocate " +
786 kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
787 CacheEntryRole::kFileMetadata)] +
788 " due to exceeding the memory limit "
789 "based on "
790 "cache capacity");
791 return s;
792 }
793 }
794
20effc67
TL
795 auto& add_files = level_state.added_files;
796 assert(add_files.find(file_number) == add_files.end());
797 add_files.emplace(file_number, f);
798
799 const uint64_t blob_file_number = f->oldest_blob_file_number;
800
1e59de90
TL
801 if (blob_file_number != kInvalidBlobFileNumber) {
802 MutableBlobFileMetaData* const mutable_meta =
803 GetOrCreateMutableBlobFileMetaData(blob_file_number);
804 if (mutable_meta) {
805 mutable_meta->LinkSst(file_number);
806 }
20effc67
TL
807 }
808
809 table_file_levels_[file_number] = level;
810
811 return Status::OK();
11fdf7f2
TL
812 }
813
1e59de90
TL
814 Status ApplyCompactCursors(int level,
815 const InternalKey& smallest_uncompacted_key) {
816 if (level < 0) {
817 std::ostringstream oss;
818 oss << "Cannot add compact cursor (" << level << ","
819 << smallest_uncompacted_key.Encode().ToString()
820 << " due to invalid level (level = " << level << ")";
821 return Status::Corruption("VersionBuilder", oss.str());
822 }
823 if (level < num_levels_) {
824 // Omit levels (>= num_levels_) when re-open with shrinking num_levels_
825 updated_compact_cursors_[level] = smallest_uncompacted_key;
826 }
827 return Status::OK();
828 }
829
7c673cae 830 // Apply all of the edits in *edit to the current state.
1e59de90 831 Status Apply(const VersionEdit* edit) {
20effc67
TL
832 {
833 const Status s = CheckConsistency(base_vstorage_);
834 if (!s.ok()) {
835 return s;
836 }
f67539c2 837 }
7c673cae 838
20effc67
TL
839 // Note: we process the blob file related changes first because the
840 // table file addition/deletion logic depends on the blob files
841 // already being there.
11fdf7f2 842
20effc67
TL
843 // Add new blob files
844 for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) {
845 const Status s = ApplyBlobFileAddition(blob_file_addition);
846 if (!s.ok()) {
847 return s;
848 }
849 }
850
851 // Increase the amount of garbage for blob files affected by GC
852 for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) {
853 const Status s = ApplyBlobFileGarbage(blob_file_garbage);
854 if (!s.ok()) {
855 return s;
856 }
857 }
858
859 // Delete table files
860 for (const auto& deleted_file : edit->GetDeletedFiles()) {
861 const int level = deleted_file.first;
862 const uint64_t file_number = deleted_file.second;
863
864 const Status s = ApplyFileDeletion(level, file_number);
865 if (!s.ok()) {
866 return s;
7c673cae
FG
867 }
868 }
869
20effc67 870 // Add new table files
7c673cae
FG
871 for (const auto& new_file : edit->GetNewFiles()) {
872 const int level = new_file.first;
20effc67
TL
873 const FileMetaData& meta = new_file.second;
874
875 const Status s = ApplyFileAddition(level, meta);
876 if (!s.ok()) {
877 return s;
878 }
879 }
880
1e59de90
TL
881 // Populate compact cursors for round-robin compaction, leave
882 // the cursor to be empty to indicate it is invalid
883 for (const auto& cursor : edit->GetCompactCursors()) {
884 const int level = cursor.first;
885 const InternalKey smallest_uncompacted_key = cursor.second;
886 const Status s = ApplyCompactCursors(level, smallest_uncompacted_key);
887 if (!s.ok()) {
888 return s;
889 }
890 }
20effc67
TL
891 return Status::OK();
892 }
893
1e59de90
TL
894 // Helper function template for merging the blob file metadata from the base
895 // version with the mutable metadata representing the state after applying the
896 // edits. The function objects process_base and process_mutable are
897 // respectively called to handle a base version object when there is no
898 // matching mutable object, and a mutable object when there is no matching
899 // base version object. process_both is called to perform the merge when a
900 // given blob file appears both in the base version and the mutable list. The
901 // helper stops processing objects if a function object returns false. Blob
902 // files with a file number below first_blob_file are not processed.
903 template <typename ProcessBase, typename ProcessMutable, typename ProcessBoth>
904 void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base,
905 ProcessMutable process_mutable,
906 ProcessBoth process_both) const {
907 assert(base_vstorage_);
908
909 auto base_it = base_vstorage_->GetBlobFileMetaDataLB(first_blob_file);
910 const auto base_it_end = base_vstorage_->GetBlobFiles().end();
911
912 auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file);
913 const auto mutable_it_end = mutable_blob_file_metas_.end();
914
915 while (base_it != base_it_end && mutable_it != mutable_it_end) {
916 const auto& base_meta = *base_it;
917 assert(base_meta);
918
919 const uint64_t base_blob_file_number = base_meta->GetBlobFileNumber();
920 const uint64_t mutable_blob_file_number = mutable_it->first;
921
922 if (base_blob_file_number < mutable_blob_file_number) {
923 if (!process_base(base_meta)) {
924 return;
925 }
926
927 ++base_it;
928 } else if (mutable_blob_file_number < base_blob_file_number) {
929 const auto& mutable_meta = mutable_it->second;
930
931 if (!process_mutable(mutable_meta)) {
932 return;
933 }
934
935 ++mutable_it;
936 } else {
937 assert(base_blob_file_number == mutable_blob_file_number);
20effc67 938
1e59de90
TL
939 const auto& mutable_meta = mutable_it->second;
940
941 if (!process_both(base_meta, mutable_meta)) {
942 return;
943 }
20effc67 944
1e59de90
TL
945 ++base_it;
946 ++mutable_it;
947 }
20effc67
TL
948 }
949
1e59de90
TL
950 while (base_it != base_it_end) {
951 const auto& base_meta = *base_it;
20effc67 952
1e59de90
TL
953 if (!process_base(base_meta)) {
954 return;
955 }
956
957 ++base_it;
20effc67
TL
958 }
959
1e59de90
TL
960 while (mutable_it != mutable_it_end) {
961 const auto& mutable_meta = mutable_it->second;
962
963 if (!process_mutable(mutable_meta)) {
964 return;
965 }
966
967 ++mutable_it;
968 }
20effc67
TL
969 }
970
1e59de90
TL
971 // Helper function template for finding the first blob file that has linked
972 // SSTs.
973 template <typename Meta>
974 static bool CheckLinkedSsts(const Meta& meta,
975 uint64_t* min_oldest_blob_file_num) {
976 assert(min_oldest_blob_file_num);
977
978 if (!meta.GetLinkedSsts().empty()) {
979 assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber);
20effc67 980
1e59de90 981 *min_oldest_blob_file_num = meta.GetBlobFileNumber();
20effc67 982
1e59de90
TL
983 return false;
984 }
20effc67 985
1e59de90 986 return true;
20effc67
TL
987 }
988
1e59de90
TL
989 // Find the oldest blob file that has linked SSTs.
990 uint64_t GetMinOldestBlobFileNumber() const {
991 uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber;
20effc67 992
1e59de90
TL
993 auto process_base =
994 [&min_oldest_blob_file_num](
995 const std::shared_ptr<BlobFileMetaData>& base_meta) {
996 assert(base_meta);
20effc67 997
1e59de90
TL
998 return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num);
999 };
20effc67 1000
1e59de90
TL
1001 auto process_mutable = [&min_oldest_blob_file_num](
1002 const MutableBlobFileMetaData& mutable_meta) {
1003 return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
1004 };
1005
1006 auto process_both = [&min_oldest_blob_file_num](
1007 const std::shared_ptr<BlobFileMetaData>& base_meta,
1008 const MutableBlobFileMetaData& mutable_meta) {
1009#ifndef NDEBUG
1010 assert(base_meta);
1011 assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
1012#else
1013 (void)base_meta;
1014#endif
1015
1016 // Look at mutable_meta since it supersedes *base_meta
1017 return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
1018 };
20effc67 1019
1e59de90
TL
1020 MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable,
1021 process_both);
20effc67 1022
1e59de90
TL
1023 return min_oldest_blob_file_num;
1024 }
1025
1026 static std::shared_ptr<BlobFileMetaData> CreateBlobFileMetaData(
1027 const MutableBlobFileMetaData& mutable_meta) {
1028 return BlobFileMetaData::Create(
1029 mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(),
1030 mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
20effc67
TL
1031 }
1032
1033 // Add the blob file specified by meta to *vstorage if it is determined to
1e59de90
TL
1034 // contain valid data (blobs).
1035 template <typename Meta>
1036 static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) {
20effc67
TL
1037 assert(vstorage);
1038 assert(meta);
20effc67 1039
1e59de90
TL
1040 if (meta->GetLinkedSsts().empty() &&
1041 meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
20effc67
TL
1042 return;
1043 }
1044
1e59de90 1045 vstorage->AddBlobFile(std::forward<Meta>(meta));
20effc67
TL
1046 }
1047
1048 // Merge the blob file metadata from the base version with the changes (edits)
1049 // applied, and save the result into *vstorage.
1050 void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
20effc67
TL
1051 assert(vstorage);
1052
1e59de90
TL
1053 assert(base_vstorage_);
1054 vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() +
1055 mutable_blob_file_metas_.size());
20effc67 1056
1e59de90
TL
1057 const uint64_t oldest_blob_file_with_linked_ssts =
1058 GetMinOldestBlobFileNumber();
20effc67 1059
1e59de90
TL
1060 auto process_base =
1061 [vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
1062 assert(base_meta);
20effc67 1063
1e59de90 1064 AddBlobFileIfNeeded(vstorage, base_meta);
20effc67 1065
1e59de90
TL
1066 return true;
1067 };
20effc67 1068
1e59de90
TL
1069 auto process_mutable =
1070 [vstorage](const MutableBlobFileMetaData& mutable_meta) {
1071 AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
20effc67 1072
1e59de90
TL
1073 return true;
1074 };
20effc67 1075
1e59de90
TL
1076 auto process_both = [vstorage](
1077 const std::shared_ptr<BlobFileMetaData>& base_meta,
1078 const MutableBlobFileMetaData& mutable_meta) {
1079 assert(base_meta);
1080 assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
20effc67 1081
1e59de90
TL
1082 if (!mutable_meta.HasDelta()) {
1083 assert(base_meta->GetGarbageBlobCount() ==
1084 mutable_meta.GetGarbageBlobCount());
1085 assert(base_meta->GetGarbageBlobBytes() ==
1086 mutable_meta.GetGarbageBlobBytes());
1087 assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
20effc67 1088
1e59de90 1089 AddBlobFileIfNeeded(vstorage, base_meta);
20effc67 1090
1e59de90
TL
1091 return true;
1092 }
20effc67 1093
1e59de90 1094 AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
20effc67 1095
1e59de90
TL
1096 return true;
1097 };
20effc67 1098
1e59de90
TL
1099 MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base,
1100 process_mutable, process_both);
1101 }
1102
1103 void MaybeAddFile(VersionStorageInfo* vstorage, int level,
1104 FileMetaData* f) const {
1105 const uint64_t file_number = f->fd.GetNumber();
1106
1107 const auto& level_state = levels_[level];
1108
1109 const auto& del_files = level_state.deleted_files;
1110 const auto del_it = del_files.find(file_number);
1111
1112 if (del_it != del_files.end()) {
1113 // f is to-be-deleted table file
1114 vstorage->RemoveCurrentStats(f);
1115 } else {
1116 const auto& add_files = level_state.added_files;
1117 const auto add_it = add_files.find(file_number);
1118
1119 // Note: if the file appears both in the base version and in the added
1120 // list, the added FileMetaData supersedes the one in the base version.
1121 if (add_it != add_files.end() && add_it->second != f) {
1122 vstorage->RemoveCurrentStats(f);
1123 } else {
1124 vstorage->AddFile(level, f);
11fdf7f2 1125 }
7c673cae 1126 }
1e59de90 1127 }
20effc67 1128
1e59de90
TL
1129 template <typename Cmp>
1130 void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
1131 // Merge the set of added files with the set of pre-existing files.
1132 // Drop any deleted files. Store the result in *vstorage.
1133 const auto& base_files = base_vstorage_->LevelFiles(level);
1134 const auto& unordered_added_files = levels_[level].added_files;
1135 vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
1136
1137 // Sort added files for the level.
1138 std::vector<FileMetaData*> added_files;
1139 added_files.reserve(unordered_added_files.size());
1140 for (const auto& pair : unordered_added_files) {
1141 added_files.push_back(pair.second);
1142 }
1143 std::sort(added_files.begin(), added_files.end(), cmp);
1144
1145 auto base_iter = base_files.begin();
1146 auto base_end = base_files.end();
1147 auto added_iter = added_files.begin();
1148 auto added_end = added_files.end();
1149 while (added_iter != added_end || base_iter != base_end) {
1150 if (base_iter == base_end ||
1151 (added_iter != added_end && cmp(*added_iter, *base_iter))) {
1152 MaybeAddFile(vstorage, level, *added_iter++);
1153 } else {
1154 MaybeAddFile(vstorage, level, *base_iter++);
1155 }
1156 }
1157 }
20effc67 1158
1e59de90
TL
1159 void SaveSSTFilesTo(VersionStorageInfo* vstorage) const {
1160 assert(vstorage);
20effc67 1161
1e59de90
TL
1162 if (!num_levels_) {
1163 return;
20effc67
TL
1164 }
1165
1e59de90 1166 SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_);
20effc67 1167
1e59de90
TL
1168 for (int level = 1; level < num_levels_; ++level) {
1169 SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
1170 }
1171 }
20effc67 1172
1e59de90
TL
1173 void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const {
1174 for (auto iter = updated_compact_cursors_.begin();
1175 iter != updated_compact_cursors_.end(); iter++) {
1176 vstorage->AddCursorForOneLevel(iter->first, iter->second);
20effc67 1177 }
7c673cae
FG
1178 }
1179
1e59de90
TL
1180 // Save the current state in *vstorage.
1181 Status SaveTo(VersionStorageInfo* vstorage) const {
1182 Status s;
1183
1184#ifndef NDEBUG
1185 // The same check is done within Apply() so we skip it in release mode.
1186 s = CheckConsistency(base_vstorage_);
f67539c2
TL
1187 if (!s.ok()) {
1188 return s;
1189 }
1e59de90 1190#endif // NDEBUG
f67539c2
TL
1191
1192 s = CheckConsistency(vstorage);
1193 if (!s.ok()) {
1194 return s;
1195 }
7c673cae 1196
1e59de90 1197 SaveSSTFilesTo(vstorage);
7c673cae 1198
20effc67
TL
1199 SaveBlobFilesTo(vstorage);
1200
1e59de90
TL
1201 SaveCompactCursorsTo(vstorage);
1202
f67539c2
TL
1203 s = CheckConsistency(vstorage);
1204 return s;
7c673cae
FG
1205 }
1206
1e59de90
TL
1207 Status LoadTableHandlers(
1208 InternalStats* internal_stats, int max_threads,
1209 bool prefetch_index_and_filter_in_cache, bool is_initial_load,
1210 const std::shared_ptr<const SliceTransform>& prefix_extractor,
1211 size_t max_file_size_for_l0_meta_pin) {
7c673cae 1212 assert(table_cache_ != nullptr);
494da23a
TL
1213
1214 size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
1215 bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity);
1e59de90 1216 size_t max_load = std::numeric_limits<size_t>::max();
494da23a
TL
1217
1218 if (!always_load) {
20effc67 1219 // If it is initial loading and not set to always loading all the
494da23a
TL
1220 // files, we only load up to kInitialLoadLimit files, to limit the
1221 // time reopening the DB.
1222 const size_t kInitialLoadLimit = 16;
1223 size_t load_limit;
1224 // If the table cache is not 1/4 full, we pin the table handle to
1225 // file metadata to avoid the cache read costs when reading the file.
1226 // The downside of pinning those files is that LRU won't be followed
1227 // for those files. This doesn't matter much because if number of files
1228 // of the DB excceeds table cache capacity, eventually no table reader
1229 // will be pinned and LRU will be followed.
1230 if (is_initial_load) {
1231 load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4);
1232 } else {
1233 load_limit = table_cache_capacity / 4;
1234 }
1235
1236 size_t table_cache_usage = table_cache_->get_cache()->GetUsage();
1237 if (table_cache_usage >= load_limit) {
1238 // TODO (yanqin) find a suitable status code.
1239 return Status::OK();
1240 } else {
1241 max_load = load_limit - table_cache_usage;
1242 }
1243 }
1244
7c673cae
FG
1245 // <file metadata, level>
1246 std::vector<std::pair<FileMetaData*, int>> files_meta;
494da23a 1247 std::vector<Status> statuses;
11fdf7f2 1248 for (int level = 0; level < num_levels_; level++) {
7c673cae
FG
1249 for (auto& file_meta_pair : levels_[level].added_files) {
1250 auto* file_meta = file_meta_pair.second;
494da23a
TL
1251 // If the file has been opened before, just skip it.
1252 if (!file_meta->table_reader_handle) {
1253 files_meta.emplace_back(file_meta, level);
1254 statuses.emplace_back(Status::OK());
1255 }
1256 if (files_meta.size() >= max_load) {
1257 break;
1258 }
1259 }
1260 if (files_meta.size() >= max_load) {
1261 break;
7c673cae
FG
1262 }
1263 }
1264
1265 std::atomic<size_t> next_file_meta_idx(0);
11fdf7f2 1266 std::function<void()> load_handlers_func([&]() {
7c673cae
FG
1267 while (true) {
1268 size_t file_idx = next_file_meta_idx.fetch_add(1);
1269 if (file_idx >= files_meta.size()) {
1270 break;
1271 }
1272
1273 auto* file_meta = files_meta[file_idx].first;
1274 int level = files_meta[file_idx].second;
494da23a 1275 statuses[file_idx] = table_cache_->FindTable(
20effc67 1276 ReadOptions(), file_options_,
1e59de90 1277 *(base_vstorage_->InternalComparator()), *file_meta,
20effc67
TL
1278 &file_meta->table_reader_handle, prefix_extractor, false /*no_io */,
1279 true /* record_read_stats */,
11fdf7f2 1280 internal_stats->GetFileReadHist(level), false, level,
1e59de90
TL
1281 prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
1282 file_meta->temperature);
7c673cae
FG
1283 if (file_meta->table_reader_handle != nullptr) {
1284 // Load table_reader
1285 file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
1286 file_meta->table_reader_handle);
1287 }
1288 }
11fdf7f2 1289 });
7c673cae 1290
11fdf7f2
TL
1291 std::vector<port::Thread> threads;
1292 for (int i = 1; i < max_threads; i++) {
1293 threads.emplace_back(load_handlers_func);
1294 }
1295 load_handlers_func();
1296 for (auto& t : threads) {
1297 t.join();
7c673cae 1298 }
20effc67 1299 Status ret;
494da23a
TL
1300 for (const auto& s : statuses) {
1301 if (!s.ok()) {
20effc67
TL
1302 if (ret.ok()) {
1303 ret = s;
1304 }
494da23a
TL
1305 }
1306 }
20effc67 1307 return ret;
7c673cae 1308 }
7c673cae
FG
1309};
1310
1e59de90
TL
1311VersionBuilder::VersionBuilder(
1312 const FileOptions& file_options, const ImmutableCFOptions* ioptions,
1313 TableCache* table_cache, VersionStorageInfo* base_vstorage,
1314 VersionSet* version_set,
1315 std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
20effc67 1316 : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
1e59de90 1317 version_set, file_metadata_cache_res_mgr)) {}
11fdf7f2 1318
20effc67 1319VersionBuilder::~VersionBuilder() = default;
11fdf7f2
TL
1320
1321bool VersionBuilder::CheckConsistencyForNumLevels() {
1322 return rep_->CheckConsistencyForNumLevels();
1323}
1324
1e59de90
TL
1325Status VersionBuilder::Apply(const VersionEdit* edit) {
1326 return rep_->Apply(edit);
1327}
11fdf7f2 1328
1e59de90 1329Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const {
f67539c2 1330 return rep_->SaveTo(vstorage);
7c673cae 1331}
11fdf7f2 1332
494da23a
TL
1333Status VersionBuilder::LoadTableHandlers(
1334 InternalStats* internal_stats, int max_threads,
1335 bool prefetch_index_and_filter_in_cache, bool is_initial_load,
1e59de90 1336 const std::shared_ptr<const SliceTransform>& prefix_extractor,
20effc67
TL
1337 size_t max_file_size_for_l0_meta_pin) {
1338 return rep_->LoadTableHandlers(
1339 internal_stats, max_threads, prefetch_index_and_filter_in_cache,
1340 is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin);
1341}
1342
1e59de90
TL
1343uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
1344 return rep_->GetMinOldestBlobFileNumber();
1345}
1346
20effc67
TL
1347BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
1348 ColumnFamilyData* cfd)
1349 : version_builder_(new VersionBuilder(
1350 cfd->current()->version_set()->file_options(), cfd->ioptions(),
1351 cfd->table_cache(), cfd->current()->storage_info(),
1e59de90
TL
1352 cfd->current()->version_set(),
1353 cfd->GetFileMetadataCacheReservationManager())),
20effc67
TL
1354 version_(cfd->current()) {
1355 version_->Ref();
1356}
1357
1358BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
1359 ColumnFamilyData* cfd, Version* v)
1360 : version_builder_(new VersionBuilder(
1361 cfd->current()->version_set()->file_options(), cfd->ioptions(),
1e59de90
TL
1362 cfd->table_cache(), v->storage_info(), v->version_set(),
1363 cfd->GetFileMetadataCacheReservationManager())),
20effc67
TL
1364 version_(v) {
1365 assert(version_ != cfd->current());
7c673cae 1366}
11fdf7f2 1367
20effc67
TL
1368BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() {
1369 version_->Unref();
7c673cae
FG
1370}
1371
f67539c2 1372} // namespace ROCKSDB_NAMESPACE