]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/version_builder.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / db / version_builder.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#include "db/version_builder.h"
11
12#ifndef __STDC_FORMAT_MACROS
13#define __STDC_FORMAT_MACROS
14#endif
15
16#include <inttypes.h>
17#include <algorithm>
18#include <atomic>
19#include <functional>
11fdf7f2 20#include <map>
7c673cae
FG
21#include <set>
22#include <thread>
23#include <unordered_map>
24#include <unordered_set>
25#include <utility>
26#include <vector>
27
28#include "db/dbformat.h"
29#include "db/internal_stats.h"
30#include "db/table_cache.h"
31#include "db/version_set.h"
32#include "port/port.h"
33#include "table/table_reader.h"
34
35namespace rocksdb {
36
37bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
11fdf7f2
TL
38 if (a->fd.largest_seqno != b->fd.largest_seqno) {
39 return a->fd.largest_seqno > b->fd.largest_seqno;
7c673cae 40 }
11fdf7f2
TL
41 if (a->fd.smallest_seqno != b->fd.smallest_seqno) {
42 return a->fd.smallest_seqno > b->fd.smallest_seqno;
7c673cae
FG
43 }
44 // Break ties by file number
45 return a->fd.GetNumber() > b->fd.GetNumber();
46}
47
48namespace {
49bool BySmallestKey(FileMetaData* a, FileMetaData* b,
50 const InternalKeyComparator* cmp) {
51 int r = cmp->Compare(a->smallest, b->smallest);
52 if (r != 0) {
53 return (r < 0);
54 }
55 // Break ties by file number
56 return (a->fd.GetNumber() < b->fd.GetNumber());
57}
58} // namespace
59
60class VersionBuilder::Rep {
61 private:
62 // Helper to sort files_ in v
63 // kLevel0 -- NewestFirstBySeqNo
64 // kLevelNon0 -- BySmallestKey
65 struct FileComparator {
66 enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
67 const InternalKeyComparator* internal_comparator;
68
11fdf7f2
TL
69 FileComparator() : internal_comparator(nullptr) {}
70
7c673cae
FG
71 bool operator()(FileMetaData* f1, FileMetaData* f2) const {
72 switch (sort_method) {
73 case kLevel0:
74 return NewestFirstBySeqNo(f1, f2);
75 case kLevelNon0:
76 return BySmallestKey(f1, f2, internal_comparator);
77 }
78 assert(false);
79 return false;
80 }
81 };
82
83 struct LevelState {
84 std::unordered_set<uint64_t> deleted_files;
85 // Map from file number to file meta data.
86 std::unordered_map<uint64_t, FileMetaData*> added_files;
87 };
88
89 const EnvOptions& env_options_;
90 Logger* info_log_;
91 TableCache* table_cache_;
92 VersionStorageInfo* base_vstorage_;
11fdf7f2 93 int num_levels_;
7c673cae 94 LevelState* levels_;
11fdf7f2
TL
95 // Store states of levels larger than num_levels_. We do this instead of
96 // storing them in levels_ to avoid regression in case there are no files
97 // on invalid levels. The version is not consistent if in the end the files
98 // on invalid levels don't cancel out.
99 std::map<int, std::unordered_set<uint64_t>> invalid_levels_;
100 // Whether there are invalid new files or invalid deletion on levels larger
101 // than num_levels_.
102 bool has_invalid_levels_;
7c673cae
FG
103 FileComparator level_zero_cmp_;
104 FileComparator level_nonzero_cmp_;
105
106 public:
107 Rep(const EnvOptions& env_options, Logger* info_log, TableCache* table_cache,
108 VersionStorageInfo* base_vstorage)
109 : env_options_(env_options),
110 info_log_(info_log),
111 table_cache_(table_cache),
11fdf7f2
TL
112 base_vstorage_(base_vstorage),
113 num_levels_(base_vstorage->num_levels()),
114 has_invalid_levels_(false) {
115 levels_ = new LevelState[num_levels_];
7c673cae
FG
116 level_zero_cmp_.sort_method = FileComparator::kLevel0;
117 level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
118 level_nonzero_cmp_.internal_comparator =
119 base_vstorage_->InternalComparator();
120 }
121
122 ~Rep() {
11fdf7f2 123 for (int level = 0; level < num_levels_; level++) {
7c673cae
FG
124 const auto& added = levels_[level].added_files;
125 for (auto& pair : added) {
126 UnrefFile(pair.second);
127 }
128 }
129
130 delete[] levels_;
131 }
132
133 void UnrefFile(FileMetaData* f) {
134 f->refs--;
135 if (f->refs <= 0) {
136 if (f->table_reader_handle) {
137 assert(table_cache_ != nullptr);
138 table_cache_->ReleaseHandle(f->table_reader_handle);
139 f->table_reader_handle = nullptr;
140 }
141 delete f;
142 }
143 }
144
145 void CheckConsistency(VersionStorageInfo* vstorage) {
146#ifdef NDEBUG
147 if (!vstorage->force_consistency_checks()) {
148 // Dont run consistency checks in release mode except if
149 // explicitly asked to
150 return;
151 }
152#endif
153 // make sure the files are sorted correctly
11fdf7f2 154 for (int level = 0; level < num_levels_; level++) {
7c673cae
FG
155 auto& level_files = vstorage->LevelFiles(level);
156 for (size_t i = 1; i < level_files.size(); i++) {
157 auto f1 = level_files[i - 1];
158 auto f2 = level_files[i];
159 if (level == 0) {
160 if (!level_zero_cmp_(f1, f2)) {
161 fprintf(stderr, "L0 files are not sorted properly");
162 abort();
163 }
164
11fdf7f2 165 if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
7c673cae 166 // This is an external file that we ingested
11fdf7f2
TL
167 SequenceNumber external_file_seqno = f2->fd.smallest_seqno;
168 if (!(external_file_seqno < f1->fd.largest_seqno ||
7c673cae 169 external_file_seqno == 0)) {
11fdf7f2
TL
170 fprintf(stderr,
171 "L0 file with seqno %" PRIu64 " %" PRIu64
172 " vs. file with global_seqno %" PRIu64 "\n",
173 f1->fd.smallest_seqno, f1->fd.largest_seqno,
7c673cae
FG
174 external_file_seqno);
175 abort();
176 }
11fdf7f2
TL
177 } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
178 fprintf(stderr,
179 "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64
180 " %" PRIu64 "\n",
181 f1->fd.smallest_seqno, f1->fd.largest_seqno,
182 f2->fd.smallest_seqno, f2->fd.largest_seqno);
7c673cae
FG
183 abort();
184 }
185 } else {
186 if (!level_nonzero_cmp_(f1, f2)) {
187 fprintf(stderr, "L%d files are not sorted properly", level);
188 abort();
189 }
190
191 // Make sure there is no overlap in levels > 0
192 if (vstorage->InternalComparator()->Compare(f1->largest,
193 f2->smallest) >= 0) {
194 fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level,
195 (f1->largest).DebugString(true).c_str(),
196 (f2->smallest).DebugString(true).c_str());
197 abort();
198 }
199 }
200 }
201 }
202 }
203
11fdf7f2 204 void CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
7c673cae
FG
205 int level) {
206#ifdef NDEBUG
207 if (!base_vstorage_->force_consistency_checks()) {
208 // Dont run consistency checks in release mode except if
209 // explicitly asked to
210 return;
211 }
212#endif
213 // a file to be deleted better exist in the previous version
214 bool found = false;
11fdf7f2 215 for (int l = 0; !found && l < num_levels_; l++) {
7c673cae
FG
216 const std::vector<FileMetaData*>& base_files =
217 base_vstorage_->LevelFiles(l);
218 for (size_t i = 0; i < base_files.size(); i++) {
219 FileMetaData* f = base_files[i];
220 if (f->fd.GetNumber() == number) {
221 found = true;
222 break;
223 }
224 }
225 }
226 // if the file did not exist in the previous version, then it
227 // is possibly moved from lower level to higher level in current
228 // version
11fdf7f2 229 for (int l = level + 1; !found && l < num_levels_; l++) {
7c673cae
FG
230 auto& level_added = levels_[l].added_files;
231 auto got = level_added.find(number);
232 if (got != level_added.end()) {
233 found = true;
234 break;
235 }
236 }
237
238 // maybe this file was added in a previous edit that was Applied
239 if (!found) {
240 auto& level_added = levels_[level].added_files;
241 auto got = level_added.find(number);
242 if (got != level_added.end()) {
243 found = true;
244 }
245 }
246 if (!found) {
247 fprintf(stderr, "not found %" PRIu64 "\n", number);
248 abort();
249 }
250 }
251
11fdf7f2
TL
252 bool CheckConsistencyForNumLevels() {
253 // Make sure there are no files on or beyond num_levels().
254 if (has_invalid_levels_) {
255 return false;
256 }
257 for (auto& level : invalid_levels_) {
258 if (level.second.size() > 0) {
259 return false;
260 }
261 }
262 return true;
263 }
264
7c673cae
FG
265 // Apply all of the edits in *edit to the current state.
266 void Apply(VersionEdit* edit) {
267 CheckConsistency(base_vstorage_);
268
269 // Delete files
270 const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
271 for (const auto& del_file : del) {
272 const auto level = del_file.first;
273 const auto number = del_file.second;
11fdf7f2
TL
274 if (level < num_levels_) {
275 levels_[level].deleted_files.insert(number);
276 CheckConsistencyForDeletes(edit, number, level);
277
278 auto exising = levels_[level].added_files.find(number);
279 if (exising != levels_[level].added_files.end()) {
280 UnrefFile(exising->second);
281 levels_[level].added_files.erase(exising);
282 }
283 } else {
284 auto exising = invalid_levels_[level].find(number);
285 if (exising != invalid_levels_[level].end()) {
286 invalid_levels_[level].erase(exising);
287 } else {
288 // Deleting an non-existing file on invalid level.
289 has_invalid_levels_ = true;
290 }
7c673cae
FG
291 }
292 }
293
294 // Add new files
295 for (const auto& new_file : edit->GetNewFiles()) {
296 const int level = new_file.first;
11fdf7f2
TL
297 if (level < num_levels_) {
298 FileMetaData* f = new FileMetaData(new_file.second);
299 f->refs = 1;
300
301 assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
302 levels_[level].added_files.end());
303 levels_[level].deleted_files.erase(f->fd.GetNumber());
304 levels_[level].added_files[f->fd.GetNumber()] = f;
305 } else {
306 uint64_t number = new_file.second.fd.GetNumber();
307 if (invalid_levels_[level].count(number) == 0) {
308 invalid_levels_[level].insert(number);
309 } else {
310 // Creating an already existing file on invalid level.
311 has_invalid_levels_ = true;
312 }
313 }
7c673cae
FG
314 }
315 }
316
317 // Save the current state in *v.
318 void SaveTo(VersionStorageInfo* vstorage) {
319 CheckConsistency(base_vstorage_);
320 CheckConsistency(vstorage);
321
11fdf7f2 322 for (int level = 0; level < num_levels_; level++) {
7c673cae
FG
323 const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
324 // Merge the set of added files with the set of pre-existing files.
325 // Drop any deleted files. Store the result in *v.
326 const auto& base_files = base_vstorage_->LevelFiles(level);
7c673cae
FG
327 const auto& unordered_added_files = levels_[level].added_files;
328 vstorage->Reserve(level,
329 base_files.size() + unordered_added_files.size());
330
331 // Sort added files for the level.
332 std::vector<FileMetaData*> added_files;
333 added_files.reserve(unordered_added_files.size());
334 for (const auto& pair : unordered_added_files) {
335 added_files.push_back(pair.second);
336 }
337 std::sort(added_files.begin(), added_files.end(), cmp);
338
339#ifndef NDEBUG
11fdf7f2 340 FileMetaData* prev_added_file = nullptr;
7c673cae 341 for (const auto& added : added_files) {
11fdf7f2 342 if (level > 0 && prev_added_file != nullptr) {
7c673cae 343 assert(base_vstorage_->InternalComparator()->Compare(
11fdf7f2 344 prev_added_file->smallest, added->smallest) <= 0);
7c673cae 345 }
11fdf7f2
TL
346 prev_added_file = added;
347 }
7c673cae
FG
348#endif
349
11fdf7f2
TL
350 auto base_iter = base_files.begin();
351 auto base_end = base_files.end();
352 auto added_iter = added_files.begin();
353 auto added_end = added_files.end();
354 while (added_iter != added_end || base_iter != base_end) {
355 if (base_iter == base_end ||
356 (added_iter != added_end && cmp(*added_iter, *base_iter))) {
357 MaybeAddFile(vstorage, level, *added_iter++);
358 } else {
359 MaybeAddFile(vstorage, level, *base_iter++);
7c673cae 360 }
7c673cae
FG
361 }
362 }
363
364 CheckConsistency(vstorage);
365 }
366
367 void LoadTableHandlers(InternalStats* internal_stats, int max_threads,
11fdf7f2
TL
368 bool prefetch_index_and_filter_in_cache,
369 const SliceTransform* prefix_extractor) {
7c673cae
FG
370 assert(table_cache_ != nullptr);
371 // <file metadata, level>
372 std::vector<std::pair<FileMetaData*, int>> files_meta;
11fdf7f2 373 for (int level = 0; level < num_levels_; level++) {
7c673cae
FG
374 for (auto& file_meta_pair : levels_[level].added_files) {
375 auto* file_meta = file_meta_pair.second;
376 assert(!file_meta->table_reader_handle);
377 files_meta.emplace_back(file_meta, level);
378 }
379 }
380
381 std::atomic<size_t> next_file_meta_idx(0);
11fdf7f2 382 std::function<void()> load_handlers_func([&]() {
7c673cae
FG
383 while (true) {
384 size_t file_idx = next_file_meta_idx.fetch_add(1);
385 if (file_idx >= files_meta.size()) {
386 break;
387 }
388
389 auto* file_meta = files_meta[file_idx].first;
390 int level = files_meta[file_idx].second;
11fdf7f2
TL
391 table_cache_->FindTable(
392 env_options_, *(base_vstorage_->InternalComparator()),
393 file_meta->fd, &file_meta->table_reader_handle, prefix_extractor,
394 false /*no_io */, true /* record_read_stats */,
395 internal_stats->GetFileReadHist(level), false, level,
396 prefetch_index_and_filter_in_cache);
7c673cae
FG
397 if (file_meta->table_reader_handle != nullptr) {
398 // Load table_reader
399 file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
400 file_meta->table_reader_handle);
401 }
402 }
11fdf7f2 403 });
7c673cae 404
11fdf7f2
TL
405 std::vector<port::Thread> threads;
406 for (int i = 1; i < max_threads; i++) {
407 threads.emplace_back(load_handlers_func);
408 }
409 load_handlers_func();
410 for (auto& t : threads) {
411 t.join();
7c673cae
FG
412 }
413 }
414
415 void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
416 if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
11fdf7f2 417 // f is to-be-deleted table file
7c673cae
FG
418 vstorage->RemoveCurrentStats(f);
419 } else {
420 vstorage->AddFile(level, f, info_log_);
421 }
422 }
423};
424
425VersionBuilder::VersionBuilder(const EnvOptions& env_options,
426 TableCache* table_cache,
427 VersionStorageInfo* base_vstorage,
428 Logger* info_log)
429 : rep_(new Rep(env_options, info_log, table_cache, base_vstorage)) {}
11fdf7f2 430
7c673cae 431VersionBuilder::~VersionBuilder() { delete rep_; }
11fdf7f2 432
7c673cae
FG
433void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
434 rep_->CheckConsistency(vstorage);
435}
11fdf7f2 436
7c673cae
FG
437void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
438 uint64_t number, int level) {
439 rep_->CheckConsistencyForDeletes(edit, number, level);
440}
11fdf7f2
TL
441
442bool VersionBuilder::CheckConsistencyForNumLevels() {
443 return rep_->CheckConsistencyForNumLevels();
444}
445
7c673cae 446void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
11fdf7f2 447
7c673cae
FG
448void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
449 rep_->SaveTo(vstorage);
450}
11fdf7f2
TL
451
452void VersionBuilder::LoadTableHandlers(InternalStats* internal_stats,
453 int max_threads,
454 bool prefetch_index_and_filter_in_cache,
455 const SliceTransform* prefix_extractor) {
7c673cae 456 rep_->LoadTableHandlers(internal_stats, max_threads,
11fdf7f2 457 prefetch_index_and_filter_in_cache, prefix_extractor);
7c673cae 458}
11fdf7f2 459
7c673cae
FG
460void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
461 FileMetaData* f) {
462 rep_->MaybeAddFile(vstorage, level, f);
463}
464
465} // namespace rocksdb