]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. | |
7 | // Use of this source code is governed by a BSD-style license that can be | |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
9 | ||
10 | #include "db/version_builder.h" | |
11 | ||
12 | #ifndef __STDC_FORMAT_MACROS | |
13 | #define __STDC_FORMAT_MACROS | |
14 | #endif | |
15 | ||
16 | #include <inttypes.h> | |
17 | #include <algorithm> | |
18 | #include <atomic> | |
19 | #include <functional> | |
11fdf7f2 | 20 | #include <map> |
7c673cae FG |
21 | #include <set> |
22 | #include <thread> | |
23 | #include <unordered_map> | |
24 | #include <unordered_set> | |
25 | #include <utility> | |
26 | #include <vector> | |
27 | ||
28 | #include "db/dbformat.h" | |
29 | #include "db/internal_stats.h" | |
30 | #include "db/table_cache.h" | |
31 | #include "db/version_set.h" | |
32 | #include "port/port.h" | |
33 | #include "table/table_reader.h" | |
34 | ||
35 | namespace rocksdb { | |
36 | ||
37 | bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { | |
11fdf7f2 TL |
38 | if (a->fd.largest_seqno != b->fd.largest_seqno) { |
39 | return a->fd.largest_seqno > b->fd.largest_seqno; | |
7c673cae | 40 | } |
11fdf7f2 TL |
41 | if (a->fd.smallest_seqno != b->fd.smallest_seqno) { |
42 | return a->fd.smallest_seqno > b->fd.smallest_seqno; | |
7c673cae FG |
43 | } |
44 | // Break ties by file number | |
45 | return a->fd.GetNumber() > b->fd.GetNumber(); | |
46 | } | |
47 | ||
48 | namespace { | |
49 | bool BySmallestKey(FileMetaData* a, FileMetaData* b, | |
50 | const InternalKeyComparator* cmp) { | |
51 | int r = cmp->Compare(a->smallest, b->smallest); | |
52 | if (r != 0) { | |
53 | return (r < 0); | |
54 | } | |
55 | // Break ties by file number | |
56 | return (a->fd.GetNumber() < b->fd.GetNumber()); | |
57 | } | |
58 | } // namespace | |
59 | ||
60 | class VersionBuilder::Rep { | |
61 | private: | |
62 | // Helper to sort files_ in v | |
63 | // kLevel0 -- NewestFirstBySeqNo | |
64 | // kLevelNon0 -- BySmallestKey | |
65 | struct FileComparator { | |
66 | enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method; | |
67 | const InternalKeyComparator* internal_comparator; | |
68 | ||
11fdf7f2 TL |
69 | FileComparator() : internal_comparator(nullptr) {} |
70 | ||
7c673cae FG |
71 | bool operator()(FileMetaData* f1, FileMetaData* f2) const { |
72 | switch (sort_method) { | |
73 | case kLevel0: | |
74 | return NewestFirstBySeqNo(f1, f2); | |
75 | case kLevelNon0: | |
76 | return BySmallestKey(f1, f2, internal_comparator); | |
77 | } | |
78 | assert(false); | |
79 | return false; | |
80 | } | |
81 | }; | |
82 | ||
83 | struct LevelState { | |
84 | std::unordered_set<uint64_t> deleted_files; | |
85 | // Map from file number to file meta data. | |
86 | std::unordered_map<uint64_t, FileMetaData*> added_files; | |
87 | }; | |
88 | ||
89 | const EnvOptions& env_options_; | |
90 | Logger* info_log_; | |
91 | TableCache* table_cache_; | |
92 | VersionStorageInfo* base_vstorage_; | |
11fdf7f2 | 93 | int num_levels_; |
7c673cae | 94 | LevelState* levels_; |
11fdf7f2 TL |
95 | // Store states of levels larger than num_levels_. We do this instead of |
96 | // storing them in levels_ to avoid regression in case there are no files | |
97 | // on invalid levels. The version is not consistent if in the end the files | |
98 | // on invalid levels don't cancel out. | |
99 | std::map<int, std::unordered_set<uint64_t>> invalid_levels_; | |
100 | // Whether there are invalid new files or invalid deletion on levels larger | |
101 | // than num_levels_. | |
102 | bool has_invalid_levels_; | |
7c673cae FG |
103 | FileComparator level_zero_cmp_; |
104 | FileComparator level_nonzero_cmp_; | |
105 | ||
106 | public: | |
107 | Rep(const EnvOptions& env_options, Logger* info_log, TableCache* table_cache, | |
108 | VersionStorageInfo* base_vstorage) | |
109 | : env_options_(env_options), | |
110 | info_log_(info_log), | |
111 | table_cache_(table_cache), | |
11fdf7f2 TL |
112 | base_vstorage_(base_vstorage), |
113 | num_levels_(base_vstorage->num_levels()), | |
114 | has_invalid_levels_(false) { | |
115 | levels_ = new LevelState[num_levels_]; | |
7c673cae FG |
116 | level_zero_cmp_.sort_method = FileComparator::kLevel0; |
117 | level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; | |
118 | level_nonzero_cmp_.internal_comparator = | |
119 | base_vstorage_->InternalComparator(); | |
120 | } | |
121 | ||
122 | ~Rep() { | |
11fdf7f2 | 123 | for (int level = 0; level < num_levels_; level++) { |
7c673cae FG |
124 | const auto& added = levels_[level].added_files; |
125 | for (auto& pair : added) { | |
126 | UnrefFile(pair.second); | |
127 | } | |
128 | } | |
129 | ||
130 | delete[] levels_; | |
131 | } | |
132 | ||
133 | void UnrefFile(FileMetaData* f) { | |
134 | f->refs--; | |
135 | if (f->refs <= 0) { | |
136 | if (f->table_reader_handle) { | |
137 | assert(table_cache_ != nullptr); | |
138 | table_cache_->ReleaseHandle(f->table_reader_handle); | |
139 | f->table_reader_handle = nullptr; | |
140 | } | |
141 | delete f; | |
142 | } | |
143 | } | |
144 | ||
145 | void CheckConsistency(VersionStorageInfo* vstorage) { | |
146 | #ifdef NDEBUG | |
147 | if (!vstorage->force_consistency_checks()) { | |
148 | // Dont run consistency checks in release mode except if | |
149 | // explicitly asked to | |
150 | return; | |
151 | } | |
152 | #endif | |
153 | // make sure the files are sorted correctly | |
11fdf7f2 | 154 | for (int level = 0; level < num_levels_; level++) { |
7c673cae FG |
155 | auto& level_files = vstorage->LevelFiles(level); |
156 | for (size_t i = 1; i < level_files.size(); i++) { | |
157 | auto f1 = level_files[i - 1]; | |
158 | auto f2 = level_files[i]; | |
159 | if (level == 0) { | |
160 | if (!level_zero_cmp_(f1, f2)) { | |
161 | fprintf(stderr, "L0 files are not sorted properly"); | |
162 | abort(); | |
163 | } | |
164 | ||
11fdf7f2 | 165 | if (f2->fd.smallest_seqno == f2->fd.largest_seqno) { |
7c673cae | 166 | // This is an external file that we ingested |
11fdf7f2 TL |
167 | SequenceNumber external_file_seqno = f2->fd.smallest_seqno; |
168 | if (!(external_file_seqno < f1->fd.largest_seqno || | |
7c673cae | 169 | external_file_seqno == 0)) { |
11fdf7f2 TL |
170 | fprintf(stderr, |
171 | "L0 file with seqno %" PRIu64 " %" PRIu64 | |
172 | " vs. file with global_seqno %" PRIu64 "\n", | |
173 | f1->fd.smallest_seqno, f1->fd.largest_seqno, | |
7c673cae FG |
174 | external_file_seqno); |
175 | abort(); | |
176 | } | |
11fdf7f2 TL |
177 | } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) { |
178 | fprintf(stderr, | |
179 | "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64 | |
180 | " %" PRIu64 "\n", | |
181 | f1->fd.smallest_seqno, f1->fd.largest_seqno, | |
182 | f2->fd.smallest_seqno, f2->fd.largest_seqno); | |
7c673cae FG |
183 | abort(); |
184 | } | |
185 | } else { | |
186 | if (!level_nonzero_cmp_(f1, f2)) { | |
187 | fprintf(stderr, "L%d files are not sorted properly", level); | |
188 | abort(); | |
189 | } | |
190 | ||
191 | // Make sure there is no overlap in levels > 0 | |
192 | if (vstorage->InternalComparator()->Compare(f1->largest, | |
193 | f2->smallest) >= 0) { | |
194 | fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level, | |
195 | (f1->largest).DebugString(true).c_str(), | |
196 | (f2->smallest).DebugString(true).c_str()); | |
197 | abort(); | |
198 | } | |
199 | } | |
200 | } | |
201 | } | |
202 | } | |
203 | ||
11fdf7f2 | 204 | void CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number, |
7c673cae FG |
205 | int level) { |
206 | #ifdef NDEBUG | |
207 | if (!base_vstorage_->force_consistency_checks()) { | |
208 | // Dont run consistency checks in release mode except if | |
209 | // explicitly asked to | |
210 | return; | |
211 | } | |
212 | #endif | |
213 | // a file to be deleted better exist in the previous version | |
214 | bool found = false; | |
11fdf7f2 | 215 | for (int l = 0; !found && l < num_levels_; l++) { |
7c673cae FG |
216 | const std::vector<FileMetaData*>& base_files = |
217 | base_vstorage_->LevelFiles(l); | |
218 | for (size_t i = 0; i < base_files.size(); i++) { | |
219 | FileMetaData* f = base_files[i]; | |
220 | if (f->fd.GetNumber() == number) { | |
221 | found = true; | |
222 | break; | |
223 | } | |
224 | } | |
225 | } | |
226 | // if the file did not exist in the previous version, then it | |
227 | // is possibly moved from lower level to higher level in current | |
228 | // version | |
11fdf7f2 | 229 | for (int l = level + 1; !found && l < num_levels_; l++) { |
7c673cae FG |
230 | auto& level_added = levels_[l].added_files; |
231 | auto got = level_added.find(number); | |
232 | if (got != level_added.end()) { | |
233 | found = true; | |
234 | break; | |
235 | } | |
236 | } | |
237 | ||
238 | // maybe this file was added in a previous edit that was Applied | |
239 | if (!found) { | |
240 | auto& level_added = levels_[level].added_files; | |
241 | auto got = level_added.find(number); | |
242 | if (got != level_added.end()) { | |
243 | found = true; | |
244 | } | |
245 | } | |
246 | if (!found) { | |
247 | fprintf(stderr, "not found %" PRIu64 "\n", number); | |
248 | abort(); | |
249 | } | |
250 | } | |
251 | ||
11fdf7f2 TL |
252 | bool CheckConsistencyForNumLevels() { |
253 | // Make sure there are no files on or beyond num_levels(). | |
254 | if (has_invalid_levels_) { | |
255 | return false; | |
256 | } | |
257 | for (auto& level : invalid_levels_) { | |
258 | if (level.second.size() > 0) { | |
259 | return false; | |
260 | } | |
261 | } | |
262 | return true; | |
263 | } | |
264 | ||
7c673cae FG |
265 | // Apply all of the edits in *edit to the current state. |
266 | void Apply(VersionEdit* edit) { | |
267 | CheckConsistency(base_vstorage_); | |
268 | ||
269 | // Delete files | |
270 | const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles(); | |
271 | for (const auto& del_file : del) { | |
272 | const auto level = del_file.first; | |
273 | const auto number = del_file.second; | |
11fdf7f2 TL |
274 | if (level < num_levels_) { |
275 | levels_[level].deleted_files.insert(number); | |
276 | CheckConsistencyForDeletes(edit, number, level); | |
277 | ||
278 | auto exising = levels_[level].added_files.find(number); | |
279 | if (exising != levels_[level].added_files.end()) { | |
280 | UnrefFile(exising->second); | |
281 | levels_[level].added_files.erase(exising); | |
282 | } | |
283 | } else { | |
284 | auto exising = invalid_levels_[level].find(number); | |
285 | if (exising != invalid_levels_[level].end()) { | |
286 | invalid_levels_[level].erase(exising); | |
287 | } else { | |
288 | // Deleting an non-existing file on invalid level. | |
289 | has_invalid_levels_ = true; | |
290 | } | |
7c673cae FG |
291 | } |
292 | } | |
293 | ||
294 | // Add new files | |
295 | for (const auto& new_file : edit->GetNewFiles()) { | |
296 | const int level = new_file.first; | |
11fdf7f2 TL |
297 | if (level < num_levels_) { |
298 | FileMetaData* f = new FileMetaData(new_file.second); | |
299 | f->refs = 1; | |
300 | ||
301 | assert(levels_[level].added_files.find(f->fd.GetNumber()) == | |
302 | levels_[level].added_files.end()); | |
303 | levels_[level].deleted_files.erase(f->fd.GetNumber()); | |
304 | levels_[level].added_files[f->fd.GetNumber()] = f; | |
305 | } else { | |
306 | uint64_t number = new_file.second.fd.GetNumber(); | |
307 | if (invalid_levels_[level].count(number) == 0) { | |
308 | invalid_levels_[level].insert(number); | |
309 | } else { | |
310 | // Creating an already existing file on invalid level. | |
311 | has_invalid_levels_ = true; | |
312 | } | |
313 | } | |
7c673cae FG |
314 | } |
315 | } | |
316 | ||
317 | // Save the current state in *v. | |
318 | void SaveTo(VersionStorageInfo* vstorage) { | |
319 | CheckConsistency(base_vstorage_); | |
320 | CheckConsistency(vstorage); | |
321 | ||
11fdf7f2 | 322 | for (int level = 0; level < num_levels_; level++) { |
7c673cae FG |
323 | const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; |
324 | // Merge the set of added files with the set of pre-existing files. | |
325 | // Drop any deleted files. Store the result in *v. | |
326 | const auto& base_files = base_vstorage_->LevelFiles(level); | |
7c673cae FG |
327 | const auto& unordered_added_files = levels_[level].added_files; |
328 | vstorage->Reserve(level, | |
329 | base_files.size() + unordered_added_files.size()); | |
330 | ||
331 | // Sort added files for the level. | |
332 | std::vector<FileMetaData*> added_files; | |
333 | added_files.reserve(unordered_added_files.size()); | |
334 | for (const auto& pair : unordered_added_files) { | |
335 | added_files.push_back(pair.second); | |
336 | } | |
337 | std::sort(added_files.begin(), added_files.end(), cmp); | |
338 | ||
339 | #ifndef NDEBUG | |
11fdf7f2 | 340 | FileMetaData* prev_added_file = nullptr; |
7c673cae | 341 | for (const auto& added : added_files) { |
11fdf7f2 | 342 | if (level > 0 && prev_added_file != nullptr) { |
7c673cae | 343 | assert(base_vstorage_->InternalComparator()->Compare( |
11fdf7f2 | 344 | prev_added_file->smallest, added->smallest) <= 0); |
7c673cae | 345 | } |
11fdf7f2 TL |
346 | prev_added_file = added; |
347 | } | |
7c673cae FG |
348 | #endif |
349 | ||
11fdf7f2 TL |
350 | auto base_iter = base_files.begin(); |
351 | auto base_end = base_files.end(); | |
352 | auto added_iter = added_files.begin(); | |
353 | auto added_end = added_files.end(); | |
354 | while (added_iter != added_end || base_iter != base_end) { | |
355 | if (base_iter == base_end || | |
356 | (added_iter != added_end && cmp(*added_iter, *base_iter))) { | |
357 | MaybeAddFile(vstorage, level, *added_iter++); | |
358 | } else { | |
359 | MaybeAddFile(vstorage, level, *base_iter++); | |
7c673cae | 360 | } |
7c673cae FG |
361 | } |
362 | } | |
363 | ||
364 | CheckConsistency(vstorage); | |
365 | } | |
366 | ||
367 | void LoadTableHandlers(InternalStats* internal_stats, int max_threads, | |
11fdf7f2 TL |
368 | bool prefetch_index_and_filter_in_cache, |
369 | const SliceTransform* prefix_extractor) { | |
7c673cae FG |
370 | assert(table_cache_ != nullptr); |
371 | // <file metadata, level> | |
372 | std::vector<std::pair<FileMetaData*, int>> files_meta; | |
11fdf7f2 | 373 | for (int level = 0; level < num_levels_; level++) { |
7c673cae FG |
374 | for (auto& file_meta_pair : levels_[level].added_files) { |
375 | auto* file_meta = file_meta_pair.second; | |
376 | assert(!file_meta->table_reader_handle); | |
377 | files_meta.emplace_back(file_meta, level); | |
378 | } | |
379 | } | |
380 | ||
381 | std::atomic<size_t> next_file_meta_idx(0); | |
11fdf7f2 | 382 | std::function<void()> load_handlers_func([&]() { |
7c673cae FG |
383 | while (true) { |
384 | size_t file_idx = next_file_meta_idx.fetch_add(1); | |
385 | if (file_idx >= files_meta.size()) { | |
386 | break; | |
387 | } | |
388 | ||
389 | auto* file_meta = files_meta[file_idx].first; | |
390 | int level = files_meta[file_idx].second; | |
11fdf7f2 TL |
391 | table_cache_->FindTable( |
392 | env_options_, *(base_vstorage_->InternalComparator()), | |
393 | file_meta->fd, &file_meta->table_reader_handle, prefix_extractor, | |
394 | false /*no_io */, true /* record_read_stats */, | |
395 | internal_stats->GetFileReadHist(level), false, level, | |
396 | prefetch_index_and_filter_in_cache); | |
7c673cae FG |
397 | if (file_meta->table_reader_handle != nullptr) { |
398 | // Load table_reader | |
399 | file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( | |
400 | file_meta->table_reader_handle); | |
401 | } | |
402 | } | |
11fdf7f2 | 403 | }); |
7c673cae | 404 | |
11fdf7f2 TL |
405 | std::vector<port::Thread> threads; |
406 | for (int i = 1; i < max_threads; i++) { | |
407 | threads.emplace_back(load_handlers_func); | |
408 | } | |
409 | load_handlers_func(); | |
410 | for (auto& t : threads) { | |
411 | t.join(); | |
7c673cae FG |
412 | } |
413 | } | |
414 | ||
415 | void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { | |
416 | if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { | |
11fdf7f2 | 417 | // f is to-be-deleted table file |
7c673cae FG |
418 | vstorage->RemoveCurrentStats(f); |
419 | } else { | |
420 | vstorage->AddFile(level, f, info_log_); | |
421 | } | |
422 | } | |
423 | }; | |
424 | ||
425 | VersionBuilder::VersionBuilder(const EnvOptions& env_options, | |
426 | TableCache* table_cache, | |
427 | VersionStorageInfo* base_vstorage, | |
428 | Logger* info_log) | |
429 | : rep_(new Rep(env_options, info_log, table_cache, base_vstorage)) {} | |
11fdf7f2 | 430 | |
7c673cae | 431 | VersionBuilder::~VersionBuilder() { delete rep_; } |
11fdf7f2 | 432 | |
7c673cae FG |
433 | void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { |
434 | rep_->CheckConsistency(vstorage); | |
435 | } | |
11fdf7f2 | 436 | |
7c673cae FG |
437 | void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit, |
438 | uint64_t number, int level) { | |
439 | rep_->CheckConsistencyForDeletes(edit, number, level); | |
440 | } | |
11fdf7f2 TL |
441 | |
442 | bool VersionBuilder::CheckConsistencyForNumLevels() { | |
443 | return rep_->CheckConsistencyForNumLevels(); | |
444 | } | |
445 | ||
7c673cae | 446 | void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); } |
11fdf7f2 | 447 | |
7c673cae FG |
448 | void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { |
449 | rep_->SaveTo(vstorage); | |
450 | } | |
11fdf7f2 TL |
451 | |
452 | void VersionBuilder::LoadTableHandlers(InternalStats* internal_stats, | |
453 | int max_threads, | |
454 | bool prefetch_index_and_filter_in_cache, | |
455 | const SliceTransform* prefix_extractor) { | |
7c673cae | 456 | rep_->LoadTableHandlers(internal_stats, max_threads, |
11fdf7f2 | 457 | prefetch_index_and_filter_in_cache, prefix_extractor); |
7c673cae | 458 | } |
11fdf7f2 | 459 | |
7c673cae FG |
460 | void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, |
461 | FileMetaData* f) { | |
462 | rep_->MaybeAddFile(vstorage, level, f); | |
463 | } | |
464 | ||
465 | } // namespace rocksdb |