]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/repair.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / repair.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9//
10// Repairer does best effort recovery to recover as much data as possible after
11// a disaster without compromising consistency. It does not guarantee bringing
12// the database to a time consistent state.
13//
14// Repair process is broken into 4 phases:
15// (a) Find files
16// (b) Convert logs to tables
17// (c) Extract metadata
18// (d) Write Descriptor
19//
20// (a) Find files
21//
22// The repairer goes through all the files in the directory, and classifies them
23// based on their file name. Any file that cannot be identified by name will be
24// ignored.
25//
26// (b) Convert logs to table
27//
28// Every log file that is active is replayed. All sections of the file where the
29// checksum does not match is skipped over. We intentionally give preference to
30// data consistency.
31//
32// (c) Extract metadata
33//
34// We scan every table to compute
35// (1) smallest/largest for the table
36// (2) largest sequence number in the table
f67539c2 37// (3) oldest blob file referred to by the table (if applicable)
7c673cae
FG
38//
39// If we are unable to scan the file, then we ignore the table.
40//
41// (d) Write Descriptor
42//
43// We generate descriptor contents:
44// - log number is set to zero
45// - next-file-number is set to 1 + largest file number we found
46// - last-sequence-number is set to largest sequence# found across
47// all tables (see 2c)
48// - compaction pointers are cleared
49// - every table file is added at level 0
50//
51// Possible optimization 1:
52// (a) Compute total size and use to pick appropriate max-level M
53// (b) Sort tables by largest sequence# in the table
54// (c) For each table: if it overlaps earlier table, place in level-0,
55// else place in level-M.
56// (d) We can provide options for time consistent recovery and unsafe recovery
57// (ignore checksum failure when applicable)
58// Possible optimization 2:
59// Store per-table metadata (smallest, largest, largest-seq#, ...)
60// in the table's meta section to speed up ScanTable.
61
62#ifndef ROCKSDB_LITE
63
f67539c2 64#include <cinttypes>
1e59de90 65
7c673cae 66#include "db/builder.h"
f67539c2 67#include "db/db_impl/db_impl.h"
7c673cae
FG
68#include "db/dbformat.h"
69#include "db/log_reader.h"
70#include "db/log_writer.h"
71#include "db/memtable.h"
72#include "db/table_cache.h"
73#include "db/version_edit.h"
74#include "db/write_batch_internal.h"
f67539c2
TL
75#include "file/filename.h"
76#include "file/writable_file_writer.h"
1e59de90 77#include "logging/logging.h"
7c673cae
FG
78#include "options/cf_options.h"
79#include "rocksdb/comparator.h"
80#include "rocksdb/db.h"
81#include "rocksdb/env.h"
82#include "rocksdb/options.h"
83#include "rocksdb/write_buffer_manager.h"
84#include "table/scoped_arena_iterator.h"
1e59de90 85#include "table/unique_id_impl.h"
7c673cae
FG
86#include "util/string_util.h"
87
f67539c2 88namespace ROCKSDB_NAMESPACE {
7c673cae
FG
89
90namespace {
91
92class Repairer {
93 public:
94 Repairer(const std::string& dbname, const DBOptions& db_options,
95 const std::vector<ColumnFamilyDescriptor>& column_families,
96 const ColumnFamilyOptions& default_cf_opts,
97 const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
98 : dbname_(dbname),
1e59de90 99 db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)),
7c673cae 100 env_(db_options.env),
1e59de90 101 file_options_(),
7c673cae 102 db_options_(SanitizeOptions(dbname_, db_options)),
11fdf7f2 103 immutable_db_options_(ImmutableDBOptions(db_options_)),
7c673cae 104 icmp_(default_cf_opts.comparator),
11fdf7f2
TL
105 default_cf_opts_(
106 SanitizeOptions(immutable_db_options_, default_cf_opts)),
1e59de90
TL
107 default_iopts_(
108 ImmutableOptions(immutable_db_options_, default_cf_opts_)),
11fdf7f2
TL
109 unknown_cf_opts_(
110 SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
7c673cae
FG
111 create_unknown_cfs_(create_unknown_cfs),
112 raw_table_cache_(
113 // TableCache can be small since we expect each table to be opened
114 // once.
115 NewLRUCache(10, db_options_.table_cache_numshardbits)),
1e59de90
TL
116 table_cache_(new TableCache(default_iopts_, &file_options_,
117 raw_table_cache_.get(),
118 /*block_cache_tracer=*/nullptr,
119 /*io_tracer=*/nullptr, db_session_id_)),
7c673cae
FG
120 wb_(db_options_.db_write_buffer_size),
121 wc_(db_options_.delayed_write_rate),
1e59de90 122 vset_(dbname_, &immutable_db_options_, file_options_,
f67539c2 123 raw_table_cache_.get(), &wb_, &wc_,
1e59de90
TL
124 /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
125 /*db_id=*/"", db_session_id_),
494da23a 126 next_file_number_(1),
20effc67
TL
127 db_lock_(nullptr),
128 closed_(false) {
7c673cae
FG
129 for (const auto& cfd : column_families) {
130 cf_name_to_opts_[cfd.name] = cfd.options;
131 }
132 }
133
134 const ColumnFamilyOptions* GetColumnFamilyOptions(
135 const std::string& cf_name) {
136 if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) {
137 if (create_unknown_cfs_) {
138 return &unknown_cf_opts_;
139 }
140 return nullptr;
141 }
142 return &cf_name_to_opts_[cf_name];
143 }
144
145 // Adds a column family to the VersionSet with cf_options_ and updates
146 // manifest.
147 Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
148 const auto* cf_opts = GetColumnFamilyOptions(cf_name);
149 if (cf_opts == nullptr) {
150 return Status::Corruption("Encountered unknown column family with name=" +
1e59de90 151 cf_name + ", id=" + std::to_string(cf_id));
7c673cae
FG
152 }
153 Options opts(db_options_, *cf_opts);
154 MutableCFOptions mut_cf_opts(opts);
155
156 VersionEdit edit;
157 edit.SetComparatorName(opts.comparator->Name());
158 edit.SetLogNumber(0);
159 edit.SetColumnFamily(cf_id);
160 ColumnFamilyData* cfd;
161 cfd = nullptr;
162 edit.AddColumnFamily(cf_name);
163
164 mutex_.Lock();
1e59de90
TL
165 std::unique_ptr<FSDirectory> db_dir;
166 Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
167 &db_dir, nullptr);
168 if (status.ok()) {
169 status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(),
170 false /* new_descriptor_log */, cf_opts);
171 }
7c673cae
FG
172 mutex_.Unlock();
173 return status;
174 }
175
20effc67
TL
176 Status Close() {
177 Status s = Status::OK();
178 if (!closed_) {
179 if (db_lock_ != nullptr) {
180 s = env_->UnlockFile(db_lock_);
181 db_lock_ = nullptr;
182 }
183 closed_ = true;
494da23a 184 }
20effc67 185 return s;
7c673cae
FG
186 }
187
20effc67
TL
188 ~Repairer() { Close().PermitUncheckedError(); }
189
7c673cae 190 Status Run() {
494da23a
TL
191 Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
192 if (!status.ok()) {
193 return status;
194 }
195 status = FindFiles();
20effc67 196 DBImpl* db_impl = nullptr;
7c673cae
FG
197 if (status.ok()) {
198 // Discard older manifests and start a fresh one
199 for (size_t i = 0; i < manifests_.size(); i++) {
200 ArchiveFile(dbname_ + "/" + manifests_[i]);
201 }
202 // Just create a DBImpl temporarily so we can reuse NewDB()
20effc67 203 db_impl = new DBImpl(db_options_, dbname_);
20effc67
TL
204 status = db_impl->NewDB(/*new_filenames=*/nullptr);
205 }
206 delete db_impl;
11fdf7f2 207
7c673cae
FG
208 if (status.ok()) {
209 // Recover using the fresh manifest created by NewDB()
210 status =
211 vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false);
212 }
213 if (status.ok()) {
214 // Need to scan existing SST files first so the column families are
215 // created before we process WAL files
216 ExtractMetaData();
217
218 // ExtractMetaData() uses table_fds_ to know which SST files' metadata to
219 // extract -- we need to clear it here since metadata for existing SST
220 // files has been extracted already
221 table_fds_.clear();
222 ConvertLogFilesToTables();
223 ExtractMetaData();
224 status = AddTables();
225 }
226 if (status.ok()) {
227 uint64_t bytes = 0;
228 for (size_t i = 0; i < tables_.size(); i++) {
229 bytes += tables_[i].meta.fd.GetFileSize();
230 }
231 ROCKS_LOG_WARN(db_options_.info_log,
232 "**** Repaired rocksdb %s; "
233 "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
11fdf7f2 234 " bytes. "
7c673cae
FG
235 "Some data may have been lost. "
236 "****",
237 dbname_.c_str(), tables_.size(), bytes);
238 }
239 return status;
240 }
241
242 private:
243 struct TableInfo {
244 FileMetaData meta;
245 uint32_t column_family_id;
246 std::string column_family_name;
7c673cae
FG
247 };
248
249 std::string const dbname_;
20effc67 250 std::string db_session_id_;
7c673cae 251 Env* const env_;
1e59de90 252 const FileOptions file_options_;
7c673cae
FG
253 const DBOptions db_options_;
254 const ImmutableDBOptions immutable_db_options_;
255 const InternalKeyComparator icmp_;
256 const ColumnFamilyOptions default_cf_opts_;
1e59de90 257 const ImmutableOptions default_iopts_; // table_cache_ holds reference
7c673cae
FG
258 const ColumnFamilyOptions unknown_cf_opts_;
259 const bool create_unknown_cfs_;
260 std::shared_ptr<Cache> raw_table_cache_;
20effc67 261 std::unique_ptr<TableCache> table_cache_;
7c673cae
FG
262 WriteBufferManager wb_;
263 WriteController wc_;
264 VersionSet vset_;
265 std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_opts_;
266 InstrumentedMutex mutex_;
267
268 std::vector<std::string> manifests_;
269 std::vector<FileDescriptor> table_fds_;
270 std::vector<uint64_t> logs_;
271 std::vector<TableInfo> tables_;
272 uint64_t next_file_number_;
494da23a
TL
273 // Lock over the persistent DB state. Non-nullptr iff successfully
274 // acquired.
275 FileLock* db_lock_;
20effc67 276 bool closed_;
7c673cae
FG
277
278 Status FindFiles() {
279 std::vector<std::string> filenames;
280 bool found_file = false;
11fdf7f2
TL
281 std::vector<std::string> to_search_paths;
282
7c673cae 283 for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
1e59de90 284 to_search_paths.push_back(db_options_.db_paths[path_id].path);
11fdf7f2
TL
285 }
286
287 // search wal_dir if user uses a customize wal_dir
1e59de90 288 bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_);
11fdf7f2 289 if (!same) {
1e59de90 290 to_search_paths.push_back(immutable_db_options_.wal_dir);
11fdf7f2
TL
291 }
292
293 for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
1e59de90
TL
294 ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n",
295 to_search_paths[path_id].c_str());
296 Status status = env_->GetChildren(to_search_paths[path_id], &filenames);
7c673cae
FG
297 if (!status.ok()) {
298 return status;
299 }
300 if (!filenames.empty()) {
301 found_file = true;
302 }
303
304 uint64_t number;
305 FileType type;
306 for (size_t i = 0; i < filenames.size(); i++) {
307 if (ParseFileName(filenames[i], &number, &type)) {
308 if (type == kDescriptorFile) {
7c673cae
FG
309 manifests_.push_back(filenames[i]);
310 } else {
311 if (number + 1 > next_file_number_) {
312 next_file_number_ = number + 1;
313 }
20effc67 314 if (type == kWalFile) {
7c673cae
FG
315 logs_.push_back(number);
316 } else if (type == kTableFile) {
317 table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
318 0);
319 } else {
320 // Ignore other files
321 }
322 }
323 }
324 }
325 }
326 if (!found_file) {
327 return Status::Corruption(dbname_, "repair found no files");
328 }
329 return Status::OK();
330 }
331
332 void ConvertLogFilesToTables() {
1e59de90 333 const auto& wal_dir = immutable_db_options_.GetWalDir();
7c673cae 334 for (size_t i = 0; i < logs_.size(); i++) {
1e59de90
TL
335 // we should use LogFileName(wal_dir, logs_[i]) here. user might uses
336 // wal_dir option.
337 std::string logname = LogFileName(wal_dir, logs_[i]);
338 Status status = ConvertLogToTable(wal_dir, logs_[i]);
7c673cae
FG
339 if (!status.ok()) {
340 ROCKS_LOG_WARN(db_options_.info_log,
341 "Log #%" PRIu64 ": ignoring conversion error: %s",
342 logs_[i], status.ToString().c_str());
343 }
344 ArchiveFile(logname);
345 }
346 }
347
1e59de90 348 Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) {
7c673cae
FG
349 struct LogReporter : public log::Reader::Reporter {
350 Env* env;
351 std::shared_ptr<Logger> info_log;
352 uint64_t lognum;
494da23a 353 void Corruption(size_t bytes, const Status& s) override {
7c673cae
FG
354 // We print error messages for corruption, but continue repairing.
355 ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
356 lognum, static_cast<int>(bytes), s.ToString().c_str());
357 }
358 };
359
360 // Open the log file
1e59de90
TL
361 std::string logname = LogFileName(wal_dir, log);
362 const auto& fs = env_->GetFileSystem();
363 std::unique_ptr<SequentialFileReader> lfile_reader;
364 Status status = SequentialFileReader::Create(
365 fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader,
366 nullptr /* dbg */, nullptr /* rate limiter */);
7c673cae
FG
367 if (!status.ok()) {
368 return status;
369 }
7c673cae
FG
370
371 // Create the log reader.
372 LogReporter reporter;
373 reporter.env = env_;
374 reporter.info_log = db_options_.info_log;
375 reporter.lognum = log;
376 // We intentionally make log::Reader do checksumming so that
377 // corruptions cause entire commits to be skipped instead of
378 // propagating bad information (like overly large sequence
379 // numbers).
380 log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
11fdf7f2 381 true /*enable checksum*/, log);
7c673cae
FG
382
383 // Initialize per-column family memtables
384 for (auto* cfd : *vset_.GetColumnFamilySet()) {
385 cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
386 kMaxSequenceNumber);
387 }
388 auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
389
390 // Read all the records and add to a memtable
391 std::string scratch;
392 Slice record;
393 WriteBatch batch;
394 int counter = 0;
395 while (reader.ReadRecord(&record, &scratch)) {
396 if (record.size() < WriteBatchInternal::kHeader) {
1e59de90
TL
397 reporter.Corruption(record.size(),
398 Status::Corruption("log record too small"));
7c673cae
FG
399 continue;
400 }
20effc67
TL
401 Status record_status = WriteBatchInternal::SetContents(&batch, record);
402 if (record_status.ok()) {
403 record_status =
404 WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
405 }
406 if (record_status.ok()) {
7c673cae
FG
407 counter += WriteBatchInternal::Count(&batch);
408 } else {
409 ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
20effc67 410 log, record_status.ToString().c_str());
7c673cae
FG
411 }
412 }
413
414 // Dump a table for each column family with entries in this log file.
415 for (auto* cfd : *vset_.GetColumnFamilySet()) {
416 // Do not record a version edit for this conversion to a Table
417 // since ExtractMetaData() will also generate edits.
418 MemTable* mem = cfd->mem();
419 if (mem->IsEmpty()) {
420 continue;
421 }
422
423 FileMetaData meta;
424 meta.fd = FileDescriptor(next_file_number_++, 0, 0);
425 ReadOptions ro;
426 ro.total_order_seek = true;
427 Arena arena;
428 ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
11fdf7f2 429 int64_t _current_time = 0;
1e59de90
TL
430 immutable_db_options_.clock->GetCurrentTime(&_current_time)
431 .PermitUncheckedError(); // ignore error
11fdf7f2 432 const uint64_t current_time = static_cast<uint64_t>(_current_time);
1e59de90 433 meta.file_creation_time = current_time;
11fdf7f2
TL
434 SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
435
436 auto write_hint = cfd->CalculateSSTWriteHint(0);
494da23a
TL
437 std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
438 range_del_iters;
1e59de90
TL
439 auto range_del_iter = mem->NewRangeTombstoneIterator(
440 ro, kMaxSequenceNumber, false /* immutable_memtable */);
494da23a
TL
441 if (range_del_iter != nullptr) {
442 range_del_iters.emplace_back(range_del_iter);
443 }
f67539c2 444
20effc67 445 IOStatus io_s;
1e59de90
TL
446 CompressionOptions default_compression;
447 TableBuilderOptions tboptions(
448 *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
449 cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
450 kNoCompression, default_compression, cfd->GetID(), cfd->GetName(),
451 -1 /* level */, false /* is_bottommost */,
452 TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
453 0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_,
454 0 /*target_file_size*/, meta.fd.GetNumber());
455
456 SeqnoToTimeMapping empty_seqno_time_mapping;
7c673cae 457 status = BuildTable(
1e59de90
TL
458 dbname_, /* versions */ nullptr, immutable_db_options_, tboptions,
459 file_options_, table_cache_.get(), iter.get(),
460 std::move(range_del_iters), &meta, nullptr /* blob_file_additions */,
461 {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker,
462 false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s,
463 nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery,
464 empty_seqno_time_mapping, nullptr /* event_logger */, 0 /* job_id */,
465 Env::IO_HIGH, nullptr /* table_properties */, write_hint);
7c673cae
FG
466 ROCKS_LOG_INFO(db_options_.info_log,
467 "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
468 log, counter, meta.fd.GetNumber(),
469 status.ToString().c_str());
470 if (status.ok()) {
471 if (meta.fd.GetFileSize() > 0) {
472 table_fds_.push_back(meta.fd);
473 }
474 } else {
475 break;
476 }
477 }
478 delete cf_mems;
479 return status;
480 }
481
482 void ExtractMetaData() {
483 for (size_t i = 0; i < table_fds_.size(); i++) {
484 TableInfo t;
485 t.meta.fd = table_fds_[i];
486 Status status = ScanTable(&t);
487 if (!status.ok()) {
488 std::string fname = TableFileName(
489 db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
490 char file_num_buf[kFormatFileNumberBufSize];
491 FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
492 file_num_buf, sizeof(file_num_buf));
493 ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s",
494 file_num_buf, status.ToString().c_str());
495 ArchiveFile(fname);
496 } else {
497 tables_.push_back(t);
498 }
499 }
500 }
501
502 Status ScanTable(TableInfo* t) {
503 std::string fname = TableFileName(
504 db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId());
505 int counter = 0;
506 uint64_t file_size;
507 Status status = env_->GetFileSize(fname, &file_size);
508 t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
509 file_size);
510 std::shared_ptr<const TableProperties> props;
511 if (status.ok()) {
1e59de90 512 status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta,
7c673cae
FG
513 &props);
514 }
515 if (status.ok()) {
1e59de90
TL
516 auto s =
517 GetSstInternalUniqueId(props->db_id, props->db_session_id,
518 props->orig_file_number, &t->meta.unique_id);
519 if (!s.ok()) {
520 ROCKS_LOG_WARN(db_options_.info_log,
521 "Table #%" PRIu64
522 ": unable to get unique id, default to Unknown.",
523 t->meta.fd.GetNumber());
524 }
7c673cae
FG
525 t->column_family_id = static_cast<uint32_t>(props->column_family_id);
526 if (t->column_family_id ==
527 TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
528 ROCKS_LOG_WARN(
529 db_options_.info_log,
530 "Table #%" PRIu64
531 ": column family unknown (probably due to legacy format); "
532 "adding to default column family id 0.",
533 t->meta.fd.GetNumber());
534 t->column_family_id = 0;
535 }
536
537 if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
538 nullptr) {
539 status =
540 AddColumnFamily(props->column_family_name, t->column_family_id);
541 }
f67539c2 542 t->meta.oldest_ancester_time = props->creation_time;
7c673cae
FG
543 }
544 ColumnFamilyData* cfd = nullptr;
545 if (status.ok()) {
546 cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
547 if (cfd->GetName() != props->column_family_name) {
548 ROCKS_LOG_ERROR(
549 db_options_.info_log,
550 "Table #%" PRIu64
551 ": inconsistent column family name '%s'; expected '%s' for column "
552 "family id %" PRIu32 ".",
553 t->meta.fd.GetNumber(), props->column_family_name.c_str(),
554 cfd->GetName().c_str(), t->column_family_id);
555 status = Status::Corruption(dbname_, "inconsistent column family name");
556 }
557 }
558 if (status.ok()) {
f67539c2
TL
559 ReadOptions ropts;
560 ropts.total_order_seek = true;
7c673cae 561 InternalIterator* iter = table_cache_->NewIterator(
1e59de90 562 ropts, file_options_, cfd->internal_comparator(), t->meta,
11fdf7f2 563 nullptr /* range_del_agg */,
1e59de90 564 cfd->GetLatestMutableCFOptions()->prefix_extractor,
f67539c2
TL
565 /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
566 TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
20effc67
TL
567 /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
568 /*smallest_compaction_key=*/nullptr,
569 /*largest_compaction_key=*/nullptr,
570 /*allow_unprepared_value=*/false);
7c673cae 571 ParsedInternalKey parsed;
7c673cae
FG
572 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
573 Slice key = iter->key();
20effc67
TL
574 Status pik_status =
575 ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors);
576 if (!pik_status.ok()) {
7c673cae 577 ROCKS_LOG_ERROR(db_options_.info_log,
20effc67
TL
578 "Table #%" PRIu64 ": unparsable key - %s",
579 t->meta.fd.GetNumber(), pik_status.getState());
7c673cae
FG
580 continue;
581 }
582
583 counter++;
f67539c2 584
1e59de90
TL
585 status = t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
586 parsed.type);
587 if (!status.ok()) {
588 break;
589 }
7c673cae 590 }
1e59de90 591 if (status.ok() && !iter->status().ok()) {
7c673cae
FG
592 status = iter->status();
593 }
594 delete iter;
595
596 ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s",
597 t->meta.fd.GetNumber(), counter,
598 status.ToString().c_str());
599 }
1e59de90
TL
600 if (status.ok()) {
601 // XXX/FIXME: This is just basic, naive handling of range tombstones,
602 // like call to UpdateBoundariesForRange in builder.cc where we assume
603 // an SST file is a full sorted run. This probably needs the extra logic
604 // from compaction_job.cc around call to UpdateBoundariesForRange (to
605 // handle range tombstones extendingg beyond range of other entries).
606 ReadOptions ropts;
607 std::unique_ptr<FragmentedRangeTombstoneIterator> r_iter;
608 status = table_cache_->GetRangeTombstoneIterator(
609 ropts, cfd->internal_comparator(), t->meta, &r_iter);
610
611 if (r_iter) {
612 r_iter->SeekToFirst();
613
614 while (r_iter->Valid()) {
615 auto tombstone = r_iter->Tombstone();
616 auto kv = tombstone.Serialize();
617 t->meta.UpdateBoundariesForRange(
618 kv.first, tombstone.SerializeEndKey(), tombstone.seq_,
619 cfd->internal_comparator());
620 r_iter->Next();
621 }
622 }
623 }
7c673cae
FG
624 return status;
625 }
626
627 Status AddTables() {
628 std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
629 SequenceNumber max_sequence = 0;
630 for (size_t i = 0; i < tables_.size(); i++) {
631 cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
f67539c2
TL
632 if (max_sequence < tables_[i].meta.fd.largest_seqno) {
633 max_sequence = tables_[i].meta.fd.largest_seqno;
7c673cae
FG
634 }
635 }
11fdf7f2
TL
636 vset_.SetLastAllocatedSequence(max_sequence);
637 vset_.SetLastPublishedSequence(max_sequence);
7c673cae
FG
638 vset_.SetLastSequence(max_sequence);
639
640 for (const auto& cf_id_and_tables : cf_id_to_tables) {
641 auto* cfd =
642 vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
643 VersionEdit edit;
644 edit.SetComparatorName(cfd->user_comparator()->Name());
645 edit.SetLogNumber(0);
646 edit.SetNextFile(next_file_number_);
647 edit.SetColumnFamily(cfd->GetID());
648
649 // TODO(opt): separate out into multiple levels
650 for (const auto* table : cf_id_and_tables.second) {
f67539c2
TL
651 edit.AddFile(
652 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
653 table->meta.fd.GetFileSize(), table->meta.smallest,
654 table->meta.largest, table->meta.fd.smallest_seqno,
655 table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
1e59de90 656 table->meta.temperature, table->meta.oldest_blob_file_number,
f67539c2 657 table->meta.oldest_ancester_time, table->meta.file_creation_time,
1e59de90
TL
658 table->meta.file_checksum, table->meta.file_checksum_func_name,
659 table->meta.unique_id);
7c673cae 660 }
11fdf7f2
TL
661 assert(next_file_number_ > 0);
662 vset_.MarkFileNumberUsed(next_file_number_ - 1);
7c673cae 663 mutex_.Lock();
1e59de90
TL
664 std::unique_ptr<FSDirectory> db_dir;
665 Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
666 &db_dir, nullptr);
667 if (status.ok()) {
668 status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
669 &edit, &mutex_, db_dir.get(),
670 false /* new_descriptor_log */);
671 }
7c673cae
FG
672 mutex_.Unlock();
673 if (!status.ok()) {
674 return status;
675 }
676 }
677 return Status::OK();
678 }
679
680 void ArchiveFile(const std::string& fname) {
681 // Move into another directory. E.g., for
682 // dir/foo
683 // rename to
684 // dir/lost/foo
685 const char* slash = strrchr(fname.c_str(), '/');
686 std::string new_dir;
687 if (slash != nullptr) {
688 new_dir.assign(fname.data(), slash - fname.data());
689 }
690 new_dir.append("/lost");
20effc67 691 env_->CreateDir(new_dir).PermitUncheckedError(); // Ignore error
7c673cae
FG
692 std::string new_file = new_dir;
693 new_file.append("/");
694 new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
695 Status s = env_->RenameFile(fname, new_file);
696 ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(),
697 s.ToString().c_str());
698 }
699};
700
701Status GetDefaultCFOptions(
702 const std::vector<ColumnFamilyDescriptor>& column_families,
703 ColumnFamilyOptions* res) {
704 assert(res != nullptr);
705 auto iter = std::find_if(column_families.begin(), column_families.end(),
706 [](const ColumnFamilyDescriptor& cfd) {
707 return cfd.name == kDefaultColumnFamilyName;
708 });
709 if (iter == column_families.end()) {
710 return Status::InvalidArgument(
711 "column_families", "Must contain entry for default column family");
712 }
713 *res = iter->options;
714 return Status::OK();
715}
716} // anonymous namespace
717
718Status RepairDB(const std::string& dbname, const DBOptions& db_options,
1e59de90 719 const std::vector<ColumnFamilyDescriptor>& column_families) {
7c673cae
FG
720 ColumnFamilyOptions default_cf_opts;
721 Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
20effc67
TL
722 if (!status.ok()) {
723 return status;
724 }
725
726 Repairer repairer(dbname, db_options, column_families, default_cf_opts,
727 ColumnFamilyOptions() /* unknown_cf_opts */,
728 false /* create_unknown_cfs */);
729 status = repairer.Run();
7c673cae 730 if (status.ok()) {
20effc67 731 status = repairer.Close();
7c673cae
FG
732 }
733 return status;
734}
735
736Status RepairDB(const std::string& dbname, const DBOptions& db_options,
737 const std::vector<ColumnFamilyDescriptor>& column_families,
738 const ColumnFamilyOptions& unknown_cf_opts) {
739 ColumnFamilyOptions default_cf_opts;
740 Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
20effc67
TL
741 if (!status.ok()) {
742 return status;
743 }
744
745 Repairer repairer(dbname, db_options, column_families, default_cf_opts,
746 unknown_cf_opts, true /* create_unknown_cfs */);
747 status = repairer.Run();
7c673cae 748 if (status.ok()) {
20effc67 749 status = repairer.Close();
7c673cae
FG
750 }
751 return status;
752}
753
754Status RepairDB(const std::string& dbname, const Options& options) {
f67539c2 755 Options opts(options);
f67539c2
TL
756 DBOptions db_options(opts);
757 ColumnFamilyOptions cf_options(opts);
20effc67 758
1e59de90 759 Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */,
7c673cae
FG
760 cf_options /* unknown_cf_opts */,
761 true /* create_unknown_cfs */);
20effc67
TL
762 Status status = repairer.Run();
763 if (status.ok()) {
764 status = repairer.Close();
765 }
766 return status;
7c673cae
FG
767}
768
f67539c2 769} // namespace ROCKSDB_NAMESPACE
7c673cae
FG
770
771#endif // ROCKSDB_LITE