]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/corruption_test.cc
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / db / corruption_test.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifndef ROCKSDB_LITE
11
12#include "rocksdb/db.h"
13
14#include <errno.h>
15#include <fcntl.h>
16#include <inttypes.h>
17#include <sys/stat.h>
18#include <sys/types.h>
19#include "db/db_impl.h"
20#include "db/log_format.h"
21#include "db/version_set.h"
22#include "rocksdb/cache.h"
11fdf7f2 23#include "rocksdb/convenience.h"
7c673cae
FG
24#include "rocksdb/env.h"
25#include "rocksdb/table.h"
26#include "rocksdb/write_batch.h"
494da23a
TL
27#include "table/block_based_table_builder.h"
28#include "table/meta_blocks.h"
7c673cae
FG
29#include "util/filename.h"
30#include "util/string_util.h"
31#include "util/testharness.h"
32#include "util/testutil.h"
33
34namespace rocksdb {
35
36static const int kValueSize = 1000;
37
38class CorruptionTest : public testing::Test {
39 public:
40 test::ErrorEnv env_;
41 std::string dbname_;
494da23a 42 std::shared_ptr<Cache> tiny_cache_;
7c673cae
FG
43 Options options_;
44 DB* db_;
45
46 CorruptionTest() {
47 // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
48 // set it to 0), test SequenceNumberRecovery will fail, likely because of a
49 // bug in recovery code. Keep it 4 for now to make the test passes.
50 tiny_cache_ = NewLRUCache(100, 4);
51 options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
52 options_.env = &env_;
11fdf7f2 53 dbname_ = test::PerThreadDBPath("corruption_test");
7c673cae
FG
54 DestroyDB(dbname_, options_);
55
56 db_ = nullptr;
57 options_.create_if_missing = true;
58 BlockBasedTableOptions table_options;
59 table_options.block_size_deviation = 0; // make unit test pass for now
60 options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
61 Reopen();
62 options_.create_if_missing = false;
63 }
64
494da23a
TL
65 ~CorruptionTest() override {
66 delete db_;
67 DestroyDB(dbname_, Options());
7c673cae
FG
68 }
69
70 void CloseDb() {
71 delete db_;
72 db_ = nullptr;
73 }
74
75 Status TryReopen(Options* options = nullptr) {
76 delete db_;
77 db_ = nullptr;
78 Options opt = (options ? *options : options_);
79 opt.env = &env_;
80 opt.arena_block_size = 4096;
81 BlockBasedTableOptions table_options;
82 table_options.block_cache = tiny_cache_;
83 table_options.block_size_deviation = 0;
84 opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
85 return DB::Open(opt, dbname_, &db_);
86 }
87
88 void Reopen(Options* options = nullptr) {
89 ASSERT_OK(TryReopen(options));
90 }
91
92 void RepairDB() {
93 delete db_;
94 db_ = nullptr;
95 ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
96 }
97
98 void Build(int n, int flush_every = 0) {
99 std::string key_space, value_space;
100 WriteBatch batch;
101 for (int i = 0; i < n; i++) {
102 if (flush_every != 0 && i != 0 && i % flush_every == 0) {
103 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
104 dbi->TEST_FlushMemTable();
105 }
106 //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
107 Slice key = Key(i, &key_space);
108 batch.Clear();
109 batch.Put(key, Value(i, &value_space));
110 ASSERT_OK(db_->Write(WriteOptions(), &batch));
111 }
112 }
113
114 void Check(int min_expected, int max_expected) {
115 uint64_t next_expected = 0;
116 uint64_t missed = 0;
117 int bad_keys = 0;
118 int bad_values = 0;
119 int correct = 0;
120 std::string value_space;
121 // Do not verify checksums. If we verify checksums then the
122 // db itself will raise errors because data is corrupted.
123 // Instead, we want the reads to be successful and this test
124 // will detect whether the appropriate corruptions have
125 // occurred.
126 Iterator* iter = db_->NewIterator(ReadOptions(false, true));
127 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
128 uint64_t key;
129 Slice in(iter->key());
130 if (!ConsumeDecimalNumber(&in, &key) ||
131 !in.empty() ||
132 key < next_expected) {
133 bad_keys++;
134 continue;
135 }
136 missed += (key - next_expected);
137 next_expected = key + 1;
138 if (iter->value() != Value(static_cast<int>(key), &value_space)) {
139 bad_values++;
140 } else {
141 correct++;
142 }
143 }
144 delete iter;
145
146 fprintf(stderr,
147 "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
148 min_expected, max_expected, correct, bad_keys, bad_values,
149 static_cast<unsigned long long>(missed));
150 ASSERT_LE(min_expected, correct);
151 ASSERT_GE(max_expected, correct);
152 }
153
154 void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
155 struct stat sbuf;
156 if (stat(fname.c_str(), &sbuf) != 0) {
157 const char* msg = strerror(errno);
11fdf7f2 158 FAIL() << fname << ": " << msg;
7c673cae
FG
159 }
160
161 if (offset < 0) {
162 // Relative to end of file; make it absolute
163 if (-offset > sbuf.st_size) {
164 offset = 0;
165 } else {
166 offset = static_cast<int>(sbuf.st_size + offset);
167 }
168 }
169 if (offset > sbuf.st_size) {
170 offset = static_cast<int>(sbuf.st_size);
171 }
172 if (offset + bytes_to_corrupt > sbuf.st_size) {
173 bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
174 }
175
176 // Do it
177 std::string contents;
178 Status s = ReadFileToString(Env::Default(), fname, &contents);
179 ASSERT_TRUE(s.ok()) << s.ToString();
180 for (int i = 0; i < bytes_to_corrupt; i++) {
181 contents[i + offset] ^= 0x80;
182 }
183 s = WriteStringToFile(Env::Default(), contents, fname);
184 ASSERT_TRUE(s.ok()) << s.ToString();
11fdf7f2
TL
185 Options options;
186 EnvOptions env_options;
187 ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname));
7c673cae
FG
188 }
189
190 void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
191 // Pick file to corrupt
192 std::vector<std::string> filenames;
193 ASSERT_OK(env_.GetChildren(dbname_, &filenames));
194 uint64_t number;
195 FileType type;
196 std::string fname;
197 int picked_number = -1;
198 for (size_t i = 0; i < filenames.size(); i++) {
199 if (ParseFileName(filenames[i], &number, &type) &&
200 type == filetype &&
201 static_cast<int>(number) > picked_number) { // Pick latest file
202 fname = dbname_ + "/" + filenames[i];
203 picked_number = static_cast<int>(number);
204 }
205 }
206 ASSERT_TRUE(!fname.empty()) << filetype;
207
208 CorruptFile(fname, offset, bytes_to_corrupt);
209 }
210
211 // corrupts exactly one file at level `level`. if no file found at level,
212 // asserts
213 void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
214 std::vector<LiveFileMetaData> metadata;
215 db_->GetLiveFilesMetaData(&metadata);
216 for (const auto& m : metadata) {
217 if (m.level == level) {
218 CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
219 return;
220 }
221 }
11fdf7f2 222 FAIL() << "no file found at level";
7c673cae
FG
223 }
224
225
226 int Property(const std::string& name) {
227 std::string property;
228 int result;
229 if (db_->GetProperty(name, &property) &&
230 sscanf(property.c_str(), "%d", &result) == 1) {
231 return result;
232 } else {
233 return -1;
234 }
235 }
236
237 // Return the ith key
238 Slice Key(int i, std::string* storage) {
239 char buf[100];
240 snprintf(buf, sizeof(buf), "%016d", i);
241 storage->assign(buf, strlen(buf));
242 return Slice(*storage);
243 }
244
245 // Return the value to associate with the specified key
246 Slice Value(int k, std::string* storage) {
247 if (k == 0) {
248 // Ugh. Random seed of 0 used to produce no entropy. This code
249 // preserves the implementation that was in place when all of the
250 // magic values in this file were picked.
251 *storage = std::string(kValueSize, ' ');
252 return Slice(*storage);
253 } else {
254 Random r(k);
255 return test::RandomString(&r, kValueSize, storage);
256 }
257 }
258};
259
260TEST_F(CorruptionTest, Recovery) {
261 Build(100);
262 Check(100, 100);
263#ifdef OS_WIN
264 // On Wndows OS Disk cache does not behave properly
265 // We do not call FlushBuffers on every Flush. If we do not close
266 // the log file prior to the corruption we end up with the first
267 // block not corrupted but only the second. However, under the debugger
268 // things work just fine but never pass when running normally
269 // For that reason people may want to run with unbuffered I/O. That option
270 // is not available for WAL though.
271 CloseDb();
272#endif
273 Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
274 Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
275 ASSERT_TRUE(!TryReopen().ok());
276 options_.paranoid_checks = false;
277 Reopen(&options_);
278
279 // The 64 records in the first two log blocks are completely lost.
280 Check(36, 36);
281}
282
283TEST_F(CorruptionTest, RecoverWriteError) {
284 env_.writable_file_error_ = true;
285 Status s = TryReopen();
286 ASSERT_TRUE(!s.ok());
287}
288
289TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
290 // Do enough writing to force minor compaction
291 env_.writable_file_error_ = true;
292 const int num =
293 static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
294 std::string value_storage;
295 Status s;
296 bool failed = false;
297 for (int i = 0; i < num; i++) {
298 WriteBatch batch;
299 batch.Put("a", Value(100, &value_storage));
300 s = db_->Write(WriteOptions(), &batch);
301 if (!s.ok()) {
302 failed = true;
303 }
304 ASSERT_TRUE(!failed || !s.ok());
305 }
306 ASSERT_TRUE(!s.ok());
307 ASSERT_GE(env_.num_writable_file_errors_, 1);
308 env_.writable_file_error_ = false;
309 Reopen();
310}
311
312TEST_F(CorruptionTest, TableFile) {
313 Build(100);
314 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
315 dbi->TEST_FlushMemTable();
316 dbi->TEST_CompactRange(0, nullptr, nullptr);
317 dbi->TEST_CompactRange(1, nullptr, nullptr);
318
319 Corrupt(kTableFile, 100, 1);
320 Check(99, 99);
11fdf7f2 321 ASSERT_NOK(dbi->VerifyChecksum());
7c673cae
FG
322}
323
324TEST_F(CorruptionTest, TableFileIndexData) {
325 Options options;
326 // very big, we'll trigger flushes manually
327 options.write_buffer_size = 100 * 1024 * 1024;
328 Reopen(&options);
329 // build 2 tables, flush at 5000
330 Build(10000, 5000);
331 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
332 dbi->TEST_FlushMemTable();
333
334 // corrupt an index block of an entire file
335 Corrupt(kTableFile, -2000, 500);
336 Reopen();
11fdf7f2
TL
337 dbi = reinterpret_cast<DBImpl*>(db_);
338 // one full file may be readable, since only one was corrupted
7c673cae 339 // the other file should be fully non-readable, since index was corrupted
11fdf7f2
TL
340 Check(0, 5000);
341 ASSERT_NOK(dbi->VerifyChecksum());
7c673cae
FG
342}
343
344TEST_F(CorruptionTest, MissingDescriptor) {
345 Build(1000);
346 RepairDB();
347 Reopen();
348 Check(1000, 1000);
349}
350
351TEST_F(CorruptionTest, SequenceNumberRecovery) {
352 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
353 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
354 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
355 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
356 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
357 RepairDB();
358 Reopen();
359 std::string v;
360 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
361 ASSERT_EQ("v5", v);
362 // Write something. If sequence number was not recovered properly,
363 // it will be hidden by an earlier write.
364 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
365 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
366 ASSERT_EQ("v6", v);
367 Reopen();
368 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
369 ASSERT_EQ("v6", v);
370}
371
372TEST_F(CorruptionTest, CorruptedDescriptor) {
373 ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
374 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
375 dbi->TEST_FlushMemTable();
376 dbi->TEST_CompactRange(0, nullptr, nullptr);
377
378 Corrupt(kDescriptorFile, 0, 1000);
379 Status s = TryReopen();
380 ASSERT_TRUE(!s.ok());
381
382 RepairDB();
383 Reopen();
384 std::string v;
385 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
386 ASSERT_EQ("hello", v);
387}
388
389TEST_F(CorruptionTest, CompactionInputError) {
390 Options options;
391 Reopen(&options);
392 Build(10);
393 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
394 dbi->TEST_FlushMemTable();
395 dbi->TEST_CompactRange(0, nullptr, nullptr);
396 dbi->TEST_CompactRange(1, nullptr, nullptr);
397 ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
398
399 Corrupt(kTableFile, 100, 1);
400 Check(9, 9);
11fdf7f2 401 ASSERT_NOK(dbi->VerifyChecksum());
7c673cae
FG
402
403 // Force compactions by writing lots of values
404 Build(10000);
405 Check(10000, 10000);
11fdf7f2 406 ASSERT_NOK(dbi->VerifyChecksum());
7c673cae
FG
407}
408
409TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
410 Options options;
411 options.paranoid_checks = true;
412 options.write_buffer_size = 131072;
413 options.max_write_buffer_number = 2;
414 Reopen(&options);
415 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
416
417 // Fill levels >= 1
418 for (int level = 1; level < dbi->NumberLevels(); level++) {
419 dbi->Put(WriteOptions(), "", "begin");
420 dbi->Put(WriteOptions(), "~", "end");
421 dbi->TEST_FlushMemTable();
422 for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
423 ++comp_level) {
424 dbi->TEST_CompactRange(comp_level, nullptr, nullptr);
425 }
426 }
427
428 Reopen(&options);
429
430 dbi = reinterpret_cast<DBImpl*>(db_);
431 Build(10);
432 dbi->TEST_FlushMemTable();
433 dbi->TEST_WaitForCompact();
434 ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
435
436 CorruptTableFileAtLevel(0, 100, 1);
437 Check(9, 9);
11fdf7f2 438 ASSERT_NOK(dbi->VerifyChecksum());
7c673cae
FG
439
440 // Write must eventually fail because of corrupted table
441 Status s;
442 std::string tmp1, tmp2;
443 bool failed = false;
444 for (int i = 0; i < 10000; i++) {
445 s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
446 if (!s.ok()) {
447 failed = true;
448 }
449 // if one write failed, every subsequent write must fail, too
450 ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
451 }
452 ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
453}
454
455TEST_F(CorruptionTest, UnrelatedKeys) {
456 Build(10);
457 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
458 dbi->TEST_FlushMemTable();
459 Corrupt(kTableFile, 100, 1);
11fdf7f2 460 ASSERT_NOK(dbi->VerifyChecksum());
7c673cae
FG
461
462 std::string tmp1, tmp2;
463 ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
464 std::string v;
465 ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
466 ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
467 dbi->TEST_FlushMemTable();
468 ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
469 ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
470}
471
494da23a
TL
472TEST_F(CorruptionTest, RangeDeletionCorrupted) {
473 ASSERT_OK(
474 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
475 ASSERT_OK(db_->Flush(FlushOptions()));
476 std::vector<LiveFileMetaData> metadata;
477 db_->GetLiveFilesMetaData(&metadata);
478 ASSERT_EQ(static_cast<size_t>(1), metadata.size());
479 std::string filename = dbname_ + metadata[0].name;
480
481 std::unique_ptr<RandomAccessFile> file;
482 ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions()));
483 std::unique_ptr<RandomAccessFileReader> file_reader(
484 new RandomAccessFileReader(std::move(file), filename));
485
486 uint64_t file_size;
487 ASSERT_OK(options_.env->GetFileSize(filename, &file_size));
488
489 BlockHandle range_del_handle;
490 ASSERT_OK(FindMetaBlock(
491 file_reader.get(), file_size, kBlockBasedTableMagicNumber,
492 ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle));
493
494 ASSERT_OK(TryReopen());
495 CorruptFile(filename, static_cast<int>(range_del_handle.offset()), 1);
496 // The test case does not fail on TryReopen because failure to preload table
497 // handlers is not considered critical.
498 ASSERT_OK(TryReopen());
499 std::string val;
500 // However, it does fail on any read involving that file since that file
501 // cannot be opened with a corrupt range deletion meta-block.
502 ASSERT_TRUE(db_->Get(ReadOptions(), "a", &val).IsCorruption());
503}
504
7c673cae
FG
505TEST_F(CorruptionTest, FileSystemStateCorrupted) {
506 for (int iter = 0; iter < 2; ++iter) {
507 Options options;
508 options.paranoid_checks = true;
509 options.create_if_missing = true;
510 Reopen(&options);
511 Build(10);
512 ASSERT_OK(db_->Flush(FlushOptions()));
513 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
514 std::vector<LiveFileMetaData> metadata;
515 dbi->GetLiveFilesMetaData(&metadata);
516 ASSERT_GT(metadata.size(), size_t(0));
517 std::string filename = dbname_ + metadata[0].name;
518
519 delete db_;
520 db_ = nullptr;
521
522 if (iter == 0) { // corrupt file size
494da23a 523 std::unique_ptr<WritableFile> file;
7c673cae
FG
524 env_.NewWritableFile(filename, &file, EnvOptions());
525 file->Append(Slice("corrupted sst"));
526 file.reset();
527 } else { // delete the file
528 env_.DeleteFile(filename);
529 }
530
531 Status x = TryReopen(&options);
532 ASSERT_TRUE(x.IsCorruption());
533 DestroyDB(dbname_, options_);
534 Reopen(&options);
535 }
536}
537
538} // namespace rocksdb
539
540int main(int argc, char** argv) {
541 ::testing::InitGoogleTest(&argc, argv);
542 return RUN_ALL_TESTS();
543}
544
545#else
546#include <stdio.h>
547
11fdf7f2 548int main(int /*argc*/, char** /*argv*/) {
7c673cae
FG
549 fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
550 return 0;
551}
552
553#endif // !ROCKSDB_LITE