1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
12 #include "rocksdb/db.h"
18 #include <sys/types.h>
19 #include "db/db_impl.h"
20 #include "db/log_format.h"
21 #include "db/version_set.h"
22 #include "rocksdb/cache.h"
23 #include "rocksdb/convenience.h"
24 #include "rocksdb/env.h"
25 #include "rocksdb/table.h"
26 #include "rocksdb/write_batch.h"
27 #include "util/filename.h"
28 #include "util/string_util.h"
29 #include "util/testharness.h"
30 #include "util/testutil.h"
34 static const int kValueSize
= 1000;
36 class CorruptionTest
: public testing::Test
{
40 shared_ptr
<Cache
> tiny_cache_
;
45 // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
46 // set it to 0), test SequenceNumberRecovery will fail, likely because of a
47 // bug in recovery code. Keep it 4 for now to make the test passes.
48 tiny_cache_
= NewLRUCache(100, 4);
49 options_
.wal_recovery_mode
= WALRecoveryMode::kTolerateCorruptedTailRecords
;
51 dbname_
= test::PerThreadDBPath("corruption_test");
52 DestroyDB(dbname_
, options_
);
55 options_
.create_if_missing
= true;
56 BlockBasedTableOptions table_options
;
57 table_options
.block_size_deviation
= 0; // make unit test pass for now
58 options_
.table_factory
.reset(NewBlockBasedTableFactory(table_options
));
60 options_
.create_if_missing
= false;
65 DestroyDB(dbname_
, Options());
73 Status
TryReopen(Options
* options
= nullptr) {
76 Options opt
= (options
? *options
: options_
);
78 opt
.arena_block_size
= 4096;
79 BlockBasedTableOptions table_options
;
80 table_options
.block_cache
= tiny_cache_
;
81 table_options
.block_size_deviation
= 0;
82 opt
.table_factory
.reset(NewBlockBasedTableFactory(table_options
));
83 return DB::Open(opt
, dbname_
, &db_
);
86 void Reopen(Options
* options
= nullptr) {
87 ASSERT_OK(TryReopen(options
));
93 ASSERT_OK(::rocksdb::RepairDB(dbname_
, options_
));
96 void Build(int n
, int flush_every
= 0) {
97 std::string key_space
, value_space
;
99 for (int i
= 0; i
< n
; i
++) {
100 if (flush_every
!= 0 && i
!= 0 && i
% flush_every
== 0) {
101 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
102 dbi
->TEST_FlushMemTable();
104 //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
105 Slice key
= Key(i
, &key_space
);
107 batch
.Put(key
, Value(i
, &value_space
));
108 ASSERT_OK(db_
->Write(WriteOptions(), &batch
));
112 void Check(int min_expected
, int max_expected
) {
113 uint64_t next_expected
= 0;
118 std::string value_space
;
119 // Do not verify checksums. If we verify checksums then the
120 // db itself will raise errors because data is corrupted.
121 // Instead, we want the reads to be successful and this test
122 // will detect whether the appropriate corruptions have
124 Iterator
* iter
= db_
->NewIterator(ReadOptions(false, true));
125 for (iter
->SeekToFirst(); iter
->Valid(); iter
->Next()) {
127 Slice
in(iter
->key());
128 if (!ConsumeDecimalNumber(&in
, &key
) ||
130 key
< next_expected
) {
134 missed
+= (key
- next_expected
);
135 next_expected
= key
+ 1;
136 if (iter
->value() != Value(static_cast<int>(key
), &value_space
)) {
145 "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
146 min_expected
, max_expected
, correct
, bad_keys
, bad_values
,
147 static_cast<unsigned long long>(missed
));
148 ASSERT_LE(min_expected
, correct
);
149 ASSERT_GE(max_expected
, correct
);
152 void CorruptFile(const std::string
& fname
, int offset
, int bytes_to_corrupt
) {
154 if (stat(fname
.c_str(), &sbuf
) != 0) {
155 const char* msg
= strerror(errno
);
156 FAIL() << fname
<< ": " << msg
;
160 // Relative to end of file; make it absolute
161 if (-offset
> sbuf
.st_size
) {
164 offset
= static_cast<int>(sbuf
.st_size
+ offset
);
167 if (offset
> sbuf
.st_size
) {
168 offset
= static_cast<int>(sbuf
.st_size
);
170 if (offset
+ bytes_to_corrupt
> sbuf
.st_size
) {
171 bytes_to_corrupt
= static_cast<int>(sbuf
.st_size
- offset
);
175 std::string contents
;
176 Status s
= ReadFileToString(Env::Default(), fname
, &contents
);
177 ASSERT_TRUE(s
.ok()) << s
.ToString();
178 for (int i
= 0; i
< bytes_to_corrupt
; i
++) {
179 contents
[i
+ offset
] ^= 0x80;
181 s
= WriteStringToFile(Env::Default(), contents
, fname
);
182 ASSERT_TRUE(s
.ok()) << s
.ToString();
184 EnvOptions env_options
;
185 ASSERT_NOK(VerifySstFileChecksum(options
, env_options
, fname
));
188 void Corrupt(FileType filetype
, int offset
, int bytes_to_corrupt
) {
189 // Pick file to corrupt
190 std::vector
<std::string
> filenames
;
191 ASSERT_OK(env_
.GetChildren(dbname_
, &filenames
));
195 int picked_number
= -1;
196 for (size_t i
= 0; i
< filenames
.size(); i
++) {
197 if (ParseFileName(filenames
[i
], &number
, &type
) &&
199 static_cast<int>(number
) > picked_number
) { // Pick latest file
200 fname
= dbname_
+ "/" + filenames
[i
];
201 picked_number
= static_cast<int>(number
);
204 ASSERT_TRUE(!fname
.empty()) << filetype
;
206 CorruptFile(fname
, offset
, bytes_to_corrupt
);
209 // corrupts exactly one file at level `level`. if no file found at level,
211 void CorruptTableFileAtLevel(int level
, int offset
, int bytes_to_corrupt
) {
212 std::vector
<LiveFileMetaData
> metadata
;
213 db_
->GetLiveFilesMetaData(&metadata
);
214 for (const auto& m
: metadata
) {
215 if (m
.level
== level
) {
216 CorruptFile(dbname_
+ "/" + m
.name
, offset
, bytes_to_corrupt
);
220 FAIL() << "no file found at level";
224 int Property(const std::string
& name
) {
225 std::string property
;
227 if (db_
->GetProperty(name
, &property
) &&
228 sscanf(property
.c_str(), "%d", &result
) == 1) {
235 // Return the ith key
236 Slice
Key(int i
, std::string
* storage
) {
238 snprintf(buf
, sizeof(buf
), "%016d", i
);
239 storage
->assign(buf
, strlen(buf
));
240 return Slice(*storage
);
243 // Return the value to associate with the specified key
244 Slice
Value(int k
, std::string
* storage
) {
246 // Ugh. Random seed of 0 used to produce no entropy. This code
247 // preserves the implementation that was in place when all of the
248 // magic values in this file were picked.
249 *storage
= std::string(kValueSize
, ' ');
250 return Slice(*storage
);
253 return test::RandomString(&r
, kValueSize
, storage
);
258 TEST_F(CorruptionTest
, Recovery
) {
262 // On Wndows OS Disk cache does not behave properly
263 // We do not call FlushBuffers on every Flush. If we do not close
264 // the log file prior to the corruption we end up with the first
265 // block not corrupted but only the second. However, under the debugger
266 // things work just fine but never pass when running normally
267 // For that reason people may want to run with unbuffered I/O. That option
268 // is not available for WAL though.
271 Corrupt(kLogFile
, 19, 1); // WriteBatch tag for first record
272 Corrupt(kLogFile
, log::kBlockSize
+ 1000, 1); // Somewhere in second block
273 ASSERT_TRUE(!TryReopen().ok());
274 options_
.paranoid_checks
= false;
277 // The 64 records in the first two log blocks are completely lost.
281 TEST_F(CorruptionTest
, RecoverWriteError
) {
282 env_
.writable_file_error_
= true;
283 Status s
= TryReopen();
284 ASSERT_TRUE(!s
.ok());
287 TEST_F(CorruptionTest
, NewFileErrorDuringWrite
) {
288 // Do enough writing to force minor compaction
289 env_
.writable_file_error_
= true;
291 static_cast<int>(3 + (Options().write_buffer_size
/ kValueSize
));
292 std::string value_storage
;
295 for (int i
= 0; i
< num
; i
++) {
297 batch
.Put("a", Value(100, &value_storage
));
298 s
= db_
->Write(WriteOptions(), &batch
);
302 ASSERT_TRUE(!failed
|| !s
.ok());
304 ASSERT_TRUE(!s
.ok());
305 ASSERT_GE(env_
.num_writable_file_errors_
, 1);
306 env_
.writable_file_error_
= false;
310 TEST_F(CorruptionTest
, TableFile
) {
312 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
313 dbi
->TEST_FlushMemTable();
314 dbi
->TEST_CompactRange(0, nullptr, nullptr);
315 dbi
->TEST_CompactRange(1, nullptr, nullptr);
317 Corrupt(kTableFile
, 100, 1);
319 ASSERT_NOK(dbi
->VerifyChecksum());
322 TEST_F(CorruptionTest
, TableFileIndexData
) {
324 // very big, we'll trigger flushes manually
325 options
.write_buffer_size
= 100 * 1024 * 1024;
327 // build 2 tables, flush at 5000
329 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
330 dbi
->TEST_FlushMemTable();
332 // corrupt an index block of an entire file
333 Corrupt(kTableFile
, -2000, 500);
335 dbi
= reinterpret_cast<DBImpl
*>(db_
);
336 // one full file may be readable, since only one was corrupted
337 // the other file should be fully non-readable, since index was corrupted
339 ASSERT_NOK(dbi
->VerifyChecksum());
342 TEST_F(CorruptionTest
, MissingDescriptor
) {
349 TEST_F(CorruptionTest
, SequenceNumberRecovery
) {
350 ASSERT_OK(db_
->Put(WriteOptions(), "foo", "v1"));
351 ASSERT_OK(db_
->Put(WriteOptions(), "foo", "v2"));
352 ASSERT_OK(db_
->Put(WriteOptions(), "foo", "v3"));
353 ASSERT_OK(db_
->Put(WriteOptions(), "foo", "v4"));
354 ASSERT_OK(db_
->Put(WriteOptions(), "foo", "v5"));
358 ASSERT_OK(db_
->Get(ReadOptions(), "foo", &v
));
360 // Write something. If sequence number was not recovered properly,
361 // it will be hidden by an earlier write.
362 ASSERT_OK(db_
->Put(WriteOptions(), "foo", "v6"));
363 ASSERT_OK(db_
->Get(ReadOptions(), "foo", &v
));
366 ASSERT_OK(db_
->Get(ReadOptions(), "foo", &v
));
370 TEST_F(CorruptionTest
, CorruptedDescriptor
) {
371 ASSERT_OK(db_
->Put(WriteOptions(), "foo", "hello"));
372 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
373 dbi
->TEST_FlushMemTable();
374 dbi
->TEST_CompactRange(0, nullptr, nullptr);
376 Corrupt(kDescriptorFile
, 0, 1000);
377 Status s
= TryReopen();
378 ASSERT_TRUE(!s
.ok());
383 ASSERT_OK(db_
->Get(ReadOptions(), "foo", &v
));
384 ASSERT_EQ("hello", v
);
387 TEST_F(CorruptionTest
, CompactionInputError
) {
391 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
392 dbi
->TEST_FlushMemTable();
393 dbi
->TEST_CompactRange(0, nullptr, nullptr);
394 dbi
->TEST_CompactRange(1, nullptr, nullptr);
395 ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
397 Corrupt(kTableFile
, 100, 1);
399 ASSERT_NOK(dbi
->VerifyChecksum());
401 // Force compactions by writing lots of values
404 ASSERT_NOK(dbi
->VerifyChecksum());
407 TEST_F(CorruptionTest
, CompactionInputErrorParanoid
) {
409 options
.paranoid_checks
= true;
410 options
.write_buffer_size
= 131072;
411 options
.max_write_buffer_number
= 2;
413 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
416 for (int level
= 1; level
< dbi
->NumberLevels(); level
++) {
417 dbi
->Put(WriteOptions(), "", "begin");
418 dbi
->Put(WriteOptions(), "~", "end");
419 dbi
->TEST_FlushMemTable();
420 for (int comp_level
= 0; comp_level
< dbi
->NumberLevels() - level
;
422 dbi
->TEST_CompactRange(comp_level
, nullptr, nullptr);
428 dbi
= reinterpret_cast<DBImpl
*>(db_
);
430 dbi
->TEST_FlushMemTable();
431 dbi
->TEST_WaitForCompact();
432 ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
434 CorruptTableFileAtLevel(0, 100, 1);
436 ASSERT_NOK(dbi
->VerifyChecksum());
438 // Write must eventually fail because of corrupted table
440 std::string tmp1
, tmp2
;
442 for (int i
= 0; i
< 10000; i
++) {
443 s
= db_
->Put(WriteOptions(), Key(i
, &tmp1
), Value(i
, &tmp2
));
447 // if one write failed, every subsequent write must fail, too
448 ASSERT_TRUE(!failed
|| !s
.ok()) << "write did not fail in a corrupted db";
450 ASSERT_TRUE(!s
.ok()) << "write did not fail in corrupted paranoid db";
453 TEST_F(CorruptionTest
, UnrelatedKeys
) {
455 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
456 dbi
->TEST_FlushMemTable();
457 Corrupt(kTableFile
, 100, 1);
458 ASSERT_NOK(dbi
->VerifyChecksum());
460 std::string tmp1
, tmp2
;
461 ASSERT_OK(db_
->Put(WriteOptions(), Key(1000, &tmp1
), Value(1000, &tmp2
)));
463 ASSERT_OK(db_
->Get(ReadOptions(), Key(1000, &tmp1
), &v
));
464 ASSERT_EQ(Value(1000, &tmp2
).ToString(), v
);
465 dbi
->TEST_FlushMemTable();
466 ASSERT_OK(db_
->Get(ReadOptions(), Key(1000, &tmp1
), &v
));
467 ASSERT_EQ(Value(1000, &tmp2
).ToString(), v
);
470 TEST_F(CorruptionTest
, FileSystemStateCorrupted
) {
471 for (int iter
= 0; iter
< 2; ++iter
) {
473 options
.paranoid_checks
= true;
474 options
.create_if_missing
= true;
477 ASSERT_OK(db_
->Flush(FlushOptions()));
478 DBImpl
* dbi
= reinterpret_cast<DBImpl
*>(db_
);
479 std::vector
<LiveFileMetaData
> metadata
;
480 dbi
->GetLiveFilesMetaData(&metadata
);
481 ASSERT_GT(metadata
.size(), size_t(0));
482 std::string filename
= dbname_
+ metadata
[0].name
;
487 if (iter
== 0) { // corrupt file size
488 unique_ptr
<WritableFile
> file
;
489 env_
.NewWritableFile(filename
, &file
, EnvOptions());
490 file
->Append(Slice("corrupted sst"));
492 } else { // delete the file
493 env_
.DeleteFile(filename
);
496 Status x
= TryReopen(&options
);
497 ASSERT_TRUE(x
.IsCorruption());
498 DestroyDB(dbname_
, options_
);
503 } // namespace rocksdb
505 int main(int argc
, char** argv
) {
506 ::testing::InitGoogleTest(&argc
, argv
);
507 return RUN_ALL_TESTS();
513 int main(int /*argc*/, char** /*argv*/) {
514 fprintf(stderr
, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
518 #endif // !ROCKSDB_LITE