]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/corruption_test.cc
bump version to 12.2.12-pve1
[ceph.git] / ceph / src / rocksdb / db / corruption_test.cc
CommitLineData
7c673cae
FG
1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under the BSD-style license found in the
3// LICENSE file in the root directory of this source tree. An additional grant
4// of patent rights can be found in the PATENTS file in the same directory.
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifndef ROCKSDB_LITE
11
12#include "rocksdb/db.h"
13
14#include <errno.h>
15#include <fcntl.h>
16#include <inttypes.h>
17#include <sys/stat.h>
18#include <sys/types.h>
19#include "db/db_impl.h"
20#include "db/log_format.h"
21#include "db/version_set.h"
22#include "rocksdb/cache.h"
23#include "rocksdb/env.h"
24#include "rocksdb/table.h"
25#include "rocksdb/write_batch.h"
26#include "util/filename.h"
27#include "util/string_util.h"
28#include "util/testharness.h"
29#include "util/testutil.h"
30
31namespace rocksdb {
32
33static const int kValueSize = 1000;
34
35class CorruptionTest : public testing::Test {
36 public:
37 test::ErrorEnv env_;
38 std::string dbname_;
39 shared_ptr<Cache> tiny_cache_;
40 Options options_;
41 DB* db_;
42
43 CorruptionTest() {
44 // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
45 // set it to 0), test SequenceNumberRecovery will fail, likely because of a
46 // bug in recovery code. Keep it 4 for now to make the test passes.
47 tiny_cache_ = NewLRUCache(100, 4);
48 options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
49 options_.env = &env_;
50 dbname_ = test::TmpDir() + "/corruption_test";
51 DestroyDB(dbname_, options_);
52
53 db_ = nullptr;
54 options_.create_if_missing = true;
55 BlockBasedTableOptions table_options;
56 table_options.block_size_deviation = 0; // make unit test pass for now
57 options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
58 Reopen();
59 options_.create_if_missing = false;
60 }
61
62 ~CorruptionTest() {
63 delete db_;
64 DestroyDB(dbname_, Options());
65 }
66
67 void CloseDb() {
68 delete db_;
69 db_ = nullptr;
70 }
71
72 Status TryReopen(Options* options = nullptr) {
73 delete db_;
74 db_ = nullptr;
75 Options opt = (options ? *options : options_);
76 opt.env = &env_;
77 opt.arena_block_size = 4096;
78 BlockBasedTableOptions table_options;
79 table_options.block_cache = tiny_cache_;
80 table_options.block_size_deviation = 0;
81 opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
82 return DB::Open(opt, dbname_, &db_);
83 }
84
85 void Reopen(Options* options = nullptr) {
86 ASSERT_OK(TryReopen(options));
87 }
88
89 void RepairDB() {
90 delete db_;
91 db_ = nullptr;
92 ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
93 }
94
95 void Build(int n, int flush_every = 0) {
96 std::string key_space, value_space;
97 WriteBatch batch;
98 for (int i = 0; i < n; i++) {
99 if (flush_every != 0 && i != 0 && i % flush_every == 0) {
100 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
101 dbi->TEST_FlushMemTable();
102 }
103 //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
104 Slice key = Key(i, &key_space);
105 batch.Clear();
106 batch.Put(key, Value(i, &value_space));
107 ASSERT_OK(db_->Write(WriteOptions(), &batch));
108 }
109 }
110
111 void Check(int min_expected, int max_expected) {
112 uint64_t next_expected = 0;
113 uint64_t missed = 0;
114 int bad_keys = 0;
115 int bad_values = 0;
116 int correct = 0;
117 std::string value_space;
118 // Do not verify checksums. If we verify checksums then the
119 // db itself will raise errors because data is corrupted.
120 // Instead, we want the reads to be successful and this test
121 // will detect whether the appropriate corruptions have
122 // occurred.
123 Iterator* iter = db_->NewIterator(ReadOptions(false, true));
124 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
125 uint64_t key;
126 Slice in(iter->key());
127 if (!ConsumeDecimalNumber(&in, &key) ||
128 !in.empty() ||
129 key < next_expected) {
130 bad_keys++;
131 continue;
132 }
133 missed += (key - next_expected);
134 next_expected = key + 1;
135 if (iter->value() != Value(static_cast<int>(key), &value_space)) {
136 bad_values++;
137 } else {
138 correct++;
139 }
140 }
141 delete iter;
142
143 fprintf(stderr,
144 "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
145 min_expected, max_expected, correct, bad_keys, bad_values,
146 static_cast<unsigned long long>(missed));
147 ASSERT_LE(min_expected, correct);
148 ASSERT_GE(max_expected, correct);
149 }
150
151 void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
152 struct stat sbuf;
153 if (stat(fname.c_str(), &sbuf) != 0) {
154 const char* msg = strerror(errno);
155 ASSERT_TRUE(false) << fname << ": " << msg;
156 }
157
158 if (offset < 0) {
159 // Relative to end of file; make it absolute
160 if (-offset > sbuf.st_size) {
161 offset = 0;
162 } else {
163 offset = static_cast<int>(sbuf.st_size + offset);
164 }
165 }
166 if (offset > sbuf.st_size) {
167 offset = static_cast<int>(sbuf.st_size);
168 }
169 if (offset + bytes_to_corrupt > sbuf.st_size) {
170 bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
171 }
172
173 // Do it
174 std::string contents;
175 Status s = ReadFileToString(Env::Default(), fname, &contents);
176 ASSERT_TRUE(s.ok()) << s.ToString();
177 for (int i = 0; i < bytes_to_corrupt; i++) {
178 contents[i + offset] ^= 0x80;
179 }
180 s = WriteStringToFile(Env::Default(), contents, fname);
181 ASSERT_TRUE(s.ok()) << s.ToString();
182 }
183
184 void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
185 // Pick file to corrupt
186 std::vector<std::string> filenames;
187 ASSERT_OK(env_.GetChildren(dbname_, &filenames));
188 uint64_t number;
189 FileType type;
190 std::string fname;
191 int picked_number = -1;
192 for (size_t i = 0; i < filenames.size(); i++) {
193 if (ParseFileName(filenames[i], &number, &type) &&
194 type == filetype &&
195 static_cast<int>(number) > picked_number) { // Pick latest file
196 fname = dbname_ + "/" + filenames[i];
197 picked_number = static_cast<int>(number);
198 }
199 }
200 ASSERT_TRUE(!fname.empty()) << filetype;
201
202 CorruptFile(fname, offset, bytes_to_corrupt);
203 }
204
205 // corrupts exactly one file at level `level`. if no file found at level,
206 // asserts
207 void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
208 std::vector<LiveFileMetaData> metadata;
209 db_->GetLiveFilesMetaData(&metadata);
210 for (const auto& m : metadata) {
211 if (m.level == level) {
212 CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
213 return;
214 }
215 }
216 ASSERT_TRUE(false) << "no file found at level";
217 }
218
219
220 int Property(const std::string& name) {
221 std::string property;
222 int result;
223 if (db_->GetProperty(name, &property) &&
224 sscanf(property.c_str(), "%d", &result) == 1) {
225 return result;
226 } else {
227 return -1;
228 }
229 }
230
231 // Return the ith key
232 Slice Key(int i, std::string* storage) {
233 char buf[100];
234 snprintf(buf, sizeof(buf), "%016d", i);
235 storage->assign(buf, strlen(buf));
236 return Slice(*storage);
237 }
238
239 // Return the value to associate with the specified key
240 Slice Value(int k, std::string* storage) {
241 if (k == 0) {
242 // Ugh. Random seed of 0 used to produce no entropy. This code
243 // preserves the implementation that was in place when all of the
244 // magic values in this file were picked.
245 *storage = std::string(kValueSize, ' ');
246 return Slice(*storage);
247 } else {
248 Random r(k);
249 return test::RandomString(&r, kValueSize, storage);
250 }
251 }
252};
253
254TEST_F(CorruptionTest, Recovery) {
255 Build(100);
256 Check(100, 100);
257#ifdef OS_WIN
258 // On Wndows OS Disk cache does not behave properly
259 // We do not call FlushBuffers on every Flush. If we do not close
260 // the log file prior to the corruption we end up with the first
261 // block not corrupted but only the second. However, under the debugger
262 // things work just fine but never pass when running normally
263 // For that reason people may want to run with unbuffered I/O. That option
264 // is not available for WAL though.
265 CloseDb();
266#endif
267 Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
268 Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
269 ASSERT_TRUE(!TryReopen().ok());
270 options_.paranoid_checks = false;
271 Reopen(&options_);
272
273 // The 64 records in the first two log blocks are completely lost.
274 Check(36, 36);
275}
276
277TEST_F(CorruptionTest, RecoverWriteError) {
278 env_.writable_file_error_ = true;
279 Status s = TryReopen();
280 ASSERT_TRUE(!s.ok());
281}
282
283TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
284 // Do enough writing to force minor compaction
285 env_.writable_file_error_ = true;
286 const int num =
287 static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
288 std::string value_storage;
289 Status s;
290 bool failed = false;
291 for (int i = 0; i < num; i++) {
292 WriteBatch batch;
293 batch.Put("a", Value(100, &value_storage));
294 s = db_->Write(WriteOptions(), &batch);
295 if (!s.ok()) {
296 failed = true;
297 }
298 ASSERT_TRUE(!failed || !s.ok());
299 }
300 ASSERT_TRUE(!s.ok());
301 ASSERT_GE(env_.num_writable_file_errors_, 1);
302 env_.writable_file_error_ = false;
303 Reopen();
304}
305
306TEST_F(CorruptionTest, TableFile) {
307 Build(100);
308 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
309 dbi->TEST_FlushMemTable();
310 dbi->TEST_CompactRange(0, nullptr, nullptr);
311 dbi->TEST_CompactRange(1, nullptr, nullptr);
312
313 Corrupt(kTableFile, 100, 1);
314 Check(99, 99);
315}
316
317TEST_F(CorruptionTest, TableFileIndexData) {
318 Options options;
319 // very big, we'll trigger flushes manually
320 options.write_buffer_size = 100 * 1024 * 1024;
321 Reopen(&options);
322 // build 2 tables, flush at 5000
323 Build(10000, 5000);
324 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
325 dbi->TEST_FlushMemTable();
326
327 // corrupt an index block of an entire file
328 Corrupt(kTableFile, -2000, 500);
329 Reopen();
330 // one full file should be readable, since only one was corrupted
331 // the other file should be fully non-readable, since index was corrupted
332 Check(5000, 5000);
333}
334
335TEST_F(CorruptionTest, MissingDescriptor) {
336 Build(1000);
337 RepairDB();
338 Reopen();
339 Check(1000, 1000);
340}
341
342TEST_F(CorruptionTest, SequenceNumberRecovery) {
343 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
344 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
345 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
346 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
347 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
348 RepairDB();
349 Reopen();
350 std::string v;
351 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
352 ASSERT_EQ("v5", v);
353 // Write something. If sequence number was not recovered properly,
354 // it will be hidden by an earlier write.
355 ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
356 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
357 ASSERT_EQ("v6", v);
358 Reopen();
359 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
360 ASSERT_EQ("v6", v);
361}
362
363TEST_F(CorruptionTest, CorruptedDescriptor) {
364 ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
365 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
366 dbi->TEST_FlushMemTable();
367 dbi->TEST_CompactRange(0, nullptr, nullptr);
368
369 Corrupt(kDescriptorFile, 0, 1000);
370 Status s = TryReopen();
371 ASSERT_TRUE(!s.ok());
372
373 RepairDB();
374 Reopen();
375 std::string v;
376 ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
377 ASSERT_EQ("hello", v);
378}
379
380TEST_F(CorruptionTest, CompactionInputError) {
381 Options options;
382 Reopen(&options);
383 Build(10);
384 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
385 dbi->TEST_FlushMemTable();
386 dbi->TEST_CompactRange(0, nullptr, nullptr);
387 dbi->TEST_CompactRange(1, nullptr, nullptr);
388 ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
389
390 Corrupt(kTableFile, 100, 1);
391 Check(9, 9);
392
393 // Force compactions by writing lots of values
394 Build(10000);
395 Check(10000, 10000);
396}
397
398TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
399 Options options;
400 options.paranoid_checks = true;
401 options.write_buffer_size = 131072;
402 options.max_write_buffer_number = 2;
403 Reopen(&options);
404 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
405
406 // Fill levels >= 1
407 for (int level = 1; level < dbi->NumberLevels(); level++) {
408 dbi->Put(WriteOptions(), "", "begin");
409 dbi->Put(WriteOptions(), "~", "end");
410 dbi->TEST_FlushMemTable();
411 for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
412 ++comp_level) {
413 dbi->TEST_CompactRange(comp_level, nullptr, nullptr);
414 }
415 }
416
417 Reopen(&options);
418
419 dbi = reinterpret_cast<DBImpl*>(db_);
420 Build(10);
421 dbi->TEST_FlushMemTable();
422 dbi->TEST_WaitForCompact();
423 ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
424
425 CorruptTableFileAtLevel(0, 100, 1);
426 Check(9, 9);
427
428 // Write must eventually fail because of corrupted table
429 Status s;
430 std::string tmp1, tmp2;
431 bool failed = false;
432 for (int i = 0; i < 10000; i++) {
433 s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
434 if (!s.ok()) {
435 failed = true;
436 }
437 // if one write failed, every subsequent write must fail, too
438 ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
439 }
440 ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
441}
442
443TEST_F(CorruptionTest, UnrelatedKeys) {
444 Build(10);
445 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
446 dbi->TEST_FlushMemTable();
447 Corrupt(kTableFile, 100, 1);
448
449 std::string tmp1, tmp2;
450 ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
451 std::string v;
452 ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
453 ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
454 dbi->TEST_FlushMemTable();
455 ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
456 ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
457}
458
459TEST_F(CorruptionTest, FileSystemStateCorrupted) {
460 for (int iter = 0; iter < 2; ++iter) {
461 Options options;
462 options.paranoid_checks = true;
463 options.create_if_missing = true;
464 Reopen(&options);
465 Build(10);
466 ASSERT_OK(db_->Flush(FlushOptions()));
467 DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
468 std::vector<LiveFileMetaData> metadata;
469 dbi->GetLiveFilesMetaData(&metadata);
470 ASSERT_GT(metadata.size(), size_t(0));
471 std::string filename = dbname_ + metadata[0].name;
472
473 delete db_;
474 db_ = nullptr;
475
476 if (iter == 0) { // corrupt file size
477 unique_ptr<WritableFile> file;
478 env_.NewWritableFile(filename, &file, EnvOptions());
479 file->Append(Slice("corrupted sst"));
480 file.reset();
481 } else { // delete the file
482 env_.DeleteFile(filename);
483 }
484
485 Status x = TryReopen(&options);
486 ASSERT_TRUE(x.IsCorruption());
487 DestroyDB(dbname_, options_);
488 Reopen(&options);
489 }
490}
491
492} // namespace rocksdb
493
494int main(int argc, char** argv) {
495 ::testing::InitGoogleTest(&argc, argv);
496 return RUN_ALL_TESTS();
497}
498
499#else
500#include <stdio.h>
501
502int main(int argc, char** argv) {
503 fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
504 return 0;
505}
506
507#endif // !ROCKSDB_LITE