ceph/src/rocksdb/db/db_test.cc

   1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 //
   6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   7 // Use of this source code is governed by a BSD-style license that can be
   8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   9
  10 // Introduction of SyncPoint effectively disabled building and running this test
  11 // in Release build.
  12 // which is a pity, it is a good test
  13 #include <fcntl.h>
  14
  15 #include <algorithm>
  16 #include <set>
  17 #include <thread>
  18 #include <unordered_set>
  19 #include <utility>
  20
  21 #ifndef OS_WIN
  22 #include <unistd.h>
  23 #endif
  24 #ifdef OS_SOLARIS
  25 #include <alloca.h>
  26 #endif
  27
  28 #include "cache/lru_cache.h"
  29 #include "db/blob/blob_index.h"
  30 #include "db/blob/blob_log_format.h"
  31 #include "db/db_impl/db_impl.h"
  32 #include "db/db_test_util.h"
  33 #include "db/dbformat.h"
  34 #include "db/job_context.h"
  35 #include "db/version_set.h"
  36 #include "db/write_batch_internal.h"
  37 #include "env/mock_env.h"
  38 #include "file/filename.h"
  39 #include "monitoring/thread_status_util.h"
  40 #include "port/port.h"
  41 #include "port/stack_trace.h"
  42 #include "rocksdb/cache.h"
  43 #include "rocksdb/compaction_filter.h"
  44 #include "rocksdb/convenience.h"
  45 #include "rocksdb/db.h"
  46 #include "rocksdb/env.h"
  47 #include "rocksdb/experimental.h"
  48 #include "rocksdb/filter_policy.h"
  49 #include "rocksdb/options.h"
  50 #include "rocksdb/perf_context.h"
  51 #include "rocksdb/slice.h"
  52 #include "rocksdb/slice_transform.h"
  53 #include "rocksdb/snapshot.h"
  54 #include "rocksdb/table.h"
  55 #include "rocksdb/table_properties.h"
  56 #include "rocksdb/thread_status.h"
  57 #include "rocksdb/types.h"
  58 #include "rocksdb/utilities/checkpoint.h"
  59 #include "rocksdb/utilities/optimistic_transaction_db.h"
  60 #include "rocksdb/utilities/write_batch_with_index.h"
  61 #include "table/mock_table.h"
  62 #include "table/scoped_arena_iterator.h"
  63 #include "test_util/sync_point.h"
  64 #include "test_util/testharness.h"
  65 #include "test_util/testutil.h"
  66 #include "util/compression.h"
  67 #include "util/mutexlock.h"
  68 #include "util/random.h"
  69 #include "util/rate_limiter.h"
  70 #include "util/string_util.h"
  71 #include "utilities/merge_operators.h"
  72
  73 namespace ROCKSDB_NAMESPACE {
  74
  75 // Note that whole DBTest and its child classes disable fsync on files
  76 // and directories for speed.
  77 // If fsync needs to be covered in a test, put it in other places.
  78 class DBTest : public DBTestBase {
  79  public:
  80   DBTest() : DBTestBase("db_test", /*env_do_fsync=*/false) {}
  81 };
  82
  83 class DBTestWithParam
  84     : public DBTest,
  85       public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
  86  public:
  87   DBTestWithParam() {
  88     max_subcompactions_ = std::get<0>(GetParam());
  89     exclusive_manual_compaction_ = std::get<1>(GetParam());
  90   }
  91
  92   // Required if inheriting from testing::WithParamInterface<>
  93   static void SetUpTestCase() {}
  94   static void TearDownTestCase() {}
  95
  96   uint32_t max_subcompactions_;
  97   bool exclusive_manual_compaction_;
  98 };
  99
 100 TEST_F(DBTest, MockEnvTest) {
 101   std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
 102   Options options;
 103   options.create_if_missing = true;
 104   options.env = env.get();
 105   DB* db;
 106
 107   const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
 108   const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
 109
 110   ASSERT_OK(DB::Open(options, "/dir/db", &db));
 111   for (size_t i = 0; i < 3; ++i) {
 112     ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
 113   }
 114
 115   for (size_t i = 0; i < 3; ++i) {
 116     std::string res;
 117     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
 118     ASSERT_TRUE(res == vals[i]);
 119   }
 120
 121   Iterator* iterator = db->NewIterator(ReadOptions());
 122   iterator->SeekToFirst();
 123   for (size_t i = 0; i < 3; ++i) {
 124     ASSERT_TRUE(iterator->Valid());
 125     ASSERT_TRUE(keys[i] == iterator->key());
 126     ASSERT_TRUE(vals[i] == iterator->value());
 127     iterator->Next();
 128   }
 129   ASSERT_TRUE(!iterator->Valid());
 130   delete iterator;
 131
 132 // TEST_FlushMemTable() is not supported in ROCKSDB_LITE
 133 #ifndef ROCKSDB_LITE
 134   DBImpl* dbi = static_cast_with_check<DBImpl>(db);
 135   ASSERT_OK(dbi->TEST_FlushMemTable());
 136
 137   for (size_t i = 0; i < 3; ++i) {
 138     std::string res;
 139     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
 140     ASSERT_TRUE(res == vals[i]);
 141   }
 142 #endif  // ROCKSDB_LITE
 143
 144   delete db;
 145 }
 146
 147 // NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't
 148 // defined.
 149 #ifndef ROCKSDB_LITE
 150 TEST_F(DBTest, MemEnvTest) {
 151   std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
 152   Options options;
 153   options.create_if_missing = true;
 154   options.env = env.get();
 155   DB* db;
 156
 157   const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
 158   const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
 159
 160   ASSERT_OK(DB::Open(options, "/dir/db", &db));
 161   for (size_t i = 0; i < 3; ++i) {
 162     ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
 163   }
 164
 165   for (size_t i = 0; i < 3; ++i) {
 166     std::string res;
 167     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
 168     ASSERT_TRUE(res == vals[i]);
 169   }
 170
 171   Iterator* iterator = db->NewIterator(ReadOptions());
 172   iterator->SeekToFirst();
 173   for (size_t i = 0; i < 3; ++i) {
 174     ASSERT_TRUE(iterator->Valid());
 175     ASSERT_TRUE(keys[i] == iterator->key());
 176     ASSERT_TRUE(vals[i] == iterator->value());
 177     iterator->Next();
 178   }
 179   ASSERT_TRUE(!iterator->Valid());
 180   delete iterator;
 181
 182   DBImpl* dbi = static_cast_with_check<DBImpl>(db);
 183   ASSERT_OK(dbi->TEST_FlushMemTable());
 184
 185   for (size_t i = 0; i < 3; ++i) {
 186     std::string res;
 187     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
 188     ASSERT_TRUE(res == vals[i]);
 189   }
 190
 191   delete db;
 192
 193   options.create_if_missing = false;
 194   ASSERT_OK(DB::Open(options, "/dir/db", &db));
 195   for (size_t i = 0; i < 3; ++i) {
 196     std::string res;
 197     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
 198     ASSERT_TRUE(res == vals[i]);
 199   }
 200   delete db;
 201 }
 202 #endif  // ROCKSDB_LITE
 203
 204 TEST_F(DBTest, WriteEmptyBatch) {
 205   Options options = CurrentOptions();
 206   options.env = env_;
 207   options.write_buffer_size = 100000;
 208   CreateAndReopenWithCF({"pikachu"}, options);
 209
 210   ASSERT_OK(Put(1, "foo", "bar"));
 211   WriteOptions wo;
 212   wo.sync = true;
 213   wo.disableWAL = false;
 214   WriteBatch empty_batch;
 215   ASSERT_OK(dbfull()->Write(wo, &empty_batch));
 216
 217   // make sure we can re-open it.
 218   ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 219   ASSERT_EQ("bar", Get(1, "foo"));
 220 }
 221
 222 TEST_F(DBTest, SkipDelay) {
 223   Options options = CurrentOptions();
 224   options.env = env_;
 225   options.write_buffer_size = 100000;
 226   CreateAndReopenWithCF({"pikachu"}, options);
 227
 228   for (bool sync : {true, false}) {
 229     for (bool disableWAL : {true, false}) {
 230       if (sync && disableWAL) {
 231         // sync and disableWAL is incompatible.
 232         continue;
 233       }
 234       // Use a small number to ensure a large delay that is still effective
 235       // when we do Put
 236       // TODO(myabandeh): this is time dependent and could potentially make
 237       // the test flaky
 238       auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
 239       std::atomic<int> sleep_count(0);
 240       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
 241           "DBImpl::DelayWrite:Sleep",
 242           [&](void* /*arg*/) { sleep_count.fetch_add(1); });
 243       std::atomic<int> wait_count(0);
 244       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
 245           "DBImpl::DelayWrite:Wait",
 246           [&](void* /*arg*/) { wait_count.fetch_add(1); });
 247       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 248
 249       WriteOptions wo;
 250       wo.sync = sync;
 251       wo.disableWAL = disableWAL;
 252       wo.no_slowdown = true;
 253       // Large enough to exceed allowance for one time interval
 254       std::string large_value(1024, 'x');
 255       // Perhaps ideally this first write would fail because of delay, but
 256       // the current implementation does not guarantee that.
 257       dbfull()->Put(wo, "foo", large_value).PermitUncheckedError();
 258       // We need the 2nd write to trigger delay. This is because delay is
 259       // estimated based on the last write size which is 0 for the first write.
 260       ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value));
 261       ASSERT_GE(sleep_count.load(), 0);
 262       ASSERT_GE(wait_count.load(), 0);
 263       token.reset();
 264
 265       token = dbfull()->TEST_write_controler().GetDelayToken(1000000);
 266       wo.no_slowdown = false;
 267       ASSERT_OK(dbfull()->Put(wo, "foo3", large_value));
 268       ASSERT_GE(sleep_count.load(), 1);
 269       token.reset();
 270     }
 271   }
 272 }
 273
 274 TEST_F(DBTest, MixedSlowdownOptions) {
 275   Options options = CurrentOptions();
 276   options.env = env_;
 277   options.write_buffer_size = 100000;
 278   CreateAndReopenWithCF({"pikachu"}, options);
 279   std::vector<port::Thread> threads;
 280   std::atomic<int> thread_num(0);
 281
 282   std::function<void()> write_slowdown_func = [&]() {
 283     int a = thread_num.fetch_add(1);
 284     std::string key = "foo" + std::to_string(a);
 285     WriteOptions wo;
 286     wo.no_slowdown = false;
 287     ASSERT_OK(dbfull()->Put(wo, key, "bar"));
 288   };
 289   std::function<void()> write_no_slowdown_func = [&]() {
 290     int a = thread_num.fetch_add(1);
 291     std::string key = "foo" + std::to_string(a);
 292     WriteOptions wo;
 293     wo.no_slowdown = true;
 294     ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
 295   };
 296   // Use a small number to ensure a large delay that is still effective
 297   // when we do Put
 298   // TODO(myabandeh): this is time dependent and could potentially make
 299   // the test flaky
 300   auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
 301   std::atomic<int> sleep_count(0);
 302   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
 303       "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) {
 304         sleep_count.fetch_add(1);
 305         if (threads.empty()) {
 306           for (int i = 0; i < 2; ++i) {
 307             threads.emplace_back(write_slowdown_func);
 308           }
 309           for (int i = 0; i < 2; ++i) {
 310             threads.emplace_back(write_no_slowdown_func);
 311           }
 312         }
 313       });
 314   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 315
 316   WriteOptions wo;
 317   wo.sync = false;
 318   wo.disableWAL = false;
 319   wo.no_slowdown = false;
 320   ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
 321   // We need the 2nd write to trigger delay. This is because delay is
 322   // estimated based on the last write size which is 0 for the first write.
 323   ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
 324   token.reset();
 325
 326   for (auto& t : threads) {
 327     t.join();
 328   }
 329   ASSERT_GE(sleep_count.load(), 1);
 330
 331   wo.no_slowdown = true;
 332   ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
 333 }
 334
 335 TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
 336   Options options = CurrentOptions();
 337   options.env = env_;
 338   options.write_buffer_size = 100000;
 339   CreateAndReopenWithCF({"pikachu"}, options);
 340   std::vector<port::Thread> threads;
 341   std::atomic<int> thread_num(0);
 342
 343   std::function<void()> write_no_slowdown_func = [&]() {
 344     int a = thread_num.fetch_add(1);
 345     std::string key = "foo" + std::to_string(a);
 346     WriteOptions wo;
 347     wo.no_slowdown = true;
 348     ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
 349   };
 350   // Use a small number to ensure a large delay that is still effective
 351   // when we do Put
 352   // TODO(myabandeh): this is time dependent and could potentially make
 353   // the test flaky
 354   auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
 355   std::atomic<int> sleep_count(0);
 356   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
 357       "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) {
 358         sleep_count.fetch_add(1);
 359         if (threads.empty()) {
 360           for (int i = 0; i < 2; ++i) {
 361             threads.emplace_back(write_no_slowdown_func);
 362           }
 363           // Sleep for 2s to allow the threads to insert themselves into the
 364           // write queue
 365           env_->SleepForMicroseconds(3000000ULL);
 366         }
 367       });
 368   std::atomic<int> wait_count(0);
 369   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
 370       "DBImpl::DelayWrite:Wait",
 371       [&](void* /*arg*/) { wait_count.fetch_add(1); });
 372   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 373
 374   WriteOptions wo;
 375   wo.sync = false;
 376   wo.disableWAL = false;
 377   wo.no_slowdown = false;
 378   ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
 379   // We need the 2nd write to trigger delay. This is because delay is
 380   // estimated based on the last write size which is 0 for the first write.
 381   ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
 382   token.reset();
 383
 384   for (auto& t : threads) {
 385     t.join();
 386   }
 387   ASSERT_EQ(sleep_count.load(), 1);
 388   ASSERT_GE(wait_count.load(), 0);
 389 }
 390
 391 TEST_F(DBTest, MixedSlowdownOptionsStop) {
 392   Options options = CurrentOptions();
 393   options.env = env_;
 394   options.write_buffer_size = 100000;
 395   CreateAndReopenWithCF({"pikachu"}, options);
 396   std::vector<port::Thread> threads;
 397   std::atomic<int> thread_num(0);
 398
 399   std::function<void()> write_slowdown_func = [&]() {
 400     int a = thread_num.fetch_add(1);
 401     std::string key = "foo" + std::to_string(a);
 402     WriteOptions wo;
 403     wo.no_slowdown = false;
 404     ASSERT_OK(dbfull()->Put(wo, key, "bar"));
 405   };
 406   std::function<void()> write_no_slowdown_func = [&]() {
 407     int a = thread_num.fetch_add(1);
 408     std::string key = "foo" + std::to_string(a);
 409     WriteOptions wo;
 410     wo.no_slowdown = true;
 411     ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
 412   };
 413   std::function<void()> wakeup_writer = [&]() {
 414     dbfull()->mutex_.Lock();
 415     dbfull()->bg_cv_.SignalAll();
 416     dbfull()->mutex_.Unlock();
 417   };
 418   // Use a small number to ensure a large delay that is still effective
 419   // when we do Put
 420   // TODO(myabandeh): this is time dependent and could potentially make
 421   // the test flaky
 422   auto token = dbfull()->TEST_write_controler().GetStopToken();
 423   std::atomic<int> wait_count(0);
 424   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
 425       "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
 426         wait_count.fetch_add(1);
 427         if (threads.empty()) {
 428           for (int i = 0; i < 2; ++i) {
 429             threads.emplace_back(write_slowdown_func);
 430           }
 431           for (int i = 0; i < 2; ++i) {
 432             threads.emplace_back(write_no_slowdown_func);
 433           }
 434           // Sleep for 2s to allow the threads to insert themselves into the
 435           // write queue
 436           env_->SleepForMicroseconds(3000000ULL);
 437         }
 438         token.reset();
 439         threads.emplace_back(wakeup_writer);
 440       });
 441   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 442
 443   WriteOptions wo;
 444   wo.sync = false;
 445   wo.disableWAL = false;
 446   wo.no_slowdown = false;
 447   ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
 448   // We need the 2nd write to trigger delay. This is because delay is
 449   // estimated based on the last write size which is 0 for the first write.
 450   ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
 451   token.reset();
 452
 453   for (auto& t : threads) {
 454     t.join();
 455   }
 456   ASSERT_GE(wait_count.load(), 1);
 457
 458   wo.no_slowdown = true;
 459   ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
 460 }
 461 #ifndef ROCKSDB_LITE
 462
 463 TEST_F(DBTest, LevelLimitReopen) {
 464   Options options = CurrentOptions();
 465   CreateAndReopenWithCF({"pikachu"}, options);
 466
 467   const std::string value(1024 * 1024, ' ');
 468   int i = 0;
 469   while (NumTableFilesAtLevel(2, 1) == 0) {
 470     ASSERT_OK(Put(1, Key(i++), value));
 471   }
 472
 473   options.num_levels = 1;
 474   options.max_bytes_for_level_multiplier_additional.resize(1, 1);
 475   Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
 476   ASSERT_EQ(s.IsInvalidArgument(), true);
 477   ASSERT_EQ(s.ToString(),
 478             "Invalid argument: db has more levels than options.num_levels");
 479
 480   options.num_levels = 10;
 481   options.max_bytes_for_level_multiplier_additional.resize(10, 1);
 482   ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 483 }
 484 #endif  // ROCKSDB_LITE
 485
 486 #ifndef ROCKSDB_LITE
 487 TEST_F(DBTest, LevelReopenWithFIFO) {
 488   const int kLevelCount = 4;
 489   const int kKeyCount = 5;
 490   const int kTotalSstFileCount = kLevelCount * kKeyCount;
 491   const int kCF = 1;
 492
 493   Options options = CurrentOptions();
 494   // Config level0_file_num_compaction_trigger to prevent L0 files being
 495   // automatically compacted while we are constructing a LSM tree structure
 496   // to test multi-level FIFO compaction.
 497   options.level0_file_num_compaction_trigger = kKeyCount + 1;
 498   CreateAndReopenWithCF({"pikachu"}, options);
 499
 500   // The expected number of files per level after each file creation.
 501   const std::string expected_files_per_level[kLevelCount][kKeyCount] = {
 502       {"0,0,0,1", "0,0,0,2", "0,0,0,3", "0,0,0,4", "0,0,0,5"},
 503       {"0,0,1,5", "0,0,2,5", "0,0,3,5", "0,0,4,5", "0,0,5,5"},
 504       {"0,1,5,5", "0,2,5,5", "0,3,5,5", "0,4,5,5", "0,5,5,5"},
 505       {"1,5,5,5", "2,5,5,5", "3,5,5,5", "4,5,5,5", "5,5,5,5"},
 506   };
 507
 508   const std::string expected_entries[kKeyCount][kLevelCount + 1] = {
 509       {"[ ]", "[ a3 ]", "[ a2, a3 ]", "[ a1, a2, a3 ]", "[ a0, a1, a2, a3 ]"},
 510       {"[ ]", "[ b3 ]", "[ b2, b3 ]", "[ b1, b2, b3 ]", "[ b0, b1, b2, b3 ]"},
 511       {"[ ]", "[ c3 ]", "[ c2, c3 ]", "[ c1, c2, c3 ]", "[ c0, c1, c2, c3 ]"},
 512       {"[ ]", "[ d3 ]", "[ d2, d3 ]", "[ d1, d2, d3 ]", "[ d0, d1, d2, d3 ]"},
 513       {"[ ]", "[ e3 ]", "[ e2, e3 ]", "[ e1, e2, e3 ]", "[ e0, e1, e2, e3 ]"},
 514   };
 515
 516   // The loop below creates the following LSM tree where each (k, v) pair
 517   // represents a file that contains that entry.  When a file is created,
 518   // the db is reopend with FIFO compaction and verified the LSM tree
 519   // structure is still the same.
 520   //
 521   // The resulting LSM tree will contain 5 different keys.  Each key as
 522   // 4 different versions, located in different level.
 523   //
 524   // L0:  (e, e0) (d, d0) (c, c0) (b, b0) (a, a0)
 525   // L1:  (a, a1) (b, b1) (c, c1) (d, d1) (e, e1)
 526   // L2:  (a, a2) (b, b2) (c, c2) (d, d2) (e, e2)
 527   // L3:  (a, a3) (b, b3) (c, c3) (d, d3) (e, e3)
 528   for (int l = 0; l < kLevelCount; ++l) {
 529     int level = kLevelCount - 1 - l;
 530     for (int p = 0; p < kKeyCount; ++p) {
 531       std::string put_key = std::string(1, char('a' + p));
 532       ASSERT_OK(Put(kCF, put_key, put_key + std::to_string(level)));
 533       ASSERT_OK(Flush(kCF));
 534       ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 535       for (int g = 0; g < kKeyCount; ++g) {
 536         int entry_count = (p >= g) ? l + 1 : l;
 537         std::string get_key = std::string(1, char('a' + g));
 538         CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count], get_key,
 539                                       kCF, {"pikachu"}, options);
 540       }
 541       if (level != 0) {
 542         MoveFilesToLevel(level, kCF);
 543         for (int g = 0; g < kKeyCount; ++g) {
 544           int entry_count = (p >= g) ? l + 1 : l;
 545           std::string get_key = std::string(1, char('a' + g));
 546           CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count],
 547                                         get_key, kCF, {"pikachu"}, options);
 548         }
 549       }
 550       ASSERT_EQ(expected_files_per_level[l][p], FilesPerLevel(kCF));
 551     }
 552   }
 553
 554   // The expected number of sst files in each level after each FIFO compaction
 555   // that deletes the oldest sst file.
 556   const std::string expected_files_per_level_after_fifo[] = {
 557       "5,5,5,4", "5,5,5,3", "5,5,5,2", "5,5,5,1", "5,5,5", "5,5,4", "5,5,3",
 558       "5,5,2",   "5,5,1",   "5,5",     "5,4",     "5,3",   "5,2",   "5,1",
 559       "5",       "4",       "3",       "2",       "1",     "",
 560   };
 561
 562   // The expected value entries of each key after each FIFO compaction.
 563   // This verifies whether FIFO removes the file with the smallest key in non-L0
 564   // files first then the oldest files in L0.
 565   const std::string expected_entries_after_fifo[kKeyCount][kLevelCount + 1] = {
 566       {"[ a0, a1, a2, a3 ]", "[ a0, a1, a2 ]", "[ a0, a1 ]", "[ a0 ]", "[ ]"},
 567       {"[ b0, b1, b2, b3 ]", "[ b0, b1, b2 ]", "[ b0, b1 ]", "[ b0 ]", "[ ]"},
 568       {"[ c0, c1, c2, c3 ]", "[ c0, c1, c2 ]", "[ c0, c1 ]", "[ c0 ]", "[ ]"},
 569       {"[ d0, d1, d2, d3 ]", "[ d0, d1, d2 ]", "[ d0, d1 ]", "[ d0 ]", "[ ]"},
 570       {"[ e0, e1, e2, e3 ]", "[ e0, e1, e2 ]", "[ e0, e1 ]", "[ e0 ]", "[ ]"},
 571   };
 572
 573   // In the 2nd phase, we reopen the DB with FIFO compaction.  In each reopen,
 574   // we config max_table_files_size so that FIFO will remove exactly one file
 575   // at a time upon compaction, and we will use it to verify whether the sst
 576   // files are deleted in the correct order.
 577   for (int i = 0; i < kTotalSstFileCount; ++i) {
 578     uint64_t total_sst_files_size = 0;
 579     ASSERT_TRUE(dbfull()->GetIntProperty(
 580         handles_[1], "rocksdb.total-sst-files-size", &total_sst_files_size));
 581     ASSERT_TRUE(total_sst_files_size > 0);
 582
 583     Options fifo_options(options);
 584     fifo_options.compaction_style = kCompactionStyleFIFO;
 585     options.create_if_missing = false;
 586     fifo_options.max_open_files = -1;
 587     fifo_options.disable_auto_compactions = false;
 588     // Config max_table_files_size to be total_sst_files_size - 1 so that
 589     // FIFO will delete one file.
 590     fifo_options.compaction_options_fifo.max_table_files_size =
 591         total_sst_files_size - 1;
 592     ASSERT_OK(
 593         TryReopenWithColumnFamilies({"default", "pikachu"}, fifo_options));
 594     // For FIFO to pick a compaction
 595     ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
 596     ASSERT_OK(dbfull()->TEST_WaitForCompact(false));
 597     for (int g = 0; g < kKeyCount; ++g) {
 598       std::string get_key = std::string(1, char('a' + g));
 599       int status_index = i / kKeyCount;
 600       if ((i % kKeyCount) >= g) {
 601         // If true, then it means the sst file containing the get_key in the
 602         // current level has already been deleted, so we need to move the
 603         // status_index for checking the expected value.
 604         status_index++;
 605       }
 606       CheckAllEntriesWithFifoReopen(
 607           expected_entries_after_fifo[g][status_index], get_key, kCF,
 608           {"pikachu"}, options);
 609     }
 610     ASSERT_EQ(expected_files_per_level_after_fifo[i], FilesPerLevel(kCF));
 611   }
 612 }
 613 #endif  // !ROCKSDB_LITE
 614
 615 TEST_F(DBTest, PutSingleDeleteGet) {
 616   do {
 617     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 618     ASSERT_OK(Put(1, "foo", "v1"));
 619     ASSERT_EQ("v1", Get(1, "foo"));
 620     ASSERT_OK(Put(1, "foo2", "v2"));
 621     ASSERT_EQ("v2", Get(1, "foo2"));
 622     ASSERT_OK(SingleDelete(1, "foo"));
 623     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
 624     // Skip FIFO and universal compaction because they do not apply to the test
 625     // case. Skip MergePut because single delete does not get removed when it
 626     // encounters a merge.
 627   } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
 628                          kSkipMergePut));
 629 }
 630
 631 TEST_F(DBTest, ReadFromPersistedTier) {
 632   do {
 633     Random rnd(301);
 634     Options options = CurrentOptions();
 635     for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
 636       CreateAndReopenWithCF({"pikachu"}, options);
 637       WriteOptions wopt;
 638       wopt.disableWAL = (disableWAL == 1);
 639       // 1st round: put but not flush
 640       ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
 641       ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
 642       ASSERT_EQ("first", Get(1, "foo"));
 643       ASSERT_EQ("one", Get(1, "bar"));
 644
 645       // Read directly from persited data.
 646       ReadOptions ropt;
 647       ropt.read_tier = kPersistedTier;
 648       std::string value;
 649       if (wopt.disableWAL) {
 650         // as data has not yet being flushed, we expect not found.
 651         ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
 652         ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
 653       } else {
 654         ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
 655         ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
 656       }
 657
 658       // Multiget
 659       std::vector<ColumnFamilyHandle*> multiget_cfs;
 660       multiget_cfs.push_back(handles_[1]);
 661       multiget_cfs.push_back(handles_[1]);
 662       std::vector<Slice> multiget_keys;
 663       multiget_keys.push_back("foo");
 664       multiget_keys.push_back("bar");
 665       std::vector<std::string> multiget_values;
 666       auto statuses =
 667           db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
 668       if (wopt.disableWAL) {
 669         ASSERT_TRUE(statuses[0].IsNotFound());
 670         ASSERT_TRUE(statuses[1].IsNotFound());
 671       } else {
 672         ASSERT_OK(statuses[0]);
 673         ASSERT_OK(statuses[1]);
 674       }
 675
 676       // 2nd round: flush and put a new value in memtable.
 677       ASSERT_OK(Flush(1));
 678       ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
 679
 680       // once the data has been flushed, we are able to get the
 681       // data when kPersistedTier is used.
 682       ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
 683       ASSERT_EQ(value, "first");
 684       ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
 685       ASSERT_EQ(value, "one");
 686       if (wopt.disableWAL) {
 687         ASSERT_TRUE(
 688             db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
 689       } else {
 690         ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
 691         ASSERT_EQ(value, "hello");
 692       }
 693
 694       // Expect same result in multiget
 695       multiget_cfs.push_back(handles_[1]);
 696       multiget_keys.push_back("rocksdb");
 697       statuses =
 698           db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
 699       ASSERT_TRUE(statuses[0].ok());
 700       ASSERT_EQ("first", multiget_values[0]);
 701       ASSERT_TRUE(statuses[1].ok());
 702       ASSERT_EQ("one", multiget_values[1]);
 703       if (wopt.disableWAL) {
 704         ASSERT_TRUE(statuses[2].IsNotFound());
 705       } else {
 706         ASSERT_OK(statuses[2]);
 707       }
 708
 709       // 3rd round: delete and flush
 710       ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
 711       Flush(1);
 712       ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
 713
 714       ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
 715       if (wopt.disableWAL) {
 716         // Still expect finding the value as its delete has not yet being
 717         // flushed.
 718         ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
 719         ASSERT_EQ(value, "one");
 720       } else {
 721         ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
 722       }
 723       ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
 724       ASSERT_EQ(value, "hello");
 725
 726       statuses =
 727           db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
 728       ASSERT_TRUE(statuses[0].IsNotFound());
 729       if (wopt.disableWAL) {
 730         ASSERT_TRUE(statuses[1].ok());
 731         ASSERT_EQ("one", multiget_values[1]);
 732       } else {
 733         ASSERT_TRUE(statuses[1].IsNotFound());
 734       }
 735       ASSERT_TRUE(statuses[2].ok());
 736       ASSERT_EQ("hello", multiget_values[2]);
 737       if (wopt.disableWAL == 0) {
 738         DestroyAndReopen(options);
 739       }
 740     }
 741   } while (ChangeOptions());
 742 }
 743
 744 TEST_F(DBTest, SingleDeleteFlush) {
 745   // Test to check whether flushing preserves a single delete hidden
 746   // behind a put.
 747   do {
 748     Random rnd(301);
 749
 750     Options options = CurrentOptions();
 751     options.disable_auto_compactions = true;
 752     CreateAndReopenWithCF({"pikachu"}, options);
 753
 754     // Put values on second level (so that they will not be in the same
 755     // compaction as the other operations.
 756     ASSERT_OK(Put(1, "foo", "first"));
 757     ASSERT_OK(Put(1, "bar", "one"));
 758     ASSERT_OK(Flush(1));
 759     MoveFilesToLevel(2, 1);
 760
 761     // (Single) delete hidden by a put
 762     ASSERT_OK(SingleDelete(1, "foo"));
 763     ASSERT_OK(Put(1, "foo", "second"));
 764     ASSERT_OK(Delete(1, "bar"));
 765     ASSERT_OK(Put(1, "bar", "two"));
 766     ASSERT_OK(Flush(1));
 767
 768     ASSERT_OK(SingleDelete(1, "foo"));
 769     ASSERT_OK(Delete(1, "bar"));
 770     ASSERT_OK(Flush(1));
 771
 772     ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
 773                                      nullptr, nullptr));
 774
 775     ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
 776     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
 777     // Skip FIFO and universal compaction beccaus they do not apply to the test
 778     // case. Skip MergePut because single delete does not get removed when it
 779     // encounters a merge.
 780   } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
 781                          kSkipMergePut));
 782 }
 783
 784 TEST_F(DBTest, SingleDeletePutFlush) {
 785   // Single deletes that encounter the matching put in a flush should get
 786   // removed.
 787   do {
 788     Random rnd(301);
 789
 790     Options options = CurrentOptions();
 791     options.disable_auto_compactions = true;
 792     CreateAndReopenWithCF({"pikachu"}, options);
 793
 794     ASSERT_OK(Put(1, "foo", Slice()));
 795     ASSERT_OK(Put(1, "a", Slice()));
 796     ASSERT_OK(SingleDelete(1, "a"));
 797     ASSERT_OK(Flush(1));
 798
 799     ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
 800     // Skip FIFO and universal compaction because they do not apply to the test
 801     // case. Skip MergePut because single delete does not get removed when it
 802     // encounters a merge.
 803   } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
 804                          kSkipMergePut));
 805 }
 806
 807 // Disable because not all platform can run it.
 808 // It requires more than 9GB memory to run it, With single allocation
 809 // of more than 3GB.
 810 TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) {
 811   const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024};  // 4GB value
 812   std::string raw(kValueSize, 'v');
 813   Options options = CurrentOptions();
 814   options.env = env_;
 815   options.merge_operator = MergeOperators::CreatePutOperator();
 816   options.write_buffer_size = 100000;  // Small write buffer
 817   options.paranoid_checks = true;
 818   DestroyAndReopen(options);
 819
 820   ASSERT_OK(Put("boo", "v1"));
 821   ASSERT_TRUE(Put("foo", raw).IsInvalidArgument());
 822   ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument());
 823
 824   WriteBatch wb;
 825   ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument());
 826   ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument());
 827
 828   Slice value_slice = raw;
 829   Slice key_slice = "foo";
 830   SliceParts sp_key(&key_slice, 1);
 831   SliceParts sp_value(&value_slice, 1);
 832
 833   ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument());
 834   ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument());
 835 }
 836
 837 // Disable because not all platform can run it.
 838 // It requires more than 9GB memory to run it, With single allocation
 839 // of more than 3GB.
 840 TEST_F(DBTest, DISABLED_VeryLargeValue) {
 841   const size_t kValueSize = 3221225472u;  // 3GB value
 842   const size_t kKeySize = 8388608u;       // 8MB key
 843   std::string raw(kValueSize, 'v');
 844   std::string key1(kKeySize, 'c');
 845   std::string key2(kKeySize, 'd');
 846
 847   Options options = CurrentOptions();
 848   options.env = env_;
 849   options.write_buffer_size = 100000;  // Small write buffer
 850   options.paranoid_checks = true;
 851   DestroyAndReopen(options);
 852
 853   ASSERT_OK(Put("boo", "v1"));
 854   ASSERT_OK(Put("foo", "v1"));
 855   ASSERT_OK(Put(key1, raw));
 856   raw[0] = 'w';
 857   ASSERT_OK(Put(key2, raw));
 858   dbfull()->TEST_WaitForFlushMemTable();
 859
 860 #ifndef ROCKSDB_LITE
 861   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 862 #endif  // !ROCKSDB_LITE
 863
 864   std::string value;
 865   Status s = db_->Get(ReadOptions(), key1, &value);
 866   ASSERT_OK(s);
 867   ASSERT_EQ(kValueSize, value.size());
 868   ASSERT_EQ('v', value[0]);
 869
 870   s = db_->Get(ReadOptions(), key2, &value);
 871   ASSERT_OK(s);
 872   ASSERT_EQ(kValueSize, value.size());
 873   ASSERT_EQ('w', value[0]);
 874
 875   // Compact all files.
 876   Flush();
 877   db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 878
 879   // Check DB is not in read-only state.
 880   ASSERT_OK(Put("boo", "v1"));
 881
 882   s = db_->Get(ReadOptions(), key1, &value);
 883   ASSERT_OK(s);
 884   ASSERT_EQ(kValueSize, value.size());
 885   ASSERT_EQ('v', value[0]);
 886
 887   s = db_->Get(ReadOptions(), key2, &value);
 888   ASSERT_OK(s);
 889   ASSERT_EQ(kValueSize, value.size());
 890   ASSERT_EQ('w', value[0]);
 891 }
 892
 893 TEST_F(DBTest, GetFromImmutableLayer) {
 894   do {
 895     Options options = CurrentOptions();
 896     options.env = env_;
 897     CreateAndReopenWithCF({"pikachu"}, options);
 898
 899     ASSERT_OK(Put(1, "foo", "v1"));
 900     ASSERT_EQ("v1", Get(1, "foo"));
 901
 902     // Block sync calls
 903     env_->delay_sstable_sync_.store(true, std::memory_order_release);
 904     ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
 905     ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
 906     ASSERT_EQ("v1", Get(1, "foo"));
 907     ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
 908     // Release sync calls
 909     env_->delay_sstable_sync_.store(false, std::memory_order_release);
 910   } while (ChangeOptions());
 911 }
 912
 913 TEST_F(DBTest, GetLevel0Ordering) {
 914   do {
 915     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 916     // Check that we process level-0 files in correct order.  The code
 917     // below generates two level-0 files where the earlier one comes
 918     // before the later one in the level-0 file list since the earlier
 919     // one has a smaller "smallest" key.
 920     ASSERT_OK(Put(1, "bar", "b"));
 921     ASSERT_OK(Put(1, "foo", "v1"));
 922     ASSERT_OK(Flush(1));
 923     ASSERT_OK(Put(1, "foo", "v2"));
 924     ASSERT_OK(Flush(1));
 925     ASSERT_EQ("v2", Get(1, "foo"));
 926   } while (ChangeOptions());
 927 }
 928
 929 TEST_F(DBTest, WrongLevel0Config) {
 930   Options options = CurrentOptions();
 931   Close();
 932   ASSERT_OK(DestroyDB(dbname_, options));
 933   options.level0_stop_writes_trigger = 1;
 934   options.level0_slowdown_writes_trigger = 2;
 935   options.level0_file_num_compaction_trigger = 3;
 936   ASSERT_OK(DB::Open(options, dbname_, &db_));
 937 }
 938
 939 #ifndef ROCKSDB_LITE
 940 TEST_F(DBTest, GetOrderedByLevels) {
 941   do {
 942     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 943     ASSERT_OK(Put(1, "foo", "v1"));
 944     Compact(1, "a", "z");
 945     ASSERT_EQ("v1", Get(1, "foo"));
 946     ASSERT_OK(Put(1, "foo", "v2"));
 947     ASSERT_EQ("v2", Get(1, "foo"));
 948     ASSERT_OK(Flush(1));
 949     ASSERT_EQ("v2", Get(1, "foo"));
 950   } while (ChangeOptions());
 951 }
 952
 953 TEST_F(DBTest, GetPicksCorrectFile) {
 954   do {
 955     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 956     // Arrange to have multiple files in a non-level-0 level.
 957     ASSERT_OK(Put(1, "a", "va"));
 958     Compact(1, "a", "b");
 959     ASSERT_OK(Put(1, "x", "vx"));
 960     Compact(1, "x", "y");
 961     ASSERT_OK(Put(1, "f", "vf"));
 962     Compact(1, "f", "g");
 963     ASSERT_EQ("va", Get(1, "a"));
 964     ASSERT_EQ("vf", Get(1, "f"));
 965     ASSERT_EQ("vx", Get(1, "x"));
 966   } while (ChangeOptions());
 967 }
 968
 969 TEST_F(DBTest, GetEncountersEmptyLevel) {
 970   do {
 971     Options options = CurrentOptions();
 972     CreateAndReopenWithCF({"pikachu"}, options);
 973     // Arrange for the following to happen:
 974     //   * sstable A in level 0
 975     //   * nothing in level 1
 976     //   * sstable B in level 2
 977     // Then do enough Get() calls to arrange for an automatic compaction
 978     // of sstable A.  A bug would cause the compaction to be marked as
 979     // occurring at level 1 (instead of the correct level 0).
 980
 981     // Step 1: First place sstables in levels 0 and 2
 982     ASSERT_OK(Put(1, "a", "begin"));
 983     ASSERT_OK(Put(1, "z", "end"));
 984     ASSERT_OK(Flush(1));
 985     ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
 986     ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
 987     ASSERT_OK(Put(1, "a", "begin"));
 988     ASSERT_OK(Put(1, "z", "end"));
 989     ASSERT_OK(Flush(1));
 990     ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
 991     ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
 992
 993     // Step 2: clear level 1 if necessary.
 994     ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
 995     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
 996     ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
 997     ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
 998
 999     // Step 3: read a bunch of times
1000     for (int i = 0; i < 1000; i++) {
1001       ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
1002     }
1003
1004     // Step 4: Wait for compaction to finish
1005     ASSERT_OK(dbfull()->TEST_WaitForCompact());
1006
1007     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
1008   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
1009 }
1010 #endif  // ROCKSDB_LITE
1011
1012 TEST_F(DBTest, FlushMultipleMemtable) {
1013   do {
1014     Options options = CurrentOptions();
1015     WriteOptions writeOpt = WriteOptions();
1016     writeOpt.disableWAL = true;
1017     options.max_write_buffer_number = 4;
1018     options.min_write_buffer_number_to_merge = 3;
1019     options.max_write_buffer_size_to_maintain = -1;
1020     CreateAndReopenWithCF({"pikachu"}, options);
1021     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
1022     ASSERT_OK(Flush(1));
1023     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
1024
1025     ASSERT_EQ("v1", Get(1, "foo"));
1026     ASSERT_EQ("v1", Get(1, "bar"));
1027     ASSERT_OK(Flush(1));
1028   } while (ChangeCompactOptions());
1029 }
1030 #ifndef ROCKSDB_LITE
1031 TEST_F(DBTest, FlushSchedule) {
1032   Options options = CurrentOptions();
1033   options.disable_auto_compactions = true;
1034   options.level0_stop_writes_trigger = 1 << 10;
1035   options.level0_slowdown_writes_trigger = 1 << 10;
1036   options.min_write_buffer_number_to_merge = 1;
1037   options.max_write_buffer_size_to_maintain =
1038       static_cast<int64_t>(options.write_buffer_size);
1039   options.max_write_buffer_number = 2;
1040   options.write_buffer_size = 120 * 1024;
1041   auto flush_listener = std::make_shared<FlushCounterListener>();
1042   flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull;
1043   options.listeners.push_back(flush_listener);
1044   CreateAndReopenWithCF({"pikachu"}, options);
1045   std::vector<port::Thread> threads;
1046
1047   std::atomic<int> thread_num(0);
1048   // each column family will have 5 thread, each thread generating 2 memtables.
1049   // each column family should end up with 10 table files
1050   std::function<void()> fill_memtable_func = [&]() {
1051     int a = thread_num.fetch_add(1);
1052     Random rnd(a);
1053     WriteOptions wo;
1054     // this should fill up 2 memtables
1055     for (int k = 0; k < 5000; ++k) {
1056       ASSERT_OK(db_->Put(wo, handles_[a & 1], rnd.RandomString(13), ""));
1057     }
1058   };
1059
1060   for (int i = 0; i < 10; ++i) {
1061     threads.emplace_back(fill_memtable_func);
1062   }
1063
1064   for (auto& t : threads) {
1065     t.join();
1066   }
1067
1068   auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
1069   auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
1070   ASSERT_LE(default_tables, static_cast<uint64_t>(10));
1071   ASSERT_GT(default_tables, static_cast<uint64_t>(0));
1072   ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
1073   ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
1074 }
1075 #endif  // ROCKSDB_LITE
1076
1077 namespace {
1078 class KeepFilter : public CompactionFilter {
1079  public:
1080   bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
1081               std::string* /*new_value*/,
1082               bool* /*value_changed*/) const override {
1083     return false;
1084   }
1085
1086   const char* Name() const override { return "KeepFilter"; }
1087 };
1088
1089 class KeepFilterFactory : public CompactionFilterFactory {
1090  public:
1091   explicit KeepFilterFactory(bool check_context = false)
1092       : check_context_(check_context) {}
1093
1094   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
1095       const CompactionFilter::Context& context) override {
1096     if (check_context_) {
1097       EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
1098       EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
1099     }
1100     return std::unique_ptr<CompactionFilter>(new KeepFilter());
1101   }
1102
1103   const char* Name() const override { return "KeepFilterFactory"; }
1104   bool check_context_;
1105   std::atomic_bool expect_full_compaction_;
1106   std::atomic_bool expect_manual_compaction_;
1107 };
1108
1109 class DelayFilter : public CompactionFilter {
1110  public:
1111   explicit DelayFilter(DBTestBase* d) : db_test(d) {}
1112   bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
1113               std::string* /*new_value*/,
1114               bool* /*value_changed*/) const override {
1115     db_test->env_->MockSleepForMicroseconds(1000);
1116     return true;
1117   }
1118
1119   const char* Name() const override { return "DelayFilter"; }
1120
1121  private:
1122   DBTestBase* db_test;
1123 };
1124
1125 class DelayFilterFactory : public CompactionFilterFactory {
1126  public:
1127   explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
1128   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
1129       const CompactionFilter::Context& /*context*/) override {
1130     return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
1131   }
1132
1133   const char* Name() const override { return "DelayFilterFactory"; }
1134
1135  private:
1136   DBTestBase* db_test;
1137 };
1138 }  // anonymous namespace
1139
1140 #ifndef ROCKSDB_LITE
1141
1142 static std::string CompressibleString(Random* rnd, int len) {
1143   std::string r;
1144   test::CompressibleString(rnd, 0.8, len, &r);
1145   return r;
1146 }
1147 #endif  // ROCKSDB_LITE
1148
1149 TEST_F(DBTest, FailMoreDbPaths) {
1150   Options options = CurrentOptions();
1151   options.db_paths.emplace_back(dbname_, 10000000);
1152   options.db_paths.emplace_back(dbname_ + "_2", 1000000);
1153   options.db_paths.emplace_back(dbname_ + "_3", 1000000);
1154   options.db_paths.emplace_back(dbname_ + "_4", 1000000);
1155   options.db_paths.emplace_back(dbname_ + "_5", 1000000);
1156   ASSERT_TRUE(TryReopen(options).IsNotSupported());
1157 }
1158
1159 void CheckColumnFamilyMeta(
1160     const ColumnFamilyMetaData& cf_meta, const std::string& cf_name,
1161     const std::vector<std::vector<FileMetaData>>& files_by_level,
1162     uint64_t start_time, uint64_t end_time) {
1163   ASSERT_EQ(cf_meta.name, cf_name);
1164   ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
1165
1166   uint64_t cf_size = 0;
1167   size_t file_count = 0;
1168
1169   for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
1170     const auto& level_meta_from_cf = cf_meta.levels[i];
1171     const auto& level_meta_from_files = files_by_level[i];
1172
1173     ASSERT_EQ(level_meta_from_cf.level, i);
1174     ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
1175
1176     file_count += level_meta_from_cf.files.size();
1177
1178     uint64_t level_size = 0;
1179     for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
1180       const auto& file_meta_from_cf = level_meta_from_cf.files[j];
1181       const auto& file_meta_from_files = level_meta_from_files[j];
1182
1183       level_size += file_meta_from_cf.size;
1184
1185       ASSERT_EQ(file_meta_from_cf.file_number,
1186                 file_meta_from_files.fd.GetNumber());
1187       ASSERT_EQ(file_meta_from_cf.file_number,
1188                 TableFileNameToNumber(file_meta_from_cf.name));
1189       ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
1190       ASSERT_EQ(file_meta_from_cf.smallest_seqno,
1191                 file_meta_from_files.fd.smallest_seqno);
1192       ASSERT_EQ(file_meta_from_cf.largest_seqno,
1193                 file_meta_from_files.fd.largest_seqno);
1194       ASSERT_EQ(file_meta_from_cf.smallestkey,
1195                 file_meta_from_files.smallest.user_key().ToString());
1196       ASSERT_EQ(file_meta_from_cf.largestkey,
1197                 file_meta_from_files.largest.user_key().ToString());
1198       ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
1199                 file_meta_from_files.oldest_blob_file_number);
1200       ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
1201                 file_meta_from_files.oldest_ancester_time);
1202       ASSERT_EQ(file_meta_from_cf.file_creation_time,
1203                 file_meta_from_files.file_creation_time);
1204       ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
1205       ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
1206       ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
1207       ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
1208       // More from FileStorageInfo
1209       ASSERT_EQ(file_meta_from_cf.file_type, kTableFile);
1210       ASSERT_EQ(file_meta_from_cf.name,
1211                 "/" + file_meta_from_cf.relative_filename);
1212       ASSERT_EQ(file_meta_from_cf.directory, file_meta_from_cf.db_path);
1213     }
1214
1215     ASSERT_EQ(level_meta_from_cf.size, level_size);
1216     cf_size += level_size;
1217   }
1218
1219   ASSERT_EQ(cf_meta.file_count, file_count);
1220   ASSERT_EQ(cf_meta.size, cf_size);
1221 }
1222
1223 void CheckLiveFilesMeta(
1224     const std::vector<LiveFileMetaData>& live_file_meta,
1225     const std::vector<std::vector<FileMetaData>>& files_by_level) {
1226   size_t total_file_count = 0;
1227   for (const auto& f : files_by_level) {
1228     total_file_count += f.size();
1229   }
1230
1231   ASSERT_EQ(live_file_meta.size(), total_file_count);
1232
1233   int level = 0;
1234   int i = 0;
1235
1236   for (const auto& meta : live_file_meta) {
1237     if (level != meta.level) {
1238       level = meta.level;
1239       i = 0;
1240     }
1241
1242     ASSERT_LT(i, files_by_level[level].size());
1243
1244     const auto& expected_meta = files_by_level[level][i];
1245
1246     ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
1247     ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
1248     ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
1249     ASSERT_EQ(meta.size, expected_meta.fd.file_size);
1250     ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
1251     ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
1252     ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
1253     ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
1254     ASSERT_EQ(meta.oldest_blob_file_number,
1255               expected_meta.oldest_blob_file_number);
1256
1257     // More from FileStorageInfo
1258     ASSERT_EQ(meta.file_type, kTableFile);
1259     ASSERT_EQ(meta.name, "/" + meta.relative_filename);
1260     ASSERT_EQ(meta.directory, meta.db_path);
1261
1262     ++i;
1263   }
1264 }
1265
1266 #ifndef ROCKSDB_LITE
1267 void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number,
1268                  uint64_t total_blob_count, uint64_t total_blob_bytes,
1269                  const std::string& checksum_method,
1270                  const std::string& checksum_value,
1271                  uint64_t garbage_blob_count = 0,
1272                  uint64_t garbage_blob_bytes = 0) {
1273   ColumnFamilyData* cfd =
1274       (static_cast<const ColumnFamilyHandleImpl*>(cfh))->cfd();
1275   assert(cfd);
1276
1277   Version* const version = cfd->current();
1278   assert(version);
1279
1280   VersionStorageInfo* const storage_info = version->storage_info();
1281   assert(storage_info);
1282
1283   // Add a live blob file.
1284
1285   auto shared_meta = SharedBlobFileMetaData::Create(
1286       blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
1287       checksum_value);
1288
1289   auto meta = BlobFileMetaData::Create(std::move(shared_meta),
1290                                        BlobFileMetaData::LinkedSsts(),
1291                                        garbage_blob_count, garbage_blob_bytes);
1292
1293   storage_info->AddBlobFile(std::move(meta));
1294 }
1295
1296 static void CheckBlobMetaData(
1297     const BlobMetaData& bmd, uint64_t blob_file_number,
1298     uint64_t total_blob_count, uint64_t total_blob_bytes,
1299     const std::string& checksum_method, const std::string& checksum_value,
1300     uint64_t garbage_blob_count = 0, uint64_t garbage_blob_bytes = 0) {
1301   ASSERT_EQ(bmd.blob_file_number, blob_file_number);
1302   ASSERT_EQ(bmd.blob_file_name, BlobFileName("", blob_file_number));
1303   ASSERT_EQ(bmd.blob_file_size,
1304             total_blob_bytes + BlobLogHeader::kSize + BlobLogFooter::kSize);
1305
1306   ASSERT_EQ(bmd.total_blob_count, total_blob_count);
1307   ASSERT_EQ(bmd.total_blob_bytes, total_blob_bytes);
1308   ASSERT_EQ(bmd.garbage_blob_count, garbage_blob_count);
1309   ASSERT_EQ(bmd.garbage_blob_bytes, garbage_blob_bytes);
1310   ASSERT_EQ(bmd.checksum_method, checksum_method);
1311   ASSERT_EQ(bmd.checksum_value, checksum_value);
1312 }
1313
1314 TEST_F(DBTest, MetaDataTest) {
1315   Options options = CurrentOptions();
1316   options.create_if_missing = true;
1317   options.disable_auto_compactions = true;
1318
1319   int64_t temp_time = 0;
1320   options.env->GetCurrentTime(&temp_time);
1321   uint64_t start_time = static_cast<uint64_t>(temp_time);
1322
1323   DestroyAndReopen(options);
1324
1325   Random rnd(301);
1326   int key_index = 0;
1327   for (int i = 0; i < 100; ++i) {
1328     // Add a single blob reference to each file
1329     std::string blob_index;
1330     BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
1331                           /* offset */ 1234, /* size */ 5678, kNoCompression);
1332
1333     WriteBatch batch;
1334     ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
1335                                                blob_index));
1336     ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
1337
1338     ++key_index;
1339
1340     // Fill up the rest of the file with random values.
1341     GenerateNewFile(&rnd, &key_index, /* nowait */ true);
1342
1343     ASSERT_OK(Flush());
1344   }
1345
1346   std::vector<std::vector<FileMetaData>> files_by_level;
1347   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
1348
1349   options.env->GetCurrentTime(&temp_time);
1350   uint64_t end_time = static_cast<uint64_t>(temp_time);
1351
1352   ColumnFamilyMetaData cf_meta;
1353   db_->GetColumnFamilyMetaData(&cf_meta);
1354   CheckColumnFamilyMeta(cf_meta, kDefaultColumnFamilyName, files_by_level,
1355                         start_time, end_time);
1356   std::vector<LiveFileMetaData> live_file_meta;
1357   db_->GetLiveFilesMetaData(&live_file_meta);
1358   CheckLiveFilesMeta(live_file_meta, files_by_level);
1359 }
1360
1361 TEST_F(DBTest, AllMetaDataTest) {
1362   Options options = CurrentOptions();
1363   options.create_if_missing = true;
1364   options.disable_auto_compactions = true;
1365   DestroyAndReopen(options);
1366   CreateAndReopenWithCF({"pikachu"}, options);
1367
1368   constexpr uint64_t blob_file_number = 234;
1369   constexpr uint64_t total_blob_count = 555;
1370   constexpr uint64_t total_blob_bytes = 66666;
1371   constexpr char checksum_method[] = "CRC32";
1372   constexpr char checksum_value[] = "\x3d\x87\xff\x57";
1373
1374   int64_t temp_time = 0;
1375   options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
1376   uint64_t start_time = static_cast<uint64_t>(temp_time);
1377
1378   Random rnd(301);
1379   dbfull()->TEST_LockMutex();
1380   for (int cf = 0; cf < 2; cf++) {
1381     AddBlobFile(handles_[cf], blob_file_number * (cf + 1),
1382                 total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
1383                 checksum_method, checksum_value);
1384   }
1385   dbfull()->TEST_UnlockMutex();
1386
1387   std::vector<ColumnFamilyMetaData> all_meta;
1388   db_->GetAllColumnFamilyMetaData(&all_meta);
1389
1390   std::vector<std::vector<FileMetaData>> default_files_by_level;
1391   std::vector<std::vector<FileMetaData>> pikachu_files_by_level;
1392   dbfull()->TEST_GetFilesMetaData(handles_[0], &default_files_by_level);
1393   dbfull()->TEST_GetFilesMetaData(handles_[1], &pikachu_files_by_level);
1394
1395   options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
1396   uint64_t end_time = static_cast<uint64_t>(temp_time);
1397
1398   ASSERT_EQ(all_meta.size(), 2);
1399   for (int cf = 0; cf < 2; cf++) {
1400     const auto& cfmd = all_meta[cf];
1401     if (cf == 0) {
1402       CheckColumnFamilyMeta(cfmd, "default", default_files_by_level, start_time,
1403                             end_time);
1404     } else {
1405       CheckColumnFamilyMeta(cfmd, "pikachu", pikachu_files_by_level, start_time,
1406                             end_time);
1407     }
1408     ASSERT_EQ(cfmd.blob_files.size(), 1U);
1409     const auto& bmd = cfmd.blob_files[0];
1410     ASSERT_EQ(cfmd.blob_file_count, 1U);
1411     ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
1412     ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
1413     CheckBlobMetaData(bmd, blob_file_number * (cf + 1),
1414                       total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
1415                       checksum_method, checksum_value);
1416   }
1417 }
1418
1419 namespace {
1420 void MinLevelHelper(DBTest* self, Options& options) {
1421   Random rnd(301);
1422
1423   for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
1424        num++) {
1425     std::vector<std::string> values;
1426     // Write 120KB (12 values, each 10K)
1427     for (int i = 0; i < 12; i++) {
1428       values.push_back(rnd.RandomString(10000));
1429       ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
1430     }
1431     ASSERT_OK(self->dbfull()->TEST_WaitForFlushMemTable());
1432     ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
1433   }
1434
1435   // generate one more file in level-0, and should trigger level-0 compaction
1436   std::vector<std::string> values;
1437   for (int i = 0; i < 12; i++) {
1438     values.push_back(rnd.RandomString(10000));
1439     ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
1440   }
1441   ASSERT_OK(self->dbfull()->TEST_WaitForCompact());
1442
1443   ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
1444   ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
1445 }
1446
1447 // returns false if the calling-Test should be skipped
1448 bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
1449                         int lev, int strategy) {
1450   fprintf(stderr,
1451           "Test with compression options : window_bits = %d, level =  %d, "
1452           "strategy = %d}\n",
1453           wbits, lev, strategy);
1454   options.write_buffer_size = 100 << 10;  // 100KB
1455   options.arena_block_size = 4096;
1456   options.num_levels = 3;
1457   options.level0_file_num_compaction_trigger = 3;
1458   options.create_if_missing = true;
1459
1460   if (Snappy_Supported()) {
1461     type = kSnappyCompression;
1462     fprintf(stderr, "using snappy\n");
1463   } else if (Zlib_Supported()) {
1464     type = kZlibCompression;
1465     fprintf(stderr, "using zlib\n");
1466   } else if (BZip2_Supported()) {
1467     type = kBZip2Compression;
1468     fprintf(stderr, "using bzip2\n");
1469   } else if (LZ4_Supported()) {
1470     type = kLZ4Compression;
1471     fprintf(stderr, "using lz4\n");
1472   } else if (XPRESS_Supported()) {
1473     type = kXpressCompression;
1474     fprintf(stderr, "using xpress\n");
1475   } else if (ZSTD_Supported()) {
1476     type = kZSTD;
1477     fprintf(stderr, "using ZSTD\n");
1478   } else {
1479     fprintf(stderr, "skipping test, compression disabled\n");
1480     return false;
1481   }
1482   options.compression_per_level.resize(options.num_levels);
1483
1484   // do not compress L0
1485   for (int i = 0; i < 1; i++) {
1486     options.compression_per_level[i] = kNoCompression;
1487   }
1488   for (int i = 1; i < options.num_levels; i++) {
1489     options.compression_per_level[i] = type;
1490   }
1491   return true;
1492 }
1493 }  // anonymous namespace
1494
1495 TEST_F(DBTest, MinLevelToCompress1) {
1496   Options options = CurrentOptions();
1497   CompressionType type = kSnappyCompression;
1498   if (!MinLevelToCompress(type, options, -14, -1, 0)) {
1499     return;
1500   }
1501   Reopen(options);
1502   MinLevelHelper(this, options);
1503
1504   // do not compress L0 and L1
1505   for (int i = 0; i < 2; i++) {
1506     options.compression_per_level[i] = kNoCompression;
1507   }
1508   for (int i = 2; i < options.num_levels; i++) {
1509     options.compression_per_level[i] = type;
1510   }
1511   DestroyAndReopen(options);
1512   MinLevelHelper(this, options);
1513 }
1514
1515 TEST_F(DBTest, MinLevelToCompress2) {
1516   Options options = CurrentOptions();
1517   CompressionType type = kSnappyCompression;
1518   if (!MinLevelToCompress(type, options, 15, -1, 0)) {
1519     return;
1520   }
1521   Reopen(options);
1522   MinLevelHelper(this, options);
1523
1524   // do not compress L0 and L1
1525   for (int i = 0; i < 2; i++) {
1526     options.compression_per_level[i] = kNoCompression;
1527   }
1528   for (int i = 2; i < options.num_levels; i++) {
1529     options.compression_per_level[i] = type;
1530   }
1531   DestroyAndReopen(options);
1532   MinLevelHelper(this, options);
1533 }
1534
1535 // This test may fail because of a legit case that multiple L0 files
1536 // are trivial moved to L1.
1537 TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) {
1538   do {
1539     Options options = CurrentOptions();
1540     options.env = env_;
1541     options.write_buffer_size = 100000;  // Small write buffer
1542     CreateAndReopenWithCF({"pikachu"}, options);
1543
1544     // We must have at most one file per level except for level-0,
1545     // which may have up to kL0_StopWritesTrigger files.
1546     const int kMaxFiles =
1547         options.num_levels + options.level0_stop_writes_trigger;
1548
1549     Random rnd(301);
1550     std::string value =
1551         rnd.RandomString(static_cast<int>(2 * options.write_buffer_size));
1552     for (int i = 0; i < 5 * kMaxFiles; i++) {
1553       ASSERT_OK(Put(1, "key", value));
1554       ASSERT_LE(TotalTableFiles(1), kMaxFiles);
1555     }
1556   } while (ChangeCompactOptions());
1557 }
1558 #endif  // ROCKSDB_LITE
1559
1560 #ifndef ROCKSDB_LITE
1561 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
1562   bool result = (val >= low) && (val <= high);
1563   if (!result) {
1564     fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
1565             (unsigned long long)(val), (unsigned long long)(low),
1566             (unsigned long long)(high));
1567   }
1568   return result;
1569 }
1570
1571 TEST_F(DBTest, ApproximateSizesMemTable) {
1572   Options options = CurrentOptions();
1573   options.write_buffer_size = 100000000;  // Large write buffer
1574   options.compression = kNoCompression;
1575   options.create_if_missing = true;
1576   DestroyAndReopen(options);
1577   auto default_cf = db_->DefaultColumnFamily();
1578
1579   const int N = 128;
1580   Random rnd(301);
1581   for (int i = 0; i < N; i++) {
1582     ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
1583   }
1584
1585   uint64_t size;
1586   std::string start = Key(50);
1587   std::string end = Key(60);
1588   Range r(start, end);
1589   SizeApproximationOptions size_approx_options;
1590   size_approx_options.include_memtables = true;
1591   size_approx_options.include_files = true;
1592   ASSERT_OK(
1593       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1594   ASSERT_GT(size, 6000);
1595   ASSERT_LT(size, 204800);
1596   // Zero if not including mem table
1597   ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
1598   ASSERT_EQ(size, 0);
1599
1600   start = Key(500);
1601   end = Key(600);
1602   r = Range(start, end);
1603   ASSERT_OK(
1604       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1605   ASSERT_EQ(size, 0);
1606
1607   for (int i = 0; i < N; i++) {
1608     ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
1609   }
1610
1611   start = Key(500);
1612   end = Key(600);
1613   r = Range(start, end);
1614   ASSERT_OK(
1615       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1616   ASSERT_EQ(size, 0);
1617
1618   start = Key(100);
1619   end = Key(1020);
1620   r = Range(start, end);
1621   ASSERT_OK(
1622       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1623   ASSERT_GT(size, 6000);
1624
1625   options.max_write_buffer_number = 8;
1626   options.min_write_buffer_number_to_merge = 5;
1627   options.write_buffer_size = 1024 * N;  // Not very large
1628   DestroyAndReopen(options);
1629   default_cf = db_->DefaultColumnFamily();
1630
1631   int keys[N * 3];
1632   for (int i = 0; i < N; i++) {
1633     keys[i * 3] = i * 5;
1634     keys[i * 3 + 1] = i * 5 + 1;
1635     keys[i * 3 + 2] = i * 5 + 2;
1636   }
1637   // MemTable entry counting is estimated and can vary greatly depending on
1638   // layout. Thus, using deterministic seed for test stability.
1639   RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
1640
1641   for (int i = 0; i < N * 3; i++) {
1642     ASSERT_OK(Put(Key(keys[i] + 1000), rnd.RandomString(1024)));
1643   }
1644
1645   start = Key(100);
1646   end = Key(300);
1647   r = Range(start, end);
1648   ASSERT_OK(
1649       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1650   ASSERT_EQ(size, 0);
1651
1652   start = Key(1050);
1653   end = Key(1080);
1654   r = Range(start, end);
1655   ASSERT_OK(
1656       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1657   ASSERT_GT(size, 6000);
1658
1659   start = Key(2100);
1660   end = Key(2300);
1661   r = Range(start, end);
1662   ASSERT_OK(
1663       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1664   ASSERT_EQ(size, 0);
1665
1666   start = Key(1050);
1667   end = Key(1080);
1668   r = Range(start, end);
1669   uint64_t size_with_mt, size_without_mt;
1670   ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
1671                                      &size_with_mt));
1672   ASSERT_GT(size_with_mt, 6000);
1673   ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
1674   ASSERT_EQ(size_without_mt, 0);
1675
1676   ASSERT_OK(Flush());
1677
1678   for (int i = 0; i < N; i++) {
1679     ASSERT_OK(Put(Key(i + 1000), rnd.RandomString(1024)));
1680   }
1681
1682   start = Key(1050);
1683   end = Key(1080);
1684   r = Range(start, end);
1685   ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
1686                                      &size_with_mt));
1687   ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
1688   ASSERT_GT(size_with_mt, size_without_mt);
1689   ASSERT_GT(size_without_mt, 6000);
1690
1691   // Check that include_memtables flag works as expected
1692   size_approx_options.include_memtables = false;
1693   ASSERT_OK(
1694       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1695   ASSERT_EQ(size, size_without_mt);
1696
1697   // Check that files_size_error_margin works as expected, when the heuristic
1698   // conditions are not met
1699   start = Key(1);
1700   end = Key(1000 + N - 2);
1701   r = Range(start, end);
1702   size_approx_options.files_size_error_margin = -1.0;  // disabled
1703   ASSERT_OK(
1704       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
1705   uint64_t size2;
1706   size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
1707   ASSERT_OK(
1708       db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2));
1709   ASSERT_EQ(size, size2);
1710 }
1711
1712 TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
1713   // Roughly 4 keys per data block, 1000 keys per file,
1714   // with filter substantially larger than a data block
1715   BlockBasedTableOptions table_options;
1716   table_options.filter_policy.reset(NewBloomFilterPolicy(16));
1717   table_options.block_size = 100;
1718   Options options = CurrentOptions();
1719   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1720   options.write_buffer_size = 24 * 1024;
1721   options.compression = kNoCompression;
1722   options.create_if_missing = true;
1723   options.target_file_size_base = 24 * 1024;
1724   DestroyAndReopen(options);
1725   const auto default_cf = db_->DefaultColumnFamily();
1726
1727   const int N = 64000;
1728   Random rnd(301);
1729   for (int i = 0; i < N; i++) {
1730     ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
1731   }
1732   // Flush everything to files
1733   ASSERT_OK(Flush());
1734   // Compact the entire key space into the next level
1735   ASSERT_OK(
1736       db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr));
1737
1738   // Write more keys
1739   for (int i = N; i < (N + N / 4); i++) {
1740     ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
1741   }
1742   // Flush everything to files again
1743   ASSERT_OK(Flush());
1744
1745   // Wait for compaction to finish
1746   ASSERT_OK(dbfull()->TEST_WaitForCompact());
1747
1748   {
1749     const std::string start = Key(0);
1750     const std::string end = Key(2 * N);
1751     const Range r(start, end);
1752
1753     SizeApproximationOptions size_approx_options;
1754     size_approx_options.include_memtables = false;
1755     size_approx_options.include_files = true;
1756     size_approx_options.files_size_error_margin = -1.0;  // disabled
1757
1758     // Get the precise size without any approximation heuristic
1759     uint64_t size;
1760     ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
1761                                        &size));
1762     ASSERT_NE(size, 0);
1763
1764     // Get the size with an approximation heuristic
1765     uint64_t size2;
1766     const double error_margin = 0.2;
1767     size_approx_options.files_size_error_margin = error_margin;
1768     ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
1769                                        &size2));
1770     ASSERT_LT(size2, size * (1 + error_margin));
1771     ASSERT_GT(size2, size * (1 - error_margin));
1772   }
1773
1774   {
1775     // Ensure that metadata is not falsely attributed only to the last data in
1776     // the file. (In some applications, filters can be large portion of data
1777     // size.)
1778     // Perform many queries over small range, enough to ensure crossing file
1779     // boundary, and make sure we never see a spike for large filter.
1780     for (int i = 0; i < 3000; i += 10) {
1781       const std::string start = Key(i);
1782       const std::string end = Key(i + 11);  // overlap by 1 key
1783       const Range r(start, end);
1784       uint64_t size;
1785       ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
1786       ASSERT_LE(size, 11 * 100);
1787     }
1788   }
1789 }
1790
1791 TEST_F(DBTest, GetApproximateMemTableStats) {
1792   Options options = CurrentOptions();
1793   options.write_buffer_size = 100000000;
1794   options.compression = kNoCompression;
1795   options.create_if_missing = true;
1796   DestroyAndReopen(options);
1797
1798   const int N = 128;
1799   Random rnd(301);
1800   for (int i = 0; i < N; i++) {
1801     ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
1802   }
1803
1804   uint64_t count;
1805   uint64_t size;
1806
1807   std::string start = Key(50);
1808   std::string end = Key(60);
1809   Range r(start, end);
1810   db_->GetApproximateMemTableStats(r, &count, &size);
1811   ASSERT_GT(count, 0);
1812   ASSERT_LE(count, N);
1813   ASSERT_GT(size, 6000);
1814   ASSERT_LT(size, 204800);
1815
1816   start = Key(500);
1817   end = Key(600);
1818   r = Range(start, end);
1819   db_->GetApproximateMemTableStats(r, &count, &size);
1820   ASSERT_EQ(count, 0);
1821   ASSERT_EQ(size, 0);
1822
1823   ASSERT_OK(Flush());
1824
1825   start = Key(50);
1826   end = Key(60);
1827   r = Range(start, end);
1828   db_->GetApproximateMemTableStats(r, &count, &size);
1829   ASSERT_EQ(count, 0);
1830   ASSERT_EQ(size, 0);
1831
1832   for (int i = 0; i < N; i++) {
1833     ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
1834   }
1835
1836   start = Key(100);
1837   end = Key(1020);
1838   r = Range(start, end);
1839   db_->GetApproximateMemTableStats(r, &count, &size);
1840   ASSERT_GT(count, 20);
1841   ASSERT_GT(size, 6000);
1842 }
1843
1844 TEST_F(DBTest, ApproximateSizes) {
1845   do {
1846     Options options = CurrentOptions();
1847     options.write_buffer_size = 100000000;  // Large write buffer
1848     options.compression = kNoCompression;
1849     options.create_if_missing = true;
1850     DestroyAndReopen(options);
1851     CreateAndReopenWithCF({"pikachu"}, options);
1852
1853     uint64_t size;
1854     ASSERT_OK(Size("", "xyz", 1, &size));
1855     ASSERT_TRUE(Between(size, 0, 0));
1856     ReopenWithColumnFamilies({"default", "pikachu"}, options);
1857     ASSERT_OK(Size("", "xyz", 1, &size));
1858     ASSERT_TRUE(Between(size, 0, 0));
1859
1860     // Write 8MB (80 values, each 100K)
1861     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
1862     const int N = 80;
1863     static const int S1 = 100000;
1864     static const int S2 = 105000;  // Allow some expansion from metadata
1865     Random rnd(301);
1866     for (int i = 0; i < N; i++) {
1867       ASSERT_OK(Put(1, Key(i), rnd.RandomString(S1)));
1868     }
1869
1870     // 0 because GetApproximateSizes() does not account for memtable space
1871     ASSERT_OK(Size("", Key(50), 1, &size));
1872     ASSERT_TRUE(Between(size, 0, 0));
1873
1874     // Check sizes across recovery by reopening a few times
1875     for (int run = 0; run < 3; run++) {
1876       ReopenWithColumnFamilies({"default", "pikachu"}, options);
1877
1878       for (int compact_start = 0; compact_start < N; compact_start += 10) {
1879         for (int i = 0; i < N; i += 10) {
1880           ASSERT_OK(Size("", Key(i), 1, &size));
1881           ASSERT_TRUE(Between(size, S1 * i, S2 * i));
1882           ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size));
1883           ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1)));
1884           ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size));
1885           ASSERT_TRUE(Between(size, S1 * 10, S2 * 10));
1886         }
1887         ASSERT_OK(Size("", Key(50), 1, &size));
1888         ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
1889         ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size));
1890         ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
1891
1892         std::string cstart_str = Key(compact_start);
1893         std::string cend_str = Key(compact_start + 9);
1894         Slice cstart = cstart_str;
1895         Slice cend = cend_str;
1896         ASSERT_OK(dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]));
1897       }
1898
1899       ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
1900       ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
1901     }
1902     // ApproximateOffsetOf() is not yet implemented in plain table format.
1903   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
1904                          kSkipPlainTable | kSkipHashIndex));
1905 }
1906
1907 TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
1908   do {
1909     Options options = CurrentOptions();
1910     options.compression = kNoCompression;
1911     CreateAndReopenWithCF({"pikachu"}, options);
1912
1913     Random rnd(301);
1914     std::string big1 = rnd.RandomString(100000);
1915     ASSERT_OK(Put(1, Key(0), rnd.RandomString(10000)));
1916     ASSERT_OK(Put(1, Key(1), rnd.RandomString(10000)));
1917     ASSERT_OK(Put(1, Key(2), big1));
1918     ASSERT_OK(Put(1, Key(3), rnd.RandomString(10000)));
1919     ASSERT_OK(Put(1, Key(4), big1));
1920     ASSERT_OK(Put(1, Key(5), rnd.RandomString(10000)));
1921     ASSERT_OK(Put(1, Key(6), rnd.RandomString(300000)));
1922     ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000)));
1923
1924     // Check sizes across recovery by reopening a few times
1925     uint64_t size;
1926     for (int run = 0; run < 3; run++) {
1927       ReopenWithColumnFamilies({"default", "pikachu"}, options);
1928
1929       ASSERT_OK(Size("", Key(0), 1, &size));
1930       ASSERT_TRUE(Between(size, 0, 0));
1931       ASSERT_OK(Size("", Key(1), 1, &size));
1932       ASSERT_TRUE(Between(size, 10000, 11000));
1933       ASSERT_OK(Size("", Key(2), 1, &size));
1934       ASSERT_TRUE(Between(size, 20000, 21000));
1935       ASSERT_OK(Size("", Key(3), 1, &size));
1936       ASSERT_TRUE(Between(size, 120000, 121000));
1937       ASSERT_OK(Size("", Key(4), 1, &size));
1938       ASSERT_TRUE(Between(size, 130000, 131000));
1939       ASSERT_OK(Size("", Key(5), 1, &size));
1940       ASSERT_TRUE(Between(size, 230000, 232000));
1941       ASSERT_OK(Size("", Key(6), 1, &size));
1942       ASSERT_TRUE(Between(size, 240000, 242000));
1943       // Ensure some overhead is accounted for, even without including all
1944       ASSERT_OK(Size("", Key(7), 1, &size));
1945       ASSERT_TRUE(Between(size, 540500, 545000));
1946       ASSERT_OK(Size("", Key(8), 1, &size));
1947       ASSERT_TRUE(Between(size, 550500, 555000));
1948
1949       ASSERT_OK(Size(Key(3), Key(5), 1, &size));
1950       ASSERT_TRUE(Between(size, 110100, 111000));
1951
1952       ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
1953     }
1954     // ApproximateOffsetOf() is not yet implemented in plain table format.
1955   } while (ChangeOptions(kSkipPlainTable));
1956 }
1957 #endif  // ROCKSDB_LITE
1958
1959 #ifndef ROCKSDB_LITE
1960 TEST_F(DBTest, Snapshot) {
1961   env_->SetMockSleep();
1962   anon::OptionsOverride options_override;
1963   options_override.skip_policy = kSkipNoSnapshot;
1964   do {
1965     CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
1966     ASSERT_OK(Put(0, "foo", "0v1"));
1967     ASSERT_OK(Put(1, "foo", "1v1"));
1968
1969     const Snapshot* s1 = db_->GetSnapshot();
1970     ASSERT_EQ(1U, GetNumSnapshots());
1971     uint64_t time_snap1 = GetTimeOldestSnapshots();
1972     ASSERT_GT(time_snap1, 0U);
1973     ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
1974     ASSERT_EQ(GetTimeOldestSnapshots(),
1975               static_cast<uint64_t>(s1->GetUnixTime()));
1976     ASSERT_OK(Put(0, "foo", "0v2"));
1977     ASSERT_OK(Put(1, "foo", "1v2"));
1978
1979     env_->MockSleepForSeconds(1);
1980
1981     const Snapshot* s2 = db_->GetSnapshot();
1982     ASSERT_EQ(2U, GetNumSnapshots());
1983     ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
1984     ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
1985     ASSERT_EQ(GetTimeOldestSnapshots(),
1986               static_cast<uint64_t>(s1->GetUnixTime()));
1987     ASSERT_OK(Put(0, "foo", "0v3"));
1988     ASSERT_OK(Put(1, "foo", "1v3"));
1989
1990     {
1991       ManagedSnapshot s3(db_);
1992       ASSERT_EQ(3U, GetNumSnapshots());
1993       ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
1994       ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
1995       ASSERT_EQ(GetTimeOldestSnapshots(),
1996                 static_cast<uint64_t>(s1->GetUnixTime()));
1997
1998       ASSERT_OK(Put(0, "foo", "0v4"));
1999       ASSERT_OK(Put(1, "foo", "1v4"));
2000       ASSERT_EQ("0v1", Get(0, "foo", s1));
2001       ASSERT_EQ("1v1", Get(1, "foo", s1));
2002       ASSERT_EQ("0v2", Get(0, "foo", s2));
2003       ASSERT_EQ("1v2", Get(1, "foo", s2));
2004       ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
2005       ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
2006       ASSERT_EQ("0v4", Get(0, "foo"));
2007       ASSERT_EQ("1v4", Get(1, "foo"));
2008     }
2009
2010     ASSERT_EQ(2U, GetNumSnapshots());
2011     ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
2012     ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
2013     ASSERT_EQ(GetTimeOldestSnapshots(),
2014               static_cast<uint64_t>(s1->GetUnixTime()));
2015     ASSERT_EQ("0v1", Get(0, "foo", s1));
2016     ASSERT_EQ("1v1", Get(1, "foo", s1));
2017     ASSERT_EQ("0v2", Get(0, "foo", s2));
2018     ASSERT_EQ("1v2", Get(1, "foo", s2));
2019     ASSERT_EQ("0v4", Get(0, "foo"));
2020     ASSERT_EQ("1v4", Get(1, "foo"));
2021
2022     db_->ReleaseSnapshot(s1);
2023     ASSERT_EQ("0v2", Get(0, "foo", s2));
2024     ASSERT_EQ("1v2", Get(1, "foo", s2));
2025     ASSERT_EQ("0v4", Get(0, "foo"));
2026     ASSERT_EQ("1v4", Get(1, "foo"));
2027     ASSERT_EQ(1U, GetNumSnapshots());
2028     ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
2029     ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
2030     ASSERT_EQ(GetTimeOldestSnapshots(),
2031               static_cast<uint64_t>(s2->GetUnixTime()));
2032
2033     db_->ReleaseSnapshot(s2);
2034     ASSERT_EQ(0U, GetNumSnapshots());
2035     ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
2036     ASSERT_EQ("0v4", Get(0, "foo"));
2037     ASSERT_EQ("1v4", Get(1, "foo"));
2038   } while (ChangeOptions());
2039 }
2040
2041 TEST_F(DBTest, HiddenValuesAreRemoved) {
2042   anon::OptionsOverride options_override;
2043   options_override.skip_policy = kSkipNoSnapshot;
2044   uint64_t size;
2045   do {
2046     Options options = CurrentOptions(options_override);
2047     CreateAndReopenWithCF({"pikachu"}, options);
2048     Random rnd(301);
2049     FillLevels("a", "z", 1);
2050
2051     std::string big = rnd.RandomString(50000);
2052     ASSERT_OK(Put(1, "foo", big));
2053     ASSERT_OK(Put(1, "pastfoo", "v"));
2054     const Snapshot* snapshot = db_->GetSnapshot();
2055     ASSERT_OK(Put(1, "foo", "tiny"));
2056     ASSERT_OK(Put(1, "pastfoo2", "v2"));  // Advance sequence number one more
2057
2058     ASSERT_OK(Flush(1));
2059     ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
2060
2061     ASSERT_EQ(big, Get(1, "foo", snapshot));
2062     ASSERT_OK(Size("", "pastfoo", 1, &size));
2063     ASSERT_TRUE(Between(size, 50000, 60000));
2064     db_->ReleaseSnapshot(snapshot);
2065     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
2066     Slice x("x");
2067     ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]));
2068     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
2069     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
2070     ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
2071     ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]));
2072     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
2073
2074     ASSERT_OK(Size("", "pastfoo", 1, &size));
2075     ASSERT_TRUE(Between(size, 0, 1000));
2076     // ApproximateOffsetOf() is not yet implemented in plain table format,
2077     // which is used by Size().
2078   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
2079                          kSkipPlainTable));
2080 }
2081 #endif  // ROCKSDB_LITE
2082
2083 TEST_F(DBTest, UnremovableSingleDelete) {
2084   // If we compact:
2085   //
2086   // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2)
2087   //
2088   // We do not want to end up with:
2089   //
2090   // Put(A, v1) Snapshot Put(A, v2)
2091   //
2092   // Because a subsequent SingleDelete(A) would delete the Put(A, v2)
2093   // but not Put(A, v1), so Get(A) would return v1.
2094   anon::OptionsOverride options_override;
2095   options_override.skip_policy = kSkipNoSnapshot;
2096   do {
2097     Options options = CurrentOptions(options_override);
2098     options.disable_auto_compactions = true;
2099     CreateAndReopenWithCF({"pikachu"}, options);
2100
2101     ASSERT_OK(Put(1, "foo", "first"));
2102     const Snapshot* snapshot = db_->GetSnapshot();
2103     ASSERT_OK(SingleDelete(1, "foo"));
2104     ASSERT_OK(Put(1, "foo", "second"));
2105     ASSERT_OK(Flush(1));
2106
2107     ASSERT_EQ("first", Get(1, "foo", snapshot));
2108     ASSERT_EQ("second", Get(1, "foo"));
2109
2110     ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
2111                                      nullptr, nullptr));
2112     ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
2113
2114     ASSERT_OK(SingleDelete(1, "foo"));
2115
2116     ASSERT_EQ("first", Get(1, "foo", snapshot));
2117     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
2118
2119     ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
2120                                      nullptr, nullptr));
2121
2122     ASSERT_EQ("first", Get(1, "foo", snapshot));
2123     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
2124     db_->ReleaseSnapshot(snapshot);
2125     // Skip FIFO and universal compaction because they do not apply to the test
2126     // case. Skip MergePut because single delete does not get removed when it
2127     // encounters a merge.
2128   } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
2129                          kSkipMergePut));
2130 }
2131
2132 #ifndef ROCKSDB_LITE
2133 TEST_F(DBTest, DeletionMarkers1) {
2134   Options options = CurrentOptions();
2135   CreateAndReopenWithCF({"pikachu"}, options);
2136   ASSERT_OK(Put(1, "foo", "v1"));
2137   ASSERT_OK(Flush(1));
2138   const int last = 2;
2139   MoveFilesToLevel(last, 1);
2140   // foo => v1 is now in last level
2141   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
2142
2143   // Place a table at level last-1 to prevent merging with preceding mutation
2144   ASSERT_OK(Put(1, "a", "begin"));
2145   ASSERT_OK(Put(1, "z", "end"));
2146   ASSERT_OK(Flush(1));
2147   MoveFilesToLevel(last - 1, 1);
2148   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
2149   ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
2150
2151   ASSERT_OK(Delete(1, "foo"));
2152   ASSERT_OK(Put(1, "foo", "v2"));
2153   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
2154   ASSERT_OK(Flush(1));  // Moves to level last-2
2155   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
2156   Slice z("z");
2157   ASSERT_OK(dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]));
2158   // DEL eliminated, but v1 remains because we aren't compacting that level
2159   // (DEL can be eliminated because v2 hides v1).
2160   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
2161   ASSERT_OK(
2162       dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
2163   // Merging last-1 w/ last, so we are the base level for "foo", so
2164   // DEL is removed.  (as is v1).
2165   ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
2166 }
2167
2168 TEST_F(DBTest, DeletionMarkers2) {
2169   Options options = CurrentOptions();
2170   CreateAndReopenWithCF({"pikachu"}, options);
2171   ASSERT_OK(Put(1, "foo", "v1"));
2172   ASSERT_OK(Flush(1));
2173   const int last = 2;
2174   MoveFilesToLevel(last, 1);
2175   // foo => v1 is now in last level
2176   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
2177
2178   // Place a table at level last-1 to prevent merging with preceding mutation
2179   ASSERT_OK(Put(1, "a", "begin"));
2180   ASSERT_OK(Put(1, "z", "end"));
2181   ASSERT_OK(Flush(1));
2182   MoveFilesToLevel(last - 1, 1);
2183   ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
2184   ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
2185
2186   ASSERT_OK(Delete(1, "foo"));
2187   ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
2188   ASSERT_OK(Flush(1));  // Moves to level last-2
2189   ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
2190   ASSERT_OK(
2191       dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]));
2192   // DEL kept: "last" file overlaps
2193   ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
2194   ASSERT_OK(
2195       dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
2196   // Merging last-1 w/ last, so we are the base level for "foo", so
2197   // DEL is removed.  (as is v1).
2198   ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
2199 }
2200
2201 TEST_F(DBTest, OverlapInLevel0) {
2202   do {
2203     Options options = CurrentOptions();
2204     CreateAndReopenWithCF({"pikachu"}, options);
2205
2206     // Fill levels 1 and 2 to disable the pushing of new memtables to levels >
2207     // 0.
2208     ASSERT_OK(Put(1, "100", "v100"));
2209     ASSERT_OK(Put(1, "999", "v999"));
2210     ASSERT_OK(Flush(1));
2211     MoveFilesToLevel(2, 1);
2212     ASSERT_OK(Delete(1, "100"));
2213     ASSERT_OK(Delete(1, "999"));
2214     ASSERT_OK(Flush(1));
2215     MoveFilesToLevel(1, 1);
2216     ASSERT_EQ("0,1,1", FilesPerLevel(1));
2217
2218     // Make files spanning the following ranges in level-0:
2219     //  files[0]  200 .. 900
2220     //  files[1]  300 .. 500
2221     // Note that files are sorted by smallest key.
2222     ASSERT_OK(Put(1, "300", "v300"));
2223     ASSERT_OK(Put(1, "500", "v500"));
2224     ASSERT_OK(Flush(1));
2225     ASSERT_OK(Put(1, "200", "v200"));
2226     ASSERT_OK(Put(1, "600", "v600"));
2227     ASSERT_OK(Put(1, "900", "v900"));
2228     ASSERT_OK(Flush(1));
2229     ASSERT_EQ("2,1,1", FilesPerLevel(1));
2230
2231     // BEGIN addition to existing test
2232     // Take this opportunity to verify SST unique ids (including Plain table)
2233     TablePropertiesCollection tbc;
2234     ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc));
2235     VerifySstUniqueIds(tbc);
2236     // END addition to existing test
2237
2238     // Compact away the placeholder files we created initially
2239     ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
2240     ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]));
2241     ASSERT_EQ("2", FilesPerLevel(1));
2242
2243     // Do a memtable compaction.  Before bug-fix, the compaction would
2244     // not detect the overlap with level-0 files and would incorrectly place
2245     // the deletion in a deeper level.
2246     ASSERT_OK(Delete(1, "600"));
2247     ASSERT_OK(Flush(1));
2248     ASSERT_EQ("3", FilesPerLevel(1));
2249     ASSERT_EQ("NOT_FOUND", Get(1, "600"));
2250   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
2251 }
2252 #endif  // ROCKSDB_LITE
2253
2254 TEST_F(DBTest, ComparatorCheck) {
2255   class NewComparator : public Comparator {
2256    public:
2257     const char* Name() const override { return "rocksdb.NewComparator"; }
2258     int Compare(const Slice& a, const Slice& b) const override {
2259       return BytewiseComparator()->Compare(a, b);
2260     }
2261     void FindShortestSeparator(std::string* s, const Slice& l) const override {
2262       BytewiseComparator()->FindShortestSeparator(s, l);
2263     }
2264     void FindShortSuccessor(std::string* key) const override {
2265       BytewiseComparator()->FindShortSuccessor(key);
2266     }
2267   };
2268   Options new_options, options;
2269   NewComparator cmp;
2270   do {
2271     options = CurrentOptions();
2272     CreateAndReopenWithCF({"pikachu"}, options);
2273     new_options = CurrentOptions();
2274     new_options.comparator = &cmp;
2275     // only the non-default column family has non-matching comparator
2276     Status s = TryReopenWithColumnFamilies(
2277         {"default", "pikachu"}, std::vector<Options>({options, new_options}));
2278     ASSERT_TRUE(!s.ok());
2279     ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
2280         << s.ToString();
2281   } while (ChangeCompactOptions());
2282 }
2283
2284 TEST_F(DBTest, CustomComparator) {
2285   class NumberComparator : public Comparator {
2286    public:
2287     const char* Name() const override { return "test.NumberComparator"; }
2288     int Compare(const Slice& a, const Slice& b) const override {
2289       return ToNumber(a) - ToNumber(b);
2290     }
2291     void FindShortestSeparator(std::string* s, const Slice& l) const override {
2292       ToNumber(*s);  // Check format
2293       ToNumber(l);   // Check format
2294     }
2295     void FindShortSuccessor(std::string* key) const override {
2296       ToNumber(*key);  // Check format
2297     }
2298
2299    private:
2300     static int ToNumber(const Slice& x) {
2301       // Check that there are no extra characters.
2302       EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
2303           << EscapeString(x);
2304       int val;
2305       char ignored;
2306       EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
2307           << EscapeString(x);
2308       return val;
2309     }
2310   };
2311   Options new_options;
2312   NumberComparator cmp;
2313   do {
2314     new_options = CurrentOptions();
2315     new_options.create_if_missing = true;
2316     new_options.comparator = &cmp;
2317     new_options.write_buffer_size = 4096;  // Compact more often
2318     new_options.arena_block_size = 4096;
2319     new_options = CurrentOptions(new_options);
2320     DestroyAndReopen(new_options);
2321     CreateAndReopenWithCF({"pikachu"}, new_options);
2322     ASSERT_OK(Put(1, "[10]", "ten"));
2323     ASSERT_OK(Put(1, "[0x14]", "twenty"));
2324     for (int i = 0; i < 2; i++) {
2325       ASSERT_EQ("ten", Get(1, "[10]"));
2326       ASSERT_EQ("ten", Get(1, "[0xa]"));
2327       ASSERT_EQ("twenty", Get(1, "[20]"));
2328       ASSERT_EQ("twenty", Get(1, "[0x14]"));
2329       ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
2330       ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
2331       Compact(1, "[0]", "[9999]");
2332     }
2333
2334     for (int run = 0; run < 2; run++) {
2335       for (int i = 0; i < 1000; i++) {
2336         char buf[100];
2337         snprintf(buf, sizeof(buf), "[%d]", i * 10);
2338         ASSERT_OK(Put(1, buf, buf));
2339       }
2340       Compact(1, "[0]", "[1000000]");
2341     }
2342   } while (ChangeCompactOptions());
2343 }
2344
2345 TEST_F(DBTest, DBOpen_Options) {
2346   Options options = CurrentOptions();
2347   std::string dbname = test::PerThreadDBPath("db_options_test");
2348   ASSERT_OK(DestroyDB(dbname, options));
2349
2350   // Does not exist, and create_if_missing == false: error
2351   DB* db = nullptr;
2352   options.create_if_missing = false;
2353   Status s = DB::Open(options, dbname, &db);
2354   ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
2355   ASSERT_TRUE(db == nullptr);
2356
2357   // Does not exist, and create_if_missing == true: OK
2358   options.create_if_missing = true;
2359   s = DB::Open(options, dbname, &db);
2360   ASSERT_OK(s);
2361   ASSERT_TRUE(db != nullptr);
2362
2363   delete db;
2364   db = nullptr;
2365
2366   // Does exist, and error_if_exists == true: error
2367   options.create_if_missing = false;
2368   options.error_if_exists = true;
2369   s = DB::Open(options, dbname, &db);
2370   ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
2371   ASSERT_TRUE(db == nullptr);
2372
2373   // Does exist, and error_if_exists == false: OK
2374   options.create_if_missing = true;
2375   options.error_if_exists = false;
2376   s = DB::Open(options, dbname, &db);
2377   ASSERT_OK(s);
2378   ASSERT_TRUE(db != nullptr);
2379
2380   delete db;
2381   db = nullptr;
2382 }
2383
2384 TEST_F(DBTest, DBOpen_Change_NumLevels) {
2385   Options options = CurrentOptions();
2386   options.create_if_missing = true;
2387   DestroyAndReopen(options);
2388   ASSERT_TRUE(db_ != nullptr);
2389   CreateAndReopenWithCF({"pikachu"}, options);
2390
2391   ASSERT_OK(Put(1, "a", "123"));
2392   ASSERT_OK(Put(1, "b", "234"));
2393   ASSERT_OK(Flush(1));
2394   MoveFilesToLevel(3, 1);
2395   Close();
2396
2397   options.create_if_missing = false;
2398   options.num_levels = 2;
2399   Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
2400   ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
2401   ASSERT_TRUE(db_ == nullptr);
2402 }
2403
2404 TEST_F(DBTest, DestroyDBMetaDatabase) {
2405   std::string dbname = test::PerThreadDBPath("db_meta");
2406   ASSERT_OK(env_->CreateDirIfMissing(dbname));
2407   std::string metadbname = MetaDatabaseName(dbname, 0);
2408   ASSERT_OK(env_->CreateDirIfMissing(metadbname));
2409   std::string metametadbname = MetaDatabaseName(metadbname, 0);
2410   ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
2411
2412   // Destroy previous versions if they exist. Using the long way.
2413   Options options = CurrentOptions();
2414   ASSERT_OK(DestroyDB(metametadbname, options));
2415   ASSERT_OK(DestroyDB(metadbname, options));
2416   ASSERT_OK(DestroyDB(dbname, options));
2417
2418   // Setup databases
2419   DB* db = nullptr;
2420   ASSERT_OK(DB::Open(options, dbname, &db));
2421   delete db;
2422   db = nullptr;
2423   ASSERT_OK(DB::Open(options, metadbname, &db));
2424   delete db;
2425   db = nullptr;
2426   ASSERT_OK(DB::Open(options, metametadbname, &db));
2427   delete db;
2428   db = nullptr;
2429
2430   // Delete databases
2431   ASSERT_OK(DestroyDB(dbname, options));
2432
2433   // Check if deletion worked.
2434   options.create_if_missing = false;
2435   ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
2436   ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
2437   ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
2438 }
2439
2440 #ifndef ROCKSDB_LITE
2441 TEST_F(DBTest, SnapshotFiles) {
2442   do {
2443     Options options = CurrentOptions();
2444     options.write_buffer_size = 100000000;  // Large write buffer
2445     CreateAndReopenWithCF({"pikachu"}, options);
2446
2447     Random rnd(301);
2448
2449     // Write 8MB (80 values, each 100K)
2450     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
2451     std::vector<std::string> values;
2452     for (int i = 0; i < 80; i++) {
2453       values.push_back(rnd.RandomString(100000));
2454       ASSERT_OK(Put((i < 40), Key(i), values[i]));
2455     }
2456
2457     // assert that nothing makes it to disk yet.
2458     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
2459
2460     // get a file snapshot
2461     uint64_t manifest_number = 0;
2462     uint64_t manifest_size = 0;
2463     std::vector<std::string> files;
2464     ASSERT_OK(dbfull()->DisableFileDeletions());
2465     ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
2466
2467     // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF)
2468     ASSERT_EQ(files.size(), 5U);
2469
2470     uint64_t number = 0;
2471     FileType type;
2472
2473     // copy these files to a new snapshot directory
2474     std::string snapdir = dbname_ + ".snapdir/";
2475     if (env_->FileExists(snapdir).ok()) {
2476       ASSERT_OK(DestroyDir(env_, snapdir));
2477     }
2478     ASSERT_OK(env_->CreateDir(snapdir));
2479
2480     for (size_t i = 0; i < files.size(); i++) {
2481       // our clients require that GetLiveFiles returns
2482       // files with "/" as first character!
2483       ASSERT_EQ(files[i][0], '/');
2484       std::string src = dbname_ + files[i];
2485       std::string dest = snapdir + files[i];
2486
2487       uint64_t size;
2488       ASSERT_OK(env_->GetFileSize(src, &size));
2489
2490       // record the number and the size of the
2491       // latest manifest file
2492       if (ParseFileName(files[i].substr(1), &number, &type)) {
2493         if (type == kDescriptorFile) {
2494           ASSERT_EQ(manifest_number, 0);
2495           manifest_number = number;
2496           ASSERT_GE(size, manifest_size);
2497           size = manifest_size;  // copy only valid MANIFEST data
2498         }
2499       }
2500       CopyFile(src, dest, size);
2501     }
2502
2503     // release file snapshot
2504     ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false));
2505     // overwrite one key, this key should not appear in the snapshot
2506     std::vector<std::string> extras;
2507     for (unsigned int i = 0; i < 1; i++) {
2508       extras.push_back(rnd.RandomString(100000));
2509       ASSERT_OK(Put(0, Key(i), extras[i]));
2510     }
2511
2512     // verify that data in the snapshot are correct
2513     std::vector<ColumnFamilyDescriptor> column_families;
2514     column_families.emplace_back("default", ColumnFamilyOptions());
2515     column_families.emplace_back("pikachu", ColumnFamilyOptions());
2516     std::vector<ColumnFamilyHandle*> cf_handles;
2517     DB* snapdb;
2518     DBOptions opts;
2519     opts.env = env_;
2520     opts.create_if_missing = false;
2521     Status stat =
2522         DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
2523     ASSERT_OK(stat);
2524
2525     ReadOptions roptions;
2526     std::string val;
2527     for (unsigned int i = 0; i < 80; i++) {
2528       ASSERT_OK(snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val));
2529       ASSERT_EQ(values[i].compare(val), 0);
2530     }
2531     for (auto cfh : cf_handles) {
2532       delete cfh;
2533     }
2534     delete snapdb;
2535
2536     // look at the new live files after we added an 'extra' key
2537     // and after we took the first snapshot.
2538     uint64_t new_manifest_number = 0;
2539     uint64_t new_manifest_size = 0;
2540     std::vector<std::string> newfiles;
2541     ASSERT_OK(dbfull()->DisableFileDeletions());
2542     ASSERT_OK(dbfull()->GetLiveFiles(newfiles, &new_manifest_size));
2543
2544     // find the new manifest file. assert that this manifest file is
2545     // the same one as in the previous snapshot. But its size should be
2546     // larger because we added an extra key after taking the
2547     // previous shapshot.
2548     for (size_t i = 0; i < newfiles.size(); i++) {
2549       std::string src = dbname_ + "/" + newfiles[i];
2550       // record the lognumber and the size of the
2551       // latest manifest file
2552       if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
2553         if (type == kDescriptorFile) {
2554           ASSERT_EQ(new_manifest_number, 0);
2555           uint64_t size;
2556           new_manifest_number = number;
2557           ASSERT_OK(env_->GetFileSize(src, &size));
2558           ASSERT_GE(size, new_manifest_size);
2559         }
2560       }
2561     }
2562     ASSERT_EQ(manifest_number, new_manifest_number);
2563     ASSERT_GT(new_manifest_size, manifest_size);
2564
2565     // Also test GetLiveFilesStorageInfo
2566     std::vector<LiveFileStorageInfo> new_infos;
2567     ASSERT_OK(db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(),
2568                                            &new_infos));
2569
2570     // Close DB (while deletions disabled)
2571     Close();
2572
2573     // Validate
2574     for (auto& info : new_infos) {
2575       std::string path = info.directory + "/" + info.relative_filename;
2576       uint64_t size;
2577       ASSERT_OK(env_->GetFileSize(path, &size));
2578       if (info.trim_to_size) {
2579         ASSERT_LE(info.size, size);
2580       } else if (!info.replacement_contents.empty()) {
2581         ASSERT_EQ(info.size, info.replacement_contents.size());
2582       } else {
2583         ASSERT_EQ(info.size, size);
2584       }
2585       if (info.file_type == kDescriptorFile) {
2586         ASSERT_EQ(info.file_number, manifest_number);
2587       }
2588     }
2589   } while (ChangeCompactOptions());
2590 }
2591
2592 TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) {
2593   do {
2594     Options options = CurrentOptions();
2595     options.level0_file_num_compaction_trigger = 2;
2596     DestroyAndReopen(options);
2597
2598     ASSERT_OK(Put("foo", "bar"));
2599     ASSERT_OK(Flush());
2600     ASSERT_OK(Put("foo", "bar"));
2601     ASSERT_OK(Flush());
2602     ASSERT_OK(dbfull()->TEST_WaitForCompact());
2603
2604     Close();
2605     ASSERT_OK(ReadOnlyReopen(options));
2606
2607     uint64_t manifest_size = 0;
2608     std::vector<std::string> files;
2609     ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
2610
2611     for (const std::string& f : files) {
2612       uint64_t number = 0;
2613       FileType type;
2614       if (ParseFileName(f.substr(1), &number, &type)) {
2615         if (type == kDescriptorFile) {
2616           uint64_t size_on_disk;
2617           ASSERT_OK(env_->GetFileSize(dbname_ + "/" + f, &size_on_disk));
2618           ASSERT_EQ(manifest_size, size_on_disk);
2619           break;
2620         }
2621       }
2622     }
2623     Close();
2624   } while (ChangeCompactOptions());
2625 }
2626
2627 TEST_F(DBTest, GetLiveBlobFiles) {
2628   // Note: the following prevents an otherwise harmless data race between the
2629   // test setup code (AddBlobFile) below and the periodic stat dumping thread.
2630   Options options = CurrentOptions();
2631   options.stats_dump_period_sec = 0;
2632
2633   constexpr uint64_t blob_file_number = 234;
2634   constexpr uint64_t total_blob_count = 555;
2635   constexpr uint64_t total_blob_bytes = 66666;
2636   constexpr char checksum_method[] = "CRC32";
2637   constexpr char checksum_value[] = "\x3d\x87\xff\x57";
2638   constexpr uint64_t garbage_blob_count = 0;
2639   constexpr uint64_t garbage_blob_bytes = 0;
2640
2641   Reopen(options);
2642
2643   AddBlobFile(db_->DefaultColumnFamily(), blob_file_number, total_blob_count,
2644               total_blob_bytes, checksum_method, checksum_value,
2645               garbage_blob_count, garbage_blob_bytes);
2646   // Make sure it appears in the results returned by GetLiveFiles.
2647   uint64_t manifest_size = 0;
2648   std::vector<std::string> files;
2649   ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
2650
2651   ASSERT_FALSE(files.empty());
2652   ASSERT_EQ(files[0], BlobFileName("", blob_file_number));
2653
2654   ColumnFamilyMetaData cfmd;
2655
2656   db_->GetColumnFamilyMetaData(&cfmd);
2657   ASSERT_EQ(cfmd.blob_files.size(), 1);
2658   const BlobMetaData& bmd = cfmd.blob_files[0];
2659
2660   CheckBlobMetaData(bmd, blob_file_number, total_blob_count, total_blob_bytes,
2661                     checksum_method, checksum_value, garbage_blob_count,
2662                     garbage_blob_bytes);
2663   ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
2664   ASSERT_EQ(cfmd.blob_file_count, 1U);
2665   ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
2666 }
2667 #endif
2668
2669 TEST_F(DBTest, PurgeInfoLogs) {
2670   Options options = CurrentOptions();
2671   options.keep_log_file_num = 5;
2672   options.create_if_missing = true;
2673   options.env = env_;
2674   for (int mode = 0; mode <= 1; mode++) {
2675     if (mode == 1) {
2676       options.db_log_dir = dbname_ + "_logs";
2677       ASSERT_OK(env_->CreateDirIfMissing(options.db_log_dir));
2678     } else {
2679       options.db_log_dir = "";
2680     }
2681     for (int i = 0; i < 8; i++) {
2682       Reopen(options);
2683     }
2684
2685     std::vector<std::string> files;
2686     ASSERT_OK(env_->GetChildren(
2687         options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files));
2688     int info_log_count = 0;
2689     for (std::string file : files) {
2690       if (file.find("LOG") != std::string::npos) {
2691         info_log_count++;
2692       }
2693     }
2694     ASSERT_EQ(5, info_log_count);
2695
2696     Destroy(options);
2697     // For mode (1), test DestroyDB() to delete all the logs under DB dir.
2698     // For mode (2), no info log file should have been put under DB dir.
2699     // Since dbname_ has no children, there is no need to loop db_files
2700     std::vector<std::string> db_files;
2701     ASSERT_TRUE(env_->GetChildren(dbname_, &db_files).IsNotFound());
2702     ASSERT_TRUE(db_files.empty());
2703
2704     if (mode == 1) {
2705       // Cleaning up
2706       ASSERT_OK(env_->GetChildren(options.db_log_dir, &files));
2707       for (std::string file : files) {
2708         ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file));
2709       }
2710       ASSERT_OK(env_->DeleteDir(options.db_log_dir));
2711     }
2712   }
2713 }
2714
2715 #ifndef ROCKSDB_LITE
2716 // Multi-threaded test:
2717 namespace {
2718
2719 static const int kColumnFamilies = 10;
2720 static const int kNumThreads = 10;
2721 static const int kTestSeconds = 10;
2722 static const int kNumKeys = 1000;
2723
2724 struct MTState {
2725   DBTest* test;
2726   std::atomic<int> counter[kNumThreads];
2727 };
2728
2729 struct MTThread {
2730   MTState* state;
2731   int id;
2732   bool multiget_batched;
2733 };
2734
2735 static void MTThreadBody(void* arg) {
2736   MTThread* t = reinterpret_cast<MTThread*>(arg);
2737   int id = t->id;
2738   DB* db = t->state->test->db_;
2739   int counter = 0;
2740   std::shared_ptr<SystemClock> clock = SystemClock::Default();
2741   auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U;
2742
2743   fprintf(stderr, "... starting thread %d\n", id);
2744   Random rnd(1000 + id);
2745   char valbuf[1500];
2746   while (clock->NowMicros() < end_micros) {
2747     t->state->counter[id].store(counter, std::memory_order_release);
2748
2749     int key = rnd.Uniform(kNumKeys);
2750     char keybuf[20];
2751     snprintf(keybuf, sizeof(keybuf), "%016d", key);
2752
2753     if (rnd.OneIn(2)) {
2754       // Write values of the form <key, my id, counter, cf, unique_id>.
2755       // into each of the CFs
2756       // We add some padding for force compactions.
2757       int unique_id = rnd.Uniform(1000000);
2758
2759       // Half of the time directly use WriteBatch. Half of the time use
2760       // WriteBatchWithIndex.
2761       if (rnd.OneIn(2)) {
2762         WriteBatch batch;
2763         for (int cf = 0; cf < kColumnFamilies; ++cf) {
2764           snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
2765                    static_cast<int>(counter), cf, unique_id);
2766           ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
2767                               Slice(valbuf)));
2768         }
2769         ASSERT_OK(db->Write(WriteOptions(), &batch));
2770       } else {
2771         WriteBatchWithIndex batch(db->GetOptions().comparator);
2772         for (int cf = 0; cf < kColumnFamilies; ++cf) {
2773           snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
2774                    static_cast<int>(counter), cf, unique_id);
2775           ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
2776                               Slice(valbuf)));
2777         }
2778         ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
2779       }
2780     } else {
2781       // Read a value and verify that it matches the pattern written above
2782       // and that writes to all column families were atomic (unique_id is the
2783       // same)
2784       std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
2785       std::vector<std::string> values;
2786       std::vector<Status> statuses;
2787       if (!t->multiget_batched) {
2788         statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys,
2789                                 &values);
2790       } else {
2791         std::vector<PinnableSlice> pin_values(keys.size());
2792         statuses.resize(keys.size());
2793         const Snapshot* snapshot = db->GetSnapshot();
2794         ReadOptions ro;
2795         ro.snapshot = snapshot;
2796         for (int cf = 0; cf < kColumnFamilies; ++cf) {
2797           db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf],
2798                        &pin_values[cf], &statuses[cf]);
2799         }
2800         db->ReleaseSnapshot(snapshot);
2801         values.resize(keys.size());
2802         for (int cf = 0; cf < kColumnFamilies; ++cf) {
2803           if (statuses[cf].ok()) {
2804             values[cf].assign(pin_values[cf].data(), pin_values[cf].size());
2805           }
2806         }
2807       }
2808       Status s = statuses[0];
2809       // all statuses have to be the same
2810       for (size_t i = 1; i < statuses.size(); ++i) {
2811         // they are either both ok or both not-found
2812         ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
2813                     (s.IsNotFound() && statuses[i].IsNotFound()));
2814       }
2815       if (s.IsNotFound()) {
2816         // Key has not yet been written
2817       } else {
2818         // Check that the writer thread counter is >= the counter in the value
2819         ASSERT_OK(s);
2820         int unique_id = -1;
2821         for (int i = 0; i < kColumnFamilies; ++i) {
2822           int k, w, c, cf, u;
2823           ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c,
2824                               &cf, &u))
2825               << values[i];
2826           ASSERT_EQ(k, key);
2827           ASSERT_GE(w, 0);
2828           ASSERT_LT(w, kNumThreads);
2829           ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
2830           ASSERT_EQ(cf, i);
2831           if (i == 0) {
2832             unique_id = u;
2833           } else {
2834             // this checks that updates across column families happened
2835             // atomically -- all unique ids are the same
2836             ASSERT_EQ(u, unique_id);
2837           }
2838         }
2839       }
2840     }
2841     counter++;
2842   }
2843   fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
2844 }
2845
2846 }  // anonymous namespace
2847
2848 class MultiThreadedDBTest
2849     : public DBTest,
2850       public ::testing::WithParamInterface<std::tuple<int, bool>> {
2851  public:
2852   void SetUp() override {
2853     std::tie(option_config_, multiget_batched_) = GetParam();
2854   }
2855
2856   static std::vector<int> GenerateOptionConfigs() {
2857     std::vector<int> optionConfigs;
2858     for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
2859       optionConfigs.push_back(optionConfig);
2860     }
2861     return optionConfigs;
2862   }
2863
2864   bool multiget_batched_;
2865 };
2866
2867 TEST_P(MultiThreadedDBTest, MultiThreaded) {
2868   if (option_config_ == kPipelinedWrite) return;
2869   anon::OptionsOverride options_override;
2870   options_override.skip_policy = kSkipNoSnapshot;
2871   Options options = CurrentOptions(options_override);
2872   std::vector<std::string> cfs;
2873   for (int i = 1; i < kColumnFamilies; ++i) {
2874     cfs.push_back(std::to_string(i));
2875   }
2876   Reopen(options);
2877   CreateAndReopenWithCF(cfs, options);
2878   // Initialize state
2879   MTState mt;
2880   mt.test = this;
2881   for (int id = 0; id < kNumThreads; id++) {
2882     mt.counter[id].store(0, std::memory_order_release);
2883   }
2884
2885   // Start threads
2886   MTThread thread[kNumThreads];
2887   for (int id = 0; id < kNumThreads; id++) {
2888     thread[id].state = &mt;
2889     thread[id].id = id;
2890     thread[id].multiget_batched = multiget_batched_;
2891     env_->StartThread(MTThreadBody, &thread[id]);
2892   }
2893
2894   env_->WaitForJoin();
2895 }
2896
2897 INSTANTIATE_TEST_CASE_P(
2898     MultiThreaded, MultiThreadedDBTest,
2899     ::testing::Combine(
2900         ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()),
2901         ::testing::Bool()));
2902 #endif  // ROCKSDB_LITE
2903
2904 // Group commit test:
2905 #if !defined(OS_WIN)
2906 // Disable this test temporarily on Travis and appveyor as it fails
2907 // intermittently. Github issue: #4151
2908 namespace {
2909
2910 static const int kGCNumThreads = 4;
2911 static const int kGCNumKeys = 1000;
2912
2913 struct GCThread {
2914   DB* db;
2915   int id;
2916   std::atomic<bool> done;
2917 };
2918
2919 static void GCThreadBody(void* arg) {
2920   GCThread* t = reinterpret_cast<GCThread*>(arg);
2921   int id = t->id;
2922   DB* db = t->db;
2923   WriteOptions wo;
2924
2925   for (int i = 0; i < kGCNumKeys; ++i) {
2926     std::string kv(std::to_string(i + id * kGCNumKeys));
2927     ASSERT_OK(db->Put(wo, kv, kv));
2928   }
2929   t->done = true;
2930 }
2931
2932 }  // anonymous namespace
2933
2934 TEST_F(DBTest, GroupCommitTest) {
2935   do {
2936     Options options = CurrentOptions();
2937     options.env = env_;
2938     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2939     Reopen(options);
2940
2941     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
2942         {{"WriteThread::JoinBatchGroup:BeganWaiting",
2943           "DBImpl::WriteImpl:BeforeLeaderEnters"},
2944          {"WriteThread::AwaitState:BlockingWaiting",
2945           "WriteThread::EnterAsBatchGroupLeader:End"}});
2946     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2947
2948     // Start threads
2949     GCThread thread[kGCNumThreads];
2950     for (int id = 0; id < kGCNumThreads; id++) {
2951       thread[id].id = id;
2952       thread[id].db = db_;
2953       thread[id].done = false;
2954       env_->StartThread(GCThreadBody, &thread[id]);
2955     }
2956     env_->WaitForJoin();
2957
2958     ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
2959
2960     std::vector<std::string> expected_db;
2961     for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
2962       expected_db.push_back(std::to_string(i));
2963     }
2964     std::sort(expected_db.begin(), expected_db.end());
2965
2966     Iterator* itr = db_->NewIterator(ReadOptions());
2967     itr->SeekToFirst();
2968     for (auto x : expected_db) {
2969       ASSERT_TRUE(itr->Valid());
2970       ASSERT_EQ(itr->key().ToString(), x);
2971       ASSERT_EQ(itr->value().ToString(), x);
2972       itr->Next();
2973     }
2974     ASSERT_TRUE(!itr->Valid());
2975     delete itr;
2976
2977     HistogramData hist_data;
2978     options.statistics->histogramData(DB_WRITE, &hist_data);
2979     ASSERT_GT(hist_data.average, 0.0);
2980   } while (ChangeOptions(kSkipNoSeekToLast));
2981 }
2982 #endif  // OS_WIN
2983
2984 namespace {
2985 using KVMap = std::map<std::string, std::string>;
2986 }
2987
2988 class ModelDB : public DB {
2989  public:
2990   class ModelSnapshot : public Snapshot {
2991    public:
2992     KVMap map_;
2993
2994     SequenceNumber GetSequenceNumber() const override {
2995       // no need to call this
2996       assert(false);
2997       return 0;
2998     }
2999
3000     int64_t GetUnixTime() const override {
3001       // no need to call this
3002       assert(false);
3003       return 0;
3004     }
3005
3006     uint64_t GetTimestamp() const override {
3007       // no need to call this
3008       assert(false);
3009       return 0;
3010     }
3011   };
3012
3013   explicit ModelDB(const Options& options) : options_(options) {}
3014   using DB::Put;
3015   Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
3016              const Slice& v) override {
3017     WriteBatch batch;
3018     Status s = batch.Put(cf, k, v);
3019     if (!s.ok()) {
3020       return s;
3021     }
3022     return Write(o, &batch);
3023   }
3024   Status Put(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
3025              const Slice& /*k*/, const Slice& /*ts*/,
3026              const Slice& /*v*/) override {
3027     return Status::NotSupported();
3028   }
3029
3030   using DB::PutEntity;
3031   Status PutEntity(const WriteOptions& /* options */,
3032                    ColumnFamilyHandle* /* column_family */,
3033                    const Slice& /* key */,
3034                    const WideColumns& /* columns */) override {
3035     return Status::NotSupported();
3036   }
3037
3038   using DB::Close;
3039   Status Close() override { return Status::OK(); }
3040   using DB::Delete;
3041   Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
3042                 const Slice& key) override {
3043     WriteBatch batch;
3044     Status s = batch.Delete(cf, key);
3045     if (!s.ok()) {
3046       return s;
3047     }
3048     return Write(o, &batch);
3049   }
3050   Status Delete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
3051                 const Slice& /*key*/, const Slice& /*ts*/) override {
3052     return Status::NotSupported();
3053   }
3054   using DB::SingleDelete;
3055   Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
3056                       const Slice& key) override {
3057     WriteBatch batch;
3058     Status s = batch.SingleDelete(cf, key);
3059     if (!s.ok()) {
3060       return s;
3061     }
3062     return Write(o, &batch);
3063   }
3064   Status SingleDelete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
3065                       const Slice& /*key*/, const Slice& /*ts*/) override {
3066     return Status::NotSupported();
3067   }
3068   using DB::Merge;
3069   Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
3070                const Slice& v) override {
3071     WriteBatch batch;
3072     Status s = batch.Merge(cf, k, v);
3073     if (!s.ok()) {
3074       return s;
3075     }
3076     return Write(o, &batch);
3077   }
3078   Status Merge(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
3079                const Slice& /*k*/, const Slice& /*ts*/,
3080                const Slice& /*value*/) override {
3081     return Status::NotSupported();
3082   }
3083   using DB::Get;
3084   Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/,
3085              const Slice& key, PinnableSlice* /*value*/) override {
3086     return Status::NotSupported(key);
3087   }
3088
3089   using DB::GetMergeOperands;
3090   virtual Status GetMergeOperands(
3091       const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
3092       const Slice& key, PinnableSlice* /*slice*/,
3093       GetMergeOperandsOptions* /*merge_operands_options*/,
3094       int* /*number_of_operands*/) override {
3095     return Status::NotSupported(key);
3096   }
3097
3098   using DB::MultiGet;
3099   std::vector<Status> MultiGet(
3100       const ReadOptions& /*options*/,
3101       const std::vector<ColumnFamilyHandle*>& /*column_family*/,
3102       const std::vector<Slice>& keys,
3103       std::vector<std::string>* /*values*/) override {
3104     std::vector<Status> s(keys.size(),
3105                           Status::NotSupported("Not implemented."));
3106     return s;
3107   }
3108
3109 #ifndef ROCKSDB_LITE
3110   using DB::IngestExternalFile;
3111   Status IngestExternalFile(
3112       ColumnFamilyHandle* /*column_family*/,
3113       const std::vector<std::string>& /*external_files*/,
3114       const IngestExternalFileOptions& /*options*/) override {
3115     return Status::NotSupported("Not implemented.");
3116   }
3117
3118   using DB::IngestExternalFiles;
3119   Status IngestExternalFiles(
3120       const std::vector<IngestExternalFileArg>& /*args*/) override {
3121     return Status::NotSupported("Not implemented");
3122   }
3123
3124   using DB::CreateColumnFamilyWithImport;
3125   virtual Status CreateColumnFamilyWithImport(
3126       const ColumnFamilyOptions& /*options*/,
3127       const std::string& /*column_family_name*/,
3128       const ImportColumnFamilyOptions& /*import_options*/,
3129       const ExportImportFilesMetaData& /*metadata*/,
3130       ColumnFamilyHandle** /*handle*/) override {
3131     return Status::NotSupported("Not implemented.");
3132   }
3133
3134   using DB::VerifyChecksum;
3135   Status VerifyChecksum(const ReadOptions&) override {
3136     return Status::NotSupported("Not implemented.");
3137   }
3138
3139   using DB::GetPropertiesOfAllTables;
3140   Status GetPropertiesOfAllTables(
3141       ColumnFamilyHandle* /*column_family*/,
3142       TablePropertiesCollection* /*props*/) override {
3143     return Status();
3144   }
3145
3146   Status GetPropertiesOfTablesInRange(
3147       ColumnFamilyHandle* /*column_family*/, const Range* /*range*/,
3148       std::size_t /*n*/, TablePropertiesCollection* /*props*/) override {
3149     return Status();
3150   }
3151 #endif  // ROCKSDB_LITE
3152
3153   using DB::KeyMayExist;
3154   bool KeyMayExist(const ReadOptions& /*options*/,
3155                    ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
3156                    std::string* /*value*/,
3157                    bool* value_found = nullptr) override {
3158     if (value_found != nullptr) {
3159       *value_found = false;
3160     }
3161     return true;  // Not Supported directly
3162   }
3163   using DB::NewIterator;
3164   Iterator* NewIterator(const ReadOptions& options,
3165                         ColumnFamilyHandle* /*column_family*/) override {
3166     if (options.snapshot == nullptr) {
3167       KVMap* saved = new KVMap;
3168       *saved = map_;
3169       return new ModelIter(saved, true);
3170     } else {
3171       const KVMap* snapshot_state =
3172           &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
3173       return new ModelIter(snapshot_state, false);
3174     }
3175   }
3176   Status NewIterators(const ReadOptions& /*options*/,
3177                       const std::vector<ColumnFamilyHandle*>& /*column_family*/,
3178                       std::vector<Iterator*>* /*iterators*/) override {
3179     return Status::NotSupported("Not supported yet");
3180   }
3181   const Snapshot* GetSnapshot() override {
3182     ModelSnapshot* snapshot = new ModelSnapshot;
3183     snapshot->map_ = map_;
3184     return snapshot;
3185   }
3186
3187   void ReleaseSnapshot(const Snapshot* snapshot) override {
3188     delete reinterpret_cast<const ModelSnapshot*>(snapshot);
3189   }
3190
3191   Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
3192     class Handler : public WriteBatch::Handler {
3193      public:
3194       KVMap* map_;
3195       void Put(const Slice& key, const Slice& value) override {
3196         (*map_)[key.ToString()] = value.ToString();
3197       }
3198       void Merge(const Slice& /*key*/, const Slice& /*value*/) override {
3199         // ignore merge for now
3200         // (*map_)[key.ToString()] = value.ToString();
3201       }
3202       void Delete(const Slice& key) override { map_->erase(key.ToString()); }
3203     };
3204     Handler handler;
3205     handler.map_ = &map_;
3206     return batch->Iterate(&handler);
3207   }
3208
3209   using DB::GetProperty;
3210   bool GetProperty(ColumnFamilyHandle* /*column_family*/,
3211                    const Slice& /*property*/, std::string* /*value*/) override {
3212     return false;
3213   }
3214   using DB::GetIntProperty;
3215   bool GetIntProperty(ColumnFamilyHandle* /*column_family*/,
3216                       const Slice& /*property*/, uint64_t* /*value*/) override {
3217     return false;
3218   }
3219   using DB::GetMapProperty;
3220   bool GetMapProperty(ColumnFamilyHandle* /*column_family*/,
3221                       const Slice& /*property*/,
3222                       std::map<std::string, std::string>* /*value*/) override {
3223     return false;
3224   }
3225   using DB::GetAggregatedIntProperty;
3226   bool GetAggregatedIntProperty(const Slice& /*property*/,
3227                                 uint64_t* /*value*/) override {
3228     return false;
3229   }
3230   using DB::GetApproximateSizes;
3231   Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
3232                              ColumnFamilyHandle* /*column_family*/,
3233                              const Range* /*range*/, int n,
3234                              uint64_t* sizes) override {
3235     for (int i = 0; i < n; i++) {
3236       sizes[i] = 0;
3237     }
3238     return Status::OK();
3239   }
3240   using DB::GetApproximateMemTableStats;
3241   void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
3242                                    const Range& /*range*/,
3243                                    uint64_t* const count,
3244                                    uint64_t* const size) override {
3245     *count = 0;
3246     *size = 0;
3247   }
3248   using DB::CompactRange;
3249   Status CompactRange(const CompactRangeOptions& /*options*/,
3250                       ColumnFamilyHandle* /*column_family*/,
3251                       const Slice* /*start*/, const Slice* /*end*/) override {
3252     return Status::NotSupported("Not supported operation.");
3253   }
3254
3255   Status SetDBOptions(
3256       const std::unordered_map<std::string, std::string>& /*new_options*/)
3257       override {
3258     return Status::NotSupported("Not supported operation.");
3259   }
3260
3261   using DB::CompactFiles;
3262   Status CompactFiles(
3263       const CompactionOptions& /*compact_options*/,
3264       ColumnFamilyHandle* /*column_family*/,
3265       const std::vector<std::string>& /*input_file_names*/,
3266       const int /*output_level*/, const int /*output_path_id*/ = -1,
3267       std::vector<std::string>* const /*output_file_names*/ = nullptr,
3268       CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
3269     return Status::NotSupported("Not supported operation.");
3270   }
3271
3272   Status PauseBackgroundWork() override {
3273     return Status::NotSupported("Not supported operation.");
3274   }
3275
3276   Status ContinueBackgroundWork() override {
3277     return Status::NotSupported("Not supported operation.");
3278   }
3279
3280   Status EnableAutoCompaction(
3281       const std::vector<ColumnFamilyHandle*>& /*column_family_handles*/)
3282       override {
3283     return Status::NotSupported("Not supported operation.");
3284   }
3285
3286   void EnableManualCompaction() override { return; }
3287
3288   void DisableManualCompaction() override { return; }
3289
3290   using DB::NumberLevels;
3291   int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
3292
3293   using DB::MaxMemCompactionLevel;
3294   int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
3295     return 1;
3296   }
3297
3298   using DB::Level0StopWriteTrigger;
3299   int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
3300     return -1;
3301   }
3302
3303   const std::string& GetName() const override { return name_; }
3304
3305   Env* GetEnv() const override { return nullptr; }
3306
3307   using DB::GetOptions;
3308   Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
3309     return options_;
3310   }
3311
3312   using DB::GetDBOptions;
3313   DBOptions GetDBOptions() const override { return options_; }
3314
3315   using DB::Flush;
3316   Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
3317                ColumnFamilyHandle* /*column_family*/) override {
3318     Status ret;
3319     return ret;
3320   }
3321   Status Flush(
3322       const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
3323       const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
3324     return Status::OK();
3325   }
3326
3327   Status SyncWAL() override { return Status::OK(); }
3328
3329   Status DisableFileDeletions() override { return Status::OK(); }
3330
3331   Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
3332 #ifndef ROCKSDB_LITE
3333
3334   Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
3335                       bool /*flush_memtable*/ = true) override {
3336     return Status::OK();
3337   }
3338
3339   Status GetLiveFilesChecksumInfo(
3340       FileChecksumList* /*checksum_list*/) override {
3341     return Status::OK();
3342   }
3343
3344   Status GetLiveFilesStorageInfo(
3345       const LiveFilesStorageInfoOptions& /*opts*/,
3346       std::vector<LiveFileStorageInfo>* /*files*/) override {
3347     return Status::OK();
3348   }
3349
3350   Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
3351     return Status::OK();
3352   }
3353
3354   Status GetCurrentWalFile(
3355       std::unique_ptr<LogFile>* /*current_log_file*/) override {
3356     return Status::OK();
3357   }
3358
3359   virtual Status GetCreationTimeOfOldestFile(
3360       uint64_t* /*creation_time*/) override {
3361     return Status::NotSupported();
3362   }
3363
3364   Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
3365
3366   Status GetUpdatesSince(
3367       ROCKSDB_NAMESPACE::SequenceNumber,
3368       std::unique_ptr<ROCKSDB_NAMESPACE::TransactionLogIterator>*,
3369       const TransactionLogIterator::ReadOptions& /*read_options*/ =
3370           TransactionLogIterator::ReadOptions()) override {
3371     return Status::NotSupported("Not supported in Model DB");
3372   }
3373
3374   void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
3375                                ColumnFamilyMetaData* /*metadata*/) override {}
3376 #endif  // ROCKSDB_LITE
3377
3378   Status GetDbIdentity(std::string& /*identity*/) const override {
3379     return Status::OK();
3380   }
3381
3382   Status GetDbSessionId(std::string& /*session_id*/) const override {
3383     return Status::OK();
3384   }
3385
3386   SequenceNumber GetLatestSequenceNumber() const override { return 0; }
3387
3388   Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
3389                                   std::string /*ts_low*/) override {
3390     return Status::OK();
3391   }
3392
3393   Status GetFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
3394                              std::string* /*ts_low*/) override {
3395     return Status::OK();
3396   }
3397
3398   ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
3399
3400  private:
3401   class ModelIter : public Iterator {
3402    public:
3403     ModelIter(const KVMap* map, bool owned)
3404         : map_(map), owned_(owned), iter_(map_->end()) {}
3405     ~ModelIter() override {
3406       if (owned_) delete map_;
3407     }
3408     bool Valid() const override { return iter_ != map_->end(); }
3409     void SeekToFirst() override { iter_ = map_->begin(); }
3410     void SeekToLast() override {
3411       if (map_->empty()) {
3412         iter_ = map_->end();
3413       } else {
3414         iter_ = map_->find(map_->rbegin()->first);
3415       }
3416     }
3417     void Seek(const Slice& k) override {
3418       iter_ = map_->lower_bound(k.ToString());
3419     }
3420     void SeekForPrev(const Slice& k) override {
3421       iter_ = map_->upper_bound(k.ToString());
3422       Prev();
3423     }
3424     void Next() override { ++iter_; }
3425     void Prev() override {
3426       if (iter_ == map_->begin()) {
3427         iter_ = map_->end();
3428         return;
3429       }
3430       --iter_;
3431     }
3432
3433     Slice key() const override { return iter_->first; }
3434     Slice value() const override { return iter_->second; }
3435     Status status() const override { return Status::OK(); }
3436
3437    private:
3438     const KVMap* const map_;
3439     const bool owned_;  // Do we own map_
3440     KVMap::const_iterator iter_;
3441   };
3442   const Options options_;
3443   KVMap map_;
3444   std::string name_ = "";
3445 };
3446
3447 #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
3448 static std::string RandomKey(Random* rnd, int minimum = 0) {
3449   int len;
3450   do {
3451     len = (rnd->OneIn(3)
3452                ? 1  // Short sometimes to encourage collisions
3453                : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
3454   } while (len < minimum);
3455   return test::RandomKey(rnd, len);
3456 }
3457
3458 static bool CompareIterators(int step, DB* model, DB* db,
3459                              const Snapshot* model_snap,
3460                              const Snapshot* db_snap) {
3461   ReadOptions options;
3462   options.snapshot = model_snap;
3463   Iterator* miter = model->NewIterator(options);
3464   options.snapshot = db_snap;
3465   Iterator* dbiter = db->NewIterator(options);
3466   bool ok = true;
3467   int count = 0;
3468   for (miter->SeekToFirst(), dbiter->SeekToFirst();
3469        ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) {
3470     count++;
3471     if (miter->key().compare(dbiter->key()) != 0) {
3472       fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step,
3473               EscapeString(miter->key()).c_str(),
3474               EscapeString(dbiter->key()).c_str());
3475       ok = false;
3476       break;
3477     }
3478
3479     if (miter->value().compare(dbiter->value()) != 0) {
3480       fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
3481               step, EscapeString(miter->key()).c_str(),
3482               EscapeString(miter->value()).c_str(),
3483               EscapeString(dbiter->value()).c_str());
3484       ok = false;
3485     }
3486   }
3487
3488   if (ok) {
3489     if (miter->Valid() != dbiter->Valid()) {
3490       fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
3491               step, miter->Valid(), dbiter->Valid());
3492       ok = false;
3493     }
3494   }
3495   delete miter;
3496   delete dbiter;
3497   return ok;
3498 }
3499
3500 class DBTestRandomized : public DBTest,
3501                          public ::testing::WithParamInterface<int> {
3502  public:
3503   void SetUp() override { option_config_ = GetParam(); }
3504
3505   static std::vector<int> GenerateOptionConfigs() {
3506     std::vector<int> option_configs;
3507     // skip cuckoo hash as it does not support snapshot.
3508     for (int option_config = kDefault; option_config < kEnd; ++option_config) {
3509       if (!ShouldSkipOptions(option_config,
3510                              kSkipDeletesFilterFirst | kSkipNoSeekToLast)) {
3511         option_configs.push_back(option_config);
3512       }
3513     }
3514     option_configs.push_back(kBlockBasedTableWithIndexRestartInterval);
3515     return option_configs;
3516   }
3517 };
3518
3519 INSTANTIATE_TEST_CASE_P(
3520     DBTestRandomized, DBTestRandomized,
3521     ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs()));
3522
3523 TEST_P(DBTestRandomized, Randomized) {
3524   anon::OptionsOverride options_override;
3525   options_override.skip_policy = kSkipNoSnapshot;
3526   Options options = CurrentOptions(options_override);
3527   DestroyAndReopen(options);
3528
3529   Random rnd(test::RandomSeed() + GetParam());
3530   ModelDB model(options);
3531   const int N = 10000;
3532   const Snapshot* model_snap = nullptr;
3533   const Snapshot* db_snap = nullptr;
3534   std::string k, v;
3535   for (int step = 0; step < N; step++) {
3536     // TODO(sanjay): Test Get() works
3537     int p = rnd.Uniform(100);
3538     int minimum = 0;
3539     if (option_config_ == kHashSkipList || option_config_ == kHashLinkList ||
3540         option_config_ == kPlainTableFirstBytePrefix ||
3541         option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
3542         option_config_ == kBlockBasedTableWithPrefixHashIndex) {
3543       minimum = 1;
3544     }
3545     if (p < 45) {  // Put
3546       k = RandomKey(&rnd, minimum);
3547       v = rnd.RandomString(rnd.OneIn(20) ? 100 + rnd.Uniform(100)
3548                                          : rnd.Uniform(8));
3549       ASSERT_OK(model.Put(WriteOptions(), k, v));
3550       ASSERT_OK(db_->Put(WriteOptions(), k, v));
3551     } else if (p < 90) {  // Delete
3552       k = RandomKey(&rnd, minimum);
3553       ASSERT_OK(model.Delete(WriteOptions(), k));
3554       ASSERT_OK(db_->Delete(WriteOptions(), k));
3555     } else {  // Multi-element batch
3556       WriteBatch b;
3557       const int num = rnd.Uniform(8);
3558       for (int i = 0; i < num; i++) {
3559         if (i == 0 || !rnd.OneIn(10)) {
3560           k = RandomKey(&rnd, minimum);
3561         } else {
3562           // Periodically re-use the same key from the previous iter, so
3563           // we have multiple entries in the write batch for the same key
3564         }
3565         if (rnd.OneIn(2)) {
3566           v = rnd.RandomString(rnd.Uniform(10));
3567           ASSERT_OK(b.Put(k, v));
3568         } else {
3569           ASSERT_OK(b.Delete(k));
3570         }
3571       }
3572       ASSERT_OK(model.Write(WriteOptions(), &b));
3573       ASSERT_OK(db_->Write(WriteOptions(), &b));
3574     }
3575
3576     if ((step % 100) == 0) {
3577       // For DB instances that use the hash index + block-based table, the
3578       // iterator will be invalid right when seeking a non-existent key, right
3579       // than return a key that is close to it.
3580       if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
3581           option_config_ != kBlockBasedTableWithPrefixHashIndex) {
3582         ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
3583         ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
3584       }
3585
3586       // Save a snapshot from each DB this time that we'll use next
3587       // time we compare things, to make sure the current state is
3588       // preserved with the snapshot
3589       if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
3590       if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
3591
3592       Reopen(options);
3593       ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
3594
3595       model_snap = model.GetSnapshot();
3596       db_snap = db_->GetSnapshot();
3597     }
3598   }
3599   if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
3600   if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
3601 }
3602 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
3603
3604 TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
3605   // create a DB with block prefix index
3606   BlockBasedTableOptions table_options;
3607   Options options = CurrentOptions();
3608   table_options.index_type = BlockBasedTableOptions::kHashSearch;
3609   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3610   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
3611
3612   Reopen(options);
3613   ASSERT_OK(Put("k1", "v1"));
3614   ASSERT_OK(Flush());
3615   ASSERT_OK(Put("k2", "v2"));
3616
3617   // Reopen with different prefix extractor, make sure everything still works.
3618   // RocksDB should just fall back to the binary index.
3619   options.prefix_extractor.reset(NewFixedPrefixTransform(2));
3620
3621   Reopen(options);
3622   ASSERT_EQ("v1", Get("k1"));
3623   ASSERT_EQ("v2", Get("k2"));
3624
3625 #ifndef ROCKSDB_LITE
3626   // Back to original
3627   ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
3628   ASSERT_EQ("v1", Get("k1"));
3629   ASSERT_EQ("v2", Get("k2"));
3630 #endif  // !ROCKSDB_LITE
3631
3632   // Same if there's a problem initally loading prefix transform
3633   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
3634   SyncPoint::GetInstance()->SetCallBack(
3635       "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
3636       [&](void* arg) { *static_cast<bool*>(arg) = true; });
3637   SyncPoint::GetInstance()->EnableProcessing();
3638   Reopen(options);
3639   ASSERT_EQ("v1", Get("k1"));
3640   ASSERT_EQ("v2", Get("k2"));
3641
3642 #ifndef ROCKSDB_LITE
3643   // Change again
3644   ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
3645   ASSERT_EQ("v1", Get("k1"));
3646   ASSERT_EQ("v2", Get("k2"));
3647 #endif  // !ROCKSDB_LITE
3648   SyncPoint::GetInstance()->DisableProcessing();
3649
3650   // Reopen with no prefix extractor, make sure everything still works.
3651   // RocksDB should just fall back to the binary index.
3652   table_options.index_type = BlockBasedTableOptions::kBinarySearch;
3653   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3654   options.prefix_extractor.reset();
3655
3656   Reopen(options);
3657   ASSERT_EQ("v1", Get("k1"));
3658   ASSERT_EQ("v2", Get("k2"));
3659 }
3660
3661 TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) {
3662   // create a DB with block prefix index
3663   BlockBasedTableOptions table_options;
3664   Options options = CurrentOptions();
3665   table_options.index_type = BlockBasedTableOptions::kHashSearch;
3666   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3667   options.prefix_extractor.reset(NewCappedPrefixTransform(2));
3668
3669   Reopen(options);
3670   ASSERT_OK(Put("kk1", "v1"));
3671   ASSERT_OK(Put("kk2", "v2"));
3672   ASSERT_OK(Put("kk", "v3"));
3673   ASSERT_OK(Put("k", "v4"));
3674   Flush();
3675
3676   ASSERT_EQ("v1", Get("kk1"));
3677   ASSERT_EQ("v2", Get("kk2"));
3678
3679   ASSERT_EQ("v3", Get("kk"));
3680   ASSERT_EQ("v4", Get("k"));
3681 }
3682
3683 TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) {
3684   // create a DB with block prefix index
3685   BlockBasedTableOptions table_options;
3686   Options options = CurrentOptions();
3687   options.max_open_files = 10;
3688   table_options.index_type = BlockBasedTableOptions::kHashSearch;
3689   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3690   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
3691
3692   // RocksDB sanitize max open files to at least 20. Modify it back.
3693   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3694       "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
3695         int* max_open_files = static_cast<int*>(arg);
3696         *max_open_files = 11;
3697       });
3698   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3699
3700   Reopen(options);
3701   ASSERT_OK(Put("k1", "v1"));
3702   ASSERT_OK(Flush());
3703
3704   CompactRangeOptions cro;
3705   cro.change_level = true;
3706   cro.target_level = 1;
3707   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
3708
3709   // Force evict tables
3710   dbfull()->TEST_table_cache()->SetCapacity(0);
3711   // Make table cache to keep one entry.
3712   dbfull()->TEST_table_cache()->SetCapacity(1);
3713
3714   ReadOptions read_options;
3715   read_options.total_order_seek = true;
3716   {
3717     std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
3718     iter->Seek("k1");
3719     ASSERT_TRUE(iter->Valid());
3720     ASSERT_EQ("k1", iter->key().ToString());
3721   }
3722
3723   // After total order seek, prefix index should still be used.
3724   read_options.total_order_seek = false;
3725   {
3726     std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
3727     iter->Seek("k1");
3728     ASSERT_TRUE(iter->Valid());
3729     ASSERT_EQ("k1", iter->key().ToString());
3730   }
3731   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3732 }
3733
3734 TEST_F(DBTest, ChecksumTest) {
3735   BlockBasedTableOptions table_options;
3736   Options options = CurrentOptions();
3737
3738   table_options.checksum = kCRC32c;
3739   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3740   Reopen(options);
3741   ASSERT_OK(Put("a", "b"));
3742   ASSERT_OK(Put("c", "d"));
3743   ASSERT_OK(Flush());  // table with crc checksum
3744
3745   table_options.checksum = kxxHash;
3746   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3747   Reopen(options);
3748   ASSERT_OK(Put("e", "f"));
3749   ASSERT_OK(Put("g", "h"));
3750   ASSERT_OK(Flush());  // table with xxhash checksum
3751
3752   table_options.checksum = kCRC32c;
3753   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3754   Reopen(options);
3755   ASSERT_EQ("b", Get("a"));
3756   ASSERT_EQ("d", Get("c"));
3757   ASSERT_EQ("f", Get("e"));
3758   ASSERT_EQ("h", Get("g"));
3759
3760   table_options.checksum = kCRC32c;
3761   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
3762   Reopen(options);
3763   ASSERT_EQ("b", Get("a"));
3764   ASSERT_EQ("d", Get("c"));
3765   ASSERT_EQ("f", Get("e"));
3766   ASSERT_EQ("h", Get("g"));
3767 }
3768
3769 #ifndef ROCKSDB_LITE
3770 TEST_P(DBTestWithParam, FIFOCompactionTest) {
3771   for (int iter = 0; iter < 2; ++iter) {
3772     // first iteration -- auto compaction
3773     // second iteration -- manual compaction
3774     Options options;
3775     options.compaction_style = kCompactionStyleFIFO;
3776     options.write_buffer_size = 100 << 10;  // 100KB
3777     options.arena_block_size = 4096;
3778     options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
3779     options.compression = kNoCompression;
3780     options.create_if_missing = true;
3781     options.max_subcompactions = max_subcompactions_;
3782     if (iter == 1) {
3783       options.disable_auto_compactions = true;
3784     }
3785     options = CurrentOptions(options);
3786     DestroyAndReopen(options);
3787
3788     Random rnd(301);
3789     for (int i = 0; i < 6; ++i) {
3790       for (int j = 0; j < 110; ++j) {
3791         ASSERT_OK(Put(std::to_string(i * 100 + j), rnd.RandomString(980)));
3792       }
3793       // flush should happen here
3794       ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
3795     }
3796     if (iter == 0) {
3797       ASSERT_OK(dbfull()->TEST_WaitForCompact());
3798     } else {
3799       CompactRangeOptions cro;
3800       cro.exclusive_manual_compaction = exclusive_manual_compaction_;
3801       ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
3802     }
3803     // only 5 files should survive
3804     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
3805     for (int i = 0; i < 50; ++i) {
3806       // these keys should be deleted in previous compaction
3807       ASSERT_EQ("NOT_FOUND", Get(std::to_string(i)));
3808     }
3809   }
3810 }
3811
3812 TEST_F(DBTest, FIFOCompactionTestWithCompaction) {
3813   Options options;
3814   options.compaction_style = kCompactionStyleFIFO;
3815   options.write_buffer_size = 20 << 10;  // 20K
3816   options.arena_block_size = 4096;
3817   options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
3818   options.compaction_options_fifo.allow_compaction = true;
3819   options.level0_file_num_compaction_trigger = 6;
3820   options.compression = kNoCompression;
3821   options.create_if_missing = true;
3822   options = CurrentOptions(options);
3823   DestroyAndReopen(options);
3824
3825   Random rnd(301);
3826   for (int i = 0; i < 60; i++) {
3827     // Generate and flush a file about 20KB.
3828     for (int j = 0; j < 20; j++) {
3829       ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
3830     }
3831     ASSERT_OK(Flush());
3832     ASSERT_OK(dbfull()->TEST_WaitForCompact());
3833   }
3834   // It should be compacted to 10 files.
3835   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
3836
3837   for (int i = 0; i < 60; i++) {
3838     // Generate and flush a file about 20KB.
3839     for (int j = 0; j < 20; j++) {
3840       ASSERT_OK(Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
3841     }
3842     ASSERT_OK(Flush());
3843     ASSERT_OK(dbfull()->TEST_WaitForCompact());
3844   }
3845
3846   // It should be compacted to no more than 20 files.
3847   ASSERT_GT(NumTableFilesAtLevel(0), 10);
3848   ASSERT_LT(NumTableFilesAtLevel(0), 18);
3849   // Size limit is still guaranteed.
3850   ASSERT_LE(SizeAtLevel(0),
3851             options.compaction_options_fifo.max_table_files_size);
3852 }
3853
3854 TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
3855   Options options;
3856   options.compaction_style = kCompactionStyleFIFO;
3857   options.write_buffer_size = 20 << 10;  // 20K
3858   options.arena_block_size = 4096;
3859   options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
3860   options.compaction_options_fifo.allow_compaction = true;
3861   options.level0_file_num_compaction_trigger = 3;
3862   options.compression = kNoCompression;
3863   options.create_if_missing = true;
3864   options = CurrentOptions(options);
3865   DestroyAndReopen(options);
3866
3867   Random rnd(301);
3868   for (int i = 0; i < 3; i++) {
3869     // Each file contains a different key which will be dropped later.
3870     ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
3871     ASSERT_OK(Put("key" + std::to_string(i), ""));
3872     ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
3873     ASSERT_OK(Flush());
3874     ASSERT_OK(dbfull()->TEST_WaitForCompact());
3875   }
3876   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
3877   for (int i = 0; i < 3; i++) {
3878     ASSERT_EQ("", Get("key" + std::to_string(i)));
3879   }
3880   for (int i = 0; i < 3; i++) {
3881     // Each file contains a different key which will be dropped later.
3882     ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
3883     ASSERT_OK(Delete("key" + std::to_string(i)));
3884     ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
3885     ASSERT_OK(Flush());
3886     ASSERT_OK(dbfull()->TEST_WaitForCompact());
3887   }
3888   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
3889   for (int i = 0; i < 3; i++) {
3890     ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
3891   }
3892 }
3893
3894 // Check that FIFO-with-TTL is not supported with max_open_files != -1.
3895 // Github issue #8014
3896 TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
3897   Options options = CurrentOptions();
3898   options.compaction_style = kCompactionStyleFIFO;
3899   options.create_if_missing = true;
3900   options.ttl = 600;  // seconds
3901
3902   // TTL is not supported with max_open_files != -1.
3903   options.max_open_files = 0;
3904   ASSERT_TRUE(TryReopen(options).IsNotSupported());
3905
3906   options.max_open_files = 100;
3907   ASSERT_TRUE(TryReopen(options).IsNotSupported());
3908
3909   // TTL is supported with unlimited max_open_files
3910   options.max_open_files = -1;
3911   ASSERT_OK(TryReopen(options));
3912 }
3913
3914 // Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
3915 TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
3916   Options options;
3917   options.compaction_style = kCompactionStyleFIFO;
3918   options.create_if_missing = true;
3919   options.ttl = 600;  // seconds
3920
3921   options = CurrentOptions(options);
3922   options.table_factory.reset(NewBlockBasedTableFactory());
3923   ASSERT_OK(TryReopen(options));
3924
3925   Destroy(options);
3926   options.table_factory.reset(NewPlainTableFactory());
3927   ASSERT_TRUE(TryReopen(options).IsNotSupported());
3928
3929   Destroy(options);
3930   options.table_factory.reset(NewAdaptiveTableFactory());
3931   ASSERT_TRUE(TryReopen(options).IsNotSupported());
3932 }
3933
3934 TEST_F(DBTest, FIFOCompactionWithTTLTest) {
3935   Options options;
3936   options.compaction_style = kCompactionStyleFIFO;
3937   options.write_buffer_size = 10 << 10;  // 10KB
3938   options.arena_block_size = 4096;
3939   options.compression = kNoCompression;
3940   options.create_if_missing = true;
3941   env_->SetMockSleep();
3942   options.env = env_;
3943
3944   // Test to make sure that all files with expired ttl are deleted on next
3945   // manual compaction.
3946   {
3947     // NOTE: Presumed unnecessary and removed: resetting mock time in env
3948
3949     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
3950     options.compaction_options_fifo.allow_compaction = false;
3951     options.ttl = 1 * 60 * 60;  // 1 hour
3952     options = CurrentOptions(options);
3953     DestroyAndReopen(options);
3954
3955     Random rnd(301);
3956     for (int i = 0; i < 10; i++) {
3957       // Generate and flush a file about 10KB.
3958       for (int j = 0; j < 10; j++) {
3959         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
3960       }
3961       ASSERT_OK(Flush());
3962       ASSERT_OK(dbfull()->TEST_WaitForCompact());
3963     }
3964     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
3965
3966     // Sleep for 2 hours -- which is much greater than TTL.
3967     env_->MockSleepForSeconds(2 * 60 * 60);
3968
3969     // Since no flushes and compactions have run, the db should still be in
3970     // the same state even after considerable time has passed.
3971     ASSERT_OK(dbfull()->TEST_WaitForCompact());
3972     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
3973
3974     ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
3975     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
3976   }
3977
3978   // Test to make sure that all files with expired ttl are deleted on next
3979   // automatic compaction.
3980   {
3981     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
3982     options.compaction_options_fifo.allow_compaction = false;
3983     options.ttl = 1 * 60 * 60;  // 1 hour
3984     options = CurrentOptions(options);
3985     DestroyAndReopen(options);
3986
3987     Random rnd(301);
3988     for (int i = 0; i < 10; i++) {
3989       // Generate and flush a file about 10KB.
3990       for (int j = 0; j < 10; j++) {
3991         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
3992       }
3993       ASSERT_OK(Flush());
3994       ASSERT_OK(dbfull()->TEST_WaitForCompact());
3995     }
3996     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
3997
3998     // Sleep for 2 hours -- which is much greater than TTL.
3999     env_->MockSleepForSeconds(2 * 60 * 60);
4000     // Just to make sure that we are in the same state even after sleeping.
4001     ASSERT_OK(dbfull()->TEST_WaitForCompact());
4002     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
4003
4004     // Create 1 more file to trigger TTL compaction. The old files are dropped.
4005     for (int i = 0; i < 1; i++) {
4006       for (int j = 0; j < 10; j++) {
4007         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
4008       }
4009       ASSERT_OK(Flush());
4010     }
4011
4012     ASSERT_OK(dbfull()->TEST_WaitForCompact());
4013     // Only the new 10 files remain.
4014     ASSERT_EQ(NumTableFilesAtLevel(0), 1);
4015     ASSERT_LE(SizeAtLevel(0),
4016               options.compaction_options_fifo.max_table_files_size);
4017   }
4018
4019   // Test that shows the fall back to size-based FIFO compaction if TTL-based
4020   // deletion doesn't move the total size to be less than max_table_files_size.
4021   {
4022     options.write_buffer_size = 10 << 10;                              // 10KB
4023     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
4024     options.compaction_options_fifo.allow_compaction = false;
4025     options.ttl = 1 * 60 * 60;  // 1 hour
4026     options = CurrentOptions(options);
4027     DestroyAndReopen(options);
4028
4029     Random rnd(301);
4030     for (int i = 0; i < 3; i++) {
4031       // Generate and flush a file about 10KB.
4032       for (int j = 0; j < 10; j++) {
4033         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
4034       }
4035       ASSERT_OK(Flush());
4036       ASSERT_OK(dbfull()->TEST_WaitForCompact());
4037     }
4038     ASSERT_EQ(NumTableFilesAtLevel(0), 3);
4039
4040     // Sleep for 2 hours -- which is much greater than TTL.
4041     env_->MockSleepForSeconds(2 * 60 * 60);
4042     // Just to make sure that we are in the same state even after sleeping.
4043     ASSERT_OK(dbfull()->TEST_WaitForCompact());
4044     ASSERT_EQ(NumTableFilesAtLevel(0), 3);
4045
4046     for (int i = 0; i < 5; i++) {
4047       for (int j = 0; j < 140; j++) {
4048         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
4049       }
4050       ASSERT_OK(Flush());
4051       ASSERT_OK(dbfull()->TEST_WaitForCompact());
4052     }
4053     // Size limit is still guaranteed.
4054     ASSERT_LE(SizeAtLevel(0),
4055               options.compaction_options_fifo.max_table_files_size);
4056   }
4057
4058   // Test with TTL + Intra-L0 compactions.
4059   {
4060     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
4061     options.compaction_options_fifo.allow_compaction = true;
4062     options.ttl = 1 * 60 * 60;  // 1 hour
4063     options.level0_file_num_compaction_trigger = 6;
4064     options = CurrentOptions(options);
4065     DestroyAndReopen(options);
4066
4067     Random rnd(301);
4068     for (int i = 0; i < 10; i++) {
4069       // Generate and flush a file about 10KB.
4070       for (int j = 0; j < 10; j++) {
4071         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
4072       }
4073       ASSERT_OK(Flush());
4074       ASSERT_OK(dbfull()->TEST_WaitForCompact());
4075     }
4076     // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
4077     // (due to level0_file_num_compaction_trigger = 6).
4078     // So total files = 1 + remaining 4 = 5.
4079     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
4080
4081     // Sleep for 2 hours -- which is much greater than TTL.
4082     env_->MockSleepForSeconds(2 * 60 * 60);
4083     // Just to make sure that we are in the same state even after sleeping.
4084     ASSERT_OK(dbfull()->TEST_WaitForCompact());
4085     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
4086
4087     // Create 10 more files. The old 5 files are dropped as their ttl expired.
4088     for (int i = 0; i < 10; i++) {
4089       for (int j = 0; j < 10; j++) {
4090         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
4091       }
4092       ASSERT_OK(Flush());
4093       ASSERT_OK(dbfull()->TEST_WaitForCompact());
4094     }
4095     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
4096     ASSERT_LE(SizeAtLevel(0),
4097               options.compaction_options_fifo.max_table_files_size);
4098   }
4099
4100   // Test with large TTL + Intra-L0 compactions.
4101   // Files dropped based on size, as ttl doesn't kick in.
4102   {
4103     options.write_buffer_size = 20 << 10;                               // 20K
4104     options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1.5MB
4105     options.compaction_options_fifo.allow_compaction = true;
4106     options.ttl = 1 * 60 * 60;  // 1 hour
4107     options.level0_file_num_compaction_trigger = 6;
4108     options = CurrentOptions(options);
4109     DestroyAndReopen(options);
4110
4111     Random rnd(301);
4112     for (int i = 0; i < 60; i++) {
4113       // Generate and flush a file about 20KB.
4114       for (int j = 0; j < 20; j++) {
4115         ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
4116       }
4117       ASSERT_OK(Flush());
4118       ASSERT_OK(dbfull()->TEST_WaitForCompact());
4119     }
4120     // It should be compacted to 10 files.
4121     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
4122
4123     for (int i = 0; i < 60; i++) {
4124       // Generate and flush a file about 20KB.
4125       for (int j = 0; j < 20; j++) {
4126         ASSERT_OK(
4127             Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
4128       }
4129       ASSERT_OK(Flush());
4130       ASSERT_OK(dbfull()->TEST_WaitForCompact());
4131     }
4132
4133     // It should be compacted to no more than 20 files.
4134     ASSERT_GT(NumTableFilesAtLevel(0), 10);
4135     ASSERT_LT(NumTableFilesAtLevel(0), 18);
4136     // Size limit is still guaranteed.
4137     ASSERT_LE(SizeAtLevel(0),
4138               options.compaction_options_fifo.max_table_files_size);
4139   }
4140 }
4141 #endif  // ROCKSDB_LITE
4142
4143 #ifndef ROCKSDB_LITE
4144 /*
4145  * This test is not reliable enough as it heavily depends on disk behavior.
4146  * Disable as it is flaky.
4147  */
4148 TEST_F(DBTest, DISABLED_RateLimitingTest) {
4149   Options options = CurrentOptions();
4150   options.write_buffer_size = 1 << 20;  // 1MB
4151   options.level0_file_num_compaction_trigger = 2;
4152   options.target_file_size_base = 1 << 20;     // 1MB
4153   options.max_bytes_for_level_base = 4 << 20;  // 4MB
4154   options.max_bytes_for_level_multiplier = 4;
4155   options.compression = kNoCompression;
4156   options.create_if_missing = true;
4157   options.env = env_;
4158   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
4159   options.IncreaseParallelism(4);
4160   DestroyAndReopen(options);
4161
4162   WriteOptions wo;
4163   wo.disableWAL = true;
4164
4165   // # no rate limiting
4166   Random rnd(301);
4167   uint64_t start = env_->NowMicros();
4168   // Write ~96M data
4169   for (int64_t i = 0; i < (96 << 10); ++i) {
4170     ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
4171   }
4172   uint64_t elapsed = env_->NowMicros() - start;
4173   double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed;
4174   uint64_t rate_limiter_drains =
4175       TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS);
4176   ASSERT_EQ(0, rate_limiter_drains);
4177   Close();
4178
4179   // # rate limiting with 0.7 x threshold
4180   options.rate_limiter.reset(
4181       NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
4182   env_->bytes_written_ = 0;
4183   DestroyAndReopen(options);
4184
4185   start = env_->NowMicros();
4186   // Write ~96M data
4187   for (int64_t i = 0; i < (96 << 10); ++i) {
4188     ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
4189   }
4190   rate_limiter_drains =
4191       TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
4192       rate_limiter_drains;
4193   elapsed = env_->NowMicros() - start;
4194   Close();
4195   ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
4196   // Most intervals should've been drained (interval time is 100ms, elapsed is
4197   // micros)
4198   ASSERT_GT(rate_limiter_drains, 0);
4199   ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
4200   double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
4201   fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
4202   ASSERT_TRUE(ratio < 0.8);
4203
4204   // # rate limiting with half of the raw_rate
4205   options.rate_limiter.reset(
4206       NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
4207   env_->bytes_written_ = 0;
4208   DestroyAndReopen(options);
4209
4210   start = env_->NowMicros();
4211   // Write ~96M data
4212   for (int64_t i = 0; i < (96 << 10); ++i) {
4213     ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
4214   }
4215   elapsed = env_->NowMicros() - start;
4216   rate_limiter_drains =
4217       TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
4218       rate_limiter_drains;
4219   Close();
4220   ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
4221   // Most intervals should've been drained (interval time is 100ms, elapsed is
4222   // micros)
4223   ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2);
4224   ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
4225   ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
4226   fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
4227   ASSERT_LT(ratio, 0.6);
4228 }
4229
4230 // This is a mocked customed rate limiter without implementing optional APIs
4231 // (e.g, RateLimiter::GetTotalPendingRequests())
4232 class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter {
4233  public:
4234   MockedRateLimiterWithNoOptionalAPIImpl() {}
4235
4236   ~MockedRateLimiterWithNoOptionalAPIImpl() override {}
4237
4238   void SetBytesPerSecond(int64_t bytes_per_second) override {
4239     (void)bytes_per_second;
4240   }
4241
4242   using RateLimiter::Request;
4243   void Request(const int64_t bytes, const Env::IOPriority pri,
4244                Statistics* stats) override {
4245     (void)bytes;
4246     (void)pri;
4247     (void)stats;
4248   }
4249
4250   int64_t GetSingleBurstBytes() const override { return 200; }
4251
4252   int64_t GetTotalBytesThrough(
4253       const Env::IOPriority pri = Env::IO_TOTAL) const override {
4254     (void)pri;
4255     return 0;
4256   }
4257
4258   int64_t GetTotalRequests(
4259       const Env::IOPriority pri = Env::IO_TOTAL) const override {
4260     (void)pri;
4261     return 0;
4262   }
4263
4264   int64_t GetBytesPerSecond() const override { return 0; }
4265 };
4266
4267 // To test that customed rate limiter not implementing optional APIs (e.g,
4268 // RateLimiter::GetTotalPendingRequests()) works fine with RocksDB basic
4269 // operations (e.g, Put, Get, Flush)
4270 TEST_F(DBTest, CustomedRateLimiterWithNoOptionalAPIImplTest) {
4271   Options options = CurrentOptions();
4272   options.rate_limiter.reset(new MockedRateLimiterWithNoOptionalAPIImpl());
4273   DestroyAndReopen(options);
4274   ASSERT_OK(Put("abc", "def"));
4275   ASSERT_EQ(Get("abc"), "def");
4276   ASSERT_OK(Flush());
4277   ASSERT_EQ(Get("abc"), "def");
4278 }
4279
4280 TEST_F(DBTest, TableOptionsSanitizeTest) {
4281   Options options = CurrentOptions();
4282   options.create_if_missing = true;
4283   DestroyAndReopen(options);
4284   ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
4285
4286   options.table_factory.reset(NewPlainTableFactory());
4287   options.prefix_extractor.reset(NewNoopTransform());
4288   Destroy(options);
4289   ASSERT_TRUE(!TryReopen(options).IsNotSupported());
4290
4291   // Test for check of prefix_extractor when hash index is used for
4292   // block-based table
4293   BlockBasedTableOptions to;
4294   to.index_type = BlockBasedTableOptions::kHashSearch;
4295   options = CurrentOptions();
4296   options.create_if_missing = true;
4297   options.table_factory.reset(NewBlockBasedTableFactory(to));
4298   ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
4299   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
4300   ASSERT_OK(TryReopen(options));
4301 }
4302
4303 TEST_F(DBTest, ConcurrentMemtableNotSupported) {
4304   Options options = CurrentOptions();
4305   options.allow_concurrent_memtable_write = true;
4306   options.soft_pending_compaction_bytes_limit = 0;
4307   options.hard_pending_compaction_bytes_limit = 100;
4308   options.create_if_missing = true;
4309
4310   DestroyDB(dbname_, options);
4311   options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4));
4312   ASSERT_NOK(TryReopen(options));
4313
4314   options.memtable_factory.reset(new SkipListFactory);
4315   ASSERT_OK(TryReopen(options));
4316
4317   ColumnFamilyOptions cf_options(options);
4318   cf_options.memtable_factory.reset(
4319       NewHashLinkListRepFactory(4, 0, 3, true, 4));
4320   ColumnFamilyHandle* handle;
4321   ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle));
4322 }
4323
4324 #endif  // ROCKSDB_LITE
4325
4326 TEST_F(DBTest, SanitizeNumThreads) {
4327   for (int attempt = 0; attempt < 2; attempt++) {
4328     const size_t kTotalTasks = 8;
4329     test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
4330
4331     Options options = CurrentOptions();
4332     if (attempt == 0) {
4333       options.max_background_compactions = 3;
4334       options.max_background_flushes = 2;
4335     }
4336     options.create_if_missing = true;
4337     DestroyAndReopen(options);
4338
4339     for (size_t i = 0; i < kTotalTasks; i++) {
4340       // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
4341       env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
4342                      &sleeping_tasks[i],
4343                      (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
4344     }
4345
4346     // Wait until 10s for they are scheduled.
4347     for (int i = 0; i < 10000; i++) {
4348       if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
4349           options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
4350         break;
4351       }
4352       env_->SleepForMicroseconds(1000);
4353     }
4354
4355     // pool size 3, total task 4. Queue size should be 1.
4356     ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
4357     // pool size 2, total task 4. Queue size should be 2.
4358     ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
4359
4360     for (size_t i = 0; i < kTotalTasks; i++) {
4361       sleeping_tasks[i].WakeUp();
4362       sleeping_tasks[i].WaitUntilDone();
4363     }
4364
4365     ASSERT_OK(Put("abc", "def"));
4366     ASSERT_EQ("def", Get("abc"));
4367     ASSERT_OK(Flush());
4368     ASSERT_EQ("def", Get("abc"));
4369   }
4370 }
4371
4372 TEST_F(DBTest, WriteSingleThreadEntry) {
4373   std::vector<port::Thread> threads;
4374   dbfull()->TEST_LockMutex();
4375   auto w = dbfull()->TEST_BeginWrite();
4376   threads.emplace_back([&] { ASSERT_OK(Put("a", "b")); });
4377   env_->SleepForMicroseconds(10000);
4378   threads.emplace_back([&] { ASSERT_OK(Flush()); });
4379   env_->SleepForMicroseconds(10000);
4380   dbfull()->TEST_UnlockMutex();
4381   dbfull()->TEST_LockMutex();
4382   dbfull()->TEST_EndWrite(w);
4383   dbfull()->TEST_UnlockMutex();
4384
4385   for (auto& t : threads) {
4386     t.join();
4387   }
4388 }
4389
4390 TEST_F(DBTest, ConcurrentFlushWAL) {
4391   const size_t cnt = 100;
4392   Options options;
4393   options.env = env_;
4394   WriteOptions wopt;
4395   ReadOptions ropt;
4396   for (bool two_write_queues : {false, true}) {
4397     for (bool manual_wal_flush : {false, true}) {
4398       options.two_write_queues = two_write_queues;
4399       options.manual_wal_flush = manual_wal_flush;
4400       options.create_if_missing = true;
4401       DestroyAndReopen(options);
4402       std::vector<port::Thread> threads;
4403       threads.emplace_back([&] {
4404         for (size_t i = 0; i < cnt; i++) {
4405           auto istr = std::to_string(i);
4406           ASSERT_OK(db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr,
4407                              "b" + istr));
4408         }
4409       });
4410       if (two_write_queues) {
4411         threads.emplace_back([&] {
4412           for (size_t i = cnt; i < 2 * cnt; i++) {
4413             auto istr = std::to_string(i);
4414             WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
4415                              wopt.protection_bytes_per_key,
4416                              0 /* default_cf_ts_sz */);
4417             ASSERT_OK(batch.Put("a" + istr, "b" + istr));
4418             ASSERT_OK(
4419                 dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true));
4420           }
4421         });
4422       }
4423       threads.emplace_back([&] {
4424         for (size_t i = 0; i < cnt * 100; i++) {  // FlushWAL is faster than Put
4425           ASSERT_OK(db_->FlushWAL(false));
4426         }
4427       });
4428       for (auto& t : threads) {
4429         t.join();
4430       }
4431       options.create_if_missing = false;
4432       // Recover from the wal and make sure that it is not corrupted
4433       Reopen(options);
4434       for (size_t i = 0; i < cnt; i++) {
4435         PinnableSlice pval;
4436         auto istr = std::to_string(i);
4437         ASSERT_OK(
4438             db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
4439         ASSERT_TRUE(pval == ("b" + istr));
4440       }
4441     }
4442   }
4443 }
4444
4445 // This test failure will be caught with a probability
4446 TEST_F(DBTest, ManualFlushWalAndWriteRace) {
4447   Options options;
4448   options.env = env_;
4449   options.manual_wal_flush = true;
4450   options.create_if_missing = true;
4451
4452   DestroyAndReopen(options);
4453
4454   WriteOptions wopts;
4455   wopts.sync = true;
4456
4457   port::Thread writeThread([&]() {
4458     for (int i = 0; i < 100; i++) {
4459       auto istr = std::to_string(i);
4460       ASSERT_OK(dbfull()->Put(wopts, "key_" + istr, "value_" + istr));
4461     }
4462   });
4463   port::Thread flushThread([&]() {
4464     for (int i = 0; i < 100; i++) {
4465       ASSERT_OK(dbfull()->FlushWAL(false));
4466     }
4467   });
4468
4469   writeThread.join();
4470   flushThread.join();
4471   ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1"));
4472   ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2"));
4473   Reopen(options);
4474   ASSERT_EQ("value1", Get("foo1"));
4475   ASSERT_EQ("value2", Get("foo2"));
4476 }
4477
4478 #ifndef ROCKSDB_LITE
4479 TEST_F(DBTest, DynamicMemtableOptions) {
4480   const uint64_t k64KB = 1 << 16;
4481   const uint64_t k128KB = 1 << 17;
4482   const uint64_t k5KB = 5 * 1024;
4483   Options options;
4484   options.env = env_;
4485   options.create_if_missing = true;
4486   options.compression = kNoCompression;
4487   options.max_background_compactions = 1;
4488   options.write_buffer_size = k64KB;
4489   options.arena_block_size = 16 * 1024;
4490   options.max_write_buffer_number = 2;
4491   // Don't trigger compact/slowdown/stop
4492   options.level0_file_num_compaction_trigger = 1024;
4493   options.level0_slowdown_writes_trigger = 1024;
4494   options.level0_stop_writes_trigger = 1024;
4495   DestroyAndReopen(options);
4496
4497   auto gen_l0_kb = [this](int size) {
4498     const int kNumPutsBeforeWaitForFlush = 64;
4499     Random rnd(301);
4500     for (int i = 0; i < size; i++) {
4501       ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
4502
4503       // The following condition prevents a race condition between flush jobs
4504       // acquiring work and this thread filling up multiple memtables. Without
4505       // this, the flush might produce less files than expected because
4506       // multiple memtables are flushed into a single L0 file. This race
4507       // condition affects assertion (A).
4508       if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
4509         ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
4510       }
4511     }
4512     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
4513   };
4514
4515   // Test write_buffer_size
4516   gen_l0_kb(64);
4517   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
4518   ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
4519   ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
4520
4521   // Clean up L0
4522   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
4523   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
4524
4525   // Increase buffer size
4526   ASSERT_OK(dbfull()->SetOptions({
4527       {"write_buffer_size", "131072"},
4528   }));
4529
4530   // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
4531   // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
4532   gen_l0_kb(192);
4533   ASSERT_EQ(NumTableFilesAtLevel(0), 1);  // (A)
4534   ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
4535   ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
4536
4537   // Decrease buffer size below current usage
4538   ASSERT_OK(dbfull()->SetOptions({
4539       {"write_buffer_size", "65536"},
4540   }));
4541   // The existing memtable became eligible for flush when we reduced its
4542   // capacity to 64KB. Two keys need to be added to trigger flush: first causes
4543   // memtable to be marked full, second schedules the flush. Then we should have
4544   // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
4545   gen_l0_kb(2);
4546   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
4547   ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
4548   ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
4549
4550   // Test max_write_buffer_number
4551   // Block compaction thread, which will also block the flushes because
4552   // max_background_flushes == 0, so flushes are getting executed by the
4553   // compaction thread
4554   env_->SetBackgroundThreads(1, Env::LOW);
4555   test::SleepingBackgroundTask sleeping_task_low;
4556   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
4557                  Env::Priority::LOW);
4558   // Start from scratch and disable compaction/flush. Flush can only happen
4559   // during compaction but trigger is pretty high
4560   options.disable_auto_compactions = true;
4561   DestroyAndReopen(options);
4562   env_->SetBackgroundThreads(0, Env::HIGH);
4563
4564   // Put until writes are stopped, bounded by 256 puts. We should see stop at
4565   // ~128KB
4566   int count = 0;
4567   Random rnd(301);
4568
4569   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
4570       "DBImpl::DelayWrite:Wait",
4571       [&](void* /*arg*/) { sleeping_task_low.WakeUp(); });
4572   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
4573
4574   while (!sleeping_task_low.WokenUp() && count < 256) {
4575     ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
4576     count++;
4577   }
4578   ASSERT_GT(static_cast<double>(count), 128 * 0.8);
4579   ASSERT_LT(static_cast<double>(count), 128 * 1.2);
4580
4581   sleeping_task_low.WaitUntilDone();
4582
4583   // Increase
4584   ASSERT_OK(dbfull()->SetOptions({
4585       {"max_write_buffer_number", "8"},
4586   }));
4587   // Clean up memtable and L0
4588   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
4589
4590   sleeping_task_low.Reset();
4591   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
4592                  Env::Priority::LOW);
4593   count = 0;
4594   while (!sleeping_task_low.WokenUp() && count < 1024) {
4595     ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
4596     count++;
4597   }
4598 // Windows fails this test. Will tune in the future and figure out
4599 // approp number
4600 #ifndef OS_WIN
4601   ASSERT_GT(static_cast<double>(count), 512 * 0.8);
4602   ASSERT_LT(static_cast<double>(count), 512 * 1.2);
4603 #endif
4604   sleeping_task_low.WaitUntilDone();
4605
4606   // Decrease
4607   ASSERT_OK(dbfull()->SetOptions({
4608       {"max_write_buffer_number", "4"},
4609   }));
4610   // Clean up memtable and L0
4611   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
4612
4613   sleeping_task_low.Reset();
4614   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
4615                  Env::Priority::LOW);
4616
4617   count = 0;
4618   while (!sleeping_task_low.WokenUp() && count < 1024) {
4619     ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
4620     count++;
4621   }
4622 // Windows fails this test. Will tune in the future and figure out
4623 // approp number
4624 #ifndef OS_WIN
4625   ASSERT_GT(static_cast<double>(count), 256 * 0.8);
4626   ASSERT_LT(static_cast<double>(count), 266 * 1.2);
4627 #endif
4628   sleeping_task_low.WaitUntilDone();
4629
4630   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
4631 }
4632 #endif  // ROCKSDB_LITE
4633
4634 #ifdef ROCKSDB_USING_THREAD_STATUS
4635 namespace {
4636 void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
4637                           int expected_count) {
4638   int op_count = 0;
4639   std::vector<ThreadStatus> thread_list;
4640   ASSERT_OK(env->GetThreadList(&thread_list));
4641   for (auto thread : thread_list) {
4642     if (thread.operation_type == op_type) {
4643       op_count++;
4644     }
4645   }
4646   ASSERT_EQ(op_count, expected_count);
4647 }
4648 }  // anonymous namespace
4649
4650 TEST_F(DBTest, GetThreadStatus) {
4651   Options options;
4652   options.env = env_;
4653   options.enable_thread_tracking = true;
4654   TryReopen(options);
4655
4656   std::vector<ThreadStatus> thread_list;
4657   Status s = env_->GetThreadList(&thread_list);
4658
4659   for (int i = 0; i < 2; ++i) {
4660     // repeat the test with differet number of high / low priority threads
4661     const int kTestCount = 3;
4662     const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
4663     const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
4664     const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4};
4665     for (int test = 0; test < kTestCount; ++test) {
4666       // Change the number of threads in high / low priority pool.
4667       env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
4668       env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
4669       env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM);
4670       // Wait to ensure the all threads has been registered
4671       unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
4672       // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after
4673       // all threads have been registered.
4674       // Try up to 60 seconds.
4675       for (int num_try = 0; num_try < 60000; num_try++) {
4676         env_->SleepForMicroseconds(1000);
4677         thread_list.clear();
4678         s = env_->GetThreadList(&thread_list);
4679         ASSERT_OK(s);
4680         memset(thread_type_counts, 0, sizeof(thread_type_counts));
4681         for (auto thread : thread_list) {
4682           ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
4683           thread_type_counts[thread.thread_type]++;
4684         }
4685         if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] ==
4686                 kHighPriCounts[test] &&
4687             thread_type_counts[ThreadStatus::LOW_PRIORITY] ==
4688                 kLowPriCounts[test] &&
4689             thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] ==
4690                 kBottomPriCounts[test]) {
4691           break;
4692         }
4693       }
4694       // Verify the number of high-priority threads
4695       ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY],
4696                 kHighPriCounts[test]);
4697       // Verify the number of low-priority threads
4698       ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY],
4699                 kLowPriCounts[test]);
4700       // Verify the number of bottom-priority threads
4701       ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY],
4702                 kBottomPriCounts[test]);
4703     }
4704     if (i == 0) {
4705       // repeat the test with multiple column families
4706       CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
4707       env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
4708                                                                      true);
4709     }
4710   }
4711   ASSERT_OK(db_->DropColumnFamily(handles_[2]));
4712   delete handles_[2];
4713   handles_.erase(handles_.begin() + 2);
4714   env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
4715                                                                  true);
4716   Close();
4717   env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
4718                                                                  true);
4719 }
4720
4721 TEST_F(DBTest, DisableThreadStatus) {
4722   Options options;
4723   options.env = env_;
4724   options.enable_thread_tracking = false;
4725   TryReopen(options);
4726   CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
4727   // Verify non of the column family info exists
4728   env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
4729                                                                  false);
4730 }
4731
4732 TEST_F(DBTest, ThreadStatusFlush) {
4733   Options options;
4734   options.env = env_;
4735   options.write_buffer_size = 100000;  // Small write buffer
4736   options.enable_thread_tracking = true;
4737   options = CurrentOptions(options);
4738
4739   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
4740       {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
4741       {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"},
4742   });
4743   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
4744
4745   CreateAndReopenWithCF({"pikachu"}, options);
4746   VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
4747
4748   ASSERT_OK(Put(1, "foo", "v1"));
4749   ASSERT_EQ("v1", Get(1, "foo"));
4750   VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
4751
4752   uint64_t num_running_flushes = 0;
4753   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
4754                                   &num_running_flushes));
4755   ASSERT_EQ(num_running_flushes, 0);
4756
4757   ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
4758   ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
4759
4760   // The first sync point is to make sure there's one flush job
4761   // running when we perform VerifyOperationCount().
4762   TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
4763   VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
4764   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
4765                                   &num_running_flushes));
4766   ASSERT_EQ(num_running_flushes, 1);
4767   // This second sync point is to ensure the flush job will not
4768   // be completed until we already perform VerifyOperationCount().
4769   TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
4770   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
4771 }
4772
4773 TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) {
4774   const int kTestKeySize = 16;
4775   const int kTestValueSize = 984;
4776   const int kEntrySize = kTestKeySize + kTestValueSize;
4777   const int kEntriesPerBuffer = 100;
4778   Options options;
4779   options.create_if_missing = true;
4780   options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
4781   options.compaction_style = kCompactionStyleLevel;
4782   options.target_file_size_base = options.write_buffer_size;
4783   options.max_bytes_for_level_base = options.target_file_size_base * 2;
4784   options.max_bytes_for_level_multiplier = 2;
4785   options.compression = kNoCompression;
4786   options = CurrentOptions(options);
4787   options.env = env_;
4788   options.enable_thread_tracking = true;
4789   const int kNumL0Files = 4;
4790   options.level0_file_num_compaction_trigger = kNumL0Files;
4791   options.max_subcompactions = max_subcompactions_;
4792
4793   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
4794       {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
4795       {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
4796       {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
4797   });
4798   for (int tests = 0; tests < 2; ++tests) {
4799     DestroyAndReopen(options);
4800     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
4801     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
4802
4803     Random rnd(301);
4804     // The Put Phase.
4805     for (int file = 0; file < kNumL0Files; ++file) {
4806       for (int key = 0; key < kEntriesPerBuffer; ++key) {
4807         ASSERT_OK(Put(std::to_string(key + file * kEntriesPerBuffer),
4808                       rnd.RandomString(kTestValueSize)));
4809       }
4810       ASSERT_OK(Flush());
4811     }
4812     // This makes sure a compaction won't be scheduled until
4813     // we have done with the above Put Phase.
4814     uint64_t num_running_compactions = 0;
4815     ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
4816                                     &num_running_compactions));
4817     ASSERT_EQ(num_running_compactions, 0);
4818     TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
4819     ASSERT_GE(NumTableFilesAtLevel(0),
4820               options.level0_file_num_compaction_trigger);
4821
4822     // This makes sure at least one compaction is running.
4823     TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
4824
4825     if (options.enable_thread_tracking) {
4826       // expecting one single L0 to L1 compaction
4827       VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
4828     } else {
4829       // If thread tracking is not enabled, compaction count should be 0.
4830       VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
4831     }
4832     ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
4833                                     &num_running_compactions));
4834     ASSERT_EQ(num_running_compactions, 1);
4835     // TODO(yhchiang): adding assert to verify each compaction stage.
4836     TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
4837
4838     // repeat the test with disabling thread tracking.
4839     options.enable_thread_tracking = false;
4840     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
4841   }
4842 }
4843
4844 TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
4845   Options options = CurrentOptions();
4846   options.max_subcompactions = max_subcompactions_;
4847   CreateAndReopenWithCF({"pikachu"}, options);
4848
4849   // iter - 0 with 7 levels
4850   // iter - 1 with 3 levels
4851   for (int iter = 0; iter < 2; ++iter) {
4852     MakeTables(3, "p", "q", 1);
4853     ASSERT_EQ("1,1,1", FilesPerLevel(1));
4854
4855     // Compaction range falls before files
4856     Compact(1, "", "c");
4857     ASSERT_EQ("1,1,1", FilesPerLevel(1));
4858
4859     // Compaction range falls after files
4860     Compact(1, "r", "z");
4861     ASSERT_EQ("1,1,1", FilesPerLevel(1));
4862
4863     // Compaction range overlaps files
4864     Compact(1, "p", "q");
4865     ASSERT_EQ("0,0,1", FilesPerLevel(1));
4866
4867     // Populate a different range
4868     MakeTables(3, "c", "e", 1);
4869     ASSERT_EQ("1,1,2", FilesPerLevel(1));
4870
4871     // Compact just the new range
4872     Compact(1, "b", "f");
4873     ASSERT_EQ("0,0,2", FilesPerLevel(1));
4874
4875     // Compact all
4876     MakeTables(1, "a", "z", 1);
4877     ASSERT_EQ("1,0,2", FilesPerLevel(1));
4878     CancelAllBackgroundWork(db_);
4879     ASSERT_TRUE(
4880         db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)
4881             .IsShutdownInProgress());
4882     ASSERT_EQ("1,0,2", FilesPerLevel(1));
4883
4884     if (iter == 0) {
4885       options = CurrentOptions();
4886       options.num_levels = 3;
4887       options.create_if_missing = true;
4888       DestroyAndReopen(options);
4889       CreateAndReopenWithCF({"pikachu"}, options);
4890     }
4891   }
4892 }
4893
4894 TEST_F(DBTest, PreShutdownFlush) {
4895   Options options = CurrentOptions();
4896   CreateAndReopenWithCF({"pikachu"}, options);
4897   ASSERT_OK(Put(1, "key", "value"));
4898   CancelAllBackgroundWork(db_);
4899   Status s =
4900       db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
4901   ASSERT_TRUE(s.IsShutdownInProgress());
4902 }
4903
4904 TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
4905   const int kTestKeySize = 16;
4906   const int kTestValueSize = 984;
4907   const int kEntrySize = kTestKeySize + kTestValueSize;
4908   const int kEntriesPerBuffer = 40;
4909   const int kNumL0Files = 4;
4910
4911   const int kHighPriCount = 3;
4912   const int kLowPriCount = 5;
4913   env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
4914   env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
4915
4916   Options options;
4917   options.create_if_missing = true;
4918   options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
4919   options.compaction_style = kCompactionStyleLevel;
4920   options.target_file_size_base = options.write_buffer_size;
4921   options.max_bytes_for_level_base =
4922       options.target_file_size_base * kNumL0Files;
4923   options.compression = kNoCompression;
4924   options = CurrentOptions(options);
4925   options.env = env_;
4926   options.enable_thread_tracking = true;
4927   options.level0_file_num_compaction_trigger = kNumL0Files;
4928   options.max_bytes_for_level_multiplier = 2;
4929   options.max_background_compactions = kLowPriCount;
4930   options.level0_stop_writes_trigger = 1 << 10;
4931   options.level0_slowdown_writes_trigger = 1 << 10;
4932   options.max_subcompactions = max_subcompactions_;
4933
4934   TryReopen(options);
4935   Random rnd(301);
4936
4937   std::vector<ThreadStatus> thread_list;
4938   // Delay both flush and compaction
4939   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
4940       {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
4941        {"CompactionJob::Run():Start",
4942         "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
4943        {"CompactionJob::Run():Start",
4944         "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
4945        {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
4946         "CompactionJob::Run():End"},
4947        {"CompactionJob::Run():End",
4948         "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
4949
4950   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
4951
4952   // Make rocksdb busy
4953   int key = 0;
4954   // check how many threads are doing compaction using GetThreadList
4955   int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
4956   for (int file = 0; file < 16 * kNumL0Files; ++file) {
4957     for (int k = 0; k < kEntriesPerBuffer; ++k) {
4958       ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
4959     }
4960
4961     ASSERT_OK(env_->GetThreadList(&thread_list));
4962     for (auto thread : thread_list) {
4963       operation_count[thread.operation_type]++;
4964     }
4965
4966     // Speed up the test
4967     if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
4968         operation_count[ThreadStatus::OP_COMPACTION] >
4969             0.6 * options.max_background_compactions) {
4970       break;
4971     }
4972     if (file == 15 * kNumL0Files) {
4973       TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
4974     }
4975   }
4976
4977   TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
4978   ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
4979   CancelAllBackgroundWork(db_);
4980   TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
4981   ASSERT_OK(dbfull()->TEST_WaitForCompact());
4982   // Record the number of compactions at a time.
4983   for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
4984     operation_count[i] = 0;
4985   }
4986   ASSERT_OK(env_->GetThreadList(&thread_list));
4987   for (auto thread : thread_list) {
4988     operation_count[thread.operation_type]++;
4989   }
4990   ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
4991 }
4992
4993 TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
4994   const int kTestKeySize = 16;
4995   const int kTestValueSize = 984;
4996   const int kEntrySize = kTestKeySize + kTestValueSize;
4997   const int kEntriesPerBuffer = 40;
4998   const int kNumL0Files = 4;
4999
5000   const int kHighPriCount = 3;
5001   const int kLowPriCount = 5;
5002   env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
5003   env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
5004
5005   Options options;
5006   options.create_if_missing = true;
5007   options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
5008   options.compaction_style = kCompactionStyleLevel;
5009   options.target_file_size_base = options.write_buffer_size;
5010   options.max_bytes_for_level_base =
5011       options.target_file_size_base * kNumL0Files;
5012   options.compression = kNoCompression;
5013   options = CurrentOptions(options);
5014   options.env = env_;
5015   options.enable_thread_tracking = true;
5016   options.level0_file_num_compaction_trigger = kNumL0Files;
5017   options.max_bytes_for_level_multiplier = 2;
5018   options.max_background_compactions = kLowPriCount;
5019   options.level0_stop_writes_trigger = 1 << 10;
5020   options.level0_slowdown_writes_trigger = 1 << 10;
5021   options.max_subcompactions = max_subcompactions_;
5022
5023   TryReopen(options);
5024   Random rnd(301);
5025
5026   std::vector<ThreadStatus> thread_list;
5027   // Delay both flush and compaction
5028   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
5029       {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
5030         "CompactionJob::Run():Inprogress"},
5031        {"CompactionJob::Run():Start",
5032         "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
5033        {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
5034        {"CompactionJob::Run():End",
5035         "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
5036
5037   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5038
5039   // Make rocksdb busy
5040   int key = 0;
5041   // check how many threads are doing compaction using GetThreadList
5042   int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
5043   for (int file = 0; file < 16 * kNumL0Files; ++file) {
5044     for (int k = 0; k < kEntriesPerBuffer; ++k) {
5045       ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
5046     }
5047
5048     ASSERT_OK(env_->GetThreadList(&thread_list));
5049     for (auto thread : thread_list) {
5050       operation_count[thread.operation_type]++;
5051     }
5052
5053     // Speed up the test
5054     if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
5055         operation_count[ThreadStatus::OP_COMPACTION] >
5056             0.6 * options.max_background_compactions) {
5057       break;
5058     }
5059     if (file == 15 * kNumL0Files) {
5060       TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
5061     }
5062   }
5063
5064   ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
5065   CancelAllBackgroundWork(db_);
5066   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
5067   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
5068   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5069   // Record the number of compactions at a time.
5070   for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
5071     operation_count[i] = 0;
5072   }
5073   ASSERT_OK(env_->GetThreadList(&thread_list));
5074   for (auto thread : thread_list) {
5075     operation_count[thread.operation_type]++;
5076   }
5077   ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
5078 }
5079
5080 #endif  // ROCKSDB_USING_THREAD_STATUS
5081
5082 #ifndef ROCKSDB_LITE
5083 TEST_F(DBTest, FlushOnDestroy) {
5084   WriteOptions wo;
5085   wo.disableWAL = true;
5086   ASSERT_OK(Put("foo", "v1", wo));
5087   CancelAllBackgroundWork(db_);
5088 }
5089
5090 TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
5091   if (!Snappy_Supported()) {
5092     return;
5093   }
5094   const int kNKeys = 120;
5095   int keys[kNKeys];
5096   for (int i = 0; i < kNKeys; i++) {
5097     keys[i] = i;
5098   }
5099   RandomShuffle(std::begin(keys), std::end(keys));
5100
5101   Random rnd(301);
5102   Options options;
5103   options.env = env_;
5104   options.create_if_missing = true;
5105   options.db_write_buffer_size = 20480;
5106   options.write_buffer_size = 20480;
5107   options.max_write_buffer_number = 2;
5108   options.level0_file_num_compaction_trigger = 2;
5109   options.level0_slowdown_writes_trigger = 2;
5110   options.level0_stop_writes_trigger = 2;
5111   options.target_file_size_base = 20480;
5112   options.level_compaction_dynamic_level_bytes = true;
5113   options.max_bytes_for_level_base = 102400;
5114   options.max_bytes_for_level_multiplier = 4;
5115   options.max_background_compactions = 1;
5116   options.num_levels = 5;
5117
5118   options.compression_per_level.resize(3);
5119   options.compression_per_level[0] = kNoCompression;
5120   options.compression_per_level[1] = kNoCompression;
5121   options.compression_per_level[2] = kSnappyCompression;
5122
5123   OnFileDeletionListener* listener = new OnFileDeletionListener();
5124   options.listeners.emplace_back(listener);
5125
5126   DestroyAndReopen(options);
5127
5128   // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
5129   // be compressed, so total data size should be more than 80K.
5130   for (int i = 0; i < 20; i++) {
5131     ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
5132   }
5133   ASSERT_OK(Flush());
5134   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5135
5136   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
5137   ASSERT_EQ(NumTableFilesAtLevel(2), 0);
5138   ASSERT_EQ(NumTableFilesAtLevel(3), 0);
5139   // Assuming each files' metadata is at least 50 bytes/
5140   ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
5141
5142   // Insert 400KB. Some data will be compressed
5143   for (int i = 21; i < 120; i++) {
5144     ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
5145   }
5146   ASSERT_OK(Flush());
5147   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5148   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
5149   ASSERT_EQ(NumTableFilesAtLevel(2), 0);
5150
5151   ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
5152             120U * 4000U + 50U * 24);
5153   // Make sure data in files in L3 is not compacted by removing all files
5154   // in L4 and calculate number of rows
5155   ASSERT_OK(dbfull()->SetOptions({
5156       {"disable_auto_compactions", "true"},
5157   }));
5158   ColumnFamilyMetaData cf_meta;
5159   db_->GetColumnFamilyMetaData(&cf_meta);
5160   for (auto file : cf_meta.levels[4].files) {
5161     listener->SetExpectedFileName(dbname_ + file.name);
5162     ASSERT_OK(dbfull()->DeleteFile(file.name));
5163   }
5164   listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
5165
5166   int num_keys = 0;
5167   std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
5168   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
5169     num_keys++;
5170   }
5171   ASSERT_OK(iter->status());
5172   ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
5173 }
5174
5175 TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
5176   if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
5177     return;
5178   }
5179   const int kNKeys = 500;
5180   int keys[kNKeys];
5181   for (int i = 0; i < kNKeys; i++) {
5182     keys[i] = i;
5183   }
5184   RandomShuffle(std::begin(keys), std::end(keys));
5185
5186   Random rnd(301);
5187   Options options;
5188   options.create_if_missing = true;
5189   options.db_write_buffer_size = 6000000;
5190   options.write_buffer_size = 600000;
5191   options.max_write_buffer_number = 2;
5192   options.level0_file_num_compaction_trigger = 2;
5193   options.level0_slowdown_writes_trigger = 2;
5194   options.level0_stop_writes_trigger = 2;
5195   options.soft_pending_compaction_bytes_limit = 1024 * 1024;
5196   options.target_file_size_base = 20;
5197   options.env = env_;
5198   options.level_compaction_dynamic_level_bytes = true;
5199   options.max_bytes_for_level_base = 200;
5200   options.max_bytes_for_level_multiplier = 8;
5201   options.max_background_compactions = 1;
5202   options.num_levels = 5;
5203   std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
5204   options.table_factory = mtf;
5205
5206   options.compression_per_level.resize(3);
5207   options.compression_per_level[0] = kNoCompression;
5208   options.compression_per_level[1] = kLZ4Compression;
5209   options.compression_per_level[2] = kZlibCompression;
5210
5211   DestroyAndReopen(options);
5212   // When base level is L4, L4 is LZ4.
5213   std::atomic<int> num_zlib(0);
5214   std::atomic<int> num_lz4(0);
5215   std::atomic<int> num_no(0);
5216   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5217       "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
5218         Compaction* compaction = reinterpret_cast<Compaction*>(arg);
5219         if (compaction->output_level() == 4) {
5220           ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
5221           num_lz4.fetch_add(1);
5222         }
5223       });
5224   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5225       "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
5226         auto* compression = reinterpret_cast<CompressionType*>(arg);
5227         ASSERT_TRUE(*compression == kNoCompression);
5228         num_no.fetch_add(1);
5229       });
5230   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5231
5232   for (int i = 0; i < 100; i++) {
5233     std::string value = rnd.RandomString(200);
5234     ASSERT_OK(Put(Key(keys[i]), value));
5235     if (i % 25 == 24) {
5236       ASSERT_OK(Flush());
5237       ASSERT_OK(dbfull()->TEST_WaitForCompact());
5238     }
5239   }
5240
5241   ASSERT_OK(Flush());
5242   ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
5243   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5244   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5245   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5246
5247   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
5248   ASSERT_EQ(NumTableFilesAtLevel(2), 0);
5249   ASSERT_EQ(NumTableFilesAtLevel(3), 0);
5250   ASSERT_GT(NumTableFilesAtLevel(4), 0);
5251   ASSERT_GT(num_no.load(), 2);
5252   ASSERT_GT(num_lz4.load(), 0);
5253   int prev_num_files_l4 = NumTableFilesAtLevel(4);
5254
5255   // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
5256   num_lz4.store(0);
5257   num_no.store(0);
5258   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5259       "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
5260         Compaction* compaction = reinterpret_cast<Compaction*>(arg);
5261         if (compaction->output_level() == 4 && compaction->start_level() == 3) {
5262           ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
5263           num_zlib.fetch_add(1);
5264         } else {
5265           ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
5266           num_lz4.fetch_add(1);
5267         }
5268       });
5269   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5270       "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
5271         auto* compression = reinterpret_cast<CompressionType*>(arg);
5272         ASSERT_TRUE(*compression == kNoCompression);
5273         num_no.fetch_add(1);
5274       });
5275   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5276
5277   for (int i = 101; i < 500; i++) {
5278     std::string value = rnd.RandomString(200);
5279     ASSERT_OK(Put(Key(keys[i]), value));
5280     if (i % 100 == 99) {
5281       ASSERT_OK(Flush());
5282       ASSERT_OK(dbfull()->TEST_WaitForCompact());
5283     }
5284   }
5285
5286   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5287   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5288   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
5289   ASSERT_EQ(NumTableFilesAtLevel(2), 0);
5290   ASSERT_GT(NumTableFilesAtLevel(3), 0);
5291   ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
5292   ASSERT_GT(num_no.load(), 2);
5293   ASSERT_GT(num_lz4.load(), 0);
5294   ASSERT_GT(num_zlib.load(), 0);
5295 }
5296
5297 TEST_F(DBTest, DynamicCompactionOptions) {
5298   // minimum write buffer size is enforced at 64KB
5299   const uint64_t k32KB = 1 << 15;
5300   const uint64_t k64KB = 1 << 16;
5301   const uint64_t k128KB = 1 << 17;
5302   const uint64_t k1MB = 1 << 20;
5303   const uint64_t k4KB = 1 << 12;
5304   Options options;
5305   options.env = env_;
5306   options.create_if_missing = true;
5307   options.compression = kNoCompression;
5308   options.soft_pending_compaction_bytes_limit = 1024 * 1024;
5309   options.write_buffer_size = k64KB;
5310   options.arena_block_size = 4 * k4KB;
5311   options.max_write_buffer_number = 2;
5312   // Compaction related options
5313   options.level0_file_num_compaction_trigger = 3;
5314   options.level0_slowdown_writes_trigger = 4;
5315   options.level0_stop_writes_trigger = 8;
5316   options.target_file_size_base = k64KB;
5317   options.max_compaction_bytes = options.target_file_size_base * 10;
5318   options.target_file_size_multiplier = 1;
5319   options.max_bytes_for_level_base = k128KB;
5320   options.max_bytes_for_level_multiplier = 4;
5321
5322   // Block flush thread and disable compaction thread
5323   env_->SetBackgroundThreads(1, Env::LOW);
5324   env_->SetBackgroundThreads(1, Env::HIGH);
5325   DestroyAndReopen(options);
5326
5327   auto gen_l0_kb = [this](int start, int size, int stride) {
5328     Random rnd(301);
5329     for (int i = 0; i < size; i++) {
5330       ASSERT_OK(Put(Key(start + stride * i), rnd.RandomString(1024)));
5331     }
5332     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
5333   };
5334
5335   // Write 3 files that have the same key range.
5336   // Since level0_file_num_compaction_trigger is 3, compaction should be
5337   // triggered. The compaction should result in one L1 file
5338   gen_l0_kb(0, 64, 1);
5339   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
5340   gen_l0_kb(0, 64, 1);
5341   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
5342   gen_l0_kb(0, 64, 1);
5343   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5344   ASSERT_EQ("0,1", FilesPerLevel());
5345   std::vector<LiveFileMetaData> metadata;
5346   db_->GetLiveFilesMetaData(&metadata);
5347   ASSERT_EQ(1U, metadata.size());
5348   ASSERT_LE(metadata[0].size, k64KB + k4KB);
5349   ASSERT_GE(metadata[0].size, k64KB - k4KB);
5350
5351   // Test compaction trigger and target_file_size_base
5352   // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
5353   // Writing to 64KB L0 files should trigger a compaction. Since these
5354   // 2 L0 files have the same key range, compaction merge them and should
5355   // result in 2 32KB L1 files.
5356   ASSERT_OK(
5357       dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
5358                             {"target_file_size_base", std::to_string(k32KB)}}));
5359
5360   gen_l0_kb(0, 64, 1);
5361   ASSERT_EQ("1,1", FilesPerLevel());
5362   gen_l0_kb(0, 64, 1);
5363   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5364   ASSERT_EQ("0,2", FilesPerLevel());
5365   metadata.clear();
5366   db_->GetLiveFilesMetaData(&metadata);
5367   ASSERT_EQ(2U, metadata.size());
5368   ASSERT_LE(metadata[0].size, k32KB + k4KB);
5369   ASSERT_GE(metadata[0].size, k32KB - k4KB);
5370   ASSERT_LE(metadata[1].size, k32KB + k4KB);
5371   ASSERT_GE(metadata[1].size, k32KB - k4KB);
5372
5373   // Test max_bytes_for_level_base
5374   // Increase level base size to 256KB and write enough data that will
5375   // fill L1 and L2. L1 size should be around 256KB while L2 size should be
5376   // around 256KB x 4.
5377   ASSERT_OK(dbfull()->SetOptions(
5378       {{"max_bytes_for_level_base", std::to_string(k1MB)}}));
5379
5380   // writing 96 x 64KB => 6 * 1024KB
5381   // (L1 + L2) = (1 + 4) * 1024KB
5382   for (int i = 0; i < 96; ++i) {
5383     gen_l0_kb(i, 64, 96);
5384   }
5385   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5386   ASSERT_GT(SizeAtLevel(1), k1MB / 2);
5387   ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
5388
5389   // Within (0.5, 1.5) of 4MB.
5390   ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
5391   ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
5392
5393   // Test max_bytes_for_level_multiplier and
5394   // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
5395   // After filling enough data that can fit in L1 - L3, we should see L1 size
5396   // reduces to 128KB from 256KB which was asserted previously. Same for L2.
5397   ASSERT_OK(dbfull()->SetOptions(
5398       {{"max_bytes_for_level_multiplier", "2"},
5399        {"max_bytes_for_level_base", std::to_string(k128KB)}}));
5400
5401   // writing 20 x 64KB = 10 x 128KB
5402   // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
5403   for (int i = 0; i < 20; ++i) {
5404     gen_l0_kb(i, 64, 32);
5405   }
5406   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5407   uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
5408   ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
5409
5410   // Test level0_stop_writes_trigger.
5411   // Clean up memtable and L0. Block compaction threads. If continue to write
5412   // and flush memtables. We should see put stop after 8 memtable flushes
5413   // since level0_stop_writes_trigger = 8
5414   ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
5415   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5416   // Block compaction
5417   test::SleepingBackgroundTask sleeping_task_low;
5418   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
5419                  Env::Priority::LOW);
5420   sleeping_task_low.WaitUntilSleeping();
5421   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
5422   int count = 0;
5423   Random rnd(301);
5424   WriteOptions wo;
5425   while (count < 64) {
5426     ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
5427     ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
5428     count++;
5429     if (dbfull()->TEST_write_controler().IsStopped()) {
5430       sleeping_task_low.WakeUp();
5431       break;
5432     }
5433   }
5434   // Stop trigger = 8
5435   ASSERT_EQ(count, 8);
5436   // Unblock
5437   sleeping_task_low.WaitUntilDone();
5438
5439   // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
5440   // Block compaction thread again. Perform the put and memtable flushes
5441   // until we see the stop after 6 memtable flushes.
5442   ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}}));
5443   ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
5444   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5445   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
5446
5447   // Block compaction again
5448   sleeping_task_low.Reset();
5449   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
5450                  Env::Priority::LOW);
5451   sleeping_task_low.WaitUntilSleeping();
5452   count = 0;
5453   while (count < 64) {
5454     ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
5455     ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
5456     count++;
5457     if (dbfull()->TEST_write_controler().IsStopped()) {
5458       sleeping_task_low.WakeUp();
5459       break;
5460     }
5461   }
5462   ASSERT_EQ(count, 6);
5463   // Unblock
5464   sleeping_task_low.WaitUntilDone();
5465
5466   // Test disable_auto_compactions
5467   // Compaction thread is unblocked but auto compaction is disabled. Write
5468   // 4 L0 files and compaction should be triggered. If auto compaction is
5469   // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
5470   // L0 files do not change after the call.
5471   ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}}));
5472   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5473   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
5474
5475   for (int i = 0; i < 4; ++i) {
5476     ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
5477     // Wait for compaction so that put won't stop
5478     ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
5479   }
5480   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5481   ASSERT_EQ(NumTableFilesAtLevel(0), 4);
5482
5483   // Enable auto compaction and perform the same test, # of L0 files should be
5484   // reduced after compaction.
5485   ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
5486   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5487   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
5488
5489   for (int i = 0; i < 4; ++i) {
5490     ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
5491     // Wait for compaction so that put won't stop
5492     ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
5493   }
5494   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5495   ASSERT_LT(NumTableFilesAtLevel(0), 4);
5496 }
5497
5498 // Test dynamic FIFO compaction options.
5499 // This test covers just option parsing and makes sure that the options are
5500 // correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions
5501 // test which makes sure that the FIFO compaction funcionality is working
5502 // as expected on dynamically changing the options.
5503 // Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
5504 TEST_F(DBTest, DynamicFIFOCompactionOptions) {
5505   Options options;
5506   options.ttl = 0;
5507   options.create_if_missing = true;
5508   options.env = env_;
5509   DestroyAndReopen(options);
5510
5511   // Initial defaults
5512   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
5513             1024 * 1024 * 1024);
5514   ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
5515   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
5516             false);
5517
5518   ASSERT_OK(dbfull()->SetOptions(
5519       {{"compaction_options_fifo", "{max_table_files_size=23;}"}}));
5520   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
5521             23);
5522   ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
5523   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
5524             false);
5525
5526   ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}}));
5527   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
5528             23);
5529   ASSERT_EQ(dbfull()->GetOptions().ttl, 97);
5530   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
5531             false);
5532
5533   ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}}));
5534   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
5535             23);
5536   ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
5537   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
5538             false);
5539
5540   ASSERT_OK(dbfull()->SetOptions(
5541       {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
5542   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
5543             23);
5544   ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
5545   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
5546             true);
5547
5548   ASSERT_OK(dbfull()->SetOptions(
5549       {{"compaction_options_fifo", "{max_table_files_size=31;}"}}));
5550   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
5551             31);
5552   ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
5553   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
5554             true);
5555
5556   ASSERT_OK(dbfull()->SetOptions(
5557       {{"compaction_options_fifo",
5558         "{max_table_files_size=51;allow_compaction=true;}"}}));
5559   ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}}));
5560   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
5561             51);
5562   ASSERT_EQ(dbfull()->GetOptions().ttl, 49);
5563   ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
5564             true);
5565 }
5566
5567 TEST_F(DBTest, DynamicUniversalCompactionOptions) {
5568   Options options;
5569   options.create_if_missing = true;
5570   options.env = env_;
5571   DestroyAndReopen(options);
5572
5573   // Initial defaults
5574   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
5575   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
5576             2u);
5577   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
5578             UINT_MAX);
5579   ASSERT_EQ(dbfull()
5580                 ->GetOptions()
5581                 .compaction_options_universal.max_size_amplification_percent,
5582             200u);
5583   ASSERT_EQ(dbfull()
5584                 ->GetOptions()
5585                 .compaction_options_universal.compression_size_percent,
5586             -1);
5587   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
5588             kCompactionStopStyleTotalSize);
5589   ASSERT_EQ(
5590       dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
5591       false);
5592
5593   ASSERT_OK(dbfull()->SetOptions(
5594       {{"compaction_options_universal", "{size_ratio=7;}"}}));
5595   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
5596   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
5597             2u);
5598   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
5599             UINT_MAX);
5600   ASSERT_EQ(dbfull()
5601                 ->GetOptions()
5602                 .compaction_options_universal.max_size_amplification_percent,
5603             200u);
5604   ASSERT_EQ(dbfull()
5605                 ->GetOptions()
5606                 .compaction_options_universal.compression_size_percent,
5607             -1);
5608   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
5609             kCompactionStopStyleTotalSize);
5610   ASSERT_EQ(
5611       dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
5612       false);
5613
5614   ASSERT_OK(dbfull()->SetOptions(
5615       {{"compaction_options_universal", "{min_merge_width=11;}"}}));
5616   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
5617   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
5618             11u);
5619   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
5620             UINT_MAX);
5621   ASSERT_EQ(dbfull()
5622                 ->GetOptions()
5623                 .compaction_options_universal.max_size_amplification_percent,
5624             200u);
5625   ASSERT_EQ(dbfull()
5626                 ->GetOptions()
5627                 .compaction_options_universal.compression_size_percent,
5628             -1);
5629   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
5630             kCompactionStopStyleTotalSize);
5631   ASSERT_EQ(
5632       dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
5633       false);
5634 }
5635 #endif  // ROCKSDB_LITE
5636
5637 TEST_F(DBTest, FileCreationRandomFailure) {
5638   Options options;
5639   options.env = env_;
5640   options.create_if_missing = true;
5641   options.write_buffer_size = 100000;  // Small write buffer
5642   options.target_file_size_base = 200000;
5643   options.max_bytes_for_level_base = 1000000;
5644   options.max_bytes_for_level_multiplier = 2;
5645
5646   DestroyAndReopen(options);
5647   Random rnd(301);
5648
5649   constexpr int kCDTKeysPerBuffer = 4;
5650   constexpr int kTestSize = kCDTKeysPerBuffer * 4096;
5651   constexpr int kTotalIteration = 20;
5652   // the second half of the test involves in random failure
5653   // of file creation.
5654   constexpr int kRandomFailureTest = kTotalIteration / 2;
5655
5656   std::vector<std::string> values;
5657   for (int i = 0; i < kTestSize; ++i) {
5658     values.push_back("NOT_FOUND");
5659   }
5660   for (int j = 0; j < kTotalIteration; ++j) {
5661     if (j == kRandomFailureTest) {
5662       env_->non_writeable_rate_.store(90);
5663     }
5664     for (int k = 0; k < kTestSize; ++k) {
5665       // here we expect some of the Put fails.
5666       std::string value = rnd.RandomString(100);
5667       Status s = Put(Key(k), Slice(value));
5668       if (s.ok()) {
5669         // update the latest successful put
5670         values[k] = value;
5671       }
5672       // But everything before we simulate the failure-test should succeed.
5673       if (j < kRandomFailureTest) {
5674         ASSERT_OK(s);
5675       }
5676     }
5677   }
5678
5679   // If rocksdb does not do the correct job, internal assert will fail here.
5680   ASSERT_TRUE(dbfull()->TEST_WaitForFlushMemTable().IsIOError());
5681   ASSERT_TRUE(dbfull()->TEST_WaitForCompact().IsIOError());
5682
5683   // verify we have the latest successful update
5684   for (int k = 0; k < kTestSize; ++k) {
5685     auto v = Get(Key(k));
5686     ASSERT_EQ(v, values[k]);
5687   }
5688
5689   // reopen and reverify we have the latest successful update
5690   env_->non_writeable_rate_.store(0);
5691   Reopen(options);
5692   for (int k = 0; k < kTestSize; ++k) {
5693     auto v = Get(Key(k));
5694     ASSERT_EQ(v, values[k]);
5695   }
5696 }
5697
5698 #ifndef ROCKSDB_LITE
5699
5700 TEST_F(DBTest, DynamicMiscOptions) {
5701   // Test max_sequential_skip_in_iterations
5702   Options options;
5703   options.env = env_;
5704   options.create_if_missing = true;
5705   options.max_sequential_skip_in_iterations = 16;
5706   options.compression = kNoCompression;
5707   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
5708   DestroyAndReopen(options);
5709
5710   auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
5711     int key0 = key_start;
5712     int key1 = key_start + 1;
5713     int key2 = key_start + 2;
5714     Random rnd(301);
5715     ASSERT_OK(Put(Key(key0), rnd.RandomString(8)));
5716     for (int i = 0; i < 10; ++i) {
5717       ASSERT_OK(Put(Key(key1), rnd.RandomString(8)));
5718     }
5719     ASSERT_OK(Put(Key(key2), rnd.RandomString(8)));
5720     std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
5721     iter->Seek(Key(key1));
5722     ASSERT_TRUE(iter->Valid());
5723     ASSERT_EQ(iter->key().compare(Key(key1)), 0);
5724     iter->Next();
5725     ASSERT_TRUE(iter->Valid());
5726     ASSERT_EQ(iter->key().compare(Key(key2)), 0);
5727     ASSERT_EQ(num_reseek,
5728               TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
5729   };
5730   // No reseek
5731   assert_reseek_count(100, 0);
5732
5733   ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}}));
5734   // Clear memtable and make new option effective
5735   ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
5736   // Trigger reseek
5737   assert_reseek_count(200, 1);
5738
5739   ASSERT_OK(
5740       dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}}));
5741   // Clear memtable and make new option effective
5742   ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
5743   // No reseek
5744   assert_reseek_count(300, 1);
5745
5746   MutableCFOptions mutable_cf_options;
5747   CreateAndReopenWithCF({"pikachu"}, options);
5748   // Test soft_pending_compaction_bytes_limit,
5749   // hard_pending_compaction_bytes_limit
5750   ASSERT_OK(dbfull()->SetOptions(
5751       handles_[1], {{"soft_pending_compaction_bytes_limit", "200"},
5752                     {"hard_pending_compaction_bytes_limit", "300"}}));
5753   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
5754                                                      &mutable_cf_options));
5755   ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit);
5756   ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit);
5757   // Test report_bg_io_stats
5758   ASSERT_OK(
5759       dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}}));
5760   // sanity check
5761   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
5762                                                      &mutable_cf_options));
5763   ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
5764   // Test compression
5765   // sanity check
5766   ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}}));
5767   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
5768                                                      &mutable_cf_options));
5769   ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
5770
5771   if (Snappy_Supported()) {
5772     ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
5773     ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
5774                                                        &mutable_cf_options));
5775     ASSERT_EQ(CompressionType::kSnappyCompression,
5776               mutable_cf_options.compression);
5777   }
5778
5779   // Test paranoid_file_checks already done in db_block_cache_test
5780   ASSERT_OK(
5781       dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
5782   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
5783                                                      &mutable_cf_options));
5784   ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
5785   ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order);
5786
5787   ASSERT_OK(dbfull()->SetOptions(
5788       handles_[1], {{"check_flush_compaction_key_order", "false"}}));
5789   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
5790                                                      &mutable_cf_options));
5791   ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order);
5792 }
5793 #endif  // ROCKSDB_LITE
5794
5795 TEST_F(DBTest, L0L1L2AndUpHitCounter) {
5796   const int kNumLevels = 3;
5797   const int kNumKeysPerLevel = 10000;
5798   const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel;
5799
5800   Options options = CurrentOptions();
5801   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
5802   Reopen(options);
5803
5804   // After the below loop there will be one file on each of L0, L1, and L2.
5805   int key = 0;
5806   for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) {
5807     for (int i = 0; i < kNumKeysPerLevel; ++i) {
5808       ASSERT_OK(Put(Key(key), "val"));
5809       key++;
5810     }
5811     ASSERT_OK(Flush());
5812     for (int input_level = 0; input_level < output_level; ++input_level) {
5813       // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to
5814       // `input_level + 1`.
5815       ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr));
5816     }
5817   }
5818   assert(key == kNumKeysPerDb);
5819
5820   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
5821   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
5822   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
5823
5824   for (int i = 0; i < kNumKeysPerDb; i++) {
5825     ASSERT_EQ(Get(Key(i)), "val");
5826   }
5827
5828   ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0));
5829   ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1));
5830   ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
5831
5832   ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) +
5833                                TestGetTickerCount(options, GET_HIT_L1) +
5834                                TestGetTickerCount(options, GET_HIT_L2_AND_UP));
5835 }
5836
5837 TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
5838   // iter 0 -- zlib
5839   // iter 1 -- bzip2
5840   // iter 2 -- lz4
5841   // iter 3 -- lz4HC
5842   // iter 4 -- xpress
5843   CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
5844                                     kLZ4Compression, kLZ4HCCompression,
5845                                     kXpressCompression};
5846   for (auto comp : compressions) {
5847     if (!CompressionTypeSupported(comp)) {
5848       continue;
5849     }
5850     // first_table_version 1 -- generate with table_version == 1, read with
5851     // table_version == 2
5852     // first_table_version 2 -- generate with table_version == 2, read with
5853     // table_version == 1
5854     for (int first_table_version = 1; first_table_version <= 2;
5855          ++first_table_version) {
5856       BlockBasedTableOptions table_options;
5857       table_options.format_version = first_table_version;
5858       table_options.filter_policy.reset(NewBloomFilterPolicy(10));
5859       Options options = CurrentOptions();
5860       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
5861       options.create_if_missing = true;
5862       options.compression = comp;
5863       DestroyAndReopen(options);
5864
5865       int kNumKeysWritten = 1000;
5866
5867       Random rnd(301);
5868       for (int i = 0; i < kNumKeysWritten; ++i) {
5869         // compressible string
5870         ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
5871       }
5872
5873       table_options.format_version = first_table_version == 1 ? 2 : 1;
5874       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
5875       Reopen(options);
5876       for (int i = 0; i < kNumKeysWritten; ++i) {
5877         auto r = Get(Key(i));
5878         ASSERT_EQ(r.substr(128), std::string(128, 'a'));
5879       }
5880     }
5881   }
5882 }
5883
5884 TEST_F(DBTest, CloseSpeedup) {
5885   Options options = CurrentOptions();
5886   options.compaction_style = kCompactionStyleLevel;
5887   options.write_buffer_size = 110 << 10;  // 110KB
5888   options.arena_block_size = 4 << 10;
5889   options.level0_file_num_compaction_trigger = 2;
5890   options.num_levels = 4;
5891   options.max_bytes_for_level_base = 400 * 1024;
5892   options.max_write_buffer_number = 16;
5893
5894   // Block background threads
5895   env_->SetBackgroundThreads(1, Env::LOW);
5896   env_->SetBackgroundThreads(1, Env::HIGH);
5897   test::SleepingBackgroundTask sleeping_task_low;
5898   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
5899                  Env::Priority::LOW);
5900   test::SleepingBackgroundTask sleeping_task_high;
5901   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
5902                  &sleeping_task_high, Env::Priority::HIGH);
5903
5904   std::vector<std::string> filenames;
5905   ASSERT_OK(env_->GetChildren(dbname_, &filenames));
5906   // In Windows, LOCK file cannot be deleted because it is locked by db_test
5907   // After closing db_test, the LOCK file is unlocked and can be deleted
5908   // Delete archival files.
5909   bool deleteDir = true;
5910   for (size_t i = 0; i < filenames.size(); ++i) {
5911     Status s = env_->DeleteFile(dbname_ + "/" + filenames[i]);
5912     if (!s.ok()) {
5913       deleteDir = false;
5914     }
5915   }
5916   if (deleteDir) {
5917     ASSERT_OK(env_->DeleteDir(dbname_));
5918   }
5919   DestroyAndReopen(options);
5920
5921   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5922   env_->SetBackgroundThreads(1, Env::LOW);
5923   env_->SetBackgroundThreads(1, Env::HIGH);
5924   Random rnd(301);
5925   int key_idx = 0;
5926
5927   // First three 110KB files are not going to level 2
5928   // After that, (100K, 200K)
5929   for (int num = 0; num < 5; num++) {
5930     GenerateNewFile(&rnd, &key_idx, true);
5931   }
5932
5933   ASSERT_EQ(0, GetSstFileCount(dbname_));
5934
5935   Close();
5936   ASSERT_EQ(0, GetSstFileCount(dbname_));
5937
5938   // Unblock background threads
5939   sleeping_task_high.WakeUp();
5940   sleeping_task_high.WaitUntilDone();
5941   sleeping_task_low.WakeUp();
5942   sleeping_task_low.WaitUntilDone();
5943
5944   Destroy(options);
5945 }
5946
5947 class DelayedMergeOperator : public MergeOperator {
5948  private:
5949   DBTest* db_test_;
5950
5951  public:
5952   explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
5953
5954   bool FullMergeV2(const MergeOperationInput& merge_in,
5955                    MergeOperationOutput* merge_out) const override {
5956     db_test_->env_->MockSleepForMicroseconds(1000 *
5957                                              merge_in.operand_list.size());
5958     merge_out->new_value = "";
5959     return true;
5960   }
5961
5962   const char* Name() const override { return "DelayedMergeOperator"; }
5963 };
5964
5965 TEST_F(DBTest, MergeTestTime) {
5966   std::string one, two, three;
5967   PutFixed64(&one, 1);
5968   PutFixed64(&two, 2);
5969   PutFixed64(&three, 3);
5970
5971   // Enable time profiling
5972   SetPerfLevel(kEnableTime);
5973   Options options = CurrentOptions();
5974   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
5975   options.merge_operator.reset(new DelayedMergeOperator(this));
5976   SetTimeElapseOnlySleepOnReopen(&options);
5977   DestroyAndReopen(options);
5978
5979   // NOTE: Presumed unnecessary and removed: resetting mock time in env
5980
5981   ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
5982   ASSERT_OK(db_->Put(WriteOptions(), "foo", one));
5983   ASSERT_OK(Flush());
5984   ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
5985   ASSERT_OK(Flush());
5986   ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
5987   ASSERT_OK(Flush());
5988
5989   ReadOptions opt;
5990   opt.verify_checksums = true;
5991   opt.snapshot = nullptr;
5992   std::string result;
5993   ASSERT_OK(db_->Get(opt, "foo", &result));
5994
5995   ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
5996
5997   ReadOptions read_options;
5998   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
5999   int count = 0;
6000   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
6001     ASSERT_OK(iter->status());
6002     ++count;
6003   }
6004
6005   ASSERT_EQ(1, count);
6006   ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
6007 #ifdef ROCKSDB_USING_THREAD_STATUS
6008   ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
6009 #endif  // ROCKSDB_USING_THREAD_STATUS
6010 }
6011
6012 #ifndef ROCKSDB_LITE
6013 TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
6014   SetPerfLevel(kEnableTime);
6015   Options options = CurrentOptions();
6016   options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
6017   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
6018   options.merge_operator.reset(new DelayedMergeOperator(this));
6019   options.disable_auto_compactions = true;
6020   options.max_subcompactions = max_subcompactions_;
6021   SetTimeElapseOnlySleepOnReopen(&options);
6022   DestroyAndReopen(options);
6023
6024   constexpr unsigned n = 1000;
6025   for (unsigned i = 0; i < n; i++) {
6026     ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
6027     ASSERT_OK(Flush());
6028   }
6029   ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
6030
6031   CompactRangeOptions cro;
6032   cro.exclusive_manual_compaction = exclusive_manual_compaction_;
6033   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
6034
6035   ASSERT_EQ(uint64_t{n} * 1000000U,
6036             TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
6037 }
6038
6039 TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
6040   Options options = CurrentOptions();
6041   options.compaction_filter_factory =
6042       std::make_shared<DelayFilterFactory>(this);
6043   options.disable_auto_compactions = true;
6044   options.create_if_missing = true;
6045   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
6046   options.statistics->set_stats_level(kExceptTimeForMutex);
6047   options.max_subcompactions = max_subcompactions_;
6048   SetTimeElapseOnlySleepOnReopen(&options);
6049   DestroyAndReopen(options);
6050
6051   unsigned n = 0;
6052   // put some data
6053   for (int table = 0; table < 4; ++table) {
6054     for (int i = 0; i < 10 + table; ++i) {
6055       ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
6056       ++n;
6057     }
6058     ASSERT_OK(Flush());
6059   }
6060
6061   CompactRangeOptions cro;
6062   cro.exclusive_manual_compaction = exclusive_manual_compaction_;
6063   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
6064   ASSERT_EQ(0U, CountLiveFiles());
6065
6066   Reopen(options);
6067
6068   Iterator* itr = db_->NewIterator(ReadOptions());
6069   itr->SeekToFirst();
6070   ASSERT_OK(itr->status());
6071   ASSERT_EQ(uint64_t{n} * 1000000U,
6072             TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME));
6073   delete itr;
6074 }
6075 #endif  // ROCKSDB_LITE
6076
6077 TEST_F(DBTest, TestLogCleanup) {
6078   Options options = CurrentOptions();
6079   options.write_buffer_size = 64 * 1024;  // very small
6080   // only two memtables allowed ==> only two log files
6081   options.max_write_buffer_number = 2;
6082   Reopen(options);
6083
6084   for (int i = 0; i < 100000; ++i) {
6085     ASSERT_OK(Put(Key(i), "val"));
6086     // only 2 memtables will be alive, so logs_to_free needs to always be below
6087     // 2
6088     ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
6089   }
6090 }
6091
6092 #ifndef ROCKSDB_LITE
6093 TEST_F(DBTest, EmptyCompactedDB) {
6094   Options options = CurrentOptions();
6095   options.max_open_files = -1;
6096   Close();
6097   ASSERT_OK(ReadOnlyReopen(options));
6098   Status s = Put("new", "value");
6099   ASSERT_TRUE(s.IsNotSupported());
6100   Close();
6101 }
6102 #endif  // ROCKSDB_LITE
6103
6104 #ifndef ROCKSDB_LITE
6105 TEST_F(DBTest, SuggestCompactRangeTest) {
6106   class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
6107    public:
6108     std::unique_ptr<CompactionFilter> CreateCompactionFilter(
6109         const CompactionFilter::Context& context) override {
6110       saved_context = context;
6111       std::unique_ptr<CompactionFilter> empty_filter;
6112       return empty_filter;
6113     }
6114     const char* Name() const override {
6115       return "CompactionFilterFactoryGetContext";
6116     }
6117     static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
6118       return reinterpret_cast<CompactionFilterFactoryGetContext*>(
6119                  compaction_filter_factory)
6120           ->saved_context.is_manual_compaction;
6121     }
6122     CompactionFilter::Context saved_context;
6123   };
6124
6125   Options options = CurrentOptions();
6126   options.memtable_factory.reset(test::NewSpecialSkipListFactory(
6127       DBTestBase::kNumKeysByGenerateNewRandomFile));
6128   options.compaction_style = kCompactionStyleLevel;
6129   options.compaction_filter_factory.reset(
6130       new CompactionFilterFactoryGetContext());
6131   options.write_buffer_size = 200 << 10;
6132   options.arena_block_size = 4 << 10;
6133   options.level0_file_num_compaction_trigger = 4;
6134   options.num_levels = 4;
6135   options.compression = kNoCompression;
6136   options.max_bytes_for_level_base = 450 << 10;
6137   options.target_file_size_base = 98 << 10;
6138   options.max_compaction_bytes = static_cast<uint64_t>(1) << 60;  // inf
6139
6140   Reopen(options);
6141
6142   Random rnd(301);
6143
6144   for (int num = 0; num < 10; num++) {
6145     GenerateNewRandomFile(&rnd);
6146   }
6147
6148   ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
6149       options.compaction_filter_factory.get()));
6150
6151   // make sure either L0 or L1 has file
6152   while (NumTableFilesAtLevel(0) == 0 && NumTableFilesAtLevel(1) == 0) {
6153     GenerateNewRandomFile(&rnd);
6154   }
6155
6156   // compact it three times
6157   for (int i = 0; i < 3; ++i) {
6158     ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
6159     ASSERT_OK(dbfull()->TEST_WaitForCompact());
6160   }
6161
6162   // All files are compacted
6163   ASSERT_EQ(0, NumTableFilesAtLevel(0));
6164   ASSERT_EQ(0, NumTableFilesAtLevel(1));
6165
6166   GenerateNewRandomFile(&rnd);
6167   ASSERT_EQ(1, NumTableFilesAtLevel(0));
6168
6169   // nonoverlapping with the file on level 0
6170   Slice start("a"), end("b");
6171   ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
6172   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6173
6174   // should not compact the level 0 file
6175   ASSERT_EQ(1, NumTableFilesAtLevel(0));
6176
6177   start = Slice("j");
6178   end = Slice("m");
6179   ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
6180   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6181   // SuggestCompactRange() is not going to be reported as manual compaction
6182   ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
6183       options.compaction_filter_factory.get()));
6184
6185   // now it should compact the level 0 file
6186   // as it's a trivial move to L1, it triggers another one to compact to L2
6187   ASSERT_EQ(0, NumTableFilesAtLevel(0));
6188   ASSERT_EQ(0, NumTableFilesAtLevel(1));
6189 }
6190
6191 TEST_F(DBTest, SuggestCompactRangeUniversal) {
6192   Options options = CurrentOptions();
6193   options.memtable_factory.reset(test::NewSpecialSkipListFactory(
6194       DBTestBase::kNumKeysByGenerateNewRandomFile));
6195   options.compaction_style = kCompactionStyleUniversal;
6196   options.write_buffer_size = 200 << 10;
6197   options.arena_block_size = 4 << 10;
6198   options.level0_file_num_compaction_trigger = 4;
6199   options.num_levels = 4;
6200   options.compression = kNoCompression;
6201   options.max_bytes_for_level_base = 450 << 10;
6202   options.target_file_size_base = 98 << 10;
6203   options.max_compaction_bytes = static_cast<uint64_t>(1) << 60;  // inf
6204
6205   Reopen(options);
6206
6207   Random rnd(301);
6208
6209   for (int num = 0; num < 10; num++) {
6210     GenerateNewRandomFile(&rnd);
6211   }
6212
6213   ASSERT_EQ("1,2,3,4", FilesPerLevel());
6214   for (int i = 0; i < 3; i++) {
6215     ASSERT_OK(
6216         db_->SuggestCompactRange(db_->DefaultColumnFamily(), nullptr, nullptr));
6217     ASSERT_OK(dbfull()->TEST_WaitForCompact());
6218   }
6219
6220   // All files are compacted
6221   ASSERT_EQ(0, NumTableFilesAtLevel(0));
6222   ASSERT_EQ(0, NumTableFilesAtLevel(1));
6223   ASSERT_EQ(0, NumTableFilesAtLevel(2));
6224
6225   GenerateNewRandomFile(&rnd);
6226   ASSERT_EQ(1, NumTableFilesAtLevel(0));
6227
6228   // nonoverlapping with the file on level 0
6229   Slice start("a"), end("b");
6230   ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
6231   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6232
6233   // should not compact the level 0 file
6234   ASSERT_EQ(1, NumTableFilesAtLevel(0));
6235
6236   start = Slice("j");
6237   end = Slice("m");
6238   ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
6239   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6240
6241   // now it should compact the level 0 file to the last level
6242   ASSERT_EQ(0, NumTableFilesAtLevel(0));
6243   ASSERT_EQ(0, NumTableFilesAtLevel(1));
6244 }
6245
6246 TEST_F(DBTest, PromoteL0) {
6247   Options options = CurrentOptions();
6248   options.disable_auto_compactions = true;
6249   options.write_buffer_size = 10 * 1024 * 1024;
6250   DestroyAndReopen(options);
6251
6252   // non overlapping ranges
6253   std::vector<std::pair<int32_t, int32_t>> ranges = {
6254       {81, 160}, {0, 80}, {161, 240}, {241, 320}};
6255
6256   int32_t value_size = 10 * 1024;  // 10 KB
6257
6258   Random rnd(301);
6259   std::map<int32_t, std::string> values;
6260   for (const auto& range : ranges) {
6261     for (int32_t j = range.first; j < range.second; j++) {
6262       values[j] = rnd.RandomString(value_size);
6263       ASSERT_OK(Put(Key(j), values[j]));
6264     }
6265     ASSERT_OK(Flush());
6266   }
6267
6268   int32_t level0_files = NumTableFilesAtLevel(0, 0);
6269   ASSERT_EQ(level0_files, ranges.size());
6270   ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
6271
6272   // Promote L0 level to L2.
6273   ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
6274   // We expect that all the files were trivially moved from L0 to L2
6275   ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
6276   ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
6277
6278   for (const auto& kv : values) {
6279     ASSERT_EQ(Get(Key(kv.first)), kv.second);
6280   }
6281 }
6282
6283 TEST_F(DBTest, PromoteL0Failure) {
6284   Options options = CurrentOptions();
6285   options.disable_auto_compactions = true;
6286   options.write_buffer_size = 10 * 1024 * 1024;
6287   DestroyAndReopen(options);
6288
6289   // Produce two L0 files with overlapping ranges.
6290   ASSERT_OK(Put(Key(0), ""));
6291   ASSERT_OK(Put(Key(3), ""));
6292   ASSERT_OK(Flush());
6293   ASSERT_OK(Put(Key(1), ""));
6294   ASSERT_OK(Flush());
6295
6296   Status status;
6297   // Fails because L0 has overlapping files.
6298   status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
6299   ASSERT_TRUE(status.IsInvalidArgument());
6300
6301   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
6302   // Now there is a file in L1.
6303   ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
6304
6305   ASSERT_OK(Put(Key(5), ""));
6306   ASSERT_OK(Flush());
6307   // Fails because L1 is non-empty.
6308   status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
6309   ASSERT_TRUE(status.IsInvalidArgument());
6310 }
6311
6312 // Github issue #596
6313 TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) {
6314   const int kNumLevels = 2;
6315   const int kNumL0Files = 2;
6316   Options options = CurrentOptions();
6317   options.disable_auto_compactions = true;
6318   options.num_levels = kNumLevels;
6319   DestroyAndReopen(options);
6320
6321   Random rnd(301);
6322   for (int i = 0; i < kNumL0Files; ++i) {
6323     ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
6324     ASSERT_OK(Flush());
6325   }
6326   ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
6327   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
6328
6329   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
6330   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
6331   ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files);
6332 }
6333 #endif  // ROCKSDB_LITE
6334
6335 TEST_F(DBTest, AutomaticConflictsWithManualCompaction) {
6336   const int kNumL0Files = 50;
6337   Options options = CurrentOptions();
6338   options.level0_file_num_compaction_trigger = 4;
6339   // never slowdown / stop
6340   options.level0_slowdown_writes_trigger = 999999;
6341   options.level0_stop_writes_trigger = 999999;
6342   options.max_background_compactions = 10;
6343   DestroyAndReopen(options);
6344
6345   // schedule automatic compactions after the manual one starts, but before it
6346   // finishes to ensure conflict.
6347   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
6348       {{"DBImpl::BackgroundCompaction:Start",
6349         "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"},
6350        {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts",
6351         "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
6352   std::atomic<int> callback_count(0);
6353   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
6354       "DBImpl::MaybeScheduleFlushOrCompaction:Conflict",
6355       [&](void* /*arg*/) { callback_count.fetch_add(1); });
6356   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
6357
6358   Random rnd(301);
6359   for (int i = 0; i < 2; ++i) {
6360     // put two keys to ensure no trivial move
6361     for (int j = 0; j < 2; ++j) {
6362       ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
6363     }
6364     ASSERT_OK(Flush());
6365   }
6366   port::Thread manual_compaction_thread([this]() {
6367     CompactRangeOptions croptions;
6368     croptions.exclusive_manual_compaction = true;
6369     ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
6370   });
6371
6372   TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts");
6373   for (int i = 0; i < kNumL0Files; ++i) {
6374     // put two keys to ensure no trivial move
6375     for (int j = 0; j < 2; ++j) {
6376       ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
6377     }
6378     ASSERT_OK(Flush());
6379   }
6380   TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts");
6381
6382   ASSERT_GE(callback_count.load(), 1);
6383   for (int i = 0; i < 2; ++i) {
6384     ASSERT_NE("NOT_FOUND", Get(Key(i)));
6385   }
6386   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
6387   manual_compaction_thread.join();
6388   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6389 }
6390
6391 #ifndef ROCKSDB_LITE
6392 TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
6393   Options options = CurrentOptions();
6394   options.max_background_compactions = 1;
6395   options.level0_file_num_compaction_trigger = 4;
6396   options.level0_slowdown_writes_trigger = 36;
6397   options.level0_stop_writes_trigger = 36;
6398   DestroyAndReopen(options);
6399
6400   // generate files for manual compaction
6401   Random rnd(301);
6402   for (int i = 0; i < 2; ++i) {
6403     // put two keys to ensure no trivial move
6404     for (int j = 0; j < 2; ++j) {
6405       ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
6406     }
6407     ASSERT_OK(Flush());
6408   }
6409
6410   ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
6411   db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
6412
6413   std::vector<std::string> input_files;
6414   input_files.push_back(cf_meta_data.levels[0].files[0].name);
6415
6416   SyncPoint::GetInstance()->LoadDependency({
6417       {"CompactFilesImpl:0",
6418        "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"},
6419       {"DBTest::CompactFilesShouldTriggerAutoCompaction:End",
6420        "CompactFilesImpl:1"},
6421   });
6422
6423   SyncPoint::GetInstance()->EnableProcessing();
6424
6425   port::Thread manual_compaction_thread([&]() {
6426     auto s = db_->CompactFiles(CompactionOptions(), db_->DefaultColumnFamily(),
6427                                input_files, 0);
6428     ASSERT_OK(s);
6429   });
6430
6431   TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
6432   // generate enough files to trigger compaction
6433   for (int i = 0; i < 20; ++i) {
6434     for (int j = 0; j < 2; ++j) {
6435       ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
6436     }
6437     ASSERT_OK(Flush());
6438   }
6439   db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
6440   ASSERT_GT(cf_meta_data.levels[0].files.size(),
6441             options.level0_file_num_compaction_trigger);
6442   TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:End");
6443
6444   manual_compaction_thread.join();
6445   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6446
6447   db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
6448   ASSERT_LE(cf_meta_data.levels[0].files.size(),
6449             options.level0_file_num_compaction_trigger);
6450 }
6451 #endif  // ROCKSDB_LITE
6452
6453 // Github issue #595
6454 // Large write batch with column families
6455 TEST_F(DBTest, LargeBatchWithColumnFamilies) {
6456   Options options = CurrentOptions();
6457   options.env = env_;
6458   options.write_buffer_size = 100000;  // Small write buffer
6459   CreateAndReopenWithCF({"pikachu"}, options);
6460   int64_t j = 0;
6461   for (int i = 0; i < 5; i++) {
6462     for (int pass = 1; pass <= 3; pass++) {
6463       WriteBatch batch;
6464       size_t write_size = 1024 * 1024 * (5 + i);
6465       fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n",
6466               (write_size / 1024 / 1024), pass);
6467       for (;;) {
6468         std::string data(3000, j++ % 127 + 20);
6469         data += std::to_string(j);
6470         ASSERT_OK(batch.Put(handles_[0], Slice(data), Slice(data)));
6471         if (batch.GetDataSize() > write_size) {
6472           break;
6473         }
6474       }
6475       fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n",
6476               (batch.GetDataSize() / 1024 / 1024));
6477       ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
6478       fprintf(stderr, "done\n");
6479     }
6480   }
6481   // make sure we can re-open it.
6482   ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
6483 }
6484
6485 // Make sure that Flushes can proceed in parallel with CompactRange()
6486 TEST_F(DBTest, FlushesInParallelWithCompactRange) {
6487   // iter == 0 -- leveled
6488   // iter == 1 -- leveled, but throw in a flush between two levels compacting
6489   // iter == 2 -- universal
6490   for (int iter = 0; iter < 3; ++iter) {
6491     Options options = CurrentOptions();
6492     if (iter < 2) {
6493       options.compaction_style = kCompactionStyleLevel;
6494     } else {
6495       options.compaction_style = kCompactionStyleUniversal;
6496     }
6497     options.write_buffer_size = 110 << 10;
6498     options.level0_file_num_compaction_trigger = 4;
6499     options.num_levels = 4;
6500     options.compression = kNoCompression;
6501     options.max_bytes_for_level_base = 450 << 10;
6502     options.target_file_size_base = 98 << 10;
6503     options.max_write_buffer_number = 2;
6504
6505     DestroyAndReopen(options);
6506
6507     Random rnd(301);
6508     for (int num = 0; num < 14; num++) {
6509       GenerateNewRandomFile(&rnd);
6510     }
6511
6512     if (iter == 1) {
6513       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
6514           {{"DBImpl::RunManualCompaction()::1",
6515             "DBTest::FlushesInParallelWithCompactRange:1"},
6516            {"DBTest::FlushesInParallelWithCompactRange:2",
6517             "DBImpl::RunManualCompaction()::2"}});
6518     } else {
6519       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
6520           {{"CompactionJob::Run():Start",
6521             "DBTest::FlushesInParallelWithCompactRange:1"},
6522            {"DBTest::FlushesInParallelWithCompactRange:2",
6523             "CompactionJob::Run():End"}});
6524     }
6525     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
6526
6527     std::vector<port::Thread> threads;
6528     threads.emplace_back([&]() { Compact("a", "z"); });
6529
6530     TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
6531
6532     // this has to start a flush. if flushes are blocked, this will try to
6533     // create
6534     // 3 memtables, and that will fail because max_write_buffer_number is 2
6535     for (int num = 0; num < 3; num++) {
6536       GenerateNewRandomFile(&rnd, /* nowait */ true);
6537     }
6538
6539     TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
6540
6541     for (auto& t : threads) {
6542       t.join();
6543     }
6544     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
6545   }
6546 }
6547
6548 TEST_F(DBTest, DelayedWriteRate) {
6549   const int kEntriesPerMemTable = 100;
6550   const int kTotalFlushes = 12;
6551
6552   Options options = CurrentOptions();
6553   env_->SetBackgroundThreads(1, Env::LOW);
6554   options.env = env_;
6555   options.write_buffer_size = 100000000;
6556   options.max_write_buffer_number = 256;
6557   options.max_background_compactions = 1;
6558   options.level0_file_num_compaction_trigger = 3;
6559   options.level0_slowdown_writes_trigger = 3;
6560   options.level0_stop_writes_trigger = 999999;
6561   options.delayed_write_rate = 20000000;  // Start with 200MB/s
6562   options.memtable_factory.reset(
6563       test::NewSpecialSkipListFactory(kEntriesPerMemTable));
6564
6565   SetTimeElapseOnlySleepOnReopen(&options);
6566   CreateAndReopenWithCF({"pikachu"}, options);
6567
6568   // Block compactions
6569   test::SleepingBackgroundTask sleeping_task_low;
6570   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
6571                  Env::Priority::LOW);
6572
6573   for (int i = 0; i < 3; i++) {
6574     ASSERT_OK(Put(Key(i), std::string(10000, 'x')));
6575     ASSERT_OK(Flush());
6576   }
6577
6578   // These writes will be slowed down to 1KB/s
6579   uint64_t estimated_sleep_time = 0;
6580   Random rnd(301);
6581   ASSERT_OK(Put("", ""));
6582   uint64_t cur_rate = options.delayed_write_rate;
6583   for (int i = 0; i < kTotalFlushes; i++) {
6584     uint64_t size_memtable = 0;
6585     for (int j = 0; j < kEntriesPerMemTable; j++) {
6586       auto rand_num = rnd.Uniform(20);
6587       // Spread the size range to more.
6588       size_t entry_size = rand_num * rand_num * rand_num;
6589       WriteOptions wo;
6590       ASSERT_OK(Put(Key(i), std::string(entry_size, 'x'), wo));
6591       size_memtable += entry_size + 18;
6592       // Occasionally sleep a while
6593       if (rnd.Uniform(20) == 6) {
6594         env_->SleepForMicroseconds(2666);
6595       }
6596     }
6597     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
6598     estimated_sleep_time += size_memtable * 1000000u / cur_rate;
6599     // Slow down twice. One for memtable switch and one for flush finishes.
6600     cur_rate = static_cast<uint64_t>(static_cast<double>(cur_rate) *
6601                                      kIncSlowdownRatio * kIncSlowdownRatio);
6602   }
6603   // Estimate the total sleep time fall into the rough range.
6604   ASSERT_GT(env_->NowMicros(), estimated_sleep_time / 2);
6605   ASSERT_LT(env_->NowMicros(), estimated_sleep_time * 2);
6606
6607   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
6608   sleeping_task_low.WakeUp();
6609   sleeping_task_low.WaitUntilDone();
6610 }
6611
6612 TEST_F(DBTest, HardLimit) {
6613   Options options = CurrentOptions();
6614   options.env = env_;
6615   env_->SetBackgroundThreads(1, Env::LOW);
6616   options.max_write_buffer_number = 256;
6617   options.write_buffer_size = 110 << 10;  // 110KB
6618   options.arena_block_size = 4 * 1024;
6619   options.level0_file_num_compaction_trigger = 4;
6620   options.level0_slowdown_writes_trigger = 999999;
6621   options.level0_stop_writes_trigger = 999999;
6622   options.hard_pending_compaction_bytes_limit = 800 << 10;
6623   options.max_bytes_for_level_base = 10000000000u;
6624   options.max_background_compactions = 1;
6625   options.memtable_factory.reset(
6626       test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
6627
6628   env_->SetBackgroundThreads(1, Env::LOW);
6629   test::SleepingBackgroundTask sleeping_task_low;
6630   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
6631                  Env::Priority::LOW);
6632
6633   CreateAndReopenWithCF({"pikachu"}, options);
6634
6635   std::atomic<int> callback_count(0);
6636   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
6637       "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
6638         callback_count.fetch_add(1);
6639         sleeping_task_low.WakeUp();
6640       });
6641   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
6642
6643   Random rnd(301);
6644   int key_idx = 0;
6645   for (int num = 0; num < 5; num++) {
6646     GenerateNewFile(&rnd, &key_idx, true);
6647     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
6648   }
6649
6650   ASSERT_EQ(0, callback_count.load());
6651
6652   for (int num = 0; num < 5; num++) {
6653     GenerateNewFile(&rnd, &key_idx, true);
6654     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
6655   }
6656   ASSERT_GE(callback_count.load(), 1);
6657
6658   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
6659   sleeping_task_low.WaitUntilDone();
6660 }
6661
6662 #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
6663 class WriteStallListener : public EventListener {
6664  public:
6665   WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
6666   void OnStallConditionsChanged(const WriteStallInfo& info) override {
6667     MutexLock l(&mutex_);
6668     condition_ = info.condition.cur;
6669   }
6670   bool CheckCondition(WriteStallCondition expected) {
6671     MutexLock l(&mutex_);
6672     return expected == condition_;
6673   }
6674
6675  private:
6676   port::Mutex mutex_;
6677   WriteStallCondition condition_;
6678 };
6679
6680 TEST_F(DBTest, SoftLimit) {
6681   Options options = CurrentOptions();
6682   options.env = env_;
6683   options.write_buffer_size = 100000;  // Small write buffer
6684   options.max_write_buffer_number = 256;
6685   options.level0_file_num_compaction_trigger = 1;
6686   options.level0_slowdown_writes_trigger = 3;
6687   options.level0_stop_writes_trigger = 999999;
6688   options.delayed_write_rate = 20000;  // About 200KB/s limited rate
6689   options.soft_pending_compaction_bytes_limit = 160000;
6690   options.target_file_size_base = 99999999;  // All into one file
6691   options.max_bytes_for_level_base = 50000;
6692   options.max_bytes_for_level_multiplier = 10;
6693   options.max_background_compactions = 1;
6694   options.compression = kNoCompression;
6695   WriteStallListener* listener = new WriteStallListener();
6696   options.listeners.emplace_back(listener);
6697
6698   // FlushMemtable with opt.wait=true does not wait for
6699   // `OnStallConditionsChanged` being called. The event listener is triggered
6700   // on `JobContext::Clean`, which happens after flush result is installed.
6701   // We use sync point to create a custom WaitForFlush that waits for
6702   // context cleanup.
6703   port::Mutex flush_mutex;
6704   port::CondVar flush_cv(&flush_mutex);
6705   bool flush_finished = false;
6706   auto InstallFlushCallback = [&]() {
6707     {
6708       MutexLock l(&flush_mutex);
6709       flush_finished = false;
6710     }
6711     SyncPoint::GetInstance()->SetCallBack(
6712         "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
6713           {
6714             MutexLock l(&flush_mutex);
6715             flush_finished = true;
6716           }
6717           flush_cv.SignalAll();
6718         });
6719   };
6720   auto WaitForFlush = [&]() {
6721     {
6722       MutexLock l(&flush_mutex);
6723       while (!flush_finished) {
6724         flush_cv.Wait();
6725       }
6726     }
6727     SyncPoint::GetInstance()->ClearCallBack(
6728         "DBImpl::BackgroundCallFlush:ContextCleanedUp");
6729   };
6730
6731   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
6732
6733   Reopen(options);
6734
6735   // Generating 360KB in Level 3
6736   for (int i = 0; i < 72; i++) {
6737     ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
6738     if (i % 10 == 0) {
6739       ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
6740     }
6741   }
6742   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6743   MoveFilesToLevel(3);
6744
6745   // Generating 360KB in Level 2
6746   for (int i = 0; i < 72; i++) {
6747     ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
6748     if (i % 10 == 0) {
6749       ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
6750     }
6751   }
6752   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6753   MoveFilesToLevel(2);
6754
6755   ASSERT_OK(Put(Key(0), ""));
6756
6757   test::SleepingBackgroundTask sleeping_task_low;
6758   // Block compactions
6759   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
6760                  Env::Priority::LOW);
6761   sleeping_task_low.WaitUntilSleeping();
6762
6763   // Create 3 L0 files, making score of L0 to be 3.
6764   for (int i = 0; i < 3; i++) {
6765     ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
6766     ASSERT_OK(Put(Key(100 - i), std::string(5000, 'x')));
6767     // Flush the file. File size is around 30KB.
6768     InstallFlushCallback();
6769     ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
6770     WaitForFlush();
6771   }
6772   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
6773   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
6774
6775   sleeping_task_low.WakeUp();
6776   sleeping_task_low.WaitUntilDone();
6777   sleeping_task_low.Reset();
6778   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6779
6780   // Now there is one L1 file but doesn't trigger soft_rate_limit
6781   //
6782   // TODO: soft_rate_limit is depreciated. If this test
6783   // relies on soft_rate_limit, then we need to change the test.
6784   //
6785   // The L1 file size is around 30KB.
6786   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
6787   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
6788   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
6789
6790   // Only allow one compactin going through.
6791   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
6792       "BackgroundCallCompaction:0", [&](void* /*arg*/) {
6793         // Schedule a sleeping task.
6794         sleeping_task_low.Reset();
6795         env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
6796                        &sleeping_task_low, Env::Priority::LOW);
6797       });
6798
6799   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
6800                  Env::Priority::LOW);
6801   sleeping_task_low.WaitUntilSleeping();
6802   // Create 3 L0 files, making score of L0 to be 3
6803   for (int i = 0; i < 3; i++) {
6804     ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x')));
6805     ASSERT_OK(Put(Key(90 - i), std::string(5000, 'x')));
6806     // Flush the file. File size is around 30KB.
6807     InstallFlushCallback();
6808     ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
6809     WaitForFlush();
6810   }
6811
6812   // Wake up sleep task to enable compaction to run and waits
6813   // for it to go to sleep state again to make sure one compaction
6814   // goes through.
6815   sleeping_task_low.WakeUp();
6816   sleeping_task_low.WaitUntilSleeping();
6817
6818   // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB
6819   // Given level multiplier 10, estimated pending compaction is around 100KB
6820   // doesn't trigger soft_pending_compaction_bytes_limit
6821   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
6822   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
6823   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
6824
6825   // Create 3 L0 files, making score of L0 to be 3, higher than L0.
6826   for (int i = 0; i < 3; i++) {
6827     ASSERT_OK(Put(Key(20 + i), std::string(5000, 'x')));
6828     ASSERT_OK(Put(Key(80 - i), std::string(5000, 'x')));
6829     // Flush the file. File size is around 30KB.
6830     InstallFlushCallback();
6831     ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
6832     WaitForFlush();
6833   }
6834   // Wake up sleep task to enable compaction to run and waits
6835   // for it to go to sleep state again to make sure one compaction
6836   // goes through.
6837   sleeping_task_low.WakeUp();
6838   sleeping_task_low.WaitUntilSleeping();
6839
6840   // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB
6841   // L2 size is 360KB, so the estimated level fanout 4, estimated pending
6842   // compaction is around 200KB
6843   // triggerring soft_pending_compaction_bytes_limit
6844   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
6845   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
6846   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
6847
6848   sleeping_task_low.WakeUp();
6849   sleeping_task_low.WaitUntilSleeping();
6850
6851   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
6852   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
6853
6854   // shrink level base so L2 will hit soft limit easier.
6855   ASSERT_OK(dbfull()->SetOptions({
6856       {"max_bytes_for_level_base", "5000"},
6857   }));
6858
6859   ASSERT_OK(Put("", ""));
6860   ASSERT_OK(Flush());
6861   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
6862   ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
6863
6864   sleeping_task_low.WaitUntilSleeping();
6865   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
6866   sleeping_task_low.WakeUp();
6867   sleeping_task_low.WaitUntilDone();
6868 }
6869
6870 TEST_F(DBTest, LastWriteBufferDelay) {
6871   Options options = CurrentOptions();
6872   options.env = env_;
6873   options.write_buffer_size = 100000;
6874   options.max_write_buffer_number = 4;
6875   options.delayed_write_rate = 20000;
6876   options.compression = kNoCompression;
6877   options.disable_auto_compactions = true;
6878   int kNumKeysPerMemtable = 3;
6879   options.memtable_factory.reset(
6880       test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
6881
6882   Reopen(options);
6883   test::SleepingBackgroundTask sleeping_task;
6884   // Block flushes
6885   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
6886                  Env::Priority::HIGH);
6887   sleeping_task.WaitUntilSleeping();
6888
6889   // Create 3 L0 files, making score of L0 to be 3.
6890   for (int i = 0; i < 3; i++) {
6891     // Fill one mem table
6892     for (int j = 0; j < kNumKeysPerMemtable; j++) {
6893       ASSERT_OK(Put(Key(j), ""));
6894     }
6895     ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
6896   }
6897   // Inserting a new entry would create a new mem table, triggering slow down.
6898   ASSERT_OK(Put(Key(0), ""));
6899   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
6900
6901   sleeping_task.WakeUp();
6902   sleeping_task.WaitUntilDone();
6903 }
6904 #endif  // !defined(ROCKSDB_LITE) &&
6905         // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
6906
6907 TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
6908   CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
6909                                     kLZ4Compression, kLZ4HCCompression,
6910                                     kXpressCompression};
6911   for (auto comp : compressions) {
6912     if (!CompressionTypeSupported(comp)) {
6913       // not supported, we should fail the Open()
6914       Options options = CurrentOptions();
6915       options.compression = comp;
6916       ASSERT_TRUE(!TryReopen(options).ok());
6917       // Try if CreateColumnFamily also fails
6918       options.compression = kNoCompression;
6919       ASSERT_OK(TryReopen(options));
6920       ColumnFamilyOptions cf_options(options);
6921       cf_options.compression = comp;
6922       ColumnFamilyHandle* handle;
6923       ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
6924     }
6925   }
6926 }
6927
6928 TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
6929   Options options = CurrentOptions();
6930   options.max_open_files = 100;
6931   Reopen(options);
6932
6933   ColumnFamilyOptions cf_options(options);
6934   // ttl is now supported when max_open_files is -1.
6935   cf_options.ttl = 3600;
6936   ColumnFamilyHandle* handle;
6937   ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
6938   delete handle;
6939 }
6940
6941 #ifndef ROCKSDB_LITE
6942 TEST_F(DBTest, RowCache) {
6943   Options options = CurrentOptions();
6944   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
6945   options.row_cache = NewLRUCache(8192);
6946   DestroyAndReopen(options);
6947
6948   ASSERT_OK(Put("foo", "bar"));
6949   ASSERT_OK(Flush());
6950
6951   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
6952   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
6953   ASSERT_EQ(Get("foo"), "bar");
6954   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
6955   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
6956   ASSERT_EQ(Get("foo"), "bar");
6957   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
6958   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
6959 }
6960
6961 TEST_F(DBTest, PinnableSliceAndRowCache) {
6962   Options options = CurrentOptions();
6963   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
6964   options.row_cache = NewLRUCache(8192);
6965   DestroyAndReopen(options);
6966
6967   ASSERT_OK(Put("foo", "bar"));
6968   ASSERT_OK(Flush());
6969
6970   ASSERT_EQ(Get("foo"), "bar");
6971   ASSERT_EQ(
6972       reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6973       1);
6974
6975   {
6976     PinnableSlice pin_slice;
6977     ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
6978     ASSERT_EQ(pin_slice.ToString(), "bar");
6979     // Entry is already in cache, lookup will remove the element from lru
6980     ASSERT_EQ(
6981         reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6982         0);
6983   }
6984   // After PinnableSlice destruction element is added back in LRU
6985   ASSERT_EQ(
6986       reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
6987       1);
6988 }
6989
6990 TEST_F(DBTest, ReusePinnableSlice) {
6991   Options options = CurrentOptions();
6992   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
6993   options.row_cache = NewLRUCache(8192);
6994   DestroyAndReopen(options);
6995
6996   ASSERT_OK(Put("foo", "bar"));
6997   ASSERT_OK(Flush());
6998
6999   ASSERT_EQ(Get("foo"), "bar");
7000   ASSERT_EQ(
7001       reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
7002       1);
7003
7004   {
7005     PinnableSlice pin_slice;
7006     ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
7007     ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
7008     ASSERT_EQ(pin_slice.ToString(), "bar");
7009
7010     // Entry is already in cache, lookup will remove the element from lru
7011     ASSERT_EQ(
7012         reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
7013         0);
7014   }
7015   // After PinnableSlice destruction element is added back in LRU
7016   ASSERT_EQ(
7017       reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
7018       1);
7019
7020   {
7021     std::vector<Slice> multiget_keys;
7022     multiget_keys.push_back("foo");
7023     std::vector<PinnableSlice> multiget_values(1);
7024     std::vector<Status> statuses({Status::NotFound()});
7025     ReadOptions ropt;
7026     dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
7027                        multiget_keys.size(), multiget_keys.data(),
7028                        multiget_values.data(), statuses.data());
7029     ASSERT_EQ(Status::OK(), statuses[0]);
7030     dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
7031                        multiget_keys.size(), multiget_keys.data(),
7032                        multiget_values.data(), statuses.data());
7033     ASSERT_EQ(Status::OK(), statuses[0]);
7034
7035     // Entry is already in cache, lookup will remove the element from lru
7036     ASSERT_EQ(
7037         reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
7038         0);
7039   }
7040   // After PinnableSlice destruction element is added back in LRU
7041   ASSERT_EQ(
7042       reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
7043       1);
7044
7045   {
7046     std::vector<ColumnFamilyHandle*> multiget_cfs;
7047     multiget_cfs.push_back(dbfull()->DefaultColumnFamily());
7048     std::vector<Slice> multiget_keys;
7049     multiget_keys.push_back("foo");
7050     std::vector<PinnableSlice> multiget_values(1);
7051     std::vector<Status> statuses({Status::NotFound()});
7052     ReadOptions ropt;
7053     dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
7054                        multiget_keys.data(), multiget_values.data(),
7055                        statuses.data());
7056     ASSERT_EQ(Status::OK(), statuses[0]);
7057     dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
7058                        multiget_keys.data(), multiget_values.data(),
7059                        statuses.data());
7060     ASSERT_EQ(Status::OK(), statuses[0]);
7061
7062     // Entry is already in cache, lookup will remove the element from lru
7063     ASSERT_EQ(
7064         reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
7065         0);
7066   }
7067   // After PinnableSlice destruction element is added back in LRU
7068   ASSERT_EQ(
7069       reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
7070       1);
7071 }
7072
7073 #endif  // ROCKSDB_LITE
7074
7075 TEST_F(DBTest, DeletingOldWalAfterDrop) {
7076   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
7077       {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"},
7078        {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}});
7079   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
7080
7081   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
7082   Options options = CurrentOptions();
7083   options.max_total_wal_size = 8192;
7084   options.compression = kNoCompression;
7085   options.write_buffer_size = 1 << 20;
7086   options.level0_file_num_compaction_trigger = (1 << 30);
7087   options.level0_slowdown_writes_trigger = (1 << 30);
7088   options.level0_stop_writes_trigger = (1 << 30);
7089   options.disable_auto_compactions = true;
7090   DestroyAndReopen(options);
7091   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
7092
7093   CreateColumnFamilies({"cf1", "cf2"}, options);
7094   ASSERT_OK(Put(0, "key1", DummyString(8192)));
7095   ASSERT_OK(Put(0, "key2", DummyString(8192)));
7096   // the oldest wal should now be getting_flushed
7097   ASSERT_OK(db_->DropColumnFamily(handles_[0]));
7098   // all flushes should now do nothing because their CF is dropped
7099   TEST_SYNC_POINT("Test:AllowFlushes");
7100   TEST_SYNC_POINT("Test:WaitForFlush");
7101   uint64_t lognum1 = dbfull()->TEST_LogfileNumber();
7102   ASSERT_OK(Put(1, "key3", DummyString(8192)));
7103   ASSERT_OK(Put(1, "key4", DummyString(8192)));
7104   // new wal should have been created
7105   uint64_t lognum2 = dbfull()->TEST_LogfileNumber();
7106   EXPECT_GT(lognum2, lognum1);
7107 }
7108
7109 TEST_F(DBTest, UnsupportedManualSync) {
7110   DestroyAndReopen(CurrentOptions());
7111   env_->is_wal_sync_thread_safe_.store(false);
7112   Status s = db_->SyncWAL();
7113   ASSERT_TRUE(s.IsNotSupported());
7114 }
7115
7116 INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
7117                         ::testing::Combine(::testing::Values(1, 4),
7118                                            ::testing::Bool()));
7119
7120 TEST_F(DBTest, PauseBackgroundWorkTest) {
7121   Options options = CurrentOptions();
7122   options.write_buffer_size = 100000;  // Small write buffer
7123   Reopen(options);
7124
7125   std::vector<port::Thread> threads;
7126   std::atomic<bool> done(false);
7127   ASSERT_OK(db_->PauseBackgroundWork());
7128   threads.emplace_back([&]() {
7129     Random rnd(301);
7130     for (int i = 0; i < 10000; ++i) {
7131       ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
7132     }
7133     done.store(true);
7134   });
7135   env_->SleepForMicroseconds(200000);
7136   // make sure the thread is not done
7137   ASSERT_FALSE(done.load());
7138   ASSERT_OK(db_->ContinueBackgroundWork());
7139   for (auto& t : threads) {
7140     t.join();
7141   }
7142   // now it's done
7143   ASSERT_TRUE(done.load());
7144 }
7145
7146 // Keep spawning short-living threads that create an iterator and quit.
7147 // Meanwhile in another thread keep flushing memtables.
7148 // This used to cause a deadlock.
7149 TEST_F(DBTest, ThreadLocalPtrDeadlock) {
7150   std::atomic<int> flushes_done{0};
7151   std::atomic<int> threads_destroyed{0};
7152   auto done = [&] { return flushes_done.load() > 10; };
7153
7154   port::Thread flushing_thread([&] {
7155     for (int i = 0; !done(); ++i) {
7156       ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"),
7157                          Slice(std::to_string(i).c_str())));
7158       ASSERT_OK(db_->Flush(FlushOptions()));
7159       int cnt = ++flushes_done;
7160       fprintf(stderr, "Flushed %d times\n", cnt);
7161     }
7162   });
7163
7164   std::vector<port::Thread> thread_spawning_threads(10);
7165   for (auto& t : thread_spawning_threads) {
7166     t = port::Thread([&] {
7167       while (!done()) {
7168         {
7169           port::Thread tmp_thread([&] {
7170             auto it = db_->NewIterator(ReadOptions());
7171             ASSERT_OK(it->status());
7172             delete it;
7173           });
7174           tmp_thread.join();
7175         }
7176         ++threads_destroyed;
7177       }
7178     });
7179   }
7180
7181   for (auto& t : thread_spawning_threads) {
7182     t.join();
7183   }
7184   flushing_thread.join();
7185   fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
7186           flushes_done.load(), threads_destroyed.load());
7187 }
7188
7189 TEST_F(DBTest, LargeBlockSizeTest) {
7190   Options options = CurrentOptions();
7191   CreateAndReopenWithCF({"pikachu"}, options);
7192   ASSERT_OK(Put(0, "foo", "bar"));
7193   BlockBasedTableOptions table_options;
7194   table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
7195   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
7196   ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
7197 }
7198
7199 #ifndef ROCKSDB_LITE
7200
7201 TEST_F(DBTest, CreationTimeOfOldestFile) {
7202   const int kNumKeysPerFile = 32;
7203   const int kNumLevelFiles = 2;
7204   const int kValueSize = 100;
7205
7206   Options options = CurrentOptions();
7207   options.max_open_files = -1;
7208   env_->SetMockSleep();
7209   options.env = env_;
7210
7211   // NOTE: Presumed unnecessary and removed: resetting mock time in env
7212
7213   DestroyAndReopen(options);
7214
7215   bool set_file_creation_time_to_zero = true;
7216   int idx = 0;
7217
7218   int64_t time_1 = 0;
7219   env_->GetCurrentTime(&time_1);
7220   const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
7221
7222   // Add 50 hours
7223   env_->MockSleepForSeconds(50 * 60 * 60);
7224
7225   int64_t time_2 = 0;
7226   env_->GetCurrentTime(&time_2);
7227   const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
7228
7229   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
7230       "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
7231         TableProperties* props = reinterpret_cast<TableProperties*>(arg);
7232         if (set_file_creation_time_to_zero) {
7233           if (idx == 0) {
7234             props->file_creation_time = 0;
7235             idx++;
7236           } else if (idx == 1) {
7237             props->file_creation_time = uint_time_1;
7238             idx = 0;
7239           }
7240         } else {
7241           if (idx == 0) {
7242             props->file_creation_time = uint_time_1;
7243             idx++;
7244           } else if (idx == 1) {
7245             props->file_creation_time = uint_time_2;
7246           }
7247         }
7248       });
7249   // Set file creation time in manifest all to 0.
7250   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
7251       "FileMetaData::FileMetaData", [&](void* arg) {
7252         FileMetaData* meta = static_cast<FileMetaData*>(arg);
7253         meta->file_creation_time = 0;
7254       });
7255   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
7256
7257   Random rnd(301);
7258   for (int i = 0; i < kNumLevelFiles; ++i) {
7259     for (int j = 0; j < kNumKeysPerFile; ++j) {
7260       ASSERT_OK(
7261           Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
7262     }
7263     ASSERT_OK(Flush());
7264   }
7265
7266   // At this point there should be 2 files, one with file_creation_time = 0 and
7267   // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
7268   uint64_t creation_time;
7269   Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
7270   ASSERT_EQ(0, creation_time);
7271   ASSERT_EQ(s1, Status::OK());
7272
7273   // Testing with non-zero file creation time.
7274   set_file_creation_time_to_zero = false;
7275   options = CurrentOptions();
7276   options.max_open_files = -1;
7277   options.env = env_;
7278
7279   // NOTE: Presumed unnecessary and removed: resetting mock time in env
7280
7281   DestroyAndReopen(options);
7282
7283   for (int i = 0; i < kNumLevelFiles; ++i) {
7284     for (int j = 0; j < kNumKeysPerFile; ++j) {
7285       ASSERT_OK(
7286           Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
7287     }
7288     ASSERT_OK(Flush());
7289   }
7290
7291   // At this point there should be 2 files with non-zero file creation time.
7292   // GetCreationTimeOfOldestFile API should return non-zero value.
7293   uint64_t ctime;
7294   Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
7295   ASSERT_EQ(uint_time_1, ctime);
7296   ASSERT_EQ(s2, Status::OK());
7297
7298   // Testing with max_open_files != -1
7299   options = CurrentOptions();
7300   options.max_open_files = 10;
7301   DestroyAndReopen(options);
7302   Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
7303   ASSERT_EQ(s3, Status::NotSupported());
7304
7305   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
7306 }
7307
7308 TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) {
7309   Options options = CurrentOptions();
7310   options.max_write_buffer_size_to_maintain = 10000;
7311   options.write_buffer_size = 160000;
7312   Reopen(options);
7313   Random rnd(301);
7314   bool memory_limit_exceeded = false;
7315
7316   ColumnFamilyData* cfd =
7317       static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
7318
7319   for (int i = 0; i < 1000; i++) {
7320     std::string value = rnd.RandomString(1000);
7321     ASSERT_OK(Put("keykey_" + std::to_string(i), value));
7322
7323     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
7324
7325     const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage();
7326     const uint64_t size_all_mem_table =
7327         cur_active_mem + cfd->imm()->ApproximateMemoryUsage();
7328
7329     // Errors out if memory usage keeps on increasing beyond the limit.
7330     // Once memory limit exceeds,  memory_limit_exceeded  is set and if
7331     // size_all_mem_table doesn't drop out in the next write then it errors out
7332     // (not expected behaviour). If memory usage drops then
7333     // memory_limit_exceeded is set to false.
7334     if ((size_all_mem_table > cur_active_mem) &&
7335         (cur_active_mem >=
7336          static_cast<uint64_t>(options.max_write_buffer_size_to_maintain)) &&
7337         (size_all_mem_table >
7338          static_cast<uint64_t>(options.max_write_buffer_size_to_maintain) +
7339              options.write_buffer_size)) {
7340       ASSERT_FALSE(memory_limit_exceeded);
7341       memory_limit_exceeded = true;
7342     } else {
7343       memory_limit_exceeded = false;
7344     }
7345   }
7346 }
7347
7348 TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) {
7349   Options options = CurrentOptions();
7350   options.disable_auto_compactions = true;
7351   Reopen(options);
7352   Random rnd(403);
7353
7354   for (int i = 0; i < 20; i++) {
7355     ASSERT_OK(Put("key_" + std::to_string(i), rnd.RandomString(10)));
7356     ASSERT_OK(Flush());
7357   }
7358   ASSERT_EQ(GetSstFileCount(dbname_), 20);
7359
7360   // We need !disable_auto_compactions for writes to stall but also want to
7361   // delay compaction so stalled writes unblocked due to kShutdownInProgress. BG
7362   // compaction will first wait for the sync point
7363   // DBTest::ShuttingDownNotBlockStalledWrites. Then it waits extra 2 sec to
7364   // allow CancelAllBackgroundWork() to set shutting_down_.
7365   SyncPoint::GetInstance()->SetCallBack(
7366       "BackgroundCallCompaction:0",
7367       [&](void* /* arg */) { env_->SleepForMicroseconds(2 * 1000 * 1000); });
7368   SyncPoint::GetInstance()->LoadDependency(
7369       {{"DBImpl::DelayWrite:Wait", "DBTest::ShuttingDownNotBlockStalledWrites"},
7370        {"DBTest::ShuttingDownNotBlockStalledWrites",
7371         "BackgroundCallCompaction:0"}});
7372   SyncPoint::GetInstance()->EnableProcessing();
7373
7374   options.level0_stop_writes_trigger = 20;
7375   options.disable_auto_compactions = false;
7376   Reopen(options);
7377
7378   std::thread thd([&]() {
7379     Status s = Put("key_" + std::to_string(101), "101");
7380     ASSERT_EQ(s.code(), Status::kShutdownInProgress);
7381   });
7382
7383   TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites");
7384   CancelAllBackgroundWork(db_, true);
7385
7386   thd.join();
7387 }
7388 #endif
7389
7390 }  // namespace ROCKSDB_NAMESPACE
7391
7392 int main(int argc, char** argv) {
7393   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
7394   ::testing::InitGoogleTest(&argc, argv);
7395   RegisterCustomObjects(argc, argv);
7396   return RUN_ALL_TESTS();
7397 }